diff --git a/.bc-linter.yml b/.bc-linter.yml
new file mode 100644
index 0000000000000..cafa3a51c3ac1
--- /dev/null
+++ b/.bc-linter.yml
@@ -0,0 +1,15 @@
+version: 1
+paths:
+include:
+  - "**/*.py"
+exclude:
+  - ".*"
+  - ".*/**"
+  - "**/.*/**"
+  - "**/.*"
+  - "**/_*/**"
+  - "**/_*.py"
+  - "**/test/**"
+  - "**/benchmarks/**"
+  - "**/test_*.py"
+  - "**/*_test.py"
diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh
index 424ddd0013cd8..bf8bab6dde232 100644
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@@ -3,8 +3,18 @@ set -eux -o pipefail
 
 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 
-if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+# Set CUDA architecture lists to match x86 build_cuda.sh
+if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0"
+elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0"
+elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
+fi
+
+# Compress the fatbin with -compress-mode=size for CUDA 13
+if [[ "$DESIRED_CUDA" == *"13"* ]]; then
+    export TORCH_NVCC_FLAGS="-compress-mode=size"
 fi
 
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
@@ -18,7 +28,7 @@ cd /
 # on the mounted pytorch repo
 git config --global --add safe.directory /pytorch
 pip install -r /pytorch/requirements.txt
-pip install auditwheel==6.2.0
+pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
     echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
     #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
@@ -26,6 +36,19 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then
 else
     echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
     export USE_SYSTEM_NCCL=1
+
+    # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
+    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
+        echo "Bundling CUDA libraries with wheel for aarch64."
+    else
+        echo "Using nvidia libs from pypi for aarch64."
+        # Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64
+        # Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"'
+        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}"
+        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
+        export USE_NVIDIA_PYPI_LIBS=1
+    fi
+
     #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
     USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
index d7bbdebc677ab..4bb9c64ea7772 100755
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@@ -69,61 +69,181 @@ def replace_tag(filename) -> None:
         f.writelines(lines)
 
 
+def patch_library_rpath(
+    folder: str,
+    lib_name: str,
+    use_nvidia_pypi_libs: bool = False,
+    desired_cuda: str = "",
+) -> None:
+    """Apply patchelf to set RPATH for a library in torch/lib"""
+    lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
+
+    if use_nvidia_pypi_libs:
+        # For PyPI NVIDIA libraries, construct CUDA RPATH
+        cuda_rpaths = [
+            "$ORIGIN/../../nvidia/cudnn/lib",
+            "$ORIGIN/../../nvidia/nvshmem/lib",
+            "$ORIGIN/../../nvidia/nccl/lib",
+            "$ORIGIN/../../nvidia/cusparselt/lib",
+        ]
+
+        if "130" in desired_cuda:
+            cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
+        else:
+            cuda_rpaths.extend(
+                [
+                    "$ORIGIN/../../nvidia/cublas/lib",
+                    "$ORIGIN/../../nvidia/cuda_cupti/lib",
+                    "$ORIGIN/../../nvidia/cuda_nvrtc/lib",
+                    "$ORIGIN/../../nvidia/cuda_runtime/lib",
+                    "$ORIGIN/../../nvidia/cufft/lib",
+                    "$ORIGIN/../../nvidia/curand/lib",
+                    "$ORIGIN/../../nvidia/cusolver/lib",
+                    "$ORIGIN/../../nvidia/cusparse/lib",
+                    "$ORIGIN/../../nvidia/nvtx/lib",
+                    "$ORIGIN/../../nvidia/cufile/lib",
+                ]
+            )
+
+        # Add $ORIGIN for local torch libs
+        rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
+    else:
+        # For bundled libraries, just use $ORIGIN
+        rpath = "$ORIGIN"
+
+    if os.path.exists(lib_path):
+        os.system(
+            f"cd {folder}/tmp/torch/lib/; "
+            f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
+        )
+
+
+def copy_and_patch_library(
+    src_path: str,
+    folder: str,
+    use_nvidia_pypi_libs: bool = False,
+    desired_cuda: str = "",
+) -> None:
+    """Copy a library to torch/lib and patch its RPATH"""
+    if os.path.exists(src_path):
+        lib_name = os.path.basename(src_path)
+        shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
+        patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
+
+
 def package_cuda_wheel(wheel_path, desired_cuda) -> None:
     """
     Package the cuda wheel libraries
     """
     folder = os.path.dirname(wheel_path)
-    wheelname = os.path.basename(wheel_path)
     os.mkdir(f"{folder}/tmp")
     os.system(f"unzip {wheel_path} -d {folder}/tmp")
-    libs_to_copy = [
-        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
-        "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
-        "/usr/local/cuda/lib64/libcudnn.so.9",
-        "/usr/local/cuda/lib64/libcublas.so.12",
-        "/usr/local/cuda/lib64/libcublasLt.so.12",
-        "/usr/local/cuda/lib64/libcudart.so.12",
-        "/usr/local/cuda/lib64/libcufft.so.11",
-        "/usr/local/cuda/lib64/libcusparse.so.12",
-        "/usr/local/cuda/lib64/libcusparseLt.so.0",
-        "/usr/local/cuda/lib64/libcusolver.so.11",
-        "/usr/local/cuda/lib64/libcurand.so.10",
-        "/usr/local/cuda/lib64/libnccl.so.2",
-        "/usr/local/cuda/lib64/libnvJitLink.so.12",
-        "/usr/local/cuda/lib64/libnvrtc.so.12",
-        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
-        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
-        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
-        "/usr/local/cuda/lib64/libcudnn_ops.so.9",
-        "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
-        "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
-        "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
-        "/lib64/libgomp.so.1",
-        "/usr/lib64/libgfortran.so.5",
-        "/acl/build/libarm_compute.so",
-        "/acl/build/libarm_compute_graph.so",
-        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_lapack_core.so.0",
-        "/usr/local/lib/libnvpl_blas_core.so.0",
-    ]
 
-    if "129" in desired_cuda:
-        libs_to_copy += [
-            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
+    # Check if we should use PyPI NVIDIA libraries or bundle system libraries
+    use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
+
+    if use_nvidia_pypi_libs:
+        print("Using nvidia libs from pypi - skipping CUDA library bundling")
+        # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
+        # We only need to bundle non-NVIDIA libraries
+        minimal_libs_to_copy = [
+            "/lib64/libgomp.so.1",
+            "/usr/lib64/libgfortran.so.5",
+            "/acl/build/libarm_compute.so",
+            "/acl/build/libarm_compute_graph.so",
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+        ]
+
+        # Copy minimal libraries to unzipped_folder/torch/lib
+        for lib_path in minimal_libs_to_copy:
+            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
+
+        # Patch torch libraries used for searching libraries
+        torch_libs_to_patch = [
+            "libtorch.so",
+            "libtorch_cpu.so",
+            "libtorch_cuda.so",
+            "libtorch_cuda_linalg.so",
+            "libtorch_global_deps.so",
+            "libtorch_python.so",
+            "libtorch_nvshmem.so",
+            "libc10.so",
+            "libc10_cuda.so",
+            "libcaffe2_nvrtc.so",
+            "libshm.so",
+        ]
+        for lib_name in torch_libs_to_patch:
+            patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
+    else:
+        print("Bundling CUDA libraries with wheel")
+        # Original logic for bundling system CUDA libraries
+        # Common libraries for all CUDA versions
+        common_libs = [
+            # Non-NVIDIA system libraries
+            "/lib64/libgomp.so.1",
+            "/usr/lib64/libgfortran.so.5",
+            "/acl/build/libarm_compute.so",
+            "/acl/build/libarm_compute_graph.so",
+            # Common CUDA libraries (same for all versions)
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
+            "/usr/local/cuda/lib64/libcudnn.so.9",
+            "/usr/local/cuda/lib64/libcusparseLt.so.0",
+            "/usr/local/cuda/lib64/libcurand.so.10",
+            "/usr/local/cuda/lib64/libnccl.so.2",
+            "/usr/local/cuda/lib64/libnvshmem_host.so.3",
+            "/usr/local/cuda/lib64/libcudnn_adv.so.9",
+            "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
+            "/usr/local/cuda/lib64/libcudnn_graph.so.9",
+            "/usr/local/cuda/lib64/libcudnn_ops.so.9",
+            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
+            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
+            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
             "/usr/local/cuda/lib64/libcufile.so.0",
             "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+            "/usr/local/cuda/lib64/libcusparse.so.12",
         ]
 
-    # Copy libraries to unzipped_folder/a/lib
-    for lib_path in libs_to_copy:
-        lib_name = os.path.basename(lib_path)
-        shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}")
-        os.system(
-            f"cd {folder}/tmp/torch/lib/; "
-            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
-        )
+        # CUDA version-specific libraries
+        if "130" in desired_cuda:
+            version_specific_libs = [
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
+                "/usr/local/cuda/lib64/libcublas.so.13",
+                "/usr/local/cuda/lib64/libcublasLt.so.13",
+                "/usr/local/cuda/lib64/libcudart.so.13",
+                "/usr/local/cuda/lib64/libcufft.so.12",
+                "/usr/local/cuda/lib64/libcusolver.so.12",
+                "/usr/local/cuda/lib64/libnvJitLink.so.13",
+                "/usr/local/cuda/lib64/libnvrtc.so.13",
+                "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
+            ]
+        elif "12" in desired_cuda:
+            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
+            minor_version = desired_cuda[-1]
+            version_specific_libs = [
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
+                "/usr/local/cuda/lib64/libcublas.so.12",
+                "/usr/local/cuda/lib64/libcublasLt.so.12",
+                "/usr/local/cuda/lib64/libcudart.so.12",
+                "/usr/local/cuda/lib64/libcufft.so.11",
+                "/usr/local/cuda/lib64/libcusolver.so.11",
+                "/usr/local/cuda/lib64/libnvJitLink.so.12",
+                "/usr/local/cuda/lib64/libnvrtc.so.12",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
+            ]
+
+        # Combine all libraries
+        libs_to_copy = common_libs + version_specific_libs
+
+        # Copy libraries to unzipped_folder/torch/lib
+        for lib_path in libs_to_copy:
+            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
 
     # Make sure the wheel is tagged with manylinux_2_28
     for f in os.scandir(f"{folder}/tmp/"):
@@ -131,14 +251,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
             replace_tag(f"{f.path}/WHEEL")
             break
 
-    os.mkdir(f"{folder}/cuda_wheel")
-    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
-    shutil.move(
-        f"{folder}/cuda_wheel/{wheelname}",
-        f"{folder}/{wheelname}",
-        copy_function=shutil.copy2,
-    )
-    os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/")
+    os.system(f"wheel pack {folder}/tmp/ -d {folder}")
+    os.system(f"rm -rf {folder}/tmp/")
 
 
 def complete_wheel(folder: str) -> str:
@@ -208,7 +322,17 @@ def parse_arguments():
     build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
     # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
     if enable_cuda:
-        build_vars = "MAX_JOBS=5 " + build_vars
+        build_vars += "MAX_JOBS=5 "
+
+        # Handle PyPI NVIDIA libraries vs bundled libraries
+        use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
+        if use_nvidia_pypi_libs:
+            print("Configuring build for PyPI NVIDIA libraries")
+            # Configure for dynamic linking (matching x86 logic)
+            build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
+        else:
+            print("Configuring build for bundled NVIDIA libraries")
+            # Keep existing static linking approach - already configured above
 
     override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
     desired_cuda = os.getenv("DESIRED_CUDA")
diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py
index 025d0a20579d4..7a4715d330060 100755
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@@ -438,9 +438,7 @@ def build_torchvision(
         )
         build_vars += f"BUILD_VERSION={version}.dev{build_date}"
     elif build_version is not None:
-        build_vars += (
-            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
-        )
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
@@ -495,9 +493,7 @@ def build_torchdata(
         )
         build_vars += f"BUILD_VERSION={version}.dev{build_date}"
     elif build_version is not None:
-        build_vars += (
-            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
-        )
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
@@ -553,9 +549,7 @@ def build_torchtext(
         )
         build_vars += f"BUILD_VERSION={version}.dev{build_date}"
     elif build_version is not None:
-        build_vars += (
-            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
-        )
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
@@ -613,9 +607,7 @@ def build_torchaudio(
         )
         build_vars += f"BUILD_VERSION={version}.dev{build_date}"
     elif build_version is not None:
-        build_vars += (
-            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
-        )
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
diff --git a/.ci/docker/README.md b/.ci/docker/README.md
index 26c97754faa70..5a97a0a3c2d46 100644
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@@ -120,8 +120,8 @@ If your new Docker image needs a library installed from a specific pinned commit
    If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
    ```bash
    docker build \
-      ....
-      --build-arg "NEW_ARG_1=${NEW_ARG_1}"
+     ....
+     --build-arg "NEW_ARG_1=${NEW_ARG_1}"
    ```
 
 3. **Update Dockerfile logic**:
diff --git a/.ci/docker/almalinux/Dockerfile b/.ci/docker/almalinux/Dockerfile
index 418a76ceac234..481d21b96cfe9 100644
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@@ -64,6 +64,10 @@ FROM cuda as cuda12.9
 RUN bash ./install_cuda.sh 12.9
 ENV DESIRED_CUDA=12.9
 
+FROM cuda as cuda13.0
+RUN bash ./install_cuda.sh 13.0
+ENV DESIRED_CUDA=13.0
+
 FROM ${ROCM_IMAGE} as rocm
 ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 ADD ./common/install_mkl.sh install_mkl.sh
@@ -76,10 +80,10 @@ ADD ./common/install_mnist.sh install_mnist.sh
 RUN bash ./install_mnist.sh
 
 FROM base as all_cuda
-COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
 COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
 COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9
+COPY --from=cuda13.0  /usr/local/cuda-13.0 /usr/local/cuda-13.0
 
 # Final step
 FROM ${BASE_TARGET} as final
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 689d6f43b8e98..48be0cf538054 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -76,10 +76,13 @@ elif [[ "$image" == *cuda*linter* ]]; then
 elif [[ "$image" == *linter* ]]; then
   # Use a separate Dockerfile for linter to keep a small image size
   DOCKERFILE="linter/Dockerfile"
+elif [[ "$image" == *riscv* ]]; then
+  # Use RISC-V specific Dockerfile
+  DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
 fi
 
-_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
-_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
+_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
+_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
 if [[ "$image" == *rocm* ]]; then
   _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
   _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
@@ -111,41 +114,18 @@ case "$tag" in
     UCC_COMMIT=${_UCC_COMMIT}
     TRITON=yes
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
+  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
+    CUDA_VERSION=13.0.0
     ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
+    GCC_VERSION=11
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
     TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
     CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.13
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.6.3
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=9
     VISION=yes
@@ -153,6 +133,7 @@ case "$tag" in
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
     TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
     ;;
   pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
     CUDA_VERSION=12.8.1
@@ -164,39 +145,6 @@ case "$tag" in
     UCC_COMMIT=${_UCC_COMMIT}
     TRITON=yes
     ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    ANACONDA_PYTHON_VERSION=3.13
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
   pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
     CUDA_VERSION=12.8.1
     ANACONDA_PYTHON_VERSION=3.10
@@ -208,30 +156,18 @@ case "$tag" in
     TRITON=yes
     ;;
   pytorch-linux-jammy-py3-clang12-onnx)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.10
     CLANG_VERSION=12
     VISION=yes
     ONNX=yes
     ;;
-  pytorch-linux-jammy-py3.9-clang12)
-    ANACONDA_PYTHON_VERSION=3.9
-    CLANG_VERSION=12
-    VISION=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3.11-clang12)
-    ANACONDA_PYTHON_VERSION=3.11
+  pytorch-linux-jammy-py3.10-clang12)
+    ANACONDA_PYTHON_VERSION=3.10
     CLANG_VERSION=12
     VISION=yes
     TRITON=yes
     ;;
-  pytorch-linux-jammy-py3.9-gcc9)
-    ANACONDA_PYTHON_VERSION=3.9
-    GCC_VERSION=9
-    VISION=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
+  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
     if [[ $tag =~ "jammy" ]]; then
       ANACONDA_PYTHON_VERSION=3.10
     else
@@ -245,7 +181,9 @@ case "$tag" in
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
-    INDUCTOR_BENCHMARKS=yes
+    if [[ $tag =~ "benchmarks" ]]; then
+      INDUCTOR_BENCHMARKS=yes
+    fi
     ;;
   pytorch-linux-noble-rocm-alpha-py3)
     ANACONDA_PYTHON_VERSION=3.12
@@ -257,26 +195,26 @@ case "$tag" in
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
-    INDUCTOR_BENCHMARKS=yes
     PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
     ;;
-  pytorch-linux-jammy-xpu-2025.0-py3)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-xpu-n-1-py3)
+    ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     VISION=yes
-    XPU_VERSION=2025.0
+    XPU_VERSION=2025.1
     NINJA_VERSION=1.9.0
     TRITON=yes
     ;;
-  pytorch-linux-jammy-xpu-2025.1-py3)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-xpu-n-py3)
+    ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     VISION=yes
-    XPU_VERSION=2025.1
+    XPU_VERSION=2025.2
     NINJA_VERSION=1.9.0
     TRITON=yes
     ;;
-  pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
+  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
+    # TODO (huydhn): Upgrade this to Python >= 3.10
     ANACONDA_PYTHON_VERSION=3.9
     GCC_VERSION=11
     VISION=yes
@@ -285,8 +223,8 @@ case "$tag" in
     DOCS=yes
     INDUCTOR_BENCHMARKS=yes
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12)
+    ANACONDA_PYTHON_VERSION=3.10
     CUDA_VERSION=12.8.1
     CLANG_VERSION=12
     VISION=yes
@@ -297,8 +235,8 @@ case "$tag" in
     CLANG_VERSION=18
     VISION=yes
     ;;
-  pytorch-linux-jammy-py3.9-gcc11)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-py3.10-gcc11)
+    ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     VISION=yes
     KATEX=yes
@@ -339,7 +277,6 @@ case "$tag" in
     GCC_VERSION=11
     ACL=yes
     VISION=yes
-    CONDA_CMAKE=yes
     OPENBLAS=yes
     # snadampal: skipping llvm src build install because the current version
     # from pytorch/llvm:9.0.1 is x86 specific
@@ -350,13 +287,15 @@ case "$tag" in
     GCC_VERSION=11
     ACL=yes
     VISION=yes
-    CONDA_CMAKE=yes
     OPENBLAS=yes
     # snadampal: skipping llvm src build install because the current version
     # from pytorch/llvm:9.0.1 is x86 specific
     SKIP_LLVM_SRC_BUILD_INSTALL=yes
     INDUCTOR_BENCHMARKS=yes
     ;;
+  pytorch-linux-noble-riscv64-py3.12-gcc14)
+    GCC_VERSION=14
+    ;;
   *)
     # Catch-all for builds that are not hardcoded.
     VISION=yes
@@ -481,7 +420,14 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
 fi
 
 if [ -n "$GCC_VERSION" ]; then
-  if !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
+  if [[ "$image" == *riscv* ]]; then
+    # Check RISC-V cross-compilation toolchain version
+    if !(drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
+      echo "RISC-V GCC_VERSION=$GCC_VERSION, but:"
+      drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version
+      exit 1
+    fi
+  elif !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
     echo "GCC_VERSION=$GCC_VERSION, but:"
     drun gcc --version
     exit 1
diff --git a/.ci/docker/ci_commit_pins/huggingface-requirements.txt b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
new file mode 100644
index 0000000000000..66e5dbdfb1bb1
--- /dev/null
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@@ -0,0 +1,2 @@
+transformers==4.54.0
+soxr==0.5.0
diff --git a/.ci/docker/ci_commit_pins/huggingface.txt b/.ci/docker/ci_commit_pins/huggingface.txt
deleted file mode 100644
index f00d6ca4f9ca7..0000000000000
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ /dev/null
@@ -1 +0,0 @@
-243e186efbf7fb93328dd6b34927a4e8c8f24395
diff --git a/.ci/docker/ci_commit_pins/nccl-cu13.txt b/.ci/docker/ci_commit_pins/nccl-cu13.txt
new file mode 100644
index 0000000000000..77202c1566019
--- /dev/null
+++ b/.ci/docker/ci_commit_pins/nccl-cu13.txt
@@ -0,0 +1 @@
+v2.27.7-1
diff --git a/.ci/docker/ci_commit_pins/torchbench.txt b/.ci/docker/ci_commit_pins/torchbench.txt
new file mode 100644
index 0000000000000..c9be7b440baea
--- /dev/null
+++ b/.ci/docker/ci_commit_pins/torchbench.txt
@@ -0,0 +1 @@
+74a23feff57432129df84d8099e622773cf77925
diff --git a/.ci/docker/ci_commit_pins/triton-xpu.txt b/.ci/docker/ci_commit_pins/triton-xpu.txt
index 80d7d7ed18af9..b03606f6defc1 100644
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@@ -1 +1 @@
-ae324eeac8e102a2b40370e341460f3791353398
+1b0418a9a454b2b93ab8d71f40e59d2297157fae
diff --git a/.ci/docker/common/install_cpython.sh b/.ci/docker/common/install_cpython.sh
index d7fc6ea264ddb..692edd0b898f1 100755
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@@ -66,8 +66,9 @@ function do_cpython_build {
         ln -s pip3 ${prefix}/bin/pip
     fi
     # install setuptools since python 3.12 is required to use distutils
-    ${prefix}/bin/pip install wheel==0.45.1 setuptools==80.9.0
-    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
+    # packaging is needed to create symlink since wheel no longer provides needed information
+    ${prefix}/bin/pip install packaging==25.0 wheel==0.45.1 setuptools==80.9.0
+    local abi_tag=$(${prefix}/bin/python -c "from packaging.tags import interpreter_name, interpreter_version; import sysconfig ; from sysconfig import get_config_var; print('{0}{1}-{0}{1}{2}'.format(interpreter_name(), interpreter_version(), 't' if sysconfig.get_config_var('Py_GIL_DISABLED') else ''))")
     ln -sf ${prefix} /opt/python/${abi_tag}
 }
 
@@ -82,9 +83,9 @@ function build_cpython {
         py_suffix=${py_ver::-1}
         py_folder=$py_suffix
     fi
-    # Only b3 is available now
+    # Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
     if [ "$py_suffix" == "3.14.0" ]; then
-        py_suffix="3.14.0b3"
+        py_suffix="3.14.0rc2"
     fi
     wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
     do_cpython_build $py_ver Python-$py_suffix
diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh
index c8a780f65c8e5..c6808ea4a7a26 100644
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@@ -10,7 +10,7 @@ else
   arch_path='sbsa'
 fi
 
-NVSHMEM_VERSION=3.3.9
+NVSHMEM_VERSION=3.3.24
 
 function install_cuda {
   version=$1
@@ -62,14 +62,16 @@ function install_nvshmem {
   mkdir -p "${tmpdir}" && cd "${tmpdir}"
 
   # nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
-  filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
-  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"
+  # This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
+  filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
+  suffix=".tar.xz"
+  url="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/linux-${arch_path}/${filename}${suffix}"
 
   # download, unpack, install
   wget -q "${url}"
-  tar xf "${filename}.tar.gz"
-  cp -a "libnvshmem/include/"* /usr/local/include/
-  cp -a "libnvshmem/lib/"*     /usr/local/lib/
+  tar xf "${filename}${suffix}"
+  cp -a "${filename}/include/"* /usr/local/cuda/include/
+  cp -a "${filename}/lib/"*     /usr/local/cuda/lib64/
 
   # cleanup
   cd ..
@@ -126,74 +128,6 @@ function install_129 {
   ldconfig
 }
 
-function prune_124 {
-  echo "Pruning CUDA 12.4"
-  #####################################################################################
-  # CUDA 12.4 prune static libs
-  #####################################################################################
-  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
-  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
-
-  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-
-  if [[ -n "$OVERRIDE_GENCODE" ]]; then
-      export GENCODE=$OVERRIDE_GENCODE
-  fi
-  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
-      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
-  fi
-
-  # all CUDA libs except CuDNN and CuBLAS
-  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
-      | xargs -I {} bash -c \
-                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
-
-  # prune CuDNN and CuBLAS
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
-
-  #####################################################################################
-  # CUDA 12.4 prune visual tools
-  #####################################################################################
-  export CUDA_BASE="/usr/local/cuda-12.4/"
-  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
-}
-
-function prune_126 {
-  echo "Pruning CUDA 12.6"
-  #####################################################################################
-  # CUDA 12.6 prune static libs
-  #####################################################################################
-  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
-  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
-
-  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-
-  if [[ -n "$OVERRIDE_GENCODE" ]]; then
-      export GENCODE=$OVERRIDE_GENCODE
-  fi
-  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
-      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
-  fi
-
-  # all CUDA libs except CuDNN and CuBLAS
-  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
-      | xargs -I {} bash -c \
-                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
-
-  # prune CuDNN and CuBLAS
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
-
-  #####################################################################################
-  # CUDA 12.6 prune visual tools
-  #####################################################################################
-  export CUDA_BASE="/usr/local/cuda-12.6/"
-  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
-}
-
 function install_128 {
   CUDNN_VERSION=9.8.0.87
   echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
@@ -212,18 +146,38 @@ function install_128 {
   ldconfig
 }
 
+function install_130 {
+  CUDNN_VERSION=9.13.0.50
+  echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
+  # install CUDA 13.0 in the same container
+  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  install_cudnn 13 $CUDNN_VERSION
+
+  install_nvshmem 13 $NVSHMEM_VERSION
+
+  CUDA_VERSION=13.0 bash install_nccl.sh
+
+  CUDA_VERSION=13.0 bash install_cusparselt.sh
+
+  ldconfig
+}
+
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
     case "$1" in
-    12.4) install_124; prune_124
+    12.4) install_124;
         ;;
-    12.6|12.6.*) install_126; prune_126
+    12.6|12.6.*) install_126;
         ;;
     12.8|12.8.*) install_128;
         ;;
     12.9|12.9.*) install_129;
         ;;
+    13.0|13.0.*) install_130;
+        ;;
     *) echo "bad argument $1"; exit 1
         ;;
     esac
diff --git a/.ci/docker/common/install_cusparselt.sh b/.ci/docker/common/install_cusparselt.sh
index feacb49f39eb5..b532c086371f1 100644
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@@ -5,7 +5,15 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt
 
-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then
+    arch_path='sbsa'
+    export TARGETARCH=${TARGETARCH:-$(uname -m)}
+    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
+        arch_path='x86_64'
+    fi
+    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.8.0.4_cuda13-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
+elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
     arch_path='sbsa'
     export TARGETARCH=${TARGETARCH:-$(uname -m)}
     if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
diff --git a/.ci/docker/common/install_inductor_benchmark_deps.sh b/.ci/docker/common/install_inductor_benchmark_deps.sh
index 7312dce170db2..81467d87f5140 100644
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@@ -5,9 +5,7 @@ set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 
 function install_huggingface() {
-  local version
-  commit=$(get_pinned_commit huggingface)
-  pip_install "git+https://github.com/huggingface/transformers@${commit}"
+  pip_install -r huggingface-requirements.txt
 }
 
 function install_timm() {
@@ -15,11 +13,34 @@ function install_timm() {
   commit=$(get_pinned_commit timm)
 
   pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
-  # Clean up
-  conda_run pip uninstall -y torch torchvision triton
+}
+
+function install_torchbench() {
+  local commit
+  commit=$(get_pinned_commit torchbench)
+  git clone https://github.com/pytorch/benchmark torchbench
+  pushd torchbench
+  git checkout "$commit"
+
+  python install.py --continue_on_fail
+
+  echo "Print all dependencies after TorchBench is installed"
+  python -mpip freeze
+  popd
+
+  chown -R jenkins torchbench
+  chown -R jenkins /opt/conda
 }
 
 # Pango is needed for weasyprint which is needed for doctr
 conda_install pango
+
+# Stable packages are ok here, just to satisfy TorchBench check
+pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+
+install_torchbench
 install_huggingface
 install_timm
+
+# Clean up
+conda_run pip uninstall -y torch torchvision torchaudio triton torchao
diff --git a/.ci/docker/common/install_nccl.sh b/.ci/docker/common/install_nccl.sh
index 17d80ebe7d273..58a8e0b4e49c1 100644
--- a/.ci/docker/common/install_nccl.sh
+++ b/.ci/docker/common/install_nccl.sh
@@ -7,6 +7,8 @@ if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
   NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
 elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
   NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
+elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then
+  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu13.txt)
 else
   echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
   exit 1
diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh
index d07ec32001635..9f23feb5adfaf 100755
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@@ -19,8 +19,8 @@ pip_install \
   transformers==4.36.2
 
 pip_install coloredlogs packaging
-pip_install onnxruntime==1.18.1
-pip_install onnxscript==0.3.1
+pip_install onnxruntime==1.22.1
+pip_install onnxscript==0.4.0
 
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
index 726dfd1c74cfa..8e714bcb6cd32 100755
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@@ -57,7 +57,7 @@ if [ ! -f setup.py ]; then
   cd python
 fi
 
-pip_install pybind11==2.13.6
+pip_install pybind11==3.0.1
 
 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
diff --git a/.ci/docker/common/install_ucc.sh b/.ci/docker/common/install_ucc.sh
index b7f884ea9648f..04f15a52e88e3 100755
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@@ -44,8 +44,12 @@ function install_ucc() {
 
   ./autogen.sh
 
-  # We only run distributed tests on Tesla M60 and A10G
-  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
+  if [[ -n "$CUDA_VERSION"  && $CUDA_VERSION == 13* ]]; then
+    NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
+  else
+    # We only run distributed tests on Tesla M60 and A10G
+    NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
+  fi
 
   if [[ -n "$ROCM_VERSION" ]]; then
     if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh
index ecbbb8ccccf89..0b150872f93ce 100644
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@@ -34,18 +34,27 @@ function install_ubuntu() {
 
     # The xpu-smi packages
     apt-get install -y flex bison xpu-smi
-    # Compute and Media Runtimes
-    apt-get install -y \
-        intel-opencl-icd intel-level-zero-gpu level-zero \
-        intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
-        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
-        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
-    if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
-        apt-get install -y intel-ocloc
+
+    if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
+        # Compute and Media Runtimes
+        apt-get install -y \
+            intel-opencl-icd intel-level-zero-gpu level-zero \
+            intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
+        # Development Packages
+        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
+    else # rolling driver
+        apt-get install -y \
+            intel-opencl-icd libze-intel-gpu1 libze1 \
+            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
+        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
     fi
-    # Development Packages
-    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
+
     # Install Intel Support Packages
     apt-get install -y ${XPU_PACKAGES}
 
@@ -56,10 +65,14 @@ function install_ubuntu() {
 
 function install_rhel() {
     . /etc/os-release
-
-    if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
-        echo "RHEL version ${VERSION_ID} not supported"
-        exit
+    if [[ "${ID}" == "rhel" ]]; then
+        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+            echo "RHEL version ${VERSION_ID} not supported"
+            exit
+        fi
+    elif [[ "${ID}" == "almalinux" ]]; then
+        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
+        VERSION_ID="8.8"
     fi
 
     dnf install -y 'dnf-command(config-manager)'
@@ -130,18 +143,18 @@ function install_sles() {
 
 }
 
-# Default use GPU driver LTS releases
-XPU_DRIVER_VERSION="/lts/2350"
-if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
-    # Use GPU driver rolling releases
-    XPU_DRIVER_VERSION=""
+# Default use GPU driver rolling releases
+XPU_DRIVER_VERSION=""
+if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
+    # Use GPU driver LTS releases
+    XPU_DRIVER_VERSION="/lts/2350"
 fi
 
-# Default use Intel® oneAPI Deep Learning Essentials 2025.0
-if [[ "$XPU_VERSION" == "2025.1" ]]; then
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
+# Default use Intel® oneAPI Deep Learning Essentials 2025.1
+if [[ "$XPU_VERSION" == "2025.2" ]]; then
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.2"
 else
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
 fi
 
 # The installation depends on the base OS
diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile
index a99a39d776267..c93f022268b25 100644
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@@ -69,6 +69,19 @@ RUN bash ./install_cuda.sh 12.9
 RUN bash ./install_magma.sh 12.9
 RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda
 
+FROM cuda as cuda13.0
+RUN bash ./install_cuda.sh 13.0
+RUN bash ./install_magma.sh 13.0
+RUN ln -sf /usr/local/cuda-13.0 /usr/local/cuda
+
+# Install libibverbs for libtorch and copy to CUDA directory
+RUN apt-get update -y && \
+    apt-get install -y libibverbs-dev librdmacm-dev && \
+    cp /usr/lib/x86_64-linux-gnu/libmlx5.so* /usr/local/cuda/lib64/ && \
+    cp /usr/lib/x86_64-linux-gnu/librdmacm.so* /usr/local/cuda/lib64/ && \
+    cp /usr/lib/x86_64-linux-gnu/libibverbs.so* /usr/local/cuda/lib64/ && \
+    cp /usr/lib/x86_64-linux-gnu/libnl* /usr/local/cuda/lib64/
+
 FROM cpu as rocm
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28
index baee261d6ff65..4c5347b40629c 100644
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@@ -175,6 +175,6 @@ ENV XPU_DRIVER_TYPE ROLLING
 RUN python3 -m pip install --upgrade pip && \
     python3 -mpip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
-ENV XPU_VERSION 2025.1
+ENV XPU_VERSION 2025.2
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh
index abe47bbe9188c..5dee4325857fb 100755
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@@ -67,6 +67,12 @@ case ${image} in
         DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
         MANY_LINUX_VERSION="2_28"
         ;;
+    manylinux2_28-builder:cuda13*)
+        TARGET=cuda_final
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
+        MANY_LINUX_VERSION="2_28"
+        ;;
     manylinuxaarch64-builder:cuda*)
         TARGET=cuda_final
         GPU_IMAGE=amd64/almalinux:8
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 12c2f5678c5a5..45fef66fd567f 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -63,11 +63,12 @@ lark==0.12.0
 #Pinned versions: 0.12.0
 #test that import:
 
-librosa>=0.6.2 ; python_version < "3.11"
-librosa==0.10.2 ; python_version == "3.12"
+librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x"
+librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
 #test that import: test_spectral_ops.py
+#librosa depends on numba; disable it for s390x while numba is disabled too
 
 #mkl #this breaks linux-bionic-rocm4.5-py3.7
 #Description: Intel oneAPI Math Kernel Library
@@ -116,6 +117,7 @@ numba==0.61.2 ; python_version > "3.9"
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
 #For numba issue see https://github.com/pytorch/pytorch/issues/51511
+#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
 
 #numpy
 #Description: Provides N-dimensional arrays and linear algebra
@@ -299,7 +301,7 @@ pytest-cpp==2.3.0
 #Pinned versions: 2.3.0
 #test that import:
 
-z3-solver==4.15.1.0
+z3-solver==4.15.1.0 ; platform_machine != "s390x"
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:
@@ -335,7 +337,7 @@ onnx==1.18.0
 #Pinned versions:
 #test that import:
 
-onnxscript==0.3.1
+onnxscript==0.4.0
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@@ -354,7 +356,6 @@ pwlf==2.2.1
 #Pinned versions: 2.2.1
 #test that import: test_sac_estimator.py
 
-
 # To build PyTorch itself
 pyyaml
 pyzstd
@@ -376,7 +377,7 @@ dataclasses_json==0.6.7
 cmake==4.0.0
 #Description: required for building
 
-tlparse==0.3.30
+tlparse==0.4.0
 #Description: required for log parsing
 
 cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt
index 3de4d8e0e44ec..efe6fb4c949b0 100644
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
 
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
diff --git a/.ci/docker/triton_xpu_version.txt b/.ci/docker/triton_xpu_version.txt
index 18091983f59dd..1545d966571dc 100644
--- a/.ci/docker/triton_xpu_version.txt
+++ b/.ci/docker/triton_xpu_version.txt
@@ -1 +1 @@
-3.4.0
+3.5.0
diff --git a/.ci/docker/ubuntu-cross-riscv/Dockerfile b/.ci/docker/ubuntu-cross-riscv/Dockerfile
new file mode 100644
index 0000000000000..08201dc83216c
--- /dev/null
+++ b/.ci/docker/ubuntu-cross-riscv/Dockerfile
@@ -0,0 +1,155 @@
+# Cross-compilation Docker container for RISC-V architecture
+ARG UBUNTU_VERSION
+FROM --platform=linux/amd64 ubuntu:${UBUNTU_VERSION} as base
+
+ARG UBUNTU_VERSION
+
+ENV GCC_VERSION=14
+ENV PYTHON_VERSION=3.12.3
+ENV DEBIAN_FRONTEND=noninteractive
+ENV CC=riscv64-linux-gnu-gcc-${GCC_VERSION}
+ENV CXX=riscv64-linux-gnu-g++-${GCC_VERSION}
+ENV QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/
+ENV SYSROOT=/opt/sysroot
+
+# Install basic dependencies
+RUN apt-get update && apt-get install -y \
+    ninja-build \
+    autoconf \
+    automake \
+    libtool \
+    patchelf \
+    ccache \
+    git \
+    wget \
+    python3-pip \
+    python3-venv \
+    python-is-python3 \
+    cmake \
+    sudo \
+    lsb-release \
+    gcc-${GCC_VERSION}-riscv64-linux-gnu \
+    g++-${GCC_VERSION}-riscv64-linux-gnu \
+    pkg-config \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install user
+COPY ./common/install_user.sh install_user.sh
+RUN bash ./install_user.sh && rm install_user.sh
+
+FROM base as python
+ARG ZLIB_VERSION=1.3.1
+ARG FFI_VERSION=3.4.6
+ARG BZ2_VERSION=1.0.8
+ARG XZ_VERSION=5.4.6
+ARG OPENSSL_VERSION=3.2.1
+
+# Set up sysroot directory for dependencies
+ENV PKG_CONFIG_PATH=${SYSROOT}/lib/pkgconfig
+ENV PKG_CONFIG_SYSROOT_DIR=${SYSROOT}
+
+WORKDIR /opt
+
+# Build zlib (for compression)
+RUN echo "--- Building zlib ---" \
+    && wget -c https://www.zlib.net/zlib-${ZLIB_VERSION}.tar.gz \
+    && tar -xf zlib-${ZLIB_VERSION}.tar.gz --no-same-permissions --no-same-owner \
+    && cd zlib-${ZLIB_VERSION}/ \
+    && mkdir build && cd build \
+    && ../configure --prefix=${SYSROOT} \
+    && make -j$(nproc) && make install \
+    && cd ../..
+
+# Build libffi (for ctypes module)
+RUN echo "--- Building libffi ---" \
+    && wget -c https://github.com/libffi/libffi/releases/download/v${FFI_VERSION}/libffi-${FFI_VERSION}.tar.gz \
+    && tar -xf libffi-${FFI_VERSION}.tar.gz --no-same-permissions --no-same-owner \
+    && cd libffi-${FFI_VERSION}/ \
+    && mkdir build && cd build \
+    && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \
+    && make -j$(nproc) && make install \
+    && cd ../..
+
+# Build bzip2 (for bz2 module)
+RUN echo "--- Building bzip2 ---" \
+    && wget -c https://sourceware.org/pub/bzip2/bzip2-${BZ2_VERSION}.tar.gz \
+    && tar -xf bzip2-${BZ2_VERSION}.tar.gz --no-same-permissions --no-same-owner \
+    && cd bzip2-${BZ2_VERSION}/ \
+    && make CC=riscv64-linux-gnu-gcc-${GCC_VERSION} bzip2 bzip2recover libbz2.a \
+    && make CC=riscv64-linux-gnu-gcc-${GCC_VERSION} -f Makefile-libbz2_so \
+    && make install PREFIX=${SYSROOT} \
+    && cp libbz2.so.${BZ2_VERSION} ${SYSROOT}/lib/ \
+    && cd ${SYSROOT}/lib/ \
+    && ln -sf libbz2.so.${BZ2_VERSION} libbz2.so.1.0 \
+    && ln -sf libbz2.so.1.0 libbz2.so \
+    && cd /opt/
+
+# Build xz (for lzma module)
+RUN echo "--- Building xz ---" \
+    && wget -c https://github.com/tukaani-project/xz/releases/download/v${XZ_VERSION}/xz-${XZ_VERSION}.tar.gz \
+    && tar -xf xz-${XZ_VERSION}.tar.gz --no-same-permissions --no-same-owner \
+    && cd xz-${XZ_VERSION} \
+    && mkdir build && cd build \
+    && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \
+    && make -j$(nproc) && make install \
+    && cd ../..
+
+# Build OpenSSL (for ssl module)
+RUN echo "--- Building OpenSSL ---" \
+    && wget -c https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz \
+    && tar -xf openssl-${OPENSSL_VERSION}.tar.gz --no-same-permissions --no-same-owner \
+    && cd openssl-${OPENSSL_VERSION}/ \
+    && mkdir build && cd build \
+    && ../Configure linux64-riscv64 --prefix=${SYSROOT} \
+    && make -j$(nproc) && make install_sw \
+    && cd ../..
+
+# Build SQLite3 (for sqlite3 module)
+RUN echo "--- Building SQLite3 ---" \
+    && wget -c https://www.sqlite.org/2024/sqlite-autoconf-3450200.tar.gz \
+    && tar -xf sqlite-autoconf-3450200.tar.gz --no-same-permissions --no-same-owner \
+    && cd sqlite-autoconf-3450200 \
+    && mkdir build && cd build \
+    && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \
+    && make -j$(nproc) && make install \
+    && cd ../..
+
+# Build and install RISC-V Python with all modules
+RUN wget -c https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
+    && tar -xf Python-${PYTHON_VERSION}.tgz --no-same-permissions --no-same-owner \
+    && cd Python-${PYTHON_VERSION} \
+    && mkdir build && cd build \
+    && ../configure \
+        --host=riscv64-linux-gnu \
+        --build=x86_64-linux-gnu \
+        --prefix=${SYSROOT} \
+        --enable-shared \
+        --disable-ipv6 \
+        --with-build-python=/usr/bin/python3 \
+        --with-ensurepip=no \
+        ac_cv_file__dev_ptmx=yes \
+        ac_cv_file__dev_ptc=no \
+    && make -j$(nproc) \
+    && make install
+
+FROM base as final
+COPY --from=python             /opt/sysroot                       /opt/sysroot
+
+# Install crossenv and cmake
+RUN pip install crossenv cmake==4.0.0 --break-system-packages \
+    && /usr/bin/python3 -m crossenv ${SYSROOT}/bin/python3 /opt/riscv-cross-env
+
+# Add pip-installed cmake binaries to PATH
+ENV PATH="/usr/local/bin:${PATH}"
+
+# Set up cross Python environment
+SHELL ["/bin/bash", "-c"]
+RUN source /opt/riscv-cross-env/bin/activate \
+    && pip install setuptools pyyaml typing_extensions wheel
+
+# Set default environment variables for PyTorch build
+ENV Python_ROOT_DIR=${SYSROOT}
+ENV OPENSSL_ROOT_DIR=${SYSROOT}
+
+USER jenkins
+CMD ["bash"]
diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile
index 883248b884ed8..e5b672cc8e37f 100644
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@@ -96,10 +96,11 @@ ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
+COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
 
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile
index a0e7dce3df4d5..8765249688ce5 100644
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@@ -56,10 +56,10 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
 
 # Install XPU Dependencies
 ARG XPU_VERSION
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 27c466dd8d41d..1edc8c60c2f07 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -66,6 +66,7 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
 # (optional) Install UCC
 ARG UCX_COMMIT
 ARG UCC_COMMIT
+ARG CUDA_VERSION
 ENV UCX_COMMIT $UCX_COMMIT
 ENV UCC_COMMIT $UCC_COMMIT
 ENV UCX_HOME /usr
@@ -96,10 +97,11 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
+COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
 
 ARG TRITON
 ARG TRITON_CPU
@@ -180,7 +182,6 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi
 
 # AWS specific CUDA build guidance
-ENV TORCH_CUDA_ARCH_LIST Maxwell
 ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
 ENV CUDA_PATH /usr/local/cuda
 
diff --git a/.ci/libtorch/build.sh b/.ci/libtorch/build.sh
index e822feb2674d9..54ddd905aad05 100644
--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@@ -7,4 +7,4 @@ set -ex
 
 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
-USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
+USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
diff --git a/.ci/lumen_cli/README.md b/.ci/lumen_cli/README.md
new file mode 100644
index 0000000000000..a0bb8b19a000f
--- /dev/null
+++ b/.ci/lumen_cli/README.md
@@ -0,0 +1,31 @@
+# 🔧 Lumen_cli
+A Python CLI tool for building and testing PyTorch-based components, using a YAML configuration file for structured, repeatable workflows.
+
+
+## Features
+- **Build**
+    - external projects (e.g. vLLM)
+
+## 📦 Installation
+at the root of the pytorch repo
+```bash
+pip install -e .ci/lumen_cli
+```
+
+## Run the cli tool
+The cli tool must be used at root of pytorch repo, as example to run build external vllm:
+```bash
+python -m cli.run build external vllm
+```
+this will run the build steps with default behaviour for vllm project.
+
+to see help messages, run
+```bash
+python3 -m cli.run --help
+```
+
+## Add customized external build logics
+To add a new external build, for instance, add a new external build logics:
+1. create the build function in cli/lib folder
+2. register your target and the main build function at  EXTERNAL_BUILD_TARGET_DISPATCH in `cli/build_cli/register_build.py`
+3. [optional] create your ci config file in .github/ci_configs/${EXTERNAL_PACKAGE_NAME}.yaml
diff --git a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_blocked b/.ci/lumen_cli/cli/build_cli/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_blocked
rename to .ci/lumen_cli/cli/build_cli/__init__.py
diff --git a/.ci/lumen_cli/cli/build_cli/register_build.py b/.ci/lumen_cli/cli/build_cli/register_build.py
new file mode 100644
index 0000000000000..9f35a9c8165dc
--- /dev/null
+++ b/.ci/lumen_cli/cli/build_cli/register_build.py
@@ -0,0 +1,37 @@
+import argparse
+import logging
+
+from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
+from cli.lib.core.vllm.vllm_build import VllmBuildRunner
+
+
+logger = logging.getLogger(__name__)
+
+# Maps targets to their argparse configuration and runner
+# it adds new target to path python -m cli.run build external {target} with buildrunner
+_TARGETS: dict[str, TargetSpec] = {
+    "vllm": {
+        "runner": VllmBuildRunner,
+        "help": "Build vLLM using docker buildx.",
+    }
+    # add yours ...
+}
+
+
+def register_build_commands(subparsers: argparse._SubParsersAction) -> None:
+    build_parser = subparsers.add_parser(
+        "build",
+        help="Build related commands",
+        formatter_class=RichHelp,
+    )
+    build_subparsers = build_parser.add_subparsers(dest="build_command", required=True)
+    overview = "\n".join(
+        f"  {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()
+    )
+    external_parser = build_subparsers.add_parser(
+        "external",
+        help="Build external targets",
+        description="Build third-party targets.\n\nAvailable targets:\n" + overview,
+        formatter_class=RichHelp,
+    )
+    register_targets(external_parser, _TARGETS)
diff --git a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_bool_called_at_least_once b/.ci/lumen_cli/cli/lib/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_bool_called_at_least_once
rename to .ci/lumen_cli/cli/lib/__init__.py
diff --git a/.ci/lumen_cli/cli/lib/common/cli_helper.py b/.ci/lumen_cli/cli/lib/common/cli_helper.py
new file mode 100644
index 0000000000000..927ca09fe7230
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py
@@ -0,0 +1,71 @@
+"""
+Cli Argparser Utility helpers for CLI tasks.
+
+"""
+
+import argparse
+from abc import ABC, abstractmethod
+
+
+try:
+    from typing import Any, Callable, Required, TypedDict  # Python 3.11+
+except ImportError:
+    from typing import Any, Callable, TypedDict
+
+    from typing_extensions import Required  # Fallback for Python <3.11
+
+
+class BaseRunner(ABC):
+    def __init__(self, args: Any) -> None:
+        self.args = args
+
+    @abstractmethod
+    def run(self) -> None:
+        """runs main logics, required"""
+
+
+# Pretty help: keep newlines + show defaults
+class RichHelp(
+    argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
+):
+    pass
+
+
+class TargetSpec(TypedDict, total=False):
+    """CLI subcommand specification with bA."""
+
+    runner: Required[type[BaseRunner]]
+    help: str
+    description: str
+    add_arguments: Callable[[argparse.ArgumentParser], None]
+
+
+def register_targets(
+    parser: argparse.ArgumentParser,
+    target_specs: dict[str, TargetSpec],
+    common_args: Callable[[argparse.ArgumentParser], None] = lambda _: None,
+) -> None:
+    """Register target subcommands."""
+    targets = parser.add_subparsers(
+        dest="target",
+        required=True,
+        metavar="{" + ",".join(target_specs.keys()) + "}",
+    )
+
+    for name, spec in target_specs.items():
+        desc = spec.get("description") or spec["runner"].__doc__ or ""
+
+        p = targets.add_parser(
+            name,
+            help=spec.get("help", ""),
+            description=desc.strip(),
+            formatter_class=RichHelp,
+        )
+        p.set_defaults(
+            func=lambda args, cls=spec["runner"]: cls(args).run(),
+            _runner_class=spec["runner"],
+        )
+        if "add_arguments" in spec and callable(spec["add_arguments"]):
+            spec["add_arguments"](p)
+        if common_args:
+            common_args(p)
diff --git a/.ci/lumen_cli/cli/lib/common/docker_helper.py b/.ci/lumen_cli/cli/lib/common/docker_helper.py
new file mode 100644
index 0000000000000..b5f0a90e2d47a
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/common/docker_helper.py
@@ -0,0 +1,42 @@
+"""
+Docker Utility helpers for CLI tasks.
+"""
+
+import logging
+from typing import Optional
+
+import docker
+from docker.errors import APIError, NotFound
+
+
+logger = logging.getLogger(__name__)
+
+# lazy singleton so we don't reconnect every call
+_docker_client: Optional[docker.DockerClient] = None
+
+
+def _get_client() -> docker.DockerClient:
+    global _docker_client
+    if _docker_client is None:
+        _docker_client = docker.from_env()
+    return _docker_client
+
+
+def local_image_exists(
+    image_name: str, client: Optional[docker.DockerClient] = None
+) -> bool:
+    """Return True if a local Docker image exists."""
+    if not image_name:
+        return False
+
+    client = client or _get_client()
+    try:
+        client.images.get(image_name)
+        return True
+    except (NotFound, APIError) as e:
+        logger.error(
+            "Error when checking Docker image '%s': %s",
+            image_name,
+            e.explanation if hasattr(e, "explanation") else str(e),
+        )
+        return False
diff --git a/.ci/lumen_cli/cli/lib/common/envs_helper.py b/.ci/lumen_cli/cli/lib/common/envs_helper.py
new file mode 100644
index 0000000000000..a654e7f18ed9f
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/common/envs_helper.py
@@ -0,0 +1,110 @@
+"""
+Environment Variables and Dataclasses Utility helpers for CLI tasks.
+"""
+
+import os
+from dataclasses import field, fields, is_dataclass, MISSING
+from pathlib import Path
+from textwrap import indent
+from typing import Optional, Union
+
+from cli.lib.common.utils import str2bool
+
+
+def get_env(name: str, default: str = "") -> str:
+    """Get environment variable with default fallback."""
+    return os.environ.get(name) or default
+
+
+def env_path_optional(
+    name: str,
+    default: Optional[Union[str, Path]] = None,
+    resolve: bool = True,
+) -> Optional[Path]:
+    """Get environment variable as optional Path."""
+    val = get_env(name) or default
+    if not val:
+        return None
+
+    path = Path(val)
+    return path.resolve() if resolve else path
+
+
+def env_path(
+    name: str,
+    default: Optional[Union[str, Path]] = None,
+    resolve: bool = True,
+) -> Path:
+    """Get environment variable as Path, raise if missing."""
+    path = env_path_optional(name, default, resolve)
+    if not path:
+        raise ValueError(f"Missing path value for {name}")
+    return path
+
+
+def env_bool(
+    name: str,
+    default: bool = False,
+) -> bool:
+    val = get_env(name)
+    if not val:
+        return default
+    return str2bool(val)
+
+
+def env_bool_field(
+    name: str,
+    default: bool = False,
+):
+    return field(default_factory=lambda: env_bool(name, default))
+
+
+def env_path_field(
+    name: str,
+    default: Union[str, Path] = "",
+    *,
+    resolve: bool = True,
+) -> Path:
+    return field(default_factory=lambda: env_path(name, default, resolve=resolve))
+
+
+def env_str_field(
+    name: str,
+    default: str = "",
+) -> str:
+    return field(default_factory=lambda: get_env(name, default))
+
+
+def generate_dataclass_help(cls) -> str:
+    """Auto-generate help text for dataclass fields."""
+    if not is_dataclass(cls):
+        raise TypeError(f"{cls} is not a dataclass")
+
+    def get_value(f):
+        if f.default is not MISSING:
+            return f.default
+        if f.default_factory is not MISSING:
+            try:
+                return f.default_factory()
+            except Exception as e:
+                return f"<error: {e}>"
+        return "<required>"
+
+    lines = [f"{f.name:<22} = {repr(get_value(f))}" for f in fields(cls)]
+    return indent("\n".join(lines), "    ")
+
+
+def with_params_help(params_cls: type, title: str = "Parameter defaults"):
+    """
+    Class decorator that appends a help table generated from another dataclass
+    (e.g., VllmParameters) to the decorated class's docstring.
+    """
+    if not is_dataclass(params_cls):
+        raise TypeError(f"{params_cls} must be a dataclass")
+
+    def _decorator(cls: type) -> type:
+        block = generate_dataclass_help(params_cls)
+        cls.__doc__ = (cls.__doc__ or "") + f"\n\n{title}:\n{block}"
+        return cls
+
+    return _decorator
diff --git a/.ci/lumen_cli/cli/lib/common/gh_summary.py b/.ci/lumen_cli/cli/lib/common/gh_summary.py
new file mode 100644
index 0000000000000..72bfaa76e7068
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/common/gh_summary.py
@@ -0,0 +1,143 @@
+from __future__ import annotations
+
+import logging
+import os
+import textwrap
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from cli.lib.common.utils import get_wheels
+from jinja2 import Template
+
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Mapping
+
+
+logger = logging.getLogger(__name__)
+
+_TPL_CONTENT = Template(
+    textwrap.dedent("""\
+    ## {{ title }}
+
+    ```{{ lang }}
+    {{ content }}
+    ```
+""")
+)
+
+_TPL_LIST_ITEMS = Template(
+    textwrap.dedent("""\
+    ## {{ title }}
+    {% for it in items %}
+    - {{ it.pkg }}: {{ it.relpath }}
+    {% else %}
+    _(no item found)_
+    {% endfor %}
+    """)
+)
+
+_TPL_TABLE = Template(
+    textwrap.dedent("""\
+    {%- if rows %}
+    | {{ cols | join(' | ') }} |
+    |{%- for _ in cols %} --- |{%- endfor %}
+    {%- for r in rows %}
+    | {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %}
+    {%- endfor %}
+    {%- else %}
+    _(no data)_
+    {%- endif %}
+""")
+)
+
+
+def gh_summary_path() -> Path | None:
+    """Return the Path to the GitHub step summary file, or None if not set."""
+    p = os.environ.get("GITHUB_STEP_SUMMARY")
+    return Path(p) if p else None
+
+
+def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool:
+    """
+    Write Markdown content to the GitHub Step Summary file if GITHUB_STEP_SUMMARY is set.
+    append_content: default true, if True, append to the end of the file, else overwrite the whole file
+
+    Returns:
+        True if written successfully (in GitHub Actions environment),
+        False if skipped (e.g., running locally where the variable is not set).
+    """
+    sp = gh_summary_path()
+    if not sp:
+        logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.")
+        return False
+
+    md_clean = textwrap.dedent(md).strip() + "\n"
+
+    mode = "a" if append_content else "w"
+    with sp.open(mode, encoding="utf-8") as f:
+        f.write(md_clean)
+    return True
+
+
+def md_heading(text: str, level: int = 2) -> str:
+    """Generate a Markdown heading string with the given level (1-6)."""
+    return f"{'#' * max(1, min(level, 6))} {text}\n"
+
+
+def md_details(summary: str, content: str) -> str:
+    """Generate a collapsible <details> block with a summary and inner content."""
+    return f"<details>\n<summary>{summary}</summary>\n\n{content}\n\n</details>\n"
+
+
+def summarize_content_from_file(
+    output_dir: Path,
+    freeze_file: str,
+    title: str = "Content from file",
+    code_lang: str = "",  # e.g. "text" or "ini"
+) -> bool:
+    f = Path(output_dir) / freeze_file
+    if not f.exists():
+        return False
+    content = f.read_text(encoding="utf-8").strip()
+    md = render_content(content, title=title, lang=code_lang)
+    return write_gh_step_summary(md)
+
+
+def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3):
+    items = get_wheels(path, max_depth=max_depth)
+    if not items:
+        return False
+    md = render_list(items, title=title)
+    return write_gh_step_summary(md)
+
+
+def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
+    """
+    Render a list of dicts as a Markdown table using Jinja template.
+    """
+    rows = list(rows)
+    cols = list({k for r in rows for k in r.keys()})
+    md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n"
+    return md
+
+
+def render_list(
+    items: Iterable[str],
+    *,
+    title: str = "List",
+) -> str:
+    tpl = _TPL_LIST_ITEMS
+    md = tpl.render(title=title, items=items)
+    return md
+
+
+def render_content(
+    content: str,
+    *,
+    title: str = "Content",
+    lang: str = "text",
+) -> str:
+    tpl = _TPL_CONTENT
+    md = tpl.render(title=title, content=content, lang=lang)
+    return md
diff --git a/.ci/lumen_cli/cli/lib/common/git_helper.py b/.ci/lumen_cli/cli/lib/common/git_helper.py
new file mode 100644
index 0000000000000..9833caca956cb
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/common/git_helper.py
@@ -0,0 +1,69 @@
+"""
+Git Utility helpers for CLI tasks.
+"""
+
+import logging
+from pathlib import Path
+
+from cli.lib.common.path_helper import remove_dir
+from git import GitCommandError, RemoteProgress, Repo
+
+
+logger = logging.getLogger(__name__)
+
+
+class PrintProgress(RemoteProgress):
+    """Simple progress logger for git operations."""
+
+    def __init__(self, interval: int = 5):
+        super().__init__()
+        self._last_percent = -1
+        self._interval = interval
+
+    def update(self, op_code, cur, max=None, message=""):
+        msg = self._cur_line or message
+        if max and cur:
+            percent = int(cur / max * 100)
+            if percent != self._last_percent and percent % self._interval == 0:
+                self._last_percent = percent
+                logger.info("Progress: %d%% - %s", percent, msg)
+        elif msg:
+            logger.info(msg)
+
+
+def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules=False):
+    """Clone repository with pinned commit and optional submodules."""
+    dst = dst or target
+
+    try:
+        logger.info("Cloning %s to %s", target, dst)
+
+        # Clone and fetch
+        remove_dir(dst)
+        r = Repo.clone_from(repo, dst, progress=PrintProgress())
+        r.git.fetch("--all", "--tags")
+
+        # Checkout pinned commit
+        commit = get_post_build_pinned_commit(target)
+        logger.info("Checking out pinned %s commit %s", target, commit)
+        r.git.checkout(commit)
+
+        # Update submodules if requested
+        if update_submodules and r.submodules:
+            logger.info("Updating %d submodule(s)", len(r.submodules))
+            for sm in r.submodules:
+                sm.update(init=True, recursive=True, progress=PrintProgress())
+
+        logger.info("Successfully cloned %s", target)
+        return r, commit
+
+    except GitCommandError as e:
+        logger.error("Git operation failed: %s", e)
+        raise
+
+
+def get_post_build_pinned_commit(name: str, prefix=".github/ci_commit_pins") -> str:
+    path = Path(prefix) / f"{name}.txt"
+    if not path.exists():
+        raise FileNotFoundError(f"Pin file not found: {path}")
+    return path.read_text(encoding="utf-8").strip()
diff --git a/.ci/lumen_cli/cli/lib/common/logger.py b/.ci/lumen_cli/cli/lib/common/logger.py
new file mode 100644
index 0000000000000..7a638206d9316
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/common/logger.py
@@ -0,0 +1,14 @@
+"""
+Logger Utility helpers for CLI tasks.
+"""
+
+import logging
+import sys
+
+
+def setup_logging(level: int = logging.INFO):
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        stream=sys.stdout,
+    )
diff --git a/.ci/lumen_cli/cli/lib/common/path_helper.py b/.ci/lumen_cli/cli/lib/common/path_helper.py
new file mode 100644
index 0000000000000..4f74aa6e509de
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/common/path_helper.py
@@ -0,0 +1,62 @@
+"""Path utility helpers for CLI tasks."""
+
+import logging
+import shutil
+from pathlib import Path
+from typing import Union
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_path(path: Union[str, Path], resolve: bool = False) -> Path:
+    """Convert to Path object, optionally resolving to absolute path."""
+    if not path:
+        raise ValueError("Path cannot be None or empty")
+    result = Path(path)
+    return result.resolve() if resolve else result
+
+
+def ensure_dir_exists(path: Union[str, Path]) -> Path:
+    """Create directory if it doesn't exist."""
+    path_obj = get_path(path)
+    path_obj.mkdir(parents=True, exist_ok=True)
+    return path_obj
+
+
+def remove_dir(path: Union[str, Path, None]) -> None:
+    """Remove directory if it exists."""
+    if not path:
+        return
+    path_obj = get_path(path)
+    if path_obj.exists():
+        shutil.rmtree(path_obj)
+
+
+def force_create_dir(path: Union[str, Path]) -> Path:
+    """Remove directory if exists, then create fresh empty directory."""
+    remove_dir(path)
+    return ensure_dir_exists(path)
+
+
+def copy(src: Union[str, Path], dst: Union[str, Path]) -> None:
+    """Copy file or directory from src to dst."""
+    src_path = get_path(src, resolve=True)
+    dst_path = get_path(dst, resolve=True)
+
+    if not src_path.exists():
+        raise FileNotFoundError(f"Source does not exist: {src_path}")
+
+    dst_path.parent.mkdir(parents=True, exist_ok=True)
+
+    if src_path.is_file():
+        shutil.copy2(src_path, dst_path)
+    elif src_path.is_dir():
+        shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
+    else:
+        raise ValueError(f"Unsupported path type: {src_path}")
+
+
+def is_path_exist(path: Union[str, Path, None]) -> bool:
+    """Check if path exists."""
+    return bool(path and get_path(path).exists())
diff --git a/.ci/lumen_cli/cli/lib/common/pip_helper.py b/.ci/lumen_cli/cli/lib/common/pip_helper.py
new file mode 100644
index 0000000000000..a53747e24d256
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/common/pip_helper.py
@@ -0,0 +1,71 @@
+import glob
+import logging
+import shlex
+import shutil
+import sys
+from collections.abc import Iterable
+from importlib.metadata import PackageNotFoundError, version  # noqa: UP035
+from typing import Optional, Union
+
+from cli.lib.common.utils import run_command
+
+
+logger = logging.getLogger(__name__)
+
+
+def pip_install_packages(
+    packages: Iterable[str] = (),
+    env=None,
+    *,
+    requirements: Optional[str] = None,
+    constraints: Optional[str] = None,
+    prefer_uv: bool = False,
+) -> None:
+    use_uv = prefer_uv and shutil.which("uv") is not None
+    base = (
+        [sys.executable, "-m", "uv", "pip", "install"]
+        if use_uv
+        else [sys.executable, "-m", "pip", "install"]
+    )
+    cmd = base[:]
+    if requirements:
+        cmd += ["-r", requirements]
+    if constraints:
+        cmd += ["-c", constraints]
+    cmd += list(packages)
+    logger.info("pip installing packages: %s", " ".join(map(shlex.quote, cmd)))
+    run_command(" ".join(map(shlex.quote, cmd)), env=env)
+
+
+def pip_install_first_match(pattern: str, extras: Optional[str] = None, pref_uv=False):
+    wheel = first_matching_pkg(pattern)
+    target = f"{wheel}[{extras}]" if extras else wheel
+    logger.info("Installing %s...", target)
+    pip_install_packages([target], prefer_uv=pref_uv)
+
+
+def run_python(args: Union[str, list[str]], env=None):
+    """
+    Run the python in the current environment.
+    """
+    if isinstance(args, str):
+        args = shlex.split(args)
+    cmd = [sys.executable] + args
+    run_command(" ".join(map(shlex.quote, cmd)), env=env)
+
+
+def pkg_exists(name: str) -> bool:
+    try:
+        pkg_version = version(name)
+        logger.info("%s already exist with version: %s", name, pkg_version)
+        return True
+    except PackageNotFoundError:
+        logger.info("%s is not installed", name)
+        return False
+
+
+def first_matching_pkg(pattern: str) -> str:
+    matches = sorted(glob.glob(pattern))
+    if not matches:
+        raise FileNotFoundError(f"No wheel matching: {pattern}")
+    return matches[0]
diff --git a/.ci/lumen_cli/cli/lib/common/utils.py b/.ci/lumen_cli/cli/lib/common/utils.py
new file mode 100644
index 0000000000000..b03309810d986
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/common/utils.py
@@ -0,0 +1,139 @@
+"""
+General Utility helpers for CLI tasks.
+"""
+
+import logging
+import os
+import shlex
+import subprocess
+import sys
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Optional
+
+
+logger = logging.getLogger(__name__)
+
+
+def run_command(
+    cmd: str,
+    use_shell: bool = False,
+    log_cmd: bool = True,
+    cwd: Optional[str] = None,
+    env: Optional[dict] = None,
+    check: bool = True,
+) -> int:
+    """Run a command with optional shell execution."""
+    if use_shell:
+        args = cmd
+        log_prefix = "[shell]"
+        executable = "/bin/bash"
+    else:
+        args = shlex.split(cmd)
+        log_prefix = "[cmd]"
+        executable = None
+
+    if log_cmd:
+        display_cmd = cmd if use_shell else " ".join(args)
+        logger.info("%s %s", log_prefix, display_cmd)
+
+    run_env = {**os.environ, **(env or {})}
+
+    proc = subprocess.run(
+        args,
+        shell=use_shell,
+        executable=executable,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+        cwd=cwd,
+        env=run_env,
+        check=False,
+    )
+
+    if check and proc.returncode != 0:
+        logger.error(
+            "%s Command failed (exit %s): %s", log_prefix, proc.returncode, cmd
+        )
+        raise subprocess.CalledProcessError(
+            proc.returncode, args if not use_shell else cmd
+        )
+
+    return proc.returncode
+
+
+def str2bool(value: Optional[str]) -> bool:
+    """Convert environment variables to boolean values."""
+    if not value:
+        return False
+    if not isinstance(value, str):
+        raise ValueError(
+            f"Expected a string value for boolean conversion, got {type(value)}"
+        )
+    value = value.strip().lower()
+
+    true_value_set = {"1", "true", "t", "yes", "y", "on", "enable", "enabled", "found"}
+    false_value_set = {"0", "false", "f", "no", "n", "off", "disable"}
+
+    if value in true_value_set:
+        return True
+    if value in false_value_set:
+        return False
+    raise ValueError(f"Invalid string value for boolean conversion: {value}")
+
+
+@contextmanager
+def temp_environ(updates: dict[str, str]):
+    """
+    Temporarily set environment variables and restore them after the block.
+    Args:
+        updates: Dict of environment variables to set.
+    """
+    missing = object()
+    old: dict[str, str | object] = {k: os.environ.get(k, missing) for k in updates}
+    try:
+        os.environ.update(updates)
+        yield
+    finally:
+        for k, v in old.items():
+            if v is missing:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = v  # type: ignore[arg-type]
+
+
+@contextmanager
+def working_directory(path: str):
+    """
+    Temporarily change the working directory inside a context.
+    """
+    if not path:
+        # No-op context
+        yield
+        return
+    prev_cwd = os.getcwd()
+    try:
+        os.chdir(path)
+        yield
+    finally:
+        os.chdir(prev_cwd)
+
+
+def get_wheels(
+    output_dir: Path,
+    max_depth: Optional[int] = None,
+) -> list[str]:
+    """Return a list of wheels found in the given output directory."""
+    root = Path(output_dir)
+    if not root.exists():
+        return []
+    items = []
+    for dirpath, _, filenames in os.walk(root):
+        depth = Path(dirpath).relative_to(root).parts
+        if max_depth is not None and len(depth) > max_depth:
+            continue
+        for fname in sorted(filenames):
+            if fname.endswith(".whl"):
+                pkg = fname.split("-")[0]
+                relpath = str((Path(dirpath) / fname).relative_to(root))
+                items.append({"pkg": pkg, "relpath": relpath})
+    return items
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/lib.py b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
new file mode 100644
index 0000000000000..98cfc807e284a
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@@ -0,0 +1,296 @@
+import logging
+import os
+import textwrap
+from typing import Any
+
+from cli.lib.common.gh_summary import write_gh_step_summary
+from cli.lib.common.git_helper import clone_external_repo
+from cli.lib.common.pip_helper import pip_install_packages
+from cli.lib.common.utils import run_command, temp_environ, working_directory
+from jinja2 import Template
+
+
+logger = logging.getLogger(__name__)
+
+_TPL_VLLM_INFO = Template(
+    textwrap.dedent("""\
+    ##  Vllm against Pytorch CI Test Summary
+    **Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})
+    {%- if torch_sha %}
+    **Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }})
+    {%- endif %}
+""")
+)
+
+
+def sample_vllm_test_library():
+    """
+    Simple sample to unblock the vllm ci development, which is mimic to
+    https://github.com/vllm-project/vllm/blob/main/.buildkite/test-pipeline.yaml
+    see run_test_plan for more details
+    """
+    # TODO(elainewy): Read from yaml file to handle the env and tests for vllm
+    return {
+        "vllm_basic_correctness_test": {
+            "title": "Basic Correctness Test",
+            "id": "vllm_basic_correctness_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "steps": [
+                "pytest -v -s basic_correctness/test_cumem.py",
+                "pytest -v -s basic_correctness/test_basic_correctness.py",
+                "pytest -v -s basic_correctness/test_cpu_offload.py",
+                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
+            ],
+        },
+        "vllm_basic_models_test": {
+            "title": "Basic models test",
+            "id": "vllm_basic_models_test",
+            "steps": [
+                "pytest -v -s models/test_transformers.py",
+                "pytest -v -s models/test_registry.py",
+                "pytest -v -s models/test_utils.py",
+                "pytest -v -s models/test_vision.py",
+                "pytest -v -s models/test_initialization.py",
+            ],
+        },
+        "vllm_entrypoints_test": {
+            "title": "Entrypoints Test ",
+            "id": "vllm_entrypoints_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "steps": [
+                " ".join(
+                    [
+                        "pytest",
+                        "-v",
+                        "-s",
+                        "entrypoints/llm",
+                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
+                        "--ignore=entrypoints/llm/test_generate.py",
+                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
+                        "--ignore=entrypoints/llm/test_collective_rpc.py",
+                    ]
+                ),
+                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
+                "pytest -v -s entrypoints/llm/test_generate.py ",
+                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
+            ],
+        },
+        "vllm_regression_test": {
+            "title": "Regression Test",
+            "id": "vllm_regression_test",
+            "package_install": ["modelscope"],
+            "steps": [
+                "pytest -v -s test_regression.py",
+            ],
+        },
+        "vllm_lora_tp_test_distributed": {
+            "title": "LoRA TP Test (Distributed)",
+            "id": "vllm_lora_tp_test_distributed",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "num_gpus": 4,
+            "steps": [
+                "pytest -v -s -x lora/test_chatglm3_tp.py",
+                "pytest -v -s -x lora/test_llama_tp.py",
+                "pytest -v -s -x lora/test_llm_with_multi_loras.py",
+            ],
+        },
+        "vllm_distributed_test_28_failure_test": {
+            "title": "Distributed Tests (2 GPUs) pytorch 2.8 release failure",
+            "id": "vllm_distributed_test_28_failure_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "num_gpus": 4,
+            "steps": [
+                "pytest -v -s distributed/test_sequence_parallel.py",
+            ],
+        },
+        "vllm_lora_28_failure_test": {
+            "title": "LoRA pytorch 2.8 failure test",
+            "id": "vllm_lora_28_failure_test",
+            "steps": ["pytest -v lora/test_quant_model.py"],
+        },
+        "vllm_multi_model_processor_test": {
+            "title": "Multi-Modal Processor Test",
+            "id": "vllm_multi_model_processor_test",
+            "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"],
+            "steps": [
+                "pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py",
+            ],
+        },
+        "vllm_multi_model_test_28_failure_test": {
+            "title": "Multi-Model Test (Failed 2.8 release)",
+            "id": "vllm_multi_model_test_28_failure_test",
+            "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"],
+            "steps": [
+                "pytest -v -s models/multimodal/generation/test_voxtral.py",
+                "pytest -v -s models/multimodal/pooling",
+            ],
+        },
+        "vllm_pytorch_compilation_unit_tests": {
+            "title": "PyTorch Compilation Unit Tests",
+            "id": "vllm_pytorch_compilation_unit_tests",
+            "steps": [
+                "pytest -v -s compile/test_pass_manager.py",
+                "pytest -v -s compile/test_fusion.py",
+                "pytest -v -s compile/test_fusion_attn.py",
+                "pytest -v -s compile/test_silu_mul_quant_fusion.py",
+                "pytest -v -s compile/test_sequence_parallelism.py",
+                "pytest -v -s compile/test_async_tp.py",
+                "pytest -v -s compile/test_fusion_all_reduce.py",
+                "pytest -v -s compile/test_decorator.py",
+            ],
+        },
+        "vllm_languagde_model_test_extended_generation_28_failure_test": {
+            "title": "Language Models Test (Extended Generation) 2.8 release failure",
+            "id": "vllm_languagde_model_test_extended_generation_28_failure_test",
+            "package_install": [
+                "--no-build-isolation",
+                "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8",
+            ],
+            "steps": [
+                "pytest -v -s models/language/generation/test_mistral.py",
+            ],
+        },
+        "vllm_distributed_test_2_gpu_28_failure_test": {
+            "title": "Distributed Tests (2 GPUs) pytorch 2.8 release failure",
+            "id": "vllm_distributed_test_2_gpu_28_failure_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "num_gpus": 4,
+            "steps": [
+                "pytest -v -s distributed/test_sequence_parallel.py",
+            ],
+        },
+        # TODO(elainewy):need to add g6 with 4 gpus to run this test
+        "vllm_lora_test": {
+            "title": "LoRA Test %N",
+            "id": "lora_test",
+            "parallelism": 4,
+            "steps": [
+                "echo '[checking] list sharded lora tests:'",
+                " ".join(
+                    [
+                        "pytest -q --collect-only lora",
+                        "--shard-id=$$BUILDKITE_PARALLEL_JOB",
+                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
+                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
+                    ]
+                ),
+                "echo '[checking] Done. list lora tests'",
+                " ".join(
+                    [
+                        "pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB",
+                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
+                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
+                    ]
+                ),
+            ],
+        },
+    }
+
+
+def check_parallelism(tests: Any, title: str, shard_id: int = 0, num_shards: int = 0):
+    """
+    a method to check if the test plan is parallelism or not.
+    """
+    parallelism = int(tests.get("parallelism", "0"))
+    is_parallel = parallelism and parallelism > 1
+
+    if not is_parallel:
+        return False
+
+    if shard_id > num_shards:
+        raise RuntimeError(
+            f"Test {title} expects {num_shards} shards, but invalid {shard_id} is provided"
+        )
+
+    if num_shards != parallelism:
+        raise RuntimeError(
+            f"Test {title} expects {parallelism} shards, but invalid {num_shards} is provided"
+        )
+
+    return True
+
+
+def run_test_plan(
+    test_plan: str,
+    test_target: str,
+    tests_map: dict[str, Any],
+    shard_id: int = 0,
+    num_shards: int = 0,
+):
+    """
+    a method to run list of tests based on the test plan.
+    """
+    logger.info("run %s tests.....", test_target)
+    if test_plan not in tests_map:
+        raise RuntimeError(
+            f"test {test_plan} not found, please add it to test plan pool"
+        )
+    tests = tests_map[test_plan]
+    pkgs = tests.get("package_install", [])
+    title = tests.get("title", "unknown test")
+
+    is_parallel = check_parallelism(tests, title, shard_id, num_shards)
+    if is_parallel:
+        title = title.replace("%N", f"{shard_id}/{num_shards}")
+
+    logger.info("Running tests: %s", title)
+    if pkgs:
+        logger.info("Installing packages: %s", pkgs)
+        pip_install_packages(packages=pkgs, prefer_uv=True)
+    with (
+        working_directory(tests.get("working_directory", "tests")),
+        temp_environ(tests.get("env_vars", {})),
+    ):
+        failures = []
+        for step in tests["steps"]:
+            logger.info("Running step: %s", step)
+            if is_parallel:
+                step = replace_buildkite_placeholders(step, shard_id, num_shards)
+                logger.info("Running parallel step: %s", step)
+            code = run_command(cmd=step, check=False, use_shell=True)
+            if code != 0:
+                failures.append(step)
+            logger.info("Finish running step: %s", step)
+        if failures:
+            logger.error("Failed tests: %s", failures)
+            raise RuntimeError(f"{len(failures)} pytest runs failed: {failures}")
+        logger.info("Done. All tests passed")
+
+
+def clone_vllm(dst: str = "vllm"):
+    _, commit = clone_external_repo(
+        target="vllm",
+        repo="https://github.com/vllm-project/vllm.git",
+        dst=dst,
+        update_submodules=True,
+    )
+    return commit
+
+
+def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:
+    mapping = {
+        "$$BUILDKITE_PARALLEL_JOB_COUNT": str(num_shards),
+        "$$BUILDKITE_PARALLEL_JOB": str(shard_id),
+    }
+    for k in sorted(mapping, key=len, reverse=True):
+        step = step.replace(k, mapping[k])
+    return step
+
+
+def summarize_build_info(vllm_commit: str) -> bool:
+    torch_sha = os.getenv("GITHUB_SHA")
+    md = (
+        _TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip()
+        + "\n"
+    )
+    return write_gh_step_summary(md)
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
new file mode 100644
index 0000000000000..8db48065cb052
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@@ -0,0 +1,285 @@
+import logging
+import os
+import textwrap
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+from cli.lib.common.cli_helper import BaseRunner
+from cli.lib.common.docker_helper import local_image_exists
+from cli.lib.common.envs_helper import (
+    env_bool_field,
+    env_path_field,
+    env_str_field,
+    with_params_help,
+)
+from cli.lib.common.gh_summary import (
+    gh_summary_path,
+    summarize_content_from_file,
+    summarize_wheels,
+)
+from cli.lib.common.path_helper import (
+    copy,
+    ensure_dir_exists,
+    force_create_dir,
+    get_path,
+    is_path_exist,
+)
+from cli.lib.common.utils import run_command
+from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info
+
+
+logger = logging.getLogger(__name__)
+
+
+# Default path for docker build artifacts
+_DEFAULT_RESULT_PATH = "./shared"
+
+# Temp folder in vllm work place to cp torch whls in vllm work directory for docker build
+_VLLM_TEMP_FOLDER = "tmp"
+
+
+@dataclass
+class VllmBuildParameters:
+    """
+    Parameters defining the vllm external input configurations.
+    Combine with VllmDockerBuildArgs to define the vllm build environment
+    """
+
+    # USE_TORCH_WHEEL: when true, use local Torch wheels; requires TORCH_WHEELS_PATH.
+    # Otherwise docker build pull torch nightly during build
+    # TORCH_WHEELS_PATH: directory containing local torch wheels when use_torch_whl is True
+    use_torch_whl: bool = env_bool_field("USE_TORCH_WHEEL", True)
+    torch_whls_path: Path = env_path_field("TORCH_WHEELS_PATH", "./dist")
+
+    # USE_LOCAL_BASE_IMAGE: when true, use an existing local Docker base image; requires BASE_IMAGE
+    # Otherwise, pull dockerfile's default image remotely
+    # BASE_IMAGE: name:tag (only needed when use_local_base_image is True)
+    use_local_base_image: bool = env_bool_field("USE_LOCAL_BASE_IMAGE", True)
+    base_image: str = env_str_field("BASE_IMAGE")
+
+    # USE_LOCAL_DOCKERFILE: when true("1"), use a local Dockerfile; requires DOCKERFILE_PATH.
+    # otherwise, use vllm's default dockerfile.torch_nightly for build
+    # DOCKERFILE_PATH: path to Dockerfile used when use_local_dockerfile is True"
+    use_local_dockerfile: bool = env_bool_field("USE_LOCAL_DOCKERFILE", True)
+    dockerfile_path: Path = env_path_field(
+        "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
+    )
+
+    # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
+    output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
+
+    # --- Build args ----------------------------------------------------------
+    target_stage: str = env_str_field("TARGET_STAGE", "export-wheels")
+
+    tag_name: str = env_str_field("TAG", "vllm-wheels")
+
+    cuda_version: str = env_str_field("CUDA_VERSION", "12.8.1")
+
+    python_version: str = env_str_field("PYTHON_VERSION", "3.12")
+
+    max_jobs: str = env_str_field("MAX_JOBS", "64")
+
+    sccache_bucket: str = env_str_field("SCCACHE_BUCKET")
+
+    sccache_region: str = env_str_field("SCCACHE_REGION")
+
+    torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
+
+    def __post_init__(self):
+        checks = [
+            (
+                self.use_torch_whl,  # flag
+                True,  # trigger_value
+                "torch_whls_path",  # resource
+                is_path_exist,  # check_func
+                "TORCH_WHEELS_PATH is not provided, but USE_TORCH_WHEEL is set to 1",
+            ),
+            (
+                self.use_local_base_image,
+                True,
+                "base_image",
+                local_image_exists,
+                f"BASE_IMAGE {self.base_image} does not found, but USE_LOCAL_BASE_IMAGE is set to 1",
+            ),
+            (
+                self.use_local_dockerfile,
+                True,
+                "dockerfile_path",
+                is_path_exist,
+                " DOCKERFILE_PATH path does not found, but USE_LOCAL_DOCKERFILE is set to 1",
+            ),
+        ]
+        for flag, trigger_value, attr_name, check_func, error_msg in checks:
+            value = getattr(self, attr_name)
+            if flag == trigger_value:
+                if not value or not check_func(value):
+                    raise ValueError(error_msg)
+            else:
+                logger.info("flag  %s is not set", flag)
+        if not self.output_dir:
+            raise ValueError("missing required output_dir")
+
+
+@with_params_help(VllmBuildParameters)
+class VllmBuildRunner(BaseRunner):
+    """
+    Build vLLM using docker buildx.
+
+    Environment variable options:
+        "USE_TORCH_WHEEL":      "1: use local wheels; 0: pull nightly from pypi",
+        "TORCH_WHEELS_PATH":    "Path to local wheels (when USE_TORCH_WHEEL=1)",
+
+        "USE_LOCAL_BASE_IMAGE": "1: use local base image; 0: default image",
+         "BASE_IMAGE":           "name:tag to indicate base image the dockerfile depends on (when USE_LOCAL_BASE_IMAGE=1)",
+
+        "USE_LOCAL_DOCKERFILE": "1: use local Dockerfile; 0: vllm repo default dockerfile.torch_nightly",
+        "DOCKERFILE_PATH":      "Path to Dockerfile (when USE_LOCAL_DOCKERFILE=1)",
+
+        "OUTPUT_DIR":           "e.g. './shared'",
+
+        "TORCH_CUDA_ARCH_LIST": "e.g. '8.0' or '8.0;9.0'",
+        "CUDA_VERSION":         "e.g. '12.8.1'",
+        "PYTHON_VERSION":       "e.g. '3.12'",
+        "MAX_JOBS":             "e.g. '64'",
+        "SCCACHE_BUCKET":       "e.g. 'my-bucket'",
+        "SCCACHE_REGION":       "e.g. 'us-west-2'",
+    """
+
+    def __init__(self, args=None):
+        self.work_directory = "vllm"
+
+    def run(self):
+        """
+        main function to run vllm build
+        1. prepare vllm build environment
+        2. prepare the docker build command args
+        3. run docker build
+        """
+        inputs = VllmBuildParameters()
+        logger.info("Running vllm build with inputs: %s", inputs)
+        vllm_commit = clone_vllm()
+
+        self.cp_dockerfile_if_exist(inputs)
+        # cp torch wheels from root direct to vllm workspace if exist
+        self.cp_torch_whls_if_exist(inputs)
+
+        # make sure the output dir to store the build artifacts exist
+        ensure_dir_exists(Path(inputs.output_dir))
+
+        cmd = self._generate_docker_build_cmd(inputs)
+        logger.info("Running docker build: \n %s", cmd)
+
+        try:
+            run_command(cmd, cwd="vllm", env=os.environ.copy())
+        finally:
+            self.genearte_vllm_build_summary(vllm_commit, inputs)
+
+    def genearte_vllm_build_summary(
+        self, vllm_commit: str, inputs: VllmBuildParameters
+    ):
+        if not gh_summary_path():
+            return logger.info("Skipping, not detect GH Summary env var....")
+        logger.info("Generate GH Summary ...")
+        # summarize vllm build info
+        summarize_build_info(vllm_commit)
+
+        # summarize vllm build artifacts
+        vllm_artifact_dir = inputs.output_dir / "wheels"
+        summarize_content_from_file(
+            vllm_artifact_dir,
+            "build_summary.txt",
+            title="Vllm build env pip package summary",
+        )
+        summarize_wheels(
+            inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts"
+        )
+        summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts")
+
+    def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
+        if not inputs.use_torch_whl:
+            return ""
+        tmp_dir = f"./{self.work_directory}/{_VLLM_TEMP_FOLDER}"
+        tmp_path = Path(tmp_dir)
+        force_create_dir(tmp_path)
+        copy(inputs.torch_whls_path, tmp_dir)
+        return tmp_dir
+
+    def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
+        if not inputs.use_local_dockerfile:
+            logger.info("using vllm default dockerfile.torch_nightly for build")
+            return
+        dockerfile_path = get_path(inputs.dockerfile_path, resolve=True)
+        vllm_torch_dockerfile = Path(
+            f"./{self.work_directory}/docker/Dockerfile.nightly_torch"
+        )
+        copy(dockerfile_path, vllm_torch_dockerfile)
+
+    def get_result_path(self, path):
+        """
+        Get the absolute path of the result path
+        """
+        if not path:
+            path = _DEFAULT_RESULT_PATH
+        abs_path = get_path(path, resolve=True)
+        return abs_path
+
+    def _get_torch_wheel_path_arg(self, torch_whl_dir: Optional[Path]) -> str:
+        if not torch_whl_dir:
+            return ""
+        return f"--build-arg TORCH_WHEELS_PATH={_VLLM_TEMP_FOLDER}"
+
+    def _get_base_image_args(self, inputs: VllmBuildParameters) -> tuple[str, str, str]:
+        """
+        Returns:
+            - base_image_arg: docker buildx arg string for base image
+            - final_base_image_arg:  docker buildx arg string for vllm-base stage
+            - pull_flag: --pull=true or --pull=false depending on whether the image exists locally
+        """
+        if not inputs.use_local_base_image:
+            return "", "", ""
+
+        base_image = inputs.base_image
+
+        # set both base image and final base image to the same local image
+        base_image_arg = f"--build-arg BUILD_BASE_IMAGE={base_image}"
+        final_base_image_arg = f"--build-arg FINAL_BASE_IMAGE={base_image}"
+
+        if local_image_exists(base_image):
+            pull_flag = "--pull=false"
+            return base_image_arg, final_base_image_arg, pull_flag
+        logger.info(
+            "[INFO] Local image not found:%s will try to pull from remote", {base_image}
+        )
+        return base_image_arg, final_base_image_arg, ""
+
+    def _generate_docker_build_cmd(
+        self,
+        inputs: VllmBuildParameters,
+    ) -> str:
+        base_image_arg, final_base_image_arg, pull_flag = self._get_base_image_args(
+            inputs
+        )
+        torch_arg = self._get_torch_wheel_path_arg(inputs.torch_whls_path)
+
+        return textwrap.dedent(
+            f"""
+            docker buildx build \
+                --output type=local,dest={inputs.output_dir} \
+                -f docker/Dockerfile.nightly_torch \
+                {pull_flag} \
+                {torch_arg} \
+                {base_image_arg} \
+                {final_base_image_arg} \
+                --build-arg max_jobs={inputs.max_jobs} \
+                --build-arg CUDA_VERSION={inputs.cuda_version} \
+                --build-arg PYTHON_VERSION={inputs.python_version} \
+                --build-arg USE_SCCACHE={int(bool(inputs.sccache_bucket and inputs.sccache_region))} \
+                --build-arg SCCACHE_BUCKET_NAME={inputs.sccache_bucket} \
+                --build-arg SCCACHE_REGION_NAME={inputs.sccache_region} \
+                --build-arg torch_cuda_arch_list='{inputs.torch_cuda_arch_list}' \
+                --target {inputs.target_stage} \
+                -t {inputs.tag_name} \
+                --progress=plain .
+        """
+        ).strip()
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
new file mode 100644
index 0000000000000..76401e33f29fd
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@@ -0,0 +1,269 @@
+import logging
+import os
+import re
+import subprocess
+import sys
+from collections.abc import Iterable
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Any
+
+from cli.lib.common.cli_helper import BaseRunner
+from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
+from cli.lib.common.path_helper import copy, remove_dir
+from cli.lib.common.pip_helper import (
+    pip_install_first_match,
+    pip_install_packages,
+    pkg_exists,
+    run_python,
+)
+from cli.lib.common.utils import run_command, working_directory
+from cli.lib.core.vllm.lib import clone_vllm, run_test_plan, sample_vllm_test_library
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class VllmTestParameters:
+    """
+    Parameters defining the vllm external test input
+
+    !!!DO NOT ADD SECRETS IN THIS CLASS!!!
+    you can put environment variable name in VllmTestParameters if it's not the same as the secret one
+    fetch secrests directly from env variables during runtime
+    """
+
+    torch_whls_path: Path = env_path_field("WHEELS_PATH", "./dist")
+
+    vllm_whls_path: Path = env_path_field(
+        "VLLM_WHEELS_PATH", "./dist/external/vllm/wheels"
+    )
+
+    torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
+
+    def __post_init__(self):
+        if not self.torch_whls_path.exists():
+            raise ValueError("missing torch_whls_path")
+        if not self.vllm_whls_path.exists():
+            raise ValueError("missing vllm_whls_path")
+
+
+class TestInpuType(Enum):
+    TEST_PLAN = "test_plan"
+    UNKNOWN = "unknown"
+
+
+class VllmTestRunner(BaseRunner):
+    def __init__(self, args: Any):
+        self.work_directory = "vllm"
+        self.test_plan = ""
+        self.test_type = TestInpuType.UNKNOWN
+
+        self.shard_id = args.shard_id
+        self.num_shards = args.num_shards
+
+        if args.test_plan:
+            self.test_plan = args.test_plan
+            self.test_type = TestInpuType.TEST_PLAN
+
+        # Matches the structeur in the artifacts.zip from torcb build
+        self.TORCH_WHL_PATH_REGEX = "torch*.whl"
+        self.TORCH_WHL_EXTRA = "opt-einsum"
+        self.TORCH_ADDITIONAL_WHLS_REGEX = [
+            "vision/torchvision*.whl",
+            "audio/torchaudio*.whl",
+        ]
+
+        # Match the structure of the artifacts.zip from vllm external build
+        self.VLLM_TEST_WHLS_REGEX = [
+            "xformers/*.whl",
+            "vllm/vllm*.whl",
+            "flashinfer-python/flashinfer*.whl",
+        ]
+
+    def prepare(self):
+        """
+        prepare test environment for vllm. This includes clone vllm repo, install all wheels, test dependencies and set env
+        """
+        params = VllmTestParameters()
+        logger.info("Display VllmTestParameters %s", params)
+        self._set_envs(params)
+
+        clone_vllm(dst=self.work_directory)
+        with working_directory(self.work_directory):
+            remove_dir(Path("vllm"))
+            self._install_wheels(params)
+            self._install_dependencies()
+        # verify the torches are not overridden by test dependencies
+        check_versions()
+
+    def run(self):
+        """
+        main function to run vllm test
+        """
+        self.prepare()
+        try:
+            with working_directory(self.work_directory):
+                if self.test_type == TestInpuType.TEST_PLAN:
+                    if self.num_shards > 1:
+                        run_test_plan(
+                            self.test_plan,
+                            "vllm",
+                            sample_vllm_test_library(),
+                            self.shard_id,
+                            self.num_shards,
+                        )
+                    else:
+                        run_test_plan(
+                            self.test_plan, "vllm", sample_vllm_test_library()
+                        )
+                else:
+                    raise ValueError(f"Unknown test type {self.test_type}")
+        finally:
+            # double check the torches are not overridden by other packages
+            check_versions()
+
+    def _install_wheels(self, params: VllmTestParameters):
+        logger.info("Running vllm test with inputs: %s", params)
+        if not pkg_exists("torch"):
+            # install torch from local whls if it's not installed yet.
+            torch_p = f"{str(params.torch_whls_path)}/{self.TORCH_WHL_PATH_REGEX}"
+            pip_install_first_match(torch_p, self.TORCH_WHL_EXTRA)
+
+        torch_whls_path = [
+            f"{str(params.torch_whls_path)}/{whl_path}"
+            for whl_path in self.TORCH_ADDITIONAL_WHLS_REGEX
+        ]
+        for torch_whl in torch_whls_path:
+            pip_install_first_match(torch_whl)
+        logger.info("Done. Installed torch and other torch-related wheels ")
+
+        logger.info("Installing vllm wheels")
+        vllm_whls_path = [
+            f"{str(params.vllm_whls_path)}/{whl_path}"
+            for whl_path in self.VLLM_TEST_WHLS_REGEX
+        ]
+        for vllm_whl in vllm_whls_path:
+            pip_install_first_match(vllm_whl)
+        logger.info("Done. Installed vllm wheels")
+
+    def _install_test_dependencies(self):
+        """
+        This method replaces torch dependencies with local torch wheel info in
+        requirements/test.in file from vllm repo. then generates the test.txt
+        in runtime
+        """
+        logger.info("generate test.txt from requirements/test.in with local torch whls")
+        preprocess_test_in()
+        copy("requirements/test.txt", "snapshot_constraint.txt")
+
+        run_command(
+            f"{sys.executable} -m uv pip compile requirements/test.in "
+            "-o test.txt "
+            "--index-strategy unsafe-best-match "
+            "--constraint snapshot_constraint.txt "
+            "--torch-backend cu128"
+        )
+        pip_install_packages(requirements="test.txt", prefer_uv=True)
+        logger.info("Done. installed requirements for test dependencies")
+
+    def _install_dependencies(self):
+        pip_install_packages(packages=["-e", "tests/vllm_test_utils"], prefer_uv=True)
+        pip_install_packages(packages=["hf_transfer"], prefer_uv=True)
+        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+
+        # using script from vllm repo to remove all torch packages from requirements txt
+        run_python("use_existing_torch.py")
+
+        # install common packages
+        for requirements in ["requirements/common.txt", "requirements/build.txt"]:
+            pip_install_packages(
+                requirements=requirements,
+                prefer_uv=True,
+            )
+        # install test packages
+        self._install_test_dependencies()
+
+    def _set_envs(self, inputs: VllmTestParameters):
+        os.environ["TORCH_CUDA_ARCH_LIST"] = inputs.torch_cuda_arch_list
+        if not validate_cuda(get_env("TORCH_CUDA_ARCH_LIST")):
+            logger.warning(
+                "Missing supported TORCH_CUDA_ARCH_LIST. "
+                "Currently support TORCH_CUDA_ARCH_LIST env var "
+                "with supported arch [8.0, 8.9, 9.0]"
+            )
+
+        os.environ["HF_TOKEN"] = os.getenv("VLLM_TEST_HUGGING_FACE_TOKEN", "")
+        if not get_env("HF_TOKEN"):
+            raise ValueError(
+                "missing required HF_TOKEN, please set VLLM_TEST_HUGGING_FACE_TOKEN env var"
+            )
+        if not get_env("TORCH_CUDA_ARCH_LIST"):
+            raise ValueError(
+                "missing required TORCH_CUDA_ARCH_LIST, please set TORCH_CUDA_ARCH_LIST env var"
+            )
+
+
+def preprocess_test_in(
+    target_file: str = "requirements/test.in", additional_packages: Iterable[str] = ()
+):
+    """
+    This modifies the target_file file in place in vllm work directory.
+    It removes torch and unwanted packages in target_file and replace with local torch whls
+    package  with format "$WHEEL_PACKAGE_NAME @ file://<LOCAL_PATH>"
+    """
+    additional_package_to_move = list(additional_packages or ())
+    pkgs_to_remove = [
+        "torch",
+        "torchvision",
+        "torchaudio",
+        "xformers",
+        "mamba_ssm",
+    ] + additional_package_to_move
+    # Read current requirements
+    target_path = Path(target_file)
+    lines = target_path.read_text().splitlines()
+
+    pkgs_to_add = []
+
+    # Remove lines starting with the package names (==, @, >=) — case-insensitive
+    pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE)
+    kept_lines = [line for line in lines if not pattern.match(line)]
+
+    # Get local installed torch/vision/audio from pip freeze
+    # This is hacky, but it works
+    pip_freeze = subprocess.check_output(["pip", "freeze"], text=True)
+    header_lines = [
+        line
+        for line in pip_freeze.splitlines()
+        if re.match(
+            r"^(torch|torchvision|torchaudio)\s*@\s*file://", line, re.IGNORECASE
+        )
+    ]
+
+    # Write back: header_lines + blank + kept_lines
+    out_lines = header_lines + [""] + kept_lines
+    if pkgs_to_add:
+        out_lines += [""] + pkgs_to_add
+
+    out = "\n".join(out_lines) + "\n"
+    target_path.write_text(out)
+    logger.info("[INFO] Updated %s", target_file)
+
+
+def validate_cuda(value: str) -> bool:
+    VALID_VALUES = {"8.0", "8.9", "9.0"}
+    return all(v in VALID_VALUES for v in value.split())
+
+
+def check_versions():
+    """
+    check installed packages version
+    """
+    logger.info("Double check installed packages")
+    patterns = ["torch", "xformers", "torchvision", "torchaudio", "vllm"]
+    for pkg in patterns:
+        pkg_exists(pkg)
+    logger.info("Done. checked installed packages")
diff --git a/.ci/lumen_cli/cli/run.py b/.ci/lumen_cli/cli/run.py
new file mode 100644
index 0000000000000..1711109170756
--- /dev/null
+++ b/.ci/lumen_cli/cli/run.py
@@ -0,0 +1,40 @@
+# main.py
+
+import argparse
+import logging
+
+from cli.build_cli.register_build import register_build_commands
+from cli.lib.common.logger import setup_logging
+from cli.test_cli.register_test import register_test_commands
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    # Define top-level parser
+    parser = argparse.ArgumentParser(description="Lumos CLI")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    parser.add_argument(
+        "--log-level", default="INFO", help="Log level (DEBUG, INFO, WARNING, ERROR)"
+    )
+
+    # registers second-level subcommands
+    register_build_commands(subparsers)
+    register_test_commands(subparsers)
+
+    # parse args after all options are registered
+    args = parser.parse_args()
+
+    # setup global logging
+    setup_logging(getattr(logging, args.log_level.upper(), logging.INFO))
+    logger.debug("Parsed args: %s", args)
+
+    if hasattr(args, "func"):
+        args.func(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_complex b/.ci/lumen_cli/cli/test_cli/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_complex
rename to .ci/lumen_cli/cli/test_cli/__init__.py
diff --git a/.ci/lumen_cli/cli/test_cli/register_test.py b/.ci/lumen_cli/cli/test_cli/register_test.py
new file mode 100644
index 0000000000000..2973341b83ed2
--- /dev/null
+++ b/.ci/lumen_cli/cli/test_cli/register_test.py
@@ -0,0 +1,62 @@
+import argparse
+import logging
+
+from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
+from cli.lib.core.vllm.vllm_test import VllmTestRunner
+
+
+logger = logging.getLogger(__name__)
+
+# Maps targets to their argparse configuration and runner
+# it adds new target to path python -m cli.run build external {target} with buildrunner
+_TARGETS: dict[str, TargetSpec] = {
+    "vllm": {
+        "runner": VllmTestRunner,
+        "help": "test vLLM with pytorch main",
+    }
+    # add yours ...
+}
+
+
+def common_args(parser: argparse.ArgumentParser) -> None:
+    """
+    Add common CLI arguments to the given parser.
+    """
+    parser.add_argument(
+        "--shard-id",
+        type=int,
+        default=1,
+        help="a shard id to run, e.g. '0,1,2,3'",
+    )
+    parser.add_argument(
+        "--num-shards",
+        type=int,
+        default=1,
+        help="a number of shards to run, e.g. '4'",
+    )
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument(
+        "-tp",
+        "--test-plan",
+        type=str,
+        help="a pre-defined test plan to run, e.g. 'basic_correctness_test'",
+    )
+
+
+def register_test_commands(subparsers: argparse._SubParsersAction) -> None:
+    build_parser = subparsers.add_parser(
+        "test",
+        help="test related commands",
+        formatter_class=RichHelp,
+    )
+    build_subparsers = build_parser.add_subparsers(dest="test_command", required=True)
+    overview = "\n".join(
+        f"  {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()
+    )
+    external_parser = build_subparsers.add_parser(
+        "external",
+        help="Test external targets",
+        description="Test third-party targets.\n\nAvailable targets:\n" + overview,
+        formatter_class=RichHelp,
+    )
+    register_targets(external_parser, _TARGETS, common_args=common_args)
diff --git a/.ci/lumen_cli/pyproject.toml b/.ci/lumen_cli/pyproject.toml
new file mode 100644
index 0000000000000..bf5edc77d9250
--- /dev/null
+++ b/.ci/lumen_cli/pyproject.toml
@@ -0,0 +1,23 @@
+[project]
+name = "lumen-ci"
+version = "0.1.0"
+dependencies = [
+    "pyyaml==6.0.2",
+    "GitPython==3.1.45",
+    "docker==7.1.0",
+    "pytest==7.3.2",
+    "uv==0.8.6"
+]
+
+[tool.setuptools]
+packages = ["cli"]
+
+[tool.setuptools.package-dir]
+cli = "cli"
+
+[tool.ruff.lint]
+# Enable preview mode for linting
+preview = true
+
+# Now you can select your preview rules, like RUF048
+extend-select = ["RUF048"]
diff --git a/.ci/lumen_cli/tests/test_app.py b/.ci/lumen_cli/tests/test_app.py
new file mode 100644
index 0000000000000..9d57b37f159d7
--- /dev/null
+++ b/.ci/lumen_cli/tests/test_app.py
@@ -0,0 +1,47 @@
+# tests/test_cli.py
+import io
+import sys
+import unittest
+from contextlib import redirect_stderr, redirect_stdout
+from unittest.mock import patch
+
+from cli.run import main
+
+
+class TestArgparseCLI(unittest.TestCase):
+    @patch("cli.build_cli.register_build.VllmBuildRunner.run", return_value=None)
+    @patch("cli.build_cli.register_build.VllmBuildRunner.__init__", return_value=None)
+    def test_cli_run_build_external(self, mock_init, mock_run):
+        from cli.run import main  # import after patches if needed
+
+        test_args = ["cli.run", "build", "external", "vllm"]
+        with patch.object(sys, "argv", test_args):
+            # argparse may call sys.exit on error; capture to avoid test aborts
+            try:
+                main()
+            except SystemExit:
+                pass
+        mock_init.assert_called_once()  # got constructed
+        mock_run.assert_called_once_with()  # run() called
+
+    def test_build_help(self):
+        test_args = ["cli.run", "build", "--help"]
+
+        with patch.object(sys, "argv", test_args):
+            stdout = io.StringIO()
+            stderr = io.StringIO()
+
+            # --help always raises SystemExit(0)
+            with self.assertRaises(SystemExit) as cm:
+                with redirect_stdout(stdout), redirect_stderr(stderr):
+                    main()
+
+            self.assertEqual(cm.exception.code, 0)
+
+            output = stdout.getvalue()
+            self.assertIn("usage", output)
+            self.assertIn("external", output)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/.ci/lumen_cli/tests/test_cli_helper.py b/.ci/lumen_cli/tests/test_cli_helper.py
new file mode 100644
index 0000000000000..848f22d6be200
--- /dev/null
+++ b/.ci/lumen_cli/tests/test_cli_helper.py
@@ -0,0 +1,115 @@
+import argparse
+import io
+import unittest
+from contextlib import redirect_stderr
+from unittest.mock import patch
+
+from cli.lib.common.cli_helper import BaseRunner, register_targets, RichHelp, TargetSpec
+
+
+# ---- Dummy runners for unittests----
+class FooRunner(BaseRunner):
+    """Foo description from docstring."""
+
+    def run(self) -> None:  # replaced by mock
+        pass
+
+
+class BarRunner(BaseRunner):
+    def run(self) -> None:  # replaced by mock
+        pass
+
+
+def add_foo_args(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--x", type=int, required=True, help="x value")
+
+
+def common_args(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--verbose", action="store_true", help="verbose flag")
+
+
+def build_parser(specs: dict[str, TargetSpec]) -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(prog="app", formatter_class=RichHelp)
+    register_targets(
+        parser=parser,
+        target_specs=specs,
+        common_args=common_args,
+    )
+    return parser
+
+
+def get_subparser(
+    parser: argparse.ArgumentParser, name: str
+) -> argparse.ArgumentParser:
+    subparsers_action = next(
+        a
+        for a in parser._subparsers._group_actions  # type: ignore[attr-defined]
+        if isinstance(a, argparse._SubParsersAction)
+    )
+    return subparsers_action.choices[name]
+
+
+class TestRegisterTargets(unittest.TestCase):
+    def test_metavar_lists_targets(self):
+        specs: dict[str, TargetSpec] = {
+            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},
+            "bar": {"runner": BarRunner},
+        }
+        parser = build_parser(specs)
+        subparsers_action = next(
+            a
+            for a in parser._subparsers._group_actions  # type: ignore[attr-defined]
+            if isinstance(a, argparse._SubParsersAction)
+        )
+        self.assertEqual(subparsers_action.metavar, "{foo,bar}")
+
+    def test_add_arguments_and_common_args_present(self):
+        specs: dict[str, TargetSpec] = {
+            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},
+        }
+        parser = build_parser(specs)
+        foo = get_subparser(parser, "foo")
+        help_text = foo.format_help()
+        self.assertIn("--x", help_text)
+        self.assertIn("--verbose", help_text)
+
+    def test_runner_constructed_with_ns_and_run_called(self):
+        specs: dict[str, TargetSpec] = {
+            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},
+        }
+        parser = build_parser(specs)
+
+        with (
+            patch.object(FooRunner, "__init__", return_value=None) as mock_init,
+            patch.object(FooRunner, "run", return_value=None) as mock_run,
+        ):
+            ns = parser.parse_args(["foo", "--x", "3", "--verbose"])
+            ns.func(ns)  # set by register_targets
+            # __init__ received the Namespace
+            self.assertEqual(mock_init.call_count, 1)
+            (called_ns,), _ = mock_init.call_args
+            self.assertIsInstance(called_ns, argparse.Namespace)
+            # run() called with no args
+            mock_run.assert_called_once_with()
+
+    def test_runner_docstring_used_as_description_when_missing(self):
+        specs: dict[str, TargetSpec] = {
+            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},
+        }
+        parser = build_parser(specs)
+        foo = get_subparser(parser, "foo")
+        help_text = foo.format_help()
+        self.assertIn("Foo description from docstring.", help_text)
+
+    def test_missing_target_raises_systemexit_with_usage(self):
+        specs: dict[str, TargetSpec] = {"foo": {"runner": FooRunner}}
+        parser = build_parser(specs)
+        buf = io.StringIO()
+        with self.assertRaises(SystemExit), redirect_stderr(buf):
+            parser.parse_args([])
+        err = buf.getvalue()
+        self.assertIn("usage:", err)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/.ci/lumen_cli/tests/test_docker_helper.py b/.ci/lumen_cli/tests/test_docker_helper.py
new file mode 100644
index 0000000000000..0f15cd4b99bad
--- /dev/null
+++ b/.ci/lumen_cli/tests/test_docker_helper.py
@@ -0,0 +1,75 @@
+import unittest
+from unittest import mock
+from unittest.mock import MagicMock
+
+import docker.errors as derr
+from cli.lib.common.docker_helper import _get_client, local_image_exists
+
+
+class TestDockerImageHelpers(unittest.TestCase):
+    def setUp(self):
+        # Reset the singleton in the target module
+        patcher = mock.patch("cli.lib.common.docker_helper._docker_client", None)
+        self.addCleanup(patcher.stop)
+        patcher.start()
+
+    def test_local_image_exists_true(self):
+        # Mock a docker client whose images.get returns an object (no exception)
+        mock_client = MagicMock()
+        mock_client.images.get.return_value = object()
+        ok = local_image_exists("repo:tag", client=mock_client)
+        self.assertTrue(ok)
+
+    def test_local_image_exists_not_found_false(self):
+        mock_client = MagicMock()
+        # Raise docker.errors.NotFound
+        mock_client.images.get.side_effect = derr.NotFound("nope")
+        ok = local_image_exists("missing:latest", client=mock_client)
+        self.assertFalse(ok)
+
+    def test_local_image_exists_api_error_false(self):
+        mock_client = MagicMock()
+        mock_client.images.get.side_effect = derr.APIError("boom", None)
+
+        ok = local_image_exists("broken:tag", client=mock_client)
+        self.assertFalse(ok)
+
+    def test_local_image_exists_uses_lazy_singleton(self):
+        # Patch docker.from_env used by _get_client()
+        with mock.patch(
+            "cli.lib.common.docker_helper.docker.from_env"
+        ) as mock_from_env:
+            mock_docker_client = MagicMock()
+            mock_from_env.return_value = mock_docker_client
+
+            # First call should create and cache the client
+            c1 = _get_client()
+            self.assertIs(c1, mock_docker_client)
+            mock_from_env.assert_called_once()
+
+            # Second call should reuse cached client (no extra from_env calls)
+            c2 = _get_client()
+            self.assertIs(c2, mock_docker_client)
+            mock_from_env.assert_called_once()  # still once
+
+    def test_local_image_exists_without_client_param_calls_get_client_once(self):
+        # Ensure _get_client is called and cached; local_image_exists should reuse it
+        with mock.patch("cli.lib.common.docker_helper._get_client") as mock_get_client:
+            mock_client = MagicMock()
+            mock_get_client.return_value = mock_client
+
+            # 1st call
+            local_image_exists("repo:tag")
+            # 2nd call
+            local_image_exists("repo:tag2")
+
+            # local_image_exists should call _get_client each time,
+            # but your _get_client itself caches docker.from_env.
+            self.assertEqual(mock_get_client.call_count, 2)
+            self.assertEqual(mock_client.images.get.call_count, 2)
+            mock_client.images.get.assert_any_call("repo:tag")
+            mock_client.images.get.assert_any_call("repo:tag2")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/.ci/lumen_cli/tests/test_envs_helper.py b/.ci/lumen_cli/tests/test_envs_helper.py
new file mode 100644
index 0000000000000..187f3016d7ea5
--- /dev/null
+++ b/.ci/lumen_cli/tests/test_envs_helper.py
@@ -0,0 +1,149 @@
+import os
+import unittest
+from dataclasses import dataclass
+from pathlib import Path
+from unittest.mock import patch
+
+import cli.lib.common.envs_helper as m
+
+
+class TestEnvHelpers(unittest.TestCase):
+    def setUp(self):
+        # Keep a copy of the original environment to restore later
+        self._env_backup = dict(os.environ)
+
+    def tearDown(self):
+        # Restore environment to original state
+        os.environ.clear()
+        os.environ.update(self._env_backup)
+
+    # -------- get_env --------
+    def test_get_env_unset_returns_default(self):
+        with patch.dict(os.environ, {}, clear=True):
+            self.assertEqual(m.get_env("FOO", "default"), "default")
+
+    def test_get_env_empty_returns_default(self):
+        with patch.dict(os.environ, {"FOO": ""}, clear=True):
+            self.assertEqual(m.get_env("FOO", "default"), "default")
+
+    def test_get_env_set_returns_value(self):
+        with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
+            self.assertEqual(m.get_env("FOO", "default"), "bar")
+
+    def test_get_env_not_exist_returns_default(self):
+        with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
+            self.assertEqual(m.get_env("TEST_NOT_EXIST", "default"), "default")
+
+    def test_get_env_not_exist_without_default(self):
+        with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
+            self.assertEqual(m.get_env("TEST_NOT_EXIST"), "")
+
+    # -------- env_bool --------
+    def test_env_bool_uses_default_when_unset(self):
+        with patch.dict(os.environ, {}, clear=True):
+            self.assertTrue(m.env_bool("FLAG", default=True))
+            self.assertFalse(m.env_bool("FLAG", default=False))
+
+    def test_env_bool_uses_str2bool_when_set(self):
+        # Patch str2bool used by env_bool so we don't depend on its exact behavior
+        def fake_str2bool(s: str) -> bool:
+            return s.lower() in {"1", "true", "yes", "on", "y"}
+
+        with (
+            patch.dict(os.environ, {"FLAG": "yEs"}, clear=True),
+            patch.object(m, "str2bool", fake_str2bool),
+        ):
+            self.assertTrue(m.env_bool("FLAG", default=False))
+
+    # -------- env_path_optional / env_path --------
+    def test_env_path_optional_unset_returns_none_by_default(self):
+        with patch.dict(os.environ, {}, clear=True):
+            self.assertIsNone(m.env_path_optional("P"))
+
+    def test_env_path_optional_unset_returns_none_when_env_var_is_empty(self):
+        with patch.dict(os.environ, {"P": ""}, clear=True):
+            self.assertIsNone(m.env_path_optional("P"))
+
+    def test_env_path_optional_unset_returns_default_str(self):
+        # default as string; resolve=True by default -> absolute path
+        default_str = "x/y"
+        with patch.dict(os.environ, {}, clear=True):
+            p = m.env_path_optional("P", default=default_str)
+            self.assertIsInstance(p, Path)
+            self.assertIsNotNone(p)
+            if p:
+                self.assertTrue(p.is_absolute())
+                self.assertEqual(p.parts[-2:], ("x", "y"))
+
+    def test_env_path_optional_unset_returns_default_path_no_resolve(self):
+        d = Path("z")
+        with patch.dict(os.environ, {}, clear=True):
+            p = m.env_path_optional("P", default=d, resolve=False)
+            self.assertEqual(p, d)
+
+    def test_env_path_optional_respects_resolve_true(self):
+        with patch.dict(os.environ, {"P": "a/b"}, clear=True):
+            p = m.env_path_optional("P", resolve=True)
+            self.assertIsInstance(p, Path)
+            if p:
+                self.assertTrue(p.is_absolute())
+
+    def test_env_path_optional_respects_resolve_false(self):
+        with patch.dict(os.environ, {"P": "rel/dir"}, clear=True):
+            p = m.env_path_optional("P", resolve=False)
+            self.assertEqual(p, Path("rel/dir"))
+            if p:
+                self.assertFalse(p.is_absolute())
+
+    def test_env_path_raises_when_missing_and_default_none(self):
+        with patch.dict(os.environ, {}, clear=True):
+            with self.assertRaises(ValueError):
+                m.env_path("P", None, resolve=True)
+
+    def test_env_path_returns_path_when_present(self):
+        tmp = Path("./b").resolve()
+        with patch.dict(os.environ, {"P": str(tmp)}, clear=True):
+            p = m.env_path("P", None, resolve=True)
+            self.assertEqual(p, tmp)
+
+    # -------- dataclass field helpers --------
+    def test_dataclass_fields_read_env_at_instantiation(self):
+        @dataclass
+        class Cfg:
+            flag: bool = m.env_bool_field("FLAG", default=False)
+            out: Path = m.env_path_field("OUT", default="ab", resolve=True)
+            name: str = m.env_str_field("NAME", default="anon")
+
+        # First instantiation
+        with patch.dict(
+            os.environ, {"FLAG": "true", "OUT": "outdir", "NAME": "alice"}, clear=True
+        ):
+            cfg1 = Cfg()
+            self.assertTrue(cfg1.flag)
+            self.assertIsInstance(cfg1.out, Path)
+            self.assertTrue(cfg1.out.is_absolute())
+            self.assertEqual(cfg1.name, "alice")
+            cfg1.name = "bob"  # change instance value
+            self.assertEqual(cfg1.name, "bob")  # change is reflected
+
+        # Change env; new instance should reflect new values
+        with patch.dict(os.environ, {"FLAG": "false", "NAME": ""}, clear=True):
+            cfg2 = Cfg()
+            self.assertFalse(cfg2.flag)  # str2bool("false") -> False
+            self.assertTrue("ab" in str(cfg2.out))
+            self.assertIsInstance(cfg2.out, Path)
+            self.assertTrue(cfg2.out.is_absolute())
+            self.assertEqual(cfg2.name, "anon")  # empty -> fallback to default
+
+    def test_dataclass_path_field_with_default_value(self):
+        @dataclass
+        class C2:
+            out: Path = m.env_path_field("OUT", default="some/dir", resolve=False)
+
+        with patch.dict(os.environ, {}, clear=True):
+            c = C2()
+            self.assertEqual(c.out, Path("some/dir"))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/.ci/lumen_cli/tests/test_path_helper.py b/.ci/lumen_cli/tests/test_path_helper.py
new file mode 100644
index 0000000000000..d90ffa5631f59
--- /dev/null
+++ b/.ci/lumen_cli/tests/test_path_helper.py
@@ -0,0 +1,122 @@
+# test_path_utils.py
+# Run: pytest -q
+
+import os
+import unittest
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from cli.lib.common.path_helper import (
+    copy,
+    ensure_dir_exists,
+    force_create_dir,
+    get_path,
+    is_path_exist,
+    remove_dir,
+)
+
+
+class TestPathHelper(unittest.TestCase):
+    def setUp(self):
+        self.tmpdir = TemporaryDirectory()
+        self.tmp_path = Path(self.tmpdir.name)
+
+    def tearDown(self):
+        self.tmpdir.cleanup()
+
+    # -------- get_path --------
+    def test_get_path_returns_path_for_str(self):
+        # Use relative path to avoid absolute-ness
+        rel_str = "sub/f.txt"
+        os.chdir(self.tmp_path)
+        p = get_path(rel_str, resolve=False)
+        self.assertIsInstance(p, Path)
+        self.assertFalse(p.is_absolute())
+        self.assertEqual(str(p), rel_str)
+
+    def test_get_path_resolves(self):
+        rel_str = "sub/f.txt"
+        p = get_path(str(self.tmp_path / rel_str), resolve=True)
+        self.assertTrue(p.is_absolute())
+        self.assertTrue(str(p).endswith(rel_str))
+
+    def test_get_path_with_path_input(self):
+        p_in = self.tmp_path / "sub/f.txt"
+        p_out = get_path(p_in, resolve=False)
+        self.assertTrue(str(p_out) == str(p_in))
+
+    def test_get_path_with_none_raises(self):
+        with self.assertRaises(ValueError):
+            get_path(None)  # type: ignore[arg-type]
+
+    def test_get_path_invalid_type_raises(self):
+        with self.assertRaises(TypeError):
+            get_path(123)  # type: ignore[arg-type]
+
+    # -------- ensure_dir_exists / force_create_dir / remove_dir --------
+    def test_ensure_dir_exists_creates_and_is_idempotent(self):
+        d = self.tmp_path / "made"
+        ensure_dir_exists(d)
+        self.assertTrue(d.exists() and d.is_dir())
+        ensure_dir_exists(d)
+
+    def test_force_create_dir_clears_existing(self):
+        d = self.tmp_path / "fresh"
+        (d / "inner").mkdir(parents=True)
+        (d / "inner" / "f.txt").write_text("x")
+        force_create_dir(d)
+        self.assertTrue(d.exists())
+        self.assertEqual(list(d.iterdir()), [])
+
+    def test_remove_dir_none_is_noop(self):
+        remove_dir(None)  # type: ignore[arg-type]
+
+    def test_remove_dir_nonexistent_is_noop(self):
+        ghost = self.tmp_path / "ghost"
+        remove_dir(ghost)
+
+    def test_remove_dir_accepts_str(self):
+        d = self.tmp_path / "to_rm"
+        d.mkdir()
+        remove_dir(str(d))
+        self.assertFalse(d.exists())
+
+    # -------- copy --------
+    def test_copy_file_to_file(self):
+        src = self.tmp_path / "src.txt"
+        dst = self.tmp_path / "out" / "dst.txt"
+        src.write_text("hello")
+        copy(src, dst)
+        self.assertEqual(dst.read_text(), "hello")
+
+    def test_copy_dir_to_new_dir(self):
+        src = self.tmp_path / "srcdir"
+        (src / "a").mkdir(parents=True)
+        (src / "a" / "f.txt").write_text("content")
+        dst = self.tmp_path / "destdir"
+        copy(src, dst)
+        self.assertEqual((dst / "a" / "f.txt").read_text(), "content")
+
+    def test_copy_dir_into_existing_dir_overwrite_true_merges(self):
+        src = self.tmp_path / "srcdir"
+        dst = self.tmp_path / "destdir"
+        (src / "x").mkdir(parents=True)
+        (src / "x" / "new.txt").write_text("new")
+        dst.mkdir()
+        (dst / "existing.txt").write_text("old")
+        copy(src, dst)
+        self.assertEqual((dst / "existing.txt").read_text(), "old")
+        self.assertEqual((dst / "x" / "new.txt").read_text(), "new")
+
+    def test_is_str_path_exist(self):
+        p = self.tmp_path / "x.txt"
+        p.write_text("1")
+        self.assertTrue(is_path_exist(str(p)))
+        self.assertTrue(is_path_exist(p))
+        self.assertFalse(is_path_exist(str(self.tmp_path / "missing")))
+        self.assertFalse(is_path_exist(self.tmp_path / "missing"))
+        self.assertFalse(is_path_exist(""))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/.ci/lumen_cli/tests/test_run_plan.py b/.ci/lumen_cli/tests/test_run_plan.py
new file mode 100644
index 0000000000000..a85ed2e3986f6
--- /dev/null
+++ b/.ci/lumen_cli/tests/test_run_plan.py
@@ -0,0 +1,185 @@
+# tests/test_run_test_plan.py
+import importlib
+from contextlib import nullcontext
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import pytest
+
+
+MOD = "cli.lib.core.vllm.lib"
+
+# We import inside tests so the MOD override above applies everywhere
+run_test_plan_import_path = f"{MOD}.run_test_plan"
+
+
+def _get_cmd(c):
+    # Support both kwargs and positional args
+    return c.kwargs.get("cmd", c.args[0] if c.args else None)
+
+
+def _get_check(c):
+    if "check" in c.kwargs:
+        return c.kwargs["check"]
+    # If positional, assume second arg is 'check' when present; default False
+    return c.args[1] if len(c.args) > 1 else False
+
+
+@pytest.fixture
+def patch_module(monkeypatch):
+    """
+    Patch helpers ('pip_install_packages', 'temp_environ', 'working_directory',
+    'run_command', 'logger') inside the target module and expose them.
+    """
+    module = importlib.import_module(MOD)
+
+    # Create fakes/mocks
+    pip_install_packages = MagicMock(name="pip_install_packages")
+    run_command = MagicMock(name="run_command", return_value=0)
+
+    # temp_environ / working_directory: record calls but act as context managers
+    temp_calls: list[dict] = []
+    workdir_calls: list[str] = []
+
+    def fake_working_directory(path: str):
+        workdir_calls.append(path)
+        return nullcontext()
+
+    def fake_temp_env(map: dict[str, str]):
+        temp_calls.append(map)
+        return nullcontext()
+
+    logger = SimpleNamespace(
+        info=MagicMock(name="logger.info"),
+        error=MagicMock(name="logger.error"),
+    )
+
+    # Apply patches (raise if attribute doesn't exist)
+    monkeypatch.setattr(
+        module, "pip_install_packages", pip_install_packages, raising=True
+    )
+    monkeypatch.setattr(module, "run_command", run_command, raising=True)
+    monkeypatch.setattr(
+        module, "working_directory", fake_working_directory, raising=True
+    )
+    monkeypatch.setattr(module, "temp_environ", fake_temp_env, raising=True)
+    monkeypatch.setattr(module, "logger", logger, raising=True)
+
+    return SimpleNamespace(
+        module=module,
+        run_test_plan=module.run_test_plan,  # expose to avoid getattr("constant") (Ruff B009)
+        pip_install_packages=pip_install_packages,
+        run_command=run_command,
+        temp_calls=temp_calls,
+        workdir_calls=workdir_calls,
+        logger=logger,
+    )
+
+
+def test_success_runs_all_steps_and_uses_env_and_workdir(monkeypatch, patch_module):
+    run_test_plan = patch_module.run_test_plan
+
+    tests_map = {
+        "basic": {
+            "title": "Basic suite",
+            "package_install": [],
+            "working_directory": "tests",
+            "env_vars": {"GLOBAL_FLAG": "1"},
+            "steps": [
+                "export A=x && pytest -q",
+                "export B=y && pytest -q tests/unit",
+            ],
+        }
+    }
+
+    # One exit code per step (export + two pytest)
+    patch_module.run_command.side_effect = [0, 0, 0]
+
+    run_test_plan("basic", "cpu", tests_map)
+
+    calls = patch_module.run_command.call_args_list
+    cmds = [_get_cmd(c) for c in calls]
+    checks = [_get_check(c) for c in calls]
+
+    assert cmds == [
+        "export A=x && pytest -q",
+        "export B=y && pytest -q tests/unit",
+    ]
+    assert all(chk is False for chk in checks)
+
+    assert patch_module.workdir_calls == ["tests"]
+    assert patch_module.temp_calls == [{"GLOBAL_FLAG": "1"}]
+
+
+def test_installs_packages_when_present(monkeypatch, patch_module):
+    run_test_plan = patch_module.module.run_test_plan
+
+    tests_map = {
+        "with_pkgs": {
+            "title": "Needs deps",
+            "package_install": ["timm==1.0.0", "flash-attn"],
+            "steps": ["pytest -q"],
+        }
+    }
+
+    patch_module.run_command.return_value = 0
+
+    run_test_plan("with_pkgs", "gpu", tests_map)
+
+    patch_module.pip_install_packages.assert_called_once_with(
+        packages=["timm==1.0.0", "flash-attn"],
+        prefer_uv=True,
+    )
+
+
+def test_raises_on_missing_plan(patch_module):
+    run_test_plan = patch_module.module.run_test_plan
+    with pytest.raises(RuntimeError) as ei:
+        run_test_plan("nope", "cpu", tests_map={})
+
+    assert "test nope not found" in str(ei.value)
+
+
+def test_aggregates_failures_and_raises(monkeypatch, patch_module):
+    run_test_plan = patch_module.module.run_test_plan
+
+    tests_map = {
+        "mix": {
+            "title": "Some pass some fail",
+            "steps": [
+                "pytest test_a.py",  # 0 → pass
+                "pytest test_b.py",  # 1 → fail
+                "pytest test_c.py",  # 2 → fail
+            ],
+        }
+    }
+
+    # Simulate pass, fail, fail
+    patch_module.run_command.side_effect = [0, 1, 2]
+
+    with pytest.raises(RuntimeError) as ei:
+        run_test_plan("mix", "cpu", tests_map)
+
+    msg = str(ei.value)
+    assert "2 pytest runs failed" in msg
+    # Ensure logger captured failed tests list
+    patch_module.logger.error.assert_called_once()
+    # And we attempted all three commands
+    assert patch_module.run_command.call_count == 3
+
+
+def test_custom_working_directory_used(patch_module):
+    run_test_plan = patch_module.module.run_test_plan
+
+    tests_map = {
+        "customwd": {
+            "title": "Custom wd",
+            "working_directory": "examples/ci",
+            "steps": ["pytest -q"],
+        }
+    }
+
+    patch_module.run_command.return_value = 0
+    run_test_plan("customwd", "cpu", tests_map)
+
+    assert patch_module.workdir_calls == ["examples/ci"]
diff --git a/.ci/lumen_cli/tests/test_utils.py b/.ci/lumen_cli/tests/test_utils.py
new file mode 100644
index 0000000000000..45ae5ad6d407b
--- /dev/null
+++ b/.ci/lumen_cli/tests/test_utils.py
@@ -0,0 +1,143 @@
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+from cli.lib.common.utils import temp_environ, working_directory  # <-- replace import
+
+
+class EnvIsolatedTestCase(unittest.TestCase):
+    """Base class that snapshots os.environ and CWD for isolation."""
+
+    def setUp(self):
+        import os
+        import tempfile
+
+        self._env_backup = dict(os.environ)
+
+        # Snapshot/repair CWD if it's gone
+        try:
+            self._cwd_backup = os.getcwd()
+        except FileNotFoundError:
+            # If CWD no longer exists, switch to a safe place and record that
+            self._cwd_backup = tempfile.gettempdir()
+            os.chdir(self._cwd_backup)
+
+        # Create a temporary directory for the test to run in
+        self._temp_dir = tempfile.mkdtemp()
+        os.chdir(self._temp_dir)
+
+    def tearDown(self):
+        import os
+        import shutil
+        import tempfile
+
+        # Restore cwd first (before cleaning up temp dir)
+        try:
+            os.chdir(self._cwd_backup)
+        except OSError:
+            os.chdir(tempfile.gettempdir())
+
+        # Clean up temporary directory
+        try:
+            shutil.rmtree(self._temp_dir, ignore_errors=True)
+        except Exception:
+            pass  # Ignore cleanup errors
+
+        # Restore env
+        to_del = set(os.environ.keys()) - set(self._env_backup.keys())
+        for k in to_del:
+            os.environ.pop(k, None)
+        for k, v in self._env_backup.items():
+            os.environ[k] = v
+
+
+class TestTempEnviron(EnvIsolatedTestCase):
+    def test_sets_and_restores_new_var(self):
+        var = "TEST_TMP_ENV_NEW"
+        self.assertNotIn(var, os.environ)
+
+        with temp_environ({var: "123"}):
+            self.assertEqual(os.environ[var], "123")
+
+        self.assertNotIn(var, os.environ)  # removed after exit
+
+    def test_overwrites_and_restores_existing_var(self):
+        var = "TEST_TMP_ENV_OVERWRITE"
+        os.environ[var] = "orig"
+
+        with temp_environ({var: "override"}):
+            self.assertEqual(os.environ[var], "override")
+
+        self.assertEqual(os.environ[var], "orig")  # restored
+
+    def test_multiple_vars_and_missing_cleanup(self):
+        v1, v2 = "TEST_ENV_V1", "TEST_ENV_V2"
+        os.environ.pop(v1, None)
+        os.environ[v2] = "keep"
+
+        with temp_environ({v1: "a", v2: "b"}):
+            self.assertEqual(os.environ[v1], "a")
+            self.assertEqual(os.environ[v2], "b")
+
+        self.assertNotIn(v1, os.environ)  # newly-added -> removed
+        self.assertEqual(os.environ[v2], "keep")  # pre-existing -> restored
+
+    def test_restores_even_on_exception(self):
+        var = "TEST_TMP_ENV_EXCEPTION"
+        self.assertNotIn(var, os.environ)
+
+        with self.assertRaises(RuntimeError):
+            with temp_environ({var: "x"}):
+                self.assertEqual(os.environ[var], "x")
+                raise RuntimeError("boom")
+
+        self.assertNotIn(var, os.environ)  # removed after exception
+
+
+class TestWorkingDirectory(EnvIsolatedTestCase):
+    def test_changes_and_restores(self):
+        start = Path.cwd()
+        with tempfile.TemporaryDirectory() as td:
+            target = Path(td) / "wd"
+            target.mkdir()
+
+            with working_directory(str(target)):
+                self.assertEqual(Path.cwd().resolve(), target.resolve())
+
+        self.assertEqual(Path.cwd(), start)
+
+    def test_noop_when_empty_path(self):
+        start = Path.cwd()
+        with working_directory(""):
+            self.assertEqual(Path.cwd(), start)
+        self.assertEqual(Path.cwd(), start)
+
+    def test_restores_on_exception(self):
+        start = Path.cwd()
+
+        with tempfile.TemporaryDirectory() as td:
+            target = Path(td) / "wd_exc"
+            target.mkdir()
+
+            with self.assertRaises(ValueError):
+                with working_directory(str(target)):
+                    # Normalize both sides to handle /var -> /private/var
+                    self.assertEqual(Path.cwd().resolve(), target.resolve())
+                    raise ValueError("boom")
+
+        self.assertEqual(Path.cwd().resolve(), start.resolve())
+
+    def test_raises_for_missing_dir(self):
+        start = Path.cwd()
+        with tempfile.TemporaryDirectory() as td:
+            missing = Path(td) / "does_not_exist"
+            with self.assertRaises(FileNotFoundError):
+                # os.chdir should raise before yielding
+                with working_directory(str(missing)):
+                    pass
+        self.assertEqual(Path.cwd(), start)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/.ci/lumen_cli/tests/test_vllm.py b/.ci/lumen_cli/tests/test_vllm.py
new file mode 100644
index 0000000000000..849eb0c40ee37
--- /dev/null
+++ b/.ci/lumen_cli/tests/test_vllm.py
@@ -0,0 +1,176 @@
+import os
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import cli.lib.core.vllm.vllm_build as vllm_build
+
+
+_VLLM_BUILD_MODULE = "cli.lib.core.vllm.vllm_build"
+
+
+class TestVllmBuildParameters(unittest.TestCase):
+    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True)
+    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=True)
+    @patch(
+        "cli.lib.common.envs_helper.env_path_optional",
+        side_effect=lambda name, default=None, resolve=True: {
+            "DOCKERFILE_PATH": Path("/abs/vllm/Dockerfile"),
+            "TORCH_WHEELS_PATH": Path("/abs/dist"),
+            "OUTPUT_DIR": Path("/abs/shared"),
+        }.get(name, Path(default) if default is not None else None),
+    )
+    @patch.dict(
+        os.environ,
+        {
+            "USE_TORCH_WHEEL": "1",
+            "USE_LOCAL_BASE_IMAGE": "1",
+            "USE_LOCAL_DOCKERFILE": "1",
+            "BASE_IMAGE": "my/image:tag",
+            "DOCKERFILE_PATH": "vllm/Dockerfile",
+            "TORCH_WHEELS_PATH": "dist",
+            "OUTPUT_DIR": "shared",
+        },
+        clear=True,
+    )
+    def test_params_success_normalizes_and_validates(
+        self, mock_env_path, mock_is_path, mock_local_img
+    ):
+        params = vllm_build.VllmBuildParameters()
+        self.assertEqual(params.torch_whls_path, Path("/abs/dist"))
+        self.assertEqual(params.dockerfile_path, Path("/abs/vllm/Dockerfile"))
+        self.assertEqual(params.output_dir, Path("/abs/shared"))
+        self.assertEqual(params.base_image, "my/image:tag")
+
+    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
+    @patch.dict(
+        os.environ, {"USE_TORCH_WHEEL": "1", "TORCH_WHEELS_PATH": "dist"}, clear=True
+    )
+    def test_params_missing_torch_whls_raises(self, _is_path):
+        with tempfile.TemporaryDirectory() as td:
+            os.chdir(td)
+            with self.assertRaises(ValueError) as cm:
+                vllm_build.VllmBuildParameters(
+                    use_local_base_image=False,
+                    use_local_dockerfile=False,
+                )
+        err = cm.exception
+        self.assertIn("TORCH_WHEELS_PATH", str(err))
+
+    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=False)
+    @patch.dict(
+        os.environ, {"USE_LOCAL_BASE_IMAGE": "1", "BASE_IMAGE": "img:tag"}, clear=True
+    )
+    def test_params_missing_local_base_image_raises(self, _local_img):
+        with tempfile.TemporaryDirectory() as td:
+            os.chdir(td)
+            with self.assertRaises(ValueError) as cm:
+                vllm_build.VllmBuildParameters(
+                    use_torch_whl=False,
+                    use_local_dockerfile=False,
+                )
+        err = cm.exception
+        self.assertIn("BASE_IMAGE", str(err))
+
+    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
+    @patch.dict(
+        os.environ,
+        {"USE_LOCAL_DOCKERFILE": "1", "DOCKERFILE_PATH": "Dockerfile"},
+        clear=True,
+    )
+    def test_params_missing_dockerfile_raises(self, _is_path):
+        with tempfile.TemporaryDirectory() as td:
+            os.chdir(td)
+            with self.assertRaises(ValueError) as cm:
+                vllm_build.VllmBuildParameters(
+                    use_torch_whl=False,
+                    use_local_base_image=False,
+                )
+        err = cm.exception
+        self.assertIn("DOCKERFILE_PATH", str(err))
+
+    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
+    @patch.dict(
+        os.environ,
+        {"OUTPUT_DIR": ""},
+        clear=True,
+    )
+    def test_params_missing_output_dir(self, _is_path):
+        with self.assertRaises(FileNotFoundError):
+            vllm_build.VllmBuildParameters()
+
+
+class TestBuildCmdAndRun(unittest.TestCase):
+    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True)
+    def test_generate_docker_build_cmd_includes_bits(self, _exists):
+        runner = vllm_build.VllmBuildRunner()
+        inputs = MagicMock()
+        inputs.output_dir = Path("/abs/out")
+        inputs.use_local_base_image = True
+        inputs.base_image = "img:tag"
+        inputs.torch_whls_path = Path("./vllm/tmp")
+        inputs.max_jobs = 64
+        inputs.cuda_version = "12.8.1"
+        inputs.python_version = "3.12"
+        inputs.sccache_bucket = "my-bucket"
+        inputs.sccache_region = "us-west-2"
+        inputs.torch_cuda_arch_list = "8.0;9.0"
+        inputs.target_stage = "export-wheels"
+        inputs.tag_name = "vllm-wheels"
+
+        cmd = runner._generate_docker_build_cmd(inputs)
+        squashed = " ".join(cmd.split())
+
+        self.assertIn("--output type=local,dest=/abs/out", squashed)
+        self.assertIn("-f docker/Dockerfile.nightly_torch", squashed)
+        self.assertIn("--pull=false", squashed)
+        self.assertIn("--build-arg TORCH_WHEELS_PATH=tmp", squashed)
+        self.assertIn("--build-arg BUILD_BASE_IMAGE=img:tag", squashed)
+        self.assertIn("--build-arg FINAL_BASE_IMAGE=img:tag", squashed)
+        self.assertIn("--build-arg max_jobs=64", squashed)
+        self.assertIn("--build-arg CUDA_VERSION=12.8.1", squashed)
+        self.assertIn("--build-arg PYTHON_VERSION=3.12", squashed)
+        self.assertIn("--build-arg USE_SCCACHE=1", squashed)
+        self.assertIn("--build-arg SCCACHE_BUCKET_NAME=my-bucket", squashed)
+        self.assertIn("--build-arg SCCACHE_REGION_NAME=us-west-2", squashed)
+        self.assertIn("--build-arg torch_cuda_arch_list='8.0;9.0'", squashed)
+        self.assertIn("--target export-wheels", squashed)
+        self.assertIn("-t vllm-wheels", squashed)
+
+    @patch(f"{_VLLM_BUILD_MODULE}.run_command")
+    @patch(f"{_VLLM_BUILD_MODULE}.ensure_dir_exists")
+    @patch(f"{_VLLM_BUILD_MODULE}.clone_vllm")
+    @patch.object(
+        vllm_build.VllmBuildRunner,
+        "_generate_docker_build_cmd",
+        return_value="docker buildx ...",
+    )
+    @patch.dict(
+        os.environ,
+        {
+            "USE_TORCH_WHEEL": "0",
+            "USE_LOCAL_BASE_IMAGE": "0",
+            "USE_LOCAL_DOCKERFILE": "0",
+            "OUTPUT_DIR": "shared",
+        },
+        clear=True,
+    )
+    def test_run_calls_clone_prepare_and_build(
+        self, mock_gen, mock_clone, mock_ensure, mock_run
+    ):
+        params = MagicMock()
+        params.output_dir = Path("shared")
+        params.use_local_dockerfile = False
+        params.use_torch_whl = False
+
+        with patch(f"{_VLLM_BUILD_MODULE}.VllmBuildParameters", return_value=params):
+            runner = vllm_build.VllmBuildRunner()
+            runner.run()
+
+        mock_clone.assert_called_once()
+        mock_ensure.assert_called_once_with(Path("shared"))
+        mock_gen.assert_called_once_with(params)
+        mock_run.assert_called_once()
+        _, kwargs = mock_run.call_args
+        assert kwargs.get("cwd") == "vllm"
diff --git a/.ci/magma/Makefile b/.ci/magma/Makefile
index 5035e1ee3b2c6..4169aedd03fa5 100644
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma/build_magma.sh
 
 .PHONY: all
+all: magma-cuda130
 all: magma-cuda129
 all: magma-cuda128
 all: magma-cuda126
@@ -25,6 +26,12 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output
 
+.PHONY: magma-cuda130
+magma-cuda130: DESIRED_CUDA := 13.0
+magma-cuda130: CUDA_ARCH_LIST := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
+magma-cuda130:
+	$(DOCKER_RUN)
+
 .PHONY: magma-cuda129
 magma-cuda129: DESIRED_CUDA := 12.9
 magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
diff --git a/.ci/magma/build_magma.sh b/.ci/magma/build_magma.sh
index 3ac0bcaf1d5ba..6f1924fa45965 100755
--- a/.ci/magma/build_magma.sh
+++ b/.ci/magma/build_magma.sh
@@ -28,6 +28,7 @@ pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION}
 patch < ${PACKAGE_FILES}/CMake.patch
 patch < ${PACKAGE_FILES}/cmakelists.patch
 patch -p0 < ${PACKAGE_FILES}/thread_queue.patch
+patch -p1 < ${PACKAGE_FILES}/cuda13.patch
 patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch
 patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch
 # The build.sh script expects to be executed from the sources root folder
@@ -37,6 +38,7 @@ popd
 # Package recipe, license and tarball
 # Folder and package name are backward compatible for the build workflow
 cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
+cp ${PACKAGE_FILES}/cuda13.patch ${PACKAGE_RECIPE}/cuda13.patch
 cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch
 cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch
 cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch
diff --git a/.ci/magma/package_files/cuda13.patch b/.ci/magma/package_files/cuda13.patch
new file mode 100644
index 0000000000000..d6ebaf9dfaae7
--- /dev/null
+++ b/.ci/magma/package_files/cuda13.patch
@@ -0,0 +1,26 @@
+diff --git a/interface_cuda/interface.cpp b/interface_cuda/interface.cpp
+index 73fed1b20..e77519bfe 100644
+--- a/interface_cuda/interface.cpp
++++ b/interface_cuda/interface.cpp
+@@ -438,14 +438,20 @@ magma_print_environment()
+         cudaDeviceProp prop;
+         err = cudaGetDeviceProperties( &prop, dev );
+         check_error( err );
++        #ifdef MAGMA_HAVE_CUDA
++#if CUDA_VERSION < 13000
+         printf( "%% device %d: %s, %.1f MHz clock, %.1f MiB memory, capability %d.%d\n",
+                 dev,
+                 prop.name,
+                 prop.clockRate / 1000.,
++#else
++        printf( "%% device %d: %s, ??? MHz clock, %.1f MiB memory, capability %d.%d\n",
++                dev,
++                prop.name,
++#endif
+                 prop.totalGlobalMem / (1024.*1024.),
+                 prop.major,
+                 prop.minor );
+-        #ifdef MAGMA_HAVE_CUDA
+         int arch = prop.major*100 + prop.minor*10;
+         if ( arch < MAGMA_CUDA_ARCH_MIN ) {
+             printf("\n"
diff --git a/.ci/manywheel/build.sh b/.ci/manywheel/build.sh
index 4c4d51134715a..6b2a60bc5ca28 100755
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@@ -5,10 +5,6 @@ set -ex
 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
 case "${GPU_ARCH_TYPE:-BLANK}" in
-    BLANK)
-        # Legacy behavior for CircleCI
-        bash "${SCRIPTPATH}/build_cuda.sh"
-        ;;
     cuda)
         bash "${SCRIPTPATH}/build_cuda.sh"
         ;;
diff --git a/.ci/manywheel/build_common.sh b/.ci/manywheel/build_common.sh
index 49549c9f2994e..4c268befb30e5 100644
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@@ -138,28 +138,11 @@ fi
 
 echo "Calling setup.py bdist at $(date)"
 
-if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
-    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \
+time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+    EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
     BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
     USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
     python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
-    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
-    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
-    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
-    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-    CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
-    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-else
-    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
-        EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
-        BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
-        USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-        python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
-fi
 echo "Finished setup.py bdist at $(date)"
 
 # Build libtorch packages
@@ -272,10 +255,6 @@ ls /tmp/$WHEELHOUSE_DIR
 mkdir -p "/$WHEELHOUSE_DIR"
 mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/
 
-if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true
-fi
-
 if [[ -n "$BUILD_PYTHONLESS" ]]; then
     mkdir -p /$LIBTORCH_HOUSE_DIR
     mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
@@ -452,16 +431,8 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then
   pushd $PYTORCH_ROOT/test
 
   # Install the wheel for this Python version
-  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true
-  fi
-
   pip uninstall -y "$TORCH_PACKAGE_NAME"
 
-  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
-  fi
-
   pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
 
   # Print info on the libraries installed in this wheel
diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh
index 39586faa85f87..6ed38f8b25c62 100644
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@@ -66,6 +66,9 @@ case ${CUDA_VERSION} in
             TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
         fi
         ;;
+    13.0)
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
+        ;;
     12.6)
         TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
         ;;
@@ -110,13 +113,18 @@ DEPS_SONAME=(
 )
 
 
-# CUDA_VERSION 12.6, 12.8, 12.9
-if [[ $CUDA_VERSION == 12* ]]; then
+# CUDA_VERSION 12.*, 13.*
+if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
     export USE_STATIC_CUDNN=0
     # Try parallelizing nvcc as well
-    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
+    TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
+    # Compress the fatbin with -compress-mode=size for CUDA 13
+    if [[ $CUDA_VERSION == 13* ]]; then
+        export TORCH_NVCC_FLAGS="$TORCH_NVCC_FLAGS -compress-mode=size"
+    fi
     if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
         echo "Bundling with cudnn and cublas."
+
         DEPS_LIST+=(
             "/usr/local/cuda/lib64/libcudnn_adv.so.9"
             "/usr/local/cuda/lib64/libcudnn_cnn.so.9"
@@ -126,15 +134,11 @@ if [[ $CUDA_VERSION == 12* ]]; then
             "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
             "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
             "/usr/local/cuda/lib64/libcudnn.so.9"
-            "/usr/local/cuda/lib64/libcublas.so.12"
-            "/usr/local/cuda/lib64/libcublasLt.so.12"
             "/usr/local/cuda/lib64/libcusparseLt.so.0"
-            "/usr/local/cuda/lib64/libcudart.so.12"
-            "/usr/local/cuda/lib64/libnvrtc.so.12"
             "/usr/local/cuda/lib64/libnvrtc-builtins.so"
             "/usr/local/cuda/lib64/libcufile.so.0"
             "/usr/local/cuda/lib64/libcufile_rdma.so.1"
-            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
+            "/usr/local/cuda/lib64/libnvshmem_host.so.3"
             "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
         )
         DEPS_SONAME+=(
@@ -146,41 +150,83 @@ if [[ $CUDA_VERSION == 12* ]]; then
             "libcudnn_engines_precompiled.so.9"
             "libcudnn_heuristic.so.9"
             "libcudnn.so.9"
-            "libcublas.so.12"
-            "libcublasLt.so.12"
             "libcusparseLt.so.0"
-            "libcudart.so.12"
-            "libnvrtc.so.12"
             "libnvrtc-builtins.so"
+            "libnvshmem_host.so.3"
             "libcufile.so.0"
             "libcufile_rdma.so.1"
-            "libcupti.so.12"
             "libnvperf_host.so"
         )
         # Add libnvToolsExt only if CUDA version is not 12.9
-        if [[ $CUDA_VERSION != 12.9* ]]; then
-            DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
-            DEPS_SONAME+=("libnvToolsExt.so.1")
+        if [[ $CUDA_VERSION == 13* ]]; then
+            DEPS_LIST+=(
+                "/usr/local/cuda/lib64/libcublas.so.13"
+                "/usr/local/cuda/lib64/libcublasLt.so.13"
+                "/usr/local/cuda/lib64/libcudart.so.13"
+                "/usr/local/cuda/lib64/libnvrtc.so.13"
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13"
+                "/usr/local/cuda/lib64/libibverbs.so.1"
+                "/usr/local/cuda/lib64/librdmacm.so.1"
+                "/usr/local/cuda/lib64/libmlx5.so.1"
+                "/usr/local/cuda/lib64/libnl-3.so.200"
+                "/usr/local/cuda/lib64/libnl-route-3.so.200")
+            DEPS_SONAME+=(
+                "libcublas.so.13"
+                "libcublasLt.so.13"
+                "libcudart.so.13"
+                "libnvrtc.so.13"
+                "libcupti.so.13"
+                "libibverbs.so.1"
+                "librdmacm.so.1"
+                "libmlx5.so.1"
+                "libnl-3.so.200"
+                "libnl-route-3.so.200")
+            export USE_CUPTI_SO=1
+            export ATEN_STATIC_CUDA=0
+            export USE_CUDA_STATIC_LINK=0
+            export USE_CUFILE=0
+        else
+            DEPS_LIST+=(
+                "/usr/local/cuda/lib64/libnvToolsExt.so.1"
+                "/usr/local/cuda/lib64/libcublas.so.12"
+                "/usr/local/cuda/lib64/libcublasLt.so.12"
+                "/usr/local/cuda/lib64/libcudart.so.12"
+                "/usr/local/cuda/lib64/libnvrtc.so.12"
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
+            DEPS_SONAME+=(
+                "libnvToolsExt.so.1"
+                "libcublas.so.12"
+                "libcublasLt.so.12"
+                "libcudart.so.12"
+                "libnvrtc.so.12"
+                "libcupti.so.12")
         fi
     else
         echo "Using nvidia libs from pypi."
         CUDA_RPATHS=(
-            '$ORIGIN/../../nvidia/cublas/lib'
-            '$ORIGIN/../../nvidia/cuda_cupti/lib'
-            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
-            '$ORIGIN/../../nvidia/cuda_runtime/lib'
             '$ORIGIN/../../nvidia/cudnn/lib'
-            '$ORIGIN/../../nvidia/cufft/lib'
-            '$ORIGIN/../../nvidia/curand/lib'
-            '$ORIGIN/../../nvidia/cusolver/lib'
-            '$ORIGIN/../../nvidia/cusparse/lib'
-            '$ORIGIN/../../nvidia/cusparselt/lib'
-            '$ORIGIN/../../cusparselt/lib'
-            '$ORIGIN/../../nvidia/nccl/lib'
             '$ORIGIN/../../nvidia/nvshmem/lib'
-            '$ORIGIN/../../nvidia/nvtx/lib'
-            '$ORIGIN/../../nvidia/cufile/lib'
+            '$ORIGIN/../../nvidia/nccl/lib'
+            '$ORIGIN/../../nvidia/cusparselt/lib'
         )
+        if [[ $CUDA_VERSION == 13* ]]; then
+            CUDA_RPATHS+=('$ORIGIN/../../nvidia/cu13/lib')
+        else
+            CUDA_RPATHS+=(
+                '$ORIGIN/../../nvidia/cublas/lib'
+                '$ORIGIN/../../nvidia/cuda_cupti/lib'
+                '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
+                '$ORIGIN/../../nvidia/cuda_runtime/lib'
+                '$ORIGIN/../../nvidia/cufft/lib'
+                '$ORIGIN/../../nvidia/curand/lib'
+                '$ORIGIN/../../nvidia/cusolver/lib'
+                '$ORIGIN/../../nvidia/cusparse/lib'
+                '$ORIGIN/../../cusparselt/lib'
+                '$ORIGIN/../../nvidia/nvtx/lib'
+                '$ORIGIN/../../nvidia/cufile/lib'
+            )
+        fi
+
         CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
         export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
         export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
diff --git a/.ci/manywheel/build_xpu.sh b/.ci/manywheel/build_xpu.sh
index ff157b1c0b205..bd7b168be336c 100755
--- a/.ci/manywheel/build_xpu.sh
+++ b/.ci/manywheel/build_xpu.sh
@@ -25,6 +25,7 @@ source /opt/intel/oneapi/mpi/latest/env/vars.sh
 export USE_STATIC_MKL=1
 export USE_ONEMKL=1
 export USE_XCCL=1
+export USE_MPI=0
 
 WHEELHOUSE_DIR="wheelhousexpu"
 LIBTORCH_HOUSE_DIR="libtorch_housexpu"
diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
index a7ce0fef736cf..1c88554c2af96 100755
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@@ -50,9 +50,6 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
   export ATEN_THREADING=NATIVE
 fi
 
-# Enable LLVM dependency for TensorExpr testing
-export USE_LLVM=/opt/llvm
-export LLVM_DIR=/opt/llvm/lib/cmake/llvm
 
 if ! which conda; then
   # In ROCm CIs, we are doing cross compilation on build machines with
@@ -95,6 +92,27 @@ if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
   export ACL_ROOT_DIR=/ComputeLibrary
 fi
 
+if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
+  if [[ -f /opt/riscv-cross-env/bin/activate ]]; then
+    # shellcheck disable=SC1091
+    source /opt/riscv-cross-env/bin/activate
+  else
+    echo "Activation file not found"
+    exit 1
+  fi
+
+  export CMAKE_CROSSCOMPILING=TRUE
+  export CMAKE_SYSTEM_NAME=Linux
+  export CMAKE_SYSTEM_PROCESSOR=riscv64
+
+  export USE_CUDA=0
+  export USE_MKLDNN=0
+
+  export SLEEF_TARGET_EXEC_USE_QEMU=ON
+  sudo chown -R jenkins /var/lib/jenkins/workspace /opt
+
+fi
+
 if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
   POSSIBLE_JAVA_HOMES=()
   POSSIBLE_JAVA_HOMES+=(/usr/local)
@@ -155,6 +173,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
   source /opt/intel/oneapi/mpi/latest/env/vars.sh
   # Enable XCCL build
   export USE_XCCL=1
+  export USE_MPI=0
   # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
   export USE_KINETO=0
   export TORCH_XPU_ARCH_LIST=pvc
@@ -176,8 +195,16 @@ fi
 
 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
-  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
+
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
+  J=2  # default to 2 jobs
+  case "$RUNNER" in
+    linux.12xlarge.memory|linux.24xlarge.memory)
+      J=24
+      ;;
+  esac
+  echo "Building FlashAttention with job limit $J"
+  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j ${J}"
 fi
 
 if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
@@ -192,7 +219,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
   export USE_ASAN=1
   export REL_WITH_DEB_INFO=1
   export UBSAN_FLAGS="-fno-sanitize-recover=all"
-  unset USE_LLVM
 fi
 
 if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
@@ -213,7 +239,7 @@ fi
 
 # Do not change workspace permissions for ROCm and s390x CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && -d /var/lib/jenkins/workspace ]]; then
   # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
   WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
   cleanup_workspace() {
@@ -258,29 +284,19 @@ else
     # XLA test build fails when WERROR=1
     # set only when building other architectures
     # or building non-XLA tests.
-    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
-          "$BUILD_ENVIRONMENT" != *xla* ]]; then
+    if [[ "$BUILD_ENVIRONMENT" != *rocm*  && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then
       # Install numpy-2.0.2 for builds which are backward compatible with 1.X
       python -mpip install numpy==2.0.2
 
       WERROR=1 python setup.py clean
 
-      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        python3 tools/packaging/split_wheel.py bdist_wheel
-      else
-        WERROR=1 python setup.py bdist_wheel
-      fi
+      WERROR=1 python setup.py bdist_wheel
     else
       python setup.py clean
       if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
         source .ci/pytorch/install_cache_xla.sh
       fi
-      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
-        exit 1
-      else
-        python setup.py bdist_wheel
-      fi
+      python setup.py bdist_wheel
     fi
     pip_install_whl "$(echo dist/*.whl)"
 
@@ -405,7 +421,7 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
   # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
   python tools/stats/export_test_times.py
 fi
-# don't do this for bazel or s390x as they don't use sccache
-if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
+# don't do this for bazel or s390x or riscv64 as they don't use sccache
+if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
   print_sccache_stats
 fi
diff --git a/.ci/pytorch/check_binary.sh b/.ci/pytorch/check_binary.sh
index 78baf6a0761d7..0f632f8006c07 100755
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@@ -300,24 +300,3 @@ except RuntimeError as e:
     exit 1
   fi
 fi
-
-###############################################################################
-# Check for C++ ABI compatibility to GCC-11 - GCC 13
-###############################################################################
-if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
-  pushd /tmp
-  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
-  # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
-  # gcc 11 - CUDA 11.8, xpu, rocm
-  # gcc 13 - CUDA 12.6, 12.8 and cpu
-  # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
-  if [[ "$(uname -m)" == "s390x" ]]; then
-    cxx_abi="19"
-  elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
-    cxx_abi="18"
-  else
-    cxx_abi="16"
-  fi
-  python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
-  popd
-fi
diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index 4771544b8b9b1..edfff60744919 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -149,6 +149,19 @@ function get_pinned_commit() {
   cat .github/ci_commit_pins/"${1}".txt
 }
 
+function detect_cuda_arch() {
+  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
+    if command -v nvidia-smi; then
+      TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
+    elif [[ "${TEST_CONFIG}" == *nogpu* ]]; then
+      # There won't be nvidia-smi in nogpu tests, so just set TORCH_CUDA_ARCH_LIST to the default
+      # minimum supported value here
+      TORCH_CUDA_ARCH_LIST=8.0
+    fi
+    export TORCH_CUDA_ARCH_LIST
+  fi
+}
+
 function install_torchaudio() {
   local commit
   commit=$(get_pinned_commit audio)
@@ -229,7 +242,6 @@ function install_torchrec_and_fbgemm() {
 
     pip_install tabulate  # needed for newer fbgemm
     pip_install patchelf  # needed for rocm fbgemm
-    pushd /tmp
 
     local wheel_dir=dist/fbgemm_gpu
     local found_whl=0
@@ -264,7 +276,6 @@ function install_torchrec_and_fbgemm() {
     done
 
     rm -rf fbgemm
-    popd
   else
     pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
     pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
@@ -283,30 +294,6 @@ function clone_pytorch_xla() {
   fi
 }
 
-function checkout_install_torchbench() {
-  local commit
-  commit=$(get_pinned_commit torchbench)
-  git clone https://github.com/pytorch/benchmark torchbench
-  pushd torchbench
-  git checkout "$commit"
-
-  if [ "$1" ]; then
-    python install.py --continue_on_fail models "$@"
-  else
-    # Occasionally the installation may fail on one model but it is ok to continue
-    # to install and test other models
-    python install.py --continue_on_fail
-  fi
-
-  # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
-  # is regressing speedup metric. This needs to be investigated further
-  pip install transformers==4.38.1
-
-  echo "Print all dependencies after TorchBench is installed"
-  python -mpip freeze
-  popd
-}
-
 function install_torchao() {
   local commit
   commit=$(get_pinned_commit torchao)
diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh
index d7447e7d48582..d41c3c08e6288 100755
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@@ -35,11 +35,10 @@ fi
 
 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
-  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
-  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
+  # backends (specifically the gloo backend), so test that this case works too
   USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh
index 83f8e4e04331d..79d47da431712 100755
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@@ -13,9 +13,13 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd
 
+python -mpip install -r requirements.txt
+
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1
 
+python -mpip install --no-input -r requirements.txt
+
 setup_test_python() {
   # The CircleCI worker hostname doesn't resolve to an address.
   # This environment variable makes ProcessGroupGloo default to
@@ -157,6 +161,34 @@ test_jit_hooks() {
   assert_git_not_dirty
 }
 
+# Shellcheck doesn't like it when you pass no arguments to a function
+# that can take args. See https://www.shellcheck.net/wiki/SC2120
+# shellcheck disable=SC2120
+checkout_install_torchbench() {
+  local commit
+  commit=$(cat .ci/docker/ci_commit_pins/torchbench.txt)
+  git clone https://github.com/pytorch/benchmark torchbench
+  pushd torchbench
+  git checkout "$commit"
+
+  if [ "$1" ]; then
+    python install.py --continue_on_fail models "$@"
+  else
+    # Occasionally the installation may fail on one model but it is ok to continue
+    # to install and test other models
+    python install.py --continue_on_fail
+  fi
+  popd
+
+  pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
+  # https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
+  # its current version 0.12.0 doesn't work with transformers 4.54.0
+  pip uninstall -y torchao
+
+  echo "Print all dependencies after TorchBench is installed"
+  python -mpip freeze
+}
+
 torchbench_setup_macos() {
   git clone --recursive https://github.com/pytorch/vision torchvision
   git clone --recursive https://github.com/pytorch/audio torchaudio
@@ -167,7 +199,7 @@ torchbench_setup_macos() {
   git checkout "$(cat ../.github/ci_commit_pins/vision.txt)"
   git submodule update --init --recursive
   python setup.py clean
-  python setup.py develop
+  python -m pip install -e . -v --no-build-isolation
   popd
 
   pushd torchaudio
@@ -176,11 +208,9 @@ torchbench_setup_macos() {
   git submodule update --init --recursive
   python setup.py clean
   #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp
-  USE_OPENMP=0 python setup.py develop
+  USE_OPENMP=0 python -m pip install -e . -v --no-build-isolation
   popd
 
-  # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
-  # shellcheck disable=SC2119,SC2120
   checkout_install_torchbench
 }
 
@@ -276,6 +306,47 @@ test_torchbench_smoketest() {
     fi
 
   done
+  echo "Pytorch benchmark on mps device completed"
+}
+
+test_aoti_torchbench_smoketest() {
+  print_cmake_info
+
+  echo "Launching AOTInductor torchbench setup"
+  pip_benchmark_deps
+  # shellcheck disable=SC2119,SC2120
+  torchbench_setup_macos
+
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+
+  local device=mps
+  local dtypes=(undefined float16 bfloat16 notset)
+  local dtype=${dtypes[$1]}
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+
+  echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
+  local dtype_arg="--${dtype}"
+  if [ "$dtype" == notset ]; then
+      dtype_arg="--float32"
+  fi
+  touch "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv"
+  for model in "${models[@]}"; do
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+      --performance --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
+      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" || true
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+      --accuracy --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
+      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
+  done
+
+  echo "Launching HuggingFace inference performance run for AOT Inductor and dtype ${dtype}"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
+    --performance --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
+    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_performance.csv" || true
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
+    --accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
+    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true
 
   echo "Pytorch benchmark on mps device completed"
 }
@@ -324,6 +395,8 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
   test_timm_perf
 elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
   test_torchbench_smoketest "${SHARD_NUMBER}"
+elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then
+  test_aoti_torchbench_smoketest "${SHARD_NUMBER}"
 elif [[ $TEST_CONFIG == *"mps"* ]]; then
   test_python_mps
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
diff --git a/.ci/pytorch/multigpu-test.sh b/.ci/pytorch/multigpu-test.sh
index 1a0f44b8f98a3..219463f318dbd 100755
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@@ -45,6 +45,7 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
     # DTensor tests
     time python test/run_test.py --verbose -i distributed/tensor/test_random_ops
     time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile
+    time python test/run_test.py --verbose -i distributed/tensor/test_utils.py
 
     # DeviceMesh test
     time python test/run_test.py --verbose -i distributed/test_device_mesh
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 9f2a67b4ff45b..e0d47259676b7 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -91,6 +91,7 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
   export VALGRIND=OFF
 fi
 
+detect_cuda_arch
 
 if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
   # There are additional warnings on s390x, maybe due to newer gcc.
@@ -495,6 +496,14 @@ test_inductor_cpp_wrapper_shard() {
     -k 'take' \
     --shard "$1" "$NUM_TEST_SHARDS" \
     --verbose
+
+  if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
+    python test/run_test.py \
+      --include inductor/test_mkldnn_pattern_matcher \
+      -k 'xpu' \
+      --shard "$1" "$NUM_TEST_SHARDS" \
+      --verbose
+  fi
 }
 
 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@@ -1051,20 +1060,10 @@ test_libtorch_api() {
     mkdir -p $TEST_REPORTS_DIR
 
     OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
-    "$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
   else
     # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
     OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
 
-    # On s390x, pytorch is built without llvm.
-    # Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
-    # test fails with errors like:
-    # JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
-    # unknown file: Failure
-    # C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
-    if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
-      python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
-    fi
   fi
 
   # quantization is not fully supported on s390x yet
@@ -1639,6 +1638,10 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
   install_torchvision
   build_xla
   test_xla
+elif [[ "$TEST_CONFIG" == *vllm* ]]; then
+    echo "vLLM CI uses TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
+    (cd .ci/lumen_cli && python -m pip install -e .)
+    python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
 elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
   test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
@@ -1684,43 +1687,34 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
 elif [[ "${TEST_CONFIG}" == cachebench ]]; then
   install_torchaudio
   install_torchvision
-  checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
-  PYTHONPATH=$(pwd)/torchbench test_cachebench
+  PYTHONPATH=/torchbench test_cachebench
 elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
   install_torchaudio
   install_torchvision
-  checkout_install_torchbench nanogpt
-  PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
+  PYTHONPATH=/torchbench test_verify_cachebench
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
   install_torchaudio
   install_torchvision
-  install_torchao
   id=$((SHARD_NUMBER-1))
   # https://github.com/opencv/opencv-python/issues/885
   pip_install opencv-python==4.8.0.74
   if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
-    checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
-    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
+    PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
   elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
-    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
-      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
-      functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
-    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
+    PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
   elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
-    checkout_install_torchbench
-    TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
+    TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
   else
-    checkout_install_torchbench
     # Do this after checkout_install_torchbench to ensure we clobber any
     # nightlies that torchbench may pull in
     if [[ "${TEST_CONFIG}" != *cpu* ]]; then
       install_torchrec_and_fbgemm
     fi
-    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
+    PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
   fi
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
   install_torchvision
-  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
+  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
   if [[ "$SHARD_NUMBER" -eq "1" ]]; then
     test_inductor_aoti
   fi
diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat
index 7ceb425ce2d1a..19d715b9d0b6d 100644
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@@ -61,9 +61,10 @@ if "%USE_XPU%"=="1" (
   call "C:\Program Files (x86)\Intel\oneAPI\compiler\latest\env\vars.bat"
   call "C:\Program Files (x86)\Intel\oneAPI\ocloc\latest\env\vars.bat"
   if errorlevel 1 exit /b 1
-  :: Reduce build time. Only have MTL self-hosted runner now
-  SET TORCH_XPU_ARCH_LIST=xe-lpg
-  SET USE_KINETO=0
+  :: Reduce build time
+  SET TORCH_XPU_ARCH_LIST=bmg
+  :: Re-setup python env for build
+  call pip install -r requirements.txt
 )
 
 @echo on
diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh
index be7f3e4bb35cc..43524dc04e3fb 100755
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@@ -44,7 +44,7 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
 python -m pip install z3-solver==4.15.1.0
 
 # Install tlparse for test\dynamo\test_structured_trace.py UTs.
-python -m pip install tlparse==0.3.30
+python -m pip install tlparse==0.4.0
 
 # Install parameterized
 python -m pip install parameterized==0.8.1
diff --git a/.ci/pytorch/windows/cuda126.bat b/.ci/pytorch/windows/cuda126.bat
index dd30cc25d4a66..efb8cfec63e7e 100644
--- a/.ci/pytorch/windows/cuda126.bat
+++ b/.ci/pytorch/windows/cuda126.bat
@@ -37,7 +37,7 @@ IF "%CUDA_PATH_V126%"=="" (
 )
 
 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0
+    set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0
     set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
     set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90
diff --git a/.ci/pytorch/windows/cuda130.bat b/.ci/pytorch/windows/cuda130.bat
new file mode 100644
index 0000000000000..f38cd789f2da6
--- /dev/null
+++ b/.ci/pytorch/windows/cuda130.bat
@@ -0,0 +1,59 @@
+@echo off
+
+set MODULE_NAME=pytorch
+
+IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" (
+    call internal\clone.bat
+    cd %~dp0
+) ELSE (
+    call internal\clean.bat
+)
+IF ERRORLEVEL 1 goto :eof
+
+call internal\check_deps.bat
+IF ERRORLEVEL 1 goto :eof
+
+REM Check for optional components
+
+set USE_CUDA=
+set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
+
+IF "%NVTOOLSEXT_PATH%"=="" (
+    IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
+        set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+    ) ELSE (
+        echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+        exit /b 1
+    )
+)
+
+IF "%CUDA_PATH_V130%"=="" (
+    IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\nvcc.exe" (
+        set "CUDA_PATH_V130=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0"
+    ) ELSE (
+        echo CUDA 13.0 not found, failing
+        exit /b 1
+    )
+)
+
+IF "%BUILD_VISION%" == "" (
+    set TORCH_CUDA_ARCH_LIST=7.5;8.0;8.6;9.0;10.0;12.0
+    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
+) ELSE (
+    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
+)
+
+set "CUDA_PATH=%CUDA_PATH_V130%"
+set "PATH=%CUDA_PATH_V130%\bin;%PATH%"
+
+:optcheck
+
+call internal\check_opts.bat
+IF ERRORLEVEL 1 goto :eof
+
+if exist "%NIGHTLIES_PYTORCH_ROOT%" cd %NIGHTLIES_PYTORCH_ROOT%\..
+call  %~dp0\internal\copy.bat
+IF ERRORLEVEL 1 goto :eof
+
+call  %~dp0\internal\setup.bat
+IF ERRORLEVEL 1 goto :eof
diff --git a/.ci/pytorch/windows/internal/copy.bat b/.ci/pytorch/windows/internal/copy.bat
index 40f2bd7acdbb9..e0281c0d78a44 100644
--- a/.ci/pytorch/windows/internal/copy.bat
+++ b/.ci/pytorch/windows/internal/copy.bat
@@ -1,12 +1,20 @@
-copy "%CUDA_PATH%\bin\cusparse*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\cublas*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\cudart*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\curand*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\cufft*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\cusolver*64_*.dll*" pytorch\torch\lib
+
+if %CUDA_VERSION% geq 130 (
+    set "dll_path=bin\x64"
+) else (
+    set "dll_path=bin"
+)
+
+copy "%CUDA_PATH%\%dll_path%\cusparse*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\cublas*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\cudart*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\curand*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\cufft*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\cusolver*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\nvrtc*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\nvJitLink_*.dll*"  pytorch\torch\lib
 
 copy "%CUDA_PATH%\bin\cudnn*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\extras\CUPTI\lib64\nvperf_host*.dll*" pytorch\torch\lib
 
@@ -20,8 +28,3 @@ copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
 if exist "C:\Windows\System32\zlibwapi.dll" (
     copy "C:\Windows\System32\zlibwapi.dll"  pytorch\torch\lib
 )
-
-::copy nvJitLink dll is requires for cuda 12+
-if exist "%CUDA_PATH%\bin\nvJitLink_*.dll*" (
-    copy "%CUDA_PATH%\bin\nvJitLink_*.dll*"  pytorch\torch\lib
-)
diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat
index a0eb650f8506a..1349d3e661f55 100644
--- a/.ci/pytorch/windows/internal/cuda_install.bat
+++ b/.ci/pytorch/windows/internal/cuda_install.bat
@@ -26,6 +26,7 @@ if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%
 if %CUDA_VER% EQU 126 goto cuda126
 if %CUDA_VER% EQU 128 goto cuda128
 if %CUDA_VER% EQU 129 goto cuda129
+if %CUDA_VER% EQU 130 goto cuda130
 
 echo CUDA %CUDA_VERSION_STR% is not supported
 exit /b 1
@@ -113,6 +114,33 @@ xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
 
 goto cuda_common
 
+:cuda130
+
+set CUDA_INSTALL_EXE=cuda_13.0.0_windows.exe
+if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
+    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    set "ARGS="
+)
+
+set CUDNN_FOLDER=cudnn-windows-x86_64-9.12.0.46_cuda13-archive
+set CUDNN_LIB_FOLDER="lib"
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
+if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
+    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+)
+
+@REM cuDNN 8.3+ required zlib to be installed on the path
+echo Installing ZLIB dlls
+curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
+7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
+xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
+
+goto cuda_common
+
 :cuda_common
 :: NOTE: We only install CUDA if we don't have it installed already.
 :: With GHA runners these should be pre-installed as part of our AMI process
diff --git a/.ci/pytorch/windows/internal/driver_update.bat b/.ci/pytorch/windows/internal/driver_update.bat
index 5ed3a236c09a0..2c173aed818b4 100644
--- a/.ci/pytorch/windows/internal/driver_update.bat
+++ b/.ci/pytorch/windows/internal/driver_update.bat
@@ -1,9 +1,9 @@
-set WIN_DRIVER_VN=528.89
-set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore
-curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe
+set WIN_DRIVER_VN=580.88
+set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe" & REM @lint-ignore
+curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe
 if errorlevel 1 exit /b 1
 
-start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe -s -noreboot
+start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe -s -noreboot
 if errorlevel 1 exit /b 1
 
-del %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe || ver > NUL
+del %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe || ver > NUL
diff --git a/.ci/pytorch/windows/internal/install_python.bat b/.ci/pytorch/windows/internal/install_python.bat
index 73622bd736edd..84d0f9caccefb 100644
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@@ -1,12 +1,22 @@
 set ADDITIONAL_OPTIONS=""
 set PYTHON_EXEC="python"
+
+
 if "%DESIRED_PYTHON%" == "3.13t" (
     echo Python version is set to 3.13t
     set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
     set ADDITIONAL_OPTIONS="Include_freethreaded=1"
     set PYTHON_EXEC="python3.13t"
+) else if "%DESIRED_PYTHON%"=="3.14" (
+    echo Python version is set to 3.14 or 3.14t
+    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
+) else if "%DESIRED_PYTHON%"=="3.14t" (
+    echo Python version is set to 3.14 or 3.14t
+    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
+    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
+    set PYTHON_EXEC="python3.14t"
 ) else (
-    echo DESIRED_PYTHON not defined, Python version is set to %DESIRED_PYTHON%
+    echo Python version is set to %DESIRED_PYTHON%
     set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/%DESIRED_PYTHON%.0/python-%DESIRED_PYTHON%.0-amd64.exe" %= @lint-ignore =%
 )
 
diff --git a/.ci/pytorch/windows/internal/xpu_install.bat b/.ci/pytorch/windows/internal/xpu_install.bat
index 2296adf4dfe66..f143571a56922 100644
--- a/.ci/pytorch/windows/internal/xpu_install.bat
+++ b/.ci/pytorch/windows/internal/xpu_install.bat
@@ -13,9 +13,9 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
 :xpu_bundle_install_start
 
 set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
-set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
+set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
 set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
-set XPU_BUNDLE_VERSION=2025.0.1+20
+set XPU_BUNDLE_VERSION=2025.1.3+5
 set XPU_BUNDLE_INSTALLED=0
 set XPU_BUNDLE_UNINSTALL=0
 set XPU_EXTRA_URL=NULL
@@ -24,9 +24,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
 set XPU_EXTRA_INSTALLED=0
 set XPU_EXTRA_UNINSTALL=0
 
-if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] (
-    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
-    set XPU_BUNDLE_VERSION=2025.1.3+5
+if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
+    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+    set XPU_BUNDLE_VERSION=2025.2.1+20
 )
 
 :: Check if XPU bundle is target version or already installed
@@ -90,14 +90,3 @@ if errorlevel 1 exit /b 1
 del xpu_extra.exe
 
 :xpu_install_end
-
-if not "%XPU_ENABLE_KINETO%"=="1" goto install_end
-:: Install Level Zero SDK
-set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
-curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
-echo "Installing level zero SDK..."
-7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
-set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
-del "%SRC_DIR%\temp_build\level_zero_sdk.zip"
-
-:install_end
diff --git a/.ci/pytorch/windows/setup_build.bat b/.ci/pytorch/windows/setup_build.bat
index 9b492eef664d7..dbdc9891324cc 100644
--- a/.ci/pytorch/windows/setup_build.bat
+++ b/.ci/pytorch/windows/setup_build.bat
@@ -7,6 +7,8 @@ call "internal\install_python.bat"
 
 %PYTHON_EXEC% --version
 set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%"
+if "%DESIRED_PYTHON%" == "3.14t" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
+if "%DESIRED_PYTHON%" == "3.14" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
 if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake
 if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake
 if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh
index 878d6595c84c0..763fce4b73e18 100755
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@@ -124,20 +124,31 @@ popd
 
 export TH_BINARY_BUILD=1
 export INSTALL_TEST=0 # dont install test binaries into site-packages
-export MACOSX_DEPLOYMENT_TARGET=10.15
+export MACOSX_DEPLOYMENT_TARGET=11.0
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 
-SETUPTOOLS_PINNED_VERSION="==70.1.0"
-PYYAML_PINNED_VERSION="=5.3"
 EXTRA_CONDA_INSTALL_FLAGS=""
 CONDA_ENV_CREATE_FLAGS=""
 RENAME_WHEEL=true
 case $desired_python in
+    3.14t)
+        echo "Using 3.14 deps"
+        NUMPY_PINNED_VERSION="==2.1.0"
+        CONDA_ENV_CREATE_FLAGS="python-freethreading"
+        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+        desired_python="3.14.0rc1"
+        RENAME_WHEEL=false
+        ;;
+    3.14)
+        echo "Using 3.14t deps"
+        NUMPY_PINNED_VERSION="==2.1.0"
+        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+        desired_python="3.14.0rc1"
+        RENAME_WHEEL=false
+        ;;
     3.13t)
         echo "Using 3.13 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="=2.1.0"
+        NUMPY_PINNED_VERSION="==2.1.0"
         CONDA_ENV_CREATE_FLAGS="python-freethreading"
         EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
         desired_python="3.13"
@@ -145,37 +156,23 @@ case $desired_python in
         ;;
     3.13)
         echo "Using 3.13 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="=2.1.0"
+        NUMPY_PINNED_VERSION="==2.1.0"
         ;;
     3.12)
         echo "Using 3.12 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="=2.0.2"
+        NUMPY_PINNED_VERSION="==2.0.2"
         ;;
     3.11)
         echo "Using 3.11 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=5.3"
-        NUMPY_PINNED_VERSION="=2.0.2"
+        NUMPY_PINNED_VERSION="==2.0.2"
         ;;
     3.10)
         echo "Using 3.10 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=5.3"
-        NUMPY_PINNED_VERSION="=2.0.2"
-        ;;
-    3.9)
-        echo "Using 3.9 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=5.3"
-        NUMPY_PINNED_VERSION="=2.0.2"
+        NUMPY_PINNED_VERSION="==2.0.2"
         ;;
     *)
-        echo "Using default deps"
-        NUMPY_PINNED_VERSION="=1.11.3"
+        echo "Unsupported version $desired_python"
+        exit 1
         ;;
 esac
 
@@ -184,17 +181,18 @@ tmp_env_name="wheel_py$python_nodot"
 conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
 source activate "$tmp_env_name"
 
-retry pip install -r "${pytorch_rootdir}/requirements-build.txt"
-pip install "numpy=${NUMPY_PINNED_VERSION}"  "pyyaml${PYYAML_PINNED_VERSION}" requests ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing-extensions
+PINNED_PACKAGES=(
+    "numpy${NUMPY_PINNED_VERSION}"
+)
+retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
+pip install requests ninja typing-extensions
 retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
 retry brew install libomp
 
-# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
+# is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1
 
-if [[ -n "$CROSS_COMPILE_ARM64" ]]; then
-    export CMAKE_OSX_ARCHITECTURES=arm64
-fi
 export USE_MKLDNN=OFF
 export USE_QNNPACK=OFF
 export BUILD_TEST=OFF
@@ -202,16 +200,7 @@ export BUILD_TEST=OFF
 pushd "$pytorch_rootdir"
 echo "Calling setup.py bdist_wheel at $(date)"
 
-if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel -d "$whl_tmp_dir"
-    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 CMAKE_FRESH=1 python setup.py bdist_wheel -d "$whl_tmp_dir"
-    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-else
-    python setup.py bdist_wheel -d "$whl_tmp_dir"
-fi
+python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}
 
 echo "Finished setup.py bdist_wheel at $(date)"
 
diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
index 11678cabb2c31..c24a50b8b17ed 100755
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@@ -65,16 +65,8 @@ fi
 
 if [[ "$PACKAGE_TYPE" != libtorch ]]; then
   if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
-    if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-      pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
-      pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
-      # todo: after folder is populated use the pypi_pkg channel instead
-      pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
-      retry pip install -q numpy protobuf typing-extensions
-    else
-      pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-      retry pip install -q numpy protobuf typing-extensions
-    fi
+    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+    retry pip install -q numpy protobuf typing-extensions
   else
     pip install "\$pkg"
     retry pip install -q numpy protobuf typing-extensions
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 0257c5843e80e..27f0a37f3fb48 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -77,8 +77,8 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
 
-# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == "cu129" ]]; then
+# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
+if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
   TRITON_CONSTRAINT="platform_system == 'Linux'"
 fi
 
@@ -137,7 +137,6 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
 export DESIRED_CUDA="$DESIRED_CUDA"
 export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
 export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
-export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
 if [[ "${OSTYPE}" == "msys" ]]; then
   export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
   if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
diff --git a/.circleci/scripts/binary_upload.sh b/.circleci/scripts/binary_upload.sh
index cf87748d538ce..d48077e112455 100755
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@@ -23,10 +23,6 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
   AWS_S3_CP="aws s3 cp"
 fi
 
-if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
-  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
-fi
-
 # this is special build with all dependencies packaged
 if [[ ${BUILD_NAME} == *-full* ]]; then
   UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
@@ -55,16 +51,12 @@ s3_upload() {
     s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
   fi
   (
-    cache_control_flag=""
-    if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
-      cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
-    fi
     for pkg in ${PKG_DIR}/*.${extension}; do
       (
         set -x
         shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
         ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
-          --metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
+          --metadata "checksum-sha256=${shm_id}"
       )
     done
   )
diff --git a/.circleci/scripts/binary_windows_build.sh b/.circleci/scripts/binary_windows_build.sh
index 27cd36f949280..18dcde50e2b65 100644
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@@ -15,8 +15,7 @@ fi
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
     export VC_YEAR=2022
     export USE_SCCACHE=0
-    export XPU_VERSION=2025.1
-    export XPU_ENABLE_KINETO=1
+    export XPU_VERSION=2025.2
 fi
 
 echo "Free space on filesystem before build:"
diff --git a/.circleci/scripts/binary_windows_test.sh b/.circleci/scripts/binary_windows_test.sh
index 79f714265f2c2..9326d9037e8b3 100644
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@@ -8,7 +8,7 @@ export VC_YEAR=2022
 
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
     export VC_YEAR=2022
-    export XPU_VERSION=2025.1
+    export XPU_VERSION=2025.2
 fi
 
 pushd "$PYTORCH_ROOT/.ci/pytorch/"
diff --git a/.flake8 b/.flake8
index 3e8a6c3a5115a..fc9ab167fbeef 100644
--- a/.flake8
+++ b/.flake8
@@ -48,6 +48,7 @@ per-file-ignores =
     torch/__init__.py: F401,TOR901
     torch/_custom_op/impl.py: TOR901
     torch/_export/serde/upgrade.py: TOR901
+    torch/_functorch/predispatch.py: TOR901
     torch/_functorch/vmap.py: TOR901
     torch/_inductor/test_operators.py: TOR901
     torch/_library/abstract_impl.py: TOR901
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index 647671e8c83d2..798dee312306d 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -12,7 +12,9 @@ self-hosted-runner:
     - linux.9xlarge.ephemeral
     - am2.linux.9xlarge.ephemeral
     - linux.12xlarge
+    - linux.12xlarge.memory
     - linux.24xlarge
+    - linux.24xlarge.memory
     - linux.24xlarge.ephemeral
     - linux.24xlarge.amd
     - linux.arm64.2xlarge
@@ -54,6 +56,7 @@ self-hosted-runner:
     - linux.rocm.gpu.2
     - linux.rocm.gpu.4
     # gfx942 runners
+    - linux.rocm.gpu.gfx942.1
     - linux.rocm.gpu.gfx942.2
     - linux.rocm.gpu.gfx942.4
     - rocm-docker
diff --git a/.github/actions/build-external-packages/action.yml b/.github/actions/build-external-packages/action.yml
new file mode 100644
index 0000000000000..c0c727d93ac66
--- /dev/null
+++ b/.github/actions/build-external-packages/action.yml
@@ -0,0 +1,86 @@
+# .github/workflows/build-external.yml
+name: Build External packages
+
+description: build external packages for PyTorch
+
+inputs:
+  cuda-version:
+    description: CUDA version to use
+    type: string
+    required: true
+    default: '12.8.1'
+  cuda-arch-list:
+    description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0")
+    type: string
+    required: true
+    default: ""
+  docker-image:
+    description: Base image to use
+    type: string
+    required: true
+  build-targets:
+    description: Build targets
+    type: string
+    required: true
+  torch-wheel-dir:
+    description: Directory to built torch wheel
+    type: string
+    required: false
+    default: dist
+  output-dir:
+    description: Directory to store build artifact
+    default: external
+    type: string
+    required: false
+
+outputs:
+  build_time:
+    description: "Total build time in seconds"
+    value: ${{ steps.build-external.outputs.build_time }}
+  output_dir:
+    description: "Directory where build artifact is stored"
+    value: ${{ steps.build-external.outputs.output_dir }}
+
+runs:
+  using: composite
+  steps:
+    - name: Build external packages in sequence
+      id: build-external
+      env:
+        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+        SCCACHE_REGION: us-east-1
+        CUDA_VERSION: ${{ inputs.cuda-version }}
+        TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
+        BASE_IMAGE: ${{ inputs.docker-image }}
+        BUILD_TARGETS: ${{ inputs.build-targets }}
+        PARENT_OUTPUT_DIR: ${{ inputs.output-dir }}
+        TORCH_WHEELS_PATH: ${{ inputs.torch-wheel-dir }}
+      shell: bash
+      run: |
+        set -euo pipefail
+        python3 --version
+        docker images
+        START_TIME=$(date +%s)
+        (
+          cd .ci/lumen_cli
+          python3 -m pip install -e .
+        )
+        MAX_JOBS="$(nproc --ignore=6)"
+        export MAX_JOBS
+
+        # Split the comma-separated list and build each target
+        IFS=',' read -ra TARGETS <<< "$BUILD_TARGETS"
+        for target in "${TARGETS[@]}"; do
+          OUTPUT_DIR="$PARENT_OUTPUT_DIR/$target"
+          export OUTPUT_DIR
+          echo "Building external package: $target in directory $OUTPUT_DIR"
+          python3 -m cli.run build external "$target"
+        done
+
+        END_TIME=$(date +%s)
+        {
+          echo "build_time=$((END_TIME - START_TIME))"
+          if [ -d "$PARENT_OUTPUT_DIR" ]; then
+            echo "output_dir=$PARENT_OUTPUT_DIR"
+          fi
+        } >> "$GITHUB_OUTPUT"
diff --git a/.github/actions/checkout-pytorch/action.yml b/.github/actions/checkout-pytorch/action.yml
index 055404c69474d..15f193ef3a5dc 100644
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@@ -57,6 +57,21 @@ runs:
         submodules: ${{ inputs.submodules }}
         show-progress: false
 
+    - name: Clean submodules post checkout
+      id: clean-submodules
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+      shell: bash
+      env:
+        NO_SUDO: ${{ inputs.no-sudo }}
+      run: |
+        cd "${GITHUB_WORKSPACE}"
+        # Clean stale submodule dirs
+        if [ -z "${NO_SUDO}" ]; then
+          sudo git submodule foreach --recursive git clean -ffdx
+        else
+          git submodule foreach --recursive git clean -ffdx
+        fi
+
     - name: Clean workspace (try again)
       if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
         (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml
index d3644c52fbcd8..a58db801b1cf8 100644
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@@ -59,11 +59,6 @@ runs:
             echo "$msg"
             exit 1
         fi
-        if [[ $ngpu -eq 1 ]]; then
-            echo "Error: only 1 GPU detected, at least 2 GPUs are needed for distributed jobs"
-            echo "$msg"
-            exit 1
-        fi
 
     - name: Runner diskspace health check
       uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
diff --git a/.github/actions/test-pytorch-binary/action.yml b/.github/actions/test-pytorch-binary/action.yml
index 63acd791b85c6..d4b8be8b609a0 100644
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@@ -24,7 +24,6 @@ runs:
           -e PYTORCH_FINAL_PACKAGE_DIR \
           -e PYTORCH_ROOT \
           -e SKIP_ALL_TESTS \
-          -e USE_SPLIT_BUILD \
           --tty \
           --detach \
           -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index 70e9da5216ae2..b0255e764c594 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-9b57c7bd5ad4db093c5bb31c802df9f04d933ac9
+27fc2493d383354a008106f22f3be232badee9a1
diff --git a/.github/ci_commit_pins/fbgemm_rocm.txt b/.github/ci_commit_pins/fbgemm_rocm.txt
index db140a31f3fa4..19f5a2b2efa1a 100644
--- a/.github/ci_commit_pins/fbgemm_rocm.txt
+++ b/.github/ci_commit_pins/fbgemm_rocm.txt
@@ -1 +1 @@
-7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8
+08ae0af1395c8d8471f4025deb6af9aef90b342f
diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt
deleted file mode 100644
index efbc3ceeb2afe..0000000000000
--- a/.github/ci_commit_pins/torchbench.txt
+++ /dev/null
@@ -1 +0,0 @@
-e03a63be43e33596f7f0a43b0f530353785e4a59
diff --git a/.github/ci_commit_pins/vllm.txt b/.github/ci_commit_pins/vllm.txt
index 21863c19dec73..c9c4265b2f37f 100644
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@@ -1 +1 @@
-6a39ba85fe0f2fff9494b5eccea717c93510c230
+e10fef08838612b4560e9c72e5cb1414a5edfa13
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index ee8531ae65100..eb335eb9d64d5 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-b6a5b82b9948b610fa4c304d0d869c82b8f17db1
+6c5478ff7c3d50dd1e3047d72ec5909bea474073
diff --git a/.github/ci_configs/vllm/Dockerfile.tmp_vllm b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
new file mode 100644
index 0000000000000..2cee6ed2df19a
--- /dev/null
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@@ -0,0 +1,427 @@
+# TODO(elainwy): remove this file after the torch nightly dockerfile is in sync in vllm repo
+# The vLLM Dockerfile is used to construct vLLM image against torch nightly and torch main that can be directly used for testing
+
+ARG CUDA_VERSION=12.8.1
+ARG PYTHON_VERSION=3.12
+
+# BUILD_BASE_IMAGE: used to setup python build xformers, and vllm wheels, It can be replaced with a different base image from local machine,
+# by default, it uses the torch-nightly-base stage from this docker image
+ARG BUILD_BASE_IMAGE=torch-nightly-base
+
+# FINAL_BASE_IMAGE: used to set up vllm-instaled environment and build flashinfer,
+# by default, it uses devel-ubuntu22.04 official image.
+ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+# The logic is copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile
+ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
+
+
+#################### TORCH NIGHTLY BASE IMAGE ####################
+# A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
+
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+ARG GET_PIP_URL
+
+# Install Python and other dependencies
+RUN apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim \
+    && add-apt-repository -y ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+# as it was causing spam when compiling the CUTLASS kernels
+# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
+RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
+    if command -v apt-get >/dev/null; then \
+        if [ "$current_gcc_version" -lt 10 ]; then \
+            echo "GCC version is $current_gcc_version, installing gcc-10..."; \
+            apt-get update \
+            && apt-get install -y gcc-10 g++-10 \
+            && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 \
+            && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
+        else \
+            echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
+        fi \
+    fi \
+    && gcc --version && g++ --version
+
+# install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv==0.8.4
+
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
+
+#################### TORCH NIGHTLY  BASE IMAGE ####################
+
+
+#################### BASE BUILD IMAGE ####################
+# A base image for building vLLM with torch nightly or torch wheels
+# prepare basic build environment
+FROM ${BUILD_BASE_IMAGE} AS base
+USER root
+
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+
+# TODO (huydhn): Only work with PyTorch manylinux builder
+ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
+
+# Install some system dependencies and double check python version
+RUN if command -v apt-get >/dev/null; then \
+        apt-get update -y \
+        && apt-get install -y ccache software-properties-common git curl wget sudo vim; \
+    else \
+        dnf install -y git curl wget sudo vim; \
+    fi \
+    && python3 --version && python3 -m pip --version
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+# Install uv for faster pip installs if not existed
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if ! python3 -m uv --version >/dev/null 2>&1; then \
+        python3 -m pip install uv==0.8.4; \
+    fi
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+
+# install build and runtime dependencies without stable torch version
+RUN python3 use_existing_torch.py
+
+# default mount file as placeholder, this just avoid the mount error
+# change to a different vllm folder if this does not exist anymore
+ARG TORCH_WHEELS_PATH="./requirements"
+ARG PINNED_TORCH_VERSION
+
+# Install torch, torchaudio and torchvision based on the input
+# if TORCH_WHEELS_PATH is default "./requirements", it will pull thethe nightly versions using pip
+# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
+RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
+        echo "[INFO] Installing torch wheels to build vllm"; \
+        torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
+        vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \
+        audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \
+        uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \
+    elif [ -n "$PINNED_TORCH_VERSION" ]; then \
+        echo "[INFO] Installing pinned torch nightly version to build vllm: $PINNED_TORCH_VERSION"; \
+        uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    else \
+        echo "[INFO] Installing torch nightly with latest one to build vllm"; \
+        uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    fi
+
+# Install numba 0.61.2 for cuda environment
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system numba==0.61.2
+
+# Install common dependencies from vllm common.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/common.txt
+
+# Must put before installing xformers, so it can install the correct version of xfomrers.
+ARG xformers_cuda_arch_list='7.5;8.0+PTX;9.0a'
+ENV TORCH_CUDA_ARCH_LIST=${xformers_cuda_arch_list}
+
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+
+RUN echo ${TORCH_CUDA_ARCH_LIST}
+RUN echo ${MAX_JOBS}
+RUN pip freeze | grep -E 'ninja'
+
+# Build xformers with cuda and torch nightly/wheel
+# following official xformers guidance: https://github.com/facebookresearch/xformers#build
+# sha for https://github.com/facebookresearch/xformers/tree/v0.0.32.post2
+ARG XFORMERS_COMMIT=5d4b92a5e5a9c6c6d4878283f47d82e17995b468
+ENV CCACHE_DIR=/root/.cache/ccache
+
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    echo 'git clone xformers...' \
+    && git clone https://github.com/facebookresearch/xformers.git --recursive \
+    && cd xformers \
+    && git checkout ${XFORMERS_COMMIT} \
+    && git submodule update --init --recursive \
+    && echo 'finish git clone xformers...' \
+    && rm -rf build \
+    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
+    && cd .. \
+    && rm -rf xformers
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system xformers-dist/*.whl --verbose
+
+# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
+# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
+RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
+
+RUN cat torch_build_versions.txt
+RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
+
+#################### BASE BUILD IMAGE ####################
+
+
+#################### WHEEL BUILD IMAGE ####################
+# Image used to build vllm wheel
+FROM base AS build
+ARG TARGETPLATFORM
+
+COPY . .
+
+RUN python3 use_existing_torch.py
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt
+
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+
+# Max jobs used by Ninja to build extensions
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+ARG nvcc_threads=4
+ENV NVCC_THREADS=$nvcc_threads
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+
+ARG USE_SCCACHE
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
+        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
+        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
+        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && export CMAKE_BUILD_TYPE=Release \
+        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38 \
+        && sccache --show-stats; \
+    fi
+
+ARG vllm_target_device="cuda"
+ENV VLLM_TARGET_DEVICE=${vllm_target_device}
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git  \
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        # Clean any existing CMake artifacts
+        rm -rf .deps && \
+        mkdir -p .deps && \
+        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
+        python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
+    fi
+
+RUN echo "[INFO] Listing current directory:" && \
+    ls -al && \
+    echo "[INFO] Showing torch_build_versions.txt content:" && \
+    cat torch_build_versions.txt
+
+#################### WHEEL BUILD IMAGE ####################
+
+
+################### VLLM INSTALLED IMAGE ####################
+# Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
+FROM ${FINAL_BASE_IMAGE} AS vllm-base
+USER root
+
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+ARG GET_PIP_URL
+
+# TODO (huydhn): Only work with PyTorch manylinux builder
+ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
+
+# prepare for environment starts
+WORKDIR /workspace
+
+# Install Python and other dependencies
+RUN if command -v apt-get >/dev/null; then \
+        apt-get update -y \
+        && apt-get install -y ccache software-properties-common git curl wget sudo vim \
+        && add-apt-repository -y ppa:deadsnakes/ppa \
+        && apt-get update -y \
+        && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+        && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+        && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+        && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+        && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
+    else \
+        dnf install -y git curl wget sudo vim; \
+    fi \
+    && python3 --version && python3 -m pip --version
+
+# Get the torch versions, and whls used in previous stagtes for consistency
+COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
+COPY --from=base /workspace/xformers-dist /wheels/xformers
+COPY --from=build /workspace/vllm-dist /wheels/vllm
+RUN echo "[INFO] Listing current directory before torch install step:" && \
+    ls -al && \
+    echo "[INFO] Showing torch_build_versions.txt content:" && \
+    cat torch_build_versions.txt
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+# Install uv for faster pip installs if not existed
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if ! python3 -m uv --version > /dev/null 2>&1; then \
+        python3 -m pip install uv==0.8.4; \
+    fi
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
+
+# Default mount file as placeholder, this just avoid the mount error
+ARG TORCH_WHEELS_PATH="./requirements"
+# Install torch, torchaudio and torchvision
+# if TORCH_WHEELS_PATH is default "./requirements", it will pull the nightly versions using pip using torch_build_versions.txt
+# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
+RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
+        torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
+        vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \
+        audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \
+        echo "[INFO] Use wheels to build : '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
+        uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \
+    else \
+        echo "[INFO] Installing torch versions from torch_build_versions.txt"; \
+        uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    fi
+
+# Install the vllm wheel from previous stage
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system /wheels/vllm/*.whl --verbose
+
+# Install xformers wheel from previous stage
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system /wheels/xformers/*.whl --verbose
+
+# Build flashinfer from source.
+ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
+# install package for build flashinfer
+# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
+
+RUN pip install build==1.3.0
+RUN pip freeze | grep -E 'setuptools|packaging|build'
+
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+# Build flashinfer for torch nightly from source around 10 mins
+ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
+# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
+ARG FLASHINFER_GIT_REF="v0.2.14.post1"
+RUN --mount=type=cache,target=/root/.cache/uv \
+    git clone --depth 1 --recursive --shallow-submodules \
+        --branch ${FLASHINFER_GIT_REF} \
+        ${FLASHINFER_GIT_REPO} flashinfer \
+    && echo "Building FlashInfer with AOT for arches: ${torch_cuda_arch_list}" \
+    && cd flashinfer \
+    && python3 -m flashinfer.aot \
+    && python3 -m build --no-isolation --wheel --outdir ../wheels/flashinfer \
+    && cd .. \
+    && rm -rf flashinfer
+
+# install flashinfer python
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system wheels/flashinfer/*.whl --verbose
+
+# Logging to confirm the torch versions
+RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm\|^flashinfer' > build_summary.txt
+################### VLLM INSTALLED IMAGE ####################
+
+
+#################### UNITTEST IMAGE #############################
+FROM vllm-base as test
+
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
+
+COPY tests/ tests/
+COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+# Install build and runtime dependencies without stable torch version
+COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
+
+RUN python3 use_existing_torch.py
+
+# install packages
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/common.txt
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -e tests/vllm_test_utils
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/nightly_torch_test.txt
+
+# Logging to confirm the torch versions
+RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+
+# Logging to confirm all the packages are installed
+RUN pip freeze
+
+#################### UNITTEST IMAGE #############################
+
+#################### EXPORT STAGE ####################
+FROM scratch as export-wheels
+
+# Just copy the wheels we prepared in previous stages
+COPY --from=base /workspace/xformers-dist /wheels/xformers
+COPY --from=build /workspace/vllm-dist /wheels/vllm
+COPY --from=vllm-base /workspace/build_summary.txt /wheels/build_summary.txt
+COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000000000..944d3fec35659
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,24 @@
+version: 2
+updates:
+  # Update to the latest transformers version with dependabot
+  - package-ecosystem: "pip"
+    directory: "/.ci/docker/ci_commit_pins"
+    schedule:
+      interval: "daily"
+    target-branch: "main"
+    allow:
+      - dependency-name: "transformers"
+    ignore:
+      - dependency-name: "*"
+        update-types: ["version-update:semver-patch"]
+    commit-message:
+      prefix: "[Dependabot] Update"
+      include: "scope"
+    labels:
+      - "dependencies"
+      - "open source"
+      - "python"
+      - "topic: not user facing"
+      - "module: ci"
+      - "module: inductor"
+      - "ciflow/inductor"
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index a5982b63b70fc..a0aa6921b92ba 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -22,10 +22,12 @@ ciflow_push_tags:
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
+- ciflow/riscv64
 - ciflow/slow
 - ciflow/trunk
 - ciflow/unstable
 - ciflow/xpu
+- ciflow/vllm
 - ciflow/torchbench
 - ciflow/op-benchmark
 - ciflow/pull
diff --git a/.github/requirements/conda-env-macOS-ARM64 b/.github/requirements/conda-env-macOS-ARM64
deleted file mode 100644
index b6e9a6ce9f3e5..0000000000000
--- a/.github/requirements/conda-env-macOS-ARM64
+++ /dev/null
@@ -1,5 +0,0 @@
-# Not pinning certifi so that we can always get the latest certificates
-certifi
-pip=23.2.1
-pkg-config=0.29.2
-wheel=0.37.1
diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt
index 224835188d87f..3a27cac46f71f 100644
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@@ -28,7 +28,7 @@ pyyaml==6.0.2
 scipy==1.12.0
 setuptools==72.1.0
 sympy==1.13.3
-tlparse==0.3.30
+tlparse==0.4.0
 tensorboard==2.13.0
 typing-extensions==4.12.2
 unittest-xml-reporting<=3.2.0,>=2.0.0
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 9e86d332c5316..e541e7a86f653 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -119,6 +119,7 @@ def build_triton(
                 ["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir
             )
         else:
+            check_call(["git", "fetch", "origin", commit_hash], cwd=triton_basedir)
             check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
 
         # change built wheel name and version
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index def91d29f2bd2..4a4f8a65f684d 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -16,17 +16,17 @@
 
 
 # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
-CUDA_ARCHES = ["12.6", "12.8", "12.9"]
+CUDA_ARCHES = ["12.6", "12.8", "13.0"]
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
     "12.6": "12.6.3",
     "12.8": "12.8.1",
-    "12.9": "12.9.1",
+    "13.0": "13.0.0",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
     "12.6": "9",
     "12.8": "9",
-    "12.9": "9",
+    "13.0": "9",
 }
 
 # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
@@ -38,7 +38,7 @@
 
 CPU_S390X_ARCH = ["cpu-s390x"]
 
-CUDA_AARCH64_ARCHES = ["12.9-aarch64"]
+CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]
 
 
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@@ -54,7 +54,7 @@
         "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@@ -71,49 +71,49 @@
         "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
     ),
-    "12.9": (
-        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    "13.0": (
+        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
     ),
     "xpu": (
-        "intel-cmplr-lib-rt==2025.1.1 | "
-        "intel-cmplr-lib-ur==2025.1.1 | "
-        "intel-cmplr-lic-rt==2025.1.1 | "
-        "intel-sycl-rt==2025.1.1 | "
-        "oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "onemkl-sycl-blas==2025.1.0 | "
-        "onemkl-sycl-dft==2025.1.0 | "
-        "onemkl-sycl-lapack==2025.1.0 | "
-        "onemkl-sycl-rng==2025.1.0 | "
-        "onemkl-sycl-sparse==2025.1.0 | "
-        "dpcpp-cpp-rt==2025.1.1 | "
-        "intel-opencl-rt==2025.1.1 | "
-        "mkl==2025.1.0 | "
-        "intel-openmp==2025.1.1 | "
-        "tbb==2022.1.0 | "
-        "tcmlib==1.3.0 | "
-        "umf==0.10.0 | "
-        "intel-pti==0.12.3"
+        "intel-cmplr-lib-rt==2025.2.1 | "
+        "intel-cmplr-lib-ur==2025.2.1 | "
+        "intel-cmplr-lic-rt==2025.2.1 | "
+        "intel-sycl-rt==2025.2.1 | "
+        "oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "onemkl-sycl-blas==2025.2.0 | "
+        "onemkl-sycl-dft==2025.2.0 | "
+        "onemkl-sycl-lapack==2025.2.0 | "
+        "onemkl-sycl-rng==2025.2.0 | "
+        "onemkl-sycl-sparse==2025.2.0 | "
+        "dpcpp-cpp-rt==2025.2.1 | "
+        "intel-opencl-rt==2025.2.1 | "
+        "mkl==2025.2.0 | "
+        "intel-openmp==2025.2.1 | "
+        "tbb==2022.2.0 | "
+        "tcmlib==1.4.0 | "
+        "umf==0.11.0 | "
+        "intel-pti==0.13.1"
     ),
 }
 
@@ -124,9 +124,7 @@ def get_nccl_wheel_version(arch_version: str) -> str:
     requirements = map(
         str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
     )
-    return next(x for x in requirements if x.startswith("nvidia-nccl-cu")).split("==")[
-        1
-    ]
+    return next(x for x in requirements if x.startswith("nvidia-nccl")).split("==")[1]
 
 
 def read_nccl_pin(arch_version: str) -> str:
@@ -193,7 +191,7 @@ def arch_type(arch_version: str) -> str:
     "cpu": "libtorch-cxx11-builder:cpu",
 }
 
-FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
+FULL_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
 
 
 def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@@ -273,7 +271,6 @@ def generate_wheels_matrix(
     os: str,
     arches: Optional[list[str]] = None,
     python_versions: Optional[list[str]] = None,
-    use_split_build: bool = False,
 ) -> list[dict[str, str]]:
     package_type = "wheel"
     if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
@@ -312,28 +309,20 @@ def generate_wheels_matrix(
                 else arch_version
             )
 
-            # TODO: Enable python 3.13t on cpu-s390x
-            if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
-                continue
-            # TODO: Enable python 3.14 on non linux OSes
-            if os != "linux" and (
-                python_version == "3.14" or python_version == "3.14t"
-            ):
+            # TODO: Enable python 3.14 for rest
+            if os not in [
+                "linux",
+                "linux-aarch64",
+                "linux-s390x",
+                "macos-arm64",
+                "windows",
+            ] and (python_version == "3.14" or python_version == "3.14t"):
                 continue
 
-            if use_split_build and (
-                arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux"
-            ):
-                raise RuntimeError(
-                    "Split build is only supported on linux with cuda 12* and cpu.\n"
-                    f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
-                    "Please modify the matrix generation to exclude this combination."
-                )
-
             # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
 
             if (
-                arch_version in ["12.9", "12.8", "12.6"]
+                arch_version in ["13.0", "12.8", "12.6"]
                 and os == "linux"
                 or arch_version in CUDA_AARCH64_ARCHES
             ):
@@ -344,7 +333,6 @@ def generate_wheels_matrix(
                         "gpu_arch_type": gpu_arch_type,
                         "gpu_arch_version": gpu_arch_version,
                         "desired_cuda": desired_cuda,
-                        "use_split_build": "True" if use_split_build else "False",
                         "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
                             ":"
                         )[0],
@@ -367,30 +355,6 @@ def generate_wheels_matrix(
                         ),  # include special case for aarch64 build, remove the -aarch64 postfix
                     }
                 )
-                # Special build building to use on Colab. Python 3.11 for 12.6 CUDA
-                if python_version == "3.11" and arch_version == CUDA_STABLE:
-                    ret.append(
-                        {
-                            "python_version": python_version,
-                            "gpu_arch_type": gpu_arch_type,
-                            "gpu_arch_version": gpu_arch_version,
-                            "desired_cuda": translate_desired_cuda(
-                                gpu_arch_type, gpu_arch_version
-                            ),
-                            "use_split_build": "True" if use_split_build else "False",
-                            "container_image": WHEEL_CONTAINER_IMAGES[
-                                arch_version
-                            ].split(":")[0],
-                            "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[
-                                arch_version
-                            ].split(":")[1],
-                            "package_type": package_type,
-                            "pytorch_extra_install_requirements": "",
-                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
-                                ".", "_"
-                            ),
-                        }
-                    )
             else:
                 ret.append(
                     {
@@ -400,7 +364,6 @@ def generate_wheels_matrix(
                         "desired_cuda": translate_desired_cuda(
                             gpu_arch_type, gpu_arch_version
                         ),
-                        "use_split_build": "True" if use_split_build else "False",
                         "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
                             ":"
                         )[0],
@@ -422,6 +385,6 @@ def generate_wheels_matrix(
     return ret
 
 
-validate_nccl_dep_consistency("12.9")
+validate_nccl_dep_consistency("13.0")
 validate_nccl_dep_consistency("12.8")
 validate_nccl_dep_consistency("12.6")
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 4df6150f97655..67906d4ad88d5 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -59,9 +59,7 @@ class BinaryBuildWorkflow:
     is_scheduled: str = ""
     branches: str = "nightly"
     # Mainly for macos
-    cross_compile_arm64: bool = False
     macos_runner: str = "macos-14-xlarge"
-    use_split_build: bool = False
     # Mainly used for libtorch builds
     build_variant: str = ""
 
@@ -72,9 +70,6 @@ def __post_init__(self) -> None:
                 for item in [self.os, "binary", self.package_type, self.build_variant]
                 if item != ""
             )
-        if self.use_split_build:
-            # added to distinguish concurrency groups
-            self.build_environment += "-split"
 
     def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         output_file_path = (
@@ -117,21 +112,6 @@ class OperatingSystem:
             isolated_workflow=True,
         ),
     ),
-    # See https://github.com/pytorch/pytorch/issues/138750
-    #   BinaryBuildWorkflow(
-    #     os=OperatingSystem.LINUX,
-    #     package_type="manywheel",
-    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-    #         OperatingSystem.LINUX,
-    #         use_split_build=True,
-    #         arches=["11.8", "12.1", "12.4", "cpu"],
-    #     ),
-    #     ciflow_config=CIFlowConfig(
-    #         labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
-    #         isolated_workflow=True,
-    #     ),
-    #     use_split_build=True,
-    # ),
     BinaryBuildWorkflow(
         os=OperatingSystem.LINUX,
         package_type="libtorch",
@@ -175,27 +155,11 @@ class OperatingSystem:
         package_type="manywheel",
         build_configs=generate_binary_build_matrix.generate_wheels_matrix(
             OperatingSystem.LINUX,
-            arches=["12.6", "12.8", "12.9"],
-            python_versions=["3.9"],
+            arches=["12.8"],
+            python_versions=["3.12"],
         ),
         branches="main",
     ),
-    # See https://github.com/pytorch/pytorch/issues/138750
-    # BinaryBuildWorkflow(
-    #     os=OperatingSystem.LINUX,
-    #     package_type="manywheel",
-    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-    #         OperatingSystem.LINUX,
-    #         arches=["11.8", "12.1", "12.4"],
-    #         python_versions=["3.9"],
-    #         use_split_build=True,
-    #     ),
-    #     ciflow_config=CIFlowConfig(
-    #         labels={LABEL_CIFLOW_PERIODIC},
-    #     ),
-    #     branches="main",
-    #     use_split_build=True,
-    # ),
     BinaryBuildWorkflow(
         os=OperatingSystem.LINUX,
         package_type="libtorch",
@@ -338,7 +302,6 @@ class OperatingSystem:
             generate_binary_build_matrix.RELEASE,
             libtorch_variants=["shared-with-deps"],
         ),
-        cross_compile_arm64=False,
         macos_runner="macos-14-xlarge",
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@@ -351,7 +314,6 @@ class OperatingSystem:
         build_configs=generate_binary_build_matrix.generate_wheels_matrix(
             OperatingSystem.MACOS_ARM64
         ),
-        cross_compile_arm64=False,
         macos_runner="macos-14-xlarge",
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
diff --git a/.github/scripts/gql_mocks.json.gz b/.github/scripts/gql_mocks.json.gz
index 07628227a18a8..67355239dc422 100644
Binary files a/.github/scripts/gql_mocks.json.gz and b/.github/scripts/gql_mocks.json.gz differ
diff --git a/.github/scripts/runner_determinator.py b/.github/scripts/runner_determinator.py
index 1481459d40c4c..baf560234549b 100644
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@@ -262,7 +262,12 @@ def is_exception_branch(branch: str) -> bool:
     """
     Branches that get opted out of experiments by default, until they're explicitly enabled.
     """
-    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
+    return branch.split("/", maxsplit=1)[0] in {
+        "main",
+        "nightly",
+        "release",
+        "landchecks",
+    }
 
 
 def load_yaml(yaml_text: str) -> Any:
diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py
index e4a8cb2bc8df1..ac3a1cc12921c 100755
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@@ -27,6 +27,7 @@
     get_drci_classifications,
     gh_get_team_members,
     GitHubPR,
+    iter_issue_timeline_until_comment,
     JobCheckState,
     main as trymerge_main,
     MandatoryChecksMissingError,
@@ -34,6 +35,8 @@
     RE_GHSTACK_DESC,
     read_merge_rules,
     remove_job_name_suffix,
+    sha_from_committed_event,
+    sha_from_force_push_after,
     validate_revert,
 )
 
@@ -70,6 +73,9 @@ def save_mocked_queries(obj: Any) -> None:
     if key in mocked_queries:
         return mocked_queries[key]
 
+    # TODO: Remove me once https://github.com/pytorch/pytorch/issues/160489 is resolved
+    raise ValueError(f"Key {key} could not be found in gql_mocks")
+
     try:
         rc = fallback_function(*args)
     except HTTPError as err:
@@ -121,7 +127,7 @@ def __init__(self) -> None:
             self.force = force
             self.pr_num = 76123
             self.dry_run = True
-            self.comment_id = 0
+            self.comment_id = 12345  # Set to non-zero value
             self.reason = "this is for testing"
             self.ignore_current = False
             self.check_mergeability = False
@@ -149,9 +155,9 @@ def mock_revert(
 def mock_merge(
     pr: GitHubPR,
     repo: GitRepo,
+    comment_id: int,
     dry_run: bool = False,
     skip_mandatory_checks: bool = False,
-    comment_id: Optional[int] = None,
     timeout_minutes: int = 400,
     stale_pr_days: int = 3,
     ignore_current: bool = False,
@@ -467,9 +473,9 @@ def test_main_force(
         mock_merge.assert_called_once_with(
             mock.ANY,
             mock.ANY,
+            comment_id=mock.ANY,
             dry_run=mock.ANY,
             skip_mandatory_checks=True,
-            comment_id=mock.ANY,
             ignore_current=False,
         )
 
@@ -482,9 +488,9 @@ def test_main_merge(self, mock_merge: Any, *args: Any) -> None:
         mock_merge.assert_called_once_with(
             mock.ANY,
             mock.ANY,
+            comment_id=mock.ANY,
             dry_run=mock.ANY,
             skip_mandatory_checks=False,
-            comment_id=mock.ANY,
             ignore_current=False,
         )
 
@@ -1135,5 +1141,176 @@ def test__revlist_to_prs_two_prs(
         )
 
 
+@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
+@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
+@mock.patch(
+    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
+)
+class TestTimelineFunctions(TestCase):
+    """Tests for the new timeline-related functions"""
+
+    def test_sha_from_committed_event(self, *args: Any) -> None:
+        """Test extracting SHA from committed event"""
+        # Based on actual GitHub API format - committed events have "sha" at top level
+        event = {
+            "event": "committed",
+            "sha": "fb21ce932ded6670c918804a0d9151b773770a7c",
+        }
+        self.assertEqual(
+            sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c"
+        )
+
+        # Test with missing SHA
+        event_no_sha = {"event": "committed"}
+        self.assertIsNone(sha_from_committed_event(event_no_sha))
+
+    def test_sha_from_force_push_after(self, *args: Any) -> None:
+        """Test extracting SHA from force push event"""
+        # NOTE: The current function doesn't handle the actual GitHub API format
+        # Real force push events have "commit_id" at top level, but this function
+        # looks for "after", "after_commit", "after_sha", or "head_sha" fields
+
+        # Test with the legacy format the current function handles
+        event_legacy = {
+            "event": "head_ref_force_pushed",
+            "after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"},
+        }
+        self.assertEqual(
+            sha_from_force_push_after(event_legacy),
+            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
+        )
+
+        # Test with current GitHub API format (should return None with current implementation)
+        event_real_api = {
+            "event": "head_ref_force_pushed",
+            "commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
+        }
+        self.assertEqual(
+            sha_from_force_push_after(event_real_api),
+            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
+        )  # Current function doesn't handle commit_id
+
+        # Test with missing SHA
+        event_no_sha = {"event": "head_ref_force_pushed"}
+        self.assertIsNone(sha_from_force_push_after(event_no_sha))
+
+    @mock.patch("trymerge.gh_fetch_json_list")
+    def test_iter_issue_timeline_until_comment(
+        self, mock_gh_fetch_json_list: Any, *args: Any
+    ) -> None:
+        """Test timeline iteration until target comment"""
+        # Mock timeline data based on actual GitHub API format
+        timeline_data = [
+            {"event": "commented", "id": 100, "body": "first comment"},
+            {"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"},
+            {"event": "commented", "id": 200, "body": "target comment"},
+            {"event": "commented", "id": 300, "body": "after target"},
+        ]
+        mock_gh_fetch_json_list.return_value = timeline_data
+
+        # Test iteration stops at target comment
+        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200))
+        self.assertEqual(len(events), 3)  # Should stop at target comment
+        self.assertEqual(events[0]["event"], "commented")
+        self.assertEqual(events[0]["id"], 100)
+        self.assertEqual(events[1]["event"], "committed")
+        self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c")
+        self.assertEqual(events[2]["event"], "commented")
+        self.assertEqual(events[2]["id"], 200)
+
+    @mock.patch("trymerge.gh_fetch_json_list")
+    def test_iter_issue_timeline_until_comment_not_found(
+        self, mock_gh_fetch_json_list: Any, *args: Any
+    ) -> None:
+        """Test timeline iteration when target comment is not found"""
+        # Mock empty timeline
+        mock_gh_fetch_json_list.return_value = []
+
+        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999))
+        self.assertEqual(len(events), 0)
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_commit_after_comment(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        """Test get_commit_sha_at_comment returns correct SHA after comment"""
+        mock_iter_timeline.return_value = [
+            {"event": "committed", "sha": "commit1"},
+            {"event": "committed", "sha": "commit2"},
+            {"event": "commented", "id": 100},
+            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertEqual(sha, "commit2")
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_force_push_before_comment(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.return_value = [
+            {"event": "committed", "sha": "commit1"},
+            {"event": "committed", "sha": "commit2"},
+            {"event": "head_ref_force_pushed", "commit_id": "commit3"},
+            {"event": "commented", "id": 100},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertEqual(sha, "commit3")
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.return_value = [
+            {"event": "committed", "sha": "commit1"},
+            {"event": "committed", "sha": "commit2"},
+            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
+            {"event": "commented", "id": 100},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertEqual(sha, "commit3")
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_multiple_comments(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.return_value = [
+            {"event": "committed", "sha": "commit1"},
+            {"event": "commented", "id": 100},
+            {"event": "committed", "sha": "commit2"},
+            {"event": "commented", "id": 200},
+            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
+            {"event": "commented", "id": 300},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(200)
+        self.assertEqual(sha, "commit2")
+        sha = pr.get_commit_sha_at_comment(300)
+        self.assertEqual(sha, "commit3")
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_no_events(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.return_value = [
+            {"event": "commented", "id": 100},
+            {"event": "labeled", "label": {"name": "test"}},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertIsNone(sha)
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_exception(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.side_effect = Exception("API error")
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertIsNone(sha)
+
+
 if __name__ == "__main__":
     main()
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index 9db85ee00ebea..00b66869dcf2a 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -108,10 +108,6 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
 fragment PRCheckSuites on CheckSuiteConnection {
   edges {
     node {
-      app {
-        name
-        databaseId
-      }
       workflowRun {
         workflow {
           name
@@ -454,6 +450,63 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
 IGNORABLE_FAILED_CHECKS_THESHOLD = 10
 
 
+def iter_issue_timeline_until_comment(
+    org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200
+) -> Any:
+    """
+    Yield timeline entries in order until (and including) the entry whose id == target_comment_id
+    for a 'commented' event. Stops once the target comment is encountered.
+    """
+    page = 1
+
+    while page <= max_pages:
+        url = (
+            f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline"
+        )
+        params = {"per_page": 100, "page": page}
+
+        batch = gh_fetch_json_list(url, params)
+
+        if not batch:
+            return
+        for ev in batch:
+            # The target is the issue comment row with event == "commented" and id == issue_comment_id
+            if ev.get("event") == "commented" and ev.get("id") == target_comment_id:
+                yield ev  # nothing in the timeline after this matters, so stop early
+                return
+            yield ev
+        if len(batch) < 100:
+            return
+        page += 1
+
+    # If we got here without finding the comment, then we either hit a bug or some github PR
+    # has a _really_ long timeline.
+    # The max # of pages found on any pytorch/pytorch PR at the time of this change was 41
+    raise RuntimeError(
+        f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}."
+        f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team."
+    )
+
+
+def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]:
+    """Extract SHA from committed event in timeline"""
+    return ev.get("sha")
+
+
+def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]:
+    """Extract SHA from force push event in timeline"""
+    # The current GitHub API format
+    commit_id = ev.get("commit_id")
+    if commit_id:
+        return str(commit_id)
+
+    # Legacy format
+    after = ev.get("after") or ev.get("after_commit") or {}
+    if isinstance(after, dict):
+        return after.get("sha") or after.get("oid")
+    return ev.get("after_sha") or ev.get("head_sha")
+
+
 def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
     rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
     return rc["data"]["repository"]["pullRequest"]
@@ -741,16 +794,24 @@ def get_changed_files_count(self) -> int:
     def last_commit(self) -> Any:
         return self.info["commits"]["nodes"][-1]["commit"]
 
+    def last_commit_sha(self, default: Optional[str] = None) -> str:
+        # for commits, the oid is the sha
+
+        if default is None:
+            return str(self.last_commit()["oid"])
+
+        return str(self.last_commit().get("oid", default))
+
     def get_merge_base(self) -> str:
         if self.merge_base:
             return self.merge_base
 
-        last_commit_oid = self.last_commit()["oid"]
+        last_commit_sha = self.last_commit_sha()
         # NB: We could use self.base_ref() here for regular PR, however, that doesn't
         # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
         # so let's just use main instead
         self.merge_base = gh_fetch_merge_base(
-            self.org, self.project, last_commit_oid, self.default_branch()
+            self.org, self.project, last_commit_sha, self.default_branch()
         )
 
         # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
@@ -839,6 +900,44 @@ def get_approved_by(self) -> list[str]:
     def get_commit_count(self) -> int:
         return int(self.info["commits_with_authors"]["totalCount"])
 
+    def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
+        """
+        Get the PR head commit SHA that was present when a specific comment was posted.
+        This ensures we only merge the state of the PR at the time the merge command was issued,
+        not any subsequent commits that may have been pushed after.
+
+        Returns None if no head-changing events found before the comment or if the comment was not found.
+        """
+        head = None
+
+        try:
+            for event in iter_issue_timeline_until_comment(
+                self.org, self.project, self.pr_num, comment_id
+            ):
+                etype = event.get("event")
+                if etype == "committed":
+                    sha = sha_from_committed_event(event)
+                    if sha:
+                        head = sha
+                        print(f"Timeline: Found commit event for SHA {sha}")
+                elif etype == "head_ref_force_pushed":
+                    sha = sha_from_force_push_after(event)
+                    if sha:
+                        head = sha
+                        print(f"Timeline: Found force push event for SHA {sha}")
+                elif etype == "commented":
+                    if event.get("id") == comment_id:
+                        print(f"Timeline: Found final comment with sha {sha}")
+                        return head
+        except Exception as e:
+            print(
+                f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}"
+            )
+            return None
+
+        print(f"Did not find comment with id {comment_id} in the PR timeline")
+        return None
+
     def get_pr_creator_login(self) -> str:
         return cast(str, self.info["author"]["login"])
 
@@ -1155,7 +1254,7 @@ def merge_into(
         *,
         skip_mandatory_checks: bool = False,
         dry_run: bool = False,
-        comment_id: Optional[int] = None,
+        comment_id: int,
         ignore_current_checks: Optional[list[str]] = None,
     ) -> None:
         # Raises exception if matching rule is not found
@@ -1171,7 +1270,7 @@ def merge_into(
             skip_internal_checks=can_skip_internal_checks(self, comment_id),
             ignore_current_checks=ignore_current_checks,
         )
-        additional_merged_prs = self.merge_changes(
+        additional_merged_prs = self.merge_changes_locally(
             repo, skip_mandatory_checks, comment_id
         )
 
@@ -1200,7 +1299,7 @@ def merge_into(
                 broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []),
                 flaky_checks=ignorable_checks.get("FLAKY", []),
                 unstable_checks=ignorable_checks.get("UNSTABLE", []),
-                last_commit_sha=self.last_commit().get("oid", ""),
+                last_commit_sha=self.last_commit_sha(default=""),
                 merge_base_sha=self.get_merge_base(),
                 merge_commit_sha=merge_commit_sha,
                 is_failed=False,
@@ -1221,7 +1320,7 @@ def merge_into(
             dry_run=dry_run,
         )
 
-    def merge_changes(
+    def merge_changes_locally(
         self,
         repo: GitRepo,
         skip_mandatory_checks: bool = False,
@@ -1230,27 +1329,15 @@ def merge_changes(
         skip_all_rule_checks: bool = False,
     ) -> list["GitHubPR"]:
         """
-        :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
+        :param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally
         """
         branch_to_merge_into = self.default_branch() if branch is None else branch
         if repo.current_branch() != branch_to_merge_into:
             repo.checkout(branch_to_merge_into)
-        if not self.is_ghstack_pr():
-            msg = self.gen_commit_message()
-            pr_branch_name = f"__pull-request-{self.pr_num}__init__"
-            repo.fetch(self.last_commit()["oid"], pr_branch_name)
-            repo._run_git("merge", "--squash", pr_branch_name)
-            repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
-
-            # Did the PR change since we started the merge?
-            pulled_sha = repo.show_ref(pr_branch_name)
-            latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
-            if pulled_sha != latest_pr_status.last_commit()["oid"]:
-                raise RuntimeError(
-                    "PR has been updated since CI checks last passed. Please rerun the merge command."
-                )
-            return []
-        else:
+
+        # It's okay to skip the commit SHA check for ghstack PRs since
+        # authoring requires write access to the repo.
+        if self.is_ghstack_pr():
             return self.merge_ghstack_into(
                 repo,
                 skip_mandatory_checks,
@@ -1258,6 +1345,48 @@ def merge_changes(
                 skip_all_rule_checks=skip_all_rule_checks,
             )
 
+        msg = self.gen_commit_message()
+        pr_branch_name = f"__pull-request-{self.pr_num}__init__"
+
+        # Determine which commit SHA to merge
+        commit_to_merge = None
+        if not comment_id:
+            raise ValueError("Must provide --comment-id when merging regular PRs")
+
+        # Get the commit SHA that was present when the comment was made
+        commit_to_merge = self.get_commit_sha_at_comment(comment_id)
+        if not commit_to_merge:
+            raise RuntimeError(
+                f"Could not find commit that was pushed before comment {comment_id}"
+            )
+
+        # Validate that this commit is the latest commit on the PR
+        latest_commit = self.last_commit_sha()
+        if commit_to_merge != latest_commit:
+            raise RuntimeError(
+                f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted "
+                f"but now the latest commit on the PR is {latest_commit}. "
+                f"Please re-issue the merge command to merge the latest commit."
+            )
+
+        print(f"Merging commit {commit_to_merge} locally")
+
+        repo.fetch(commit_to_merge, pr_branch_name)
+        repo._run_git("merge", "--squash", pr_branch_name)
+        repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
+
+        # Did the PR change since we started the merge?
+        pulled_sha = repo.show_ref(pr_branch_name)
+        latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
+        if (
+            pulled_sha != latest_pr_status.last_commit_sha()
+            or pulled_sha != commit_to_merge
+        ):
+            raise RuntimeError(
+                "PR has been updated since CI checks last passed. Please rerun the merge command."
+            )
+        return []
+
 
 class MergeRuleFailedError(RuntimeError):
     def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None:
@@ -1462,7 +1591,7 @@ def find_matching_merge_rule(
             pending_checks = []
             failed_checks = []
 
-        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
+        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}"
         if len(failed_checks) > 0:
             if reject_reason_score < 30000:
                 reject_reason_score = 30000
@@ -2160,14 +2289,14 @@ def categorize_checks(
 def merge(
     pr: GitHubPR,
     repo: GitRepo,
+    comment_id: int,
     dry_run: bool = False,
     skip_mandatory_checks: bool = False,
-    comment_id: Optional[int] = None,
     timeout_minutes: int = 400,
     stale_pr_days: int = 3,
     ignore_current: bool = False,
 ) -> None:
-    initial_commit_sha = pr.last_commit()["oid"]
+    initial_commit_sha = pr.last_commit_sha()
     pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
     print(f"Attempting merge of {initial_commit_sha} ({pr_link})")
 
@@ -2238,7 +2367,7 @@ def merge(
             f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)"
         )
         pr = GitHubPR(pr.org, pr.project, pr.pr_num)
-        if initial_commit_sha != pr.last_commit()["oid"]:
+        if initial_commit_sha != pr.last_commit_sha():
             raise RuntimeError(
                 "New commits were pushed while merging. Please rerun the merge command."
             )
@@ -2405,7 +2534,7 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
     if args.check_mergeability:
         if pr.is_ghstack_pr():
             get_ghstack_prs(repo, pr)  # raises error if out of sync
-        pr.merge_changes(
+        pr.merge_changes_locally(
             repo,
             skip_mandatory_checks=True,
             skip_all_rule_checks=True,
@@ -2420,12 +2549,18 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
         gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run)
         return
     try:
+        # Ensure comment id is set, else fail
+        if not args.comment_id:
+            raise ValueError(
+                "Comment ID is required for merging PRs, please provide it using --comment-id"
+            )
+
         merge(
             pr,
             repo,
+            comment_id=args.comment_id,
             dry_run=args.dry_run,
             skip_mandatory_checks=args.force,
-            comment_id=args.comment_id,
             ignore_current=args.ignore_current,
         )
     except Exception as e:
@@ -2447,7 +2582,7 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
                 broken_trunk_checks=[],
                 flaky_checks=[],
                 unstable_checks=[],
-                last_commit_sha=pr.last_commit().get("oid", ""),
+                last_commit_sha=pr.last_commit_sha(default=""),
                 merge_base_sha=pr.get_merge_base(),
                 is_failed=True,
                 skip_mandatory_checks=args.force,
diff --git a/.github/scripts/windows/build_magma.bat b/.github/scripts/windows/build_magma.bat
index 0f11fe34068eb..75c916ecdbef7 100644
--- a/.github/scripts/windows/build_magma.bat
+++ b/.github/scripts/windows/build_magma.bat
@@ -35,6 +35,9 @@ cd magma
 mkdir build && cd build
 
 set GPU_TARGET=All
+if "%CUVER_NODOT%" == "130" (
+  set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
+)
 if "%CUVER_NODOT%" == "129" (
   set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
 )
diff --git a/.github/scripts/windows/build_triton.bat b/.github/scripts/windows/build_triton.bat
index 97cd535a49889..d26dc8bf3b198 100644
--- a/.github/scripts/windows/build_triton.bat
+++ b/.github/scripts/windows/build_triton.bat
@@ -1,18 +1,12 @@
 @echo on
 
-set PYTHON_PREFIX=%PY_VERS:.=%
-set PYTHON_PREFIX=py%PYTHON_PREFIX:;=;py%
-call .ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
-:: Create a new conda environment
-if "%PY_VERS%" == "3.13t" (
-    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python-freethreading python=3.13
-) else (
-    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS%
-)
+set DESIRED_PYTHON=%PY_VERS%
+call .ci/pytorch/windows/internal/install_python.bat
+
 :: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480
-call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja
+%PYTHON_EXEC% -m pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja==1.11.1.4
 
 dir "%VC_INSTALL_PATH%"
 
 call "%VC_INSTALL_PATH%\VC\Auxiliary\Build\vcvarsall.bat" x64
-call conda run -n %PYTHON_PREFIX% python .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE%
+%PYTHON_EXEC% .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE%
diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index 23d4c003efa86..064eea7592230 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -4,7 +4,7 @@
 {%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}
 
 {%- set timeout_minutes = 240 -%}
-{%- set timeout_minutes_windows_binary = 300 -%}
+{%- set timeout_minutes_windows_binary = 360 -%}
 
 {%- macro concurrency(build_environment) -%}
 concurrency:
diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index b14a13f3f90c2..fee9ca2eac120 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -114,12 +114,12 @@ jobs:
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       {%- elif config["gpu_arch_type"] == "rocm" %}
       runs_on: linux.rocm.gpu
-      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.8", "12.9"] %}
+      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.6"] %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
       {%- elif config["gpu_arch_type"] == "cuda" %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
       {%- else %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.4xlarge
@@ -135,7 +135,7 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@main
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2
index 29b92ad461ef4..f4b2a66d2acda 100644
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@@ -47,9 +47,6 @@ env:
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   PR_NUMBER: ${{ github.event.pull_request.number }}
   SKIP_ALL_TESTS: 0
-{%- if cross_compile_arm64 %}
-  CROSS_COMPILE_ARM64: 1
-{% endif %}
 !{{ common.concurrency(build_environment) }}
 
 jobs:
@@ -71,11 +68,6 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       !{{ common.checkout(deep_clone=False, directory="pytorch") }}
       - name: Populate binary env
         run: |
@@ -113,12 +105,33 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
           conda activate test_conda_env
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2
index f159d623f1bf7..5e3798f8e2377 100644
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@@ -15,7 +15,7 @@
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: !{{ config["desired_cuda"] }}
 {%- if config["gpu_arch_version"] %}
-      GPU_ARCH_VERSION: !{{ config["gpu_arch_version"] }}
+      GPU_ARCH_VERSION: "!{{ config["gpu_arch_version"] }}"
 {%- endif %}
       GPU_ARCH_TYPE: !{{ config["gpu_arch_type"] }}
 {%- if include_skip_tests %}
@@ -25,11 +25,6 @@
       DOCKER_IMAGE: !{{ config["container_image"] }}
       DOCKER_IMAGE_TAG_PREFIX: !{{ config["container_image_tag_prefix"] }}
 {%- endif %}
-{%- if config["package_type"] == "manywheel" %}
-  {%- if config.use_split_build is defined %}
-      use_split_build: !{{ config["use_split_build"] }}
-  {%- endif %}
-{%- endif %}
 {%- if config["package_type"] == "libtorch" %}
   {%- if config["libtorch_config"] %}
       LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}
@@ -38,7 +33,7 @@
   {%- if is_windows %}
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
   {%- endif %}
 
 {%- else %}
diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml
index f11ee4a6621e1..bfa035bc753b8 100644
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@@ -26,13 +26,6 @@ on:
         default: 240
         type: number
         description: timeout for the job
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
       ALPINE_IMAGE:
         required: false
         type: string
@@ -117,7 +110,6 @@ jobs:
       PR_NUMBER: ${{ github.event.pull_request.number }}
       PYTORCH_FINAL_PACKAGE_DIR: /artifacts
       SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
     steps:
       - name: Make the env permanent during this workflow (but not the secrets)
         shell: bash
@@ -142,7 +134,6 @@ jobs:
             echo "PR_NUMBER=${{ env.PR_NUMBER }}"
             echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
             echo "SHA1=${{ env.SHA1 }}"
-            echo "USE_SPLIT_BUILD=${{ env.use_split_build }}"
           } >> "${GITHUB_ENV} }}"
 
       - name: List the env
@@ -261,7 +252,6 @@ jobs:
             -e PYTORCH_ROOT \
             -e SKIP_ALL_TESTS \
             -e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \
-            -e USE_SPLIT_BUILD \
             --tty \
             --detach \
             -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml
index 434167d0f0c6d..2d9e4d0e27b25 100644
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@@ -64,13 +64,6 @@ on:
         required: true
         type: string
         description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
     secrets:
       github-token:
         required: true
@@ -104,7 +97,6 @@ jobs:
       PR_NUMBER: ${{ github.event.pull_request.number }}
       PYTORCH_FINAL_PACKAGE_DIR: /artifacts
       SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
     steps:
       - name: Make the env permanent during this workflow (but not the secrets)
         shell: bash
@@ -129,7 +121,6 @@ jobs:
             echo "PR_NUMBER=${{ env.PR_NUMBER }}"
             echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
             echo "SHA1=${{ env.SHA1 }}"
-            echo "USE_SPLIT_BUILD=${{ env.USE_SPLIT_BUILD }}"
           } >> "${GITHUB_ENV} }}"
 
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
@@ -196,6 +187,8 @@ jobs:
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        with:
+          driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
         if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
 
       - name: configure aws credentials
diff --git a/.github/workflows/_binary-upload.yml b/.github/workflows/_binary-upload.yml
index 6750102b5a293..636b76d42931a 100644
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@@ -51,13 +51,6 @@ on:
         required: false
         type: string
         description: Desired python version
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
     secrets:
       github-token:
         required: true
@@ -86,7 +79,6 @@ jobs:
       PR_NUMBER: ${{ github.event.pull_request.number }}
       PYTORCH_FINAL_PACKAGE_DIR: /artifacts
       SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
     steps:
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
diff --git a/.github/workflows/_link_check.yml b/.github/workflows/_link_check.yml
index efe92ca627bba..014e6106b0730 100644
--- a/.github/workflows/_link_check.yml
+++ b/.github/workflows/_link_check.yml
@@ -13,6 +13,7 @@ jobs:
     if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
+      job-name: lint-urls
       timeout: 120
       runner: ${{ inputs.runner }}linux.2xlarge
       docker-image: ci-image:pytorch-linux-jammy-linter
@@ -38,6 +39,7 @@ jobs:
     if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
+      job-name: lint-xrefs
       timeout: 60
       runner: ${{ inputs.runner }}linux.2xlarge
       docker-image: ci-image:pytorch-linux-jammy-linter
diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
index 5173425009f69..6b4bd429e3c9f 100644
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@@ -96,6 +96,13 @@ on:
         required: false
         type: string
         default: ""
+      build-external-packages:
+        description: |
+          If set, the build external packages and saves their wheels as artifacts
+          use command separated list of packages to build ex: 'vllm,transformers'.
+        required: false
+        type: string
+        default: ""
 
     secrets:
       HUGGING_FACE_HUB_TOKEN:
@@ -121,7 +128,7 @@ jobs:
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
     runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }}
-    timeout-minutes: 240
+    timeout-minutes: 480
     outputs:
       docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       test-matrix: ${{ steps.filter.outputs.test-matrix }}
@@ -262,6 +269,7 @@ jobs:
           HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
           BUILD_ADDITIONAL_PACKAGES: ${{ inputs.build-additional-packages }}
+          RUNNER: ${{ inputs.runner }}
         run: |
           START_TIME=$(date +%s)
           if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@@ -287,10 +295,36 @@ jobs:
           # comes from https://github.com/pytorch/test-infra/pull/6058
           TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
 
+          if [[ ${BUILD_ENVIRONMENT} == *"riscv64"* ]]; then
+            # EC2 specific setup for RISC-V emulation
+            # Ensure binfmt_misc is available
+            echo "Mounting binfmt_misc filesystem"
+            sudo mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc 2>/dev/null || true
+
+            echo "QEMU registration: multiarch/qemu-user-static"
+            docker run --rm --privileged multiarch/qemu-user-static --reset -p yes || true
+
+            # Final verification
+            echo "Checking binfmt_misc status:"
+            ls -la /proc/sys/fs/binfmt_misc/ 2>/dev/null || echo "Cannot access binfmt_misc directory"
+
+            if [ -f /proc/sys/fs/binfmt_misc/qemu-riscv64 ]; then
+              echo "qemu-riscv64 registration successful"
+            else
+              echo "qemu-riscv64 registration failed - proceeding without emulation"
+              echo "This may cause RISC-V builds to fail"
+            fi
+
+            RISCV_DOCKER_ARGS="--privileged"
+          else
+            RISCV_DOCKER_ARGS=
+          fi
+
           # detached container should get cleaned up by teardown_ec2_linux
           # Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty
           # shellcheck disable=SC2086
           container_name=$(docker run \
+            ${RISCV_DOCKER_ARGS} \
             -e BUILD_ENVIRONMENT \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e PR_NUMBER \
@@ -306,8 +340,8 @@ jobs:
             -e OUR_GITHUB_JOB_ID \
             -e HUGGING_FACE_HUB_TOKEN \
             -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
-            -e USE_SPLIT_BUILD \
             -e BUILD_ADDITIONAL_PACKAGES \
+            -e RUNNER \
             --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
             --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
@@ -331,6 +365,26 @@ jobs:
           END_TIME=$(date +%s)
           echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"
 
+      - name: Build external packages
+        id: build-external-packages
+        if: inputs.build-external-packages != '' &&  steps.build.outcome != 'skipped'
+        uses: ./.github/actions/build-external-packages
+        with:
+          build-targets: ${{ inputs.build-external-packages }}
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          cuda-arch-list: ${{ inputs.cuda-arch-list }}
+          output-dir: external
+
+      - name: Move external packages to dist
+        if: steps.build-external-packages.outputs.output_dir != '' && steps.build-external-packages.outcome != 'skipped'
+        shell: bash
+        run: |
+          src="${{ steps.build-external-packages.outputs.output_dir }}"
+          if [ -d "$src" ]; then
+            mkdir -p "dist/$(dirname "$src")"
+            mv "$src" "dist/$(dirname "$src")/"
+          fi
+
       - name: Stop monitoring script
         if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
         shell: bash
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 07be3720b2bf2..66579b573a63d 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -72,6 +72,10 @@ on:
         required: false
         description: |
           HF Auth token to avoid rate limits when downloading models or datasets from hub
+      VLLM_TEST_HUGGING_FACE_TOKEN:
+        required: false
+        description: |
+          HF Auth token to test vllm
       SCRIBE_GRAPHQL_ACCESS_TOKEN:
         required: false
         description: |
@@ -286,6 +290,7 @@ jobs:
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
           DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
+          VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }}
           HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
           ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
@@ -362,6 +367,7 @@ jobs:
             -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e HUGGING_FACE_HUB_TOKEN \
+            -e VLLM_TEST_HUGGING_FACE_TOKEN \
             -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
             -e DASHBOARD_TAG \
             -e ARTIFACTS_FILE_SUFFIX \
@@ -403,7 +409,7 @@ jobs:
           job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
 
       - name: Authenticate with AWS
-        if: ${{ contains(matrix.runner, 'b200') }}
+        if: ${{ always() && contains(matrix.runner, 'b200') }}
         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 063c97e449c75..086e25b4868eb 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -136,7 +136,7 @@ jobs:
           MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
           MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
         run: |
-          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_sajson==0.6.7
+          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_json==0.6.7
           "$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index 2d660d98905e9..f73972942b5f9 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -88,6 +88,16 @@ jobs:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
 
+      - name: Runner check GPU count (distributed jobs)
+        if: ${{ contains(matrix.config, 'distributed') }}
+        shell: bash
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ $ngpu -lt 4 ]]; then
+            echo "Error: only $ngpu GPU(s) detected, at least 4 GPUs are needed for distributed jobs"
+            exit 1
+          fi
+
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index ebfb4001e4379..7067d79eb0758 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -77,6 +77,7 @@ jobs:
         run: |
           git config --global core.longpaths true
           git config --global core.symlinks true
+          git config --global core.ignorecase false
 
           # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
           # the directory on Windows and prevent GHA from checking out as reported
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index 0c95503928fb9..5049ef61f6930 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -70,6 +70,7 @@ jobs:
         run: |
           git config --global core.longpaths true
           git config --global core.symlinks true
+          git config --global core.ignorecase false
 
           # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
           # the directory on Windows and prevent GHA from checking out as reported
diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml
index 177e6ca4bbe3c..7aa7608924487 100644
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@@ -275,7 +275,7 @@ jobs:
       - name: Change permissions
         if: ${{ always() && steps.test.conclusion }}
         run: |
-          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test"
 
       - name: Print remaining test logs
         shell: bash
diff --git a/.github/workflows/build-almalinux-images.yml b/.github/workflows/build-almalinux-images.yml
index aaf85d7fc8067..0754b154a358d 100644
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@@ -36,7 +36,7 @@ jobs:
     runs-on: linux.9xlarge.ephemeral
     strategy:
       matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "rocm6.3", "rocm6.4", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
     steps:
       - name: Build docker image
         uses: pytorch/pytorch/.github/actions/binary-docker-build@main
diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml
index b2d50efd7d96c..cc2f54fc45f84 100644
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@@ -48,6 +48,7 @@ jobs:
       fail-fast: false
       matrix:
         include: [
+          { tag: "cuda13.0" },
           { tag: "cuda12.9" },
           { tag: "cuda12.8" },
           { tag: "cuda12.6" },
diff --git a/.github/workflows/build-magma-linux.yml b/.github/workflows/build-magma-linux.yml
index e13de48b2408a..be8f613169e8c 100644
--- a/.github/workflows/build-magma-linux.yml
+++ b/.github/workflows/build-magma-linux.yml
@@ -34,7 +34,7 @@ jobs:
       id-token: write
     strategy:
       matrix:
-        cuda_version: ["129", "128", "126"]
+        cuda_version: ["130", "129", "128", "126"]
     steps:
       - name: Checkout PyTorch
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/build-magma-windows.yml b/.github/workflows/build-magma-windows.yml
index 80d870f419e42..b7d293a5cec11 100644
--- a/.github/workflows/build-magma-windows.yml
+++ b/.github/workflows/build-magma-windows.yml
@@ -22,7 +22,7 @@ jobs:
     runs-on: windows-2022
     strategy:
       matrix:
-        cuda_version: ["129", "128", "126"]
+        cuda_version: ["130", "129", "128", "126"]
         config: ["Release", "Debug"]
     env:
       CUDA_VERSION: ${{ matrix.cuda_version }}
diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml
index e84b84f6158ba..9d08501f51bc5 100644
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@@ -46,11 +46,12 @@ jobs:
       fail-fast: false
       matrix:
         include: [
-          { name: "manylinux2_28-builder",          tag: "cuda12.9",         runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda13.0",         runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cuda12.6",          runner: "linux.9xlarge.ephemeral" },
-          { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda13.0",          runner: "linux.arm64.2xlarge.ephemeral" },
           { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index d54f459d0b43e..932d9c8863027 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -145,7 +145,7 @@ jobs:
           fi
 
           docker exec -t "${container_name}" yum install -y zlib-devel zip
-          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==78.1.0 pybind11==3.0.1 auditwheel wheel
           set +e
           docker exec -t "${container_name}" command -v pip
           has_pip=$?
@@ -194,7 +194,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
         device: ["xpu"]
     timeout-minutes: 40
     env:
diff --git a/.github/workflows/build-vllm-wheel.yml b/.github/workflows/build-vllm-wheel.yml
new file mode 100644
index 0000000000000..658e02ede6fbd
--- /dev/null
+++ b/.github/workflows/build-vllm-wheel.yml
@@ -0,0 +1,248 @@
+name: Build vLLM wheels
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/build-vllm-wheel.yml
+      - .github/ci_commit_pins/vllm.txt
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - .github/workflows/build-vllm-wheel.yml
+      - .github/ci_commit_pins/vllm.txt
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-wheel:
+    if: github.repository_owner == 'pytorch'
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ '3.12' ]
+        # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
+        device: [ 'cu128', 'cu129' ]
+        runner: [ 'linux.12xlarge.memory' ]
+        include:
+          - device: cu128
+            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
+          - device: cu129
+            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
+    name: "Build ${{ matrix.device }} vLLM wheel"
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 480
+    env:
+      PY_VERS: ${{ matrix.python-version }}
+      MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
+      PLATFORM: 'manylinux_2_28_x86_64'
+      BUILD_DEVICE: ${{ matrix.device }}
+    steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Get latest PyTorch nightly
+        shell: bash
+        run: |
+          set -eux
+
+          # Keep PyTorch nightly wheel here so that we can install it later during
+          # vLLM build process
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+
+          container_name=$(docker run \
+            --tty \
+            --detach \
+            -e PLATFORM \
+            -v "${GITHUB_WORKSPACE}:/pytorch" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w /artifacts/ \
+            "${MANYLINUX_IMAGE}"
+          )
+
+          # Determine python executable for given version (copied from build-triton-wheel)
+          case $PY_VERS in
+          3.10)
+            PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
+            ;;
+          3.11)
+            PYTHON_EXECUTABLE=/opt/python/cp311-cp311/bin/python
+            ;;
+          3.12)
+            PYTHON_EXECUTABLE=/opt/python/cp312-cp312/bin/python
+            ;;
+          3.13)
+            PYTHON_EXECUTABLE=/opt/python/cp313-cp313/bin/python
+            ;;
+          3.13t)
+            PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
+            ;;
+          3.14)
+            PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python
+            ;;
+          3.14t)
+            PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python
+            ;;
+          *)
+            echo "Unsupported python version ${PY_VERS}"
+            exit 1
+            ;;
+          esac
+
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
+            --pre torch torchvision torchaudio \
+            --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
+
+          # I wonder if there is a command to both download and install the wheels
+          # in one go
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip download \
+            --pre torch torchvision torchaudio \
+            --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
+
+          # Save this for later
+          echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV"
+          echo "container_name=${container_name}" >> "$GITHUB_ENV"
+
+      - name: Build vLLM wheel
+        uses: ./.github/actions/build-external-packages
+        with:
+          build-targets: vllm
+          docker-image: ${{ env.MANYLINUX_IMAGE }}
+          cuda-arch-list: '8.0;8.9;9.0;10.0;12.0'
+          torch-wheel-dir: ${{ runner.temp }}/artifacts
+          output-dir: ${{ runner.temp }}/artifacts/externals
+
+      - name: Prepare vLLM wheel
+        shell: bash
+        run: |
+          set -eux
+
+          # Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh
+          docker exec -t "${container_name}" bash -c "
+            set -eux
+
+            nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4)
+
+            pushd externals/vllm/wheels
+            for package in xformers flashinfer-python vllm; do
+              pushd \$package
+              auditwheel repair --plat \$PLATFORM *.whl \
+                --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
+              repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*)
+              repair_wheel=\$(basename \${repair_wheel})
+              popd
+
+              cp \${package}/wheelhouse/\${repair_wheel} .
+              version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+
+              if [[ \$package == vllm ]]; then
+                new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly}
+              else
+                major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3)
+                new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly}
+              fi
+
+              mv -- \$repair_wheel \$new_wheel
+              rm -rf \$package
+            done
+            popd
+          "
+
+          docker exec -t "${container_name}" chown -R 1000:1000 /artifacts
+
+      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        with:
+          name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
+          if-no-files-found: error
+          path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()
+
+  # Copied from build-triton-wheel workflow (mostly)
+  upload-wheel:
+    name: "Upload ${{ matrix.device }} vLLM wheel"
+    needs:
+      - build-wheel
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        device: [ 'cu128', 'cu129' ]
+    env:
+      BUILD_DEVICE: ${{ matrix.device }}
+    permissions:
+      id-token: write
+      contents: read
+    container:
+      image: continuumio/miniconda3:4.12.0
+    environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Configure AWS credentials(PyTorch account) for main
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
+          aws-region: us-east-1
+
+      - name: Configure AWS credentials(PyTorch account) for RC builds
+        if: ${{ github.event_name == 'push' &&  (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels
+          aws-region: us-east-1
+
+      - name: Download Build Artifacts
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+        with:
+          # Download all available artifacts
+          path: ${{ runner.temp }}/artifacts-all
+
+      - name: Select Wheel Artifacts
+        shell: bash
+        run: |
+          set -eux
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
+
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
+        shell: bash
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
+        shell: bash
+        run: |
+          set -ex
+
+          if [[ "${GITHUB_REF_NAME}" = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+
+      - name: Upload binaries
+        env:
+          PACKAGE_TYPE: wheel
+          UPLOAD_SUBFOLDER: ${{ env.BUILD_DEVICE }}
+          PKG_DIR: ${{ runner.temp }}/artifacts
+        shell: bash
+        run: |
+          set -ex
+          bash .circleci/scripts/binary_upload.sh
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index db8fbcb4bdc7d..57fe7be15d298 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -57,6 +57,11 @@ jobs:
           echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
       - name: Checkout optional submodules
         run: python3 tools/optional_submodules.py
+      - name: Copy docs requirements for inclusion
+        run: |
+          # Replace symlink with actual file
+          rm docs/requirements.txt || true
+          cp .ci/docker/requirements-docs.txt docs/requirements.txt
       - name: Create source distribution
         run: |
             # Create new folder with specified name so extracting the archive yields that
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index c27f651b6b3aa..492f41775d9de 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -50,35 +50,31 @@ jobs:
         runner: [linux.12xlarge]
         docker-image-name: [
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
+          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
-          pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
           pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
-          pytorch-linux-jammy-py3.9-clang12,
-          pytorch-linux-jammy-py3.11-clang12,
-          pytorch-linux-jammy-py3.12-clang12,
+          pytorch-linux-jammy-py3.10-clang12,
           pytorch-linux-jammy-py3.13-clang12,
           pytorch-linux-jammy-rocm-n-py3,
           pytorch-linux-noble-rocm-n-py3,
           pytorch-linux-noble-rocm-alpha-py3,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
-          pytorch-linux-jammy-py3.9-gcc11,
-          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
+          pytorch-linux-jammy-rocm-n-py3-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
+          pytorch-linux-jammy-py3.10-gcc11,
+          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
           pytorch-linux-jammy-py3.12-halide,
-          pytorch-linux-jammy-xpu-2025.0-py3,
-          pytorch-linux-jammy-xpu-2025.1-py3,
+          pytorch-linux-jammy-xpu-n-1-py3,
+          pytorch-linux-jammy-xpu-n-py3,
           pytorch-linux-jammy-py3-clang18-asan,
           pytorch-linux-jammy-py3-clang12-onnx,
           pytorch-linux-jammy-linter,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
           # Executorch pin needs update
           # pytorch-linux-jammy-py3-clang12-executorch,
-          pytorch-linux-jammy-py3.12-triton-cpu
+          pytorch-linux-jammy-py3.12-triton-cpu,
+          pytorch-linux-noble-riscv64-py3.12-gcc14
         ]
         include:
           - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
@@ -127,7 +123,7 @@ jobs:
           GHCR_PAT: ${{ secrets.GHCR_PAT }}
         with:
           shell: bash
-          timeout_minutes: 30
+          timeout_minutes: 60
           max_attempts: 5
           retry_wait_seconds: 90
           command: |
diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
index 8cde3006e3816..860ee21cda6a7 100644
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@@ -47,7 +47,7 @@ jobs:
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-cpu-aarch64-build:
+  manywheel-py3_10-cpu-aarch64-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -60,19 +60,18 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_9-cpu-aarch64
+      build_name: manywheel-py3_10-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cpu-aarch64-test:  # Testing
+  manywheel-py3_10-cpu-aarch64-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_9-cpu-aarch64-build
+      - manywheel-py3_10-cpu-aarch64-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -84,21 +83,20 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cpu-aarch64
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.2xlarge
       ALPINE_IMAGE: "arm64v8/alpine"
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cpu-aarch64-upload:  # Uploading
+  manywheel-py3_10-cpu-aarch64-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_9-cpu-aarch64-test
+    needs: manywheel-py3_10-cpu-aarch64-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
@@ -108,14 +106,13 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cpu-aarch64
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_9-cuda-aarch64-12_9-build:
+  manywheel-py3_10-cuda-aarch64-12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -124,46 +121,44 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_9-cuda-aarch64-12_9
+      build_name: manywheel-py3_10-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_10-cuda-aarch64-12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_9-cuda-aarch64-12_9-build
+    needs: manywheel-py3_10-cuda-aarch64-12_6-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda-aarch64-12_9
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda-aarch64-12_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_10-cpu-aarch64-build:
+  manywheel-py3_10-cuda-aarch64-12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -172,66 +167,44 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_10-cpu-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cpu-aarch64-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-cpu-aarch64-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cpu-aarch64
+      build_name: manywheel-py3_10-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.2xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cpu-aarch64-upload:  # Uploading
+  manywheel-py3_10-cuda-aarch64-12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_10-cpu-aarch64-test
+    needs: manywheel-py3_10-cuda-aarch64-12_8-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cpu-aarch64
+      build_name: manywheel-py3_10-cuda-aarch64-12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_10-cuda-aarch64-12_9-build:
+  manywheel-py3_10-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -240,41 +213,39 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_10-cuda-aarch64-12_9
+      build_name: manywheel-py3_10-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_10-cuda-aarch64-13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_10-cuda-aarch64-12_9-build
+    needs: manywheel-py3_10-cuda-aarch64-13_0-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda-aarch64-12_9
+      build_name: manywheel-py3_10-cuda-aarch64-13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -292,7 +263,6 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
@@ -316,7 +286,6 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -340,14 +309,105 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_11-cuda-aarch64-12_9-build:
+  manywheel-py3_11-cuda-aarch64-12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_11-cuda-aarch64-12_6
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda-aarch64-12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda-aarch64-12_6-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda-aarch64-12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_11-cuda-aarch64-12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_11-cuda-aarch64-12_8
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda-aarch64-12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda-aarch64-12_8-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda-aarch64-12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_11-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -356,41 +416,39 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_11-cuda-aarch64-12_9
+      build_name: manywheel-py3_11-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_11-cuda-aarch64-13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_11-cuda-aarch64-12_9-build
+    needs: manywheel-py3_11-cuda-aarch64-13_0-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda-aarch64-12_9
+      build_name: manywheel-py3_11-cuda-aarch64-13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -408,7 +466,6 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
@@ -432,7 +489,6 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -456,14 +512,105 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_12-cuda-aarch64-12_9-build:
+  manywheel-py3_12-cuda-aarch64-12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_12-cuda-aarch64-12_6
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda-aarch64-12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cuda-aarch64-12_6-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda-aarch64-12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_12-cuda-aarch64-12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_12-cuda-aarch64-12_8
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda-aarch64-12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cuda-aarch64-12_8-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda-aarch64-12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_12-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -472,41 +619,39 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_12-cuda-aarch64-12_9
+      build_name: manywheel-py3_12-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_12-cuda-aarch64-13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_12-cuda-aarch64-12_9-build
+    needs: manywheel-py3_12-cuda-aarch64-13_0-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda-aarch64-12_9
+      build_name: manywheel-py3_12-cuda-aarch64-13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -524,7 +669,6 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
@@ -548,7 +692,6 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -572,14 +715,13 @@ jobs:
       GPU_ARCH_TYPE: cpu-aarch64
       DOCKER_IMAGE: manylinux2_28_aarch64-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13-cuda-aarch64-12_9-build:
+  manywheel-py3_13-cuda-aarch64-12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -588,46 +730,44 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_13-cuda-aarch64-12_9
+      build_name: manywheel-py3_13-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_13-cuda-aarch64-12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13-cuda-aarch64-12_9-build
+    needs: manywheel-py3_13-cuda-aarch64-12_6-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda-aarch64-12_9
+      build_name: manywheel-py3_13-cuda-aarch64-12_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13t-cpu-aarch64-build:
+  manywheel-py3_13-cuda-aarch64-12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -636,66 +776,44 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
-      DESIRED_PYTHON: "3.13t"
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_13t-cpu-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cpu-aarch64-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_13t-cpu-aarch64-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cpu-aarch64
+      build_name: manywheel-py3_13-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.2xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cpu-aarch64-upload:  # Uploading
+  manywheel-py3_13-cuda-aarch64-12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13t-cpu-aarch64-test
+    needs: manywheel-py3_13-cuda-aarch64-12_8-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cpu-aarch64
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda-aarch64-12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13t-cuda-aarch64-12_9-build:
+  manywheel-py3_13-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -704,41 +822,648 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
-      DESIRED_PYTHON: "3.13t"
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_13t-cuda-aarch64-12_9
+      build_name: manywheel-py3_13-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_13-cuda-aarch64-13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13t-cuda-aarch64-12_9-build
+    needs: manywheel-py3_13-cuda-aarch64-13_0-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9-aarch64
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda-aarch64-13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda-aarch64-12_9
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13t-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-aarch64-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-aarch64-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.2xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-aarch64-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda-aarch64-12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13t-cuda-aarch64-12_6
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda-aarch64-12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda-aarch64-12_6-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda-aarch64-12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda-aarch64-12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13t-cuda-aarch64-12_8
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda-aarch64-12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda-aarch64-12_8-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda-aarch64-12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda-aarch64-13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13t-cuda-aarch64-13_0
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda-aarch64-13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda-aarch64-13_0-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda-aarch64-13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cpu-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cpu-aarch64-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14-cpu-aarch64-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.2xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cpu-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cpu-aarch64-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cpu-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cuda-aarch64-12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14-cuda-aarch64-12_6
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda-aarch64-12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cuda-aarch64-12_6-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda-aarch64-12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cuda-aarch64-12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14-cuda-aarch64-12_8
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda-aarch64-12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cuda-aarch64-12_8-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda-aarch64-12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cuda-aarch64-13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14-cuda-aarch64-13_0
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda-aarch64-13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cuda-aarch64-13_0-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda-aarch64-13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cpu-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14t-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cpu-aarch64-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14t-cpu-aarch64-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.2xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cpu-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cpu-aarch64-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cpu-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cuda-aarch64-12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_6
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda-aarch64-12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cuda-aarch64-12_6-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cuda-aarch64-12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_8
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda-aarch64-12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cuda-aarch64-12_8-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cuda-aarch64-13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14t-cuda-aarch64-13_0
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda-aarch64-13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cuda-aarch64-13_0-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda-aarch64-13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
index 9f4a8194d2874..03835a9f5f352 100644
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@@ -122,7 +122,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -145,7 +145,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -154,7 +154,7 @@ jobs:
       build_name: libtorch-cuda12_6-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_6-shared-with-deps-release-upload:  # Uploading
@@ -169,7 +169,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@@ -190,7 +190,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -213,7 +213,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -222,7 +222,7 @@ jobs:
       build_name: libtorch-cuda12_8-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_8-shared-with-deps-release-upload:  # Uploading
@@ -237,7 +237,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@@ -248,7 +248,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  libtorch-cuda12_9-shared-with-deps-release-build:
+  libtorch-cuda13_0-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -257,22 +257,22 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_name: libtorch-cuda13_0-shared-with-deps-release
       build_environment: linux-binary-libtorch
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda12_9-shared-with-deps-release-test:  # Testing
+  libtorch-cuda13_0-shared-with-deps-release-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - libtorch-cuda12_9-shared-with-deps-release-build
+      - libtorch-cuda13_0-shared-with-deps-release-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -280,38 +280,38 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_name: libtorch-cuda13_0-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
+  libtorch-cuda13_0-shared-with-deps-release-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: libtorch-cuda12_9-shared-with-deps-release-test
+    needs: libtorch-cuda13_0-shared-with-deps-release-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_name: libtorch-cuda13_0-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -326,7 +326,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -350,7 +350,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: libtorch-cxx11-builder
@@ -419,7 +419,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@@ -440,7 +440,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@@ -464,7 +464,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: libtorch-cxx11-builder
@@ -533,7 +533,7 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml
index d1e89bb6e2d85..ec08b2c78eb67 100644
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@@ -42,54 +42,7 @@ jobs:
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-cuda12_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_6
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_6-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_6
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda12_8-build:
+  manywheel-py3_12-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -99,22 +52,21 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_8
+      build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_8-test:  # Testing
+  manywheel-py3_12-cuda12_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_9-cuda12_8-build
+      - manywheel-py3_12-cuda12_8-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -123,62 +75,14 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_8
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_9
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_9-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_9-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_9
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 464bef0e1f7db..8a581a1f21fe1 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -47,619 +47,6 @@ jobs:
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cpu
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cpu-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cpu
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-cpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_9-cuda12_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_6
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_6-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_6
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-cuda12_6-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_9-cuda12_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_8
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_8-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_8
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-cuda12_8-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_9-cuda12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_9
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_9-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_9-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_9
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_9-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-cuda12_9-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_9
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_9-rocm6_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-rocm6_3
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-rocm6_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-rocm6_3-build
-      - get-label-type
-    runs-on: linux.rocm.gpu.mi250
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-rocm6_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: configure aws credentials
-        id: aws_creds
-        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
-          docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.3
-          docker-build-dir: .ci/docker
-          working-directory: pytorch
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-        env:
-          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  manywheel-py3_9-rocm6_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-rocm6_3-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm6_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_9-rocm6_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-rocm6_4
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-rocm6_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-rocm6_4-build
-      - get-label-type
-    runs-on: linux.rocm.gpu.mi250
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-rocm6_4
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: configure aws credentials
-        id: aws_creds
-        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
-          docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.4
-          docker-build-dir: .ci/docker
-          working-directory: pytorch
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-        env:
-          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  manywheel-py3_9-rocm6_4-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-rocm6_4-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm6_4
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_9-xpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-xpu
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-xpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-xpu-build
-      - get-label-type
-    runs-on: linux.idc.xpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-      - name: Login to Amazon ECR
-        id: login-ecr
-        uses: aws-actions/amazon-ecr-login@v2
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-xpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
-          docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: xpu
-          docker-build-dir: .ci/docker
-          working-directory: pytorch
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-        env:
-          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Teardown XPU
-        uses: ./.github/actions/teardown-xpu
-  manywheel-py3_9-xpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-xpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-xpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
   manywheel-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -673,7 +60,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cpu
@@ -695,7 +81,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu
       build_environment: linux-binary-manywheel
@@ -718,7 +103,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu
     secrets:
@@ -735,16 +119,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_6-test:  # Testing
@@ -759,16 +142,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_6-upload:  # Uploading
@@ -783,11 +165,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_6
     secrets:
@@ -804,16 +185,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_8-test:  # Testing
@@ -828,16 +208,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_8-upload:  # Uploading
@@ -852,18 +231,17 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_10-cuda12_9-build:
+  manywheel-py3_10-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -872,23 +250,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-cuda12_9
+      build_name: manywheel-py3_10-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_9-test:  # Testing
+  manywheel-py3_10-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_10-cuda12_9-build
+      - manywheel-py3_10-cuda13_0-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -896,38 +273,36 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_9
+      build_name: manywheel-py3_10-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_9-upload:  # Uploading
+  manywheel-py3_10-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_10-cuda12_9-test
+    needs: manywheel-py3_10-cuda13_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_9
+      build_name: manywheel-py3_10-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -942,11 +317,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-rocm6_3
@@ -966,12 +340,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
     steps:
       - name: Setup ROCm
@@ -1035,11 +408,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-rocm6_3
     secrets:
@@ -1056,11 +428,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-rocm6_4
@@ -1080,12 +451,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
     steps:
       - name: Setup ROCm
@@ -1149,11 +519,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-rocm6_4
     secrets:
@@ -1173,12 +542,11 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-xpu-test:  # Testing
@@ -1198,14 +566,13 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
     permissions:
       id-token: write
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@main
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -1266,7 +633,6 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-xpu
     secrets:
@@ -1286,7 +652,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cpu
@@ -1308,7 +673,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu
       build_environment: linux-binary-manywheel
@@ -1331,7 +695,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu
     secrets:
@@ -1348,16 +711,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_6-test:  # Testing
@@ -1372,16 +734,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_6-upload:  # Uploading
@@ -1396,11 +757,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_6
     secrets:
@@ -1417,16 +777,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_8-test:  # Testing
@@ -1441,16 +800,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_8-upload:  # Uploading
@@ -1465,86 +823,17 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_11-cuda12_8-full-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_11-cuda12_8-full
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_8-full-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_11-cuda12_8-full-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_8-full
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_8-full-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cuda12_8-full-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_8-full
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_11-cuda12_9-build:
+  manywheel-py3_11-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -1553,23 +842,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_11-cuda12_9
+      build_name: manywheel-py3_11-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_9-test:  # Testing
+  manywheel-py3_11-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_11-cuda12_9-build
+      - manywheel-py3_11-cuda13_0-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -1577,38 +865,36 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_9
+      build_name: manywheel-py3_11-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_9-upload:  # Uploading
+  manywheel-py3_11-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_11-cuda12_9-test
+    needs: manywheel-py3_11-cuda13_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_9
+      build_name: manywheel-py3_11-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1623,11 +909,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-rocm6_3
@@ -1647,12 +932,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
     steps:
       - name: Setup ROCm
@@ -1716,11 +1000,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-rocm6_3
     secrets:
@@ -1737,11 +1020,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-rocm6_4
@@ -1761,12 +1043,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
     steps:
       - name: Setup ROCm
@@ -1830,11 +1111,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-rocm6_4
     secrets:
@@ -1854,12 +1134,11 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-xpu-test:  # Testing
@@ -1879,14 +1158,13 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
     permissions:
       id-token: write
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@main
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -1947,7 +1225,6 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-xpu
     secrets:
@@ -1967,7 +1244,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cpu
@@ -1989,7 +1265,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu
       build_environment: linux-binary-manywheel
@@ -2012,7 +1287,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu
     secrets:
@@ -2029,16 +1303,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_6-test:  # Testing
@@ -2053,16 +1326,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_6-upload:  # Uploading
@@ -2077,11 +1349,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_6
     secrets:
@@ -2098,16 +1369,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_8-test:  # Testing
@@ -2122,16 +1392,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_8-upload:  # Uploading
@@ -2146,18 +1415,17 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_12-cuda12_9-build:
+  manywheel-py3_12-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -2166,23 +1434,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-cuda12_9
+      build_name: manywheel-py3_12-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda12_9-test:  # Testing
+  manywheel-py3_12-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_12-cuda12_9-build
+      - manywheel-py3_12-cuda13_0-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -2190,38 +1457,36 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda12_9
+      build_name: manywheel-py3_12-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda12_9-upload:  # Uploading
+  manywheel-py3_12-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_12-cuda12_9-test
+    needs: manywheel-py3_12-cuda13_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda12_9
+      build_name: manywheel-py3_12-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -2236,11 +1501,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-rocm6_3
@@ -2260,12 +1524,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
     steps:
       - name: Setup ROCm
@@ -2329,11 +1592,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-rocm6_3
     secrets:
@@ -2350,11 +1612,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-rocm6_4
@@ -2374,12 +1635,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
     steps:
       - name: Setup ROCm
@@ -2443,11 +1703,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-rocm6_4
     secrets:
@@ -2467,12 +1726,11 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-xpu-test:  # Testing
@@ -2492,14 +1750,13 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
     permissions:
       id-token: write
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@main
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -2560,7 +1817,6 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-xpu
     secrets:
@@ -2580,7 +1836,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cpu
@@ -2602,7 +1857,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu
       build_environment: linux-binary-manywheel
@@ -2625,7 +1879,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu
     secrets:
@@ -2642,16 +1895,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_6-test:  # Testing
@@ -2666,16 +1918,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_6-upload:  # Uploading
@@ -2690,11 +1941,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_6
     secrets:
@@ -2711,16 +1961,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_8-test:  # Testing
@@ -2735,16 +1984,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_8-upload:  # Uploading
@@ -2759,18 +2007,17 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13-cuda12_9-build:
+  manywheel-py3_13-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -2779,23 +2026,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13-cuda12_9
+      build_name: manywheel-py3_13-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_9-test:  # Testing
+  manywheel-py3_13-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_13-cuda12_9-build
+      - manywheel-py3_13-cuda13_0-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -2803,38 +2049,36 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_9
+      build_name: manywheel-py3_13-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_9-upload:  # Uploading
+  manywheel-py3_13-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13-cuda12_9-test
+    needs: manywheel-py3_13-cuda13_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_9
+      build_name: manywheel-py3_13-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -2849,11 +2093,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-rocm6_3
@@ -2873,12 +2116,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
     steps:
       - name: Setup ROCm
@@ -2942,11 +2184,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-rocm6_3
     secrets:
@@ -2963,11 +2204,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-rocm6_4
@@ -2987,12 +2227,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
     steps:
       - name: Setup ROCm
@@ -3056,11 +2295,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-rocm6_4
     secrets:
@@ -3080,12 +2318,11 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-xpu-test:  # Testing
@@ -3105,14 +2342,13 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
     permissions:
       id-token: write
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@main
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -3173,7 +2409,6 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-xpu
     secrets:
@@ -3193,7 +2428,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cpu
@@ -3215,7 +2449,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cpu
       build_environment: linux-binary-manywheel
@@ -3238,7 +2471,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cpu
     secrets:
@@ -3255,16 +2487,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_6-test:  # Testing
@@ -3279,16 +2510,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_6-upload:  # Uploading
@@ -3303,11 +2533,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_6
     secrets:
@@ -3324,16 +2553,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_8-test:  # Testing
@@ -3348,16 +2576,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_8-upload:  # Uploading
@@ -3372,18 +2599,17 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13t-cuda12_9-build:
+  manywheel-py3_13t-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -3392,23 +2618,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13t-cuda12_9
+      build_name: manywheel-py3_13t-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda12_9-test:  # Testing
+  manywheel-py3_13t-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_13t-cuda12_9-build
+      - manywheel-py3_13t-cuda13_0-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -3416,38 +2641,36 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda12_9
+      build_name: manywheel-py3_13t-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda12_9-upload:  # Uploading
+  manywheel-py3_13t-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13t-cuda12_9-test
+    needs: manywheel-py3_13t-cuda13_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda12_9
+      build_name: manywheel-py3_13t-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -3462,11 +2685,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-rocm6_3
@@ -3486,12 +2708,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
     steps:
       - name: Setup ROCm
@@ -3555,11 +2776,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-rocm6_3
     secrets:
@@ -3576,11 +2796,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-rocm6_4
@@ -3600,12 +2819,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
     steps:
       - name: Setup ROCm
@@ -3669,11 +2887,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-rocm6_4
     secrets:
@@ -3693,12 +2910,11 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-xpu-test:  # Testing
@@ -3718,14 +2934,13 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
     permissions:
       id-token: write
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@main
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -3786,7 +3001,6 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-xpu
     secrets:
@@ -3806,7 +3020,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cpu
@@ -3828,7 +3041,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       build_name: manywheel-py3_14-cpu
       build_environment: linux-binary-manywheel
@@ -3851,7 +3063,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       build_name: manywheel-py3_14-cpu
     secrets:
@@ -3868,16 +3079,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_6-test:  # Testing
@@ -3892,16 +3102,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       build_name: manywheel-py3_14-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_6-upload:  # Uploading
@@ -3916,11 +3125,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       build_name: manywheel-py3_14-cuda12_6
     secrets:
@@ -3937,16 +3145,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_8-test:  # Testing
@@ -3961,16 +3168,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       build_name: manywheel-py3_14-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_8-upload:  # Uploading
@@ -3985,18 +3191,17 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       build_name: manywheel-py3_14-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_14-cuda12_9-build:
+  manywheel-py3_14-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -4005,23 +3210,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_14-cuda12_9
+      build_name: manywheel-py3_14-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cuda12_9-test:  # Testing
+  manywheel-py3_14-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_14-cuda12_9-build
+      - manywheel-py3_14-cuda13_0-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -4029,38 +3233,36 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cuda12_9
+      build_name: manywheel-py3_14-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cuda12_9-upload:  # Uploading
+  manywheel-py3_14-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_14-cuda12_9-test
+    needs: manywheel-py3_14-cuda13_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cuda12_9
+      build_name: manywheel-py3_14-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -4075,11 +3277,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-rocm6_3
@@ -4099,12 +3300,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
     steps:
       - name: Setup ROCm
@@ -4168,11 +3368,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       build_name: manywheel-py3_14-rocm6_3
     secrets:
@@ -4189,11 +3388,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-rocm6_4
@@ -4213,12 +3411,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
     steps:
       - name: Setup ROCm
@@ -4282,11 +3479,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       build_name: manywheel-py3_14-rocm6_4
     secrets:
@@ -4306,12 +3502,11 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-xpu-test:  # Testing
@@ -4331,14 +3526,13 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
     permissions:
       id-token: write
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@main
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -4399,7 +3593,6 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.14"
       build_name: manywheel-py3_14-xpu
     secrets:
@@ -4419,7 +3612,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cpu
@@ -4441,7 +3633,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       build_name: manywheel-py3_14t-cpu
       build_environment: linux-binary-manywheel
@@ -4464,7 +3655,6 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       build_name: manywheel-py3_14t-cpu
     secrets:
@@ -4481,16 +3671,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_6-test:  # Testing
@@ -4505,16 +3694,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       build_name: manywheel-py3_14t-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_6-upload:  # Uploading
@@ -4529,11 +3717,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       build_name: manywheel-py3_14t-cuda12_6
     secrets:
@@ -4550,16 +3737,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_8-test:  # Testing
@@ -4574,16 +3760,15 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       build_name: manywheel-py3_14t-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_8-upload:  # Uploading
@@ -4598,18 +3783,17 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       build_name: manywheel-py3_14t-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_14t-cuda12_9-build:
+  manywheel-py3_14t-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -4618,23 +3802,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_14t-cuda12_9
+      build_name: manywheel-py3_14t-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cuda12_9-test:  # Testing
+  manywheel-py3_14t-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_14t-cuda12_9-build
+      - manywheel-py3_14t-cuda13_0-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -4642,38 +3825,36 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cuda12_9
+      build_name: manywheel-py3_14t-cuda13_0
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cuda12_9-upload:  # Uploading
+  manywheel-py3_14t-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_14t-cuda12_9-test
+    needs: manywheel-py3_14t-cuda13_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cuda12_9
+      build_name: manywheel-py3_14t-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -4688,11 +3869,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-rocm6_3
@@ -4712,12 +3892,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
     steps:
       - name: Setup ROCm
@@ -4781,11 +3960,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_VERSION: "6.3"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       build_name: manywheel-py3_14t-rocm6_3
     secrets:
@@ -4802,11 +3980,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-rocm6_4
@@ -4826,12 +4003,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
     steps:
       - name: Setup ROCm
@@ -4895,11 +4071,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       build_name: manywheel-py3_14t-rocm6_4
     secrets:
@@ -4919,12 +4094,11 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-xpu-test:  # Testing
@@ -4944,14 +4118,13 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
     permissions:
       id-token: write
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@main
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -5012,7 +4185,6 @@ jobs:
       GPU_ARCH_TYPE: xpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: xpu
-      use_split_build: False
       DESIRED_PYTHON: "3.14t"
       build_name: manywheel-py3_14t-xpu
     secrets:
diff --git a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
index b6b63c4e38d5e..8177bac3fe216 100644
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@@ -54,11 +54,10 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.9"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_9-rocm6_4
@@ -78,12 +77,11 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
       DESIRED_PYTHON: "3.9"
     steps:
       - name: Setup ROCm
diff --git a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
index 66c0813afe900..4a7ebe8366336 100644
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@@ -47,7 +47,7 @@ jobs:
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-cpu-s390x-build:
+  manywheel-py3_10-cpu-s390x-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -60,19 +60,18 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       timeout-minutes: 420
-      build_name: manywheel-py3_9-cpu-s390x
+      build_name: manywheel-py3_10-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cpu-s390x-test:  # Testing
+  manywheel-py3_10-cpu-s390x-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_9-cpu-s390x-build
+      - manywheel-py3_10-cpu-s390x-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -84,20 +83,19 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cpu-s390x
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cpu-s390x-upload:  # Uploading
+  manywheel-py3_10-cpu-s390x-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_9-cpu-s390x-test
+    needs: manywheel-py3_10-cpu-s390x-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
@@ -107,14 +105,13 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cpu-s390x
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cpu-s390x
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_10-cpu-s390x-build:
+  manywheel-py3_11-cpu-s390x-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -127,19 +124,18 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       timeout-minutes: 420
-      build_name: manywheel-py3_10-cpu-s390x
+      build_name: manywheel-py3_11-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cpu-s390x-test:  # Testing
+  manywheel-py3_11-cpu-s390x-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_10-cpu-s390x-build
+      - manywheel-py3_11-cpu-s390x-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -151,20 +147,19 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cpu-s390x
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cpu-s390x-upload:  # Uploading
+  manywheel-py3_11-cpu-s390x-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_10-cpu-s390x-test
+    needs: manywheel-py3_11-cpu-s390x-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
@@ -174,14 +169,13 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cpu-s390x
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cpu-s390x
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_11-cpu-s390x-build:
+  manywheel-py3_12-cpu-s390x-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -194,19 +188,18 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.12"
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       timeout-minutes: 420
-      build_name: manywheel-py3_11-cpu-s390x
+      build_name: manywheel-py3_12-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cpu-s390x-test:  # Testing
+  manywheel-py3_12-cpu-s390x-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_11-cpu-s390x-build
+      - manywheel-py3_12-cpu-s390x-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -218,20 +211,19 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cpu-s390x
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cpu-s390x-upload:  # Uploading
+  manywheel-py3_12-cpu-s390x-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_11-cpu-s390x-test
+    needs: manywheel-py3_12-cpu-s390x-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
@@ -241,14 +233,13 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cpu-s390x
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cpu-s390x
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_12-cpu-s390x-build:
+  manywheel-py3_13-cpu-s390x-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -261,19 +252,18 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13"
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       timeout-minutes: 420
-      build_name: manywheel-py3_12-cpu-s390x
+      build_name: manywheel-py3_13-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cpu-s390x-test:  # Testing
+  manywheel-py3_13-cpu-s390x-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_12-cpu-s390x-build
+      - manywheel-py3_13-cpu-s390x-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -285,20 +275,19 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cpu-s390x
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cpu-s390x-upload:  # Uploading
+  manywheel-py3_13-cpu-s390x-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_12-cpu-s390x-test
+    needs: manywheel-py3_13-cpu-s390x-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
@@ -308,14 +297,13 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cpu-s390x
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cpu-s390x
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13-cpu-s390x-build:
+  manywheel-py3_13t-cpu-s390x-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -328,19 +316,18 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.13"
+      DESIRED_PYTHON: "3.13t"
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       timeout-minutes: 420
-      build_name: manywheel-py3_13-cpu-s390x
+      build_name: manywheel-py3_13t-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cpu-s390x-test:  # Testing
+  manywheel-py3_13t-cpu-s390x-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_13-cpu-s390x-build
+      - manywheel-py3_13t-cpu-s390x-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -352,20 +339,19 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cpu-s390x
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-s390x
       build_environment: linux-s390x-binary-manywheel
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cpu-s390x-upload:  # Uploading
+  manywheel-py3_13t-cpu-s390x-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13-cpu-s390x-test
+    needs: manywheel-py3_13t-cpu-s390x-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
@@ -375,9 +361,136 @@ jobs:
       GPU_ARCH_TYPE: cpu-s390x
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cpu-s390x
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      timeout-minutes: 420
+      build_name: manywheel-py3_14-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14-cpu-s390x-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14t"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      timeout-minutes: 420
+      build_name: manywheel-py3_14t-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14t-cpu-s390x-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cpu-s390x
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
index ad7a1cf1d71df..500f8fa07af6b 100644
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
@@ -46,7 +46,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -67,11 +67,6 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index 7c4cc4ab55176..6aee57b503aa2 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -30,7 +30,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  wheel-py3_9-cpu-build:
+  wheel-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-14-xlarge
     timeout-minutes: 240
@@ -42,7 +42,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -63,11 +63,6 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -115,12 +110,33 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
           conda activate test_conda_env
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
@@ -129,16 +145,16 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_9-cpu
+          name: wheel-py3_10-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_9-cpu-upload:  # Uploading
+  wheel-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_9-cpu-build
+    needs: wheel-py3_10-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: wheel
@@ -148,13 +164,13 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cpu
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cpu-build:
+  wheel-py3_11-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-14-xlarge
     timeout-minutes: 240
@@ -166,7 +182,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -187,11 +203,6 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -239,12 +250,33 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
           conda activate test_conda_env
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
@@ -253,16 +285,16 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_10-cpu
+          name: wheel-py3_11-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_10-cpu-upload:  # Uploading
+  wheel-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_10-cpu-build
+    needs: wheel-py3_11-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: wheel
@@ -272,13 +304,13 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cpu-build:
+  wheel-py3_12-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-14-xlarge
     timeout-minutes: 240
@@ -290,7 +322,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.12"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -311,11 +343,6 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -363,12 +390,33 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
           conda activate test_conda_env
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
@@ -377,16 +425,16 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_11-cpu
+          name: wheel-py3_12-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_11-cpu-upload:  # Uploading
+  wheel-py3_12-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_11-cpu-build
+    needs: wheel-py3_12-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: wheel
@@ -396,13 +444,13 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cpu
+      DESIRED_PYTHON: "3.12"
+      build_name: wheel-py3_12-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cpu-build:
+  wheel-py3_13-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-14-xlarge
     timeout-minutes: 240
@@ -414,7 +462,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -435,11 +483,6 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -487,12 +530,33 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
           conda activate test_conda_env
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
@@ -501,16 +565,16 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_12-cpu
+          name: wheel-py3_13-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_12-cpu-upload:  # Uploading
+  wheel-py3_13-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_12-cpu-build
+    needs: wheel-py3_13-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: wheel
@@ -520,13 +584,13 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cpu
+      DESIRED_PYTHON: "3.13"
+      build_name: wheel-py3_13-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13-cpu-build:
+  wheel-py3_13t-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-14-xlarge
     timeout-minutes: 240
@@ -538,7 +602,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
+      DESIRED_PYTHON: "3.13t"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -559,11 +623,6 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -611,12 +670,33 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
           conda activate test_conda_env
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
@@ -625,16 +705,16 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13-cpu
+          name: wheel-py3_13t-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_13-cpu-upload:  # Uploading
+  wheel-py3_13t-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13-cpu-build
+    needs: wheel-py3_13t-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: wheel
@@ -644,13 +724,13 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      DESIRED_PYTHON: "3.13"
-      build_name: wheel-py3_13-cpu
+      DESIRED_PYTHON: "3.13t"
+      build_name: wheel-py3_13t-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13t-cpu-build:
+  wheel-py3_14-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-14-xlarge
     timeout-minutes: 240
@@ -662,7 +742,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13t"
+      DESIRED_PYTHON: "3.14"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -683,11 +763,6 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -735,12 +810,33 @@ jobs:
           # Create new "clean" conda environment for testing
 
           SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
           conda activate test_conda_env
           pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
 
@@ -749,16 +845,16 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13t-cpu
+          name: wheel-py3_14-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_13t-cpu-upload:  # Uploading
+  wheel-py3_14-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13t-cpu-build
+    needs: wheel-py3_14-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: wheel
@@ -768,8 +864,148 @@ jobs:
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cpu
-      DESIRED_PYTHON: "3.13t"
-      build_name: wheel-py3_13t-cpu
+      DESIRED_PYTHON: "3.14"
+      build_name: wheel-py3_14-cpu
+      use_s3: False
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_14t-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-14-xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.14t"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
+
+          # Build
+          USE_PYTORCH_METAL_EXPORT=1
+          USE_COREML_DELEGATE=1
+          TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
+          export USE_PYTORCH_METAL_EXPORT
+          export USE_COREML_DELEGATE
+          export TORCH_PACKAGE_NAME
+          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
+      - name: Test PyTorch wheel
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
+          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
+
+          # Create new "clean" conda environment for testing
+
+          SMOKE_TEST_PARAMS=""
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          conda activate test_conda_env
+          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
+
+          # shellcheck disable=SC2086
+          python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_14t-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_14t-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_14t-cpu-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      DESIRED_PYTHON: "3.14t"
+      build_name: wheel-py3_14t-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
index 2c86e7e103598..7c26dbc3b9eea 100644
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
@@ -51,7 +51,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -64,7 +64,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Populate binary env
         shell: cmd
@@ -128,7 +128,7 @@ jobs:
       - libtorch-cpu-shared-with-deps-debug-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -141,7 +141,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Populate binary env
         shell: cmd
@@ -201,7 +201,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cpu-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
index 912a452f0ee8a..5e30b66183840 100644
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
@@ -51,7 +51,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -64,7 +64,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Populate binary env
         shell: cmd
@@ -128,7 +128,7 @@ jobs:
       - libtorch-cpu-shared-with-deps-release-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -141,7 +141,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Populate binary env
         shell: cmd
@@ -201,7 +201,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cpu-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
index 1dd70d0d06a91..1368bc942350e 100644
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@@ -51,7 +51,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -124,7 +124,7 @@ jobs:
       - wheel-py3_11-cpu-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -198,7 +198,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -271,7 +271,7 @@ jobs:
       - wheel-py3_12-cpu-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -345,7 +345,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -418,7 +418,7 @@ jobs:
       - wheel-py3_13-cpu-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
index ac15a9f3e97ac..818d2ca45cc4c 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@@ -38,7 +38,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -51,7 +51,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -153,7 +153,7 @@ jobs:
       - libtorch-cpu-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -166,7 +166,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
index 75c393b46e59b..67fdecdf6e866 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@@ -45,7 +45,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -58,7 +58,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -160,7 +160,7 @@ jobs:
       - libtorch-cpu-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -173,7 +173,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -283,7 +283,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cpu-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -292,21 +292,21 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -408,21 +408,21 @@ jobs:
       - libtorch-cuda12_6-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -527,13 +527,13 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cuda12_6-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -542,21 +542,21 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -658,21 +658,21 @@ jobs:
       - libtorch-cuda12_8-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -777,36 +777,36 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cuda12_8-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda12_9-shared-with-deps-debug-build:
+  libtorch-cuda13_0-shared-with-deps-debug-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -884,7 +884,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: libtorch-cuda12_9-shared-with-deps-debug
+          name: libtorch-cuda13_0-shared-with-deps-debug
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -902,27 +902,27 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  libtorch-cuda12_9-shared-with-deps-debug-test:  # Testing
+  libtorch-cuda13_0-shared-with-deps-debug-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - libtorch-cuda12_9-shared-with-deps-debug-build
+      - libtorch-cuda13_0-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -992,7 +992,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: libtorch-cuda12_9-shared-with-deps-debug
+          name: libtorch-cuda13_0-shared-with-deps-debug
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -1015,26 +1015,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda12_9-shared-with-deps-debug-upload:  # Uploading
+  libtorch-cuda13_0-shared-with-deps-debug-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: libtorch-cuda12_9-shared-with-deps-debug-test
+    needs: libtorch-cuda13_0-shared-with-deps-debug-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-      build_name: libtorch-cuda12_9-shared-with-deps-debug
+      DESIRED_PYTHON: "3.10"
+      build_name: libtorch-cuda13_0-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-main.yml b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
index 9a0a3496e37b3..ff8a2bbbfe1ef 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@@ -38,7 +38,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -51,7 +51,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -153,7 +153,7 @@ jobs:
       - libtorch-cpu-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -166,7 +166,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
index eccd332c74a1f..8efca3b7571b9 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@@ -45,7 +45,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -58,7 +58,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -160,7 +160,7 @@ jobs:
       - libtorch-cpu-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -173,7 +173,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -283,7 +283,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cpu-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -292,21 +292,21 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -408,21 +408,21 @@ jobs:
       - libtorch-cuda12_6-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -527,13 +527,13 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cuda12_6-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -542,21 +542,21 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -658,21 +658,21 @@ jobs:
       - libtorch-cuda12_8-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -777,36 +777,36 @@ jobs:
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cuda12_8-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda12_9-shared-with-deps-release-build:
+  libtorch-cuda13_0-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -884,7 +884,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: libtorch-cuda12_9-shared-with-deps-release
+          name: libtorch-cuda13_0-shared-with-deps-release
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -902,27 +902,27 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  libtorch-cuda12_9-shared-with-deps-release-test:  # Testing
+  libtorch-cuda13_0-shared-with-deps-release-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - libtorch-cuda12_9-shared-with-deps-release-build
+      - libtorch-cuda13_0-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -992,7 +992,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: libtorch-cuda12_9-shared-with-deps-release
+          name: libtorch-cuda13_0-shared-with-deps-release
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -1015,26 +1015,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
+  libtorch-cuda13_0-shared-with-deps-release-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: libtorch-cuda12_9-shared-with-deps-release-test
+    needs: libtorch-cuda13_0-shared-with-deps-release-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-      build_name: libtorch-cuda12_9-shared-with-deps-release
+      DESIRED_PYTHON: "3.10"
+      build_name: libtorch-cuda13_0-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml
index 22ebe8db70eac..154dadbe6a1e3 100644
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@@ -41,11 +41,1196 @@ jobs:
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
-  wheel-py3_9-cpu-build:
+  wheel-py3_10-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_10-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  wheel-py3_10-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_10-cpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_10-cpu-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_10-cuda12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_10-cuda12_6
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  wheel-py3_10-cuda12_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_10-cuda12_6-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cuda12_6
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cuda12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_10-cuda12_6-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cuda12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_10-cuda12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_10-cuda12_8
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  wheel-py3_10-cuda12_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_10-cuda12_8-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cuda12_8
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cuda12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_10-cuda12_8-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cuda12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_10-cuda13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_10-cuda13_0
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  wheel-py3_10-cuda13_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_10-cuda13_0-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cuda13_0
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cuda13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_10-cuda13_0-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cuda13_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_10-xpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_10-xpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  wheel-py3_10-xpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_10-xpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 360
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-xpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-xpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_10-xpu-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-xpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_11-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -54,7 +1239,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.11"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -132,7 +1317,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_9-cpu
+          name: wheel-py3_11-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -150,13 +1335,13 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_9-cpu-test:  # Testing
+  wheel-py3_11-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_9-cpu-build
+      - wheel-py3_11-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -165,7 +1350,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -235,7 +1420,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_9-cpu
+          name: wheel-py3_11-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -258,12 +1443,12 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cpu-upload:  # Uploading
+  wheel-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_9-cpu-test
+    needs: wheel-py3_11-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -271,26 +1456,26 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cuda12_6-build:
+  wheel-py3_11-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.11"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -368,7 +1553,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_9-cuda12_6
+          name: wheel-py3_11-cuda12_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -386,23 +1571,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_9-cuda12_6-test:  # Testing
+  wheel-py3_11-cuda12_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_9-cuda12_6-build
+      - wheel-py3_11-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -472,7 +1657,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_9-cuda12_6
+          name: wheel-py3_11-cuda12_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -495,40 +1680,40 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda12_6-upload:  # Uploading
+  wheel-py3_11-cuda12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_9-cuda12_6-test
+    needs: wheel-py3_11-cuda12_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cuda12_6
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cuda12_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cuda12_8-build:
+  wheel-py3_11-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.11"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -606,7 +1791,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_9-cuda12_8
+          name: wheel-py3_11-cuda12_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -624,23 +1809,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_9-cuda12_8-test:  # Testing
+  wheel-py3_11-cuda12_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_9-cuda12_8-build
+      - wheel-py3_11-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -710,7 +1895,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_9-cuda12_8
+          name: wheel-py3_11-cuda12_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -733,40 +1918,40 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda12_8-upload:  # Uploading
+  wheel-py3_11-cuda12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_9-cuda12_8-test
+    needs: wheel-py3_11-cuda12_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cuda12_8
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cuda12_9-build:
+  wheel-py3_11-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.11"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -844,7 +2029,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_9-cuda12_9
+          name: wheel-py3_11-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -862,23 +2047,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_9-cuda12_9-test:  # Testing
+  wheel-py3_11-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_9-cuda12_9-build
+      - wheel-py3_11-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -948,7 +2133,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_9-cuda12_9
+          name: wheel-py3_11-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -971,30 +2156,30 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda12_9-upload:  # Uploading
+  wheel-py3_11-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_9-cuda12_9-test
+    needs: wheel-py3_11-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cuda12_9
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-xpu-build:
+  wheel-py3_11-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1003,8 +2188,8 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      DESIRED_PYTHON: "3.11"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -1082,7 +2267,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_9-xpu
+          name: wheel-py3_11-xpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1100,13 +2285,13 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_9-xpu-test:  # Testing
+  wheel-py3_11-xpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_9-xpu-build
+      - wheel-py3_11-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1115,7 +2300,7 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1185,7 +2370,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_9-xpu
+          name: wheel-py3_11-xpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -1208,12 +2393,12 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-xpu-upload:  # Uploading
+  wheel-py3_11-xpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_9-xpu-test
+    needs: wheel-py3_11-xpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1221,16 +2406,16 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-xpu
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-xpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cpu-build:
+  wheel-py3_12-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1239,7 +2424,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.12"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -1317,7 +2502,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_10-cpu
+          name: wheel-py3_12-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1335,13 +2520,13 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_10-cpu-test:  # Testing
+  wheel-py3_12-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_10-cpu-build
+      - wheel-py3_12-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1350,7 +2535,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.12"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1420,7 +2605,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cpu
+          name: wheel-py3_12-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -1443,12 +2628,12 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cpu-upload:  # Uploading
+  wheel-py3_12-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_10-cpu-test
+    needs: wheel-py3_12-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1456,26 +2641,26 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cpu
+      DESIRED_PYTHON: "3.12"
+      build_name: wheel-py3_12-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda12_6-build:
+  wheel-py3_12-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.12"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -1553,7 +2738,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_10-cuda12_6
+          name: wheel-py3_12-cuda12_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1571,23 +2756,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_10-cuda12_6-test:  # Testing
+  wheel-py3_12-cuda12_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_10-cuda12_6-build
+      - wheel-py3_12-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.12"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1657,7 +2842,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cuda12_6
+          name: wheel-py3_12-cuda12_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -1680,40 +2865,40 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda12_6-upload:  # Uploading
+  wheel-py3_12-cuda12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_10-cuda12_6-test
+    needs: wheel-py3_12-cuda12_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda12_6
+      DESIRED_PYTHON: "3.12"
+      build_name: wheel-py3_12-cuda12_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda12_8-build:
+  wheel-py3_12-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.12"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -1791,7 +2976,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_10-cuda12_8
+          name: wheel-py3_12-cuda12_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1809,23 +2994,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_10-cuda12_8-test:  # Testing
+  wheel-py3_12-cuda12_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_10-cuda12_8-build
+      - wheel-py3_12-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.12"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1895,7 +3080,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cuda12_8
+          name: wheel-py3_12-cuda12_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -1918,40 +3103,40 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda12_8-upload:  # Uploading
+  wheel-py3_12-cuda12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_10-cuda12_8-test
+    needs: wheel-py3_12-cuda12_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda12_8
+      DESIRED_PYTHON: "3.12"
+      build_name: wheel-py3_12-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda12_9-build:
+  wheel-py3_12-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.12"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -2029,7 +3214,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_10-cuda12_9
+          name: wheel-py3_12-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2047,23 +3232,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_10-cuda12_9-test:  # Testing
+  wheel-py3_12-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_10-cuda12_9-build
+      - wheel-py3_12-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.12"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2133,7 +3318,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cuda12_9
+          name: wheel-py3_12-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -2156,30 +3341,30 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda12_9-upload:  # Uploading
+  wheel-py3_12-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_10-cuda12_9-test
+    needs: wheel-py3_12-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda12_9
+      DESIRED_PYTHON: "3.12"
+      build_name: wheel-py3_12-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-xpu-build:
+  wheel-py3_12-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2188,8 +3373,8 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      DESIRED_PYTHON: "3.12"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -2267,7 +3452,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_10-xpu
+          name: wheel-py3_12-xpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2285,13 +3470,13 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_10-xpu-test:  # Testing
+  wheel-py3_12-xpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_10-xpu-build
+      - wheel-py3_12-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2300,7 +3485,7 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.12"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2370,7 +3555,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-xpu
+          name: wheel-py3_12-xpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -2393,12 +3578,12 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-xpu-upload:  # Uploading
+  wheel-py3_12-xpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_10-xpu-test
+    needs: wheel-py3_12-xpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2406,16 +3591,16 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-xpu
+      DESIRED_PYTHON: "3.12"
+      build_name: wheel-py3_12-xpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cpu-build:
+  wheel-py3_13-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2424,7 +3609,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.13"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -2502,7 +3687,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_11-cpu
+          name: wheel-py3_13-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2520,13 +3705,13 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_11-cpu-test:  # Testing
+  wheel-py3_13-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_11-cpu-build
+      - wheel-py3_13-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2535,7 +3720,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.13"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2605,7 +3790,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-cpu
+          name: wheel-py3_13-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -2628,12 +3813,12 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cpu-upload:  # Uploading
+  wheel-py3_13-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_11-cpu-test
+    needs: wheel-py3_13-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2641,26 +3826,26 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
-      DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cpu
+      DESIRED_PYTHON: "3.13"
+      build_name: wheel-py3_13-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cuda12_6-build:
+  wheel-py3_13-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.13"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -2738,7 +3923,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_11-cuda12_6
+          name: wheel-py3_13-cuda12_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2756,23 +3941,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_11-cuda12_6-test:  # Testing
+  wheel-py3_13-cuda12_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_11-cuda12_6-build
+      - wheel-py3_13-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.13"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2842,7 +4027,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-cuda12_6
+          name: wheel-py3_13-cuda12_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -2865,40 +4050,40 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda12_6-upload:  # Uploading
+  wheel-py3_13-cuda12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_11-cuda12_6-test
+    needs: wheel-py3_13-cuda12_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cuda12_6
+      DESIRED_PYTHON: "3.13"
+      build_name: wheel-py3_13-cuda12_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cuda12_8-build:
+  wheel-py3_13-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.13"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -2976,7 +4161,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_11-cuda12_8
+          name: wheel-py3_13-cuda12_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2994,23 +4179,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_11-cuda12_8-test:  # Testing
+  wheel-py3_13-cuda12_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_11-cuda12_8-build
+      - wheel-py3_13-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.13"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3080,7 +4265,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-cuda12_8
+          name: wheel-py3_13-cuda12_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -3103,40 +4288,40 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda12_8-upload:  # Uploading
+  wheel-py3_13-cuda12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_11-cuda12_8-test
+    needs: wheel-py3_13-cuda12_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cuda12_8
+      DESIRED_PYTHON: "3.13"
+      build_name: wheel-py3_13-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cuda12_9-build:
+  wheel-py3_13-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.13"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -3214,7 +4399,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_11-cuda12_9
+          name: wheel-py3_13-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3232,23 +4417,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_11-cuda12_9-test:  # Testing
+  wheel-py3_13-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_11-cuda12_9-build
+      - wheel-py3_13-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.13"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3318,7 +4503,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-cuda12_9
+          name: wheel-py3_13-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -3341,30 +4526,30 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda12_9-upload:  # Uploading
+  wheel-py3_13-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_11-cuda12_9-test
+    needs: wheel-py3_13-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cuda12_9
+      DESIRED_PYTHON: "3.13"
+      build_name: wheel-py3_13-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-xpu-build:
+  wheel-py3_13-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3373,8 +4558,8 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      DESIRED_PYTHON: "3.13"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -3452,7 +4637,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_11-xpu
+          name: wheel-py3_13-xpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3470,13 +4655,13 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_11-xpu-test:  # Testing
+  wheel-py3_13-xpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_11-xpu-build
+      - wheel-py3_13-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3485,7 +4670,7 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.13"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3555,7 +4740,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-xpu
+          name: wheel-py3_13-xpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -3578,12 +4763,12 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-xpu-upload:  # Uploading
+  wheel-py3_13-xpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_11-xpu-test
+    needs: wheel-py3_13-xpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3591,16 +4776,16 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
-      DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-xpu
+      DESIRED_PYTHON: "3.13"
+      build_name: wheel-py3_13-xpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cpu-build:
+  wheel-py3_13t-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3609,7 +4794,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13t"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -3687,7 +4872,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_12-cpu
+          name: wheel-py3_13t-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3705,13 +4890,13 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_12-cpu-test:  # Testing
+  wheel-py3_13t-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_12-cpu-build
+      - wheel-py3_13t-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3720,7 +4905,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13t"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3790,7 +4975,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-cpu
+          name: wheel-py3_13t-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -3813,12 +4998,12 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cpu-upload:  # Uploading
+  wheel-py3_13t-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_12-cpu-test
+    needs: wheel-py3_13t-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3826,26 +5011,26 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
-      DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cpu
+      DESIRED_PYTHON: "3.13t"
+      build_name: wheel-py3_13t-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cuda12_6-build:
+  wheel-py3_13t-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13t"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -3923,7 +5108,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_12-cuda12_6
+          name: wheel-py3_13t-cuda12_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3941,23 +5126,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_12-cuda12_6-test:  # Testing
+  wheel-py3_13t-cuda12_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_12-cuda12_6-build
+      - wheel-py3_13t-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13t"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -4027,7 +5212,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-cuda12_6
+          name: wheel-py3_13t-cuda12_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -4050,40 +5235,40 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda12_6-upload:  # Uploading
+  wheel-py3_13t-cuda12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_12-cuda12_6-test
+    needs: wheel-py3_13t-cuda12_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cuda12_6
+      DESIRED_PYTHON: "3.13t"
+      build_name: wheel-py3_13t-cuda12_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cuda12_8-build:
+  wheel-py3_13t-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13t"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -4161,7 +5346,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_12-cuda12_8
+          name: wheel-py3_13t-cuda12_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -4179,23 +5364,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_12-cuda12_8-test:  # Testing
+  wheel-py3_13t-cuda12_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_12-cuda12_8-build
+      - wheel-py3_13t-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13t"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -4265,7 +5450,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-cuda12_8
+          name: wheel-py3_13t-cuda12_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -4288,40 +5473,40 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda12_8-upload:  # Uploading
+  wheel-py3_13t-cuda12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_12-cuda12_8-test
+    needs: wheel-py3_13t-cuda12_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cuda12_8
+      DESIRED_PYTHON: "3.13t"
+      build_name: wheel-py3_13t-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cuda12_9-build:
+  wheel-py3_13t-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13t"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -4399,7 +5584,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_12-cuda12_9
+          name: wheel-py3_13t-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -4417,23 +5602,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_12-cuda12_9-test:  # Testing
+  wheel-py3_13t-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_12-cuda12_9-build
+      - wheel-py3_13t-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13t"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -4503,7 +5688,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-cuda12_9
+          name: wheel-py3_13t-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -4526,30 +5711,30 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda12_9-upload:  # Uploading
+  wheel-py3_13t-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_12-cuda12_9-test
+    needs: wheel-py3_13t-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cuda12_9
+      DESIRED_PYTHON: "3.13t"
+      build_name: wheel-py3_13t-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-xpu-build:
+  wheel-py3_13t-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4558,8 +5743,8 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      DESIRED_PYTHON: "3.13t"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -4637,7 +5822,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_12-xpu
+          name: wheel-py3_13t-xpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -4655,13 +5840,13 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_12-xpu-test:  # Testing
+  wheel-py3_13t-xpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_12-xpu-build
+      - wheel-py3_13t-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4670,7 +5855,7 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13t"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -4740,7 +5925,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-xpu
+          name: wheel-py3_13t-xpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -4763,12 +5948,12 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-xpu-upload:  # Uploading
+  wheel-py3_13t-xpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_12-xpu-test
+    needs: wheel-py3_13t-xpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4776,16 +5961,16 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
-      DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-xpu
+      DESIRED_PYTHON: "3.13t"
+      build_name: wheel-py3_13t-xpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13-cpu-build:
+  wheel-py3_14-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4794,7 +5979,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
+      DESIRED_PYTHON: "3.14"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -4872,7 +6057,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13-cpu
+          name: wheel-py3_14-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -4890,13 +6075,13 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_13-cpu-test:  # Testing
+  wheel-py3_14-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13-cpu-build
+      - wheel-py3_14-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4905,7 +6090,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
+      DESIRED_PYTHON: "3.14"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -4975,7 +6160,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13-cpu
+          name: wheel-py3_14-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -4998,12 +6183,12 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cpu-upload:  # Uploading
+  wheel-py3_14-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13-cpu-test
+    needs: wheel-py3_14-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5011,26 +6196,26 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
-      DESIRED_PYTHON: "3.13"
-      build_name: wheel-py3_13-cpu
+      DESIRED_PYTHON: "3.14"
+      build_name: wheel-py3_14-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13-cuda12_6-build:
+  wheel-py3_14-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
+      DESIRED_PYTHON: "3.14"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -5108,7 +6293,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13-cuda12_6
+          name: wheel-py3_14-cuda12_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -5126,23 +6311,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_13-cuda12_6-test:  # Testing
+  wheel-py3_14-cuda12_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13-cuda12_6-build
+      - wheel-py3_14-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
+      DESIRED_PYTHON: "3.14"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -5212,7 +6397,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13-cuda12_6
+          name: wheel-py3_14-cuda12_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -5235,40 +6420,40 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cuda12_6-upload:  # Uploading
+  wheel-py3_14-cuda12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13-cuda12_6-test
+    needs: wheel-py3_14-cuda12_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.13"
-      build_name: wheel-py3_13-cuda12_6
+      DESIRED_PYTHON: "3.14"
+      build_name: wheel-py3_14-cuda12_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13-cuda12_8-build:
+  wheel-py3_14-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
+      DESIRED_PYTHON: "3.14"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -5346,7 +6531,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13-cuda12_8
+          name: wheel-py3_14-cuda12_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -5364,23 +6549,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_13-cuda12_8-test:  # Testing
+  wheel-py3_14-cuda12_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13-cuda12_8-build
+      - wheel-py3_14-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
+      DESIRED_PYTHON: "3.14"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -5450,7 +6635,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13-cuda12_8
+          name: wheel-py3_14-cuda12_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -5473,40 +6658,40 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cuda12_8-upload:  # Uploading
+  wheel-py3_14-cuda12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13-cuda12_8-test
+    needs: wheel-py3_14-cuda12_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.13"
-      build_name: wheel-py3_13-cuda12_8
+      DESIRED_PYTHON: "3.14"
+      build_name: wheel-py3_14-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13-cuda12_9-build:
+  wheel-py3_14-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
+      DESIRED_PYTHON: "3.14"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -5584,7 +6769,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13-cuda12_9
+          name: wheel-py3_14-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -5602,23 +6787,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_13-cuda12_9-test:  # Testing
+  wheel-py3_14-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13-cuda12_9-build
+      - wheel-py3_14-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
+      DESIRED_PYTHON: "3.14"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -5688,7 +6873,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13-cuda12_9
+          name: wheel-py3_14-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -5711,30 +6896,30 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cuda12_9-upload:  # Uploading
+  wheel-py3_14-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13-cuda12_9-test
+    needs: wheel-py3_14-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.13"
-      build_name: wheel-py3_13-cuda12_9
+      DESIRED_PYTHON: "3.14"
+      build_name: wheel-py3_14-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13-xpu-build:
+  wheel-py3_14-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5743,8 +6928,8 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      DESIRED_PYTHON: "3.14"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -5822,7 +7007,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13-xpu
+          name: wheel-py3_14-xpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -5840,13 +7025,13 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_13-xpu-test:  # Testing
+  wheel-py3_14-xpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13-xpu-build
+      - wheel-py3_14-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5855,7 +7040,7 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
+      DESIRED_PYTHON: "3.14"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -5925,7 +7110,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13-xpu
+          name: wheel-py3_14-xpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -5948,12 +7133,12 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-xpu-upload:  # Uploading
+  wheel-py3_14-xpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13-xpu-test
+    needs: wheel-py3_14-xpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5961,16 +7146,16 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
-      DESIRED_PYTHON: "3.13"
-      build_name: wheel-py3_13-xpu
+      DESIRED_PYTHON: "3.14"
+      build_name: wheel-py3_14-xpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13t-cpu-build:
+  wheel-py3_14t-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5979,7 +7164,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13t"
+      DESIRED_PYTHON: "3.14t"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -6057,7 +7242,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13t-cpu
+          name: wheel-py3_14t-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -6075,13 +7260,13 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_13t-cpu-test:  # Testing
+  wheel-py3_14t-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13t-cpu-build
+      - wheel-py3_14t-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -6090,7 +7275,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13t"
+      DESIRED_PYTHON: "3.14t"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -6160,7 +7345,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13t-cpu
+          name: wheel-py3_14t-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -6183,12 +7368,12 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cpu-upload:  # Uploading
+  wheel-py3_14t-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13t-cpu-test
+    needs: wheel-py3_14t-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -6196,26 +7381,26 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
-      DESIRED_PYTHON: "3.13t"
-      build_name: wheel-py3_13t-cpu
+      DESIRED_PYTHON: "3.14t"
+      build_name: wheel-py3_14t-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13t-cuda12_6-build:
+  wheel-py3_14t-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13t"
+      DESIRED_PYTHON: "3.14t"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -6293,7 +7478,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13t-cuda12_6
+          name: wheel-py3_14t-cuda12_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -6311,23 +7496,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_13t-cuda12_6-test:  # Testing
+  wheel-py3_14t-cuda12_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13t-cuda12_6-build
+      - wheel-py3_14t-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13t"
+      DESIRED_PYTHON: "3.14t"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -6397,7 +7582,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13t-cuda12_6
+          name: wheel-py3_14t-cuda12_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -6420,40 +7605,40 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cuda12_6-upload:  # Uploading
+  wheel-py3_14t-cuda12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13t-cuda12_6-test
+    needs: wheel-py3_14t-cuda12_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_VERSION: "12.6"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.13t"
-      build_name: wheel-py3_13t-cuda12_6
+      DESIRED_PYTHON: "3.14t"
+      build_name: wheel-py3_14t-cuda12_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13t-cuda12_8-build:
+  wheel-py3_14t-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13t"
+      DESIRED_PYTHON: "3.14t"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -6531,7 +7716,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13t-cuda12_8
+          name: wheel-py3_14t-cuda12_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -6549,23 +7734,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_13t-cuda12_8-test:  # Testing
+  wheel-py3_14t-cuda12_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13t-cuda12_8-build
+      - wheel-py3_14t-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13t"
+      DESIRED_PYTHON: "3.14t"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -6635,7 +7820,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13t-cuda12_8
+          name: wheel-py3_14t-cuda12_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -6658,40 +7843,40 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cuda12_8-upload:  # Uploading
+  wheel-py3_14t-cuda12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13t-cuda12_8-test
+    needs: wheel-py3_14t-cuda12_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.13t"
-      build_name: wheel-py3_13t-cuda12_8
+      DESIRED_PYTHON: "3.14t"
+      build_name: wheel-py3_14t-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13t-cuda12_9-build:
+  wheel-py3_14t-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13t"
+      DESIRED_PYTHON: "3.14t"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -6769,7 +7954,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13t-cuda12_9
+          name: wheel-py3_14t-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -6787,23 +7972,23 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_13t-cuda12_9-test:  # Testing
+  wheel-py3_14t-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13t-cuda12_9-build
+      - wheel-py3_14t-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13t"
+      DESIRED_PYTHON: "3.14t"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -6873,7 +8058,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13t-cuda12_9
+          name: wheel-py3_14t-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -6896,30 +8081,30 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cuda12_9-upload:  # Uploading
+  wheel-py3_14t-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13t-cuda12_9-test
+    needs: wheel-py3_14t-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.13t"
-      build_name: wheel-py3_13t-cuda12_9
+      DESIRED_PYTHON: "3.14t"
+      build_name: wheel-py3_14t-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13t-xpu-build:
+  wheel-py3_14t-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -6928,8 +8113,8 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13t"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      DESIRED_PYTHON: "3.14t"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -7007,7 +8192,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13t-xpu
+          name: wheel-py3_14t-xpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -7025,13 +8210,13 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_13t-xpu-test:  # Testing
+  wheel-py3_14t-xpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13t-xpu-build
+      - wheel-py3_14t-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -7040,7 +8225,7 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13t"
+      DESIRED_PYTHON: "3.14t"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -7110,7 +8295,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13t-xpu
+          name: wheel-py3_14t-xpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -7133,12 +8318,12 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-xpu-upload:  # Uploading
+  wheel-py3_14t-xpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13t-xpu-test
+    needs: wheel-py3_14t-xpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -7146,8 +8331,8 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
-      DESIRED_PYTHON: "3.13t"
-      build_name: wheel-py3_13t-xpu
+      DESIRED_PYTHON: "3.14t"
+      build_name: wheel-py3_14t-xpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/h100-cutlass-backend.yml b/.github/workflows/h100-cutlass-backend.yml
index 82dc2ae2a3944..edf4c2e0e807c 100644
--- a/.github/workflows/h100-cutlass-backend.yml
+++ b/.github/workflows/h100-cutlass-backend.yml
@@ -4,9 +4,12 @@ on:
   pull_request:
     paths:
       - .github/workflows/h100-cutlass-backend.yml
+      - torch/_inductor/codegen/cuda/**
+      - test/inductor/test_cutlass_backend.py
+      - test/inductor/test_cutlass_evt.py
   workflow_dispatch:
   schedule:
-    - cron: 22 9 * * *  # every 24 hours about 2:22am PDT
+    - cron: 22 9,21 * * *  # every 12 hours
   push:
     tags:
       - ciflow/h100-cutlass-backend/*
diff --git a/.github/workflows/inductor-micro-benchmark-x86.yml b/.github/workflows/inductor-micro-benchmark-x86.yml
index 117183428abc1..c6cc075e6b270 100644
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@@ -18,13 +18,13 @@ permissions:
   contents: read
 
 jobs:
-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+  inductor-build:
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
     with:
       build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       # Use metal host for benchmark jobs
       test-matrix: |
         { include: [
@@ -32,13 +32,13 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-micro-benchmark-test:
+    name: inductor-micro-benchmark-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
     with:
       build-environment: linux-jammy-py3.9-gcc11
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
     secrets: inherit
diff --git a/.github/workflows/inductor-nightly.yml b/.github/workflows/inductor-nightly.yml
index c17a4ed6341aa..fe0f102406b6a 100644
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@@ -32,13 +32,13 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build:
-    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
+  nightly-dynamo-benchmarks-build:
+    name: nightly-dynamo-benchmarks-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-default-label-prefix
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       test-matrix: |
         { include: [
@@ -51,13 +51,13 @@ jobs:
       build-additional-packages: "vision audio torchao"
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test:
-    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
+  nightly-dynamo-benchmarks-test:
+    name: nightly-dynamo-benchmarks-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build
+    needs: nightly-dynamo-benchmarks-build
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
       timeout-minutes: 720
     secrets: inherit
diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml
index 2b59777aae8c7..41210f89c9a89 100644
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@@ -58,9 +58,14 @@ on:
         required: false
         type: string
         default: inductor_huggingface_perf_cuda_h100,inductor_timm_perf_cuda_h100,inductor_torchbench_perf_cuda_h100
+  pull_request:
+    # Changing these files guarantees that this workflow needs to be run
+    paths:
+      - .github/workflows/inductor-perf-test-nightly-h100.yml
+      - .ci/docker/ci_commit_pins/huggingface-requirements.txt
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
 permissions:
@@ -79,9 +84,8 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  # NB: Keep this in sync with trunk.yml
   build:
-    name: cuda12.8-py3.10-gcc9-sm90
+    name: build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -123,7 +127,7 @@ jobs:
     secrets: inherit
 
   test-periodically:
-    name: cuda12.8-py3.10-gcc9-sm90
+    name: test-periodically
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event.schedule == '15 0,12 * * 1-6'
@@ -140,7 +144,7 @@ jobs:
     secrets: inherit
 
   test-weekly:
-    name: cuda12.8-py3.10-gcc9-sm90
+    name: test-weekly
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event.schedule == '0 7 * * 0'
@@ -157,13 +161,15 @@ jobs:
     secrets: inherit
 
   test:
-    name: cuda12.8-py3.10-gcc9-sm90
+    name: test
     uses: ./.github/workflows/_linux-test.yml
     needs: build
-    if: github.event_name == 'workflow_dispatch'
+    # The pull_request trigger is used in PR to bump transformers pin which always
+    # needs one round of benchmark
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
-      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
+      dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
       docker-image: ${{ needs.build.outputs.docker-image }}
       test-matrix: ${{ needs.build.outputs.test-matrix }}
       timeout-minutes: 720
diff --git a/.github/workflows/inductor-perf-test-nightly-macos.yml b/.github/workflows/inductor-perf-test-nightly-macos.yml
index 0d92455a8f3c7..c3b9a42299247 100644
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@@ -48,6 +48,9 @@ jobs:
           { config: "perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
           { config: "perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
           { config: "perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
+          { config: "aot_inductor_perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
+          { config: "aot_inductor_perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
+          { config: "aot_inductor_perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/inductor-perf-test-nightly-rocm.yml b/.github/workflows/inductor-perf-test-nightly-rocm.yml
index 377f6d04bc8ce..f329fe74e6b64 100644
--- a/.github/workflows/inductor-perf-test-nightly-rocm.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml
@@ -85,26 +85,26 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     with:
       build-environment: linux-jammy-rocm-py3_10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
       test-matrix: |
         { include: [
-          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
index 6e19130a19246..170de752ab875 100644
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@@ -69,14 +69,14 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  linux-jammy-zen-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+  inductor-build:
+    name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
           { config: "inductor_huggingface_perf_cpu_x86_zen", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
@@ -95,16 +95,16 @@ jobs:
       selected-test-configs: ${{ inputs.benchmark_configs }}
     secrets: inherit
 
-  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly:
-    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+  inductor-test-nightly:
+    name: inductor-test-nightly
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
     if: github.event.schedule == '0 7 * * *'
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
       dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
-      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
       # disable monitor in perf tests
       disable-monitor: false
@@ -112,17 +112,16 @@ jobs:
       monitor-data-collect-interval: 4
     secrets: inherit
 
-
-  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+  inductor-test:
+    name: inductor-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
     if: github.event_name == 'workflow_dispatch'
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
       dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
-      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
       # disable monitor in perf tests
       disable-monitor: false
diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml
index 62234e5f499a7..f894b8fdc6e03 100644
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@@ -74,14 +74,14 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-build:
+    name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
           { config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" },
@@ -101,16 +101,16 @@ jobs:
       build-additional-packages: "vision audio torchao"
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-test-nightly-freezing:
+    name: inductor-test-nightly-freezing
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
     if: github.event.schedule == '0 7 * * *'
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
       dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
       # disable monitor in perf tests
       disable-monitor: false
@@ -118,16 +118,16 @@ jobs:
       monitor-data-collect-interval: 4
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-test:
+    name: inductor-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
     if: github.event_name == 'workflow_dispatch'
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
       dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
       # disable monitor in perf tests
       disable-monitor: false
diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml
index 9fd81a5a05c9a..19f72ba453414 100644
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@@ -79,7 +79,6 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  # NB: Keep this in sync with trunk.yml
   build:
     name: cuda12.8-py3.10-gcc9-sm80
     uses: ./.github/workflows/_linux-build.yml
diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml
index db6a235b8c864..21d965eaeaada 100644
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@@ -31,8 +31,8 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build:
-    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+  periodic-dynamo-benchmarks-build:
+    name: periodic-dynamo-benchmarks-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-default-label-prefix
     with:
@@ -57,63 +57,73 @@ jobs:
           { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
       build-additional-packages: "vision audio fbgemm torchao"
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
-    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+  periodic-dynamo-benchmarks-test:
+    name: periodic-dynamo-benchmarks-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build
+    needs: periodic-dynamo-benchmarks-build
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
+  rocm-periodic-dynamo-benchmarks-build:
     if: github.repository_owner == 'pytorch'
-    name: rocm-py3_10-periodic-dynamo-benchmarks
+    name: rocm-periodic-dynamo-benchmarks-build
     uses: ./.github/workflows/_linux-build.yml
     with:
       build-environment: linux-jammy-rocm-py3_10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
       sync-tag: rocm-build
       test-matrix: |
         { include: [
-          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
         ]}
     secrets: inherit
 
-  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test:
+  rocm-periodic-dynamo-benchmarks-test:
     permissions:
       id-token: write
       contents: read
-    name: rocm-py3_10-periodic-dynamo-benchmarks
+    name: rocm-periodic-dynamo-benchmarks-test
     uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build
+    needs: rocm-periodic-dynamo-benchmarks-build
     with:
       build-environment: linux-jammy-rocm-py3_10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build:
-    name: cuda12.8-py3.10-gcc9-sm80
+  inductor-smoke-build:
+    name: inductor-smoke-build
     uses: ./.github/workflows/_linux-build.yml
     needs:
       - get-default-label-prefix
@@ -129,23 +139,23 @@ jobs:
       build-additional-packages: "vision audio fbgemm torchao"
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
-    name: cuda12.8-py3.10-gcc9-sm80
+  inductor-smoke-test:
+    name: inductor-smoke-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build
+    needs: inductor-smoke-build
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build:
-    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
+  periodic-dynamo-benchmarks-cpu-build:
+    name: periodic-dynamo-benchmarks-cpu-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-default-label-prefix
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       test-matrix: |
         { include: [
@@ -160,68 +170,6 @@ jobs:
           { config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
           { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
           { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
-        ]}
-      build-additional-packages: "vision audio torchao"
-    secrets: inherit
-
-  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test:
-    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build
-    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
-    secrets: inherit
-
-
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-default-label-prefix
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
-      test-matrix: |
-        { include: [
-          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-      build-additional-packages: "vision audio fbgemm torchao"
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-default-label-prefix
-    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
-      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
-      test-matrix: |
-        { include: [
           { config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
           { config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
           { config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
@@ -247,12 +195,12 @@ jobs:
       build-additional-packages: "vision audio torchao"
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  periodic-dynamo-benchmarks-cpu-test:
+    name: periodic-dynamo-benchmarks-cpu-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: periodic-dynamo-benchmarks-cpu-build
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml
index f4c81ce7d7b8d..732ec7eb85f3e 100644
--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@@ -47,8 +47,8 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
       test-matrix: |
         { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml
index df918c329dd77..2125a8559363b 100644
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@@ -28,8 +28,8 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm86
+  inductor-build:
+    name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -47,44 +47,18 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm86
+  inductor-test:
+    name: inductor-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    needs: inductor-build
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_12-gcc9-inductor-build:
-    name: cuda12.8-py3.12-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      test-matrix: |
-        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_12-gcc9-inductor-test:
-    name: cuda12.8-py3.12-gcc9-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cpu-py3_12-inductor-halide-build:
-    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+  inductor-halide-build:
+    name: inductor-halide-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -97,18 +71,18 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cpu-py3_12-inductor-halide-test:
-    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+  inductor-halide-test:
+    name: inductor-halide-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_12-inductor-halide-build
+    needs: inductor-halide-build
     with:
       build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-halide-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
-    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
+  inductor-triton-cpu-build:
+    name: inductor-triton-cpu-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -121,23 +95,23 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
+  inductor-triton-cpu-test:
     name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
+    needs: inductor-triton-cpu-build
     with:
       build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-triton-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-triton-cpu-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-cpu-build:
+    name: inductor-cpu-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
         { include: [
@@ -148,37 +122,12 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-cpu-test:
+    name: inductor-cpu-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-cpu-build
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_13-gcc9-inductor-build:
-    name: cuda12.8-py3.13-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      test-matrix: |
-        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_13-gcc9-inductor-test:
-    name: cuda12.8-py3.13-gcc9-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index 721572f1807ba..4189d24a7b14f 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -44,8 +44,8 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm86
+  inductor-build:
+    name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -53,7 +53,6 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '8.6'
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
       test-matrix: |
         { include: [
           { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
@@ -65,25 +64,24 @@ jobs:
       build-additional-packages: "vision audio fbgemm torchao"
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm86
+  inductor-test:
+    name: inductor-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    needs: inductor-build
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-cpu-build:
+    name: inductor-cpu-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
       test-matrix: |
         { include: [
           { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
@@ -98,12 +96,12 @@ jobs:
       build-additional-packages: "vision audio torchao"
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-cpu-test:
+    name: inductor-cpu-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-cpu-build
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 476195ab5eec7..b1a6dfb390711 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -93,7 +93,7 @@ jobs:
       script: |
         CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
         echo "Running mypy"
-        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY --all-files" .github/scripts/lintrunner.sh
+        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
 
   lintrunner-noclang:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -111,9 +111,9 @@ jobs:
         CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
         echo "Running all other linters"
         if [ "$CHANGED_FILES" = '*' ]; then
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY --all-files" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
         else
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
         fi
 
   quick-checks:
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 2acc987e523c4..65b8781be7585 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -42,8 +42,8 @@ jobs:
     needs: get-label-type
     with:
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
     secrets: inherit
 
   docs-push:
diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml
index 16cb1600b8d6b..aaf32c160f0dc 100644
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@@ -24,38 +24,38 @@ permissions:
   contents: read
 
 jobs:
-  linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
+  opbenchmark-build:
     if: github.repository_owner == 'pytorch'
-    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+    name: opbenchmark-build
     uses: ./.github/workflows/_linux-build.yml
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
           { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
         ]}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build:
+  opbenchmark-on-demand-build:
     if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
-    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+    name: opbenchmark-on-demand-build
     uses: ./.github/workflows/_linux-build.yml
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
           { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
         ]}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-opbenchmark-test:
-    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+  opbenchmark-test:
+    name: opbenchmark-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build
+    needs: opbenchmark-build
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }}
+      docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 976fb241c99f9..714838eb84762 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -51,37 +51,6 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
-  linux-jammy-cuda12_4-py3_10-gcc11-sm89-build:
-    name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
-      cuda-arch-list: 8.9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_4-py3_10-gcc11-sm89-test:
-    name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_4-py3_10-gcc11-sm89-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
-      docker-image: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.test-matrix }}
-    secrets: inherit
-
   linux-jammy-cuda12_4-py3_10-gcc11-build:
     name: linux-jammy-cuda12.4-py3.10-gcc11
     uses: ./.github/workflows/_linux-build.yml
@@ -201,6 +170,38 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
     secrets: inherit
 
+  linux-jammy-cuda13_0-py3_10-gcc11-build:
+    name: linux-jammy-cuda13.0-py3.10-gcc11
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      cuda-arch-list: 7.5
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
+      test-matrix: |
+        { include: [
+          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda13_0-py3_10-gcc11-test:
+    name: linux-jammy-cuda13.0-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda13_0-py3_10-gcc11-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
+    secrets: inherit
+
   linux-jammy-rocm-py3_10-build:
     name: linux-jammy-rocm-py3.10
     uses: ./.github/workflows/_linux-build.yml
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 519a1a870b16f..3f13fbf276882 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -49,14 +49,14 @@ jobs:
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
 
-  linux-jammy-py3_9-gcc11-build:
-    name: linux-jammy-py3.9-gcc11
+  linux-jammy-py3_10-gcc11-build:
+    name: linux-jammy-py3.10-gcc11
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -73,49 +73,49 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-gcc11-test:
-    name: linux-jammy-py3.9-gcc11
+  linux-jammy-py3_10-gcc11-test:
+    name: linux-jammy-py3.10-gcc11
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-jammy-py3_9-gcc11-build
+      - linux-jammy-py3_10-gcc11-build
       - target-determination
     with:
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-docs:
     name: linux-docs
     uses: ./.github/workflows/_docs.yml
-    needs: linux-jammy-py3_9-gcc11-build
+    needs: linux-jammy-py3_10-gcc11-build
     with:
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
     secrets: inherit
 
-  linux-jammy-py3_9-gcc11-no-ops:
-    name: linux-jammy-py3.9-gcc11-no-ops
+  linux-jammy-py3_10-gcc11-no-ops:
+    name: linux-jammy-py3.10-gcc11-no-ops
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-no-ops
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11-no-ops
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-gcc11-pch:
-    name: linux-jammy-py3.9-gcc11-pch
+  linux-jammy-py3_10-gcc11-pch:
+    name: linux-jammy-py3.10-gcc11-pch
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-pch
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11-pch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
@@ -132,17 +132,17 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
         ]}
       sync-tag: asan-build
     secrets: inherit
 
-
   linux-jammy-py3_10-clang18-asan-test:
     name: linux-jammy-py3.10-clang18-asan
     uses: ./.github/workflows/_linux-test.yml
@@ -156,13 +156,13 @@ jobs:
       sync-tag: asan-test
     secrets: inherit
 
-  linux-jammy-py3_9-clang12-onnx-build:
-    name: linux-jammy-py3.9-clang12-onnx
+  linux-jammy-py3_10-clang12-onnx-build:
+    name: linux-jammy-py3.10-clang12-onnx
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang12-onnx
+      build-environment: linux-jammy-py3.10-clang12-onnx
       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx
       test-matrix: |
         { include: [
@@ -171,26 +171,26 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-clang12-onnx-test:
-    name: linux-jammy-py3.9-clang12-onnx
+  linux-jammy-py3_10-clang12-onnx-test:
+    name: linux-jammy-py3.10-clang12-onnx
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-jammy-py3_9-clang12-onnx-build
+      - linux-jammy-py3_10-clang12-onnx-build
       - target-determination
     with:
-      build-environment: linux-jammy-py3.9-clang12-onnx
-      docker-image: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-clang12-onnx
+      docker-image: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-py3_9-clang12-build:
-    name: linux-jammy-py3.9-clang12
+  linux-jammy-py3_10-clang12-build:
+    name: linux-jammy-py3.10-clang12
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@@ -207,16 +207,16 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-clang12-test:
-    name: linux-jammy-py3.9-clang12
+  linux-jammy-py3_10-clang12-test:
+    name: linux-jammy-py3.10-clang12
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-jammy-py3_9-clang12-build
+      - linux-jammy-py3_10-clang12-build
       - target-determination
     with:
-      build-environment: linux-jammy-py3.9-clang12
-      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-jammy-py3_13-clang12-build:
@@ -251,108 +251,22 @@ jobs:
       build-environment: linux-jammy-py3.13-clang12
       docker-image: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
-      timeout-minutes: 600
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-build-distributed:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '7.5'
-      test-matrix: |
-        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-test-distributed:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed
-      - target-determination
-    with:
-      timeout-minutes: 360
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc11-build:
-    name: linux-jammy-cuda12.8-py3.10-gcc11
+  linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build:
+    name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc11
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-build
-      - target-determination
-    with:
-      timeout-minutes: 360
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
-    name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-clang9-xla-build:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3_9-clang9-xla-test:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_9-clang9-xla-build
-    with:
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
-    secrets: inherit
-
   linux-jammy-cpu-py3_10-gcc11-bazel-test:
     name: linux-jammy-cpu-py3.10-gcc11-bazel-test
     uses: ./.github/workflows/_bazel-build-test.yml
@@ -368,14 +282,14 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-gcc11-mobile-lightweight-dispatch-build:
-    name: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
+  linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build:
+    name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
       build-generates-artifacts: false
       test-matrix: |
         { include: [
@@ -402,37 +316,6 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc11-sm89-build:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: 8.9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-sm89-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-sm89-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }}
-    secrets: inherit
-
   linux-jammy-py3-clang12-executorch-build:
     if: false  # Docker build needs pin update
     name: linux-jammy-py3-clang12-executorch
@@ -484,15 +367,15 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-xpu-2025_1-py3_9-build:
-    name: linux-jammy-xpu-2025.1-py3.9
+  linux-jammy-xpu-n-py3_9-build:
+    name: linux-jammy-xpu-n-py3.9
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      sync-tag: linux-xpu-2025-1-build
+      sync-tag: linux-xpu-n-build
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-2025.1-py3.9
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
+      build-environment: linux-jammy-xpu-n-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
diff --git a/.github/workflows/riscv64.yml b/.github/workflows/riscv64.yml
new file mode 100644
index 0000000000000..e4ec656fafcc3
--- /dev/null
+++ b/.github/workflows/riscv64.yml
@@ -0,0 +1,24 @@
+name: riscv64
+
+on:
+  push:
+    tags:
+      - ciflow/riscv64/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  pytorch-linux-noble-riscv64-py3_12-gcc14-cross-build:
+    if: github.repository_owner == 'pytorch'
+    name: pytorch-linux-noble-riscv64-py3_12-gcc14-cross-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-noble-riscv64-py3.12-gcc14
+      docker-image-name: pytorch-linux-noble-riscv64-py3.12-gcc14
+      runner: linux.2xlarge
+    secrets: inherit
diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml
index c51d89e5c955d..7e3ba43bf9845 100644
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@@ -48,12 +48,12 @@ jobs:
       sync-tag: rocm-build
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index 2a7b1d184330b..19b402f854572 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -78,14 +78,14 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-py3_9-clang12-build:
-    name: linux-jammy-py3.9-clang12
+  linux-jammy-py3_10-clang12-build:
+    name: linux-jammy-py3.10-clang12
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
       test-matrix: |
         { include: [
           { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
@@ -93,16 +93,16 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-clang12-test:
-    name: linux-jammy-py3.9-clang12
+  linux-jammy-py3_10-clang12-test:
+    name: linux-jammy-py3.10-clang12
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-jammy-py3_9-clang12-build
+      - linux-jammy-py3_10-clang12-build
       - target-determination
     with:
-      build-environment: linux-jammy-py3.9-clang12
-      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-jammy-rocm-py3_10-build:
diff --git a/.github/workflows/test-check-binary.yml b/.github/workflows/test-check-binary.yml
index 0d31948f196a1..5f0ad59d3a3bb 100644
--- a/.github/workflows/test-check-binary.yml
+++ b/.github/workflows/test-check-binary.yml
@@ -30,7 +30,7 @@ jobs:
     name: Test check_binary.sh for Linux CUDA
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
-      runner: linux.4xlarge.nvidia.gpu
+      runner: linux.g4dn.4xlarge.nvidia.gpu
       docker-image: python:3.11
       docker-build-dir: "skip-docker-build"
       script: |
diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml
index 7e4a818c3528d..1e83c7b9d98ce 100644
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@@ -4,6 +4,10 @@ on:
   pull_request:
     paths:
       - .github/workflows/test-h100.yml
+      - test/inductor/test_max_autotune.py
+      - torch/_inductor/kernel/mm.py
+      - torch/_inductor/kernel/mm_grouped.py
+
   workflow_dispatch:
   schedule:
     - cron: 0 4,10,16,22 * * *  # every 6 hours
diff --git a/.github/workflows/tools-unit-tests.yml b/.github/workflows/tools-unit-tests.yml
new file mode 100644
index 0000000000000..c687c07b7ca7e
--- /dev/null
+++ b/.github/workflows/tools-unit-tests.yml
@@ -0,0 +1,70 @@
+name: test-scripts-and-ci-tools
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - scripts/lumen_cli/**
+      - .github/workflows/tools-unit-tests.yml
+  pull_request:
+    paths:
+      - scripts/lumen_cli/**
+      - .github/workflows/tools-unit-tests.yml
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  lumen-cli-unit-tests-python312:
+    permissions:
+      contents: read
+      pull-requests: write
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout pytorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: true
+          fetch-depth: 0
+      - name: Setup Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: '3.12'
+          cache: pip
+
+      - name: Run tests
+        continue-on-error: true
+        run: |
+          set -ex
+          python3 -m venv /tmp/venv
+          source /tmp/venv/bin/activate
+          pip install -e .ci/lumen_cli/
+          pytest -v -s .ci/lumen_cli/tests/*
+
+  lumen-cli-compatible-python39:
+    permissions:
+      contents: read
+      pull-requests: write
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout pytorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: true
+          fetch-depth: 0
+      - name: Setup Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: '3.9'
+          cache: 'pip'
+      - name: Run tests
+        continue-on-error: true
+        run: |
+          set -ex
+          python3 -m venv /tmp/venv
+          source /tmp/venv/bin/activate
+          pip install -e .ci/lumen_cli/
diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml
index c656c16e97c2e..08fcd33402625 100644
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@@ -10,6 +10,10 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+permissions:
+  id-token: write
+  contents: read
+
 jobs:
   get-default-label-prefix:
     if: github.repository_owner == 'pytorch'
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 3879b62cc020e..4dd465d70803d 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -63,6 +63,43 @@ jobs:
         ]}
     secrets: inherit
 
+  linux-jammy-cuda12_8-py3_10-gcc11-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '7.5 8.9'
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
+    secrets: inherit
+
+
   # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
   linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build:
     name: linux-jammy-cuda12.8-py3.10-gcc11-no-ops
@@ -164,9 +201,9 @@ jobs:
       sync-tag: rocm-build
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
-          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.4" },
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" },
         ]}
     secrets: inherit
 
@@ -187,13 +224,12 @@ jobs:
       tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
     secrets: inherit
 
-  # NB: Keep this in sync with inductor-perf-test-nightly.yml
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm80
+  inductor-build:
+    name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '8.0'
     secrets: inherit
@@ -205,7 +241,7 @@ jobs:
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
           { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
index 1fdb1da67a595..5c456c607c887 100644
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@@ -59,22 +59,19 @@ jobs:
             # on the PR appear in chronological order (timing issues can shuffle them around)
             sleep 60
           fi
+
+          # Require a comment id for merge operations
+          if [ -z "${COMMENT_ID}" ]; then
+            echo "Error: merge requires COMMENT_ID to be specified"
+            exit 1
+          fi
+
           if [ -n "${FORCE}" ]; then
-            if [ -n "${COMMENT_ID}" ]; then
-              python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
-            else
-              python3 .github/scripts/trymerge.py --force "${PR_NUM}"
-            fi
+            python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
           elif [ -n "${IGNORE_CURRENT}" ]; then
-            if [ -n "${COMMENT_ID}" ]; then
-              python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
-            else
-              python3 .github/scripts/trymerge.py --ignore-current "${PR_NUM}"
-            fi
-          elif [ -n "${COMMENT_ID}" ]; then
-            python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
           else
-            python3 .github/scripts/trymerge.py "${PR_NUM}"
+            python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
           fi
       - name: Comment on Canceled
         if: ${{ cancelled() && steps.checkout.outcome == 'success' }}
diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml
index 08ae920e7cb0d..7f0fe6058bd08 100644
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@@ -12,7 +12,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
-permissions: read-all
+permissions:
+  id-token: write
+  contents: read
 
 jobs:
   # There must be at least one job here to satisfy GitHub action workflow syntax
@@ -51,3 +53,27 @@ jobs:
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-py3_9-clang9-xla-build:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
+      test-matrix: |
+        { include: [
+          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3_9-clang9-xla-test:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3_9-clang9-xla-build
+    with:
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
+    secrets: inherit
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 3d445756f7a2e..aa12cf22b246c 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -23,7 +23,7 @@ jobs:
         with:
           repository: pytorch/pytorch
           stable-branch: viable/strict
-          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]'
+          requires: '[\"pull\", \"trunk\", \"lint\", \"^linux-binary-manywheel$\", \"^linux-binary-libtorch-release$\", \"linux-aarch64\"]'
           secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
           clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
           clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
new file mode 100644
index 0000000000000..654e88be386b6
--- /dev/null
+++ b/.github/workflows/vllm.yml
@@ -0,0 +1,76 @@
+name: vllm-test
+
+on:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      - ciflow/vllm/*
+  workflow_dispatch:
+  schedule:
+    - cron: '0 */8 * * *'  # every 8 hours at minute 0 (UTC)
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  torch-build:
+    name: ci-vllm-test
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-additional-packages: "vision audio"
+      build-external-packages: "vllm"
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
+      cuda-arch-list: '8.0;8.9;9.0'
+      runner: linux.24xlarge.memory
+      test-matrix: |
+        { include: [
+          { config:  "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_multi_model_processor_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_multi_model_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
+          { config: "vllm_languagde_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
+          { config: "vllm_distributed_test_2_gpu_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 2, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_test", shard: 3, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_tp_test_distributed", shard: 1, num_shards: 1, runner: "linux.g6.12xlarge.nvidia.gpu"},
+          { config: "vllm_distributed_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.12xlarge.nvidia.gpu"}
+        ]}
+    secrets: inherit
+
+  vllm-test-sm89:
+      name: ci-vllm-test
+      uses: ./.github/workflows/_linux-test.yml
+      needs: [
+        torch-build,
+      ]
+      with:
+        build-environment: linux-jammy-cuda12.8-py3.12-gcc11
+        docker-image: ${{ needs.torch-build.outputs.docker-image }}
+        test-matrix: ${{ needs.torch-build.outputs.test-matrix }}
+      secrets: inherit
diff --git a/.github/workflows/win-arm64-build-test.yml b/.github/workflows/win-arm64-build-test.yml
index 627a43b56bf70..95b4e2f027f60 100644
--- a/.github/workflows/win-arm64-build-test.yml
+++ b/.github/workflows/win-arm64-build-test.yml
@@ -4,6 +4,9 @@ on:
   push:
     tags:
       - ciflow/win-arm64/*
+  schedule:
+    # Every 4 hours starting at 00:00 UTC
+    - cron: '0 */4 * * *'
 
 env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml
index c62918b4af210..36ba62349f28b 100644
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@@ -26,15 +26,15 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
-  linux-jammy-xpu-2025_0-py3_9-build:
-    name: linux-jammy-xpu-2025.0-py3.9
+  linux-jammy-xpu-n-1-py3_10-build:
+    name: linux-jammy-xpu-n-1-py3.10
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      sync-tag: linux-xpu-2025-0-build
+      sync-tag: linux-xpu-n-1-build
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-2025.0-py3.9
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3
+      build-environment: linux-jammy-xpu-n-1-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
       runner: linux.12xlarge
       test-matrix: |
         { include: [
@@ -47,60 +47,62 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-xpu-2025_1-py3_9-build:
-    name: linux-jammy-xpu-2025.1-py3.9
+  linux-jammy-xpu-n-py3_10-build:
+    name: linux-jammy-xpu-n-py3.10
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      sync-tag: linux-xpu-2025-1-build
+      sync-tag: linux-xpu-n-build
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-2025.1-py3.9
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
       runner: linux.12xlarge
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
         ]}
     secrets: inherit
 
-  linux-jammy-xpu-2025_1-py3_9-test:
-    name: linux-jammy-xpu-2025.1-py3.9
+  linux-jammy-xpu-n-py3_10-test:
+    name: linux-jammy-xpu-n-py3.10
     uses: ./.github/workflows/_xpu-test.yml
-    needs: linux-jammy-xpu-2025_1-py3_9-build
+    needs: linux-jammy-xpu-n-py3_10-build
     permissions:
       id-token: write
       contents: read
     with:
-      build-environment: linux-jammy-xpu-2025.1-py3.9
-      docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }}
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
     secrets: inherit
 
-  windows-xpu-2025_0-build:
+  windows-xpu-n-1-build:
     if: github.repository_owner == 'pytorch'
-    name: win-vs2022-xpu-2025_0-py3
+    name: win-vs2022-xpu-n-1-py3
     uses: ./.github/workflows/_win-build.yml
     with:
-      build-environment: win-vs2022-xpu-py3
+      build-environment: win-vs2022-xpu-n-1-py3
       cuda-version: cpu
       use-xpu: true
-      xpu-version: '2025.0'
+      xpu-version: '2025.1'
       vc-year: '2022'
     secrets: inherit
 
-  windows-xpu-2025_1-build:
+  windows-xpu-n-build:
     if: github.repository_owner == 'pytorch'
-    name: win-vs2022-xpu-2025_1-py3
+    name: win-vs2022-xpu-n-py3
     uses: ./.github/workflows/_win-build.yml
     with:
-      build-environment: win-vs2022-xpu-py3
+      build-environment: win-vs2022-xpu-n-py3
       cuda-version: cpu
       use-xpu: true
-      xpu-version: '2025.1'
+      xpu-version: '2025.2'
       vc-year: '2022'
     secrets: inherit
diff --git a/.gitignore b/.gitignore
index b4e78e642b245..d1fa4cd3caf28 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ coverage.xml
 aten/build/
 aten/src/ATen/Config.h
 aten/src/ATen/cuda/CUDAConfig.h
+aten/src/ATen/hip/HIPConfig.h
 benchmarks/.data
 caffe2/cpp_test/
 dist/
@@ -146,6 +147,9 @@ merge_record.json
 torchgen/packaged/*
 !torchgen/packaged/README.md
 
+# This file is injected by ROCm build scripts to bootstrap in torch/__init__.py.
+torch/_rocm_init.py
+
 # IPython notebook checkpoints
 .ipynb_checkpoints
 
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 9c46c91b5e353..944829fa38977 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -132,7 +132,7 @@ include_patterns = [
     'test/test_complex.py',
     'test/test_datapipe.py',
     'test/test_futures.py',
-    # 'test/test_numpy_interop.py',
+    'test/test_numpy_interop.py',
     'test/test_torch.py',
     'test/test_type_hints.py',
     'test/test_type_info.py',
@@ -583,7 +583,7 @@ exclude_patterns = [
 command = [
     'python3',
     'tools/linter/adapters/grep_linter.py',
-    '--pattern=#include <pybind11\/(^|[^(gil\.h)])',
+    '--pattern=#include <pybind11\/(^|[^(gil_simple\.h)])',
     '--allowlist-pattern=#include <torch\/csrc\/utils\/pybind.h>',
     '--linter-name=PYBIND11_INCLUDE',
     '--match-first-only',
@@ -1452,11 +1452,9 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
-    '--no-black-binary',
-    'black==23.12.1',
     'usort==1.0.8.post1',
     'isort==6.0.1',
-    'ruff==0.12.2',  # sync with RUFF
+    'ruff==0.12.9',  # sync with RUFF
 ]
 is_formatter = true
 
@@ -1591,7 +1589,7 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
-    'ruff==0.12.2',  # sync with PYFMT
+    'ruff==0.12.9',  # sync with PYFMT
 ]
 is_formatter = true
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
deleted file mode 100644
index 2c67fb1981b71..0000000000000
--- a/.pre-commit-config.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-repos:
-  - repo: local
-    hooks:
-      - id: lintrunner
-        name: Run Lintrunner in an isolated venv before every push. The first run may be slow...
-        entry: python scripts/run_lintrunner.py   # wrapper below
-        language: python                          # pre‑commit manages venv for the wrapper
-        additional_dependencies: []               # wrapper handles lintrunner install
-        always_run: true
-        stages: [pre-push]                        # fire only on pre‑push
-        pass_filenames: false                     # Lintrunner gets no per‑file args
-        verbose: true                             # stream output as it is produced...allegedly anyways
diff --git a/AGENTS.md b/AGENTS.md
index daf0f491702ba..3d5436a02a85d 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1 +1,17 @@
 - This is the only AGENTS.md, there are no recursive AGENTS.md
+- When you are working on a bug, first create a standalone file that
+  reproduces the bug and verify it fails in the expected way.  Use this to
+  test if your changes work.  Once the change is passing, find an appropriate
+  test file to add the test to and make sure to follow local conventions on
+  the test file.
+- If you are running the real test suite, DO NOT run the entire test suite.
+  Instead run only a single test case, e.g., 'python test/test_torch.py TestTorch.test_dir'
+- Do NOT run setup.py, you do not have a working build environment
+- Do NOT run pre-commit, it is not setup
+- To run lint, run 'lintrunner -a' (which will autoapply changes)
+- Do NOT attempt to install dependencies, you do not have Internet access
+- When you are ready to make a PR, do exactly these steps:
+  - git stash -u
+  - git reset --hard $(cat /tmp/orig_work.txt) # NB: reset to the LOCAL branch, do NOT fetch
+  - git stash pop
+  - Resolve conflicts if necessary
diff --git a/BUILD.bazel b/BUILD.bazel
index 50ffa12576475..2cbd36f06761b 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -22,7 +22,6 @@ COMMON_COPTS = [
     "-DHAVE_SHM_UNLINK=1",
     "-D_FILE_OFFSET_BITS=64",
     "-DUSE_FBGEMM",
-    "-DUSE_DISTRIBUTED",
     "-DAT_PER_OPERATOR_HEADERS",
     "-DATEN_THREADING=NATIVE",
     "-DNO_CUDNN_DESTROY_HANDLE",
@@ -279,6 +278,7 @@ header_template_rule(
         "@AT_BLAS_F2C@": "0",
         "@AT_BLAS_USE_CBLAS_DOT@": "1",
         "@AT_KLEIDIAI_ENABLED@": "0",
+        "@AT_USE_EIGEN_SPARSE@": "0",
     },
 )
 
@@ -746,6 +746,7 @@ cc_library(
             "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
             "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
             "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
+            "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
             "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
         ],
     )) + torch_sources,
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000000000..dcdf409e73146
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,15 @@
+# Testing
+
+Use our test class and test runner:
+
+```
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+class TestFeature(TestCase):
+    ...
+
+if __name__ == "__main__":
+    run_tests()
+```
+
+To test Tensor equality, use assertEqual.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d3314c72814a..21c867dd6b6e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,8 +181,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
   set(CPU_POWER ON)
 endif()
 
-# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
-# tested and likely won't work without additional changes.
+# For non-supported platforms, turn USE_DISTRIBUTED off by default.
+# NB: USE_DISTRIBUTED simply disables the backend; distributed code
+# still gets built
 if(NOT LINUX AND NOT WIN32)
   set(USE_DISTRIBUTED
       OFF
@@ -233,13 +234,16 @@ cmake_dependent_option(INSTALL_TEST "Install test binaries if BUILD_TEST is on"
 option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
 option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
+option(USE_LSAN "Use Leak Sanitizer" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
 option(USE_XPU "Use XPU" ON)
 cmake_dependent_option(
   BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
   "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
-cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
+cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX OR WIN32" OFF)
+cmake_dependent_option(USE_ROCM_CK_GEMM "Use ROCm Composable Kernel for GEMMs" ON "USE_ROCM;NOT WIN32" OFF)
+option(USE_ROCM_CK_SDPA "Use ROCm Composable Kernel for SDPA" OFF)
 option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
 cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
@@ -251,7 +255,6 @@ cmake_dependent_option(USE_CUFILE "Use cuFile" ON "USE_CUDA AND NOT WIN32" OFF)
 option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
-option(USE_FAKELOWP "Use FakeLowp operators" OFF)
 option(USE_GFLAGS "Use GFLAGS" OFF)
 option(USE_GLOG "Use GLOG" OFF)
 option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
@@ -260,16 +263,18 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
+option(USE_DISTRIBUTED "Enable default distributed backends" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
-                       "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
+cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
 cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
                        OFF)
 cmake_dependent_option(USE_NVSHMEM "Use NVSHMEM" ON
-                       "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 option(USE_NNAPI "Use NNAPI" OFF)
 option(USE_NNPACK "Use NNPACK" ON)
 cmake_dependent_option(USE_NUMA "Use NUMA. Only available on Linux." ON "LINUX"
@@ -286,6 +291,7 @@ option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build."
 option(USE_PROF "Use profiling" OFF)
 option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON)
 option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
+option(USE_EIGEN_SPARSE "Use Eigen Sparse Matrices" OFF)
 option(USE_SYSTEM_EIGEN_INSTALL
     "Use system Eigen instead of the one under third_party" OFF)
 cmake_dependent_option(
@@ -322,7 +328,6 @@ set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN})
 cmake_dependent_option(USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF "USE_MKLDNN"
                        OFF)
 option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF)
-option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(
   USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
   "USE_DISTRIBUTED" OFF)
@@ -427,11 +432,10 @@ if(WIN32)
       PATH_SUFFIXES lib
       NO_DEFAULT_PATH)
     if(NOT libuv_tmp_LIBRARY)
-      set(USE_DISTRIBUTED OFF)
       set(USE_GLOO OFF)
       message(
         WARNING
-          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
+          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
           "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
       )
     else()
@@ -834,10 +838,11 @@ include(ExternalProject)
 
 # ---[ Dependencies ---[ FBGEMM doesn't work on x86 32bit and
 # CMAKE_SYSTEM_PROCESSOR thinks its 64bit
-if(USE_FBGEMM
-   AND((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SIZEOF_VOID_P EQUAL
-                                                      4)
-        OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x86"))
+if(USE_FBGEMM AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+  message(WARNING
+    "x64 operating system is required for FBGEMM. "
+    "Not compiling with FBGEMM. "
+    "Turn this warning off by USE_FBGEMM=OFF.")
   set(USE_FBGEMM OFF)
 endif()
 
@@ -1193,7 +1198,7 @@ if(APPLE)
     string(
       APPEND
       CMAKE_SHARED_LINKER_FLAGS
-      " -weak_framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal"
+      " -weak_framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal -weak_framework IOKit"
     )
     # To suppress MPSGraph availability warnings
     append_cxx_flag_if_supported("-Wno-unguarded-availability-new"
diff --git a/CODEOWNERS b/CODEOWNERS
index 24ab4fd35be9d..1d91adacb0629 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -164,6 +164,7 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 # torch.export
 /torch/export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
 /torch/_export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
+/torch/_export/serde/schema.py @SherlockNoMad @zhxchen17
 
 # Dynamic Shapes
 /torch/fx/experimental/symbolic_shapes.py @bobrenjc93 @laithsakka
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index dade8f4ec6ec0..9d2b5d3553910 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -88,13 +88,13 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
 
 * If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
 
-* When installing with `python -m pip install -e .` (in contrast to `python -m pip install .`) Python runtime will use
+* When installing with `python -m pip install -e . -v --no-build-isolation` (in contrast to `python -m pip install . -v --no-build-isolation`) Python runtime will use
   the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder)
   This way you do not need to repeatedly install after modifying Python files (`.py`).
   However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
 
 
-  One way to avoid running `python -m pip install -e .` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
+  One way to avoid running `python -m pip install -e . -v --no-build-isolation` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
   is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following:
   ```bash
   pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
@@ -116,7 +116,7 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
 
   Next run `python setup.py clean`. After that, you can install in editable mode again.
 
-* If you run into errors when running `python -m pip install -e .`, here are some debugging steps:
+* If you run into errors when running `python -m pip install -e . -v --no-build-isolation`, here are some debugging steps:
   1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
   your CMake works and can compile this simple Hello World program without errors.
   2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
@@ -129,10 +129,10 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
       git clean -xdf
       python setup.py clean
       git submodule update --init --recursive
-      python -m pip install -r requirements.txt
+      python -m pip install --group dev
       python -m pip install --no-build-isolation -v -e .
       ```
-  4. The main step within `python -m pip install -e .` is running `cmake --build build` from the `build` directory. If you want to
+  4. The main step within `python -m pip install -e . -v --no-build-isolation` is running `make` from the `build` directory. If you want to
     experiment with some environment variables, you can pass them into the command:
       ```bash
       ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
@@ -259,6 +259,7 @@ dependencies as well as the nightly binaries into the repo directory.
       support for PyTorch.
 * [tools](tools) - Code generation scripts for the PyTorch library.
   See [README](tools/README.md) of this directory for more details.
+* [torchgen](torchgen) - contains the logic and tooling for generating PyTorch's low-level C++ and Python bindings from operator definitions, typically specified in native_functions.yaml
 * [test](test) - Python unit tests for PyTorch Python frontend.
   * [test_torch.py](test/test_torch.py) - Basic tests for PyTorch
     functionality.
@@ -294,7 +295,7 @@ The following packages should be installed with `pip`:
 - `pytest` - recommended to run tests more selectively
 Running
 ```
-pip install -r requirements.txt
+pip install --group dev
 ```
 will install these dependencies for you.
 
@@ -645,9 +646,9 @@ can be selected interactively with your mouse to zoom in on a particular part of
 the program execution timeline. The `--native` command-line option tells
 `py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers
 for C++ code it may be necessary to compile PyTorch in debug mode by prepending
-your `python -m pip install -e .` call to compile PyTorch with `DEBUG=1`.
-Depending on your operating system it may also be necessary to run `py-spy` with
-root privileges.
+your `python -m pip install -e . -v --no-build-isolation` call to compile
+PyTorch with `DEBUG=1`. Depending on your operating system it may also be
+necessary to run `py-spy` with root privileges.
 
 `py-spy` can also work in an `htop`-like "live profiling" mode and can be
 tweaked to adjust the stack sampling rate, see the `py-spy` readme for more
@@ -655,10 +656,10 @@ details.
 
 ## Managing multiple build trees
 
-One downside to using `python -m pip install -e .` is that your development
-version of PyTorch will be installed globally on your account (e.g., if
-you run `import torch` anywhere else, the development version will be
-used).
+One downside to using `python -m pip install -e . -v --no-build-isolation` is
+that your development version of PyTorch will be installed globally on your
+account (e.g., if you run `import torch` anywhere else, the development version
+will be used).
 
 If you want to manage multiple builds of PyTorch, you can make use of
 [venv environments](https://docs.python.org/3/library/venv.html) to maintain
@@ -719,7 +720,7 @@ options.
 
 ### Code completion and IDE support
 
-When using `python -m pip install -e .`, PyTorch will generate
+When using `python -m pip install -e . -v --no-build-isolation`, PyTorch will generate
 a `compile_commands.json` file that can be used by many editors
 to provide command completion and error highlighting for PyTorch's
 C++ code. You need to `pip install ninja` to generate accurate
diff --git a/README.md b/README.md
index 65c0bb982bd96..99e6dabd16181 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-![PyTorch Logo](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/pytorch-logo-dark.png)
+![PyTorch Logo](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/pytorch-logo-dark.png)
 
 --------------------------------------------------------------------------------
 
@@ -72,7 +72,7 @@ Elaborating Further:
 
 If you use NumPy, then you have used Tensors (a.k.a. ndarray).
 
-![Tensor illustration](./docs/source/_static/img/tensor_illustration.png)
+![Tensor illustration](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/tensor_illustration.png)
 
 PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
 computation by a huge amount.
@@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
 While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
 You get the best of speed and flexibility for your crazy research.
 
-![Dynamic graph](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif)
+![Dynamic graph](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/dynamic_graph.gif)
 
 ### Python First
 
@@ -242,9 +242,8 @@ git submodule update --init --recursive
 **Common**
 
 ```bash
-conda install cmake ninja
-# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
-pip install -r requirements.txt
+# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above
+pip install --group dev
 ```
 
 **On Linux**
@@ -395,7 +394,7 @@ On macOS
 
 ```bash
 export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
-MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build
+MACOSX_DEPLOYMENT_TARGET=11.0 CMAKE_ONLY=1 python setup.py build
 ccmake build  # or cmake-gui build
 ```
 
@@ -560,7 +559,7 @@ To learn more about making a contribution to Pytorch, please see our [Contributi
 
 PyTorch is a community-driven project with several skillful engineers and researchers contributing to it.
 
-PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
+PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), [Alban Desmaison](https://github.com/albanD), [Piotr Bialecki](https://github.com/ptrblck) and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
 A non-exhaustive but growing list needs to mention: [Trevor Killeen](https://github.com/killeent), [Sasank Chilamkurthy](https://github.com/chsasank), [Sergey Zagoruyko](https://github.com/szagoruyko), [Adam Lerer](https://github.com/adamlerer), [Francisco Massa](https://github.com/fmassa), [Alykhan Tejani](https://github.com/alykhantejani), [Luca Antiga](https://github.com/lantiga), [Alban Desmaison](https://github.com/albanD), [Andreas Koepf](https://github.com/andreaskoepf), [James Bradbury](https://github.com/jekbradbury), [Zeming Lin](https://github.com/ebetica), [Yuandong Tian](https://github.com/yuandong-tian), [Guillaume Lample](https://github.com/glample), [Marat Dukhan](https://github.com/Maratyszcza), [Natalia Gimelshein](https://github.com/ngimel), [Christian Sarofeen](https://github.com/csarofeen), [Martin Raison](https://github.com/martinraison), [Edward Yang](https://github.com/ezyang), [Zachary Devito](https://github.com/zdevito). <!-- codespell:ignore -->
 
 Note: This project is unrelated to [hughperkins/pytorch](https://github.com/hughperkins/pytorch) with the same name. Hugh is a valuable contributor to the Torch community and has helped with many things Torch and PyTorch.
diff --git a/android/README.md b/android/README.md
index 6b8000c13fccc..f0c74750522de 100644
--- a/android/README.md
+++ b/android/README.md
@@ -2,7 +2,7 @@
 
 ## Demo applications and tutorials
 
-Please refer to [pytorch-labs/executorch-examples](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo) for the Android demo app based on [ExecuTorch](https://github.com/pytorch/executorch).
+Please refer to [meta-pytorch/executorch-examples](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo) for the Android demo app based on [ExecuTorch](https://github.com/pytorch/executorch).
 
 Please join our [Discord](https://discord.com/channels/1334270993966825602/1349854760299270284) for any questions.
 
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index b02638e5b6de7..a3c98f37a0242 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -96,6 +96,8 @@ file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
 file(GLOB vulkan_cpp "vulkan/*.cpp")
 file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/impl/*.cpp" "native/vulkan/ops/*.cpp")
 
+file(GLOB native_eigen_cpp "native/sparse/eigen/*.cpp")
+
 # Metal
 file(GLOB metal_h "metal/*.h")
 file(GLOB metal_cpp "metal/*.cpp")
@@ -119,6 +121,8 @@ file(GLOB_RECURSE native_mps_cpp "native/mps/*.cpp")
 file(GLOB_RECURSE native_mps_mm "native/mps/*.mm")
 file(GLOB_RECURSE native_mps_metal "native/mps/*.metal")
 file(GLOB_RECURSE native_mps_h "native/mps/*.h")
+file(GLOB_RECURSE native_sparse_mps_mm "native/sparse/mps/*.mm")
+file(GLOB_RECURSE native_mps_sparse_metal "native/sparse/mps/*.metal")
 
 file(GLOB native_sparse_cpp "native/sparse/*.cpp")
 file(GLOB native_quantized_cpp
@@ -178,26 +182,27 @@ file(GLOB native_flash_attn_api_cpp "native/transformers/cuda/flash_attn/flash_a
 file(GLOB flash_attention_hip_hip "native/transformers/hip/flash_attn/*.hip")
 # if USE_FLASH_ATTENTION is set, ensure CK instances get generated
 if(USE_FLASH_ATTENTION)
-  if(DEFINED ENV{USE_CK_FLASH_ATTENTION})
-    set(USE_CK_FLASH_ATTENTION $ENV{USE_CK_FLASH_ATTENTION})
-      if(USE_CK_FLASH_ATTENTION STREQUAL "1")
-        if(DEFINED ENV{PYTORCH_ROCM_ARCH})
-          list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS)
-          if(NUM_ARCHS GREATER 1)
-            message(WARNING "Building CK for multiple archs can increase build time considerably!
-            Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for")
-          endif()
-        endif()
-        message(STATUS "USE_CK_FLASH_ATTENTION is set; building PyTorch with CK Flash Attention enabled")
-        message(STATUS "Generating CK kernel instances...")
-        add_subdirectory(native/transformers/hip/flash_attn/ck)
-        file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip")
-        list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip})
-        # FAv3 Generation
-        add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3)
-        file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip")
-        list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip})
+  if("$ENV{USE_CK_FLASH_ATTENTION}" STREQUAL "1")
+    message(STATUS "USE_CK_FLASH_ATTENTION is being deprecated. Please use USE_ROCM_CK_SDPA instead")
+    caffe2_update_option(USE_ROCM_CK_SDPA ON)
+  endif()
+  if(USE_ROCM_CK_SDPA)
+    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+      list(LENGTH PYTORCH_ROCM_ARCH NUM_ARCHS)
+      if(NUM_ARCHS GREATER 1)
+        message(WARNING "Building CK for multiple archs can increase build time considerably!
+        Consider setting PYTORCH_ROCM_ARCH env var value as the gfx arch you need to build for")
       endif()
+    endif()
+    message(STATUS "USE_ROCM_CK_SDPA is set; building PyTorch with CK SDPA enabled")
+    message(STATUS "Generating CK kernel instances...")
+    add_subdirectory(native/transformers/hip/flash_attn/ck)
+    file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip")
+    list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip})
+    # FAv3 Generation
+    add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3)
+    file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip")
+    list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip})
   endif()
   file(GLOB flash_attention_hip_aot_hip "native/transformers/hip/flash_attn/aot/*.hip")
   file(GLOB flash_attention_src_hip_hip "native/transformers/hip/flash_attn/src/*.hip")
@@ -211,7 +216,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention
 if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
   add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp})
 
-  target_include_directories(flash_attention PUBLIC
+  target_include_directories(flash_attention SYSTEM PUBLIC
     ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc
     ${PROJECT_SOURCE_DIR}/third_party/flash-attention/include
     ${PROJECT_SOURCE_DIR}/third_party/cutlass/include
@@ -255,39 +260,78 @@ endif()
 # FBGEMM GenAI
 IF(USE_FBGEMM_GENAI)
   set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
-  set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
-
-  if(USE_ROCM)
-    # Only include the kernels we want to build to avoid increasing binary size.
-    file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
-      "${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
-      "${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
-    set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-
-    # Add additional HIPCC compiler flags for performance
-    set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-      -mllvm
-      -amdgpu-coerce-illegal-types=1
-      -mllvm
-      -enable-post-misched=0
-      -mllvm
-      -greedy-reverse-local-assignment=1
-      -fhip-new-launch-api)
+  set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
+  if(USE_CUDA)
+    # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
+    # If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
+    set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*")
+    file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
+      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
+      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
+    list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX})
+
+    file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp
+      "${FBGEMM_GENAI_SRCS}/common/*.cpp"
+    )
+
+    # Combine all source files into a single list
+    list(APPEND fbgemm_genai_all_sources
+      ${fbgemm_genai_native_cuda_cu}
+      ${fbgemm_genai_native_cuda_cpp}
+    )
 
     hip_add_library(
       fbgemm_genai STATIC
       ${fbgemm_genai_native_rocm_hip}
       HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
     set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
+
+    set(fbgemm_genai_mx8mx8bf16_grouped
+      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
+    )
 
     target_include_directories(fbgemm_genai PUBLIC
-      # FBGEMM version of Composable Kernel is used due to some customizations
-      ${FBGEMM_THIRD_PARTY}/composable_kernel/include
-      ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
-      ${FBGEMM_GENAI_DIR}/include/
-      ${FBGEMM_GENAI_DIR}/common/include/
+      ${FBGEMM_THIRD_PARTY}/cutlass/include
+      ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
+      ${fbgemm_genai_mx8mx8bf16_grouped}
+      ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
+      ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
     )
+  else()
+    if(USE_ROCM)
+      # Only include the kernels we want to build to avoid increasing binary size.
+      file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
+        "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
+        "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
+      set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+
+      # Add additional HIPCC compiler flags for performance
+      set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
+        -mllvm
+        -amdgpu-coerce-illegal-types=1
+        -mllvm
+        -enable-post-misched=0
+        -mllvm
+        -greedy-reverse-local-assignment=1
+        -fhip-new-launch-api)
+
+      hip_add_library(
+        fbgemm_genai STATIC
+        ${fbgemm_genai_native_rocm_hip}
+        HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
+      set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
+      target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
+
+      target_include_directories(fbgemm_genai PUBLIC
+        # FBGEMM version of Composable Kernel is used due to some customizations
+        ${FBGEMM_THIRD_PARTY}/composable_kernel/include
+        ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
+        ${FBGEMM_THIRD_PARTY}/cutlass/include
+        ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
+        ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
+        ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
+      )
+    endif()
   endif()
 endif()
 
@@ -338,6 +382,9 @@ if(USE_VULKAN)
 else()
   set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp})
 endif()
+if(USE_EIGEN_SPARSE)
+  set(all_cpu_cpp ${all_cpu_cpp} ${native_eigen_cpp})
+endif()
 
 if(USE_MTIA)
     set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} ${mtia_cpp} ${mtia_h} ${native_mtia_cpp} ${native_mtia_h})
@@ -416,40 +463,42 @@ if(USE_CUDA)
 endif()
 
 if(USE_ROCM)
-  # NOTE: The PyTorch build does not actually add_subdirectory
-  # third_party/composable_kernel or use it as a CMake library. What is used
-  # is header only, so this should be ok, except that the CMake build generates
-  # a ck/config.h. We just do that part here. Without this, the ck.h from the
-  # ROCM SDK may get accidentally used instead.
-  function(_pytorch_rocm_generate_ck_conf)
-    set(CK_ENABLE_INT8 "ON")
-    set(CK_ENABLE_FP16 "ON")
-    set(CK_ENABLE_FP32 "ON")
-    set(CK_ENABLE_FP64 "ON")
-    set(CK_ENABLE_BF16 "ON")
-    set(CK_ENABLE_FP8 "ON")
-    set(CK_ENABLE_BF8 "ON")
-    set(CK_USE_XDL "ON")
-    set(CK_USE_WMMA "ON")
-    configure_file(
-      "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
-      "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
-      )
-  endfunction()
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
-  _pytorch_rocm_generate_ck_conf()
+  if((USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA) OR USE_ROCM_CK_GEMM)
+    # NOTE: The PyTorch build does not actually add_subdirectory
+    # third_party/composable_kernel or use it as a CMake library. What is used
+    # is header only, so this should be ok, except that the CMake build generates
+    # a ck/config.h. We just do that part here. Without this, the ck.h from the
+    # ROCM SDK may get accidentally used instead.
+    function(_pytorch_rocm_generate_ck_conf)
+      set(CK_ENABLE_INT8 "ON")
+      set(CK_ENABLE_FP16 "ON")
+      set(CK_ENABLE_FP32 "ON")
+      set(CK_ENABLE_FP64 "ON")
+      set(CK_ENABLE_BF16 "ON")
+      set(CK_ENABLE_FP8 "ON")
+      set(CK_ENABLE_BF8 "ON")
+      set(CK_USE_XDL "ON")
+      set(CK_USE_WMMA "ON")
+      configure_file(
+        "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
+        "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
+        )
+    endfunction()
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha)
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
+    _pytorch_rocm_generate_ck_conf()
+  endif()
 
   # Next two lines are needed because TunableOp uses third-party/fmt
   list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>)
   list(APPEND ATen_HIP_DEPENDENCY_LIBS fmt::fmt-header-only)
-if(USE_FLASH_ATTENTION)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck)
-endif()
+  if(USE_FLASH_ATTENTION AND USE_ROCM_CK_SDPA)
+    list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/transformers/hip/flash_attn/ck)
+  endif()
   list(APPEND ATen_HIP_SRCS
     ${ATen_HIP_SRCS}
     ${hip_hip}
@@ -459,12 +508,13 @@ endif()
     ${native_quantized_hip_hip}
     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
   )
-  if(WIN32) # Windows doesn't support Composable Kernels
+  if(NOT USE_ROCM_CK_GEMM)
     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
     file(GLOB native_hip_ck "native/hip/ck*.hip")
     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
       ${native_hip_bgemm} ${native_hip_ck})
   endif()
+
   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
   list(APPEND all_hip_cpp
     ${native_nested_hip_cpp}
@@ -624,12 +674,26 @@ if(USE_CUDA AND NOT USE_ROCM)
   add_definitions(-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
   list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
   list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
+
+  # Add FBGEMM_GENAI include directories for torch_ops.h
+  if(USE_FBGEMM_GENAI)
+    list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
+    list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
+  endif()
+
   if($ENV{ATEN_STATIC_CUDA})
-    list(APPEND ATen_CUDA_DEPENDENCY_LIBS
-      ${CUDA_LIBRARIES}
-      CUDA::cusparse_static
-      CUDA::cufft_static_nocallback
-    )
+    if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
+      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+          ${CUDA_LIBRARIES}
+          CUDA::cusparse_static
+          CUDA::cufft_static_nocallback)
+    else()
+      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+          ${CUDA_LIBRARIES}
+          CUDA::cusparse_static
+          CUDA::cufft_static)
+    endif()
+
    if(NOT BUILD_LAZY_CUDA_LINALG)
      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
        CUDA::cusolver_static
@@ -699,10 +763,10 @@ endif()
 if(USE_MPS)
     include(../../../cmake/Metal.cmake)
 
-    set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h})
+    set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h} ${native_sparse_mps_mm})
 
     if(CAN_COMPILE_METAL)
-        foreach(SHADER ${native_mps_metal})
+        foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal})
             cmake_path(GET SHADER STEM TGT_STEM)
             string(CONCAT TGT_BASIC ${TGT_STEM} "_31.air")
             list(APPEND AIR_BASIC ${TGT_BASIC})
@@ -717,7 +781,7 @@ if(USE_MPS)
         add_custom_target(metallibs DEPENDS kernels_basic.metallib metallib_dummy.cpp)
     else()
         file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps")
-        foreach(SHADER ${native_mps_metal})
+        foreach(SHADER ${native_mps_metal} ${native_mps_sparse_metal})
             cmake_path(GET SHADER STEM TGT_STEM)
             string(CONCAT SHADER_HDR_NAME  "${CMAKE_CURRENT_BINARY_DIR}" /native/mps/ ${TGT_STEM} "_metallib.h")
             metal_to_metallib_h(${SHADER} ${SHADER_HDR_NAME})
diff --git a/aten/src/ATen/Config.h.in b/aten/src/ATen/Config.h.in
index c22e15a52aa23..0bae6d4af6e5e 100644
--- a/aten/src/ATen/Config.h.in
+++ b/aten/src/ATen/Config.h.in
@@ -20,3 +20,4 @@
 #define AT_BLAS_F2C() @AT_BLAS_F2C@
 #define AT_BLAS_USE_CBLAS_DOT() @AT_BLAS_USE_CBLAS_DOT@
 #define AT_KLEIDIAI_ENABLED() @AT_KLEIDIAI_ENABLED@
+#define AT_USE_EIGEN_SPARSE() @AT_USE_EIGEN_SPARSE@
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 03529c64d6cac..4d48084b0ab89 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -480,6 +480,9 @@ at::BlasBackend Context::blasPreferredBackend() {
   // call site for blasPreferredBackend(), we set it to an actual value.
   if (blas_preferred_backend == at::BlasBackend::Default) {
     blas_preferred_backend = at::BlasBackend::Cublas;
+    // This logic sits in the getter because it needs to validate
+    // values set via env vars such as TORCH_BLAS_PREFER_CUBLASLT
+    // which initialize the backend without calling the setter
 #ifdef USE_ROCM
     // AMD Instinct targets prefer hipblaslt
     static const bool hipblaslt_preferred = []() {
@@ -509,6 +512,10 @@ at::BlasBackend Context::blasPreferredBackend() {
   // hipblaslt support for all archs is not as complete as hipblas
   if (blas_preferred_backend == at::BlasBackend::Cublaslt) {
     static const bool hipblaslt_unsupported = []() {
+      if(!hasCuBLASLt())
+      {
+          return true;
+      }
       static const std::vector<std::string> archs = {
           "gfx90a", "gfx942",
 #if ROCM_VERSION >= 60300
@@ -534,6 +541,24 @@ at::BlasBackend Context::blasPreferredBackend() {
   return blas_preferred_backend;
 }
 
+bool Context::ckSupported() {
+#ifdef USE_ROCM
+  static const std::vector<std::string> supported_archs = {
+    "gfx90a", "gfx942", "gfx950"
+  };
+  for (auto index : c10::irange(detail::getCUDAHooks().deviceCount())) {
+    if(!detail::getCUDAHooks().isGPUArch(supported_archs, index)) {
+      TORCH_WARN_ONCE(
+        "Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
+      return false;
+    }
+  }
+  return true;
+#else
+  return false;
+#endif
+}
+
 void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #ifdef _MSC_VER
   TORCH_WARN_ONCE(
@@ -543,8 +568,14 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #else
   TORCH_CHECK((b != at::BlasBackend::Cublaslt) || hasCuBLASLt(),
       "Cannot set preferred backend to cuBLASLt if PyTorch has not been compiled with cuBLASLt.");
-  TORCH_CHECK((b != at::BlasBackend::Ck) || hasROCM(),
-      "Cannot set preferred backend to Ck if PyTorch has not been compiled for ROCm.");
+#ifdef USE_ROCM
+  static const bool ckSupportedFlag = ckSupported();
+  static const bool hasCKGEMMFlag = hasCKGEMM();
+  TORCH_CHECK((b != at::BlasBackend::Ck) || (ckSupportedFlag && hasCKGEMMFlag),
+      "Cannot set preferred blas backend to CK since following conditions are not true: ",
+      "architecture supported for CK: ", ckSupportedFlag,
+      ", PyTorch built with CK GEMM support: ", hasCKGEMMFlag);
+#endif
   if (b != at::BlasBackend::Default && b != at::BlasBackend::Cublas) {
     TORCH_WARN_ONCE(
       "torch.backends.cuda.preferred_blas_library is an experimental feature. "
@@ -556,35 +587,40 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
 #endif
 }
 
-at::ROCmFABackend Context::getROCmFAPreferredBackend() const {
+at::ROCmFABackend Context::getROCmFAPreferredBackend() {
+#ifdef USE_ROCM
+  // Set potential "Default" value so we don't have to interpret at call sites.
+  // We use aotriton backend as the default, for now.
+  if(rocm_fa_preferred_backend == at::ROCmFABackend::Default) {
+    rocm_fa_preferred_backend = at::ROCmFABackend::AOTriton;
+  } else if (rocm_fa_preferred_backend == at::ROCmFABackend::Ck) {
+    // This logic sits in the getter because it needs to validate
+    // values set via env vars such as TORCH_ROCM_FA_PREFER_CK
+    // which initialize the backend without calling the setter
+    // Perform validity checking
+    static const bool hasCKSDPAFlag = hasCKSDPA();
+    static const bool ckSupportedFlag = ckSupported();
+    if(!(hasCKSDPAFlag && ckSupportedFlag)){
+      TORCH_WARN_ONCE(
+        "Cannot set preferred SDPA backend to CK since following conditions are not true: ",
+        "architecture supported for CK: ", ckSupportedFlag,
+        ", PyTorch built with CK SDPA support: ", hasCKSDPAFlag);
+      rocm_fa_preferred_backend = at::ROCmFABackend::AOTriton;
+    }
+  }
+#endif
+
   return rocm_fa_preferred_backend;
 }
 
 void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
-
-  // TODO: add plumbing for hasCK for validity checking
-  TORCH_CHECK((b != at::ROCmFABackend::Ck) || hasROCM(),
-      "Cannot set preferred flash attention backend to Ck if PyTorch has not been compiled for ROCm.");
 #ifdef USE_ROCM
-  if(b == at::ROCmFABackend::Ck) {
-    static const bool ck_unsupported = []() {
-      static const std::vector<std::string> archs = {
-          "gfx90a",  "gfx942", "gfx950"
-      };
-      for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
-        if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
-          TORCH_WARN_ONCE(
-            "Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
-          return true;
-        }
-      }
-      return false;
-    }();
-    if(!ck_unsupported) rocm_fa_preferred_backend = b;
-  }
-  else {
-     rocm_fa_preferred_backend = b;
-  }
+  static const bool hasCKSDPAFlag = hasCKSDPA();
+  static const bool ckSupportedFlag = ckSupported();
+  TORCH_CHECK((b != at::ROCmFABackend::Ck) || (hasCKSDPAFlag && ckSupportedFlag),
+      "Cannot set preferred SDPA backend to CK since following conditions are not true: ",
+      "architecture supported for CK: ", ckSupportedFlag,
+      ", PyTorch built with CK SDPA support: ", hasCKSDPAFlag);
 #endif
   rocm_fa_preferred_backend = b;
 }
@@ -662,6 +698,14 @@ bool Context::hasLAPACK() {
 #endif
 }
 
+bool Context::hasEigenSparse() {
+#if AT_USE_EIGEN_SPARSE()
+  return true;
+#else
+  return false;
+#endif
+}
+
 at::QEngine Context::qEngine() const {
   static auto _quantized_engine = []() {
     at::QEngine qengine = at::kNoQEngine;
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 945076f3f0124..5cfa9b23e20aa 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -132,6 +132,8 @@ class TORCH_API Context {
   static bool hasKleidiAI();
   static bool hasLAPACK();
   static bool hasMKLDNN();
+  static bool ckSupported();
+  static bool hasEigenSparse();
   static bool hasMAGMA() {
     return detail::getCUDAHooks().hasMAGMA();
   }
@@ -162,6 +164,12 @@ class TORCH_API Context {
   static bool hasROCM() {
     return detail::getCUDAHooks().hasROCM();
   }
+  static bool hasCKSDPA() {
+    return detail::getCUDAHooks().hasCKSDPA();
+  }
+  static bool hasCKGEMM() {
+    return detail::getCUDAHooks().hasCKGEMM();
+  }
   static bool hasHIP() {
     return detail::getHIPHooks().hasHIP();
   }
@@ -252,7 +260,7 @@ class TORCH_API Context {
   at::BlasBackend blasPreferredBackend();
   void setBlasPreferredBackend(at::BlasBackend);
 
-  at::ROCmFABackend getROCmFAPreferredBackend() const;
+  at::ROCmFABackend getROCmFAPreferredBackend();
   void setROCmFAPreferredBackend(at::ROCmFABackend);
 
   // Note [Enabling Deterministic Operations]
@@ -608,6 +616,10 @@ inline bool hasLAPACK() {
   return globalContext().hasLAPACK();
 }
 
+inline bool hasEigenSparse() {
+  return globalContext().hasEigenSparse();
+}
+
 inline bool hasMAGMA() {
   return globalContext().hasMAGMA();
 }
diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
index 9632cd5ed6983..98ad757946bec 100644
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -308,17 +308,44 @@ void fillVersion<DLManagedTensorVersioned>(
 // constructed out of ATen tensor
 template <class T>
 T* toDLPackImpl(const Tensor& src) {
-  // create a new tensor with possibly normalized strides
-  // gh-83069
-  auto shape = src.sizes();
-  auto strides = src.strides().vec();
-  for (int i = 0; i < src.dim(); i++) {
-    if (shape[i] < 2) {
-      strides[i] = 1;
+  auto view = src;
+
+  // Detect whether there is need to normalize the strides
+  // Background: gh-83069
+  //
+  // However, normalizing strides can come at a high-cost
+  // to slow down toDLPack conversion 3x, so we
+  // only normalize if needed.
+  //
+  // The following code detects whether the src follows
+  // a continuous pattern. If the src follows such pattern (common-case)
+  // then we do not need to normalize the strides.
+  bool need_normalize_strides = false;
+  int64_t expected_stride = 1;
+  for (int i = src.dim() - 1; i >= 0; i--) {
+    // detect if we do not meet continuous pattern
+    // and the size is 1, so there is opportunity to normalize
+    if (src.stride(i) != expected_stride && src.size(i) == 1) {
+      need_normalize_strides = true;
+      break;
+    }
+    expected_stride *= src.size(i);
+  }
+
+  // less common case, try normalizing the strides
+  if (need_normalize_strides) {
+    // create a new tensor with possibly normalized strides
+    // gh-83069
+    auto shape = src.sizes();
+    auto strides = src.strides().vec();
+    for (int i = 0; i < src.dim(); i++) {
+      if (shape[i] < 2) {
+        strides[i] = 1;
+      }
     }
+    view = src.as_strided(shape, strides, src.storage_offset());
   }
 
-  auto view = src.as_strided(shape, strides, src.storage_offset());
   ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
   atDLMTensor->handle = view;
   atDLMTensor->tensor.manager_ctx = atDLMTensor;
diff --git a/aten/src/ATen/DTensorState.cpp b/aten/src/ATen/DTensorState.cpp
new file mode 100644
index 0000000000000..0644aae3d0709
--- /dev/null
+++ b/aten/src/ATen/DTensorState.cpp
@@ -0,0 +1,17 @@
+#include <ATen/DTensorState.h>
+
+namespace at {
+
+namespace {
+thread_local bool kDTensorAllowImplicitReplication = false;
+}
+
+bool get_dtensor_allow_implicit_replication() {
+  return kDTensorAllowImplicitReplication;
+}
+
+void set_dtensor_allow_implicit_replication(bool enabled) {
+  kDTensorAllowImplicitReplication = enabled;
+}
+
+} // namespace at
diff --git a/aten/src/ATen/DTensorState.h b/aten/src/ATen/DTensorState.h
new file mode 100644
index 0000000000000..07e89eaeddae7
--- /dev/null
+++ b/aten/src/ATen/DTensorState.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+namespace at {
+
+TORCH_API bool get_dtensor_allow_implicit_replication();
+TORCH_API void set_dtensor_allow_implicit_replication(bool enabled);
+
+struct DTensorAllowImplicitReplication {
+  DTensorAllowImplicitReplication()
+      : prev_dtensor_allow_implicit_replication_(
+            get_dtensor_allow_implicit_replication()) {
+    set_dtensor_allow_implicit_replication(true);
+  }
+
+  DTensorAllowImplicitReplication(const DTensorAllowImplicitReplication&) =
+      delete;
+  DTensorAllowImplicitReplication& operator=(
+      const DTensorAllowImplicitReplication&) = delete;
+  DTensorAllowImplicitReplication(DTensorAllowImplicitReplication&&) = delete;
+  DTensorAllowImplicitReplication& operator=(
+      DTensorAllowImplicitReplication&&) = delete;
+
+  ~DTensorAllowImplicitReplication() {
+    set_dtensor_allow_implicit_replication(
+        prev_dtensor_allow_implicit_replication_);
+  }
+
+ private:
+  bool prev_dtensor_allow_implicit_replication_;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h
index f37e492c861fe..f23b35047fcc8 100644
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/core/CachingDeviceAllocator.h>
 #include <c10/core/DeviceType.h>
 #include <c10/macros/Macros.h>
 
@@ -72,6 +73,27 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
 // original device index that was active before the change.
 TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
 
+TORCH_API inline void emptyCache() {
+  const auto device_type = getAccelerator(true).value();
+  at::getDeviceAllocator(device_type)->emptyCache();
+}
+
+TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
+    c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
+}
+
+TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
+}
+
+TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
+}
+
 } // namespace at::accelerator
 
 namespace at {
diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
index 5634733325a2e..0e535ab20cd21 100644
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -31,7 +31,9 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
       return at::globalContext().getPinnedMemoryAllocator(opt_device_type);
     } else {
       TORCH_CHECK(
-          false, "Need to provide pin_memory allocator to use pin memory.")
+          false,
+          "pin_memory=True requires a CUDA or other accelerator backend; "
+          "no pinned memory allocator is available on this system.")
     }
   }
 
diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index 33977d8d7cf8a..22509c7be4e19 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -8,6 +8,7 @@
 #include <ATen/record_function.h>
 #include <ATen/SavedTensorHooks.h>
 #include <ATen/FunctionalTensorWrapper.h>
+#include <ATen/DTensorState.h>
 
 namespace at {
 
@@ -19,6 +20,7 @@ ThreadLocalState::ThreadLocalState()
       torch_dispatch_mode_state_(c10::impl::TorchDispatchModeTLS::get_state()), python_dispatcher_state_(c10::impl::PythonDispatcherTLS::get_state()),
       python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()),
       saved_tensors_default_hooks_state_(at::SavedTensorDefaultHooks::get_tls_state()), functionalization_reapply_views_state_(at::functionalization::impl::getFunctionalizationReapplyViewsTLS()),
+      dtensor_allow_implicit_replication_(at::get_dtensor_allow_implicit_replication()),
       saved_objects_(at::impl::ThreadLocalPythonObjects::get_state()) {
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) && !defined(BUILD_LITE_INTERPRETER)
   for(size_t i=0; i<autocast_dtypes_.size(); i++) {
@@ -52,6 +54,8 @@ void ThreadLocalState::setThreadLocalState(
 
   c10::impl::PythonDispatcherTLS::set_state(state.python_dispatcher_state_);
 
+  at::set_dtensor_allow_implicit_replication(state.dtensor_allow_implicit_replication_);
+
   c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_);
 
   c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_);
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index bb28175c5f42e..d0d8112fc4cec 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -75,6 +75,8 @@ class TORCH_API ThreadLocalState {
 
   bool functionalization_reapply_views_state_;
 
+  bool dtensor_allow_implicit_replication_;
+
   // TLS for arbitrary python objects that is registered via hooks
   at::impl::ThreadLocalPythonObjects saved_objects_;
 
diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
index 3b4b0ae02becf..aa000b118daa2 100644
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -121,7 +121,7 @@ inline int64_t legacy_cat_wrap_dim_symint(
     const std::vector<std::vector<c10::SymInt>>& tensor_sizes) {
   for (auto& sizes : tensor_sizes) {
     if (sizes.size() == 1) {
-      if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) {
+      if (TORCH_GUARD_OR_FALSE(sizes[0].sym_eq(0))) {
         continue;
       }
     }
@@ -135,7 +135,7 @@ inline int64_t legacy_cat_wrap_dim(
     const MaterializedITensorListRef& tensors) {
   for (const Tensor& tensor : tensors) {
     if (tensor.dim() == 1) {
-      if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) {
+      if (TORCH_GUARD_OR_FALSE(tensor.sym_sizes()[0].sym_eq(0))) {
         continue;
       }
     }
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index afd0a6b67674a..4b8b5f6c5d187 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -216,6 +216,7 @@ TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) {
   KERNEL_MPS(_convolution, lower_precision_fp)
   KERNEL_MPS(conv1d, lower_precision_fp)
   KERNEL_MPS(conv2d, lower_precision_fp)
+  KERNEL_MPS(conv3d, lower_precision_fp)
   KERNEL_MPS(conv_tbc, lower_precision_fp)
   KERNEL_MPS(conv_transpose1d, lower_precision_fp)
   KERNEL_MPS(conv_transpose2d, input, lower_precision_fp)
@@ -239,6 +240,7 @@ TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) {
   KERNEL_MPS(scaled_dot_product_attention, lower_precision_fp)
 
   // fp32
+  KERNEL_MPS(conv_transpose3d, input, fp32)
   KERNEL_MPS(acos, fp32)
   KERNEL_MPS(asin, fp32)
   KERNEL_MPS(cosh, fp32)
diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h
index 5049018d731e1..53e95cd2d4cfd 100644
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/core/Allocator.h>
+#include <c10/core/AllocatorConfig.h>
 #include <c10/core/Stream.h>
 #include <c10/core/thread_pool.h>
 #include <c10/util/flat_hash_map.h>
@@ -251,6 +252,7 @@ struct CachingHostAllocatorImpl {
     auto* block = reinterpret_cast<B*>(ctx);
 
     std::optional<std::vector<E>> events;
+    ska::flat_hash_set<S> streams;
     {
       std::lock_guard<std::mutex> g(block->mutex_);
       block->allocated_ = false;
@@ -259,14 +261,19 @@ struct CachingHostAllocatorImpl {
       } else {
         events = std::vector<E>();
         events->reserve(block->streams_.size());
-        for (auto stream : block->streams_) {
-          record_stream(events, stream);
-        }
-        block->event_count_ += events->size();
+        block->event_count_ += block->streams_.size();
+        // Move out streams to avoid holding the mutex during event recording
+        streams = std::move(block->streams_);
         block->streams_.clear();
       }
     }
 
+    // Event recording must be done outside the mutex to avoid potential
+    // deadlocks (e.g., when Python GIL is involved)
+    for (auto stream : streams) {
+      record_stream(events, stream);
+    }
+
     if (!events) {
       auto index = size_index(block->size_);
       std::lock_guard<std::mutex> g(free_list_[index].mutex_);
@@ -345,7 +352,8 @@ struct CachingHostAllocatorImpl {
   }
 
   virtual bool pinned_use_background_threads() {
-    return false;
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        pinned_use_background_threads();
   }
 
   virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const {
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 8463379149e27..5f43738ea0faf 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -1,5 +1,18 @@
 #pragma once
 
+// See https://github.com/pytorch/pytorch/issues/161660
+// This compile flag is intended to be passed in to CppExtensions that rely on
+// the stable ABI via the `extra_compile_args` argument. This is a stopgap
+// solution to ensure that non-stable libtorch APIs are not used in the extension.
+// The long term solution is to have a torch_stable target that excludes headers
+// that are not in torch/stable or torch/headeronly.
+// See test/cpp_extensions/torch_stable_test_extension/setup.py for an example
+// of how this is used.
+#ifdef TORCH_STABLE_ONLY
+#error \
+    "TensorBase.h should not be included when TORCH_STABLE_ONLY compile flag is passed"
+#endif
+
 #include <c10/core/Device.h>
 #include <c10/core/Layout.h>
 #include <c10/core/MemoryFormat.h>
diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h
index 06bcc5d4f49b8..4300217235b84 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@@ -6,6 +6,8 @@
 #include <c10/core/DispatchKeySet.h>
 #include <c10/util/TypeList.h>
 #include <c10/util/intrusive_ptr.h>
+#include <atomic>
+#include <memory>
 #include <type_traits>
 
 namespace c10 {
@@ -17,6 +19,9 @@ class OperatorHandle;
 struct OperatorKernel;
 class KernelFunction;
 
+class KernelToken;
+class SafeKernelFunction;
+
 template <typename T>
 using has_symint = std::disjunction<
     std::is_same<c10::SymInt, T>,
@@ -90,6 +95,12 @@ class TORCH_API KernelFunction final {
       BoxedKernel::BoxedKernelFunction_withDispatchKeys;
 
   KernelFunction();
+  ~KernelFunction();
+
+  KernelFunction(const KernelFunction& other);
+  KernelFunction& operator=(const KernelFunction& other);
+
+  KernelFunction(KernelFunction&&) noexcept = default;
 
   // Fast path for dispatch to allow not touching the boxed kernel in
   // the common case where unboxed is available.
@@ -262,6 +273,9 @@ class TORCH_API KernelFunction final {
   // For testing internal invariants only
   bool _equalsBoxedAndUnboxed(const KernelFunction&) const;
 
+  // Register a token to be invalidated when this KernelFunction is destroyed
+  void registerToken(std::weak_ptr<KernelToken> token) const;
+
  private:
   explicit KernelFunction(
       std::unique_ptr<OperatorKernel> functor,
@@ -276,6 +290,50 @@ class TORCH_API KernelFunction final {
   BoxedKernel boxed_kernel_func_;
   void* unboxed_kernel_func_;
   void* sym_unboxed_kernel_func_;
+  // List of tokens that need to be invalidated when this KernelFunction is
+  // destroyed (lazy allocation to save memory when empty)
+  mutable std::unique_ptr<std::vector<std::weak_ptr<KernelToken>>> tokens_;
+};
+
+// Token held by SafeKernelFunction that gets invalidated when KernelFunction is
+// destroyed
+class KernelToken {
+ public:
+  bool isValid() const;
+  void invalidate();
+
+ private:
+  std::atomic<bool> invalid_{false};
+};
+
+class SafeKernelFunction {
+ public:
+  SafeKernelFunction(
+      const KernelFunction* kernel,
+      std::string debug,
+      std::shared_ptr<OperatorHandle> opHandle);
+
+  // Safe callBoxed - checks token validity first
+  void callBoxed(
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Stack* stack) const;
+
+  // Get debug information
+  const std::string& debug() const {
+    return debug_;
+  }
+
+  // Get the OpHandle that lives on this SafeKernelFunction
+  const OperatorHandle& opHandle() const {
+    return *opHandle_;
+  }
+
+ private:
+  KernelFunction kernel_;
+  std::shared_ptr<KernelToken> token_;
+  std::string debug_;
+  std::shared_ptr<OperatorHandle> opHandle_;
 };
 
 } // namespace c10
diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
index df49d6227ee93..672309ec19a2c 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -15,7 +15,7 @@ std::enable_if_t<
         std::is_base_of_v<Base, Child>,
     std::unique_ptr<Base>>
 make_unique_base(Args&&... args) {
-  return std::unique_ptr<Base>(new Child(std::forward<Args>(args)...));
+  return std::make_unique<Child>(std::forward<Args>(args)...);
 }
 } // namespace detail
 
@@ -24,6 +24,36 @@ inline KernelFunction::KernelFunction()
       unboxed_kernel_func_(nullptr),
       sym_unboxed_kernel_func_(nullptr) {}
 
+inline KernelFunction::~KernelFunction() {
+  if (tokens_) {
+    for (auto& weak_token : *tokens_) {
+      if (auto token = weak_token.lock()) {
+        token->invalidate();
+      }
+    }
+  }
+}
+
+inline KernelFunction::KernelFunction(const KernelFunction& other)
+    : boxed_kernel_func_(other.boxed_kernel_func_),
+      unboxed_kernel_func_(other.unboxed_kernel_func_),
+      sym_unboxed_kernel_func_(other.sym_unboxed_kernel_func_) {
+  // tokens_ is intentionally not copied as we only care about invalidating
+  // tokens if the original KernelFunction is destroyed
+}
+
+inline KernelFunction& KernelFunction::operator=(const KernelFunction& other) {
+  if (this != &other) {
+    boxed_kernel_func_ = other.boxed_kernel_func_;
+    unboxed_kernel_func_ = other.unboxed_kernel_func_;
+    sym_unboxed_kernel_func_ = other.sym_unboxed_kernel_func_;
+
+    // tokens_ is intentionally not copied as we only care about invalidating
+    // tokens if the original KernelFunction is destroyed
+  }
+  return *this;
+}
+
 inline KernelFunction::KernelFunction(
     std::unique_ptr<OperatorKernel> functor,
     InternalBoxedKernelFunction* boxed_kernel_func,
@@ -157,6 +187,14 @@ C10_ALWAYS_INLINE Return KernelFunction::call(
       std::forward<Args>(args)...);
 }
 
+inline void KernelFunction::registerToken(
+    std::weak_ptr<KernelToken> token) const {
+  if (!tokens_) {
+    tokens_ = std::make_unique<std::vector<std::weak_ptr<KernelToken>>>();
+  }
+  tokens_->push_back(std::move(token));
+}
+
 inline KernelFunction KernelFunction::makeFromBoxedKernel(
     BoxedKernel boxed_fn) {
   return KernelFunction(
@@ -317,4 +355,38 @@ KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) {
           std::forward<Lambda>(lambda)));
 }
 
+inline bool KernelToken::isValid() const {
+  return !invalid_.load(std::memory_order_acquire);
+}
+
+inline void KernelToken::invalidate() {
+  invalid_.store(true, std::memory_order_release);
+}
+
+inline SafeKernelFunction::SafeKernelFunction(
+    const KernelFunction* kernel,
+    std::string debug,
+    std::shared_ptr<OperatorHandle> opHandle)
+    : kernel_(kernel ? *kernel : KernelFunction()),
+      token_(std::make_shared<KernelToken>()),
+      debug_(std::move(debug)),
+      opHandle_(std::move(opHandle)) {
+  // Register the token with the original kernel so it gets invalidated when the
+  // kernel is destroyed
+  if (kernel) {
+    kernel->registerToken(token_);
+  }
+}
+
+inline void SafeKernelFunction::callBoxed(
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack) const {
+  TORCH_CHECK(
+      token_ && token_->isValid(),
+      "SafeKernelFunction has been invalidated ",
+      debug_);
+  kernel_.callBoxed(opHandle, dispatchKeySet, stack);
+}
+
 } // namespace c10
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index bc043df6a93e9..43eb0028c70fe 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -487,6 +487,10 @@ class TORCH_API OperatorHandle {
     return operatorDef_->op.hasComputedKernelForDispatchKey(k);
   }
 
+  SafeKernelFunction getComputedKernelForDispatchKey(DispatchKey k) const {
+    return operatorDef_->op.getComputedKernelForDispatchKey(k);
+  }
+
   std::string dumpComputedTable() const {
     return operatorDef_->op.dumpComputedTable();
   }
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index b4063fb720be0..c172e9b9c6096 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -315,6 +315,42 @@ const AnnotatedKernel* OperatorEntry::getKernelForDispatchKey(DispatchKey dispat
   return nullptr;
 }
 
+SafeKernelFunction OperatorEntry::getComputedKernelForDispatchKey(
+    DispatchKey k) const {
+  TORCH_CHECK(
+      !isAliasDispatchKey(k),
+      "Alias keys do not have runtime kernel registrations.");
+  const auto dispatch_ix = getDispatchTableIndexForDispatchKey(k);
+  TORCH_CHECK(
+      dispatchTable_[dispatch_ix].isValid(),
+      "no kernel for ",
+      k,
+      " for ",
+      name_);
+
+  // Get the KernelFunction object from kernels_ to pass to SafeKernelFunction
+
+  // The KernelFunction object in dispatchTable_ is a copy of the KernelFunction
+  // in the AnnotatedKernel in kernels_. A KernelFunction is only truly
+  // deregistered when the kernel is removed from kernels_. However, the
+  // KernelFunction in dispatchTable_ might be removed before it is deregistered
+  // (when a newer kernel is registered). Therefore, here we want to return a
+  // SafeKernelFunction that is backed by the original KernelFunction in
+  // kernels_, so that we only invalidate it when the kernel is deregistered.
+  auto [annotatedKernel, _] =
+      computeDispatchTableEntryWithDebug(c10::Dispatcher::singleton(), k);
+
+  // Use findSchemaOrThrow to get OpHandle for the OperatorEntry
+  auto& dispatcher = c10::Dispatcher::singleton();
+  auto opHandle = dispatcher.findSchemaOrThrow(
+      name_.name.c_str(), name_.overload_name.c_str());
+
+  return SafeKernelFunction(
+      &annotatedKernel.kernel,
+      annotatedKernel.debug,
+      std::make_shared<OperatorHandle>(opHandle));
+}
+
 const std::vector<at::Tag>& OperatorEntry::getTags() const {
   #if defined C10_MOBILE
     TORCH_CHECK(false, "tags are not saved for Mobile");
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h
index 83200ff9c94ff..59b54ce1d9d32 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@@ -217,6 +217,8 @@ class TORCH_API OperatorEntry final {
   const KernelFunction& kernelForDispatchKey(DispatchKey k) const;
   // Returns true if the "computed table" has an entry for a particular key.
   bool hasComputedKernelForDispatchKey(DispatchKey k) const;
+  // Returns a KernelFunction corresponding to the kernel in dispatchTable
+  SafeKernelFunction getComputedKernelForDispatchKey(DispatchKey k) const;
   // Returns all the operator tags added at the time of registration
   const std::vector<at::Tag>& getTags() const;
   void setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> callback);
diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h
index b33e7ce0c5495..2ba841e44e202 100644
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@@ -64,6 +64,7 @@ constexpr DynamicTypeBits kDynamicClassTypeBit = DYNAMIC_TYPE_BIT(10);
   _(ScalarType, kDynamicIntTypeBit, 1)                                \
   _(Layout, kDynamicIntTypeBit, 1)                                        \
   _(SymInt, kDynamicIntTypeBit, 1)                                        \
+  _(SymBool, kDynamicIntTypeBit, 1)                                        \
   _(MemoryFormat, kDynamicIntTypeBit, 1)
 
 #define FORWARD_DECL_TYPE(NAME, _, __) struct NAME ## Type;
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index c6087f0a68ecf..72589436606ec 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -97,6 +97,8 @@ c10::TypePtr IValue::TagType<c10::Type>::get(const IValue& v) {
         return ComplexType::get();
       case Tag::Int:
         return IntType::get();
+      case Tag::UInt:
+        return IntType::get();
       case Tag::SymInt:
         return c10::SymIntType::get();
       case Tag::SymFloat:
@@ -320,6 +322,8 @@ IValue IValue::equals(const IValue& rhs) const {
       return rhs.isComplexDouble() && lhs.toComplexDouble() == rhs.toComplexDouble();
     case Tag::Int:
       return rhs.isInt() && lhs.toInt() == rhs.toInt();
+    case Tag::UInt:
+      return rhs.isUnsigned() && lhs.toUInt() == rhs.toUInt();
     case Tag::SymInt:
       return rhs.isSymInt() && lhs.toSymInt() == rhs.toSymInt();
     case Tag::SymFloat:
@@ -379,6 +383,8 @@ size_t IValue::hash(const IValue& v) {
     case Tag::Int:
       return c10::get_hash(v.payload.u.as_int);
     // NB: these are technically strict aliasing violations
+    case Tag::UInt:
+      return c10::get_hash(v.payload.u.as_int);
     case Tag::SymInt:
       return c10::get_hash(v.payload.u.as_int);
     case Tag::SymFloat:
@@ -806,6 +812,8 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       return printComplex(out, v);
     } case IValue::Tag::Int:
       return out << v.toInt();
+    case IValue::Tag::UInt:
+      return out << v.toUInt();
     case IValue::Tag::SymInt:
       return out << v.toSymInt();
     case IValue::Tag::SymFloat:
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 175860dc99a7c..ab2039e058201 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -12,6 +12,7 @@
 #include <c10/macros/Export.h>
 #include <c10/util/MaybeOwned.h>
 #include <c10/util/intrusive_ptr.h>
+#include <limits>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
@@ -160,6 +161,7 @@ struct Capsule {
   _(Double)                  \
   _(ComplexDouble)           \
   _(Int)                     \
+  _(UInt)                    \
   _(SymInt)                  \
   _(SymFloat)                \
   _(SymBool)                 \
@@ -653,6 +655,29 @@ struct TORCH_API IValue final {
     }
   }
 
+  // Unsigned
+  IValue(uint64_t u) : tag( u <= std::numeric_limits<int64_t>::max() ? Tag::Int : Tag::UInt) {
+    payload.u.as_uint = u;
+  }
+
+
+  // See Note [Meaning of HAS_u]
+  // IValue type model closely follows that of c10::Scalar
+  // Where all integers are upcast to 64-bit representation, and `as_int` is used as default
+  // representation unless value could not be represented as signed int
+  bool isUnsigned() const {
+    return Tag::UInt == tag || (Tag::Int == tag && payload.u.as_int >= 0);
+  }
+
+  uint64_t toUInt() const {
+    if (isUnsigned()) {
+      return payload.u.as_uint;
+    } else {
+      TORCH_INTERNAL_ASSERT(0, "expected unsigned int");
+    }
+  }
+
+
   // Bool
   IValue(bool b) : tag(Tag::Bool) {
 #if defined(__clang__) && defined(__x86_64__)
@@ -893,8 +918,14 @@ struct TORCH_API IValue final {
     } else {
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           s.isIntegral(false), "Unknown type in Scalar");
-      tag = Tag::Int;
-      payload.u.as_int = s.toLong();
+      if (s.isUnsigned()) {
+        const auto val = s.toUInt64();
+        payload.u.as_uint = val;
+        tag = val <= std::numeric_limits<int64_t>::max() ? Tag::Int : Tag::UInt;
+      } else {
+        payload.u.as_int = s.toLong();
+        tag = Tag::Int;
+      }
     }
   }
 
@@ -918,6 +949,8 @@ struct TORCH_API IValue final {
       return toSymFloat();
     else if (isSymBool())
       return toSymBool();
+    else if (isUnsigned())
+      return toUInt();
     TORCH_CHECK(false, "IValue is not a Scalar");
   }
 
@@ -1247,6 +1280,8 @@ struct TORCH_API IValue final {
         return true;
       case Tag::Int:
         return false;
+      case Tag::UInt:
+        return false;
       case Tag::SymInt:
         return true;
       case Tag::SymFloat:
@@ -1343,6 +1378,8 @@ struct TORCH_API IValue final {
     union TriviallyCopyablePayload {
       TriviallyCopyablePayload() : as_int(0) {}
       int64_t as_int;
+      // See Note [Meaning of HAS_u]
+      uint64_t as_uint;
       double as_double;
       bool as_bool;
       // Invariant: never nullptr; null state is represented as
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index cf403365b2df2..0d319ea593840 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -832,7 +832,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
       bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
     }
   }
-#if defined(USE_ROCM) && !defined(_MSC_VER)
+#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
   }
@@ -996,9 +996,6 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
 
 template <>
 void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
-  #ifdef USE_ROCM
-  TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported for ROCm");
-  #endif
   // TODO: Support tuning for Half inputs and FP32 output
   bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
 }
@@ -1006,9 +1003,7 @@ void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)
 
 template <>
 void bgemm<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
-  #ifdef USE_ROCM
-  TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is not supported for ROCm");
-  #else
+  #ifndef USE_ROCM
     cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
 
     if (prop->major < 8)
@@ -1273,7 +1268,7 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
 #endif
   }
-#if defined(USE_ROCM) && !defined(_MSC_VER)
+#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
   }
@@ -1289,7 +1284,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
   }
-#if defined(USE_ROCM) && !defined(_MSC_VER)
+#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
       gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
@@ -1341,7 +1336,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
   }
-#if defined(USE_ROCM) && !defined(_MSC_VER)
+#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
   }
@@ -1357,7 +1352,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
   }
-#if defined(USE_ROCM) && !defined(_MSC_VER)
+#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
   }
@@ -1513,9 +1508,6 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 
 template <>
 void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
-  #ifdef USE_ROCM
-  TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
-  #endif
   // TODO: Support Tuning for fp16-fp32 gemm
   gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
 }
@@ -1523,9 +1515,7 @@ void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float))
 
 template <>
 void gemm<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
-  #ifdef USE_ROCM
-  TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
-  #else
+  #ifndef USE_ROCM
     cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
 
     if (prop->major < 8)
@@ -1847,8 +1837,12 @@ int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fa
   switch (scaling_type) {
     case ScalingType::BlockWise1x32:
       TORCH_CHECK(scale_dtype == kFloat8_e8m0fnu);
-#if CUDA_VERSION >= 12080
+#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
+#ifdef USE_ROCM
+      return HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
+#else
       return CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
+#endif // USE_ROCM
 #else
       TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales of 1x32 blocks is only supported for CUDA 12.8 and above");
 #endif // if CUDA_VERSION >= 12080
@@ -1943,15 +1937,33 @@ void scaled_gemm(
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
   cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER;
   cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER;
+#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
   // hipblaslt supported row-wise before cublas, and did so their own way (via
   // the SCALE_POINTERSs), but then migrated to match how cublas does it (via
   // the SCALE_MODEs). Here we check for this early custom mode.
-#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
-  if (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise) {
+  bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
+  if (use_rowwise) {
     matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
     matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
   }
-#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
+  else if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
+  #if ROCM_VERSION >= 70000
+            if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
+                // TODO: add constraints based on hipblaslt internals
+                TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
+                           "Matrix dimensions must be multiples of 32 for MX format. "
+                           "Got m=", m, ", n=", n, ", k=", k);
+            }
+  #endif
+  }
+#elif (CUDA_VERSION < 12090) && !defined(USE_ROCM)
+  // hipblaslt supported row-wise before cublas, and did so their own way (via
+  // the SCALE_POINTERSs), but then migrated to match how cublas does it (via
+  // the SCALE_MODEs). Here we check for this early custom mode.
+  bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
+  // rowwise isn't supported using older cublaslt or older hipblaslt
+  TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
+#endif  // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
   computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
   computeDesc.setAttribute(matmulDescB, mat2_scale_ptr);
   if (result_scale_ptr != nullptr) {
@@ -1990,15 +2002,16 @@ void scaled_gemm(
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS);
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
   }
-
-  // The SCALE_MODE attrs only exist in cuBLAS 12.8+ or in recent hipblaslt,
-  // but we must invoke get_scale_mode anyways to trigger the version checks.
-  [[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum);
-  [[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum);
-#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode);
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
-#endif
+    // For other data types, use the get_scale_mode function based on scaling type
+    // The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt,
+    // but we must invoke get_scale_mode anyways to trigger the version checks.
+    // Note that AMD/ROCm follows OCP Spec 1.0, which is different from NVIDIA's implementation. See get_scale_mode() for details.
+    [[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum);
+    [[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum);
+#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC))
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode);
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
+#endif // if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC))
 
   CuBlasLtMatmulPreference preference;
   auto ltworkspace = CublasLtWorkspace();
@@ -2564,8 +2577,6 @@ void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>)) {
                                    reinterpret_cast<cuDoubleComplex*>(result)));
 }
 
-// HIP on Windows does not support
-#if !(defined(USE_ROCM) && defined(_MSC_VER))
 template <>
 void getrsBatched<float>(CUDABLAS_GETRS_ARGTYPES(float)) {
   TORCH_CUDABLAS_CHECK(cublasSgetrsBatched(
@@ -2764,6 +2775,5 @@ void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::comple
       devInfoArray,
       batchSize));
 }
-#endif // !(defined(USE_ROCM) && defined(_MSC_VER))
 
 } // namespace at::cuda::blas
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
index 5021917fe0950..b235840418e25 100644
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@@ -343,9 +343,6 @@ void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>));
   int m, int n, int nrhs, Dtype** dA_array, int ldda, \
   Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize
 
-// HIP on Windows does not support getrs, geqrf, getrf, gels
-#if !(defined(USE_ROCM) && defined(_MSC_VER))
-
 template<class Dtype>
 void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) {
   static_assert(false&&sizeof(Dtype),"at::cuda::blas::getrsBatched: not implemented");
@@ -400,28 +397,4 @@ TORCH_CUDA_CU_API void gelsBatched<c10::complex<double>>(CUDABLAS_GELS_BATCHED_A
 template<>
 TORCH_CUDA_CU_API void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex<float>));
 
-#else // !(defined(USE_ROCM) && defined(_MSC_VER))
-
-template<class Dtype>
-void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) {
-  TORCH_CHECK(false, "at::cuda::blas::getrsBatched: not supported for HIP on Windows");
-}
-
-template <class Dtype>
-void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) {
-  TORCH_CHECK(false, "at::cuda::blas::geqrfBatched: not supported for HIP on Windows");
-}
-
-template<class Dtype>
-void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) {
-  TORCH_CHECK(false, "at::cuda::blas::getrfBatched: not supported for HIP on Windows");
-}
-
-template <class Dtype>
-void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) {
-  TORCH_CHECK(false, "at::cuda::blas::gelsBatched: not supported for HIP on Windows");
-}
-
-#endif // !(defined(USE_ROCM) && defined(_MSC_VER))
-
 } // namespace at::cuda::blas
diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
index 7fba7c4c7424c..b8cd84c56daef 100644
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -2,7 +2,6 @@
 #include <ATen/cuda/CUDAGraph.h>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/Functions.h>
-#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>
 
 #include <cstddef>
@@ -253,6 +252,13 @@ cudaGraph_t CUDAGraph::raw_cuda_graph() {
   return graph_;
 }
 
+cudaGraphExec_t CUDAGraph::raw_cuda_graph_exec() {
+  TORCH_CHECK(
+      has_graph_exec_,
+      "You cannot access the raw cudaGraphExec_t instance until instantiate() has been called");
+  return graph_exec_;
+}
+
 void CUDAGraph::reset() {
   // I'd prefer these checks throw exceptions, not print warnings,
   // but the destructor calls reset(), and at least one CI build
diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h
index c8cae16b624fe..c18ad66b20809 100644
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@@ -2,6 +2,7 @@
 
 #include <ATen/Tensor.h>
 #include <c10/core/Device.h>
+#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/flat_hash_map.h>
@@ -36,6 +37,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   void enable_debug_mode();
   void debug_dump(const std::string& debug_path);
   cudaGraph_t raw_cuda_graph();
+  cudaGraphExec_t raw_cuda_graph_exec();
 
  protected:
   cudaGraph_t graph_ = nullptr;
diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp
index 39fd0e16fac51..34aa15d0c06cf 100644
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl
   }
 
   bool pinned_use_background_threads() override {
-    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+    return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
         pinned_use_background_threads();
   }
 
diff --git a/aten/src/ATen/cuda/PeerToPeerAccess.cpp b/aten/src/ATen/cuda/PeerToPeerAccess.cpp
index 91b487cd9c83e..66a75db6ea067 100644
--- a/aten/src/ATen/cuda/PeerToPeerAccess.cpp
+++ b/aten/src/ATen/cuda/PeerToPeerAccess.cpp
@@ -4,6 +4,9 @@
 
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/driver_api.h>
+#endif
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 
@@ -12,6 +15,7 @@
 namespace at::cuda {
 
 static std::vector<int8_t> p2pAccessEnabled_;
+static std::vector<int8_t> fabricAccessEnabled_;
 static int64_t num_devices_ = -1;
 
 namespace detail {
@@ -29,20 +33,23 @@ void init_p2p_access_cache(int64_t num_devices) {
   for (const auto i : c10::irange(num_devices)) {
     p2pAccessEnabled_[i * num_devices + i] = 1;
   }
+  fabricAccessEnabled_.clear();
+  fabricAccessEnabled_.resize(num_devices, -1);
 }
 
-}  // namespace detail
+} // namespace detail
 
 bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) {
   at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
 
-  TORCH_CHECK(dev >= 0 || dev < num_devices_,
-              dev, " is not a device");
-  TORCH_CHECK(dev_to_access >= 0 || dev_to_access < num_devices_,
-              dev_to_access, " is not a device");
+  TORCH_CHECK(dev >= 0 || dev < num_devices_, dev, " is not a device");
+  TORCH_CHECK(
+      dev_to_access >= 0 || dev_to_access < num_devices_,
+      dev_to_access,
+      " is not a device");
   TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized");
 
-  auto &cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access];
+  auto& cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access];
 
   if (cache != -1) {
     return cache;
@@ -58,4 +65,118 @@ bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) {
   return cache;
 }
 
-}  // namespace at::cuda::detail
+namespace {
+#if !defined USE_ROCM && defined CUDA_VERSION && CUDA_VERSION >= 12040 && defined PYTORCH_C10_DRIVER_API_SUPPORTED
+
+nvmlDevice_t get_nvml_device(c10::DeviceIndex dev) {
+  static bool nvml_init [[maybe_unused]] = []() {
+    TORCH_INTERNAL_ASSERT(NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_());
+    return true;
+  }();
+
+  auto prop = at::cuda::getDeviceProperties(dev);
+  char pci_id // NOLINT(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+      [NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  snprintf(
+      pci_id,
+      sizeof(pci_id),
+      NVML_DEVICE_PCI_BUS_ID_FMT,
+      prop->pciDomainID,
+      prop->pciBusID,
+      prop->pciDeviceID);
+
+  nvmlDevice_t nvml_device = nullptr;
+  TORCH_INTERNAL_ASSERT(
+      NVML_SUCCESS ==
+      DriverAPI::get()->nvmlDeviceGetHandleByPciBusId_v2_(
+          pci_id, &nvml_device));
+  return nvml_device;
+}
+
+bool isFabricSupported() {
+  // 1. try allocating memory
+  CUmemGenericAllocationHandle handle = 0;
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+
+  size_t granularity{};
+  const auto driver_api = c10::cuda::DriverAPI::get();
+  C10_CUDA_DRIVER_CHECK(driver_api->cuMemGetAllocationGranularity_(
+      &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+
+  auto status = driver_api->cuMemCreate_(&handle, granularity, &prop, 0);
+  if (status != CUDA_SUCCESS) {
+    LOG(INFO)
+        << "status " << status
+        << " Could not allocate memory with FABRIC handle, falling back to fd handle exchange\n";
+    return false;
+  }
+  // 2. check export
+  CUmemFabricHandle sharedHandle;
+  status = driver_api->cuMemExportToShareableHandle_(
+      &sharedHandle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0);
+  if (status != CUDA_SUCCESS) {
+    LOG(INFO)
+        << "status " << status
+        << " Could not export FABRIC handle, falling back to fd handle exchange\n";
+    driver_api->cuMemRelease_(handle);
+    return false;
+  }
+  // 3. check import
+  CUmemGenericAllocationHandle import_handle = 0;
+  status = driver_api->cuMemImportFromShareableHandle_(
+      &import_handle, &sharedHandle, CU_MEM_HANDLE_TYPE_FABRIC);
+  if (status != CUDA_SUCCESS) {
+    LOG(INFO)
+        << "status " << status
+        << " Could not import FABRIC handle, falling back to fd handle exchange\n";
+    driver_api->cuMemRelease_(handle);
+    return false;
+  }
+  driver_api->cuMemRelease_(import_handle);
+  driver_api->cuMemRelease_(handle);
+  LOG(INFO) << "using fabric to exchange memory handles\n";
+  return true;
+}
+#endif
+} // namespace
+
+bool get_fabric_access(c10::DeviceIndex dev) {
+#if !defined USE_ROCM && defined CUDA_VERSION && CUDA_VERSION >= 12040 && defined PYTORCH_C10_DRIVER_API_SUPPORTED
+  at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
+
+  TORCH_CHECK(dev >= 0 || dev < num_devices_, dev, " is not a device");
+  auto& cache = fabricAccessEnabled_[dev];
+  if (cache != -1) {
+    return cache;
+  }
+  auto nvml_device = get_nvml_device(dev);
+  if (nvml_device != nullptr) {
+    nvmlGpuFabricInfoV_t fabricInfo;
+    fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
+    fabricInfo.version = nvmlGpuFabricInfo_v2;
+    if (DriverAPI::get()->nvmlDeviceGetGpuFabricInfoV_ == nullptr) {
+      return false;
+    }
+    TORCH_CHECK(
+        NVML_SUCCESS ==
+        DriverAPI::get()->nvmlDeviceGetGpuFabricInfoV_(
+            nvml_device, &fabricInfo));
+    auto state = fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
+    if (state) {
+      // now perform the full cycle of allocating - exporting - importing memory
+      state = isFabricSupported();
+    }
+    cache = state ? 1 : 0;
+    return cache;
+  } else {
+    return false;
+  }
+#else
+  return false;
+#endif
+}
+
+} // namespace at::cuda
diff --git a/aten/src/ATen/cuda/PeerToPeerAccess.h b/aten/src/ATen/cuda/PeerToPeerAccess.h
index 5b63a855f3f46..30d21af83ed88 100644
--- a/aten/src/ATen/cuda/PeerToPeerAccess.h
+++ b/aten/src/ATen/cuda/PeerToPeerAccess.h
@@ -8,5 +8,6 @@ void init_p2p_access_cache(int64_t num_devices);
 }
 
 TORCH_CUDA_CPP_API bool get_p2p_access(c10::DeviceIndex source_dev, c10::DeviceIndex dest_dev);
+TORCH_CUDA_CPP_API bool get_fabric_access(c10::DeviceIndex device);
 
 }  // namespace at::cuda
diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
index aad19c6771ed7..b809512692093 100644
--- a/aten/src/ATen/cuda/cub_definitions.cuh
+++ b/aten/src/ATen/cuda/cub_definitions.cuh
@@ -54,7 +54,7 @@
 
 // There were many bc-breaking changes in major version release of CCCL v3.0.0
 // Please see https://nvidia.github.io/cccl/cccl/3.0_migration_guide.html
-#if CUB_VERSION >= 300000
+#if CUB_VERSION >= 200800
 #define CUB_V3_PLUS() true
 #else
 #define CUB_V3_PLUS() false
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 247fdb2537cb4..72826b5847925 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -19,10 +19,6 @@
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/util/irange.h>
 
-#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
-#include <c10/cuda/driver_api.h>
-#endif
-
 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
 #endif
@@ -93,29 +89,6 @@ void CUDAHooks::init() const {
   // have a chance to enable vitals.
   at::vitals::VitalsAPI.setVital("CUDA", "used", "true", /* force = */ true);
 
-  // Sets the CUDA_MODULE_LOADING environment variable
-  // if it's not set by the user.
-  // CUDA_MODULE_LOADING="LAZY" is default for all drivers released for CUDA 12.2+.
-  // Check the driver version and only set the env variable if needed.
-  bool set_lazy_module_loading = true;
-  #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
-  auto driver_api = c10::cuda::DriverAPI::get();
-  // Initialize NVML
-  if (driver_api->nvmlInit_v2_() == NVML_SUCCESS) {
-    // Get the driver version
-    int version = -1;
-    auto res = driver_api->nvmlSystemGetCudaDriverVersion_v2_(&version);
-    if (res == NVML_SUCCESS) {
-      // Check if driver is sufficiently new
-      if (version >= 12020) {
-        set_lazy_module_loading = false;
-      }
-    }
-  }
-  #endif
-  if (set_lazy_module_loading) {
-    c10::utils::set_env("CUDA_MODULE_LOADING", "LAZY", false);
-  }
   const auto num_devices = c10::cuda::device_count_ensure_non_zero();
   c10::cuda::CUDACachingAllocator::init(num_devices);
   at::cuda::detail::init_p2p_access_cache(num_devices);
@@ -207,6 +180,27 @@ bool CUDAHooks::hasCuBLASLt() const {
 #endif
 }
 
+
+bool CUDAHooks::hasCKSDPA() const {
+#if !defined(USE_ROCM)
+    return false;
+#elif defined(USE_ROCM) && defined(USE_ROCM_CK_SDPA)
+    return true;
+#else
+    return false;
+#endif
+}
+
+bool CUDAHooks::hasCKGEMM() const {
+#if !defined(USE_ROCM)
+    return false;
+#elif defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
+    return true;
+#else
+    return false;
+#endif
+}
+
 bool CUDAHooks::hasROCM() const {
   // Currently, this is same as `compiledWithMIOpen`.
   // But in future if there are ROCm builds without MIOpen,
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index b0dac7a71e809..2780369a37b71 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -31,6 +31,8 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   bool hasCuSOLVER() const override;
   bool hasCuBLASLt() const override;
   bool hasROCM() const override;
+  bool hasCKSDPA() const override;
+  bool hasCKGEMM() const override;
   const at::cuda::NVRTC& nvrtc() const override;
   DeviceIndex current_device() const override;
   bool isBuilt() const override {return true;}
diff --git a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
index a65db3f2df12a..487e798bd80f6 100644
--- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
+++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
@@ -49,12 +49,12 @@ struct OffsetCalculator {
 #if defined(USE_ROCM)
     if ((dims > 0) && (dims <= 2)) {
       auto divmod = sizes_[0].divmod(linear_idx);
-      #pragma unroll
+#pragma unroll
       for (int arg = 0; arg < NARGS; arg++)
         offsets[arg] = divmod.mod * strides_[0][arg];
       if (dims >= 2) {
         divmod = sizes_[1].divmod(divmod.div);
-        #pragma unroll
+#pragma unroll
         for (int arg = 0; arg < NARGS; arg++)
           offsets[arg] += divmod.mod * strides_[1][arg];
       }
diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
index d89875865b887..aca83386ad421 100644
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@@ -117,6 +117,8 @@ namespace at::cuda {
   _(nvrtcGetPTXSize)                              \
   _(nvrtcGetPTX)                                  \
   _(cuModuleLoadData)                             \
+  _(cuModuleLoad)                                 \
+  _(cuGetErrorString)                             \
   _(cuModuleGetFunction)                          \
   _(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR) \
   _(nvrtcGetErrorString)                          \
diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
index 670137e48cbc3..1f71a61c0fba1 100644
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -91,7 +91,6 @@ constexpr hipDataType HipDataTypeFor<c10::Float4_e2m1fn_x2>() {
 #if ROCM_VERSION >= 70000
   return HIP_R_4F_E2M1;
 #else
-  // Return HIP_R_4F_E2M1 enum value for earlier ROCm version.
   return static_cast<hipDataType>(33);
 #endif
 }
diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp
index 9972cbd1c1514..3511e48ae061a 100644
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@@ -220,19 +220,17 @@ TuningResultsValidator::TuningResultsValidator() {
       []() { return GetPyTorchVersion(); },
       [this](auto&& k) { return ValidatePyTorchVersion(std::forward<decltype(k)>(k)); });
 #ifdef USE_ROCM
-  // rocm
+  // hip
   {
-#ifdef _WIN32
-    std::string rocm_version = HIP_VERSION_BUILD_NAME;
-#else
-    std::string rocm_version = ROCM_BUILD_INFO;
-#endif
+    // HIP version is more accurate than ROCm version.  User's environment could be a stock
+    // ROCm install but with a mix of newer components, making ROCm version meaningless.
+    std::string hip_version = c10::str(TORCH_HIP_VERSION);
     RegisterValidator(
-       "ROCM_VERSION",
-       [rocm_version]() { return rocm_version; },
-       [rocm_version](auto&& k) {
-        TUNABLE_LOG1("ROCM_VERSION validation: expect ", k, " to match ", rocm_version);
-        return rocm_version == k ? OK : FAIL;
+       "HIP_VERSION",
+       [hip_version]() { return hip_version; },
+       [hip_version](auto&& k) {
+        TUNABLE_LOG1("HIP_VERSION validation: expect ", k, " to match ", hip_version);
+        return hip_version == k ? OK : FAIL;
       });
   }
   // gfx arch
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index 6c2492b12e6b9..85f0286542e75 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -38,6 +38,7 @@ inline int dataSize(cudnnDataType_t dataType)
   }
 }
 
+// NOTE [ cudnn fixSizeOneDimStride ]
 // The stride for a size-1 dimensions is not uniquely determined; in
 // fact, it can be anything you want, because the fact that the
 // tensor is size 1 at this dimension means that you will never actually
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index f99e03d156c9b..00573e3cf701b 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -118,6 +118,14 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
     return false;
   }
 
+  virtual bool hasCKSDPA() const {
+    return false;
+  }
+
+  virtual bool hasCKGEMM() const {
+    return false;
+  }
+
   virtual const at::cuda::NVRTC& nvrtc() const {
     TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP);
   }
diff --git a/aten/src/ATen/detail/MTIAHooksInterface.cpp b/aten/src/ATen/detail/MTIAHooksInterface.cpp
index b6e260e59ec41..d2e331abb0c04 100644
--- a/aten/src/ATen/detail/MTIAHooksInterface.cpp
+++ b/aten/src/ATen/detail/MTIAHooksInterface.cpp
@@ -21,6 +21,10 @@ bool isMTIAHooksBuilt() {
 
 } // namespace detail
 
+bool MTIAHooksInterface::isAvailable() const {
+  return detail::isMTIAHooksBuilt() && detail::getMTIAHooks().deviceCount() > 0;
+}
+
 C10_DEFINE_REGISTRY(MTIAHooksRegistry, MTIAHooksInterface, MTIAHooksArgs)
 
 } // namespace at
diff --git a/aten/src/ATen/detail/MTIAHooksInterface.h b/aten/src/ATen/detail/MTIAHooksInterface.h
index fb8ed6fb23226..b415862f29e7c 100644
--- a/aten/src/ATen/detail/MTIAHooksInterface.h
+++ b/aten/src/ATen/detail/MTIAHooksInterface.h
@@ -149,6 +149,8 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
     FAIL_MTIAHOOKS_FUNC(__func__);
     return;
   }
+
+  virtual bool isAvailable() const override;
 };
 
 struct TORCH_API MTIAHooksArgs {};
diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp
index de69e5c1e23a4..6e63708a90f4a 100644
--- a/aten/src/ATen/functorch/BatchRulesModules.cpp
+++ b/aten/src/ATen/functorch/BatchRulesModules.cpp
@@ -7,6 +7,7 @@
 #include <ATen/functorch/BatchRulesHelper.h>
 #include <ATen/functorch/PlumbingHelper.h>
 #include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/DTensorState.h>
 
 #include <utility>
 
@@ -44,8 +45,13 @@ static std::tuple<Tensor, std::optional<int64_t>> embedding_batch_rule(
   const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight);
   auto indices_ = moveBatchDimToFront(indices, indices_bdim);
 
-  const auto range = getStepTensor(indices, batch_size, num_embeddings);
-  indices_ = indices_ + range;
+  {
+    // getStepTensor returns a regular Tensor. If indices_ is a DTensor
+    // we want to allow this mixed DTensor-Tensor operation.
+    at::DTensorAllowImplicitReplication guard;
+    const auto range = getStepTensor(indices, batch_size, num_embeddings);
+    indices_ = indices_ + range;
+  }
   auto result = at::embedding_symint(weight_, indices_, std::move(padding_idx), scale_grad_by_freq, sparse);
   return std::make_tuple(std::move(result), 0);
 }
diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
index 39ab441478e8f..f4316def4fb42 100644
--- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
@@ -1,7 +1,6 @@
 #pragma once
 
-#include <c10/core/Allocator.h>
-#include <c10/core/DeviceType.h>
+#include <c10/hip/HIPCachingAllocator.h>
 
 // Use of c10::hip namespace here makes hipification easier, because
 // I don't have to also fix namespaces.  Sorry!
@@ -10,22 +9,227 @@ namespace c10::hip {
 // Takes a valid HIPAllocator (of any sort) and turns it into
 // an allocator pretending to be a CUDA allocator.  See
 // Note [Masquerading as CUDA]
-class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
-  Allocator* allocator_;
+class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllocator {
+  HIPCachingAllocator::HIPAllocator* allocator_;
 public:
-  explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
+  explicit HIPAllocatorMasqueradingAsCUDA(HIPCachingAllocator::HIPAllocator* allocator)
     : allocator_(allocator) {}
+
+  virtual ~HIPAllocatorMasqueradingAsCUDA() = default;
+
+  // From c10::Allocator
+
   DataPtr allocate(size_t size) override {
     DataPtr r = allocator_->allocate(size);
     r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index()));
     return r;
   }
+
+  bool is_simple_data_ptr(const DataPtr& data_ptr) const override {
+    return allocator_->is_simple_data_ptr(data_ptr);
+  }
+
   DeleterFnPtr raw_deleter() const override {
     return allocator_->raw_deleter();
   }
+
   void copy_data(void* dest, const void* src, std::size_t count) const final {
     allocator_->copy_data(dest, src, count);
   }
+
+  // From DeviceAllocator
+
+  bool initialized() override {
+    return allocator_->initialized();
+  }
+
+  void emptyCache(MempoolId_t mempool_id = {0, 0}) override {
+    allocator_->emptyCache(mempool_id);
+  }
+
+  void recordStream(const DataPtr& ptr, c10::Stream stream) override {
+    HIPStream hip_stream = HIPStream(stream);
+    recordStream(ptr, hip_stream);
+  }
+
+  CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) override {
+    return allocator_->getDeviceStats(device);
+  }
+
+  void resetAccumulatedStats(c10::DeviceIndex device) override {
+    allocator_->resetAccumulatedStats(device);
+  }
+
+  void resetPeakStats(c10::DeviceIndex device) override {
+    allocator_->resetPeakStats(device);
+  }
+
+  // From CUDAAllocator
+
+  void* raw_alloc(size_t nbytes) override {
+    return allocator_->raw_alloc(nbytes);
+  }
+
+  void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) override {
+    return allocator_->raw_alloc_with_stream(nbytes, stream);
+  }
+
+  void raw_delete(void* ptr) override {
+    allocator_->raw_delete(ptr);
+  }
+
+  void init(int device_count) override {
+    allocator_->init(device_count);
+  }
+
+  double getMemoryFraction(c10::DeviceIndex device) override {
+    return allocator_->getMemoryFraction(device);
+  }
+
+  void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
+    allocator_->setMemoryFraction(fraction, device);
+  }
+
+  void enable(bool value) override {
+    allocator_->enable(value);
+  }
+
+  bool isEnabled() const override {
+    return allocator_->isEnabled();
+  }
+
+  void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override {
+    allocator_->cacheInfo(device, largestBlock);
+  }
+
+  void* getBaseAllocation(void* ptr, size_t* size) override {
+    return allocator_->getBaseAllocation(ptr, size);
+  }
+
+  void recordStream(const DataPtr& ptr, HIPStream stream) override {
+    allocator_->recordStream(ptr, stream);
+  }
+
+  HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override {
+    return allocator_->snapshot(mempool_id);
+  }
+
+  void beginAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      std::function<bool(hipStream_t)> filter) override {
+    allocator_->beginAllocateToPool(device, mempool_id, filter);
+  }
+
+  void endAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id) override {
+    allocator_->endAllocateToPool(device, mempool_id);
+  }
+
+  void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    allocator_->releasePool(device, mempool_id);
+  }
+
+  int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    return allocator_->getPoolUseCount(device, mempool_id);
+  }
+
+  void createOrIncrefPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      HIPAllocator* allocator = nullptr) override {
+    allocator_->createOrIncrefPool(device, mempool_id, allocator);
+  }
+
+  void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    allocator_->setUseOnOOM(device, mempool_id);
+  }
+
+  bool checkPoolLiveAllocations(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      const std::unordered_set<void*>& expected_live_allocations) override {
+    return allocator_->checkPoolLiveAllocations(device, mempool_id, expected_live_allocations);
+  }
+
+  HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) override {
+    return allocator_->shareIpcHandle(ptr);
+  }
+
+  std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
+    return allocator_->getIpcDevPtr(handle);
+  }
+
+  bool isHistoryEnabled() override {
+    return allocator_->isHistoryEnabled();
+  }
+
+  void recordHistory(
+      bool enabled,
+      HIPCachingAllocator::CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      HIPCachingAllocator::RecordContext when,
+      bool clearHistory) override {
+    allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
+  }
+
+  void recordAnnotation(
+      const std::vector<std::pair<std::string, std::string>>& md) override {
+    allocator_->recordAnnotation(md);
+  }
+
+  void pushCompileContext(std::string& md) override {
+    allocator_->pushCompileContext(md);
+  }
+
+  void popCompileContext() override {
+    allocator_->popCompileContext();
+  }
+
+  void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) override {
+    allocator_->attachOutOfMemoryObserver(observer);
+  }
+
+  void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) override {
+    allocator_->attachAllocatorTraceTracker(tracker);
+  }
+
+  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) override {
+    allocator_->enablePeerAccess(dev, dev_to_access);
+  }
+
+  hipError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      hipStream_t stream,
+      bool p2p_enabled) override {
+    return allocator_->memcpyAsync(dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
+  }
+
+  std::shared_ptr<HIPCachingAllocator::AllocatorState> getCheckpointState(
+      c10::DeviceIndex device,
+      MempoolId_t id) override {
+    return allocator_->getCheckpointState(device, id);
+  }
+
+  HIPCachingAllocator::CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<HIPCachingAllocator::AllocatorState> pps) override {
+    auto cpd = allocator_->setCheckpointPoolState(device, pps);
+    for (auto& ptr : cpd.dataptrs_allocd) {
+      ptr.unsafe_set_device(Device(c10::DeviceType::CUDA, ptr.device().index()));
+    }
+    return cpd;
+  }
+
+  std::string name() override {
+    return allocator_->name();
+  }
+
 };
 
 } // namespace c10::hip
diff --git a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp
index 46f7d247293a1..53e7980b3d3f9 100644
--- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp
+++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp
@@ -1,10 +1,11 @@
-#include <c10/core/Allocator.h>
+#include <c10/hip/HIPCachingAllocator.h>
+#include <ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h>
 #include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
 
 namespace c10 { namespace hip {
 namespace HIPCachingAllocatorMasqueradingAsCUDA {
 
-Allocator* get() {
+HIPCachingAllocator::HIPAllocator* get() {
   static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
   return &allocator;
 }
diff --git a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
index 3aaa9d06c5e91..1d3606b456fca 100644
--- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
@@ -10,9 +10,185 @@ class DataPtr;
 namespace hip {
 namespace HIPCachingAllocatorMasqueradingAsCUDA {
 
-C10_HIP_API Allocator* get();
+C10_HIP_API HIPCachingAllocator::HIPAllocator* get();
 C10_HIP_API void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream);
 
+inline void* raw_alloc(size_t nbytes) {
+  return get()->raw_alloc(nbytes);
+}
+
+inline void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) {
+  return get()->raw_alloc_with_stream(nbytes, stream);
+}
+
+inline void raw_delete(void* ptr) {
+  return get()->raw_delete(ptr);
+}
+
+inline void init(int device_count) {
+  return get()->init(device_count);
+}
+
+inline double getMemoryFraction(c10::DeviceIndex device) {
+  return get()->getMemoryFraction(device);
+}
+
+inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
+  return get()->setMemoryFraction(fraction, device);
+}
+
+inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
+  return get()->emptyCache(mempool_id);
+}
+
+inline void enable(bool value) {
+  return get()->enable(value);
+}
+
+inline bool isEnabled() {
+  return get()->isEnabled();
+}
+
+inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
+  return get()->cacheInfo(device, largestBlock);
+}
+
+inline void* getBaseAllocation(void* ptr, size_t* size) {
+  return get()->getBaseAllocation(ptr, size);
+}
+
+inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
+    c10::DeviceIndex device) {
+  return get()->getDeviceStats(device);
+}
+
+inline void resetAccumulatedStats(c10::DeviceIndex device) {
+  return get()->resetAccumulatedStats(device);
+}
+
+inline void resetPeakStats(c10::DeviceIndex device) {
+  return get()->resetPeakStats(device);
+}
+
+inline HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) {
+  return get()->snapshot(mempool_id);
+}
+
+inline std::shared_ptr<HIPCachingAllocator::AllocatorState> getCheckpointState(
+    c10::DeviceIndex device,
+    MempoolId_t id) {
+  return get()->getCheckpointState(device, id);
+}
+
+inline HIPCachingAllocator::CheckpointDelta setCheckpointPoolState(
+    c10::DeviceIndex device,
+    std::shared_ptr<HIPCachingAllocator::AllocatorState> pps) {
+  return get()->setCheckpointPoolState(device, std::move(pps));
+}
+
+inline void beginAllocateToPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    std::function<bool(hipStream_t)> filter) {
+  get()->beginAllocateToPool(device, mempool_id, std::move(filter));
+}
+
+inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->endAllocateToPool(device, mempool_id);
+}
+
+inline void recordHistory(
+    bool enabled,
+    HIPCachingAllocator::CreateContextFn context_recorder,
+    size_t alloc_trace_max_entries,
+    HIPCachingAllocator::RecordContext when,
+    bool clearHistory) {
+  return get()->recordHistory(
+      enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
+}
+
+inline void recordAnnotation(
+    const std::vector<std::pair<std::string, std::string>>& md) {
+  return get()->recordAnnotation(md);
+}
+
+inline void pushCompileContext(std::string& md) {
+  return get()->pushCompileContext(md);
+}
+
+inline void popCompileContext() {
+  return get()->popCompileContext();
+}
+
+inline bool isHistoryEnabled() {
+  return get()->isHistoryEnabled();
+}
+
+inline bool checkPoolLiveAllocations(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    const std::unordered_set<void*>& expected_live_allocations) {
+  return get()->checkPoolLiveAllocations(
+      device, mempool_id, expected_live_allocations);
+}
+
+inline void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) {
+  return get()->attachOutOfMemoryObserver(std::move(observer));
+}
+
+inline void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) {
+  return get()->attachAllocatorTraceTracker(std::move(tracker));
+}
+
+inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  return get()->releasePool(device, mempool_id);
+}
+
+inline void createOrIncrefPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    HIPCachingAllocator::HIPAllocator* allocator_ptr = nullptr) {
+  get()->createOrIncrefPool(device, mempool_id, allocator_ptr);
+}
+
+inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->setUseOnOOM(device, mempool_id);
+}
+
+inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  return get()->getPoolUseCount(device, mempool_id);
+}
+
+inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
+  return get()->getIpcDevPtr(std::move(handle));
+}
+
+inline HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) {
+  return get()->shareIpcHandle(ptr);
+}
+
+inline std::string name() {
+  return get()->name();
+}
+
+inline hipError_t memcpyAsync(
+    void* dst,
+    int dstDevice,
+    const void* src,
+    int srcDevice,
+    size_t count,
+    hipStream_t stream,
+    bool p2p_enabled) {
+  return get()->memcpyAsync(
+      dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
+}
+
+inline void enablePeerAccess(
+    c10::DeviceIndex dev,
+    c10::DeviceIndex dev_to_access) {
+  return get()->enablePeerAccess(dev, dev_to_access);
+}
+
 } // namespace HIPCachingAllocatorMasqueradingAsCUDA
 } // namespace hip
 } // namespace c10
diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp
index 08c09b88f99cb..86e42ee3b66dc 100644
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@@ -19,31 +19,37 @@ inline miopenDataType_t getDataType(const at::Tensor& t) {
   } else {
     TORCH_CHECK(
         false,
-        "TensorDescriptor only supports float, half and bfloat16 tensors");
+        "TensorDescriptor does not support ", scalar_type);
   }
 }
 
 } // anonymous namespace
 
+constexpr size_t MIOPEN_DIM_MAX = 5;
 
-void TensorDescriptor::set(const at::Tensor &t, size_t pad) {
-  set(getDataType(t), t.sizes(), t.strides(), pad);
+void TensorDescriptor::set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad) {
+  set(getDataType(t), t.sizes(), t.strides(), pad,
+    memory_format == at::MemoryFormat::ChannelsLast ||
+    memory_format == at::MemoryFormat::ChannelsLast3d);
 }
 
-constexpr size_t MIOPEN_DIM_MAX = 5;
+void TensorDescriptor::set(const at::Tensor &t, size_t pad) {
+  auto memory_format = t.suggest_memory_format();
+  set(getDataType(t), t.sizes(), t.strides(), pad,
+    memory_format == at::MemoryFormat::ChannelsLast ||
+    memory_format == at::MemoryFormat::ChannelsLast3d);
+}
 
 void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad) {
+  set(datatype, t_sizes, t_strides, pad,
+    is_channels_last_strides_2d(t_sizes, t_strides) ||
+    is_channels_last_strides_3d(t_sizes, t_strides));
+}
+
+void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad, bool nhwc) {
   size_t dim = t_sizes.size();
   if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
-#define _STR(X) #X
-#define STR(X) _STR(X)
-    TORCH_CHECK(
-        false,
-        "MIOpen supports only up to ",
-        STR(MIOPEN_DIM_MAX),
-        " dimensions");
-#undef _STR
-#undef STR
+    TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions");
   int size[MIOPEN_DIM_MAX];
   int stride[MIOPEN_DIM_MAX];
   for (const auto i : c10::irange(dim)) {
@@ -54,7 +60,7 @@ void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntAr
     size[i] = 1;
     stride[i] = 1;
   }
-  set(datatype, static_cast<int>(std::max(dim, pad)), size, stride);
+  set(datatype, static_cast<int>(std::max(dim, pad)), size, stride, nhwc);
 }
 
 std::string miopenTypeToString(miopenDataType_t dtype) {
@@ -74,10 +80,11 @@ std::string miopenTypeToString(miopenDataType_t dtype) {
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
   out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
-  int nbDims = 4;
+  int nbDims = 0;
   int dimA[MIOPEN_DIM_MAX];
   int strideA[MIOPEN_DIM_MAX];
   miopenDataType_t dtype;
+  miopenGetTensorDescriptorSize(d.desc(), &nbDims);
   miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
   out << "    type = " << miopenTypeToString(dtype) << "\n";
   out << "    nbDims = " << nbDims << "\n";
@@ -99,19 +106,17 @@ void TensorDescriptor::print() { std::cout << *this; }
 
 void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad) {
   auto dim = t.ndimension();
-  if (dim > static_cast<int64_t>(MIOPEN_DIM_MAX) || pad > static_cast<int64_t>(MIOPEN_DIM_MAX)) {
-#define _STR(X) #X
-#define STR(X) _STR(X)
-    TORCH_CHECK(
-        false,
-        "MIOpen supports only up to ",
-        STR(MIOPEN_DIM_MAX),
-        " dimensions");
-#undef _STR
-#undef STR
-  }
+  if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
+  TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions");
+  // NB: It is possible for this test to be insufficient, because the
+  // Tensor passed in to set the filter descriptor may not be the actual
+  // Tensor whose data pointer is passed to cuDNN.  Nevertheless,
+  // that is the common case, so we can catch most client errors with this test.
   TORCH_CHECK(t.is_contiguous(memory_format),
-      "MIOpen filters (a.k.a. weights) must be contiguous");
+    "MIOpen filters (a.k.a. weights) must be contiguous in desired memory_format\n",
+    "Weight sizes: ", t.sizes(), "\n",
+    "Weight strides: ", t.strides(), "\n",
+    "cuDNN suggested memory_format: ", memory_format);
 
   int size[MIOPEN_DIM_MAX];
   int stride[MIOPEN_DIM_MAX];
@@ -131,7 +136,9 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
   }
 
   dim = std::max<int64_t>(dim, pad);
-  set(getDataType(t), (int) dim, size, stride);
+  set(getDataType(t), static_cast<int>(dim), size, stride,
+    memory_format == at::MemoryFormat::ChannelsLast ||
+    memory_format == at::MemoryFormat::ChannelsLast3d);
 }
 
 }}
diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h
index 2eee837cd533d..8825575c9231b 100644
--- a/aten/src/ATen/miopen/Descriptors.h
+++ b/aten/src/ATen/miopen/Descriptors.h
@@ -9,6 +9,8 @@
 
 namespace at { namespace native {
 
+std::string miopenTypeToString(miopenDataType_t dtype);
+
 inline int dataSize(miopenDataType_t dataType)
 {
   switch (dataType) {
@@ -19,6 +21,32 @@ inline int dataSize(miopenDataType_t dataType)
   }
 }
 
+// See NOTE [ cudnn fixSizeOneDimStride ] in aten/src/ATen/cudnn/Descriptors.h
+template <typename T>
+static inline void fixSizeOneDimStride(int dim, const T *size, T *stride, bool nhwc) {
+  int64_t z = 1;
+  int index = 0;
+  std::vector<int> permutation(dim);
+
+  if (nhwc) {
+    permutation[index++] = 1;
+  }
+  for (int d = dim-1; d > 1; d--) {
+    permutation[index++] = d;
+  }
+  if (!nhwc) {
+    permutation[index++] = 1;
+  }
+  permutation[index++] = 0;
+  for (int d : permutation) {
+    if (size[d] == 1) {
+      stride[d] = z;
+    } else {
+      z *= size[d];
+    }
+  }
+}
+
 template <typename T, miopenStatus_t (*dtor)(T*)>
 struct DescriptorDeleter {
   void operator()(T* x) {
@@ -75,14 +103,20 @@ class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor<
     set(t, pad);
   }
 
+  // See Note [CuDNN broadcast padding]
   void set(const at::Tensor &t, size_t pad = 0);
+  void set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad = 0);
   void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0);
 
   void print();
 
 private:
-  void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
-    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
+  void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad, bool nhwc);
+
+  void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
+    std::vector<int> strides_copy(stride, stride + dim);
+    fixSizeOneDimStride<int>(dim, size, strides_copy.data(), nhwc);
+    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data()));
   }
 };
 
@@ -100,8 +134,10 @@ class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor<
   void set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad = 0);
 
 private:
-  void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
-    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
+  void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
+    std::vector<int> strides_copy(stride, stride + dim);
+    fixSizeOneDimStride<int>(dim, size, strides_copy.data(), nhwc);
+    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data()));
   }
 };
 
@@ -166,4 +202,4 @@ union Constant
   }
 };
 
-}}  // namespace
+}} // namespace
diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp
index 7b04d65ebdd02..d858df0733975 100644
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@@ -43,7 +43,6 @@ TensorBase empty_mps(
     int64_t nelements = c10::multiply_integers(size);
     auto dtype = dtype_or_default(dtype_opt);
     TORCH_CHECK_TYPE(dtype != ScalarType::Double, MPS_ERROR_DOUBLE_NOT_SUPPORTED);
-    TORCH_CHECK_TYPE(dtype != ScalarType::BFloat16 || is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_14_0_PLUS), "MPS BFloat16 is only supported on MacOS 14 or newer");
 
 
     auto dtype_meta = scalarTypeToTypeMeta(dtype);
diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h
index a70ce25108201..9b58477104978 100644
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@@ -18,11 +18,7 @@ namespace at::mps {
 
 // Helper enum to check if a MPSGraph op is supported in a given macOS version
 enum class MacOSVersion : uint32_t {
-  MACOS_VER_13_1_PLUS = 0,
-  MACOS_VER_13_2_PLUS,
-  MACOS_VER_13_3_PLUS,
-  MACOS_VER_14_0_PLUS,
-  MACOS_VER_14_4_PLUS,
+  MACOS_VER_14_4_PLUS = 0,
   MACOS_VER_15_0_PLUS,
   MACOS_VER_15_1_PLUS,
   MACOS_VER_15_2_PLUS,
@@ -59,6 +55,17 @@ class TORCH_API MPSDevice {
    */
   bool isMacOS13Plus(MacOSVersion version) const;
 
+  /**
+   * Returns device name
+   */
+  std::string getName() const;
+
+  /**
+   * Returns number of GPU cores.
+   * 1 Core = 16 ExecutionUnit x 8 ALU x 24 threads
+   */
+  unsigned getCoreCount() const;
+
   ~MPSDevice();
 
  private:
diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
index 55af5f83b388c..5a37490c02402 100644
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -32,11 +32,11 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
 
 MPSDevice::MPSDevice() : _mtl_device(nil) {
   // Check that MacOS 13.0+ version of MPS framework is available
-  // Create the MPSGraph and check method introduced in 13.0
+  // Create the MPSGraph and check method introduced in 14.0
   // which is used by MPS backend.
   id mpsCD = NSClassFromString(@"MPSGraph");
 
-  if ([mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == NO) {
+  if ([mpsCD instancesRespondToSelector:@selector(HermiteanToRealFFTWithTensor:axes:descriptor:name:)] == NO) {
     return;
   }
 
@@ -66,24 +66,12 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
           isOperatingSystemAtLeastVersion:{.majorVersion = major, .minorVersion = minor, .patchVersion = 0}];
     }
   };
-  static bool _macos_13_1_plus = is_os_version_at_least(13, 1);
-  static bool _macos_13_2_plus = is_os_version_at_least(13, 2);
-  static bool _macos_13_3_plus = is_os_version_at_least(13, 3);
-  static bool _macos_14_0_plus = is_os_version_at_least(14, 0);
   static bool _macos_14_4_plus = is_os_version_at_least(14, 4);
   static bool _macos_15_0_plus = is_os_version_at_least(15, 0);
   static bool _macos_15_1_plus = is_os_version_at_least(15, 1);
   static bool _macos_15_2_plus = is_os_version_at_least(15, 2);
 
   switch (version) {
-    case MacOSVersion::MACOS_VER_13_1_PLUS:
-      return _macos_13_1_plus;
-    case MacOSVersion::MACOS_VER_13_2_PLUS:
-      return _macos_13_2_plus;
-    case MacOSVersion::MACOS_VER_13_3_PLUS:
-      return _macos_13_3_plus;
-    case MacOSVersion::MACOS_VER_14_0_PLUS:
-      return _macos_14_0_plus;
     case MacOSVersion::MACOS_VER_14_4_PLUS:
       return _macos_14_4_plus;
     case MacOSVersion::MACOS_VER_15_0_PLUS:
@@ -97,10 +85,36 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   }
 }
 
+std::string MPSDevice::getName() const {
+  @autoreleasepool {
+    return [[_mtl_device name] UTF8String];
+  }
+}
+
+unsigned MPSDevice::getCoreCount() const {
+  io_iterator_t iterator = 0;
+  io_registry_entry_t entry = 0;
+  int core_count = 0;
+  auto matchingDict = IOServiceMatching("AGXAccelerator");
+  TORCH_INTERNAL_ASSERT(matchingDict, "Failed to create matching dict");
+  const auto status = IOServiceGetMatchingServices(kIOMainPortDefault, matchingDict, &iterator);
+  TORCH_INTERNAL_ASSERT(status == KERN_SUCCESS);
+  while ((entry = IOIteratorNext(iterator)) != 0) {
+    auto property = IORegistryEntryCreateCFProperty(entry, CFSTR("gpu-core-count"), kCFAllocatorDefault, 0);
+    auto found = CFNumberGetValue(static_cast<CFNumberRef>(property), kCFNumberIntType, &core_count);
+    CFRelease(property);
+    IOObjectRelease(entry);
+    if (found) {
+      break;
+    }
+  }
+  IOObjectRelease(iterator);
+  return core_count;
+}
+
 at::Allocator* GetMPSAllocator(bool useSharedAllocator) {
   return getIMPSAllocator(useSharedAllocator);
 }
-
 bool is_available() {
   return MPSDevice::getInstance()->device() != nil;
 }
diff --git a/aten/src/ATen/mps/MPSHooks.mm b/aten/src/ATen/mps/MPSHooks.mm
index f6133e8877222..a2ec221c1bfea 100644
--- a/aten/src/ATen/mps/MPSHooks.mm
+++ b/aten/src/ATen/mps/MPSHooks.mm
@@ -34,7 +34,7 @@
     case 14:
       switch (minor) {
         case 0:
-          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS);
+          return true;
         case 4:
           return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
         default:
@@ -42,19 +42,7 @@
           return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
       }
     case 13:
-      switch (minor) {
-        case 0:
-          return true;
-        case 1:
-          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_1_PLUS);
-        case 2:
-          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS);
-        case 3:
-          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-        default:
-          TORCH_WARN("Can't check whether running on 13.", minor, "+ returning one for 13.3+");
-          return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-      }
+      return true;
     default:
       TORCH_WARN("Checking for unexpected MacOS ", major, ".", minor, " returning false");
       return false;
diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp
index 674ccf11cfb9b..49366151ae60b 100644
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@@ -9,6 +9,7 @@
 #include <ATen/native/mkldnn/Matmul.h>
 #include <ATen/native/mkldnn/Linear.h>
 #include <ATen/native/Resize.h>
+#include <ATen/native/GroupedMMUtils.h>
 #if !defined(__s390x__) && !defined(__powerpc__)
 #include <cpuinfo.h>
 #endif
@@ -332,4 +333,23 @@ _scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b,
   return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
 }
 
+// TODO(vasiliy, future PR): figure out why we need to declare this function, when
+// other functions that live in ATen/native/*.cpp without declarations
+// or headers work just fine.
+Tensor _grouped_mm(const Tensor& mat_a, const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+const std::optional<at::Tensor>& bias,
+std::optional<c10::ScalarType> out_dtype);
+
+Tensor _grouped_mm(const Tensor& mat_a, const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+const std::optional<at::Tensor>& bias,
+std::optional<c10::ScalarType> out_dtype) {
+  _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
+  const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
+  _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
+  return out;
+}
+
 }  // namespace at::native
diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp
index 79dbe7353e159..e06afddd05aa7 100644
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@@ -51,7 +51,7 @@ extern "C" void zaxpy_(int *n, void *a, const void *x, int *incx, void *y, int *
 // brgemm_pack_B is changed to transform and the setting of brgemm beta is changed to set_add_C
 #if (IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR == 5)
 #define ONEDNN_UKERNEL_1
-#elif (IDEEP_VERSION_MAJOR >= 3 && IDEEP_VERSION_MINOR >= 6)
+#elif ((IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR >= 6) || (IDEEP_VERSION_MAJOR > 3))
 #define ONEDNN_UKERNEL_2
 #endif
 #if ((defined(ONEDNN_UKERNEL_1) || defined(ONEDNN_UKERNEL_2)) && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))))
@@ -496,18 +496,18 @@ void gemm(
   // for the fallback path, first compute gemm with beta = 0,
   // and then add c in full precision.
   int64_t c_size = n * m;
-  std::vector<at::Half> float16_c(c_size, 0.f);
-  gemm_stub(
+  std::vector<float> float_c(c_size, 0.f);
+  gemm_no_downcast_stub(
       at::kCPU, at::kHalf,
-      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float16_c.data(), m);
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
   for (const auto j : c10::irange(n)) {
     for (const auto i : c10::irange(m)) {
       auto offset = j * ldc + i;
       // beta == 0 won't propagate NaN from C
       if (beta == 0.f) {
-        c[offset] = c10::convert<float>(float16_c[j * m + i]);
+        c[offset] = float_c[j * m + i];
       } else {
-        c[offset] = beta * c[offset] + c10::convert<float>(float16_c[j * m + i]);
+        c[offset] = beta * c[offset] + float_c[j * m + i];
       }
     }
   }
diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h
index 95d11903dc773..8b75f12ebaf21 100644
--- a/aten/src/ATen/native/CPUBlas.h
+++ b/aten/src/ATen/native/CPUBlas.h
@@ -206,6 +206,16 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
 // B Base pointer to a tensor B.
 // C Pointer to a tensor C (accumulation buffer).
 // Note only batch size 1 is used currently
+
+// Define macros for available brgemm APIs
+// so that callers can determine which APIs are available
+#define CPUBLAS_BRGEMM_F16F16F32 // half * half -> float
+#define CPUBLAS_BRGEMM_BF16BF16F32 // bfloat16 * bfloat16 -> float
+#define CPUBLAS_BRGEMM_F32F32F32 // float * float -> float
+#define CPUBLAS_BRGEMM_U8U8I32 // unsigned char * unsigned char -> int32
+#define CPUBLAS_BRGEMM_U8I8I32 // unsigned char * signed char -> int32
+#define CPUBLAS_BRGEMM_I8I8I32 // signed char * signed char -> int32
+
 TORCH_API void brgemm(
     int64_t M,
     int64_t N,
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 84381efe55b0b..e160c84ced331 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -353,19 +353,21 @@ TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable);
 TORCH_API bool _cudnn_get_conv_benchmark_empty_cache();
 
 
-inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
-
+inline at::MemoryFormat miopen_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) {
   // disable NHWC for float64 input.
   if (!at::detail::getCUDAHooks().compiledWithMIOpen() ||
       input.scalar_type() == at::kDouble ||
       weight.scalar_type() == at::kDouble) {
-    return false;
+    return at::MemoryFormat::Contiguous;
   }
 
   // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
-  // See #64427
-  static std::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
-  static bool suggest_nhwc = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC;
+  // See https://github.com/pytorch/pytorch/issues/64427.
+  // non static variable is used to be able to change environment variable in runtime for testing
+  // enabled by default for ROCm >= 7.0.0 with miopen 3.5
+  int miopen_version = detail::getCUDAHooks().compiledWithMIOpen() ? detail::getCUDAHooks().versionMIOpen() : 0;
+  bool is_miopen_3_5 = miopen_version >= 30500;  // ROCm 7.0
+  bool suggest_nhwc = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC").value_or(is_miopen_3_5);
 
   auto input_memory_format = input.suggest_memory_format();
   auto weight_memory_format = weight.suggest_memory_format();
@@ -375,13 +377,24 @@ inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Ten
     (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
     (weight_memory_format == at::MemoryFormat::ChannelsLast)
   );
+  if (can_use_miopen_channels_last_2d) {
+    return at::MemoryFormat::ChannelsLast;
+  }
 
   bool can_use_miopen_channels_last_3d = suggest_nhwc && (weight_ndim == 5) && (
     (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
     (weight_memory_format == at::MemoryFormat::ChannelsLast3d)
   );
+  if (can_use_miopen_channels_last_3d) {
+    return at::MemoryFormat::ChannelsLast3d;
+  }
+
+  return at::MemoryFormat::Contiguous;
+}
 
-  return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
+// deprecated, but to remove would be BC-breaking
+inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+  return miopen_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous;
 }
 
 inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index b926df11c21f3..ab427f396e345 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -3,6 +3,7 @@
 #include <ATen/Config.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorOperators.h>
+#include <ATen/native/CanUse32BitIndexMath.h>
 #include <ATen/native/ConvolutionMM3d.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/Pool.h>
@@ -13,6 +14,7 @@
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
 #include <c10/macros/Macros.h>
+#include <algorithm>
 #include <limits>
 #include <utility>
 
@@ -299,67 +301,50 @@ struct ConvParams {
   bool allow_tf32{};
 
   bool is_strided() const {
-    bool is_strided = false;
-    for (const auto& s : stride) {
-      is_strided |= (s != 1);
-    }
-    return is_strided;
+    return std::any_of(
+      stride.cbegin(), stride.cend(), [](const T& s) { return s != 1; });
   }
 
   bool is_dilated() const {
-    bool is_dilated = false;
-    for (const auto& d : dilation) {
-      is_dilated |= (d != 1);
-    }
-    return is_dilated;
+    return std::any_of(
+      dilation.cbegin(), dilation.cend(), [](const T& d) { return d != 1; });
   }
 
   bool is_padded() const {
-    bool is_padded = false;
-    for (auto p : padding) {
-      is_padded |= (p != 0);
-    }
-    return is_padded;
+    return std::any_of(
+      padding.cbegin(), padding.cend(), [](const T& p) { return p != 0; });
   }
 
   bool is_output_padding_neg() const {
-    bool is_non_neg = false;
-    for (const auto& p : output_padding) {
-      is_non_neg |= (p < 0);
-    }
-    return is_non_neg;
+    return std::any_of(
+      output_padding.cbegin(),
+      output_padding.cend(),
+      [](const T& p) { return p < 0; });
   }
 
   bool is_output_padding_big() const {
-    bool is_big = false;
+    // Revisit this with std::views::zip at C++20.
     for (auto i: c10::irange(output_padding.size())) {
-      is_big |= (output_padding[i] >= stride[i]);
+      if (output_padding[i] >= stride[i]) {
+        return true;
+      }
     }
-    return is_big;
+    return false;
   }
 
   bool is_padding_neg() const {
-    bool is_non_neg = false;
-    for (const auto& p : padding) {
-      is_non_neg |= (p < 0);
-    }
-    return is_non_neg;
+    return std::any_of(
+      padding.cbegin(), padding.cend(), [](const T& p) { return p < 0; });
   }
 
   bool is_dilation_neg() const {
-    bool is_non_neg = false;
-    for (const auto& p : dilation) {
-      is_non_neg |= (p < 0);
-    }
-    return is_non_neg;
+    return std::any_of(
+      dilation.cbegin(), dilation.cend(), [](const T& d) { return d < 0; });
   }
 
   bool is_stride_nonpos() const {
-    bool is_nonpos = false;
-    for (const auto& s : stride) {
-      is_nonpos |= (s <= 0);
-    }
-    return is_nonpos;
+    return std::any_of(
+      stride.cbegin(), stride.cend(), [](const T& s) { return s <= 0; });
   }
 
   void view1d_as_2d() {
@@ -458,12 +443,15 @@ struct ConvParams {
 
   // Use cudnn for FP16 depthwise convolutions
   bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
+    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+      return false;
+    }
     if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
       // always use cudnn_depthwise for channels_last format
       return true;
     }
     // native kernel doesn't support 64-bit non-splittable case
-    if (cudnn_enabled && needs_64bit_indexing_no_split(input, weight)) {
+    if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
       static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
       if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
         TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
@@ -1418,10 +1406,8 @@ static inline at::MemoryFormat determine_backend_memory_format(
     case ConvBackend::Miopen:
     case ConvBackend::MiopenDepthwise:
     case ConvBackend::MiopenTranspose:
-      if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {
-        TORCH_INTERNAL_ASSERT((k == 4 || k == 5),
-            "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()");
-        backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
+      if (detail::getCUDAHooks().compiledWithMIOpen()) {
+        backend_memory_format = miopen_conv_suggest_memory_format(input, weight);
       }
       break;
     case ConvBackend::Mkldnn:
diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index 64c39fcaef239..cb437fb45ce21 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -260,6 +260,7 @@ namespace at::native {
     check_foreach_api_restrictions(input, tensors1, tensors2);            \
                                                                           \
     std::vector<Tensor> result;                                           \
+    result.reserve(input.size());                                         \
     for (const auto i : c10::irange(input.size())) {                      \
       result.emplace_back(input[i].OP(tensors1[i], tensors2[i], scalar)); \
     }                                                                     \
@@ -288,6 +289,7 @@ namespace at::native {
     check_foreach_api_restrictions(input, tensors1, tensors2, scalars);       \
                                                                               \
     std::vector<Tensor> result;                                               \
+    result.reserve(input.size());                                             \
     for (const auto i : c10::irange(input.size())) {                          \
       result.emplace_back(input[i].OP(tensors1[i], tensors2[i], scalars[i])); \
     }                                                                         \
@@ -417,6 +419,7 @@ std::vector<Tensor> foreach_tensor_ternary_lerp_slow(
     TensorList tensors3) {
   check_foreach_api_restrictions(tensors1, tensors2, tensors3);
   std::vector<Tensor> result;
+  result.reserve(tensors1.size());
   for (const auto i : c10::irange(tensors1.size())) {
     result.emplace_back(tensors1[i].lerp(tensors2[i], tensors3[i]));
   }
@@ -439,6 +442,7 @@ std::vector<Tensor> foreach_tensor_lerp_scalarlist_kernel_slow(
     at::ArrayRef<Scalar> scalars) {
   check_foreach_api_restrictions(tensors1, tensors2, scalars);
   std::vector<Tensor> result;
+  result.reserve(tensors1.size());
   for (const auto i : c10::irange(tensors1.size())) {
     result.emplace_back(tensors1[i].lerp(tensors2[i], scalars[i]));
   }
@@ -469,6 +473,7 @@ std::vector<Tensor> foreach_tensor_norm_slow(
     std::optional<ScalarType> dtype) {
   check_foreach_api_restrictions(tensors);
   std::vector<Tensor> result;
+  result.reserve(tensors.size());
   for (const auto& t : tensors) {
     result.emplace_back(at::linalg_vector_norm(t, ord, {}, false, dtype));
   }
@@ -478,6 +483,7 @@ std::vector<Tensor> foreach_tensor_norm_slow(
 std::vector<Tensor> foreach_tensor_max_slow(TensorList tensors) {
   check_foreach_api_restrictions(tensors);
   std::vector<Tensor> result;
+  result.reserve(tensors.size());
   for (const auto& t : tensors) {
     result.emplace_back(at::max(t));
   }
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index 56b7a6f98e779..f0dce20a6eff4 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -22,7 +22,7 @@ namespace {
 // Check if tensor list has either a boolean tensor or a integer tensor
 inline bool has_integral_tensor(TensorList tensors, const bool includeBool) {
   return std::any_of(
-      tensors.begin(), tensors.end(), [&includeBool](const auto& t) {
+      tensors.begin(), tensors.end(), [includeBool](const auto& t) {
         return at::isIntegralType(t.scalar_type(), includeBool);
       });
 }
@@ -53,8 +53,8 @@ inline void check_foreach_api_restrictions(
 inline void check_foreach_api_restrictions(
     TensorList tensors1,
     TensorList tensors2) {
-  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
-  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
+  check_foreach_api_restrictions(tensors1);
+  check_foreach_api_restrictions(tensors2);
   TORCH_CHECK(
       tensors1.size() == tensors2.size(),
       "Tensor lists must have the same number of tensors, got ",
@@ -67,21 +67,8 @@ inline void check_foreach_api_restrictions(
     TensorList tensors1,
     TensorList tensors2,
     TensorList tensors3) {
-  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
-  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
-  TORCH_CHECK(!tensors3.empty(), "Tensor list must have at least one tensor.");
-  TORCH_CHECK(
-      tensors1.size() == tensors2.size(),
-      "Tensor lists must have the same number of tensors, got ",
-      tensors1.size(),
-      " and ",
-      tensors2.size());
-  TORCH_CHECK(
-      tensors1.size() == tensors3.size(),
-      "Tensor lists must have the same number of tensors, got ",
-      tensors1.size(),
-      " and ",
-      tensors3.size());
+  check_foreach_api_restrictions(tensors1, tensors2);
+  check_foreach_api_restrictions(tensors1, tensors3);
 }
 
 inline void check_foreach_api_restrictions(
@@ -90,12 +77,7 @@ inline void check_foreach_api_restrictions(
     TensorList tensors3,
     ArrayRef<Scalar> scalars) {
   check_foreach_api_restrictions(tensors1, tensors2, tensors3);
-  TORCH_CHECK(
-      tensors1.size() == scalars.size(),
-      "Tensor list must have same number of elements as scalar list, got ",
-      tensors1.size(),
-      " and ",
-      scalars.size());
+  check_foreach_api_restrictions(tensors1, scalars);
 }
 
 inline void check_foreach_api_restrictions(
@@ -103,12 +85,7 @@ inline void check_foreach_api_restrictions(
     TensorList tensors2,
     ArrayRef<Scalar> scalars) {
   check_foreach_api_restrictions(tensors1, tensors2);
-  TORCH_CHECK(
-      tensors1.size() == scalars.size(),
-      "Tensor list must have same number of elements as scalar list, got ",
-      tensors1.size(),
-      " and ",
-      scalars.size());
+  check_foreach_api_restrictions(tensors1, scalars);
 }
 
 // Helper function called in check_fast_path_restrictions to check whether all
@@ -126,15 +103,13 @@ inline bool _check_tensors_share_device_and_dtype(
         tensor.is_non_overlapping_and_dense();
   };
 
-  for (const auto& tensorList : tensorLists) {
-    for (const auto& tensor : tensorList) {
-      if (!is_tensor_okay(tensor)) {
-        return false;
-      }
-    }
-  }
-
-  return true;
+  return std::all_of(
+      tensorLists.cbegin(),
+      tensorLists.cend(),
+      [&](const TensorList& tensorList) {
+        return std::all_of(
+            tensorList.cbegin(), tensorList.cend(), is_tensor_okay);
+      });
 }
 
 // Helper function called in check_fast_path_restrictions to check if
@@ -180,11 +155,9 @@ inline bool _check_tensors_do_type_promotion_with_scalars(
     bool does_op_promote_integer_inputs_to_float = false) {
   for (const auto i : c10::irange(tensorList.size())) {
     // For division, integer inputs will result in float.
-    if (does_op_promote_integer_inputs_to_float) {
-      if (at::isIntegralType(
-              tensorList[i].scalar_type(), /*includeBool*/ true)) {
-        return false;
-      }
+    if (does_op_promote_integer_inputs_to_float &&
+        at::isIntegralType(tensorList[i].scalar_type(), /*includeBool*/ true)) {
+      return false;
     }
     if (!scalarList.empty()) {
       const auto& scalar =
@@ -361,36 +334,34 @@ inline FlatMap _group_tensors_by_first_tensors_device_and_dtype(
               }
             }),
         "Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding");
-    if (!grouped_tensors_with_indices.count(key)) {
-      grouped_tensors_with_indices.insert(
-          {key,
-           TensorsAndIndicesT{
-               [&]() -> nested_optional_tensorvec_t {
-                 nested_optional_tensorvec_t nested_tensorvec;
-                 nested_tensorvec.reserve(num_lists);
-                 for (const auto& i : c10::irange(num_lists)) {
-                   std::vector<std::optional<at::Tensor>> tensors;
-                   if (!nested_tensorlist[i].empty()) {
-                     // NB: num_tensors is the max possible length for any of
-                     // the inner lists of tensor references. Reserving the max
-                     // trades memory for perf. This should not have significant
-                     // impact.
-                     tensors.reserve(num_tensors);
-                   }
-                   nested_tensorvec.emplace_back(tensors);
-                 }
-                 return nested_tensorvec;
-               }(),
-               [&]() -> IndicesT {
-                 if (!with_indices) {
-                   return {};
-                 } else {
-                   IndicesT indices;
-                   indices.reserve(num_tensors);
-                   return indices;
-                 }
-               }()}});
-    }
+    grouped_tensors_with_indices.try_emplace(
+        key,
+        TensorsAndIndicesT{
+            [&]() -> nested_optional_tensorvec_t {
+              nested_optional_tensorvec_t nested_tensorvec;
+              nested_tensorvec.reserve(num_lists);
+              for (const auto& i : c10::irange(num_lists)) {
+                std::vector<std::optional<at::Tensor>> tensors;
+                if (!nested_tensorlist[i].empty()) {
+                  // NB: num_tensors is the max possible length for any of
+                  // the inner lists of tensor references. Reserving the max
+                  // trades memory for perf. This should not have significant
+                  // impact.
+                  tensors.reserve(num_tensors);
+                }
+                nested_tensorvec.emplace_back(std::move(tensors));
+              }
+              return nested_tensorvec;
+            }(),
+            [&]() -> IndicesT {
+              if (!with_indices) {
+                return {};
+              } else {
+                IndicesT indices;
+                indices.reserve(num_tensors);
+                return indices;
+              }
+            }()});
     for (const auto& list_index : c10::irange(num_lists)) {
       if (!nested_tensorlist[list_index].empty()) {
         grouped_tensors_with_indices[key].first[list_index].emplace_back(
diff --git a/aten/src/ATen/native/GroupedMMUtils.h b/aten/src/ATen/native/GroupedMMUtils.h
new file mode 100644
index 0000000000000..78993308cd5fa
--- /dev/null
+++ b/aten/src/ATen/native/GroupedMMUtils.h
@@ -0,0 +1,167 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/CPUFunctions.h>
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/bmm.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/mm.h>
+#endif
+
+namespace at::native {
+
+inline bool check_valid_strides_and_return_transposed(const Tensor& mat) {
+  IntArrayRef tensor_strides = mat.strides();
+  IntArrayRef tensor_sizes = mat.sizes();
+  int end_dim = mat.dim() - 1;
+  int alignment = 16 / mat.element_size();
+  TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n");
+  if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max<int64_t>(1, tensor_sizes[end_dim - 1]))) {
+    TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes");
+    return true;
+  } else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max<int64_t>(1, tensor_sizes[end_dim]))) {
+    TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes");
+    return false;
+  } else {
+    TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes");
+  }
+}
+
+inline at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a,
+const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+c10::ScalarType out_dtype
+) {
+  c10::SmallVector<int64_t, 3> out_size;
+  const bool a_is_2d = mat_a.dim() == 2;
+  const bool b_is_2d = mat_b.dim() == 2;
+  if (a_is_2d) {
+    if (b_is_2d) {
+      out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)};
+    } else {
+      TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match");
+      out_size = {mat_a.size(0), mat_b.size(-1)};
+    }
+  } else {
+    if (b_is_2d) {
+      // this case is not actually encountered for MoE gemms
+      TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match");
+      out_size = {mat_a.size(1), mat_b.size(1)};
+    } else { // regular bmm
+      TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match");
+      out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)};
+    }
+  }
+
+  #ifndef USE_ROCM
+  // For TMA transfers, strides of output tensor have to be either
+  // 1, or aligned to 16 bytes.
+  const auto last_dim = out_size.size() - 1;
+  const auto alignment = 16 / c10::elementSize(out_dtype);
+  const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment;
+  std::vector<int64_t> out_stride;
+  if (a_is_2d != b_is_2d) {
+    out_stride = {size_padded, 1};
+  } else {
+    out_stride = {out_size[1] * size_padded, size_padded, 1};
+  }
+  return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype));
+  #else
+  return at::empty(out_size, mat_a.options().dtype(out_dtype));
+  #endif
+}
+
+inline void _grouped_mm_validate_inputs(const Tensor& mat_a, const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+const std::optional<at::Tensor>& bias,
+std::optional<c10::ScalarType> out_dtype) {
+  TORCH_CHECK((mat_a.dtype() == at::kBFloat16) || (mat_a.dtype() == at::kFloat) || (mat_a.dtype() == at::kHalf), "Expected mat_a to be Float32, BFloat16 or Float16 matrix, got ", mat_a.scalar_type());
+  TORCH_CHECK((mat_b.dtype() == at::kBFloat16) || (mat_b.dtype() == at::kFloat) || (mat_b.dtype() == at::kHalf), "Expected mat_b to be Float32, BFloat16 or Float16 matrix, got ", mat_b.scalar_type());
+  TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
+  TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
+  const bool a_is_2d = mat_a.dim() == 2;
+  const bool b_is_2d = mat_b.dim() == 2;
+  if (!a_is_2d || !b_is_2d) {
+    TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
+  }
+
+  // check that the strides are valid, the fn will throw an error if not
+  check_valid_strides_and_return_transposed(mat_a);
+  check_valid_strides_and_return_transposed(mat_b);
+  TORCH_CHECK(offs.has_value() ==  (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d");
+
+  if (offs.has_value()) {
+    TORCH_CHECK(offs->dim() == 1, "offs has to be 1D");
+    TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
+  }
+  TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
+}
+
+inline c10::ScalarType _resolve_grouped_mm_out_dtype(const Tensor& mat_a, const Tensor& mat_b,
+std::optional<c10::ScalarType> out_dtype) {
+  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
+  // TODO(future PR): enable float32 output dtype for bfloat16 and float16 inputs
+  TORCH_CHECK(out_dtype_ == mat_a.dtype(), "Grouped gemm output dtype must match `mat_a` dtype");
+  return out_dtype_;
+}
+
+
+inline void _grouped_mm_fallback(const Tensor& mat_a, const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+const std::optional<at::Tensor>& bias,
+std::optional<c10::ScalarType> out_dtype,
+Tensor out) {
+  LOG(INFO) << "fallback path for `torch._grouped_mm`, performance may not be optimal";
+  const bool a_is_2d = mat_a.dim() == 2;
+  const bool b_is_2d = mat_b.dim() == 2;
+  if (a_is_2d && !b_is_2d) {
+    // 2d x 3d with offsets
+    int group_start_idx = 0;
+    auto offs_cpu = offs.value().cpu();
+    for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) {
+      int group_end_idx = offs_cpu[group_idx].item<int>();
+      auto mat_a_slice = mat_a.slice(0, group_start_idx, group_end_idx);
+      auto out_slice = out.slice(0, group_start_idx, group_end_idx);
+      at::mm_out(out_slice, mat_a_slice, mat_b[group_idx]);
+      group_start_idx = group_end_idx;
+    }
+
+  } else if (!a_is_2d && b_is_2d) {
+    // 3d x 2d with offsets
+    int group_start_idx = 0;
+    auto offs_cpu = offs.value().cpu();
+    for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) {
+      int group_end_idx = offs_cpu[group_idx].item<int>();
+      auto mat_b_slice = mat_b.slice(1, group_start_idx, group_end_idx);
+      auto out_slice = out.slice(1, group_start_idx, group_end_idx);
+      at::mm_out(out_slice, mat_a[group_idx], mat_b_slice);
+      group_start_idx = group_end_idx;
+    }
+
+  } else if (a_is_2d && b_is_2d) {
+    // 2d x 2d with offsets
+    int group_start_idx = 0;
+    auto offs_cpu = offs.value().cpu();
+    for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) {
+      int group_end_idx = offs_cpu[group_idx].item<int>();
+      auto mat_a_slice = mat_a.slice(1, group_start_idx, group_end_idx);
+      auto mat_b_slice = mat_b.slice(0, group_start_idx, group_end_idx);
+      auto out_slice = out[group_idx];
+      at::mm_out(out_slice, mat_a_slice, mat_b_slice);
+      group_start_idx = group_end_idx;
+    }
+
+  } else {
+    // 3d x 3d without offsets - regular bmm
+    at::bmm_out(out, mat_a, mat_b);
+  }
+}
+
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index 5d3a84ea39f6d..a744da3bcad2e 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -185,6 +185,17 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
   // right:  "lro, summed, ro" permuted with rpermutation and the three flattened
   // then the permuted output is a view of bmm(left, right)
   // finally, opermutation reverts the permutation to the original order of dimensions
+  // By default the output is "lro, lo, 1-for-summed-dims, ro" with original shape dimensions.
+  // However, if all dimensions from the right operand appear before those from the left
+  // operand in the final output, we can swap the operands so that bmm directly produces
+  // the result in the correct memory order.
+
+  bool swap_lo_ro = !lo.empty() && !ro.empty() && ro.back() < lo.front();
+  if (swap_lo_ro) {
+    std::swap(left, right);
+    std::swap(lo, ro);
+    std::swap(lo_size, ro_size);
+  }
   auto out_num_dim = lro.size() + lo.size() + sum_dims_.size() + ro.size();
   std::vector<SymInt> out_size;
   out_size.reserve(out_num_dim);
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 2d7c2ff067c69..b62c584641dba 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1360,6 +1360,7 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
 #endif
 
 
+#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
 static inline int64_t get_mkldnn_matmul_min_dim() {
   static auto value = [&] {
     const int64_t default_min_dim = [&] {
@@ -1393,6 +1394,7 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
   const int64_t min_size = get_mkldnn_matmul_min_size();
   return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
 }
+#endif
 
 
 static void addmm_impl_cpu_(
@@ -1771,6 +1773,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
         (strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
   };
 
+#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
   bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
   if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
     try {
@@ -1781,6 +1784,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
       at::globalContext().setUserEnabledMkldnn(false);
     }
   }
+#endif
 
   if (contraction_size * res_rows * res_cols < 400) {
     if (is_bmm_out) {
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index 53d56622fe628..ca86292403fbf 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -47,10 +47,14 @@ TORCH_META_FUNC(nll_loss_forward)
   TORCH_CHECK(
       target.dim() <= 1,
       "0D or 1D target tensor expected, multi-target not supported");
-
-  auto no_batch_dim = self.dim() == 1  && target.dim() == 0;
+  if (self.dim() == 1 && target.dim() == 1) {
+      TORCH_CHECK_VALUE(
+          target.size(0) == 1,
+          "For 1D input, 1D target must have size 1, but got target size: ",
+          target.size(0));
+  }
   TORCH_CHECK(
-      no_batch_dim || (self.size(0) == target.size(0)),
+      self.dim() == 1 || (self.size(0) == target.size(0)),
       "size mismatch (got input: ",
       self.sizes(),
       ", target: ",
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 710a6498d3963..ac1086c6b6bd3 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -537,10 +537,13 @@ BatchNormBackend _select_batch_norm_backend(
   }
 
   // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM once ROCm officially supports NHWC in MIOpen
-  // See #64427
+  // See https://github.com/pytorch/pytorch/issues/64427.
   // non static variable is used to be able to change environment variable in runtime for testing
-  // enabled by default for ROCm >= 7.0.0
-  bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(ROCM_VERSION >= 70000);
+  // enabled by default for ROCm >= 7.0.0 with miopen 3.5
+  int miopen_version = detail::getCUDAHooks().compiledWithMIOpen() ? detail::getCUDAHooks().versionMIOpen() : 0;
+  bool is_miopen_3_4 = miopen_version >= 30400;  // ROCm 6.4
+  bool is_miopen_3_5 = miopen_version >= 30500;  // ROCm 7.0
+  bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(is_miopen_3_5);
 
   if (
       detail::getCUDAHooks().compiledWithMIOpen()
@@ -549,17 +552,15 @@ BatchNormBackend _select_batch_norm_backend(
       && input.dim() <= MIOPEN_DIM_MAX
       && input.dim() >= 3
       && input.scalar_type() != at::kDouble
-      && (detail::getCUDAHooks().versionMIOpen() >= 30400 || input.scalar_type() != at::kBFloat16)
+      && (is_miopen_3_4 || input.scalar_type() != at::kBFloat16)
       && weight.scalar_type() == at::kFloat // only FP32 weight for FP32 or FP16/BF16(mixed) input
       && weight.defined() && bias.defined()
       && ((running_mean.defined() && running_var.defined())
         || (!running_mean.defined() && !running_var.defined() && training))
       && (input.suggest_memory_format() == MemoryFormat::Contiguous
-#if (defined(USE_ROCM) && ROCM_VERSION >= 60500)
-        || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
-        || (input.suggest_memory_format() == MemoryFormat::ChannelsLast3d && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
-#endif
-        )
+          || (is_miopen_3_5 && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM &&
+              (input.suggest_memory_format() == MemoryFormat::ChannelsLast
+               || input.suggest_memory_format() == MemoryFormat::ChannelsLast3d)))
   ) {
     return BatchNormBackend::Miopen;
   }
diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp
index 2ac513bf08880..8833bdb6e471d 100644
--- a/aten/src/ATen/native/Onehot.cpp
+++ b/aten/src/ATen/native/Onehot.cpp
@@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
+#include <ATen/DTensorState.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -24,8 +25,13 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
         if (num_classes == -1) {
           num_classes = self.max().item().toLong() + 1;
         }
-        at::Tensor index = at::arange(num_classes, self.options());
-        return at::eq(self.unsqueeze(-1), index).to(kLong);
+        {
+          // If `self` is a DTensor, then allow implicit replication
+          // of the `index` Tensor.
+          at::DTensorAllowImplicitReplication guard;
+          at::Tensor index = at::arange(num_classes, self.options());
+          return at::eq(self.unsqueeze(-1), index).to(kLong);
+        }
     }
 
     auto shape = self.sizes().vec();
diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp
index 8072d24a1090d..8099648d37b29 100644
--- a/aten/src/ATen/native/PadNd.cpp
+++ b/aten/src/ATen/native/PadNd.cpp
@@ -240,8 +240,15 @@ Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mod
       default: {}
     }
   }
-  C10_THROW_ERROR(NotImplementedError,
-      "Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now");
+
+  std::ostringstream error_msg;
+  error_msg << "Padding size " << pad.size() << " is not supported for " << input_dim << "D input tensor.\n";
+  error_msg << "Supported combinations for non-constant padding:\n";
+  error_msg << "  - 2D or 3D input: padding size = 2 (pads last dimension)\n";
+  error_msg << "  - 3D or 4D input: padding size = 4 (pads last 2 dimensions)\n";
+  error_msg << "  - 4D or 5D input: padding size = 6 (pads last 3 dimensions)";
+
+  C10_THROW_ERROR(NotImplementedError, error_msg.str());
 }
 
 Tensor pad_symint(const Tensor &self, c10::SymIntArrayRef pad, std::string_view mode, std::optional<double> value) {
diff --git a/aten/src/ATen/native/QuantizedLinear.cpp b/aten/src/ATen/native/QuantizedLinear.cpp
index f4fdd395f013a..746d8c1a2db4f 100644
--- a/aten/src/ATen/native/QuantizedLinear.cpp
+++ b/aten/src/ATen/native/QuantizedLinear.cpp
@@ -411,7 +411,8 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
 Tensor fbgemm_linear_fp16_weight_fp32_activation(
     const Tensor& input,
     const Tensor& packed_weight,
-    const std::optional<Tensor>& bias) {
+    const std::optional<Tensor>& bias,
+    at::Tensor& output) {
   TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
                   "and will be removed in a future PyTorch release.")
 
@@ -436,9 +437,11 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
   const int64_t N = packed_weight_fp16.numCols();
+
   std::vector<int64_t> output_size = input.sizes().vec();
   output_size.back() = N;
-  Tensor output = at::empty(output_size, input.options().dtype(at::kFloat));
+  // Resize output Tensor
+  output.resize_(output_size);
 
   // Call the fp16 gemm interface
   fbgemm::cblas_gemm_compute(
@@ -460,6 +463,14 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
   return output;
 }
 
+Tensor fbgemm_linear_fp16_weight_fp32_activation(
+    const Tensor& input,
+    const Tensor& packed_weight,
+    const std::optional<Tensor>& bias) {
+      at::Tensor output = at::empty({0}, input.options().dtype(at::kFloat));
+      return at::native::fbgemm_linear_fp16_weight_fp32_activation(input, packed_weight, bias, output);
+  }
+
 Tensor fbgemm_linear_fp16_weight(
     const Tensor& input,
     const Tensor& packed_weight,
@@ -468,6 +479,15 @@ Tensor fbgemm_linear_fp16_weight(
       input, packed_weight, bias);
 }
 
+Tensor fbgemm_linear_fp16_weight(
+  const Tensor& input,
+    const Tensor& packed_weight,
+    const Tensor& bias,
+  at::Tensor& output) {
+  return at::native::fbgemm_linear_fp16_weight_fp32_activation(
+      input, packed_weight, bias, output);
+}
+
 #else // USE_FBGEMM
 
 Tensor fbgemm_linear_int8_weight_fp32_activation(
@@ -554,6 +574,21 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
       false, "This PyTorch installation was not built with FBGEMM operators");
 }
 
+Tensor fbgemm_linear_fp16_weight_fp32_activation(
+    const Tensor& input,
+    const Tensor& packed_weight,
+    const std::optional<Tensor>& bias,
+    at::Tensor& output) {
+  TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
+  // We make a strong guarantee that models using these operators will have the
+  // same numerics across different machines. Therefore, we do not provide a
+  // fallback path and rather fail loudly if we cannot run FBGEMM.
+  TORCH_CHECK(
+      false, "This PyTorch installation was not built with FBGEMM operators");
+}
+
 Tensor fbgemm_linear_fp16_weight_fp32_activation(
     const Tensor& input,
     const Tensor& packed_weight,
@@ -568,6 +603,21 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
       false, "This PyTorch installation was not built with FBGEMM operators");
 }
 
+Tensor fbgemm_linear_fp16_weight(
+    const Tensor& input,
+    const Tensor& packed_weight,
+    const Tensor& bias,
+    at::Tensor& output) {
+  TORCH_WARN_ONCE("fbgemm_linear_fp16_weight is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
+  // We make a strong guarantee that models using these operators will have the
+  // same numerics across different machines. Therefore, we do not provide a
+  // fallback path and rather fail loudly if we cannot run FBGEMM.
+  TORCH_CHECK(
+      false, "This PyTorch installation was not built with FBGEMM operators");
+}
+
 Tensor fbgemm_linear_fp16_weight(
     const Tensor& input,
     const Tensor& packed_weight,
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 5f9d5c85750b1..db046428bb683 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -220,6 +220,8 @@ static void check_argmax_argmin(
     const char* name,
     const Tensor& self,
     const std::optional<int64_t>& dim) {
+  TORCH_CHECK(!self.is_complex(), name, ": does not support complex input");
+  TORCH_CHECK(!(self.scalar_type() == kBool), name, ": does not support bool input");
   if (dim.has_value()) {
     auto dim_ = maybe_wrap_dim(dim.value(), self.dim());
     native::zero_numel_check_dims(self, dim_, name);
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
index 1bdc806a3b4ec..44215a26018f0 100644
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@@ -59,6 +59,8 @@ TORCH_META_FUNC(topk)
       "selected index k out of range");
   int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim);
   TORCH_CHECK(k >= 0 && k <= sliceSize, "k not in range for dimension");
+  TORCH_CHECK(!self.is_complex(), " topk does not support complex dtypes on CPU");
+  TORCH_CHECK(!(self.scalar_type() == kBool), "topk does not support bool dtypes on CPU");
 
   // Build the output size, which is the dim being selected set to
   // size k
@@ -74,11 +76,7 @@ TORCH_META_FUNC2(sort, stable)
 (const Tensor& self, std::optional<bool> stable, int64_t dim, bool descending) {
   maybe_wrap_dim(dim, self.dim());
 
-  const auto self_dtype = self.dtype();
-  TORCH_CHECK_VALUE(
-    self_dtype != ScalarType::ComplexFloat &&
-    self_dtype != ScalarType::ComplexDouble,
-    "Sort currently does not support complex dtypes on CPU.");
+  TORCH_CHECK(!self.is_complex(), " Sort does not support complex dtypes on CPU");
 
   // See issue: https://github.com/pytorch/pytorch/issues/65863
   // Strides should be dense, so as not to allocate too much memory.
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 054cc66cf8eb3..1886e65fc1edc 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -1640,6 +1640,9 @@ Tensor zeros_symint(
     std::optional<Layout> layout,
     std::optional<Device> device,
     std::optional<bool> pin_memory) {
+  for (const auto& dim_size : size) {
+    TORCH_CHECK(dim_size >= 0, "zeros: Dimension size must be non-negative.");
+  }
   Layout layout_ = layout.value_or(Layout::Strided);
   if (at::sparse_csr::is_sparse_compressed(layout_)) {
     return zeros_sparse_compressed_symint(
diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
index 77acfe47363e4..4fa0556ad7859 100644
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -18,6 +18,7 @@
 #include <ATen/ops/is_set_to_native.h>
 #include <ATen/ops/size_native.h>
 #include <ATen/ops/stride_native.h>
+#include <ATen/ops/sym_is_contiguous_native.h>
 #include <ATen/ops/sym_numel_native.h>
 #include <ATen/ops/sym_size_native.h>
 #include <ATen/ops/sym_storage_offset_native.h>
@@ -57,6 +58,12 @@ c10::SymInt sym_size(const Tensor& self, int64_t dim) {
   return self.sym_size(dim);
 }
 
+c10::SymBool sym_is_contiguous(
+    const Tensor& self,
+    c10::MemoryFormat memory_format) {
+  return self.sym_is_contiguous(memory_format);
+}
+
 c10::SymInt sym_stride(const Tensor& self, int64_t dim) {
   return self.sym_stride(dim);
 }
diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
index 2d300177a0533..a1a7059b7d64f 100644
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@@ -139,7 +139,7 @@ struct Dist {
     static inline data_t map(const data_t& diff, const data_t& p) { return diff; }
     static inline data_t red(const data_t& agg, const data_t& up) { return max(agg, up); }
     static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
-    // TODO This backward pass uses a very complext expression to compute (diff
+    // TODO This backward pass uses a very complex expression to compute (diff
     // == dist) that could be much faster if using SSE instructions.
     static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return Vec(grad) * sign(diff) * (Vec(1) - vec::minimum(Vec(1), (diff.abs() - Vec(dist)).abs().ceil())); }
   };
diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h
index 5715fd8f047f2..83b51a9985637 100644
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@@ -89,7 +89,7 @@ execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t
   using result_type = typename traits::result_type;
   for (; i < n; i++) {
     result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
-    *out_ptr = c10::guts::apply(op, dereference<traits>(
+    *out_ptr = std::apply(op, dereference<traits>(
         &data[1],
         &strides[1],
         i));
@@ -102,7 +102,7 @@ inline void
 execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
   using traits = function_traits<func_t>;
   for (; i < n; i++) {
-    c10::guts::apply(op, dereference<traits>(
+    std::apply(op, dereference<traits>(
         &data[0],
         &strides[0],
         i));
@@ -162,7 +162,7 @@ void handle_tuple_outputs(char* C10_RESTRICT data[],
 }
 
 // Loop operation for `cpu_kernel_multiple_outputs`.
-// 1. Use `c10::guts::apply` to make dynamic method invocation
+// 1. Use `std::apply` to make dynamic method invocation
 //    for the lambda passed in `cpu_kernel_multiple_outputs`.
 // 2. Iterate over the members of the returned tuple, set the corresponding
 //    output tensor by the tuple member in `handle_tuple_outputs` function.
@@ -183,7 +183,7 @@ multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_
   }
 
   for (; i < n; i++) {
-    auto output = c10::guts::apply(op, dereference<traits>(
+    auto output = std::apply(op, dereference<traits>(
       &data[num_outputs],
       &strides[num_outputs],
       i));
@@ -213,8 +213,8 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
   for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
     auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
     auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
-    auto out1 = c10::guts::apply(vop, std::move(args1));
-    auto out2 = c10::guts::apply(vop, std::move(args2));
+    auto out1 = std::apply(vop, std::move(args1));
+    auto out2 = std::apply(vop, std::move(args2));
     out1.store(data[0] + i * sizeof(scalar_t));
     out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
   }
diff --git a/aten/src/ATen/native/cpu/PaddingKernel.cpp b/aten/src/ATen/native/cpu/PaddingKernel.cpp
index e3f08194bb58e..59d838b9782da 100644
--- a/aten/src/ATen/native/cpu/PaddingKernel.cpp
+++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp
@@ -156,7 +156,7 @@ void cpu_padding(
   int64_t offset_h = ndim >= 2 ? p.offsets[ndim - 2] : 0;
   int64_t offset_w = p.offsets[ndim - 1];
 
-  // do vectorized copy whe output is overlapped with input on W,
+  // do vectorized copy when output is overlapped with input on W,
   // only applies to positive padding
   auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) {
     if (positive_padding) {
diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
index 317647123d4c0..dac0f3bef25ee 100644
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <iterator>
 #include <numeric>
+#include <vector>
 
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
@@ -647,10 +648,10 @@ _vec_softmax(
   parallel_for(
       0, outer_size * inner_size, 0, [&](int64_t begin, int64_t end) {
         int64_t idx = begin;
-        std::unique_ptr<float[]> temp_vec_input(new float[dim_size*vectorized_step]());
-        std::unique_ptr<float[]> temp_vec_output(new float[dim_size*vectorized_step]());
-        float* temp_vec_input_data = temp_vec_input.get();
-        float* temp_vec_output_data = temp_vec_output.get();
+        std::vector<float> temp_vec_input(dim_size * vectorized_step);
+        std::vector<float> temp_vec_output(dim_size * vectorized_step);
+        float* temp_vec_input_data = temp_vec_input.data();
+        float* temp_vec_output_data = temp_vec_output.data();
         while (idx < end) {
           int64_t outer_idx = idx / inner_size;
           int64_t inner_idx = idx % inner_size;
diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
index 5a288193143d4..d013dfa0485e0 100644
--- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
@@ -318,7 +318,7 @@ batch_norm_cpu_collect_stats_channels_last_impl(
     //
     // The optimal THRESHOLD to tile was found empirically.
     // When C > THRESHOLD, C is large enough that the benefit from tiling and vectorization outweigh the synchronization overhead.
-    // Wehn C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
+    // When C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
     //
     // When num_threads == 1, always use Method 2 as there is no synchronization overhead.
     //
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 40d39b3c7b606..fcaae32e773f1 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -16,6 +16,7 @@
 #include <ATen/cuda/tunable/TunableGemm.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/MaybeOwned.h>
+#include <ATen/native/GroupedMMUtils.h>
 #include <ATen/native/cuda/RowwiseScaledMM.h>
 #include <ATen/native/cuda/ScaledGroupMM.h>
 #include <ATen/native/cuda/GroupMM.h>
@@ -1080,6 +1081,16 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals
 #endif
 }
 
+static bool _grouped_mm_allowed_device() {
+#ifdef USE_ROCM
+    return false;
+#else
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    // CUDA capability 8.0 and greater
+    return dprops->major >= 8;
+#endif
+}
+
 #ifdef USE_ROCM
 static bool _scaled_mm_is_fnuz() {
     return at::detail::getCUDAHooks().isGPUArch({"gfx942"});
@@ -1289,21 +1300,30 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
     TORCH_CHECK(ROCM_VERSION >= 70000, "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above");
   }
   if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) {
-    TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e5m2 is only supported for ROCm 6.0 and above");
+    TORCH_CHECK(ROCM_VERSION >= 60500, "Float8_e5m2 is only supported for ROCm 6.5 and above");
   }
   if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) {
-    TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e4m3fn is only supported for ROCm 6.0 and above");
+    TORCH_CHECK(ROCM_VERSION >= 60500, "Float8_e4m3fn is only supported for ROCm 6.5 and above");
   }
 #endif
   if (bias) {
-    TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32");
-    TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
-         "Bias must be either Half or BFloat16, but got ", bias->scalar_type());
-    TORCH_CHECK((out.scalar_type() != kFloat && out.scalar_type() != ScalarType::BFloat16) ||
-          bias->scalar_type() == ScalarType::BFloat16,
-          "Bias must be BFloat16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
-    TORCH_CHECK(out.scalar_type() != ScalarType::Half || bias->scalar_type() == ScalarType::Half,
-          "Bias must be Float16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
+    TORCH_CHECK(out.scalar_type() != kFloat,
+        "Bias is not supported when out_dtype is set to Float32");
+
+    TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 ||
+                bias->scalar_type() == ScalarType::Half,
+        "Bias must be BFloat16 or Half, but got ", bias->scalar_type());
+
+    TORCH_CHECK((out.scalar_type() != kFloat &&
+                 out.scalar_type() != ScalarType::BFloat16) ||
+                bias->scalar_type() == ScalarType::BFloat16,
+        "Bias must be BFloat16 to compute ", out.scalar_type(),
+        " output, but got ", bias->scalar_type());
+
+    TORCH_CHECK(out.scalar_type() != ScalarType::Half ||
+                bias->scalar_type() == ScalarType::Half,
+        "Bias must be Float16 to compute ", out.scalar_type(),
+        " output, but got ", bias->scalar_type());
   }
   {
     auto bias_ = bias.value_or(Tensor());
@@ -1339,7 +1359,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   // We are doing row-wise scaling
   auto dprops = at::cuda::getCurrentDeviceProperties();
   if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise
-      && (dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)) {
+      && ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
+      // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
+      ||  (dprops->major >= 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
     TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
     at::cuda::detail::f8f8bf16_rowwise(
         mat1,
@@ -1365,6 +1387,22 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
     TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
          "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
   }
+  else if (scaling_choice_a == ScalingType::BlockWise1x32 && scaling_choice_b == ScalingType::BlockWise1x32) {
+    #if ROCM_VERSION >= 70000
+    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
+                "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+
+    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
+                mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
+                "Matrix dimensions must be multiples of 32 for block-wise scaling");
+
+    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
+                out.scalar_type() == ScalarType::Half,
+                "Block-wise scaling only supports BFloat16 or Half output types");
+#else
+    TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
+#endif
+  }
 #endif
 
   cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, scale_result, scaling_choice_a, scaling_choice_b);
@@ -1442,12 +1480,14 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
       params.k = args.k;
       params.a = args.mata->data_ptr();
       params.a_scale_ptr = args.scale_mata_ptr;
+      params.a_scale_dtype = args.scale_mata_dtype.value();
       params.lda = args.lda;
       params.a_dtype = args.mata->scalar_type();
       params.a_scale_dtype = args.scale_mata_dtype.value();
       params.a_scaling_type = args.scaling_mata_type.value();
       params.b = args.matb->data_ptr();
       params.b_scale_ptr = args.scale_matb_ptr;
+      params.b_scale_dtype = args.scale_matb_dtype.value();
       params.ldb = args.ldb;
       params.b_dtype = args.matb->scalar_type();
       params.b_scale_dtype = args.scale_matb_dtype.value();
@@ -1512,71 +1552,8 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
 }
 
 namespace {
-  at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a,
-  const Tensor& mat_b,
-  const std::optional<at::Tensor>& offs,
-  std::optional<c10::ScalarType> out_dtype
-  ) {
-    c10::SmallVector<int64_t, 3> out_size;
-    const bool a_is_2d = mat_a.dim() == 2;
-    const bool b_is_2d = mat_b.dim() == 2;
-    if (a_is_2d) {
-      if (b_is_2d) {
-        out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)};
-      } else {
-        TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match");
-        out_size = {mat_a.size(0), mat_b.size(-1)};
-      }
-    } else {
-      if (b_is_2d) {
-        // this case is not actually encountered for MoE gemms
-        TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match");
-        out_size = {mat_a.size(1), mat_b.size(1)};
-      } else { // regular bmm
-        TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match");
-        out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)};
-      }
-    }
-
-    const auto out_dtype_ = out_dtype.value_or(kBFloat16);
-    TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
-
-    #ifndef USE_ROCM
-    // For TMA transfers, strides of output tensor have to be either
-    // 1, or aligned to 16 bytes.
-    const auto last_dim = out_size.size() - 1;
-    const auto alignment = 16 / c10::elementSize(out_dtype_);
-    const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment;
-    std::vector<int64_t> out_stride;
-    if (a_is_2d != b_is_2d) {
-      out_stride = {size_padded, 1};
-    } else {
-      out_stride = {out_size[1] * size_padded, size_padded, 1};
-    }
-    return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
-    #else
-    return at::empty(out_size, mat_a.options().dtype(out_dtype_));
-    #endif
-  }
-
-  bool check_valid_strides_and_return_transposed(const Tensor& mat) {
-    IntArrayRef tensor_strides = mat.strides();
-    IntArrayRef tensor_sizes = mat.sizes();
-    int end_dim = mat.dim() - 1;
-    int alignment = 16 / mat.element_size();
-    TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n");
-    if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max<int64_t>(1, tensor_sizes[end_dim - 1]))) {
-      TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes");
-      return true;
-    } else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max<int64_t>(1, tensor_sizes[end_dim]))) {
-      TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes");
-      return false;
-    } else {
-      TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes");
-    }
-  }
-
-  void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
+  void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
+    // Checks scales for 2d or 3d target tensors (`mat`).
     if (mat.dim() == 2) {
       TORCH_CHECK(
           scale.dim() == 1,
@@ -1610,9 +1587,66 @@ namespace {
           "scale must have the same first dimension as mat for arg ",
           arg_idx);
     }
-}
+  }
 
+  void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
+    // Checks scales for 2d or 3d target tensors (`mat`).
+    if (mat.dim() == 2) {
+      // For MXFP8, 2d tensors have variable size groups represented as subtensors,
+      // that are converted to blocked padded format individually,
+      // so we can't check the scale sizes without doing a d2h sync to get the group sizes here.
+      TORCH_CHECK(
+        scale.dim() == mat.dim(),
+        "for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx);
+
+      // LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4))
+      // RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4))
+      //   * weight is transposed prior to the call, scale stays non-transposed.
+      bool LHS = arg_idx == 0;
+      int scale_dim_to_check = 0;
+      int mat_dim_to_check = LHS ? 0 : 1;
+      TORCH_CHECK(
+          scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check),
+          "for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
+          "must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")");
+    } else {
+      // For MXFP8, 3d tensors have static group sizes (stack of 2d tensors),
+      // so we can check the exact expected scale sizes here without a d2h sync.
+      auto round_up = [](auto x, auto y) {
+          return ((x + y - 1) / y) * y;
+      };
+
+      // TODO: this is for 3d tensor in 2d-3d case specifically.
+      // We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them.
+      int64_t G = mat.size(0);
+      int64_t K = mat.size(1);
+      int64_t N = mat.size(2);
+      int64_t blocked_scale_K = round_up(K/32, 4);
+      int64_t blocked_scale_N = round_up(N, 128);
+
+      // fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N).
+      TORCH_CHECK(
+        scale.dim() == mat.dim() - 1,
+        "for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx
+      );
+      TORCH_CHECK(
+        scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N,
+        "for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx
+      );
+    }
+  }
 
+  void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
+    bool using_fp8_rowwise = scale.scalar_type() == kFloat;
+    bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu;
+    if (using_fp8_rowwise) {
+      _check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier);
+    } else if (using_mxfp8) {
+      _check_scales_mxfp8(mat, scale, dim, arg_idx);
+    } else {
+      TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype());
+    }
+  }
 }
 
 Tensor
@@ -1637,8 +1671,8 @@ const std::optional<at::Tensor>& bias,
 const std::optional<at::Tensor>& scale_result,
 std::optional<c10::ScalarType> out_dtype,
 bool use_fast_accum) {
-  bool allowed_device = _scaled_mm_allowed_device();
-  TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");
+  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true);
+  TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = [9.0, 10.0], or ROCm MI300+");
 
   TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
   TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
@@ -1671,16 +1705,47 @@ bool use_fast_accum) {
     TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
   }
 
-  // Both Per-Tensor and Row-wise scaling expect fp32 tensors
+  // FP8 per-tensor and per-row scaling expect fp32 scales.
+  // MXFP8 expects float8_e8m0fnu scales.
   TORCH_CHECK(
-      scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat,
-      "Both scale_a and scale_b must be float (fp32) tensors.");
+      (scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat) ||
+      (scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu),
+      "For FP8 tensorwise and rowwise, both scales must both be float32 tensors. For MXFP8, scales must both be float8_e8m0fnu tensors.");
 
   const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1;
   check_scale(mat_a, scale_a, 0 ,0, scale_multiplier);
   check_scale(mat_b, scale_b, 1, 1, scale_multiplier);
 
-  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
+  const auto out_dtype_ = out_dtype.value_or(kBFloat16);
+  TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
+
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
+
+#if defined(USE_FBGEMM_GENAI) && defined(USE_CUDA) && !defined(USE_ROCM)
+  // MXFP8 grouped GEMM dispatching
+  bool is_mx8mx8bf16 = (
+    mat_a.scalar_type() == at::kFloat8_e4m3fn && mat_b.scalar_type() == at::kFloat8_e4m3fn &&
+    scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu
+  );
+  TORCH_CHECK(out_dtype == at::kBFloat16, "Only bf16 out_dtype is supported for MXFP8 grouped gemm");
+
+  if (is_mx8mx8bf16) {
+    bool b_is_3d = mat_b.dim() == 3;
+    bool is_2d_2d = a_is_2d && b_is_2d;
+    bool is_2d_3d = a_is_2d && b_is_3d;
+    TORCH_CHECK(is_2d_2d || is_2d_3d, "MXFP8 grouped GEMM currently only supports 2d-2d and 2d-3d cases");
+    TORCH_CHECK(offs.has_value(), "MXFP8 2d-2d and 2d-3d grouped GEMMs requires offsets");
+
+    fbgemm_gpu::mx8mx8bf16_grouped_mm(
+        mat_a,
+        mat_b,
+        scale_a,
+        scale_b,
+        offs.value(),
+        out);
+    return out;
+  }
+#endif
 
 #ifndef USE_ROCM
   TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
@@ -1713,6 +1778,7 @@ bool use_fast_accum) {
 #else
   TORCH_CHECK(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM")
 #endif
+
 #endif
 
 }
@@ -1722,33 +1788,21 @@ const std::optional<at::Tensor>& offs,
 const std::optional<at::Tensor>& bias,
 std::optional<c10::ScalarType> out_dtype) {
 #ifndef USE_ROCM
-  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true);
-  TORCH_CHECK(allowed_device, "torch._grouped_mm is only supported on CUDA devices with compute capability = 9.0, 10.0");
-
-  TORCH_CHECK(mat_a.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_a.scalar_type());
-  TORCH_CHECK(mat_b.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_b.scalar_type());
-  TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
-  TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
-  const bool a_is_2d = mat_a.dim() == 2;
-  const bool b_is_2d = mat_b.dim() == 2;
-  if (!a_is_2d || !b_is_2d) {
-    TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
-  }
-
-  // check that the strides are valid, the fn will throw an error if not
-  check_valid_strides_and_return_transposed(mat_a);
-  check_valid_strides_and_return_transposed(mat_b);
-  TORCH_CHECK(offs.has_value() ==  (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d");
-
-  if (offs.has_value()) {
-    TORCH_CHECK(offs->dim() == 1, "offs has to be 1D");
-    TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
+  _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
+  bool a_b_and_out_are_bf16 = (
+    mat_a.dtype() == at::kBFloat16 &&
+    mat_b.dtype() == at::kBFloat16 &&
+    out_dtype.value_or(at::kBFloat16) == at::kBFloat16
+  );
+  bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16;
+  const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
+  if (use_fast_path) {
+    // fast path, no d2h sync needed
+    at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
+  } else {
+    _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
   }
-  TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
-
-  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
-
-  at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
   return out;
 #else
   TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index 16acbe0b8bf2d..12ad84a15b180 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -436,7 +436,6 @@ static inline void launch_vectorized_templated_kernel(
     loader_t l,
     storer_t s) {
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
-  using traits = function_traits<func_t>;
   int64_t grid = (N + vectorized_templated_config::block_work_size() - 1) /
       vectorized_templated_config::block_work_size();
   auto stream = at::cuda::getCurrentCUDAStream();
diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu
index b5908cc0abcfc..c6d3c25200d50 100644
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@@ -644,7 +644,12 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
   Tensor grad = at::full_like(log_probs, neginf, LEGACY_CONTIGUOUS_MEMORY_FORMAT); // initialization for log(sum (alpha beta))
 
   // As above, there may be better configurations to use.
-  constexpr int max_threads = std::is_same_v<scalar_t, float> ? 1024 : 896; // we need 72 or so 32 bit registers for double
+  constexpr int max_threads_ = std::is_same_v<scalar_t, float> ? 1024 : 896; // we need 72 or so 32 bit registers for double
+  int max_threads = max_threads_;
+  // Blackwell launch bounds
+  if (at::cuda::getCurrentDeviceProperties()->major >= 10) {
+    max_threads = 512;
+  }
   int threads_target = max_threads;
   while (threads_target / 2 >= 2*max_target_length+1) {
     threads_target /= 2;
diff --git a/aten/src/ATen/native/cuda/Pow.cuh b/aten/src/ATen/native/cuda/Pow.cuh
index dc9faf77f22a3..fe249c1cdaef3 100644
--- a/aten/src/ATen/native/cuda/Pow.cuh
+++ b/aten/src/ATen/native/cuda/Pow.cuh
@@ -14,7 +14,7 @@ namespace {
 //   pow(double, int)
 //   pow(float, float)
 //   pow(double, double)
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(_LIBCPP_VERSION)
 // Functions for pow
 // pow for at::Half
 static inline __host__ __device__ at::Half pow_(at::Half base, at::Half exp) {
diff --git a/aten/src/ATen/native/cuda/SegmentReduce.cu b/aten/src/ATen/native/cuda/SegmentReduce.cu
index 3acb359342f13..c6f88692a8a5c 100644
--- a/aten/src/ATen/native/cuda/SegmentReduce.cu
+++ b/aten/src/ATen/native/cuda/SegmentReduce.cu
@@ -20,7 +20,7 @@
 
 // SegmentReduce compilation with CUDA-12.9 causes  NVCC crash on Windows
 // See https://github.com/pytorch/pytorch/issues/156181
-#if !defined(_WIN32) || CUDART_VERSION < 12090
+#if !(defined(_WIN32) && CUDART_VERSION == 12090)
 
 namespace at::native {
 
@@ -606,4 +606,4 @@ REGISTER_DISPATCH(
 
 } // namespace at::native
 
-#endif
+#endif
\ No newline at end of file
diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu
index 272eb9b9c564f..5444bb57eba7c 100644
--- a/aten/src/ATen/native/cuda/int4mm.cu
+++ b/aten/src/ATen/native/cuda/int4mm.cu
@@ -1304,7 +1304,7 @@ at::Tensor _convert_weight_to_int4pack_cuda(
   constexpr int32_t kKTileSize = 16;
 
   // GPT-FAST assumes nTileSize of 8 for quantized weight tensor.
-  // See https://github.com/pytorch-labs/gpt-fast/blob/091515ab5b06f91c0d6a3b92f9c27463f738cc9b/quantize.py#L510
+  // See https://github.com/meta-pytorch/gpt-fast/blob/091515ab5b06f91c0d6a3b92f9c27463f738cc9b/quantize.py#L510
   // Torch dynamo also requires the torch ops has the same output shape for each device.
   // See https://github.com/pytorch/pytorch/blob/ec284d3a74ec1863685febd53687d491fd99a161/torch/_meta_registrations.py#L3263
   constexpr int32_t kNTileSizeTensor = 8;
diff --git a/aten/src/ATen/native/cuda/int8mm.cu b/aten/src/ATen/native/cuda/int8mm.cu
new file mode 100644
index 0000000000000..60f64cd9fc203
--- /dev/null
+++ b/aten/src/ATen/native/cuda/int8mm.cu
@@ -0,0 +1,74 @@
+#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+namespace at::native {
+
+__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) {
+  // one thread per output element: [B, N]
+  int b = blockIdx.y * blockDim.y + threadIdx.y;
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (b >= B || n >= N) return;
+
+  float acc = 0.0f;
+  for (int k = 0; k < K; ++k) {
+    acc += x[b * K + k] * static_cast<float>(w[n * K + k]);
+  }
+
+  out[b * N + n] = acc * scale[n];
+}
+
+void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) {
+  const int B = x.size(0);
+  const int K = x.size(1);
+  const int N = w_int8.size(0);
+
+  const dim3 block(16, 16);
+  const dim3 grid((N + block.x - 1) / block.x, (B + block.y - 1) / block.y);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  weight_int8pack_mm_kernel<<<grid, block, 0, stream>>>(
+      x.data_ptr<float>(),
+      w_int8.data_ptr<int8_t>(),
+      scale.data_ptr<float>(),
+      out.data_ptr<float>(),
+      B, K, N);
+}
+
+
+// Main GPU entry point
+at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) {
+  // --- Check inputs ---
+  TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
+  TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor");
+  TORCH_CHECK(scale.is_cuda(), "scale must be a CUDA tensor");
+
+  TORCH_CHECK(x.dim() == 2, "x must be 2D");
+  TORCH_CHECK(w_int8.dim() == 2, "w must be 2D");
+  TORCH_CHECK(scale.dim() == 1, "scale must be 1D");
+
+  TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)");
+  TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)");
+
+  // --- Determine shapes ---
+  auto B = x.size(0);  // batch size
+  auto N = w_int8.size(0);  // output dim
+
+  // Ensure inputs are in the correct types for the kernel
+  auto x_f32 = x.to(at::kFloat);
+  auto w_int8_contiguous = w_int8.contiguous();
+  auto scale_f32 = scale.to(at::kFloat);
+
+  // --- Allocate output ---
+  auto out = at::empty({B, N}, x.options().dtype(at::kFloat));
+
+  // --- Launch kernel ---
+  launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out);
+
+  return out;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp
index 4d869e5679f8a..081b4afa15ac5 100644
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@@ -285,7 +285,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
         sizeof(algos) / sizeof(algos[0]) == num_algos,
         "Missing cuDNN convolution forward algorithms");
     int perf_count;
-    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+    c10::SmallVector<perf_t, CUDNN_CONVOLUTION_FWD_ALGO_COUNT> perf_results;
     if (!benchmark) {
       AT_CUDNN_CHECK_WITH_SHAPES(
           cudnnGetConvolutionForwardAlgorithm_v7(
@@ -296,7 +296,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
               args.odesc.desc(),
               num_algos,
               &perf_count,
-              perf_results.get()),
+              perf_results.data()),
           args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
@@ -314,7 +314,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
               args.output.data_ptr(),
               num_algos,
               &perf_count,
-              perf_results.get(),
+              perf_results.data(),
               ws.data,
               ws.size),
           args);
@@ -324,7 +324,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
       // memory, e.g. a few GBs.
       c10::cuda::CUDACachingAllocator::emptyCache();
     }
-    return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
+    return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
   }
 
   static void getWorkspaceSize(
@@ -369,7 +369,8 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
         sizeof(algos) / sizeof(algos[0]) == num_algos,
         "Missing cuDNN convolution backward data algorithms.");
     int perf_count;
-    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+    c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT>
+        perf_results;
     if (!benchmark) {
       AT_CUDNN_CHECK_WITH_SHAPES(
           cudnnGetConvolutionBackwardDataAlgorithm_v7(
@@ -380,7 +381,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
               args.idesc.desc(),
               num_algos,
               &perf_count,
-              perf_results.get()),
+              perf_results.data()),
           args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
@@ -398,7 +399,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
               args.input.data_ptr(),
               num_algos,
               &perf_count,
-              perf_results.get(),
+              perf_results.data(),
               ws.data,
               ws.size),
           args);
@@ -408,7 +409,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
       // memory, e.g. a few GBs.
       c10::cuda::CUDACachingAllocator::emptyCache();
     }
-    return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
+    return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
   }
 
   static void getWorkspaceSize(
@@ -456,7 +457,8 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
     static_assert(
         sizeof(algos) / sizeof(algos[0]) == num_algos,
         "Missing cuDNN convolution backward filter algorithms.");
-    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+    c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT>
+        perf_results;
     int perf_count;
     if (!benchmark) {
       AT_CUDNN_CHECK_WITH_SHAPES(
@@ -468,7 +470,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
               args.wdesc.desc(),
               num_algos,
               &perf_count,
-              perf_results.get()),
+              perf_results.data()),
           args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
@@ -486,7 +488,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
               args.weight.data_ptr(),
               num_algos,
               &perf_count,
-              perf_results.get(),
+              perf_results.data(),
               ws.data,
               ws.size),
           args);
@@ -496,7 +498,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
       // memory, e.g. a few GBs.
       c10::cuda::CUDACachingAllocator::emptyCache();
     }
-    return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
+    return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
   }
 
   static void getWorkspaceSize(
diff --git a/aten/src/ATen/native/cudnn/MHA.cpp b/aten/src/ATen/native/cudnn/MHA.cpp
index 48119a6a3b4c3..c2f7ce2ac2d53 100644
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@@ -2,9 +2,13 @@
 #include <ATen/Config.h>
 #include <ATen/cuda/CUDAConfig.h>
 
-#if defined(USE_ROCM) || !AT_CUDNN_ENABLED() || \
-    (defined(CUDNN_VERSION) && CUDNN_VERSION < 8900)
+#if AT_CUDNN_ENABLED()
+#include <cudnn_frontend.h>
+#endif
 
+#if defined(USE_ROCM) || !AT_CUDNN_ENABLED() ||         \
+    (defined(CUDNN_VERSION) && CUDNN_VERSION < 8900) || \
+    (defined(CUDNN_FRONTEND_VERSION) && CUDNN_FRONTEND_VERSION < 10100)
 namespace at {
 namespace native {
 
@@ -84,6 +88,37 @@ void run_cudnn_SDP_bprop(
       false, "PyTorch was not compiled with cuDNN Flash Attention enabled!");
 }
 
+void run_cudnn_SDP_bprop_nestedtensor(
+    int64_t b,
+    int64_t h_q,
+    int64_t h_k,
+    int64_t h_v,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d_qk,
+    int64_t d_v,
+
+    float scaling_factor,
+    bool is_causal,
+    float dropout_probability,
+    const Tensor& cum_seqlen_q,
+    const Tensor& cum_seqlen_kv,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
+    const Tensor& o,
+    const Tensor& dO,
+    const Tensor& softmaxstats,
+    Tensor& dQ,
+    Tensor& dK,
+    Tensor& dV,
+    const Tensor& dropoutseed,
+    const Tensor& dropoutoffset) {
+  TORCH_CHECK(
+      false, "PyTorch was not compiled with cuDNN Flash Attention enabled!");
+}
+
 } // namespace native
 } // namespace at
 
@@ -95,7 +130,6 @@ void run_cudnn_SDP_bprop(
 #include <ATen/native/transformers/sdp_utils.h>
 
 #include <ATen/cuda/Exceptions.h>
-#include <cudnn_frontend.h>
 
 #include <ATen/TensorUtils.h>
 #include <ATen/native/utils/ParamsHash.h>
@@ -111,42 +145,58 @@ namespace native {
 #include <cudnn_frontend.h>
 
 namespace fe = cudnn_frontend;
-using graph_and_tensors = std::tuple<
-    std::shared_ptr<fe::graph::Graph>,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // K,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // V,
-    std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias
-    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
-    // TODO(eqy): additional options
-    // std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_Q,
-    // std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_KV,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // Offset,
-    // std::shared_ptr<fe::graph::Tensor_attributes>, // Dropout_mask,
-    // std::shared_ptr<fe::graph::Tensor_attributes>, // Dropout_scale
-    std::shared_ptr<fe::graph::Tensor_attributes>, // O
-    std::shared_ptr<fe::graph::Tensor_attributes> // Stats
-    >;
-
-using graph_and_tensors_backward = std::tuple<
-    std::shared_ptr<fe::graph::Graph>,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // K,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // V,
-    std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // Offset,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // O,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // dO,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // stats,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // dQ,
-    std::shared_ptr<fe::graph::Tensor_attributes>, // dK,,
-    std::shared_ptr<fe::graph::Tensor_attributes> // dV,
-    >;
-
-#define MAX_MHA_DIM 4
+
+constexpr uint8_t MAX_MHA_DIM = 4;
+
+// Whether we will use ragged offsets in the dense (non-nested) path
+// to avoid recompilation
+bool use_ragged_in_dense(
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const Tensor& o,
+    bool has_bias) {
+  static bool flag =
+      c10::utils::check_env("TORCH_CUDNN_SDPA_AVOID_RECOMPILE") == true;
+  if (!flag) {
+    return flag;
+  }
+  TORCH_WARN_ONCE(
+      "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 is currently experimental. "
+      "Please report any issues to https://github.com/pytorch/pytorch/issues.");
+  if (has_bias) {
+    TORCH_WARN_ONCE(
+        "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works without bias."
+        "Consider using the is_causal hint instead of bias for causal masking."
+        "Falling back to regular dense case, which may trigger excessive recompilation.");
+    return !has_bias;
+  }
+  bool all_bshd = q.dim() == 4 && q.transpose(1, 2).is_contiguous() &&
+      k.dim() == 4 && k.transpose(1, 2).is_contiguous() && v.dim() == 4 &&
+      v.transpose(1, 2).is_contiguous() && o.dim() == 4 &&
+      o.transpose(1, 2).is_contiguous();
+  if (!all_bshd) {
+    TORCH_WARN_ONCE(
+        "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works with Q, K, V, and output in BSHD memory layout,"
+        "e.g., Q, K, V must be allocated with torch.randn((B, S, H, D).transpose(1, 2)."
+        "Falling back to regualr dense case, which may trigger excessive recompilation.");
+  }
+  return all_bshd;
+}
+
+int roundup_power2(int dim) {
+  if (!dim) {
+    return 1;
+  }
+  dim--;
+  dim |= dim >> 1;
+  dim |= dim >> 2;
+  dim |= dim >> 4;
+  dim |= dim >> 8;
+  dim |= dim >> 16;
+  dim++;
+  return dim;
+}
 
 struct MHAParams {
   c10::DeviceIndex device_id;
@@ -171,6 +221,7 @@ struct MHAParams {
   // might be redundant if we take 0 dim/stride
   // as signaling no-bias
   bool has_attn_bias;
+  bool use_ragged;
 };
 
 void setMHAParams(
@@ -187,7 +238,8 @@ void setMHAParams(
     const std::optional<Tensor>& attn_bias,
     double dropout_probability,
     bool is_causal,
-    bool return_softmaxstats) {
+    bool return_softmaxstats,
+    bool is_nested) {
   memset(&params, 0, sizeof(MHAParams));
   params.device_id = at::cuda::current_device();
   params.dataType = fe::DataType_t::HALF;
@@ -204,23 +256,24 @@ void setMHAParams(
   params.is_causal = is_causal;
   params.return_softmaxstats = return_softmaxstats;
   params.has_attn_bias = attn_bias.has_value();
+  // Expect 4D dense tensor, 3D nested case (THD)
   TORCH_INTERNAL_ASSERT(
-      q.sizes().size() == MAX_MHA_DIM,
+      q.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
       "Q tensor has unexpected number of dims, please report a bug to PyTorch.");
   TORCH_INTERNAL_ASSERT(
-      q.strides().size() == MAX_MHA_DIM,
+      q.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
       "Q tensor has unexpected number of dims, please report a bug to PyTorch.");
   TORCH_INTERNAL_ASSERT(
-      k.sizes().size() == MAX_MHA_DIM,
+      k.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
       "K tensor has unexpected number of dims, please report a bug to PyTorch.");
   TORCH_INTERNAL_ASSERT(
-      k.strides().size() == MAX_MHA_DIM,
+      k.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
       "K tensor has unexpected number of dims, please report a bug to PyTorch.");
   TORCH_INTERNAL_ASSERT(
-      v.sizes().size() == MAX_MHA_DIM,
+      v.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
       "V tensor has unexpected number of dims, please report a bug to PyTorch.");
   TORCH_INTERNAL_ASSERT(
-      v.strides().size() == MAX_MHA_DIM,
+      v.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
       "V tensor has unexpected number of dims, please report a bug to PyTorch.");
   std::copy(q.sizes().begin(), q.sizes().end(), params.q_dim.begin());
   std::copy(q.strides().begin(), q.strides().end(), params.q_stride.begin());
@@ -228,6 +281,20 @@ void setMHAParams(
   std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin());
   std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin());
   std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin());
+  bool use_ragged = use_ragged_in_dense(q, k, v, q, params.has_attn_bias);
+  params.use_ragged = use_ragged;
+  if (use_ragged) {
+    // ignore B - stride in BSHD (THD) avoid-recompile
+    params.q_stride[0] = INT_MAX;
+    params.k_stride[0] = INT_MAX;
+    params.v_stride[0] = INT_MAX;
+    // fix seqlen to rounded value
+    params.s_q = roundup_power2(params.s_q);
+    params.s_kv = roundup_power2(params.s_kv);
+    params.q_dim[2] = roundup_power2(params.q_dim[2]);
+    params.k_dim[2] = roundup_power2(params.k_dim[2]);
+    params.v_dim[2] = roundup_power2(params.v_dim[2]);
+  }
   // uninit is OK as the struct is memset 0'd
   if (params.has_attn_bias) {
     std::copy(
@@ -255,7 +322,8 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
       const std::optional<Tensor>& attn_bias,
       double dropout_probability,
       bool is_causal,
-      bool return_softmaxstats) {
+      bool return_softmaxstats,
+      bool is_nested) {
     setMHAParams(
         this->pod,
         b,
@@ -270,22 +338,37 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
         attn_bias,
         dropout_probability,
         is_causal,
-        return_softmaxstats);
+        return_softmaxstats,
+        is_nested);
   }
 };
 
 template <typename T, typename KeyType>
 struct MHAGraphCache {
   std::unordered_map<KeyType, T, ParamsWrapperHash<KeyType>> engine_cache;
+  int count = 0;
+  int hits = 0;
 
   // no mutexes here as caches are now thread local for v8, can also return a
   // pointer to the Execution Plan if we know it will not be invalidated by
   // another thread
   T* find(const KeyType& key) {
+    static bool flag =
+        c10::utils::check_env("TORCH_CUDNN_SDPA_CACHE_DEBUG") == true;
+    if (flag && count) {
+      TORCH_WARN(
+          "SDPA Cache Called ",
+          count,
+          " times. Hit rate: ",
+          100 * hits / count,
+          "%");
+    }
+    count++;
     auto it = engine_cache.find(key);
     if (it == engine_cache.end()) {
       return nullptr;
     }
+    hits++;
     return &(it->second);
   }
 
@@ -298,11 +381,45 @@ struct MHAGraphCache {
 // @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to
 // be thread safe across all engines see Limitations in
 // https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html
-thread_local MHAGraphCache<graph_and_tensors, MHACacheKeyWrapper> mhagraphcache;
-thread_local MHAGraphCache<graph_and_tensors_backward, MHACacheKeyWrapper>
-    mhagraphbackwardcache;
+// We also leak the caches to workaround potential teardown race issues.
+
+auto& getMHAGraphCache_() {
+  thread_local auto& instance =
+      *new MHAGraphCache<std::shared_ptr<fe::graph::Graph>, MHACacheKeyWrapper>;
+  return instance;
+}
+
+auto& getMHAGraphBackwardCache_() {
+  thread_local auto& instance =
+      *new MHAGraphCache<std::shared_ptr<fe::graph::Graph>, MHACacheKeyWrapper>;
+  return instance;
+}
 
 namespace {
+
+enum UIDS {
+  Q,
+  K,
+  V,
+  O,
+  BIAS,
+  SCALE,
+  SEED,
+  OFFSET,
+  LSE,
+  DO,
+  DQ,
+  DK,
+  DV,
+  SEQ_LEN_Q,
+  SEQ_LEN_KV,
+  RAG_Q_OFF,
+  RAG_K_OFF,
+  RAG_V_OFF,
+  RAG_O_OFF,
+  RAG_LSE_OFF
+};
+
 // analogous to the same function in Descriptors.h for cuDNN Convolutions...
 auto fixSizeOneDimStrideSDPA(
     const IntArrayRef sizes,
@@ -320,9 +437,10 @@ auto fixSizeOneDimStrideSDPA(
   }
   return strides;
 }
+
 } // namespace
 
-auto build_graph_and_tensors(
+auto build_graph(
     int64_t b,
     int64_t h,
     int64_t s_q,
@@ -355,65 +473,162 @@ auto build_graph_and_tensors(
       .set_compute_data_type(fe::DataType_t::FLOAT);
   auto attn_scale =
       mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(SCALE)
                             .set_name("Attn_scale")
                             .set_dim({1, 1, 1, 1})
                             .set_stride({1, 1, 1, 1})
                             .set_is_pass_by_value(true)
                             .set_data_type(fe::DataType_t::FLOAT));
-  auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                    .set_name("Seed")
-                                    .set_dim({1, 1, 1, 1})
-                                    .set_stride({1, 1, 1, 1})
-                                    .set_data_type(
-                                        dropoutseed.dtype() == kInt
-                                            ? fe::DataType_t::INT32
-                                            : fe::DataType_t::INT64));
-  auto offset = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                      .set_name("Offset")
+  auto scaled_dot_product_flash_attention_options =
+      fe::graph::SDPA_attributes()
+          .set_name("CUDNN_SDPA")
+          .set_generate_stats(return_softmaxstats)
+          .set_causal_mask(is_causal)
+          .set_attn_scale(attn_scale);
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    auto SEQ_LEN_Q_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(SEQ_LEN_Q)
+                              .set_name("Seq_q")
+                              .set_dim({b, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto SEQ_LEN_KV_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(SEQ_LEN_KV)
+                              .set_name("Seq_kv")
+                              .set_dim({b, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    scaled_dot_product_flash_attention_options.set_seq_len_q(SEQ_LEN_Q_)
+        .set_seq_len_kv(SEQ_LEN_KV_)
+        .set_padding_mask(true);
+  }
+  if (dropout_probability != 0.0f) {
+    auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_uid(SEED)
+                                      .set_name("Seed")
                                       .set_dim({1, 1, 1, 1})
                                       .set_stride({1, 1, 1, 1})
                                       .set_data_type(
-                                          dropoutoffset.dtype() == kInt
+                                          dropoutseed.dtype() == kInt
                                               ? fe::DataType_t::INT32
                                               : fe::DataType_t::INT64));
-  auto scaled_dot_product_flash_attention_options =
-      fe::graph::SDPA_attributes()
-          .set_name("CUDNN_SDPA")
-          .set_is_inference(return_softmaxstats == false)
-          .set_causal_mask(is_causal)
-          .set_attn_scale(attn_scale)
-          .set_dropout(dropout_probability, seed, offset);
-  auto Q = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("Q")
-          .set_dim(q.sizes().vec())
-          .set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec())));
-  auto K = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("K")
-          .set_dim(k.sizes().vec())
-          .set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec())));
-  auto V = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_name("V")
-          .set_dim(v.sizes().vec())
-          .set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec())));
+    auto offset = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                        .set_uid(OFFSET)
+                                        .set_name("Offset")
+                                        .set_dim({1, 1, 1, 1})
+                                        .set_stride({1, 1, 1, 1})
+                                        .set_data_type(
+                                            dropoutoffset.dtype() == kInt
+                                                ? fe::DataType_t::INT32
+                                                : fe::DataType_t::INT64));
+    scaled_dot_product_flash_attention_options.set_dropout(
+        dropout_probability, seed, offset);
+  }
+  auto Q_ = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(Q).set_name("Q"));
+  auto K_ = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(K).set_name("K"));
+  auto V_ = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(V).set_name("V"));
   std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
   if (attn_bias.has_value()) {
     bias =
         mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(BIAS)
                               .set_name("bias")
                               .set_dim(attn_bias.value().sizes().vec())
                               .set_stride(attn_bias.value().strides().vec()));
     scaled_dot_product_flash_attention_options.set_bias(bias.value());
   }
 
-  auto [O, Stats] =
-      mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
-  O->set_output(true).set_dim(o.sizes().vec()).set_stride(o.strides().vec());
-
+  auto [O_, Stats] =
+      mha_graph->sdpa(Q_, K_, V_, scaled_dot_product_flash_attention_options);
+  O_->set_uid(O).set_output(true);
   if (Stats) {
-    Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    Stats->set_uid(LSE)
+        .set_output(true)
+        .set_data_type(fe::DataType_t::FLOAT)
+        .set_stride(softmaxstats.strides().vec());
+  }
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    auto RAG_Q_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_Q_OFF)
+                              .set_name("cum_seq_q")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_K_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_K_OFF)
+                              .set_name("cum_seq_k")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_V_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_V_OFF)
+                              .set_name("cum_seq_v")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_O_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_O_OFF)
+                              .set_name("cum_seq_o")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_STATS_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_LSE_OFF)
+                              .set_name("cum_seq_stats")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    O_->set_ragged_offset(RAG_O_OFF_);
+    Q_->set_ragged_offset(RAG_Q_OFF_);
+    K_->set_ragged_offset(RAG_K_OFF_);
+    V_->set_ragged_offset(RAG_V_OFF_);
+    auto qsizevec = q.sizes().vec();
+    auto ksizevec = k.sizes().vec();
+    auto vsizevec = v.sizes().vec();
+    auto osizevec = o.sizes().vec();
+    qsizevec[2] = roundup_power2(qsizevec[2]);
+    ksizevec[2] = roundup_power2(ksizevec[2]);
+    vsizevec[2] = roundup_power2(vsizevec[2]);
+    osizevec[2] = roundup_power2(osizevec[2]);
+    // we checked for BSHD contig., set fake strides as cuDNN will complain
+    // if e.g., a ragged dim is smaller than a non-ragged one:
+    // consider HBSD tensor where H is 1
+    Q_->set_dim(qsizevec).set_stride(
+        {INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1});
+    K_->set_dim(ksizevec).set_stride(
+        {INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1});
+    V_->set_dim(vsizevec).set_stride(
+        {INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1});
+    O_->set_dim(osizevec).set_stride(
+        {INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1});
+    if (Stats) {
+      Stats->set_ragged_offset(RAG_STATS_OFF_);
+      auto statssizevec = softmaxstats.sizes().vec();
+      statssizevec[2] = roundup_power2(statssizevec[2]);
+      Stats->set_dim(statssizevec);
+    }
+  } else {
+    Q_->set_dim(q.sizes().vec())
+        .set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec()));
+    K_->set_dim(k.sizes().vec())
+        .set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec()));
+    V_->set_dim(v.sizes().vec())
+        .set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec()));
+    O_->set_dim(o.sizes().vec())
+        .set_stride(fixSizeOneDimStrideSDPA(o.sizes(), o.strides().vec()));
+    if (Stats) {
+      Stats->set_dim(softmaxstats.sizes().vec());
+    }
   }
 
   AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
@@ -423,20 +638,10 @@ auto build_graph_and_tensors(
   AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle));
   AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));
 
-  return std::make_tuple(
-      std::move(mha_graph),
-      std::move(Q),
-      std::move(K),
-      std::move(V),
-      std::move(bias),
-      std::move(attn_scale),
-      std::move(seed),
-      std::move(offset),
-      std::move(O),
-      std::move(Stats));
+  return mha_graph;
 }
 
-auto build_graph_and_tensors_nestedtensor(
+auto build_graph_nestedtensor(
     int64_t b,
     int64_t h_q,
     int64_t h_k,
@@ -473,28 +678,22 @@ auto build_graph_and_tensors_nestedtensor(
       .set_compute_data_type(fe::DataType_t::FLOAT);
   auto attn_scale =
       mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(SCALE)
                             .set_name("Attn_scale")
                             .set_dim({1, 1, 1, 1})
                             .set_stride({1, 1, 1, 1})
                             .set_is_pass_by_value(true)
                             .set_data_type(fe::DataType_t::FLOAT));
-  auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                    .set_name("Seed")
-                                    .set_dim({1, 1, 1, 1})
-                                    .set_stride({1, 1, 1, 1})
-                                    .set_data_type(fe::DataType_t::INT32));
-  auto offset = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                      .set_name("Offset")
-                                      .set_dim({1, 1, 1, 1})
-                                      .set_stride({1, 1, 1, 1})
-                                      .set_data_type(fe::DataType_t::INT32));
-  auto SEQ_LEN_Q = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                         .set_name("Seq_q")
-                                         .set_dim({b, 1, 1, 1})
-                                         .set_stride({1, 1, 1, 1})
-                                         .set_data_type(fe::DataType_t::INT32));
-  auto SEQ_LEN_KV =
+  auto SEQ_LEN_Q_ =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(SEQ_LEN_Q)
+                            .set_name("Seq_q")
+                            .set_dim({b, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+  auto SEQ_LEN_KV_ =
       mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(SEQ_LEN_KV)
                             .set_name("Seq_kv")
                             .set_dim({b, 1, 1, 1})
                             .set_stride({1, 1, 1, 1})
@@ -503,44 +702,69 @@ auto build_graph_and_tensors_nestedtensor(
   auto scaled_dot_product_flash_attention_options =
       fe::graph::SDPA_attributes()
           .set_name("CUDNN_SDPA_NESTEDTENSOR")
-          .set_is_inference(return_softmaxstats == false)
+          .set_generate_stats(return_softmaxstats)
           .set_causal_mask(is_causal)
           .set_attn_scale(attn_scale)
-          .set_dropout(dropout_probability, seed, offset)
-          .set_seq_len_q(SEQ_LEN_Q)
-          .set_seq_len_kv(SEQ_LEN_KV)
+          .set_seq_len_q(SEQ_LEN_Q_)
+          .set_seq_len_kv(SEQ_LEN_KV_)
           .set_padding_mask(true);
+  if (dropout_probability != 0.0f) {
+    auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_uid(SEED)
+                                      .set_name("Seed")
+                                      .set_dim({1, 1, 1, 1})
+                                      .set_stride({1, 1, 1, 1})
+                                      .set_data_type(
+                                          dropoutseed.dtype() == kInt
+                                              ? fe::DataType_t::INT32
+                                              : fe::DataType_t::INT64));
+    auto offset = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                        .set_uid(OFFSET)
+                                        .set_name("Offset")
+                                        .set_dim({1, 1, 1, 1})
+                                        .set_stride({1, 1, 1, 1})
+                                        .set_data_type(
+                                            dropoutoffset.dtype() == kInt
+                                                ? fe::DataType_t::INT32
+                                                : fe::DataType_t::INT64));
+    scaled_dot_product_flash_attention_options.set_dropout(
+        dropout_probability, seed, offset);
+  }
   // We hardcode BSHD to cuDNN even though the underlying layout is THD
   auto q_strides = q.strides();
   auto k_strides = k.strides();
   auto v_strides = v.strides();
+  // NB: cuDNN API shape is transposed: we pass it nominally as HTD
   constexpr int strideidx0 = 1;
   constexpr int strideidx1 = 0;
   constexpr int strideidx2 = 2;
-  auto Q = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                 .set_name("Q")
-                                 .set_dim({b, h_q, s_q, d_qk})
-                                 .set_stride(
-                                     {INT_MAX,
-                                      q_strides[strideidx0],
-                                      q_strides[strideidx1],
-                                      q_strides[strideidx2]}));
-  auto K = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                 .set_name("K")
-                                 .set_dim({b, h_k, s_kv, d_qk})
-                                 .set_stride(
-                                     {INT_MAX,
-                                      k_strides[strideidx0],
-                                      k_strides[strideidx1],
-                                      k_strides[strideidx2]}));
-  auto V = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                 .set_name("V")
-                                 .set_dim({b, h_v, s_kv, d_v})
-                                 .set_stride(
-                                     {INT_MAX,
-                                      v_strides[strideidx0],
-                                      v_strides[strideidx1],
-                                      v_strides[strideidx2]}));
+  auto Q_ = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_uid(Q)
+                                  .set_name("Q")
+                                  .set_dim({b, h_q, s_q, d_qk})
+                                  .set_stride(
+                                      {INT_MAX,
+                                       q_strides[strideidx0],
+                                       q_strides[strideidx1],
+                                       q_strides[strideidx2]}));
+  auto K_ = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_uid(K)
+                                  .set_name("K")
+                                  .set_dim({b, h_k, s_kv, d_qk})
+                                  .set_stride(
+                                      {INT_MAX,
+                                       k_strides[strideidx0],
+                                       k_strides[strideidx1],
+                                       k_strides[strideidx2]}));
+  auto V_ = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_uid(V)
+                                  .set_name("V")
+                                  .set_dim({b, h_v, s_kv, d_v})
+                                  .set_stride(
+                                      {INT_MAX,
+                                       v_strides[strideidx0],
+                                       v_strides[strideidx1],
+                                       v_strides[strideidx2]}));
   std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
   if (attn_bias.has_value()) {
     TORCH_CHECK(
@@ -548,44 +772,48 @@ auto build_graph_and_tensors_nestedtensor(
         "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
     bias =
         mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(BIAS)
                               .set_name("bias")
                               .set_dim(attn_bias.value().sizes().vec())
                               .set_stride(attn_bias.value().strides().vec()));
     scaled_dot_product_flash_attention_options.set_bias(bias.value());
   }
-  auto RAG_Q_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                         .set_name("cum_seq_q")
-                                         .set_dim({b + 1, 1, 1, 1})
-                                         .set_stride({1, 1, 1, 1})
-                                         .set_data_type(fe::DataType_t::INT32));
-  auto RAG_K_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                         .set_name("cum_seq_k")
-                                         .set_dim({b + 1, 1, 1, 1})
-                                         .set_stride({1, 1, 1, 1})
-                                         .set_data_type(fe::DataType_t::INT32));
-  auto RAG_V_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                         .set_name("cum_seq_v")
-                                         .set_dim({b + 1, 1, 1, 1})
-                                         .set_stride({1, 1, 1, 1})
-                                         .set_data_type(fe::DataType_t::INT32));
-  auto RAG_O_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                         .set_name("cum_seq_o")
-                                         .set_dim({b + 1, 1, 1, 1})
-                                         .set_stride({1, 1, 1, 1})
-                                         .set_data_type(fe::DataType_t::INT32));
-  // auto RAG_STATS_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
-  //                                     .set_name("cum_seq_stats")
-  //                                     .set_dim({b + 1, 1, 1, 1})
-  //                                     .set_stride({1, 1, 1, 1})
-  //                                     .set_data_type(fe::DataType_t::INT32));
-  auto RAG_STATS_OFF = nullptr;
-  Q->set_ragged_offset(RAG_Q_OFF);
-  K->set_ragged_offset(RAG_K_OFF);
-  V->set_ragged_offset(RAG_V_OFF);
-  auto [O, Stats] =
-      mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
+  auto RAG_Q_OFF_ =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(RAG_Q_OFF)
+                            .set_name("cum_seq_q")
+                            .set_dim({b + 1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+  auto RAG_K_OFF_ =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(RAG_K_OFF)
+                            .set_name("cum_seq_k")
+                            .set_dim({b + 1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+  auto RAG_V_OFF_ =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(RAG_V_OFF)
+                            .set_name("cum_seq_v")
+                            .set_dim({b + 1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+  auto RAG_O_OFF_ =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(RAG_O_OFF)
+                            .set_name("cum_seq_o")
+                            .set_dim({b + 1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+  Q_->set_ragged_offset(RAG_Q_OFF_);
+  K_->set_ragged_offset(RAG_K_OFF_);
+  V_->set_ragged_offset(RAG_V_OFF_);
+  auto [O_, Stats] =
+      mha_graph->sdpa(Q_, K_, V_, scaled_dot_product_flash_attention_options);
   auto o_strides = o.strides();
-  O->set_output(true)
+  O_->set_output(true)
+      .set_uid(O)
       .set_dim({b, h_q, s_q, d_v})
       .set_stride(
           {INT_MAX,
@@ -593,16 +821,20 @@ auto build_graph_and_tensors_nestedtensor(
            o_strides[strideidx1],
            o_strides[strideidx2]});
 
-  O->set_ragged_offset(RAG_O_OFF);
+  O_->set_ragged_offset(RAG_O_OFF_);
   if (Stats) {
-    TORCH_CHECK(
-        false,
-        "cuDNN SDPA Nested Tensor does not yet handle backwards/logsumexp computation");
-    // TODO(eqy): fix  when stats (backward) support is added
+    auto RAG_STATS_OFF =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_LSE_OFF)
+                              .set_name("cum_seq_stats")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
     Stats->set_output(true)
+        .set_uid(LSE)
         .set_data_type(fe::DataType_t::FLOAT)
         .set_dim({b, h_q, s_q, 1})
-        .set_stride({h_q * s_q * d_v, d_v, s_q * d_v, 1});
+        .set_stride({h_q * s_q, 1, h_q, 1});
     Stats->set_ragged_offset(RAG_STATS_OFF);
   }
   AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
@@ -611,27 +843,10 @@ auto build_graph_and_tensors_nestedtensor(
       mha_graph->create_execution_plans({fe::HeurMode_t::A}));
   AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle));
   AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));
-  return std::make_tuple(
-      std::move(mha_graph),
-      std::move(Q),
-      std::move(K),
-      std::move(V),
-      std::move(bias),
-      std::move(attn_scale),
-      std::move(seed),
-      std::move(offset),
-      std::move(O),
-      std::move(Stats),
-      std::move(RAG_Q_OFF),
-      std::move(RAG_K_OFF),
-      std::move(RAG_V_OFF),
-      std::move(RAG_O_OFF),
-      std::move(RAG_STATS_OFF),
-      std::move(SEQ_LEN_Q),
-      std::move(SEQ_LEN_KV));
+  return mha_graph;
 }
 
-auto build_graph_and_tensors_backward(
+auto build_graph_backward(
     int64_t b,
     int64_t h,
     int64_t s_q,
@@ -667,6 +882,7 @@ auto build_graph_and_tensors_backward(
       .set_compute_data_type(fe::DataType_t::FLOAT);
   auto attn_scale =
       mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(SCALE)
                             .set_name("Attn_scale")
                             .set_dim({1, 1, 1, 1})
                             .set_stride({1, 1, 1, 1})
@@ -676,87 +892,415 @@ auto build_graph_and_tensors_backward(
                                    .set_name("CUDNN_SDPA_BACKWARD")
                                    .set_causal_mask(is_causal)
                                    .set_attn_scale(attn_scale);
-  auto Q = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                 .set_name("Q")
-                                 .set_dim(q.sizes().vec())
-                                 .set_stride(q.strides().vec()));
-  auto K = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                 .set_name("K")
-                                 .set_dim(k.sizes().vec())
-                                 .set_stride(k.strides().vec()));
-  auto V = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                 .set_name("V")
-                                 .set_dim(v.sizes().vec())
-                                 .set_stride(v.strides().vec()));
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    auto SEQ_LEN_Q_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(SEQ_LEN_Q)
+                              .set_name("Seq_q")
+                              .set_dim({b, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto SEQ_LEN_KV_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(SEQ_LEN_KV)
+                              .set_name("Seq_kv")
+                              .set_dim({b, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    sdpa_backward_options.set_seq_len_q(SEQ_LEN_Q_)
+        .set_seq_len_kv(SEQ_LEN_KV_)
+        .set_padding_mask(true);
+  }
+
+  auto Q_ = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(Q).set_name("Q"));
+  auto K_ = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(K).set_name("K"));
+  auto V_ = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(V).set_name("V"));
   std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
   if (attn_bias.has_value()) {
     bias =
         mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(BIAS)
                               .set_name("bias")
                               .set_dim(attn_bias.value().sizes().vec())
                               .set_stride(attn_bias.value().strides().vec()));
     sdpa_backward_options.set_bias(bias.value());
   }
-  auto Seed = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                    .set_name("Seed")
-                                    .set_dim({1, 1, 1, 1})
-                                    .set_stride({1, 1, 1, 1})
-                                    .set_data_type(
-                                        dropoutseed.dtype() == kInt
-                                            ? fe::DataType_t::INT32
-                                            : fe::DataType_t::INT64));
-
-  auto Offset = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                      .set_name("Offset")
+  if (dropout_probability != 0.0f) {
+    auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_uid(SEED)
+                                      .set_name("Seed")
                                       .set_dim({1, 1, 1, 1})
                                       .set_stride({1, 1, 1, 1})
                                       .set_data_type(
-                                          dropoutoffset.dtype() == kInt
+                                          dropoutseed.dtype() == kInt
                                               ? fe::DataType_t::INT32
                                               : fe::DataType_t::INT64));
-
-  auto O = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                 .set_name("O")
-                                 .set_dim(o.sizes().vec())
-                                 .set_stride(o.strides().vec()));
-  auto STATS = mha_graph->tensor(fe::graph::Tensor_attributes()
+    auto offset = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                        .set_uid(OFFSET)
+                                        .set_name("Offset")
+                                        .set_dim({1, 1, 1, 1})
+                                        .set_stride({1, 1, 1, 1})
+                                        .set_data_type(
+                                            dropoutoffset.dtype() == kInt
+                                                ? fe::DataType_t::INT32
+                                                : fe::DataType_t::INT64));
+    sdpa_backward_options.set_dropout(dropout_probability, seed, offset);
+  }
+  auto O_ = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(O).set_name("O"));
+  auto Stats = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                     .set_uid(LSE)
                                      .set_name("Stats")
-                                     .set_dim(softmaxstats.sizes().vec())
                                      .set_stride(softmaxstats.strides().vec())
                                      .set_data_type(fe::DataType_t::FLOAT));
-  auto DO = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                  .set_name("DO")
-                                  .set_dim(dO.sizes().vec())
-                                  .set_stride(dO.strides().vec()));
+  auto Do = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(DO).set_name("DO"));
+  auto [Dq, Dk, Dv] = mha_graph->sdpa_backward(
+      Q_, K_, V_, O_, Do, Stats, sdpa_backward_options);
+  Dq->set_uid(DQ).set_output(true);
+  Dk->set_uid(DK).set_output(true);
+  Dv->set_uid(DV).set_output(true);
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    auto RAG_Q_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_Q_OFF)
+                              .set_name("cum_seq_q")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_K_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_K_OFF)
+                              .set_name("cum_seq_k")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_V_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_V_OFF)
+                              .set_name("cum_seq_v")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_O_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_O_OFF)
+                              .set_name("cum_seq_o")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_STATS_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_LSE_OFF)
+                              .set_name("cum_seq_stats")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    O_->set_ragged_offset(RAG_O_OFF_);
+    Q_->set_ragged_offset(RAG_Q_OFF_);
+    K_->set_ragged_offset(RAG_K_OFF_);
+    V_->set_ragged_offset(RAG_V_OFF_);
+    Dq->set_ragged_offset(RAG_Q_OFF_);
+    Dk->set_ragged_offset(RAG_K_OFF_);
+    Dv->set_ragged_offset(RAG_V_OFF_);
+    Do->set_ragged_offset(RAG_O_OFF_);
+    auto qsizevec = q.sizes().vec();
+    auto ksizevec = k.sizes().vec();
+    auto vsizevec = v.sizes().vec();
+    auto osizevec = o.sizes().vec();
+    qsizevec[2] = roundup_power2(qsizevec[2]);
+    ksizevec[2] = roundup_power2(ksizevec[2]);
+    vsizevec[2] = roundup_power2(vsizevec[2]);
+    osizevec[2] = roundup_power2(osizevec[2]);
+    // see corresponding section in the forward about the hardcoding
+    // of strides here
+    Q_->set_dim(qsizevec).set_stride(
+        {INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1});
+    K_->set_dim(ksizevec).set_stride(
+        {INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1});
+    V_->set_dim(vsizevec).set_stride(
+        {INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1});
+    O_->set_dim(osizevec).set_stride(
+        {INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1});
+    // should be identical to their non-d counterparts
+    Dq->set_dim(qsizevec).set_stride(
+        {INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1});
+    Dk->set_dim(ksizevec).set_stride(
+        {INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1});
+    Dv->set_dim(vsizevec).set_stride(
+        {INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1});
+    Do->set_dim(osizevec).set_stride(
+        {INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1});
+
+    Stats->set_ragged_offset(RAG_STATS_OFF_);
+    auto statssizevec = softmaxstats.sizes().vec();
+    statssizevec[2] = roundup_power2(statssizevec[2]);
+    Stats->set_dim(statssizevec);
+  } else {
+    O_->set_dim(o.sizes().vec()).set_stride(o.strides().vec());
+    Q_->set_dim(q.sizes().vec()).set_stride(q.strides().vec());
+    K_->set_dim(k.sizes().vec()).set_stride(k.strides().vec());
+    V_->set_dim(v.sizes().vec()).set_stride(v.strides().vec());
+    Dq->set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
+    Dk->set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
+    Dv->set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
+    Do->set_dim(dO.sizes().vec()).set_stride(dO.strides().vec());
+    Stats->set_dim(softmaxstats.sizes().vec());
+  }
+
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
+  AT_CUDNN_FRONTEND_CHECK(
+      mha_graph->create_execution_plans({fe::HeurMode_t::A}));
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle));
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));
+  return mha_graph;
+}
+
+auto build_graph_backward_nestedtensor(
+    int64_t b,
+    int64_t h_q,
+    int64_t h_k,
+    int64_t h_v,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d_qk,
+    int64_t d_v,
+    float scaling_factor,
+    bool is_causal,
+    float dropout_probability,
+    const Tensor& cum_seqlen_q,
+    const Tensor& cum_seqlen_kv,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
+    const Tensor& o,
+    const Tensor& dO,
+    const Tensor& softmaxstats,
+    Tensor& dQ,
+    Tensor& dK,
+    Tensor& dV,
+    const Tensor& dropoutseed,
+    const Tensor& dropoutoffset,
+    cudnnHandle_t& handle) {
+  auto dtype = fe::DataType_t::HALF;
+  if (q.scalar_type() == kBFloat16) {
+    dtype = fe::DataType_t::BFLOAT16;
+  }
+  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  // We're baking in float accumulation and scale types
+  // in theory the graph may support other types, but they
+  // have not been tested
+  mha_graph->set_io_data_type(dtype)
+      .set_intermediate_data_type(fe::DataType_t::FLOAT)
+      .set_compute_data_type(fe::DataType_t::FLOAT);
+  auto attn_scale =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(SCALE)
+                            .set_name("Attn_scale")
+                            .set_dim({1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_is_pass_by_value(true)
+                            .set_data_type(fe::DataType_t::FLOAT));
+
+  auto SEQ_LEN_Q_ =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(SEQ_LEN_Q)
+                            .set_name("Seq_q")
+                            .set_dim({b, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+  auto SEQ_LEN_KV_ =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(SEQ_LEN_KV)
+                            .set_name("Seq_kv")
+                            .set_dim({b, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+  auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
+                                   .set_name("CUDNN_SDPA_NESTEDTENSOR_BACKWARD")
+                                   .set_causal_mask(is_causal)
+                                   .set_attn_scale(attn_scale)
+                                   .set_seq_len_q(SEQ_LEN_Q_)
+                                   .set_seq_len_kv(SEQ_LEN_KV_)
+                                   .set_padding_mask(true);
   if (dropout_probability != 0.0f) {
-    sdpa_backward_options.set_dropout(dropout_probability, Seed, Offset);
+    auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_uid(SEED)
+                                      .set_name("Seed")
+                                      .set_dim({1, 1, 1, 1})
+                                      .set_stride({1, 1, 1, 1})
+                                      .set_data_type(
+                                          dropoutseed.dtype() == kInt
+                                              ? fe::DataType_t::INT32
+                                              : fe::DataType_t::INT64));
+    auto offset = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                        .set_uid(OFFSET)
+                                        .set_name("Offset")
+                                        .set_dim({1, 1, 1, 1})
+                                        .set_stride({1, 1, 1, 1})
+                                        .set_data_type(
+                                            dropoutoffset.dtype() == kInt
+                                                ? fe::DataType_t::INT32
+                                                : fe::DataType_t::INT64));
+    sdpa_backward_options.set_dropout(dropout_probability, seed, offset);
   }
-  auto [DQ, DK, DV] =
-      mha_graph->sdpa_backward(Q, K, V, O, DO, STATS, sdpa_backward_options);
-  DQ->set_output(true).set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
-  DK->set_output(true).set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
-  DV->set_output(true).set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
+  auto q_strides = q.strides();
+  auto k_strides = k.strides();
+  auto v_strides = v.strides();
+  // NB: cuDNN API shape is transposed
+  constexpr int strideidx0 = 1;
+  constexpr int strideidx1 = 0;
+  constexpr int strideidx2 = 2;
+  auto Q_ = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_uid(Q)
+                                  .set_name("Q")
+                                  .set_dim({b, h_q, s_q, d_qk})
+                                  .set_stride(
+                                      {INT_MAX,
+                                       q_strides[strideidx0],
+                                       q_strides[strideidx1],
+                                       q_strides[strideidx2]}));
+  auto K_ = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_uid(K)
+                                  .set_name("K")
+                                  .set_dim({b, h_k, s_kv, d_qk})
+                                  .set_stride(
+                                      {INT_MAX,
+                                       k_strides[strideidx0],
+                                       k_strides[strideidx1],
+                                       k_strides[strideidx2]}));
+  auto V_ = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_uid(V)
+                                  .set_name("V")
+                                  .set_dim({b, h_v, s_kv, d_v})
+                                  .set_stride(
+                                      {INT_MAX,
+                                       v_strides[strideidx0],
+                                       v_strides[strideidx1],
+                                       v_strides[strideidx2]}));
+  auto o_strides = o.strides();
+  auto O_ = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                  .set_uid(O)
+                                  .set_name("O")
+                                  .set_dim({b, h_q, s_q, d_v})
+                                  .set_stride(
+                                      {INT_MAX,
+                                       o_strides[strideidx0],
+                                       o_strides[strideidx1],
+                                       o_strides[strideidx2]}));
+
+  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
+  if (attn_bias.has_value()) {
+    TORCH_CHECK(
+        false,
+        "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
+    bias =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(BIAS)
+                              .set_name("bias")
+                              .set_dim(attn_bias.value().sizes().vec())
+                              .set_stride(attn_bias.value().strides().vec()));
+    sdpa_backward_options.set_bias(bias.value());
+  }
+  auto RAG_Q_OFF_ =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(RAG_Q_OFF)
+                            .set_name("cum_seq_q")
+                            .set_dim({b + 1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+  auto RAG_K_OFF_ =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(RAG_K_OFF)
+                            .set_name("cum_seq_k")
+                            .set_dim({b + 1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+  auto RAG_V_OFF_ =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(RAG_V_OFF)
+                            .set_name("cum_seq_v")
+                            .set_dim({b + 1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+  auto RAG_O_OFF_ =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(RAG_O_OFF)
+                            .set_name("cum_seq_o")
+                            .set_dim({b + 1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+  auto RAG_STATS_OFF_ =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_uid(RAG_LSE_OFF)
+                            .set_name("cum_seq_stats")
+                            .set_dim({b + 1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_data_type(fe::DataType_t::INT32));
+  O_->set_ragged_offset(RAG_O_OFF_);
+  Q_->set_ragged_offset(RAG_Q_OFF_);
+  K_->set_ragged_offset(RAG_K_OFF_);
+  V_->set_ragged_offset(RAG_V_OFF_);
+  auto STATS = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                     .set_uid(LSE)
+                                     .set_name("stats")
+                                     .set_dim({b, h_q, s_q, 1})
+                                     .set_stride({s_q * h_q, 1, h_q, 1})
+                                     .set_data_type(fe::DataType_t::FLOAT));
+  STATS->set_ragged_offset(RAG_STATS_OFF_);
+  auto do_strides = dO.strides();
+  auto DO_ = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_ragged_offset(RAG_O_OFF_)
+                                   .set_uid(DO)
+                                   .set_name("DO")
+                                   .set_dim({b, h_q, s_q, d_v})
+                                   .set_stride(
+                                       {INT_MAX,
+                                        do_strides[strideidx0],
+                                        do_strides[strideidx1],
+                                        do_strides[strideidx2]}));
+  auto [Dq, Dk, Dv] = mha_graph->sdpa_backward(
+      Q_, K_, V_, O_, DO_, STATS, sdpa_backward_options);
+  Dq->set_output(true)
+      .set_uid(DQ)
+      .set_ragged_offset(RAG_Q_OFF_)
+      .set_dim({b, h_q, s_q, d_qk})
+      .set_stride(
+          {INT_MAX,
+           q_strides[strideidx0],
+           q_strides[strideidx1],
+           q_strides[strideidx2]});
+  Dk->set_output(true)
+      .set_uid(DK)
+      .set_ragged_offset(RAG_K_OFF_)
+      .set_dim({b, h_k, s_kv, d_qk})
+      .set_stride(
+          {INT_MAX,
+           k_strides[strideidx0],
+           k_strides[strideidx1],
+           k_strides[strideidx2]});
+  Dv->set_output(true)
+      .set_uid(DV)
+      .set_ragged_offset(RAG_V_OFF_)
+      .set_dim({b, h_v, s_kv, d_v})
+      .set_stride(
+          {INT_MAX,
+           v_strides[strideidx0],
+           v_strides[strideidx1],
+           v_strides[strideidx2]});
+
   AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
   AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
   AT_CUDNN_FRONTEND_CHECK(
       mha_graph->create_execution_plans({fe::HeurMode_t::A}));
   AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle));
   AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));
-  return std::make_tuple(
-      std::move(mha_graph),
-      std::move(Q),
-      std::move(K),
-      std::move(V),
-      std::move(bias),
-      std::move(attn_scale),
-      std::move(Seed),
-      std::move(Offset),
-      std::move(O),
-      std::move(DO),
-      std::move(STATS),
-      std::move(DQ),
-      std::move(DK),
-      std::move(DV));
+  return mha_graph;
 }
 
 void run_cudnn_SDP_fprop(
@@ -778,31 +1322,61 @@ void run_cudnn_SDP_fprop(
     Tensor& o,
     Tensor& dropoutseed,
     Tensor& dropoutoffset) {
-  const auto dprops = at::cuda::getCurrentDeviceProperties();
-  auto _dropoutseed = dropoutseed;
-  auto _dropoutoffset = dropoutoffset;
-  // cuDNN dropout bug requires these to be in int64
-  if (dprops->major == 10 && dprops->minor == 0) {
-    _dropoutseed = dropoutseed.to(kLong);
-    _dropoutoffset = dropoutoffset.to(kLong);
+  // do nothing if we got 0-element tensors
+  if (!q.numel() || !k.numel() || !v.numel()) {
+    return;
   }
+  Tensor seqlen_q, seqlen_kv;
+  Tensor rag_off_q, rag_off_k, rag_off_v, rag_off_o, rag_off_lse;
 
-  cudnnHandle_t handle = getCudnnHandle();
   if (!o.defined()) {
     // q is passed to us in BHSD dim order
     alloc_with_matching_layout(q, o, {b, h, s_q, d_v});
   }
-
+  bool use_ragged = use_ragged_in_dense(q, k, v, o, attn_bias.has_value());
   if (return_softmaxstats && !softmaxstats.defined()) {
-    // TODO(eqy): verify that this is correct
-    softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat));
+    // TODO(eqy): investigate why cuDNN doesn't like BSH layout softmaxstats
+    if (!use_ragged) {
+      softmaxstats = at::empty({b, h, s_q, 1}, q.options().dtype(kFloat));
+    } else {
+      softmaxstats =
+          at::empty({b, s_q, h, 1}, q.options().dtype(kFloat)).transpose(1, 2);
+    }
   }
 
-  // do nothing if we got 0-element tensors
-  if (!q.numel() || !k.numel() || !v.numel()) {
-    return;
+  if (use_ragged) {
+    seqlen_q = at::full({b, 1, 1, 1}, s_q, q.options().dtype(kInt));
+    seqlen_kv = at::full({b, 1, 1, 1}, s_kv, q.options().dtype(kInt));
+    auto cum_seqlen_q = at::full({b + 1, 1, 1, 1}, s_q, q.options().dtype(kInt))
+                            .cumsum(0, kInt)
+                            .add_(-s_q);
+    auto cum_seqlen_kv =
+        at::full({b + 1, 1, 1, 1}, s_kv, q.options().dtype(kInt))
+            .cumsum(0, kInt)
+            .add_(-s_kv);
+    rag_off_q = cum_seqlen_q.mul(q.stride(-2));
+    rag_off_k = cum_seqlen_kv.mul(k.stride(-2));
+    rag_off_v = cum_seqlen_kv.mul(v.stride(-2));
+    rag_off_o = cum_seqlen_q.mul(o.stride(-2));
+    if (return_softmaxstats) {
+      rag_off_lse = cum_seqlen_q.mul(softmaxstats.stride(-2));
+    }
   }
 
+  const auto dprops = at::cuda::getCurrentDeviceProperties();
+  auto _dropoutseed = dropoutseed;
+  auto _dropoutoffset = dropoutoffset;
+  // cuDNN dropout bug requires these to be in int64
+  if (dprops->major == 10 && dprops->minor == 0) {
+    _dropoutseed = dropoutseed.to(kLong);
+    _dropoutoffset = dropoutoffset.to(kLong);
+  }
+
+  cudnnHandle_t handle = getCudnnHandle();
+
+  // NB: The key initialization will round up sequence length, stride data etc.
+  // if use_ragged_in_dense is enabled (to allow multiple sequence lenghths to
+  // reuse the same cached value/graph)
   auto key = MHACacheKeyWrapper(
       b,
       h,
@@ -816,13 +1390,14 @@ void run_cudnn_SDP_fprop(
       attn_bias,
       dropout_probability,
       is_causal,
-      return_softmaxstats);
-  auto graph_and_tensors_ptr = mhagraphcache.find(key);
-  graph_and_tensors graph_and_tensors_values;
-  if (graph_and_tensors_ptr) {
-    graph_and_tensors_values = *graph_and_tensors_ptr;
+      return_softmaxstats,
+      false);
+  auto graph_ptr = getMHAGraphCache_().find(key);
+  std::shared_ptr<fe::graph::Graph> mha_graph;
+  if (graph_ptr) {
+    mha_graph = *graph_ptr;
   } else {
-    graph_and_tensors_values = build_graph_and_tensors(
+    mha_graph = build_graph(
         b,
         h,
         s_q,
@@ -843,29 +1418,39 @@ void run_cudnn_SDP_fprop(
         _dropoutoffset,
         handle);
   }
-  auto [mha_graph, Q, K, V, bias, attn_scale, seed, offset, O, Stats] =
-      graph_and_tensors_values;
-  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
-      variant_pack = {
-          {Q, q.data_ptr()},
-          {K, k.data_ptr()},
-          {V, v.data_ptr()},
-          {attn_scale, &scaling_factor},
-          {seed, _dropoutseed.data_ptr()},
-          {offset, _dropoutoffset.data_ptr()},
-          {O, o.data_ptr()}};
+  std::unordered_map<int64_t, void*> variant_pack = {
+      {Q, q.data_ptr()},
+      {K, k.data_ptr()},
+      {V, v.data_ptr()},
+      {SCALE, &scaling_factor},
+      {O, o.data_ptr()}};
   if (return_softmaxstats) {
-    variant_pack[Stats] = softmaxstats.data_ptr();
+    variant_pack[LSE] = softmaxstats.data_ptr();
   }
   if (attn_bias.has_value()) {
-    variant_pack[bias.value()] = attn_bias.value().data_ptr();
+    variant_pack[BIAS] = attn_bias.value().data_ptr();
+  }
+  if (dropout_probability != 0.0f) {
+    variant_pack[SEED] = _dropoutseed.data_ptr();
+    variant_pack[OFFSET] = _dropoutoffset.data_ptr();
+  }
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr();
+    variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr();
+    variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr();
+    variant_pack[RAG_K_OFF] = rag_off_k.data_ptr();
+    variant_pack[RAG_V_OFF] = rag_off_v.data_ptr();
+    variant_pack[RAG_O_OFF] = rag_off_o.data_ptr();
+    if (return_softmaxstats) {
+      variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr();
+    }
   }
   auto workspace_size = mha_graph->get_workspace_size();
   auto workspace_ptr =
       c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
   TORCH_CHECK(
       mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
-  mhagraphcache.update(key, graph_and_tensors_values);
+  getMHAGraphCache_().update(key, mha_graph);
 }
 
 void run_cudnn_SDP_fprop_nestedtensor(
@@ -904,72 +1489,78 @@ void run_cudnn_SDP_fprop_nestedtensor(
   if (return_softmaxstats && !softmaxstats.defined()) {
     softmaxstats = at::empty({q.size(0), h_q, 1}, q.options().dtype(kFloat));
   }
-  auto
-      [mha_graph,
-       Q,
-       K,
-       V,
-       bias,
-       attn_scale,
-       seed,
-       offset,
-       O,
-       Stats,
-       RAG_Q_OFF,
-       RAG_K_OFF,
-       RAG_V_OFF,
-       RAG_O_OFF,
-       RAG_STATS_OFF,
-       SEQ_LEN_Q,
-       SEQ_LEN_KV] =
-          build_graph_and_tensors_nestedtensor(
-              b,
-              h_q,
-              h_k,
-              h_v,
-              s_q,
-              s_kv,
-              d_qk,
-              d_v,
-              scaling_factor,
-              return_softmaxstats,
-              is_causal,
-              dropout_probability,
-              cum_seqlen_q,
-              cum_seqlen_kv,
-              q,
-              k,
-              v,
-              attn_bias,
-              softmaxstats,
-              o,
-              dropoutseed,
-              dropoutoffset,
-              handle);
+
+  auto key = MHACacheKeyWrapper(
+      b,
+      h_q,
+      s_q, // max-seqlen-q
+      s_kv, // max-seqlen-kv
+      d_qk,
+      d_v,
+      q,
+      k,
+      v,
+      attn_bias,
+      dropout_probability,
+      is_causal,
+      return_softmaxstats,
+      true);
+  auto graph_ptr = getMHAGraphCache_().find(key);
+  std::shared_ptr<fe::graph::Graph> mha_graph;
+
+  if (graph_ptr) {
+    mha_graph = *graph_ptr;
+  } else {
+    mha_graph = build_graph_nestedtensor(
+        b,
+        h_q,
+        h_k,
+        h_v,
+        s_q,
+        s_kv,
+        d_qk,
+        d_v,
+        scaling_factor,
+        return_softmaxstats,
+        is_causal,
+        dropout_probability,
+        cum_seqlen_q,
+        cum_seqlen_kv,
+        q,
+        k,
+        v,
+        attn_bias,
+        softmaxstats,
+        o,
+        dropoutseed,
+        dropoutoffset,
+        handle);
+  }
   auto seqlen_q = at::diff(cum_seqlen_q, 1, 0);
   auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0);
   auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk);
-  auto rag_k_off = cum_seqlen_kv.mul(h_k * d_qk);
+  auto rag_k_off = cum_seqlen_kv.mul(h_k * d_v);
   auto rag_v_off = cum_seqlen_kv.mul(h_v * d_v);
   auto rag_stats_off = cum_seqlen_q.mul(h_q);
-  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
-      variant_pack = {
-          {Q, q.data_ptr()},
-          {K, k.data_ptr()},
-          {V, v.data_ptr()},
-          {attn_scale, &scaling_factor},
-          {seed, dropoutseed.data_ptr()},
-          {offset, dropoutoffset.data_ptr()},
-          {O, o.data_ptr()},
-          {RAG_Q_OFF, rag_q_off.data_ptr()},
-          {RAG_O_OFF, rag_q_off.data_ptr()},
-          {RAG_K_OFF, rag_k_off.data_ptr()},
-          {RAG_V_OFF, rag_v_off.data_ptr()},
-          {SEQ_LEN_Q, seqlen_q.data_ptr()},
-          {SEQ_LEN_KV, seqlen_kv.data_ptr()}};
+  std::unordered_map<int64_t, void*> variant_pack = {
+      {Q, q.data_ptr()},
+      {K, k.data_ptr()},
+      {V, v.data_ptr()},
+      {SCALE, &scaling_factor},
+      {O, o.data_ptr()},
+      {RAG_Q_OFF, rag_q_off.data_ptr()},
+      {RAG_O_OFF, rag_q_off.data_ptr()},
+      {RAG_K_OFF, rag_k_off.data_ptr()},
+      {RAG_V_OFF, rag_v_off.data_ptr()},
+      {SEQ_LEN_Q, seqlen_q.data_ptr()},
+      {SEQ_LEN_KV, seqlen_kv.data_ptr()}};
   if (return_softmaxstats) {
-    variant_pack[Stats] = softmaxstats.data_ptr();
-    variant_pack[RAG_STATS_OFF] = cum_seqlen_q.data_ptr();
+    variant_pack[LSE] = softmaxstats.data_ptr();
+    variant_pack[RAG_LSE_OFF] = rag_stats_off.data_ptr();
+  }
+  if (dropout_probability != 0.0f) {
+    variant_pack[SEED] = dropoutseed.data_ptr();
+    variant_pack[OFFSET] = dropoutoffset.data_ptr();
   }
   if (attn_bias.has_value()) {
     TORCH_CHECK("bias not supported with nestedtensor");
@@ -1008,6 +1599,9 @@ void run_cudnn_SDP_bprop(
       !softmaxstats.numel()) {
     return;
   }
+  Tensor seqlen_q, seqlen_kv;
+  Tensor rag_off_q, rag_off_k, rag_off_v, rag_off_o, rag_off_lse;
+
   auto dprops = at::cuda::getCurrentDeviceProperties();
   auto _dropoutseed = dropoutseed;
   auto _dropoutoffset = dropoutoffset;
@@ -1034,10 +1628,28 @@ void run_cudnn_SDP_bprop(
       "with matching strides...");
 #else
   const auto innermost_dO_stride = dO.strides()[dO.strides().size() - 1];
-  if (innermost_dO_stride != 1) {
+  if (innermost_dO_stride != 1 ||
+      use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
     permute_to_matching_layout(o, dO_);
   }
 #endif
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    seqlen_q = at::full({b, 1, 1, 1}, s_q, q.options().dtype(kInt));
+    seqlen_kv = at::full({b, 1, 1, 1}, s_kv, q.options().dtype(kInt));
+    auto cum_seqlen_q = at::full({b + 1, 1, 1, 1}, s_q, q.options().dtype(kInt))
+                            .cumsum(0, kInt)
+                            .add_(-s_q);
+    auto cum_seqlen_kv =
+        at::full({b + 1, 1, 1, 1}, s_kv, q.options().dtype(kInt))
+            .cumsum(0, kInt)
+            .add_(-s_kv);
+    rag_off_q = cum_seqlen_q.mul(q.stride(-2));
+    rag_off_k = cum_seqlen_kv.mul(k.stride(-2));
+    rag_off_v = cum_seqlen_kv.mul(v.stride(-2));
+    rag_off_o = cum_seqlen_q.mul(o.stride(-2));
+    rag_off_lse = cum_seqlen_q.mul(softmaxstats.stride(-2));
+  }
+
   cudnnHandle_t handle = getCudnnHandle();
   auto key = MHACacheKeyWrapper(
       b,
@@ -1052,13 +1664,14 @@ void run_cudnn_SDP_bprop(
       attn_bias,
       dropout_probability,
       is_causal,
-      true);
-  auto graph_and_tensors_backward_ptr = mhagraphbackwardcache.find(key);
-  graph_and_tensors_backward graph_and_tensors_backward_values;
-  if (graph_and_tensors_backward_ptr) {
-    graph_and_tensors_backward_values = *graph_and_tensors_backward_ptr;
+      true,
+      false);
+  auto graph_backward_ptr = getMHAGraphBackwardCache_().find(key);
+  std::shared_ptr<fe::graph::Graph> mha_graph;
+  if (graph_backward_ptr) {
+    mha_graph = *graph_backward_ptr;
   } else {
-    graph_and_tensors_backward_values = build_graph_and_tensors_backward(
+    mha_graph = build_graph_backward(
         b,
         h,
         s_q,
@@ -1082,49 +1695,185 @@ void run_cudnn_SDP_bprop(
         _dropoutoffset,
         handle);
   }
-  auto
-      [mha_graph,
-       Q,
-       K,
-       V,
-       bias,
-       attn_scale,
-       Seed,
-       Offset,
-       O,
-       Do,
-       Stats,
-       Dq,
-       Dk,
-       Dv] = graph_and_tensors_backward_values;
-  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
-      variant_pack = {// inputs
-                      {Q, q.data_ptr()},
-                      {K, k.data_ptr()},
-                      {V, v.data_ptr()},
-                      {O, o.data_ptr()},
-                      {Do, dO_.data_ptr()},
-                      {Stats, softmaxstats.data_ptr()},
-                      // outputs
-                      {Dq, dQ.data_ptr()},
-                      {Dk, dK.data_ptr()},
-                      {Dv, dV.data_ptr()},
-                      // pass by value
-                      {attn_scale, &scaling_factor}};
+  std::unordered_map<int64_t, void*> variant_pack = {
+      // inputs
+      {Q, q.data_ptr()},
+      {K, k.data_ptr()},
+      {V, v.data_ptr()},
+      {O, o.data_ptr()},
+      {DO, dO_.data_ptr()},
+      {LSE, softmaxstats.data_ptr()},
+      // outputs
+      {DQ, dQ.data_ptr()},
+      {DK, dK.data_ptr()},
+      {DV, dV.data_ptr()},
+      {SCALE, &scaling_factor}};
   if (dropout_probability != 0.0f) {
-    variant_pack[Seed] = _dropoutseed.data_ptr();
-    variant_pack[Offset] = _dropoutoffset.data_ptr();
+    variant_pack[SEED] = _dropoutseed.data_ptr();
+    variant_pack[OFFSET] = _dropoutoffset.data_ptr();
   }
   if (attn_bias.has_value()) {
-    variant_pack[bias.value()] = attn_bias.value().data_ptr();
+    variant_pack[BIAS] = attn_bias.value().data_ptr();
+  }
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr();
+    variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr();
+    variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr();
+    variant_pack[RAG_K_OFF] = rag_off_k.data_ptr();
+    variant_pack[RAG_V_OFF] = rag_off_v.data_ptr();
+    variant_pack[RAG_O_OFF] = rag_off_o.data_ptr();
+    variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr();
+  }
+
+  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_ptr =
+      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
+  TORCH_CHECK(!workspace_size || workspace_ptr.get());
+  TORCH_CHECK(
+      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
+  getMHAGraphBackwardCache_().update(key, mha_graph);
+}
+
+void run_cudnn_SDP_bprop_nestedtensor(
+    int64_t b,
+    int64_t h_q,
+    int64_t h_k,
+    int64_t h_v,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d_qk,
+    int64_t d_v,
+    float scaling_factor,
+    bool is_causal,
+    float dropout_probability,
+    const Tensor& cum_seqlen_q,
+    const Tensor& cum_seqlen_kv,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
+    const Tensor& o,
+    const Tensor& dO,
+    const Tensor& softmaxstats,
+    Tensor& dQ,
+    Tensor& dK,
+    Tensor& dV,
+    const Tensor& dropoutseed,
+    const Tensor& dropoutoffset) {
+  // do nothing if we got 0-element tensors
+  if (!q.numel() || !k.numel() || !v.numel() || !o.numel() || !dO.numel() ||
+      !softmaxstats.numel()) {
+    return;
   }
+
+  Tensor dO_ = dO;
+  const auto innermost_dO_stride = dO.strides()[dO.strides().size() - 1];
+  if (innermost_dO_stride != 1) {
+    permute_to_matching_layout(o, dO_);
+  }
+
+  auto seqlen_q = at::diff(cum_seqlen_q, 1, 0);
+  auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0);
+  auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk);
+  auto rag_k_off = cum_seqlen_kv.mul(h_k * d_v);
+  auto rag_v_off = cum_seqlen_kv.mul(h_v * d_v);
+  auto rag_stats_off = cum_seqlen_q.mul(h_q);
+
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  auto _dropoutseed = dropoutseed;
+  auto _dropoutoffset = dropoutoffset;
+  // cuDNN dropout bug requires these to be in int64
+  if (dprops->major == 10 && dprops->minor == 0) {
+    _dropoutseed = dropoutseed.to(kLong);
+    _dropoutoffset = dropoutoffset.to(kLong);
+  }
+
+  cudnnHandle_t handle = getCudnnHandle();
+
+  auto key = MHACacheKeyWrapper(
+      b,
+      h_q,
+      s_q, // max-seqlen-q
+      s_kv, // max-seqlen-kv
+      d_qk,
+      d_v,
+      q,
+      k,
+      v,
+      attn_bias,
+      dropout_probability,
+      is_causal,
+      true,
+      true);
+  auto graph_ptr = getMHAGraphCache_().find(key);
+  std::shared_ptr<fe::graph::Graph> mha_graph;
+
+  if (graph_ptr) {
+    mha_graph = *graph_ptr;
+  } else {
+    mha_graph = build_graph_backward_nestedtensor(
+        b,
+        h_q,
+        h_k,
+        h_v,
+        s_q,
+        s_kv,
+        d_qk,
+        d_v,
+        scaling_factor,
+        is_causal,
+        dropout_probability,
+        cum_seqlen_q,
+        cum_seqlen_kv,
+        q,
+        k,
+        v,
+        attn_bias,
+        o,
+        dO_,
+        softmaxstats,
+        dQ,
+        dK,
+        dV,
+        dropoutseed,
+        dropoutoffset,
+        handle);
+  }
+
+  std::unordered_map<int64_t, void*> variant_pack = {
+      // inputs
+      {Q, q.data_ptr()},
+      {K, k.data_ptr()},
+      {V, v.data_ptr()},
+      {O, o.data_ptr()},
+      {DO, dO_.data_ptr()},
+      {LSE, softmaxstats.data_ptr()},
+      // outputs
+      {DQ, dQ.data_ptr()},
+      {DK, dK.data_ptr()},
+      {DV, dV.data_ptr()},
+      {SCALE, &scaling_factor},
+      {RAG_Q_OFF, rag_q_off.data_ptr()},
+      {RAG_O_OFF, rag_q_off.data_ptr()},
+      {RAG_K_OFF, rag_k_off.data_ptr()},
+      {RAG_V_OFF, rag_v_off.data_ptr()},
+      {RAG_LSE_OFF, rag_stats_off.data_ptr()},
+      {SEQ_LEN_Q, seqlen_q.data_ptr()},
+      {SEQ_LEN_KV, seqlen_kv.data_ptr()}};
+  if (dropout_probability != 0.0f) {
+    variant_pack[SEED] = _dropoutseed.data_ptr();
+    variant_pack[OFFSET] = _dropoutoffset.data_ptr();
+  }
+  TORCH_CHECK(
+      !attn_bias.has_value(),
+      "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
+
   auto workspace_size = mha_graph->get_workspace_size();
   auto workspace_ptr =
       c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
   TORCH_CHECK(!workspace_size || workspace_ptr.get());
   TORCH_CHECK(
       mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
-  mhagraphbackwardcache.update(key, graph_and_tensors_backward_values);
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/cudnn/MHA.h b/aten/src/ATen/native/cudnn/MHA.h
index 045e8cf6dee9d..620abc1aa0a8e 100644
--- a/aten/src/ATen/native/cudnn/MHA.h
+++ b/aten/src/ATen/native/cudnn/MHA.h
@@ -70,4 +70,31 @@ void run_cudnn_SDP_bprop(
     const Tensor& dropoutseed,
     const Tensor& dropoutoffset);
 
+void run_cudnn_SDP_bprop_nestedtensor(
+    int64_t b,
+    int64_t h_q,
+    int64_t h_k,
+    int64_t h_v,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d_qk,
+    int64_t d_v,
+    float scaling_factor,
+    bool is_causal,
+    float dropout_probability,
+    const Tensor& cum_seqlen_q,
+    const Tensor& cum_seqlen_kv,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const std::optional<Tensor>& attn_bias,
+    const Tensor& o,
+    const Tensor& dO,
+    const Tensor& softmaxstats,
+    Tensor& dQ,
+    Tensor& dK,
+    Tensor& dV,
+    const Tensor& dropoutseed,
+    const Tensor& dropoutoffset);
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/hip/ck_gemm.h b/aten/src/ATen/native/hip/ck_gemm.h
index 176cbabd5e01c..0d42cad56fcda 100644
--- a/aten/src/ATen/native/hip/ck_gemm.h
+++ b/aten/src/ATen/native/hip/ck_gemm.h
@@ -10,6 +10,7 @@ inline void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
   static_assert(false&&sizeof(Dtype),"at::cuda::blas_gemm_internal_ck: not implemented");
 }
 
+#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
 template <>
 void gemm_internal_ck<double>(CUDABLAS_GEMM_ARGTYPES(double));
 template <>
@@ -18,7 +19,7 @@ template <>
 void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
-
+#endif
 
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
index 79cb14be41031..7561cede386fb 100644
--- a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
@@ -1,6 +1,7 @@
 #undef __HIP_NO_HALF_CONVERSIONS__
-
 #include <ATen/native/hip/ck_gemm.h>
+
+#if defined(USE_ROCM_CK_GEMM)
 #include <ATen/native/hip/ck_gemm_template.h>
 #include <ck/utility/sequence.hpp>
 
@@ -781,3 +782,4 @@ void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 }
 
 } // namespace at::native
+#endif // USE_ROCM_CK_GEMM
diff --git a/aten/src/ATen/native/hip/ck_gemm_float.hip b/aten/src/ATen/native/hip/ck_gemm_float.hip
index b8301a47981c6..c4fea6088d3f0 100644
--- a/aten/src/ATen/native/hip/ck_gemm_float.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_float.hip
@@ -1,6 +1,7 @@
 #undef __HIP_NO_HALF_CONVERSIONS__
 
 #include <ATen/native/hip/ck_gemm.h>
+#if defined(USE_ROCM_CK_GEMM)
 #include <ATen/native/hip/ck_gemm_template.h>
 #include <ck/utility/sequence.hpp>
 
@@ -484,3 +485,4 @@ void gemm_internal_ck<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
 }
 
 } // namespace at::native
+#endif // USE_ROCM_CK_GEMM
diff --git a/aten/src/ATen/native/hip/ck_gemm_half.hip b/aten/src/ATen/native/hip/ck_gemm_half.hip
index 552f0de845418..ebe044c389721 100644
--- a/aten/src/ATen/native/hip/ck_gemm_half.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_half.hip
@@ -1,6 +1,7 @@
 #undef __HIP_NO_HALF_CONVERSIONS__
 
 #include <ATen/native/hip/ck_gemm.h>
+#if defined(USE_ROCM_CK_GEMM)
 #include <ATen/native/hip/ck_gemm_template.h>
 
 #include <ck/utility/sequence.hpp>
@@ -606,3 +607,4 @@ void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
 }
 
 } // namespace at::native
+#endif // USE_ROCM_CK_GEMM
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 154118d9f2728..41226680c4b58 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -18,6 +18,7 @@
 #include <ATen/ops/squeeze.h>
 #include <ATen/ops/sum.h>
 #include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
 #endif
 
 // TODO: Remove the condition on AT_ROCM_ENABLED entirely,
@@ -145,13 +146,13 @@ at::Tensor miopen_convolution_relu(
 
 #include <ATen/TensorUtils.h>
 #include <ATen/native/ConvUtils.h>
+#include <ATen/native/utils/ParamsHash.h>
 #include <c10/util/irange.h>
 
 #include <c10/hip/HIPCachingAllocator.h>
 
 #include <functional>
 #include <iterator>
-#include <sstream>
 #include <algorithm>
 #include <memory>
 #include <mutex>
@@ -162,10 +163,13 @@ at::Tensor miopen_convolution_relu(
 
 namespace at { namespace native {
 
-Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) {
-  auto group_size = t.size(dim) / groups;
-  return t.narrow(dim, group_idx * group_size, group_size);
-}
+// See NOTE [ Convolution design ] in aten/src/ATen/native/cudnn/ConvShared.cpp
+
+// ---------------------------------------------------------------------
+//
+// Helper classes
+//
+// ---------------------------------------------------------------------
 
 // This POD struct is used to let us easily compute hashes of the
 // parameters
@@ -174,6 +178,8 @@ struct ConvolutionParams
   miopenHandle_t handle;
   miopenDataType_t dataType;
   int input_size[2 + max_dim];
+  uint8_t input_dim;
+  at::MemoryFormat memory_format;
   int input_stride[2 + max_dim];
   int weight_size[2 + max_dim];
   int padding[max_dim];
@@ -181,25 +187,29 @@ struct ConvolutionParams
   int dilation[max_dim];
   int64_t groups;
   bool deterministic;
-  int device_id; //This is needed to distinguish between miopen handles of multiple gpus.
+  c10::DeviceIndex device_id; //This is needed to distinguish between miopen handles of multiple gpus.
   // NB: transposed purposely omitted: transposed just swaps
   // forward and backward, so you can reuse the benchmark entry,
 };
-// ConvolutionParams must be a POD because we read out its memory
-// contenst as char* when hashing
-static_assert(std::is_standard_layout_v<ConvolutionParams>, "ConvolutionParams not POD");
 
 void setConvolutionParams(
-    ConvolutionParams* params, miopenHandle_t handle,
-    const at::Tensor& input, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool deterministic) {
-
+    ConvolutionParams* params,
+    miopenHandle_t handle,
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool deterministic,
+    at::MemoryFormat memory_format) {
   miopenDataType_t dataType = getMiopenDataType(input);
   memset(params, 0, sizeof(ConvolutionParams));
   params->dataType = dataType;
   params->handle = handle;
   // ASSERT(weight.dim() == input.dim())
+  params->input_dim = input.dim();
+  params->memory_format = memory_format;
   for (int i = 0; i != input.dim(); ++i) {
     params->input_size[i] = (int) input.size(i);
     params->input_stride[i] = (int) input.stride(i);
@@ -214,9 +224,7 @@ void setConvolutionParams(
   }
   params->groups = groups;
   params->deterministic = deterministic;
-  int device_id;
-  HIP_CHECK(hipGetDevice(&device_id));
-  params->device_id = device_id;
+  params->device_id = at::cuda::current_device();
 }
 
 // Convenience struct for passing around descriptors and data
@@ -239,31 +247,10 @@ struct ConvolutionArgs {
 //
 // ---------------------------------------------------------------------
 
-// Hashing machinery for ConvolutionParams
-struct ParamsHash {
-  std::size_t operator()(const ConvolutionParams& params) const {
-    auto ptr = reinterpret_cast<const uint8_t*>(&params);
-    uint32_t value = 0x811C9DC5;
-    for (const auto i : c10::irange((int)sizeof(ConvolutionParams))) {
-      value ^= ptr[i];
-      value *= 0x01000193;
-    }
-    return (size_t)value;
-  }
-};
-
-struct ParamsEqual {
-  bool operator()(const ConvolutionParams& a, const ConvolutionParams& b) const {
-    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
-    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
-    return memcmp(ptr1, ptr2, sizeof(ConvolutionParams)) == 0;
-  }
-};
-
 template <typename T>
 struct BenchmarkCache {
   std::mutex mutex;
-  std::unordered_map<ConvolutionParams, T, ParamsHash, ParamsEqual> map;
+  std::unordered_map<ConvolutionParams, T, ParamsHash<ConvolutionParams>, ParamsEqual<ConvolutionParams>> map;
 
   bool find(const ConvolutionParams& params, T* results) {
     std::lock_guard<std::mutex> guard(mutex);
@@ -314,39 +301,39 @@ size_t getWorkspaceSize(
     const ConvolutionArgs& args, const miopenConvFwdAlgorithm_t)
 {
     size_t sz = 0;
-    miopenConvolutionForwardGetWorkSpaceSize(
+    MIOPEN_CHECK(miopenConvolutionForwardGetWorkSpaceSize(
         args.handle,
         args.wdesc.desc(),
         args.idesc.desc(),
         args.cdesc.desc(),
         args.odesc.desc(),
-        &sz);
+        &sz));
     return sz;
 }
 size_t getWorkspaceSize(
     const ConvolutionArgs& args, const miopenConvBwdDataAlgorithm_t)
 {
     size_t sz = 0;
-    miopenConvolutionBackwardDataGetWorkSpaceSize(
+    MIOPEN_CHECK(miopenConvolutionBackwardDataGetWorkSpaceSize(
         args.handle,
         args.odesc.desc(),
         args.wdesc.desc(),
         args.cdesc.desc(),
         args.idesc.desc(),
-        &sz);
+        &sz));
     return sz;
 }
 size_t getWorkspaceSize(
     const ConvolutionArgs& args, const miopenConvBwdWeightsAlgorithm_t)
 {
     size_t sz = 0;
-    miopenConvolutionBackwardWeightsGetWorkSpaceSize(
+    MIOPEN_CHECK(miopenConvolutionBackwardWeightsGetWorkSpaceSize(
         args.handle,
         args.odesc.desc(),
         args.idesc.desc(),
         args.cdesc.desc(),
         args.wdesc.desc(),
-        &sz);
+        &sz));
     return sz;
 }
 
@@ -649,6 +636,94 @@ Workspace chooseSolution(const ConvolutionArgs& args, uint64_t* solution_id)
   }
 }
 
+// See NOTE [ raw_cudnn_convolution_forward_out ] in aten/src/ATen/native/cudnn/Conv_v7.cpp
+
+// ---------------------------------------------------------------------
+//
+// Splitting to 32bit
+//
+// ---------------------------------------------------------------------
+
+template <typename func_t>
+static inline void split_batch_dim_to_32bit_out(
+    const at::Tensor& output,
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise,
+    int64_t max_worksize,
+    func_t func_32bit) {
+  constexpr int64_t int_max = std::numeric_limits<int>::max();
+  const int64_t ni = input.numel();
+  const int64_t no = output.numel();
+  // Assume the shape of the tensor is (N, C, D1, D2, ...)
+  // if N * C * D1 * D2 * ... <= int_max, then no need to split at all
+  if (ni <= int_max && no <= int_max) {
+    func_32bit(
+        output,
+        input,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        depthwise);
+    return;
+  }
+  // else, if C * D1 * D2 * ... <= int_max, then we just need to split across
+  // the N dimension
+  //
+  // Here we use a simple heuristics to determine the size of each split
+  // We don't max out the 2^31 address space because this number is super
+  // large and very likely to get an OOM.
+  int64_t n = output.size(0);
+  int64_t max_inner_size = std::max<int64_t>(ni, no) / n;
+  int64_t split_size = std::max<int64_t>(max_worksize / max_inner_size, 1L);
+  int64_t num_splits = (n + split_size - 1) / split_size;
+  if (split_size * max_inner_size < int_max) {
+    for (const auto i : c10::irange(num_splits)) {
+      int64_t start = split_size * i;
+      int64_t split_size_ = std::min<int64_t>(split_size, n - start);
+      Tensor input_ = input.narrow(0, start, split_size_);
+      Tensor output_ = output.narrow(0, start, split_size_);
+      func_32bit(
+          output_,
+          input_,
+          weight,
+          padding,
+          stride,
+          dilation,
+          groups,
+          benchmark,
+          deterministic,
+          depthwise);
+    }
+    return;
+  }
+  // If control flow reaches here, this means even splitting N is not enough,
+  // then things starts to become complicated: For example, for conv2d, there
+  // following questions needs to be considered.
+  // - Is the memory layout NCHW or NHWC ?
+  // - If the conv is NCHW -> NC'H'W', then should we
+  //   - split only NC?
+  //   - split only N'C'?
+  //   - split both?
+  // - If the conv is NHWC, then we need to split across H, we need to be very
+  // careful about the boundary condition
+  //   to make sure that the boundary is handled correctly.
+  // - If we decide to make these splits, is the memory contiguous? Do we need
+  // to copy the memory? Considering the complexity of this issue, it is better
+  // not to use cuDNN for this case
+  TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN.");
+}
+
 // ---------------------------------------------------------------------
 //
 // Bias addition
@@ -690,8 +765,47 @@ void miopen_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const
   */
 }
 
-// see NOTE [ Convolution design ] in src/Aten/native/cudnn/Conv.cpp
+Tensor miopen_convolution_backward_bias(const Tensor& grad_output_t)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 };
+
+  // TODO: Workaround since MIOpen does not support NHWC bias
+  // See #64426
+  std::vector<int64_t> discard_dims;
+  for( int i = 0; i < grad_output_t.dim(); i++ ) {
+    if(i != output_channels_dim ) {
+      discard_dims.push_back(i);
+    }
+  }
+
+  Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) );
+  if( outputBias.dim() == 0 ) {
+    // always return a tensor of shape [_]
+    return outputBias.unsqueeze(0);
+  }
+  else {
+    return outputBias;
+  }
+
+/* MIOpen does not support NHWC bias. Activate once support is added.
+  auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options());
+
+  TensorArg grad_bias{ grad_bias_t, "result", 0 };
+
+  TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}),
+                         static_cast<size_t>(grad_output->dim())};
+  TensorDescriptor odesc{*grad_output};
+
+  auto handle = getMiopenHandle();
+  auto dataType = getMiopenDataType(*grad_bias);
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
 
+  MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(),
+                                                   &zero, bdesc.desc(), grad_bias->data_ptr()));
+  return *grad_bias;
+*/
+}
 
 // ---------------------------------------------------------------------
 //
@@ -699,30 +813,47 @@ void miopen_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const
 //
 // ---------------------------------------------------------------------
 
-// The raw API directly invokes MIOpen.
-//
-// There are a few reasons this should never be directly exposed
-// via ATen:
-//
-//    - It takes output as a parameter (this should be computed!)
-//    - It doesn't do input checking
-//    - It doesn't resize output (it is assumed to be correctly sized)
-//
-void raw_miopen_convolution_forward_out(
-    const Tensor& output, const Tensor& input, const Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
-
+void raw_miopen_convolution_forward_out_32bit(
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
   auto dataType = getMiopenDataType(input);
-  miopenConvolutionMode_t c_mode = miopenConvolution;
+  miopenConvolutionMode_t c_mode = depthwise ? miopenDepthwise : miopenConvolution;
 
-  ConvolutionArgs args{ input, output, weight };
+  ConvolutionArgs args{input, output, weight};
   args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(input);
-  args.wdesc.set(weight, input.suggest_memory_format(), 0);
-  args.odesc.set(output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
+  at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
+  setConvolutionParams(
+      &args.params,
+      args.handle,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      memory_format);
+  args.idesc.set(input, memory_format);
+  args.wdesc.set(weight, memory_format, 0);
+  args.odesc.set(output, memory_format);
+  args.cdesc.set(
+      dataType,
+      c_mode,
+      input.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
+      benchmark,
+      deterministic);
 
   if (at::globalContext().immediateMiopen()) {
       uint64_t solution_id;
@@ -730,10 +861,16 @@ void raw_miopen_convolution_forward_out(
 
       MIOPEN_CHECK(miopenConvolutionForwardImmediate(
         args.handle,
-        args.wdesc.desc(), weight.const_data_ptr(),
-        args.idesc.desc(), input.const_data_ptr(),
+        args.wdesc.desc(),
+        weight.const_data_ptr(),
+        args.idesc.desc(),
+        input.const_data_ptr(),
         args.cdesc.desc(),
-        args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id));
+        args.odesc.desc(),
+        output.data_ptr(),
+        workspace.data,
+        workspace.size,
+        solution_id));
   }
   else {
       miopenConvFwdAlgorithm_t fwdAlg;
@@ -744,472 +881,216 @@ void raw_miopen_convolution_forward_out(
 
       MIOPEN_CHECK(miopenConvolutionForward(
         args.handle,
-        &one, args.idesc.desc(), input.const_data_ptr(),
-        args.wdesc.desc(), weight.const_data_ptr(),
-        args.cdesc.desc(), fwdAlg, &zero,
-        args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size));
+        &one,
+        args.idesc.desc(),
+        input.const_data_ptr(),
+        args.wdesc.desc(),
+        weight.const_data_ptr(),
+        args.cdesc.desc(),
+        fwdAlg,
+        &zero,
+        args.odesc.desc(),
+        output.data_ptr(),
+        workspace.data,
+        workspace.size));
   }
 }
 
-Tensor miopen_convolution_forward(
+void raw_miopen_convolution_forward_out(
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  split_batch_dim_to_32bit_out(
+      output,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise,
+      1024 * 1024 * 256,
+      raw_miopen_convolution_forward_out_32bit);
+}
+
+void miopen_convolution_forward_out(
+    TensorArg& output,
     CheckedFrom c,
-    const TensorArg& input, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
+    const TensorArg& input,
+    const TensorArg& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
   checkAllSameType(c, {input, weight});
   checkAllSameGPU(c, {input, weight});
 
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*input, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
-  }
-
-  Tensor output_t = at::detail::empty_cuda(
-      conv_output_size(input->sizes(), weight->sizes(),
-                       padding, stride, dilation),
-      input->options().memory_format(memory_format));
-
-  if (output_t.numel() == 0) {
-    return output_t;
-  }
-
-  // Avoid ambiguity of "output" when this is being used as backwards
-  TensorArg output{ output_t, "result", 0 };
-  convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups);
+  auto memory_format = output->suggest_memory_format();
+  convolution_shape_check(
+      c, input, weight, output, padding, stride, dilation, groups);
 
-  // See #4500
   Tensor weight_contig = weight->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  weight_contig.resize_(weight_contig.sizes(), memory_format);
   Tensor input_contig = input->contiguous(memory_format);
-  input_contig.resize_(input_contig.sizes(), memory_format);
-
-
 
   raw_miopen_convolution_forward_out(
-      *output, input_contig, weight_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
-
-  return *output;
+      *output,
+      input_contig,
+      weight_contig,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise);
 }
 
 Tensor miopen_convolution(
-    const Tensor& input_t, const Tensor& weight_t, const std::optional<Tensor>& bias_t_opt,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool benchmark, bool deterministic)
-{
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_t_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt);
   const Tensor& bias_t = *bias_t_maybe_owned;
 
-  TensorArg input  { input_t,  "input",  1 },
-            weight { weight_t, "weight", 2 },
-            bias   { bias_t,   "bias",   3 };
+  TensorArg input{input_t, "input",  1 }, weight{weight_t, "weight", 2}, bias{bias_t, "bias", 3};
   CheckedFrom c = "miopen_convolution";
-  auto output_t = miopen_convolution_forward(
-    c, input, weight, padding, stride, dilation, groups, benchmark, deterministic);
+  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+        input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+      input->options().memory_format(memory_format));
+  if (output_t.numel() == 0) {
+    return output_t;
+  }
+  // Avoid ambiguity of "output" when this is being used as backwards
+  TensorArg output{output_t, "result", 0};
+  miopen_convolution_forward_out(
+      output,
+      c,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic);
   if (bias->defined()) {
-    miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
+    miopen_convolution_add_bias_(c, output, bias);
   }
-  return output_t;
+  return *output;
 }
 
-//Depthwise Convolutions
-void raw_miopen_depthwise_convolution_forward_out(
-    const Tensor& output, const Tensor& input, const Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
+Tensor miopen_convolution_transpose_backward_input(
+    const Tensor& grad_output_t,
+    const Tensor& weight_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic) {
+  TensorArg grad_output{ grad_output_t,  "grad_output", 1 }, weight{weight_t, "weight", 2};
+  auto memory_format =
+    miopen_conv_suggest_memory_format(grad_output_t, weight_t);
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+        grad_output_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+      grad_output_t.options().memory_format(memory_format));
 
-  auto dataType = getMiopenDataType(input);
-  miopenConvolutionMode_t c_mode = miopenDepthwise;
+  if (output_t.numel() == 0) {
+    return output_t;
+  }
+  TensorArg output{output_t, "result", 0};
+  miopen_convolution_forward_out(
+      output,
+      "miopen_convolution_transpose_backward_input",
+      grad_output,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic);
+  return *output;
+}
 
-  ConvolutionArgs args{ input, output, weight };
-  args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(input);
-  args.wdesc.set(weight, input.suggest_memory_format(), 0);
-  args.odesc.set(output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
+// file organization would put miopen_convolution_transpose_backward_weight here,
+// but it depends on miopen_convolution_backward_weight which is defined later
+Tensor miopen_convolution_transpose_backward_weight(
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic);
 
-  if (at::globalContext().immediateMiopen()) {
-      uint64_t solution_id;
-      Workspace workspace = chooseSolution<miopenConvFwdAlgorithm_t>(args, &solution_id);
+std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_convolution_transpose_backward(
+    const at::Tensor& input,
+    const at::Tensor& grad_output_t,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    std::array<bool,3> output_mask) {
+  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
 
-      MIOPEN_CHECK(miopenConvolutionForwardImmediate(
-        args.handle,
-        args.wdesc.desc(), weight.const_data_ptr(),
-        args.idesc.desc(), input.const_data_ptr(),
-        args.cdesc.desc(),
-        args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id));
+  Tensor grad_input, grad_weight, grad_bias;
+  if (output_mask[0]) {
+    grad_input = miopen_convolution_transpose_backward_input(
+        grad_output,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic);
   }
-  else {
-      miopenConvFwdAlgorithm_t fwdAlg;
-      Workspace workspace = chooseAlgorithm(args, benchmark, &fwdAlg);
-
-      Constant one(dataType, 1);
-      Constant zero(dataType, 0);
-
-      MIOPEN_CHECK(miopenConvolutionForward(
-        args.handle,
-        &one, args.idesc.desc(), input.const_data_ptr(),
-        args.wdesc.desc(), weight.const_data_ptr(),
-        args.cdesc.desc(), fwdAlg, &zero,
-        args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size));
+  if (output_mask[1]) {
+    grad_weight = miopen_convolution_transpose_backward_weight(
+        weight.sizes(),
+        grad_output,
+        input,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic);
+  }
+  if (output_mask[2]) {
+    grad_bias = miopen_convolution_backward_bias(grad_output);
   }
-}
-
-Tensor miopen_depthwise_convolution_forward(
-    CheckedFrom c,
-    const TensorArg& input, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  checkAllSameType(c, {input, weight});
-  checkAllSameGPU(c, {input, weight});
-
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*input, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
-  }
-
-  Tensor output_t = at::detail::empty_cuda(
-      conv_output_size(input->sizes(), weight->sizes(),
-                       padding, stride, dilation),
-      input->options().memory_format(memory_format));
-
-  TensorArg output{ output_t, "result", 0 };
-  convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups);
-
-  // See #4500
-  Tensor weight_contig = weight->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  weight_contig.resize_(weight_contig.sizes(), memory_format);
-  Tensor input_contig = input->contiguous(memory_format);
-  input_contig.resize_(input_contig.sizes(), memory_format);
-
-  raw_miopen_depthwise_convolution_forward_out(
-      *output, input_contig, weight_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
-
-  return *output;
-}
-
-Tensor miopen_depthwise_convolution(
-    const Tensor& input_t, const Tensor& weight_t, const std::optional<Tensor>& bias_t_opt,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool benchmark, bool deterministic)
-{
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt);
-  const Tensor& bias_t = *bias_t_maybe_owned;
-
-  TensorArg input  { input_t,  "input",  1 },
-            weight { weight_t, "weight", 2 },
-            bias   { bias_t,   "bias",   3 };
-  CheckedFrom c = "miopen_depthwise_convolution";
-  auto output_t = miopen_depthwise_convolution_forward(
-    c, input, weight, padding, stride, dilation, groups, benchmark, deterministic);
-  if (bias->defined()) {
-    miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
-  }
-  return output_t;
-}
-
-// ---------------------------------------------------------------------
-//
-// Convolution backward (bias)
-//
-// ---------------------------------------------------------------------
-
-Tensor miopen_convolution_backward_bias(
-    const Tensor& grad_output_t)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 };
-
-  // TODO: Workaround since MIOpen does not support NHWC bias
-  // See #64426
-  std::vector<int64_t> discard_dims;
-  for( int i = 0; i < grad_output_t.dim(); i++ ) {
-      if(i != output_channels_dim ) {
-          discard_dims.push_back(i);
-      }
-  }
-
-  Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) );
-  if( outputBias.dim() == 0 ) {
-      // always return a tensor of shape [_]
-      return outputBias.unsqueeze(0);
-  }
-  else {
-      return outputBias;
-  }
-
-/* MIOpen does not support NHWC bias. Activate once support is added.
-  auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options());
-
-  TensorArg grad_bias{ grad_bias_t, "result", 0 };
-
-  TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}),
-                         static_cast<size_t>(grad_output->dim())};
-  TensorDescriptor odesc{*grad_output};
-
-  auto handle = getMiopenHandle();
-  auto dataType = getMiopenDataType(*grad_bias);
-  Constant one(dataType, 1);
-  Constant zero(dataType, 0);
-
-  MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(),
-                                                   &zero, bdesc.desc(), grad_bias->data_ptr()));
-  return *grad_bias;
-*/
-}
-
-// ---------------------------------------------------------------------
-//
-// Convolution backward (weight)
-//
-// ---------------------------------------------------------------------
-
-void raw_miopen_convolution_backward_weight_out(
-    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
-
-  auto dataType = getMiopenDataType(input);
-  miopenConvolutionMode_t c_mode = miopenConvolution;
-
-  ConvolutionArgs args{ input, grad_output, grad_weight };
-  args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(input);
-  args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
-  args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
-
-  if (at::globalContext().immediateMiopen()) {
-      uint64_t solution_id;
-      Workspace workspace = chooseSolution<miopenConvBwdWeightsAlgorithm_t>(args, &solution_id);
-
-      MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate(
-          args.handle,
-          args.odesc.desc(), grad_output.const_data_ptr(),
-          args.idesc.desc(), input.const_data_ptr(),
-          args.cdesc.desc(),
-          args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id));
-  }
-  else {
-      miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
-      Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg);
-
-      Constant one(dataType, 1);
-      Constant zero(dataType, 0);
-
-      MIOPEN_CHECK(miopenConvolutionBackwardWeights(
-          args.handle,
-          &one, args.odesc.desc(), grad_output.const_data_ptr(),
-          args.idesc.desc(), input.const_data_ptr(),
-          args.cdesc.desc(), bwdFilterAlg, &zero,
-          args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
-  }
-}
-
-//Depthwise backward weights.
-void raw_miopen_depthwise_convolution_backward_weight_out(
-    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
-
-  auto dataType = getMiopenDataType(input);
-  miopenConvolutionMode_t c_mode = miopenDepthwise;
-
-  ConvolutionArgs args{ input, grad_output, grad_weight };
-  args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(input);
-  args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
-  args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
-
-  if (at::globalContext().immediateMiopen()) {
-      uint64_t solution_id;
-      Workspace workspace = chooseSolution<miopenConvBwdWeightsAlgorithm_t>(args, &solution_id);
-
-      MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate(
-          args.handle,
-          args.odesc.desc(), grad_output.const_data_ptr(),
-          args.idesc.desc(), input.const_data_ptr(),
-          args.cdesc.desc(),
-          args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id));
-  }
-  else {
-      miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
-      Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg);
-
-      Constant one(dataType, 1);
-      Constant zero(dataType, 0);
-
-      MIOPEN_CHECK(miopenConvolutionBackwardWeights(
-          args.handle,
-          &one, args.odesc.desc(), grad_output.const_data_ptr(),
-          args.idesc.desc(), input.const_data_ptr(),
-          args.cdesc.desc(), bwdFilterAlg, &zero,
-          args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
-  }
-}
-
-Tensor miopen_depthwise_convolution_backward_weight(
-    CheckedFrom c,
-    IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-
-  checkAllSameType(c, {grad_output, input});
-  checkAllSameGPU(c, {grad_output, input});
-
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*input, *grad_output)) {
-    memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
-  }
-
-  Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format);
-  TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 };
-
-  Tensor input_contig_t = input->contiguous(memory_format);
-  input_contig_t.resize_(input_contig_t.sizes(), memory_format);
-  TensorArg input_contig{ input_contig_t, "input", 2};
-
-  auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format);
-
-  // For uniformity with everything else, although it seems grad_weight
-  // would be unambiguous too.
-  TensorArg grad_weight{ grad_weight_t, "result", 0 };
-  convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups);
-
-  raw_miopen_depthwise_convolution_backward_weight_out(
-      *grad_weight, *grad_output_contig, *input_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
-
-  return grad_weight_t;
-}
-
-Tensor miopen_depthwise_convolution_backward_weight(
-    IntArrayRef weight_size,
-    const Tensor& grad_output_t,
-    const Tensor& input_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            input{ input_t, "input", 2 };
-  return miopen_depthwise_convolution_backward_weight(
-      "miopen_depthwise_convolution_backward_weight",
-      weight_size, grad_output, input,
-      padding, stride, dilation, groups, benchmark, deterministic);
-}
-
-Tensor miopen_convolution_backward_weight(
-    CheckedFrom c,
-    IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-
-  checkAllSameType(c, {grad_output, input});
-  checkAllSameGPU(c, {grad_output, input});
-
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*input, *grad_output)) {
-    memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
-  }
-
-  Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format);
-  TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 };
-
-  Tensor input_contig_t = input->contiguous(memory_format);
-  input_contig_t.resize_(input_contig_t.sizes(), memory_format);
-  TensorArg input_contig{ input_contig_t, "input", 2};
-
-  auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format);
-
-  // For uniformity with everything else, although it seems grad_weight
-  // would be unambiguous too.
-  TensorArg grad_weight{ grad_weight_t, "result", 0 };
-  convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups);
-
-  raw_miopen_convolution_backward_weight_out(
-      *grad_weight, *grad_output_contig, *input_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
-
-  return grad_weight_t;
-}
-
-Tensor miopen_convolution_backward_weight(
-    IntArrayRef weight_size,
-    const Tensor& grad_output_t,
-    const Tensor& input_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            input{ input_t, "input", 2 };
-  return miopen_convolution_backward_weight(
-      "miopen_convolution_backward_weight",
-      weight_size, grad_output, input,
-      padding, stride, dilation, groups, benchmark, deterministic);
-}
-
-Tensor miopen_convolution_transpose_backward_input(
-    const Tensor& grad_output_t, const Tensor& weight_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool benchmark, bool deterministic)
-{
-  TensorArg grad_output { grad_output_t,  "grad_output", 1 },
-            weight      { weight_t, "weight", 2 };
-  return miopen_convolution_forward(
-    "miopen_convolution_transpose_backward_input",
-    grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
-}
-
-Tensor miopen_convolution_transpose_backward_weight(
-    IntArrayRef weight_size,
-    const Tensor& grad_output_t,
-    const Tensor& input_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            input{ input_t, "input", 2 };
-  return miopen_convolution_backward_weight(
-      "miopen_convolution_backward_weight",
-      weight_size, input, grad_output,
-      padding, stride, dilation, groups, benchmark, deterministic);
-}
-
-std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_transpose_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
-
-  Tensor grad_input, grad_weight, grad_bias;
-  if (output_mask[0]) {
-    grad_input = miopen_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
-  }
-  if (output_mask[1]) {
-    grad_weight = miopen_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
-  }
-  if (output_mask[2]) {
-    grad_bias = miopen_convolution_backward_bias(grad_output);
-  }
-
-  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+
+  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
 }
 
 // ---------------------------------------------------------------------
@@ -1218,23 +1099,50 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_transpose_backwa
 //
 // ---------------------------------------------------------------------
 
-void raw_miopen_convolution_backward_input_out(
+// See NOTE [ Backward vs transpose convolutions ] in aten/src/ATen/native/cudnn/ConvShared.cpp
+
+void raw_miopen_convolution_backward_input_out_32bit(
     const at::Tensor& grad_input,
     const at::Tensor& grad_output,
     const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
-
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
   auto dataType = getMiopenDataType(grad_output);
-  miopenConvolutionMode_t c_mode = miopenConvolution;
+  miopenConvolutionMode_t c_mode = depthwise ? miopenDepthwise : miopenConvolution;
 
-  ConvolutionArgs args{ grad_input, grad_output, weight };
+  ConvolutionArgs args{grad_input, grad_output, weight};
   args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, grad_input, weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(grad_input);
-  args.wdesc.set(weight, grad_output.suggest_memory_format(), 0);
-  args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
+  at::MemoryFormat memory_format =
+    miopen_conv_suggest_memory_format(grad_input, weight);
+  setConvolutionParams(
+      &args.params,
+      args.handle,
+      grad_input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      memory_format);
+  args.idesc.set(grad_input, memory_format);
+  args.wdesc.set(weight, memory_format, 0);
+  args.odesc.set(grad_output, memory_format);
+  args.cdesc.set(
+      dataType,
+      c_mode,
+      grad_output.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
+      benchmark,
+      deterministic);
 
   if (at::globalContext().immediateMiopen()) {
       uint64_t solution_id;
@@ -1245,7 +1153,10 @@ void raw_miopen_convolution_backward_input_out(
           args.odesc.desc(), grad_output.const_data_ptr(),
           args.wdesc.desc(), weight.const_data_ptr(),
           args.cdesc.desc(),
-          args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id));
+          args.idesc.desc(), grad_input.mutable_data_ptr(),
+          workspace.data,
+          workspace.size,
+          solution_id));
   }
   else {
       miopenConvBwdDataAlgorithm_t bwdDataAlg;
@@ -1256,216 +1167,521 @@ void raw_miopen_convolution_backward_input_out(
 
       MIOPEN_CHECK(miopenConvolutionBackwardData(
           args.handle,
-          &one, args.odesc.desc(), grad_output.const_data_ptr(),
+          &one,
+          args.odesc.desc(), grad_output.const_data_ptr(),
           args.wdesc.desc(), weight.const_data_ptr(),
-          args.cdesc.desc(), bwdDataAlg, &zero,
-          args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size));
+          args.cdesc.desc(),
+          bwdDataAlg,
+          &zero,
+          args.idesc.desc(), grad_input.mutable_data_ptr(),
+          workspace.data,
+          workspace.size));
   }
 }
 
-// see NOTE [ Backward vs transpose convolutions ] in src/Aten/native/cudnn/Conv.cpp
+void raw_miopen_convolution_backward_input_out(
+    const at::Tensor& grad_input,
+    const at::Tensor& grad_output,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  split_batch_dim_to_32bit_out(
+      grad_input,
+      grad_output,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise,
+      1024 * 1024 * 128,
+      raw_miopen_convolution_backward_input_out_32bit);
+}
 
 Tensor miopen_convolution_backward_input(
     CheckedFrom c,
-    IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
+    IntArrayRef input_size,
+    const TensorArg& grad_output,
+    const TensorArg& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
 
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*grad_output, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
-  }
-
+  auto memory_format = miopen_conv_suggest_memory_format(*grad_output, *weight);
   Tensor grad_input_t = at::detail::empty_cuda(
       input_size, grad_output->options().memory_format(memory_format));
 
   // Avoid "grad_input" when this is being used as transposed convolution
-  TensorArg grad_input{ grad_input_t, "result", 0 };
-  convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
+  TensorArg grad_input{grad_input_t, "result", 0};
+  convolution_shape_check(
+      c, grad_input, weight, grad_output, padding, stride, dilation, groups);
 
-  // See #4500
   Tensor weight_contig = weight->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  weight_contig.resize_(weight_contig.sizes(), memory_format);
-
   Tensor grad_output_contig = grad_output->contiguous(memory_format);
-  grad_output_contig.resize_(grad_output_contig.sizes(), memory_format);
 
   raw_miopen_convolution_backward_input_out(
-      *grad_input, grad_output_contig, weight_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
+      *grad_input,
+      grad_output_contig,
+      weight_contig,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise);
 
   return *grad_input;
 }
 
-Tensor miopen_convolution_transpose_forward(
-    CheckedFrom c,
-    const TensorArg& grad_output, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(),
-                                    padding, output_padding, stride, dilation, groups);
-  return miopen_convolution_backward_input(c, input_size, grad_output, weight,
-                                    padding, stride, dilation, groups, benchmark, deterministic);
-}
-
+// overload
 Tensor miopen_convolution_backward_input(
-    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            weight{ weight_t, "weight", 2 };
+    IntArrayRef input_size,
+    const Tensor& grad_output_t,
+    const Tensor& weight_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  TensorArg grad_output{grad_output_t, "grad_output", 1},
+      weight{weight_t, "weight", 2};
   return miopen_convolution_backward_input(
       "miopen_convolution_backward_input",
-      input_size, grad_output, weight,
-      padding, stride, dilation, groups, benchmark, deterministic);
+      input_size,
+      grad_output,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise);
 }
 
-//Depthwise convolutions backward data.
-void raw_miopen_depthwise_convolution_backward_input_out(
-    const at::Tensor& grad_input,
-    const at::Tensor& grad_output,
-    const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
-
-  auto dataType = getMiopenDataType(grad_output);
-  miopenConvolutionMode_t c_mode = miopenDepthwise;
+void raw_miopen_convolution_backward_weight_out_32bit(
+    const Tensor& grad_weight,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  auto dataType = getMiopenDataType(input);
+  miopenConvolutionMode_t c_mode = depthwise ? miopenDepthwise : miopenConvolution;
 
-  ConvolutionArgs args{ grad_input, grad_output, weight };
+  ConvolutionArgs args{input, grad_output, grad_weight};
   args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, grad_input, weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(grad_input);
-  args.wdesc.set(weight, grad_output.suggest_memory_format(), 0);
-  args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
+  at::MemoryFormat memory_format =
+    miopen_conv_suggest_memory_format(input, grad_weight);
+  setConvolutionParams(
+      &args.params,
+      args.handle,
+      input,
+      grad_weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      memory_format);
+  args.idesc.set(input, memory_format);
+  args.wdesc.set(grad_weight, memory_format, 0);
+  args.odesc.set(grad_output, memory_format);
+  args.cdesc.set(
+      dataType,
+      c_mode,
+      input.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
+      benchmark,
+      deterministic);
 
   if (at::globalContext().immediateMiopen()) {
       uint64_t solution_id;
-      Workspace workspace = chooseSolution<miopenConvBwdDataAlgorithm_t>(args, &solution_id);
+      Workspace workspace = chooseSolution<miopenConvBwdWeightsAlgorithm_t>(args, &solution_id);
 
-      MIOPEN_CHECK(miopenConvolutionBackwardDataImmediate(
+      MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate(
           args.handle,
           args.odesc.desc(), grad_output.const_data_ptr(),
-          args.wdesc.desc(), weight.const_data_ptr(),
+          args.idesc.desc(), input.const_data_ptr(),
           args.cdesc.desc(),
-          args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id));
+          args.wdesc.desc(), grad_weight.data_ptr(),
+          workspace.data,
+          workspace.size,
+          solution_id));
+  }
+  else {
+      miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
+      Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg);
+
+      Constant one(dataType, 1);
+      Constant zero(dataType, 0);
+
+      MIOPEN_CHECK(miopenConvolutionBackwardWeights(
+          args.handle,
+          &one,
+          args.odesc.desc(), grad_output.const_data_ptr(),
+          args.idesc.desc(), input.const_data_ptr(),
+          args.cdesc.desc(),
+          bwdFilterAlg,
+          &zero,
+          args.wdesc.desc(), grad_weight.data_ptr(),
+          workspace.data,
+          workspace.size));
+  }
+}
+
+void raw_miopen_convolution_backward_weight_out(
+    const Tensor& grad_weight,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  constexpr int64_t int_max = std::numeric_limits<int>::max();
+  const int64_t ni = input.numel();
+  const int64_t no = grad_output.numel();
+  // Assume the shape of the tensor is (N, C, D1, D2, ...)
+  // if N * C * D1 * D2 * ... <= int_max, then no need to split at all
+  if (ni <= int_max && no <= int_max) {
+    raw_miopen_convolution_backward_weight_out_32bit(
+        grad_weight,
+        grad_output,
+        input,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        depthwise);
+    return;
   }
-  else {
-      miopenConvBwdDataAlgorithm_t bwdDataAlg;
-      Workspace workspace = chooseAlgorithm(args, benchmark, &bwdDataAlg);
-
-      Constant one(dataType, 1);
-      Constant zero(dataType, 0);
-
-      MIOPEN_CHECK(miopenConvolutionBackwardData(
-          args.handle,
-          &one, args.odesc.desc(), grad_output.const_data_ptr(),
-          args.wdesc.desc(), weight.const_data_ptr(),
-          args.cdesc.desc(), bwdDataAlg, &zero,
-          args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size));
+  // else, if C * D1 * D2 * ... <= int_max, then we just need to split across
+  // the N dimension
+  //
+  // Here we use a simple heuristics to determine the size of each split
+  // We don't max out the 2^31 address space because this number is super
+  // large and very likely to get an OOM.
+  int64_t n = grad_output.size(0);
+  int64_t max_inner_size = std::max<int64_t>(ni, no) / n;
+  int64_t split_size =
+      std::max<int64_t>(1024 * 1024 * 512 / max_inner_size, 1L);
+  int64_t num_splits = (n + split_size - 1) / split_size;
+  if (split_size * max_inner_size < int_max) {
+    const auto kAccType = (grad_weight.scalar_type() == kHalf ||
+                           grad_weight.scalar_type() == kBFloat16)
+        ? kFloat
+        : grad_weight.scalar_type();
+    Tensor grad_weight_accumulator =
+        at::zeros(grad_weight.sizes(), grad_weight.options().dtype(kAccType));
+    for (const auto i : c10::irange(num_splits)) {
+      int64_t start = split_size * i;
+      int64_t split_size_ = std::min<int64_t>(split_size, n - start);
+      Tensor input_ = input.narrow(0, start, split_size_);
+      Tensor grad_output_ = grad_output.narrow(0, start, split_size_);
+      Tensor grad_weight_ = at::empty_like(grad_weight);
+      raw_miopen_convolution_backward_weight_out_32bit(
+          grad_weight_,
+          grad_output_,
+          input_,
+          padding,
+          stride,
+          dilation,
+          groups,
+          benchmark,
+          deterministic,
+          depthwise);
+      grad_weight_accumulator.add_(grad_weight_);
+    }
+    grad_weight.copy_(grad_weight_accumulator);
+    return;
   }
+  // If control flow reaches here, this means even splitting N is not enough,
+  // then things starts to become complicated: For example, for conv2d, there
+  // following questions needs to be considered.
+  // - Is the memory layout NCHW or NHWC ?
+  // - If the conv is NCHW -> NC'H'W', then should we
+  //   - split only NC?
+  //   - split only N'C'?
+  //   - split both?
+  // - If the conv is NHWC, then we need to split across H, we need to be very
+  // careful about the boundary condition
+  //   to make sure that the boundary is handled correctly.
+  // - If we decide to make these splits, is the memory contiguous? Do we need
+  // to copy the memory? Considering the complexity of this issue, it is better
+  // not to use cuDNN for this case
+  TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN.");
 }
 
-Tensor miopen_depthwise_convolution_backward_input(
+Tensor miopen_convolution_backward_weight(
     CheckedFrom c,
-    IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  checkAllSameType(c, {grad_output, weight});
-  checkAllSameGPU(c, {grad_output, weight});
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  auto memory_format = miopen_conv_suggest_memory_format(input_t, grad_output_t);
 
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*grad_output, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
-  }
+  Tensor grad_output_contig_t = grad_output_t.contiguous(memory_format);
+  TensorArg grad_output_contig{grad_output_contig_t, "grad_output", 1};
 
-  Tensor grad_input_t = at::detail::empty_cuda(
-      input_size, grad_output->options().memory_format(memory_format));
+  Tensor input_contig_t = input_t.contiguous(memory_format);
+  TensorArg input{input_contig_t, "input", 2};
 
-  TensorArg grad_input{ grad_input_t, "result", 0 };
-  convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
+  checkAllSameType(c, {grad_output_contig, input});
+  checkAllSameGPU(c, {grad_output_contig, input});
 
-  // See #4500
-  Tensor weight_contig = weight->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  weight_contig.resize_(weight_contig.sizes(), memory_format);
+  auto grad_weight_t =
+    at::empty(weight_size, grad_output_contig->options(), memory_format);
 
-  Tensor grad_output_contig = grad_output->contiguous(memory_format);
-  grad_output_contig.resize_(grad_output_contig.sizes(), memory_format);
+  // For uniformity with everything else, although it seems grad_weight
+  // would be unambiguous too.
+  TensorArg grad_weight{grad_weight_t, "result", 0};
+  convolution_shape_check(
+      c,
+      input,
+      grad_weight,
+      grad_output_contig,
+      padding,
+      stride,
+      dilation,
+      groups);
 
-  raw_miopen_depthwise_convolution_backward_input_out(
-      *grad_input, grad_output_contig, weight_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
+  raw_miopen_convolution_backward_weight_out(
+      *grad_weight,
+      *grad_output_contig,
+      *input,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise);
 
-  return *grad_input;
+  return grad_weight_t;
 }
 
-Tensor miopen_depthwise_convolution_backward_input(
-    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            weight{ weight_t, "weight", 2 };
-  return miopen_depthwise_convolution_backward_input(
-      "miopen_depthwise_convolution_backward_input",
-      input_size, grad_output, weight,
-      padding, stride, dilation, groups, benchmark, deterministic);
+// overload
+Tensor miopen_convolution_backward_weight(
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  return miopen_convolution_backward_weight(
+      "miopen_convolution_backward_weight",
+      weight_size,
+      grad_output_t,
+      input_t,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise);
 }
 
-std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
+std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_convolution_backward(
+    const at::Tensor& input,
+    const at::Tensor& grad_output_t,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    std::array<bool,3> output_mask) {
+  Tensor grad_output = grad_output_t.to(input.suggest_memory_format());
 
   Tensor grad_input, grad_weight, grad_bias;
-  if (output_mask[0]) {
-    grad_input = miopen_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
-  }
-  if (output_mask[1]) {
-    grad_weight = miopen_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
-  }
-  if (output_mask[2]) {
-    grad_bias = miopen_convolution_backward_bias(grad_output);
+  if (input.numel() == 0) {
+    if (output_mask[0]) {
+      grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    }
+    if (output_mask[1]) {
+      grad_weight = at::zeros_like(weight, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    }
+    if (output_mask[2]) {
+      grad_bias = at::zeros_like(grad_output_t, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    }
+  } else {
+    if (output_mask[0]) {
+      grad_input = miopen_convolution_backward_input(
+          input.sizes(),
+          grad_output,
+          weight,
+          padding,
+          stride,
+          dilation,
+          groups,
+          benchmark,
+          deterministic);
+    }
+    if (output_mask[1]) {
+      grad_weight = miopen_convolution_backward_weight(
+          weight.sizes(),
+          grad_output,
+          input,
+          padding,
+          stride,
+          dilation,
+          groups,
+          benchmark,
+          deterministic);
+    }
+    if (output_mask[2]) {
+      grad_bias = miopen_convolution_backward_bias(grad_output);
+    }
   }
 
-  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+  return std::tuple<Tensor, Tensor, Tensor>{grad_input, grad_weight, grad_bias};
 }
 
-std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_depthwise_convolution_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
+Tensor miopen_convolution_transpose_forward(
+    CheckedFrom c,
+    const TensorArg& grad_output,
+    const TensorArg& weight,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic) {
+  auto input_size = conv_input_size(
+      grad_output->sizes(),
+      weight->sizes(),
+      padding,
+      output_padding,
+      stride,
+      dilation,
+      groups);
+  return miopen_convolution_backward_input(
+      c,
+      input_size,
+      grad_output,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic);
+}
 
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
+Tensor miopen_convolution_transpose_backward_weight(
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic) {
+  return miopen_convolution_backward_weight(
+      "miopen_convolution_backward_weight",
+      weight_size,
+      input_t,
+      grad_output_t,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic);
+}
 
-  Tensor grad_input, grad_weight, grad_bias;
-  if (output_mask[0]) {
-    grad_input = miopen_depthwise_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
-  }
-  if (output_mask[1]) {
-    grad_weight = miopen_depthwise_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
-  }
-  if (output_mask[2]) {
-    grad_bias = miopen_convolution_backward_bias(grad_output);
-  }
+Tensor miopen_convolution_transpose(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_t_opt,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt);
+  const Tensor& bias_t = *bias_t_maybe_owned;
 
-  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+  TensorArg input{input_t, "input", 1}, weight{weight_t, "weight", 2}, bias{bias_t, "bias", 3};
+  CheckedFrom c = "miopen_convolution_transpose";
+  auto output_t = miopen_convolution_transpose_forward(
+      c,
+      input,
+      weight,
+      padding,
+      output_padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic);
+  if (bias->defined()) {
+    miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
+  }
+  return output_t;
 }
 
-Tensor miopen_convolution_transpose(
-    const Tensor& input_t, const Tensor& weight_t, const std::optional<Tensor>& bias_t_opt,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool benchmark, bool deterministic)
+// ---------------------------------------------------------------------
+//
+// Convolution depthwise
+//
+// ---------------------------------------------------------------------
+
+Tensor miopen_depthwise_convolution(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_t_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic)
 {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt);
@@ -1474,16 +1690,86 @@ Tensor miopen_convolution_transpose(
   TensorArg input  { input_t,  "input",  1 },
             weight { weight_t, "weight", 2 },
             bias   { bias_t,   "bias",   3 };
-  CheckedFrom c = "miopen_convolution_transpose";
-  auto output_t = miopen_convolution_transpose_forward(
-    c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic);
+  CheckedFrom c = "miopen_depthwise_convolution";
+  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+        input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+      input_t.options().memory_format(memory_format));
+  if (output_t.numel() == 0) {
+    return output_t;
+  }
+  // Avoid ambiguity of "output" when this is being used as backwards
+  TensorArg output{output_t, "result", 0};
+  miopen_convolution_forward_out(
+      output,
+      c,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      true);
   if (bias->defined()) {
-    miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
+    miopen_convolution_add_bias_(c, output, bias);
   }
-  return output_t;
+  return *output;
 }
 
-// MIOpen fused convolution bias activation forward
+std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_depthwise_convolution_backward(
+    const at::Tensor& input,
+    const at::Tensor& grad_output_t,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    std::array<bool,3> output_mask) {
+  Tensor grad_output = grad_output_t.to(input.suggest_memory_format());
+
+  Tensor grad_input, grad_weight, grad_bias;
+  if (output_mask[0]) {
+    grad_input = miopen_convolution_backward_input(
+        input.sizes(),
+        grad_output,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        true);
+  }
+  if (output_mask[1]) {
+    grad_weight = miopen_convolution_backward_weight(
+        weight.sizes(),
+        grad_output,
+        input,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        true);
+  }
+  if (output_mask[2]) {
+    grad_bias = miopen_convolution_backward_bias(grad_output);
+  }
+
+  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+}
+
+// ---------------------------------------------------------------------
+// fusions
+// ---------------------------------------------------------------------
+
 void raw_miopen_convolution_relu_out(
     const Tensor& output,
     const Tensor& input,
@@ -1495,17 +1781,35 @@ void raw_miopen_convolution_relu_out(
     int64_t groups,
     bool benchmark,
     bool deterministic) {
-
   auto dataType = getMiopenDataType(input);
   miopenConvolutionMode_t c_mode = miopenConvolution;
-
   ConvolutionArgs args{ input, output, weight };
   args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(input);
-  args.wdesc.set(weight, input.suggest_memory_format(), 0);
-  args.odesc.set(output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
+  at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
+  setConvolutionParams(
+      &args.params,
+      args.handle,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      memory_format);
+  args.idesc.set(input, memory_format);
+  args.wdesc.set(weight, memory_format, 0);
+  args.odesc.set(output, memory_format);
+  args.cdesc.set(
+      dataType,
+      c_mode,
+      input.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
+      benchmark,
+      deterministic);
 
   TensorDescriptor bdesc;
   bdesc.set(bias.expand({1, bias.size(0)}), output.dim());
@@ -1549,8 +1853,8 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m
 }
 
 Tensor miopen_convolution_add_relu(
-    const Tensor& input,
-    const Tensor& weight,
+    const Tensor& input_t,
+    const Tensor& weight_t,
     const Tensor& z,
     const std::optional<Scalar>& alpha,
     const std::optional<Tensor>& bias,
@@ -1562,17 +1866,28 @@ Tensor miopen_convolution_add_relu(
   // MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function:
   // y = act ( alpha1 * conv(x) + alpha2 * z + bias )
 
-  auto memory_format = input.suggest_memory_format();
+  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
 
   auto& ctx = at::globalContext();
   bool benchmark = ctx.benchmarkCuDNN();
 
-  TensorArg input_arg  { input,  "input",  1 },
-            weight_arg { weight, "weight", 2 };
-  auto output = miopen_convolution_forward(
+  TensorArg input  { input_t,  "input",  1 },
+            weight { weight_t, "weight", 2 };
+
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+        input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+      input_t.options().memory_format(memory_format));
+  if (output_t.numel() == 0){
+    return output_t;
+  }
+  // Avoid ambiguity of "output" when this is being used as backwards
+  TensorArg output{output_t, "result", 0};
+  miopen_convolution_forward_out(
+      output,
       "miopen_convolution_add_relu",
-      input_arg,
-      weight_arg,
+      input,
+      weight,
       padding,
       stride,
       dilation,
@@ -1581,53 +1896,51 @@ Tensor miopen_convolution_add_relu(
       false // deterministic
   );
 
-  auto contig_output = self_or_new_memory_format(output, memory_format);
+  auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
 
-  if (!output.is_same(contig_output)) {
-    contig_output.copy_(output);
+  if (!output_t.is_same(contig_output_t)) {
+    contig_output_t.copy_(output_t);
   }
 
   auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
   auto _bias = bias.has_value()
           ? bias.value()
           : at::zeros(
-                {contig_output.size(1)},
-                optTypeMetaToScalarType(contig_output.options().dtype_opt()),
-                contig_output.options().layout_opt(),
-                contig_output.options().device_opt(),
-                contig_output.options().pinned_memory_opt());
+                {contig_output_t.size(1)},
+                optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
+                contig_output_t.options().layout_opt(),
+                contig_output_t.options().device_opt(),
+                contig_output_t.options().pinned_memory_opt());
 
-  at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input.dim(), _bias).add(z, _alpha);
-  contig_output.add_(alpha_mul_z_add_bias);
-  contig_output.relu_();
+  at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha);
+  contig_output_t.add_(alpha_mul_z_add_bias);
+  contig_output_t.relu_();
 
-  return contig_output;
+  return contig_output_t;
 }
 
 Tensor miopen_convolution_relu(
-    const Tensor& input,
-    const Tensor& weight,
+    const Tensor& input_t,
+    const Tensor& weight_t,
     const std::optional<Tensor>& bias,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
     int64_t groups) {
 
-  auto memory_format = input.suggest_memory_format();
-
   auto& ctx = at::globalContext();
   bool benchmark = ctx.benchmarkCuDNN();
 
   // MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d
-  if (input.suggest_memory_format() == at::MemoryFormat::Contiguous
-          && input.scalar_type() == at::kFloat
-          && input.ndimension() == 4) {
+  if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous
+          && input_t.scalar_type() == at::kFloat
+          && input_t.ndimension() == 4) {
 
     // FuseFrozenConvAddRelu performs some tensor shape checking
     Tensor output_t = at::detail::empty_cuda(
         conv_output_size(
-            input.sizes(), weight.sizes(), padding, stride, dilation),
-        input.options().memory_format(input.suggest_memory_format()));
+            input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+        input_t.options().memory_format(input_t.suggest_memory_format()));
     if (output_t.numel() == 0) {
       return output_t;
     }
@@ -1643,8 +1956,8 @@ Tensor miopen_convolution_relu(
 
     raw_miopen_convolution_relu_out(
         output_t,
-        input,
-        weight,
+        input_t,
+        weight_t,
         _bias,
         stride,
         padding,
@@ -1659,12 +1972,25 @@ Tensor miopen_convolution_relu(
   else {
     // fallback
 
-    TensorArg input_arg  { input,  "input",  1 },
-              weight_arg { weight, "weight", 2 };
-    auto output = miopen_convolution_forward(
+    auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+
+    TensorArg input  { input_t,  "input",  1 },
+              weight { weight_t, "weight", 2 };
+
+    Tensor output_t = at::detail::empty_cuda(
+        conv_output_size(
+          input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+        input->options().memory_format(memory_format));
+    if (output_t.numel() == 0){
+      return output_t;
+    }
+    // Avoid ambiguity of "output" when this is being used as backwards
+    TensorArg output{output_t, "result", 0};
+    miopen_convolution_forward_out(
+        output,
         "miopen_convolution_relu",
-        input_arg,
-        weight_arg,
+        input,
+        weight,
         padding,
         stride,
         dilation,
@@ -1673,26 +1999,26 @@ Tensor miopen_convolution_relu(
         false // deterministic
     );
 
-    auto contig_output = self_or_new_memory_format(output, memory_format);
+    auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
 
-    if (!output.is_same(contig_output)) {
-      contig_output.copy_(output);
+    if (!output_t.is_same(contig_output_t)) {
+      contig_output_t.copy_(output_t);
     }
 
     auto _bias = bias.has_value()
             ? bias.value()
             : at::zeros(
-                  {contig_output.size(1)},
-                  optTypeMetaToScalarType(contig_output.options().dtype_opt()),
-                  contig_output.options().layout_opt(),
-                  contig_output.options().device_opt(),
-                  contig_output.options().pinned_memory_opt());
+                  {contig_output_t.size(1)},
+                  optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
+                  contig_output_t.options().layout_opt(),
+                  contig_output_t.options().device_opt(),
+                  contig_output_t.options().pinned_memory_opt());
 
-    at::Tensor reshaped_bias = at::native::reshape_bias(input.dim(), _bias);
-    contig_output.add_(reshaped_bias);
-    contig_output.relu_();
+    at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias);
+    contig_output_t.add_(reshaped_bias);
+    contig_output_t.relu_();
 
-    return contig_output;
+    return contig_output_t;
   }
 }
 
diff --git a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
index 813db7a97ef9f..873005b3dd2bc 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
@@ -1,3 +1,4 @@
+#include <ATen/Context.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/transformers/sdp_utils.h>
@@ -49,7 +50,7 @@ bool check_no_grad(sdp::sdp_params const& params, bool debug) {
   return !any_inputs_require_grad || !gradmode_enabled;
 }
 
-bool use_overrideable_xpu(sdp::sdp_params const& params, bool debug) {
+bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
   constexpr auto supported_dtypes = c10::array_of<at::ScalarType>(
       at::kFloat, at::kBFloat16, at::kHalf); // double is not supported
 
@@ -73,6 +74,42 @@ bool use_overrideable_xpu(sdp::sdp_params const& params, bool debug) {
   return sdp::check_tensor_dtype(params, supported_dtypes, debug);
 }
 
+bool can_use_flash_attention(sdp::sdp_params const& params, bool debug) {
+  // Currently, XPU fallbacks flash attention to overrideable
+  return can_use_overrideable_attention(params, debug);
+}
+
+bool can_use_cudnn_attention(sdp::sdp_params const& params, bool debug) {
+  if (debug) {
+    TORCH_WARN("XPU don't support SDPA cudnn attention backend.");
+  }
+  return false;
+}
+
+bool can_use_mem_efficien_attention(sdp::sdp_params const& params, bool debug) {
+  if (debug) {
+    TORCH_WARN("XPU don't support SDPA mem efficient attention backend.");
+  }
+  return false;
+}
+
+bool priority_order_init = false;
+
+std::array<sdp::SDPBackend, sdp::num_backends> priority_order(
+    sdp::sdp_params const& params) {
+  if (!priority_order_init) {
+    priority_order_init = true;
+    const std::vector<int64_t> priority_order = {
+        static_cast<int64_t>(at::SDPBackend::overrideable),
+        static_cast<int64_t>(at::SDPBackend::math),
+        static_cast<int64_t>(at::SDPBackend::flash_attention),
+        static_cast<int64_t>(at::SDPBackend::efficient_attention),
+        static_cast<int64_t>(at::SDPBackend::cudnn_attention)};
+    at::globalContext().setSDPPriorityOrder(priority_order);
+  }
+  return at::globalContext().sDPPriorityOrder();
+}
+
 sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
   // This function defines the priority order of the different sdp backends
   // 1. Flash Attention
@@ -85,20 +122,16 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
   }
 
   // Get ideal kernel ordering
-  const std::array<sdp::SDPBackend, 3> priority_order{
-      sdp::SDPBackend::overrideable,
-      sdp::SDPBackend::math,
-      sdp::SDPBackend::flash_attention,
-  };
+  const auto ordering = priority_order(kernel_params);
 
   // Because TORCHCHECK checks if condition is true we negate debug so that
   // The statements will be printed when debug is true
   bool print_debug = false;
-  for (auto& backend : priority_order) {
+  for (auto& backend : ordering) {
     switch (backend) {
       case sdp::SDPBackend::overrideable:
         if (ctx.userEnabledOverrideableSDP() &&
-            use_overrideable_xpu(kernel_params, print_debug)) {
+            can_use_overrideable_attention(kernel_params, print_debug)) {
           return sdp::SDPBackend::overrideable;
         }
         break;
@@ -109,25 +142,43 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
         break;
       case sdp::SDPBackend::flash_attention:
         if (ctx.userEnabledFlashSDP() &&
-            use_overrideable_xpu(kernel_params, print_debug)) {
-          TORCH_WARN(
-              "Flash Attention is not supported on XPU, falling back to overrideable kernel.");
+            can_use_flash_attention(kernel_params, print_debug)) {
+          TORCH_WARN_ONCE(
+              "SDPA Flash Attention backend is not supported on XPU, falling back to OVERRIDEABLE backend.");
           return sdp::SDPBackend::overrideable;
         }
         break;
+      case sdp::SDPBackend::cudnn_attention:
+        if (ctx.userEnabledCuDNNSDP() &&
+            can_use_cudnn_attention(kernel_params, print_debug)) {
+          TORCH_CHECK(false, "Invalid backend");
+        }
+        break;
+      case sdp::SDPBackend::efficient_attention:
+        if (ctx.userEnabledMemEfficientSDP() &&
+            can_use_mem_efficien_attention(kernel_params, print_debug)) {
+          TORCH_CHECK(false, "Invalid backend");
+        }
+        break;
       default:
         TORCH_CHECK(false, "Invalid backend");
     }
   }
   // If we have gotten to this point then two things have happened:
-  // 1. use_overrideable_xpu did not satisfy the constraints to be ran
+  // 1. can_use_overrideable_attention did not satisfy the constraints to be ran
   // 2. The user has explicitly disabled the math kernel
   // We then re-run the kernel checks with debug enabled to print out the
   // reason why the kernel was not selected
 
   print_debug = true;
-  TORCH_WARN("OneDNN kernel not used because:");
-  use_overrideable_xpu(kernel_params, print_debug);
+  TORCH_WARN("Flash attention kernel not used because:");
+  can_use_flash_attention(kernel_params, print_debug);
+  TORCH_WARN("Overrideable attention kernel not used because:");
+  can_use_overrideable_attention(kernel_params, print_debug);
+  TORCH_WARN("CuDNN attention kernel not used because:");
+  can_use_cudnn_attention(kernel_params, print_debug);
+  TORCH_WARN("Memory Efficient attention kernel not used because:");
+  can_use_mem_efficien_attention(kernel_params, print_debug);
   TORCH_CHECK(!print_debug, "No available kernel. Aborting execution.")
   return sdp::SDPBackend::error;
 }
@@ -209,7 +260,7 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
   alloc_with_matching_layout(query, output, output_shape);
   at::Tensor logsumexp, debug_attn_mask; // not supported
 
-  at::native::onednn::gpu_float_sdpa(
+  at::native::onednn::sdpa(
       batch_size,
       seq_len_q,
       seq_len_kv,
@@ -223,7 +274,9 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
       attn_bias,
       is_causal,
       scale.has_value() ? scale.value() : (1.0 / std::sqrt(head_dim_qk)),
-      output);
+      output,
+      false,
+      logsumexp);
 
   // rng not used
   auto philox_seed = at::empty({}, at::dtype(at::kLong));
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
index 1d90711f6e382..e840e21f4f7a1 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
@@ -13,6 +13,9 @@ using dims = logical_tensor::dims;
 using op = dnnl::graph::op;
 using partition = dnnl::graph::partition;
 
+constexpr logical_tensor::data_type sdpa_intermediate_dtype =
+    logical_tensor::data_type::f32;
+
 inline data_type to_logical_tensor_data_type(c10::ScalarType scalar_type) {
   return scalar_type == c10::ScalarType::Float   ? data_type::f32
       : scalar_type == c10::ScalarType::Half     ? data_type::f16
@@ -20,6 +23,8 @@ inline data_type to_logical_tensor_data_type(c10::ScalarType scalar_type) {
                                                  : data_type::undef;
 }
 
+namespace sdpa_forward {
+
 struct SDPALogicalParams {
   enum class TensorID {
     query,
@@ -28,7 +33,8 @@ struct SDPALogicalParams {
     neg_inf,
     attn_mask,
     value,
-    output,
+    attention,
+    logsumexp,
     end,
   };
 
@@ -38,14 +44,16 @@ struct SDPALogicalParams {
   std::optional<logical_tensor> neg_inf;
   std::optional<logical_tensor> attn_mask;
   logical_tensor value{};
-  logical_tensor output{};
+  logical_tensor attention{};
+  std::optional<logical_tensor> logsumexp;
 
   SDPALogicalParams(
       const at::Tensor& query_,
       const at::Tensor& key_,
       const at::Tensor& value_,
       const std::optional<at::Tensor>& attn_mask_,
-      const at::Tensor& output_,
+      const at::Tensor& attention_,
+      const at::Tensor& logsumexp_,
       int batch_size,
       int seq_len_q,
       int seq_len_kv,
@@ -53,19 +61,26 @@ struct SDPALogicalParams {
       int num_head_kv,
       int head_dim_qk,
       int head_dim_v,
-      bool is_causal) {
+      bool is_causal,
+      bool compute_logsumexp) {
     const data_type dtype = to_logical_tensor_data_type(query_.scalar_type());
     TORCH_INTERNAL_ASSERT(
         (dtype != data_type::undef),
         "Only FP16/BF16/FP32 datatypes are currently supported");
+    TORCH_INTERNAL_ASSERT(
+        query_.scalar_type() == attention_.scalar_type(),
+        "scaled_dot_product_attention_xpu: query and attention tensors should have the same data type.");
     const dims scalar_shape = {1};
-    std::vector<logical_tensor> inputLogicalTensors;
 
     at::Tensor reshaped_query = query_;
     at::Tensor reshaped_key = key_;
     at::Tensor reshaped_value = value_;
-    at::Tensor reshaped_output = output_;
+    at::Tensor reshaped_attention = attention_;
+    at::Tensor reshaped_logsumexp =
+        compute_logsumexp ? logsumexp_.unsqueeze(-1) : logsumexp_;
     at::Tensor reshaped_attn_mask = attn_mask_.value_or(at::Tensor());
+
+    // handle broadcasted input tensors for OneDNN
     if (at::native::onednn::is_broadcast(reshaped_query)) {
       at::native::onednn::undo_broadcast(reshaped_query);
     }
@@ -75,9 +90,6 @@ struct SDPALogicalParams {
     if (at::native::onednn::is_broadcast(reshaped_value)) {
       at::native::onednn::undo_broadcast(reshaped_value);
     }
-    if (at::native::onednn::is_broadcast(reshaped_output)) {
-      at::native::onednn::undo_broadcast(reshaped_output);
-    }
     if (attn_mask_.has_value() &&
         at::native::onednn::is_broadcast(reshaped_attn_mask)) {
       at::native::onednn::undo_broadcast(reshaped_attn_mask);
@@ -95,23 +107,22 @@ struct SDPALogicalParams {
           {batch_size, group_num, group_size, seq_len_q, head_dim_qk});
       reshaped_key = key_.unsqueeze(2);
       reshaped_value = value_.unsqueeze(2);
-      reshaped_output = output_.view(
+      reshaped_attention = attention_.view(
           {batch_size, group_num, group_size, seq_len_q, head_dim_v});
       if (attn_mask_.has_value() && attn_mask_.value().dim() == 4) {
         reshaped_attn_mask = attn_mask_.value().unsqueeze(2);
       }
     }
 
-    query = {
-        static_cast<size_t>(TensorID::query),
-        dtype,
-        reshaped_query.sizes().vec(),
-        reshaped_query.strides().vec()};
-    key = {
-        static_cast<size_t>(TensorID::key),
-        dtype,
-        reshaped_key.sizes().vec(),
-        reshaped_key.strides().vec()};
+#define LOGIC_TENSOR_DESC(name, dtype)     \
+  name = {                                 \
+      static_cast<size_t>(TensorID::name), \
+      dtype,                               \
+      reshaped_##name.sizes().vec(),       \
+      reshaped_##name.strides().vec()}
+
+    LOGIC_TENSOR_DESC(query, dtype);
+    LOGIC_TENSOR_DESC(key, dtype);
     scale = {
         static_cast<size_t>(TensorID::scale),
         to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())),
@@ -132,22 +143,19 @@ struct SDPALogicalParams {
       TORCH_INTERNAL_ASSERT(
           (mask_dtype != data_type::undef),
           "Only FP16/BF16/FP32 datatypes are currently supported for attn_mask");
-      attn_mask = {
-          static_cast<size_t>(TensorID::attn_mask),
-          mask_dtype,
-          reshaped_attn_mask.sizes().vec(),
-          reshaped_attn_mask.strides().vec()};
+      LOGIC_TENSOR_DESC(attn_mask, mask_dtype);
     }
-    value = {
-        static_cast<size_t>(TensorID::value),
-        dtype,
-        reshaped_value.sizes().vec(),
-        reshaped_value.strides().vec()};
-    output = {
-        static_cast<size_t>(TensorID::output),
-        dtype,
-        reshaped_output.sizes().vec(),
-        reshaped_output.strides().vec()};
+    LOGIC_TENSOR_DESC(value, dtype);
+    LOGIC_TENSOR_DESC(attention, dtype);
+    if (compute_logsumexp) {
+      TORCH_INTERNAL_ASSERT(
+          logsumexp_.scalar_type() == at::kFloat,
+          "scaled_dot_product_attention: Expected logsumexp data type in FP32, but got ",
+          logsumexp_.scalar_type(),
+          " instead.");
+      LOGIC_TENSOR_DESC(logsumexp, sdpa_intermediate_dtype);
+    }
+#undef LOGIC_TENSOR_DESC
   }
   std::vector<logical_tensor> get_input() const {
     std::vector<logical_tensor> input = {query, key, scale};
@@ -161,16 +169,21 @@ struct SDPALogicalParams {
     return input;
   }
   std::vector<logical_tensor> get_output() const {
-    return {output};
+    std::vector<logical_tensor> output;
+    output.push_back(attention);
+    if (logsumexp.has_value()) {
+      output.push_back(logsumexp.value());
+    }
+    return output;
   }
 };
 
 partition create_sdpa_graph_partition(
     bool is_causal,
+    bool compute_logsumexp,
     data_type dtype,
     const SDPALogicalParams& params) {
   // graph building and partitioning
-  // currently, we assume that Q and K have same sequence length
 
   size_t lt_id = static_cast<size_t>(SDPALogicalParams::TensorID::end);
   size_t op_id = 0;
@@ -180,7 +193,7 @@ partition create_sdpa_graph_partition(
   // Matrix Extensions (Intel(R) XMX) support, which means the
   // Q/K/V tensors have bf16 or f16 data type while the output of the first
   // MatMul, Scale, Mask, and the input of SoftMax are in f32 data type.
-  logical_tensor matmul_qk_out{lt_id++, data_type::f32};
+  logical_tensor matmul_qk_out{lt_id++, sdpa_intermediate_dtype};
   op matmul_qk{
       op_id++,
       op::kind::MatMul,
@@ -189,7 +202,7 @@ partition create_sdpa_graph_partition(
       "matmul_qk"};
   matmul_qk.set_attr<bool>(op::attr::transpose_b, true);
 
-  logical_tensor scaled_qk_out{lt_id++, data_type::f32};
+  logical_tensor scaled_qk_out{lt_id++, sdpa_intermediate_dtype};
   op scale_mul{
       op_id++,
       op::kind::Multiply,
@@ -214,7 +227,7 @@ partition create_sdpa_graph_partition(
   if (params.attn_mask.has_value()) {
     TORCH_INTERNAL_ASSERT(
         !is_causal, "Additive mask cannot use with is_causal.");
-    masked_qk_out = {lt_id++, data_type::f32};
+    masked_qk_out = {lt_id++, sdpa_intermediate_dtype};
     mask_add = {
         op_id++,
         op::kind::Add,
@@ -249,7 +262,7 @@ partition create_sdpa_graph_partition(
         {mask_gt_out.value()},
         "mask_gt"};
 
-    masked_qk_out = {lt_id++, data_type::f32};
+    masked_qk_out = {lt_id++, sdpa_intermediate_dtype};
     mask_select = {
         op_id++,
         op::kind::Select,
@@ -270,12 +283,15 @@ partition create_sdpa_graph_partition(
   logical_tensor softmax_out{lt_id++, dtype};
   softmax.add_input(masked_qk_out.value_or(scaled_qk_out));
   softmax.add_output(softmax_out);
+  if (compute_logsumexp) {
+    softmax.add_output(params.logsumexp.value());
+  }
 
   op matmul_v{
       op_id++,
       op::kind::MatMul,
       {softmax_out, params.value},
-      {params.output},
+      {params.attention},
       "matmul_v"};
 
   constexpr auto ekind = dnnl::engine::kind::gpu;
@@ -304,44 +320,469 @@ partition create_sdpa_graph_partition(
 
 partition& find_or_create_graph_partition(
     bool is_causal,
+    bool compute_logsumexp,
     const SDPALogicalParams& params) {
-  thread_local static PartitionCache cache;
+  thread_local PartitionCache cache;
   const data_type dtype = params.query.get_data_type();
 
   // cache key creation
   // patternID is determined on the basis of the arguments provided
   std::bitset<32> patternID;
   if (dtype == data_type::f32) {
-    // bit 3 corresponds to float32 dtype
-    patternID.set(3, 1);
+    patternID.set(static_cast<uint8_t>(PartitionCache::BitType::Float32), 1);
   }
   if (dtype == data_type::bf16) {
-    // bit 2 corresponds to fp16/bf16 dtype
-    patternID.set(2, 1);
+    patternID.set(static_cast<uint8_t>(PartitionCache::BitType::Bfloat16), 1);
   }
   // sdp pattern
-  patternID.set(4, 1);
+  patternID.set(static_cast<uint8_t>(PartitionCache::BitType::SdpaPattern), 1);
 
   // Refer to comments in Utils.h. The first 8 bits are reserved
   int pos = 8;
   // attn_mask
   patternID.set(pos++, params.attn_mask.has_value());
   patternID.set(pos++, is_causal);
+  // compute_logsumexp
+  patternID.set(pos++, compute_logsumexp);
 
   auto partition_ = cache.find_partition(patternID);
   if (!partition_.has_value()) {
     // partition cache no hit
     // graph building and partitioning
-    partition sdp_partition =
-        create_sdpa_graph_partition(is_causal, dtype, params);
+    partition sdp_partition = create_sdpa_graph_partition(
+        is_causal, compute_logsumexp, dtype, params);
     partition_ = cache.insert_partition_cache(patternID, sdp_partition);
   }
   return *partition_;
 }
+} // namespace sdpa_forward
+
+namespace sdpa_backward {
+
+struct SDPABackwardLogicalParams {
+  enum class TensorID {
+    grad_out,
+    query,
+    key,
+    value,
+    out,
+    logsumexp,
+    scale,
+    neg_inf,
+    attn_mask,
+    grad_query,
+    grad_key,
+    grad_value,
+    end,
+  };
+
+  logical_tensor grad_out{};
+  logical_tensor query{};
+  logical_tensor key{};
+  logical_tensor value{};
+  logical_tensor out{};
+  logical_tensor logsumexp{};
+  logical_tensor scale{};
+  std::optional<logical_tensor> neg_inf;
+  std::optional<logical_tensor> attn_mask;
+  logical_tensor grad_query{};
+  logical_tensor grad_key{};
+  logical_tensor grad_value{};
+
+  SDPABackwardLogicalParams(
+      const at::Tensor& grad_out_,
+      const at::Tensor& query_,
+      const at::Tensor& key_,
+      const at::Tensor& value_,
+      const at::Tensor& out_,
+      const at::Tensor& logsumexp_,
+      const std::optional<at::Tensor>& attn_mask_,
+      const at::Tensor& grad_query_,
+      const at::Tensor& grad_key_,
+      const at::Tensor& grad_value_,
+      int batch_size,
+      int num_head_q,
+      int num_head_kv,
+      int seq_len_q,
+      int seq_len_kv,
+      int head_dim_qk,
+      int head_dim_v,
+      bool is_causal) {
+    const data_type dtype = to_logical_tensor_data_type(query_.scalar_type());
+    TORCH_INTERNAL_ASSERT(
+        (dtype != data_type::undef),
+        "Only FP16/BF16/FP32 datatypes are currently supported");
+    TORCH_INTERNAL_ASSERT(
+        grad_out_.scalar_type() == query_.scalar_type() &&
+            grad_out_.scalar_type() == key_.scalar_type() &&
+            grad_out_.scalar_type() == value_.scalar_type() &&
+            grad_out_.scalar_type() == out_.scalar_type(),
+        "scaled_dot_product_attention_backward_xpu: Expected grad_out, q, k, v and out to have the same data type, but got ",
+        " grad_out: ",
+        grad_out_.scalar_type(),
+        ", q: ",
+        query_.scalar_type(),
+        ", k: ",
+        key_.scalar_type(),
+        ", v: ",
+        value_.scalar_type(),
+        ", out: ",
+        out_.scalar_type());
+    TORCH_INTERNAL_ASSERT(
+        logsumexp_.defined() && logsumexp_.scalar_type() == at::kFloat,
+        "scaled_dot_product_attention_backward_xpu: Expected logsumexp to be defined and have FP32 data type");
+    const dims scalar_shape = {1};
+
+    at::Tensor reshaped_grad_out = grad_out_;
+    at::Tensor reshaped_query = query_;
+    at::Tensor reshaped_key = key_;
+    at::Tensor reshaped_value = value_;
+    at::Tensor reshaped_out = out_;
+    at::Tensor reshaped_logsumexp = logsumexp_.unsqueeze(-1);
+    at::Tensor reshaped_attn_mask = attn_mask_.value_or(at::Tensor());
+    at::Tensor reshaped_grad_query = grad_query_;
+    at::Tensor reshaped_grad_key = grad_key_;
+    at::Tensor reshaped_grad_value = grad_value_;
+
+    // handle broadcasted input tensors for OneDNN
+    if (at::native::onednn::is_broadcast(reshaped_grad_out)) {
+      at::native::onednn::undo_broadcast(reshaped_grad_out);
+    }
+    if (at::native::onednn::is_broadcast(reshaped_query)) {
+      at::native::onednn::undo_broadcast(reshaped_query);
+    }
+    if (at::native::onednn::is_broadcast(reshaped_key)) {
+      at::native::onednn::undo_broadcast(reshaped_key);
+    }
+    if (at::native::onednn::is_broadcast(reshaped_value)) {
+      at::native::onednn::undo_broadcast(reshaped_value);
+    }
+    if (attn_mask_.has_value() &&
+        at::native::onednn::is_broadcast(reshaped_attn_mask)) {
+      at::native::onednn::undo_broadcast(reshaped_attn_mask);
+    }
+
+    // TODO: Support GQA in backward pass once OneDNN supports it.
+
+#define LOGIC_TENSOR_DESC(name, dtype)     \
+  name = {                                 \
+      static_cast<size_t>(TensorID::name), \
+      dtype,                               \
+      reshaped_##name.sizes().vec(),       \
+      reshaped_##name.strides().vec()}
+
+    LOGIC_TENSOR_DESC(grad_out, dtype);
+    LOGIC_TENSOR_DESC(query, dtype);
+    LOGIC_TENSOR_DESC(key, dtype);
+    LOGIC_TENSOR_DESC(value, dtype);
+    LOGIC_TENSOR_DESC(out, dtype);
+    LOGIC_TENSOR_DESC(logsumexp, sdpa_intermediate_dtype);
+    scale = {
+        static_cast<size_t>(TensorID::scale),
+        to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())),
+        scalar_shape,
+        logical_tensor::layout_type::strided,
+        logical_tensor::property_type::constant};
+    if (is_causal) {
+      neg_inf = {
+          static_cast<size_t>(TensorID::neg_inf),
+          to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())),
+          scalar_shape,
+          logical_tensor::layout_type::strided,
+          logical_tensor::property_type::constant};
+    }
+    if (attn_mask_.has_value()) {
+      const data_type mask_dtype =
+          to_logical_tensor_data_type(attn_mask_->scalar_type());
+      TORCH_INTERNAL_ASSERT(
+          (mask_dtype != data_type::undef),
+          "Only FP16/BF16/FP32 datatypes are currently supported for attn_mask");
+      LOGIC_TENSOR_DESC(attn_mask, mask_dtype);
+    }
+    LOGIC_TENSOR_DESC(grad_query, dtype);
+    LOGIC_TENSOR_DESC(grad_key, dtype);
+    LOGIC_TENSOR_DESC(grad_value, dtype);
+#undef LOGIC_TENSOR_DESC
+  }
+  std::vector<logical_tensor> get_input() const {
+    std::vector<logical_tensor> input = {
+        grad_out, query, key, value, out, logsumexp, scale};
+    if (neg_inf.has_value()) {
+      input.push_back(neg_inf.value());
+    }
+    if (attn_mask.has_value()) {
+      input.push_back(attn_mask.value());
+    }
+    return input;
+  }
+  std::vector<logical_tensor> get_output() const {
+    std::vector<logical_tensor> output = {grad_query, grad_key, grad_value};
+    return output;
+  }
+};
+
+partition create_sdpa_backward_graph_partition(
+    bool is_causal,
+    data_type dtype,
+    const SDPABackwardLogicalParams& params) {
+  // graph building and partitioning
+  size_t lt_id = static_cast<size_t>(SDPABackwardLogicalParams::TensorID::end);
+  size_t op_id = 0;
+
+  // OneDNN graph has optimized implementation for `f16` or `bf16` SDPA with
+  // `f32` intermediate data type on Intel Graphics Products with Intel(R) Xe
+  // Matrix Extensions (Intel(R) XMX) support, which means the
+  // Q/K/V tensors have bf16 or f16 data type while the output of the first
+  // MatMul, Scale, Mask, and the input of SoftMax are in f32 data type.
+  logical_tensor matmul_qk_out{lt_id++, sdpa_intermediate_dtype};
+  op matmul_qk{
+      op_id++,
+      op::kind::MatMul,
+      {params.query, params.key},
+      {matmul_qk_out},
+      "matmul_qk"};
+  matmul_qk.set_attr<bool>(op::attr::transpose_b, true);
+
+  logical_tensor scaled_qk_out{lt_id++, sdpa_intermediate_dtype};
+  op scale_mul{
+      op_id++,
+      op::kind::Multiply,
+      {matmul_qk_out, params.scale},
+      {scaled_qk_out},
+      "scale_mul"};
+
+  std::optional<logical_tensor> masked_qk_out;
+
+  // For optional additive mask
+  std::optional<op> mask_add;
+
+  // For optional implicite causal mask
+  std::optional<op> mask_gen_idx_row;
+  std::optional<logical_tensor> mask_row_idx;
+  std::optional<op> mask_gen_idx_col;
+  std::optional<logical_tensor> mask_col_idx;
+  std::optional<op> mask_gt;
+  std::optional<logical_tensor> mask_gt_out;
+  std::optional<op> mask_select;
+
+  if (params.attn_mask.has_value()) {
+    TORCH_INTERNAL_ASSERT(
+        !is_causal, "Additive mask cannot use with is_causal.");
+    masked_qk_out = {lt_id++, sdpa_intermediate_dtype};
+    mask_add = {
+        op_id++,
+        op::kind::Add,
+        {scaled_qk_out, params.attn_mask.value()},
+        {masked_qk_out.value()},
+        "mask_add"};
+  } else if (is_causal) {
+    mask_row_idx = {lt_id++, data_type::s32};
+    mask_gen_idx_row = {
+        op_id++,
+        op::kind::GenIndex,
+        {scaled_qk_out},
+        {mask_row_idx.value()},
+        "mask_gen_idx_row"};
+    mask_gen_idx_row->set_attr<int64_t>(op::attr::axis, -2);
+
+    mask_col_idx = {lt_id++, data_type::s32};
+    mask_gen_idx_col = {
+        op_id++,
+        op::kind::GenIndex,
+        {scaled_qk_out},
+        {mask_col_idx.value()},
+        "mask_gen_idx_col"};
+    mask_gen_idx_col->set_attr<int64_t>(op::attr::axis, -1);
+
+    mask_gt_out = {lt_id++, data_type::boolean};
+    mask_gt = {
+        op_id++,
+        op::kind::GreaterEqual,
+        {mask_row_idx.value(), mask_col_idx.value()},
+        {mask_gt_out.value()},
+        "mask_gt"};
+
+    masked_qk_out = {lt_id++, sdpa_intermediate_dtype};
+    mask_select = {
+        op_id++,
+        op::kind::Select,
+        {mask_gt_out.value(), scaled_qk_out, params.neg_inf.value()},
+        {masked_qk_out.value()},
+        "mask_select"};
+  }
+
+  // attention_probs = softmax(masked_score) = exp(masked_score - logsumexp)
+  logical_tensor sub_out{lt_id++, sdpa_intermediate_dtype};
+  op subtract{
+      op_id++,
+      op::kind::Subtract,
+      {masked_qk_out.value_or(scaled_qk_out), params.logsumexp},
+      {sub_out},
+      "subtract"};
+  logical_tensor prob{lt_id++, sdpa_intermediate_dtype};
+  op exp{op_id++, op::kind::Exp, {sub_out}, {prob}, "exp"};
+
+  // The following matmul doesn't support different input dtypes, insert a
+  // typecast
+  logical_tensor prob_casted = prob;
+  op typecast = op(op_id++, op::kind::TypeCast, "typecast");
+  if (dtype != sdpa_intermediate_dtype) {
+    prob_casted = logical_tensor(lt_id++, dtype);
+    typecast.add_inputs({prob});
+    typecast.add_outputs({prob_casted});
+  }
+
+  // grad_value = prob^T * grad_out
+  // TODO: handle GQA headnum because (batch_size, num_head_kv, seq_len_kv,
+  // head_dim_v) != (batch_size, num_head_q, seqlen_kv, seq_len_q) *
+  // (batch_size, num_head_q, seqlen_q, head_dim_v)
+  op matmul_grad_value{
+      op_id++,
+      op::kind::MatMul,
+      {prob_casted, params.grad_out},
+      {params.grad_value},
+      "matmul_grad_value"};
+  matmul_grad_value.set_attr<bool>(op::attr::transpose_a, true);
+
+  // grad_prop = grad_out * value^T
+  // TODO: handle GQA headnum because (batch_size, num_head_q, seq_len_q,
+  // seq_len_kv) != (batch_size, num_head_q, seq_len_q, head_dim_v) *
+  // (batch_size, num_head_kv, head_dim_v, seq_len_kv)
+  logical_tensor grad_prop{lt_id++, sdpa_intermediate_dtype};
+  op matmul_grad_prop{
+      op_id++,
+      op::kind::MatMul,
+      {params.grad_out, params.value},
+      {grad_prop},
+      "matmul_grad_prop"};
+  matmul_grad_prop.set_attr<bool>(op::attr::transpose_b, true);
+
+  // grad_masked_score = softmaxbackward(grad_prop)
+  logical_tensor grad_masked_score{lt_id++, sdpa_intermediate_dtype};
+  op softmax_backward{
+      op_id++,
+      op::kind::SoftMaxBackward,
+      {grad_prop, prob},
+      {grad_masked_score},
+      "softmax_backward"};
+  softmax_backward.set_attr<int64_t>(op::attr::axis, -1);
+
+  // TODO: add output tensor grad_attn_mask = grad_masked_score once OneDNN
+  // supports output grad_attn_mask.
+
+  // grad_scaled_score = grad_masked_score * scale
+  logical_tensor grad_scaled_score{lt_id++, sdpa_intermediate_dtype};
+  op grad_scale_mul{
+      op_id++,
+      op::kind::Multiply,
+      {grad_masked_score, params.scale},
+      {grad_scaled_score},
+      "grad_scale_mul"};
+
+  // The following matmul doesn't support different input dtypes, insert a
+  // typecast
+  logical_tensor grad_scaled_score_cast = grad_scaled_score;
+  op typecast2 = op(op_id++, op::kind::TypeCast, "typecast2");
+  if (dtype != sdpa_intermediate_dtype) {
+    grad_scaled_score_cast = logical_tensor(lt_id++, dtype);
+    typecast2.add_inputs({grad_scaled_score});
+    typecast2.add_outputs({grad_scaled_score_cast});
+  }
+
+  // grad_query = grad_scaled_score_cast * key
+  // TODO: handle GQA headnum because (batch_size, num_head_q, seq_len_q,
+  // head_dim_qk) != (batch_size, num_head_q, seq_len_q, seq_len_kv) *
+  // (batch_size, num_head_kv, seq_len_kv, head_dim_qk)
+  op matmul_grad_query{
+      op_id++,
+      op::kind::MatMul,
+      {grad_scaled_score_cast, params.key},
+      {params.grad_query},
+      "matmul_grad_query"};
+
+  // grad_key = grad_scaled_score_cast^T * query
+  op matmul_grad_key{
+      op_id++,
+      op::kind::MatMul,
+      {grad_scaled_score_cast, params.query},
+      {params.grad_key},
+      "matmul_grad_key"};
+  matmul_grad_key.set_attr<bool>(op::attr::transpose_a, true);
+
+  constexpr auto ekind = dnnl::engine::kind::gpu;
+  dnnl::graph::graph g(ekind);
+  g.add_op(matmul_qk);
+  g.add_op(scale_mul);
+  if (mask_add.has_value()) {
+    g.add_op(mask_add.value());
+  }
+  if (is_causal) {
+    g.add_op(mask_gen_idx_row.value());
+    g.add_op(mask_gen_idx_col.value());
+    g.add_op(mask_gt.value());
+    g.add_op(mask_select.value());
+  }
+  g.add_op(subtract);
+  g.add_op(exp);
+  g.add_op(matmul_grad_value);
+  g.add_op(matmul_grad_prop);
+  g.add_op(softmax_backward);
+  g.add_op(grad_scale_mul);
+  g.add_op(matmul_grad_query);
+  g.add_op(matmul_grad_key);
+  if (dtype != sdpa_intermediate_dtype) {
+    g.add_op(typecast);
+    g.add_op(typecast2);
+  }
+  g.finalize();
+  auto partitions = g.get_partitions();
+  TORCH_INTERNAL_ASSERT(
+      (partitions.size() == 1) && partitions[0].is_supported(),
+      "oneDNN doesn't support this fusion pattern. If you'd like its support, please submit a issue.");
+  return partitions[0];
+}
+
+partition& find_or_create_backward_graph_partition(
+    bool is_causal,
+    const SDPABackwardLogicalParams& params) {
+  thread_local PartitionCache cache;
+  const data_type dtype = params.query.get_data_type();
+
+  // cache key creation
+  // patternID is determined on the basis of the arguments provided
+  std::bitset<32> patternID;
+  if (dtype == data_type::f32) {
+    patternID.set(static_cast<uint8_t>(PartitionCache::BitType::Float32), 1);
+  }
+  if (dtype == data_type::bf16) {
+    patternID.set(static_cast<uint8_t>(PartitionCache::BitType::Bfloat16), 1);
+  }
+  // sdpa backward pattern
+  patternID.set(
+      static_cast<uint8_t>(PartitionCache::BitType::SdpaBwdPattern), 1);
+
+  // Refer to comments in Utils.h. The first 8 bits are reserved
+  int pos = 8;
+  // attn_mask
+  patternID.set(pos++, params.attn_mask.has_value());
+  patternID.set(pos++, is_causal);
+
+  auto partition_ = cache.find_partition(patternID);
+  if (!partition_.has_value()) {
+    // partition cache no hit
+    // graph building and partitioning
+    partition sdpa_backward_partition =
+        create_sdpa_backward_graph_partition(is_causal, dtype, params);
+    partition_ =
+        cache.insert_partition_cache(patternID, sdpa_backward_partition);
+  }
+  return *partition_;
+}
+} // namespace sdpa_backward
 } // namespace
 
 namespace at::native::onednn {
-void gpu_float_sdpa(
+void sdpa(
     int batch_size,
     int seq_len_q,
     int seq_len_kv,
@@ -355,7 +796,9 @@ void gpu_float_sdpa(
     std::optional<at::Tensor> attn_mask,
     bool is_causal,
     float softmax_scale,
-    const Tensor& output) {
+    const Tensor& attention,
+    bool compute_logsumexp,
+    const Tensor& logsumexp) {
   auto& eng = GpuEngineManager::Instance().get_engine();
   auto& strm = GpuStreamManager::Instance().get_stream();
 
@@ -370,8 +813,8 @@ void gpu_float_sdpa(
   };
 
   // OneDNN doesn't support fp32 ukernel for implicit causal mask,
-  // and the reference implementation is worse than aten math + explict causal
-  // mask. Fall back to explict causal mask until OneDNN v3.9 which has fp32
+  // and the reference implementation is worse than aten math + explicit causal
+  // mask. Fall back to explicit causal mask until OneDNN v3.9 which has fp32
   // ukernel for implicit causal mask.
   if (is_causal && query.dtype() == at::kFloat) {
     attn_mask = get_tril_mask();
@@ -381,32 +824,27 @@ void gpu_float_sdpa(
   std::vector<dnnl::graph::logical_tensor> l_inputs, l_outputs;
   std::optional<dnnl::graph::compiled_partition> compiled_partition;
 
-  auto get_compiled_partition = [&]() {
-    const SDPALogicalParams logical_params(
-        query,
-        key,
-        value,
-        attn_mask,
-        output,
-        batch_size,
-        seq_len_q,
-        seq_len_kv,
-        num_head_q,
-        num_head_kv,
-        head_dim_qk,
-        head_dim_v,
-        is_causal);
-    auto& partition_ =
-        find_or_create_graph_partition(is_causal, logical_params);
-    auto i = logical_params.get_input();
-    auto o = logical_params.get_output();
-    auto compiled_partition = partition_.compile(i, o, eng);
-    l_inputs = std::move(i);
-    l_outputs = std::move(o);
-    return compiled_partition;
-  };
-
-  compiled_partition = get_compiled_partition();
+  const sdpa_forward::SDPALogicalParams logical_params(
+      query,
+      key,
+      value,
+      attn_mask,
+      attention,
+      logsumexp,
+      batch_size,
+      seq_len_q,
+      seq_len_kv,
+      num_head_q,
+      num_head_kv,
+      head_dim_qk,
+      head_dim_v,
+      is_causal,
+      compute_logsumexp);
+  auto& partition = sdpa_forward::find_or_create_graph_partition(
+      is_causal, compute_logsumexp, logical_params);
+  l_inputs = std::move(logical_params.get_input());
+  l_outputs = std::move(logical_params.get_output());
+  compiled_partition = partition.compile(l_inputs, l_outputs, eng);
 
   Tensor softmax_scale1 = at::full(
       {},
@@ -416,26 +854,147 @@ void gpu_float_sdpa(
   if (is_causal) {
     neg_inf = at::full(
         {},
-        -INFINITY,
+        -std::numeric_limits<float>::infinity(),
         query.options().dtype(at::toOpMathType(query.scalar_type())));
   }
 
   std::vector<dnnl::graph::tensor> outputs = {
-      {l_outputs[0], eng, output.data_ptr()},
+      {l_outputs[0], eng, attention.data_ptr()},
   };
+  if (compute_logsumexp) {
+    outputs.emplace_back(l_outputs[1], eng, logsumexp.data_ptr());
+  }
+
   size_t i = 0;
   std::vector<dnnl::graph::tensor> inputs;
   inputs.reserve(l_inputs.size());
-  inputs.emplace_back(l_inputs[i++], eng, query.data_ptr());
-  inputs.emplace_back(l_inputs[i++], eng, key.data_ptr());
-  inputs.emplace_back(l_inputs[i++], eng, softmax_scale1.data_ptr());
+
+#define ADD_INPUT(variable) \
+  inputs.emplace_back(l_inputs[i++], eng, variable.data_ptr())
+
+  ADD_INPUT(query);
+  ADD_INPUT(key);
+  ADD_INPUT(softmax_scale1);
   if (neg_inf.has_value()) {
-    inputs.emplace_back(l_inputs[i++], eng, neg_inf->data_ptr());
+    ADD_INPUT((*neg_inf));
   }
   if (attn_mask.has_value()) {
-    inputs.emplace_back(l_inputs[i++], eng, attn_mask->data_ptr());
+    ADD_INPUT((*attn_mask));
   }
-  inputs.emplace_back(l_inputs[i++], eng, value.data_ptr());
+  ADD_INPUT(value);
+#undef ADD_INPUT
+
+  compiled_partition->execute(strm, inputs, outputs);
+}
+
+void sdpa_backward(
+    int batch_size,
+    int num_head_q,
+    int num_head_kv,
+    int seq_len_q,
+    int seq_len_kv,
+    int head_dim_qk,
+    int head_dim_v,
+    const Tensor& grad_out,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& out,
+    const Tensor& logsumexp,
+    std::optional<at::Tensor> attn_mask,
+    bool is_causal,
+    double scale,
+    Tensor& grad_query,
+    Tensor& grad_key,
+    Tensor& grad_value) {
+  auto& eng = GpuEngineManager::Instance().get_engine();
+  auto& strm = GpuStreamManager::Instance().get_stream();
+
+  const auto get_tril_mask = [&]() {
+    auto opts = query.options();
+    auto bool_tril =
+        at::ones_symint({seq_len_q, seq_len_kv}, opts.dtype(at::kBool)).tril();
+    return at::where(
+        bool_tril,
+        0.f,
+        at::scalar_tensor(-std::numeric_limits<float>::infinity(), opts));
+  };
+
+  // OneDNN doesn't support fp32 ukernel for implicit causal mask,
+  // and the reference implementation is worse than aten math + explicit causal
+  // mask. Fall back to explicit causal mask until OneDNN v3.9 which has fp32
+  // ukernel for implicit causal mask.
+  if (is_causal && query.dtype() == at::kFloat) {
+    attn_mask = get_tril_mask();
+    is_causal = false;
+  }
+
+  std::vector<dnnl::graph::logical_tensor> l_inputs, l_outputs;
+  std::optional<dnnl::graph::compiled_partition> compiled_partition;
+
+  const sdpa_backward::SDPABackwardLogicalParams logical_params(
+      grad_out,
+      query,
+      key,
+      value,
+      out,
+      logsumexp,
+      attn_mask,
+      grad_query,
+      grad_key,
+      grad_value,
+      batch_size,
+      num_head_q,
+      num_head_kv,
+      seq_len_q,
+      seq_len_kv,
+      head_dim_qk,
+      head_dim_v,
+      is_causal);
+  auto& partition = sdpa_backward::find_or_create_backward_graph_partition(
+      is_causal, logical_params);
+  l_inputs = std::move(logical_params.get_input());
+  l_outputs = std::move(logical_params.get_output());
+  compiled_partition = partition.compile(l_inputs, l_outputs, eng);
+
+  Tensor softmax_scale = at::full(
+      {}, scale, query.options().dtype(at::toOpMathType(query.scalar_type())));
+  std::optional<at::Tensor> neg_inf;
+  if (is_causal) {
+    neg_inf = at::full(
+        {},
+        -std::numeric_limits<float>::infinity(),
+        query.options().dtype(at::toOpMathType(query.scalar_type())));
+  }
+
+  std::vector<dnnl::graph::tensor> outputs = {
+      {l_outputs[0], eng, grad_query.data_ptr()},
+      {l_outputs[1], eng, grad_key.data_ptr()},
+      {l_outputs[2], eng, grad_value.data_ptr()},
+  };
+
+  size_t i = 0;
+  std::vector<dnnl::graph::tensor> inputs;
+  inputs.reserve(l_inputs.size());
+
+#define ADD_INPUT(variable) \
+  inputs.emplace_back(l_inputs[i++], eng, variable.data_ptr())
+
+  ADD_INPUT(grad_out);
+  ADD_INPUT(query);
+  ADD_INPUT(key);
+  ADD_INPUT(value);
+  ADD_INPUT(out);
+  ADD_INPUT(logsumexp);
+  ADD_INPUT(softmax_scale);
+  if (neg_inf.has_value()) {
+    ADD_INPUT((*neg_inf));
+  }
+  if (attn_mask.has_value()) {
+    ADD_INPUT((*attn_mask));
+  }
+#undef ADD_INPUT
+
   compiled_partition->execute(strm, inputs, outputs);
 }
 } // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
index ac8645d3e4a50..52f89bc1395d7 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
@@ -110,11 +110,21 @@ struct PartitionCache {
   // bit 1: is uint8
   // bit 2: fp16(0) / bf16(1)
   // bit 3: is fp32
-  // bit 4: is sdp pattern
-  // bit 5-7: N/A
+  // bit 4: is sdpa pattern
+  // bit 5: is sdpa backward pattern
+  // bit 6-7: reserved for future use
   // The rest of the bits depend upon the arguments provided
   // However, down the line, we might have different bitsets for different
   // patterns
+  enum class BitType : uint8_t {
+    Int8 = 0,
+    Uint8 = 1,
+    Bfloat16 = 2,
+    Float32 = 3,
+    SdpaPattern = 4,
+    SdpaBwdPattern = 5
+  };
+
   dnnl::graph::partition& insert_partition_cache(
       std::bitset<32>& patternID,
       dnnl::graph::partition& p) {
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
index e73cb73e8b1e7..6b2bf01e6d73d 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
@@ -164,7 +164,7 @@ void quantized_matmul(
     std::string_view unary_post_op_algorithm,
     bool m2_trnas);
 
-void gpu_float_sdpa(
+void sdpa(
     int batch_size,
     int seq_len_q,
     int seq_len_kv,
@@ -178,5 +178,28 @@ void gpu_float_sdpa(
     std::optional<at::Tensor> attn_mask,
     bool is_causal,
     float softmax_scale,
-    const Tensor& output);
+    const Tensor& attention,
+    bool compute_logsumexp,
+    const Tensor& logsumexp);
+
+void sdpa_backward(
+    int batch_size,
+    int num_head_q,
+    int num_head_kv,
+    int seq_len_q,
+    int seq_len_kv,
+    int head_dim_qk,
+    int head_dim_v,
+    const Tensor& grad_out,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& out,
+    const Tensor& logsumexp,
+    std::optional<at::Tensor> attn_mask,
+    bool is_causal,
+    double scale,
+    Tensor& grad_query,
+    Tensor& grad_key,
+    Tensor& grad_value);
 } // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
index 1c6e2a6c89dae..c014313a5b35d 100644
--- a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
@@ -1,5 +1,7 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+#include <ATen/native/mkldnn/xpu/qconv.h>
+
 #include <c10/core/MemoryFormat.h>
 #include <c10/core/ScalarType.h>
 #include <torch/library.h>
@@ -7,7 +9,7 @@
 using namespace at::native::onednn;
 namespace at::native::xpu {
 
-static inline c10::ScalarType qconv_decide_out_dtype(
+inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
     const at::Tensor& act,
     const std::optional<c10::ScalarType> output_dtype) {
   bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@@ -19,7 +21,7 @@ static inline c10::ScalarType qconv_decide_out_dtype(
   return dst_dtype;
 }
 
-static at::Tensor qconv_prepack_xpu(
+at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
     at::Tensor weight,
     at::Tensor weight_scales,
     double input_scale,
@@ -33,222 +35,265 @@ static at::Tensor qconv_prepack_xpu(
   return weight;
 }
 
-class QConvoneDNNXPU final {
- public:
-  static at::Tensor run_pointwise(
-      at::Tensor act,
-      double act_scale,
-      int64_t act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double inv_output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      std::string_view attr,
-      torch::List<std::optional<at::Scalar>> scalars,
-      std::optional<std::string_view> algorithm) {
-    if (act.dim() == 3 || act.dim() == 5) {
-      TORCH_CHECK(
-          attr == "none",
-          "quantized pointwise conv",
-          act.dim() - 2,
-          "d doesn't support unary_post_op fusion. Got unary_post_op:",
-          attr,
-          ".");
-    } else {
-      TORCH_CHECK(
-          attr == "none" || attr == "relu" || attr == "hardtanh" ||
-              attr == "hardswish" || attr == "swish",
-          "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
-          attr,
-          ".");
-    }
+at::Tensor QConvoneDNNXPU::run_pointwise(
+    at::Tensor act,
+    double act_scale,
+    int64_t act_zero_point,
+    at::Tensor weight,
+    at::Tensor weight_scales,
+    at::Tensor weight_zero_points,
+    std::optional<at::Tensor> bias,
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> dilation,
+    int64_t groups,
+    double inv_output_scale,
+    int64_t output_zero_point,
+    std::optional<c10::ScalarType> output_dtype,
+    std::string_view attr,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<std::string_view> algorithm) {
+  if (act.dim() == 3 || act.dim() == 5) {
+    TORCH_CHECK(
+        attr == "none",
+        "quantized pointwise conv",
+        act.dim() - 2,
+        "d doesn't support unary_post_op fusion. Got unary_post_op:",
+        attr,
+        ".");
+  } else {
+    TORCH_CHECK(
+        attr == "none" || attr == "relu" || attr == "hardtanh" ||
+            attr == "hardswish" || attr == "swish",
+        "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
+        attr,
+        ".");
+  }
 
-    bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
-    auto mfmt = is_channels_last_suggested
-        ? get_cl_tag_by_ndim(act.ndimension())
-        : at::MemoryFormat::Contiguous;
-    Tensor input_ = act.contiguous(mfmt);
-    Tensor weight_ = weight.contiguous(mfmt);
+  bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
+  auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
+                                         : at::MemoryFormat::Contiguous;
+  Tensor input_ = act.contiguous(mfmt);
+  Tensor weight_ = weight.contiguous(mfmt);
 
-    auto dst_tz = conv_dst_size(
-        input_.ndimension(),
-        input_.sizes(),
-        weight_.sizes(),
-        padding.vec(),
-        padding.vec(),
-        stride.vec(),
-        dilation.vec());
+  auto dst_tz = conv_dst_size(
+      input_.ndimension(),
+      input_.sizes(),
+      weight_.sizes(),
+      padding.vec(),
+      padding.vec(),
+      stride.vec(),
+      dilation.vec());
 
-    auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
-    Tensor output =
-        at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
+  auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
+  Tensor output =
+      at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
 
-    return quantized_convolution(
-        act,
-        act_scale,
-        act_zero_point,
-        weight,
-        weight_scales,
-        weight_zero_points,
-        bias,
-        stride,
-        padding,
-        dilation,
-        /*transposed*/ false,
-        groups,
-        output,
-        inv_output_scale,
-        output_zero_point,
-        /*accum*/ std::nullopt,
-        /*accum_scale*/ 0.0,
-        /*accum_zero_point*/ 0,
-        /*output_dtype*/ output_dtype,
-        /*binary_attr*/ std::nullopt,
-        /*binary_alpha*/ std::nullopt,
-        /*unary_attr*/ attr,
-        /*unary_scalars*/ scalars,
-        /*unary_algorithm*/ algorithm);
-  }
+  return quantized_convolution(
+      act,
+      act_scale,
+      act_zero_point,
+      weight,
+      weight_scales,
+      weight_zero_points,
+      bias,
+      stride,
+      padding,
+      dilation,
+      /*transposed*/ false,
+      groups,
+      output,
+      inv_output_scale,
+      output_zero_point,
+      /*accum*/ std::nullopt,
+      /*accum_scale*/ 0.0,
+      /*accum_zero_point*/ 0,
+      /*output_dtype*/ output_dtype,
+      /*binary_attr*/ std::nullopt,
+      /*binary_alpha*/ std::nullopt,
+      /*unary_attr*/ attr,
+      /*unary_scalars*/ scalars,
+      /*unary_algorithm*/ algorithm);
+}
 
-  static at::Tensor run_pointwise_tensor(
-      at::Tensor act,
-      at::Tensor act_scale,
-      at::Tensor act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      std::string_view attr,
-      torch::List<std::optional<at::Scalar>> scalars,
-      std::optional<std::string_view> algorithm) {
-    return run_pointwise(
-        act,
-        act_scale.item().toDouble(),
-        act_zero_point.item().toLong(),
-        weight,
-        weight_scales,
-        weight_zero_points,
-        bias,
-        stride,
-        padding,
-        dilation,
-        groups,
-        output_scale,
-        output_zero_point,
-        output_dtype,
-        /*unary_attr*/ attr,
-        /*unary_scalars*/ scalars,
-        /*unary_algorithm*/ algorithm);
-  }
+at::Tensor QConvoneDNNXPU::run_pointwise_tensor(
+    at::Tensor act,
+    at::Tensor act_scale,
+    at::Tensor act_zero_point,
+    at::Tensor weight,
+    at::Tensor weight_scales,
+    at::Tensor weight_zero_points,
+    std::optional<at::Tensor> bias,
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> dilation,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    std::optional<c10::ScalarType> output_dtype,
+    std::string_view attr,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<std::string_view> algorithm) {
+  return run_pointwise(
+      act,
+      act_scale.item().toDouble(),
+      act_zero_point.item().toLong(),
+      weight,
+      weight_scales,
+      weight_zero_points,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      output_scale,
+      output_zero_point,
+      output_dtype,
+      /*unary_attr*/ attr,
+      /*unary_scalars*/ scalars,
+      /*unary_algorithm*/ algorithm);
+}
 
-  static at::Tensor run_pointwise_binary(
-      at::Tensor act,
-      double act_scale,
-      int64_t act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      at::Tensor accum,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      double accum_scale,
-      int64_t accum_zero_point,
-      std::string_view binary_attr,
-      std::optional<at::Scalar> alpha,
-      std::optional<std::string_view> unary_attr,
-      torch::List<std::optional<at::Scalar>> unary_scalars,
-      std::optional<std::string_view> unary_algorithm) {
-    TORCH_CHECK(
-        act.dim() == 4 && binary_attr == "sum" &&
-            (!unary_attr.has_value() ||
-             (unary_attr.has_value() &&
-              (unary_attr.value() == "none" || unary_attr.value() == "relu"))),
-        "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
-        binary_attr,
-        " unary_post_op: ",
-        unary_attr.has_value() ? unary_attr.value() : "none",
-        ".")
+at::Tensor QConvoneDNNXPU::run_pointwise_binary(
+    at::Tensor act,
+    double act_scale,
+    int64_t act_zero_point,
+    at::Tensor weight,
+    at::Tensor weight_scales,
+    at::Tensor weight_zero_points,
+    at::Tensor accum,
+    std::optional<at::Tensor> bias,
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> dilation,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    std::optional<c10::ScalarType> output_dtype,
+    double accum_scale,
+    int64_t accum_zero_point,
+    std::string_view binary_attr,
+    std::optional<at::Scalar> alpha,
+    std::optional<std::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<std::string_view> unary_algorithm) {
+  TORCH_CHECK(
+      act.dim() == 4 && binary_attr == "sum" &&
+          (!unary_attr.has_value() ||
+           (unary_attr.has_value() &&
+            (unary_attr.value() == "none" || unary_attr.value() == "relu"))),
+      "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
+      binary_attr,
+      " unary_post_op: ",
+      unary_attr.has_value() ? unary_attr.value() : "none",
+      ".")
 
-    bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
-    auto mfmt = is_channels_last_suggested
-        ? get_cl_tag_by_ndim(act.ndimension())
-        : at::MemoryFormat::Contiguous;
-    Tensor input_ = act.contiguous(mfmt);
-    Tensor weight_ = weight.contiguous(mfmt);
+  bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
+  auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
+                                         : at::MemoryFormat::Contiguous;
+  Tensor input_ = act.contiguous(mfmt);
+  Tensor weight_ = weight.contiguous(mfmt);
 
-    auto dst_tz = conv_dst_size(
-        input_.ndimension(),
-        input_.sizes(),
-        weight_.sizes(),
-        padding.vec(),
-        padding.vec(),
-        stride.vec(),
-        dilation.vec());
+  auto dst_tz = conv_dst_size(
+      input_.ndimension(),
+      input_.sizes(),
+      weight_.sizes(),
+      padding.vec(),
+      padding.vec(),
+      stride.vec(),
+      dilation.vec());
 
-    auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
-    bool has_accum_postop_sum = binary_attr == "sum";
-    Tensor output = has_accum_postop_sum
-        ? accum
-        : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
+  auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
+  bool has_accum_postop_sum = binary_attr == "sum";
+  Tensor output = has_accum_postop_sum
+      ? accum
+      : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
 
-    output = quantized_convolution(
-        act,
-        act_scale,
-        act_zero_point,
-        weight,
-        weight_scales,
-        weight_zero_points,
-        bias,
-        stride,
-        padding,
-        dilation,
-        /*transposed*/ false,
-        groups,
-        output,
-        output_scale,
-        output_zero_point,
-        /*accum*/ accum,
-        /*accum_scale*/ accum_scale,
-        /*accum_zero_point*/ accum_zero_point,
-        /*output_dtype*/ output_dtype,
-        /*binary_attr*/ binary_attr,
-        /*binary_alpha*/ alpha,
-        /*unary_attr*/ unary_attr,
-        /*unary_scalars*/ unary_scalars,
-        /*unary_algorithm*/ unary_algorithm);
+  output = quantized_convolution(
+      act,
+      act_scale,
+      act_zero_point,
+      weight,
+      weight_scales,
+      weight_zero_points,
+      bias,
+      stride,
+      padding,
+      dilation,
+      /*transposed*/ false,
+      groups,
+      output,
+      output_scale,
+      output_zero_point,
+      /*accum*/ accum,
+      /*accum_scale*/ accum_scale,
+      /*accum_zero_point*/ accum_zero_point,
+      /*output_dtype*/ output_dtype,
+      /*binary_attr*/ binary_attr,
+      /*binary_alpha*/ alpha,
+      /*unary_attr*/ unary_attr,
+      /*unary_scalars*/ unary_scalars,
+      /*unary_algorithm*/ unary_algorithm);
 
-    if (!has_accum_postop_sum) {
-      return output;
-    } else {
-      return accum;
-    }
+  if (!has_accum_postop_sum) {
+    return output;
+  } else {
+    return accum;
   }
-};
+}
+
+at::Tensor QConvoneDNNXPU::run_pointwise_binary_tensor(
+    at::Tensor act, // contains quantized values but not QTensor
+    at::Tensor act_scale,
+    at::Tensor act_zero_point,
+    at::Tensor weight, // contains quantized values but not QTensor
+    at::Tensor weight_scales,
+    at::Tensor weight_zero_points,
+    at::Tensor accum, // contains quantized values but not QTensor
+    std::optional<at::Tensor> bias,
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> dilation,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    std::optional<c10::ScalarType> output_dtype,
+    double accum_scale,
+    int64_t accum_zero_point,
+    std::string_view binary_attr,
+    std::optional<at::Scalar> alpha,
+    std::optional<std::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<std::string_view> unary_algorithm) {
+  return run_pointwise_binary(
+      act,
+      act_scale.item().toDouble(),
+      act_zero_point.item().toLong(),
+      weight,
+      weight_scales,
+      weight_zero_points,
+      accum,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      output_scale,
+      output_zero_point,
+      output_dtype,
+      accum_scale,
+      accum_zero_point,
+      binary_attr,
+      alpha,
+      unary_attr,
+      unary_scalars,
+      unary_algorithm);
+}
 
 TORCH_LIBRARY_IMPL(onednn, XPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qconv_prepack"),
-      TORCH_FN(xpu::qconv_prepack_xpu));
+      TORCH_FN(QConvoneDNNXPU::qconv_prepack_xpu));
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qconv1d_pointwise"),
       QConvoneDNNXPU::run_pointwise);
@@ -267,6 +312,9 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"),
       QConvoneDNNXPU::run_pointwise_tensor);
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary_tensor"),
+      QConvoneDNNXPU::run_pointwise_binary_tensor);
 }
 
 } // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mkldnn/xpu/qconv.h b/aten/src/ATen/native/mkldnn/xpu/qconv.h
new file mode 100644
index 0000000000000..e9ddd4fa29697
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <ATen/Config.h>
+#include <ATen/Tensor.h>
+
+namespace at::native::xpu {
+class QConvoneDNNXPU final {
+ public:
+  C10_API static at::Tensor run_pointwise(
+      at::Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double inv_output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view attr,
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm);
+
+  C10_API static at::Tensor run_pointwise_tensor(
+      at::Tensor act,
+      at::Tensor act_scale,
+      at::Tensor act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view attr,
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm);
+
+  C10_API static at::Tensor run_pointwise_binary(
+      at::Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      at::Tensor accum,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double accum_scale,
+      int64_t accum_zero_point,
+      std::string_view binary_attr,
+      std::optional<at::Scalar> alpha,
+      std::optional<std::string_view> unary_attr,
+      torch::List<std::optional<at::Scalar>> unary_scalars,
+      std::optional<std::string_view> unary_algorithm);
+
+  C10_API static at::Tensor run_pointwise_binary_tensor(
+      at::Tensor act,
+      at::Tensor act_scale,
+      at::Tensor act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      at::Tensor accum,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double accum_scale,
+      int64_t accum_zero_point,
+      std::string_view binary_attr,
+      std::optional<at::Scalar> alpha,
+      std::optional<std::string_view> unary_attr,
+      torch::List<std::optional<at::Scalar>> unary_scalars,
+      std::optional<std::string_view> unary_algorithm);
+
+  static inline c10::ScalarType qconv_decide_out_dtype(
+      const at::Tensor& act,
+      const std::optional<c10::ScalarType> output_dtype);
+
+  static at::Tensor qconv_prepack_xpu(
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      double input_scale,
+      int64_t input_zero_point,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      std::optional<torch::List<int64_t>> input_shape);
+};
+
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
index 7e3f2f01fa1e6..e9584e8289eb2 100644
--- a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
@@ -1,13 +1,14 @@
 #include <torch/library.h>
 
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+#include <ATen/native/mkldnn/xpu/qlinear.h>
 #include <c10/core/ScalarType.h>
 
 using namespace at::native::onednn;
 
 namespace at::native::xpu {
 
-static inline c10::ScalarType qlinear_decide_out_dtype(
+inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
     const at::Tensor& act,
     const std::optional<c10::ScalarType> output_dtype) {
   bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@@ -19,7 +20,7 @@ static inline c10::ScalarType qlinear_decide_out_dtype(
   return dst_dtype;
 }
 
-static Tensor q_linear_pointwise(
+Tensor QLinearOnednnXPU::q_linear_pointwise(
     Tensor act,
     double act_scale,
     int64_t act_zero_point,
@@ -78,7 +79,7 @@ static Tensor q_linear_pointwise(
   return qout;
 }
 
-static Tensor q_linear_pointwise_tensor(
+Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
     Tensor act,
     Tensor act_scale,
     Tensor act_zero_point,
@@ -137,7 +138,7 @@ static Tensor q_linear_pointwise_tensor(
   return qout;
 }
 
-static Tensor q_linear_pointwise_binary(
+Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
     Tensor act,
     double act_scale,
     int64_t act_zero_point,
@@ -208,7 +209,7 @@ static Tensor q_linear_pointwise_binary(
   return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout;
 }
 
-static Tensor q_linear_pointwise_binary_tensor(
+Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
     Tensor act,
     Tensor act_scale,
     Tensor act_zero_point,
@@ -248,7 +249,7 @@ static Tensor q_linear_pointwise_binary_tensor(
       unary_post_op_algorithm);
 }
 
-static at::Tensor q_linear_prepack_onednn(
+Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
     at::Tensor weight,
     std::optional<torch::List<int64_t>> input_shape) {
   at::Tensor weight_transposed = weight.transpose(0, 1);
@@ -258,19 +259,19 @@ static at::Tensor q_linear_prepack_onednn(
 TORCH_LIBRARY_IMPL(onednn, XPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
-      TORCH_FN(q_linear_pointwise));
+      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise));
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
-      TORCH_FN(q_linear_pointwise_tensor));
+      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_tensor));
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"),
-      TORCH_FN(q_linear_prepack_onednn));
+      TORCH_FN(QLinearOnednnXPU::q_linear_prepack_onednn));
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"),
-      TORCH_FN(q_linear_pointwise_binary));
+      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary));
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
-      TORCH_FN(q_linear_pointwise_binary_tensor));
+      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary_tensor));
 }
 
 } // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mkldnn/xpu/qlinear.h b/aten/src/ATen/native/mkldnn/xpu/qlinear.h
new file mode 100644
index 0000000000000..7382276664242
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <ATen/Config.h>
+#include <ATen/Tensor.h>
+#include <ATen/core/List.h>
+
+namespace at::native::xpu {
+
+class QLinearOnednnXPU final {
+ public:
+  C10_API static Tensor q_linear_pointwise(
+      Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      Tensor weight,
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      std::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view post_op_name,
+      torch::List<std::optional<at::Scalar>> post_op_args,
+      std::string_view post_op_algorithm);
+
+  C10_API static Tensor q_linear_pointwise_tensor(
+      Tensor act,
+      Tensor act_scale,
+      Tensor act_zero_point,
+      Tensor weight,
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      std::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view post_op_name,
+      torch::List<std::optional<at::Scalar>> post_op_args,
+      std::string_view post_op_algorithm);
+
+  C10_API static Tensor q_linear_pointwise_binary(
+      Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      Tensor weight,
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      std::optional<at::Tensor> other,
+      std::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double other_scale,
+      int64_t other_zero_point,
+      std::string_view binary_post_op,
+      double binary_alpha,
+      std::string_view unary_post_op,
+      torch::List<std::optional<at::Scalar>> unary_post_op_args,
+      std::string_view unary_post_op_algorithm);
+
+  C10_API static Tensor q_linear_pointwise_binary_tensor(
+      Tensor act,
+      Tensor act_scale,
+      Tensor act_zero_point,
+      Tensor weight,
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      std::optional<at::Tensor> other,
+      std::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double other_scale,
+      int64_t other_zero_point,
+      std::string_view binary_post_op,
+      double binary_alpha,
+      std::string_view unary_post_op,
+      torch::List<std::optional<at::Scalar>> unary_post_op_args,
+      std::string_view unary_post_op_algorithm);
+
+  C10_API static Tensor q_linear_prepack_onednn(
+      at::Tensor weight,
+      std::optional<torch::List<int64_t>> input_shape);
+
+  static inline c10::ScalarType qlinear_decide_out_dtype(
+      const at::Tensor& act,
+      const std::optional<c10::ScalarType> output_dtype);
+
+}; // class QLinearOnednnXPU
+
+} // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index e6f87f5499a47..f9cd28ca06fa8 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -88,14 +88,8 @@ std::string getArrayRefString(const IntArrayRef s);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const Tensor& src, Tensor& dst);
 Tensor& scatterViewTensor(const Tensor& src, Tensor& output);
-MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph,
-                               MPSGraphTensor* inputTensor,
-                               const TensorBase& input,
-                               bool includesInt64 = false);
-MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph,
-                                 MPSGraphTensor* inputTensor,
-                                 const TensorBase& input,
-                                 bool includesInt64 = false);
+MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input);
+MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input);
 
 MPSNDArray* getStridedMPSNDArray(const TensorBase& src, MPSNDArray* srcNDArray);
 MPSNDArray* getMPSNDArray(const TensorBase& t, const IntArrayRef& sizes = {}, const IntArrayRef& strides = {});
@@ -435,14 +429,6 @@ inline T* LookUpOrCreateCachedGraph(const std::string& key, std::function<void(M
 // Common math operations
 MPSGraphTensor* log1p(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
 
-#define MPS_CHECK_INT64_OP_SUPPORTED(input_tensor, mac_os_13_3_plus, op_name)                                            \
-  if (!mac_os_13_3_plus && input_tensor.scalar_type() == kLong) {                                                        \
-    TORCH_WARN_ONCE(                                                                                                     \
-        "MPS: no support for int64 for ",                                                                                \
-        op_name,                                                                                                         \
-        ", downcasting to a smaller data type (int32/float32). Native support for int64 has been added in macOS 13.3."); \
-  }
-
 /**
  * Returns distance from lowest to highest element offset in given tensor.
  */
@@ -618,10 +604,6 @@ inline void runMPSGraph(MPSStream* stream, MPSGraph* graph, NSDictionary* feeds,
   runMPSGraph(stream, graph, feeds, dictionaryFromPlaceholders(result));
 }
 
-inline bool supportsComplex() {
-  return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS);
-}
-
 // MPS yet to support double types, but starting from MacOS 14, supports bfloat16
 inline bool supportedFloatingType(ScalarType dtype) {
   return dtype == kFloat || dtype == kHalf || dtype == kBFloat16;
@@ -633,7 +615,7 @@ inline bool supportedFloatingType(const TensorBase& t) {
 
 inline bool supportedFloatingOrComplexType(ScalarType dtype) {
   if (dtype == kComplexFloat || dtype == kComplexHalf) {
-    return supportsComplex();
+    return true;
   }
   return supportedFloatingType(dtype);
 }
@@ -641,11 +623,6 @@ inline bool supportedFloatingOrComplexType(const TensorBase& t) {
   return supportedFloatingOrComplexType(t.scalar_type());
 }
 
-inline void checkSupportsBFloat16() {
-  TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS),
-                   "MPS bfloat16 type is supported on MacOS 14.0 or newer.");
-}
-
 inline bool needsGather(const TensorBase& t) {
   static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
   return !is_macOS_15_0_or_newer && (!t.is_contiguous() || t.storage_offset());
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index de9da1acecc5c..bf3e94207e25b 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -89,10 +89,6 @@ void runMPSGraph(MPSStream* mpsStream, MPSGraph* mpsGraph, NSDictionary* feeds,
   mpsStream->executeMPSGraph(mpsGraph, feeds, results, SyncType::COMMIT_ADAPTIVE);
 }
 
-static inline void checkSupportsComplex() {
-  TORCH_CHECK_TYPE(supportsComplex(), "MPS complex types are only supported on MacOS 14.0 or newer.");
-}
-
 MPSDataType getMPSDataType(ScalarType scalar_type) {
   switch (scalar_type) {
     case ScalarType::Float:
@@ -100,7 +96,6 @@ MPSDataType getMPSDataType(ScalarType scalar_type) {
     case ScalarType::Half:
       return MPSDataTypeFloat16;
     case ScalarType::BFloat16:
-      checkSupportsBFloat16();
       return MPSDataTypeBFloat16;
     case ScalarType::Int:
       return MPSDataTypeInt32;
@@ -119,10 +114,8 @@ MPSDataType getMPSDataType(ScalarType scalar_type) {
                        "Cannot convert a float64 Tensor to MPS as the MPS framework doesn't support float64. "
                        "Please use float32 instead.")
     case ScalarType::ComplexHalf:
-      checkSupportsComplex();
       return MPSDataTypeComplexFloat16;
     case ScalarType::ComplexFloat:
-      checkSupportsComplex();
       return MPSDataTypeComplexFloat32;
     // Unsigned types
     case ScalarType::UInt64:
@@ -140,16 +133,10 @@ MPSDataType getMPSDataType(ScalarType scalar_type) {
 // #issue 104398441 sortWithTensor and argsortWithTensor has support of
 // Int32, Half and Float32 types. These utilities are to help cast to these
 // types.
-MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph,
-                               MPSGraphTensor* inputTensor,
-                               const TensorBase& input,
-                               bool includesInt64) {
+MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input) {
   MPSDataType dataType = getMPSDataType(input.scalar_type());
-  bool condition =
-      (dataType != MPSDataTypeInt32) && (dataType != MPSDataTypeFloat32) && (dataType != MPSDataTypeFloat16);
-  if (includesInt64) {
-    condition = condition && (dataType != MPSDataTypeInt64);
-  }
+  bool condition = (dataType != MPSDataTypeInt32) && (dataType != MPSDataTypeFloat32) &&
+      (dataType != MPSDataTypeFloat16) && (dataType != MPSDataTypeInt64);
   if (condition) {
     dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
     return [mpsGraph castTensor:inputTensor toType:dataType name:@"castInputTensor"];
@@ -160,16 +147,10 @@ MPSDataType getMPSDataType(ScalarType scalar_type) {
 // #issue 104398441 sortWithTensor and argsortWithTensor has support of
 // Int32, Half and Float32 types. These utilities are to help cast from these
 // types.
-MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph,
-                                 MPSGraphTensor* inputTensor,
-                                 const TensorBase& input,
-                                 bool includesInt64) {
+MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input) {
   MPSDataType dataType = getMPSDataType(input.scalar_type());
-  bool condition =
-      (dataType != MPSDataTypeInt32) && (dataType != MPSDataTypeFloat32) && (dataType != MPSDataTypeFloat16);
-  if (includesInt64) {
-    condition = condition && (dataType != MPSDataTypeInt64);
-  }
+  bool condition = (dataType != MPSDataTypeInt32) && (dataType != MPSDataTypeFloat32) &&
+      (dataType != MPSDataTypeFloat16) && (dataType != MPSDataTypeInt64);
   if (condition) {
     inputTensor = [mpsGraph castTensor:inputTensor toType:dataType name:@"castInputTensor"];
   }
@@ -186,7 +167,6 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
     case ScalarType::Half:
       return MPSDataTypeFloat16;
     case ScalarType::BFloat16:
-      checkSupportsBFloat16();
       return MPSDataTypeBFloat16;
     case ScalarType::Int:
       return MPSDataTypeInt32;
@@ -201,13 +181,11 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
     case ScalarType::Bool:
       return MPSDataTypeBool;
     case ScalarType::ComplexHalf:
-      checkSupportsComplex();
       return MPSDataTypeComplexFloat16;
     // This is an intentional fallthrough supporting ComplexDouble for Scalar
     // types as they are casted to Complex64 currently.
     case ScalarType::ComplexDouble:
     case ScalarType::ComplexFloat:
-      checkSupportsComplex();
       return MPSDataTypeComplexFloat32;
     // Unsigned types
     case ScalarType::UInt64:
@@ -267,7 +245,6 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
     case ScalarType::Half:
       return "half";
     case ScalarType::BFloat16:
-      checkSupportsBFloat16();
       return "bfloat";
     case ScalarType::Int:
       return "int";
@@ -879,9 +856,7 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override {}
   MTLCompileOptions* options = compile_options;
   if (!options) {
     options = [[MTLCompileOptions new] autorelease];
-    // Need 3.0 for atomic oprations, 3.1 introduces bfloat support
-    [options setLanguageVersion:is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) ? MTLLanguageVersion3_1
-                                                                                        : MTLLanguageVersion3_0];
+    [options setLanguageVersion:MTLLanguageVersion3_1];
     if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) {
       options.mathMode = fast_math ? MTLMathModeFast : MTLMathModeSafe;
       options.mathFloatingPointFunctions =
diff --git a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
index f6f4935608e49..0539eab79500d 100644
--- a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
@@ -39,6 +39,13 @@ struct lerp_alpha_functor {
   }
 };
 
+struct native_dropout_mask_and_scale_functor {
+  template <typename TI, typename TA>
+  inline TA operator()(const TI a, const TI b, const TA scale) {
+    return static_cast<TA>(a) * static_cast<TA>(b) * scale;
+  }
+};
+
 struct fmax_functor {
   template <typename T>
   inline T operator()(const T a, const T b) {
@@ -315,6 +322,20 @@ struct fmod_functor {
   }
 };
 
+struct igamma_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b) {
+    return c10::metal::igamma(a, b);
+  }
+};
+
+struct igammac_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b) {
+    return c10::metal::igammac(a, b);
+  }
+};
+
 #define REGISTER_INTEGER_BINARY_OP(NAME)  \
   REGISTER_BINARY_OP(NAME, long, long);   \
   REGISTER_BINARY_OP(NAME, int, int);     \
@@ -386,6 +407,8 @@ REGISTER_OPMATH_FLOAT_BINARY_OP(remainder);
 REGISTER_INTEGER_BINARY_OP(remainder);
 REGISTER_OPMATH_FLOAT_BINARY_OP(fmod);
 REGISTER_INTEGER_BINARY_OP(fmod);
+REGISTER_OPMATH_FLOAT_BINARY_OP(igamma);
+REGISTER_OPMATH_FLOAT_BINARY_OP(igammac);
 REGISTER_BINARY_ALPHA_OP(add_alpha, long, long, long);
 REGISTER_BINARY_ALPHA_OP(add_alpha, int, int, int);
 REGISTER_BINARY_ALPHA_OP(add_alpha, float, float, float);
@@ -411,6 +434,10 @@ REGISTER_BINARY_ALPHA_OP(lerp_alpha, uchar, uchar, uchar);
 REGISTER_BINARY_ALPHA_OP(lerp_alpha, char, char, char);
 REGISTER_BINARY_ALPHA_OP(lerp_alpha, bool, bool, bool);
 
+REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, float, float, float);
+REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, bfloat, bfloat, bfloat);
+REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, half, half, half);
+
 REGISTER_BINARY_ALPHA_OP(add_alpha, bfloat, bfloat, bfloat);
 REGISTER_BINARY_ALPHA_OP(sub_alpha, bfloat, bfloat, bfloat);
 REGISTER_BINARY_ALPHA_OP(lerp_alpha, bfloat, bfloat, bfloat);
diff --git a/aten/src/ATen/native/mps/kernels/GridSampler.h b/aten/src/ATen/native/mps/kernels/GridSampler.h
new file mode 100644
index 0000000000000..c2b3cad3cd47d
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/GridSampler.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <c10/metal/common.h>
+
+#ifdef __METAL__
+enum class GridSamplerInterpolation { Bilinear, Nearest, Bicubic };
+enum class GridSamplerPadding { Zeros, Border, Reflection };
+#else
+#include <ATen/native/GridSamplerUtils.h>
+using at::native::GridSamplerInterpolation;
+using at::native::GridSamplerPadding;
+#endif
+
+template <unsigned N = 5, typename idx_type_t = int32_t>
+struct GridSamplerParams {
+  int32_t sampler_dims;
+  ::c10::metal::array<idx_type_t, N> output_sizes;
+  ::c10::metal::array<idx_type_t, N> output_strides;
+  ::c10::metal::array<idx_type_t, N> input_sizes;
+  ::c10::metal::array<idx_type_t, N> input_strides;
+  ::c10::metal::array<idx_type_t, N> grid_sizes;
+  ::c10::metal::array<idx_type_t, N> grid_strides;
+  GridSamplerInterpolation interpolation_mode;
+  GridSamplerPadding padding_mode;
+  bool align_corners;
+};
diff --git a/aten/src/ATen/native/mps/kernels/GridSampler.metal b/aten/src/ATen/native/mps/kernels/GridSampler.metal
new file mode 100644
index 0000000000000..331793e08d664
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/GridSampler.metal
@@ -0,0 +1,324 @@
+#include <ATen/native/mps/kernels/GridSampler.h>
+#include <c10/metal/utils.h>
+#include <metal_array>
+#include <metal_stdlib>
+
+using namespace metal;
+using namespace c10::metal;
+
+struct GridSamplerOffsets {
+  int32_t output;
+  int32_t input;
+  int32_t grid;
+
+  GridSamplerOffsets() : output(0), input(0), grid(0) {}
+};
+
+// Find offsets into the tensors that this thread will operate on,
+// based on the thread ID.
+static GridSamplerOffsets find_grid_sampler_offsets(
+    constant int32_t* output_sizes,
+    constant int32_t* output_strides,
+    constant int32_t* input_strides,
+    constant int32_t* grid_strides,
+    int32_t sampler_dims,
+    uint tid) {
+  auto dims = sampler_dims + 2;
+  auto output_idx = static_cast<int32_t>(tid);
+  GridSamplerOffsets offsets;
+
+  for (auto dim = dims - 1; dim >= 0; dim--) {
+    auto dim_idx = output_idx % output_sizes[dim];
+    output_idx = output_idx / output_sizes[dim];
+
+    // Select the output element that this thread will calculate.
+    // output shape:
+    //   2 sampler dims: (N, C, Hout, Wout)
+    //   3 sampler dims: (N, C, Dout, Hout, Wout)
+    offsets.output += output_strides[dim] * dim_idx;
+
+    // Select the batch and channel for the input.
+    // input shape:
+    //   2 sampler dims: (N, C, Hin, Win)
+    //   3 sampler dims: (N, C, Din, Hin, Win)
+    if (dim < 2) {
+      offsets.input += input_strides[dim] * dim_idx;
+    }
+
+    // Select the grid coordinates for the output element.
+    // grid shape:
+    //   2 sampler dims: (N, Hout, Wout, 2)
+    //   3 sampler dims: (N, Dout, Hout, Wout, 3)
+    if (dim == 0) {
+      offsets.grid += grid_strides[dim] * dim_idx;
+    } else if (dim >= 2) {
+      offsets.grid += grid_strides[dim - 1] * dim_idx;
+    }
+  }
+
+  return offsets;
+}
+
+// Mod function which gives postive output when `a` is negative
+static int32_t mod(int32_t a, int32_t b) {
+  auto r = a % b;
+  return r + (r < 0 ? b : 0);
+}
+
+// Sentinel index value to indicate zero padding
+constant int32_t IDX_ZERO = -1;
+
+// Apply padding to an index into the input
+static int32_t pad_input_index(
+    int32_t idx,
+    int32_t input_size,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+  int32_t idx_padded = idx;
+
+  if (padding_mode == GridSamplerPadding::Zeros) {
+    idx_padded = (idx < 0) ? IDX_ZERO : idx_padded;
+    idx_padded = (idx >= input_size) ? IDX_ZERO : idx_padded;
+
+  } else if (padding_mode == GridSamplerPadding::Border) {
+    idx_padded = (idx < 0) ? 0 : idx_padded;
+    idx_padded = (idx >= input_size) ? input_size - 1 : idx_padded;
+
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    auto scale_length = align_corners ? (input_size - 1) : input_size;
+    auto idx_mod = mod(idx, scale_length);
+    auto idx_mod_reverse = (input_size - 1) - idx_mod;
+    bool is_reverse = (abs(idx - idx_mod) / scale_length) % 2 == 1;
+    idx_padded = is_reverse ? idx_mod_reverse : idx_mod;
+  }
+  return idx_padded;
+}
+
+template <int32_t dims, typename T>
+T get_tensor_val(
+    constant T* input,
+    constant int32_t* input_strides,
+    int32_t indices[dims]) {
+  bool found_idx_zero = false;
+  int32_t offset = 0;
+
+  for (auto dim = 0; dim < dims; dim++) {
+    auto idx = indices[dim];
+    found_idx_zero = found_idx_zero || (idx == IDX_ZERO);
+    offset += (found_idx_zero ? 0 : idx) * input_strides[dim];
+  }
+
+  return found_idx_zero ? 0 : input[offset];
+}
+
+// This function performs 3D linear interpolation for one value. One way to
+// think of how this works is to imagine a unit cube where each corner of the
+// cube has one scalar value associated with it. Inside the cube, the values
+// change linearly, so the gradient is constant. The values associated with each
+// corner are given by the `input`, indexed at all eight different combinations
+// of the `left_indices` and `right_indices`. Given a 3D coordinate anywhere
+// within the cube, specified by the `scales` argument, we must calculate the
+// value associated with that position.
+template <typename T>
+T interpolate_linear_3d(
+    constant T* input,
+    constant int32_t* input_strides,
+    int32_t left_indices[3],
+    int32_t right_indices[3],
+    opmath_t<T> scales[3]) {
+  int32_t a_idx[3] = {left_indices[0], left_indices[1], left_indices[2]};
+  int32_t b_idx[3] = {left_indices[0], left_indices[1], right_indices[2]};
+  int32_t c_idx[3] = {left_indices[0], right_indices[1], left_indices[2]};
+  int32_t d_idx[3] = {left_indices[0], right_indices[1], right_indices[2]};
+  int32_t e_idx[3] = {right_indices[0], left_indices[1], left_indices[2]};
+  int32_t f_idx[3] = {right_indices[0], left_indices[1], right_indices[2]};
+  int32_t g_idx[3] = {right_indices[0], right_indices[1], left_indices[2]};
+  int32_t h_idx[3] = {right_indices[0], right_indices[1], right_indices[2]};
+  auto a =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, a_idx));
+  auto b =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, b_idx));
+  auto c =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, c_idx));
+  auto d =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, d_idx));
+  auto e =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, e_idx));
+  auto f =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, f_idx));
+  auto g =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, g_idx));
+  auto h =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, h_idx));
+
+  auto scale0_right = scales[0];
+  auto scale1_right = scales[1];
+  auto scale2_right = scales[2];
+  auto scale0_left = 1 - scale0_right;
+  auto scale1_left = 1 - scale1_right;
+  auto scale2_left = 1 - scale2_right;
+
+  return static_cast<T>(
+      scale0_left * scale1_left * scale2_left * a +
+      scale0_left * scale1_left * scale2_right * b +
+      scale0_left * scale1_right * scale2_left * c +
+      scale0_left * scale1_right * scale2_right * d +
+      scale0_right * scale1_left * scale2_left * e +
+      scale0_right * scale1_left * scale2_right * f +
+      scale0_right * scale1_right * scale2_left * g +
+      scale0_right * scale1_right * scale2_right * h);
+}
+
+// Calculates a single output element.
+// `input` shape:
+//    2 sampler dims: (Hin, Win)
+//    3 sampler dims: (Din, Hin, Win)
+// `coords` values:
+//    2 sampler dims: (Wcoord, Hcoord)
+//    3 sampler dims: (Wcoord, Hcoord, Dcoord)
+template <typename T>
+void grid_sampler_single_element(
+    device T* output,
+    constant T* input,
+    constant T* coords,
+    int32_t dims,
+    constant int32_t* input_sizes,
+    constant int32_t* input_strides,
+    GridSamplerInterpolation interpolation_mode,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+  int32_t left_indices[3];
+  int32_t right_indices[3];
+  opmath_t<T> scales[3];
+
+  // For each dimension, find the pair of indices in the cooresponding dimension
+  // of `input` which surround the grid coordinate in that dimension. We'll do
+  // this by mapping different coordiante spaces onto each other. There are
+  // basically three different coordinate spaces to keep in mind:
+  //
+  //  * aligned grid space
+  //    - `-1` refers to the leftmost input value.
+  //    - `1` refers to the rightmost input value.
+  //
+  //  * unaligned grid space
+  //    - `-1` refers to the midpoint between the leftmost input value and
+  //      a padding value to the left of that.
+  //    - `1` refers to the midpoint between the rightmost input value and
+  //      a padding value to the right of that.
+  //
+  //  * input index space
+  //    - `n` refers to the n-th value of the input.
+  //    - `0` refers to the leftmost input value.
+  //    - `N-1` refers to the rightmost input value.
+  //
+  // If `align_corners == False`, then the coordinates are is in unaligned grid
+  // space, and we will map it onto aligned grid space. If `align_corners ==
+  // True`, then coordinates are already in aligned grid space.
+  //
+  // Then we will map unaligned grid space onto input index space, making it
+  // relatively simple to find the two input indices that surround the
+  // coordinate.
+  for (auto coord_dim = 0; coord_dim < dims; coord_dim++) {
+    auto input_dim = dims - coord_dim - 1;
+    auto input_size = input_sizes[input_dim];
+    auto coord = static_cast<opmath_t<T>>(coords[coord_dim]);
+
+    // Interpret nan as -1
+    coord = isnan(coord) ? -1 : coord;
+
+    if (!align_corners) {
+      // Map unaligned grid space to aligned grid space
+      auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) /
+          static_cast<opmath_t<T>>(input_size - 1);
+      coord = coord * corner_alignment_factor;
+    }
+
+    // Map aligned grid space to input index space
+    coord = (coord + 1) * (static_cast<opmath_t<T>>(input_size - 1) / 2);
+
+    // Get the input indices surrounding the coordinate, apply padding to them,
+    // and obtain the scaling factor between the two for interpolation.
+    auto left_idx = static_cast<int32_t>(floor(coord));
+    auto right_idx = static_cast<int32_t>(ceil(coord));
+    left_indices[input_dim] =
+        pad_input_index(left_idx, input_size, padding_mode, align_corners);
+    right_indices[input_dim] =
+        pad_input_index(right_idx, input_size, padding_mode, align_corners);
+
+    auto scale = coord - left_idx;
+
+    if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+      // TODO: For some reason, rounding the scale to 0 or 1 and then using
+      // linear interpolation seems to work perfectly with zero padding mode,
+      // but we get flaky failures with border and reflection padding modes.
+      // Need to investigate and fix it.
+      scale = (scale <= 0.5) ? 0 : 1;
+    }
+    scales[input_dim] = scale;
+  }
+
+  // Now that we have the bounding indices and scale factor for each dimension
+  // of the input, we can interpolate.
+  if (dims == 3) {
+    *output = interpolate_linear_3d(
+        input, input_strides, left_indices, right_indices, scales);
+  }
+}
+
+template <typename T>
+kernel void grid_sampler(
+    device T* output [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant T* grid [[buffer(2)]],
+    constant GridSamplerParams<5>& params [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  auto output_sizes = params.output_sizes.data();
+  auto output_strides = params.output_strides.data();
+  auto input_sizes = params.input_sizes.data();
+  auto input_strides = params.input_strides.data();
+  auto grid_strides = params.grid_strides.data();
+  auto sampler_dims = params.sampler_dims;
+
+  auto offsets = find_grid_sampler_offsets(
+      output_sizes,
+      output_strides,
+      input_strides,
+      grid_strides,
+      sampler_dims,
+      tid);
+
+  output += offsets.output;
+  input += offsets.input;
+  auto coords = grid + offsets.grid;
+
+  input_sizes += 2;
+  input_strides += 2;
+
+  auto interpolation_mode = params.interpolation_mode;
+  auto padding_mode = params.padding_mode;
+  auto align_corners = params.align_corners;
+
+  grid_sampler_single_element(
+      output,
+      input,
+      coords,
+      sampler_dims,
+      input_sizes,
+      input_strides,
+      interpolation_mode,
+      padding_mode,
+      align_corners);
+}
+
+#define REGISTER_GRID_SAMPLER_OP(DTYPE)                     \
+  template [[host_name("grid_sampler_" #DTYPE)]]            \
+  kernel void grid_sampler<DTYPE>(                          \
+      device DTYPE * output [[buffer(0)]],                  \
+      constant DTYPE * input [[buffer(1)]],                 \
+      constant DTYPE * grid [[buffer(2)]],                  \
+      constant GridSamplerParams<5> & params [[buffer(3)]], \
+      uint tid [[thread_position_in_grid]]);
+
+REGISTER_GRID_SAMPLER_OP(float);
+REGISTER_GRID_SAMPLER_OP(half);
+REGISTER_GRID_SAMPLER_OP(bfloat);
diff --git a/aten/src/ATen/native/mps/kernels/Indexing.metal b/aten/src/ATen/native/mps/kernels/Indexing.metal
index 7503d8b2b1c8b..b41e64d70ced5 100644
--- a/aten/src/ATen/native/mps/kernels/Indexing.metal
+++ b/aten/src/ATen/native/mps/kernels/Indexing.metal
@@ -5,29 +5,6 @@
 using namespace metal;
 using namespace c10::metal;
 
-namespace c10 {
-namespace metal {
-// There are no atomic 64-bit add in Metal yet, but this implements a consistent
-// add I.e. if multiple threads are modify the same 64-bit value, results stored
-// at the address will eventually be equal to its original value plus sum of all
-// operands
-template <>
-struct AtomicType<long> {
-  using type = ::metal::atomic<uint>;
-  static inline void atomic_add(device type* data, long offset, long value) {
-    const auto value_bits = as_type<ulong>(value);
-    const uint low = static_cast<uint>(value_bits);
-    uint high = static_cast<uint>(value_bits >> 32);
-    auto ptr = data + (offset << 1);
-    auto old_low = atomic_fetch_add_explicit(ptr, low, memory_order_relaxed);
-    high += (old_low + low < old_low) ? 1 : 0;
-    atomic_fetch_add_explicit(ptr + 1, high, memory_order_relaxed);
-  }
-};
-
-} // namespace metal
-} // namespace c10
-
 struct IndexAB {
   constant int64_t* indexArray;
 };
@@ -234,13 +211,15 @@ REGISTER_INDEX_OP_ALL_DTYPES(put_serial);
 
 REGISTER_INDEX_OP(put_accumulate, float, float);
 REGISTER_INDEX_OP(put_accumulate, half, half);
+REGISTER_INDEX_OP(put_accumulate, bfloat, bfloat);
 REGISTER_INDEX_OP(put_accumulate, long, long);
 REGISTER_INDEX_OP(put_accumulate, int, int);
 REGISTER_INDEX_OP(put_accumulate, short, short);
 REGISTER_INDEX_OP(put_accumulate, char, char);
 REGISTER_INDEX_OP(put_accumulate, uchar, uchar);
 REGISTER_INDEX_OP(put_accumulate, bool, bool);
-REGISTER_INDEX_OP(put_accumulate, bfloat, bfloat);
+REGISTER_INDEX_OP(put_accumulate, float2, float2);
+REGISTER_INDEX_OP(put_accumulate, half2, half2);
 
 template <typename StridesT, typename DataT>
 kernel void kernel_index_offsets(
@@ -379,6 +358,7 @@ kernel void index_copy_strided(
     constant long* input_strides,
     constant long* output_strides,
     constant long* source_strides,
+    constant long& indices_stride,
     uint thread_index [[thread_position_in_grid]]) {
   int pos[max_ndim];
   pos_from_thread_index(int(thread_index), pos, sizes, ndim);
@@ -395,7 +375,7 @@ kernel void index_copy_strided(
   // find the last index in the indices array that equals this coordinate
   int last_matching_index = -1;
   for (uint i = 0; i < indices_numel; i++) {
-    if (indices[i] == orig_dim) {
+    if (indices[i * indices_stride] == orig_dim) {
       last_matching_index = int(i);
     }
   }
@@ -434,6 +414,7 @@ kernel void index_copy_strided(
       constant long*,                                           \
       constant long*,                                           \
       constant long*,                                           \
+      constant long&,                                           \
       uint);
 
 #define REGISTER_MASKED_FILL_SCALAR(SIZE, DTYPE)                            \
diff --git a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
index 92774f3ff2668..4ba2bca720db7 100644
--- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
+++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
@@ -68,6 +68,37 @@ kernel void matmul(
   }
 }
 
+template <typename T>
+kernel void addmm(
+    constant T* mat1Data [[buffer(0)]],
+    constant T* mat2Data [[buffer(1)]],
+    device T* outputData [[buffer(2)]],
+    constant T* biasData [[buffer(3)]],
+    constant array<c10::metal::opmath_t<T>, 2>& alpha_beta [[buffer(4)]],
+    constant array<ulong2, 4>& strides [[buffer(5)]],
+    constant uint3& sizes [[buffer(6)]],
+    uint2 tid [[thread_position_in_threadgroup]],
+    uint2 thread_id [[thread_position_in_grid]]) {
+  threadgroup T A_tile[TILE_DIM][TILE_DIM];
+  threadgroup T B_tile[TILE_DIM][TILE_DIM];
+
+  auto sum = matmul_inner<T>(
+      mat1Data,
+      mat2Data,
+      reinterpret_cast<constant array<ulong2, 3>&>(strides),
+      sizes,
+      A_tile,
+      B_tile,
+      tid,
+      thread_id);
+  if (thread_id.y < sizes.x && thread_id.x < sizes.z) {
+    auto bias =
+        biasData[thread_id.y * strides[3].x + thread_id.x * strides[3].y];
+    outputData[thread_id.y * strides[2].x + thread_id.x * strides[2].y] =
+        static_cast<T>(alpha_beta[0] * sum + alpha_beta[1] * bias);
+  }
+}
+
 template <typename T>
 kernel void naive_bmm(
     constant T* mat1Data [[buffer(0)]],
@@ -613,17 +644,15 @@ kernel void applyPivots(
   }
 }
 
-#define INSTANTIATE_NAIVE_MM(DTYPE)                                   \
-  template [[host_name("matmul_" #DTYPE)]] kernel void matmul<DTYPE>( \
-      constant DTYPE * mat1Data [[buffer(0)]],                        \
-      constant DTYPE * mat2Data [[buffer(1)]],                        \
-      device DTYPE * outputData [[buffer(2)]],                        \
-      constant array<ulong2, 3> & strides [[buffer(3)]],              \
-      constant uint3 & sizes [[buffer(4)]],                           \
-      uint2 tid [[thread_position_in_threadgroup]],                   \
-      uint2 group_id [[threadgroup_position_in_grid]])
-
-#define INSTANTIATE_NAIVE_BMM(DTYPE)                                        \
+#define INSTANTIATE_MM_OPS(DTYPE)                                           \
+  template [[host_name("matmul_" #DTYPE)]] kernel void matmul<DTYPE>(       \
+      constant DTYPE * mat1Data [[buffer(0)]],                              \
+      constant DTYPE * mat2Data [[buffer(1)]],                              \
+      device DTYPE * outputData [[buffer(2)]],                              \
+      constant array<ulong2, 3> & strides [[buffer(3)]],                    \
+      constant uint3 & sizes [[buffer(4)]],                                 \
+      uint2 tid [[thread_position_in_threadgroup]],                         \
+      uint2 group_id [[threadgroup_position_in_grid]]);                     \
   template [[host_name("naive_bmm_" #DTYPE)]] kernel void naive_bmm<DTYPE>( \
       constant DTYPE * mat1Data [[buffer(0)]],                              \
       constant DTYPE * mat2Data [[buffer(1)]],                              \
@@ -631,20 +660,26 @@ kernel void applyPivots(
       constant array<ulong, 9> & strides [[buffer(3)]],                     \
       constant uint4 & sizes [[buffer(4)]],                                 \
       uint3 tid [[thread_position_in_threadgroup]],                         \
-      uint3 group_id [[threadgroup_position_in_grid]])
+      uint3 group_id [[threadgroup_position_in_grid]]);                     \
+  template [[host_name("addmm_" #DTYPE)]] kernel void addmm<DTYPE>(         \
+      constant DTYPE * mat1Data [[buffer(0)]],                              \
+      constant DTYPE * mat2Data [[buffer(1)]],                              \
+      device DTYPE * outputData [[buffer(2)]],                              \
+      constant DTYPE * biasData [[buffer(3)]],                              \
+      constant array<c10::metal::opmath_t<DTYPE>, 2> &                      \
+          alpha_beta [[buffer(4)]],                                         \
+      constant array<ulong2, 4> & strides [[buffer(5)]],                    \
+      constant uint3 & sizes [[buffer(6)]],                                 \
+      uint2 tid [[thread_position_in_threadgroup]],                         \
+      uint2 group_id [[threadgroup_position_in_grid]])
 
-INSTANTIATE_NAIVE_MM(float);
-INSTANTIATE_NAIVE_MM(half);
-INSTANTIATE_NAIVE_MM(bfloat);
+INSTANTIATE_MM_OPS(float);
+INSTANTIATE_MM_OPS(half);
+INSTANTIATE_MM_OPS(bfloat);
 
 // Integral MM
-INSTANTIATE_NAIVE_MM(short);
-INSTANTIATE_NAIVE_MM(int);
-INSTANTIATE_NAIVE_MM(long);
-INSTANTIATE_NAIVE_MM(char);
-INSTANTIATE_NAIVE_MM(uchar);
-INSTANTIATE_NAIVE_BMM(short);
-INSTANTIATE_NAIVE_BMM(int);
-INSTANTIATE_NAIVE_BMM(long);
-INSTANTIATE_NAIVE_BMM(char);
-INSTANTIATE_NAIVE_BMM(uchar);
+INSTANTIATE_MM_OPS(long);
+INSTANTIATE_MM_OPS(int);
+INSTANTIATE_MM_OPS(short);
+INSTANTIATE_MM_OPS(char);
+INSTANTIATE_MM_OPS(uchar);
diff --git a/aten/src/ATen/native/mps/kernels/Pooling.metal b/aten/src/ATen/native/mps/kernels/Pooling.metal
index 4eec3ed4d1b6e..3eee8bb079a7a 100644
--- a/aten/src/ATen/native/mps/kernels/Pooling.metal
+++ b/aten/src/ATen/native/mps/kernels/Pooling.metal
@@ -1,5 +1,6 @@
 #include <ATen/native/mps/kernels/Pooling.h>
 #include <c10/metal/atomic.h>
+#include <c10/metal/utils.h>
 #include <metal_array>
 #include <metal_stdlib>
 
@@ -88,6 +89,53 @@ void max_pool_3d_input_iter(
   }
 }
 
+template <typename T, bool return_indices>
+void max_pool_2d_input_iter(
+    constant T* input,
+    device T* output,
+    device int64_t* indices,
+    constant int32_t* input_sizes,
+    constant int32_t* input_strides,
+    thread int32_t (&pooling_dim_indices)[3],
+    constant int32_t* kernel_size,
+    constant int32_t* stride,
+    constant int32_t* padding,
+    constant int32_t* dilation) {
+  auto bounds0 = get_input_iter_bounds<0>(
+      input_sizes, pooling_dim_indices, kernel_size, stride, padding, dilation);
+  auto bounds1 = get_input_iter_bounds<1>(
+      input_sizes, pooling_dim_indices, kernel_size, stride, padding, dilation);
+
+  auto d0 = dilation[0];
+  auto d1 = dilation[1];
+
+  T max_value = input
+      [input_strides[0] * bounds0.start + input_strides[1] * bounds1.start];
+  auto max_index = bounds0.start * input_sizes[1] + bounds1.start;
+
+  for (auto i0 = bounds0.start; i0 < bounds0.end; i0 += d0) {
+    auto offset0 = input_strides[0] * i0;
+
+    for (auto i1 = bounds1.start; i1 < bounds1.end; i1 += d1) {
+      auto offset1 = input_strides[1] * i1;
+
+      auto input_value = input[offset0 + offset1];
+      bool is_greater = input_value > max_value;
+
+      max_value = is_greater ? input_value : max_value;
+
+      if (return_indices) {
+        auto input_index = i0 * input_sizes[1] + i1;
+        max_index = is_greater ? input_index : max_index;
+      }
+    }
+  }
+  *output = max_value;
+  if (return_indices) {
+    *indices = max_index;
+  }
+}
+
 struct PoolOffsets {
   int32_t output;
   int32_t indices;
@@ -212,7 +260,7 @@ kernel void max_pool(
   PoolOffsets offsets = find_pool_offsets(
       output_sizes,
       output_strides,
-      indices_strides,
+      return_indices ? indices_strides : nullptr,
       input_strides,
       pooling_dim_indices,
       dims,
@@ -224,18 +272,47 @@ kernel void max_pool(
   indices += offsets.indices;
   input += offsets.input_leading;
 
-  max_pool_3d_input_iter<T>(
-      input,
-      output,
-      indices,
-      input_sizes + leading_dims,
-      input_strides + leading_dims,
-      pooling_dim_indices,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      return_indices);
+  switch (pooling_dims) {
+    case 2:
+      if (return_indices) {
+        return max_pool_2d_input_iter<T, /*return_indices=*/true>(
+            input,
+            output,
+            indices,
+            input_sizes + leading_dims,
+            input_strides + leading_dims,
+            pooling_dim_indices,
+            kernel_size,
+            stride,
+            padding,
+            dilation);
+      } else {
+        return max_pool_2d_input_iter<T, /*return_indices=*/false>(
+            input,
+            output,
+            indices,
+            input_sizes + leading_dims,
+            input_strides + leading_dims,
+            pooling_dim_indices,
+            kernel_size,
+            stride,
+            padding,
+            dilation);
+      }
+    case 3:
+      return max_pool_3d_input_iter<T>(
+          input,
+          output,
+          indices,
+          input_sizes + leading_dims,
+          input_strides + leading_dims,
+          pooling_dim_indices,
+          kernel_size,
+          stride,
+          padding,
+          dilation,
+          return_indices);
+  }
 }
 
 // Finds the element in the grad input which corresponds to the index into the
@@ -426,8 +503,8 @@ void avg_pool_3d_input_iter(
       padding,
       count_include_pad);
 
-  T value_sum = 0;
-  auto divisor = has_divisor_override
+  opmath_t<T> value_sum = 0;
+  opmath_t<T> divisor = has_divisor_override
       ? divisor_override
       : (bounds0.count) * (bounds1.count) * (bounds2.count);
 
@@ -440,11 +517,58 @@ void avg_pool_3d_input_iter(
       for (auto i2 = bounds2.start; i2 < bounds2.end; i2++) {
         auto offset2 = input_strides[2] * i2;
         auto input_value = input[offset0 + offset1 + offset2];
-        value_sum += input_value;
+        value_sum += static_cast<opmath_t<T>>(input_value);
       }
     }
   }
-  *output = value_sum / static_cast<T>(divisor);
+  *output = static_cast<T>(value_sum / divisor);
+}
+
+// Iterates through all the input elements that this kernel needs to
+// apply max to. Specialized for 2 pooling dimensions.
+template <typename T>
+void avg_pool_2d_input_iter(
+    constant T* input,
+    device T* output,
+    constant int32_t* input_sizes,
+    constant int32_t* input_strides,
+    thread int32_t (&pooling_dim_indices)[3],
+    constant int32_t* kernel_size,
+    constant int32_t* stride,
+    constant int32_t* padding,
+    bool count_include_pad,
+    bool has_divisor_override,
+    int32_t divisor_override) {
+  auto bounds0 = get_avg_pool_input_iter_bounds<0>(
+      input_sizes,
+      pooling_dim_indices,
+      kernel_size,
+      stride,
+      padding,
+      count_include_pad);
+  auto bounds1 = get_avg_pool_input_iter_bounds<1>(
+      input_sizes,
+      pooling_dim_indices,
+      kernel_size,
+      stride,
+      padding,
+      count_include_pad);
+
+  opmath_t<T> value_sum = 0;
+  opmath_t<T> divisor = has_divisor_override
+      ? divisor_override
+      : (bounds0.count) * (bounds1.count);
+
+  for (auto i0 = bounds0.start; i0 < bounds0.end; i0++) {
+    auto offset0 = input_strides[0] * i0;
+
+    for (auto i1 = bounds1.start; i1 < bounds1.end; i1++) {
+      auto offset1 = input_strides[1] * i1;
+      auto input_value = input[offset0 + offset1];
+      value_sum += static_cast<opmath_t<T>>(input_value);
+    }
+  }
+  *output = static_cast<T>(value_sum / divisor);
 }
 
 template <typename T>
@@ -543,18 +667,33 @@ kernel void avg_pool(
   input_sizes += leading_dims;
   input_strides += leading_dims;
 
-  avg_pool_3d_input_iter<T>(
-      input,
-      output,
-      input_sizes,
-      input_strides,
-      pooling_dim_indices,
-      kernel_size,
-      stride,
-      padding,
-      params.count_include_pad,
-      params.has_divisor_override,
-      params.divisor_override);
+  if (pooling_dims == 3) {
+    avg_pool_3d_input_iter<T>(
+        input,
+        output,
+        input_sizes,
+        input_strides,
+        pooling_dim_indices,
+        kernel_size,
+        stride,
+        padding,
+        params.count_include_pad,
+        params.has_divisor_override,
+        params.divisor_override);
+  } else if (pooling_dims == 2) {
+    avg_pool_2d_input_iter<T>(
+        input,
+        output,
+        input_sizes,
+        input_strides,
+        pooling_dim_indices,
+        kernel_size,
+        stride,
+        padding,
+        params.count_include_pad,
+        params.has_divisor_override,
+        params.divisor_override);
+  }
 }
 
 template <typename T>
diff --git a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
index 23c4810a24963..7db38da80532f 100644
--- a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
@@ -490,11 +490,6 @@ struct bitwise_not_functor {
   }
 };
 
-template <typename T>
-float erfc(T x) {
-  return 1.0 - erf(x);
-}
-
 struct round_decimals_functor {
   template <typename T>
   inline T operator()(const T x, const long ndigits) {
@@ -503,6 +498,17 @@ struct round_decimals_functor {
   }
 };
 
+struct round_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    return static_cast<T>(rint(float(x)));
+  }
+  template <typename T, enable_if_t<is_scalar_integral_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    return x;
+  }
+};
+
 DEFINE_UNARY_FLOATING_FUNCTOR(erf);
 DEFINE_UNARY_FLOATING_FUNCTOR(erfc);
 DEFINE_UNARY_FLOATING_FUNCTOR(erfinv);
@@ -515,6 +521,13 @@ REGISTER_UNARY_OP(neg, char, char);
 REGISTER_UNARY_OP(neg, uchar, uchar);
 REGISTER_UNARY_OP(neg, float, float);
 REGISTER_UNARY_OP(neg, half, half);
+REGISTER_UNARY_OP(round, int, int);
+REGISTER_UNARY_OP(round, long, long);
+REGISTER_UNARY_OP(round, short, short);
+REGISTER_UNARY_OP(round, char, char);
+REGISTER_UNARY_OP(round, uchar, uchar);
+REGISTER_UNARY_OP(round, float, float);
+REGISTER_UNARY_OP(round, half, half);
 
 REGISTER_UNARY_OP(bitwise_not, int, int);
 REGISTER_UNARY_OP(bitwise_not, long, long);
@@ -558,6 +571,7 @@ REGISTER_UNARY_OP(abs, half, half);
 
 INSTANTIATE_UNARY_KERNELS2(bfloat, bfloat);
 REGISTER_UNARY_OP(neg, bfloat, bfloat);
+REGISTER_UNARY_OP(round, bfloat, bfloat);
 REGISTER_UNARY_OP(abs, bfloat, bfloat);
 INSTANTIATE_UNARY_KERNELS2(half, half);
 INSTANTIATE_UNARY_KERNELS2(float, float);
diff --git a/aten/src/ATen/native/mps/operations/BinaryKernel.mm b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
index 806eeb82e1d17..0b303f48028f4 100644
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@@ -53,6 +53,7 @@ void binary_op_kernel(const std::string func_name,
                   .add_input(input)
                   .add_input(other)
                   .check_all_same_dtype(false)
+                  .promote_inputs_to_common_dtype(true)
                   .build();
 
   lib.exec_binary_kernel(iter, func_name, alpha);
@@ -167,6 +168,10 @@ static void lerp_scalar_mps_kernel(at::TensorIteratorBase& iter, const Scalar& w
   lib.exec_binary_kernel(iter, "lerp_alpha", weight);
 }
 
+static void native_dropout_mask_and_scale_mps_kernel(at::TensorIteratorBase& iter, const Scalar& scale) {
+  lib.exec_binary_kernel(iter, "native_dropout_mask_and_scale", scale);
+}
+
 static void mul_mps_kernel(TensorIteratorBase& iter) {
   lib.exec_binary_kernel(iter, "mul");
 }
@@ -191,6 +196,14 @@ static void fmod_mps_kernel(TensorIteratorBase& iter) {
   lib.exec_binary_kernel(iter, "fmod");
 }
 
+static void igamma_mps_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "igamma");
+}
+
+static void igammac_mps_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "igammac");
+}
+
 REGISTER_DISPATCH(fmax_stub, &fmax_mps_kernel)
 REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel)
 REGISTER_DISPATCH(copysign_stub, &copysign_mps_kernel)
@@ -216,4 +229,6 @@ static void fmod_mps_kernel(TensorIteratorBase& iter) {
 REGISTER_DISPATCH(div_trunc_stub, &div_trunc_mps_kernel)
 REGISTER_DISPATCH(fmod_stub, &fmod_mps_kernel)
 REGISTER_DISPATCH(remainder_stub, &remainder_mps_kernel)
+REGISTER_DISPATCH(igamma_stub, &igamma_mps_kernel)
+REGISTER_DISPATCH(igammac_stub, &igammac_mps_kernel)
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index a9589ecc490ee..06b6edcff9407 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -48,28 +48,11 @@
 #define BinaryOpFn(graph, primary, secondary) \
   MPSGraphTensor*(mps::BinaryOpCachedGraph * graph, MPSGraphTensor * primary, MPSGraphTensor * secondary)
 
-static inline Tensor legacy_complex_as_view(const Tensor& t) {
-  // Convert non-complex types (and cdouble CPU scalars) to cfloat
-  if (!isComplexType(t.scalar_type()) || t.scalar_type() == kComplexDouble) {
-    return at::view_as_real(t.to(kMPS, kComplexFloat));
-  }
-  return at::view_as_real(t.dim() != 0 ? t : t.to(kMPS));
-}
-
 static void binaryOpTensor(const Tensor& self,
                            const Tensor& other,
                            const Tensor& output_,
                            std::string op_name,
                            BinaryOpBlock binaryBlock) {
-  TORCH_CHECK(!(op_name == "power" && !is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS) &&
-                (self.scalar_type() == ScalarType::Long ||
-                 (other.scalar_type() == ScalarType::Long &&
-                  (self.scalar_type() != ScalarType::Half && self.scalar_type() != ScalarType::Float)))),
-              "MPS: ",
-              op_name,
-              " op with int64 input is supported natively starting from macOS 13.2");
-  TORCH_CHECK_TYPE(!isComplexType(self.scalar_type()) || mps::supportsComplex(),
-                   "Complex types are supported starting from MacOS 14.0+");
   MPSStream* mpsStream = getCurrentMPSStream();
 
   const bool is_self_scalar = self.dim() == 0;
diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm
index f167067216d48..101ef5feb224e 100644
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@@ -51,9 +51,6 @@ inline void dot_check(const Tensor& self, const Tensor& other) {
 } // namespace mps
 
 Tensor dot_mps(const Tensor& self, const Tensor& other) {
-  TORCH_CHECK(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) || self.scalar_type() != ScalarType::Long,
-              "MPS: dot op doesn't support int64 input on MacOS13")
-
   using namespace mps;
   using CachedGraph = MPSBinaryCachedGraph;
 
diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 97d562730dd8a..d572d52d103a1 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -124,7 +124,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
                                     IntArrayRef dilation,
                                     int64_t groups,
                                     std::optional<IntArrayRef> input_shape) {
-  const bool is_macOS_13_2_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS);
   const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
   Tensor input_t = input_t_;
   bool is3DConv = input_t.dim() == 5;
@@ -132,9 +131,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
     input_t = input_t.contiguous();
   }
 
-  TORCH_CHECK(((input_t.dim() < 5) || is_macOS_13_2_or_newer),
-              "Conv3D is only supported on MPS for MacOS_13_2 or newer");
-
   TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types");
 
   using namespace at::native::mps;
diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index 4f879c3b63b02..0c121cee8fb62 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -60,7 +60,6 @@ static void copy_cast_mps(at::Tensor& dst,
         outputTensor = [mpsGraph castTensor:outputTensor toType:dstDType name:@"cast"];
       }
       if (needs_conj) {
-        TORCH_CHECK(supportsComplex(), "MPS complex tensors conjugation needs MacOS14+");
         outputTensor = [mpsGraph conjugateWithTensor:outputTensor name:nil];
       }
 
@@ -275,24 +274,7 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
     // for GPU to GPU copies we only encode to stream's command buffer (no flushing)
     stream->copy(sourceBuffer, destBuffer, src.nbytes(), src_byte_offset, dst_byte_offset, profile_id);
   } else {
-    // Simulate cast to Complex on older MacOS by initializing real and imag parts
-    if (dst_.is_complex() && !supportsComplex()) {
-      if (!src.is_complex()) {
-        at::real(dst_).copy_(src);
-        at::imag(dst_).fill_(0);
-      } else if (src.is_conj() || dst_.is_conj()) {
-        // One cannot take view of conjugated tensor, but for some reason real and imag views are fine
-        // Use this to implement a conjugation
-        at::real(dst_).copy_(at::real(src));
-        if (src.is_conj() != dst_.is_conj()) {
-          at::imag(dst_).copy_(at::neg(at::imag(src)));
-        } else {
-          at::imag(dst_).copy_(at::imag(src));
-        }
-      } else {
-        at::view_as_real(dst_).copy_(at::view_as_real(src));
-      }
-    } else if (dst_byte_offset) {
+    if (dst_byte_offset) {
       auto maybeCastedSource =
           at::empty(dst_.sizes(), dst_.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
       auto maybeCastedSourceBuffer = getMTLBufferStorage(maybeCastedSource);
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index d072e5a40ac96..4d3f99ea9e02d 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -87,7 +87,6 @@
           case kFloat:
             return MPSDataTypeFloat32;
           case kBFloat16: {
-            checkSupportsBFloat16();
             return MPSDataTypeBFloat16;
           }
           default:
diff --git a/aten/src/ATen/native/mps/operations/Dropout.mm b/aten/src/ATen/native/mps/operations/Dropout.mm
new file mode 100644
index 0000000000000..116367d809eb5
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Dropout.mm
@@ -0,0 +1,45 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/TensorOperators.h>
+#include <ATen/mps/MPSGeneratorImpl.h>
+#include <ATen/native/Distributions.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/operations/BinaryKernel.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/bernoulli.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/native_dropout_backward_native.h>
+#include <ATen/ops/native_dropout_native.h>
+#include <ATen/ops/ones_like.h>
+#endif
+
+namespace at::native {
+
+static Tensor native_dropout_mask_and_scale(const Tensor& input, const Tensor& mask, float scale) {
+  auto output = at::empty_like(input);
+  mps::binary_op_kernel("native_dropout_mask_and_scale", input, mask, output, scale);
+  return output;
+}
+
+std::tuple<Tensor, Tensor> native_dropout_mps(const Tensor& input, double p, std::optional<bool> train) {
+  if (input.numel() == 0 || !train.value_or(false) || p == 0) {
+    return {input.clone(), at::ones_like(input, input.options().dtype(c10::kBool))};
+  }
+
+  float p_comp = 1.0f - p;
+  Tensor mask = at::empty_like(input, input.options().dtype(c10::kBool));
+  mask.bernoulli_(p_comp);
+  auto scale = p_comp == 0 ? 0.0f : 1.0f / p_comp;
+  Tensor output = native_dropout_mask_and_scale(input, mask, scale);
+  return {std::move(output), std::move(mask)};
+}
+
+Tensor native_dropout_backward_mps(const Tensor& grad, const Tensor& mask, double scale) {
+  auto grad_float = isFloatingType(grad.scalar_type()) ? grad : grad.to(c10::kFloat);
+  return native_dropout_mask_and_scale(grad_float, mask, scale);
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
index a9ac701106170..7e9867c9b948d 100644
--- a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
+++ b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
@@ -88,7 +88,6 @@ Tensor _fft_c2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization,
 
 // TODO: Investigate numerical discrepancies see https://github.com/pytorch/pytorch/issues/120237
 Tensor& _fft_r2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided, Tensor& out) {
-  TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+");
   auto key = __func__ + getTensorsStringKey({self, out}) + ":" + getArrayRefString(dim) + ":" +
       std::to_string(normalization) + ":" + std::to_string(onesided);
   @autoreleasepool {
@@ -129,7 +128,6 @@ Tensor _fft_c2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization,
                          int64_t normalization,
                          int64_t last_dim_size,
                          Tensor& out) {
-  TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+");
   auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" +
       std::to_string(normalization) + ":" + std::to_string(last_dim_size);
   @autoreleasepool {
@@ -155,7 +153,6 @@ Tensor _fft_c2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization,
 }
 
 Tensor& _fft_c2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward, Tensor& out) {
-  TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+");
   auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" +
       std::to_string(normalization) + ":" + std::to_string(forward);
   @autoreleasepool {
diff --git a/aten/src/ATen/native/mps/operations/GridSampler.mm b/aten/src/ATen/native/mps/operations/GridSampler.mm
index 1e701d314354d..ef85633889487 100644
--- a/aten/src/ATen/native/mps/operations/GridSampler.mm
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@@ -1,7 +1,10 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/GridSamplerUtils.h>
+#include <ATen/native/Pool.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/kernels/GridSampler.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -9,9 +12,17 @@
 #else
 #include <ATen/ops/grid_sampler_2d.h>
 #include <ATen/ops/grid_sampler_2d_native.h>
+#include <ATen/ops/grid_sampler_3d_native.h>
 #endif
 
 namespace at::native {
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/GridSampler_metallib.h>
+#endif
+
 namespace mps {
 static void grid_sampler_2d_mps_impl(Tensor& output,
                                      const Tensor& input,
@@ -120,6 +131,96 @@ static void grid_sampler_2d_mps_impl(Tensor& output,
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
+
+static void grid_sampler_template(Tensor& output,
+                                  const Tensor& input,
+                                  const Tensor& grid,
+                                  int64_t _interpolation_mode,
+                                  int64_t _padding_mode,
+                                  bool align_corners,
+                                  int32_t sampler_dims,
+                                  const std::string& op_name) {
+  check_grid_sampler_common(input, grid);
+  switch (sampler_dims) {
+    case 2:
+      check_grid_sampler_2d(input, grid);
+      break;
+    case 3:
+      check_grid_sampler_3d(input, grid, _interpolation_mode);
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Only 2D and 3D sampling are supported, but got: ", sampler_dims);
+  }
+  TORCH_CHECK(input.scalar_type() == grid.scalar_type(),
+              "expected input and grid to have the same type, but got ",
+              input.scalar_type(),
+              " and ",
+              grid.scalar_type());
+
+  auto interpolation_mode = static_cast<GridSamplerInterpolation>(_interpolation_mode);
+  auto padding_mode = static_cast<GridSamplerPadding>(_padding_mode);
+
+  switch (interpolation_mode) {
+    case GridSamplerInterpolation::Bilinear:
+      break;
+    case GridSamplerInterpolation::Nearest:
+      TORCH_CHECK(false, op_name, ": Unsupported Nearest interpolation");
+      break;
+    case GridSamplerInterpolation::Bicubic:
+      TORCH_CHECK(false, op_name, ": Unsupported Bicubic interpolation");
+      break;
+    default:
+      TORCH_CHECK(false, op_name, ": Unrecognised interpolation mode: ", _interpolation_mode);
+  }
+
+  switch (padding_mode) {
+    case GridSamplerPadding::Zeros:
+    case GridSamplerPadding::Border:
+    case GridSamplerPadding::Reflection:
+      break;
+    default:
+      TORCH_CHECK(false, op_name, ": Unrecognised Padding Mode: ", _padding_mode);
+  }
+
+  auto input_size = input.sizes();
+  auto grid_size = grid.sizes();
+  output.resize_({input_size[0], input_size[1], grid_size[1], grid_size[2], grid_size[3]}, MemoryFormat::Contiguous);
+
+  auto dims = input.dim();
+
+  GridSamplerParams<5> params;
+  params.sampler_dims = sampler_dims;
+  params.padding_mode = padding_mode;
+  params.interpolation_mode = interpolation_mode;
+  params.align_corners = align_corners;
+
+  for (const auto dim : c10::irange(dims)) {
+    params.output_sizes[dim] = safe_downcast<int32_t, int64_t>(output.size(dim));
+    params.output_strides[dim] = safe_downcast<int32_t, int64_t>(output.stride(dim));
+    params.input_sizes[dim] = safe_downcast<int32_t, int64_t>(input.size(dim));
+    params.input_strides[dim] = safe_downcast<int32_t, int64_t>(input.stride(dim));
+    params.grid_sizes[dim] = safe_downcast<int32_t, int64_t>(grid.size(dim));
+    params.grid_strides[dim] = safe_downcast<int32_t, int64_t>(grid.stride(dim));
+  }
+
+  auto num_threads = output.numel();
+  MPSStream* mpsStream = getCurrentMPSStream();
+
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      auto pso = lib.getPipelineStateForFunc("grid_sampler_" + scalarToMetalTypeString(input));
+
+      getMPSProfiler().beginProfileKernel(pso, op_name, {input, grid});
+      [computeEncoder setComputePipelineState:pso];
+      mtl_setArgs(computeEncoder, output, input, grid, params);
+
+      mtl_dispatch1DJob(computeEncoder, pso, num_threads);
+      getMPSProfiler().endProfileKernel(pso);
+    }
+  });
+}
+
 } // namespace mps
 
 Tensor grid_sampler_2d_mps(const Tensor& input,
@@ -127,15 +228,6 @@ Tensor grid_sampler_2d_mps(const Tensor& input,
                            int64_t interpolation_mode,
                            int64_t padding_mode,
                            bool align_corners) {
-  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS)) {
-    TORCH_WARN_ONCE("MPS: grid_sampler_2d op is supported natively starting from macOS 13.2. ",
-                    "Falling back on CPU. This may have performance implications.");
-
-    return at::grid_sampler_2d(input.to("cpu"), grid.to("cpu"), interpolation_mode, padding_mode, align_corners)
-        .clone()
-        .to("mps");
-  }
-
   auto in_size = input.sizes();
   auto grid_size = grid.sizes();
   auto output = at::empty({in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options());
@@ -144,4 +236,21 @@ Tensor grid_sampler_2d_mps(const Tensor& input,
   return output;
 }
 
+Tensor grid_sampler_3d_mps(const Tensor& input,
+                           const Tensor& grid,
+                           int64_t interpolation_mode,
+                           int64_t padding_mode,
+                           bool align_corners) {
+  auto output = at::empty({0}, input.options(), MemoryFormat::Contiguous);
+  mps::grid_sampler_template(output,
+                             input,
+                             grid,
+                             interpolation_mode,
+                             padding_mode,
+                             align_corners,
+                             /*sampler_dims=*/3,
+                             /*op_name=*/"grid_sampler_3d");
+  return output;
+}
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index f00d155559da0..fa19d2f4d127f 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -108,26 +108,12 @@
 static void validateInputData(const TensorIteratorBase& iter,
                               IntArrayRef index_size,
                               IntArrayRef index_stride,
-                              const std::string& op,
-                              bool accumulate) {
-  using namespace mps;
-
+                              const std::string& op) {
   const auto num_indices = index_size.size();
   TORCH_CHECK(num_indices <= 16, "Current limit allows up to 16 indices to be used in MPS indexing kernels");
 
   AT_ASSERT(num_indices == index_stride.size());
   AT_ASSERT(static_cast<int>(num_indices) == iter.ntensors() - 2);
-  const Tensor& inputTensor = iter.tensor(1);
-  const auto scalar_type = inputTensor.scalar_type();
-
-  if (accumulate) {
-    // No atomic support for the complex dtypes
-    TORCH_CHECK(c10::isIntegralType(scalar_type, /*includesBool=*/true) || supportedFloatingType(scalar_type));
-  } else {
-    TORCH_CHECK(c10::isIntegralType(scalar_type, /*includesBool=*/true) || supportedFloatingType(scalar_type) ||
-                    scalar_type == ScalarType::ComplexFloat || scalar_type == ScalarType::ComplexHalf,
-                getMPSTypeString(inputTensor) + std::string(" not supported for index.Tensor_out"));
-  }
 }
 
 static Tensor& masked_select_out_mps_impl(Tensor& result, const Tensor& self, const Tensor& mask) {
@@ -158,7 +144,7 @@ static void dispatch_index_kernel(TensorIteratorBase& iter,
                                   IntArrayRef index_stride,
                                   const std::string& kernel_name,
                                   const bool serial = false) {
-  validateInputData(iter, index_size, index_stride, "index.Tensor_out", /*accumulate=*/false);
+  validateInputData(iter, index_size, index_stride, "index.Tensor_out");
   if (iter.numel() == 0)
     return;
   if (!iter.can_use_32bit_indexing()) {
@@ -200,7 +186,7 @@ static void dispatch_index_kernel(TensorIteratorBase& iter,
 }
 
 static void index_kernel_mps(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride) {
-  validateInputData(iter, index_size, index_stride, "index.Tensor_out", /*accumulate=*/false);
+  validateInputData(iter, index_size, index_stride, "index.Tensor_out");
   dispatch_index_kernel(
       iter, index_size, index_stride, fmt::format("index_select_{}", getBitSizeString(iter.tensor_base(0))));
 }
@@ -210,7 +196,7 @@ static void index_put_kernel_mps(TensorIterator& iter,
                                  IntArrayRef index_stride,
                                  bool accumulate) {
   @autoreleasepool {
-    validateInputData(iter, index_size, index_stride, "index_put_impl", accumulate);
+    validateInputData(iter, index_size, index_stride, "index_put_impl");
     if (accumulate) {
       dispatch_index_kernel(iter,
                             index_size,
@@ -244,7 +230,7 @@ static void index_put_kernel_mps(TensorIterator& iter,
                 index.numel());
     int64_t idx = index.item<int64_t>();
     TORCH_CHECK(idx == 0, "index_copy_(): the only valid index for a 0-dim tensor is 0, but got ", idx);
-    result.copy_(source);
+    result.copy_(source.squeeze());
     return;
   }
 
@@ -268,11 +254,12 @@ static void index_put_kernel_mps(TensorIterator& iter,
     }
   }
 
-  TORCH_CHECK(source.size(dim) == index.numel(),
+  const auto source_size_dim = source.dim() > 0 ? source.size(dim) : 1;
+  TORCH_CHECK(index.numel() == source_size_dim,
               "index_copy_(): Number of indices (",
               index.numel(),
               ") should be equal to source.size(dim) (",
-              source.size(dim),
+              source_size_dim,
               ")");
 
   auto stream = getCurrentMPSStream();
@@ -295,7 +282,7 @@ static void index_put_kernel_mps(TensorIterator& iter,
       [computeEncoder setComputePipelineState:indexCopyPSO];
       mtl_setArgs(computeEncoder, result, self, source, index, dim_arg, self.sizes(), ndim, indices_numel);
       if (!is_dense) {
-        mtl_setArgs<8>(computeEncoder, self.strides(), result.strides(), source.strides());
+        mtl_setArgs<8>(computeEncoder, self.strides(), result.strides(), source.strides(), index.strides());
       }
       mtl_dispatch1DJob(computeEncoder, indexCopyPSO, result.numel());
     }
@@ -353,14 +340,7 @@ static Tensor nonzero_fallback(const Tensor& self) {
 }
 
 Tensor& nonzero_out_mps(const Tensor& self, Tensor& out_) {
-  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) {
-    TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 14.0. ",
-                    "Falling back on CPU. This may have performance implications.");
-    Tensor out_fallback = nonzero_fallback(self);
-    at::native::resize_output(out_, out_fallback.sizes());
-    out_.copy_(out_fallback);
-    return out_;
-  } else if (self.is_complex()) {
+  if (self.is_complex()) {
     TORCH_WARN_ONCE("MPS: nonzero op is not supported for complex datatypes. ",
                     "Falling back on CPU. This may have performance implications.");
     Tensor out_fallback = nonzero_fallback(self);
@@ -445,11 +425,7 @@ static Tensor nonzero_fallback(const Tensor& self) {
 }
 
 Tensor nonzero_mps(const Tensor& self) {
-  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) {
-    TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 14.0. ",
-                    "Falling back on CPU. This may have performance implications.");
-    return nonzero_fallback(self);
-  } else if (self.is_complex()) {
+  if (self.is_complex()) {
     TORCH_WARN_ONCE("MPS: nonzero op is not supported for complex datatypes ",
                     "Falling back on CPU. This may have performance implications.");
     return nonzero_fallback(self);
@@ -537,7 +513,28 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
     return;
   }
 
-  TORCH_CHECK(source.scalar_type() != ScalarType::Long, "index_add(): Expected non int64 dtype for source.");
+  bool use_deterministic_algorithm = globalContext().deterministicAlgorithms();
+
+  // TODO: Do not use deterministic algorithm for long/complex but rather implement it as Metal shader
+  use_deterministic_algorithm |= source.scalar_type() == ScalarType::Long;
+  use_deterministic_algorithm |= c10::isComplexType(source.scalar_type());
+
+  if (use_deterministic_algorithm) {
+    if (!result.is_same(self)) {
+      result.copy_(self);
+    }
+    torch::List<std::optional<Tensor>> indices;
+    indices.reserve(dim + 1);
+    for (const auto i : c10::irange(dim)) {
+      indices.emplace_back();
+    }
+    indices.emplace_back(index.to(at::kLong));
+    const Tensor result_ = (result.dim() == 0) ? result.view(1) : result;
+    const Tensor source_ = (source.dim() == 0) ? source.view(1) : source;
+    result_.index_put_(indices, source_.mul(alpha), true);
+    return;
+  }
+
   auto casted_type = isFloatingType(source.scalar_type()) ? ScalarType::Float : ScalarType::Int;
 
   struct CachedGraph : public MPSCachedGraph {
@@ -599,28 +596,7 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
 }
 
 Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) {
-  IntArrayRef input_shape = self.sizes();
-  auto num_input_dims = input_shape.size();
-
-  auto num_indices = index.numel();
-  TORCH_CHECK_INDEX(index.dim() <= 1, "index_select(): Index is supposed to be a vector");
-
-  dim = maybe_wrap_dim(dim, self.dim());
-  std::vector<int64_t> shape_data(num_input_dims);
-
-  // Calculate new shape
-  for (const auto i : c10::irange(num_input_dims)) {
-    if (i == static_cast<decltype(i)>(dim)) {
-      shape_data[i] = num_indices;
-    } else {
-      shape_data[i] = input_shape[i];
-    }
-  }
-
-  IntArrayRef output_shape = IntArrayRef(shape_data.data(), num_input_dims);
-
-  Tensor result = at::empty(output_shape, self.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
-
+  Tensor result = at::empty({0}, self.options());
   index_select_out_mps(self, dim, index, result);
   return result;
 }
@@ -642,25 +618,11 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) {
   TORCH_CHECK(self.scalar_type() == output.scalar_type(),
               "index_select(): self and output must have the same scalar type");
   TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
-  TORCH_CHECK(output.dim() == 0 || index.size(-1) == output.size(dim),
-              "index_select(): index and output must have the same size at `dim`th dimension, but got ",
-              index.size(-1),
-              " and ",
-              output.size(dim),
-              ".");
-
-  for (const auto i : irange(self.dim())) {
-    if (i == dim)
-      continue;
-    TORCH_CHECK(self.size(i) == output.size(i),
-                "index_select(): self and output must have the same dimensions except for `dim`th dimension, but got ",
-                self.size(i),
-                " and ",
-                output.size(i),
-                " at dimension ",
-                i,
-                ".");
+  auto output_size = self.sizes().vec();
+  if (self.dim() > 0) {
+    output_size[dim] = num_indices;
   }
+  at::native::resize_output(output, output_size);
 
   // Empty index
   if (num_indices == 0 || self.numel() == 0) {
@@ -946,6 +908,8 @@ Tensor embedding_dense_backward_mps(const Tensor& grad_,
   TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int,
               "index_fill_(): Expected dtype int32 or int64 for index");
   TORCH_CHECK(dim == 0 || dim < self.dim(), "index_fill_(): Indexing dim ", dim, " is out of bounds of tensor");
+  // MPS.scatter crashes if used with complex dtypes
+  TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "index_fill_(): Complex types are yet not supported");
 
   // Empty index
   if (num_indices == 0) {
diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm
index 42769c13f1e1b..219086edd8e37 100644
--- a/aten/src/ATen/native/mps/operations/Linear.mm
+++ b/aten/src/ATen/native/mps/operations/Linear.mm
@@ -115,7 +115,10 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
     return output;
   }
 
-  if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) {
+  // No-graph execution causes nonsense if these are non-contiguous.
+  const bool is_contiguous = input.is_contiguous() && weight.is_contiguous() && bias.is_contiguous();
+
+  if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_contiguous) {
     _mps_linear_nograph(input, weight, bias, output);
     // Squeeze last dim of 1D linear
     return weight_arg.dim() != 1 ? output : output.squeeze(-1);
diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
index 3cdf0021e987f..7a3dde679c05f 100644
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -112,6 +112,61 @@
   return output;
 }
 
+Tensor& do_metal_addmm(const Tensor& self,
+                       const Tensor& other,
+                       Tensor& output,
+                       const Scalar& alpha,
+                       const Scalar& beta,
+                       const Tensor& bias) {
+  if (beta.toDouble() == 0 && alpha.toDouble() == 1) {
+    return do_metal_mm(self, other, output);
+  }
+  auto stream = getCurrentMPSStream();
+  auto device = MPSDevice::getInstance()->device();
+  auto matmulPSO = lib.getPipelineStateForFunc("addmm_" + mps::scalarToMetalTypeString(output));
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      getMPSProfiler().beginProfileKernel(matmulPSO, "addmm", {self, other});
+      auto computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:matmulPSO];
+      std::array<uint32_t, 3> sizes = {static_cast<uint32_t>(self.size(0)),
+                                       static_cast<uint32_t>(self.size(1)),
+                                       static_cast<uint32_t>(output.size(1))};
+      std::array<int64_t, 8> strides = {self.stride(0),
+                                        self.stride(1),
+                                        other.stride(0),
+                                        other.stride(1),
+                                        output.stride(0),
+                                        output.stride(1),
+                                        bias.stride(0),
+                                        bias.stride(1)};
+      union {
+        std::array<int64_t, 2> i64;
+        std::array<int32_t, 2> i32;
+        std::array<float, 2> f32;
+      } alpha_beta;
+      if (output.scalar_type() == kLong) {
+        alpha_beta.i64 = {alpha.toLong(), beta.toLong()};
+      } else if (c10::isIntegralType(output.scalar_type(), true)) {
+        alpha_beta.i32 = {alpha.toInt(), beta.toInt()};
+      } else {
+        TORCH_INTERNAL_ASSERT(c10::isFloatingType(output.scalar_type()));
+        alpha_beta.f32 = {alpha.toFloat(), beta.toFloat()};
+      }
+      constexpr uint32_t TILE_DIM = 16; // fastest performance from tests on multiple macs
+      uint32_t gridSizeX = (output.size(1) + TILE_DIM - 1) / TILE_DIM;
+      uint32_t gridSizeY = (self.size(0) + TILE_DIM - 1) / TILE_DIM;
+
+      MTLSize threadsPerThreadgroup = MTLSizeMake(TILE_DIM, TILE_DIM, 1);
+      MTLSize threadgroupsPerGrid = MTLSizeMake(gridSizeX, gridSizeY, 1);
+      mtl_setArgs(computeEncoder, self, other, output, bias, alpha_beta.i64, strides, sizes);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadsPerThreadgroup];
+      getMPSProfiler().endProfileKernel(matmulPSO);
+    }
+  });
+  return output;
+}
+
 std::tuple<MPSGraphTensor*, MPSGraphTensor*, MPSGraphTensor*> do_mm(MPSGraph* graph,
                                                                     const Tensor& self,
                                                                     const Tensor& other) {
@@ -644,7 +699,6 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
 
   TORCH_CHECK(output.is_mps());
   TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D");
-  TORCH_CHECK(supportedFloatingOrComplexType(self), "MPS device does not support addmm for non-float input");
 
   TensorArg args[]{{output, "out", 0}, {bias, "self", 1}, {self, "mat1", 2}, {other, "mat2", 3}};
   checkAllSameGPU(__func__, args);
@@ -671,6 +725,10 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
     return output;
   }
 
+  if (use_metal_mm(self, other, output)) {
+    return do_metal_addmm(self, other, output, alpha, beta, *bias_);
+  }
+
   bool is_beta_non_zero = beta.toDouble() != 0.0;
 
   struct CachedGraph : public mps::MPSCachedGraph {
diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm
index 0c2c25946bb4b..2945ebf715f27 100644
--- a/aten/src/ATen/native/mps/operations/Pad.mm
+++ b/aten/src/ATen/native/mps/operations/Pad.mm
@@ -460,6 +460,9 @@ Tensor replication_pad3d_backward_mps(const Tensor& grad_output, const Tensor& i
 
 // backward pass is explicitly handled in autograd by negating the "pad" argument
 Tensor constant_pad_nd_mps(const Tensor& self, IntArrayRef pad, const Scalar& value) {
+  if (pad.empty()) {
+    return self.clone();
+  }
   if (pad.size() > 6) {
     TORCH_WARN_ONCE("MPS: The constant padding of more than 3 dimensions is not currently supported natively. ",
                     "It uses View Ops default implementation to run. This may have performance implications.");
diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
index b2bc870844a88..d916320b2e238 100644
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -297,13 +297,13 @@ static PoolSizes process_pool_sizes(const Tensor& input,
               pooling_dims,
               " ints");
 
-  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 3,
+  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == pooling_dims,
               op_name,
               ": stride must either be omitted, a single int, or a tuple of ",
               pooling_dims,
               " ints");
 
-  TORCH_CHECK(padding.size() == 1 || padding.size() == 3,
+  TORCH_CHECK(padding.size() == 1 || padding.size() == pooling_dims,
               op_name,
               ": padding must either be a single int, or a tuple of ",
               pooling_dims,
@@ -333,6 +333,22 @@ static PoolSizes process_pool_sizes(const Tensor& input,
                 ": pad should be at most half of effective kernel size");
   }
 
+  if (pooling_dims == 2) {
+    const auto memory_format = input.suggest_memory_format();
+    bool valid_dims = input.size(1) != 0 && input.size(2) != 0;
+    if (memory_format == at::MemoryFormat::ChannelsLast) {
+      // Expect tensor in NHWC format and allow 0-dim only for N.
+      TORCH_CHECK((dims == 4 && valid_dims && input.size(3) != 0),
+                  "Expected 4D (batch mode) tensor expected for input with channels_last layout"
+                  " with optional 0 dim batch size for input, but got: ",
+                  input.sizes());
+    } else {
+      TORCH_CHECK((dims == 3 && input.size(0) != 0 && valid_dims) || (dims == 4 && valid_dims && input.size(3) != 0),
+                  "Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got:",
+                  input.sizes());
+    }
+  }
+
   for (const auto dim : c10::irange(static_cast<int>(leading_dims == 2), dims)) {
     TORCH_CHECK(input.size(dim) > 0, op_name, ": Expected input's non-batch dimensions to have positive length");
   }
@@ -786,6 +802,16 @@ static void avg_pool_backward_out_mps_template(const Tensor& grad_input,
 
 } // namespace mps
 
+// TODO: The MPS graph impl can sometimes give significantly better performance
+// than the Metal impl for cases where the stride is 1 in all dimensions. There
+// may be a code path in the graph kernel that specifically optimizes for that
+// case. We should look into implementing a specialized case in Metal so we can
+// avoid using the graph impl.
+static bool use_graph_for_max_pool2d(IntArrayRef kernel_size, IntArrayRef stride_) {
+  IntArrayRef stride = stride_.empty() ? kernel_size : stride_;
+  return (stride[0] == 1) && (stride.size() == 1 || stride[1] == 1);
+}
+
 Tensor mps_max_pool2d(const Tensor& input,
                       IntArrayRef kernel_size,
                       IntArrayRef stride,
@@ -793,24 +819,37 @@ Tensor mps_max_pool2d(const Tensor& input,
                       IntArrayRef dilation,
                       bool ceil_mode) {
   Tensor output = at::empty({0}, input.options(), MemoryFormat::Contiguous);
-  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
-    MPSGraph* mpsGraph = cachedGraph.graph();
-    return [mpsGraph maxPooling2DWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil];
-  };
-  mps::pool2d_template(input,
-                       output,
-                       std::nullopt,
-                       std::nullopt,
-                       kernel_size,
-                       stride,
-                       padding,
-                       dilation,
-                       ceil_mode,
-                       false,
-                       std::nullopt,
-                       pooling_op_block,
-                       "max_pool2d");
-
+  bool use_graph = use_graph_for_max_pool2d(kernel_size, stride);
+  if (use_graph) {
+    mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+      MPSGraph* mpsGraph = cachedGraph.graph();
+      return [mpsGraph maxPooling2DWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil];
+    };
+    mps::pool2d_template(input,
+                         output,
+                         std::nullopt,
+                         std::nullopt,
+                         kernel_size,
+                         stride,
+                         padding,
+                         dilation,
+                         ceil_mode,
+                         false,
+                         std::nullopt,
+                         pooling_op_block,
+                         "max_pool2d");
+  } else {
+    mps::max_pool_with_indices_out_mps_template(output,
+                                                std::nullopt,
+                                                input,
+                                                kernel_size,
+                                                stride,
+                                                padding,
+                                                dilation,
+                                                ceil_mode,
+                                                /*pooling_dims=*/2,
+                                                "max_pool2d");
+  }
   return output;
 }
 
@@ -855,32 +894,45 @@ Tensor mps_max_pool2d_backward(const Tensor& grad_output,
  bool ceil_mode,
  const Tensor& output,
  const Tensor& indices) {
-  auto indices_memory_format = indices.suggest_memory_format();
-
-  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
-    MPSGraph* mpsGraph = cachedGraph.graph();
-    NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor:cachedGraph.inputTensor
-                                                                                     descriptor:desc
-                                                                                           name:nil];
-    cachedGraph.indicesTensor = mps::castMPSTensor(mpsGraph, poolOutputs[1], ScalarType::Long);
-    return poolOutputs[0];
-  };
-  mps::pool2d_template(input,
-                       output,
-                       indices,
-                       std::nullopt,
-                       kernel_size,
-                       stride,
-                       padding,
-                       dilation,
-                       ceil_mode,
-                       false,
-                       std::nullopt,
-                       pooling_op_block,
-                       "max_pool2d_indices");
+  bool use_graph = use_graph_for_max_pool2d(kernel_size, stride);
+  if (use_graph) {
+    auto indices_memory_format = indices.suggest_memory_format();
+
+    mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+      MPSGraph* mpsGraph = cachedGraph.graph();
+      NSArray<MPSGraphTensor*>* poolOutputs =
+          [mpsGraph maxPooling2DReturnIndicesWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil];
+      cachedGraph.indicesTensor = mps::castMPSTensor(mpsGraph, poolOutputs[1], ScalarType::Long);
+      return poolOutputs[0];
+    };
+    mps::pool2d_template(input,
+                         output,
+                         indices,
+                         std::nullopt,
+                         kernel_size,
+                         stride,
+                         padding,
+                         dilation,
+                         ceil_mode,
+                         false,
+                         std::nullopt,
+                         pooling_op_block,
+                         "max_pool2d_indices");
+    if (indices_memory_format == MemoryFormat::ChannelsLast) {
+      const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
+    }
 
-  if (indices_memory_format == MemoryFormat::ChannelsLast) {
-    const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
+  } else {
+    mps::max_pool_with_indices_out_mps_template(output,
+                                                indices,
+                                                input,
+                                                kernel_size,
+                                                stride,
+                                                padding,
+                                                dilation,
+                                                ceil_mode,
+                                                /*pooling_dims=*/2,
+                                                "max_pool2d");
   }
 }
 
@@ -1085,17 +1137,30 @@ Tensor max_unpooling3d_forward_mps(const Tensor& self,
  bool count_include_pad,
  std::optional<int64_t> divisor_override,
  const Tensor& output) {
-  mps::avg_pool2d_template(input,
-                           output,
-                           std::nullopt,
-                           {kH, kW},
-                           {dH, dW},
-                           {padH, padW},
-                           {1, 1},
-                           ceil_mode,
-                           count_include_pad,
-                           divisor_override,
-                           "avg_pool2d");
+  if (ceil_mode) {
+    mps::avg_pool_out_mps_template(output,
+                                   input,
+                                   {kH, kW},
+                                   {dH, dW},
+                                   {padH, padW},
+                                   ceil_mode,
+                                   count_include_pad,
+                                   divisor_override,
+                                   /*pooling_dims=*/2,
+                                   "avg_pool3d");
+  } else {
+    mps::avg_pool2d_template(input,
+                             output,
+                             std::nullopt,
+                             {kH, kW},
+                             {dH, dW},
+                             {padH, padW},
+                             {1, 1},
+                             ceil_mode,
+                             count_include_pad,
+                             divisor_override,
+                             "avg_pool2d");
+  }
 }
 
 TORCH_IMPL_FUNC(avg_pool2d_backward_out_mps)
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 21020bad467d0..ae13504d9003e 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -152,8 +152,6 @@ static void reduction_out_mps(const Tensor& input_t,
                               const Tensor& output_t,
                               MPSReductionType reduction_type,
                               const std::string& func_name) {
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, func_name);
   // NS: TODO: get rid of all those shenanigans and just call reduction_op with view tensor
   bool canSqueezeLastDim = true;
   IntArrayRef input_shape = input_t.sizes();
@@ -236,12 +234,10 @@ static void reduction_out_mps(const Tensor& input_t,
       MPSGraphTensor* castInputTensor = inputTensor;
       MPSDataType inputCastType = MPSDataTypeInvalid;
       if (dtype.has_value() &&
-          (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt ||
-           (dtype.value() == kLong && macOS13_3_plus))) {
+          (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt || dtype.value() == kLong)) {
         inputCastType = getMPSDataType(dtype.value());
       } else if (inputScalarType != kInt && inputScalarType != kHalf && inputScalarType != kFloat &&
-                 inputScalarType != kComplexFloat && inputScalarType != kComplexHalf &&
-                 (inputScalarType != kLong || !macOS13_3_plus)) {
+                 inputScalarType != kComplexFloat && inputScalarType != kComplexHalf && inputScalarType != kLong) {
         inputCastType = getMPSDataType(kFloat);
       }
 
@@ -460,7 +456,7 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
     errMessage += ": reduction dim must be in the range of input shape";
     for (const auto dim : dim_value) {
       auto wrap_dim = maybe_wrap_dim(dim, num_input_dims);
-      TORCH_CHECK(wrap_dim < static_cast<decltype(wrap_dim)>(input_shape.size()), errMessage.c_str())
+      TORCH_CHECK(wrap_dim < (num_input_dims ? num_input_dims : 1), errMessage.c_str())
     }
   }
 
@@ -615,9 +611,6 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
 }
 
 static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, nanmedian ? "nanmedian" : "median");
-
   IntArrayRef input_shape = input_t.sizes();
   int64_t num_in_elements = c10::multiply_integers(input_shape);
 
@@ -634,8 +627,7 @@ static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
   auto medianCachedGraph =
       LookUpOrCreateCachedGraph<MedianCachedGraph>(medianKey, [&](auto mpsGraph, auto newCachedGraph) {
         MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
-        MPSGraphTensor* castInputTensor =
-            castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+        MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
 
         MPSGraphTensor* reshapedTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
 
@@ -693,9 +685,6 @@ static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
 }
 
 static Tensor min_max_mps_impl(const Tensor& input_t, MPSReductionType reduction_type, const std::string& func_name) {
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "min_max");
-
   using CachedGraph = MPSUnaryCachedGraph;
 
   IntArrayRef input_shape = input_t.sizes();
@@ -713,8 +702,7 @@ static Tensor min_max_mps_impl(const Tensor& input_t, MPSReductionType reduction
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
 
       MPSGraphTensor* castOutputTensor = nil;
-      MPSGraphTensor* castInputTensor =
-          castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+      MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
 
       NSArray<NSNumber*>* axes = getTensorAxes(input_t);
       if (reduction_type == MPSReductionType::MAX) {
@@ -749,9 +737,6 @@ static void min_max_out_mps(const Tensor& input_t,
                             const Tensor& indices_t,
                             MPSReductionType reduction_type,
                             const std::string& func_name) {
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "min_max_out");
-
   if (output_t.numel() == 0) {
     return;
   }
@@ -789,8 +774,7 @@ static void min_max_out_mps(const Tensor& input_t,
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
       MPSGraphTensor* outputTensor = nil;
-      MPSGraphTensor* castInputTensor =
-          castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+      MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
 
       if (reduction_type == MPSReductionType::MAX) {
         outputTensor = [mpsGraph reductionMaximumPropagateNaNWithTensor:castInputTensor axis:(NSInteger)dim_ name:nil];
@@ -896,9 +880,6 @@ static void argmax_argmin_out_mps(const Tensor& input_t,
                                   const std::string& func_name) {
   using CachedGraph = MPSUnaryCachedGraph;
 
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "argmax_argmin_out");
-
   int64_t dim_ = -1;
 
   if (dim.has_value()) {
@@ -953,7 +934,7 @@ static void argmax_argmin_out_mps(const Tensor& input_t,
 
       MPSGraphTensor* castInputTensor = inputTensor;
       if (inputScalarType != kInt && inputScalarType != kHalf && inputScalarType != kFloat &&
-          (inputScalarType != kLong || !macOS13_3_plus)) {
+          inputScalarType != kLong) {
         castInputTensor = castMPSTensor(mpsGraph, inputTensor, kFloat);
       }
       if (reduction_type == MPSReductionType::MAX) {
@@ -1282,9 +1263,6 @@ static void all_any_common_impl_mps(const Tensor& input_t,
     return;
   }
 
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, op_name);
-
   int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
   native::zero_numel_check_dims(input_t, dim_, op_name.c_str());
 
@@ -1303,7 +1281,7 @@ static void all_any_common_impl_mps(const Tensor& input_t,
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
 
-      auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+      auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
       // reductionOrWithTensor:axis: will throw an internal assert if number of dimentions is more than 4
       // See https://github.com/pytorch/pytorch/issues/95538
       MPSGraphTensor* outputTensor = nil;
@@ -1369,14 +1347,11 @@ static void all_any_common_impl_mps(const Tensor& input_t,
     return;
   }
 
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "any_all_out");
-
   @autoreleasepool {
     std::string key = std::string("any_all_out_mps:") + getTensorsStringKey(input_t);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
-      auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+      auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
       // reductionOrWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
       // See https://github.com/pytorch/pytorch/issues/95538
       if (input_t.dim() > 4) {
@@ -1420,14 +1395,11 @@ static void all_any_common_impl_mps(const Tensor& input_t,
     return;
   }
 
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "all_all_out");
-
   @autoreleasepool {
     std::string key = std::string("all_all_out_mps:") + getTensorsStringKey(input_t);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
-      auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+      auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
       // reductionAndWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
       // See https://github.com/pytorch/pytorch/issues/95538
       if (input_t.ndimension() > 4) {
@@ -1512,9 +1484,6 @@ static void median_out_mps_common(const Tensor& input_t,
                                   Tensor& indices,
                                   const std::string& func_name,
                                   bool nanmedian) {
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "median_out");
-
   int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
   native::zero_numel_check_dims(input_t, dim_, "max()");
 
@@ -1585,8 +1554,7 @@ static void median_out_mps_common(const Tensor& input_t,
         getTensorsStringKey(indices);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
-      MPSGraphTensor* castInputTensor =
-          castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+      MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
 
       MPSGraphTensor* effectiveLengthTensor = nil;
       if (nanmedian) {
diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm
index 10668309a8c23..40afa15b4f700 100644
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@@ -129,16 +129,8 @@ void computeRepeatIndices(const index_t* repeat_ptr,
   });
 }
 
-Tensor repeat_interleave_mps(const Tensor& repeat_, std::optional<int64_t> output_size) {
+Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
   Tensor output;
-  Tensor repeat = repeat_;
-  if (repeat.scalar_type() == kLong && !is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS)) {
-    // #103810551: `repeat_interleave_common` uses cumsum to calculate the final shape of output,
-    // which currently doesn't support int64_t as input. Casting internally the indices to int32_t.
-    TORCH_WARN_ONCE(
-        "MPS: no support for int64 repeats mask, casting it to int32. Support has been added in macOS 13.3");
-    repeat = repeat.to(kInt);
-  }
   AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
     output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(repeat, output_size);
   });
diff --git a/aten/src/ATen/native/mps/operations/ScanKernel.mm b/aten/src/ATen/native/mps/operations/ScanKernel.mm
index 9e3269d970143..80495ba9d501d 100644
--- a/aten/src/ATen/native/mps/operations/ScanKernel.mm
+++ b/aten/src/ATen/native/mps/operations/ScanKernel.mm
@@ -23,125 +23,6 @@
 #include <ATen/native/mps/ScanKernel_metallib.h>
 #endif
 
-// Generic scan implementation that handles both simple scans and scans with indices
-static void scan_mps_impl(const Tensor& self,
-                          const std::vector<Tensor>& outputs,
-                          int64_t dim,
-                          const std::string& op_name) {
-  if (outputs[0].numel() == 0) {
-    return;
-  }
-
-  const int64_t ndim = self.dim();
-  const int64_t wrapped_dim = maybe_wrap_dim(dim, ndim);
-
-  // Calculate dimensions for scan operation
-  int64_t row_size = self.size(wrapped_dim);
-  auto sizes = self.sizes();
-
-  bool is_innermost = (wrapped_dim == ndim - 1);
-
-  // Check if all tensors are contiguous
-  bool is_contiguous = self.is_contiguous();
-  for (const auto& output : outputs) {
-    is_contiguous = is_contiguous && output.is_contiguous();
-  }
-
-  uint32_t num_rows, num_orows, num_irows, num_threads;
-
-  if (is_innermost) {
-    // Treat all outer dimensions as a single dimension
-    num_rows = self.numel() / row_size;
-    num_threads = num_rows;
-  } else {
-    // Treat all outer dimensions (i.e. dim_ < dim) as one
-    num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + wrapped_dim);
-    // Treat all inner dimensions (i.e. dim > dimension) as one
-    num_irows = c10::multiply_integers(sizes.begin() + wrapped_dim + 1, sizes.end());
-    num_threads = num_orows * num_irows;
-  }
-
-  MPSStream* mpsStream = getCurrentMPSStream();
-  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
-    @autoreleasepool {
-      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
-
-      // Choose kernel based on contiguity and dimension
-      std::string kernel_name;
-      if (is_contiguous) {
-        kernel_name =
-            op_name + "_contiguous_" + (is_innermost ? "innermost_" : "outer_") + scalarToMetalTypeString(self);
-      } else {
-        kernel_name = op_name + "_strided_" + scalarToMetalTypeString(self);
-      }
-
-      id<MTLComputePipelineState> scanPSO = lib.getPipelineStateForFunc(kernel_name);
-
-      // this function call is a no-op if MPS Profiler is not enabled
-      getMPSProfiler().beginProfileKernel(scanPSO, op_name, [&]() {
-        std::vector<Tensor> all_tensors = {self};
-        all_tensors.insert(all_tensors.end(), outputs.begin(), outputs.end());
-        return all_tensors;
-      }());
-
-      [computeEncoder setComputePipelineState:scanPSO];
-
-      // Set input tensor
-      mtl_setBuffer(computeEncoder, self, 0);
-
-      // Set output tensors
-      for (size_t i = 0; i < outputs.size(); ++i) {
-        mtl_setBuffer(computeEncoder, outputs[i], i + 1);
-      }
-
-      if (is_contiguous) {
-        // Contiguous kernels
-        if (is_innermost) {
-          if (outputs.size() == 1) {
-            // Simple scan
-            mtl_setArgs<2>(computeEncoder, num_rows, static_cast<uint32_t>(row_size));
-          } else {
-            // Scan with indices
-            mtl_setArgs<3>(computeEncoder, num_rows, static_cast<uint32_t>(row_size));
-          }
-        } else {
-          if (outputs.size() == 1) {
-            // Simple scan
-            mtl_setArgs<2>(computeEncoder, num_orows, num_irows, static_cast<uint32_t>(row_size));
-          } else {
-            // Scan with indices
-            mtl_setArgs<3>(computeEncoder, num_orows, num_irows, static_cast<uint32_t>(row_size));
-          }
-        }
-      } else {
-        // Strided kernels - pass full tensor information
-        if (outputs.size() == 1) {
-          // Simple scan
-          mtl_setArgs<2>(computeEncoder,
-                         self.sizes(),
-                         self.strides(),
-                         outputs[0].strides(),
-                         static_cast<uint32_t>(self.ndimension()),
-                         static_cast<uint32_t>(wrapped_dim));
-        } else {
-          // Scan with indices
-          mtl_setArgs<3>(computeEncoder,
-                         self.sizes(),
-                         self.strides(),
-                         outputs[0].strides(),
-                         outputs[1].strides(),
-                         static_cast<uint32_t>(self.ndimension()),
-                         static_cast<uint32_t>(wrapped_dim));
-        }
-      }
-
-      mtl_dispatch1DJob(computeEncoder, scanPSO, num_threads);
-
-      getMPSProfiler().endProfileKernel(scanPSO);
-    }
-  });
-}
-
 // Utility function to get 2D grid dimensions for dispatch
 static std::pair<uint32_t, uint32_t> get_2d_grid_dims(const IntArrayRef& shape, const int64_t dim) {
   size_t grid_x = 1;
@@ -375,19 +256,11 @@ static void scan_with_indices_mps_impl(const Tensor& self,
 } // namespace mps
 
 void cummax_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) {
-  if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) {
-    mps::scan_with_indices_mps_impl(self, values, indices, dim, "cummax");
-  } else {
-    mps::scan_mps_impl(self, {values, indices}, dim, "cummax");
-  }
+  mps::scan_with_indices_mps_impl(self, values, indices, dim, "cummax");
 }
 
 void cummin_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) {
-  if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) {
-    mps::scan_with_indices_mps_impl(self, values, indices, dim, "cummin");
-  } else {
-    mps::scan_mps_impl(self, {values, indices}, dim, "cummin");
-  }
+  mps::scan_with_indices_mps_impl(self, values, indices, dim, "cummin");
 }
 
 Tensor& _logcumsumexp_out_mps(const Tensor& self, int64_t dim, Tensor& result) {
@@ -402,11 +275,7 @@ void cummin_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int6
     return result;
   }
 
-  if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) {
-    mps::scan_simple_mps_impl(self, result, wrap_dim, "logcumsumexp");
-  } else {
-    mps::scan_mps_impl(self, {result}, wrap_dim, "logcumsumexp");
-  }
+  mps::scan_simple_mps_impl(self, result, wrap_dim, "logcumsumexp");
   return result;
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Sort.mm b/aten/src/ATen/native/mps/operations/Sort.mm
index c73b7c33098f1..6ff47044df133 100644
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@@ -2,6 +2,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/MemoryOverlap.h>
 #include <ATen/WrapDimUtils.h>
+#include <ATen/native/SortingUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
@@ -11,10 +12,85 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/kthvalue_native.h>
 #include <ATen/ops/sort.h>
 #include <ATen/ops/sort_native.h>
 #endif
 namespace at::native {
+namespace {
+
+void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& values, Tensor& indices) {
+  using namespace mps;
+  if (self.dim() == 0 && self.numel() == 1) {
+    values.copy_(self);
+    indices.zero_();
+    return;
+  }
+  // Handle empty tensors
+  if (self.numel() == 0) {
+    values.copy_(self);
+    indices.copy_(values.toType(at::ScalarType::Long));
+    return;
+  }
+  // issue #154890, raising error to prevent crash within MPSGraph until
+  // workaround is implemented.
+  TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
+
+  auto stream = getCurrentMPSStream();
+  struct CachedGraph : public MPSCachedGraph {
+    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil;
+  };
+
+  // MPSGraph kthvalue is always sorted.
+  @autoreleasepool {
+    // Input as placeholders
+    MPSShape* input_shape = getMPSShape(self);
+    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+    std::string key = std::string("kthvalue:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" +
+        std::to_string(k) + ":dim" + std::to_string(dim);
+    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+      newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);
+
+      MPSGraphTensor* castInputTensor = newCachedGraph->selfTensor;
+      MPSDataType dataType = getMPSDataType(self);
+      // #issue 104398441 sortWithTensor and argsortWithTensor
+      if (dataType != MPSDataTypeInt32 && dataType != MPSDataTypeFloat32 && dataType != MPSDataTypeFloat16) {
+        dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
+        castInputTensor = [mpsGraph castTensor:newCachedGraph->selfTensor toType:dataType name:@"castInputTensor"];
+      }
+      MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor
+                                                         axis:(NSUInteger)dim
+                                                   descending:false
+                                                         name:nil];
+      sortedTensor = [mpsGraph sliceTensor:sortedTensor
+                                 dimension:(NSUInteger)dim
+                                     start:((NSUInteger)k - 1)
+                                    length:1
+                                      name:nil];
+      MPSGraphTensor* argSortedTensor = [mpsGraph argSortWithTensor:castInputTensor
+                                                               axis:(NSInteger)dim
+                                                         descending:false
+                                                               name:@"kthvalue_out"];
+      argSortedTensor = [mpsGraph sliceTensor:argSortedTensor
+                                    dimension:dim
+                                        start:((NSUInteger)k - 1)
+                                       length:1
+                                         name:nil];
+      newCachedGraph->valuesTensor = sortedTensor;
+      newCachedGraph->indicesTensor = argSortedTensor;
+    });
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->selfTensor, self);
+    // Outputs as placeholders
+    Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
+    Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
+    // Create dictionary of inputs and outputs
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+} // anonymous namespace
 
 // sort
 TORCH_IMPL_FUNC(sort_stable_out_mps)
@@ -26,9 +102,6 @@
  const Tensor& indices) {
   using namespace mps;
 
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-  MPS_CHECK_INT64_OP_SUPPORTED(self, macOS13_3_plus, "sort_stable_out");
-
   if (self.numel() == 0) {
     return;
   }
@@ -55,8 +128,7 @@
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);
 
-      MPSGraphTensor* castInputTensor =
-          castToIHFTypes(mpsGraph, newCachedGraph->selfTensor, self, /*includesInt64=*/macOS13_3_plus);
+      MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, newCachedGraph->selfTensor, self);
       MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor
                                                          axis:(NSInteger)dim
                                                    descending:(BOOL)descending
@@ -85,4 +157,31 @@
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
 }
+
+std::tuple<Tensor&, Tensor&> kthvalue_out_mps(const Tensor& self,
+                                              int64_t k,
+                                              int64_t dim_,
+                                              bool keepdim,
+                                              Tensor& values,
+                                              Tensor& indices) {
+  // See note [Writing Nondeterministic Operations]
+  // If there are duplicate elements of the kth value, the procedure for choosing which
+  // of the duplicates to use for the indices output is nondeterministic.
+  at::globalContext().alertNotDeterministic("kthvalue MPS");
+
+  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
+  int64_t slicesize = self.dim() == 0 ? 1 : self.size(dim);
+  TORCH_CHECK(k >= 1 && k <= slicesize, "kthvalue(): selected number k out of range for dimension ", dim);
+  at::assert_no_overlap(self, values);
+  _reduction_with_indices_allocate_or_resize_output(values, indices, self, dim, keepdim);
+
+  kthvalue_out_mps_impl(self, k, dim, values, indices);
+
+  if (!keepdim) {
+    values.squeeze_(dim);
+    indices.squeeze_(dim);
+  }
+
+  return std::forward_as_tuple(values, indices);
+}
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index 6e030c99d0356..7b637d896f850 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -297,9 +297,6 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
 
   const auto common_type = at::result_type(elements, test_elements);
   TORCH_CHECK(elements.is_mps() && test_elements.is_mps());
-  TORCH_CHECK(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) || supportedFloatingType(common_type),
-              "isin_Tensor_Tensor_out only works on floating types on MPS for pre MacOS_14_0. Received dtype: ",
-              common_type);
 
   @autoreleasepool {
     std::string key = op_name + getTensorsStringKey({elements, test_elements}) + std::to_string(invert);
@@ -338,6 +335,9 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
 }
 
 static void is_posneginf_helper(TensorIteratorBase& iter, bool is_neg) {
+  if (iter.numel() == 0) {
+    return;
+  }
   const auto& self = iter.input(0);
   auto& out = iter.output(0);
   @autoreleasepool {
diff --git a/aten/src/ATen/native/mps/operations/UnaryKernel.mm b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
index b560739ed40c3..7e150b133cc65 100644
--- a/aten/src/ATen/native/mps/operations/UnaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
@@ -50,6 +50,7 @@ static void round_decimals_kernel(TensorIteratorBase& iter, int64_t decimals) {
 REGISTER_UNARY_TI_DISPATCH(log);
 REGISTER_UNARY_TI_DISPATCH(log1p);
 REGISTER_UNARY_TI_DISPATCH(bitwise_not);
+REGISTER_UNARY_TI_DISPATCH(round);
 REGISTER_UNARY_TI_DISPATCH(sigmoid);
 REGISTER_DISPATCH(round_decimals_stub, round_decimals_kernel);
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index edf45a5ff80d0..d7ce40e5cbb4f 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -184,7 +184,6 @@ static void unary_op(const Tensor& self,
 
 REGISTER_MPS_UNARY_STUB(ceil, ceil);
 REGISTER_MPS_UNARY_STUB(floor, floor);
-REGISTER_MPS_UNARY_STUB(round, round);
 REGISTER_MPS_UNARY_STUB(trunc, truncate);
 
 #define CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(func_out, func_stub)                                         \
@@ -208,28 +207,12 @@ static void unary_op(const Tensor& self,
 }
 
 Tensor& angle_out_mps(const Tensor& self, Tensor& output) {
-  if (mps::supportsComplex()) {
-    mps::unary_op(self, output, "angle_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
-      auto realPart = [mpsGraph realPartOfTensor:inputTensor name:nil];
-      auto imagPart = [mpsGraph imaginaryPartOfTensor:inputTensor name:nil];
-      return [mpsGraph atan2WithPrimaryTensor:imagPart secondaryTensor:realPart name:nil];
-    });
-    return output;
-  } else {
-    TORCH_CHECK(!self.is_complex(), "MPS does not support angle with complex input on macOS13")
-    mps::unary_op(self, output, "angle_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
-      // On macOS 13 with non-complex input, realPartOfTensor and imaginaryPartOfTensor are
-      // not available, and NaN is not propagated correctly:
-      auto imagPart = [mpsGraph constantWithScalar:0.0 shape:inputTensor.shape dataType:inputTensor.dataType];
-      auto result = [mpsGraph atan2WithPrimaryTensor:imagPart secondaryTensor:inputTensor name:nil];
-      auto nanMask = [mpsGraph isNaNWithTensor:inputTensor name:nil];
-      return [mpsGraph selectWithPredicateTensor:nanMask
-                             truePredicateTensor:inputTensor
-                            falsePredicateTensor:result
-                                            name:nil];
-    });
-    return output;
-  }
+  mps::unary_op(self, output, "angle_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+    auto realPart = [mpsGraph realPartOfTensor:inputTensor name:nil];
+    auto imagPart = [mpsGraph imaginaryPartOfTensor:inputTensor name:nil];
+    return [mpsGraph atan2WithPrimaryTensor:imagPart secondaryTensor:realPart name:nil];
+  });
+  return output;
 }
 
 Tensor angle_mps(const Tensor& self) {
@@ -362,7 +345,6 @@ static void cumulative_op_impl(const Tensor& self,
                                const Tensor& result,
                                MPSCumulativeOpType cumulativeOpType,
                                const std::string& op_name) {
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
   auto nDims = self.dim();
   auto wrapped_dim = maybe_wrap_dim(dim, nDims);
   TORCH_CHECK(wrapped_dim >= 0 && wrapped_dim < std::max(1LL, self.ndimension()),
@@ -381,11 +363,6 @@ static void cumulative_op_impl(const Tensor& self,
   bool castInputData = (isIntegralType(input.scalar_type(), true) && input.scalar_type() != ScalarType::Int &&
                         input.scalar_type() != ScalarType::Long);
 
-  TORCH_CHECK(macOS13_3_plus || input.scalar_type() != ScalarType::Long,
-              "MPS does not support ",
-              op_name,
-              " op with int64 input. Support has been added in macOS 13.3");
-
   mps::unary_op(
       input, result, op_name + std::to_string(dim), ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
         if (castInputData) {
@@ -440,17 +417,10 @@ static void cumulative_op_impl(const Tensor& self,
 
 Tensor& conj_physical_out_mps(const Tensor& self, Tensor& result) {
   TORCH_CHECK(self.is_complex());
-  if (!mps::supportsComplex()) {
-    if (!result.is_same_size(self)) {
-      result.resize_(self.sizes());
-    }
-    at::real(result).copy_(at::real(self));
-    at::imag(result).copy_(at::neg(at::imag(self)));
-  } else {
-    mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
-      return [mpsGraph conjugateWithTensor:inputTensor name:nil];
-    });
-  }
+  TORCH_CHECK(self.dtype() != at::kComplexDouble);
+  mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+    return [mpsGraph conjugateWithTensor:inputTensor name:nil];
+  });
   return result;
 }
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index db8eef9349642..abb061afc5c95 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -288,6 +288,7 @@
   dispatch:
     CPU: native_dropout_cpu
     CUDA: native_dropout_cuda
+    MPS: native_dropout_mps
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_nested
   tags: [nondeterministic_seeded, core]
   autogen: native_dropout.out
@@ -296,6 +297,7 @@
   dispatch:
     CPU, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_backward
     CUDA: native_dropout_backward_cuda
+    MPS: native_dropout_backward_mps
   autogen: native_dropout_backward.out
   tags: pointwise
 
@@ -340,8 +342,8 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: abs
-    SparseCPU, SparseCUDA: abs_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
+    SparseCPU, SparseCUDA, SparseMPS: abs_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
   tags: [core, pointwise]
 
@@ -350,16 +352,16 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: abs_
-    SparseCPU, SparseCUDA: abs_sparse_
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
+    SparseCPU, SparseCUDA, SparseMPS: abs_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_
 
 - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA, MPS, MTIA: abs_out
-    SparseCPU, SparseCUDA: abs_sparse_out
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out
+    SparseCPU, SparseCUDA, SparseMPS: abs_sparse_out
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_out
   tags: pointwise
 
 # Note [Adding an alias]
@@ -428,7 +430,7 @@
   variants: function, method
   structured_delegate: sgn.out
   dispatch:
-    SparseCPU, SparseCUDA: sgn_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sgn_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
   tags: pointwise
@@ -437,7 +439,7 @@
   variants: method
   structured_delegate: sgn.out
   dispatch:
-    SparseCPU, SparseCUDA: sgn_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
   tags: pointwise
@@ -448,7 +450,7 @@
   dispatch:
     CPU, CUDA: sgn_out
     MPS: sgn_out_mps
-    SparseCPU, SparseCUDA: sgn_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_out
   tags: pointwise
 
@@ -476,7 +478,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: _conj_physical
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr
   autogen: _conj_physical.out
 
 - func: conj_physical(Tensor self) -> Tensor
@@ -487,8 +489,8 @@
   dispatch:
     CPU, CUDA: conj_physical_out
     MPS: conj_physical_out_mps
-    SparseCPU, SparseCUDA: conj_physical_out_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr_out
+    SparseCPU, SparseCUDA, SparseMPS: conj_physical_out_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr_out
   tags: pointwise
 
 - func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
@@ -554,7 +556,7 @@
   structured_delegate: add.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA, SparseMeta: add_sparse
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
     MkldnnCPU: mkldnn_add
     ZeroTensor: add_zerotensor
@@ -566,7 +568,7 @@
   variants: method
   structured_delegate: add.out
   dispatch:
-    SparseCPU, SparseCUDA, SparseMeta: add_sparse_
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
     MkldnnCPU: mkldnn_add_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
@@ -582,6 +584,7 @@
   dispatch:
     SparseCPU, SparseMeta: add_out_sparse_cpu
     SparseCUDA: add_out_sparse_cuda
+    SparseMPS: add_out_sparse_mps
     SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
     SparseCsrCUDA: add_out_sparse_compressed_cuda
     MkldnnCPU: mkldnn_add_out
@@ -874,7 +877,7 @@
   variants: function, method
   structured_delegate: asinh.out
   dispatch:
-    SparseCPU, SparseCUDA: asinh_sparse
+    SparseCPU, SparseCUDA, SparseMPS: asinh_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr
   tags: [core, pointwise]
 
@@ -882,7 +885,7 @@
   variants: function, method
   structured_delegate: asinh.out
   dispatch:
-    SparseCPU, SparseCUDA: asinh_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_
   tags: pointwise
 
@@ -892,7 +895,7 @@
   dispatch:
     CPU, CUDA: asinh_out
     MPS: asinh_out_mps
-    SparseCPU, SparseCUDA: asinh_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_out
   tags: pointwise
 
@@ -909,7 +912,7 @@
   structured_delegate: atanh.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: atanh_sparse
+    SparseCPU, SparseCUDA, SparseMPS: atanh_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr
   tags: [core, pointwise]
 
@@ -917,7 +920,7 @@
   structured_delegate: atanh.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: atanh_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_
   tags: pointwise
 
@@ -927,7 +930,7 @@
   dispatch:
     CPU, CUDA: atanh_out
     MPS: atanh_out_mps
-    SparseCPU, SparseCUDA: atanh_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_out
   tags: pointwise
 # arctanh, alias for atanh
@@ -964,7 +967,7 @@
   variants: function, method
   structured_delegate: asin.out
   dispatch:
-    SparseCPU, SparseCUDA: asin_sparse
+    SparseCPU, SparseCUDA, SparseMPS: asin_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr
   tags: [core, pointwise]
 
@@ -973,7 +976,7 @@
   variants: function, method
   structured_delegate: asin.out
   dispatch:
-    SparseCPU, SparseCUDA: asin_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: asin_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_
   tags: pointwise
 
@@ -983,7 +986,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: asin_out
-    SparseCPU, SparseCUDA: asin_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: asin_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out
   tags: pointwise
 
@@ -1001,7 +1004,7 @@
   structured_delegate: atan.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: atan_sparse
+    SparseCPU, SparseCUDA, SparseMPS: atan_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr
   tags: [core, pointwise]
 
@@ -1010,7 +1013,7 @@
   structured_delegate: atan.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: atan_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: atan_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_
   tags: pointwise
 
@@ -1020,7 +1023,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: atan_out
-    SparseCPU, SparseCUDA: atan_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: atan_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out
   tags: pointwise
 
@@ -1459,7 +1462,7 @@
   structured_delegate: ceil.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: ceil_sparse
+    SparseCPU, SparseCUDA, SparseMPS: ceil_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr
   tags: [core, pointwise]
 
@@ -1468,7 +1471,7 @@
   structured_delegate: ceil.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: ceil_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_
   tags: pointwise
 
@@ -1478,7 +1481,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: ceil_out
-    SparseCPU, SparseCUDA: ceil_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_out
   tags: pointwise
 
@@ -2406,7 +2409,7 @@
     MPS: empty_mps
     Meta: empty_meta_symint
     MkldnnCPU: empty_mkldnn
-    SparseCPU, SparseCUDA: empty_sparse
+    SparseCPU, SparseCUDA, SparseMPS: empty_sparse
     SparseMeta: empty_sparse_symint
     SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
     SparseCsrMeta: empty_sparse_compressed_symint
@@ -2534,7 +2537,7 @@
   structured_delegate: erf.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: erf_sparse
+    SparseCPU, SparseCUDA, SparseMPS: erf_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr
   tags: [core, pointwise]
 
@@ -2543,7 +2546,7 @@
   structured_delegate: erf.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: erf_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: erf_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_
   tags: pointwise
 
@@ -2553,7 +2556,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: erf_out
-    SparseCPU, SparseCUDA: erf_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: erf_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out
   tags: pointwise
 
@@ -2619,7 +2622,7 @@
   structured_delegate: expm1.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: expm1_sparse
+    SparseCPU, SparseCUDA, SparseMPS: expm1_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr
   tags: [core, pointwise]
 
@@ -2628,7 +2631,7 @@
   structured_delegate: expm1.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: expm1_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_
   tags: pointwise
 
@@ -2638,7 +2641,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: expm1_out
-    SparseCPU, SparseCUDA: expm1_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out
   tags: pointwise
 
@@ -2737,7 +2740,7 @@
   structured_delegate: floor.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: floor_sparse
+    SparseCPU, SparseCUDA, SparseMPS: floor_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr
   tags: [core, pointwise]
 
@@ -2746,7 +2749,7 @@
   structured_delegate: floor.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: floor_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: floor_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_
   tags: pointwise
 
@@ -2756,7 +2759,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: floor_out
-    SparseCPU, SparseCUDA: floor_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: floor_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_out
   tags: pointwise
 
@@ -2764,7 +2767,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA, MPS: floor_divide
+    CPU, CUDA, MPS, MTIA: floor_divide
     SparseCPU, SparseCUDA: floor_divide_sparse
 
 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -2798,7 +2801,7 @@
   structured_delegate: frac.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: frac_sparse
+    SparseCPU, SparseCUDA, SparseMPS: frac_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr
   tags: pointwise
 
@@ -2807,7 +2810,7 @@
   structured_delegate: frac.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: frac_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: frac_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_
   tags: pointwise
 
@@ -2818,7 +2821,7 @@
   dispatch:
     CPU, CUDA: frac_out
     MPS: frac_out_mps
-    SparseCPU, SparseCUDA: frac_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: frac_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_out
   tags: pointwise
 
@@ -2931,6 +2934,7 @@
   dispatch:
     CPU: grid_sampler_3d_cpu
     CUDA: grid_sampler_3d_cuda
+    MPS: grid_sampler_3d_mps
   autogen: grid_sampler_3d.out
 
 # `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for
@@ -3207,7 +3211,7 @@
   dispatch:
     CPU, CUDA, MPS, MTIA: isnan
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
-    SparseCPU, SparseCUDA: isnan_sparse
+    SparseCPU, SparseCUDA, SparseMPS: isnan_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
   autogen: isnan.out
   tags: [core, pointwise]
@@ -3288,6 +3292,7 @@
   dispatch:
     CPU: kthvalue_out_cpu
     CUDA: kthvalue_out_cuda
+    MPS: kthvalue_out_mps
 
 - func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
@@ -3335,21 +3340,21 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: nan_to_num
-    SparseCPU, SparseCUDA: nan_to_num_sparse
+    SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse
   tags: pointwise
 
 - func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: nan_to_num_
-    SparseCPU, SparseCUDA: nan_to_num_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_
   tags: pointwise
 
 - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA, MTIA: nan_to_num_out
     MPS: nan_to_num_out_mps
-    SparseCPU, SparseCUDA: nan_to_num_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_out
   tags: pointwise
 
 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
@@ -3447,8 +3452,12 @@
 
 - func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor? bias) -> Tensor
 
+- func: fbgemm_linear_fp16_weight_fp32_activation.out(Tensor input, Tensor packed_weight, Tensor? bias, Tensor(a!) output) -> Tensor
+
 - func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
 
+- func: fbgemm_linear_fp16_weight.out(Tensor input, Tensor packed_weight, Tensor bias, Tensor(a!) output) -> Tensor
+
 - func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor
 
 - func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor
@@ -3548,7 +3557,7 @@
   structured_delegate: log1p.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: log1p_sparse
+    SparseCPU, SparseCUDA, SparseMPS: log1p_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr
   tags: [core, pointwise]
 
@@ -3557,7 +3566,7 @@
   structured_delegate: log1p.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: log1p_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_
   tags: pointwise
 
@@ -3567,7 +3576,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: log1p_out
-    SparseCPU, SparseCUDA: log1p_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out
   tags: pointwise
 
@@ -4230,6 +4239,7 @@
 - func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
   dispatch:
     CPU: _weight_int8pack_mm_cpu
+    CUDA: _weight_int8pack_mm_cuda
     MPS: _weight_int8pack_mm_mps
 
 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
@@ -4658,7 +4668,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: rad2deg
-    SparseCPU, SparseCUDA: rad2deg_sparse
+    SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr
   tags: pointwise
 
@@ -4666,14 +4676,14 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: rad2deg_
-    SparseCPU, SparseCUDA: rad2deg_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_
   tags: pointwise
 
 - func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: rad2deg_out
-    SparseCPU, SparseCUDA: rad2deg_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_out
   tags: pointwise
 
@@ -4681,7 +4691,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: deg2rad
-    SparseCPU, SparseCUDA: deg2rad_sparse
+    SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr
   tags: pointwise
 
@@ -4689,14 +4699,14 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: deg2rad_
-    SparseCPU, SparseCUDA: deg2rad_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_
   tags: pointwise
 
 - func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: deg2rad_out
-    SparseCPU, SparseCUDA: deg2rad_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_out
   tags: pointwise
 
@@ -4922,7 +4932,7 @@
   structured_delegate: neg.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: neg_sparse
+    SparseCPU, SparseCUDA, SparseMPS: neg_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
   tags: [core, pointwise]
@@ -4932,7 +4942,7 @@
   structured_delegate: neg.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: neg_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: neg_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
   tags: pointwise
@@ -4943,7 +4953,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: neg_out
-    SparseCPU, SparseCUDA: neg_out_sparse
+    SparseCPU, SparseCUDA, SparseMPS: neg_out_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out
   tags: pointwise
 # Alias for neg
@@ -5027,7 +5037,7 @@
   structured_delegate: round.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: round_sparse
+    SparseCPU, SparseCUDA, SparseMPS: round_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr
   tags: [core, pointwise]
 
@@ -5036,7 +5046,7 @@
   structured_delegate: round.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: round_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: round_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_
   tags: pointwise
 
@@ -5046,7 +5056,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: round_out
-    SparseCPU, SparseCUDA: round_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: round_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_out
   tags: pointwise
 
@@ -5089,7 +5099,7 @@
     QuantizedCPU: relu_quantized_cpu
     QuantizedCUDA: relu_quantized_cuda
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
-    SparseCPU, SparseCUDA: relu_sparse
+    SparseCPU, SparseCUDA, SparseMPS: relu_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
   tags: [core, pointwise]
 
@@ -5104,7 +5114,7 @@
     QuantizedCPU: relu_quantized_cpu_
     QuantizedCUDA: relu_quantized_cuda_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
-    SparseCPU, SparseCUDA: relu_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: relu_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
   autogen: relu.out
   tags: pointwise
@@ -5391,7 +5401,7 @@
   variants: function, method
   dispatch:
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
-    SparseCPU, SparseCUDA: sin_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sin_sparse
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
   tags: [core, pointwise]
 
@@ -5401,7 +5411,7 @@
   variants: function, method
   dispatch:
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_
-    SparseCPU, SparseCUDA: sin_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sin_sparse_
   tags: pointwise
 
 - func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -5411,7 +5421,7 @@
   dispatch:
     CPU, CUDA, MPS, MTIA: sin_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out
-    SparseCPU, SparseCUDA: sin_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: sin_sparse_out
   tags: pointwise
 
 - func: sinc(Tensor self) -> Tensor
@@ -5436,7 +5446,7 @@
   structured_delegate: sinh.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: sinh_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sinh_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr
   tags: [core, pointwise]
 
@@ -5445,7 +5455,7 @@
   structured_delegate: sinh.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: sinh_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_
   tags: pointwise
 
@@ -5455,7 +5465,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: sinh_out
-    SparseCPU, SparseCUDA: sinh_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out
 
 # Returns a copy of this `Variable` that is detached from its autograd graph.
@@ -5503,6 +5513,13 @@
   tags: core
   manual_cpp_binding: True
 
+- func: sym_is_contiguous(Tensor self, MemoryFormat memory_format=contiguous_format) -> SymBool
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+  manual_cpp_binding: True
+
 - func: sym_numel(Tensor self) -> SymInt
   variants: function
   device_check: NoCheck
@@ -5898,7 +5915,7 @@
   variants: function, method
   dispatch:
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
-    SparseCPU, SparseCUDA: sqrt_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
   tags: [core, pointwise]
 
@@ -5907,7 +5924,7 @@
   structured_delegate: sqrt.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: sqrt_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_
   tags: pointwise
 
@@ -5917,7 +5934,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: sqrt_out
-    SparseCPU, SparseCUDA: sqrt_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out
   tags: pointwise
 
@@ -6055,7 +6072,7 @@
   structured_delegate: tan.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: tan_sparse
+    SparseCPU, SparseCUDA, SparseMPS: tan_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr
   tags: [core, pointwise]
 
@@ -6064,7 +6081,7 @@
   structured_delegate: tan.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: tan_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: tan_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_
   tags: pointwise
 
@@ -6074,7 +6091,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: tan_out
-    SparseCPU, SparseCUDA: tan_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: tan_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out
   tags: pointwise
 
@@ -6085,7 +6102,7 @@
   dispatch:
     QuantizedCPU: tanh_quantized_cpu
     MkldnnCPU: mkldnn_tanh
-    SparseCPU, SparseCUDA: tanh_sparse
+    SparseCPU, SparseCUDA, SparseMPS: tanh_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
   tags: [core, pointwise]
@@ -6096,7 +6113,7 @@
   variants: function, method
   dispatch:
     MkldnnCPU: mkldnn_tanh_
-    SparseCPU, SparseCUDA: tanh_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
   tags: pointwise
@@ -6107,7 +6124,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: tanh_out
-    SparseCPU, SparseCUDA: tanh_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out
   tags: pointwise
 
@@ -6379,8 +6396,8 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: trunc_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr
+    SparseCPU, SparseCUDA, SparseMPS: trunc_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr
   tags: [core, pointwise]
 
 - func: trunc_(Tensor(a!) self) -> Tensor(a!)
@@ -6388,8 +6405,8 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: trunc_sparse_
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_
+    SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_
   tags: pointwise
 
 - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -6398,8 +6415,8 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA, MPS: trunc_out
-    SparseCPU, SparseCUDA: trunc_sparse_out
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_out
+    SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_out
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_out
   tags: pointwise
 # Alias for trunc
 
@@ -6909,7 +6926,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: clone
-    SparseCPU, SparseCUDA: clone_sparse
+    SparseCPU, SparseCUDA, SparseMPS: clone_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: clone_sparse_compressed
     MkldnnCPU: mkldnn_clone
     QuantizedCPU, QuantizedCUDA: quantized_clone
@@ -6944,7 +6961,7 @@
     CPU, CUDA: zero_
     MPS: zero_mps_
     Meta: zero_meta_
-    SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zero_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
     MkldnnCPU: mkldnn_zero_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: zero_nested_
@@ -7150,6 +7167,7 @@
 - func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
   variants: function
   dispatch:
+    CompositeExplicitAutograd: _grouped_mm
     CUDA: _grouped_mm_cuda
 
 # NOTE [ Sparse: autograd and API ]
@@ -7361,8 +7379,8 @@
 - func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
   variants: method
   dispatch:
-    SparseCPU, SparseCUDA: sparse_to_dense
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_dense
+    SparseCPU, SparseCUDA, SparseMPS: sparse_to_dense
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: sparse_compressed_to_dense
     MkldnnCPU: mkldnn_to_dense
   autogen: _to_dense.out
 
@@ -7388,8 +7406,8 @@
 - func: dense_dim(Tensor self) -> int
   variants: method
   dispatch:
-    SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: dense_dim_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: dense_dim_sparse_csr
     CompositeExplicitAutograd: dense_dim_default
   device_check: NoCheck
   device_guard: False
@@ -7422,6 +7440,7 @@
   dispatch:
     SparseCPU: _coalesce_sparse_cpu
     SparseCUDA: _coalesce_sparse_cuda
+    SparseMPS: _coalesce_sparse_mps
   autogen: _coalesce.out
 
 - func: is_coalesced(Tensor self) -> bool
@@ -7460,7 +7479,7 @@
 - func: indices(Tensor(a) self) -> Tensor(a)
   variants: method
   dispatch:
-    SparseCPU, SparseCUDA, SparseMeta: indices_sparse
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: indices_sparse
     CompositeExplicitAutograd: indices_default
   device_check: NoCheck
   device_guard: False
@@ -7468,7 +7487,7 @@
 - func: values(Tensor(a) self) -> Tensor(a)
   variants: method
   dispatch:
-    SparseCPU, SparseCUDA, SparseMeta: values_sparse
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: values_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: values_nested
     CompositeExplicitAutograd: values_default
@@ -7521,7 +7540,7 @@
   device_check: NoCheck  # Allows copy into different device
   variants: function
   dispatch:
-    SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: copy_sparse_
   autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
 
 # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
@@ -9712,7 +9731,7 @@
   structured_delegate: sign.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: sign_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sign_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr
   tags: [core, pointwise]
 
@@ -9721,7 +9740,7 @@
   structured_delegate: sign.out
   variants: method
   dispatch:
-    SparseCPU, SparseCUDA: sign_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sign_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_
   tags: pointwise
 
@@ -9732,7 +9751,7 @@
   dispatch:
     CPU, CUDA: sign_out
     MPS: sign_out_mps
-    SparseCPU, SparseCUDA: sign_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: sign_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_out
   tags: pointwise
 
@@ -9740,7 +9759,7 @@
   variants: function, method
   structured_delegate: signbit.out
   dispatch:
-    SparseCPU, SparseCUDA: signbit_sparse
+    SparseCPU, SparseCUDA, SparseMPS: signbit_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr
   tags: pointwise
 
@@ -9751,7 +9770,7 @@
     CPU: signbit_out
     CUDA: signbit_out
     MPS: signbit_out_mps
-    SparseCPU, SparseCUDA: signbit_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: signbit_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr_out
   tags: pointwise
 
@@ -9934,7 +9953,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: igamma_out
+    CPU, CUDA, MPS: igamma_out
   tags: pointwise
 
 - func: igamma(Tensor self, Tensor other) -> Tensor
@@ -9951,7 +9970,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: igammac_out
+    CPU, CUDA, MPS: igammac_out
   tags: pointwise
 
 - func: igammac(Tensor self, Tensor other) -> Tensor
@@ -13255,7 +13274,7 @@
   dispatch:
     CompositeExplicitAutograd: isinf
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
-    SparseCPU, SparseCUDA: isinf_sparse
+    SparseCPU, SparseCUDA, SparseMPS: isinf_sparse
     SparseMeta: isinf_sparse_meta
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
   autogen: isinf.out
@@ -13271,7 +13290,7 @@
   structured_delegate: isposinf.out
   dispatch:
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
-    SparseCPU, SparseCUDA: isposinf_sparse
+    SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
   tags: pointwise
 
@@ -13280,7 +13299,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: isposinf_out
-    SparseCPU, SparseCUDA: isposinf_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr_out
   tags: pointwise
 
@@ -13289,7 +13308,7 @@
   structured_delegate: isneginf.out
   dispatch:
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
-    SparseCPU, SparseCUDA: isneginf_sparse
+    SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
   tags: pointwise
 
@@ -13298,7 +13317,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: isneginf_out
-    SparseCPU, SparseCUDA: isneginf_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr_out
   tags: pointwise
 
@@ -15011,6 +15030,7 @@
 - func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor)
   dispatch:
     CUDA: _scaled_dot_product_cudnn_attention_backward_cuda
+    NestedTensorCUDA: _scaled_dot_product_cudnn_attention_nestedtensor_backward_cuda
   tags: nondeterministic_seeded
 
 - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
@@ -15043,6 +15063,11 @@
     CUDA: _cudnn_attention_forward
   tags: nondeterministic_seeded
 
+- func: _cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: _cudnn_attention_backward
+  tags: nondeterministic_seeded
+
 - func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
   variants: function
   dispatch:
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index 5b7476453407e..96c6ab8310f80 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -349,6 +349,63 @@ _scaled_dot_product_cudnn_attention_nestedtensor_cuda(
   return std::make_tuple(std::move(attention), std::move(log_sumexp), cumulative_sequence_length_q, cumulative_sequence_length_kv, max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
 }
 
+std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_nestedtensor_backward_cuda(
+    const Tensor& grad_out,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& out,
+    const Tensor& logsumexp,
+    const Tensor& philox_seed,
+    const Tensor& philox_offset,
+    const Tensor& attn_bias,
+    const Tensor& cum_seq_q,
+    const Tensor& cum_seq_k,
+    const int64_t max_q,
+    const int64_t max_k,
+    double dropout_p,
+    bool is_causal,
+    std::optional<double> scale) {
+  if (!grad_out.defined()) {
+    return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
+  }
+  auto [
+      grad_out_buffer_reshaped,
+      query_buffer_reshaped,
+      key_buffer_reshaped,
+      value_buffer_reshaped,
+      output_buffer_reshaped] =
+      preprocessing::sdpa_nested_preprocessing_backward(
+          grad_out,
+          query,
+          key,
+          value,
+          out,
+          cum_seq_q,
+          cum_seq_k,
+          max_q,
+          max_k);
+
+  auto [dq, dk, dv] = at::_cudnn_attention_backward(grad_out_buffer_reshaped,
+                                                    query_buffer_reshaped,
+                                                    key_buffer_reshaped,
+                                                    value_buffer_reshaped,
+                                                    output_buffer_reshaped,
+                                                    logsumexp,
+                                                    philox_seed,
+                                                    philox_offset,
+                                                    attn_bias,
+                                                    cum_seq_q,
+                                                    cum_seq_k,
+                                                    max_q,
+                                                    max_k,
+                                                    dropout_p,
+                                                    is_causal,
+                                                    scale);
+  return std::make_tuple(std::move(dq), std::move(dk), std::move(dv));
+}
+
+
 std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_flash_attention_backward_nested(
     const at::Tensor& grad_out_,
     const at::Tensor& query,
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index 4ca777be9cd44..f804670c31538 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -335,6 +335,8 @@ std::tuple<Tensor, Tensor> choose_qparams_optimized(
     const int64_t n_bins,
     const double ratio,
     int64_t bit_width) {
+  const float* input_row = input_tensor.const_data_ptr<float>();
+  TORCH_CHECK_VALUE(input_row != nullptr, "input tensor is empty and has no data");
 
   if (numel < 0 || numel > input_tensor.numel()) {
     TORCH_CHECK(false, "numel is out of the bound of input tensor");
@@ -342,7 +344,7 @@ std::tuple<Tensor, Tensor> choose_qparams_optimized(
 
   TORCH_CHECK(numel <= input_tensor.numel(), "numel ", numel,
       " greater than input_tensor.numel() ", input_tensor.numel());
-  const float* input_row = input_tensor.const_data_ptr<float>();
+
   float xmin = *std::min_element(input_row, input_row + numel);
   float xmax = *std::max_element(input_row, input_row + numel);
   float n_bins_float = static_cast<float>(n_bins);
diff --git a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
index 7722272dfcc27..963a47a21fa9f 100644
--- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
@@ -460,4 +460,6 @@ at::Tensor _qconv_prepack_onednn(
     int64_t groups,
     std::optional<torch::List<int64_t>> input_shape=std::nullopt);
 
+#define FP8E4M3_MAX 448.0
+
 #endif // #if AT_MKLDNN_ENABLED()
diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
index e9e32e43ae022..42c000ee09d5c 100644
--- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
@@ -17,6 +17,7 @@
 #include <c10/util/irange.h>
 
 #include <cstring>
+#include <vector>
 
 
 namespace at::native {
@@ -53,8 +54,8 @@ static void upsample_nearest2d_out_frame(
     return;
   }
 
-  std::unique_ptr<int64_t []> input_offset_arr(new int64_t[output_width]);
-  int64_t* input_offset = input_offset_arr.get();
+  std::vector<int64_t> input_offset_arr(output_width);
+  int64_t* input_offset = input_offset_arr.data();
 
   for (const auto w2 : c10::irange(output_width)) {
     const int64_t w1 = nn_compute_source_index_fn(width_scale, w2, input_width);
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 8624c9ef03367..3b50bad579023 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -1483,6 +1483,8 @@ static at::Tensor _fp8_convolution_onednn_ref(
     }
     y_f32.div_(output_scale);
     if (x1.scalar_type() == at::kFloat8_e4m3fn) {
+      // Avoid NaN
+      y_f32.clamp_(-FP8E4M3_MAX, FP8E4M3_MAX);
       // Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8
       y_f32 = y_f32.to(at::kHalf);
     }
@@ -1497,6 +1499,8 @@ static at::Tensor _fp8_convolution_onednn_ref(
   y_f32.div_(output_scale);
   auto out_dtype = output_dtype.has_value() ? output_dtype.value() : at::kFloat8_e4m3fn;
   if (out_dtype == at::kFloat8_e4m3fn) {
+    // Avoid NaN
+    y_f32.clamp_(-FP8E4M3_MAX, FP8E4M3_MAX);
     // Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8
     return y_f32.to(at::kHalf).to(out_dtype);
   }
@@ -1730,12 +1734,13 @@ static at::Tensor _quantized_convolution_onednn(
   output_sizes = at::native::conv_output_size(input_size, kernel_size, padding.vec(), stride.vec(), dilation.vec());
   ideep::dims dst_dims = ideep::dims({output_sizes.cbegin(), output_sizes.cend()});
   // Output is not a quantized tensor but data type is uint8
+  auto out_dtype = output_dtype.has_value() ? output_dtype.value() : act_dtype;
   at::Tensor output = has_accum_postop_sum ?
     accum.value() :
     at::empty(
       dst_dims,
       at::device(c10::kCPU)
-          .dtype(fp32_output ? c10::kFloat : (bfloat16_output ? c10::kBFloat16 : act_dtype))
+          .dtype(out_dtype)
           .memory_format(kSpatialDim == 2 ?
               c10::MemoryFormat::ChannelsLast :
               c10::MemoryFormat::ChannelsLast3d)
@@ -1755,6 +1760,16 @@ static at::Tensor _quantized_convolution_onednn(
     unary_scalars,
     unary_algorithm.has_value() ? unary_algorithm.value() : ""
   );
+  // Avoid NaN if output dtype is fp8
+  if (out_dtype == c10::kFloat8_e4m3fn) {
+    // To avoid NaN, we need to clamp the intermediate results (in fp32) to [-488, 488]
+    // before converting to fp8
+    auto post_ops = op_attr.get_post_ops();
+    post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, 1.0/output_scale, 0.0);
+    post_ops.append_eltwise(dnnl::algorithm::eltwise_clip, -FP8E4M3_MAX, FP8E4M3_MAX);
+    op_attr.set_post_ops(post_ops);
+    output_scale = 1.0f;
+  }
 
 #if IDEEP_PREREQ(3, 1, 0, 0)
   // Use oneDNN's APIs instead of prepare/compute from ideep to reduce integration overhead.
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index 1e91fecd45005..807a9b25d3772 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -333,14 +333,14 @@ Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
       weight.scalar_type() == at::ScalarType::Float ||
           weight.scalar_type() == at::ScalarType::Half,
       "'embedding_bag_byte_prepack' only support float32 or float16.");
-  const auto weight_sizes = weight.sizes();
-  const auto cols_dim = weight_sizes.size() - 1;
-  const int32_t embedding_cols = static_cast<int32_t>(weight_sizes[cols_dim]);
+  const auto weight_sizes = weight.sym_sizes();
+  const auto cols_dim = weight.ndimension() - 1;
+  const auto embedding_cols = weight_sizes[cols_dim];
   // Add 8 bytes per column to store FP32 scale and zero_point per row.
-  const int32_t output_columns = static_cast<int32_t>(embedding_cols + 2 * sizeof(float));
+  const auto output_columns = embedding_cols + 2 * sizeof(float);
 
   // Adjust output dimensions to account for FP32 scale and zero_points.
-  std::vector<int64_t> output_shape = weight_sizes.vec();
+  auto output_shape = weight_sizes.vec();
   output_shape.at(cols_dim) = output_columns;
   at::SymDimVector output_shape_vec(output_shape);
 
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index bd6a1086c8cb9..a3a494d16fd69 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -1012,6 +1012,12 @@ static at::Tensor fp8_qlinear_onednn_ref(
           "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op sum");
     }
     y_f32.div_(output_scale);
+    if (x1.scalar_type() == c10::kFloat8_e4m3fn) {
+      // Avoid NaN
+      y_f32.clamp_(-FP8E4M3_MAX, FP8E4M3_MAX);
+      // Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8
+      y_f32 = y_f32.to(at::kHalf);
+    }
     x1.copy_(y_f32.to(x1.scalar_type()).view(x1.sizes()));
     return x1;
   } else if (binary_post_op == "add") {
@@ -1038,6 +1044,12 @@ static at::Tensor fp8_qlinear_onednn_ref(
   y_f32.div_(output_scale);
   y_f32 = y_f32.view(output_size);
   auto out_dtype = output_dtype.has_value() ? output_dtype.value() : at::kFloat8_e4m3fn;
+  if (out_dtype == at::kFloat8_e4m3fn) {
+    // Avoid NaN
+    y_f32.clamp_(-FP8E4M3_MAX, FP8E4M3_MAX);
+    // Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8
+    return y_f32.to(at::kHalf).to(out_dtype);
+  }
   return y_f32.to(out_dtype);
 }
 
@@ -1118,7 +1130,7 @@ static at::Tensor linear_int8_with_onednn_weight(
 #if defined(__powerpc__)
   if (is_fp8) {
 #else
-  if(is_fp8 && !cpuinfo_has_x86_amx_int8()) {
+  if(is_fp8 && !cpuinfo_has_x86_amx_fp16()) {
 #endif
     // Fall back to ref impl on old platforms because not supported
     // Transpose weight to align with behavior in oneDNN
@@ -1155,12 +1167,13 @@ static at::Tensor linear_int8_with_onednn_weight(
   }
   std::vector<int64_t> src_dims = {M, K};
   std::vector<int64_t> dst_dims = {M, N};
+  auto out_dtype = output_dtype.has_value() ? output_dtype.value() : input.scalar_type();
   at::Tensor output = binary_post_op == "sum" ?
       other.value() :
       at::empty(
         dst_dims,
         at::device(c10::kCPU)
-            .dtype(fp32_output ? c10::kFloat : (bf16_output ? c10::kBFloat16 : input.scalar_type()))
+            .dtype(out_dtype)
       );
   if (output.numel() == 0) {
     return output;
@@ -1195,6 +1208,16 @@ static at::Tensor linear_int8_with_onednn_weight(
     unary_post_op_args,
     unary_post_op_algorithm
   );
+  // Avoid NaN if output dtype is fp8
+  if (out_dtype == c10::kFloat8_e4m3fn) {
+    // To avoid NaN, we need to clamp the intermediate results (in fp32) to [-488, 488]
+    // before converting to fp8
+    auto post_ops = op_attr.get_post_ops();
+    post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, 1.0/output_scale, 0.0);
+    post_ops.append_eltwise(dnnl::algorithm::eltwise_clip, -FP8E4M3_MAX, FP8E4M3_MAX);
+    op_attr.set_post_ops(post_ops);
+    output_scale = 1.0f;
+  }
   if (input_scale != 1.0f) {
     op_attr.set_scales_mask(DNNL_ARG_SRC, 0);
   }
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index 3bd68feca1c2f..b4ae4e677bcd2 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -305,7 +305,7 @@ static inline at::Tensor pack_weight_to_onednn_tensor(
 #if defined(__powerpc__)
   if (is_fp8){
 #else
-  if(is_fp8 && !cpuinfo_has_x86_amx_int8()) {
+  if(is_fp8 && !cpuinfo_has_x86_amx_fp16()) {
 #endif
     // oneDNN's fp8 requires AMX support
     // If AMX is not available, fall back to reference implementation
diff --git a/aten/src/ATen/native/sparse/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/SparseBlasImpl.cpp
index 5a3f5f14dc0a7..c841da8354b5f 100644
--- a/aten/src/ATen/native/sparse/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/SparseBlasImpl.cpp
@@ -23,6 +23,9 @@
 #include <ATen/Parallel.h>
 #endif
 
+#if AT_USE_EIGEN_SPARSE()
+#include <ATen/native/sparse/eigen/SparseBlasImpl.h>
+#endif
 
 namespace at::native::sparse::impl {
 
@@ -442,13 +445,15 @@ void add_out_sparse_csr(
     const Tensor& mat2,
     const Scalar& alpha,
     const Tensor& result) {
-#if !AT_MKL_ENABLED()
-  TORCH_CHECK(
-      false,
-      "Calling add on a sparse CPU tensor requires compiling PyTorch with MKL. ",
-      "Please use PyTorch built MKL support.");
-#else
+#if AT_USE_MKL_SPARSE()
   sparse::impl::mkl::add_out_sparse_csr(mat1, mat2, alpha, result);
+#elif AT_USE_EIGEN_SPARSE()
+  sparse::impl::eigen::add_out_sparse(mat1, mat2, alpha, result);
+#else
+  TORCH_CHECK(
+    false,
+    "Calling add on a sparse CPU tensor requires compiling PyTorch with MKL. ",
+    "Please use PyTorch built MKL support.");
 #endif
 }
 
@@ -459,7 +464,7 @@ void triangular_solve_out_sparse_csr(
     bool upper,
     bool transpose,
     bool unitriangular) {
-#if !AT_MKL_ENABLED()
+#if !AT_USE_MKL_SPARSE()
   TORCH_CHECK(
       false,
       "Calling triangular_solve on a sparse CPU tensor requires compiling PyTorch with MKL. ",
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index ba94f98551747..4faa135713d65 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -127,6 +127,10 @@
 #include <ATen/ops/zeros_like.h>
 #endif
 
+#if AT_USE_EIGEN_SPARSE()
+#include <ATen/native/sparse/eigen/SparseBlasImpl.h>
+#endif
+
 #include <algorithm>
 
 namespace at {
@@ -536,7 +540,12 @@ static void addmm_out_sparse_csr_native_cpu(
   auto values = sparse.values();
 
   scalar_t cast_alpha = alpha.to<scalar_t>();
-  r.mul_(beta);
+  // If beta is zero NaN and Inf should not be propagated to the result
+  if (beta.toComplexDouble() == 0.) {
+    r.zero_();
+  } else {
+    r.mul_(beta);
+  }
   AT_DISPATCH_INDEX_TYPES(
       col_indices.scalar_type(), "csr_mm_crow_indices", [&]() {
         auto csr_accessor = csr.accessor<index_t, 1>();
@@ -648,6 +657,15 @@ Tensor& addmm_out_sparse_compressed_cpu(
     return result;
   }
 
+#if AT_USE_EIGEN_SPARSE()
+  if ((result.layout() == kSparseCsr || result.layout() == kSparseCsc) &&
+      (mat1.layout() == kSparseCsr || mat1.layout() == kSparseCsc) &&
+      (mat2.layout() == kSparseCsr || mat2.layout() == kSparseCsc)) {
+    sparse::impl::eigen::addmm_out_sparse(mat1, mat2, result, alpha, beta);
+    return result;
+  }
+#endif
+
 #if !AT_USE_MKL_SPARSE()
   // The custom impl addmm_out_sparse_csr_native_cpu only supports CSR @
   // strided -> strided
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
index 693ca536a3198..c11588a32ba05 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
@@ -196,9 +196,17 @@ C10_LAUNCH_BOUNDS_1(num_threads())
 __global__ void coalesceValuesKernel(
   int64_t *segment_offsets, int64_t *value_indices,
   Dtype *values, Dtype *newValues,
-  int64_t nnz, int64_t newNnz, int64_t stride) {
+  int64_t nnz, int64_t newNnz,
+#ifdef USE_ROCM
+  int64_t nsegments,
+#endif
+  int64_t stride) {
 
-  int seg = blockIdx.x * 4 + threadIdx.y;
+#ifdef USE_ROCM
+  int64_t seg = (blockIdx.x * gridDim.y + blockIdx.y) * 4 + threadIdx.y;
+#else
+  int64_t seg = blockIdx.x * 4 + threadIdx.y;
+#endif
 
   // Number of values processed by each thread (grain size)
   const int SZ = 4;
@@ -207,7 +215,11 @@ __global__ void coalesceValuesKernel(
     const int newValueRow = seg * stride;
     const int begin = segment_offsets[seg];
     const int end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz;
+#ifdef USE_ROCM
+    const int startFeature = threadIdx.x + blockIdx.z * nsegments * SZ;
+#else
     const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+#endif
     Acctype tmp[SZ];
     #pragma unroll
     for (int ii = 0; ii < SZ; ii++) {
@@ -250,9 +262,17 @@ C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE*4)
 __global__ void coalesceValuesKernel(
   int64_t *segment_offsets, int64_t *value_indices,
   bool *values, bool *newValues,
-  int64_t nnz, int64_t newNnz, int64_t stride) {
+  int64_t nnz, int64_t newNnz,
+#ifdef USE_ROCM
+  int64_t nsegments,
+#endif
+  int64_t stride) {
 
-  int seg = blockIdx.x * 4 + threadIdx.y;
+#ifdef USE_ROCM
+  int64_t seg = (blockIdx.x * gridDim.y + blockIdx.y) * 4 + threadIdx.y;
+#else
+  int64_t seg = blockIdx.x * 4 + threadIdx.y;
+#endif
 
   // Number of values processed by each thread (grain size)
   const int SZ = 4;
@@ -261,7 +281,11 @@ __global__ void coalesceValuesKernel(
     const int newValueRow = seg * stride;
     const int begin = segment_offsets[seg];
     const int end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz;
+#ifdef USE_ROCM
+    const int startFeature = threadIdx.x + blockIdx.z * nsegments * SZ;
+#else
     const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+#endif
     bool tmp[SZ];
     #pragma unroll
     for (int ii = 0; ii < SZ; ii++) {
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
index a36ec9b203fc3..b59221a3231a5 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -106,8 +106,17 @@ SparseTensor _coalesce_sparse_cuda(const SparseTensor& self) {
     values = values.contiguous();
     int64_t stride = c10::multiply_integers(values.sizes().slice(1));
     int warp_size = at::cuda::warp_size();
+#ifdef USE_ROCM
+    const int64_t BATCHING_SEGMENT = 4096;
+    int64_t nsegments = ceil_div(newNnz, (int64_t) SZ);
+    int64_t s_batch = ceil_div(nsegments, BATCHING_SEGMENT);
+    dim3 grid(s_batch, (s_batch == 1) ? nsegments : BATCHING_SEGMENT, ceil_div(stride, (int64_t) warp_size*SZ));
+#else
     dim3 grid(ceil_div(newNnz, (int64_t) SZ), ceil_div(stride, (int64_t) warp_size*SZ));
+#endif
     dim3 block(warp_size, SZ);
+#ifdef USE_ROCM
+    // Must duplicate the whole section otherwise does not compile on Windows
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
       at::ScalarType::ComplexHalf, at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool,
       values.scalar_type(), "coalesce_sparse_cuda", [&] {
@@ -119,10 +128,28 @@ SparseTensor _coalesce_sparse_cuda(const SparseTensor& self) {
           newValues.data_ptr<scalar_t>(),
           nnz,
           newNnz,
+          nsegments,
           stride
         );
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
+#else
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      at::ScalarType::ComplexHalf, at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool,
+      values.scalar_type(), "coalesce_sparse_cuda", [&] {
+        using cuda_accscalar_t = acc_type<scalar_t, /* is_cuda */ true>;
+        apply::coalesceValuesKernel<scalar_t, cuda_accscalar_t><<<grid, block, 0, stream>>>(
+          uniqueOffsets.data_ptr<int64_t>(),
+          origIndices.data_ptr<int64_t>(),
+          values.data_ptr<scalar_t>(),
+          newValues.data_ptr<scalar_t>(),
+          nnz,
+          newNnz,
+          stride
+        );
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      });
+#endif
   }
 
 // this grid-strided version is slower but probably more flexible
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index fe0ddd087dd3b..3730ceb913547 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -800,7 +800,7 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
   Tensor indices_dim1 = indices[1].to(ScalarType::Int);
   Tensor indices_dim2 = indices[2].to(ScalarType::Int);
 
-  std::unique_ptr<int64_t[]> mat_el_end_indices_host(new int64_t[num_matrices]);
+  std::vector<int64_t> mat_el_end_indices_host(num_matrices);
 
   {
     auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
@@ -809,14 +809,14 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
 
     search_end_matrix_indices(mat_el_end_indices_device, num_matrices, indices_dim0);
     AT_CUDA_CHECK(cudaMemcpy(
-      mat_el_end_indices_host.get(),
+      mat_el_end_indices_host.data(),
       mat_el_end_indices_device,
       num_matrices*sizeof(int64_t),
       cudaMemcpyDeviceToHost
     ));
   }
   // Need a pointer to an array to access within a lambda
-  int64_t* mat_el_end_indices = &mat_el_end_indices_host[0];
+  int64_t* mat_el_end_indices = mat_el_end_indices_host.data();
 
   Scalar beta = 0;
   Scalar alpha = 1;
diff --git a/aten/src/ATen/native/sparse/eigen/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/eigen/SparseBlasImpl.cpp
new file mode 100644
index 0000000000000..20738992a61d9
--- /dev/null
+++ b/aten/src/ATen/native/sparse/eigen/SparseBlasImpl.cpp
@@ -0,0 +1,329 @@
+#include <ATen/native/sparse/eigen/SparseBlasImpl.h>
+
+#if AT_USE_EIGEN_SPARSE()
+
+#include <ATen/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/SparseCsrTensorUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#endif
+
+#include <c10/core/ScalarType.h>
+
+#include <Eigen/SparseCore>
+
+namespace at::native::sparse::impl::eigen {
+
+namespace {
+
+void inline sparse_indices_to_result_dtype_inplace(
+    const c10::ScalarType& dtype,
+    const at::Tensor& input) {
+  auto [compressed_indices, plain_indices] =
+      at::sparse_csr::getCompressedPlainIndices(input);
+      static_cast<at::SparseCsrTensorImpl*>(input.unsafeGetTensorImpl())
+          ->set_member_tensors(
+              compressed_indices.to(dtype),
+              plain_indices.to(dtype),
+              input.values(),
+              input.sizes());
+}
+
+void inline sparse_indices_and_values_resize(
+    const at::Tensor& input,
+    int64_t nnz) {
+  auto [compressed_indices, plain_indices] =
+      at::sparse_csr::getCompressedPlainIndices(input);
+      static_cast<SparseCsrTensorImpl*>(input.unsafeGetTensorImpl())
+          ->set_member_tensors(
+              compressed_indices,
+              plain_indices.resize_({nnz}),
+              input.values().resize_({nnz}),
+              input.sizes());
+}
+
+template <typename scalar_t, int eigen_options, typename index_t>
+const Eigen::Map<Eigen::SparseMatrix<scalar_t, eigen_options, index_t>>
+Tensor_to_Eigen(const at::Tensor& tensor) {
+  int64_t rows = tensor.size(0);
+  int64_t cols = tensor.size(1);
+  int64_t nnz = tensor._nnz();
+  TORCH_CHECK(tensor.values().is_contiguous(), "eigen accepts only contiguous tensor values");
+  auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(tensor);
+  index_t* c_indices_ptr = compressed_indices.data_ptr<index_t>();
+  index_t* p_indices_ptr = plain_indices.data_ptr<index_t>();
+  scalar_t* values_ptr = tensor.values().data_ptr<scalar_t>();
+  Eigen::Map<Eigen::SparseMatrix<scalar_t, eigen_options, index_t>> map(
+      rows, cols, nnz, c_indices_ptr, p_indices_ptr, values_ptr);
+  return map;
+}
+
+template <typename scalar_t, int eigen_options, typename index_t>
+void Eigen_to_Tensor(
+    const at::Tensor& tensor,
+    const Eigen::SparseMatrix<scalar_t, eigen_options, index_t>& matrix) {
+  const Layout eigen_layout = (eigen_options == Eigen::RowMajor ? kSparseCsr : kSparseCsc);
+  TORCH_CHECK(
+      tensor.layout() == eigen_layout,
+      "Eigen_to_Tensor, expected tensor be ", eigen_layout, ", but got ",
+      tensor.layout());
+  int64_t nnz = matrix.nonZeros();
+  int64_t csize = matrix.outerSize();
+  sparse_indices_and_values_resize(tensor, nnz);
+  auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(tensor);
+  if (nnz > 0) {
+    std::memcpy(
+        tensor.values().mutable_data_ptr<scalar_t>(),
+        matrix.valuePtr(),
+        nnz * sizeof(scalar_t));
+    std::memcpy(
+        plain_indices.mutable_data_ptr<index_t>(),
+        matrix.innerIndexPtr(),
+        nnz * sizeof(index_t));
+  }
+  if (csize > 0) {
+    std::memcpy(
+        compressed_indices.mutable_data_ptr<index_t>(),
+        matrix.outerIndexPtr(),
+        csize * sizeof(index_t));
+  }
+  compressed_indices.mutable_data_ptr<index_t>()[csize] = nnz;
+}
+
+template <typename scalar_t>
+void add_out_sparse_eigen(
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    const at::Scalar& alpha,
+    const at::Tensor& result) {
+  // empty matrices
+  if (mat1._nnz() == 0 && mat2._nnz() == 0) {
+    return;
+  }
+
+  if (mat2._nnz() == 0 || alpha.toComplexDouble() == 0.) {
+    sparse_indices_and_values_resize(result, mat1._nnz());
+    result.copy_(mat1);
+    return;
+  } else if (mat1._nnz() == 0) {
+    sparse_indices_and_values_resize(result, mat2._nnz());
+    result.copy_(mat2);
+    result.values().mul_(alpha);
+    return;
+  }
+
+  c10::ScalarType result_index_dtype = at::sparse_csr::getIndexDtype(result);
+
+  sparse_indices_to_result_dtype_inplace(result_index_dtype, mat1);
+  sparse_indices_to_result_dtype_inplace(result_index_dtype, mat2);
+
+  AT_DISPATCH_INDEX_TYPES(
+      result_index_dtype, "eigen_sparse_add", [&]() {
+        scalar_t _alpha = alpha.to<scalar_t>();
+
+        if (result.layout() == kSparseCsr) {
+          auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat1);
+          auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
+          auto mat1_mat2_eigen = (mat1_eigen + _alpha * mat2_eigen);
+          Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(result, mat1_mat2_eigen);
+        } else {
+          auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat1);
+          auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
+          auto mat1_mat2_eigen = (mat1_eigen + _alpha * mat2_eigen);
+          Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(result, mat1_mat2_eigen);
+        }
+      });
+}
+
+template <typename scalar_t>
+void addmm_out_sparse_eigen(
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    const at::Tensor& result,
+    const at::Scalar& alpha,
+    const at::Scalar& beta) {
+  // empty matrices
+  if (mat1._nnz() == 0 || mat2._nnz() == 0) {
+    return;
+  }
+
+  // If beta is zero NaN and Inf should not be propagated to the result
+  // In addition, beta = 0 lets us enable a fast-path for result = alpha * A @ B
+  bool is_beta_zero = false;
+  if (beta.toComplexDouble() == 0.) {
+    is_beta_zero = true;
+    result.values().zero_();
+  } else {
+    result.values().mul_(beta);
+  }
+
+  c10::ScalarType result_index_dtype = at::sparse_csr::getIndexDtype(result);
+
+  sparse_indices_to_result_dtype_inplace(result_index_dtype, mat1);
+  sparse_indices_to_result_dtype_inplace(result_index_dtype, mat2);
+
+  AT_DISPATCH_INDEX_TYPES(
+      result_index_dtype, "eigen_sparse_mm", [&]() {
+        typedef Eigen::SparseMatrix<scalar_t, Eigen::RowMajor, index_t> EigenCsrMatrix;
+        typedef Eigen::SparseMatrix<scalar_t, Eigen::ColMajor, index_t> EigenCscMatrix;
+
+        at::Tensor mat1_mat2;
+        if (is_beta_zero) {
+          mat1_mat2 = result;
+        } else {
+          mat1_mat2 = at::empty_like(result, result.options());
+        }
+
+        if (mat1_mat2.layout() == kSparseCsr) {
+          if (mat1.layout() == kSparseCsr) {
+            const auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat1);
+            if (mat2.layout() == kSparseCsr) {
+              // Out_csr = M1_csr * M2_csr
+              const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
+              const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
+              Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
+            } else {
+              // Out_csr = M1_csr * M2_csc
+              const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
+              const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
+              Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
+            }
+          } else {
+            const auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat1);
+            if (mat2.layout() == kSparseCsr) {
+              // Out_csr = M1_csc * M2_csr
+              const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
+              const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
+              Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
+            } else {
+              // Out_csr = M1_csc * M2_csc
+              // This multiplication will be computationally inefficient, as it will require
+              // additional conversion of the output matrix from CSC to CSR format.
+              const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
+              const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
+              Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
+            }
+          }
+        } else {
+          if (mat1.layout() == kSparseCsr) {
+            const auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat1);
+            if (mat2.layout() == kSparseCsr) {
+              // Out_csc = M1_csr * M2_csr
+              // This multiplication will be computationally inefficient, as it will require
+              // additional conversion of the output matrix from CSR to CSC format.
+              const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
+              const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
+              Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
+            } else {
+              // Out_csc = M1_csr * M2_csc
+              const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
+              const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
+              Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
+            }
+          } else {
+            const auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat1);
+            if (mat2.layout() == kSparseCsr) {
+              // Out_csc = M1_csc * M2_csr
+              const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
+              const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
+              Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
+            } else {
+              // Out_csc = M1_csc * M2_csc
+              const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
+              const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
+              Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
+            }
+          }
+        }
+
+        if (is_beta_zero) {
+          result.mul_(alpha.to<scalar_t>());
+        } else {
+          result.add_(mat1_mat2, alpha.to<scalar_t>());
+        }
+      });
+}
+
+} // anonymous namespace
+
+void addmm_out_sparse(
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    const at::Tensor& result,
+    const at::Scalar& alpha,
+    const at::Scalar& beta) {
+  AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(mat1.layout(), "eigen::addmm_out_sparse:mat1", [&]{});
+  AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(mat2.layout(), "eigen::addmm_out_sparse:mat2", [&]{});
+  AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(result.layout(), "eigen::addmm_out_sparse:result", [&]{});
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+    result.scalar_type(), "addmm_out_sparse_eigen", [&] {
+      addmm_out_sparse_eigen<scalar_t>(mat1, mat2, result, alpha, beta);
+  });
+}
+
+void add_out_sparse(
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    const at::Scalar& alpha,
+    const at::Tensor& result) {
+  TORCH_CHECK(
+      (result.layout() == kSparseCsr && mat1.layout() == kSparseCsr && mat2.layout() == kSparseCsr) ||
+      (result.layout() == kSparseCsc && mat1.layout() == kSparseCsc && mat2.layout() == kSparseCsc),
+      "eigen::add_out_sparse: expected the same layout for all operands but got ",
+      mat1.layout(),
+      " + ",
+      mat2.layout(),
+      " -> ",
+      result.layout());
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+    result.scalar_type(), "add_out_sparse_eigen", [&] {
+      add_out_sparse_eigen<scalar_t>(mat1, mat2, alpha, result);
+  });
+}
+
+} // namespace at::native::sparse::impl::eigen
+
+#else
+
+namespace at::native::sparse::impl::eigen {
+
+void addmm_out_sparse(
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    const at::Tensor& result,
+    const at::Scalar& alpha,
+    const at::Scalar& beta) {
+    TORCH_CHECK(
+      false,
+      "eigen::addmm_out_sparse: Eigen was not enabled for ",
+      result.layout(),
+      " + ",
+      mat1.layout(),
+      " @ ",
+      mat2.layout());
+}
+
+void add_out_sparse(
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    const at::Scalar& alpha,
+    const at::Tensor& result) {
+    TORCH_CHECK(
+      false,
+      "eigen::add_out_sparse: Eigen was not enabled for ",
+      mat1.layout(),
+      " + ",
+      mat2.layout(),
+      " -> ",
+      result.layout());
+}
+
+} // namespace at::native::sparse::impl::eigen
+
+#endif // AT_USE_EIGEN_SPARSE()
diff --git a/aten/src/ATen/native/sparse/eigen/SparseBlasImpl.h b/aten/src/ATen/native/sparse/eigen/SparseBlasImpl.h
new file mode 100644
index 0000000000000..d8e8dc322bc37
--- /dev/null
+++ b/aten/src/ATen/native/sparse/eigen/SparseBlasImpl.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <ATen/Config.h>
+
+#if AT_USE_EIGEN_SPARSE()
+#ifndef EIGEN_MPL2_ONLY
+#define EIGEN_MPL2_ONLY
+#endif
+
+#include <ATen/Tensor.h>
+
+namespace at::native::sparse::impl::eigen {
+
+void addmm_out_sparse(
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    const at::Tensor& result,
+    const at::Scalar& alpha,
+    const at::Scalar& beta);
+
+void add_out_sparse(
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    const at::Scalar& alpha,
+    const at::Tensor& result);
+
+} // namespace at::native::sparse::impl::eigen
+
+#endif
diff --git a/aten/src/ATen/native/sparse/mps/FlattenIndices.mm b/aten/src/ATen/native/sparse/mps/FlattenIndices.mm
new file mode 100644
index 0000000000000..41efa545cd2a8
--- /dev/null
+++ b/aten/src/ATen/native/sparse/mps/FlattenIndices.mm
@@ -0,0 +1,73 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/SparseTensorUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/sparse/SparseStubs.h>
+#include <ATen/native/sparse/FlattenIndicesCommon.h>
+#include <ATen/ExpandUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_coalesce_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/zeros_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+using namespace mps;
+using namespace at::sparse;
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/FlattenIndices_metallib.h>
+#endif
+
+Tensor flatten_indices_mps(const Tensor& indices, IntArrayRef size) {
+  TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
+  TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
+              "flatten_indices: indices.size(0) must equal size.size()");
+
+  const int64_t sparse_dim = indices.size(0);
+  const int64_t nnz = indices.size(1);
+
+  if (nnz == 0) {
+    return at::empty({0}, indices.options().dtype(kLong));
+  }
+
+  // Row-major multipliers for flattening: mul[d] = prod_{j>d}(size[j])
+  std::vector<int64_t> row_muls(sparse_dim);
+  row_muls[sparse_dim - 1] = 1;
+  for (int64_t i = sparse_dim - 2; i >= 0; --i) {
+    row_muls[i] = row_muls[i + 1] * size[i + 1];
+  }
+
+  auto flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
+      auto encoder = stream->commandEncoder();
+      [encoder setComputePipelineState:pipeline];
+      mtl_setArgs(encoder,
+                  indices,
+                  row_muls,
+                  flat_indices,
+                  static_cast<uint>(sparse_dim),
+                  indices.strides()
+      );
+
+      mtl_dispatch1DJob(encoder, pipeline, nnz);
+    }
+  });
+  return flat_indices;
+}
+
+} // namespace
+REGISTER_MPS_DISPATCH(flatten_indices_stub, &flatten_indices_mps)
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm
new file mode 100644
index 0000000000000..3e0ac4e35da1a
--- /dev/null
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm
@@ -0,0 +1,183 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/SparseTensorUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_coalesce_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/zeros_native.h>
+#endif
+
+namespace at::native {
+
+using namespace mps;
+using namespace at::sparse;
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/Coalesce_metallib.h>
+#endif
+
+static Tensor compute_output_positions(const Tensor& is_unique) {
+
+  int64_t nnz = is_unique.size(0);
+  if (nnz == 0) {
+    return at::empty({0}, TensorOptions().device(kMPS).dtype(kInt));
+  }
+
+  Tensor positions = at::empty({nnz}, TensorOptions().device(kMPS).dtype(kInt));
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pipeline = lib.getPipelineStateForFunc("compute_output_positions_kernel");
+      auto encoder = stream->commandEncoder();
+      [encoder setComputePipelineState:pipeline];
+
+      mtl_setArgs(encoder, is_unique, positions);
+      mtl_dispatch1DJob(encoder, pipeline, nnz);
+    }
+  });
+
+  return positions;
+}
+
+static Tensor compute_output_positions_parallel(const Tensor& is_unique) {
+
+  int64_t nnz = is_unique.size(0);
+  if (nnz == 0) {
+    return at::empty({0}, TensorOptions().device(kMPS).dtype(kInt));
+  }
+
+  // for small arrays, use simple kernel
+  // speed of the naive kernel drops off after 4096 nnz elements
+  if (nnz <= 4096) {
+    return compute_output_positions(is_unique);
+  }
+  auto stream = getCurrentMPSStream();
+  Tensor positions = is_unique.to(kInt);
+  // Kogge-Stone parallel prefix sum
+  Tensor positions_cloned = positions.clone();
+
+  for (int64_t stride = 1; stride < nnz; stride *= 2) {
+    dispatch_sync_with_rethrow(stream->queue(), ^() {
+      @autoreleasepool {
+        auto pipeline = lib.getPipelineStateForFunc("kogge_stone_step");
+        auto encoder = stream->commandEncoder();
+        [encoder setComputePipelineState:pipeline];
+
+        mtl_setArgs(encoder, positions, positions_cloned, stride);
+        mtl_dispatch1DJob(encoder, pipeline, nnz);
+      }
+    });
+    std::swap(positions, positions_cloned);
+  }
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pipeline = lib.getPipelineStateForFunc("shift_right_kernel");
+      auto encoder = stream->commandEncoder();
+      [encoder setComputePipelineState:pipeline];
+
+      mtl_setArgs(encoder, positions, positions_cloned);
+      mtl_dispatch1DJob(encoder, pipeline, nnz);
+    }
+  });
+
+  return positions_cloned;
+}
+
+static std::pair<Tensor, int32_t> mark_unique_and_count(const Tensor& flat_indices) {
+
+  int64_t nnz = flat_indices.size(0);
+  if (nnz == 0) {
+    return {at::empty({0}, flat_indices.options().dtype(kBool)), 0};
+  }
+
+  Tensor is_unique = at::empty({nnz}, flat_indices.options().dtype(kBool));
+  Tensor count_result = at::zeros({1}, flat_indices.options().dtype(kInt));
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pipeline = lib.getPipelineStateForFunc("mark_unique_positions_and_count_kernel");
+      auto encoder = stream->commandEncoder();
+      [encoder setComputePipelineState:pipeline];
+
+      mtl_setArgs(encoder, flat_indices, is_unique, count_result);
+      mtl_dispatch1DJob(encoder, pipeline, nnz);
+    }
+  });
+
+  int32_t num_unique = count_result.item<int32_t>();
+
+  return {is_unique, num_unique};
+}
+
+SparseTensor _coalesce_sparse_mps(const SparseTensor& self) {
+  int64_t nnz = self._nnz();
+  TORCH_INTERNAL_ASSERT(!self.is_coalesced());
+  if (nnz < 2) {
+    SparseTensor dst = self.clone();
+    dst._coalesced_(true);
+    return dst;
+  }
+
+  Tensor indices = self._indices();
+  Tensor values = self._values();
+
+  Tensor flat_indices = flatten_indices(indices, self.sizes());
+  Tensor sorted_order = flat_indices.argsort();
+  Tensor flat_indices_sorted = flat_indices.index({sorted_order});
+  values = values.index({sorted_order});
+  indices = indices.index_select(1, sorted_order);
+
+  auto unique_info = mark_unique_and_count(flat_indices_sorted);
+  Tensor is_unique = unique_info.first;
+  int32_t newNnz = unique_info.second;
+
+  Tensor output_positions = compute_output_positions_parallel(is_unique);
+
+  Tensor out_indices = at::empty({indices.size(0), newNnz}, indices.options());
+  auto outValuesSize = values.sizes().vec();
+  outValuesSize[0] = newNnz;
+  Tensor out_values = at::zeros(outValuesSize, values.options());
+
+  Tensor is_unique_local = is_unique;
+  int64_t sparse_dim = indices.size(0);
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pipeline = lib.getPipelineStateForFunc("coalesce_with_positions_kernel_" + scalarToMetalTypeString(values));
+      auto encoder = stream->commandEncoder();
+      [encoder setComputePipelineState:pipeline];
+
+      const uint32_t numThreads = static_cast<uint32_t>(nnz);
+      const uint32_t valueSize = static_cast<uint32_t>(values.numel() / nnz);
+      mtl_setArgs(encoder,
+                  flat_indices_sorted,
+                  indices,
+                  values,
+                  is_unique_local,
+                  output_positions,
+                  out_indices,
+                  out_values,
+                  numThreads,
+                  valueSize,
+                  sparse_dim,
+                  newNnz);
+      mtl_dispatch1DJob(encoder, pipeline, nnz);
+    }
+  });
+
+  SparseTensor result = _sparse_coo_tensor_unsafe_symint(out_indices, out_values, self.sym_sizes())._coalesced_(true);
+  return result;
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
new file mode 100644
index 0000000000000..07ee2e097b49e
--- /dev/null
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@@ -0,0 +1,183 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/SparseTensorUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_coalesce_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/add_native.h>
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/zeros_native.h>
+#include <ATen/ops/result_type.h>
+#include <ATen/ops/copy_sparse_to_sparse.h>
+#include <ATen/ops/mul.h>
+#endif
+
+namespace at::native {
+
+using namespace at::sparse;
+
+Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
+
+Tensor& add_out_dense_sparse_mps(
+    Tensor& out,
+    const Tensor& dense,
+    const SparseTensor& sparse,
+    const Scalar& alpha) {
+  TORCH_CHECK(dense.is_mps(),  "add: expected 'self' to be an MPS tensor, got ", dense.device());
+  TORCH_CHECK(sparse.is_mps(), "add: expected 'other' to be an MPS tensor, got ", sparse.device());
+  TORCH_CHECK(out.is_mps(),    "add: expected 'out' to be an MPS tensor, got ", out.device());
+  TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
+              "add: expected 'self' and 'other' to have same size, but self has size ",
+              dense.sizes(), " while other has size ", sparse.sizes(),
+              " (FYI: dense-sparse addition does not currently support broadcasting)");
+
+  const int64_t nnz = sparse._nnz();
+  if (nnz == 0) {
+    out.resize_as_(dense);
+    out.copy_(dense);
+    return out;
+  }
+
+  auto commonDtype = at::result_type(dense, sparse);
+  TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
+              "Can't convert result type ", commonDtype, " to output ", out.scalar_type());
+
+  Tensor r;
+  const bool need_separate_buffer = out.is_same(dense) || (out.scalar_type() != commonDtype);
+  if (need_separate_buffer) {
+    r = at::empty(dense.sizes(), out.options().dtype(commonDtype));
+  } else {
+    r = out;
+    r.resize_as_(dense);
+  }
+
+  Tensor dense_buffer = dense.to(commonDtype);
+  if (!r.is_same(dense_buffer)) {
+    r.copy_(dense_buffer);
+  }
+
+  Tensor indices = sparse._indices();
+  Tensor values  = sparse._values().to(commonDtype);
+  if (values.numel() == 0) {
+    if (!out.is_same(r)) {
+      out.resize_as_(dense);
+      out.copy_(r);
+    }
+    return out;
+  }
+
+  const int64_t nDim  = r.dim();
+  const int64_t nDimI = sparse.sparse_dim();
+  TORCH_CHECK(nDimI >= 0 && nDimI <= nDim,
+              "Invalid sparse_dim=", nDimI, " for dense tensor of dim ", nDim);
+
+  Tensor indices1D = at::sparse::flatten_indices(indices, sparse.sizes()).contiguous();
+
+  int64_t view_rows = 1;
+  int64_t view_cols = 1;
+  for (int64_t i = 0; i < nDimI; i++) {
+    view_rows *= r.size(i);
+  }
+  for (int64_t i = nDimI; i < nDim; i++) {
+    view_cols *= r.size(i);
+  }
+
+  if (view_cols == 1) {
+    Tensor r_flat = r.reshape({view_rows});
+    Tensor values_1d  = values.reshape({nnz});
+    r_flat.index_add_(0, indices1D, values_1d, alpha);
+  } else {
+    Tensor r_view = r.view({view_rows, view_cols});
+    Tensor values_2d  = values.reshape({nnz, view_cols});
+    r_view.index_add_(0, indices1D, values_2d, alpha);
+  }
+
+  if (!out.is_same(r)) {
+    out.resize_as_(dense);
+    out.copy_(r);
+  }
+  return out;
+}
+
+
+SparseTensor& add_out_sparse_mps(const SparseTensor& self,
+                                 const SparseTensor& other,
+                                 const Scalar& alpha,
+                                 SparseTensor& out) {
+  TORCH_CHECK(other.is_sparse(), "add(sparse, dense) is not supported. Use add(dense, sparse) instead.");
+  TORCH_CHECK(self.is_mps(),  "add: expected 'self' to be MPS, but got ", self.device());
+  TORCH_CHECK(other.is_mps(), "add: expected 'other' to be MPS, but got ", other.device());
+  TORCH_CHECK(out.is_mps(),   "add: expected 'out' to be MPS, but got ", out.device());
+  if (!self.is_sparse()) {
+    return add_out_dense_sparse_mps(out, self, other, alpha);
+  }
+  auto commonDtype = at::result_type(self, other);
+  TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
+              "Can't convert result type ", commonDtype, " to output ", out.scalar_type());
+
+  TORCH_CHECK(self.sizes().equals(other.sizes()),
+              "add: expected 'self' and 'other' to have same size, but ", self.sizes(), " != ", other.sizes());
+
+  if (other._nnz() == 0) {
+    out.resize_as_(self);
+    Tensor vals = self._values();
+    if (vals.scalar_type() != out.scalar_type()) {
+      vals = vals.to(out.scalar_type());
+    }
+    alias_into_sparse(out, self._indices(), vals);
+    out._coalesced_(self.is_coalesced());
+    return out;
+  }
+
+  if (self._nnz() == 0) {
+    out.resize_as_(other);
+    Tensor vals = other._values();
+    if (!alpha.isIntegral(false) || alpha.to<double>() != 1.0) {
+      vals = at::mul(vals, alpha);
+    }
+    if (vals.scalar_type() != out.scalar_type()) {
+      vals = vals.to(out.scalar_type());
+    }
+    alias_into_sparse(out, other._indices(), vals);
+    out._coalesced_(other.is_coalesced());
+    return out;
+  }
+
+  TORCH_CHECK(is_same_density(self, other),
+              "add: expected 'self' and 'other' to have same density, but 'self' has ",
+              self.sparse_dim(), " sparse dimensions while 'other' has ", other.sparse_dim(), " sparse dimensions");
+
+  Tensor t_indices_ = self._indices();
+  Tensor s_indices_ = other._indices();
+
+  Tensor t_values_ = self._values().to(commonDtype);
+  Tensor s_values_ = other._values().to(commonDtype);
+  if (!alpha.isIntegral(false) || alpha.to<double>() != 1.0) {
+    s_values_ = at::mul(s_values_, alpha);
+  }
+
+  Tensor r_indices_ = at::cat({t_indices_, s_indices_}, 1);
+  Tensor r_values_  = at::cat({t_values_,  s_values_ }, 0);
+
+  SparseTensor tmp = empty({0}, out.options().dtype(commonDtype));
+  tmp.resize_as_(other);
+  alias_into_sparse(tmp, r_indices_, r_values_);
+  tmp = _coalesce_sparse_mps(tmp);
+
+  out.resize_as_(other);
+  Tensor out_vals = tmp._values();
+  if (out.scalar_type() != commonDtype) {
+    out_vals = out_vals.to(out.scalar_type());
+  }
+  alias_into_sparse(out, tmp._indices(), out_vals);
+  out._coalesced_(tmp.is_coalesced());
+
+  return out;
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal b/aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal
new file mode 100644
index 0000000000000..e32d1edf1c2f6
--- /dev/null
+++ b/aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal
@@ -0,0 +1,117 @@
+#include <metal_atomic>
+#include <metal_stdlib>
+using namespace metal;
+
+
+kernel void compute_output_positions_kernel(
+    device const bool* is_unique [[buffer(0)]],
+    device int* positions [[buffer(1)]],
+    uint gid [[thread_position_in_grid]]) {
+  int pos = 0;
+  for (uint i = 0; i < gid; i++) {
+    if (is_unique[i])
+      pos++;
+  }
+  positions[gid] = pos;
+}
+
+kernel void mark_unique_positions_and_count_kernel(
+    device const int64_t* flat_indices [[buffer(0)]],
+    device bool* is_unique [[buffer(1)]],
+    device atomic_int* count [[buffer(2)]],
+    uint tid [[thread_position_in_grid]]) {
+  bool unique = (tid == 0) || (flat_indices[tid] != flat_indices[tid - 1]);
+  is_unique[tid] = unique;
+
+  if (unique) {
+    atomic_fetch_add_explicit(count, 1, memory_order_relaxed);
+  }
+}
+
+// Kogge-Stone parallel prefix sum step
+kernel void kogge_stone_step(
+    device const int* input [[buffer(0)]],
+    device int* output [[buffer(1)]],
+    constant uint& stride [[buffer(2)]],
+    uint gid [[thread_position_in_grid]]) {
+  int val = input[gid];
+  if (gid >= stride) {
+    val += input[gid - stride];
+  }
+  output[gid] = val;
+}
+
+// Shift right for exclusive scan
+kernel void shift_right_kernel(
+    device const int* input [[buffer(0)]],
+    device int* output [[buffer(1)]],
+    uint gid [[thread_position_in_grid]]) {
+  output[gid] = (gid == 0) ? 0 : input[gid - 1];
+}
+
+template <typename T>
+kernel void coalesce_with_positions_kernel(
+    device const int64_t* flat_indices [[buffer(0)]],
+    device const int64_t* indices [[buffer(1)]],
+    device const T* in_values [[buffer(2)]],
+    device const bool* is_unique [[buffer(3)]],
+    device const int* output_positions [[buffer(4)]],
+    device int64_t* out_indices [[buffer(5)]],
+    device T* out_values [[buffer(6)]],
+    constant uint& nnz [[buffer(7)]],
+    constant uint& value_size [[buffer(8)]],
+    constant uint& sparse_dim [[buffer(9)]],
+    constant uint& total_unique [[buffer(10)]],
+    uint gid [[thread_position_in_grid]]) {
+  if (!is_unique[gid])
+    return;
+
+  int out_pos = output_positions[gid];
+
+  for (uint d = 0; d < sparse_dim; d++) {
+    out_indices[d * total_unique + out_pos] = indices[d * nnz + gid];
+  }
+
+  int64_t current_index = flat_indices[gid];
+  uint end = gid + 1;
+  while (end < nnz && flat_indices[end] == current_index) {
+    end++;
+  }
+
+  for (uint elem = 0; elem < value_size; elem++) {
+    T sum = 0;
+    for (uint j = gid; j < end; j++) {
+      sum += in_values[j * value_size + elem];
+    }
+    out_values[out_pos * value_size + elem] = sum;
+  }
+}
+
+#define INSTANTIATE_COALESCE_WITH_POSITIONS(DTYPE)                            \
+  template                                                                    \
+      [[host_name("coalesce_with_positions_kernel_" #DTYPE)]] [[kernel]] void \
+      coalesce_with_positions_kernel<DTYPE>(                                  \
+          device const int64_t* flat_indices [[buffer(0)]],                   \
+          device const int64_t* indices [[buffer(1)]],                        \
+          device const DTYPE* in_values [[buffer(2)]],                        \
+          device const bool* is_unique [[buffer(3)]],                         \
+          device const int* output_positions [[buffer(4)]],                   \
+          device int64_t* out_indices [[buffer(5)]],                          \
+          device DTYPE* out_values [[buffer(6)]],                             \
+          constant uint& nnz [[buffer(7)]],                                   \
+          constant uint& value_size [[buffer(8)]],                            \
+          constant uint& sparse_dim [[buffer(9)]],                            \
+          constant uint& total_unique [[buffer(10)]],                         \
+          uint gid [[thread_position_in_grid]]);
+
+INSTANTIATE_COALESCE_WITH_POSITIONS(float);
+INSTANTIATE_COALESCE_WITH_POSITIONS(half);
+INSTANTIATE_COALESCE_WITH_POSITIONS(bfloat);
+INSTANTIATE_COALESCE_WITH_POSITIONS(bool);
+INSTANTIATE_COALESCE_WITH_POSITIONS(long);
+INSTANTIATE_COALESCE_WITH_POSITIONS(char);
+INSTANTIATE_COALESCE_WITH_POSITIONS(uchar);
+INSTANTIATE_COALESCE_WITH_POSITIONS(short);
+INSTANTIATE_COALESCE_WITH_POSITIONS(int);
+INSTANTIATE_COALESCE_WITH_POSITIONS(float2);
+INSTANTIATE_COALESCE_WITH_POSITIONS(half2);
\ No newline at end of file
diff --git a/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal b/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal
new file mode 100644
index 0000000000000..00156dddb06c2
--- /dev/null
+++ b/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal
@@ -0,0 +1,19 @@
+#include <metal_stdlib>
+using namespace metal;
+
+
+kernel void flatten_indices_kernel(
+    device const long* indices        [[ buffer(0) ]],
+    device const long* row_muls       [[ buffer(1) ]],
+    device long*       flat_indices   [[ buffer(2) ]],
+    constant uint&     sparse_dim     [[ buffer(3) ]],
+    constant long2&    idx_strides    [[ buffer(4) ]],
+    uint               gid            [[ thread_position_in_grid ]]) {
+  long flat = 0;
+  for (uint d = 0; d < sparse_dim; ++d) {
+    long off = (long)d * idx_strides.x + (long)gid * idx_strides.y;
+    long v = indices[off];
+    flat += v * row_muls[d];
+  }
+  flat_indices[gid] = flat;
+}
\ No newline at end of file
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index 80049aa9a832f..b8b43e0086c1a 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -849,16 +849,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
     // TODO(eqy): support debug_attn_mask
     return std::make_tuple(std::move(attention), std::move(log_sumexp), Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
   } else {
-    //auto [
-    //    query_buffer_reshaped,
-    //    key_buffer_reshaped,
-    //    value_buffer_reshaped,
-    //    cumulative_sequence_length_q,
-    //    cumulative_sequence_length_kv,
-    //    max_seqlen_batch_q,
-    //    max_seqlen_batch_kv,
-    //    output_shape] = preprocessing::sdpa_nested_preprocessing(query, key, value);
-    // C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention_cudnn");
     // TODO(eqy): debug mask support
     // BHSD ...
     const int64_t batch_size = cumulative_sequence_length_q.value().size(0) - 1;
@@ -1346,7 +1336,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
   if(at::globalContext().getROCmFAPreferredBackend() ==
     at::ROCmFABackend::Ck) {
 
-#if defined(USE_CK_FLASH_ATTENTION)
+#if defined(USE_ROCM_CK_SDPA)
     std::optional<Tensor> out(res);
     std::optional<Tensor> seqused_k = std::nullopt;
     std::optional<Tensor> alibi_slopes = std::nullopt;
@@ -1406,12 +1396,15 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     at::Tensor v_t = value.transpose(1, 2);
     at::Tensor output_t = res.transpose(1, 2);
     bool is_causal;
-    if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) {
-      is_causal = true;
-    } else if (static_cast<int64_t>(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) {
+    if (static_cast<int64_t>(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) {
       is_causal = false;
     } else {
-      TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now");
+      is_causal = true;
+#if AOTRITON_V3_API == 0
+      if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) != custom_mask_type) {
+        TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now");
+      }
+#endif
     }
 
     at::Tensor atomic_counter;
@@ -1436,7 +1429,51 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     auto offset_output = mk_philoxtensor(use_philox_state ? offset_t.data_ptr<int64_t>() : nullptr);
     auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr<int32_t>() : nullptr);
     hipError_t err; // TODO: Error handling
-    if (seqstart_q.has_value()) {
+    if constexpr (AOTRITON_ALWAYS_V3_API) {  // Better readability than nesting ifdef
+#if AOTRITON_V3_API  // if constexpr does not stop errors from undefined functions
+      using aotriton::v3::flash::CausalType;
+      using aotriton::v3::flash::VarlenType;
+      using aotriton::v3::flash::WindowValue;
+      aotriton::v3::flash::attn_fwd_params params;
+      params.Q = mk_aotensor(q_t, "q");
+      params.K = mk_aotensor(k_t, "k");
+      params.V = mk_aotensor(v_t, "v");
+      params.Sm_scale = softmax_scale;
+      params.L = compute_logsumexp ? mk_aotensor<2>(softmax_lse, "M") : empty_t2;
+      params.Out = mk_aotensor(output_t, "Out");
+      params.Max_seqlen_q = max_seqlen_q;    // Unused if cu_seqlens_q is empty
+      params.Max_seqlen_k = max_seqlen_k;    // Unused if cu_seqlens_k is empty
+      params.dropout_p = dropout_p;
+      params.philox_seed_ptr = seed;
+      params.philox_offset1 = offset1;
+      params.philox_offset2 = offset2;
+      params.philox_seed_output = seed_output;
+      params.philox_offset_output = offset_output;
+      params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax");
+      params.persistent_atomic_counter = persistent_counter;
+      params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
+      if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) {
+        params.window_left = WindowValue::TopLeftAligned;
+        params.window_right = WindowValue::TopLeftAligned;
+      } else if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromBottomRight) == custom_mask_type) {
+        params.window_left = WindowValue::BottomRightAligned;
+        params.window_right = WindowValue::BottomRightAligned;
+      }
+      if (bias.has_value()) {
+        params.B = mk_aotensor(bias.value(), "bias");
+      }
+      if (seqstart_q.has_value()) {
+        params.varlen_type = VarlenType::CompactVarlen;
+        params.cu_seqlens_q = mk_aotensor<1>(seqstart_q.value(), "cu_seqlens_q");
+        params.cu_seqlens_k = mk_aotensor<1>(seqstart_k.value(), "cu_seqlens_k");
+      } else {
+        params.varlen_type = VarlenType::None;
+      }
+      err = aotriton::v3::flash::attn_fwd(params,
+                                          aotriton::v3::flash::attn_fwd_params::kVersion,
+                                          stream);
+#endif  // AOTRITON_V3_API
+    } else if (seqstart_q.has_value()) {
       // varlen aka nested tensor
       err = attn_fwd_compact_varlen(mk_aotensor(q_t, "q"),
                                     mk_aotensor(k_t, "k"),
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
index 3888df64ad80b..55fc1e261219e 100644
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -24,8 +24,11 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/zeros.h>
 #include <ATen/ops/zeros_like.h>
 #include <ATen/ops/empty_strided.h>
+#include <ATen/ops/_cudnn_attention_backward.h>
+#include <ATen/ops/_cudnn_attention_backward_native.h>
 #include <ATen/ops/_flash_attention_backward.h>
 #include <ATen/ops/_flash_attention_backward_native.h>
 #include <ATen/ops/_efficient_attention_backward.h>
@@ -45,6 +48,7 @@
 #include <ATen/native/transformers/cuda/mem_eff_attention/gemm_kernel_utils.h>
 #include <ATen/native/transformers/cuda/mem_eff_attention/pytorch_utils.h>
 #else
+#include <ATen/native/transformers/hip/gemm_kernel_utils.h>
 // MemoryEfficient Attention Specific Imports for ROCM
 #ifndef DISABLE_AOTRITON
 #include <ATen/native/transformers/hip/aotriton_adapter.h>
@@ -184,7 +188,7 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
   return std::make_tuple(Tensor(), Tensor(), Tensor());
 }
 
-std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_cuda(
+std::tuple<Tensor, Tensor, Tensor> _cudnn_attention_backward(
     const Tensor& grad_out,
     const Tensor& query,
     const Tensor& key,
@@ -211,57 +215,117 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
       }
     }
 
-    const int64_t batch_size = query.size(0);
-    const int64_t num_heads = query.size(1);
-    const int64_t head_dim_qk = query.size(3);
-    const int64_t head_dim_v = value.size(3);
+    const bool is_nested = cum_seq_q.defined();
     const int64_t max_seqlen_batch_q = query.size(2);
     const int64_t max_seqlen_batch_k = key.size(2);
 
-    // This is needed because SaveVariable automatically converts
-    // std::optional to undefined tensor
-    std::optional<Tensor> attn_bias_;
-    if (attn_bias.defined()) {
-      attn_bias_ = attn_bias;
-    }
-    if (attn_bias_.has_value()) {
-      const auto bias_dim = attn_bias_.value().dim();
-      if (bias_dim == 2) {
-        attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
-      } else if (bias_dim == 3) {
-        attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
-      } else {
-        TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
-        attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
+    if (!is_nested) {
+      const int64_t batch_size = query.size(0);
+      const int64_t num_heads = query.size(1);
+      const int64_t head_dim_qk = query.size(3);
+      const int64_t head_dim_v = value.size(3);
+
+      // This is needed because SaveVariable automatically converts
+      // std::optional to undefined tensor
+      std::optional<Tensor> attn_bias_;
+      if (attn_bias.defined()) {
+        attn_bias_ = attn_bias;
+      }
+      if (attn_bias_.has_value()) {
+        const auto bias_dim = attn_bias_.value().dim();
+        if (bias_dim == 2) {
+          attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+        } else if (bias_dim == 3) {
+          attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+        } else {
+          TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
+          attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
+        }
       }
-    }
 
-    const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
-    auto dq = at::empty_like(query);
-    auto dk = at::empty_like(key);
-    auto dv = at::empty_like(value);
-    run_cudnn_SDP_bprop(batch_size /*int64_t b*/,
-                        num_heads /*int64_t h*/,
-                        max_q/*int64_t s_q*/,
-                        max_k/*int64_t s_kv*/,
-                        head_dim_qk /*int64_t d_qk*/,
-                        head_dim_v /*int64_t d_v*/,
-                        softmax_scale /*float scaling_factor*/,
-                        is_causal /*bool is_causal*/,
-                        dropout_p /*float dropout_probability*/,
-                        query /*const Tensor& q*/,
-                        key /*const Tensor& k*/,
-                        value /*const Tensor& v*/,
-                        attn_bias_ /*const std::optional<Tensor>& attn_bias*/,
-                        out /*const Tensor& o*/,
-                        grad_out/*const Tensor& dO*/,
-                        logsumexp.unsqueeze(-1)/*const Tensor& softmaxstats*/,
-                        dq/*Tensor& dQ*/,
-                        dk/*Tensor& dK*/,
-                        dv/*Tensor& dV*/,
-                        philox_seed/*Tensor& dropoutseed*/,
-                        philox_offset/*Tensor& dropoutoffset*/);
-    return std::make_tuple(std::move(dq), std::move(dk), std::move(dv));
+      const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
+      auto dq = at::empty_like(query);
+      auto dk = at::empty_like(key);
+      auto dv = at::empty_like(value);
+      run_cudnn_SDP_bprop(batch_size /*int64_t b*/,
+                          num_heads /*int64_t h*/,
+                          max_q/*int64_t s_q*/,
+                          max_k/*int64_t s_kv*/,
+                          head_dim_qk /*int64_t d_qk*/,
+                          head_dim_v /*int64_t d_v*/,
+                          softmax_scale /*float scaling_factor*/,
+                          is_causal /*bool is_causal*/,
+                          dropout_p /*float dropout_probability*/,
+                          query /*const Tensor& q*/,
+                          key /*const Tensor& k*/,
+                          value /*const Tensor& v*/,
+                          attn_bias_ /*const std::optional<Tensor>& attn_bias*/,
+                          out /*const Tensor& o*/,
+                          grad_out/*const Tensor& dO*/,
+                          logsumexp/*const Tensor& softmaxstats*/,
+                          dq/*Tensor& dQ*/,
+                          dk/*Tensor& dK*/,
+                          dv/*Tensor& dV*/,
+                          philox_seed/*Tensor& dropoutseed*/,
+                          philox_offset/*Tensor& dropoutoffset*/);
+      return std::make_tuple(std::move(dq), std::move(dk), std::move(dv));
+    } else {
+      // BHSD ...
+      const int64_t batch_size = cum_seq_q.size(0) - 1;
+      const int64_t num_heads_q = query.size(-2);
+      const int64_t num_heads_k = key.size(-2);
+      const int64_t num_heads_v = value.size(-2);
+      const int64_t head_dim_qk = query.size(-1);
+      const int64_t head_dim_v = value.size(-1);
+      std::optional<Tensor> attn_bias_;
+      if (attn_bias.defined()) {
+        attn_bias_ = attn_bias;
+      }
+      if (attn_bias_.has_value()) {
+        const auto bias_dim = attn_bias_.value().dim();
+        if (bias_dim == 2) {
+          attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+        } else if (bias_dim == 3) {
+          attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
+        } else {
+          attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
+          TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
+        }
+      }
+
+      auto dq = at::empty_like(query);
+      auto dk = at::empty_like(key);
+      auto dv = at::empty_like(value);
+
+      const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
+      run_cudnn_SDP_bprop_nestedtensor(
+        batch_size,
+        num_heads_q,
+        num_heads_k,
+        num_heads_v,
+        max_seqlen_batch_q,
+        max_seqlen_batch_k,
+        head_dim_qk,
+        head_dim_v,
+        softmax_scale,
+        is_causal,
+        dropout_p,
+        cum_seq_q,
+        cum_seq_k,
+        query,
+        key,
+        value,
+        attn_bias_,
+        out,
+        grad_out,
+        logsumexp,
+        dq,
+        dk,
+        dv,
+        philox_seed,
+        philox_offset);
+      return std::make_tuple(std::move(dq), std::move(dk), std::move(dv));
+    }
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@@ -431,7 +495,7 @@ _efficient_attention_backward(
   // ROCM Implementation
   if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck)
   {
-#if defined(USE_CK_FLASH_ATTENTION)
+#if defined(USE_ROCM_CK_SDPA)
     const auto my_softmax_scale = sdp::calculate_scale(query, scale).expect_float();
     // Store grad_bias in optional
     std::optional<at::Tensor> opt_grad_bias = grad_bias;
@@ -482,12 +546,15 @@ _efficient_attention_backward(
     }
     const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
     bool is_causal;
-    if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) {
-      is_causal = true;
-    } else if (static_cast<int64_t>(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) {
+    if (static_cast<int64_t>(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) {
       is_causal = false;
     } else {
-      TORCH_CHECK(false, "[_efficient_attention_backward] Unsupported mask type in AOTriton, for now");
+      is_causal = true;
+#if AOTRITON_V3_API == 0
+      if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) != custom_mask_type) {
+        TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now");
+      }
+#endif
     }
     at::Tensor q_t = query.permute({0,2,1,3});
     at::Tensor k_t = key.permute({0,2,1,3});
@@ -506,7 +573,62 @@ _efficient_attention_backward(
     using sdp::aotriton_adapter::mk_aoscalartensor;
     using sdp::aotriton_adapter::cast_dtype;
     aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, cast_dtype(query.dtype()));
-    if (cu_seqlens_q.has_value()) {
+    if constexpr (AOTRITON_ALWAYS_V3_API) {  // Better readability than nesting ifdef
+#if AOTRITON_V3_API  // if constexpr does not stop errors from undefined functions
+      using aotriton::v3::flash::CausalType;
+      using aotriton::v3::flash::VarlenType;
+      using aotriton::v3::flash::WindowValue;
+      aotriton::v3::flash::attn_bwd_params params;
+      params.Q = mk_aotensor(q_t, "q");
+      params.K = mk_aotensor(k_t, "k");
+      params.V = mk_aotensor(v_t, "v");
+      params.B = bias.has_value() ? mk_aotensor(bias.value(), "bias") : empty_t4;
+      params.Sm_scale = softmax_scale;
+      params.Out = mk_aotensor(out_t, "out");
+      params.DO = mk_aotensor(dout_t, "dout");
+      params.DK = mk_aotensor(dk_t, "dk");
+      params.DV = mk_aotensor(dv_t, "dv");
+      params.DQ = mk_aotensor(dq_t, "dq");
+      params.DB = bias_requires_grad ? mk_aotensor(grad_bias, "db") : empty_t4;
+      params.L = mk_aotensor<2>(softmax_lse, "L");
+      params.Max_seqlen_q = max_seqlen_q;        // Unused if cu_seqlens_q is empty
+      params.Max_seqlen_k = max_seqlen_k;        // Unused if cu_seqlens_k is empty
+      params.dropout_p = float(dropout_p);
+      params.philox_seed_ptr =  mk_aoscalartensor(philox_seed);
+      params.philox_offset1 = mk_aoscalartensor(philox_offset);
+      params.philox_offset2 = 0;
+      params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
+      if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) {
+        params.window_left = WindowValue::TopLeftAligned;
+        params.window_right = WindowValue::TopLeftAligned;
+      } else if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromBottomRight) == custom_mask_type) {
+        params.window_left = WindowValue::BottomRightAligned;
+        params.window_right = WindowValue::BottomRightAligned;
+      }
+#if AOTRITON_ALWAYS_V3_API
+      using sdp::aotriton_adapter::mklazy_empty_like;
+      using sdp::aotriton_adapter::mklazy_fp32zeros;
+      using sdp::aotriton_adapter::LazyTensorContext;
+      LazyTensorContext lazy_delta { .like_tensor = softmax_lse, .tensor_name = "delta" };
+      LazyTensorContext lazy_dq_acc { .like_tensor = dq_t, .tensor_name = "dq_acc" };
+      params.D = mklazy_empty_like<2>(&lazy_delta);
+      params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc);
+#else
+      at::Tensor delta = at::empty_like(softmax_lse).contiguous();
+      params.D = mk_aotensor<2>(delta, "delta");
+#endif
+      if (cu_seqlens_q.has_value()) {
+        params.varlen_type = VarlenType::CompactVarlen;
+        params.cu_seqlens_q = mk_aotensor<1>(cu_seqlens_q.value(), "cu_seqlens_q");
+        params.cu_seqlens_k = mk_aotensor<1>(cu_seqlens_k.value(), "cu_seqlens_k");
+      } else {
+        params.varlen_type = VarlenType::None;
+      }
+      err = aotriton::v3::flash::attn_bwd(params,
+                                          aotriton::v3::flash::attn_bwd_params::kVersion,
+                                          stream);
+#endif  // AOTRITON_V3_API
+    } else if (cu_seqlens_q.has_value()) {
       at::Tensor delta = at::empty_like(softmax_lse).contiguous();
       // varlen aka Nested tensor
       err = attn_bwd_compact_varlen(mk_aotensor(q_t, "q"),
@@ -1063,4 +1185,40 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_e
   }
 }
 
+std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_cuda(
+    const Tensor& grad_out,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& out,
+    const Tensor& logsumexp,
+    const Tensor& philox_seed,
+    const Tensor& philox_offset,
+    const Tensor& attn_bias,
+    const Tensor& cum_seq_q,
+    const Tensor& cum_seq_k,
+    const int64_t max_q,
+    const int64_t max_k,
+    double dropout_p,
+    bool is_causal,
+    std::optional<double> scale) {
+        return at::_cudnn_attention_backward(
+            grad_out,
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            philox_seed,
+            philox_offset,
+            attn_bias,
+            cum_seq_q,
+            cum_seq_k,
+            max_q,
+            max_k,
+            dropout_p,
+            is_causal,
+            scale);
+}
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
index 4b198f4d6d2de..c826ef1ab8b15 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -16,6 +16,7 @@
 #include <c10/util/irange.h>
 #include <c10/util/Array.h>
 #include <c10/util/Exception.h>
+#include <c10/util/string_view.h>
 
 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
@@ -25,9 +26,12 @@
 
 #if USE_ROCM
 #if defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION)
+#include <ATen/native/transformers/hip/aotriton_versions.h>
 #include <aotriton/flash.h>
 #define USE_ROCM_ATTENTION 1
 #endif
+#else
+#define USE_ROCM_ATTENTION 0
 #endif
 
 // Avoid potential compiler -Wall -Werror complains undefined macro
@@ -57,21 +61,29 @@
 namespace sdp {
 namespace {
 
+// tracks whether we've set the default priority order once, to avoid setting
+// it redundantly or overwriting a user-specified priority order
+// when the priority order context manager is used before the default priority
+// order is initialized the following happens:
+// (1) the current priority order is queried
+// (2) priority_order() is called, which initializes it to the default as init_ is false
+// (3) the user-specified priority order is set
+// (3.1) we are in the priority context...
+// (3.2) we exit the priority context...
+// (4) the previous priority order (default) is restored
+bool priority_order_init_ = false;
+
 // TODO(eqy): more benchmarking to determine whether this should include sm86/89
 // Needs to be kept in-sync with test_fused_chocie in test_transformers.py
 bool check_prefer_cudnn_attention() {
-  // TODO(eqy): Re-enable by default after upgrading to a release later than 9.5.0
-  // see context: https://github.com/pytorch/pytorch/issues/138340
-  // return false;
-#if defined(CUDNN_VERSION)
-
-#if CUDNN_VERSION > 90000
+  static const bool prefer_cudnn = c10::utils::check_env("TORCH_CUDNN_SDPA_PREFERRED") != false;
+  if (!prefer_cudnn) {
+    return false;
+  }
+#if (defined(CUDNN_VERSION) && (CUDNN_VERSION >= 90900))
   auto dprops = at::cuda::getCurrentDeviceProperties();
-  return dprops->major >= 9;
-#else
-  return false;
-#endif
-
+  auto major = dprops->major;
+  return (major == 9 || major == 10) && !dprops->minor;
 #else
   return false;
 #endif
@@ -79,6 +91,16 @@ bool check_prefer_cudnn_attention() {
 
 // flash_attention V2 is universally faster than efficient_attention and Math
 std::array<SDPBackend, num_backends> priority_order(sdp_params const& params) {
+  if (!priority_order_init_) {
+    priority_order_init_ = true;
+    if (check_prefer_cudnn_attention()) {
+        const std::vector<int64_t> cudnn_order = {static_cast<int64_t>(at::SDPBackend::cudnn_attention),
+                                                  static_cast<int64_t>(at::SDPBackend::flash_attention),
+                                                  static_cast<int64_t>(at::SDPBackend::efficient_attention),
+                                                  static_cast<int64_t>(at::SDPBackend::math)};
+        at::globalContext().setSDPPriorityOrder(cudnn_order);
+    }
+  }
   return at::globalContext().sDPPriorityOrder();
 }
 
@@ -112,9 +134,24 @@ int64_t minimum_gemm_alignment(sdp_params const& params) {
 // caller_is_meff is added to make the TORCH_WARN message showing the correct result
 template<bool caller_is_meff = false>
 bool check_head_dim_size_flash(sdp_params const& params, bool debug) {
-#if USE_ROCM_ATTENTION && AOTRITON_VERSION_MINOR >= 9
+#if USE_ROCM_ATTENTION
   // AOTriton 0.9+ supports head_dim up to 512
-  const auto max_size = c10::SymInt(512);
+  const static auto max_hdim = []() {
+#if AOTRITON_VERSION_CURRENT == AOTRITON_VERSION_INT(0, 11)
+    // gfx11xx only support hdim <= 256 on AOTriton 0.11
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    const c10::basic_string_view<char> arch(dprops->gcnArchName);
+    if (arch.starts_with("gfx11")) {
+      return 256;
+    }
+#endif // AOTriton 0.11
+#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 9)
+    return 512;
+#else
+    return 256;
+#endif
+  }();
+  const auto max_size = c10::SymInt(max_hdim);
 #else
   // All head_dim sizes must be equal and less than 256
   const auto max_size = c10::SymInt(256);
@@ -414,9 +451,9 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
     return false;
   }
   auto head_dim_limit = 128;
-  if (cudnn_version >= 90501) {
+  if (cudnn_version >= 91000) {
     auto dprops = at::cuda::getCurrentDeviceProperties();
-    if (dprops->major == 9  && !dprops->minor) {
+    if (dprops->major == 9 && !dprops->minor) {
       head_dim_limit = 256;
     }
   }
@@ -453,9 +490,15 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
       return false;
     }
   }
-  if (s_q == 1 || s_k == 1) {
+  if (s_k == 1) {
+    if (debug) {
+      TORCH_WARN_ONCE("cudnn SDPA does not support key/value sequence length 1.");
+    }
+    return false;
+  }
+  if (s_q == 1 && params.dropout != 0.0) {
     if (debug) {
-      TORCH_WARN_ONCE("cudnn SDPA does not support sequence length 1.");
+      TORCH_WARN_ONCE("cudnn SDPA does not support query sequence length 1 with dropout.");
     }
     return false;
   }
@@ -563,9 +606,9 @@ bool check_for_nested_inputs(sdp_params const& params, bool debug) {
 
   const auto dprop = at::cuda::getCurrentDeviceProperties();
   // Check that the input is nested
-  if (dprop->major != 9 && has_for_nested_inputs(params)) {
+  if (!(dprop->major == 9 || dprop->major == 10) && has_for_nested_inputs(params)) {
     if (debug) {
-      TORCH_WARN("CuDNN SDPA supports nested tensors on SM 9.0.");
+      TORCH_WARN("cuDNN SDPA supports nested tensors on SM 9.0, SM 10.0.");
     }
     return false;
   }
@@ -589,7 +632,7 @@ bool check_runtime_disabled_cudnn(sdp_params const& params, bool debug) {
   // sdp kernels
   if (!at::globalContext().userEnabledCuDNNSDP()) {
     if (debug) {
-      TORCH_WARN("CuDNN attention has been runtime disabled.");
+      TORCH_WARN("cuDNN attention has been runtime disabled.");
     }
     return false;
   }
@@ -620,7 +663,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
 #endif
 #if defined(CUDNN_VERSION) && CUDNN_VERSION < 90000
   if (debug) {
-    TORCH_WARN(CUDNN_VERSION, " cuDNN version too old to use CuDNN Attention (< v9.0.0)");
+    TORCH_WARN(CUDNN_VERSION, " cuDNN version too old to use cuDNN Attention (< v9.0.0)");
   }
   return false;
 #endif
@@ -630,10 +673,8 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
       c10::array_of<bool (*)(sdp_params const&, bool)>(
           check_runtime_disabled_cudnn,
           check_for_nested_inputs,
-          check_nonzero_sequence_lengths_dense,
           check_all_tensors_on_device,
           check_tensor_shapes,
-          check_cudnn_tensor_shapes,
           check_cudnn_deterministic,
           check_dtypes_low_precision,
           check_attn_mask_shape,
@@ -646,8 +687,10 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
   }
   constexpr auto dense_constraints =
       c10::array_of<bool (*)(sdp_params const&, bool)>(
+      check_nonzero_sequence_lengths_dense,
       check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim=*/>,
-      check_batch_size_and_num_heads_dense<true /*enable_gqa*/, false /*requires_same_num_heads*/>
+      check_batch_size_and_num_heads_dense<true /*enable_gqa*/, false /*requires_same_num_heads*/>,
+      check_cudnn_tensor_shapes
   );
 
   if (has_only_dense_inputs(params)) {
@@ -864,7 +907,7 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
   sdp::can_use_mem_efficient_attention(kernel_params, print_debug);
   TORCH_WARN("Flash attention kernel not used because:");
   sdp::can_use_flash_attention(kernel_params, print_debug);
-  TORCH_WARN("CuDNN attention kernel not used because:");
+  TORCH_WARN("cuDNN attention kernel not used because:");
   sdp::can_use_cudnn_attention(kernel_params, print_debug);
   TORCH_CHECK(!print_debug, "No available kernel. Aborting execution.")
   return SDPBackend::error;
diff --git a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
index aedb205e57101..d316808cf9bef 100644
--- a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
+++ b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
@@ -2,8 +2,12 @@
 
 #ifdef USE_ROCM
 
+// Expect to be included after headers of at::zeros_like and at::empty_like
+
 #include <aotriton/dtypes.h>
 #include <aotriton/util.h>
+#include <aotriton/config.h>
+#include <ATen/native/transformers/hip/aotriton_versions.h>
 
 ////////////////////////////////////////////////////////////////////////////////
 // Common macros copied from cuda/mem_eff_attention/gemm_kernel_utils.h
@@ -111,6 +115,61 @@ inline aotriton::TensorView<0> mk_atomictensor(const int32_t* ptr)
                                  aotriton::DType::kInt32);
 }
 
+#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11)
+
+struct LazyTensorContext {
+  at::Tensor like_tensor;
+  std::string_view tensor_name;
+  at::Tensor tensor;
+};
+
+template<int kRank, bool kRequireZeros>
+struct LazyTensorFunctions : public LazyTensorContext {
+  static aotriton::TensorView<kRank> acquire(void* cookie) {
+    auto ctx = (LazyTensorContext*)cookie;
+    if (!ctx->tensor.defined()) {
+      auto q = ctx->like_tensor;
+      if constexpr (kRequireZeros) {
+        ctx->tensor = at::zeros(q.sizes(),
+                                q.options().dtype(at::kFloat));
+      } else {
+        ctx->tensor = at::empty_like(q);
+      }
+    }
+    return mk_aotensor<kRank>(ctx->tensor, ctx->tensor_name);
+  }
+
+  static void dispose(void* cookie) {
+  }
+};
+
+template<int kRank, bool kRequireZeros>
+aotriton::LazyTensor<kRank> mklazy_common(LazyTensorContext* cookie)
+{
+  using LTF = LazyTensorFunctions<kRank, kRequireZeros>;
+  return aotriton::LazyTensor<kRank> {
+    .cookie = cookie,
+    .acquire = &LTF::acquire,
+    .dispose = &LTF::dispose
+  };
+}
+
+template<int kRank>
+auto mklazy_empty_like(LazyTensorContext* cookie)
+{
+  return mklazy_common<kRank, false>(cookie);
+}
+
+
+// Note: this will not keep the original strides
+template<int kRank>
+auto mklazy_fp32zeros(LazyTensorContext* cookie)
+{
+  return mklazy_common<kRank, true>(cookie);
+}
+
+#endif  // >= 0.11
+
 } // namespace aotriton_adapter
 
 } // namespace sdp
diff --git a/aten/src/ATen/native/transformers/hip/aotriton_versions.h b/aten/src/ATen/native/transformers/hip/aotriton_versions.h
new file mode 100644
index 0000000000000..2f5d3f0e12228
--- /dev/null
+++ b/aten/src/ATen/native/transformers/hip/aotriton_versions.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#ifdef USE_ROCM
+
+#define AOTRITON_VERSION_INT(x, y) (x * 100 + y)
+#define AOTRITON_VERSION_CURRENT (AOTRITON_VERSION_MAJOR * 100 + AOTRITON_VERSION_MINOR)
+
+#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11)
+#define AOTRITON_ALWAYS_V3_API 1
+#else
+#define AOTRITON_ALWAYS_V3_API 0
+#endif
+
+#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 10)
+#define AOTRITON_V3_API 1
+#else
+#define AOTRITON_V3_API 0
+#endif
+
+#endif
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
index 05523f75caa42..b5b1ed4292896 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
@@ -60,20 +60,13 @@
 #include <c10/util/Exception.h>
 
 // AOTriton headers
-#include <aotriton/config.h>
 #include <aotriton/flash.h>
 #include <aotriton/runtime.h>
 
-#if AOTRITON_VERSION_MINOR < 9
+#if AOTRITON_VERSION_CURRENT < AOTRITON_VERSION_INT(0, 9)
 #error "This adaptor code is only tested with AOTriton >= 0.9"
 #endif
 
-#if (AOTRITON_VERSION_MAJOR * 100 + AOTRITON_VERSION_MINOR) >= 10
-#define V3_API 1
-#else
-#define V3_API 0
-#endif
-
 namespace pytorch_flash {
 
 namespace {
@@ -93,15 +86,15 @@ calculate_swa(std::optional<int64_t> window_size_left,
               int max_seqlen_q,
               int max_seqlen_k,
               bool is_causal) {
-#if V3_API  // SWA is exposed through V3 API
+#if AOTRITON_V3_API  // SWA is exposed through V3 API
   bool needs_swa = false;
   using aotriton::v3::flash::WindowValue;
   // Default values when std::optional window_size_left/right have no value
   int window_left = max_seqlen_q;
   int window_right = max_seqlen_k;
   if (is_causal) {
-    window_left = WindowValue::TopLeftAligned;
-    window_right = WindowValue::TopLeftAligned;
+    window_left = WindowValue::BottomRightAligned;
+    window_right = WindowValue::BottomRightAligned;
   }
   if (window_size_left.has_value() || window_size_right.has_value()) {
     needs_swa = true;
@@ -243,25 +236,27 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
   } else {
     softmax_fa_t = at::empty({ 0, 0, 0, 0 }, opts);
   }
-
-  at::Tensor atomic_counter;
-  if (is_causal) {
-    atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
-  }
-
   auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left,
                                                               window_size_right,
                                                               seqlen_q,
                                                               seqlen_k,
                                                               is_causal);
-#if V3_API
+#if AOTRITON_V3_API
   const bool uses_swa = needs_swa;
 #else
-  // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
+  // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
   // optimized out (hopefully).
   constexpr bool uses_swa = false;
 #endif
 
+  // SWA in AOTriton Kernels is treated as "Generalized Causal masks"
+  is_causal = is_causal || uses_swa;
+
+  at::Tensor atomic_counter;
+  if (is_causal) {
+    atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
+  }
+
   hipError_t err; // TODO: Error handling
   using aotriton::v2::flash::attn_fwd;
   using sdp::aotriton_adapter::mk_aotensor;
@@ -276,8 +271,8 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
   auto seed_output = mk_philoxtensor(use_philox_state ? seed_t.data_ptr<int64_t>() : nullptr);
   auto offset_output = mk_philoxtensor(use_philox_state ? offset_t.data_ptr<int64_t>() : nullptr);
   auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr<int32_t>() : nullptr);
-  if (uses_swa) {
-#if V3_API
+  if (uses_swa || AOTRITON_ALWAYS_V3_API) {
+#if AOTRITON_V3_API
     using aotriton::v3::flash::CausalType;
     using aotriton::v3::flash::VarlenType;
     aotriton::v3::flash::attn_fwd_params params;
@@ -297,7 +292,7 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
     params.philox_offset_output = offset_output;
     params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax");
     params.persistent_atomic_counter = persistent_counter;
-    params.causal_type = CausalType::WindowedAttention;
+    params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
     params.varlen_type = VarlenType::None;
     params.window_left = window_left;
     params.window_right = window_right;
@@ -447,14 +442,17 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
                                                               max_seqlen_q,
                                                               max_seqlen_k,
                                                               is_causal);
-#if V3_API
+#if AOTRITON_V3_API
   const bool uses_swa = needs_swa;
 #else
-  // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
+  // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
   // optimized out (hopefully).
   constexpr bool uses_swa = false;
 #endif
 
+  // SWA in AOTriton Kernels is treated as "Generalized Causal masks"
+  is_causal = is_causal || needs_swa;
+
   auto [seed_t, offset_t, philox_state, use_philox_state] =
     prepare_philox_arguments(p_dropout, batch_size * num_heads * 32);
 
@@ -477,8 +475,8 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
     auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr<int64_t>()) : nullscalar;
     auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr<int64_t>()) : nullscalar;
     auto persistent_counter = is_causal ? mk_philoxtensor(atomic_counter.data_ptr<int64_t>()) : nullscalar;
-    if (uses_swa) {
-#if V3_API
+    if (uses_swa || AOTRITON_ALWAYS_V3_API) {
+#if AOTRITON_V3_API
       using aotriton::v3::flash::CausalType;
       using aotriton::v3::flash::VarlenType;
       aotriton::v3::flash::attn_fwd_params params;
@@ -500,7 +498,7 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
       params.philox_offset_output = offset_output;
       params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax");
       params.persistent_atomic_counter = persistent_counter;
-      params.causal_type = CausalType::WindowedAttention;
+      params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
       params.varlen_type = VarlenType::CompactVarlen;
       params.window_left = window_left;
       params.window_right = window_right;
@@ -594,10 +592,6 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
   const int seqlen_k = k.size(1);
   const int num_heads_k = k.size(2);
 
-  if (is_causal){
-    TORCH_CHECK((seqlen_q == seqlen_k), "For backwards kernel seqlen_q must equal seqlen_k for causal kernels");
-  }
-
   TORCH_CHECK(batch_size > 0, "batch size must be positive");
   TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
   TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
@@ -649,10 +643,10 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
                                                               seqlen_q,
                                                               seqlen_k,
                                                               is_causal);
-#if V3_API
+#if AOTRITON_V3_API
   const bool uses_swa = needs_swa;
 #else
-  // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
+  // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
   // optimized out (hopefully).
   constexpr bool uses_swa = false;
 #endif
@@ -676,10 +670,9 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
   hipError_t err; // TODO: Error handling
   using sdp::aotriton_adapter::mk_aotensor;
   using sdp::aotriton_adapter::mk_aoscalartensor;
-  if (uses_swa) {
-#if V3_API
+  if (uses_swa || AOTRITON_ALWAYS_V3_API) {
+#if AOTRITON_V3_API
     // Fused BWD does not support SWA
-    at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
     using aotriton::v3::flash::CausalType;
     using aotriton::v3::flash::VarlenType;
     aotriton::v3::flash::attn_bwd_params params;
@@ -689,21 +682,32 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
     params.Sm_scale = softmax_scale;
     params.Out = mk_aotensor(out_t, "out");
     params.DO = mk_aotensor(dout_t, "dout");
-    params.DK = mk_aotensor(dq_t, "dq");
-    params.DV = mk_aotensor(dk_t, "dk");
-    params.DQ = mk_aotensor(dv_t, "dv");
+    params.DQ = mk_aotensor(dq_t, "dq");
+    params.DK = mk_aotensor(dk_t, "dk");
+    params.DV = mk_aotensor(dv_t, "dv");
     params.L = mk_aotensor<2>(softmax_lse_cont, "L");
-    params.D = mk_aotensor<2>(delta, "delta");
     params.Max_seqlen_q = seqlen_q;        // Unused if cu_seqlens_q is empty
     params.Max_seqlen_k = seqlen_k;        // Unused if cu_seqlens_k is empty
     params.dropout_p = p_dropout;
     params.philox_seed_ptr =  mk_aoscalartensor(philox_seed);
     params.philox_offset1 = mk_aoscalartensor(philox_offset);
     params.philox_offset2 = 0;
-    params.causal_type = CausalType::WindowedAttention;
-    params.varlen_type = VarlenType::None;
+    params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
     params.window_left = window_left;
     params.window_right = window_right;
+    params.varlen_type = VarlenType::None;
+#if AOTRITON_ALWAYS_V3_API
+    using sdp::aotriton_adapter::mklazy_empty_like;
+    using sdp::aotriton_adapter::mklazy_fp32zeros;
+    using sdp::aotriton_adapter::LazyTensorContext;
+    LazyTensorContext lazy_delta { .like_tensor = softmax_lse_cont, .tensor_name = "delta" };
+    LazyTensorContext lazy_dq_acc { .like_tensor = dq_t, .tensor_name = "dq_acc" };
+    params.D = mklazy_empty_like<2>(&lazy_delta);
+    params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc);
+#else
+    at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
+    params.D = mk_aotensor<2>(delta, "delta");
+#endif
     err = aotriton::v3::flash::attn_bwd(params,
                                         aotriton::v3::flash::attn_bwd_params::kVersion,
                                         stream);
@@ -838,7 +842,6 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
   CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
 
   at::Tensor softmax_lse_cont = softmax_lse.view({batch_size * num_heads, max_seqlen_q}).contiguous();
-  at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
 
   at::Tensor q_padded, k_padded, v_padded;
   q_padded = q.unsqueeze(0).transpose(1, 2);
@@ -896,10 +899,10 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
                                                               max_seqlen_q,
                                                               max_seqlen_k,
                                                               is_causal);
-#if V3_API
+#if AOTRITON_V3_API
   const bool uses_swa = needs_swa;
 #else
-  // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
+  // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
   // optimized out (hopefully).
   constexpr bool uses_swa = false;
 #endif
@@ -919,8 +922,8 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
     hipError_t err; // TODO: Error handling
     using sdp::aotriton_adapter::mk_aotensor;
     using sdp::aotriton_adapter::mk_aoscalartensor;
-    if (uses_swa) {
-#if V3_API
+    if (uses_swa || AOTRITON_ALWAYS_V3_API) {
+#if AOTRITON_V3_API
       using aotriton::v3::flash::CausalType;
       using aotriton::v3::flash::VarlenType;
       aotriton::v3::flash::attn_bwd_params params;
@@ -930,11 +933,10 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
       params.Sm_scale = softmax_scale;
       params.Out = mk_aotensor(out_t, "out");
       params.DO = mk_aotensor(dout_t, "dout");
-      params.DK = mk_aotensor(dq_padded, "dq");
-      params.DV = mk_aotensor(dk_padded, "dk");
-      params.DQ = mk_aotensor(dv_padded, "dv");
+      params.DK = mk_aotensor(dk_padded, "dk");
+      params.DV = mk_aotensor(dv_padded, "dv");
+      params.DQ = mk_aotensor(dq_padded, "dq");
       params.L = mk_aotensor<2>(softmax_lse_cont, "L");
-      params.D = mk_aotensor<2>(delta, "delta");
       params.cu_seqlens_q = mk_aotensor<1>(cu_seqlens_q, "cu_seqlens_q");
       params.cu_seqlens_k = mk_aotensor<1>(cu_seqlens_k, "cu_seqlens_k");
       params.Max_seqlen_q = max_seqlen_q;        // Unused if cu_seqlens_q is empty
@@ -943,17 +945,30 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
       params.philox_seed_ptr =  mk_aoscalartensor(philox_seed);
       params.philox_offset1 = mk_aoscalartensor(philox_offset);
       params.philox_offset2 = 0;
-      params.causal_type = CausalType::WindowedAttention;
+      params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
       params.varlen_type = VarlenType::CompactVarlen;
       params.window_left = window_left;
       params.window_right = window_right;
+#if AOTRITON_ALWAYS_V3_API
+      using sdp::aotriton_adapter::mklazy_empty_like;
+      using sdp::aotriton_adapter::mklazy_fp32zeros;
+      using sdp::aotriton_adapter::LazyTensorContext;
+      LazyTensorContext lazy_delta { .like_tensor = softmax_lse_cont, .tensor_name = "delta" };
+      LazyTensorContext lazy_dq_acc { .like_tensor = dq_padded, .tensor_name = "dq_acc" };
+      params.D = mklazy_empty_like<2>(&lazy_delta);
+      params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc);
+#else
+      at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
+      params.D = mk_aotensor<2>(delta, "delta");
+#endif
       err = aotriton::v3::flash::attn_bwd(params,
                                           aotriton::v3::flash::attn_bwd_params::kVersion,
                                           stream);
-#endif
+#endif  // AOTRITON_ALWAYS_V3_API
     } else {
       using aotriton::v2::flash::attn_bwd_compact_varlen;
       using sdp::aotriton_adapter::cast_dtype;
+      at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
       aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
       err = attn_bwd_compact_varlen(mk_aotensor(q_padded, "q"),
                                     mk_aotensor(k_padded, "k"),
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip
index 601ffd2d07525..59669afb93d2f 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_bwd_ck.hip
@@ -1,7 +1,7 @@
 #include <ATen/native/transformers/hip/flash_attn/flash_common_hip.hpp>
 #include <ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h>
 
-#if defined(USE_CK_FLASH_ATTENTION)
+#if defined(USE_ROCM_CK_SDPA)
 namespace pytorch_flash {
 std::tuple<
     at::Tensor, // dQ
@@ -117,4 +117,4 @@ mem_eff_backward_ck(
 }
 
 } // namespace pytorch_flash
-#endif // USE_CK_FLASH_ATTENTION
+#endif // USE_ROCM_CK_SDPA
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h
index 6fd46467bc076..e92006ef6315c 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h
@@ -3,7 +3,7 @@
 
 #include <ATen/core/Tensor.h>
 
-#if defined(USE_CK_FLASH_ATTENTION)
+#if defined(USE_ROCM_CK_SDPA)
 namespace pytorch_flash {
 
 std::tuple<
@@ -64,4 +64,4 @@ mem_eff_backward_ck(
     const at::Tensor philox_offset);
 
 } // namespace pytorch_flash
-#endif // USE_CK_FLASH_ATTENTION
+#endif // USE_ROCM_CK_SDPA
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip
index fac77821a56c1..d15c5105d0b46 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/me_fwd_ck.hip
@@ -1,7 +1,7 @@
 #include <ATen/native/transformers/hip/flash_attn/flash_common_hip.hpp>
 #include <ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h>
 
-#if defined(USE_CK_FLASH_ATTENTION)
+#if defined(USE_ROCM_CK_SDPA)
 namespace pytorch_flash {
 std::tuple<
     at::Tensor, // output
@@ -93,4 +93,4 @@ mem_eff_forward_ck(
 }
 
 } // namespace pytorch_flash
-#endif // USE_CK_FLASH_ATTENTION
+#endif // USE_ROCM_CK_SDPA
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
index 17298aae9485d..f6f2240d4f091 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
@@ -147,7 +147,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varlen_bwd_aot(
     const at::Tensor& philox_seed,
     const at::Tensor& philox_offset);
 
-#if defined(USE_CK_FLASH_ATTENTION)
+#if defined(USE_ROCM_CK_SDPA)
 // CK implementation
 TORCH_API
 std::tuple<
@@ -295,7 +295,7 @@ mha_fwd(
     const float softcap,
     const bool return_softmax,
     std::optional<at::Generator> gen_) {
-#if defined(USE_CK_FLASH_ATTENTION)
+#if defined(USE_ROCM_CK_SDPA)
   if (at::globalContext().getROCmFAPreferredBackend() ==
       at::ROCmFABackend::Ck) {
     const int non_null_window_left = window_size_left.value_or(-1);
@@ -368,7 +368,7 @@ mha_varlen_fwd(
     const float softcap,
     const bool return_softmax,
     std::optional<at::Generator> gen_) {
-#if defined(USE_CK_FLASH_ATTENTION)
+#if defined(USE_ROCM_CK_SDPA)
   if (at::globalContext().getROCmFAPreferredBackend() ==
       at::ROCmFABackend::Ck) {
     std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
@@ -441,9 +441,10 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_bwd(
     const bool deterministic,
     const at::Tensor philox_seed,
     const at::Tensor philox_offset) {
+
+#if defined(USE_ROCM_CK_SDPA)
   if (at::globalContext().getROCmFAPreferredBackend() ==
       at::ROCmFABackend::Ck) {
-#if defined(USE_CK_FLASH_ATTENTION)
     std::optional<at::Tensor> non_null_dbias = std::nullopt;
     const int non_null_window_left = window_size_left.value_or(-1);
     const int non_null_window_right = window_size_right.value_or(-1);
@@ -474,10 +475,8 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_bwd(
                              philox_offset);
     // for FA return [dQ, dV, dK, dSoftmax]
     return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue), std::move(dSoftmax));
-#else
-    TORCH_WARN_ONCE("Warning! You have opted to use CK flash attention backend in a build that was not compiled using USE_CK_FLASH_ATTENTION=1. Please set this variable and try again. Defaulting to use aotriton backend...");
-#endif
   }
+#endif
   return mha_bwd_aot(
       dout,
       q,
@@ -530,7 +529,7 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varlen_bwd
     const bool deterministic,
     const at::Tensor philox_seed,
     const at::Tensor philox_offset) {
-#if defined(USE_CK_FLASH_ATTENTION)
+#if defined(USE_ROCM_CK_SDPA)
   if (at::globalContext().getROCmFAPreferredBackend() ==
       at::ROCmFABackend::Ck) {
     std::optional<at::Tensor> non_null_dbias = std::nullopt;
diff --git a/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h b/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h
new file mode 100644
index 0000000000000..c18744afc1ffc
--- /dev/null
+++ b/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This file is a trimmed version of cuda/mem_eff_attention/gemm_kernel_utils.h
+#pragma once
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
+  TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  TORCH_CHECK(TENSOR.is_contiguous());
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
+  TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  TORCH_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+  TORCH_CHECK(                         \
+      uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                    \
+  {                                                                    \
+    A = B;                                                             \
+    TORCH_CHECK(                                                    \
+        B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
+  }
diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu
index 236753c94d37b..1b3ed4dc4ac42 100644
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@@ -10,8 +10,13 @@ using namespace at::native::memory;
 
 constexpr int buffer_size = 1024;
 
+#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
 __managed__ double4 buffer1[buffer_size];
 __managed__ double4 buffer2[buffer_size];
+#else
+__managed__ double4_16a buffer1[buffer_size];
+__managed__ double4_16a buffer2[buffer_size];
+#endif
 
 void reset_buffers() {
   for (int i = 0; i < buffer_size; i++) {
diff --git a/aten/src/ATen/test/thread_init_test.cpp b/aten/src/ATen/test/thread_init_test.cpp
index 7ad7a18e9c660..60dd52d1dffcb 100644
--- a/aten/src/ATen/test/thread_init_test.cpp
+++ b/aten/src/ATen/test/thread_init_test.cpp
@@ -1,7 +1,8 @@
+#include <gtest/gtest.h>
+
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
-#include <test/cpp/tensorexpr/test_base.h>
 #include <thread>
 
 
@@ -9,7 +10,7 @@
 // numbers of threads set and also whether the scheduler
 // will throw an exception when multiple threads call
 // their first parallel construct.
-void test(int given_num_threads) {
+static void test(int given_num_threads) {
   auto t = at::ones({1000 * 1000}, at::CPU(at::kFloat));
   ASSERT_TRUE(given_num_threads >= 0);
   ASSERT_EQ(at::get_num_threads(), given_num_threads);
@@ -19,7 +20,7 @@ void test(int given_num_threads) {
   }
 }
 
-int main() {
+TEST(ThreadInitTest, ThreadInit) {
   at::init_num_threads();
 
   at::set_num_threads(4);
@@ -32,13 +33,11 @@ int main() {
 
   #if !AT_PARALLEL_NATIVE
   at::set_num_threads(5);
-  ASSERT_TRUE(at::get_num_threads() == 5);
+  ASSERT_EQ(at::get_num_threads(), 5);
   #endif
 
   // test inter-op settings
   at::set_num_interop_threads(5);
   ASSERT_EQ(at::get_num_interop_threads(), 5);
   ASSERT_ANY_THROW(at::set_num_interop_threads(6));
-
-  return 0;
 }
diff --git a/aten/src/ATen/xpu/CachingHostAllocator.cpp b/aten/src/ATen/xpu/CachingHostAllocator.cpp
index 1255285d25af0..d531b46c3c554 100644
--- a/aten/src/ATen/xpu/CachingHostAllocator.cpp
+++ b/aten/src/ATen/xpu/CachingHostAllocator.cpp
@@ -30,6 +30,12 @@ struct XPUCachingHostAllocatorImpl
   bool query_event(XPUEvent& event) override {
     return event.query();
   }
+
+  bool pinned_use_background_threads() override {
+    // Using background threads for XPU causes a hang on Windows during program
+    // exit. Will be enabled once the issue is resolved.
+    return false;
+  }
 };
 
 DECLARE_HOST_ALLOCATOR(
diff --git a/benchmarks/data/dataloader_benchmark.py b/benchmarks/data/dataloader_benchmark.py
new file mode 100644
index 0000000000000..7d1dd3afc7e98
--- /dev/null
+++ b/benchmarks/data/dataloader_benchmark.py
@@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+"""
+Benchmark script for PyTorch DataLoader with different worker methods.
+
+This script measures:
+1. Dataloader initialization time
+2. Dataloading speed (time per batch)
+3. CPU memory utilization
+
+Usage:
+    python dataloader_benchmark.py --data_path /path/to/dataset --batch_size 32 --num_workers 4
+"""
+
+import argparse
+import copy
+import gc
+import time
+
+import psutil
+import torchvision
+import torchvision.transforms as transforms
+from torchvision.models import resnet18
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from torch.utils.data.dataset import ConcatDataset
+
+
+def get_memory_usage():
+    """
+    Get current memory usage in MB. This includes all child processes.
+
+    Returns:
+        Total memory usage in MB
+    """
+    process = psutil.Process()
+
+    main_memory = process.memory_full_info().pss
+
+    # Add memory usage of all child processes
+    for child in process.children(recursive=True):
+        try:
+            child_mem = child.memory_full_info().pss
+            main_memory += child_mem
+        except (psutil.NoSuchProcess, psutil.AccessDenied, AttributeError):
+            # Process might have terminated or doesn't support PSS, fall back to USS
+            print(f"Failed to get PSS for {child}, falling back to USS")
+            child_mem = child.memory_info().uss
+            main_memory += child_mem
+
+    return main_memory / (1024 * 1024)
+
+
+def print_detailed_memory():
+    """Print detailed memory information."""
+    process = psutil.Process()
+    print("\nDetailed memory information:")
+    try:
+        print(
+            f"  USS (Unique Set Size): {process.memory_full_info().uss / (1024 * 1024):.2f} MB"
+        )
+        print(
+            f"  PSS (Proportional Set Size): {process.memory_full_info().pss / (1024 * 1024):.2f} MB"
+        )
+        print(
+            f"  RSS (Resident Set Size): {process.memory_info().rss / (1024 * 1024):.2f} MB"
+        )
+    except Exception:
+        print("  Detailed memory info not available")
+
+
+def create_model():
+    """Create a simple model for benchmarking."""
+    model = resnet18()
+    return model
+
+
+def benchmark_dataloader(
+    dataset,
+    batch_size,
+    num_workers,
+    num_epochs=1,
+    max_batches=10,
+    multiprocessing_context=None,
+    logging_freq=10,
+):
+    """Benchmark a dataloader with specific configuration."""
+    print("\n--- Benchmarking DataLoader ---")
+
+    # Clear memory before starting
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    # Create model
+    model = create_model()
+
+    # Measure memory before dataloader creation
+    memory_before = get_memory_usage()
+    print(f"Memory before DataLoader creation: {memory_before:.2f} MB")
+    print_detailed_memory()
+
+    # Measure dataloader initialization time
+    start = time.perf_counter()
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=num_workers,
+        pin_memory=torch.cuda.is_available(),
+        prefetch_factor=2 if num_workers > 0 else None,
+        multiprocessing_context=multiprocessing_context,
+    )
+    it = iter(dataloader)
+    dataloader_init_time = time.perf_counter() - start
+
+    # Measure memory after dataloader creation
+    memory_after = get_memory_usage()
+    print(f"Memory after DataLoader creation: {memory_after:.2f} MB")
+    print(f"Memory increase: {memory_after - memory_before:.2f} MB")
+
+    # Create model and optimizer
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+
+    # Benchmark dataloading speed
+    model.train()
+    total_batches = 0
+    total_samples = 0
+    total_time = 0
+    total_data_load_time = 0
+
+    # Measure peak memory during training
+    peak_memory = memory_after
+
+    print(
+        f"\nStarting training loop with {num_epochs} epochs (max {max_batches} batches per epoch)"
+    )
+
+    for epoch in range(num_epochs):
+        while total_batches < max_batches:
+            batch_start = time.perf_counter()
+
+            try:
+                inputs, labels = next(it)
+            except StopIteration:
+                break
+
+            # Move data to device
+            inputs = inputs.to(device)
+            labels = labels.to(device)
+
+            # Capture data fetch time (including sending to device)
+            data_load_time = time.perf_counter() - batch_start
+
+            # Forward pass
+            outputs = model(inputs)
+            loss = criterion(outputs, labels)
+
+            # Backward and optimize
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            # Capture batch time
+            batch_time = time.perf_counter() - batch_start
+
+            total_batches += 1
+            total_samples += inputs.size(0)
+            total_data_load_time += data_load_time
+            total_time += batch_time
+
+            # Update peak memory and log memory usage periodically
+            if total_batches % 5 == 0:
+                # Force garbage collection before measuring memory
+                gc.collect()
+                current_memory = get_memory_usage()
+
+                if current_memory > peak_memory:
+                    peak_memory = current_memory
+
+            if total_batches % logging_freq == 0:
+                print(
+                    f"Epoch {epoch + 1}, Batch {total_batches}, "
+                    f"Time: {batch_time:.4f}s, "
+                    f"Memory: {current_memory:.2f} MB"
+                )
+
+    # Calculate statistics
+    avg_data_load_time = (
+        total_data_load_time / total_batches if total_batches > 0 else 0
+    )
+    avg_batch_time = total_time / total_batches if total_batches > 0 else 0
+    samples_per_second = total_samples / total_time if total_time > 0 else 0
+
+    results = {
+        "dataloader_init_time": dataloader_init_time,
+        "num_workers": num_workers,
+        "batch_size": batch_size,
+        "total_batches": total_batches,
+        "avg_batch_time": avg_batch_time,
+        "avg_data_load_time": avg_data_load_time,
+        "samples_per_second": samples_per_second,
+        "peak_memory_mb": peak_memory,
+        "memory_increase_mb": peak_memory - memory_before,
+    }
+
+    print("\nResults:")
+    print(f"  DataLoader init time: {dataloader_init_time:.4f} seconds")
+    print(f"  Average data loading time: {avg_data_load_time:.4f} seconds")
+    print(f"  Average batch time: {avg_batch_time:.4f} seconds")
+    print(f"  Samples per second: {samples_per_second:.2f}")
+    print(f"  Peak memory usage: {peak_memory:.2f} MB")
+    print(f"  Memory increase: {peak_memory - memory_before:.2f} MB")
+
+    # Clean up
+    del model, optimizer
+    del dataloader
+
+    # Force garbage collection
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Benchmark PyTorch DataLoader with different worker methods"
+    )
+    parser.add_argument("--data_path", required=True, help="Path to dataset")
+    parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
+    parser.add_argument("--num_workers", type=int, default=4, help="Number of workers")
+    parser.add_argument(
+        "--max_batches",
+        type=int,
+        default=100,
+        help="Maximum number of batches per epoch",
+    )
+    parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs")
+    parser.add_argument(
+        "--multiprocessing_context",
+        choices=["fork", "spawn", "forkserver"],
+        default="forkserver",
+        help="Multiprocessing context to use (fork, spawn, forkserver)",
+    )
+    parser.add_argument(
+        "--dataset_copies",
+        type=int,
+        default=1,
+        help="Number of copies of the dataset to concatenate (for testing memory usage)",
+    )
+    parser.add_argument(
+        "--logging_freq",
+        type=int,
+        default=10,
+        help="Frequency of logging memory usage during training",
+    )
+    args = parser.parse_args()
+
+    # Print system info
+    print("System Information:")
+    # The following are handy for debugging if building from source worked correctly
+    print(f"  PyTorch version: {torch.__version__}")
+    print(f"  PyTorch location: {torch.__file__}")
+    print(f"  Torchvision version: {torchvision.__version__}")
+    print(f"  Torchvision location: {torchvision.__file__}")
+    print(f"  CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"  CUDA device: {torch.cuda.get_device_name(0)}")
+    print(f"  CPU count: {psutil.cpu_count(logical=True)}")
+    print(f"  Physical CPU cores: {psutil.cpu_count(logical=False)}")
+    print(f"  Total system memory: {psutil.virtual_memory().total / (1024**3):.2f} GB")
+
+    # Define transforms
+    transform = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+
+    # Load dataset
+    print(f"\nLoading dataset from {args.data_path} ({args.dataset_copies} copies)")
+
+    # Try to load as ImageFolder
+    datasets = []
+    for _ in range(args.dataset_copies):
+        base_dataset = torchvision.datasets.ImageFolder(
+            args.data_path, transform=transform
+        )
+        datasets.append(copy.deepcopy(base_dataset))
+        del base_dataset
+    dataset = ConcatDataset(datasets)
+
+    print(f"Dataset size: {len(dataset)}")
+
+    # Run benchmark with specified worker method
+    benchmark_dataloader(
+        dataset,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        multiprocessing_context=args.multiprocessing_context,
+        num_epochs=args.num_epochs,
+        max_batches=args.max_batches,
+        logging_freq=args.logging_freq,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/dynamo/check_accuracy.py b/benchmarks/dynamo/check_accuracy.py
index 7fa24ae7346b1..678cee5f752c3 100644
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@@ -14,6 +14,9 @@
     "detectron2_maskrcnn_r_101_c4",
     "timm_efficientnet",  # see https://github.com/pytorch/pytorch/issues/148699
     "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
+    "moondream",  # discovered in https://github.com/pytorch/pytorch/pull/159291
+    # discovered in https://github.com/pytorch/pytorch/issues/161419. Its not flaky but really hard to repro, so skipping it
+    "mobilenetv3_large_100",
 }
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
index 66e088f334071..f65909f3a24ea 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@@ -46,14 +46,6 @@ CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,5
-
-
-
-DebertaForQuestionAnswering,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,5
+DistillGPT2,pass,7
 
 
 
@@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,6
+OPTForCausalLM,pass,8
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,6
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
index af605accecf6e..1d199fe8ea664 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@@ -106,7 +106,7 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,4
+doctr_det_predictor,pass,3
 
 
 
@@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
@@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
 
 
 
-hf_T5_generate,pass,3
+hf_T5_generate,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
index 33ede2b914b4f..54b7d63f3a4bc 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@@ -102,7 +102,7 @@ hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,6
+hf_GPT2,pass,8
 
 
 
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,23
+hf_Reformer,pass,25
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
index 1cafcbe55675d..169a42ff7cd41 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@@ -42,14 +42,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -154,10 +146,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
index 1cafcbe55675d..169a42ff7cd41 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
@@ -42,14 +42,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -154,10 +146,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
index c889ba0e8d2f7..c7d283b9aa52d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
@@ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+dla102,timeout,0
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
index faafea393ede5..e68aa2fa5351f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@@ -138,7 +138,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,pass,24
+hf_BigBird,pass,25
 
 
 
@@ -158,7 +158,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
@@ -346,7 +346,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,fail_accuracy,30
+vision_maskrcnn,fail_accuracy,29
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
index c889ba0e8d2f7..c7d283b9aa52d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
@@ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+dla102,timeout,0
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
index a2b7c1a7b15ca..aec659fdcd654 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@@ -138,7 +138,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,pass,24
+hf_BigBird,pass,25
 
 
 
@@ -158,7 +158,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
index 697fe04cd91a5..4f2eec1493520 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@@ -138,7 +138,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,pass,24
+hf_BigBird,pass,25
 
 
 
@@ -158,7 +158,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
index 66e088f334071..f65909f3a24ea 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@@ -46,14 +46,6 @@ CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,5
-
-
-
-DebertaForQuestionAnswering,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,5
+DistillGPT2,pass,7
 
 
 
@@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,6
+OPTForCausalLM,pass,8
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,6
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
index 7f11e13980273..20cad351b1275 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@@ -106,7 +106,7 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,4
+doctr_det_predictor,pass,3
 
 
 
@@ -146,7 +146,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,fail_to_run,0
+hf_BigBird,pass,0
 
 
 
@@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
@@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
 
 
 
-hf_T5_generate,pass,3
+hf_T5_generate,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
index cb8cead2ba034..5050b3762ed96 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@@ -2,7 +2,7 @@ name,accuracy,graph_breaks
 
 
 
-torchrec_dlrm,fail_to_run,3
+torchrec_dlrm,pass,6
 
 
 
@@ -94,7 +94,7 @@ hf_Bert_large,pass,6
 
 
 
-hf_BigBird,fail_to_run,3
+hf_BigBird,pass,6
 
 
 
@@ -102,7 +102,7 @@ hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,6
+hf_GPT2,pass,8
 
 
 
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,fail_to_run,19
+hf_Reformer,pass,25
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
index c251f34c0e944..b0e8f34b964ec 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
@@ -34,7 +34,7 @@ basic_gnn_gin,pass,0
 
 
 
-basic_gnn_sage,fail_to_run,0
+basic_gnn_sage,pass,0
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
index 6f9e9e0ed5a7b..c8db4d5823203 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@@ -122,7 +122,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,pass,24
+hf_BigBird,pass,25
 
 
 
@@ -142,7 +142,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
index 4f7ca2b638c48..f4c9ffddd9974 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@@ -138,7 +138,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,pass,24
+hf_BigBird,pass,25
 
 
 
@@ -158,7 +158,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
index 66e088f334071..f65909f3a24ea 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@@ -46,14 +46,6 @@ CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,5
-
-
-
-DebertaForQuestionAnswering,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,5
+DistillGPT2,pass,7
 
 
 
@@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,6
+OPTForCausalLM,pass,8
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,6
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
index 7f11e13980273..2b2c1a504647f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@@ -106,7 +106,7 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,4
+doctr_det_predictor,pass,3
 
 
 
@@ -146,7 +146,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,fail_to_run,0
+hf_BigBird,fail_accuracy,0
 
 
 
@@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
@@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
 
 
 
-hf_T5_generate,pass,3
+hf_T5_generate,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
index 05eb7e3546eef..89871fd49a04b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@@ -2,7 +2,7 @@ name,accuracy,graph_breaks
 
 
 
-torchrec_dlrm,fail_to_run,3
+torchrec_dlrm,pass,6
 
 
 
@@ -46,7 +46,7 @@ dcgan,pass,6
 
 
 
-demucs,fail_to_run,4
+demucs,pass,9
 
 
 
@@ -94,7 +94,7 @@ hf_Bert_large,pass,6
 
 
 
-hf_BigBird,fail_to_run,3
+hf_BigBird,pass,6
 
 
 
@@ -102,7 +102,7 @@ hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,6
+hf_GPT2,pass,8
 
 
 
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,fail_to_run,19
+hf_Reformer,pass,25
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
index 66e088f334071..f65909f3a24ea 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
@@ -46,14 +46,6 @@ CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,5
-
-
-
-DebertaForQuestionAnswering,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,5
+DistillGPT2,pass,7
 
 
 
@@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,6
+OPTForCausalLM,pass,8
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,6
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
index af605accecf6e..1d199fe8ea664 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@@ -106,7 +106,7 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,4
+doctr_det_predictor,pass,3
 
 
 
@@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
@@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
 
 
 
-hf_T5_generate,pass,3
+hf_T5_generate,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
index 44983e8ecc214..0985e42fc5cb9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@@ -102,7 +102,7 @@ hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,6
+hf_GPT2,pass,8
 
 
 
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,23
+hf_Reformer,pass,25
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
index 66e088f334071..f65909f3a24ea 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
@@ -46,14 +46,6 @@ CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,5
-
-
-
-DebertaForQuestionAnswering,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,5
+DistillGPT2,pass,7
 
 
 
@@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,6
+OPTForCausalLM,pass,8
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,6
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
index 9a9a68629f875..e41018657c0e2 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -106,7 +106,7 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,4
+doctr_det_predictor,pass,3
 
 
 
@@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
@@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
 
 
 
-hf_T5_generate,pass,3
+hf_T5_generate,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
index 33ede2b914b4f..54b7d63f3a4bc 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -102,7 +102,7 @@ hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,6
+hf_GPT2,pass,8
 
 
 
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,23
+hf_Reformer,pass,25
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
index 9fdb41506e3b2..08061de428d71 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
@@ -46,14 +46,6 @@ CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,5
-
-
-
-DebertaForQuestionAnswering,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,5
+DistillGPT2,pass,7
 
 
 
@@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,6
+OPTForCausalLM,pass,8
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,6
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
index b3a3265baa16f..6f316b219bb92 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
@@ -166,7 +166,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
@@ -181,7 +181,7 @@ hf_T5_base,pass,0
 
 
 
-hf_T5_generate,pass,3
+hf_T5_generate,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
index d2300bdac05b8..48d0b111788f7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
@@ -102,7 +102,7 @@ hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,6
+hf_GPT2,pass,8
 
 
 
@@ -114,7 +114,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,23
+hf_Reformer,pass,25
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
index 1cafcbe55675d..ce334e22c698b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
@@ -42,14 +42,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -66,7 +58,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -154,10 +146,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
index 9fdb41506e3b2..08061de428d71 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
@@ -46,14 +46,6 @@ CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,5
-
-
-
-DebertaForQuestionAnswering,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,5
+DistillGPT2,pass,7
 
 
 
@@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,6
+OPTForCausalLM,pass,8
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,6
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
index 624f295624783..4b5138ce9c367 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
@@ -166,7 +166,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
@@ -181,7 +181,7 @@ hf_T5_base,pass,0
 
 
 
-hf_T5_generate,pass,3
+hf_T5_generate,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
index 1605a26b7ce5f..643a02fdca8fd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
@@ -102,7 +102,7 @@ hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,6
+hf_GPT2,pass,8
 
 
 
@@ -114,7 +114,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,23
+hf_Reformer,pass,25
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
index 66e088f334071..f65909f3a24ea 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
@@ -46,14 +46,6 @@ CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,5
-
-
-
-DebertaForQuestionAnswering,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,5
+DistillGPT2,pass,7
 
 
 
@@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,6
+OPTForCausalLM,pass,8
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,6
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
index 6776cc5f5d7a7..a3fc7cf192371 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
@@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
@@ -174,7 +174,7 @@ hf_T5_base,eager_fail_to_run,0
 
 
 
-hf_T5_generate,pass,3
+hf_T5_generate,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
index b43e38b7d822a..ced88884720b7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
@@ -102,7 +102,7 @@ hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,6
+hf_GPT2,pass,8
 
 
 
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,23
+hf_Reformer,pass,25
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
index 9fdb41506e3b2..08061de428d71 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
@@ -46,14 +46,6 @@ CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,5
-
-
-
-DebertaForQuestionAnswering,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,5
+DistillGPT2,pass,7
 
 
 
@@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,6
+OPTForCausalLM,pass,8
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,6
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
index b3a3265baa16f..6f316b219bb92 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
@@ -166,7 +166,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
@@ -181,7 +181,7 @@ hf_T5_base,pass,0
 
 
 
-hf_T5_generate,pass,3
+hf_T5_generate,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
index 754f5f718e436..d1606b622639e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
@@ -102,7 +102,7 @@ hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,6
+hf_GPT2,pass,8
 
 
 
@@ -114,7 +114,7 @@ hf_Longformer,pass,4
 
 
 
-hf_Reformer,pass,23
+hf_Reformer,pass,25
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
index fd57a3b4cbf3c..0f088e7892d8f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
@@ -46,14 +46,6 @@ CamemBert,pass,0
 
 
 
-DebertaForMaskedLM,pass,0
-
-
-
-DebertaForQuestionAnswering,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,0
 
 
 
-DistillGPT2,pass,0
+DistillGPT2,pass,2
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,0
 
 
 
-Speech2Text2ForCausalLM,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
index 66e088f334071..f65909f3a24ea 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
@@ -46,14 +46,6 @@ CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,5
-
-
-
-DebertaForQuestionAnswering,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
@@ -70,7 +62,7 @@ DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,5
+DistillGPT2,pass,7
 
 
 
@@ -130,7 +122,7 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,6
+OPTForCausalLM,pass,8
 
 
 
@@ -158,10 +150,6 @@ RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,6
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
index 3e4e9ee702aa3..8ccf95da9659e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
@@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,5
+hf_Reformer,pass,8
 
 
 
@@ -174,7 +174,7 @@ hf_T5_base,eager_fail_to_run,0
 
 
 
-hf_T5_generate,pass,3
+hf_T5_generate,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
index 86ad955b5a2cb..e842ac7cb8e1f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
@@ -102,7 +102,7 @@ hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,6
+hf_GPT2,pass,8
 
 
 
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,23
+hf_Reformer,pass,25
 
 
 
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 516549d7f6569..2901009f7c4d1 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -21,6 +21,7 @@
 import signal
 import subprocess
 import sys
+import tempfile
 import time
 import weakref
 from contextlib import contextmanager
@@ -41,6 +42,7 @@
 import torch.distributed
 import torch.multiprocessing as mp
 from torch._C import _has_cuda as HAS_CUDA, _has_xpu as HAS_XPU
+from torch._C._nativert import PyModelRunner
 from torch._dynamo.profiler import fx_insert_profiling, Profiler
 from torch._dynamo.testing import (
     dummy_fx_compile,
@@ -202,7 +204,6 @@ class CI(NamedTuple):
     "PLBartForCausalLM",
     "PLBartForConditionalGeneration",
     "PegasusForCausalLM",
-    "Speech2Text2ForCausalLM",
     "TrOCRForCausalLM",
     "XGLMForCausalLM",
     # TIMM
@@ -1100,6 +1101,10 @@ def maybe_mark_profile(*args, **kwargs):
             frozen_model_iter_fn = export_aot_inductor(
                 model, example_inputs, args.inductor_compile_mode
             )
+        elif args.export_nativert:
+            frozen_model_iter_fn = export_nativert(model, example_inputs)
+        elif args.torchscript_jit_trace:
+            frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
         else:
             frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
 
@@ -1446,6 +1451,60 @@ def get_excess_memory(cls, model) -> float:
         return cls.cache.get(weakref.ref(model), (None, 0.0))[1]
 
 
+class NativeRTCache:
+    cache: dict[weakref.ref, Any] = {}
+
+    @classmethod
+    def load(cls, model, example_inputs):
+        from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path
+
+        key = weakref.ref(model)
+        if key not in cls.cache:
+            example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
+            example_outputs = model(*example_args, **example_kwargs)
+            _register_dataclass_output_as_pytree(example_outputs)
+
+            combined_args = _combine_args(model, example_args, example_kwargs)
+            dynamic_shapes = _tree_map_with_path(
+                _produce_dynamic_shapes_for_export, combined_args
+            )
+
+            ep = torch.export.export(
+                model, example_args, example_kwargs, dynamic_shapes=dynamic_shapes
+            )
+            ep = ep.run_decompositions({})
+            with tempfile.NamedTemporaryFile(delete=False) as f:
+                torch.export.pt2_archive._package.package_pt2(
+                    f, exported_programs={"forward": ep}
+                )
+                filename = f.name
+            cls.cache[key] = PyModelRunner(filename, "forward")
+
+        return cls.cache[key]
+
+
+class JitTracedCache:
+    cache: dict[weakref.ref, Any] = {}
+
+    @classmethod
+    def load(cls, model, example_inputs):
+        key = weakref.ref(model)
+        if key not in cls.cache:
+            example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
+            if example_args:
+                jit_traced_module = torch.jit.trace(
+                    model, example_inputs=example_args, strict=False
+                )
+            else:
+                jit_traced_module = torch.jit.trace(
+                    model, example_kwarg_inputs=example_kwargs, strict=False
+                )
+
+            cls.cache[key] = jit_traced_module
+
+        return cls.cache[key]
+
+
 def export(model, example_inputs):
     from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path
 
@@ -1472,6 +1531,16 @@ def opt_export(_, example_inputs):
     return opt_export
 
 
+def export_nativert(model, example_inputs):
+    optimized = NativeRTCache.load(model, example_inputs)
+
+    def opt_nativert(_, example_inputs, collect_outputs=False):
+        example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
+        return optimized.run(*example_args, **example_kwargs)
+
+    return opt_nativert
+
+
 def export_aot_inductor(model, example_inputs, mode):
     optimized = AOTInductorModelCache.load(model, example_inputs, mode)
 
@@ -1482,6 +1551,16 @@ def opt_aot_inductor(_, example_inputs, collect_outputs=False):
     return opt_aot_inductor
 
 
+def torchscript_jit_trace(model, example_inputs):
+    optimized = JitTracedCache.load(model, example_inputs)
+
+    def opt_jit_trace(_, example_inputs, collect_outputs=False):
+        example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
+        return optimized(*example_args, **example_kwargs)
+
+    return opt_jit_trace
+
+
 def download_retry_decorator(download_fn):
     """
     Decorator function for applying retry logic to a download function.
@@ -2228,7 +2307,12 @@ def record_status(accuracy_status, dynamo_start_stats):
             try:
                 model_copy = self.deepcopy_and_maybe_parallelize(model)
                 self.init_optimizer(name, current_device, model_copy.parameters())
-                if self.args.export or self.args.export_aot_inductor:
+                if (
+                    self.args.export
+                    or self.args.export_aot_inductor
+                    or self.args.export_nativert
+                    or self.args.torchscript_jit_trace
+                ):
                     # apply export on module directly
                     # no need for n iterations
                     # the logic should be the same to self.model_iter_fn (forward_pass)
@@ -2624,7 +2708,11 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                         niters=1,
                     )
 
-            if self.args.export_aot_inductor:
+            if (
+                self.args.export_aot_inductor
+                or self.args.export_nativert
+                or self.args.torchscript_jit_trace
+            ):
                 optimized_model_iter_fn = optimize_ctx
             else:
                 optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
@@ -3377,6 +3465,16 @@ def get_example_inputs(self):
         action="store_true",
         help="Measure pass rate with Export+AOTInductor",
     )
+    group.add_argument(
+        "--export-nativert",
+        action="store_true",
+        help="Measure pass rate with Export+NativeRT",
+    )
+    group.add_argument(
+        "--torchscript-jit-trace",
+        action="store_true",
+        help="Measure pass rate with TorchScript jit.trace",
+    )
     group.add_argument(
         "--xla", action="store_true", help="Compare TorchXLA to eager PyTorch"
     )
@@ -3818,6 +3916,14 @@ def run(runner, args, original_dir=None):
         optimize_ctx = export
         experiment = speedup_experiment
         output_filename = "export.csv"
+    elif args.export_nativert:
+        optimize_ctx = export_nativert
+        experiment = speedup_experiment
+        output_filename = "export_nativert.csv"
+    elif args.torchscript_jit_trace:
+        optimize_ctx = torchscript_jit_trace
+        experiment = speedup_experiment
+        output_filename = "torchscript_jit_trace.csv"
     elif args.xla:
         (dev,) = args.devices
         os.environ["PJRT_DEVICE"] = {"cuda": "GPU", "cpu": "CPU"}[dev]
@@ -4132,7 +4238,7 @@ def detect_and_mark_batch(t):
                 nonlocal marked
                 for i, s in enumerate(t.size()):
                     if s == batch_size:
-                        torch._dynamo.mark_dynamic(t, i)
+                        torch._dynamo.maybe_mark_dynamic(t, i)
                         marked = True
                         break
 
diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index 916a33276d996..76026731fe890 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -370,6 +370,7 @@ def use_larger_multiplier_for_smaller_tensor(self, name):
         return name in [
             "ElectraForQuestionAnswering",
             "MegatronBertForQuestionAnswering",
+            "GPT2ForSequenceClassification",
         ]
 
     def _get_model_cls_and_config(self, model_name):
@@ -459,6 +460,12 @@ def load_model(
         else:
             model.eval()
 
+        # Turning off kv cache for torchbench models. This is not the right
+        # thing to do, but the pt2 dashboard is outdated. Real transformers
+        # benchmarks will be added soon using a different infra.
+        if hasattr(model, "config") and hasattr(model.config, "use_cache"):
+            model.config.use_cache = False
+
         self.validate_model(model, example_inputs)
         return device, model_name, model, example_inputs, batch_size
 
diff --git a/benchmarks/dynamo/huggingface.yaml b/benchmarks/dynamo/huggingface.yaml
index f0ee57a589657..5640776117096 100644
--- a/benchmarks/dynamo/huggingface.yaml
+++ b/benchmarks/dynamo/huggingface.yaml
@@ -31,8 +31,6 @@ batch_size:
     BlenderbotSmallForCausalLM: 4
     BlenderbotSmallForConditionalGeneration: 2
     CamemBert: 2
-    DebertaForMaskedLM: 4
-    DebertaForQuestionAnswering: 2
     DebertaV2ForMaskedLM: 4
     DebertaV2ForQuestionAnswering: 8
     DistilBertForMaskedLM: 2
@@ -63,7 +61,6 @@ batch_size:
     PegasusForConditionalGeneration: 2
     RobertaForCausalLM: 2
     RobertaForQuestionAnswering: 2
-    Speech2Text2ForCausalLM: 4
     T5ForConditionalGeneration: 2
     T5Small: 2
     TrOCRForCausalLM: 2
diff --git a/benchmarks/dynamo/huggingface_models_list.txt b/benchmarks/dynamo/huggingface_models_list.txt
index 6e3cf19a783d7..12ceedd5c4ccc 100644
--- a/benchmarks/dynamo/huggingface_models_list.txt
+++ b/benchmarks/dynamo/huggingface_models_list.txt
@@ -10,8 +10,6 @@ BlenderbotForConditionalGeneration,16
 BlenderbotSmallForCausalLM,256
 BlenderbotSmallForConditionalGeneration,128
 CamemBert,32
-DebertaForMaskedLM,32
-DebertaForQuestionAnswering,32
 DebertaV2ForMaskedLM,8
 DebertaV2ForQuestionAnswering,8
 DistilBertForMaskedLM,256
@@ -42,7 +40,6 @@ PegasusForCausalLM,128
 PegasusForConditionalGeneration,64
 RobertaForCausalLM,32
 RobertaForQuestionAnswering,32
-Speech2Text2ForCausalLM,1024
 T5ForConditionalGeneration,8
 T5Small,8
 TrOCRForCausalLM,64
diff --git a/benchmarks/dynamo/huggingface_models_list_cpu.txt b/benchmarks/dynamo/huggingface_models_list_cpu.txt
index cabd79ac830f6..4078368a69c44 100644
--- a/benchmarks/dynamo/huggingface_models_list_cpu.txt
+++ b/benchmarks/dynamo/huggingface_models_list_cpu.txt
@@ -10,8 +10,6 @@ BlenderbotForCausalLM,32
 BlenderbotSmallForCausalLM,64
 BlenderbotSmallForConditionalGeneration,64
 CamemBert,16
-DebertaForMaskedLM,32
-DebertaForQuestionAnswering,8
 DebertaV2ForMaskedLM,16
 DebertaV2ForQuestionAnswering,2
 DistilBertForMaskedLM,128
@@ -38,7 +36,6 @@ PLBartForCausalLM,8
 PLBartForConditionalGeneration,4
 RobertaForCausalLM,16
 RobertaForQuestionAnswering,16
-Speech2Text2ForCausalLM,32
 T5ForConditionalGeneration,4
 T5Small,1
 TrOCRForCausalLM,32
diff --git a/benchmarks/dynamo/pr_time_benchmarks/check_results.py b/benchmarks/dynamo/pr_time_benchmarks/check_results.py
index f9204ee98fb05..734d3a01c1e82 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/check_results.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/check_results.py
@@ -132,10 +132,10 @@ def log(event_name):
             )
 
         new_entry = copy.deepcopy(entry)
-        # only change if abs(ratio) > entry.noise_margin /3.
+        # only change if abs(ratio) > entry.noise_margin /5.
         new_entry.expected_value = (
             replace_with_zeros(result)
-            if abs(ratio) > entry.noise_margin * 100 / 3
+            if abs(ratio) > entry.noise_margin * 100 / 5
             else entry.expected_value
         )
         new_expected[key] = new_entry
diff --git a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
index 5398c40f3573a..fc11be9ba6528 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@@ -18,7 +18,7 @@ add_loop_inductor_gpu,compile_time_instruction_count,26800000000,0.1
 
 
 
-basic_modules_ListOfLinears_eager,compile_time_instruction_count,1009000000,0.1
+basic_modules_ListOfLinears_eager,compile_time_instruction_count,1048000000,0.1
 
 
 
@@ -74,15 +74,15 @@ aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0
 
 
 
-mm_loop_inductor_gpu,compile_time_instruction_count,4461000000,0.1
+mm_loop_inductor_gpu,compile_time_instruction_count,4820968837,0.1
 
 
 
-mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8417000000,0.1
+mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8802129167,0.1
 
 
 
-basic_NestedModule_eager,compile_time_instruction_count,8787000000,0.1
+basic_NestedModule_eager,compile_time_instruction_count,9554000000,0.1
 
 
 
diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
index c2568aa1daa19..1f10ecc661d8e 100755
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@@ -382,6 +382,22 @@ def load_model(
         if self.args.trace_on_xla:
             # work around for: https://github.com/pytorch/xla/issues/4174
             import torch_xla  # noqa: F401
+
+        # Turning off kv cache for torchbench models. This is not the right
+        # thing to do, but the torchbench models are way outdated, and since we
+        # are using torchbench pt2 dashboard to track regressions (rather than
+        # improving performance), we are just setting the kv cache to false.
+        # Real transformers benchmarks will be added soon using a different
+        # infra.
+        if (
+            model_name.startswith("hf")
+            and hasattr(model, "config")
+            and hasattr(model.config, "use_cache")
+        ):
+            model.config.use_cache = False
+        if model_name == "hf_T5_generate":
+            model.model.config.use_cache = False
+
         self.validate_model(model, example_inputs)
         return device, benchmark.name, model, example_inputs, batch_size
 
diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml
index bf0a1b6c31e85..6a15cf33222b2 100644
--- a/benchmarks/dynamo/torchbench.yaml
+++ b/benchmarks/dynamo/torchbench.yaml
@@ -219,7 +219,9 @@ skip:
       - timm_regnet
       - timm_nfnet
 
-    cuda: []
+    cuda:
+      # Temporary until https://github.com/pytorch/pytorch/issues/162282 is fixed
+      - sam_fast
 
   test:
     training:
diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py
index cb836bb5eaa4b..0b7fcf4e555f8 100644
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@@ -4,6 +4,7 @@
 import functools
 import json
 import os
+import platform
 import timeit
 from collections import namedtuple
 from dataclasses import asdict, dataclass
@@ -191,6 +192,11 @@ def __init__(self, args):
         self.predefined_minimum_secs = 1
         self.max_iters = 1e6
         self.use_jit = args.use_jit
+        self.use_compile = args.use_compile
+        if self.use_jit and self.use_compile:
+            raise ValueError(
+                "use_jit and use_compile are mutually exclusive, please specify one."
+            )
         self.num_runs = args.num_runs
         self.print_per_iter = False
         self.output_csv = args.output_csv
@@ -222,7 +228,7 @@ def _print_header(self):
             if self.args.operators:
                 print(f"# {self.args.operators}")
 
-    def _print_perf_result(self, reported_run_time_us, test_case):
+    def _print_perf_result(self, results, test_case):
         if self.args.report_aibench:
             # Output for AIBench
             # Print out per iteration execution time instead of avg time
@@ -236,12 +242,14 @@ def _print_perf_result(self, reported_run_time_us, test_case):
                             "type": test_name,
                             "metric": "latency",
                             "unit": "us",
-                            "value": str(reported_run_time_us[run]),
+                            "value": str(results["reported_run_time_us"[run]]),
                         }
                     )
                 )
         else:
-            print(f"# Mode: {'JIT' if self.use_jit else 'Eager'}")
+            print(
+                f"# Mode: {'JIT' if self.use_jit else 'Compile' if self.use_compile else 'Eager'}"
+            )
             print(
                 f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}"
             )
@@ -250,25 +258,33 @@ def _print_perf_result(self, reported_run_time_us, test_case):
             if self.num_runs > 1:
                 for run in range(self.num_runs):
                     print(
-                        f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}"
+                        f"Run: {run}, {mode} Execution Time (us) : {results['reported_run_time_us'][run]:.3f}"
                     )
                 print()
             else:
-                print(f"{mode} Execution Time (us) : {reported_run_time_us[0]:.3f}\n")
+                print(
+                    f"{mode} Execution Time (us) : {results['reported_run_time_us'][0]:.3f}"
+                )
+                print(f"Peak Memory (KB) : {results['peak_memory']}\n")
 
-    def _perf_result_to_dict(self, reported_run_time_us, test_case):
+    def _perf_result_to_dict(self, results, test_case):
         """This function is the parallel of _print_perf_result, which instead of
         writing information to terminal, returns a dictionary.
         """
         if self.args.report_aibench:
             return {}
+
         out = {
             "test_name": test_case.test_config.test_name,
             "input_config": test_case.test_config.input_config,
-            "mode": "JIT" if self.use_jit else "Eager",
+            "runtime": (
+                "JIT" if self.use_jit else "Compile" if self.use_compile else "Eager"
+            ),
             "run": "Backward" if test_case.test_config.run_backward else "Forward",
-            "latency": round(reported_run_time_us[0], 3),
+            "latency": round(results["reported_run_time_us"][0], 3),
             "latency unit": "us",
+            "peak memory": results["peak_memory"],
+            "memory unit": "KB",
         }
 
         # parsing test_case.test_config.input_config, adding it as entries to the 'out' dictionary
@@ -330,6 +346,8 @@ def _launch_forward(self, test_case, iters, print_per_iter):
         func = test_case.run_forward
         if self.use_jit:
             func = test_case.run_jit_forward
+        if self.use_compile:
+            func = test_case.run_compile_forward
         forward_time = timeit.timeit(
             functools.partial(func, iters, print_per_iter, cuda_sync), number=1
         )
@@ -346,7 +364,7 @@ def _launch_backward(self, test_case, iters, print_per_iter=False):
         )
         return backward_time
 
-    def _measure_time(self, launch_test, test_case, iters, print_per_iter):
+    def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
         """
         This function execute the operator for <iters> iterations then look at the time.
         If it's not significant, the number of iterations will be increased before rerun.
@@ -354,8 +372,20 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter):
         """
         curr_test_total_time = 0
         time_trace = []
+        peak_memory = 0
+        sample_input = next(iter(test_case.op_bench.inputs.values()))
+        device = sample_input.device
+        device_module = torch.get_device_module(device.type)
+        # TODO: add support for cpu memory measurement
         while True:
+            if hasattr(device_module, "reset_peak_memory_stats"):
+                device_module.reset_peak_memory_stats(device)
             run_time_sec = launch_test(test_case, iters, print_per_iter)
+            if hasattr(device_module, "synchronize"):
+                device_module.synchronize(device)
+            # Memory measurement process
+            if hasattr(device_module, "max_memory_allocated"):
+                peak_memory = device_module.max_memory_allocated(device)
             curr_test_total_time += run_time_sec
             # Analyze time after each run to decide if the result is stable
             results_are_significant = self._iteration_result_is_significant(
@@ -369,7 +399,13 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter):
             time_trace.append(report_run_time)
             # Print out the time spent in each epoch in ms
             if self.args.report_aibench:
-                mode = "JIT" if self.use_jit else "Eager"
+                mode = (
+                    "JIT"
+                    if self.use_jit
+                    else "Compile"
+                    if self.use_compile
+                    else "Eager"
+                )
                 test_name = "_".join(
                     [test_case.framework, test_case.test_config.test_name, mode]
                 )
@@ -381,7 +417,7 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter):
                             "metric": "latency",
                             "unit": "ms",
                             "value": str(report_run_time / 1e3),
-                        }
+                        },
                     )
                 )
             if results_are_significant:
@@ -391,7 +427,7 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter):
             # iteration count, and run the benchmark again...
             iters = self._predict_num_iter_needed(iters)
         reported_run_time_us = np.percentile(np.array(time_trace), 50)
-        return reported_run_time_us
+        return reported_run_time_us, peak_memory / 1024
 
     def _check_keep(self, test_flag, cmd_flag):
         return cmd_flag is None or test_flag == cmd_flag
@@ -478,6 +514,7 @@ def _output_json(
         self,
         perf_list,
         output_file,
+        benchmark_name="PyTorch operator benchmark",
     ):
         """
         Write the result into JSON format, so that it can be uploaded to the benchmark database
@@ -495,8 +532,10 @@ def _output_json(
             input_config = perf_item.get("input_config", "")
             run_type = perf_item.get("run")
             latency = perf_item.get("latency", 0)
-
-            dtype = "float32"  # default
+            peak_memory = perf_item.get("peak memory", 0)
+            device = perf_item.get("device", "unknown")
+            dtype = perf_item.get("dtype", "torch.float").split(".")[1]
+            runtime = perf_item.get("runtime", None)
 
             # Extract mode based on run_type
             mode = None
@@ -505,6 +544,22 @@ def _output_json(
             elif run_type == "Backward":
                 mode = "training"
 
+            # Extract use_compile from it
+            if runtime == "Compile":
+                use_compile = True
+            elif runtime == "Eager":
+                use_compile = False
+            else:
+                use_compile = None
+
+            device_arch = (
+                torch.cuda.get_device_name(0)
+                if device == "cuda"
+                else platform.processor()
+                if device == "cpu"
+                else "unknown"
+            )
+
             # Create the record
             @dataclass
             class BenchmarkInfo:
@@ -532,12 +587,18 @@ class BenchmarkRecord:
                 model: ModelInfo
                 metric: MetricInfo
 
-            record = BenchmarkRecord(
+            # Add record for latency
+            record_latency = BenchmarkRecord(
                 benchmark=BenchmarkInfo(
-                    name="PyTorch operator benchmark",
+                    name=benchmark_name,
                     mode=mode,
                     dtype=dtype,
-                    extra_info={"input_config": input_config},
+                    extra_info={
+                        "input_config": input_config,
+                        "device": device,
+                        "arch": device_arch,
+                        "use_compile": use_compile,
+                    },
                 ),
                 model=ModelInfo(
                     name=test_name, type="micro-benchmark", origins=["pytorch"]
@@ -549,8 +610,17 @@ class BenchmarkRecord:
                     target_value=None,
                 ),
             )
-
-            records.append(asdict(record))
+            records.append(asdict(record_latency))
+
+            # Add record for peak memory
+            record_memory = copy.deepcopy(record_latency)
+            record_memory.metric = MetricInfo(
+                name="peak memory",
+                unit="KB",
+                benchmark_values=[peak_memory],
+                target_value=None,
+            )
+            records.append(asdict(record_memory))
 
         # Write all records to the output file
         with open(output_file, "w", encoding="utf-8") as f:
@@ -566,6 +636,7 @@ def run(self):
             "tag",
             "run_backward",
             "Execution Time",
+            "Peak Memory (KB)",
         ]
 
         if self.args.output_json or self.args.output_json_for_dashboard:
@@ -603,13 +674,16 @@ def run(self):
                     test_case, self.args.warmup_iterations, print_per_iter=False
                 )
                 # Actual Execution
-                reported_time = [
-                    self._measure_time(
+                results = [
+                    self._measure_metrics(
                         launch_func, test_case, self.iters, self.print_per_iter
                     )
                     for _ in range(self.num_runs)
                 ]
-                self._print_perf_result(reported_time, test_case)
+                result_dict = dict()
+                result_dict["reported_run_time_us"] = [r[0] for r in results]
+                result_dict["peak_memory"] = results[0][1]
+                self._print_perf_result(results=result_dict, test_case=test_case)
 
                 # output results to csv
                 self._output_csv(
@@ -625,16 +699,17 @@ def run(self):
                         ),
                         test_case.test_config.tag,
                         test_case.test_config.run_backward,
-                        reported_time[0],
+                        result_dict["reported_run_time_us"][0],
+                        result_dict["peak_memory"],
                     ],
                 )
                 if self.args.output_json or self.args.output_json_for_dashboard:
-                    perf_list.append(
-                        self._perf_result_to_dict(reported_time, test_case)
-                    )
+                    perf_list.append(self._perf_result_to_dict(result_dict, test_case))
 
         if self.args.output_json_for_dashboard:
-            self._output_json(perf_list, self.args.output_json_for_dashboard)
+            self._output_json(
+                perf_list, self.args.output_json_for_dashboard, self.args.benchmark_name
+            )
 
         if self.args.output_json:
             with open(self.args.output_json, "w") as f:
diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py
index 52ae47047daab..a7ff40ebb340e 100644
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@@ -4,6 +4,15 @@
 import torch
 
 
+# Import the C++ extension to register the _consume operator
+try:
+    import benchmark_cpp_extension  # noqa: F401
+except ImportError as err:
+    # If the extension isn't built, the script must raise an error
+    raise ImportError(
+        "Failed to import C++ extension, please build it using \ncd pt_extension \npython -m pip install ."
+    ) from err
+
 """PyTorch performance microbenchmarks.
 
 This module contains PyTorch-specific functionalities for performance
@@ -71,6 +80,16 @@ def forward_consume(self, iters: int):
         for _ in range(iters):
             torch.ops.operator_benchmark._consume(self.forward_impl())
 
+    def forward_impl_eager(self):
+        # This is to supply the inputs to the forward function which
+        # will be called in both the eager and compile mode of local runs
+        return self.forward(*self.get_inputs())
+
+    def forward_consume_eager(self, iters: int):
+        # Eager version of forward_consume without decorators (compilation handled by torch.compile)
+        for _ in range(iters):
+            torch.ops.operator_benchmark._consume(self.forward_impl_eager())
+
     def module_name(self):
         """this is used to label the operator being benchmarked"""
         if self.user_given_name:
@@ -117,18 +136,32 @@ def __init__(self, op_bench, test_config):
         self.framework = "PyTorch"
         self.time_series = []
         self._jit_forward_graph = None
+        self._compile_forward_graph = None
 
     def _generate_jit_forward_graph(self):
         """generate a graph for the forward function via scripting"""
         scripted_op_bench = torch.jit.script(self.op_bench)
         return scripted_op_bench.forward_consume
 
+    def _generate_compile_forward_graph(self):
+        """generate a compiled graph for the forward function via torch.compile"""
+        compiled_forward_consume = torch.compile(
+            self.op_bench.forward_consume_eager, backend="inductor"
+        )
+        return compiled_forward_consume
+
     def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
         """Run the forward path of an op with JIT mode"""
         if self._jit_forward_graph is None:
             self._jit_forward_graph = self._generate_jit_forward_graph()
         self._jit_forward_graph(num_runs)
 
+    def run_compile_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
+        """Run the forward path of an op with compile mode"""
+        if self._compile_forward_graph is None:
+            self._compile_forward_graph = self._generate_compile_forward_graph()
+        self._compile_forward_graph(num_runs)
+
     def _print_per_iter(self):
         # print last 50 values
         length = min(len(self.time_series), 50)
@@ -150,14 +183,14 @@ def run_forward(self, num_runs, print_per_iter, cuda_sync):
         if print_per_iter:
             for _ in range(num_runs):
                 start_time = time.time()
-                self.output = self.op_bench.forward_impl()
+                self.output = self.op_bench.forward_impl_eager()
                 if cuda_sync:
                     torch.cuda.synchronize(torch.cuda.current_device())
                 end_time = time.time()
                 self.time_series.append((end_time - start_time) * 1e3)
         else:
             for _ in range(num_runs):
-                self.output = self.op_bench.forward_impl()
+                self.output = self.op_bench.forward_impl_eager()
             if cuda_sync:
                 torch.cuda.synchronize(torch.cuda.current_device())
 
diff --git a/benchmarks/operator_benchmark/benchmark_runner.py b/benchmarks/operator_benchmark/benchmark_runner.py
index 9dfab781498ea..6568cf9bf3ee6 100644
--- a/benchmarks/operator_benchmark/benchmark_runner.py
+++ b/benchmarks/operator_benchmark/benchmark_runner.py
@@ -62,6 +62,13 @@ def parse_args():
         default=None,
     )
 
+    parser.add_argument(
+        "--benchmark-name",
+        "--benchmark_name",
+        help="Name of the benchmark to store results to",
+        default="PyTorch operator benchmark",
+    )
+
     parser.add_argument(
         "--list-tests",
         "--list_tests",
@@ -135,6 +142,16 @@ def parse_args():
         help="Run operators with PyTorch JIT mode",
     )
 
+    parser.add_argument(
+        "--use-compile",
+        "--use_compile",
+        type=benchmark_utils.str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="Run operators with PyTorch Compile mode",
+    )
+
     parser.add_argument(
         "--forward-only",
         "--forward_only",
@@ -162,7 +179,7 @@ def parse_args():
         "--output-json-for-dashboard",
         "--output_json_for_dashboard",
         help="Save results in JSON format for display on the OSS dashboard",
-        default="False",
+        default="benchmark-results.json",
     )
 
     args, _ = parser.parse_known_args()
diff --git a/buckbuild.bzl b/buckbuild.bzl
index 09a515584d97c..218fd747301f9 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -11,7 +11,7 @@ load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
 load("//tools/build_defs:platform_defs.bzl", "APPLETVOS", "IOS", "MACOSX")
 load("//tools/build_defs:type_defs.bzl", "is_list", "is_string")
 load("//tools/build_defs/android:build_mode_defs.bzl", is_production_build_android = "is_production_build")
-load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build")
+load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build", is_profile_build_ios = "is_profile_build")
 load(
     ":build_variables.bzl",
     "aten_cpu_source_list",
@@ -74,7 +74,7 @@ def _is_build_mode_dev():
     if is_production_build_android():
         # Android Prod builds
         return False
-    if is_production_build_ios():
+    if is_production_build_ios() or is_profile_build_ios():
         # iOS Prod builds
         return False
 
@@ -824,9 +824,13 @@ def get_pt_operator_registry_dict(
         apple_sdks = kwargs.get("apple_sdks"),
     )
 
+    # Extract existing linker_flags from kwargs and combine with default flags
+    existing_linker_flags = kwargs.pop("linker_flags", [])
+    combined_linker_flags = get_no_as_needed_linker_flag() + existing_linker_flags
+
     return dict(
         srcs = code_gen_files["srcs"],
-        linker_flags = get_no_as_needed_linker_flag(),
+        linker_flags = combined_linker_flags,
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
         soname = "libtorch-code-gen.$(ext)",
@@ -944,6 +948,7 @@ def define_buck_targets(
             [
                 ("torch/csrc/api/include", "torch/**/*.h"),
                 ("", "torch/csrc/**/*.h"),
+                ("", "torch/csrc/**/*.hpp"),
                 ("", "torch/nativert/**/*.h"),
                 ("", "torch/headeronly/**/*.h"),
                 ("", "torch/script.h"),
@@ -1144,6 +1149,9 @@ def define_buck_targets(
             "--replace",
             "@AT_KLEIDIAI_ENABLED@",
             "0",
+            "--replace",
+            "@AT_USE_EIGEN_SPARSE@",
+            "0",
         ]),
         outs = {
             "Config.h": ["Config.h"],
@@ -2026,6 +2034,7 @@ def define_buck_targets(
                 ("", "caffe2/utils/*.h"),
                 ("", "caffe2/core/*.h"),
                 ("", "torch/csrc/*.h"),
+                ("", "torch/csrc/*.hpp"),
                 ("", "torch/csrc/api/include/torch/*.h"),
                 ("", "torch/csrc/autograd/*.h"),
                 ("", "torch/csrc/autograd/*/*.h"),
diff --git a/build_variables.bzl b/build_variables.bzl
index a226249db7089..05b1cfdc7a4b0 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -512,6 +512,7 @@ libtorch_distributed_base_sources = [
     "torch/csrc/distributed/c10d/TCPStore.cpp",
     "torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
     "torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
+    "torch/csrc/distributed/c10d/Types.cpp",
     "torch/csrc/distributed/c10d/Utils.cpp",
     "torch/csrc/distributed/c10d/Work.cpp",
     "torch/csrc/distributed/c10d/comm.cpp",
@@ -631,6 +632,16 @@ libtorch_nativert_sources = [
     "torch/nativert/kernels/NativeKernels.cpp",
     "torch/nativert/kernels/GeneratedStaticDispatchKernels.cpp",
     "torch/nativert/kernels/GeneratedNativeStaticDispatchKernels.cpp",
+    "torch/nativert/graph/passes/SubgraphRewriter.cpp",
+    "torch/nativert/graph/passes/pass_manager/GraphPasses.cpp",
+    "torch/nativert/graph/passes/pass_manager/PassManager.cpp",
+    "torch/nativert/kernels/KernelHandlerRegistry.cpp",
+    "torch/nativert/kernels/TritonKernel.cpp",
+    "torch/nativert/executor/triton/CpuTritonKernelManager.cpp",
+]
+
+libtorch_nativert_cuda_sources = [
+    "torch/nativert/executor/triton/CudaTritonKernelManager.cpp",
 ]
 
 torch_mobile_tracer_sources = [
@@ -751,14 +762,22 @@ libtorch_cuda_distributed_extra_sources = [
     "torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu",
     "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp",
     "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
+    "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
     "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
 ]
 
+libtorch_nvshmem_sources = [
+    "torch/csrc/distributed/c10d/cuda/utils.cpp",
+    "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
+    "torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu",
+    "torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu",
+]
+
 libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
 
 libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [
     "torch/csrc/cuda/nccl.cpp",
-]
+] + libtorch_nativert_cuda_sources
 
 torch_cpp_srcs = [
     "torch/csrc/api/src/cuda.cpp",  # this just forwards stuff, no real CUDA
@@ -1075,6 +1094,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/DeviceAccelerator.cpp",
     "aten/src/ATen/Context.cpp",
     "aten/src/ATen/DLConvertor.cpp",
+    "aten/src/ATen/DTensorState.cpp",
     "aten/src/ATen/EmptyTensor.cpp",
     "aten/src/ATen/ExpandUtils.cpp",
     "aten/src/ATen/CachedTensorUtils.cpp",
diff --git a/c10/core/AllocatorConfig.cpp b/c10/core/AllocatorConfig.cpp
index e154338d501b2..c6b6e95f43b28 100644
--- a/c10/core/AllocatorConfig.cpp
+++ b/c10/core/AllocatorConfig.cpp
@@ -45,7 +45,7 @@ size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
       63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
   const size_t interval_end =
       63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
       interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
       "kRoundUpPowerOfTwoIntervals mismatch");
 
@@ -64,7 +64,7 @@ size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
       std::numeric_limits<size_t>::max() / kMB;
 
   size_t val_env = tokenizer.toSizeT(++i);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
       val_env >= min_allowed_split_size_mb,
       "CachingAllocator option max_split_size_mb too small, must be >= ",
       min_allowed_split_size_mb);
@@ -83,7 +83,7 @@ size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
       std::numeric_limits<size_t>::max() / kMB;
 
   size_t val_env = tokenizer.toSizeT(++i);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
       val_env >= min_allowed_split_size_mb,
       "CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
       min_allowed_split_size_mb);
@@ -98,7 +98,7 @@ size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
     size_t i) {
   tokenizer.checkToken(++i, ":");
   double val_env = tokenizer.toDouble(++i);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
       val_env > 0 && val_env < 1.0,
       "garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
   garbage_collection_threshold_ = val_env;
@@ -119,7 +119,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
       size_t value_index = i;
       tokenizer.checkToken(++i, ":");
       size_t value = tokenizer.toSizeT(++i);
-      TORCH_CHECK_VALUE(
+      TORCH_CHECK(
           value == 0 || llvm::isPowerOf2_64(value),
           "For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
 
@@ -133,7 +133,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
             value);
       } else {
         size_t boundary = tokenizer.toSizeT(value_index);
-        TORCH_CHECK_VALUE(
+        TORCH_CHECK(
             llvm::isPowerOf2_64(boundary),
             "For roundups, the intervals have to be power of 2 ");
 
@@ -163,7 +163,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
         "Expected closing bracket ']' in ConfigTokenizer but reached end of config");
   } else { // Keep this for backwards compatibility
     size_t value = tokenizer.toSizeT(i);
-    TORCH_CHECK_VALUE(
+    TORCH_CHECK(
         llvm::isPowerOf2_64(value),
         "For roundups, the divisions has to be power of 2 ");
     std::fill(
diff --git a/c10/core/AllocatorConfig.h b/c10/core/AllocatorConfig.h
index efde5e3a8ff98..68cc47a8417c2 100644
--- a/c10/core/AllocatorConfig.h
+++ b/c10/core/AllocatorConfig.h
@@ -76,7 +76,7 @@ class ConfigTokenizer {
     } else if (token == "False") {
       return false;
     } else {
-      TORCH_CHECK_VALUE(
+      TORCH_CHECK(
           false,
           "Expected 'True' or 'False' at index ",
           i,
diff --git a/c10/core/Backend.h b/c10/core/Backend.h
index 67c9276313bba..0497d72b95703 100644
--- a/c10/core/Backend.h
+++ b/c10/core/Backend.h
@@ -237,8 +237,6 @@ inline DeviceType backendToDeviceType(Backend b) {
       return DeviceType::CPU;
     case Backend::CUDA:
     case Backend::SparseCUDA:
-    case Backend::SparseMPS:
-    case Backend::SparseCsrMPS:
     case Backend::QuantizedCUDA:
     case Backend::SparseCsrCUDA:
       return DeviceType::CUDA;
@@ -276,6 +274,8 @@ inline DeviceType backendToDeviceType(Backend b) {
     case Backend::Meta:
       return DeviceType::Meta;
     case Backend::MPS:
+    case Backend::SparseMPS:
+    case Backend::SparseCsrMPS:
       return DeviceType::MPS;
     case Backend::HPU:
       return DeviceType::HPU;
diff --git a/c10/core/CachingDeviceAllocator.cpp b/c10/core/CachingDeviceAllocator.cpp
new file mode 100644
index 0000000000000..582efd59cf1b1
--- /dev/null
+++ b/c10/core/CachingDeviceAllocator.cpp
@@ -0,0 +1,10 @@
+#include <c10/core/CachingDeviceAllocator.h>
+
+namespace c10 {
+
+// Ensures proper DLL export of this pure virtual base class on Windows,
+// since it's mainly used in other DLLs outside c10.dll.
+DeviceAllocator::DeviceAllocator() = default;
+DeviceAllocator::~DeviceAllocator() = default;
+
+} // namespace c10
diff --git a/c10/core/CachingDeviceAllocator.h b/c10/core/CachingDeviceAllocator.h
index b23490de693a8..0bec03ae417fa 100644
--- a/c10/core/CachingDeviceAllocator.h
+++ b/c10/core/CachingDeviceAllocator.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/core/Allocator.h>
+#include <c10/core/Stream.h>
 
 namespace c10::CachingDeviceAllocator {
 
@@ -59,3 +60,55 @@ struct DeviceStats {
 };
 
 } // namespace c10::CachingDeviceAllocator
+
+namespace c10 {
+
+using CaptureId_t = unsigned long long;
+
+// first is set if the instance is created by Graph mode capture_begin.
+// second is set if the instance is created by Graph mode graph_pool_handle.
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+struct C10_API DeviceAllocator : public c10::Allocator {
+  DeviceAllocator();
+  ~DeviceAllocator() override;
+
+  // Returns true if the allocator has been properly initialized and is ready
+  // for use
+  virtual bool initialized() = 0;
+
+  // Releases all cached device memory from the specified memory pool back to
+  // the system
+  virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
+
+  // Associates a memory allocation with a stream to establish dependency
+  // tracking. Prevents memory reuse until all operations on the specified
+  // stream complete
+  virtual void recordStream(const DataPtr& ptr, c10::Stream stream) = 0;
+
+  // Retrieves comprehensive memory statistics for the specified device,
+  // including allocation patterns, usage metrics
+  virtual CachingDeviceAllocator::DeviceStats getDeviceStats(
+      c10::DeviceIndex device) = 0;
+
+  // Resets cumulative allocation statistics for the specified device to zero
+  virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
+
+  // Resets peak memory usage statistics for the specified device
+  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+};
+
+// This function is used to get the DeviceAllocator for a specific device type
+// and keep backward compatibility with c10::GetAllocator.
+C10_API inline DeviceAllocator* getDeviceAllocator(const DeviceType& t) {
+  TORCH_CHECK(
+      t != DeviceType::CPU,
+      "getDeviceAllocator is not supported for CPU device type.");
+  auto* allocator = c10::GetAllocator(t);
+  auto* device_allocator = dynamic_cast<DeviceAllocator*>(allocator);
+  TORCH_INTERNAL_ASSERT(
+      device_allocator, "Allocator for ", t, " is not a DeviceAllocator.");
+  return device_allocator;
+}
+
+} // namespace c10
diff --git a/c10/core/Contiguity.h b/c10/core/Contiguity.h
index 279a795583b12..eed3f24983424 100644
--- a/c10/core/Contiguity.h
+++ b/c10/core/Contiguity.h
@@ -33,7 +33,8 @@ bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
 }
 
 // Return a SymBool with underlying symbolic expression that represents
-// contiguity. Guaranteed not to add guards.
+// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions
+// or symbolic True.
 inline static c10::SymBool _compute_contiguous_sym(
     ArrayRef<c10::SymInt> sizes,
     ArrayRef<c10::SymInt> strides,
@@ -76,6 +77,8 @@ inline static c10::SymBool _compute_contiguous_sym(
     return true;
   };
 
+  // We try to minimize creating large symbolic expressions when not needed to
+  // avoid symbolic evaluation perf issues.
   if (is_contiguous_or_false()) {
     return c10::SymBool(true);
   }
@@ -94,6 +97,9 @@ inline static c10::SymBool _compute_contiguous_sym(
   return is_contiguous_cond.sym_or(is_empty);
 }
 
+// When T is SymInt this function may throw a data dependent error.
+// _compute_channels_last_contiguous_2d_sym does not. Only use this function
+// when inputs are hinted.
 template <typename T>
 bool _compute_channels_last_contiguous_2d(
     ArrayRef<T> sizes,
@@ -105,8 +111,8 @@ bool _compute_channels_last_contiguous_2d(
       T expected = 1;
       for (auto& d : {1, 3, 2, 0}) {
         const auto& size_d = sizes[d];
-        if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
-          if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
+        if (size_d != 1) {
+          if (strides[d] != expected) {
             return false;
           }
           expected *= size_d;
@@ -123,6 +129,65 @@ bool _compute_channels_last_contiguous_2d(
   }
 }
 
+// Return a SymBool with underlying symbolic expression that represents
+// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions
+// or symbolic True.
+inline static c10::SymBool _compute_channels_last_contiguous_2d_sym(
+    ArrayRef<c10::SymInt> sizes,
+    ArrayRef<c10::SymInt> strides) {
+  switch (sizes.size()) {
+    case 4: {
+      // When this function return True, result always true. When it return
+      // False, result could be False or data dependent.
+      auto guard_or_false = [&]() {
+        c10::SymInt expected = 1;
+        for (auto& d : {1, 3, 2, 0}) {
+          const auto& size_d = sizes[d];
+          // Not taking this branch could make this return False instead of True
+          // but not vice-versa. so its ok.
+          if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) {
+            continue;
+          }
+          // Taking this branch could make this return False instead of True
+          // but not vice-versa. so its ok.
+          if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) {
+            return false;
+          }
+          expected *= size_d;
+        }
+        return true;
+      };
+
+      // We try to minimize creating large symbolic expressions when not needed
+      // to avoid symbolic evaluation perf issues.
+      if (guard_or_false()) {
+        return c10::SymBool(true);
+      }
+
+      // Result is either false, or data dependent.
+      c10::SymInt expected_stride = 1;
+      c10::SymBool cond = true;
+
+      for (auto& d : {1, 3, 2, 0}) {
+        const auto& size_d = sizes[d];
+        cond = cond.sym_and(
+            size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride)));
+        expected_stride *= size_d;
+      }
+      return cond;
+    }
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 3:
+      // TODO dim == 3 case will be enabled once it is fully tested
+      return c10::SymBool(false);
+    default:
+      return c10::SymBool(false);
+  }
+}
+
+// When T is SymInt this function may throw a data dependent error.
+// _compute_channels_last_contiguous_3d_sym does not. Only use this function
+// when inputs are hinted.
 template <typename T>
 bool _compute_channels_last_contiguous_3d(
     ArrayRef<T> sizes,
@@ -134,8 +199,8 @@ bool _compute_channels_last_contiguous_3d(
       T expected = 1;
       for (auto& d : {1, 4, 3, 2, 0}) {
         const auto& size_d = sizes[d];
-        if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
-          if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
+        if (size_d != 1) {
+          if (strides[d] != expected) {
             return false;
           }
           expected *= size_d;
@@ -152,6 +217,59 @@ bool _compute_channels_last_contiguous_3d(
   }
 }
 
+inline static c10::SymBool _compute_channels_last_contiguous_3d_sym(
+    ArrayRef<c10::SymInt> sizes,
+    ArrayRef<c10::SymInt> strides) {
+  switch (sizes.size()) {
+    case 5: {
+      // When this function return True, result always true. When it return
+      // False, result could be False or data dependent.
+      auto guard_or_false = [&]() {
+        c10::SymInt expected = 1;
+        for (auto& d : {1, 4, 3, 2, 0}) {
+          const auto& size_d = sizes[d];
+          // Not taking this branch could make this return False instead of True
+          // but not vice-versa. so its ok.
+          if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) {
+            continue;
+          }
+          // Taking this branch could make this return False instead of True
+          // but not vice-versa. so its ok.
+          if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) {
+            return false;
+          }
+          expected *= size_d;
+        }
+        return true;
+      };
+
+      // We try to minimize creating large symbolic expressions when not needed
+      // to avoid symbolic evaluation perf issues.
+      if (guard_or_false()) {
+        return c10::SymBool(true);
+      }
+
+      // Result is either false, or data dependent.
+      c10::SymInt expected_stride = 1;
+      c10::SymBool cond = true;
+
+      for (auto& d : {1, 4, 3, 2, 0}) {
+        const auto& size_d = sizes[d];
+        cond = cond.sym_and(
+            size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride)));
+        expected_stride *= size_d;
+      }
+      return cond;
+    }
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 4:
+      // TODO dim == 4 case will be enabled once it is fully tested
+      return c10::SymBool(false);
+    default:
+      return c10::SymBool(false);
+  }
+}
+
 template <typename T>
 bool _compute_non_overlapping_and_dense(
     ArrayRef<T> sizes,
diff --git a/c10/core/Layout.h b/c10/core/Layout.h
index 0daa129bb5a4f..0d09e0ed46f4e 100644
--- a/c10/core/Layout.h
+++ b/c10/core/Layout.h
@@ -33,7 +33,6 @@ inline Layout layout_from_backend(Backend backend) {
     case Backend::SparseCPU:
     case Backend::SparseCUDA:
     case Backend::SparseMPS:
-    case Backend::SparseCsrMPS:
     case Backend::SparseHIP:
     case Backend::SparseVE:
     case Backend::SparseXPU:
@@ -43,6 +42,7 @@ inline Layout layout_from_backend(Backend backend) {
       return Layout::Mkldnn;
     case Backend::SparseCsrCPU:
     case Backend::SparseCsrCUDA:
+    case Backend::SparseCsrMPS:
     case Backend::SparseCsrHIP:
     case Backend::SparseCsrVE:
     case Backend::SparseCsrXPU:
diff --git a/c10/core/Scalar.h b/c10/core/Scalar.h
index 3b483c86bc88f..646a1dde39940 100644
--- a/c10/core/Scalar.h
+++ b/c10/core/Scalar.h
@@ -191,11 +191,17 @@ class C10_API Scalar {
   isIntegral() const {
     return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag;
   }
+
   bool isIntegral(bool includeBool) const {
     return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag ||
         (includeBool && isBoolean());
   }
 
+  // See Note [Meaning of HAS_u]
+  bool isUnsigned() const {
+    return Tag::HAS_u == tag || (Tag::HAS_i == tag && v.i >= 0);
+  }
+
   bool isComplex() const {
     return Tag::HAS_z == tag;
   }
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 3d8a2b0074e9e..4a15eb23ac63c 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -19,25 +19,16 @@
 
 #include <array>
 #include <cstddef>
-#include <cstdint>
 #include <limits>
 #include <ostream>
 #include <type_traits>
 #include <unordered_map>
 
-namespace c10 {
-
-// dummy struct for uint1 to uint7, actual functionality
-// of these dtypes will be implemented in python with Tensor subclass
-template <unsigned int N>
-struct dummy_uint1_7_t {};
+#include <torch/headeronly/core/ScalarType.h>
 
-// dummy struct for int1 to int7, actual functionality
-// of these dtypes will be implemented in python with Tensor subclass
-template <unsigned int N>
-struct dummy_int1_7_t {};
+namespace c10 {
 
-// For the macros below:
+// [dtype Macros note] For the macros below:
 //
 // For users: If you want to macro some code for all non-QInt scalar types
 // (i.e. types with complete information, you probably want one of the
@@ -57,56 +48,6 @@ struct dummy_int1_7_t {};
 // some old PRs where we added new dtypes (check history of this file) can
 // help give you an idea where to start.
 
-// NB: Order matters for this macro; it is relied upon in
-// _promoteTypesLookup and the serialization format.
-#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(_) \
-  _(uint8_t, Byte) /* 0 */                               \
-  _(int8_t, Char) /* 1 */                                \
-  _(int16_t, Short) /* 2 */                              \
-  _(int, Int) /* 3 */                                    \
-  _(int64_t, Long) /* 4 */                               \
-  _(at::Half, Half) /* 5 */                              \
-  _(float, Float) /* 6 */                                \
-  _(double, Double) /* 7 */                              \
-  _(c10::complex<c10::Half>, ComplexHalf) /* 8 */        \
-  _(c10::complex<float>, ComplexFloat) /* 9 */           \
-  _(c10::complex<double>, ComplexDouble) /* 10 */        \
-  _(bool, Bool) /* 11 */                                 \
-  _(c10::qint8, QInt8) /* 12 */                          \
-  _(c10::quint8, QUInt8) /* 13 */                        \
-  _(c10::qint32, QInt32) /* 14 */                        \
-  _(at::BFloat16, BFloat16) /* 15 */                     \
-  _(c10::quint4x2, QUInt4x2) /* 16 */                    \
-  _(c10::quint2x4, QUInt2x4) /* 17 */                    \
-  _(c10::bits1x8, Bits1x8) /* 18 */                      \
-  _(c10::bits2x4, Bits2x4) /* 19 */                      \
-  _(c10::bits4x2, Bits4x2) /* 20 */                      \
-  _(c10::bits8, Bits8) /* 21 */                          \
-  _(c10::bits16, Bits16) /* 22 */                        \
-  _(c10::Float8_e5m2, Float8_e5m2) /* 23 */              \
-  _(c10::Float8_e4m3fn, Float8_e4m3fn) /* 24 */          \
-  _(c10::Float8_e5m2fnuz, Float8_e5m2fnuz) /* 25 */      \
-  _(c10::Float8_e4m3fnuz, Float8_e4m3fnuz) /* 26 */      \
-  _(uint16_t, UInt16) /* 27 */                           \
-  _(uint32_t, UInt32) /* 28 */                           \
-  _(uint64_t, UInt64) /* 29 */                           \
-  _(c10::dummy_uint1_7_t<1>, UInt1) /* 30 */             \
-  _(c10::dummy_uint1_7_t<2>, UInt2) /* 31 */             \
-  _(c10::dummy_uint1_7_t<3>, UInt3) /* 32 */             \
-  _(c10::dummy_uint1_7_t<4>, UInt4) /* 33 */             \
-  _(c10::dummy_uint1_7_t<5>, UInt5) /* 34 */             \
-  _(c10::dummy_uint1_7_t<6>, UInt6) /* 35 */             \
-  _(c10::dummy_uint1_7_t<7>, UInt7) /* 36 */             \
-  _(c10::dummy_int1_7_t<1>, Int1) /* 37 */               \
-  _(c10::dummy_int1_7_t<2>, Int2) /* 38 */               \
-  _(c10::dummy_int1_7_t<3>, Int3) /* 39 */               \
-  _(c10::dummy_int1_7_t<4>, Int4) /* 40 */               \
-  _(c10::dummy_int1_7_t<5>, Int5) /* 41 */               \
-  _(c10::dummy_int1_7_t<6>, Int6) /* 42 */               \
-  _(c10::dummy_int1_7_t<7>, Int7) /* 43 */               \
-  _(c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */        \
-  _(c10::Float4_e2m1fn_x2, Float4_e2m1fn_x2) /* 45 */
-
 // If you want to support ComplexHalf for real, add ComplexHalf
 // into this macro (and change the name).  But beware: convert()
 // doesn't work for all the conversions you need...
@@ -152,17 +93,6 @@ struct dummy_int1_7_t {};
   _(at::Float8_e4m3fnuz, Float8_e4m3fnuz)      \
   _(at::Float8_e8m0fnu, Float8_e8m0fnu)
 
-enum class ScalarType : int8_t {
-#define DEFINE_ST_ENUM_VAL_(_1, n) n,
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ST_ENUM_VAL_)
-#undef DEFINE_ENUM_ST_ENUM_VAL_
-      Undefined,
-  NumOptions
-};
-
-constexpr uint16_t NumScalarTypes =
-    static_cast<uint16_t>(ScalarType::NumOptions);
-
 namespace impl {
 
 // These are used to map ScalarTypes to C++ types.
diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp
index c6c2743d8358a..b78ca94dc5145 100644
--- a/c10/core/SymInt.cpp
+++ b/c10/core/SymInt.cpp
@@ -20,6 +20,14 @@ void SymInt::promote_to_negative() {
   s.data_ = 0;
 }
 
+std::optional<int64_t> SymInt::maybe_as_int_slow_path() const {
+  auto* node = toSymNodeImplUnowned();
+  if (auto c = node->constant_int()) {
+    return c;
+  }
+  return node->maybe_as_int();
+}
+
 SymNode SymInt::toSymNode() const {
   TORCH_CHECK_ALWAYS_SHOW_CPP_STACKTRACE(
       is_heap_allocated(), "SymInt::toSymNode is_heap_allocated");
@@ -45,12 +53,11 @@ bool SymInt::has_hint() const {
 #define DEFINE_BINARY(API, OP, METHOD, RET)                          \
   RET SymInt::API(const SymInt& sci) const {                         \
     if (auto ma = maybe_as_int()) {                                  \
-      if (auto mb = sci.maybe_as_int()) {                            \
-        return RET(OP(*ma, *mb));                                    \
-      } else {                                                       \
-        auto b = sci.toSymNode();                                    \
-        return RET(b->wrap_int(*ma)->METHOD(b));                     \
-      }                                                              \
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(                              \
+          !sci.maybe_as_int(),                                       \
+          "should have hit fast path in the header in this case.");  \
+      auto b = sci.toSymNode();                                      \
+      return RET(b->wrap_int(*ma)->METHOD(b));                       \
     } else {                                                         \
       if (auto mb = sci.maybe_as_int()) {                            \
         auto a = toSymNodeImplUnowned();                             \
@@ -61,19 +68,19 @@ bool SymInt::has_hint() const {
     }                                                                \
   }
 
-DEFINE_BINARY(operator+, std::plus<>(), add, SymInt)
-DEFINE_BINARY(operator-, std::minus<>(), sub, SymInt)
-DEFINE_BINARY(operator*, std::multiplies<>(), mul, SymInt)
-DEFINE_BINARY(operator/, std::divides<>(), floordiv, SymInt)
-DEFINE_BINARY(operator%, std::modulus<>(), mod, SymInt)
-DEFINE_BINARY(sym_eq, std::equal_to<>(), eq, SymBool)
-DEFINE_BINARY(sym_ne, std::not_equal_to<>(), ne, SymBool)
-DEFINE_BINARY(sym_lt, std::less<>(), lt, SymBool)
-DEFINE_BINARY(sym_le, std::less_equal<>(), le, SymBool)
-DEFINE_BINARY(sym_gt, std::greater<>(), gt, SymBool)
-DEFINE_BINARY(sym_ge, std::greater_equal<>(), ge, SymBool)
-DEFINE_BINARY(min, std::min, sym_min, SymInt)
-DEFINE_BINARY(max, std::max, sym_max, SymInt)
+DEFINE_BINARY(operator_add_slow_path, std::plus<>(), add, SymInt)
+DEFINE_BINARY(operator_sub_slow_path, std::minus<>(), sub, SymInt)
+DEFINE_BINARY(operator_mul_slow_path, std::multiplies<>(), mul, SymInt)
+DEFINE_BINARY(operator_div_slow_path, std::divides<>(), floordiv, SymInt)
+DEFINE_BINARY(operator_mod_slow_path, std::modulus<>(), mod, SymInt)
+DEFINE_BINARY(sym_eq_slow_path, std::equal_to<>(), eq, SymBool)
+DEFINE_BINARY(sym_ne_slow_path, std::not_equal_to<>(), ne, SymBool)
+DEFINE_BINARY(sym_lt_slow_path, std::less<>(), lt, SymBool)
+DEFINE_BINARY(sym_le_slow_path, std::less_equal<>(), le, SymBool)
+DEFINE_BINARY(sym_gt_slow_path, std::greater<>(), gt, SymBool)
+DEFINE_BINARY(sym_ge_slow_path, std::greater_equal<>(), ge, SymBool)
+DEFINE_BINARY(min_slow_path, std::min, sym_min, SymInt)
+DEFINE_BINARY(max_slow_path, std::max, sym_max, SymInt)
 
 SymInt::operator SymFloat() const {
   if (auto ma = maybe_as_int()) {
@@ -153,15 +160,15 @@ SymInt operator-(const SymInt& s) {
   }
 }
 
-void SymInt::operator*=(const SymInt& sci) {
+void SymInt::operator_imul_slow_path(const SymInt& sci) {
   *this = *this * sci;
 }
 
-void SymInt::operator/=(const SymInt& sci) {
+void SymInt::operator_idiv_slow_path(const SymInt& sci) {
   *this = *this / sci;
 }
 
-void SymInt::operator+=(const SymInt& sci) {
+void SymInt::operator_iadd_slow_path(const SymInt& sci) {
   *this = *this + sci;
 }
 
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 51686f8b81afb..9b1c776cbe2ab 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -7,6 +7,7 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
 
+#include <algorithm>
 #include <cstdint>
 #include <iterator>
 #include <numeric>
@@ -177,23 +178,136 @@ class C10_API SymInt {
 #endif
   }
 
-  SymInt operator+(const SymInt& sci) const;
-  SymInt operator-(const SymInt& sci) const;
-  SymInt operator*(const SymInt& sci) const;
-  SymInt operator/(const SymInt& sci) const;
-  SymInt operator%(const SymInt& sci) const;
-  void operator*=(const SymInt& sci);
-  void operator+=(const SymInt& sci);
-  void operator/=(const SymInt& sci);
+  SymInt operator+(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma + *mb);
+      }
+    }
+    return operator_add_slow_path(sci);
+  }
+
+  SymInt operator-(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma - *mb);
+      }
+    }
+    return operator_sub_slow_path(sci);
+  }
+
+  SymInt operator*(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma * *mb);
+      }
+    }
+    return operator_mul_slow_path(sci);
+  }
+
+  SymInt operator/(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma / *mb);
+      }
+    }
+    return operator_div_slow_path(sci);
+  }
+
+  SymInt operator%(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma % *mb);
+      }
+    }
+    return operator_mod_slow_path(sci);
+  }
+
+  void operator*=(const SymInt& sci) {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        *this = SymInt(*ma * *mb);
+        return;
+      }
+    }
+    operator_imul_slow_path(sci);
+  }
+
+  void operator+=(const SymInt& sci) {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        *this = SymInt(*ma + *mb);
+        return;
+      }
+    }
+    operator_iadd_slow_path(sci);
+  }
+
+  void operator/=(const SymInt& sci) {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        *this = SymInt(*ma / *mb);
+        return;
+      }
+    }
+    operator_idiv_slow_path(sci);
+  }
 
   SymInt clone() const;
 
-  SymBool sym_eq(const SymInt&) const;
-  SymBool sym_ne(const SymInt&) const;
-  SymBool sym_lt(const SymInt&) const;
-  SymBool sym_le(const SymInt&) const;
-  SymBool sym_gt(const SymInt&) const;
-  SymBool sym_ge(const SymInt&) const;
+  SymBool sym_eq(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma == *mb);
+      }
+    }
+    return sym_eq_slow_path(sci);
+  }
+
+  SymBool sym_ne(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma != *mb);
+      }
+    }
+    return sym_ne_slow_path(sci);
+  }
+
+  SymBool sym_lt(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma < *mb);
+      }
+    }
+    return sym_lt_slow_path(sci);
+  }
+
+  SymBool sym_le(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma <= *mb);
+      }
+    }
+    return sym_le_slow_path(sci);
+  }
+
+  SymBool sym_gt(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma > *mb);
+      }
+    }
+    return sym_gt_slow_path(sci);
+  }
+
+  SymBool sym_ge(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma >= *mb);
+      }
+    }
+    return sym_ge_slow_path(sci);
+  }
 
   bool operator==(const SymInt& o) const {
     return sym_eq(o).guard_bool(__FILE__, __LINE__);
@@ -214,8 +328,23 @@ class C10_API SymInt {
     return sym_ge(o).guard_bool(__FILE__, __LINE__);
   }
 
-  SymInt min(const SymInt& sci) const;
-  SymInt max(const SymInt& sci) const;
+  SymInt min(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(std::min(*ma, *mb));
+      }
+    }
+    return min_slow_path(sci);
+  }
+
+  SymInt max(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(std::max(*ma, *mb));
+      }
+    }
+    return max_slow_path(sci);
+  }
 
   // If both are symbolic, this checks if
   // they share the same node.
@@ -239,11 +368,7 @@ class C10_API SymInt {
     if (!is_heap_allocated()) {
       return data_;
     }
-    auto* node = toSymNodeImplUnowned();
-    if (auto c = node->constant_int()) {
-      return c;
-    }
-    return node->maybe_as_int();
+    return maybe_as_int_slow_path();
   }
 
   // Return whether the integer is directly coercible to a SymInt
@@ -264,6 +389,25 @@ class C10_API SymInt {
 
  private:
   void promote_to_negative();
+  SymInt operator_add_slow_path(const SymInt& sci) const;
+  SymInt operator_sub_slow_path(const SymInt& sci) const;
+  SymInt operator_mul_slow_path(const SymInt& sci) const;
+  SymInt operator_div_slow_path(const SymInt& sci) const;
+  SymInt operator_mod_slow_path(const SymInt& sci) const;
+  void operator_imul_slow_path(const SymInt& sci);
+  void operator_iadd_slow_path(const SymInt& sci);
+  void operator_idiv_slow_path(const SymInt& sci);
+  SymBool sym_eq_slow_path(const SymInt& sci) const;
+  SymBool sym_ne_slow_path(const SymInt& sci) const;
+  SymBool sym_lt_slow_path(const SymInt& sci) const;
+  SymBool sym_le_slow_path(const SymInt& sci) const;
+  SymBool sym_gt_slow_path(const SymInt& sci) const;
+  SymBool sym_ge_slow_path(const SymInt& sci) const;
+
+  SymInt min_slow_path(const SymInt& sci) const;
+  SymInt max_slow_path(const SymInt& sci) const;
+
+  std::optional<int64_t> maybe_as_int_slow_path() const;
 
   // Constraints on the internal representation:
   //
diff --git a/c10/core/SymbolicShapeMeta.cpp b/c10/core/SymbolicShapeMeta.cpp
index 6fa2ab0ed4f1d..01276d416fbb8 100644
--- a/c10/core/SymbolicShapeMeta.cpp
+++ b/c10/core/SymbolicShapeMeta.cpp
@@ -71,6 +71,27 @@ normalize_sym_sizes_strides(SymIntArrayRef sizes, SymIntArrayRef strides) {
   return std::tuple<SymNode, std::vector<SymNode>, std::vector<SymNode>>(
       std::move(base), std::move(size_nodes), std::move(stride_nodes));
 }
+namespace {
+bool all_hinted(
+    const c10::SymIntArrayRef& sizes,
+    const c10::SymIntArrayRef& strides) {
+  auto all_hinted = true;
+  for (const auto& s : sizes) {
+    if (!s.has_hint()) {
+      return false;
+    }
+  }
+
+  if (all_hinted) {
+    for (const auto& s : strides) {
+      if (!s.has_hint()) {
+        return false;
+      }
+    }
+  }
+  return all_hinted;
+}
+} // namespace
 
 // Special treatment because of numel
 SymBool SymbolicShapeMeta::compute_contiguous() const {
@@ -88,28 +109,61 @@ SymBool SymbolicShapeMeta::compute_contiguous() const {
     return maybe_as_bool.value();
   }
 
-  auto all_hinted = true;
-  for (const auto& s : sizes) {
-    if (!s.has_hint()) {
-      all_hinted = false;
-      break;
-    }
+  if (all_hinted(sizes, strides)) {
+    // We avoid going through the slow path if everything is hinted,
+    // because evaluating a large SymPy expression can be expensive.
+    // TODO exclude backed_size_oblivious from this path.
+    return _compute_contiguous<SymInt>(sizes_, strides_, numel());
   }
 
-  if (all_hinted) {
-    for (const auto& s : strides) {
-      if (!s.has_hint()) {
-        all_hinted = false;
-        break;
-      }
-    }
+  return result;
+}
+
+SymBool SymbolicShapeMeta::compute_channels_last_contiguous_2d() const {
+  if (!strides_valid_) {
+    return false;
   }
+  c10::SymIntArrayRef sizes(sizes_);
+  c10::SymIntArrayRef strides(strides_);
 
-  if (all_hinted) {
+  auto result = _compute_channels_last_contiguous_2d_sym(sizes, strides);
+
+  // If the result is already determined without guarding, just return it.
+  auto maybe_as_bool = result.maybe_as_bool();
+  if (maybe_as_bool.has_value()) {
+    return maybe_as_bool.value();
+  }
+
+  if (all_hinted(sizes, strides)) {
     // We avoid going through the slow path if everything is hinted,
     // because evaluating a large SymPy expression can be expensive.
     // TODO exclude backed_size_oblivious from this path.
-    return _compute_contiguous<SymInt>(sizes_, strides_, numel());
+    return _compute_channels_last_contiguous_2d<SymInt>(sizes_, strides_);
+  }
+
+  return result;
+}
+
+SymBool SymbolicShapeMeta::compute_channels_last_contiguous_3d() const {
+  if (!strides_valid_) {
+    return false;
+  }
+  c10::SymIntArrayRef sizes(sizes_);
+  c10::SymIntArrayRef strides(strides_);
+
+  auto result = _compute_channels_last_contiguous_3d_sym(sizes, strides);
+
+  // If the result is already determined without guarding, just return it.
+  auto maybe_as_bool = result.maybe_as_bool();
+  if (maybe_as_bool.has_value()) {
+    return maybe_as_bool.value();
+  }
+
+  if (all_hinted(sizes, strides)) {
+    // We avoid going through the slow path if everything is hinted,
+    // because evaluating a large SymPy expression can be expensive.
+    // TODO exclude backed_size_oblivious from this path.
+    return _compute_channels_last_contiguous_3d<SymInt>(sizes_, strides_);
   }
 
   return result;
@@ -143,8 +197,6 @@ SymBool SymbolicShapeMeta::compute_contiguous() const {
   }
 
 // clang-format off
-DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_2d, _compute_channels_last_contiguous_2d)
-DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_3d, _compute_channels_last_contiguous_3d)
 DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_2d, is_channels_last_strides_2d)
 DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_3d, is_channels_last_strides_3d)
 
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index f3ec2f2d46ea2..cd0321d3bb6f5 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -313,8 +313,15 @@ void TensorImpl::throw_data_ptr_access_error() const {
 c10::SymBool TensorImpl::sym_is_contiguous_custom(
     at::MemoryFormat memory_format) const {
   if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) {
-    return pyobj_slot_.load_pyobj_interpreter()->is_contiguous(
-        this, memory_format);
+    // TO reduce BC breaking and reduce having to introduce
+    // sym_is_contiguous. call is_contiguous when tensor does not
+    if (C10_UNLIKELY(has_symbolic_sizes_strides_)) {
+      return pyobj_slot_.load_pyobj_interpreter()->sym_is_contiguous(
+          this, memory_format);
+    } else {
+      return pyobj_slot_.load_pyobj_interpreter()->is_contiguous(
+          this, memory_format);
+    }
   }
 
   return sym_is_contiguous_default(memory_format);
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 381bc65b27fbd..972181327b1f6 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -643,47 +643,43 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     }
   }
 
-  // From https://stackoverflow.com/a/3057522/23845
-  // TODO: does C++14 have a stdlib template for this?
-  template <typename T>
-  struct identity {
-    typedef T type;
-  };
-
   template <typename T>
   ArrayRef<T> generic_sizes() {
-    return _generic_sizes(identity<T>());
-  }
+    static_assert(
+        std::is_same_v<T, int64_t> || std::is_same_v<T, c10::SymInt>,
+        "Only supports int64_t and c10::SymInt.");
 
-  ArrayRef<int64_t> _generic_sizes(identity<int64_t>) {
-    return sizes();
-  }
-  ArrayRef<c10::SymInt> _generic_sizes(identity<c10::SymInt>) {
-    return sym_sizes();
+    if constexpr (std::is_same_v<T, int64_t>) {
+      return sizes();
+    } else {
+      return sym_sizes();
+    }
   }
 
   template <typename T>
   ArrayRef<T> generic_strides() {
-    return _generic_strides(identity<T>());
-  }
+    static_assert(
+        std::is_same_v<T, int64_t> || std::is_same_v<T, c10::SymInt>,
+        "Only supports int64_t and c10::SymInt.");
 
-  ArrayRef<int64_t> _generic_strides(identity<int64_t>) {
-    return strides();
-  }
-  ArrayRef<c10::SymInt> _generic_strides(identity<c10::SymInt>) {
-    return sym_strides();
+    if constexpr (std::is_same_v<T, int64_t>) {
+      return strides();
+    } else {
+      return sym_strides();
+    }
   }
 
   template <typename T>
   T generic_storage_offset() {
-    return _generic_storage_offset(identity<T>());
-  }
+    static_assert(
+        std::is_same_v<T, int64_t> || std::is_same_v<T, c10::SymInt>,
+        "Only supports int64_t and c10::SymInt.");
 
-  int64_t _generic_storage_offset(identity<int64_t>) {
-    return storage_offset();
-  }
-  c10::SymInt _generic_storage_offset(identity<c10::SymInt>) {
-    return sym_storage_offset();
+    if constexpr (std::is_same_v<T, int64_t>) {
+      return storage_offset();
+    } else {
+      return sym_storage_offset();
+    }
   }
 
   /**
@@ -2090,6 +2086,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       constexpr auto sparse_backends = DispatchKeySet(
           {BackendComponent::CPUBit,
            BackendComponent::CUDABit,
+           BackendComponent::MPSBit,
            BackendComponent::HIPBit,
            BackendComponent::XPUBit});
       constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse);
diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
index b4ae1d612e961..913bc78726576 100644
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -60,6 +60,10 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
   bool is_contiguous(const TensorImpl* self, at::MemoryFormat) const override {
     PANIC(is_contiguous);
   }
+  c10::SymBool sym_is_contiguous(const TensorImpl* self, at::MemoryFormat)
+      const override {
+    PANIC(sym_is_contiguous);
+  }
   bool is_strides_like(const TensorImpl* self, at::MemoryFormat)
       const override {
     PANIC(is_strides_like);
diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h
index 09d4801f7d83d..def708c24b802 100644
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@@ -168,6 +168,9 @@ struct C10_API PyInterpreterVTable {
 
   virtual bool is_contiguous(const TensorImpl* self, at::MemoryFormat)
       const = 0;
+  virtual c10::SymBool sym_is_contiguous(
+      const TensorImpl* self,
+      at::MemoryFormat) const = 0;
   virtual bool is_strides_like(const TensorImpl* self, at::MemoryFormat)
       const = 0;
   virtual bool is_non_overlapping_and_dense(const TensorImpl* self) const = 0;
diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp
index 3ad84fd345ca5..8706f7362a3d2 100644
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@@ -1,119 +1,393 @@
 #include <c10/cuda/CUDAAllocatorConfig.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/llvmMathExtras.h>
 
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/driver_api.h>
 #endif
 
-#include <cuda_runtime_api.h>
-
 namespace c10::cuda::CUDACachingAllocator {
 
-size_t CUDAAllocatorConfig::parseAllocatorConfig(
-    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
+
+CUDAAllocatorConfig::CUDAAllocatorConfig()
+    : m_max_split_size(std::numeric_limits<size_t>::max()),
+      m_max_non_split_rounding_size(kLargeBuffer),
+      m_garbage_collection_threshold(0),
+      m_pinned_num_register_threads(1),
+      m_expandable_segments(false),
+#if CUDA_VERSION >= 12030
+      m_expandable_segments_handle_type(
+          Expandable_Segments_Handle_Type::UNSPECIFIED),
+#else
+      m_expandable_segments_handle_type(
+          Expandable_Segments_Handle_Type::POSIX_FD),
+#endif
+      m_release_lock_on_cudamalloc(false),
+      m_pinned_use_cuda_host_register(false),
+      m_graph_capture_record_stream_reuse(false),
+      m_pinned_use_background_threads(false) {
+  m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
+}
+
+size_t CUDAAllocatorConfig::roundup_power2_divisions(size_t size) {
+  size_t log_size = (63 - llvm::countLeadingZeros(size));
+
+  // Our intervals start at 1MB and end at 64GB
+  const size_t interval_start =
+      63 - llvm::countLeadingZeros(static_cast<size_t>(1048576));
+  const size_t interval_end =
+      63 - llvm::countLeadingZeros(static_cast<size_t>(68719476736));
+  TORCH_CHECK(
+      (interval_end - interval_start == kRoundUpPowerOfTwoIntervals),
+      "kRoundUpPowerOfTwoIntervals mismatch");
+
+  int index = static_cast<int>(log_size) - static_cast<int>(interval_start);
+
+  index = std::max(0, index);
+  index = std::min(index, static_cast<int>(kRoundUpPowerOfTwoIntervals) - 1);
+  return instance().m_roundup_power2_divisions[index];
+}
+
+void CUDAAllocatorConfig::lexArgs(
+    const std::string& env,
+    std::vector<std::string>& config) {
+  std::vector<char> buf;
+
+  for (char ch : env) {
+    if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
+      if (!buf.empty()) {
+        config.emplace_back(buf.begin(), buf.end());
+        buf.clear();
+      }
+      config.emplace_back(1, ch);
+    } else if (ch != ' ') {
+      buf.emplace_back(ch);
+    }
+  }
+  if (!buf.empty()) {
+    config.emplace_back(buf.begin(), buf.end());
+  }
+}
+
+void CUDAAllocatorConfig::consumeToken(
+    const std::vector<std::string>& config,
+    size_t i,
+    const char c) {
+  TORCH_CHECK(
+      i < config.size() && config[i] == std::string(1, c),
+      "Error parsing CachingAllocator settings, expected ",
+      c,
+      "");
+}
+
+size_t CUDAAllocatorConfig::parseMaxSplitSize(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  constexpr int mb = 1024 * 1024;
+  if (++i < config.size()) {
+    size_t val1 = stoi(config[i]);
+    TORCH_CHECK(
+        val1 > kLargeBuffer / mb,
+        "CachingAllocator option max_split_size_mb too small, must be > ",
+        kLargeBuffer / mb,
+        "");
+    val1 = std::max(val1, kLargeBuffer / mb);
+    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
+    m_max_split_size = val1 * 1024 * 1024;
+  } else {
+    TORCH_CHECK(false, "Error, expecting max_split_size_mb value", "");
+  }
+  return i;
+}
+
+size_t CUDAAllocatorConfig::parseMaxNonSplitRoundingSize(
+    const std::vector<std::string>& config,
     size_t i) {
+  consumeToken(config, ++i, ':');
+  constexpr int mb = 1024 * 1024;
+  if (++i < config.size()) {
+    size_t val1 = stoi(config[i]);
+    TORCH_CHECK(
+        val1 > kLargeBuffer / mb,
+        "CachingAllocator option max_non_split_rounding_mb too small, must be > ",
+        kLargeBuffer / mb,
+        "");
+    val1 = std::max(val1, kLargeBuffer / mb);
+    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
+    m_max_non_split_rounding_size = val1 * 1024 * 1024;
+  } else {
+    TORCH_CHECK(false, "Error, expecting max_non_split_rounding_mb value", "");
+  }
+  return i;
+}
+
+size_t CUDAAllocatorConfig::parseGarbageCollectionThreshold(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    double val1 = stod(config[i]);
+    TORCH_CHECK(
+        val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", "");
+    TORCH_CHECK(
+        val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", "");
+    m_garbage_collection_threshold = val1;
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting garbage_collection_threshold value", "");
+  }
+  return i;
+}
+
+size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  bool first_value = true;
+
+  if (++i < config.size()) {
+    if (std::string_view(config[i]) == "[") {
+      size_t last_index = 0;
+      // NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
+      while (++i < config.size() && std::string_view(config[i]) != "]") {
+        const std::string& val1 = config[i];
+        size_t val2 = 0;
+
+        consumeToken(config, ++i, ':');
+        if (++i < config.size()) {
+          val2 = stoi(config[i]);
+        } else {
+          TORCH_CHECK(
+              false, "Error parsing roundup_power2_divisions value", "");
+        }
+        TORCH_CHECK(
+            val2 == 0 || llvm::isPowerOf2_64(val2),
+            "For roundups, the divisions has to be power of 2 or 0 to disable roundup ",
+            "");
+
+        if (std::string_view(val1) == ">") {
+          std::fill(
+              std::next(
+                  m_roundup_power2_divisions.begin(),
+                  static_cast<std::vector<unsigned long>::difference_type>(
+                      last_index)),
+              m_roundup_power2_divisions.end(),
+              val2);
+        } else {
+          size_t val1_long = stoul(val1);
+          TORCH_CHECK(
+              llvm::isPowerOf2_64(val1_long),
+              "For roundups, the intervals have to be power of 2 ",
+              "");
+
+          size_t index = 63 - llvm::countLeadingZeros(val1_long);
+          index = std::max((size_t)0, index);
+          index = std::min(index, m_roundup_power2_divisions.size() - 1);
+
+          if (first_value) {
+            std::fill(
+                m_roundup_power2_divisions.begin(),
+                std::next(
+                    m_roundup_power2_divisions.begin(),
+                    static_cast<std::vector<unsigned long>::difference_type>(
+                        index)),
+                val2);
+            first_value = false;
+          }
+          if (index < m_roundup_power2_divisions.size()) {
+            m_roundup_power2_divisions[index] = val2;
+          }
+          last_index = index;
+        }
+
+        if (std::string_view(config[i + 1]) != "]") {
+          consumeToken(config, ++i, ',');
+        }
+      }
+    } else { // Keep this for backwards compatibility
+      size_t val1 = stoi(config[i]);
+      TORCH_CHECK(
+          llvm::isPowerOf2_64(val1),
+          "For roundups, the divisions has to be power of 2 ",
+          "");
+      std::fill(
+          m_roundup_power2_divisions.begin(),
+          m_roundup_power2_divisions.end(),
+          val1);
+    }
+  } else {
+    TORCH_CHECK(false, "Error, expecting roundup_power2_divisions value", "");
+  }
+  return i;
+}
+
+size_t CUDAAllocatorConfig::parseAllocatorConfig(
+    const std::vector<std::string>& config,
+    size_t i,
+    bool& used_cudaMallocAsync) {
   // For ease of maintenance and understanding, the CUDA and ROCm
   // implementations of this function are separated. This avoids having many
   // #ifdef's throughout.
+#ifdef USE_ROCM
   // Ease burden on ROCm users by allowing either cuda or hip tokens.
   // cuda token is broken up to prevent hipify matching it.
 #define PYTORCH_TOKEN1 \
   "cud"                \
   "aMallocAsync"
 #define PYTORCH_TOKEN2 "hipMallocAsync"
-  tokenizer.checkToken(++i, ":");
-  i++; // Move to the value after the colon
-  TORCH_CHECK_VALUE(
-      ((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
-       (tokenizer[i] == PYTORCH_TOKEN2)),
-      "Unknown allocator backend, "
-      "options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
-  if (m_is_allocator_loaded) {
-    bool aync_allocator_at_runtime = (tokenizer[i] != "native");
-    TORCH_CHECK(
-        aync_allocator_at_runtime == m_use_async_allocator,
-        "Allocator async backend parsed at runtime != allocator async backend parsed at load time, ",
-        aync_allocator_at_runtime,
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        ((config[i] == "native") || (config[i] == PYTORCH_TOKEN1) ||
+         (config[i] == PYTORCH_TOKEN2)),
+        "Unknown allocator backend, "
+        "options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
+    used_cudaMallocAsync =
+        (config[i] == PYTORCH_TOKEN1 || config[i] == PYTORCH_TOKEN2);
+    TORCH_INTERNAL_ASSERT(
+        config[i] == get()->name() ||
+            (config[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
+        "Allocator backend parsed at runtime != "
+        "allocator backend parsed at load time, ",
+        config[i],
         " != ",
-        m_use_async_allocator);
+        get()->name());
+  } else {
+    TORCH_CHECK(false, "Error parsing backend value", "");
   }
-  m_use_async_allocator =
-      (tokenizer[i] == PYTORCH_TOKEN1 || tokenizer[i] == PYTORCH_TOKEN2);
-  // CUDA allocator is always loaded at the start of the program
-  m_is_allocator_loaded = true;
-
-#if defined(CUDA_VERSION)
-  if (m_use_async_allocator) {
-#if CUDA_VERSION >= 11040
-    int version = 0;
-    C10_CUDA_CHECK(cudaDriverGetVersion(&version));
+  return i;
+#undef PYTORCH_TOKEN1
+#undef PYTORCH_TOKEN2
+#else // USE_ROCM
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
     TORCH_CHECK(
-        version >= 11040,
-        "backend:cudaMallocAsync requires CUDA runtime "
-        "11.4 or newer, but cudaDriverGetVersion returned ",
-        version);
+        ((config[i] == "native") || (config[i] == "cudaMallocAsync")),
+        "Unknown allocator backend, "
+        "options are native and cudaMallocAsync");
+    used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
+    if (used_cudaMallocAsync) {
+#if CUDA_VERSION >= 11040
+      int version = 0;
+      C10_CUDA_CHECK(cudaDriverGetVersion(&version));
+      TORCH_CHECK(
+          version >= 11040,
+          "backend:cudaMallocAsync requires CUDA runtime "
+          "11.4 or newer, but cudaDriverGetVersion returned ",
+          version);
 #else
-    TORCH_CHECK(
-        false,
-        "backend:cudaMallocAsync requires PyTorch to be built with "
-        "CUDA 11.4 or newer, but CUDA_VERSION is ",
-        CUDA_VERSION);
+      TORCH_CHECK(
+          false,
+          "backend:cudaMallocAsync requires PyTorch to be built with "
+          "CUDA 11.4 or newer, but CUDA_VERSION is ",
+          CUDA_VERSION);
 #endif
+    }
+    TORCH_INTERNAL_ASSERT(
+        config[i] == get()->name(),
+        "Allocator backend parsed at runtime != "
+        "allocator backend parsed at load time");
+  } else {
+    TORCH_CHECK(false, "Error parsing backend value", "");
   }
-#endif
-
   return i;
-#undef PYTORCH_TOKEN1
-#undef PYTORCH_TOKEN2
+#endif // USE_ROCM
 }
 
-void CUDAAllocatorConfig::parseArgs(const std::string& env) {
+void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
   // If empty, set the default values
+  m_max_split_size = std::numeric_limits<size_t>::max();
+  m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
+  m_garbage_collection_threshold = 0;
+  bool used_cudaMallocAsync = false;
   bool used_native_specific_option = false;
 
-  c10::CachingAllocator::ConfigTokenizer tokenizer(env);
-  for (size_t i = 0; i < tokenizer.size(); i++) {
-    const auto& key = tokenizer[i];
-    if (key == "backend") {
-      i = parseAllocatorConfig(tokenizer, i);
+  if (!env.has_value()) {
+    return;
+  }
+  {
+    std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
+    m_last_allocator_settings = env.value();
+  }
+
+  std::vector<std::string> config;
+  lexArgs(env.value(), config);
+
+  for (size_t i = 0; i < config.size(); i++) {
+    std::string_view config_item_view(config[i]);
+    if (config_item_view == "max_split_size_mb") {
+      i = parseMaxSplitSize(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "max_non_split_rounding_mb") {
+      i = parseMaxNonSplitRoundingSize(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "garbage_collection_threshold") {
+      i = parseGarbageCollectionThreshold(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "roundup_power2_divisions") {
+      i = parseRoundUpPower2Divisions(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "backend") {
+      i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
+    } else if (config_item_view == "expandable_segments") {
+      used_native_specific_option = true;
+      consumeToken(config, ++i, ':');
+      ++i;
+      TORCH_CHECK(
+          i < config.size() &&
+              (std::string_view(config[i]) == "True" ||
+               std::string_view(config[i]) == "False"),
+          "Expected a single True/False argument for expandable_segments");
+      config_item_view = config[i];
+      m_expandable_segments = (config_item_view == "True");
     } else if (
         // ROCm build's hipify step will change "cuda" to "hip", but for ease of
         // use, accept both. We must break up the string to prevent hipify here.
-        key == "release_lock_on_hipmalloc" ||
-        key ==
+        config_item_view == "release_lock_on_hipmalloc" ||
+        config_item_view ==
             "release_lock_on_c"
             "udamalloc") {
       used_native_specific_option = true;
-      tokenizer.checkToken(++i, ":");
-      m_release_lock_on_cudamalloc = tokenizer.toBool(++i);
+      consumeToken(config, ++i, ':');
+      ++i;
+      TORCH_CHECK(
+          i < config.size() &&
+              (std::string_view(config[i]) == "True" ||
+               std::string_view(config[i]) == "False"),
+          "Expected a single True/False argument for release_lock_on_cudamalloc");
+      config_item_view = config[i];
+      m_release_lock_on_cudamalloc = (config_item_view == "True");
     } else if (
         // ROCm build's hipify step will change "cuda" to "hip", but for ease of
         // use, accept both. We must break up the string to prevent hipify here.
-        key == "pinned_use_hip_host_register" ||
-        key ==
+        config_item_view == "pinned_use_hip_host_register" ||
+        config_item_view ==
             "pinned_use_c"
             "uda_host_register") {
-      i = parsePinnedUseCudaHostRegister(tokenizer, i);
+      i = parsePinnedUseCudaHostRegister(config, i);
       used_native_specific_option = true;
-    } else if (key == "pinned_num_register_threads") {
-      i = parsePinnedNumRegisterThreads(tokenizer, i);
+    } else if (config_item_view == "pinned_num_register_threads") {
+      i = parsePinnedNumRegisterThreads(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "pinned_use_background_threads") {
+      i = parsePinnedUseBackgroundThreads(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "graph_capture_record_stream_reuse") {
+      i = parseGraphCaptureRecordStreamReuse(config, i);
       used_native_specific_option = true;
     } else {
-      const auto& keys =
-          c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
       TORCH_CHECK(
-          keys.find(key) != keys.end(),
-          "Unrecognized key '",
-          key,
-          "' in Accelerator allocator config.");
-      i = tokenizer.skipKey(i);
+          false, "Unrecognized CachingAllocator option: ", config_item_view);
     }
 
-    if (i + 1 < tokenizer.size()) {
-      tokenizer.checkToken(++i, ",");
+    if (i + 1 < config.size()) {
+      consumeToken(config, ++i, ',');
     }
   }
 
-  if (m_use_async_allocator && used_native_specific_option) {
+  if (used_cudaMallocAsync && used_native_specific_option) {
     TORCH_WARN(
         "backend:cudaMallocAsync ignores max_split_size_mb,"
         "roundup_power2_divisions, and garbage_collect_threshold.");
@@ -121,33 +395,81 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
 }
 
 size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
-    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+    const std::vector<std::string>& config,
     size_t i) {
-  tokenizer.checkToken(++i, ":");
-  m_pinned_use_cuda_host_register = tokenizer.toBool(++i);
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        (config[i] == "True" || config[i] == "False"),
+        "Expected a single True/False argument for pinned_use_cuda_host_register");
+    m_pinned_use_cuda_host_register = (config[i] == "True");
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting pinned_use_cuda_host_register value", "");
+  }
+  return i;
+}
+
+size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        (config[i] == "True" || config[i] == "False"),
+        "Expected a single True/False argument for graph_capture_record_stream_reuse");
+    m_graph_capture_record_stream_reuse = (config[i] == "True");
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting graph_capture_record_stream_reuse value", "");
+  }
 
   return i;
 }
 
 size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
-    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+    const std::vector<std::string>& config,
     size_t i) {
-  tokenizer.checkToken(++i, ":");
-  size_t val2 = tokenizer.toSizeT(++i);
-  TORCH_CHECK_VALUE(
-      llvm::isPowerOf2_64(val2),
-      "Number of register threads has to be power of 2 ",
-      "");
-  auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
-  TORCH_CHECK_VALUE(
-      val2 <= maxThreads,
-      "Number of register threads should be less than or equal to " +
-          std::to_string(maxThreads),
-      "");
-  m_pinned_num_register_threads = val2;
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    size_t val2 = stoi(config[i]);
+    TORCH_CHECK(
+        llvm::isPowerOf2_64(val2),
+        "Number of register threads has to be power of 2 ",
+        "");
+    auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
+    TORCH_CHECK(
+        val2 <= maxThreads,
+        "Number of register threads should be less than or equal to " +
+            std::to_string(maxThreads),
+        "");
+    m_pinned_num_register_threads = val2;
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting pinned_num_register_threads value", "");
+  }
   return i;
 }
 
-REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(CUDAAllocatorConfig)
+size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        (config[i] == "True" || config[i] == "False"),
+        "Expected a single True/False argument for pinned_use_background_threads");
+    m_pinned_use_background_threads = (config[i] == "True");
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting pinned_use_background_threads value", "");
+  }
+  return i;
+}
+
+// General caching allocator utilities
+void setAllocatorSettings(const std::string& env) {
+  CUDACachingAllocator::CUDAAllocatorConfig::instance().parseArgs(env.c_str());
+}
 
 } // namespace c10::cuda::CUDACachingAllocator
diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
index 21d72e4b68313..54c41ba70fb6f 100644
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@@ -1,12 +1,16 @@
 #pragma once
 
-#include <c10/core/AllocatorConfig.h>
-#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMacros.h>
-#include <c10/util/Deprecated.h>
 #include <c10/util/Exception.h>
 #include <c10/util/env.h>
 
+#include <atomic>
+#include <cstddef>
+#include <cstdlib>
+#include <mutex>
+#include <string>
+#include <vector>
+
 namespace c10::cuda::CUDACachingAllocator {
 
 enum class Expandable_Segments_Handle_Type : int {
@@ -18,28 +22,21 @@ enum class Expandable_Segments_Handle_Type : int {
 // Environment config parser
 class C10_CUDA_API CUDAAllocatorConfig {
  public:
-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.")
   static size_t max_split_size() {
-    return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
+    return instance().m_max_split_size;
   }
-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.")
   static double garbage_collection_threshold() {
-    return c10::CachingAllocator::AcceleratorAllocatorConfig::
-        garbage_collection_threshold();
+    return instance().m_garbage_collection_threshold;
   }
 
   static bool expandable_segments() {
-    bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig::
-        use_expandable_segments();
 #ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
-    if (enabled) {
+    if (instance().m_expandable_segments) {
       TORCH_WARN_ONCE("expandable_segments not supported on this platform")
     }
     return false;
 #else
-    return enabled;
+    return instance().m_expandable_segments;
 #endif
   }
 
@@ -56,6 +53,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return instance().m_release_lock_on_cudamalloc;
   }
 
+  static bool graph_capture_record_stream_reuse() {
+    return instance().m_graph_capture_record_stream_reuse;
+  }
+
   /** Pinned memory allocator settings */
   static bool pinned_use_cuda_host_register() {
     return instance().m_pinned_use_cuda_host_register;
@@ -65,11 +66,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return instance().m_pinned_num_register_threads;
   }
 
-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.")
   static bool pinned_use_background_threads() {
-    return c10::CachingAllocator::AcceleratorAllocatorConfig::
-        pinned_use_background_threads();
+    return instance().m_pinned_use_background_threads;
   }
 
   static size_t pinned_max_register_threads() {
@@ -79,107 +77,96 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return 128;
   }
 
-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
-  static size_t roundup_power2_divisions(size_t size) {
-    return c10::CachingAllocator::AcceleratorAllocatorConfig::
-        roundup_power2_divisions(size);
-  }
+  // This is used to round-up allocation size to nearest power of 2 divisions.
+  // More description below in function roundup_power2_next_division
+  // As an example, if we want 4 divisions between 2's power, this can be done
+  // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
+  static size_t roundup_power2_divisions(size_t size);
 
-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
   static std::vector<size_t> roundup_power2_divisions() {
-    return c10::CachingAllocator::AcceleratorAllocatorConfig::
-        roundup_power2_divisions();
+    return instance().m_roundup_power2_divisions;
   }
 
-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_non_split_rounding_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_non_split_rounding_size() instead.")
   static size_t max_non_split_rounding_size() {
-    return c10::CachingAllocator::AcceleratorAllocatorConfig::
-        max_non_split_rounding_size();
+    return instance().m_max_non_split_rounding_size;
   }
 
-  C10_DEPRECATED_MESSAGE(
-      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.")
   static std::string last_allocator_settings() {
-    return c10::CachingAllocator::getAllocatorSettings();
-  }
-
-  static bool use_async_allocator() {
-    return instance().m_use_async_allocator;
-  }
-
-  // Use `Construct On First Use Idiom` to avoid `Static Initialization Order`
-  // issue.
-  static const std::unordered_set<std::string>& getKeys() {
-    static std::unordered_set<std::string> keys{
-        "backend",
-        // keep BC for Rocm: `cuda` -> `cud` `a`, to avoid hipify issues
-        // NOLINTBEGIN(bugprone-suspicious-missing-comma,-warnings-as-errors)
-        "release_lock_on_cud"
-        "amalloc",
-        "pinned_use_cud"
-        "a_host_register",
-        // NOLINTEND(bugprone-suspicious-missing-comma,-warnings-as-errors)
-        "release_lock_on_hipmalloc",
-        "pinned_use_hip_host_register",
-        "pinned_num_register_threads"};
-    return keys;
+    std::lock_guard<std::mutex> lock(
+        instance().m_last_allocator_settings_mutex);
+    return instance().m_last_allocator_settings;
   }
 
   static CUDAAllocatorConfig& instance() {
     static CUDAAllocatorConfig* s_instance = ([]() {
       auto inst = new CUDAAllocatorConfig();
-      auto env = c10::utils::get_env("PYTORCH_ALLOC_CONF");
-      if (!env.has_value()) {
-        // For backward compatibility, check for the old environment variable
-        // PYTORCH_CUDA_ALLOC_CONF.
-        env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
-      }
+      auto env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
 #ifdef USE_ROCM
       // convenience for ROCm users, allow alternative HIP token
       if (!env.has_value()) {
         env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
       }
 #endif
-      if (env.has_value()) {
-        inst->parseArgs(env.value());
-      }
+      inst->parseArgs(env);
       return inst;
     })();
     return *s_instance;
   }
 
-  void parseArgs(const std::string& env);
+  void parseArgs(const std::optional<std::string>& env);
 
  private:
-  CUDAAllocatorConfig() = default;
-
-  size_t parseAllocatorConfig(
-      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+  CUDAAllocatorConfig();
+
+  static void lexArgs(const std::string& env, std::vector<std::string>& config);
+  static void consumeToken(
+      const std::vector<std::string>& config,
+      size_t i,
+      const char c);
+  size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
+  size_t parseMaxNonSplitRoundingSize(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseGarbageCollectionThreshold(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseRoundUpPower2Divisions(
+      const std::vector<std::string>& config,
       size_t i);
+  size_t parseAllocatorConfig(
+      const std::vector<std::string>& config,
+      size_t i,
+      bool& used_cudaMallocAsync);
   size_t parsePinnedUseCudaHostRegister(
-      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+      const std::vector<std::string>& config,
       size_t i);
   size_t parsePinnedNumRegisterThreads(
-      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parsePinnedUseBackgroundThreads(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseGraphCaptureRecordStreamReuse(
+      const std::vector<std::string>& config,
       size_t i);
 
-  std::atomic<size_t> m_pinned_num_register_threads{1};
-  std::atomic<Expandable_Segments_Handle_Type> m_expandable_segments_handle_type
-#if CUDA_VERSION >= 12030
-      {Expandable_Segments_Handle_Type::UNSPECIFIED};
-#else
-      {Expandable_Segments_Handle_Type::POSIX_FD};
-#endif
-  std::atomic<bool> m_release_lock_on_cudamalloc{false};
-  std::atomic<bool> m_pinned_use_cuda_host_register{false};
-  std::atomic<bool> m_use_async_allocator{false};
-  std::atomic<bool> m_is_allocator_loaded{false};
+  std::atomic<size_t> m_max_split_size;
+  std::atomic<size_t> m_max_non_split_rounding_size;
+  std::vector<size_t> m_roundup_power2_divisions;
+  std::atomic<double> m_garbage_collection_threshold;
+  std::atomic<size_t> m_pinned_num_register_threads;
+  std::atomic<bool> m_expandable_segments;
+  std::atomic<Expandable_Segments_Handle_Type>
+      m_expandable_segments_handle_type;
+  std::atomic<bool> m_release_lock_on_cudamalloc;
+  std::atomic<bool> m_pinned_use_cuda_host_register;
+  std::atomic<bool> m_graph_capture_record_stream_reuse;
+  std::atomic<bool> m_pinned_use_background_threads;
+  std::string m_last_allocator_settings;
+  std::mutex m_last_allocator_settings_mutex;
 };
 
-// Keep this for backwards compatibility
-using c10::CachingAllocator::setAllocatorSettings;
+// General caching allocator utilities
+C10_CUDA_API void setAllocatorSettings(const std::string& env);
 
 } // namespace c10::cuda::CUDACachingAllocator
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index c2a46ac9f3f74..93ac4f7a4c649 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -1,6 +1,7 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 
 #include <c10/core/impl/GPUTrace.h>
+#include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -63,6 +64,10 @@ namespace cuda::CUDACachingAllocator {
 using namespace c10::CachingAllocator;
 using namespace c10::CachingDeviceAllocator;
 
+// Included here as this is externally used in CUDAAllocatorConfig
+const size_t kLargeBuffer =
+    20971520; // "large" allocations may be packed in 20 MiB blocks
+
 namespace Native {
 
 //
@@ -1162,8 +1167,13 @@ class DeviceCachingAllocator {
   // tracks which pools we can use as a last resort before ooming
   ska::flat_hash_set<MempoolId_t, MempoolIdHash> use_on_oom_pools;
 
-  // See free() for this thing's purpose
-  std::vector<Block*> needs_events_deferred_until_no_capture;
+  // Map of blocks whose freeing is deferred until after CUDA graph capture.
+  //   - Key: Block* to be freed.
+  //   - Value: List of "empty nodes" inserted as free markers during capture.
+  //     If the vector is empty, the block must always be deferred until capture
+  //     ends.
+  ska::flat_hash_map<Block*, std::vector<cudaGraphNode_t>> deferred_blocks;
+
   // outstanding cuda events
   ska::flat_hash_map<
       cuda::CUDAStream,
@@ -1218,7 +1228,7 @@ class DeviceCachingAllocator {
   DeviceCachingAllocator()
       : large_blocks(/*small=*/false), small_blocks(/*small=*/true) {
     stats.max_split_size =
-        static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
+        static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
     context_recorder_.store(nullptr);
   }
 
@@ -1324,6 +1334,11 @@ class DeviceCachingAllocator {
       //    capture. Cross-stream memory use is uncommon, so the deferral's
       //    effect on memory use during capture should be small.
       process_events(context);
+    } else {
+      if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
+        // We check if there is some block that is safe to reuse on this stream
+        free_safe_blocks_in_capture(context, stream);
+      }
     }
     size_t size = round_size(orig_size);
     auto& pool = get_pool(size, stream);
@@ -1343,8 +1358,7 @@ class DeviceCachingAllocator {
       // Do garbage collection if the flag is set.
       if (C10_UNLIKELY(
               set_fraction &&
-              AcceleratorAllocatorConfig::garbage_collection_threshold() >
-                  0.0)) {
+              CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
         garbage_collect_cached_blocks(context);
       }
       // Attempt allocate
@@ -1596,7 +1610,7 @@ class DeviceCachingAllocator {
       stats.active_bytes[stat_type].increase(block->size);
       stats.requested_bytes[stat_type].increase(block->requested_size);
     });
-    if (block->size >= AcceleratorAllocatorConfig::max_split_size())
+    if (block->size >= CUDAAllocatorConfig::max_split_size())
       stats.oversize_allocations.increase(1);
 
     auto allocated_bytes_gauge =
@@ -1615,6 +1629,248 @@ class DeviceCachingAllocator {
     return block;
   }
 
+  // Insert "free marker" (empty nodes) into the CUDA graph for all streams that
+  // have used the block, including the allocation stream. These nodes mark the
+  // last use of the block in the capture graph. Returns a vector of the
+  // inserted nodes, or an empty vector if any stream is not capturing.
+  std::vector<cudaGraphNode_t> insert_free_marker(Block* block) {
+    std::vector<cudaGraphNode_t> empty_nodes;
+
+    auto try_add_empty_node = [&](cudaStream_t stream) -> bool {
+      cudaStreamCaptureStatus status{};
+      cudaGraph_t graph{};
+      const cudaGraphNode_t* deps = nullptr;
+      size_t num_deps = 0;
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
+      C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
+          stream, &status, nullptr, &graph, &deps, nullptr, &num_deps));
+#else
+      C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
+          stream, &status, nullptr, &graph, &deps, &num_deps));
+#endif
+
+      TORCH_INTERNAL_ASSERT(
+          status != cudaStreamCaptureStatusInvalidated,
+          "Invalid stream capture status");
+
+      if (status == cudaStreamCaptureStatusNone) {
+        return false;
+      }
+
+      cudaGraphNode_t node{};
+      C10_CUDA_CHECK(cudaGraphAddEmptyNode(&node, graph, deps, num_deps));
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
+      C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
+          stream, &node, nullptr, 1, cudaStreamSetCaptureDependencies));
+#else
+      C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
+          stream, &node, 1, cudaStreamSetCaptureDependencies));
+#endif
+      empty_nodes.push_back(node);
+      return true;
+    };
+
+    // If any stream is not currently capturing, return an empty node vector.
+    // An empty vector indicates that the block should be deferred for freeing
+    // until after capture.
+
+    // Attempt to add an empty node for the allocation stream.
+    if (!try_add_empty_node(block->stream)) {
+      return {};
+    }
+    // Attempt to add empty nodes for all streams that have used the block.
+    for (const auto& s : block->stream_uses) {
+      if (!try_add_empty_node(s.stream())) {
+        return {};
+      }
+    }
+    return empty_nodes;
+  }
+
+  // Returns the current set of "terminal" nodes in the CUDA graph for a given
+  // stream. These represent the current endpoints of the stream, and may
+  // include additional nodes if the graph branches. Any new work captured will
+  // be attached after one or more of these terminals.
+  std::vector<cudaGraphNode_t> get_terminals(cudaStream_t stream) {
+    std::vector<cudaGraphNode_t> result;
+
+    cudaStreamCaptureStatus status{};
+    cudaGraph_t graph{};
+    const cudaGraphNode_t* dependencies = nullptr;
+    size_t num_dependencies = 0;
+
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
+    C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
+        stream,
+        &status,
+        nullptr,
+        &graph,
+        &dependencies,
+        nullptr,
+        &num_dependencies));
+#else
+    C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
+        stream, &status, nullptr, &graph, &dependencies, &num_dependencies));
+#endif
+
+    TORCH_INTERNAL_ASSERT(
+        status == cudaStreamCaptureStatusActive,
+        "Invalid stream capture status");
+
+    for (size_t i = 0; i < num_dependencies; i++) {
+      auto node = dependencies[i];
+      if (node != nullptr) {
+        result.push_back(node);
+      }
+    }
+
+    return result;
+  }
+
+  // Returns the set of "reusable" free markers (empty nodes) in the current
+  // CUDA graph capture. A free marker is considered reusable if it is a
+  // predecessor of every terminal node.
+  // This ensures that all future captured work will occur after the free
+  // marker, making it safe to reuse.
+  ska::flat_hash_set<cudaGraphNode_t> get_reusable_empty_nodes(
+      cudaStream_t stream) {
+    auto terminals = get_terminals(stream);
+    if (terminals.empty()) {
+      // No terminal nodes found; nothing to free.
+      return {};
+    }
+
+    auto get_dependencies = [](cudaGraphNode_t node,
+                               cudaGraphNode_t* pDependencies,
+                               size_t* pNumDependencies) -> void {
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
+      C10_CUDA_CHECK(cudaGraphNodeGetDependencies(
+          node, pDependencies, nullptr, pNumDependencies));
+#else
+      C10_CUDA_CHECK(
+          cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies));
+#endif
+    };
+
+    // Helper to retrieve all parent nodes (dependencies) of a given node.
+    auto get_parents =
+        [&](cudaGraphNode_t node) -> std::vector<cudaGraphNode_t> {
+      size_t count = 0;
+      get_dependencies(node, nullptr, &count);
+      std::vector<cudaGraphNode_t> out(count);
+      if (count) {
+        get_dependencies(node, out.data(), &count);
+        out.resize(count);
+      }
+      return out;
+    };
+
+    // Helper to determine if a node is an empty node (used as a free marker).
+    auto is_empty_node = [](cudaGraphNode_t n) -> bool {
+      cudaGraphNodeType type{};
+      C10_CUDA_CHECK(cudaGraphNodeGetType(n, &type));
+      return type == cudaGraphNodeTypeEmpty;
+    };
+
+    // For each terminal node, perform a reverse DFS to count, for each empty
+    // node, how many terminals it can reach (i.e., for how many terminals it is
+    // a predecessor). An empty node is reusable if it is a predecessor of all
+    // terminal nodes.
+    ska::flat_hash_map<cudaGraphNode_t, size_t> num_terminals_reachable;
+
+    for (auto terminal : terminals) {
+      ska::flat_hash_set<cudaGraphNode_t> visited;
+      ska::flat_hash_set<cudaGraphNode_t> empty_nodes;
+
+      std::function<void(cudaGraphNode_t)> reverse_dfs =
+          [&](cudaGraphNode_t node) {
+            if (!visited.insert(node).second)
+              return;
+
+            if (is_empty_node(node)) {
+              num_terminals_reachable[node]++;
+              empty_nodes.insert(node);
+            }
+            auto parents = get_parents(node);
+            for (auto p : parents) {
+              reverse_dfs(p);
+            }
+          };
+
+      reverse_dfs(terminal);
+    }
+
+    ska::flat_hash_set<cudaGraphNode_t> reusable_empty_nodes;
+    for (auto [node, count] : num_terminals_reachable) {
+      if (count == terminals.size()) {
+        reusable_empty_nodes.insert(node);
+      }
+    }
+
+    return reusable_empty_nodes;
+  }
+
+  // A block is considered reusable during CUDA graph capture if every free
+  // marker (empty node) associated with the block is a predecessor of every
+  // terminal node.
+  //
+  // This ensures that any new operation added to the graph will be attached
+  // after all terminal nodes, which themselves are after all free markers. As a
+  // result, all future work is guaranteed to occur after the block's last use
+  // on every stream, so the block's previous lifetime ends before any new
+  // lifetime begins. This check relies solely on the DAG topology and does not
+  // require event queries, making it safe to use during capture.
+  //
+  // This function iterates over all deferred blocks, determines if their empty
+  // nodes are reusable according to the above criteria, and frees the block if
+  // so.
+  void free_safe_blocks_in_capture(
+      const std::shared_ptr<GatheredContext>& context,
+      cudaStream_t stream) {
+    auto reusable_empty_nodes = get_reusable_empty_nodes(stream);
+
+    // If there are no reusable empty nodes (e.g., not currently capturing),
+    // there is nothing to do.
+    if (reusable_empty_nodes.empty()) {
+      return;
+    }
+
+    std::vector<Block*> blocks_to_erase;
+
+    for (auto& [block, inserted_empty_nodes] : deferred_blocks) {
+      // Skip this block if it has no empty nodes, as we defer its freeing until
+      // after graph capture. Also skip if the block was not allocated on the
+      // current stream; such blocks will be freed when
+      // free_safe_blocks_in_capture is attempted on that stream.
+      if (inserted_empty_nodes.empty() || block->stream != stream) {
+        continue;
+      }
+
+      bool is_reusable = true;
+
+      for (const auto& node : inserted_empty_nodes) {
+        if (reusable_empty_nodes.find(node) == reusable_empty_nodes.end()) {
+          is_reusable = false;
+          break;
+        }
+      }
+
+      if (is_reusable) {
+        // Clear stream uses since the graph ensures proper synchronization.
+        // No need to insert events.
+        block->stream_uses.clear();
+
+        free_block(block, context);
+        blocks_to_erase.push_back(block);
+      }
+    }
+
+    // Remove blocks that were freed from the deferred_blocks map.
+    for (auto* block : blocks_to_erase) {
+      deferred_blocks.erase(block);
+    }
+  }
+
   void free(Block* block) {
     std::shared_ptr<GatheredContext> context =
         maybeGatherContext(RecordContext::ALL);
@@ -1647,17 +1903,25 @@ class DeviceCachingAllocator {
         block->pool->owner_MempoolId(),
         context ? context : block->context_when_allocated);
 
-    if (block->size >= AcceleratorAllocatorConfig::max_split_size())
+    if (block->size >= CUDAAllocatorConfig::max_split_size())
       stats.oversize_allocations.decrease(1);
 
+    // If the block has been used on more than one stream, handle accordingly.
     if (!block->stream_uses.empty()) {
       if (C10_UNLIKELY(!captures_underway.empty())) {
-        // It's forbidden to cudaEventQuery an event recorded during CUDA graph
-        // capture. We conservatively defer recording end-of-life events until
-        // the next call to process_events() (which won't happen until no
-        // captures are underway)
-        needs_events_deferred_until_no_capture.push_back(block);
+        if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
+          // insert_free_marker returns a vector of free markers,
+          // or an empty vector if any associated stream is not currently
+          // capturing. The empty vector means that we will defer the free until
+          // capture is finished.
+          deferred_blocks.emplace(block, insert_free_marker(block));
+        } else {
+          // If graph_capture_record_stream_reuse is not enabled, always defer
+          // the free until capture is finished.
+          deferred_blocks.emplace(block, std::vector<cudaGraphNode_t>{});
+        }
       } else {
+        // If not in a capture, insert events for the block.
         insert_events(block);
       }
     } else {
@@ -2196,8 +2460,7 @@ class DeviceCachingAllocator {
     if (size < kMinBlockSize) {
       return kMinBlockSize;
     } else {
-      auto divisions =
-          AcceleratorAllocatorConfig::roundup_power2_divisions(size);
+      auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
       if (divisions > 1 && size > (kMinBlockSize * divisions)) {
         return roundup_power2_next_division(size, divisions);
       } else {
@@ -2676,7 +2939,7 @@ class DeviceCachingAllocator {
     if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
       return remaining >= kMinBlockSize;
     } else {
-      return (size < AcceleratorAllocatorConfig::max_split_size()) &&
+      return (size < CUDAAllocatorConfig::max_split_size()) &&
           (remaining > kSmallSize);
     }
   }
@@ -2696,7 +2959,7 @@ class DeviceCachingAllocator {
 
     if (C10_UNLIKELY(
             set_fraction &&
-            AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+            CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
       // Track block reuse interval only when garbage collection is enabled.
       ++pool.get_free_blocks_call_count;
     }
@@ -2738,13 +3001,13 @@ class DeviceCachingAllocator {
     }
 
     // Do not return an oversized block for a large request
-    if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) &&
-        ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()))
+    if ((p.size() < CUDAAllocatorConfig::max_split_size()) &&
+        ((*it)->size >= CUDAAllocatorConfig::max_split_size()))
       return false;
     // Allow oversized block size to be rounded up but within a limit
-    if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) &&
+    if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
         ((*it)->size >=
-         p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size()))
+         p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
       return false;
     p.block = *it;
     pool.blocks.erase(it);
@@ -2767,7 +3030,7 @@ class DeviceCachingAllocator {
     // therefore should be of less overheads.
 
     size_t gc_threshold = static_cast<size_t>(
-        AcceleratorAllocatorConfig::garbage_collection_threshold() *
+        CUDAAllocatorConfig::garbage_collection_threshold() *
         static_cast<double>(allowed_memory_maximum));
     // No need to trigger GC yet
     if (total_allocated_memory <= gc_threshold) {
@@ -2915,7 +3178,7 @@ class DeviceCachingAllocator {
       stats.segment[stat_type].increase(1);
       stats.reserved_bytes[stat_type].increase(size);
     });
-    if (size >= AcceleratorAllocatorConfig::max_split_size())
+    if (size >= CUDAAllocatorConfig::max_split_size())
       stats.oversize_segments.increase(1);
     auto reserved_bytes_gauge =
         STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
@@ -2944,7 +3207,7 @@ class DeviceCachingAllocator {
   bool release_available_cached_blocks(
       const AllocParams& p,
       const std::shared_ptr<GatheredContext>& context) {
-    if (AcceleratorAllocatorConfig::max_split_size() ==
+    if (CUDAAllocatorConfig::max_split_size() ==
         std::numeric_limits<size_t>::max())
       return false;
     BlockPool& pool = *p.pool;
@@ -2952,8 +3215,8 @@ class DeviceCachingAllocator {
     // because of std::unique_ptr, block cannot be trivially copied
     // Use constructor for search key.
     Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
-    key.size = (key.size < AcceleratorAllocatorConfig::max_split_size())
-        ? AcceleratorAllocatorConfig::max_split_size()
+    key.size = (key.size < CUDAAllocatorConfig::max_split_size())
+        ? CUDAAllocatorConfig::max_split_size()
         : key.size;
     auto it = pool.blocks.lower_bound(&key);
     if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
@@ -2966,7 +3229,7 @@ class DeviceCachingAllocator {
       --it; // Back up one item.  Now on the largest block for the correct
             // stream
       while ((totalReleased < key.size) &&
-             ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) &&
+             ((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
              ((*it)->stream == p.stream())) {
         auto cur = it;
         bool is_first = cur == pool.blocks.begin();
@@ -2974,8 +3237,8 @@ class DeviceCachingAllocator {
           --it;
         }
         if (!(*cur)->expandable_segment_) {
-          release_block(*cur, context);
           totalReleased += (*cur)->size;
+          release_block(*cur, context);
         }
         if (is_first) {
           break;
@@ -3091,7 +3354,7 @@ class DeviceCachingAllocator {
         stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
             .current);
 
-    if (block->size >= AcceleratorAllocatorConfig::max_split_size())
+    if (block->size >= CUDAAllocatorConfig::max_split_size())
       stats.oversize_segments.decrease(1);
     pool->blocks.erase(block);
     delete block;
@@ -3284,8 +3547,8 @@ class DeviceCachingAllocator {
 
   void insert_events_deferred_until_no_capture(
       const std::shared_ptr<GatheredContext>& context) {
-    if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) {
-      for (auto* block : needs_events_deferred_until_no_capture) {
+    if (C10_UNLIKELY(!deferred_blocks.empty())) {
+      for (auto& [block, inserted_empty_nodes] : deferred_blocks) {
         TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
         // only streams recorded before cudagraph will be used to insert events
         // since we know all streams recorded during cudagraph must have
@@ -3297,7 +3560,7 @@ class DeviceCachingAllocator {
           free_block(block, context);
         }
       }
-      needs_events_deferred_until_no_capture.clear();
+      deferred_blocks.clear();
     }
   }
 
@@ -3718,8 +3981,8 @@ class NativeCachingAllocator : public CUDAAllocator {
 
     auto& md = result.config_metadata;
     md.garbage_collection_threshold =
-        AcceleratorAllocatorConfig::garbage_collection_threshold();
-    md.max_split_size = AcceleratorAllocatorConfig::max_split_size();
+        CUDAAllocatorConfig::garbage_collection_threshold();
+    md.max_split_size = CUDAAllocatorConfig::max_split_size();
     md.pinned_num_register_threads =
         CUDAAllocatorConfig::pinned_num_register_threads();
     md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
@@ -3727,10 +3990,11 @@ class NativeCachingAllocator : public CUDAAllocator {
         CUDAAllocatorConfig::release_lock_on_cudamalloc();
     md.pinned_use_host_register =
         CUDAAllocatorConfig::pinned_use_cuda_host_register();
-    md.last_allocator_settings =
-        AcceleratorAllocatorConfig::last_allocator_settings();
+    md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
+    md.graph_capture_record_stream_reuse =
+        CUDAAllocatorConfig::graph_capture_record_stream_reuse();
     md.roundup_power2_divisions =
-        AcceleratorAllocatorConfig::roundup_power2_divisions();
+        CUDAAllocatorConfig::roundup_power2_divisions();
 
     return result;
   }
@@ -4108,17 +4372,67 @@ CUDAAllocator* allocator();
 } // namespace CudaMallocAsync
 
 struct BackendStaticInitializer {
+  // Parses env for backend at load time, duplicating some logic from
+  // CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at
+  // runtime). Defers verbose exceptions and error checks, including Cuda
+  // version checks, to CUDAAllocatorConfig's runtime doublecheck. If this
+  // works, maybe we should move all of CUDAAllocatorConfig here?
   CUDAAllocator* parseEnvForBackend() {
-    // If the environment variable is set, we use the CudaMallocAsync allocator.
-    if (CUDAAllocatorConfig::use_async_allocator()) {
-      return CudaMallocAsync::allocator();
+    auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
+#ifdef USE_ROCM
+    // convenience for ROCm users to allow either CUDA or HIP env var
+    if (!val.has_value()) {
+      val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
+    }
+#endif
+    if (val.has_value()) {
+      const std::string& config = val.value();
+
+      std::regex exp("[\\s,]+");
+      std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
+      std::sregex_token_iterator end;
+      std::vector<std::string> options(it, end);
+
+      for (auto option : options) {
+        std::regex exp2("[:]+");
+        std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
+        std::sregex_token_iterator end2;
+        std::vector<std::string> kv(it2, end2);
+        if (kv.size() >= 2) {
+          if (kv[0] == "backend") {
+#ifdef USE_ROCM
+            // convenience for ROCm users to allow either CUDA or HIP env var
+            if (kv[1] ==
+                    "cud"
+                    "aMallocAsync" ||
+                kv[1] == "hipMallocAsync")
+#else
+            if (kv[1] == "cudaMallocAsync")
+#endif
+              return CudaMallocAsync::allocator();
+            if (kv[1] == "native")
+              return &Native::allocator;
+          }
+        }
+      }
     }
     return &Native::allocator;
   }
 
   BackendStaticInitializer() {
     auto r = parseEnvForBackend();
+// Register this HIP allocator as the CUDA allocator to allow it to work
+// with both c10::GetAllocator(kCUDA) and c10::getDeviceAllocator(kCUDA)
+// APIs. We don't perform this masquerading inside
+// HIPAllocatorMasqueradingAsCUDA because it needs to happen during static
+// initialization, and doing so there may introduce static initialization
+// order (SIOF) issues.
+#define HIP_MASQUERADING_AS_CUDA \
+  "cud"                          \
+  "a"
+    at::SetAllocator(c10::Device(HIP_MASQUERADING_AS_CUDA).type(), r, 0);
     allocator.store(r);
+#undef HIP_MASQUERADING_AS_CUDA
   }
 };
 
@@ -4145,11 +4459,8 @@ std::atomic<CaptureId_t> MemPool::uuid_{1};
 MemPool::MemPool(
     CUDACachingAllocator::CUDAAllocator* allocator,
     bool is_user_created,
-    bool use_on_oom,
-    bool symmetric)
-    : allocator_(allocator),
-      is_user_created_(is_user_created),
-      symmetric_(symmetric) {
+    bool use_on_oom)
+    : allocator_(allocator), is_user_created_(is_user_created) {
   if (is_user_created_) {
     id_ = {0, uid_++};
   } else {
@@ -4172,10 +4483,6 @@ MempoolId_t MemPool::id() {
   return id_;
 }
 
-bool MemPool::is_symmetric() {
-  return symmetric_;
-}
-
 CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
   return allocator_;
 }
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index 956411fe22827..bfc486d69fcff 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <c10/core/CachingDeviceAllocator.h>
-#include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/cuda/CUDAStream.h>
@@ -50,9 +49,10 @@ namespace c10::cuda::CUDACachingAllocator {
 
 // Preserved only for BC reasons
 // NOLINTNEXTLINE(misc-unused-using-decls)
-using c10::CachingAllocator::kLargeBuffer;
 using c10::CachingDeviceAllocator::DeviceStats;
 
+extern const size_t kLargeBuffer;
+
 typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
 
 // Struct containing info of an allocation block (i.e. a fractional part of a
@@ -163,6 +163,7 @@ struct AllocatorConfigInfo {
   bool expandable_segments;
   bool release_lock_on_malloc;
   bool pinned_use_host_register;
+  bool graph_capture_record_stream_reuse;
   std::string last_allocator_settings;
   std::vector<size_t> roundup_power2_divisions;
 };
@@ -202,25 +203,24 @@ struct ShareableHandle {
   std::string handle;
 };
 
-class CUDAAllocator : public Allocator {
+class CUDAAllocator : public DeviceAllocator {
  public:
   virtual void* raw_alloc(size_t nbytes) = 0;
   virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
   virtual void raw_delete(void* ptr) = 0;
   virtual void init(int device_count) = 0;
-  virtual bool initialized() = 0;
   virtual double getMemoryFraction(c10::DeviceIndex device) = 0;
   virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
-  virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
   virtual void enable(bool value) = 0;
   virtual bool isEnabled() const = 0;
   virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
   virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
-  virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
-  virtual c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
-      c10::DeviceIndex device) = 0;
-  virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
-  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+  // Keep for BC only
+  virtual void recordStream(const DataPtr& ptr, CUDAStream stream) = 0;
+  void recordStream(const DataPtr& ptr, c10::Stream stream) override {
+    CUDAStream cuda_stream = CUDAStream(stream);
+    recordStream(ptr, cuda_stream);
+  }
   virtual SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) = 0;
   virtual void beginAllocateToPool(
       c10::DeviceIndex device,
@@ -525,6 +525,10 @@ inline void enablePeerAccess(
 
 namespace c10::cuda {
 
+// Keep BC only
+using c10::CaptureId_t;
+using c10::MempoolId_t;
+
 // MemPool represents a pool of memory in a caching allocator. Currently,
 // it's just the ID of the pool object maintained in the CUDACachingAllocator.
 //
@@ -535,8 +539,7 @@ struct C10_CUDA_API MemPool {
   MemPool(
       CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
       bool is_user_created = true,
-      bool use_on_oom = false,
-      bool symmetric = false);
+      bool use_on_oom = false);
   MemPool(const MemPool&) = delete;
   MemPool(MemPool&&) = default;
   MemPool& operator=(const MemPool&) = delete;
@@ -544,7 +547,6 @@ struct C10_CUDA_API MemPool {
   ~MemPool();
 
   MempoolId_t id();
-  bool is_symmetric();
   CUDACachingAllocator::CUDAAllocator* allocator();
   int use_count();
   c10::DeviceIndex device();
@@ -556,7 +558,6 @@ struct C10_CUDA_API MemPool {
   CUDACachingAllocator::CUDAAllocator* allocator_;
   bool is_user_created_;
   MempoolId_t id_;
-  bool symmetric_;
   c10::DeviceIndex device_;
 };
 
diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index 0e8cabf618593..9839e4e72049e 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -53,13 +53,12 @@ int device_count_impl(bool fail_if_no_driver) {
             "https://pytorch.org to install a PyTorch version that has been "
             "compiled with your version of the CUDA driver.");
       }
-    } break;
+    }
     case cudaErrorInitializationError:
       TORCH_CHECK(
           false,
           "CUDA driver initialization failed, you might not "
           "have a CUDA gpu.");
-      break;
     case cudaErrorUnknown:
       TORCH_CHECK(
           false,
@@ -67,7 +66,6 @@ int device_count_impl(bool fail_if_no_driver) {
           "incorrectly set up environment, e.g. changing env "
           "variable CUDA_VISIBLE_DEVICES after program start. "
           "Setting the available devices to be zero.");
-      break;
 #if C10_ASAN_ENABLED
     case cudaErrorMemoryAllocation:
       // In ASAN mode, we know that a cudaErrorMemoryAllocation error will
@@ -80,6 +78,18 @@ int device_count_impl(bool fail_if_no_driver) {
           "would like to use GPUs, turn off ASAN.");
       break;
 #endif // C10_ASAN_ENABLED
+#if _WIN32 && CUDA_VERSION >= 13000
+    // Workaround for CUDA-13.0 error handling on Windows, see
+    // https://github.com/pytorch/pytorch/issues/162333#issuecomment-3267929585
+    case cudaErrorNotSupported:
+      if (!fail_if_no_driver) {
+        TORCH_WARN(
+            "cudaGetDeviceCount() returned cudaErrorNotSupported, "
+            "likely using older driver or on CPU machine");
+        count = 0;
+        break;
+      }
+#endif
     default:
       TORCH_CHECK(
           false,
diff --git a/c10/cuda/CUDAGraphsC10Utils.h b/c10/cuda/CUDAGraphsC10Utils.h
index eb29ca8bc9f02..936875fd71d5c 100644
--- a/c10/cuda/CUDAGraphsC10Utils.h
+++ b/c10/cuda/CUDAGraphsC10Utils.h
@@ -9,12 +9,6 @@
 
 namespace c10::cuda {
 
-using CaptureId_t = unsigned long long;
-
-// first is set if the instance is created by CUDAGraph::capture_begin.
-// second is set if the instance is created by at::cuda::graph_pool_handle.
-using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
-
 // RAII guard for "cudaStreamCaptureMode", a thread-local value
 // that controls the error-checking strictness of a capture.
 struct C10_CUDA_API CUDAStreamCaptureModeGuard {
diff --git a/c10/cuda/CUDAStream.cpp b/c10/cuda/CUDAStream.cpp
index 0cde2d9de01cf..6d2b1e06fda9b 100644
--- a/c10/cuda/CUDAStream.cpp
+++ b/c10/cuda/CUDAStream.cpp
@@ -147,7 +147,7 @@ static inline StreamIdType streamIdType(StreamId s) {
   // rightmost bit
   int mask_for_type = (1 << kStreamTypeBits) - 1;
   auto val = (s >> 1) & mask_for_type;
-  TORCH_INTERNAL_ASSERT(val || !(s & 1), "invalid StreamId", s);
+  TORCH_CHECK(val || !(s & 1), "invalid StreamId", s);
   return StreamIdType(val);
 }
 
@@ -216,9 +216,6 @@ static void initSingleStream(int p, DeviceIndex device_index, int i) {
 // Creates the low and high priority stream pools for the specified device
 // Warning: only call once per device!
 static void initDeviceStreamState(DeviceIndex device_index) {
-  // Switches to the requested device so streams are properly associated
-  // with it.
-  CUDAGuard device_guard{device_index};
   for (const auto i : c10::irange(kStreamsPerPool)) {
     for (const auto p : c10::irange(max_stream_priorities)) {
       initSingleStream(p, device_index, i);
@@ -279,7 +276,7 @@ cudaStream_t CUDAStream::stream() const {
   StreamIdType st = streamIdType(stream_id);
   size_t si = streamIdIndex(stream_id);
   if (st.isDefault()) {
-    TORCH_INTERNAL_ASSERT(
+    TORCH_CHECK(
         si == 0,
         "Unrecognized stream ",
         stream_,
@@ -294,7 +291,7 @@ cudaStream_t CUDAStream::stream() const {
     return reinterpret_cast<cudaStream_t>(stream_id);
   } else {
     auto streamType = st.getStreamType();
-    TORCH_INTERNAL_ASSERT(
+    TORCH_CHECK(
         streamType >= 1 && streamType <= max_stream_priorities,
         "Unrecognized stream ",
         stream_,
diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp
index f4b62e53fcc00..d545bf5477b64 100644
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@@ -38,6 +38,13 @@ DriverAPI create_driver_api() {
     C10_NVML_DRIVER_API(LOOKUP_NVML_ENTRY)
 #undef LOOKUP_NVML_ENTRY
   }
+
+  if (handle_1) {
+#define LOOKUP_NVML_ENTRY_OPTIONAL(name) \
+  r.name##_ = ((decltype(&name))dlsym(handle_1, #name));
+    C10_NVML_DRIVER_API_OPTIONAL(LOOKUP_NVML_ENTRY_OPTIONAL)
+#undef LOOKUP_NVML_ENTRY_OPTIONAL
+  }
   return r;
 }
 
@@ -54,11 +61,14 @@ void* get_symbol(const char* name, int version) {
   }
 #endif
 
+  // As of CUDA 13, this API is deprecated.
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 13000)
   // This fallback to the old API to try getting the symbol again.
   if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
       st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
     return out;
   }
+#endif
 
   // If the symbol cannot be resolved, report and return nullptr;
   // the caller is responsible for checking the pointer.
diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h
index 9800809d1e535..8910e581a1a4e 100644
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@@ -53,7 +53,8 @@
 #define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) \
   _(cuMulticastAddDevice, 12030)           \
   _(cuMulticastBindMem, 12030)             \
-  _(cuMulticastCreate, 12030)
+  _(cuMulticastCreate, 12030)              \
+  _(cuMulticastUnbind, 12030)
 #else
 #define C10_LIBCUDA_DRIVER_API_OPTIONAL(_)
 #endif
@@ -66,6 +67,12 @@
   _(nvmlDeviceGetComputeRunningProcesses) \
   _(nvmlSystemGetCudaDriverVersion_v2)
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12040)
+#define C10_NVML_DRIVER_API_OPTIONAL(_) _(nvmlDeviceGetGpuFabricInfoV)
+#else
+#define C10_NVML_DRIVER_API_OPTIONAL(_)
+#endif
+
 namespace c10::cuda {
 
 struct DriverAPI {
@@ -74,6 +81,7 @@ struct DriverAPI {
   C10_LIBCUDA_DRIVER_API_REQUIRED(CREATE_MEMBER_VERSIONED)
   C10_LIBCUDA_DRIVER_API_OPTIONAL(CREATE_MEMBER_VERSIONED)
   C10_NVML_DRIVER_API(CREATE_MEMBER)
+  C10_NVML_DRIVER_API_OPTIONAL(CREATE_MEMBER)
 #undef CREATE_MEMBER_VERSIONED
 #undef CREATE_MEMBER
 
diff --git a/c10/metal/atomic.h b/c10/metal/atomic.h
index 6dcd9a706ba74..d0cbc03916989 100644
--- a/c10/metal/atomic.h
+++ b/c10/metal/atomic.h
@@ -124,5 +124,54 @@ struct AtomicType<bool> {
   }
 };
 
+// ComplexHalf atomic op
+template <>
+struct AtomicType<half2> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, half2 value) {
+    auto ptr = data + offset;
+    auto old =
+        ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed);
+    while (!::metal::atomic_compare_exchange_weak_explicit(
+        ptr,
+        &old,
+        as_type<uint>(as_type<half2>(old) + value),
+        ::metal::memory_order_relaxed,
+        ::metal::memory_order_relaxed))
+      ;
+  }
+};
+
+// There are no atomic 64-bit add in Metal yet, but templates below implements a
+// consistent add I.e. if multiple threads are modify the same 64-bit value,
+// results stored at the address will eventually be equal to its original value
+// plus sum of all operands
+template <>
+struct AtomicType<long> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, long value) {
+    const auto value_bits = as_type<ulong>(value);
+    const uint low = static_cast<uint>(value_bits);
+    uint high = static_cast<uint>(value_bits >> 32);
+    auto ptr = data + (offset << 1);
+    auto old_low =
+        atomic_fetch_add_explicit(ptr, low, ::metal::memory_order_relaxed);
+    high += (old_low + low < old_low) ? 1 : 0;
+    atomic_fetch_add_explicit(ptr + 1, high, ::metal::memory_order_relaxed);
+  }
+};
+
+// ComplexFloat atomic op, which again is not really atomic, but eventually
+// consistent
+template <>
+struct AtomicType<float2> {
+  using type = ::metal::atomic<float>;
+  static inline void atomic_add(device type* data, long offset, float2 value) {
+    auto ptr = data + (offset << 1);
+    atomic_fetch_add_explicit(ptr + 0, value.x, ::metal::memory_order_relaxed);
+    atomic_fetch_add_explicit(ptr + 1, value.y, ::metal::memory_order_relaxed);
+  }
+};
+
 } // namespace metal
 } // namespace c10
diff --git a/c10/metal/igamma.h b/c10/metal/igamma.h
new file mode 100644
index 0000000000000..8dabdbbb621c9
--- /dev/null
+++ b/c10/metal/igamma.h
@@ -0,0 +1,744 @@
+#pragma once
+
+#include <c10/metal/utils.h>
+#include <metal_math>
+#include <metal_stdlib>
+
+using namespace c10::metal;
+using namespace metal;
+
+namespace c10 {
+namespace metal {
+
+template <typename T>
+inline float log_gamma(const T);
+
+inline float expm1f(float a);
+
+template <typename T>
+float erfc(T x);
+
+} // namespace metal
+} // namespace c10
+
+namespace {
+
+template <typename T>
+inline float lgamma(const T a) {
+  return log_gamma(a);
+}
+
+inline float expm1(float a) {
+  return expm1f(a);
+}
+
+// NOTE: The following code was ported directly from the CUDA implementation in
+// `aten/src/ATen/native/cuda/IGammaKernel.cu`
+
+/*
+ * This implementation of the regularized incomplete gamma functions and
+ * their helper functions are derived from the implementation of SciPy's
+ * gammainc, Cephes's igam and igamc, and Boost's Lanczos approximations.
+ * See NOTICE for the licenses.
+ */
+// regularized lower & upper incomplete gamma
+template <typename scalar_t>
+scalar_t ratevl(
+    scalar_t x,
+    const scalar_t num[],
+    int64_t M,
+    const scalar_t denom[],
+    int64_t N) {
+  // evaluating rational function, i.e., the ratio of two polynomials
+  // the coefficients for numerator are given by `num` while coeffs for
+  // denumerator are given by `denom`
+
+  using accscalar_t = opmath_t<scalar_t>;
+  int64_t i, dir;
+  accscalar_t y, num_ans, denom_ans;
+  accscalar_t absx = ::fabs(x);
+  thread const accscalar_t* p;
+
+  if (absx > 1) {
+    /* Evaluate as a polynomial in 1/x. */
+    dir = -1;
+    p = num + M;
+    y = 1 / x;
+  } else {
+    dir = 1;
+    p = num;
+    y = x;
+  }
+
+  /* Evaluate the numerator */
+  num_ans = *p;
+  p += dir;
+  for (i = 1; i <= M; i++) {
+    num_ans = num_ans * y + *p;
+    p += dir;
+  }
+  /* Evaluate the denominator */
+  if (absx > 1) {
+    p = denom + N;
+  } else {
+    p = denom;
+  }
+
+  denom_ans = *p;
+  p += dir;
+  for (i = 1; i <= N; i++) {
+    denom_ans = denom_ans * y + *p;
+    p += dir;
+  }
+  if (absx > 1) {
+    i = N - M;
+    return ::pow(x, static_cast<accscalar_t>(i)) * num_ans / denom_ans;
+  } else {
+    return num_ans / denom_ans;
+  }
+}
+
+template <typename scalar_t>
+scalar_t lanczos_sum_expg_scaled(scalar_t x) {
+  // lanczos approximation
+  using accscalar_t = opmath_t<scalar_t>;
+
+  const accscalar_t lanczos_sum_expg_scaled_num[13] = {
+      0.006061842346248906525783753964555936883222,
+      0.5098416655656676188125178644804694509993,
+      19.51992788247617482847860966235652136208,
+      449.9445569063168119446858607650988409623,
+      6955.999602515376140356310115515198987526,
+      75999.29304014542649875303443598909137092,
+      601859.6171681098786670226533699352302507,
+      3481712.15498064590882071018964774556468,
+      14605578.08768506808414169982791359218571,
+      43338889.32467613834773723740590533316085,
+      86363131.28813859145546927288977868422342,
+      103794043.1163445451906271053616070238554,
+      56906521.91347156388090791033559122686859};
+  const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
+      1.,
+      66.,
+      1925.,
+      32670.,
+      357423.,
+      2637558.,
+      13339535.,
+      45995730.,
+      105258076.,
+      150917976.,
+      120543840.,
+      39916800.,
+      0};
+  return ratevl(
+      static_cast<accscalar_t>(x),
+      lanczos_sum_expg_scaled_num,
+      sizeof(lanczos_sum_expg_scaled_num) /
+              sizeof(lanczos_sum_expg_scaled_num[0]) -
+          1,
+      lanczos_sum_expg_scaled_denom,
+      sizeof(lanczos_sum_expg_scaled_denom) /
+              sizeof(lanczos_sum_expg_scaled_denom[0]) -
+          1);
+}
+
+template <typename scalar_t>
+scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
+  // compute x^a * exp(-a) / gamma(a)
+  // corrected from (15) and (16) in [igam2] by replacing exp(x - a) with
+  // exp(a - x).
+
+  using accscalar_t = opmath_t<scalar_t>;
+  accscalar_t ax, fac, res, num, numfac;
+  const accscalar_t MAXLOG = 88.72283905206835;
+  const accscalar_t EXP1 = 2.718281828459045;
+  const accscalar_t lanczos_g = 6.024680040776729583740234375;
+
+  if (::fabs(a - x) > 0.4 * ::fabs(a)) {
+    ax = a * ::log(x) - x - ::lgamma(a);
+    if (ax < -MAXLOG) {
+      return 0.0;
+    }
+    return ::exp(ax);
+  }
+
+  fac = a + lanczos_g - 0.5;
+  res = ::sqrt(fac / EXP1) / lanczos_sum_expg_scaled(a);
+
+  if ((a < 200) && (x < 200)) {
+    res *= ::exp(a - x) * ::pow(x / fac, a);
+  } else {
+    num = x - a - lanczos_g + 0.5;
+    numfac = num / fac;
+    res *= ::exp(a * (::log1p(numfac) - numfac) + x * (0.5 - lanczos_g) / fac);
+  }
+  return res;
+}
+
+template <typename scalar_t>
+scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
+  // Compute igam using DLMF 8.11.4. [igam1]
+
+  using accscalar_t = opmath_t<scalar_t>;
+  const accscalar_t MACHEP = 5.9604644775390625E-8;
+  const int MAXITER = 2000;
+
+  int i;
+  accscalar_t ans, ax, c, r;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* power series */
+  r = a;
+  c = 1.0;
+  ans = 1.0;
+
+  for (i = 0; i < MAXITER; i++) {
+    r += 1.0;
+    c *= x / r;
+    ans += c;
+    if (c <= MACHEP * ans) {
+      break;
+    }
+  }
+  return (ans * ax / a);
+}
+
+template <typename scalar_t>
+scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.7.3 [igam1]. This is related to the series in
+  // _igam_helper_series but extra care is taken to avoid cancellation.
+
+  using accscalar_t = opmath_t<scalar_t>;
+  int n;
+  accscalar_t fac = 1;
+  accscalar_t sum = 0;
+  accscalar_t term, logx;
+  const int MAXITER = 2000;
+  const accscalar_t MACHEP = 5.9604644775390625E-8;
+
+  for (n = 1; n < MAXITER; n++) {
+    fac *= -x / n;
+    term = fac / (a + n);
+    sum += term;
+    if (::fabs(term) <= MACHEP * ::fabs(sum)) {
+      break;
+    }
+  }
+
+  logx = ::log(x);
+  term = -::expm1(a * logx - ::lgamma(1 + a));
+  return term - ::exp(a * logx - ::lgamma(a)) * sum;
+}
+
+template <typename scalar_t>
+scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
+  // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
+
+  using accscalar_t = opmath_t<scalar_t>;
+  const accscalar_t d[25][25] = {
+      {-3.3333333333333333e-1,  8.3333333333333333e-2,
+       -1.4814814814814815e-2,  1.1574074074074074e-3,
+       3.527336860670194e-4,    -1.7875514403292181e-4,
+       3.9192631785224378e-5,   -2.1854485106799922e-6,
+       -1.85406221071516e-6,    8.296711340953086e-7,
+       -1.7665952736826079e-7,  6.7078535434014986e-9,
+       1.0261809784240308e-8,   -4.3820360184533532e-9,
+       9.1476995822367902e-10,  -2.551419399494625e-11,
+       -5.8307721325504251e-11, 2.4361948020667416e-11,
+       -5.0276692801141756e-12, 1.1004392031956135e-13,
+       3.3717632624009854e-13,  -1.3923887224181621e-13,
+       2.8534893807047443e-14,  -5.1391118342425726e-16,
+       -1.9752288294349443e-15},
+      {-1.8518518518518519e-3,  -3.4722222222222222e-3,  2.6455026455026455e-3,
+       -9.9022633744855967e-4,  2.0576131687242798e-4,   -4.0187757201646091e-7,
+       -1.8098550334489978e-5,  7.6491609160811101e-6,   -1.6120900894563446e-6,
+       4.6471278028074343e-9,   1.378633446915721e-7,    -5.752545603517705e-8,
+       1.1951628599778147e-8,   -1.7543241719747648e-11, -1.0091543710600413e-9,
+       4.1627929918425826e-10,  -8.5639070264929806e-11, 6.0672151016047586e-14,
+       7.1624989648114854e-12,  -2.9331866437714371e-12, 5.9966963656836887e-13,
+       -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14,
+       -4.13125571381061e-15},
+      {4.1335978835978836e-3,  -2.6813271604938272e-3,  7.7160493827160494e-4,
+       2.0093878600823045e-6,  -1.0736653226365161e-4,  5.2923448829120125e-5,
+       -1.2760635188618728e-5, 3.4235787340961381e-8,   1.3721957309062933e-6,
+       -6.298992138380055e-7,  1.4280614206064242e-7,   -2.0477098421990866e-10,
+       -1.4092529910867521e-8, 6.228974084922022e-9,    -1.3670488396617113e-9,
+       9.4283561590146782e-13, 1.2872252400089318e-10,  -5.5645956134363321e-11,
+       1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12,
+       4.6622399463901357e-13, -9.905105763906906e-14,  1.8931876768373515e-17,
+       8.8592218725911273e-15},
+      {6.4943415637860082e-4,   2.2947209362139918e-4,  -4.6918949439525571e-4,
+       2.6772063206283885e-4,   -7.5618016718839764e-5, -2.3965051138672967e-7,
+       1.1082654115347302e-5,   -5.6749528269915966e-6, 1.4230900732435884e-6,
+       -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8,
+       -1.9111168485973654e-8,  2.3928620439808118e-12, 2.0620131815488798e-9,
+       -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14,
+       -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12,
+       6.2088195734079014e-17,  2.126978363279737e-13,  -9.3446887915174333e-14,
+       2.0453671226782849e-14},
+      {-8.618882909167117e-4,   7.8403922172006663e-4,
+       -2.9907248030319018e-4,  -1.4638452578843418e-6,
+       6.6414982154651222e-5,   -3.9683650471794347e-5,
+       1.1375726970678419e-5,   2.5074972262375328e-10,
+       -1.6954149536558306e-6,  8.9075075322053097e-7,
+       -2.2929348340008049e-7,  2.956794137544049e-11,
+       2.8865829742708784e-8,   -1.4189739437803219e-8,
+       3.4463580499464897e-9,   -2.3024517174528067e-13,
+       -3.9409233028046405e-10, 1.8602338968504502e-10,
+       -4.356323005056618e-11,  1.2786001016296231e-15,
+       4.6792750266579195e-12,  -2.1492464706134829e-12,
+       4.9088156148096522e-13,  -6.3385914848915603e-18,
+       -5.0453320690800944e-14},
+      {-3.3679855336635815e-4, -6.9728137583658578e-5,  2.7727532449593921e-4,
+       -1.9932570516188848e-4, 6.7977804779372078e-5,   1.419062920643967e-7,
+       -1.3594048189768693e-5, 8.0184702563342015e-6,   -2.2914811765080952e-6,
+       -3.252473551298454e-10, 3.4652846491085265e-7,   -1.8447187191171343e-7,
+       4.8240967037894181e-8,  -1.7989466721743515e-14, -6.3061945000135234e-9,
+       3.1624176287745679e-9,  -7.8409242536974293e-10, 5.1926791652540407e-15,
+       9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11,
+       -3.661886712685252e-17, -1.210902069055155e-12,  5.6807435849905643e-13,
+       -1.3249659916340829e-13},
+      {5.3130793646399222e-4,  -5.9216643735369388e-4,  2.7087820967180448e-4,
+       7.9023532326603279e-7,  -8.1539693675619688e-5,  5.6116827531062497e-5,
+       -1.8329116582843376e-5, -3.0796134506033048e-9,  3.4651553688036091e-6,
+       -2.0291327396058604e-6, 5.7887928631490037e-7,   2.338630673826657e-13,
+       -8.8286007463304835e-8, 4.7435958880408128e-8,   -1.2545415020710382e-8,
+       8.6496488580102925e-14, 1.6846058979264063e-9,   -8.5754928235775947e-10,
+       2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11,
+       1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18,
+       3.6902800842763467e-13},
+      {3.4436760689237767e-4,   5.1717909082605922e-5,
+       -3.3493161081142236e-4,  2.812695154763237e-4,
+       -1.0976582244684731e-4,  -1.2741009095484485e-7,
+       2.7744451511563644e-5,   -1.8263488805711333e-5,
+       5.7876949497350524e-6,   4.9387589339362704e-10,
+       -1.0595367014026043e-6,  6.1667143761104075e-7,
+       -1.7562973359060462e-7,  -1.2974473287015439e-12,
+       2.695423606288966e-8,    -1.4578352908731271e-8,
+       3.887645959386175e-9,    -3.8810022510194121e-17,
+       -5.3279941738772867e-10, 2.7437977643314845e-10,
+       -6.9957960920705679e-11, 2.5899863874868481e-17,
+       8.8566890996696381e-12,  -4.403168815871311e-12,
+       1.0865561947091654e-12},
+      {-6.5262391859530942e-4,  8.3949872067208728e-4,  -4.3829709854172101e-4,
+       -6.969091458420552e-7,   1.6644846642067548e-4,  -1.2783517679769219e-4,
+       4.6299532636913043e-5,   4.5579098679227077e-9,  -1.0595271125805195e-5,
+       6.7833429048651666e-6,   -2.1075476666258804e-6, -1.7213731432817145e-11,
+       3.7735877416110979e-7,   -2.1867506700122867e-7, 6.2202288040189269e-8,
+       6.5977038267330006e-16,  -9.5903864974256858e-9, 5.2132144922808078e-9,
+       -1.3991589583935709e-9,  5.382058999060575e-16,  1.9484714275467745e-10,
+       -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18,
+       -3.3721464474854592e-12},
+      {-5.9676129019274625e-4,  -7.2048954160200106e-5,
+       6.7823088376673284e-4,   -6.4014752602627585e-4,
+       2.7750107634328704e-4,   1.8197008380465151e-7,
+       -8.4795071170685032e-5,  6.105192082501531e-5,
+       -2.1073920183404862e-5,  -8.8585890141255994e-10,
+       4.5284535953805377e-6,   -2.8427815022504408e-6,
+       8.7082341778646412e-7,   3.6886101871706965e-12,
+       -1.5344695190702061e-7,  8.862466778790695e-8,
+       -2.5184812301826817e-8,  -1.0225912098215092e-14,
+       3.8969470758154777e-9,   -2.1267304792235635e-9,
+       5.7370135528051385e-10,  -1.887749850169741e-19,
+       -8.0931538694657866e-11, 4.2382723283449199e-11,
+       -1.1002224534207726e-11},
+      {1.3324454494800656e-3,   -1.9144384985654775e-3,  1.1089369134596637e-3,
+       9.932404122642299e-7,    -5.0874501293093199e-4,  4.2735056665392884e-4,
+       -1.6858853767910799e-4,  -8.1301893922784998e-9,  4.5284402370562147e-5,
+       -3.127053674781734e-5,   1.044986828530338e-5,    4.8435226265680926e-11,
+       -2.1482565873456258e-6,  1.329369701097492e-6,    -4.0295693092101029e-7,
+       -1.7567877666323291e-13, 7.0145043163668257e-8,   -4.040787734999483e-8,
+       1.1474026743371963e-8,   3.9642746853563325e-18,  -1.7804938269892714e-9,
+       9.7480262548731646e-10,  -2.6405338676507616e-10, 5.794875163403742e-18,
+       3.7647749553543836e-11},
+      {1.579727660730835e-3,   1.6251626278391582e-4,   -2.0633421035543276e-3,
+       2.1389686185689098e-3,  -1.0108559391263003e-3,  -3.9912705529919201e-7,
+       3.6235025084764691e-4,  -2.8143901463712154e-4,  1.0449513336495887e-4,
+       2.1211418491830297e-9,  -2.5779417251947842e-5,  1.7281818956040463e-5,
+       -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6,
+       -6.8693396379526735e-7, 2.0653236975414887e-7,   4.6714772409838506e-14,
+       -3.5609886164949055e-8, 2.0470855345905963e-8,   -5.8091738633283358e-9,
+       -1.332821287582869e-16, 9.0354604391335133e-10,  -4.9598782517330834e-10,
+       1.3481607129399749e-10},
+      {-4.0725121195140166e-3, 6.4033628338080698e-3,  -4.0410161081676618e-3,
+       -2.183732802866233e-6,  2.1740441801254639e-3,  -1.9700440518418892e-3,
+       8.3595469747962458e-4,  1.9445447567109655e-8,  -2.5779387120421696e-4,
+       1.9009987368139304e-4,  -6.7696499937438965e-5, -1.4440629666426572e-10,
+       1.5712512518742269e-5,  -1.0304008744776893e-5, 3.304517767401387e-6,
+       7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7,
+       -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8,
+       -1.1407719956357511e-8, 3.2355857064185555e-9,  4.1759468293455945e-20,
+       -5.0423112718105824e-10},
+      {-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3,
+       -9.8576315587856125e-3, 5.0134695031021538e-3,  1.2807521786221875e-6,
+       -2.0626019342754683e-3, 1.7109128573523058e-3,  -6.7695312714133799e-4,
+       -6.9011545676562133e-9, 1.8855128143995902e-4,  -1.3395215663491969e-4,
+       4.6263183033528039e-5,  4.0034230613321351e-11, -1.0255652921494033e-5,
+       6.612086372797651e-6,   -2.0913022027253008e-6, -2.0951775649603837e-13,
+       3.9756029041993247e-7,  -2.3956211978815887e-7, 7.1182883382145864e-8,
+       8.925574873053455e-16,  -1.2101547235064676e-8, 6.9350618248334386e-9,
+       -1.9661464453856102e-9},
+      {1.7402027787522711e-2,   -2.9527880945699121e-2, 2.0045875571402799e-2,
+       7.0289515966903407e-6,   -1.2375421071343148e-2, 1.1976293444235254e-2,
+       -5.4156038466518525e-3,  -6.3290893396418616e-8, 1.8855118129005065e-3,
+       -1.473473274825001e-3,   5.5515810097708387e-4,  5.2406834412550662e-10,
+       -1.4357913535784836e-4,  9.9181293224943297e-5,  -3.3460834749478311e-5,
+       -3.5755837291098993e-12, 7.1560851960630076e-6,  -4.5516802628155526e-6,
+       1.4236576649271475e-6,   1.8803149082089664e-14, -2.6623403898929211e-7,
+       1.5950642189595716e-7,   -4.7187514673841102e-8, -6.5107872958755177e-17,
+       7.9795091026746235e-9},
+      {3.0249124160905891e-2,  2.4817436002649977e-3,  -4.9939134373457022e-2,
+       5.9915643009307869e-2,  -3.2483207601623391e-2, -5.7212968652103441e-6,
+       1.5085251778569354e-2,  -1.3261324005088445e-2, 5.5515262632426148e-3,
+       3.0263182257030016e-8,  -1.7229548406756723e-3, 1.2893570099929637e-3,
+       -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4,
+       -7.7378565221244477e-5, 2.5625836246985201e-5,  1.0766165333192814e-12,
+       -5.3246809282422621e-6, 3.349634863064464e-6,   -1.0381253128684018e-6,
+       -5.608909920621128e-15, 1.9150821930676591e-7,  -1.1418365800203486e-7,
+       3.3654425209171788e-8},
+      {-9.9051020880159045e-2, 1.7954011706123486e-1,   -1.2989606383463778e-1,
+       -3.1478872752284357e-5, 9.0510635276848131e-2,   -9.2828824411184397e-2,
+       4.4412112839877808e-2,  2.7779236316835888e-7,   -1.7229543805449697e-2,
+       1.4182925050891573e-2,  -5.6214161633747336e-3,  -2.39598509186381e-9,
+       1.6029634366079908e-3,  -1.1606784674435773e-3,  4.1001337768153873e-4,
+       1.8365800754090661e-11, -9.5844256563655903e-5,  6.3643062337764708e-5,
+       -2.076250624489065e-5,  -1.1806020912804483e-13, 4.2131808239120649e-6,
+       -2.6262241337012467e-6, 8.0770620494930662e-7,   6.0125912123632725e-16,
+       -1.4729737374018841e-7},
+      {-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1,
+       -4.6435192311733545e-1, 2.6640934719197893e-1,  3.4038266027147191e-5,
+       -1.3784338709329624e-1, 1.276467178337056e-1,   -5.6213828755200985e-2,
+       -1.753150885483011e-7,  1.9235592956768113e-2,  -1.5088821281095315e-2,
+       5.7401854451350123e-3,  1.0622382710310225e-9,  -1.5335082692563998e-3,
+       1.0819320643228214e-3,  -3.7372510193945659e-4, -6.6170909729031985e-12,
+       8.4263617380909628e-5,  -5.5150706827483479e-5, 1.7769536448348069e-5,
+       3.8827923210205533e-14, -3.53513697488768e-6,   2.1865832130045269e-6,
+       -6.6812849447625594e-7},
+      {7.2438608504029431e-1,   -1.3918010932653375,    1.0654143352413968,
+       1.876173868950258e-4,    -8.2705501176152696e-1, 8.9352433347828414e-1,
+       -4.4971003995291339e-1,  -1.6107401567546652e-6, 1.9235590165271091e-1,
+       -1.6597702160042609e-1,  6.8882222681814333e-2,  1.3910091724608687e-8,
+       -2.146911561508663e-2,   1.6228980898865892e-2,  -5.9796016172584256e-3,
+       -1.1287469112826745e-10, 1.5167451119784857e-3,  -1.0478634293553899e-3,
+       3.5539072889126421e-4,   8.1704322111801517e-13, -7.7773013442452395e-5,
+       5.0291413897007722e-5,   -1.6035083867000518e-5, 1.2469354315487605e-14,
+       3.1369106244517615e-6},
+      {1.6668949727276811,     1.165462765994632e-1,   -3.3288393225018906,
+       4.4692325482864037,     -2.6977693045875807,    -2.600667859891061e-4,
+       1.5389017615694539,     -1.4937962361134612,    6.8881964633233148e-1,
+       1.3077482004552385e-6,  -2.5762963325596288e-1, 2.1097676102125449e-1,
+       -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2,
+       -1.7813678334552311e-2, 6.3970330388900056e-3,  4.9430807090480523e-11,
+       -1.5554602758465635e-3, 1.0561196919903214e-3,  -3.5277184460472902e-4,
+       9.3002334645022459e-14, 7.5285855026557172e-5,  -4.8186515569156351e-5,
+       1.5227271505597605e-5},
+      {-6.6188298861372935,    1.3397985455142589e+1,  -1.0789350606845146e+1,
+       -1.4352254537875018e-3, 9.2333694596189809,     -1.0456552819547769e+1,
+       5.5105526029033471,     1.2024439690716742e-5,  -2.5762961164755816,
+       2.3207442745387179,     -1.0045728797216284,    -1.0207833290021914e-7,
+       3.3975092171169466e-1,  -2.6720517450757468e-1, 1.0235252851562706e-1,
+       8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2,
+       -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3,
+       -1.1082898580743683e-3, 3.654545161310169e-4,   -5.1290032026971794e-11,
+       -7.6340103696869031e-5},
+      {-1.7112706061976095e+1, -1.1208044642899116,    3.7131966511885444e+1,
+       -5.2298271025348962e+1, 3.3058589696624618e+1,  2.4791298976200222e-3,
+       -2.061089403411526e+1,  2.088672775145582e+1,   -1.0045703956517752e+1,
+       -1.2238783449063012e-5, 4.0770134274221141,     -3.473667358470195,
+       1.4329352617312006,     7.1359914411879712e-8,  -4.4797257159115612e-1,
+       3.4112666080644461e-1,  -1.2699786326594923e-1, -2.8953677269081528e-10,
+       3.3125776278259863e-2,  -2.3274087021036101e-2, 8.0399993503648882e-3,
+       -1.177805216235265e-9,  -1.8321624891071668e-3, 1.2108282933588665e-3,
+       -3.9479941246822517e-4},
+      {7.389033153567425e+1,   -1.5680141270402273e+2, 1.322177542759164e+2,
+       1.3692876877324546e-2,  -1.2366496885920151e+2, 1.4620689391062729e+2,
+       -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1,
+       -3.8210340013273034e+1, 1.719522294277362e+1,   9.3519707955168356e-7,
+       -6.2716159907747034,    5.1168999071852637,     -2.0319658112299095,
+       -4.9507215582761543e-9, 5.9626397294332597e-1,  -4.4220765337238094e-1,
+       1.6079998700166273e-1,  -2.4733786203223402e-8, -4.0307574759979762e-2,
+       2.7849050747097869e-2,  -9.4751858992054221e-3, 6.419922235909132e-6,
+       2.1250180774699461e-3},
+      {2.1216837098382522e+2,  1.3107863022633868e+1,  -4.9698285932871748e+2,
+       7.3121595266969204e+2,  -4.8213821720890847e+2, -2.8817248692894889e-2,
+       3.2616720302947102e+2,  -3.4389340280087117e+2, 1.7195193870816232e+2,
+       1.4038077378096158e-4,  -7.52594195897599e+1,   6.651969984520934e+1,
+       -2.8447519748152462e+1, -7.613702615875391e-7,  9.5402237105304373,
+       -7.5175301113311376,    2.8943997568871961,     -4.6612194999538201e-7,
+       -8.0615149598794088e-1, 5.8483006570631029e-1,  -2.0845408972964956e-1,
+       1.4765818959305817e-4,  5.1000433863753019e-2,  -3.3066252141883665e-2,
+       1.5109265210467774e-2},
+      {-9.8959643098322368e+2, 2.1925555360905233e+3,  -1.9283586782723356e+3,
+       -1.5925738122215253e-1, 1.9569985945919857e+3,  -2.4072514765081556e+3,
+       1.3756149959336496e+3,  1.2920735237496668e-3,  -7.525941715948055e+2,
+       7.3171668742208716e+2,  -3.4137023466220065e+2, -9.9857390260608043e-6,
+       1.3356313181291573e+2,  -1.1276295161252794e+2, 4.6310396098204458e+1,
+       -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1,
+       -4.1690817945270892,    3.1008219800117808e-3,  1.1220095449981468,
+       -7.6052379926149916e-1, 3.6262236505085254e-1,  2.216867741940747e-1,
+       4.8683443692930507e-1}};
+
+  int k, n, sgn;
+  int maxpow = 0;
+  const accscalar_t MACHEP = 5.9604644775390625E-8;
+  accscalar_t lambda = x / a;
+  accscalar_t sigma = (x - a) / a;
+  accscalar_t eta, res, ck, ckterm, term, absterm;
+  accscalar_t absoldterm = INFINITY;
+  accscalar_t etapow[25] = {1};
+  accscalar_t sum = 0;
+  accscalar_t afac = 1;
+
+  if (igam) {
+    sgn = -1;
+  } else {
+    sgn = 1;
+  }
+
+  if (lambda > 1) {
+    eta = ::sqrt(-2 * (::log1p(sigma) - sigma));
+  } else if (lambda < 1) {
+    eta = -::sqrt(-2 * (::log1p(sigma) - sigma));
+  } else {
+    eta = 0;
+  }
+  res = 0.5 * ::erfc(sgn * eta * ::sqrt(a / 2));
+
+  for (k = 0; k < 25; k++) {
+    ck = d[k][0];
+    for (n = 1; n < 25; n++) {
+      if (n > maxpow) {
+        etapow[n] = eta * etapow[n - 1];
+        maxpow += 1;
+      }
+      ckterm = d[k][n] * etapow[n];
+      ck += ckterm;
+      if (::fabs(ckterm) < MACHEP * ::fabs(ck)) {
+        break;
+      }
+    }
+    term = ck * afac;
+    absterm = ::fabs(term);
+    if (absterm > absoldterm) {
+      break;
+    }
+    sum += term;
+    if (absterm < MACHEP * ::fabs(sum)) {
+      break;
+    }
+    absoldterm = absterm;
+    afac /= a;
+  }
+  res += sgn * ::exp(-0.5 * a * eta * eta) * sum / ::sqrt(2 * 3.1415926535 * a);
+
+  return res;
+}
+
+template <typename scalar_t>
+scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.9.2. [igam1]
+
+  using accscalar_t = opmath_t<scalar_t>;
+  int i;
+  accscalar_t ans, ax, c, yc, r, t, y, z;
+  accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
+  const int MAXITER = 2000;
+  const accscalar_t MACHEP = 5.9604644775390625E-8;
+  const accscalar_t BIG = 16777216.;
+  const accscalar_t BIGINV = 5.9604644775390625E-8;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* continued fraction */
+  y = 1.0 - a;
+  z = x + y + 1.0;
+  c = 0.0;
+  pkm2 = 1.0;
+  qkm2 = x;
+  pkm1 = x + 1.0;
+  qkm1 = z * x;
+  ans = pkm1 / qkm1;
+
+  for (i = 0; i < MAXITER; i++) {
+    c += 1.0;
+    y += 1.0;
+    z += 2.0;
+    yc = y * c;
+    pk = pkm1 * z - pkm2 * yc;
+    qk = qkm1 * z - qkm2 * yc;
+    if (qk != 0) {
+      r = pk / qk;
+      t = ::fabs((ans - r) / r);
+      ans = r;
+    } else {
+      t = 1.0;
+    }
+    pkm2 = pkm1;
+    pkm1 = pk;
+    qkm2 = qkm1;
+    qkm1 = qk;
+    if (::fabs(pk) > BIG) {
+      pkm2 *= BIGINV;
+      pkm1 *= BIGINV;
+      qkm2 *= BIGINV;
+      qkm1 *= BIGINV;
+    }
+    if (t <= MACHEP) {
+      break;
+    }
+  }
+  return ans * ax;
+}
+
+template <typename scalar_t>
+scalar_t calc_igammac(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized upper incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.4 [igam1])
+   * - if x > 1.1 and x < a, using the subtraction from the regularized lower
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (5)
+   */
+
+  using accscalar_t = opmath_t<scalar_t>;
+  accscalar_t absxma_a;
+
+  const accscalar_t SMALL = 20.0;
+  const accscalar_t LARGE = 200.0;
+  const accscalar_t SMALLRATIO = 0.3;
+  const accscalar_t LARGERATIO = 4.5;
+
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return NAN;
+  } else if (a == 0) {
+    if (x > 0) {
+      return 0.0;
+    } else {
+      return NAN;
+    }
+  } else if (x == 0) {
+    return 1.0;
+  } else if (isinf(a)) {
+    if (isinf(x)) {
+      return NAN;
+    }
+    return 1.0;
+  } else if (isinf(x)) {
+    return 0.0;
+  }
+
+  absxma_a = ::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+    return _igam_helper_asymptotic_series(a, x, 0);
+  } else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) {
+    return _igam_helper_asymptotic_series(a, x, 0);
+  }
+
+  if (x > 1.1) {
+    if (x < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    } else {
+      return _igamc_helper_continued_fraction(a, x);
+    }
+  } else if (x <= 0.5) {
+    if (-0.4 / ::log(x) < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    } else {
+      return _igamc_helper_series(a, x);
+    }
+  } else {
+    if (x * 1.1 < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    } else {
+      return _igamc_helper_series(a, x);
+    }
+  }
+}
+
+template <typename scalar_t>
+scalar_t calc_igamma(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized lower incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.3 [igam1])
+   * - if x > 1 and x > a, using the subtraction from the regularized upper
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (4)
+   */
+
+  using accscalar_t = opmath_t<scalar_t>;
+  accscalar_t absxma_a;
+  const accscalar_t SMALL = 20.0;
+  const accscalar_t LARGE = 200.0;
+  const accscalar_t SMALLRATIO = 0.3;
+  const accscalar_t LARGERATIO = 4.5;
+
+  // boundary values following SciPy
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return NAN;
+  } else if (a == 0) {
+    if (x > 0) {
+      return 1.0;
+    } else {
+      return NAN;
+    }
+  } else if (x == 0) {
+    return 0.0; // zero integration limit
+  } else if (isinf(a)) {
+    if (isinf(x)) {
+      return NAN;
+    }
+    return 0.0;
+  } else if (isinf(x)) {
+    return 1.0;
+  }
+
+  /* Asymptotic regime where a ~ x. */
+  absxma_a = ::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  } else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  }
+
+  if ((x > 1.0) && (x > a)) {
+    return 1.0 - calc_igammac(a, x);
+  }
+
+  return _igam_helper_series(a, x);
+}
+
+} // namespace
+
+// end of regularized lower & upper incomplete gamma
+
+namespace c10 {
+namespace metal {
+
+template <typename T>
+inline T igamma(T a, T b) {
+  return calc_igamma(a, b);
+}
+
+template <typename T>
+inline T igammac(T a, T b) {
+  return calc_igammac(a, b);
+}
+
+} // namespace metal
+} // namespace c10
diff --git a/c10/metal/special_math.h b/c10/metal/special_math.h
index 34f6ab6d1d09e..29a45ff4c30b6 100644
--- a/c10/metal/special_math.h
+++ b/c10/metal/special_math.h
@@ -1,6 +1,7 @@
 // Implementation of specal math functions for Metal
 #pragma once
 #include <c10/metal/expm1f.h>
+#include <c10/metal/igamma.h>
 #include <c10/metal/utils.h>
 #include <metal_stdlib>
 
@@ -47,6 +48,11 @@ inline float erf(T x) {
   return r;
 }
 
+template <typename T>
+float erfc(T x) {
+  return 1.0 - erf(x);
+}
+
 template <typename T>
 inline float erfinv(T y) {
   /* coefficients in rational expansion */
diff --git a/c10/test/build.bzl b/c10/test/build.bzl
index 2f54c8a2faa5b..deb917dd8fcf3 100644
--- a/c10/test/build.bzl
+++ b/c10/test/build.bzl
@@ -46,7 +46,7 @@ def define_targets(rules):
                 "util/typeid_test.cpp",
             ],
         ),
-        copts = ["-Wno-deprecated-declarations"],
+        copts = ["-Wno-deprecated-declarations", "-Wno-ctad-maybe-unsupported"],
         deps = [
             ":Macros",
             ":complex_math_test_common",
diff --git a/c10/test/core/SymInt_test.cpp b/c10/test/core/SymInt_test.cpp
index 7cefa1e4a771b..e408543f5362c 100644
--- a/c10/test/core/SymInt_test.cpp
+++ b/c10/test/core/SymInt_test.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+#include <c10/core/ConstantSymNodeImpl.h>
 #include <c10/core/SymInt.h>
 #include <c10/core/SymNodeImpl.h>
 #include <c10/macros/Macros.h>
@@ -35,4 +36,169 @@ TEST(SymIntTest, Overflows) {
 }
 #endif
 
+namespace {
+
+// We need a SymNodeImpl that 1) has working arithmetic with
+// predictable results and 2) causes SymInt::maybe_as_int to return
+// nullopt so that we can hit all 4 cases (zero/one/both arguments
+// have null maybe_as_int) in the operator implementations.
+class ConstantIntPretendingToBeSymbolicSymNodeImpl
+    : public ConstantSymNodeImpl<int64_t> {
+ public:
+  using ConstantSymNodeImpl<int64_t>::ConstantSymNodeImpl;
+  std::optional<int64_t> constant_int() override {
+    return std::nullopt;
+  }
+  std::optional<int64_t> maybe_as_int() override {
+    return std::nullopt;
+  }
+  // Needs to be implemented for arithmetic to actually
+  // work. NestedIntSymNodeImpl does this, for example.
+  c10::SymNode wrap_int(int64_t num) override {
+    return SymNode(
+        c10::make_intrusive<ConstantIntPretendingToBeSymbolicSymNodeImpl>(num));
+  }
+
+  c10::SymNode wrap_bool(bool b) override {
+    return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(b));
+  }
+
+  SymNode add(const SymNode& other) override {
+    return wrap_int(int_() + other->int_());
+  }
+
+  SymNode sub(const SymNode& other) override {
+    return wrap_int(int_() - other->int_());
+  }
+
+  SymNode mul(const SymNode& other) override {
+    return wrap_int(int_() * other->int_());
+  }
+
+  SymNode floordiv(const SymNode& other) override {
+    return wrap_int(int_() / other->int_());
+  }
+
+  SymNode sym_min(const SymNode& other) override {
+    return wrap_int(std::min(int_(), other->int_()));
+  }
+
+  SymNode sym_max(const SymNode& other) override {
+    return wrap_int(std::max(int_(), other->int_()));
+  }
+
+  SymNode mod(const SymNode& other) override {
+    return wrap_int(int_() % other->int_());
+  }
+
+  SymNode eq(const SymNode& other) override {
+    return wrap_bool(int_() == other->int_());
+  }
+
+  SymNode ne(const SymNode& other) override {
+    return wrap_bool(int_() != other->int_());
+  }
+
+  SymNode lt(const SymNode& other) override {
+    return wrap_bool(int_() < other->int_());
+  }
+
+  SymNode le(const SymNode& other) override {
+    return wrap_bool(int_() <= other->int_());
+  }
+
+  SymNode gt(const SymNode& other) override {
+    return wrap_bool(int_() > other->int_());
+  }
+
+  SymNode ge(const SymNode& other) override {
+    return wrap_bool(int_() >= other->int_());
+  }
+};
+
+SymInt create_symbolic_symint(int64_t value) {
+  return SymInt(
+      SymNode(c10::make_intrusive<ConstantIntPretendingToBeSymbolicSymNodeImpl>(
+          value)));
+}
+
+auto unwrap(const SymInt& x) {
+  return x.guard_int(__FILE__, __LINE__);
+}
+
+auto unwrap(bool b) {
+  return b;
+}
+
+template <template <typename> class Op>
+void test_operator() {
+  for (const auto& arg1 : {SymInt(42), create_symbolic_symint(42)}) {
+    for (const auto& arg2 : {SymInt(27), create_symbolic_symint(27)}) {
+      EXPECT_EQ(unwrap(Op<SymInt>()(arg1, arg2)), Op<int64_t>()(42, 27));
+    }
+  }
+}
+} // namespace
+
+TEST(SymIntTest, BinaryPlus) {
+  test_operator<std::plus>();
+}
+
+TEST(SymIntTest, BinaryMinus) {
+  test_operator<std::minus>();
+}
+
+TEST(SymIntTest, BinaryMultiplies) {
+  test_operator<std::multiplies>();
+}
+
+TEST(SymIntTest, BinaryDivides) {
+  test_operator<std::divides>();
+}
+
+TEST(SymIntTest, BinaryModulus) {
+  test_operator<std::modulus>();
+}
+
+TEST(SymIntTest, BinaryComparisonOperators) {
+  test_operator<std::equal_to>();
+  test_operator<std::not_equal_to>();
+  test_operator<std::less>();
+  test_operator<std::less_equal>();
+  test_operator<std::greater>();
+  test_operator<std::greater_equal>();
+}
+
+template <typename T>
+struct MinWrapper {
+  auto operator()(T lhs, T rhs) const {
+    return std::min(lhs, rhs);
+  }
+};
+
+template <>
+struct MinWrapper<SymInt> {
+  auto operator()(const SymInt& lhs, const SymInt& rhs) const {
+    return lhs.min(rhs);
+  }
+};
+
+template <typename T>
+struct MaxWrapper {
+  auto operator()(T lhs, T rhs) const {
+    return std::max(lhs, rhs);
+  }
+};
+
+template <>
+struct MaxWrapper<SymInt> {
+  auto operator()(const SymInt& lhs, const SymInt& rhs) const {
+    return lhs.max(rhs);
+  }
+};
+
+TEST(SymIntTest, MinMax) {
+  test_operator<MinWrapper>();
+  test_operator<MaxWrapper>();
+}
 #endif
diff --git a/c10/xpu/XPUCachingAllocator.cpp b/c10/xpu/XPUCachingAllocator.cpp
index afae32d92a4b4..a5e088515ff55 100644
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@@ -1,4 +1,3 @@
-#include <c10/core/AllocatorConfig.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
 #include <c10/xpu/XPUCachingAllocator.h>
@@ -21,6 +20,8 @@ constexpr size_t kMinBlockSize = 512;
 constexpr size_t kSmallSize = 1048576;
 // "small" allocations are packed in 2 MiB blocks
 constexpr size_t kSmallBuffer = 2097152;
+// "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kLargeBuffer = 20971520;
 // allocations between 1 and 10 MiB may use kLargeBuffer
 constexpr size_t kMinLargeAlloc = 10485760;
 // round up large allocations to 2 MiB
@@ -539,7 +540,7 @@ class DeviceCachingAllocator {
 
 static void local_raw_delete(void* ptr);
 
-class XPUAllocator : public Allocator {
+class XPUAllocator : public DeviceAllocator {
  private:
   std::mutex mutex;
   ska::flat_hash_map<void*, Block*> allocated_blocks;
@@ -575,6 +576,10 @@ class XPUAllocator : public Allocator {
     }
   }
 
+  bool initialized() override {
+    return !device_allocators.empty();
+  }
+
   void malloc(
       void** devPtr,
       DeviceIndex device,
@@ -609,13 +614,13 @@ class XPUAllocator : public Allocator {
     }
   }
 
-  void emptyCache() {
+  void emptyCache(MempoolId_t mempool_id [[maybe_unused]] = {0, 0}) override {
     for (auto& da : device_allocators) {
       da->emptyCache();
     }
   }
 
-  void recordStream(const DataPtr& ptr, XPUStream stream) {
+  void recordStream(const DataPtr& ptr, c10::Stream stream) override {
     if (!ptr.get()) {
       return;
     }
@@ -625,7 +630,8 @@ class XPUAllocator : public Allocator {
 
     Block* block = get_allocated_block(ptr.get());
     TORCH_CHECK(block, "No allocated block can be found.");
-    device_allocators[block->device]->recordStream(block, stream);
+    c10::xpu::XPUStream xpu_stream{stream};
+    device_allocators[block->device]->recordStream(block, xpu_stream);
   }
 
   DataPtr allocate(size_t size) override {
@@ -678,17 +684,17 @@ class XPUAllocator : public Allocator {
         ": did you call init?");
   }
 
-  DeviceStats getDeviceStats(DeviceIndex device) {
+  DeviceStats getDeviceStats(DeviceIndex device) override {
     assertValidDevice(device);
     return device_allocators[device]->getStats();
   }
 
-  void resetPeakStats(DeviceIndex device) {
+  void resetPeakStats(DeviceIndex device) override {
     assertValidDevice(device);
     device_allocators[device]->resetPeakStats();
   }
 
-  void resetAccumulatedStats(DeviceIndex device) {
+  void resetAccumulatedStats(DeviceIndex device) override {
     assertValidDevice(device);
     device_allocators[device]->resetAccumulatedStats();
   }
diff --git a/c10/xpu/XPUDeviceProp.h b/c10/xpu/XPUDeviceProp.h
index 591a14f4ad91a..085c6367477f0 100644
--- a/c10/xpu/XPUDeviceProp.h
+++ b/c10/xpu/XPUDeviceProp.h
@@ -115,19 +115,22 @@ namespace c10::xpu {
 
 #define AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(_)                                \
   /* the number of EUs associated with the Intel GPU. */                      \
-  _(gpu_eu_count, 512)                                                        \
+  _(gpu_eu_count, gpu_eu_count, 512)                                          \
                                                                               \
   /* the number of EUs in a subslice. */                                      \
-  _(gpu_eu_count_per_subslice, 8)                                             \
+  _(gpu_eu_count_per_subslice, gpu_eu_count_per_subslice, 8)                  \
                                                                               \
   /* the simd width of EU of GPU. */                                          \
-  _(gpu_eu_simd_width, 8)                                                     \
+  _(gpu_eu_simd_width, gpu_eu_simd_width, 8)                                  \
                                                                               \
   /* the number of hardware threads per EU of GPU. */                         \
-  _(gpu_hw_threads_per_eu, 8)                                                 \
+  _(gpu_hw_threads_per_eu, gpu_hw_threads_per_eu, 8)                          \
                                                                               \
   /* the device identifier of the Intel GPU, also known as the product ID. */ \
-  _(device_id, 0)
+  _(device_id, device_id, 0)                                                  \
+                                                                              \
+  /* the device descriptor for device Universal Unique ID, 16 bytes*/         \
+  _(uuid, device_info_uuid, (std::array<unsigned char, 16>{}))
 
 #define AT_FORALL_XPU_DEVICE_ASPECT(_)                  \
   /* sycl::half is supported on device. */              \
diff --git a/c10/xpu/XPUFunctions.cpp b/c10/xpu/XPUFunctions.cpp
index 5ea7d30e34cfa..6947c078483eb 100644
--- a/c10/xpu/XPUFunctions.cpp
+++ b/c10/xpu/XPUFunctions.cpp
@@ -157,9 +157,9 @@ void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
 #define ASSIGN_DEVICE_PROP(property) \
   device_prop->property = raw_device.get_info<device::property>();
 
-#define ASSIGN_EXT_DEVICE_PROP(property, default_value)                      \
-  device_prop->property = raw_device.has(sycl::aspect::ext_intel_##property) \
-      ? raw_device.get_info<intel::info::device::property>()                 \
+#define ASSIGN_EXT_DEVICE_PROP(property, aspect_tag, default_value)            \
+  device_prop->property = raw_device.has(sycl::aspect::ext_intel_##aspect_tag) \
+      ? raw_device.get_info<intel::info::device::property>()                   \
       : default_value;
 
 #define ASSIGN_DEVICE_ASPECT(member) \
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 706b191e318e2..4cd773bc16123 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -540,11 +540,9 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
     ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
   )
 
-  if(USE_DISTRIBUTED)
-    append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
-    if(NOT WIN32)
-      append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
-    endif()
+  append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
+  if(NOT WIN32)
+    append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
   endif()
 endif()
 
@@ -568,31 +566,30 @@ if(USE_CUDA)
     list(APPEND Caffe2_GPU_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
   endif()
-  if(USE_DISTRIBUTED)
-    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
-    if(NOT WIN32)
-      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
-      set_source_files_properties(
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
-        PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
-      )
-    endif()
+  append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
+  if(NOT WIN32)
+    append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
+    set_source_files_properties(
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
+      PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
+    )
+  endif()
 
-    set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
-    # Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
-    if(CMAKE_COMPILER_IS_GNUCXX)
-      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
-    endif()
-    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
-      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
-    endif()
+  set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
+  # Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
+  if(CMAKE_COMPILER_IS_GNUCXX)
+    set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
+  endif()
+  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
+    set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
   endif()
   set_source_files_properties(
     ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@@ -625,11 +622,9 @@ if(USE_ROCM)
     list(APPEND Caffe2_HIP_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
   endif()
-  if(USE_DISTRIBUTED)
-    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
-    if(NOT WIN32)
-      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
-    endif()
+  append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
+  if(NOT WIN32)
+    append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
   endif()
   # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
   # See NOTE [ ATen NVRTC Stub and HIP ]
@@ -1062,7 +1057,7 @@ elseif(USE_CUDA)
         UNFUSE_FMA                      # Addressing issue #121558
       )
     target_sources(torch_cuda PRIVATE $<TARGET_OBJECTS:flash_attention>)
-    target_include_directories(torch_cuda PUBLIC
+    target_include_directories(torch_cuda SYSTEM PUBLIC
       $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc>
       $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/include>
       $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>
@@ -1122,6 +1117,11 @@ elseif(USE_CUDA)
     set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
     set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
   endif()
+  # Set driver api defined for PeerToPeerAccess
+  if(NOT WIN32)
+    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/PeerToPeerAccess.cpp PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1")
+  endif()
+
 endif()
 
 if(USE_XPU)
@@ -1345,16 +1345,10 @@ if(BUILD_TEST)
     add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
     add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
     add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
-    add_subdirectory(
-      ${TORCH_ROOT}/test/cpp/tensorexpr
-      ${CMAKE_BINARY_DIR}/test_tensorexpr
-    )
-    if(USE_DISTRIBUTED)
-      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
-      if(NOT WIN32)
-        add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
-        add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
-      endif()
+    add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
+    if(NOT WIN32)
+      add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
+      add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
     endif()
     if(NOT NO_API)
       add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
@@ -1446,8 +1440,8 @@ if(USE_ROCM)
   if(USE_MEM_EFF_ATTENTION)
     target_compile_definitions(torch_hip PRIVATE USE_MEM_EFF_ATTENTION)
   endif()
-  if(USE_CK_FLASH_ATTENTION)
-    target_compile_definitions(torch_hip PRIVATE USE_CK_FLASH_ATTENTION)
+  if(USE_ROCM_CK_SDPA)
+    target_compile_definitions(torch_hip PRIVATE USE_ROCM_CK_SDPA)
   endif()
 endif()
 
@@ -1459,46 +1453,40 @@ if(BUILD_LITE_INTERPRETER)
   endif()
 endif()
 
-
-# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
-# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
-if(USE_DISTRIBUTED)
-  target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
-  if(USE_GLOO AND USE_C10D_GLOO)
-    target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
-  endif()
-  if(USE_UCC AND USE_C10D_UCC)
-    target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
-    if(USE_CUDA)
-      target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
-    endif()
-  endif()
-  if(USE_NCCL AND USE_C10D_NCCL)
-    if(USE_ROCM)
-      target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
-    else()
-      target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
-    endif()
-  endif()
-  if(USE_MPI AND USE_C10D_MPI)
-    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-      set_source_files_properties(
-        "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
-        PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
-    endif()
-    target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
+if(USE_GLOO AND USE_C10D_GLOO)
+  target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
+endif()
+if(USE_UCC AND USE_C10D_UCC)
+  target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
+  if(USE_CUDA)
+    target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
   endif()
-  # Pass USE_RPC in order to reduce use of
-  # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
-  # need to be removed when RPC is supported
-  if(NOT WIN32)
-    target_compile_definitions(torch_cpu PUBLIC USE_RPC)
+endif()
+if(USE_NCCL AND USE_C10D_NCCL)
+  if(USE_ROCM)
+    target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
+  else()
+    target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
   endif()
-  # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
-  # can only be compiled with USE_TENSORPIPE is set.
-  if(USE_TENSORPIPE)
-    target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
+endif()
+if(USE_MPI AND USE_C10D_MPI)
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    set_source_files_properties(
+      "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
+      PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
   endif()
+  target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
+endif()
+# Pass USE_RPC in order to reduce use of
+# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
+# need to be removed when RPC is supported
+if(NOT WIN32)
+  target_compile_definitions(torch_cpu PUBLIC USE_RPC)
+endif()
+# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
+# can only be compiled with USE_TENSORPIPE is set.
+if(USE_TENSORPIPE)
+  target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
 endif()
 
 if(NOT INTERN_BUILD_MOBILE)
@@ -1650,6 +1638,10 @@ if(USE_CUDA)
   # order of the libraries in the linker call matters here when statically
   # linking; libculibos and cublas must be last.
   target_link_libraries(torch_cuda PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
+  if(USE_FBGEMM_GENAI)
+    # Link fbgemm_genai to torch_cuda (only for (1) CUDA build for SM100).
+    target_link_libraries(torch_cuda PRIVATE fbgemm_genai)
+  endif()
 endif()
 
 # ---[ XPU library.
@@ -1771,9 +1763,10 @@ if(USE_ROCM)
   target_link_libraries(torch_hip PRIVATE ${Caffe2_HIP_DEPENDENCY_LIBS})
 
   if(USE_FBGEMM_GENAI)
-    target_link_libraries(torch_hip PRIVATE fbgemm_genai)
+    if(USE_ROCM)
+      target_link_libraries(torch_hip PRIVATE fbgemm_genai)
+    endif()
   endif()
-
   # Since PyTorch files contain HIP headers, this is also needed to capture the includes.
   # ROCM_INCLUDE_DIRS is defined in LoadHIP.cmake
   target_include_directories(torch_hip PRIVATE ${Caffe2_HIP_INCLUDE} ${ROCM_INCLUDE_DIRS})
@@ -1837,6 +1830,12 @@ if(BUILD_TEST)
               target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::undefined)
             endif()
           endif()
+          if(USE_LSAN AND TARGET Sanitizer::leak)
+            target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::leak)
+          endif()
+          if(USE_TSAN AND TARGET Sanitizer::thread)
+            target_link_libraries(${test_name}_${CPU_CAPABILITY} Sanitizer::thread)
+          endif()
         else()
           add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
           target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
diff --git a/cmake/BLAS_ABI.cmake b/cmake/BLAS_ABI.cmake
index bb0b5949d73d2..45a15af1027a3 100644
--- a/cmake/BLAS_ABI.cmake
+++ b/cmake/BLAS_ABI.cmake
@@ -1,3 +1,4 @@
+include(CMakePushCheckState)
 # Push host architecture when cross-compiling otherwise check would fail
 # when cross-compiling for arm64 on x86_64
 cmake_push_check_state(RESET)
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 16ee19a91d487..e4973c849a18f 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -91,30 +91,28 @@ if(INTERN_BUILD_ATEN_OPS)
       torch_cuda_get_nvcc_gencode_flag(_existing_arch_flags)
 
       set(_file_compile_flags "")
-      if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0)
-        foreach(_arch ${archs})
-          if("${_arch}" STREQUAL "89")
-            if(_existing_arch_flags MATCHES ".*compute_86.*")
-              list(APPEND _file_compile_flags "-gencode;arch=compute_89,code=sm_89")
-            endif()
+      foreach(_arch ${archs})
+        if("${_arch}" STREQUAL "89")
+          if(_existing_arch_flags MATCHES ".*compute_86.*")
+            list(APPEND _file_compile_flags "-gencode;arch=compute_89,code=sm_89")
           endif()
-          if("${_arch}" STREQUAL "90a")
-            if(_existing_arch_flags MATCHES ".*compute_90.*")
-              list(APPEND _file_compile_flags "-gencode;arch=compute_90a,code=sm_90a")
-            endif()
+        endif()
+        if("${_arch}" STREQUAL "90a")
+          if(_existing_arch_flags MATCHES ".*compute_90.*")
+            list(APPEND _file_compile_flags "-gencode;arch=compute_90a,code=sm_90a")
           endif()
-          if("${_arch}" STREQUAL "100a")
-            if(_existing_arch_flags MATCHES ".*compute_100.*")
-              list(APPEND _file_compile_flags "-gencode;arch=compute_100a,code=sm_100a")
-            endif()
+        endif()
+        if("${_arch}" STREQUAL "100a")
+          if(_existing_arch_flags MATCHES ".*compute_100.*")
+            list(APPEND _file_compile_flags "-gencode;arch=compute_100a,code=sm_100a")
           endif()
-          if("${_arch}" STREQUAL "120a")
-            if(_existing_arch_flags MATCHES ".*compute_120.*")
-              list(APPEND _file_compile_flags "-gencode;arch=compute_120a,code=sm_120a")
-            endif()
+        endif()
+        if("${_arch}" STREQUAL "120a")
+          if(_existing_arch_flags MATCHES ".*compute_120.*")
+            list(APPEND _file_compile_flags "-gencode;arch=compute_120a,code=sm_120a")
           endif()
-        endforeach()
-      endif()
+        endif()
+      endforeach()
       list(JOIN _file_compile_flags " " _file_compile_flags)
 
       set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS "${_file_compile_flags}")
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index d11915fe43147..08ffdaf8cf451 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -108,24 +108,32 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_BUILD_MOBILE)
   enable_ubsan()
 endif()
 
-if(USE_ASAN OR USE_TSAN)
+if(USE_ASAN OR USE_LSAN OR USE_TSAN)
   find_package(Sanitizer REQUIRED)
   if(USE_ASAN)
     if(TARGET Sanitizer::address)
       list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::address)
     else()
-      message(WARNING "Not ASAN found. Suppress this warning with -DUSE_ASAN=OFF.")
+      message(WARNING "ASAN not found. Suppress this warning with -DUSE_ASAN=OFF.")
       caffe2_update_option(USE_ASAN OFF)
     endif()
     if(TARGET Sanitizer::undefined)
       list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::undefined)
     endif()
   endif()
+  if(USE_LSAN)
+    if(TARGET Sanitizer::leak)
+      list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::leak)
+    else()
+      message(WARNING "LSAN not found. Suppress this warning with -DUSE_LSAN=OFF.")
+      caffe2_update_option(USE_LSAN OFF)
+    endif()
+  endif()
   if(USE_TSAN)
     if(TARGET Sanitizer::thread)
       list(APPEND Caffe2_DEPENDENCY_LIBS Sanitizer::thread)
     else()
-      message(WARNING "Not TSAN found. Suppress this warning with -DUSE_TSAN=OFF.")
+      message(WARNING "TSAN not found. Suppress this warning with -DUSE_TSAN=OFF.")
       caffe2_update_option(USE_TSAN OFF)
     endif()
   endif()
@@ -153,6 +161,7 @@ set(AT_MKLDNN_ACL_ENABLED 0)
 set(AT_MKLDNN_ENABLED 0)
 set(AT_MKL_ENABLED 0)
 set(AT_KLEIDIAI_ENABLED 0)
+set(AT_USE_EIGEN_SPARSE 0)
 # setting default preferred BLAS options if not already present.
 if(NOT INTERN_BUILD_MOBILE)
   set(BLAS "MKL" CACHE STRING "Selected BLAS library")
@@ -262,6 +271,15 @@ if(BLAS_LIBRARIES AND BLAS_CHECK_F2C)
   include(cmake/BLAS_ABI.cmake)
 endif()
 
+if(USE_EIGEN_SPARSE AND BLAS_INFO STREQUAL "mkl")
+  message(WARNING "Disabling USE_EIGEN_SPARSE because MKL is enabled")
+  set(USE_EIGEN_SPARSE OFF)
+endif()
+
+if(USE_EIGEN_SPARSE)
+  set(AT_USE_EIGEN_SPARSE 1)
+endif()
+
 if(NOT INTERN_BUILD_MOBILE)
   set(AT_MKL_SEQUENTIAL 0)
   set(USE_BLAS 1)
@@ -664,55 +682,20 @@ if(USE_FBGEMM)
   if(NOT DEFINED FBGEMM_SOURCE_DIR)
     set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory")
   endif()
-  if(NOT CAFFE2_COMPILER_SUPPORTS_AVX512_EXTENSIONS)
-    message(WARNING
-      "A compiler with AVX512 support is required for FBGEMM. "
-      "Not compiling with FBGEMM. "
-      "Turn this warning off by USE_FBGEMM=OFF.")
-    set(USE_FBGEMM OFF)
-  endif()
-  if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
-    message(WARNING
-      "x64 operating system is required for FBGEMM. "
-      "Not compiling with FBGEMM. "
-      "Turn this warning off by USE_FBGEMM=OFF.")
-    set(USE_FBGEMM OFF)
-  endif()
   if(USE_FBGEMM AND NOT TARGET fbgemm)
     set(FBGEMM_BUILD_TESTS OFF CACHE BOOL "")
     set(FBGEMM_BUILD_BENCHMARKS OFF CACHE BOOL "")
-    if(MSVC AND BUILD_SHARED_LIBS)
-      set(FBGEMM_LIBRARY_TYPE "shared" CACHE STRING "")
-    else()
-      set(FBGEMM_LIBRARY_TYPE "static" CACHE STRING "")
-    endif()
-    if(USE_ASAN)
-      set(USE_SANITIZER "address,undefined" CACHE STRING "-fsanitize options for FBGEMM")
-    endif()
+    set(FBGEMM_LIBRARY_TYPE "static" CACHE STRING "")
     add_subdirectory("${FBGEMM_SOURCE_DIR}")
-    set_property(TARGET fbgemm_generic PROPERTY POSITION_INDEPENDENT_CODE ON)
-    set_property(TARGET fbgemm_avx2 PROPERTY POSITION_INDEPENDENT_CODE ON)
-    set_property(TARGET fbgemm_avx512 PROPERTY POSITION_INDEPENDENT_CODE ON)
-    set_property(TARGET fbgemm PROPERTY POSITION_INDEPENDENT_CODE ON)
-
-    # Disabling autovec in fbgemm due to large library size causing symbol relocation issues, which is only allowed in static builds.
-    # Long-term solution involves modularizing fbgemm targets.
-    target_compile_definitions(fbgemm_generic PUBLIC DISABLE_FBGEMM_AUTOVEC)
-    target_compile_definitions(fbgemm_avx2 PUBLIC DISABLE_FBGEMM_AUTOVEC)
-    target_compile_definitions(fbgemm_avx512 PUBLIC DISABLE_FBGEMM_AUTOVEC)
-
-    if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 13.0.0)
-      # See https://github.com/pytorch/pytorch/issues/74352
-      target_compile_options_if_supported(asmjit -Wno-deprecated-copy)
-      target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
-    endif()
+
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
       target_compile_options_if_supported(asmjit -Wno-extra-semi)
       target_compile_options_if_supported(fbgemm -Wno-extra-semi)
     endif()
+    target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
+    target_compile_options_if_supported(asmjit -Wno-unused-variable)
   endif()
   if(USE_FBGEMM)
-    target_compile_definitions(fbgemm PUBLIC DISABLE_FBGEMM_AUTOVEC)
     list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm)
   endif()
 endif()
@@ -721,9 +704,6 @@ if(USE_FBGEMM)
   caffe2_update_option(USE_FBGEMM ON)
 else()
   caffe2_update_option(USE_FBGEMM OFF)
-  message(WARNING
-    "Turning USE_FAKELOWP off as it depends on USE_FBGEMM.")
-  caffe2_update_option(USE_FAKELOWP OFF)
 endif()
 
 if(USE_OPENCL)
@@ -1045,6 +1025,9 @@ if(USE_ROCM)
     if(HIPBLASLT_VEC_EXT)
       list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT)
     endif()
+    if(USE_ROCM_CK_GEMM)
+      list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK_GEMM)
+    endif()
     list(APPEND HIP_HIPCC_FLAGS --offload-compress)
     if(WIN32)
       add_definitions(-DROCM_ON_WINDOWS)
@@ -1143,7 +1126,7 @@ if(USE_UCC)
 endif()
 
 # ---[ CUB
-if(USE_CUDA)
+if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
   find_package(CUB)
   if(NOT CUB_FOUND)
     message(FATAL_ERROR "Cannot find CUB.")
@@ -1151,7 +1134,7 @@ if(USE_CUDA)
   include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
 endif()
 
-if(USE_DISTRIBUTED AND USE_TENSORPIPE)
+if(USE_TENSORPIPE)
   if(MSVC)
     message(WARNING "Tensorpipe cannot be used on Windows.")
   else()
@@ -1166,17 +1149,10 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
 
     # Tensorpipe uses cuda_add_library
     torch_update_find_cuda_flags()
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
-      message(WARNING "Archived TensorPipe forces CMake compatibility mode")
-      set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
-    endif()
     add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe)
     # Suppress warning to unblock libnop compilation by clang-17
     # See https://github.com/pytorch/pytorch/issues/151316
     target_compile_options_if_supported(tensorpipe -Wno-missing-template-arg-list-after-template-kw)
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
-      unset(CMAKE_POLICY_VERSION_MINIMUM)
-    endif()
 
     list(APPEND Caffe2_DEPENDENCY_LIBS tensorpipe)
     list(APPEND Caffe2_DEPENDENCY_LIBS nlohmann)
@@ -1242,10 +1218,17 @@ if(USE_GLOO)
       if(NOT Gloo_FOUND)
         message(FATAL_ERROR "Cannot find gloo")
       endif()
-      message("Found gloo: ${Gloo_LIBRARY}")
+      message("Found gloo: ${Gloo_NATIVE_LIBRARY}, cuda lib: ${Gloo_CUDA_LIBRARY}, hip lib: ${Gloo_HIP_LIBRARY}")
       message("Found gloo include directories: ${Gloo_INCLUDE_DIRS}")
       add_library(gloo SHARED IMPORTED)
-      set_target_properties(gloo PROPERTIES IMPORTED_LOCATION ${Gloo_LIBRARY})
+      set_target_properties(gloo PROPERTIES IMPORTED_LOCATION ${Gloo_NATIVE_LIBRARY})
+      if(USE_CUDA)
+        add_library(gloo_cuda SHARED IMPORTED)
+        set_target_properties(gloo_cuda PROPERTIES IMPORTED_LOCATION ${Gloo_CUDA_LIBRARY})
+      elseif(USE_ROCM)
+        add_library(gloo_hip SHARED IMPORTED)
+        set_target_properties(gloo_hip PROPERTIES IMPORTED_LOCATION ${Gloo_HIP_LIBRARY})
+      endif()
       # need to use Gloo_INCLUDE_DIRS over third_party/gloo to find Gloo's auto-generated config.h
       include_directories(BEFORE SYSTEM ${Gloo_INCLUDE_DIRS})
     endif()
@@ -1691,9 +1674,9 @@ if(USE_KINETO)
         set(CMAKE_REQUIRED_LINK_OPTIONS "")
         if(NOT EXCEPTIONS_WORK)
           message(FATAL_ERROR
-            "Detected that statically linking against CUPTI causes exceptions to stop working. "
-            "See https://github.com/pytorch/pytorch/issues/57744 for more details. "
-            "Perhaps try: USE_CUPTI_SO=1 CMAKE_FRESH=1 python setup.py develop")
+            "Detected that statically linking against CUPTI causes exceptions to stop working.  "
+            "See https://github.com/pytorch/pytorch/issues/57744 for more details.  "
+            "Perhaps try: USE_CUPTI_SO=1 CMAKE_FRESH=1 python -m pip install -e . -v --no-build-isolation")
         endif()
       endif()
 
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index 54564e42c9023..5d91587746540 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -9,97 +9,160 @@ if(NOT __AOTRITON_INCLUDED)
   # Replaces .ci/docker/aotriton_version.txt
   # Note packages information may have versions skipped (due to no ABI breaks)
   # But they must be listed from lower version to higher version
-  set(__AOTRITON_VER "0.10b")
+  set(__AOTRITON_VER "0.11b")
   set(__AOTRITON_MANYLINUX_LIST
+      "manylinux_2_28"  # rocm6.2
       "manylinux_2_28"  # rocm6.3
       "manylinux_2_28"  # rocm6.4
-      "manylinux_2_28"  # rocm6.5
       "manylinux_2_28"  # rocm7.0
       )
   set(__AOTRITON_ROCM_LIST
+      "rocm6.2"
       "rocm6.3"
       "rocm6.4"
-      "rocm6.5"
       "rocm7.0"
       )
-  set(__AOTRITON_CI_COMMIT "6fca155f4deeb8d9529326f7b69f350aeeb93477")
+  set(__AOTRITON_CI_COMMIT "972223c501ffc22068bb035ac5d64cf54318d895")
   set(__AOTRITON_SHA256_LIST
-      "861cd9f7479eec943933c27cb86920247e5b5dd139bc7c1376c81808abb7d7fe"  # rocm6.3
-      "acea7d811a2d3bbe718b6e07fc2a9f739e49eecd60b4b6a36fcb3fe8edf85d78"  # rocm6.4
-      "7e29c325d5bd33ba896ddb106f5d4fc7d715274dca7fe937f724fffa82017838"  # rocm6.5
-      "1e9b3dddf0c7fc07131c6f0f5266129e83ce2331f459fa2be8c63f4ae91b0f5b"  # rocm7.0
+      "6cae3d5de75ee205d22e088f7dfaab1227056d02ea67f29ccdbc09f2be4e8c8f"  # rocm6.2
+      "72a153549ea20707331e8a1f1e3d1b8de2913f9d5af2b900c56235d578b57efe"  # rocm6.3
+      "c7f319dd7448cbbbab81889dd8a37d47dbc25ebcbd89760f09e6a0904e556393"  # rocm6.4
+      "a2a974e0ad929a5e5827c0f896c59bda4872459cbaf8dd8e0a00407f404491cf"  # rocm7.0
       )
+  set(__AOTRITON_IMAGE_LIST
+      "amd-gfx90a"
+      "amd-gfx942"
+      "amd-gfx950"
+      "amd-gfx11xx"
+      "amd-gfx120x"
+     )
+  set(__AOTRITON_IMAGE_SHA256_LIST
+     "c19a41c9480510ab32e6fb05e6ed0a3832d6b07634f050b836b760200befa735" # amd-gfx90a
+     "3a06a99971dddb7703a30378f1c5d6b41468d926ea51821156d1b6857b985bc4" # amd-gfx942
+     "27fc21f6761d57987a700436de8cf29cbdd9eeee91318dfed596eeb147d219ad" # amd-gfx950
+     "ec134032087344176695505db659387374d1916adfee16f0db47dee38d9c8603" # amd-gfx11xx
+     "fec05205747ff51649b1e151545267d5aa2037ba9d0338cad286882915b941b0" # amd-gfx120x
+     )
+  set(__AOTRITON_BASE_URL "https://github.com/ROCm/aotriton/releases/download/")  # @lint-ignore
   set(__AOTRITON_Z "gz")
-
-  # Note it is INSTALL"ED"
-  if(DEFINED ENV{AOTRITON_INSTALLED_PREFIX})
-    install(DIRECTORY
-            $ENV{AOTRITON_INSTALLED_PREFIX}/lib
-            $ENV{AOTRITON_INSTALLED_PREFIX}/include
-            DESTINATION ${__AOTRITON_INSTALL_DIR})
-    set(__AOTRITON_INSTALL_DIR "$ENV{AOTRITON_INSTALLED_PREFIX}")
-    message(STATUS "Using Preinstalled AOTriton at ${__AOTRITON_INSTALL_DIR}")
-  elseif(DEFINED ENV{AOTRITON_INSTALL_FROM_SOURCE})
-    ExternalProject_Add(aotriton_external
+  function(aotriton_build_from_source noimage project)
+    if(noimage)
+      SET(RECURSIVE "OFF")
+    else()
+      SET(RECURSIVE "ON")
+    endif()
+    message(STATUS "PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}")
+    ExternalProject_Add(${project}
       GIT_REPOSITORY https://github.com/ROCm/aotriton.git
+      GIT_SUBMODULES_RECURSE ${RECURSIVE}
       GIT_TAG ${__AOTRITON_CI_COMMIT}
       PREFIX ${__AOTRITON_EXTERN_PREFIX}
-      INSTALL_DIR ${__AOTRITON_INSTALL_DIR}
-      CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${__AOTRITON_INSTALL_DIR}
+      CMAKE_CACHE_ARGS
       -DAOTRITON_TARGET_ARCH:STRING=${PYTORCH_ROCM_ARCH}
+      -DCMAKE_INSTALL_PREFIX:FILEPATH=${__AOTRITON_INSTALL_DIR}
+      CMAKE_ARGS
       -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+      -DAOTRITON_GPU_BUILD_TIMEOUT=0
       -DAOTRITON_NO_PYTHON=ON
-      -DAOTRITON_NO_SHARED=OFF
-      # CONFIGURE_COMMAND ""
-      BUILD_COMMAND ""  # No build, install command will repeat the build process due to problems in the build system.
+      -DAOTRITON_NOIMAGE_MODE=${noimage}
       BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
       USES_TERMINAL_DOWNLOAD TRUE
       USES_TERMINAL_CONFIGURE TRUE
       USES_TERMINAL_BUILD TRUE
       USES_TERMINAL_INSTALL TRUE
-      # INSTALL_COMMAND ${MAKE_COMMAND} install
-      )
-    add_dependencies(__caffe2_aotriton aotriton_external)
-    message(STATUS "Using AOTriton compiled from source directory ${__AOTRITON_EXTERN_PREFIX}")
-  else()
-    set(__AOTRITON_SYSTEM_ROCM "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}")
-    list(GET __AOTRITON_ROCM_LIST 0 __AOTRITON_ROCM_DEFAULT_STR)
-    # Initialize __AOTRITON_ROCM to lowest version, in case all builds > system's ROCM
-    string(SUBSTRING ${__AOTRITON_ROCM_DEFAULT_STR} 4 -1 __AOTRITON_ROCM)
-    foreach(AOTRITON_ROCM_BUILD_STR IN LISTS __AOTRITON_ROCM_LIST)
-      # len("rocm") == 4
-      string(SUBSTRING ${AOTRITON_ROCM_BUILD_STR} 4 -1 AOTRITON_ROCM_BUILD)
-      # Find the last build that <= system's ROCM
-      # Assume the list is from lower to higher
-      if(AOTRITON_ROCM_BUILD VERSION_GREATER __AOTRITON_SYSTEM_ROCM)
-        break()
-      endif()
-      set(__AOTRITON_ROCM ${AOTRITON_ROCM_BUILD})
-    endforeach()
-    list(FIND __AOTRITON_ROCM_LIST "rocm${__AOTRITON_ROCM}" __AOTRITON_ROCM_INDEX)
-    list(GET __AOTRITON_SHA256_LIST ${__AOTRITON_ROCM_INDEX} __AOTRITON_SHA256)
-    list(GET __AOTRITON_MANYLINUX_LIST ${__AOTRITON_ROCM_INDEX} __AOTRITON_MANYLINUX)
-    set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
+    )
+  endfunction()
+
+  set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
+  function(aotriton_download_runtime index project)
+    list(GET __AOTRITON_ROCM_LIST ${index} __AOTRITON_ROCM)
+    list(GET __AOTRITON_MANYLINUX_LIST ${index} __AOTRITON_MANYLINUX)
+    list(GET __AOTRITON_SHA256_LIST ${index} __AOTRITON_SHA256)
+
     string(CONCAT __AOTRITON_FILE "aotriton-"
                                   "${__AOTRITON_VER}-${__AOTRITON_MANYLINUX}"
-                                  "_${__AOTRITON_ARCH}-rocm${__AOTRITON_ROCM}"
+                                  "_${__AOTRITON_ARCH}-${__AOTRITON_ROCM}"
                                   "-shared.tar.${__AOTRITON_Z}")
-    string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/"  # @lint-ignore
-                                 "${__AOTRITON_VER}/${__AOTRITON_FILE}")
-    ExternalProject_Add(aotriton_external
+    string(CONCAT __AOTRITON_URL
+           "${__AOTRITON_BASE_URL}"
+           "${__AOTRITON_VER}/${__AOTRITON_FILE}")
+    ExternalProject_Add(${project}
       URL "${__AOTRITON_URL}"
       URL_HASH SHA256=${__AOTRITON_SHA256}
-      SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_tarball
+      SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime
       CONFIGURE_COMMAND ""
       BUILD_COMMAND ""
       INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
-      "${CMAKE_CURRENT_BINARY_DIR}/aotriton_tarball"
+      "${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime"
       "${__AOTRITON_INSTALL_DIR}"
       BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
     )
-    add_dependencies(__caffe2_aotriton aotriton_external)
-    message(STATUS "Using AOTriton from pre-compiled binary ${__AOTRITON_URL}.\
+    message(STATUS "Using AOTriton Runtime from pre-compiled binary ${__AOTRITON_URL}.\
     Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.")
+  endfunction()
+
+  function(aotriton_download_image image project)
+    list(FIND __AOTRITON_IMAGE_LIST ${image} index)
+    list(GET __AOTRITON_IMAGE_SHA256_LIST ${index} __AOTRITON_SHA256)
+
+    string(CONCAT __AOTRITON_FILE
+           "aotriton-${__AOTRITON_VER}-images-"
+           "${image}.tar.${__AOTRITON_Z}")
+    string(CONCAT __AOTRITON_URL
+           "${__AOTRITON_BASE_URL}"
+           "${__AOTRITON_VER}/${__AOTRITON_FILE}")
+    ExternalProject_Add(${project}
+      URL "${__AOTRITON_URL}"
+      URL_HASH SHA256=${__AOTRITON_SHA256}
+      SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND ""
+      INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
+      "${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}"
+      "${__AOTRITON_INSTALL_DIR}"
+      BUILD_BYPRODUCTS
+      "${__AOTRITON_INSTALL_DIR}/lib/aotriton.images/${image}/__signature__"
+    )
+    message(STATUS "Download AOTriton pre-compiled GPU images from ${__AOTRITON_URL}.")
+  endfunction()
+
+  # Note it is INSTALL"ED"
+  if(DEFINED ENV{AOTRITON_INSTALLED_PREFIX})
+    install(DIRECTORY
+            $ENV{AOTRITON_INSTALLED_PREFIX}/lib
+            $ENV{AOTRITON_INSTALLED_PREFIX}/include
+            DESTINATION ${__AOTRITON_INSTALL_DIR})
+    set(__AOTRITON_INSTALL_DIR "$ENV{AOTRITON_INSTALLED_PREFIX}")
+    message(STATUS "Using Preinstalled AOTriton at ${__AOTRITON_INSTALL_DIR}")
+  elseif(DEFINED ENV{AOTRITON_INSTALL_FROM_SOURCE})
+    aotriton_build_from_source(OFF aotriton_external)
+    add_dependencies(__caffe2_aotriton aotriton_external)
+    message(STATUS "Using AOTriton compiled from source directory ${__AOTRITON_EXTERN_PREFIX}")
+  else()
+    set(__AOTRITON_SYSTEM_ROCM "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}")
+    list(FIND __AOTRITON_ROCM_LIST "rocm${__AOTRITON_SYSTEM_ROCM}" __AOTRITON_RUNTIME_INDEX)
+    if(${__AOTRITON_RUNTIME_INDEX} LESS 0)
+      message(STATUS "Cannot find AOTriton runtime for ROCM ${__AOTRITON_SYSTEM_ROCM}. \
+      Build runtime from source")
+      aotriton_build_from_source(ON aotriton_runtime)
+    else()
+      aotriton_download_runtime(${__AOTRITON_RUNTIME_INDEX} aotriton_runtime)
+    endif()
+    add_dependencies(__caffe2_aotriton aotriton_runtime)
+    set(__AOTRITON_CHAINED_IMAGE "aotriton_runtime")
+    foreach(image ${__AOTRITON_IMAGE_LIST})
+      string(SUBSTRING ${image} 7 -1 gfx_pattern)
+      string(REPLACE "x" "." gfx_regex ${gfx_pattern})
+      foreach(target ${PYTORCH_ROCM_ARCH})
+        if(target MATCHES ${gfx_regex})
+          set(__AOTRITON_DOWNLOAD_TARGET aotriton_image_${gfx_pattern})
+          aotriton_download_image(${image} ${__AOTRITON_DOWNLOAD_TARGET})
+          add_dependencies(${__AOTRITON_CHAINED_IMAGE} ${__AOTRITON_DOWNLOAD_TARGET})
+          set(__AOTRITON_CHAINED_IMAGE ${__AOTRITON_DOWNLOAD_TARGET})
+          break()
+        endif()
+      endforeach()
+    endforeach()
   endif()
   target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so)
   target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include)
diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake
index 871a23487f29d..54126b1f130dc 100644
--- a/cmake/MiscCheck.cmake
+++ b/cmake/MiscCheck.cmake
@@ -2,24 +2,6 @@ include(CheckCXXSourceCompiles)
 include(CheckCXXCompilerFlag)
 include(CMakePushCheckState)
 
-# ---[ Check if we want to turn off deprecated warning due to glog.
-if(USE_GLOG)
-  cmake_push_check_state(RESET)
-  set(CMAKE_REQUIRED_FLAGS "-std=c++17")
-  CHECK_CXX_SOURCE_COMPILES(
-      "#include <glog/stl_logging.h>
-      int main(int argc, char** argv) {
-        return 0;
-      }" CAFFE2_NEED_TO_TURN_OFF_DEPRECATION_WARNING
-      FAIL_REGEX ".*-Wno-deprecated.*")
-
-  if(NOT CAFFE2_NEED_TO_TURN_OFF_DEPRECATION_WARNING AND NOT MSVC)
-    message(STATUS "Turning off deprecation warning due to glog.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated")
-  endif()
-  cmake_pop_check_state()
-endif()
-
 # ---[ Check if the compiler has AVX/AVX2 support. We only check AVX2.
 if(NOT INTERN_BUILD_MOBILE)
   find_package(AVX) # checks AVX and AVX2
@@ -30,46 +12,6 @@ if(NOT INTERN_BUILD_MOBILE)
     set(CAFFE2_PERF_WITH_AVX2 1)
   endif()
 endif()
-# ---[ Check if the compiler has AVX512 support.
-cmake_push_check_state(RESET)
-if(MSVC AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-  # We could've used MSVC's hidden option /arch:AVX512 that defines __AVX512F__,
-  # __AVX512DQ__, and __AVX512VL__, and /arch:AVX512F that defines __AVX512F__.
-  # But, we chose not to do that not to rely on hidden options.
-  set(CMAKE_REQUIRED_FLAGS "/D__AVX512F__ /D__AVX512DQ__ /D__AVX512VL__")
-else()
-  # We only consider the case where all of avx512f, avx512dq, and avx512vl are
-  # supported.
-  # Platforms where avx512f is supported by not avx512dq and avx512vl as of
-  # Jan 15 2019 : linux_manywheel_2.7mu_cpu_build and
-  # linux_conda_3.7_cu100_build
-  set(CMAKE_REQUIRED_FLAGS "-mavx512f -mavx512dq -mavx512vl")
-endif()
-CHECK_CXX_SOURCE_COMPILES(
-    "#if defined(_MSC_VER)
-     #include <intrin.h>
-     #else
-     #include <immintrin.h>
-     #endif
-     // check avx512f
-     __m512 addConstant(__m512 arg) {
-       return _mm512_add_ps(arg, _mm512_set1_ps(1.f));
-     }
-     // check avx512dq
-     __m512 andConstant(__m512 arg) {
-       return _mm512_and_ps(arg, _mm512_set1_ps(1.f));
-     }
-     int main() {
-       __m512i a = _mm512_set1_epi32(1);
-       __m256i ymm = _mm512_extracti64x4_epi64(a, 0);
-       ymm = _mm256_abs_epi64(ymm); // check avx512vl
-       __mmask16 m = _mm512_cmp_epi32_mask(a, a, _MM_CMPINT_EQ);
-       __m512i r = _mm512_andnot_si512(a, a);
-     }" CAFFE2_COMPILER_SUPPORTS_AVX512_EXTENSIONS)
-if(CAFFE2_COMPILER_SUPPORTS_AVX512_EXTENSIONS)
-  message(STATUS "Current compiler supports avx512f extension. Will build fbgemm.")
-endif()
-cmake_pop_check_state()
 
 # ---[ Checks if compiler supports -fvisibility=hidden
 check_cxx_compiler_flag("-fvisibility=hidden" COMPILER_SUPPORTS_HIDDEN_VISIBILITY)
diff --git a/cmake/Modules/FindGloo.cmake b/cmake/Modules/FindGloo.cmake
index e965326e2e8a0..944cd4d8d2573 100644
--- a/cmake/Modules/FindGloo.cmake
+++ b/cmake/Modules/FindGloo.cmake
@@ -1,7 +1,8 @@
 # Try to find the Gloo library and headers.
 #  Gloo_FOUND        - system has Gloo lib
 #  Gloo_INCLUDE_DIRS - the Gloo include directory
-#  Gloo_LIBRARY/Gloo_NATIVE_LIBRARY    - libraries needed to use Gloo
+#  Gloo_NATIVE_LIBRARY - base gloo library, needs to be linked
+#  Gloo_CUDA_LIBRARY/Gloo_HIP_LIBRARY - CUDA/HIP support library in Gloo
 
 find_path(Gloo_INCLUDE_DIR
   NAMES gloo/common/common.h
@@ -10,40 +11,32 @@ find_path(Gloo_INCLUDE_DIR
 
 find_library(Gloo_NATIVE_LIBRARY
   NAMES gloo
-  DOC "The Gloo library (without CUDA)"
+  DOC "The Gloo library"
 )
 
+# Gloo has optional CUDA support
+# if Gloo + CUDA is desired, Gloo_CUDA_LIBRARY
+# needs to be linked into desired target
 find_library(Gloo_CUDA_LIBRARY
   NAMES gloo_cuda
-  DOC "The Gloo library (with CUDA)"
+  DOC "Gloo's CUDA support/code"
+)
+
+# Gloo has optional HIP support
+# if Gloo + HIP is desired, Gloo_HIP_LIBRARY
+# needs to be linked to desired target
+find_library(Gloo_HIP_LIBRARY
+  NAMES gloo_hiop
+  DOC "Gloo's HIP support/code"
 )
 
 set(Gloo_INCLUDE_DIRS ${Gloo_INCLUDE_DIR})
 
-# use the CUDA library depending on the Gloo_USE_CUDA variable
-if (DEFINED Gloo_USE_CUDA)
-  if (${Gloo_USE_CUDA})
-    set(Gloo_LIBRARY ${Gloo_CUDA_LIBRARY})
-    set(Gloo_NATIVE_LIBRARY ${Gloo_NATIVE_LIBRARY})
-  else()
-    set(Gloo_LIBRARY ${Gloo_NATIVE_LIBRARY})
-    set(Gloo_NATIVE_LIBRARY ${Gloo_NATIVE_LIBRARY})
-  endif()
-else()
-  # else try to use the CUDA library if found
-  if (${Gloo_CUDA_LIBRARY} STREQUAL "Gloo_CUDA_LIBRARY-NOTFOUND")
-    set(Gloo_LIBRARY ${Gloo_NATIVE_LIBRARY})
-    set(Gloo_NATIVE_LIBRARY ${Gloo_NATIVE_LIBRARY})
-  else()
-    set(Gloo_LIBRARY ${Gloo_CUDA_LIBRARY})
-    set(Gloo_NATIVE_LIBRARY ${Gloo_NATIVE_LIBRARY})
-  endif()
-endif()
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(Gloo
   FOUND_VAR Gloo_FOUND
-  REQUIRED_VARS Gloo_INCLUDE_DIR Gloo_LIBRARY
+  REQUIRED_VARS Gloo_INCLUDE_DIR Gloo_NATIVE_LIBRARY
 )
 
 mark_as_advanced(Gloo_FOUND)
diff --git a/cmake/Modules/FindMKLDNN.cmake b/cmake/Modules/FindMKLDNN.cmake
index 00fd0130d8344..2018d5ec9370b 100644
--- a/cmake/Modules/FindMKLDNN.cmake
+++ b/cmake/Modules/FindMKLDNN.cmake
@@ -46,8 +46,8 @@ IF(NOT MKLDNN_FOUND)
       endif()
     endif()
     ExternalProject_Add(xpu_mkldnn_proj
-      GIT_REPOSITORY https://github.com/oneapi-src/oneDNN
-      GIT_TAG v3.8.1
+      GIT_REPOSITORY https://github.com/uxlfoundation/oneDNN
+      GIT_TAG v3.9.1
       PREFIX ${XPU_MKLDNN_DIR_PREFIX}
       BUILD_IN_SOURCE 0
       CMAKE_ARGS  -DCMAKE_C_COMPILER=icx
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 3c2ec74f14d17..fb64e99bccf22 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -66,6 +66,7 @@ function(caffe2_print_configuration_summary)
     message(STATUS "    LAPACK              : ${LAPACK_INFO}")
   endif()
   message(STATUS "  USE_ASAN              : ${USE_ASAN}")
+  message(STATUS "  USE_LSAN              : ${USE_LSAN}")
   message(STATUS "  USE_TSAN              : ${USE_TSAN}")
   message(STATUS "  USE_CPP_CODE_COVERAGE : ${USE_CPP_CODE_COVERAGE}")
   message(STATUS "  USE_CUDA              : ${USE_CUDA}")
@@ -127,15 +128,16 @@ function(caffe2_print_configuration_summary)
   endif()
   message(STATUS "  USE_ROCM              : ${USE_ROCM}")
   if(${USE_ROCM})
-    message(STATUS "    ROCM_VERSION        : ${ROCM_VERSION}")
-    message(STATUS "    USE_FLASH_ATTENTION : ${USE_FLASH_ATTENTION}")
-    message(STATUS "    USE_CK_FLASH_ATTENTION : ${USE_CK_FLASH_ATTENTION}")
+    message(STATUS "    ROCM_VERSION          : ${ROCM_VERSION}")
+    message(STATUS "    USE_FLASH_ATTENTION   : ${USE_FLASH_ATTENTION}")
     message(STATUS "    USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}")
+    message(STATUS "    USE_ROCM_CK_SDPA      : ${USE_ROCM_CK_SDPA}")
+    message(STATUS "    USE_ROCM_CK_GEMM      : ${USE_ROCM_CK_GEMM}")
   endif()
   message(STATUS "  BUILD_NVFUSER         : ${BUILD_NVFUSER}")
   message(STATUS "  USE_EIGEN_FOR_BLAS    : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
+  message(STATUS "  USE_EIGEN_FOR_SPARSE  : ${USE_EIGEN_SPARSE}")
   message(STATUS "  USE_FBGEMM            : ${USE_FBGEMM}")
-  message(STATUS "    USE_FAKELOWP          : ${USE_FAKELOWP}")
   message(STATUS "  USE_KINETO            : ${USE_KINETO}")
   message(STATUS "  USE_GFLAGS            : ${USE_GFLAGS}")
   message(STATUS "  USE_GLOG              : ${USE_GLOG}")
@@ -190,13 +192,11 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_PYTORCH_QNNPACK   : ${USE_PYTORCH_QNNPACK}")
   message(STATUS "  USE_XNNPACK           : ${USE_XNNPACK}")
   message(STATUS "  USE_DISTRIBUTED       : ${USE_DISTRIBUTED}")
-  if(${USE_DISTRIBUTED})
-    message(STATUS "    USE_MPI               : ${USE_MPI}")
-    message(STATUS "    USE_GLOO              : ${USE_GLOO}")
-    message(STATUS "    USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
-    message(STATUS "    USE_GLOO_IBVERBS      : ${USE_GLOO_IBVERBS}")
-    message(STATUS "    USE_TENSORPIPE        : ${USE_TENSORPIPE}")
-  endif()
+  message(STATUS "    USE_MPI               : ${USE_MPI}")
+  message(STATUS "    USE_GLOO              : ${USE_GLOO}")
+  message(STATUS "    USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
+  message(STATUS "    USE_GLOO_IBVERBS      : ${USE_GLOO_IBVERBS}")
+  message(STATUS "    USE_TENSORPIPE        : ${USE_TENSORPIPE}")
   if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
     message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
   endif()
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 132f9670ff34f..018bca837a5a8 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -6,7 +6,7 @@ set(PYTORCH_FOUND_HIP FALSE)
 # In the latter case, if /opt/rocm does not exist emit status
 # message and return.
 if(DEFINED ENV{ROCM_PATH})
-  set(ROCM_PATH $ENV{ROCM_PATH})
+  file(TO_CMAKE_PATH "$ENV{ROCM_PATH}" ROCM_PATH)
   if(NOT EXISTS ${ROCM_PATH})
     message(FATAL_ERROR
       "ROCM_PATH environment variable is set to ${ROCM_PATH} but does not exist.\n"
@@ -31,7 +31,7 @@ if(NOT DEFINED ENV{MAGMA_HOME})
   set(MAGMA_HOME ${ROCM_PATH}/magma)
   set(ENV{MAGMA_HOME} ${ROCM_PATH}/magma)
 else()
-  set(MAGMA_HOME $ENV{MAGMA_HOME})
+  file(TO_CMAKE_PATH "$ENV{MAGMA_HOME}" MAGMA_HOME)
 endif()
 
 # MIOpen isn't a part of HIP-SDK for Windows and hence, may have a different
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index ad50f622fe0e1..218c50a69c6fb 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -282,9 +282,15 @@ endif()
 # cufft
 add_library(caffe2::cufft INTERFACE IMPORTED)
 if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
-    set_property(
-        TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
-        CUDA::cufft_static_nocallback)
+    if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
+      set_property(
+          TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
+          CUDA::cufft_static_nocallback)
+    else()
+      set_property(
+          TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
+          CUDA::cufft_static)
+    endif()
 else()
     set_property(
         TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
diff --git a/codex_setup.sh b/codex_setup.sh
new file mode 100755
index 0000000000000..85c7b93e89794
--- /dev/null
+++ b/codex_setup.sh
@@ -0,0 +1,14 @@
+set -ex
+uv venv
+source .venv/bin/activate
+uv pip install -r requirements.txt
+uv pip install numpy
+lintrunner init
+NIGHTLY_PATCH=$(curl -s https://github.com/pytorch/pytorch/commit/nightly.patch | head -n20)
+COMMIT=$(grep -oE '[0-9a-f]{40}' <<< "$NIGHTLY_PATCH" | head -1)
+COMMIT_DATE=$(echo "$NIGHTLY_PATCH" | grep '^Date:' | sed -E 's/Date: .*, ([0-9]+) ([A-Za-z]+) ([0-9]+) .*/\3 \2 \1/' | awk 'BEGIN{split("Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec", months, " "); for(i=1;i<=12;i++) month[months[i]]=sprintf("%02d",i)} {print $1 month[$2] sprintf("%02d",$3)}')
+VERSION_STRING="2.9.0.dev${COMMIT_DATE}+cpu"
+git rev-parse HEAD > /tmp/orig_work.txt
+git reset --hard $COMMIT
+USE_NIGHTLY=$VERSION_STRING python setup.py develop
+echo "source $PWD/.venv/bin/activate" >> ~/.bashrc
diff --git a/docs/source/_static/img/aoti_debugging_guide/cuda_ima_cca.png b/docs/source/_static/img/aoti_debugging_guide/cuda_ima_cca.png
new file mode 100644
index 0000000000000..6049c208106bd
Binary files /dev/null and b/docs/source/_static/img/aoti_debugging_guide/cuda_ima_cca.png differ
diff --git a/docs/source/accelerator.md b/docs/source/accelerator.md
index c6f2fb1080400..ce593a9acf518 100644
--- a/docs/source/accelerator.md
+++ b/docs/source/accelerator.md
@@ -25,3 +25,26 @@
     synchronize
     device_index
 ```
+
+```{eval-rst}
+.. automodule:: torch.accelerator.memory
+```
+```{eval-rst}
+.. currentmodule:: torch.accelerator.memory
+```
+
+## Memory management
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+     empty_cache
+     max_memory_allocated
+     max_memory_reserved
+     memory_allocated
+     memory_reserved
+     memory_stats
+     reset_accumulated_memory_stats
+     reset_peak_memory_stats
+```
diff --git a/docs/source/accelerator/index.md b/docs/source/accelerator/index.md
new file mode 100644
index 0000000000000..4c604ba10b01a
--- /dev/null
+++ b/docs/source/accelerator/index.md
@@ -0,0 +1,52 @@
+# Accelerator Integration
+
+Since PyTorch 2.1, the community has made significant progress in streamlining the process of integrating new accelerators into the PyTorch ecosystem. These improvements include, but are not limited to: refinements to the `PrivateUse1` Dispatch Key, the introduction and enhancement of core subsystem extension mechanisms, and the device-agnostic refactoring of key modules (e.g., `torch.accelerator`, `memory management`). Taken together, these advances provide the foundation for a **robust**, **flexible**, and **developer-friendly** pathway for accelerator integration.
+
+## Why Does This Matter?
+
+This integration pathway offers several major benefits:
+
+* **Speed**: Extensibility is built into all core PyTorch modules. Developers can integrate new accelerators into their downstream codebases independently—without modifying upstream code and without being limited by community review bandwidth.
+* **Future-proofing**: This is the default integration path for all future PyTorch features, meaning that as new modules and features are added, they will automatically support scaling to new accelerators if this path is followed.
+* **Autonomy**: Vendors maintain full control over their accelerator integration timelines, enabling fast iteration cycles and reducing reliance on upstream coordination.
+
+## About This Document
+
+This guide aims to provide a **comprehensive overview of the modern integration pathway** for new accelerator in PyTorch. It walks through the full integration surface, from low-level device primitives to higher-level domain modules like compilation and quantization. The structure follows a **modular and scenario-driven approach**, where each topic is paired with corresponding code examples from [torch_openreg][OpenReg URL], an official reference implementation.
+
+The goal is to help developers:
+
+* Understand the full scope of accelerator integration;
+* Follow best practices to quickly launch new accelerators;
+* Avoid common pitfalls through clear, targeted examples.
+
+## Target Audience
+
+This document is intended for:
+
+* **Accelerator Developers** who are integrating accelerator into PyTorch;
+* **Advanced PyTorch Users** interested in the inner workings of key modules;
+
+## Quick Overview
+
+This document outlines the key processes and practical scenarios involved in integrating new devices into PyTorch, providing developers with a comprehensive and detailed guide for bringing up new backends. The discussion is structured around four major axes:
+
+* **Runtime**: Covers core components such as Event, Stream, Memory, Generator, Guard, Hooks, as well as the supporting C++ scaffolding.
+* **Operators**: Involve the minimum necessary set of operators, forward and backward operators, fallback operators, fallthroughs, STUBs, etc. in both C++ and Python implementations.
+* **Python Frontend**: Focuses on Python bindings for modules and device-agnostic APIs.
+* **High-level Modules**: Explores integration with major subsystems such as `AMP`, `Compiler`, `ONNX`, and `Distributed` and so on.
+
+Next, we will officially embark on the integration journey for a new PyTorch accelerator.
+
+```{note}
+This guide is a work in progress. For more details, please refer to the [roadmap](https://github.com/pytorch/pytorch/issues/158917).
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+
+operators
+```
+
+[OpenReg URL]: https://github.com/pytorch/pytorch/tree/main/test/cpp_extensions/open_registration_extension/torch_openreg "OpenReg URL"
diff --git a/docs/source/accelerator/operators.md b/docs/source/accelerator/operators.md
new file mode 100644
index 0000000000000..2930d6b7f6e46
--- /dev/null
+++ b/docs/source/accelerator/operators.md
@@ -0,0 +1,406 @@
+# Operator Registration
+
+For new accelerators, one of the most important and fundamental aspects of integration is supporting high-performance operators. To facilitate operator adaptation for users and accelerator developers, PyTorch provides multiple methods for developing and registering operators in both `Python` and `C++`. The following sections detail some of PyTorch's fundamental capabilities for operator registration.
+
+```{note}
+`Dispatch Key` is used to uniquely identify accelerator within PyTorch, such as `CPU`, `CUDA`, `MPS`, and `PrivateUse1`. In theory, all subsequent new accelerators will share `PrivateUse1`, leveraging its built-in comprehensive scaffolding capabilities to complete the integration of new accelerators. Please refer to [Let's talk about the PyTorch dispatcher](https://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/) if you are interested with dispatcher.
+```
+
+(operator-set)=
+
+## Operator Set
+
+PyTorch currently has over 3500 built-in operators (including related operator variants). This represents a significant workload from any perspective, and supporting this massive number of operators in a short period of time is no easy task. Therefore, as the first step in developing new backend operators, our goal should be to focus on the essential operators. For other operators, we can first use the community's fallback mechanism to support the feature as the first priority. After that, we can gradually complete other operators to improve the performance of the new backend.
+
+The required operator set is listed below, primarily consisting of low-level operators required by factory functions and fallback operators:
+
+| Operator Name                      | Dispatch Key | Description                                                                                                        |
+| :---:                              | :---:        | :---:                                                                                                              |
+| empty.memory_format                | PrivateUse1  | Create an uninitialized Tensor with the specified shape and memory layout (the stride is automatically calculated) |
+| empty_strided                      | PrivateUse1  | Create an uninitialized Tensor of the specified shape and stride (more degrees of freedom)                         |
+| as_strided                         | PrivateUse1  | Create a shared view of the input Tensor with new shape, stride, and offset (without allocating new memory)        |
+| view                               | PrivateUse1  | Create a shared view of the input Tensor with new shape, but the original Tensor must be memory-contiguous         |
+| _reshape_alias                     | PrivateUse1  | Creates a shared view without safety checks(Internal version of reshape)                                           |
+| resize_                            | PrivateUse1  | Modify the shape of the Tensor in place and reallocate memory if capacity is insufficient                          |
+| _copy_from                         | PrivateUse1  | The underlying core function of Tensor.copy_ is responsible for the actual cross-device data copying               |
+| _copy_from_and_resize              | PrivateUse1  | Combine `resize_` and `_copy_from` to resize first and then copy                                                   |
+| _local_scalar_dense                | PrivateUse1  | The underlying implementation of `.item()`, extracting values from Tensor to CPU scalars                           |
+| set_.source_Tensor                 | PrivateUse1  | Set the current Tensor using the specified Tensor                                                                  |
+| set_.source_Storage                | PrivateUse1  | Set the current Tensor using the specified Storage                                                                 |
+| set_.source_Storage_storage_offset | PrivateUse1  | Set the current Tensor using the specified Storage with the storage offset                                         |
+| fallback                           | PrivateUse1  | Fallback to CPU                                                                                                    |
+
+## Basics
+
+Now that we have defined the initial scope of operator support, we can begin developing operator adaptations. This section will explain these implementations in `Python` and `C++` based on actual scenarios.
+
+(step-one)=
+
+### Step 1
+
+{ref}`The operators mentioned above <operator-set>` share a common characteristic: They are built-in PyTorch operators with defined `namespaces` and `Schemas`, and these operators' built-in accelerators (`CPU`, `CUDA`, etc.) have been implemented. What we have to do next is to implement these operators for the new accelerators.
+
+::::{tab-set}
+
+:::{tab-item} C++
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: EMPTY.MEMORY_FORMAT IMPL
+    :end-before: LITERALINCLUDE END: EMPTY.MEMORY_FORMAT IMPL
+    :linenos:
+
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: EMPTY.MEMORY_FORMAT WRAPPER
+    :end-before: LITERALINCLUDE END: EMPTY.MEMORY_FORMAT WRAPPER
+    :linenos:
+```
+
+:::
+
+::::
+
+Taking the `empty.memory_format` operator as an example, we first need to query the operator's `schema` information in `native_functions.yaml`, which contains detailed signature information. Then, we can implement the operator based on the capabilities of the new accelerator.
+
+```Yaml
+- func: empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+dispatch:
+    CPU: empty_cpu
+    CUDA: empty_cuda
+    ...
+```
+
+::::{tab-set-code}
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: TORCH_LIBRARY_IMPL DEFAULT
+    :end-before:  LITERALINCLUDE END: TORCH_LIBRARY_IMPL DEFAULT
+    :emphasize-lines: 1,2
+    :linenos:
+```
+
+::::
+
+After completing the `wrapper_empty_memory_format`, we can register `aten::empty.memory_format` for `PrivateUse1` through `TORCH_LIBRARY_IMPL`.
+
+### Step 2
+
+By following {ref}`Step 1<step-one>`, we can complete the development and registration of all operators except `fallback`. Next, to support operators related to operations (such as mathematical operations and convolution operations), we need to implement the registration of fallback semantics. This is a built-in capability provided by the PyTorch framework that can fallback some operations that are not supported by new accelerators to the CPU for execution. For new backends in development, this is an extremely effective way to ensure functionality at the expense of performance.
+
+::::{tab-set}
+
+:::{tab-item} C++
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: FALLBACK IMPL
+    :end-before: LITERALINCLUDE END: FALLBACK IMPL
+    :emphasize-lines: 15
+    :linenos:
+
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: FALLBACK WRAPPER
+    :end-before: LITERALINCLUDE END: FALLBACK WRAPPER
+    :linenos:
+
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: FALLBACK GLOBAL
+    :end-before: LITERALINCLUDE END: FALLBACK GLOBAL
+    :linenos:
+```
+
+:::
+
+::::
+
+`wrapper_cpu_fallback` wraps the `at::native::cpu_fallback` method provided by PyTorch and is registered with `PrivateUse1` in PyTorch via `TORCH_LIBRARY_IMPL`. Subsequent operations not supported by the new backend will automatically fall back to the CPU for execution, and the results will be passed back to the new backend after execution.
+
+## Advanced
+
+### Selective Fallback
+
+Enabling the fallback mechanism only for certain operators, while following PyTorch's default behavior for other operators (an error will be reported if the accelerator does not have a corresponding operator implementation), this is a very reasonable scenario as well.
+
+::::{tab-set}
+
+:::{tab-item} C++
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: FALLBACK WRAPPER
+    :end-before: LITERALINCLUDE END: FALLBACK WRAPPER
+    :linenos:
+
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: FALLBACK SINGLE
+    :end-before: LITERALINCLUDE END: FALLBACK SINGLE
+    :linenos:
+```
+
+:::
+
+::::
+
+Per-operator fallbacks are very similar to global fallbacks, the only difference being the registration method: calling `m.impl` registers an implementation for a specific operator, while `m.fallback` registers a default implementation for all operators.
+
+::::{tab-set-code}
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: FALLBACK IMPL
+    :end-before: LITERALINCLUDE END: FALLBACK IMPL
+    :emphasize-lines: 2-5
+    :linenos:
+```
+
+::::
+
+Of course, global fallbacks can also be combined with a blacklist of fallbacks, which is a common approach, especially when only a few operators do not support fallbacks.
+
+### PyTorch STUB
+
+PyTorch also provides another approach for built-in operators: `STUB`. This method is essentially based on the `Step 1<step-one>` approach, but adds secondary scheduling capabilities (for example, scheduling based on CPU characteristics).
+
+```{note}
+The `STUB` method currently supports only a limited set of operators. For new accelerator devices, the advantage of the `STUB` method is that it significantly reduces the cost of development at the cost of a small performance overhead. PyTorch currently does not clearly list the set of operators that can be registered through `STUB`. Due to the large number of related operators, only the query method for the supported operator list is provided here.
+```
+
+```shell
+pushd ${TORCH_ROOT}
+
+find aten -type f -a -name "*.h" | xargs -I {} grep -wl "^DECLARE_DISPATCH" {}
+
+popd
+```
+
+`DECLARE_DISPATCH` is a macro used to explicitly declare `STUB`. It is currently distributed in the `aten` directory. Based on this macro, you can find all operators that can be integrated using the `STUB` method.
+
+```text
+...
+aten/src/ATen/native/Activation.h
+aten/src/ATen/native/FusedSGD.h
+aten/src/ATen/native/nested/NestedTensorBinaryOps.h
+aten/src/ATen/native/TensorCompare.h
+aten/src/ATen/native/Sorting.h
+...
+```
+
+```c++
+using unary_fn = void(*)(TensorIteratorBase&);
+
+DECLARE_DISPATCH(unary_fn, abs_stub)
+```
+
+The above listing contains the file that declares the `STUB` operator, where you can clearly see the STUB name and the associated function signature. Next, we will take `abs_stub` as an example to briefly introduce the path to support operators through `STUB`.
+
+::::{tab-set}
+
+:::{tab-item} C++
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: STUB ABS
+    :end-before: LITERALINCLUDE END: STUB ABS
+    :linenos:
+```
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: STUB DEFAULT
+    :end-before: LITERALINCLUDE END: STUB DEFAULT
+    :emphasize-lines: 1
+    :linenos:
+```
+
+:::
+
+::::
+
+From the signature, we can see that the input of `abs_stub` is `TensorIteratorBase`, a powerful helper class provided by PyTorch that contains all input and output operators, as well as some other auxiliary methods. Based on it, we can develop the `abs_kernel` operator and then call `REGISTER_PRIVATEUSE1_DISPATCH` to specify `abs_stub` to complete the registration.
+
+### Custom Operators
+
+In addition to PyTorch's built-in operators, custom accelerator operators are also very common to improve performance in specific scenarios. These can be categorized into three main approaches:
+
+* Forward-only
+* Forward and backward: Separate registration
+* Forward and backward: Implemented using `torch.autograd.Function`
+
+```{note}
+There are more details in PyTorch tutorials, so refer to [PyTorch Custom Operators](https://docs.pytorch.org/tutorials/advanced/custom_ops_landing_page.html) if you are interested.
+```
+
+#### Forward Only
+
+Here, we'll briefly introduce the implementation process of custom operators, focusing on the forward-only approach. The implementation can be summarized into the following three points:
+
+1. **Define Schema:**
+
+    ::::{tab-set}
+
+    :::{tab-item} C++
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: CUSTOM OPERATOR SCHEMA
+        :end-before: LITERALINCLUDE END: CUSTOM OPERATOR SCHEMA
+        :emphasize-lines: 2
+        :linenos:
+    ```
+
+    :::
+
+    ::::
+
+    * Namespace Name: `openreg`
+    * Function Name: `custom_abs`
+    * Input Parameters:
+        * Type: `Tensor`
+        * Name: `input`
+    * Output Type: `Tensor`
+
+2. **Register Operator&Autograd Fallback:**
+
+    ::::{tab-set}
+
+    :::{tab-item} C++
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: CUSTOM OPERATOR DEFAULT
+        :end-before: LITERALINCLUDE END: CUSTOM OPERATOR DEFAULT
+        :linenos:
+
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: CUSTOM OPERATOR FALLBACK
+        :end-before: LITERALINCLUDE END: CUSTOM OPERATOR FALLBACK
+        :emphasize-lines: 2
+        :linenos:
+    ```
+
+    :::
+
+    ::::
+
+    Use `TORCH_LIBRARY_IMPL` to register the `wrapper_custom_abs` implementation for the `custom_abs` operator in `PrivateUse1`. However, because `Autograd` is always enabled in PyTorch, PyTorch defaults to finding and executing the corresponding backward implementation even if only forward computation is required(will fallthrough in backward implementation). Therefore, we also need to register the corresponding implementation for `AutogradPrivateUse1` of the `custom_abs` operator. Fortunately, PyTorch also provides a general `Autograd Fallback` mechanism named `torch::autograd::autogradNotImplementedFallback`, if only forward computation is involved, it is equivalent to a fallthrough operation, selecting the next DispatchKey for computation; if backward computation is involved, an error is thrown.
+
+3. **Register Metadata(optional, but required by the graph mode, etc.):**
+
+    ::::{tab-set-code}
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/meta.py
+        :language: python
+        :start-after: LITERALINCLUDE START: CUSTOM OPERATOR META
+        :end-before: LITERALINCLUDE END: CUSTOM OPERATOR META
+        :linenos:
+    ```
+
+    ::::
+
+    PyTorch supports registering `Meta` in both C++ and Python. Since Python registration is simpler, Python is used as an example here. Similar to the `TORCH_LIBRARY_IMPL` function in C++, Python provides the more user-friendly `torch.library.impl` decorator.
+
+## Tools
+
+Operator registration in PyTorch is complex, with diverse registration methods and numerous scenarios. Therefore, the PyTorch community has provided a number of tools to help developers quickly understand the underlying principles and assist in troubleshooting. Here we briefly introduce several commonly used tools:
+
+### Commands
+
+PyTorch provides a set of commands prefixed with `torch._C._dispatch_` around its Dispatch feature. You can query all related interfaces using the following command:
+
+```Shell
+python -c 'import torch; print("\n".join([x for x in dir(torch._C) if x.startswith("_dispatch_")]))'
+
+...
+_dispatch_dump
+_dispatch_dump_table
+_dispatch_has_kernel
+_dispatch_has_kernel_for_any_dispatch_key
+_dispatch_has_kernel_for_dispatch_key
+_dispatch_isTensorSubclassLike
+_dispatch_is_alias_key
+_dispatch_is_included_in_alias
+_dispatch_is_main_interpreter
+_dispatch_kernel_for_dispatch_key_is_fallthrough
+_dispatch_key_for_device
+_dispatch_key_name
+_dispatch_key_parse
+_dispatch_key_set
+...
+```
+
+Here are explanations for several commonly used commands:
+
+* `torch._C._dispatch_key_set`:
+
+    Displays the DispatchKey of the current Tensor, with priority increasing from left to right.
+
+    ```Python
+    >>> import torch
+    >>> a = torch.randn(3,3,device="cuda")
+    >>> torch._C._dispatch_key_set(a)
+    'DispatchKeySet(CUDA, ADInplaceOrView, AutogradCUDA, AutocastCUDA)'
+    ```
+
+* `torch._C._dispatch_dump_table`:
+
+    Queries the support status of a given operator across different Dispatch Keys, making it easy to locate the corresponding implementation code.
+
+    ```Python
+    >>> import torch
+    >>> print(torch._C._dispatch_dump_table("aten::add.Tensor"))
+    >>> ...
+        CPU: registered at ./build/aten/src/ATen/RegisterCPU_0.cpp:1309 [kernel]
+        CUDA: registered at ./build/aten/src/ATen/RegisterCUDA_0.cpp:2420 [kernel]
+        HIP: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        MPS: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        IPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        XPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        HPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        VE: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        MTIA: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        MAIA: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        PrivateUse1: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        ...
+    ```
+
+    You can easily query the corresponding implementation of the `aten::add.Tensor` operator on other platforms, so that you can track the entire operator calling process from the source code level.
+
+### Environment Variables
+
+PyTorch also provides some dispatcher-related environment variables that can help with learning and quickly locating issues.
+
+* TORCH_SHOW_DISPATCH_TRACE
+
+    Displays detailed internal dispatch key scheduling during PyTorch execution.
+
+    ```Bash
+    export TORCH_SHOW_DISPATCH_TRACE=1
+    ```
+
+    ```Python
+    >>> import torch
+    >>> a = torch.randn(3,3)
+     [call] op=[aten::randn], key=[BackendSelect]
+       [redispatch] op=[aten::randn], key=[CPU]
+         [call] op=[aten::empty.memory_format], key=[BackendSelect]
+           [redispatch] op=[aten::empty.memory_format], key=[CPU]
+         [call] op=[aten::normal_], key=[CPU]
+    ```
+
+    You can clearly see all the underlying operators called by Python-level operators within PyTorch: including the operator name, calling hierarchy, and corresponding `Dispatch Key`.
diff --git a/docs/source/compile/programming_model.error_on_graph_break.md b/docs/source/compile/programming_model.error_on_graph_break.md
new file mode 100644
index 0000000000000..02acf1e7c8f2b
--- /dev/null
+++ b/docs/source/compile/programming_model.error_on_graph_break.md
@@ -0,0 +1,242 @@
+---
+file_format: mystnb
+kernelspec:
+  name: python3
+mystnb:
+  execution_timeout: 30
+  execution_show_tb: True
+  merge_streams: True
+---
+
+```{code-cell}
+:tags: [remove-cell]
+import torch
+
+import header_code
+torch._logging.set_logs(graph_breaks=True)
+```
+
+# Toggling `error_on_graph_break`
+
+**Summary:**
+
+- When `fullgraph=False`, we can use `torch._dynamo.error_on_graph_break()` for more flexibility in
+  dealing with graph breaks.
+
+So far, we have introduced two ways in dealing with graph breaks in `torch.compile`:
+1. `fullgraph=True` errors on the first graph break and additionally guarantees that only one graph is traced from the code.
+2. `fullgraph=False` continues tracing even when encountering graph breaks.
+
+What if we want to disallow graph breaks for most of the code, but there are a few problematic functions where the graph breaks are hard to remove,
+and we are okay with having those graph breaks? We can use `torch._dynamo.error_on_graph_break()` to achieve this.
+
+`torch.compile` has an `error_on_graph_break` setting (initially set to `False`).
+If a graph break or compiler error occurs in code while `error_on_graph_break` is set to `False`, then `torch.compile` will attempt to continue compilation after the graph break/error.
+If `error_on_graph_break` is set to `True`, then `torch.compile` will abort compilation and propagate the error to user code.
+
+A significant difference between `error_on_graph_break=True` and `fullgraph=True` is that the former **does not guarantee that a single graph will be captured**.
+`error_on_graph_break` **can be arbitrarily toggled during compile time** by using the `torch._dynamo.error_on_graph_break()` context manager/decorator.
+In comparison, once `fullgraph` is set to `True`, it cannot be set back to `False`.
+Finally, `error_on_graph_break` has lower precedence than `fullgraph` - `error_on_graph_break` only takes effect when `fullgraph=False`.
+
+
+## `error_on_graph_break(False)` example
+
+```{code-cell}
+@torch._dynamo.error_on_graph_break(False)
+def code_with_a_difficult_graph_break(x):
+    x = x + 1
+    torch._dynamo.graph_break()
+    return x + 2
+
+def inner(x):
+    return code_with_a_difficult_graph_break(x)
+
+# NOTE: fullgraph=False
+@torch._dynamo.error_on_graph_break(True)
+@torch.compile
+def fn(x):
+    return inner(x)
+
+# No error, but there is a graph break
+fn(torch.randn(3))
+```
+
+Using `error_on_graph_break(False)` under `error_on_graph_break(True)` is helpful for when we want to minimize graph breaks (i.e. follow the `fullgraph=True` programming model),
+but there are some sections of code with non-performance-critical graph breaks that are difficult to work around.
+
+`error_on_graph_break()` can be used as a context manager as well:
+
+```{code-cell}
+# NOTE: fullgraph=False
+@torch._dynamo.error_on_graph_break(True)
+@torch.compile
+def fn(x):
+    x = x + 1
+    with torch._dynamo.error_on_graph_break(False):
+        torch._dynamo.graph_break()  # no error
+    return x + 2
+
+# No error, but there is a graph break
+fn(torch.randn(3))
+```
+
+You can use monkey patching to toggle `error_on_graph_break` for code where you cannot edit the source (e.g. framework code):
+
+```{code-cell}
+class ThirdPartyModule(torch.nn.Module):
+    def forward(self, x):
+        x = x + 1
+        torch._dynamo.graph_break()
+        return x + 2
+
+tp_mod = ThirdPartyModule()
+tp_mod.forward = torch._dynamo.error_on_graph_break(False)(tp_mod.forward)
+
+@torch._dynamo.error_on_graph_break(True)
+@torch.compile
+def fn(x):
+    return tp_mod.forward(x)
+
+# No error, but there is a graph break
+fn(torch.randn(3))
+```
+
+## `error_on_graph_break(True)` example
+
+```{code-cell}
+@torch._dynamo.error_on_graph_break(True)
+def inner2(x):
+    x = x + 1
+    torch._dynamo.graph_break()  # error
+    return x + 2
+
+def inner(x):
+    return inner2(x)
+
+# fullgraph=False, error_on_graph_break=False
+@torch.compile
+def fn(x):
+    x = x + 4
+    torch._dynamo.graph_break()  # no error
+    return inner(x)
+
+try:
+    fn(torch.randn(3))
+except Exception as e:
+    print(e)
+```
+
+Using `error_on_graph_break(True)` under `error_on_graph_break(False)` is helpful for when we want to use `torch.compile` flexibly (i.e. follow the `fullgraph=False` programming model),
+but there are some sections of the code that are performance-critical and we want to ensure that those sections do not contain graph breaks.
+
+## `error_on_graph_break` nesting behavior
+
+`torch._dynamo.error_on_graph_break()` affects the `error_on_graph_break` setting of nested calls as well:
+
+```{code-cell}
+def inner(x):
+    x = x + 1
+    torch._dynamo.graph_break()
+    return x + 2
+
+def inner2(x):
+    with torch._dynamo.error_on_graph_break(False):
+        return inner(x)
+
+@torch._dynamo.error_on_graph_break(True)
+@torch.compile
+def fn(x):
+    return inner2(x)
+
+# no error
+fn(torch.randn(3))
+```
+
+`torch._dynamo.error_on_graph_break()` can be used under another `torch._dynamo.error_on_graph_break()` region:
+
+```{code-cell}
+def inner(x):
+    x = x + 1
+    with torch._dynamo.error_on_graph_break(False):
+        torch._dynamo.graph_break()
+    return x + 2
+
+def inner2(x):
+    with torch._dynamo.error_on_graph_break(True):
+        return inner(x)
+
+@torch.compile
+def fn(x):
+    return inner2(x)
+
+# no error
+fn(torch.randn(3))
+```
+
+## Interaction with `fullgraph`
+
+`fullgraph=True` takes higher precedence than `error_on_graph_break`:
+
+
+```{code-cell}
+@torch._dynamo.error_on_graph_break(False)
+def inner(x):
+    x = x + 1
+    torch._dynamo.graph_break()
+    return x + 2
+
+@torch.compile(fullgraph=True)
+def fn(x):
+    return inner(x)
+
+try:
+    fn(torch.randn(3))
+except Exception as e:
+    print(e)
+```
+
+`fullgraph=True` cannot be toggled back to `fullgraph=False`:
+
+```{code-cell}
+@torch.compile(fullgraph=False)
+def inner(x):
+    x = x + 1
+    torch._dynamo.graph_break()
+    return x + 2
+
+@torch.compile(fullgraph=True)
+def fn(x):
+    return inner(x)
+
+try:
+    fn(torch.randn(3))
+except Exception as e:
+    print(e)
+```
+
+```{code-cell}
+@torch.compile(fullgraph=True)
+def inner(x):
+    x = x + 1
+    torch._dynamo.graph_break()
+    return x + 2
+
+@torch.compile(fullgraph=False)
+def fn(x):
+    return inner(x)
+
+try:
+    fn(torch.randn(3))
+except Exception as e:
+    print(e)
+```
+
+## Summary of `fullgraph=True/False` vs `error_on_graph_break`
+
+Here is a table summarizing the differences between `fullgraph=True/False` and `error_on_graph_break`:
+
+|  | `error_on_graph_break=True` | `error_on_graph_break=False` (default) |
+| --- | --- | --- |
+| `fullgraph=True` | Graph breaks result in errors. Only the first graph break will be reported. **One graph guarantee.**<br><br>`fullgraph` cannot be toggled to `False`. `error_on_graph_break` has no effect.<br><br>User code must be fully compatible with `torch.compile`. Guarantees no performance hits from graph breaks (because there are no graph breaks).<br><br>Ideal for code sensitive to graph breaks: framework/library code or cases where getting maximum performance is required. Prevents downstream user code from inadvertently allowing graph breaks. | Same as `fullgraph=True` and `error_on_graph_break=True` as `error_on_graph_break` has no effect when `fullgraph=True`. |
+| `fullgraph=False` (default) | Graph breaks result in errors. Only the first graph break will be reported. **No one graph guarantee.**<br><br>`error_on_graph_break` can be toggled to `False`.<br><br>User code must be fully compatible with `torch.compile`. Guarantees no performance hits from graph breaks (because there are no graph breaks).<br><br>Ideal for user code sensitive to graph breaks. `error_on_graph_break` can be toggled to `False` to deal with sections that have graph breaks that are difficult to work around. | Will continue to compile after encountering graph breaks. All graph breaks will be reported.<br><br>`error_on_graph_break` can be toggled to `True`.<br><br>Doesn’t require many user code changes to work. Performance may be negatively impacted due to graph breaks.<br><br>Ideal for out-of-the-box use cases, on “non-weird” code, or where squeezing maximal performance is not necessary |
diff --git a/docs/source/compile/programming_model.fullgraph_false.md b/docs/source/compile/programming_model.fullgraph_false.md
index 249ae128a5ec4..df26ae804cdc0 100644
--- a/docs/source/compile/programming_model.fullgraph_false.md
+++ b/docs/source/compile/programming_model.fullgraph_false.md
@@ -19,6 +19,7 @@ The strategy for using `torch.compile(fullgraph=False)` is as follows:
 ```{toctree}
 programming_model.where_to_apply_compile
 programming_model.compiler_disable
+programming_model.error_on_graph_break
 programming_model.nested_graph_breaks
 programming_model.skipped_functions
 ```
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 07a44318ff726..d1504757f9c54 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1793,12 +1793,6 @@
     # torch.optim.optimizer
     "register_optimizer_step_post_hook",
     "register_optimizer_step_pre_hook",
-    # torch.optim.swa_utils
-    "get_ema_avg_fn",
-    "get_ema_multi_avg_fn",
-    "get_swa_avg_fn",
-    "get_swa_multi_avg_fn",
-    "update_bn",
     # torch.overrides
     "enable_reentrant_dispatch",
     # torch.package.analyze.find_first_use_of_broken_modules
@@ -2520,6 +2514,8 @@
     # torch.distributed.checkpoint.hf_storage
     "HuggingFaceStorageReader",
     "HuggingFaceStorageWriter",
+    # torch.distributed.checkpoint.quantized_hf_storage
+    "QuantizedHuggingFaceStorageReader",
     # torch.distributed.checkpoint.metadata
     "BytesStorageMetadata",
     "ChunkStorageMetadata",
@@ -2909,31 +2905,6 @@
     # torch.onnx.verification
     "OnnxBackend",
     "OnnxTestCaseRepro",
-    # torch.optim.adamax
-    "Adamax",
-    # torch.optim.adamw
-    "AdamW",
-    # torch.optim.asgd
-    "ASGD",
-    # torch.optim.lbfgs
-    "LBFGS",
-    # torch.optim.lr_scheduler
-    "ChainedScheduler",
-    "ConstantLR",
-    "CosineAnnealingLR",
-    "CosineAnnealingWarmRestarts",
-    "CyclicLR",
-    "ExponentialLR",
-    "LRScheduler",
-    "LambdaLR",
-    "LinearLR",
-    "MultiStepLR",
-    "MultiplicativeLR",
-    "OneCycleLR",
-    "PolynomialLR",
-    "ReduceLROnPlateau",
-    "SequentialLR",
-    "StepLR",
     # torch.optim.optimizer
     "Optimizer",
     # torch.overrides
@@ -3362,13 +3333,6 @@ def coverage_post_process(app, exception):
     if not isinstance(app.builder, CoverageBuilder):
         return
 
-    if not torch.distributed.is_available():
-        raise RuntimeError(
-            "The coverage tool cannot run with a version "
-            "of PyTorch that was built with USE_DISTRIBUTED=0 "
-            "as this module's API changes."
-        )
-
     # These are all the modules that have "automodule" in an rst file
     # These modules are the ones for which coverage is checked
     # Here, we make sure that no module is missing from that list
diff --git a/docs/source/cuda.md b/docs/source/cuda.md
index e72610fa81e72..24830cacdd4f6 100644
--- a/docs/source/cuda.md
+++ b/docs/source/cuda.md
@@ -268,10 +268,6 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t
 .. py:module:: torch.cuda.comm
 ```
 
-```{eval-rst}
-.. py:module:: torch.cuda.error
-```
-
 ```{eval-rst}
 .. py:module:: torch.cuda.gds
 ```
diff --git a/docs/source/distributed.checkpoint.md b/docs/source/distributed.checkpoint.md
index 694dfef1098a1..c733ffef18d97 100644
--- a/docs/source/distributed.checkpoint.md
+++ b/docs/source/distributed.checkpoint.md
@@ -173,6 +173,9 @@ We also provide other storage layers, including ones to interact with HuggingFac
 .. autoclass:: torch.distributed.checkpoint.HuggingFaceStorageWriter
   :members:
 
+.. autoclass:: torch.distributed.checkpoint.QuantizedHuggingFaceStorageReader
+  :members:
+
 We provide default implementations of `LoadPlanner` and `SavePlanner` that
 can handle all of torch.distributed constructs such as FSDP, DDP, ShardedTensor and DistributedTensor.
 
diff --git a/docs/source/distributed.md b/docs/source/distributed.md
index 9762e79c7ea3b..1a5f8d2b6f3fd 100644
--- a/docs/source/distributed.md
+++ b/docs/source/distributed.md
@@ -1139,6 +1139,10 @@ If you are running single node training, it may be convenient to interactively b
 .. py:module:: torch.distributed.checkpoint.hf_storage
 ```
 
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.quantized_hf_storage
+```
+
 ```{eval-rst}
 .. py:module:: torch.distributed.checkpoint.metadata
 ```
diff --git a/docs/source/distributed.pipelining.md b/docs/source/distributed.pipelining.md
index 2b6dbf186ff48..9d8b6998aae43 100644
--- a/docs/source/distributed.pipelining.md
+++ b/docs/source/distributed.pipelining.md
@@ -504,6 +504,10 @@ The following set of APIs transform your model into a pipeline representation.
 .. autoclass:: ScheduleZBVZeroBubble
 ```
 
+```{eval-rst}
+.. autoclass:: ScheduleDualPipeV
+```
+
 ```{eval-rst}
 .. autoclass:: PipelineScheduleSingle
   :members:
diff --git a/docs/source/distributed.tensor.md b/docs/source/distributed.tensor.md
index 64f2f02c81077..cb12eb195c02c 100644
--- a/docs/source/distributed.tensor.md
+++ b/docs/source/distributed.tensor.md
@@ -179,6 +179,18 @@ specifying the {class}`DeviceMesh` and {class}`Placement` for the {class}`DTenso
 
 ```
 
+### Random Operations
+
+DTensor provides distributed RNG functionality to ensure that random operations on sharded tensors get unique values, and random operations on replicated tensors get the same values. This system requires that all participating
+ranks (e.g. SPMD ranks) start out using the same generator state before each dtensor random operation is performed,
+and if this is true, it ensures they all end up at the same state after each dtensor random operation completes. There is no communication performed during random operations to synchronize RNG states.
+
+Operators that accept a `generator` kwarg will utilize the user-passed generator, if passed, or the default generator for the device otherwise. Whichever generator is used, it will be advanced after the DTensor operation.  It is valid to use the same generator for both DTensor and non-DTensor operations, but care must be taken to ensure the non-DTensor operations advance the generator state equally on all ranks if so.
+
+When using DTensor together with Pipeline Parallelism, ranks for each pipeline stage should use a distinct seed, and ranks within a pipeline stage should use the same seed.
+
+DTensor's RNG infra is based on the philox based RNG algorithm, and supports any philox based backend (cuda, and other cuda-like devices), but unfortunately does not yet support the CPU backend.
+
 ## Debugging
 
 ```{eval-rst}
diff --git a/docs/source/elastic/numa.rst b/docs/source/elastic/numa.rst
index b6caa8a94c0e7..d56c99cf422e3 100644
--- a/docs/source/elastic/numa.rst
+++ b/docs/source/elastic/numa.rst
@@ -3,8 +3,8 @@
 NUMA Binding Utilities
 ======================
 
-.. automodule:: torch.distributed.numa
+.. automodule:: torch.numa
    :members:
 
-.. automodule:: torch.distributed.numa.binding
+.. automodule:: torch.numa.binding
    :members:
diff --git a/docs/source/export.md b/docs/source/export.md
index fcebcc6d49620..b550e0270b325 100644
--- a/docs/source/export.md
+++ b/docs/source/export.md
@@ -645,6 +645,7 @@ export/programming_model
 export/ir_spec
 export/pt2_archive
 export/draft_export
+export/joint_with_descriptors
 cond
 generated/exportdb/index
 torch.compiler_aot_inductor
diff --git a/docs/source/export/joint_with_descriptors.md b/docs/source/export/joint_with_descriptors.md
new file mode 100644
index 0000000000000..67c6e70fd98a2
--- /dev/null
+++ b/docs/source/export/joint_with_descriptors.md
@@ -0,0 +1,111 @@
+# Joint with descriptors
+
+Joint with descriptors is an experimental API for exporting a traced joint
+graph that supports all of torch.compile's features in full generality and,
+after processing, can be converted back into a differentiable callable that
+can be executed as normal.  For example, it is used to implement autoparallel,
+a system that takes a model and reshards inputs and parameters to make it
+a distributed SPMD program.
+
+```{eval-rst}
+.. currentmodule:: torch._functorch.aot_autograd
+.. autofunction:: aot_export_joint_with_descriptors
+.. autofunction:: aot_compile_joint_with_descriptors
+```
+
+## Descriptors
+
+```{eval-rst}
+.. currentmodule:: torch._functorch._aot_autograd.descriptors
+
+.. autoclass:: AOTInput
+  :members:
+
+.. autoclass:: AOTOutput
+  :members:
+
+.. autoclass:: BackwardTokenAOTInput
+  :members:
+
+.. autoclass:: BackwardTokenAOTOutput
+  :members:
+
+.. autoclass:: BufferAOTInput
+  :members:
+
+.. autoclass:: DummyAOTInput
+  :members:
+
+.. autoclass:: DummyAOTOutput
+  :members:
+
+.. autoclass:: GradAOTOutput
+  :members:
+
+.. autoclass:: InputMutationAOTOutput
+  :members:
+
+.. autoclass:: IntermediateBaseAOTOutput
+  :members:
+
+.. autoclass:: ParamAOTInput
+  :members:
+
+.. autoclass:: PhiloxBackwardBaseOffsetAOTInput
+  :members:
+
+.. autoclass:: PhiloxBackwardSeedAOTInput
+  :members:
+
+.. autoclass:: PhiloxForwardBaseOffsetAOTInput
+  :members:
+
+.. autoclass:: PhiloxForwardSeedAOTInput
+  :members:
+
+.. autoclass:: PhiloxUpdatedBackwardOffsetAOTOutput
+  :members:
+
+.. autoclass:: PhiloxUpdatedForwardOffsetAOTOutput
+  :members:
+
+.. autoclass:: PlainAOTInput
+  :members:
+
+.. autoclass:: PlainAOTOutput
+  :members:
+
+.. autoclass:: SavedForBackwardsAOTOutput
+  :members:
+
+.. autoclass:: SubclassGetAttrAOTInput
+  :members:
+
+.. autoclass:: SubclassGetAttrAOTOutput
+  :members:
+
+.. autoclass:: SubclassSizeAOTInput
+  :members:
+
+.. autoclass:: SubclassSizeAOTOutput
+  :members:
+
+.. autoclass:: SubclassStrideAOTInput
+  :members:
+
+.. autoclass:: SubclassStrideAOTOutput
+  :members:
+
+.. autoclass:: SyntheticBaseAOTInput
+  :members:
+
+.. autoclass:: ViewBaseAOTInput
+  :members:
+```
+
+## FX utilities
+
+```{eval-rst}
+.. automodule:: torch._functorch._aot_autograd.fx_utils
+  :members:
+```
diff --git a/docs/source/export/pt2_archive.md b/docs/source/export/pt2_archive.md
index cfb589f7bdfe4..447e944972ad0 100644
--- a/docs/source/export/pt2_archive.md
+++ b/docs/source/export/pt2_archive.md
@@ -22,18 +22,22 @@ The following is a sample archive. We will walk through the archive folder by fo
 ├── data
 │   ├── aotinductor
 │   │   └── model1
-│   │       ├── aotinductor_pickle_data.json
-│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.cpp
-│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.so
+│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.kernel_metadata.json
+│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.kernel.cpp
+│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.wrapper_metadata.json
+│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.wrapper.cpp
+│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.wrapper.so
 │   │       ├── cg7domx3woam3nnliwud7yvtcencqctxkvvcafuriladwxw4nfiv.cubin
 │   │       └── cubaaxppb6xmuqdm4bej55h2pftbce3bjyyvljxbtdfuolmv45ex.cubin
 │   ├── weights
-│   │  ├── model1_model_param_config.json
+│   │  ├── model1_weights_config.json
+│   │  ├── model2_weights_config.json
 │   │  ├── weight_0
 │   │  ├── weight_1
 │   │  ├── weight_2
 │   └── constants
-│   │  ├── model1_model_constants_config.json
+│   │  ├── model1_constants_config.json
+│   │  ├── model2_constants_config.json
 │   │  ├── tensor_0
 │   │  ├── tensor_1
 │   │  ├── custom_obj_0
@@ -67,11 +71,12 @@ example, compilation artifacts for the `model1` model on A100 and H100 will be
 saved in `model1-a100` and `model1-h100` folders separately.
 
 The folder typically contains
-* `<uuid>.so`: Dynamic library compiled from <uuid>.cpp.
-* `<uuid>.cpp`: AOTInductor generated cpp wrapper file.
+* `<uuid>.wrapper.so`: Dynamic library compiled from <uuid>.cpp.
+* `<uuid>.wrapper.cpp`: AOTInductor generated cpp wrapper file.
+* `<uuid>.kernel.cpp`: AOTInductor generated cpp kernel file.
 * `*.cubin`: Triton kernels compiled from triton codegen kernels
+* `<uuid>.wrapper_metadata.json`: Metadata which was passed in from the `aot_inductor.metadata` inductor config
 * (optional) `<uuid>.json`: External fallback nodes for custom ops to be executed by `ProxyExecutor`, serialized according to `ExternKernelNode` struct. If the model doesn’t use custom ops/ProxyExecutor, this file would be omitted.
-* `<uuid>_metadata.json`: Metadata which was passed in from the `aot_inductor.metadata` inductor config
 
 ### Weights
 
@@ -79,16 +84,16 @@ Path: `/data/weights/*`
 
 Model parameters and buffers are saved in the `/data/weights/` folder. Each
 tensor is saved as a separated file. The file only contains the raw data blob,
-tensor metadata are saved separately in the
-`<model_name>_model_param_config.json`.
+tensor metadata and mapping from model weight FQN to saved raw data blob are saved separately in the
+`<model_name>_weights_config.json`.
 
 ### Constants
 
 Path: `/data/constants/*`
 
 TensorConstants, non-persistent buffers and TorchBind objects are saved in the
-`/data/constants/` folder. Metadata is saved separately in the
-`<model_name>_model_constants_config.json`
+`/data/constants/` folder. Metadata and mapping from model constant FQN to saved raw data blob are saved separately in the
+`<model_name>_constants_config.json`
 
 ### Sample Inputs
 
diff --git a/docs/source/index.md b/docs/source/index.md
index e1e8ce5c0f2e5..df012d1d6e177 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -23,17 +23,11 @@ The APIs and performance characteristics of these features may change.
 :glob:
 :maxdepth: 2
 
+Install PyTorch <https://pytorch.org/get-started/locally/>
+user_guide/index
 pytorch-api
 notes
-```
-
-```{toctree}
-:glob:
-:hidden:
-:maxdepth: 2
-
 community/index
-C++ <https://docs.pytorch.org/cppdocs/>
 ```
 
 ## Indices and tables
diff --git a/docs/source/library.md b/docs/source/library.md
index 9d706e2e1080e..b31ca95d5b6a3 100644
--- a/docs/source/library.md
+++ b/docs/source/library.md
@@ -56,6 +56,7 @@ via PyTorch's C++ operator registration APIs).
 .. autofunction:: infer_schema
 .. autoclass:: torch._library.custom_ops.CustomOpDef
    :members: set_kernel_enabled
+.. autofunction:: get_kernel
 ```
 
 ## Low-level APIs
diff --git a/docs/source/nn.attention.flex_attention.md b/docs/source/nn.attention.flex_attention.md
index 7087bec6c9d96..4cfb51c5945c0 100644
--- a/docs/source/nn.attention.flex_attention.md
+++ b/docs/source/nn.attention.flex_attention.md
@@ -14,6 +14,12 @@
 ```{eval-rst}
 .. autofunction:: flex_attention
 ```
+```{eval-rst}
+.. autoclass:: AuxOutput
+```
+```{eval-rst}
+.. autoclass:: AuxRequest
+```
 
 ## BlockMask Utilities
 
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 5210eb4ad1495..8981ac1bf6ed4 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -608,6 +608,14 @@ Available options:
   for processing events. This avoids any slow path associated with querying/processing of
   events in the fast allocation path. This feature is disabled by default.
 
+* ``graph_capture_record_stream_reuse`` (experimental, default: `False`)
+  If set to `True`, the CUDA caching allocator will attempt to reclaim device memory during
+  CUDA Graph capture by using the graph topology (instead of CUDA events) to determine
+  when a freed block is safe to reuse. This can reduce peak memory during long captures that free
+  and reallocate buffers across multiple streams, especially when the capture DAG frequently
+  reaches joined frontiers. Note: Enabling this option can significantly increase the time spent
+  capturing the graph.
+
 .. note::
 
     Some stats reported by the
@@ -896,6 +904,130 @@ APIs can be used for debugging purposes:
     https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html#memory-allocator
 
 
+Tuning NVLink Performance with Custom Memory Allocator on H100/H200 GPUs
+------------------------------------------------------------------------
+In rare cases, performance of NVLink on H100/H200 GPUs can be influenced by the physical memory
+layout of data, creating an opportunity for developers to tune their applications for optimal
+throughput.
+
+An example of how physical memory layout of data affects performance is when communication
+kernels issue unbalanced NVLink read/write operations. In the following figure, we can see
+that each warp accesses memory addresses with a consistent strided pattern in each single wave.
+We can have a more balanced load by tuning the stride size in the workload or we can implement
+a custom CUDA allocator.
+
+.. code::
+
+  _______________________________  _______________________________      _______________________________
+  | Warp 0 Reading | No-reading |  | Warp 1 Reading | No-reading |  ...  Warp N Reading | No-reading |
+  _______________________________  _______________________________      _______________________________
+  <----------------------------->
+          Stride size
+
+Such an allocator can maintain contiguous virtual memory addresses for the kernel while strategically
+arranging the mapping to physical memory addresses (e.g., through shuffling). This technique allows
+developers to explore different physical access patterns to find the most efficient one, unlocking
+higher performance without modifying the kernel's logic. A practical implementation of such an allocator
+can be achieved using PyTorch’s custom allocator support as mentioned before, where the malloc and free
+functions are:
+
+.. code:: C++
+
+  // assuming a system with 8 GPUs
+  struct CustomAllocInfo {
+    void** devPtr;  // This will be the usable virtual memory address
+    CUdeviceptr dptr;
+    size_t totalSize;  // Total size of the allocated memory
+    size_t padded_size;
+    int device_id;
+    std::vector<CUmemGenericAllocationHandle> handles;  // Handles to physical memory allocations
+  };
+
+  // loop over pages
+  cudaError_t customCudaMalloc(CustomAllocInfo* info) {
+      if (!info) return cudaErrorInvalidValue;
+
+      CUdeviceptr dptr;
+
+      // Handles to redundant physical memory allocations which help truncate stride pattern in physical memory
+      std::vector<CUmemGenericAllocationHandle> handles_redundant;
+
+      size_t granularity = 0;
+      CUmemAllocationProp prop = {};
+
+      int currentDev = info->device_id;
+      size_t totalSize = info->totalSize;
+
+      prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+      prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+      prop.location.id = currentDev;
+      cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+      size_t padded_size = ROUND_UP(totalSize, granularity);
+
+      info->padded_size = padded_size;
+
+      // loop over pages
+      size_t iter_granularity = granularity * 64; // 64 * granularity with shift_size = 2 works
+      uint32_t iteration_count = (totalSize + iter_granularity - 1) / iter_granularity;
+
+      cuMemAddressReserve(&dptr, padded_size, 0ULL, 0ULL, 0ULL);
+
+      const int shift_size = 2;
+      for (size_t i = 0; i < iteration_count; i+=shift_size) {
+
+          CUmemGenericAllocationHandle allocHandle[shift_size];
+          for (int shift = 0; (shift < shift_size)&&(i+shift < iteration_count); shift++){
+              CHECK_CUDA(cuMemCreate(&allocHandle[shift], iter_granularity, &prop, 0));
+              info->handles.push_back(allocHandle[shift]);
+          }
+
+          for (int shift = 0; (shift < shift_size)&&(i+shift < iteration_count); shift++){
+
+              // mapping makes the shift (shift -> (shift+1)%shift_size  )
+              CHECK_CUDA(cuMemMap(dptr + (i+shift) * iter_granularity, iter_granularity, 0, allocHandle[(shift+1)%shift_size], 0));
+
+              setupMultiGPUAccess(dptr + (i+shift) * iter_granularity, iter_granularity, {0, 1, 2, 3, 4, 5, 6, 7}); // Enable access for all 8 GPUs
+          }
+
+          // std::cout << "Here we allocate one redundant page (2MB)..." << std::endl;
+          // this is an extra optimization on top of the swizzling. It helps "break"
+          // the physical access pattern even more. It can be left out if workload is already
+          // performing at SOL with just swizzling.
+          CUmemGenericAllocationHandle allocHandle_redundant;
+          CHECK_CUDA(cuMemCreate(&allocHandle_redundant, granularity, &prop, 0));
+          handles_redundant.push_back(allocHandle_redundant);
+      }
+
+      *info->devPtr = (void*)dptr;
+      info->dptr = dptr;
+
+      // Release each redundant allocation
+      for (auto handle : handles_redundant) {
+          // std::cout << "Here we release one redundant page (2MB)..." << std::endl;
+          CHECK_CUDA(cuMemRelease(handle));
+      }
+
+      return cudaSuccess;
+  }
+
+  void customCudaFree(CustomAllocInfo* info) {
+      if (!info) return;
+
+      // CHECK_CUDA(cudaSetDevice(info->device_id));
+
+      CHECK_CUDA(cuMemUnmap(info->dptr, info->padded_size));
+
+      // Unmap and release each allocation
+      for (auto handle : info->handles) {
+          CHECK_CUDA(cuMemRelease(handle));
+      }
+
+      // Unreserve the virtual address space
+      // CHECK_CUDA(cuMemAddressFree((CUdeviceptr)*info->devPtr, info->padded_size));
+      CHECK_CUDA(cuMemAddressFree(info->dptr, info->padded_size));
+  }
+
+
 cuBLAS workspaces
 -----------------
 
diff --git a/docs/source/notes/get_start_xpu.rst b/docs/source/notes/get_start_xpu.rst
index 5ca51833f0256..57cb47bd840d4 100644
--- a/docs/source/notes/get_start_xpu.rst
+++ b/docs/source/notes/get_start_xpu.rst
@@ -24,17 +24,12 @@ For Intel Client GPU
 +-------------------------------------+----------------------------------------------------------------------------------------------------+
 | Supported OS                        | Validated Hardware                                                                                 |
 +=====================================+====================================================================================================+
-|| Windows 11 & Ubuntu 24.10          || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
+|| Windows 11 & Ubuntu 24.04/25.04    || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
 ||                                    || Intel® Arc B-Series Graphics (CodeName: Battlemage)                                               |
 ||                                    || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H)                 |
 ||                                    || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
 ||                                    || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
 +-------------------------------------+----------------------------------------------------------------------------------------------------+
-|| Ubuntu 24.04 & WSL2 (Ubuntu 24.04) || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
-||                                    || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H)                 |
-||                                    || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
-||                                    || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
-+-------------------------------------+----------------------------------------------------------------------------------------------------+
 
 Intel GPUs support (Prototype) is ready from PyTorch* 2.5 for Intel® Client GPUs and Intel® Data Center GPU Max Series on both Linux and Windows, which brings Intel GPUs and the SYCL* software stack into the official PyTorch stack with consistent user experience to embrace more AI application scenarios.
 
@@ -107,7 +102,7 @@ If you are migrating code from ``cuda``, you would change references from ``cuda
 The following points outline the support and limitations for PyTorch with Intel GPU:
 
 #. Both training and inference workflows are supported.
-#. Both eager mode and ``torch.compile`` is supported. The feature ``torch.compile`` is also supported on Windows from PyTorch* 2.7 with Intel GPU, refer to `How to Use Inductor on Windows with CPU/XPU <https://pytorch.org/tutorials/prototype/inductor_windows.html>`_.
+#. Both eager mode and ``torch.compile`` is supported. The feature ``torch.compile`` is also supported on Windows from PyTorch* 2.7 with Intel GPU, refer to `How to use torch.compile on Windows CPU/XPU <https://pytorch.org/tutorials/unstable/inductor_windows.html>`_.
 #. Data types such as FP32, BF16, FP16, and Automatic Mixed Precision (AMP) are all supported.
 
 Examples
diff --git a/docs/source/notes/hip.rst b/docs/source/notes/hip.rst
index a34535d67fc99..7ee596b53f9cc 100644
--- a/docs/source/notes/hip.rst
+++ b/docs/source/notes/hip.rst
@@ -179,3 +179,30 @@ by recompiling the PyTorch from source.
 Please add below line as an argument to cmake command parameters::
 
     -DROCM_FORCE_ENABLE_GPU_ASSERTS:BOOL=ON
+
+Enabling/Disabling ROCm Composable Kernel
+-----------------------------------------
+
+Enabling composable_kernel (CK) for both SDPA and GEMMs is a two-part process. First the user must have built
+pytorch while setting the corresponding environment variable to '1'
+
+SDPA:
+``USE_ROCM_CK_SDPA=1``
+
+GEMMs:
+``USE_ROCM_CK_GEMM=1``
+
+Second, the user must explicitly request that CK be used as the backend library via the corresponding python
+call
+
+SDPA:
+``setROCmFAPreferredBackend('<choice>')``
+
+GEMMs:
+``setBlasPreferredBackend('<choice>')``
+
+To enable CK in either scenario, simply pass 'ck' to those functions.
+
+In order to set the backend to CK, the user MUST have built with the correct environment variable. If not,
+PyTorch will print a warning and use the "default" backend. For GEMMs, this will route to hipblas and
+for SDPA it routes to aotriton.
diff --git a/docs/source/notes/libtorch_stable_abi.md b/docs/source/notes/libtorch_stable_abi.md
index 73b83c7259597..1180a85d0eaa9 100644
--- a/docs/source/notes/libtorch_stable_abi.md
+++ b/docs/source/notes/libtorch_stable_abi.md
@@ -9,8 +9,9 @@ This note will eventually contain more details on how to use the APIs in torch/c
 |  type in custom extension    |   StableIValue representation   |   type in libtorch  |   Schema Type  |
 | -------- | ------- | ------- | ------- |
 | std::optional\<S> | if there is a value, raw bitwise copy into leading bytes of uint64_t of pointer to a new StableIValue representing S. if there is no value, nullptr. | std::optional\<T> | Type? |
-| RAIIATH | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor |  Tensor |
-| int32_t | raw bitwise copy into leading bytes of uint64_t | at::ScalarType | ScalarType |
+| torch::stable::Tensor | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor |  Tensor |
+| RAIIATH (outdated) | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor |  Tensor |
+| torch::headeronly::ScalarType | raw bitwise copy of the translated underlying enum into leading bytes of uint64_t | torch::headeronly::ScalarType | ScalarType |
 | int32_t | raw bitwise copy into leading bytes of uint64_t | at::Layout | Layout |
 | int32_t | raw bitwise copy into leading bytes of uint64_t | at::MemoryFormat | MemoryFormat |
 | bool | raw bitwise copy into leading bytes of uint64_t | bool | bool |
diff --git a/docs/source/onnx.md b/docs/source/onnx.md
index 06b049ec39bcc..73a24b671553c 100644
--- a/docs/source/onnx.md
+++ b/docs/source/onnx.md
@@ -84,8 +84,6 @@ also be interested in reading our [development wiki](https://github.com/pytorch/
     :noindex:
 .. autofunction:: is_in_onnx_export
     :noindex:
-.. autofunction:: enable_fake_mode
-    :noindex:
 ```
 
 ### Classes
@@ -115,7 +113,6 @@ also be interested in reading our [development wiki](https://github.com/pytorch/
 .. autofunction:: register_custom_op_symbolic
 .. autofunction:: unregister_custom_op_symbolic
 .. autofunction:: select_model_mode_for_export
-.. autoclass:: JitScalarType
 ```
 
 ```{eval-rst}
diff --git a/docs/source/onnx_export.md b/docs/source/onnx_export.md
index 029952aa4e995..0adfec359d0b8 100644
--- a/docs/source/onnx_export.md
+++ b/docs/source/onnx_export.md
@@ -245,5 +245,4 @@ Each initialized value, input, output has the following metadata:
 .. autofunction:: torch.onnx.is_in_onnx_export
 .. autoclass:: torch.onnx.OnnxExporterError
     :members:
-.. autofunction:: torch.onnx.enable_fake_mode
 ```
diff --git a/docs/source/onnx_verification.md b/docs/source/onnx_verification.md
index cbaad021e960c..4036aea8f81a7 100644
--- a/docs/source/onnx_verification.md
+++ b/docs/source/onnx_verification.md
@@ -1,4 +1,5 @@
 # torch.onnx.verification
+
 ```{eval-rst}
 .. automodule:: torch.onnx.verification
 ```
@@ -11,23 +12,3 @@
 .. autoclass:: VerificationInfo
     :members:
 ```
-
-```{eval-rst}
-.. autofunction:: verify
-```
-
-## Deprecated
-
-The following classes and functions are deprecated.
-
-<!-- Some deprecated members are not publicly shown -->
-```{eval-rst}
-.. py:class:: check_export_model_diff
-.. py:class:: GraphInfo
-.. py:class:: GraphInfoPrettyPrinter
-.. py:class:: OnnxBackend
-.. py:class:: OnnxTestCaseRepro
-.. py:class:: VerificationOptions
-.. py:function:: find_mismatch
-.. py:function:: verify_aten_graph
-```
diff --git a/docs/source/optim.md b/docs/source/optim.md
index 38587705ed216..8c3174c76fb29 100644
--- a/docs/source/optim.md
+++ b/docs/source/optim.md
@@ -165,6 +165,7 @@ for input, target in dataset:
     Adamax
     ASGD
     LBFGS
+    Muon
     NAdam
     RAdam
     RMSprop
@@ -210,6 +211,7 @@ Below is a table showing the available and default implementations of each algor
     :class:`Adamax`;foreach;yes;no
     :class:`ASGD`;foreach;yes;no
     :class:`LBFGS`;for-loop;no;no
+    :class:`Muon`;for-loop;no;no
     :class:`NAdam`;foreach;yes;no
     :class:`RAdam`;foreach;yes;no
     :class:`RMSprop`;foreach;yes;no
@@ -233,6 +235,7 @@ Below table is showing the stability status for fused implementations:
     :class:`Adamax`;unsupported;unsupported;unsupported
     :class:`ASGD`;unsupported;unsupported;unsupported
     :class:`LBFGS`;unsupported;unsupported;unsupported
+    :class:`Muon`;unsupported;unsupported;unsupported
     :class:`NAdam`;unsupported;unsupported;unsupported
     :class:`RAdam`;unsupported;unsupported;unsupported
     :class:`RMSprop`;unsupported;unsupported;unsupported
diff --git a/docs/source/pytorch-api.md b/docs/source/pytorch-api.md
index 1083354f3b3ca..6ebf94c47a357 100644
--- a/docs/source/pytorch-api.md
+++ b/docs/source/pytorch-api.md
@@ -1,9 +1,16 @@
 (pytorch_api)=
-# Python API
+# Reference API
+
+```{toctree}
+:maxdepth: 1
+
+C++ <https://docs.pytorch.org/cppdocs/>
+```
 
 ```{toctree}
 :glob:
 :maxdepth: 1
+:caption: Python API
 
 torch
 nn
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index 53c4d42d3a3d0..d8f7c162b5e04 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -110,7 +110,6 @@ and supported quantized modules and functions.
 .. py:module:: torch.ao.quantization.backend_config.executorch
 .. py:module:: torch.ao.quantization.backend_config.fbgemm
 .. py:module:: torch.ao.quantization.backend_config.native
-.. py:module:: torch.ao.quantization.backend_config.observation_type
 .. py:module:: torch.ao.quantization.backend_config.onednn
 .. py:module:: torch.ao.quantization.backend_config.qnnpack
 .. py:module:: torch.ao.quantization.backend_config.tensorrt
diff --git a/docs/source/torch.compiler_aot_inductor.md b/docs/source/torch.compiler_aot_inductor.md
index d8514a920848e..0584cac0aa917 100644
--- a/docs/source/torch.compiler_aot_inductor.md
+++ b/docs/source/torch.compiler_aot_inductor.md
@@ -202,6 +202,7 @@ Below are some useful tools for debugging AOT Inductor.
 
 logging
 torch.compiler_aot_inductor_minifier
+torch.compiler_aot_inductor_debugging_guide
 ```
 
 To enable runtime checks on inputs, set the environment variable `AOTI_RUNTIME_CHECK_INPUTS` to 1. This will raise a `RuntimeError` if the inputs to the compiled model differ in size, data type, or strides from those used during export.
diff --git a/docs/source/torch.compiler_aot_inductor_debugging_guide.md b/docs/source/torch.compiler_aot_inductor_debugging_guide.md
new file mode 100644
index 0000000000000..331e1abd886a0
--- /dev/null
+++ b/docs/source/torch.compiler_aot_inductor_debugging_guide.md
@@ -0,0 +1,73 @@
+# AOTInductor Debugging Guide
+
+If you encounter CUDA illegal memory access (IMA) errors while using [AOT Inductor](./torch.compiler_aot_inductor.md), this guide provides a systematic approach to debug such errors. AOT Inductor is part of the PT2 stack, similar to torch.compile, but it produces a compilation artifact that can work in a C++ environment. CUDA illegal memory errors can happen non-deterministically and even appear transient at times.
+
+On a high-level, there are three main steps in debugging CUDA IMA errors:
+
+- **Sanity checks**: Use basic debugging flags to catch common issues before diving deeper.
+- **Pinpoint the CUDA IMA**: Make the error deterministic and identify the problematic kernel.
+- **Identify problematic kernels**: Use intermediate value debugging to inspect kernel inputs and outputs.
+
+## Step 1: Sanity Checks
+
+Before diving deep into reliably reproducing the error, try out some existing debugging flags:
+
+```bash
+AOTI_RUNTIME_CHECK_INPUTS=1
+TORCHINDUCTOR_NAN_ASSERTS=1
+```
+
+These flags take effect at compilation time (more precisely, at codegen time):
+
+- `AOTI_RUNTIME_CHECK_INPUTS=1` checks if the inputs satisfy the same set of guards used during compilation. See {ref}`torch.compiler_troubleshooting` for more details.
+- `TORCHINDUCTOR_NAN_ASSERTS=1` adds codegen before and after each Inductor's kernel to check for NaN.
+
+## Step 2: Pinpoint the CUDA IMA
+
+One hard part is CUDA IMA errors can be non-deterministic. They can happen at different locations, and sometimes not happen at all (though that just means the numerics are silently incorrect). With the following two flags, we can trigger the error deterministically:
+
+```bash
+PYTORCH_NO_CUDA_MEMORY_CACHING=1
+CUDA_LAUNCH_BLOCKING=1
+```
+
+These flags take effect at runtime:
+
+- `PYTORCH_NO_CUDA_MEMORY_CACHING=1` disables PyTorch's Caching Allocator, which allocates a bigger buffer than needed immediately to reduce the number of buffer allocations. This is usually the reason why CUDA illegal memory access errors are non-deterministic.
+![How PyTorch's caching allocator can mask CUDA illegal memory access errors](./_static/img/aoti_debugging_guide/cuda_ima_cca.png)
+*Figure: How PyTorch's caching allocator can mask CUDA illegal memory access errors*
+
+- `CUDA_LAUNCH_BLOCKING=1` forces the kernels to launch one at a time. Without this, we would get the famous "CUDA kernel errors might be asynchronously reported at some other API call" warning since kernels are launched asynchronously.
+
+## Step 3: Identify Problematic Kernels with Intermediate Value Debugger
+
+The AOTI Intermediate Value Debugger can help pinpoint the problematic kernel and get information about the inputs and outputs of said kernel.
+
+First, use:
+
+```bash
+AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=3
+```
+
+This flag takes effect at compilation time and prints the kernels one by one at runtime. Together with the previous flags, this would let us know which kernel was launched right before the error happened.
+
+However, it is important to note that just because the error happened in that kernel, it doesn't mean that kernel is problematic. For example, it can happen that an earlier kernel is problematic and produces some wrong outputs. So the natural next step is to inspect the inputs to the problematic kernel:
+
+```bash
+AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT="triton_poi_fused_add_ge_logical_and_logical_or_lt_231,_add_position_embeddings_kernel_5" AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2
+```
+
+The filtered kernels to print environment variable has the names of the kernels you want to inspect. If the inputs to the kernel are not as expected, you then inspect the kernel that produces the bad input.
+
+## Additional Debugging Tools
+
+### Logging and Tracing
+
+- **tlparse / TORCH_TRACE**: Provides complete output codes for inspection and records the set of guards used. See {ref}`tlparse / TORCH_TRACE <tlparse-torch-trace>` for more details.
+- **TORCH_LOGS**: Use `TORCH_LOGS="+inductor,output_code"` to see more PT2 internal logs. See {ref}`TORCH_LOGS <torch-logs>` for more details.
+- **TORCH_SHOW_CPP_STACKTRACES**: Set `TORCH_SHOW_CPP_STACKTRACES=1` to potentially see more stack traces.
+
+### Common Sources of Issues
+
+- [**Dynamic shapes**](./torch.compiler_dynamic_shapes.md): Historically a source of many IMAs. Pay special attention when debugging dynamic shape scenarios.
+- **Custom ops**: Especially when implemented in C++ and used with dynamic shapes. There is a need to Symint'ify the meta function.
diff --git a/docs/source/torch.compiler_dynamo_deepdive.md b/docs/source/torch.compiler_dynamo_deepdive.md
index 6bbb03170e549..9fa7654023ca5 100644
--- a/docs/source/torch.compiler_dynamo_deepdive.md
+++ b/docs/source/torch.compiler_dynamo_deepdive.md
@@ -285,7 +285,7 @@ appear in the errors, and the `VariableTracker` method that throws the
 exception when you encounter a Dynamo error. In particular, sometimes we
 find that an object is tracked as a `UserDefinedObjectVariable` (this
 is Dynamo’s catch-all class), when it should have been tracked as
-something more specific. In these cases, the `SourceBuilder.__call__`
+something more specific. In these cases, the `VariableBuilder`
 logic is often to blame.
 
 **Debugging tip**. When running a program with `TORCH_LOGS=dynamo`,
diff --git a/docs/source/torch.compiler_troubleshooting.md b/docs/source/torch.compiler_troubleshooting.md
index 041d61cf9b901..a4f7af3b9b8e9 100644
--- a/docs/source/torch.compiler_troubleshooting.md
+++ b/docs/source/torch.compiler_troubleshooting.md
@@ -192,6 +192,8 @@ For more information on dynamic shapes, see [The dynamic shapes manual](https://
 
 ## Logging Tools
 
+(tlparse-torch-trace)=
+
 ### tlparse / TORCH_TRACE
 
 `tlparse` / `TORCH_TRACE` are a pair of tools that produce compilation reports that look like this:
@@ -252,6 +254,8 @@ Here are some insights you can gain from a `tlparse`:
   For example, you can look at the high-level generated FX graph or the generated Triton code.
 - Is there relevant information for a particular frame? You can find these in `compilation_metrics`.
 
+(torch-logs)=
+
 ### TORCH_LOGS
 
 You can use the `TORCH_LOGS` environment variable to selectively enable parts of the `torch.compile` stack to log.
diff --git a/docs/source/torch_cuda_memory.md b/docs/source/torch_cuda_memory.md
index e5fa147ee785e..f7f1fe706dad3 100644
--- a/docs/source/torch_cuda_memory.md
+++ b/docs/source/torch_cuda_memory.md
@@ -32,7 +32,7 @@ torch.cuda.memory._dump_snapshot("my_snapshot.pickle")
 
 ## Using the visualizer
 
-Open [pytorch.org/memory_viz](https://pytorch.org/memory_viz) and drag/drop the pickled snapshot file into the visualizer.
+Open <https://pytorch.org/memory_viz> and drag/drop the pickled snapshot file into the visualizer.
 The visualizer is a javascript application that runs locally on your computer. It does not upload any snapshot data.
 
 
diff --git a/docs/source/type_info.md b/docs/source/type_info.md
index 9fc2ce56c4bea..9933d551506d9 100644
--- a/docs/source/type_info.md
+++ b/docs/source/type_info.md
@@ -20,15 +20,15 @@ This is similar to [numpy.finfo](https://numpy.org/doc/stable/reference/generate
 
 A {class}`torch.finfo` provides the following attributes:
 
-| Name            | Type  | Description                                                                |
-| :-------------- | :---- | :------------------------------------------------------------------------- |
-| bits            | int   | The number of bits occupied by the type.                                   |
-| eps             | float | The smallest representable number such that ``1.0 + eps != 1.0``.          |
-| max             | float | The largest representable number.                                          |
-| min             | float | The smallest representable number (typically ``-max``).                    |
-| tiny            | float | The smallest positive normal number. Equivalent to ``smallest_normal``.    |
-| smallest_normal | float | The smallest positive normal number. See notes.                            |
-| resolution      | float | The approximate decimal resolution of this type, i.e., ``10**-precision``. |
+| Name            | Type  | Description                                                                                 |
+| :-------------- | :---- | :------------------------------------------------------------------------------------------ |
+| bits            | int   | The number of bits occupied by the type.                                                    |
+| eps             | float | The difference between 1.0 and the next smallest representable float larger than 1.0.       |
+| max             | float | The largest representable number.                                                           |
+| min             | float | The smallest representable number (typically ``-max``).                                     |
+| tiny            | float | The smallest positive normal number. Equivalent to ``smallest_normal``.                     |
+| smallest_normal | float | The smallest positive normal number. See notes.                                             |
+| resolution      | float | The approximate decimal resolution of this type, i.e., ``10**-precision``.                  |
 
 ```{note}
   The constructor of {class}`torch.finfo` can be called without argument,
diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
new file mode 100644
index 0000000000000..3a341893ef90b
--- /dev/null
+++ b/docs/source/user_guide/index.md
@@ -0,0 +1,46 @@
+# User Guide
+
+PyTorch provides a flexible and efficient platform for building deep
+learning models, offering dynamic computation graphs and a rich
+ecosystem of tools and libraries. This guide will help you harness the power
+of PyTorch to create and deploy machine learning models effectively.
+
+```{note}
+This guide is a work in progress.
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Introduction
+
+Pytorch Overview <https://docs.pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html>
+Get Started <https://pytorch.org/get-started/locally/>
+Learn the Basics <https://docs.pytorch.org/tutorials/beginner/basics/intro.html>
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Core Concepts
+
+pytorch_main_components
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Beyond the Basics
+
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Developer Notes
+
+../notes
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Accelerator Integration
+
+../accelerator/index
+```
diff --git a/docs/source/user_guide/pytorch_main_components.md b/docs/source/user_guide/pytorch_main_components.md
new file mode 100644
index 0000000000000..809fafaf8235a
--- /dev/null
+++ b/docs/source/user_guide/pytorch_main_components.md
@@ -0,0 +1,29 @@
+(pytorch_main_components)=
+# PyTorch Main Components
+
+PyTorch is a flexible and powerful library for deep learning that provides a comprehensive set of tools for building, training, and deploying machine learning models.
+
+## PyTorch Components for Basic Deep Learning
+
+Some of the basic PyTorch components include:
+
+* **Tensors** - N-dimensional arrays that serve as PyTorch's fundamental
+data structure. They support automatic differentiation, hardware acceleration, and provide a comprehensive API for mathematical operations.
+
+* **Autograd** - PyTorch's automatic differentiation engine
+that tracks operations performed on tensors and builds a computational
+graph dynamically to be able to compute gradients.
+
+* **Neural Network API** - A modular framework for building neural networks with pre-defined layers,
+activation functions, and loss functions. The {mod}`nn.Module` base class provides a clean interface
+for creating custom network architectures with parameter management.
+
+* **DataLoaders** - Tools for efficient data handling that provide
+features like batching, shuffling, and parallel data loading. They abstract away the complexities
+of data preprocessing and iteration, allowing for optimized training loops.
+
+
+## PyTorch Compiler
+
+The PyTorch compiler is a suite of tools that optimize model execution and
+reduce resource requirements. You can learn more about the PyTorch compiler [here](https://docs.pytorch.org/docs/stable/torch.compiler_get_started.html).
diff --git a/functorch/csrc/dim/minpybind.h b/functorch/csrc/dim/minpybind.h
index e1ac428864a9b..ceced399b40d2 100644
--- a/functorch/csrc/dim/minpybind.h
+++ b/functorch/csrc/dim/minpybind.h
@@ -602,7 +602,7 @@ struct vector_args {
             _PyArg_ParseStackAndKeywords((PyObject*const*)args, nargs, kwnames.ptr(), _parser, &dummy, &dummy, &dummy, &dummy, &dummy);
 #else
             _PyArg_Parser* _parser = new _PyArg_Parser{NULL, &names_buf[0], fname_cstr, 0};
-            std::unique_ptr<PyObject*[]> buf(new PyObject*[names.size()]);
+            auto buf = std::make_unique<PyObject*[]>(names.size());
             _PyArg_UnpackKeywords((PyObject*const*)args, nargs, NULL, kwnames.ptr(), _parser, required, (Py_ssize_t)values.size() - kwonly, 0, &buf[0]);
 #endif
             throw exception_set();
diff --git a/functorch/dim/__init__.py b/functorch/dim/__init__.py
index f52d417d2ba27..95747181e848e 100644
--- a/functorch/dim/__init__.py
+++ b/functorch/dim/__init__.py
@@ -24,10 +24,6 @@ class DimensionBindError(Exception):
 # use dict to avoid writing C++ bindings for set
 pointwise = dict.fromkeys(op_properties.pointwise, True)
 
-use_c = True
-if not use_c:
-    from . import reference
-
 
 class _Tensor:
     # fast path around slow wrapping/unwrapping logic for simply queries used
@@ -40,12 +36,8 @@ def dims(self):
     def dim(self):
         return self.ndim
 
-    if use_c:
-        __torch_function__ = classmethod(_C.__torch_function__)
-        expand = _C._instancemethod(_C.expand)
-    else:
-        __torch_function__ = reference.__torch_function__
-        expand = reference.expand
+    __torch_function__ = classmethod(_C.__torch_function__)
+    expand = _C._instancemethod(_C.expand)
 
     index = _C._instancemethod(_C.index)
 
@@ -64,8 +56,6 @@ class Dim(_C.Dim, _Tensor):
 
 
 class Tensor(_Tensor, _C.Tensor):
-    if not use_c:
-        from_batched = staticmethod(_C.Tensor_from_batched)
     from_positional = staticmethod(_C.Tensor_from_positional)
     sum = _C._instancemethod(_C.Tensor_sum)
 
@@ -75,21 +65,17 @@ def cat(tensors, dim, new_dim):
     return stack(tensors, n, dim).index([n, dim], new_dim)
 
 
-if use_c:
-    _wrap = _C._wrap
+_wrap = _C._wrap
+
+
+def _def(name, *args, **kwargs):
+    orig = getattr(torch.Tensor, name)
+    setattr(_Tensor, name, _C._instancemethod(_wrap(orig, *args, **kwargs)))
 
-    def _def(name, *args, **kwargs):
-        orig = getattr(torch.Tensor, name)
-        setattr(_Tensor, name, _C._instancemethod(_wrap(orig, *args, **kwargs)))
 
-    t__getitem__ = _C._instancemethod(_C.__getitem__)
-    stack = _C.stack
-    split = _C._instancemethod(_C.split)
-else:
-    _wrap, _def = reference._wrap, reference._def
-    t__getitem__ = reference.t__getitem__
-    stack = reference.stack
-    split = reference.split
+t__getitem__ = _C._instancemethod(_C.__getitem__)
+stack = _C.stack
+split = _C._instancemethod(_C.split)
 
 # note: there is no python reference
 t__setitem__ = _C._instancemethod(_C.__setitem__)
@@ -105,13 +91,10 @@ def _def(name, *args, **kwargs):
 _Tensor.split = split
 torch.Tensor.expand = _C._instancemethod(_C.expand)
 torch.Tensor.index = _C._instancemethod(_C.index)
-wrap_type(use_c, _Tensor, torch.Tensor, _Tensor.__torch_function__)
+wrap_type(_Tensor, torch.Tensor, _Tensor.__torch_function__)
 del _Tensor.ndim
 
-if use_c:
-    _Tensor.order = _C._instancemethod(_C.order)
-else:
-    _Tensor.order = reference.positional
+_Tensor.order = _C._instancemethod(_C.order)
 
 _def("mean")
 _def("sum")
diff --git a/functorch/dim/batch_tensor.py b/functorch/dim/batch_tensor.py
deleted file mode 100644
index dae9b270896e9..0000000000000
--- a/functorch/dim/batch_tensor.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-from contextlib import contextmanager
-
-from torch._C._functorch import _vmap_add_layers, _vmap_remove_layers
-
-
-_enabled = False
-
-
-@contextmanager
-def _enable_layers(dims):
-    global _enabled
-    assert not _enabled
-    input = sorted((d._level, d.size) for d in dims if not isinstance(d, int))
-    n = len(input)
-    try:
-        _vmap_add_layers(input)
-        _enabled = True
-        yield
-    finally:
-        _enabled = False
-        _vmap_remove_layers(n)
diff --git a/functorch/dim/delayed_mul_tensor.py b/functorch/dim/delayed_mul_tensor.py
deleted file mode 100644
index 3c136cfe1247d..0000000000000
--- a/functorch/dim/delayed_mul_tensor.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-import torch
-
-from . import _Tensor, Tensor
-from .reference import _dims, _enable_layers, llist, ltuple
-
-
-class DelayedMulTensor(_Tensor):
-    def __init__(self, lhs, rhs):
-        self._lhs, self._rhs = lhs, rhs
-        self._data = None
-        self._levels_data = None
-        self._has_device = lhs._has_device or rhs._has_device
-        self._batchtensor_data = None
-        self._tensor_data = None
-
-    @property
-    def _levels(self):
-        if self._levels_data is None:
-            levels = llist(self._lhs._levels)
-            for l in self._rhs._levels:
-                if l not in levels:
-                    levels.append(l)
-            self._levels_data = ltuple(levels)
-        return self._levels_data
-
-    @property
-    def _batchtensor(self):
-        if self._batchtensor_data is None:
-            with _enable_layers(self._levels):
-                print("bt multiply fallback")
-                self._batchtensor_data = self._lhs._batchtensor * self._rhs._batchtensor
-        return self._batchtensor_data
-
-    @property
-    def _tensor(self):
-        if self._tensor_data is None:
-            self._tensor_data = Tensor.from_batched(
-                self._batchtensor, self._has_device
-            )._tensor
-        return self._tensor_data
-
-    @property
-    def ndim(self):
-        return self._batchtensor.ndim
-
-    @property
-    def dims(self):
-        return ltuple(super().dims)
-
-    def sum(self, dim):
-        dims = _dims(dim, 0, False, False)
-        n = ord("a")
-        all_levels = self._levels
-
-        def to_char(d):
-            return chr(n + all_levels.index(d))
-
-        plhs, levelslhs = self._lhs._tensor, self._lhs._levels
-        prhs, levelsrhs = self._rhs._tensor, self._rhs._levels
-        new_levels = [l for l in self._levels if l not in dims]
-        fmt = "".join(
-            [
-                *(to_char(d) for d in levelslhs),
-                ",",
-                *(to_char(d) for d in levelsrhs),
-                "->",
-                *(to_char(d) for d in new_levels),
-            ]
-        )
-        result_data = torch.einsum(fmt, (plhs, prhs))
-        return Tensor.from_positional(result_data, new_levels, True)
diff --git a/functorch/dim/dim.py b/functorch/dim/dim.py
deleted file mode 100644
index 9a4b568664849..0000000000000
--- a/functorch/dim/dim.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-import dis
-import inspect
-from dataclasses import dataclass
-from typing import Union
-
-from . import DimList
-
-
-_vmap_levels = []
-
-
-@dataclass
-class LevelInfo:
-    level: int
-    alive: bool = True
-
-
-class Dim:
-    def __init__(self, name: str, size: Union[None, int] = None):
-        self.name = name
-        self._size = None
-        self._vmap_level = None
-        if size is not None:
-            self.size = size
-
-    def __del__(self):
-        if self._vmap_level is not None:
-            _vmap_active_levels[self._vmap_stack].alive = False  # noqa: F821
-            while (
-                not _vmap_levels[-1].alive and current_level() == _vmap_levels[-1].level  # noqa: F821
-            ):
-                _vmap_decrement_nesting()  # noqa: F821
-                _vmap_levels.pop()
-
-    @property
-    def size(self):
-        assert self.is_bound
-        return self._size
-
-    @size.setter
-    def size(self, size: int):
-        from . import DimensionBindError
-
-        if self._size is None:
-            self._size = size
-            self._vmap_level = _vmap_increment_nesting(size, "same")  # noqa: F821
-            self._vmap_stack = len(_vmap_levels)
-            _vmap_levels.append(LevelInfo(self._vmap_level))
-
-        elif self._size != size:
-            raise DimensionBindError(
-                f"Dim '{self}' previously bound to a dimension of size {self._size} cannot bind to a dimension of size {size}"
-            )
-
-    @property
-    def is_bound(self):
-        return self._size is not None
-
-    def __repr__(self):
-        return self.name
-
-
-def extract_name(inst):
-    assert inst.opname == "STORE_FAST" or inst.opname == "STORE_NAME"
-    return inst.argval
-
-
-_cache = {}
-
-
-def dims(lists=0):
-    frame = inspect.currentframe()
-    assert frame is not None
-    calling_frame = frame.f_back
-    assert calling_frame is not None
-    code, lasti = calling_frame.f_code, calling_frame.f_lasti
-    key = (code, lasti)
-    if key not in _cache:
-        first = lasti // 2 + 1
-        instructions = list(dis.get_instructions(calling_frame.f_code))
-        unpack = instructions[first]
-
-        if unpack.opname == "STORE_FAST" or unpack.opname == "STORE_NAME":
-            # just a single dim, not a list
-            name = unpack.argval
-            ctor = Dim if lists == 0 else DimList
-            _cache[key] = lambda: ctor(name=name)
-        else:
-            assert unpack.opname == "UNPACK_SEQUENCE"
-            ndims = unpack.argval
-            names = tuple(
-                extract_name(instructions[first + 1 + i]) for i in range(ndims)
-            )
-            first_list = len(names) - lists
-            _cache[key] = lambda: tuple(
-                Dim(n) if i < first_list else DimList(name=n)
-                for i, n in enumerate(names)
-            )
-    return _cache[key]()
-
-
-def _dim_set(positional, arg):
-    def convert(a):
-        if isinstance(a, Dim):
-            return a
-        else:
-            assert isinstance(a, int)
-            return positional[a]
-
-    if arg is None:
-        return positional
-    elif not isinstance(arg, (Dim, int)):
-        return tuple(convert(a) for a in arg)
-    else:
-        return (convert(arg),)
diff --git a/functorch/dim/reference.py b/functorch/dim/reference.py
deleted file mode 100644
index fd934011d8238..0000000000000
--- a/functorch/dim/reference.py
+++ /dev/null
@@ -1,645 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# reference python implementations for C ops
-import torch
-from functorch._C import dim as _C
-
-from . import op_properties
-from .batch_tensor import _enable_layers
-from .tree_map import tree_flatten, tree_map
-
-
-DimList = _C.DimList
-import operator
-from functools import reduce
-
-
-# use dict to avoid writing C++ bindings for set
-pointwise = set(op_properties.pointwise)
-
-
-def prod(x):
-    return reduce(operator.mul, x, 1)
-
-
-def _wrap_dim(d, N, keepdim):
-    from . import Dim
-
-    if isinstance(d, Dim):
-        assert not keepdim, "cannot preserve first-class dimensions with keepdim=True"
-        return d
-    elif d >= 0:
-        return d - N
-    else:
-        return d
-
-
-def _dims(d, N, keepdim, single_dim):
-    from . import Dim
-
-    if isinstance(d, (Dim, int)):
-        return ltuple((_wrap_dim(d, N, keepdim),))
-    assert not single_dim, f"expected a single dimension or int but found: {d}"
-    return ltuple(_wrap_dim(x, N, keepdim) for x in d)
-
-
-def _bind_dims_to_size(lhs_size, rhs, lhs_debug):
-    from . import DimensionMismatchError
-
-    not_bound = tuple((i, r) for i, r in enumerate(rhs) if not r.is_bound)
-    if len(not_bound) == 1:
-        idx, d = not_bound[0]
-        rhs_so_far = prod(r.size for r in rhs if r.is_bound)
-        if lhs_size % rhs_so_far != 0:
-            rhs_s = tuple("?" if not r.is_bound else str(r.size) for r in rhs)
-            raise DimensionMismatchError(
-                f"inferred dimension does not evenly fit into larger dimension: {lhs_size} vs {rhs_s}"
-            )
-        new_size = lhs_size // rhs_so_far
-        d.size = new_size
-    elif len(not_bound) > 1:
-        rhs_s = tuple("?" if not r.is_bound else str(r.size) for r in rhs)
-        raise DimensionMismatchError(
-            f"cannot infer the size of two dimensions at once: {rhs} with sizes {rhs_s}"
-        )
-    else:
-        rhs_size = prod(r.size for r in rhs)
-        if lhs_size != rhs_size:
-            raise DimensionMismatchError(
-                f"Dimension sizes to do not match ({lhs_size} != {rhs_size}) when matching {lhs_debug} to {rhs}"
-            )
-
-
-def _tensor_levels(inp):
-    from . import _Tensor
-
-    if isinstance(inp, _Tensor):
-        return inp._tensor, llist(inp._levels), inp._has_device
-    else:
-        return inp, llist(range(-inp.ndim, 0)), True
-
-
-def _match_levels(v, from_levels, to_levels):
-    view = []
-    permute = []
-    requires_view = False
-    size = v.size()
-    for t in to_levels:
-        try:
-            idx = from_levels.index(t)
-            permute.append(idx)
-            view.append(size[idx])
-        except ValueError:
-            view.append(1)
-            requires_view = True
-    if permute != list(range(len(permute))):
-        v = v.permute(*permute)
-    if requires_view:
-        v = v.view(*view)
-    return v
-
-
-# make a single dimension positional but do not permute it,
-# used to do multi-tensor operators where the dim being acted on
-# should not physically move if possible
-def _positional_no_permute(self, dim, expand_dim=False):
-    from . import Tensor
-
-    ptensor, levels = self._tensor, llist(self._levels)
-    try:
-        idx = levels.index(dim)
-    except ValueError:
-        if not expand_dim:
-            raise
-        idx = 0
-        ptensor = ptensor.expand(dim.size, *ptensor.size())
-        levels.insert(0, 0)
-    idx_batched = 0
-    for i in range(idx):
-        if isinstance(levels[i], int):
-            levels[i] -= 1
-            idx_batched += 1
-    levels[idx] = -idx_batched - 1
-    return Tensor.from_positional(ptensor, levels, self._has_device), idx_batched
-
-
-def seq(a, b):
-    from . import Dim
-
-    if isinstance(a, Dim) != isinstance(b, Dim):
-        return False
-    if isinstance(a, Dim):
-        return a is b
-    else:
-        return a == b
-
-
-class isin:
-    __slots__ = ()
-
-    def __contains__(self, item):
-        for x in self:
-            if seq(item, x):
-                return True
-        return False
-
-    def index(self, item):
-        for i, x in enumerate(self):
-            if seq(item, x):
-                return i
-        raise ValueError
-
-
-class llist(isin, list):
-    __slots__ = ()
-
-
-class ltuple(isin, tuple):
-    __slots__ = ()
-
-
-empty_dict = {}
-
-
-@classmethod
-def __torch_function__(self, orig, cls, args, kwargs=empty_dict):
-    from . import _Tensor, Tensor, TensorLike
-    from .delayed_mul_tensor import DelayedMulTensor
-
-    if orig is torch.Tensor.__mul__:
-        lhs, rhs = args
-        if (
-            isinstance(lhs, _Tensor)
-            and isinstance(rhs, _Tensor)
-            and lhs.ndim == 0
-            and rhs.ndim == 0
-        ):
-            return DelayedMulTensor(lhs, rhs)
-    all_dims = llist()
-    flat_args, unflatten = tree_flatten((args, kwargs))
-    device_holding_tensor = None
-    for f in flat_args:
-        if isinstance(f, _Tensor):
-            if f._has_device:
-                device_holding_tensor = f._batchtensor
-            for d in f.dims:
-                if d not in all_dims:
-                    all_dims.append(d)
-
-    def unwrap(t):
-        if isinstance(t, _Tensor):
-            r = t._batchtensor
-            if device_holding_tensor is not None and not t._has_device:
-                r = r.to(device=device_holding_tensor.device)
-            return r
-        return t
-
-    if orig in pointwise:
-        result_levels = llist()
-        to_expand = []
-        for i, f in enumerate(flat_args):
-            if isinstance(f, TensorLike):
-                ptensor, levels, _ = _tensor_levels(f)
-                if (
-                    isinstance(f, _Tensor)
-                    and not f._has_device
-                    and device_holding_tensor is not None
-                ):
-                    ptensor = ptensor.to(device=device_holding_tensor.device)
-                flat_args[i] = ptensor
-                for l in levels:
-                    if l not in result_levels:
-                        result_levels.append(l)
-                to_expand.append((i, levels))
-
-        for i, levels in to_expand:
-            flat_args[i] = _match_levels(flat_args[i], levels, result_levels)
-        args, kwargs = unflatten(flat_args)
-        result = orig(*args, **kwargs)
-
-        def wrap(t):
-            if isinstance(t, TensorLike):
-                return Tensor.from_positional(
-                    t, result_levels, device_holding_tensor is not None
-                )
-            return t
-
-        return tree_map(wrap, result)
-    else:
-
-        def wrap(t):
-            if isinstance(t, TensorLike):
-                return Tensor.from_batched(t, device_holding_tensor is not None)
-            return t
-
-        with _enable_layers(all_dims):
-            print(f"batch_tensor for {orig}")
-            args, kwargs = unflatten(unwrap(f) for f in flat_args)
-            result = orig(*args, **kwargs)
-            # print("END", orig)
-            return tree_map(wrap, result)
-
-
-def positional(self, *dims):
-    from . import Dim, DimensionBindError, Tensor
-
-    ptensor, levels = self._tensor, llist(self._levels)
-    flat_dims = llist()
-    view = []
-    needs_view = False
-    ndim = self.ndim
-    for d in dims:
-        if isinstance(d, DimList):
-            flat_dims.extend(d)
-            view.extend(e.size for e in d)
-        elif isinstance(d, Dim):
-            flat_dims.append(d)
-            view.append(d.size)
-        elif isinstance(d, int):
-            d = _wrap_dim(d, ndim, False)
-            flat_dims.append(d)
-            view.append(ptensor.size(d))
-        else:
-            flat_dims.extend(d)
-            view.append(prod(e.size for e in d))
-            needs_view = True
-
-    permute = list(range(len(levels)))
-    for i, d in enumerate(flat_dims):
-        try:
-            idx = levels.index(d)
-        except ValueError as e:
-            raise DimensionBindError(
-                f"tensor of dimensions {self.dims} does not contain dim {d}"
-            ) from e
-        p = permute[idx]
-        del levels[idx]
-        del permute[idx]
-        levels.insert(i, 0)
-        permute.insert(i, p)
-    ptensor = ptensor.permute(*permute)
-    seen = 0
-    for i in range(len(levels) - 1, -1, -1):
-        if isinstance(levels[i], int):
-            seen += 1
-            levels[i] = -seen
-    result = Tensor.from_positional(ptensor, levels, self._has_device)
-    if needs_view:
-        result = result.reshape(*view, *result.size()[len(flat_dims) :])
-    return result
-
-
-def _contains_dim(input):
-    from . import Dim
-
-    for i in input:
-        if isinstance(i, Dim):
-            return True
-
-
-def expand(self, *sizes):
-    if not _contains_dim(sizes):
-        return self.__torch_function__(torch.Tensor.expand, None, (self, *sizes))
-    dims = sizes
-    sizes = [d.size for d in dims] + [-1] * self.ndim
-    self = self.expand(*sizes)
-    return self[dims]
-
-
-_not_present = object()
-
-
-def _getarg(name, offset, args, kwargs, default):
-    if len(args) > offset:
-        return args[offset]
-    return kwargs.get(name, default)
-
-
-def _patcharg(name, offset, args, kwargs, value):
-    if len(args) > offset:
-        args[offset] = value
-    else:
-        kwargs[name] = value
-
-
-def _wrap(
-    orig, dim_offset=0, keepdim_offset=1, dim_name="dim", single_dim=False, reduce=True
-):
-    from . import Dim, Tensor, TensorLike
-
-    def fn(self, *args, **kwargs):
-        dim = _getarg(dim_name, dim_offset, args, kwargs, _not_present)
-        if dim is _not_present or (single_dim and not isinstance(dim, Dim)):
-            with _enable_layers(self.dims):
-                print(f"dim fallback batch_tensor for {orig}")
-                return Tensor.from_batched(
-                    orig(self._batchtensor, *args, **kwargs), self._has_device
-                )
-        keepdim = (
-            _getarg("keepdim", keepdim_offset, args, kwargs, False) if reduce else False
-        )
-        t, levels = self._tensor, llist(self._levels)
-        dims = _dims(dim, self._batchtensor.ndim, keepdim, single_dim)
-        dim_indices = tuple(levels.index(d) for d in dims)
-        if reduce and not keepdim:
-            new_levels = [l for i, l in enumerate(levels) if i not in dim_indices]
-        else:
-            new_levels = levels
-
-        if len(dim_indices) == 1:
-            dim_indices = dim_indices[
-                0
-            ]  # so that dims that really only take a single argument work...
-        args = list(args)
-        _patcharg(dim_name, dim_offset, args, kwargs, dim_indices)
-
-        def wrap(t):
-            if isinstance(t, TensorLike):
-                return Tensor.from_positional(t, new_levels, self._has_device)
-            return t
-
-        with _enable_layers(new_levels):
-            print(f"dim used batch_tensor for {orig}")
-            r = orig(t, *args, **kwargs)
-            return tree_map(wrap, r)
-
-    return fn
-
-
-def _def(name, *args, **kwargs):
-    from . import _Tensor
-
-    orig = getattr(torch.Tensor, name)
-    setattr(_Tensor, name, _wrap(orig, *args, **kwargs))
-
-
-no_slice = slice(None)
-
-_orig_getitem = torch.Tensor.__getitem__
-
-
-class dim_tracker:
-    def __init__(self) -> None:
-        self.dims = llist()
-        self.count = []
-
-    def record(self, d):
-        if d not in self.dims:
-            self.dims.append(d)
-            self.count.append(1)
-
-    def __getitem__(self, d):
-        return self.count[self.dims.index(d)]
-
-
-def t__getitem__(self, input):
-    from . import _Tensor, Dim, DimensionBindError, DimList, Tensor, TensorLike
-
-    # * bail to original example if we have a single non-Dim tensor, or a non-tensor
-    # * locate ... or an unbound tensor list, and determine its size, bind dim list
-    #   (remember that None does not count to the total dim count)
-    # * bind simple dims and dim-packs to their sizes, count the number of uses of each dim,
-    #   produce the re-view if needed
-    # * for each single-use dim index, replace with no_slice and mark that it will be added
-    #   (keep track of whether we have to call super)
-    # * call super if needed
-    # * if we have dims to bind, bind them (it will help if we eliminated ... and None before)
-    # this handles bool indexing handling, as well as some other simple cases.
-
-    is_simple = (
-        not isinstance(input, Dim)
-        and not isinstance(input, (tuple, list))
-        and
-        # WAR for functorch bug where zero time tensors in getitem are not handled correctly.
-        not (isinstance(input, TensorLike) and input.ndim == 0)
-    )
-
-    if is_simple:
-        if isinstance(self, _Tensor):
-            return _Tensor.__torch_function__(_orig_getitem, None, (self, input))
-        else:
-            return _orig_getitem(self, input)
-
-    # can further optimize this case
-    if not isinstance(input, tuple):
-        input = [input]
-    else:
-        input = list(input)
-
-    dims_indexed = 0
-    expanding_object = None
-    dimlists = []
-    for i, s in enumerate(input):
-        if s is ... or isinstance(s, DimList) and not s.is_bound:
-            if expanding_object is not None:
-                msg = (
-                    "at most one ... or unbound dimension list can exist in indexing list but"
-                    f" found 2 at offsets {i} and {expanding_object}"
-                )
-                raise DimensionBindError(msg)
-            expanding_object = i
-
-        if isinstance(s, DimList):
-            dims_indexed += len(s) if s.is_bound else 0
-            dimlists.append(i)
-        elif s is not None and s is not ...:
-            dims_indexed += 1
-
-    ndim = self.ndim
-    if dims_indexed > ndim:
-        raise IndexError(
-            f"at least {dims_indexed} indices were supplied but the tensor only has {ndim} dimensions."
-        )
-    if expanding_object is not None:
-        expanding_ndims = ndim - dims_indexed
-        obj = input[expanding_object]
-        if obj is ...:
-            input[expanding_object : expanding_object + 1] = [
-                no_slice
-            ] * expanding_ndims
-        else:
-            obj.bind_len(expanding_ndims)
-    # flatten the dimslists into the indexing
-    for i in reversed(dimlists):
-        input[i : i + 1] = input[i]
-    dims_indexed = 0
-    requires_view = False
-    size = self.size()
-    view_sizes = []
-    dims_seen = dim_tracker()
-
-    def add_dims(t):
-        if not isinstance(t, _Tensor):
-            return
-        for d in t.dims:
-            dims_seen.record(d)
-
-    add_dims(self)
-    dim_packs = []
-    for i, idx in enumerate(input):
-        if idx is None:
-            input[i] = no_slice
-            view_sizes.append(1)
-            requires_view = True
-        else:
-            sz = size[dims_indexed]
-            if isinstance(idx, Dim):
-                idx.size = sz
-                dims_seen.record(idx)
-                view_sizes.append(sz)
-            elif isinstance(idx, (tuple, list)) and idx and isinstance(idx[0], Dim):
-                for d in idx:
-                    dims_seen.record(idx)
-                _bind_dims_to_size(sz, idx, f"offset {i}")
-                view_sizes.extend(d.size for d in idx)
-                requires_view = True
-                dim_packs.append(i)
-            else:
-                add_dims(idx)
-                view_sizes.append(sz)
-            dims_indexed += 1
-    if requires_view:
-        self = self.view(*view_sizes)
-    for i in reversed(dim_packs):
-        input[i : i + 1] = input[i]
-
-    # currently:
-    # input is flat, containing either Dim, or Tensor, or something valid for standard indexing
-    # self may have first-class dims as well.
-
-    # to index:
-    # drop the first class dims from self, they just become direct indices of their positions
-
-    # figure out the dimensions of the indexing tensors: union of all the dims in the tensors in the index.
-    # these dimensions will appear and need to be bound at the first place tensor occurs
-
-    if isinstance(self, _Tensor):
-        ptensor_self, levels = self._tensor, list(self._levels)
-        # indices to ptensor rather than self which has first-class dimensions
-        input_it = iter(input)
-        flat_inputs = [next(input_it) if isinstance(l, int) else l for l in levels]
-        has_device = self._has_device
-        to_pad = 0
-    else:
-        ptensor_self, flat_inputs = self, input
-        to_pad = ptensor_self.ndim - len(flat_inputs)
-        has_device = True
-
-    result_levels = []
-    index_levels = []
-    tensor_insert_point = None
-    to_expand = {}
-    requires_getindex = False
-    for i, inp in enumerate(flat_inputs):
-        if isinstance(inp, Dim) and dims_seen[inp] == 1:
-            flat_inputs[i] = no_slice
-            result_levels.append(inp)
-        elif isinstance(inp, TensorLike):
-            requires_getindex = True
-            if tensor_insert_point is None:
-                tensor_insert_point = len(result_levels)
-            ptensor, levels, _ = _tensor_levels(inp)
-            to_expand[i] = levels
-            flat_inputs[i] = ptensor
-            for l in levels:
-                if l not in index_levels:
-                    index_levels.append(l)
-        else:
-            requires_getindex = True
-            result_levels.append(0)
-
-    if tensor_insert_point is not None:
-        result_levels[tensor_insert_point:tensor_insert_point] = index_levels
-
-    for i, levels in to_expand.items():
-        flat_inputs[i] = _match_levels(flat_inputs[i], levels, index_levels)
-
-    if requires_getindex:
-        result = _orig_getitem(ptensor_self, flat_inputs)
-    else:
-        result = ptensor_self
-
-    next_positional = -1
-    if to_pad > 0:
-        result_levels.extend([0] * to_pad)
-    for i, r in enumerate(reversed(result_levels)):
-        if isinstance(r, int):
-            result_levels[-1 - i] = next_positional
-            next_positional -= 1
-
-    return Tensor.from_positional(result, result_levels, has_device)
-
-
-# XXX - dim is optional and can be the outer-most dimension...
-def stack(tensors, new_dim, dim=0, out=None):
-    if isinstance(dim, int):
-        return torch.stack(tensors, dim, out).index(dim, new_dim)
-    index = None
-    if out is not None:
-        out, index = _positional_no_permute(out, dim, expand_dim=True)
-    ptensors = []
-    for t in tensors:
-        pt, pi = _positional_no_permute(t, dim, expand_dim=True)
-        if index is not None and pi != index:
-            pt = pt.move_dim(pi, index)
-        else:
-            index = pi
-        ptensors.append(pt)
-    pr = torch.stack(ptensors, index, out=out)
-    return pr.index((index, index + 1), (new_dim, dim))
-
-
-_orig_split = torch.Tensor.split
-
-
-def split(self, split_size_or_sections, dim=0):
-    from . import _Tensor, Dim
-
-    if isinstance(split_size_or_sections, int) or any(
-        isinstance(t, int) for t in split_size_or_sections
-    ):
-        if isinstance(dim, Dim):
-            raise ValueError(
-                "when dim is specified as a Dim object, split sizes must also be dimensions."
-            )
-        return _orig_split(self, split_size_or_sections, dim=dim)
-
-    if isinstance(dim, Dim):
-        assert isinstance(self, _Tensor), f"Tensor does not have dimension {dim}"
-        self, dim = _positional_no_permute(self, dim)
-
-    size = self.size(dim)
-    total_bound_size = 0
-    unbound = []
-    sizes = []
-    for i, d in enumerate(split_size_or_sections):
-        if d.is_bound:
-            sizes.append(d.size)
-            total_bound_size += d.size
-        else:
-            sizes.append(0)
-            unbound.append(i)
-
-    if unbound:
-        assert total_bound_size <= size, (
-            f"result dimensions are larger than original: {total_bound_size} vs {size} ({split_size_or_sections})"
-        )
-        remaining_size = size - total_bound_size
-        chunk_size = -(-remaining_size // len(unbound))
-        for u in unbound:
-            sz = min(chunk_size, remaining_size)
-            split_size_or_sections[u].size = sz
-            sizes[u] = sz
-            remaining_size -= sz
-    else:
-        assert total_bound_size == size, (
-            f"result dimensions do not match original: {total_bound_size} vs {size} ({split_size_or_sections})"
-        )
-    return tuple(
-        t.index(dim, d)
-        for d, t in zip(split_size_or_sections, _orig_split(self, sizes, dim=dim))
-    )
diff --git a/functorch/dim/wrap_type.py b/functorch/dim/wrap_type.py
index aae543b91a896..b9ebda47c4cfe 100644
--- a/functorch/dim/wrap_type.py
+++ b/functorch/dim/wrap_type.py
@@ -26,18 +26,8 @@
 PROPERTY_TYPES = (GetSetDescriptorType, property)
 
 
-def _py_wrap_method(orig, __torch_function__):
-    def impl(*args, **kwargs):
-        return __torch_function__(orig, None, args, kwargs)
-
-    return impl
-
-
-def wrap_type(use_c, to_patch, pattern, __torch_function__):
-    if use_c:
-        wrap_method = _wrap_method
-    else:
-        wrap_method = _py_wrap_method
+def wrap_type(to_patch, pattern, __torch_function__):
+    wrap_method = _wrap_method
 
     all = {}
     for t in reversed(pattern.mro()[:-1]):  # skip object
diff --git a/pyproject.toml b/pyproject.toml
index c42aa782407fa..afc5aba2ccd3c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,39 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"
 
+[dependency-groups]
+dev = [
+    # This list should be kept in sync with the requirements-build.txt
+    # in PyTorch root until the project fully migrates to pyproject.toml
+    # after which this can be removed as it is already specified in the
+    # [build-system] section
+    "setuptools>=70.1.0,<80.0",  # setuptools develop deprecated on 80.0
+    "cmake>=3.27",
+    "ninja",
+    "numpy",
+    "packaging",
+    "pyyaml",
+    "requests",
+    "six",  # dependency chain: NNPACK -> PeachPy -> six
+    "typing-extensions>=4.10.0",
+
+    # This list should be kept in sync with the requirements.txt in
+    # PyTorch root until the project fully migrates to pyproject.toml
+    "build[uv]",
+    "expecttest>=0.3.0",
+    "filelock",
+    "fsspec>=0.8.5",
+    "hypothesis",
+    "jinja2",
+    "lintrunner; platform_machine != 's390x' and platform_machine != 'riscv64'",
+    "networkx>=2.5.1",
+    "optree>=0.13.0",
+    "psutil",
+    "sympy>=1.13.3",
+    "typing-extensions>=4.13.2",
+    "wheel",
+]
+
 [project]
 name = "torch"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
@@ -69,9 +102,6 @@ pyyaml = ["pyyaml"]
 
 # Linter tools #################################################################
 
-[tool.black]
-line-length = 88
-
 [tool.isort]
 src_paths = ["caffe2", "torch", "torchgen", "functorch", "test"]
 extra_standard_library = ["typing_extensions"]
diff --git a/pyrefly.toml b/pyrefly.toml
index e063747349771..6b94aeb5c1ca5 100644
--- a/pyrefly.toml
+++ b/pyrefly.toml
@@ -1,4 +1,4 @@
-project_includes = [
+project-includes = [
     "torch",
     "caffe2",
     "test/test_bundled_images.py",
@@ -7,12 +7,11 @@ project_includes = [
     "test/test_datapipe.py",
     "test/test_futures.py",
     "test/test_numpy_interop.py",
-    "test/test_torch.py",
     "test/test_type_hints.py",
     "test/test_type_info.py",
     "test/test_utils.py",
 ]
-project_excludes = [
+project-excludes = [
   "torch/include/**",
   "torch/csrc/**",
   "torch/distributed/elastic/agent/server/api.py",
@@ -27,7 +26,7 @@ project_excludes = [
   "*/__pycache__/**",
   "*/.*",
 ]
-replace_imports_with_any = [
+ignore-missing-imports = [
     "torch._C._jit_tree_views.*",
     "torch.for_onnx.onnx.*",
     "torch.ao.quantization.experimental.apot_utils.*",
@@ -85,4 +84,16 @@ replace_imports_with_any = [
     "redis.*"
 ]
 
-untyped_def_behavior = "check-and-infer-return-any"
\ No newline at end of file
+untyped_def_behavior = "check-and-infer-return-any"
+
+# Shut off noisy errors
+errors.implicit-import = false
+
+# We exclude test_torch.py because it is full of errors, but most functions lack type signatures,
+# and mypy.ini specifies `check_untyped_defs = False` for this file.
+# If you check even the unannotated stuff mypy produces 322 errors.
+# "test/test_torch.py",
+# Uncomment this file to check
+# [[tool.pyrefly.sub-config]]
+# matches = "test/test_torch.py"
+# untyped-def-behavior = "skip-and-infer-return-any"
diff --git a/related_commits b/related_commits
index 8f36656c15abc..a4c1f27eccf59 100644
--- a/related_commits
+++ b/related_commits
@@ -1,10 +1,10 @@
 ubuntu|pytorch|apex|master|4b03581558a063754bc1c4c9656bf6444844568c|https://github.com/ROCm/apex
 centos|pytorch|apex|master|4b03581558a063754bc1c4c9656bf6444844568c|https://github.com/ROCm/apex
-ubuntu|pytorch|torchvision|main|98f8b3757c0648724064ca95434b18281c43c5f6|https://github.com/pytorch/vision
-centos|pytorch|torchvision|main|98f8b3757c0648724064ca95434b18281c43c5f6|https://github.com/pytorch/vision
-ubuntu|pytorch|torchdata|main|a05a54f797dd0f1a66610652a949fd47243ff952|https://github.com/pytorch/data
-centos|pytorch|torchdata|main|a05a54f797dd0f1a66610652a949fd47243ff952|https://github.com/pytorch/data
-ubuntu|pytorch|torchaudio|main|0c22347335f4c9a5b92a2f5bad65e05e2464c184|https://github.com/pytorch/audio
-centos|pytorch|torchaudio|main|0c22347335f4c9a5b92a2f5bad65e05e2464c184|https://github.com/pytorch/audio
-ubuntu|pytorch|ao|main|3b4bc9869d933927b2547d8231feab69789a80d4|https://github.com/pytorch/ao
-centos|pytorch|ao|main|3b4bc9869d933927b2547d8231feab69789a80d4|https://github.com/pytorch/ao
+ubuntu|pytorch|torchvision|main|a8dc530fbb96a66a68dfbf48743fb528df770d87|https://github.com/pytorch/vision
+centos|pytorch|torchvision|main|a8dc530fbb96a66a68dfbf48743fb528df770d87|https://github.com/pytorch/vision
+ubuntu|pytorch|torchdata|main|92950795e0790eb74df995daf40b658e85fd2c9f|https://github.com/pytorch/data
+centos|pytorch|torchdata|main|92950795e0790eb74df995daf40b658e85fd2c9f|https://github.com/pytorch/data
+ubuntu|pytorch|torchaudio|main|2685e6b932d666435a198bfed472015bb5b87a55|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|main|2685e6b932d666435a198bfed472015bb5b87a55|https://github.com/pytorch/audio
+ubuntu|pytorch|ao|main|0df571af05e2fdc516022c98f6686a6888b81e1a|https://github.com/pytorch/ao
+centos|pytorch|ao|main|0df571af05e2fdc516022c98f6686a6888b81e1a|https://github.com/pytorch/ao
diff --git a/scripts/lintrunner.py b/scripts/lintrunner.py
new file mode 100644
index 0000000000000..2e3ad2bc219ab
--- /dev/null
+++ b/scripts/lintrunner.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+"""
+Wrapper script to run the isolated hook version of lintrunner.
+
+This allows developers to easily run lintrunner (including with -a for auto-fixes)
+using the same isolated environment that the pre-push hook uses, without having
+to manually activate/deactivate virtual environments.
+
+Usage:
+    python scripts/lintrunner.py          # Check mode (same as git push)
+    python scripts/lintrunner.py -a       # Auto-fix mode
+    python scripts/lintrunner.py --help   # Show lintrunner help
+
+This module also provides shared functionality for lintrunner hash management.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import os
+import shlex
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+
+def find_repo_root() -> Path:
+    """Find repository root using git."""
+    try:
+        result = subprocess.run(
+            ["git", "rev-parse", "--show-toplevel"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        return Path(result.stdout.strip())
+    except subprocess.CalledProcessError:
+        sys.exit("❌ Not in a git repository")
+
+
+def compute_file_hash(path: Path) -> str:
+    """Returns SHA256 hash of a file's contents."""
+    hasher = hashlib.sha256()
+    with path.open("rb") as f:
+        while chunk := f.read(8192):
+            hasher.update(chunk)
+    return hasher.hexdigest()
+
+
+def read_stored_hash(path: Path) -> str | None:
+    if not path.exists():
+        return None
+    try:
+        return path.read_text().strip()
+    except Exception:
+        return None
+
+
+# Venv location - change this if the path changes
+HOOK_VENV_PATH = ".git/hooks/linter/.venv"
+
+
+def get_hook_venv_path() -> Path:
+    """Get the path to the hook virtual environment."""
+    repo_root = find_repo_root()
+    return repo_root / HOOK_VENV_PATH
+
+
+def find_hook_venv() -> Path:
+    """Locate the isolated hook virtual environment."""
+    venv_dir = get_hook_venv_path()
+
+    if not venv_dir.exists():
+        sys.exit(
+            f"❌ Hook virtual environment not found at {venv_dir}\n"
+            "   Please set this up by running: python scripts/setup_hooks.py"
+        )
+
+    return venv_dir
+
+
+def check_lintrunner_installed(venv_dir: Path) -> None:
+    """Check if lintrunner is installed in the given venv, exit if not."""
+    result = subprocess.run(
+        [
+            "uv",
+            "pip",
+            "show",
+            "--python",
+            str(venv_dir / "bin" / "python"),
+            "lintrunner",
+        ],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    if result.returncode != 0:
+        sys.exit(
+            "❌ lintrunner is required but was not found in the hook environment. "
+            "Please run `python scripts/setup_hooks.py` to reinstall."
+        )
+    print("✅ lintrunner is already installed")
+
+
+def run_lintrunner(venv_dir: Path, args: list[str]) -> int:
+    """Run lintrunner command in the specified venv and return exit code."""
+    # Run lintrunner directly from the venv's bin directory with environment setup
+    lintrunner_exe = venv_dir / "bin" / "lintrunner"
+    cmd = [str(lintrunner_exe)] + args
+    env = os.environ.copy()
+
+    # PATH: Ensures lintrunner can find other tools in the venv (like python, pip, etc.)
+    env["PATH"] = str(venv_dir / "bin") + os.pathsep + env.get("PATH", "")
+    # VIRTUAL_ENV: Tells tools like pip_init.py that we're in a venv (prevents --user flag issues)
+    env["VIRTUAL_ENV"] = str(venv_dir)
+
+    # Note: Progress tends to be slightly garbled due to terminal control sequences,
+    # but functionality and final results will be correct
+    return subprocess.call(cmd, env=env)
+
+
+def initialize_lintrunner_if_needed(venv_dir: Path) -> None:
+    """Check if lintrunner needs initialization and run init if needed."""
+    repo_root = find_repo_root()
+    lintrunner_toml_path = repo_root / ".lintrunner.toml"
+    initialized_hash_path = venv_dir / ".lintrunner_plugins_hash"
+
+    if not lintrunner_toml_path.exists():
+        print("⚠️ No .lintrunner.toml found. Skipping init.")
+        return
+
+    current_hash = compute_file_hash(lintrunner_toml_path)
+    stored_hash = read_stored_hash(initialized_hash_path)
+
+    if current_hash != stored_hash:
+        print("🔁 Running `lintrunner init` …", file=sys.stderr)
+        result = run_lintrunner(venv_dir, ["init"])
+        if result != 0:
+            sys.exit(f"❌ lintrunner init failed")
+        initialized_hash_path.write_text(current_hash)
+    else:
+        print("✅ Lintrunner plugins already initialized and up to date.")
+
+
+def main() -> None:
+    """Run lintrunner in the isolated hook environment."""
+    venv_dir = find_hook_venv()
+    python_exe = venv_dir / "bin" / "python"
+
+    if not python_exe.exists():
+        sys.exit(f"❌ Python executable not found at {python_exe}")
+
+    try:
+        print(f"🐍 Virtual env being used: {venv_dir}", file=sys.stderr)
+
+        # 1. Ensure lintrunner binary is available in the venv
+        check_lintrunner_installed(venv_dir)
+
+        # 2. Check for plugin updates and re-init if needed
+        initialize_lintrunner_if_needed(venv_dir)
+
+        # 3. Run lintrunner with any passed arguments and propagate its exit code
+        args = sys.argv[1:]
+        result = run_lintrunner(venv_dir, args)
+
+        # If lintrunner failed and we're not already in auto-fix mode, suggest the wrapper
+        if result != 0 and "-a" not in args:
+            print(
+                "\n💡 To auto-fix these issues, run: python scripts/lintrunner.py -a",
+                file=sys.stderr,
+            )
+
+        sys.exit(result)
+
+    except KeyboardInterrupt:
+        print("\n  Lintrunner interrupted by user (KeyboardInterrupt)", file=sys.stderr)
+        sys.exit(1)  # Tell git push to fail
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_lintrunner.py b/scripts/run_lintrunner.py
deleted file mode 100644
index 60d5b545cf917..0000000000000
--- a/scripts/run_lintrunner.py
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/usr/bin/env python3
-"""
-Pre‑push hook wrapper for Lintrunner.
-
-✓ Stores a hash of .lintrunner.toml in the venv
-✓ Re-runs `lintrunner init` if that file's hash changes
-"""
-
-from __future__ import annotations
-
-import hashlib
-import os
-import shutil
-import subprocess
-import sys
-from pathlib import Path
-
-
-REPO_ROOT = Path(__file__).resolve().parents[1]
-LINTRUNNER_TOML_PATH = REPO_ROOT / ".lintrunner.toml"
-
-# This is the path to the pre-commit-managed venv
-VENV_ROOT = Path(sys.executable).parent.parent
-# Stores the hash of .lintrunner.toml from the last time we ran `lintrunner init`
-INITIALIZED_LINTRUNNER_TOML_HASH_PATH = VENV_ROOT / ".lintrunner_plugins_hash"
-
-
-def ensure_lintrunner() -> None:
-    """Fail if Lintrunner is not on PATH."""
-    if shutil.which("lintrunner"):
-        print("✅ lintrunner is already installed")
-        return
-    sys.exit(
-        "❌ lintrunner is required but was not found on your PATH. Please run the `python scripts/setup_hooks.py` to install to configure lintrunner before using this script. If `git push` still fails, you may need to open an new terminal"
-    )
-
-
-def ensure_virtual_environment() -> None:
-    """Fail if not running within a virtual environment."""
-    in_venv = (
-        os.environ.get("VIRTUAL_ENV") is not None
-        or hasattr(sys, "real_prefix")
-        or (hasattr(sys, "base_prefix") and sys.base_prefix != sys.prefix)
-    )
-
-    if not in_venv:
-        sys.exit(
-            "❌ This script must be run from within a virtual environment. "
-            "Please activate your virtual environment before running this script."
-        )
-
-
-def compute_file_hash(path: Path) -> str:
-    """Returns SHA256 hash of a file's contents."""
-    hasher = hashlib.sha256()
-    with path.open("rb") as f:
-        while chunk := f.read(8192):
-            hasher.update(chunk)
-    return hasher.hexdigest()
-
-
-def read_stored_hash(path: Path) -> str | None:
-    if not path.exists():
-        return None
-    try:
-        return path.read_text().strip()
-    except Exception:
-        return None
-
-
-def initialize_lintrunner_if_needed() -> None:
-    """Runs lintrunner init if .lintrunner.toml changed since last run."""
-    if not LINTRUNNER_TOML_PATH.exists():
-        print("⚠️ No .lintrunner.toml found. Skipping init.")
-        return
-
-    print(
-        f"INITIALIZED_LINTRUNNER_TOML_HASH_PATH = {INITIALIZED_LINTRUNNER_TOML_HASH_PATH}"
-    )
-    current_hash = compute_file_hash(LINTRUNNER_TOML_PATH)
-    stored_hash = read_stored_hash(INITIALIZED_LINTRUNNER_TOML_HASH_PATH)
-
-    if current_hash == stored_hash:
-        print("✅ Lintrunner plugins already initialized and up to date.")
-        return
-
-    print("🔁 Running `lintrunner init` …", file=sys.stderr)
-    subprocess.check_call(["lintrunner", "init"])
-    INITIALIZED_LINTRUNNER_TOML_HASH_PATH.write_text(current_hash)
-
-
-def main() -> None:
-    # 0. Ensure we're running in a virtual environment
-    ensure_virtual_environment()
-    print(f"🐍 Virtual env being used: {VENV_ROOT}", file=sys.stderr)
-
-    # 1. Ensure lintrunner binary is available
-    ensure_lintrunner()
-
-    # 2. Check for plugin updates and re-init if needed
-    initialize_lintrunner_if_needed()
-
-    # 3. Run lintrunner with any passed arguments and propagate its exit code
-    args = sys.argv[1:]  # Forward all arguments to lintrunner
-    result = subprocess.call(["lintrunner"] + args)
-    sys.exit(result)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/setup_hooks.py b/scripts/setup_hooks.py
index 41f08d45e98b6..e8effe7f82325 100644
--- a/scripts/setup_hooks.py
+++ b/scripts/setup_hooks.py
@@ -1,31 +1,51 @@
 #!/usr/bin/env python3
 """
-Bootstrap Git pre‑push hook.
+Bootstrap Git pre‑push hook with isolated virtual environment.
 
 ✓ Requires uv to be installed (fails if not available)
-✓ Installs/updates pre‑commit with uv  (global, venv‑proof)
-✓ Registers the repo's pre‑push hook and freezes hook versions
+✓ Creates isolated venv in .git/hooks/linter/.venv/ for hook dependencies
+✓ Installs lintrunner only in the isolated environment
+✓ Creates direct git hook that bypasses pre-commit
 
 Run this from the repo root (inside or outside any project venv):
 
     python scripts/setup_hooks.py
+
+IMPORTANT: The generated git hook references scripts/lintrunner.py. If users checkout
+branches that don't have this file, git push will fail with "No such file or directory".
+Users would need to either:
+1. Re-run the old setup_hooks.py from that branch, or
+2. Manually delete .git/hooks/pre-push to disable hooks temporarily, or
+3. Switch back to a branch with the new scripts/lintrunner.py
 """
 
 from __future__ import annotations
 
+import shlex
 import shutil
 import subprocess
 import sys
 from pathlib import Path
-from typing import Tuple
+
+
+# Add scripts directory to Python path so we can import lintrunner module
+scripts_dir = Path(__file__).parent
+sys.path.insert(0, str(scripts_dir))
+
+# Import shared functions from lintrunner module
+from lintrunner import find_repo_root, get_hook_venv_path
+
+
+# Restore sys.path to avoid affecting other imports
+sys.path.pop(0)
 
 
 # ───────────────────────────────────────────
 # Helper utilities
 # ───────────────────────────────────────────
-def run(cmd: list[str]) -> None:
+def run(cmd: list[str], cwd: Path = None) -> None:
     print(f"$ {' '.join(cmd)}")
-    subprocess.check_call(cmd)
+    subprocess.check_call(cmd, cwd=cwd)
 
 
 def which(cmd: str) -> bool:
@@ -34,28 +54,7 @@ def which(cmd: str) -> bool:
 
 def ensure_uv() -> None:
     if which("uv"):
-        # Ensure the path uv installs binaries to is part of the system path
-        print("$ uv tool update-shell")
-        result = subprocess.run(
-            ["uv", "tool", "update-shell"], capture_output=True, text=True
-        )
-        if result.returncode == 0:
-            # Check if the output indicates changes were made
-            if (
-                "Updated" in result.stdout
-                or "Added" in result.stdout
-                or "Modified" in result.stdout
-            ):
-                print(
-                    "⚠️  Shell configuration updated. You may need to restart your terminal for changes to take effect."
-                )
-            elif result.stdout.strip():
-                print(result.stdout)
-            return
-        else:
-            sys.exit(
-                f"❌ Warning: uv tool update-shell failed: {result.stderr}. uv installed tools may not be available."
-            )
+        return
 
     sys.exit(
         "\n❌  uv is required but was not found on your PATH.\n"
@@ -65,29 +64,6 @@ def ensure_uv() -> None:
     )
 
 
-def ensure_tool_installed(
-    tool: str, force_update: bool = False, python_ver: Tuple[int, int] = None
-) -> None:
-    """
-    Checks to see if the tool is available and if not (or if force update requested) then
-    it reinstalls it.
-
-    Returns: Whether or not the tool is available on PATH.  If it's not, a new terminal
-    needs to be opened before git pushes work as expected.
-    """
-    if force_update or not which(tool):
-        print(f"Ensuring latest {tool} via uv …")
-        command = ["uv", "tool", "install", "--force", tool]
-        if python_ver:
-            # Add the Python version to the command if specified
-            command.extend(["--python", f"{python_ver[0]}.{python_ver[1]}"])
-        run(command)
-        if not which(tool):
-            print(
-                f"\n⚠️  {tool} installation succeed, but it's not on PATH. Launch a new terminal if your git pushes don't work.\n"
-            )
-
-
 if sys.platform.startswith("win"):
     print(
         "\n⚠️  Lintrunner is not supported on Windows, so there are no pre-push hooks to add. Exiting setup.\n"
@@ -95,52 +71,61 @@ def ensure_tool_installed(
     sys.exit(0)
 
 # ───────────────────────────────────────────
-# 1. Install dependencies
+# 1. Setup isolated hook environment
 # ───────────────────────────────────────────
 
 ensure_uv()
 
-# Ensure pre-commit is installed globally via uv
-ensure_tool_installed("pre-commit", force_update=True, python_ver=(3, 9))
+# Find repo root and setup hook directory
+repo_root = find_repo_root()
+venv_dir = get_hook_venv_path()
+hooks_dir = venv_dir.parent.parent  # Go from .git/hooks/linter/.venv to .git/hooks
+
 
-# Don't force a lintrunner update because it might break folks
-# who already have it installed in a different way
-ensure_tool_installed("lintrunner")
+print(f"Setting up isolated hook environment in {venv_dir}")
+
+# Create isolated virtual environment for hooks
+if venv_dir.exists():
+    print("Removing existing hook venv...")
+    shutil.rmtree(venv_dir)
+
+run(["uv", "venv", str(venv_dir), "--python", "3.9"])
+
+# Install lintrunner in the isolated environment
+print("Installing lintrunner in isolated environment...")
+run(
+    ["uv", "pip", "install", "--python", str(venv_dir / "bin" / "python"), "lintrunner"]
+)
 
 # ───────────────────────────────────────────
-# 2. Activate (or refresh) the pre‑push hook
+# 2. Create direct git pre-push hook
 # ───────────────────────────────────────────
 
-# ── Activate (or refresh) the repo’s pre‑push hook ──────────────────────────
-# Creates/overwrites .git/hooks/pre‑push with a tiny shim that will call
-# `pre-commit run --hook-stage pre-push` on every `git push`.
-# This is why we need to install pre-commit globally.
-#
-# The --allow-missing-config flag lets pre-commit succeed if someone changes to
-# a branch that doesn't have pre-commit installed
-run(
-    [
-        "uv",
-        "tool",
-        "run",
-        "pre-commit",
-        "install",
-        "--hook-type",
-        "pre-push",
-        "--allow-missing-config",
-    ]
+pre_push_hook = hooks_dir / "pre-push"
+python_exe = venv_dir / "bin" / "python"
+lintrunner_script_path_quoted = shlex.quote(
+    str(repo_root / "scripts" / "lintrunner.py")
 )
 
-# ── Pin remote‑hook versions for reproducibility ────────────────────────────
-# (Note: we don't have remote hooks right now, but it future-proofs this script)
-# 1. `autoupdate` bumps every remote hook’s `rev:` in .pre-commit-config.yaml
-#    to the latest commit on its default branch.
-# 2. `--freeze` immediately rewrites each `rev:` to the exact commit SHA,
-#    ensuring all contributors and CI run identical hook code.
-run(["uv", "tool", "run", "pre-commit", "autoupdate", "--freeze"])
+hook_script = f"""#!/bin/bash
+set -e
+
+# Check if lintrunner script exists (user might be on older commit)
+if [ ! -f {lintrunner_script_path_quoted} ]; then
+    echo "⚠️  {lintrunner_script_path_quoted} not found - skipping linting (likely on an older commit)"
+    exit 0
+fi
+
+# Run lintrunner wrapper using the isolated venv's Python
+{shlex.quote(str(python_exe))} {lintrunner_script_path_quoted}
+"""
 
+print(f"Creating git pre-push hook at {pre_push_hook}")
+pre_push_hook.write_text(hook_script)
+pre_push_hook.chmod(0o755)  # Make executable
 
 print(
-    "\n✅  pre‑commit is installed globally via uv and the pre‑push hook is active.\n"
+    "\n✅  Isolated hook environment created and pre‑push hook is active.\n"
     "   Lintrunner will now run automatically on every `git push`.\n"
+    f"   Hook dependencies are isolated in {venv_dir}\n"
 )
diff --git a/setup.py b/setup.py
index 189a78c23bbb6..c1803ef25567b 100644
--- a/setup.py
+++ b/setup.py
@@ -156,6 +156,12 @@
 #   USE_ROCM_KERNEL_ASSERT=1
 #     Enable kernel assert in ROCm platform
 #
+#   USE_ROCM_CK_GEMM=1
+#     Enable building CK GEMM backend in ROCm platform
+#
+#   USE_ROCM_CK_SDPA=1
+#     Enable building CK SDPA backend in ROCm platform
+#
 # Environment variables we respect (these environment variables are
 # conventional and are often understood/set by other software.)
 #
@@ -229,6 +235,11 @@
 #
 #   BUILD_PYTHON_ONLY
 #      Builds pytorch as a wheel using libtorch.so from a separate wheel
+#
+#   USE_NIGHTLY=VERSION
+#      Skip cmake build and instead download and extract nightly PyTorch wheel
+#      matching the specified version (e.g., USE_NIGHTLY="2.8.0.dev20250608+cpu")
+#      into the local directory for development use
 
 from __future__ import annotations
 
@@ -266,8 +277,10 @@
 import shutil
 import subprocess
 import sysconfig
+import tempfile
 import textwrap
 import time
+import zipfile
 from collections import defaultdict
 from pathlib import Path
 from typing import Any, ClassVar, IO
@@ -407,6 +420,41 @@ def _get_package_path(package_name: str) -> Path:
     if arg == "rebuild" or arg == "build":
         arg = "build"  # rebuild is gone, make it build
         EMIT_BUILD_WARNING = True
+    if arg == "develop":
+        print(
+            (
+                "WARNING: Redirecting 'python setup.py develop' to 'pip install -e . -v --no-build-isolation',"
+                " for more info see https://github.com/pytorch/pytorch/issues/152276"
+            ),
+            file=sys.stderr,
+        )
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "pip",
+                "install",
+                "-e",
+                ".",
+                "-v",
+                "--no-build-isolation",
+            ],
+            env={**os.environ},
+        )
+        sys.exit(result.returncode)
+    if arg == "install":
+        print(
+            (
+                "WARNING: Redirecting 'python setup.py install' to 'pip install . -v --no-build-isolation',"
+                " for more info see https://github.com/pytorch/pytorch/issues/152276"
+            ),
+            file=sys.stderr,
+        )
+        result = subprocess.run(
+            [sys.executable, "-m", "pip", "install", ".", "-v", "--no-build-isolation"],
+            env={**os.environ},
+        )
+        sys.exit(result.returncode)
     if arg == "--":
         filtered_args += sys.argv[i:]
         break
@@ -588,9 +636,372 @@ def mirror_files_into_torchgen() -> None:
         raise RuntimeError("Check the file paths in `mirror_files_into_torchgen()`")
 
 
+# ATTENTION: THIS IS AI SLOP
+def extract_variant_from_version(version: str) -> str:
+    """Extract variant from version string, defaulting to 'cpu'."""
+    import re
+
+    variant_match = re.search(r"\+([^-\s,)]+)", version)
+    return variant_match.group(1) if variant_match else "cpu"
+
+
+# ATTENTION: THIS IS AI SLOP
+def get_nightly_git_hash(version: str) -> str:
+    """Download a nightly wheel and extract the git hash from its version.py file."""
+    # Extract variant from version to construct correct URL
+    variant = extract_variant_from_version(version)
+    nightly_index_url = f"https://download.pytorch.org/whl/nightly/{variant}/"
+
+    torch_version_spec = f"torch=={version}"
+
+    # Create a temporary directory for downloading
+    with tempfile.TemporaryDirectory(prefix="pytorch-hash-extract-") as temp_dir:
+        temp_path = Path(temp_dir)
+
+        # Download the wheel
+        report(f"-- Downloading {version} wheel to extract git hash...")
+        download_cmd = [
+            "uvx",
+            "pip",
+            "download",
+            "--index-url",
+            nightly_index_url,
+            "--pre",
+            "--no-deps",
+            "--dest",
+            str(temp_path),
+            torch_version_spec,
+        ]
+
+        result = subprocess.run(download_cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"Failed to download {version} wheel for git hash extraction: {result.stderr}"
+            )
+
+        # Find the downloaded wheel file
+        wheel_files = list(temp_path.glob("torch-*.whl"))
+        if not wheel_files:
+            raise RuntimeError(f"No torch wheel found after downloading {version}")
+
+        wheel_file = wheel_files[0]
+
+        # Extract the wheel and look for version.py
+        with tempfile.TemporaryDirectory(
+            prefix="pytorch-wheel-extract-"
+        ) as extract_dir:
+            extract_path = Path(extract_dir)
+
+            with zipfile.ZipFile(wheel_file, "r") as zip_ref:
+                zip_ref.extractall(extract_path)
+
+            # Find torch directory and version.py
+            torch_dirs = list(extract_path.glob("torch"))
+            if not torch_dirs:
+                torch_dirs = list(extract_path.glob("*/torch"))
+
+            if not torch_dirs:
+                raise RuntimeError(f"Could not find torch directory in {version} wheel")
+
+            version_file = torch_dirs[0] / "version.py"
+            if not version_file.exists():
+                raise RuntimeError(f"Could not find version.py in {version} wheel")
+
+            # Read and parse version.py to extract git_version (nightly branch commit)
+            from ast import literal_eval
+
+            nightly_commit = None
+            with version_file.open(encoding="utf-8") as f:
+                for line in f:
+                    if line.strip().startswith("git_version"):
+                        try:
+                            # Parse the git_version assignment, e.g., git_version = "abc123def456"
+                            nightly_commit = literal_eval(
+                                line.partition("=")[2].strip()
+                            )
+                            break
+                        except (ValueError, SyntaxError):
+                            continue
+
+            if not nightly_commit:
+                raise RuntimeError(
+                    f"Could not parse git_version from {version} wheel's version.py"
+                )
+
+            # Now fetch the nightly branch and extract the real source commit from the message
+            report("-- Fetching nightly branch to extract source commit...")
+
+            # Fetch only the nightly branch
+            subprocess.check_call(["git", "fetch", "origin", "nightly"], cwd=str(CWD))
+
+            # Get the commit message from the nightly commit
+            commit_message = subprocess.check_output(
+                ["git", "show", "--no-patch", "--format=%s", nightly_commit],
+                cwd=str(CWD),
+                text=True,
+            ).strip()
+
+            # Parse the commit message to extract the real hash
+            # Format: "2025-08-06 nightly release (74a754aae98aabc2aca67e5edb41cc684fae9a82)"
+            import re
+
+            hash_match = re.search(r"\(([0-9a-fA-F]{40})\)", commit_message)
+            if hash_match:
+                real_commit = hash_match.group(1)
+                report(f"-- Extracted source commit: {real_commit[:12]}...")
+                return real_commit
+            else:
+                raise RuntimeError(
+                    f"Could not parse commit hash from nightly commit message: {commit_message}"
+                )
+
+
+# ATTENTION: THIS IS AI SLOP
+def get_latest_nightly_version(variant: str = "cpu") -> str:
+    """Get the latest available nightly version using pip to query the PyTorch nightly index."""
+    # Get the latest available nightly version for the specified variant
+    nightly_index_url = f"https://download.pytorch.org/whl/nightly/{variant}/"
+
+    # Run pip index to get available versions
+    output = subprocess.check_output(
+        [
+            "uvx",
+            "pip",
+            "index",
+            "versions",
+            "--index-url",
+            nightly_index_url,
+            "--pre",
+            "torch",
+        ],
+        text=True,
+        timeout=30,
+    )
+
+    # Parse the first line to get the latest version
+    # Format: "torch (2.9.0.dev20250806)" or "torch (2.9.0.dev20250806+cpu)"
+    first_line = output.strip().split("\n")[0]
+    if "(" in first_line and ")" in first_line:
+        # Extract version from parentheses exactly as reported
+        version = first_line.split("(")[1].split(")")[0]
+        return version
+
+    raise RuntimeError(f"Could not parse version from pip index output: {first_line}")
+
+
+# ATTENTION: THIS IS AI SLOP
+def download_and_extract_nightly_wheel(version: str) -> None:
+    """Download and extract nightly PyTorch wheel for USE_NIGHTLY=VERSION builds."""
+
+    # Extract variant from version (e.g., cpu, cu121, cu118, rocm5.7)
+    variant = extract_variant_from_version(version)
+    nightly_index_url = f"https://download.pytorch.org/whl/nightly/{variant}/"
+
+    # Construct the full torch version spec
+    torch_version_spec = f"torch=={version}"
+
+    # Create a temporary directory for downloading
+    with tempfile.TemporaryDirectory(prefix="pytorch-nightly-") as temp_dir:
+        temp_path = Path(temp_dir)
+
+        # Use pip to download the specific nightly wheel
+        download_cmd = [
+            "uvx",
+            "pip",
+            "download",
+            "--index-url",
+            nightly_index_url,
+            "--pre",
+            "--no-deps",
+            "--dest",
+            str(temp_path),
+            torch_version_spec,
+        ]
+
+        report("-- Downloading nightly PyTorch wheel...")
+        result = subprocess.run(download_cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            # Try to get the latest nightly version for the same variant to help the user
+            variant = extract_variant_from_version(version)
+            try:
+                report(f"-- Detecting latest {variant} nightly version...")
+                latest_version = get_latest_nightly_version(variant)
+                error_msg = f"Failed to download nightly wheel for version {version}: {result.stderr.strip()}"
+                error_msg += (
+                    f"\n\nLatest available {variant} nightly version: {latest_version}"
+                )
+                error_msg += f'\nTry: USE_NIGHTLY="{latest_version}"'
+
+                # Also get the git hash for the latest version
+                git_hash = get_nightly_git_hash(latest_version)
+                error_msg += f"\n\nIMPORTANT: You must checkout the matching source commit:\ngit checkout {git_hash}"
+            except Exception:
+                # If we can't get latest for this variant, try CPU as fallback
+                try:
+                    report("-- Detecting latest CPU nightly version...")
+                    latest_version = get_latest_nightly_version("cpu")
+                    error_msg = f"Failed to download nightly wheel for version {version}: {result.stderr.strip()}"
+                    error_msg += f"\n\nCould not find {variant} nightlies. Latest available CPU nightly version: {latest_version}"
+                    error_msg += f'\nTry: USE_NIGHTLY="{latest_version}"'
+                except Exception:
+                    error_msg = f"Failed to download nightly wheel for version {version}: {result.stderr.strip()}"
+                    error_msg += "\n\nCould not determine latest nightly version. "
+                    error_msg += "Check https://download.pytorch.org/whl/nightly/ for available versions."
+
+            raise RuntimeError(error_msg)
+
+        # Find the downloaded wheel file
+        wheel_files = list(temp_path.glob("torch-*.whl"))
+        if not wheel_files:
+            raise RuntimeError("No torch wheel found after download")
+        elif len(wheel_files) > 1:
+            raise RuntimeError(f"Multiple torch wheels found: {wheel_files}")
+
+        wheel_file = wheel_files[0]
+        report(f"-- Downloaded wheel: {wheel_file.name}")
+
+        # Extract the wheel
+        with tempfile.TemporaryDirectory(
+            prefix="pytorch-wheel-extract-"
+        ) as extract_dir:
+            extract_path = Path(extract_dir)
+
+            # Use Python's zipfile to extract the wheel
+            with zipfile.ZipFile(wheel_file, "r") as zip_ref:
+                zip_ref.extractall(extract_path)
+
+            # Find the torch directory in the extracted wheel
+            torch_dirs = list(extract_path.glob("torch"))
+            if not torch_dirs:
+                # Sometimes the torch directory might be nested
+                torch_dirs = list(extract_path.glob("*/torch"))
+
+            if not torch_dirs:
+                raise RuntimeError("Could not find torch directory in extracted wheel")
+
+            source_torch_dir = torch_dirs[0]
+            target_torch_dir = TORCH_DIR
+
+            report(
+                f"-- Extracting wheel contents from {source_torch_dir} to {target_torch_dir}"
+            )
+
+            # Copy the essential files from the wheel to our local directory
+            # Based on the file listing logic from tools/nightly.py
+            files_to_copy: list[Path] = []
+
+            # Get platform-specific binary files
+            if IS_LINUX:
+                files_to_copy.extend(source_torch_dir.glob("*.so"))
+                files_to_copy.extend(
+                    (source_torch_dir / "lib").glob("*.so*")
+                    if (source_torch_dir / "lib").exists()
+                    else []
+                )
+            elif IS_DARWIN:
+                files_to_copy.extend(source_torch_dir.glob("*.so"))
+                files_to_copy.extend(
+                    (source_torch_dir / "lib").glob("*.dylib")
+                    if (source_torch_dir / "lib").exists()
+                    else []
+                )
+            elif IS_WINDOWS:
+                files_to_copy.extend(source_torch_dir.glob("*.pyd"))
+                files_to_copy.extend(
+                    (source_torch_dir / "lib").glob("*.lib")
+                    if (source_torch_dir / "lib").exists()
+                    else []
+                )
+                files_to_copy.extend(
+                    (source_torch_dir / "lib").glob("*.dll")
+                    if (source_torch_dir / "lib").exists()
+                    else []
+                )
+
+            # Add essential directories and files
+            essential_items = ["version.py", "bin", "include", "lib"]
+            for item_name in essential_items:
+                item_path = source_torch_dir / item_name
+                if item_path.exists():
+                    files_to_copy.append(item_path)
+
+            # Add testing internal generated files
+            testing_generated = source_torch_dir / "testing" / "_internal" / "generated"
+            if testing_generated.exists():
+                files_to_copy.append(testing_generated)
+
+            # Copy all the files and directories
+            for src_path in files_to_copy:
+                rel_path = src_path.relative_to(source_torch_dir)
+                dst_path = target_torch_dir / rel_path
+
+                # Copy files and directories, preserving existing subdirectories
+                if src_path.is_dir():
+                    # Create destination directory if it doesn't exist
+                    dst_path.mkdir(parents=True, exist_ok=True)
+                    # Copy individual entries from source directory
+                    for src_item in src_path.iterdir():
+                        dst_item = dst_path / src_item.name
+                        if src_item.is_dir():
+                            # Recursively copy subdirectories (this will preserve existing ones)
+                            shutil.copytree(src_item, dst_item, dirs_exist_ok=True)
+                        else:
+                            # Copy individual files, overwriting existing ones
+                            shutil.copy2(src_item, dst_item)
+                else:
+                    # For files, remove existing and copy new
+                    if dst_path.exists():
+                        dst_path.unlink()
+                    dst_path.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy2(src_path, dst_path)
+
+                report(f"   Copied {rel_path}")
+
+    report("-- Nightly wheel extraction completed")
+
+
 # all the work we need to do _before_ setup runs
 def build_deps() -> None:
     report(f"-- Building version {TORCH_VERSION}")
+
+    # ATTENTION: THIS IS AI SLOP
+    # Check for USE_NIGHTLY=VERSION to bypass normal build and download nightly wheel
+    nightly_version = os.getenv("USE_NIGHTLY")
+    if nightly_version is not None:
+        import re
+
+        if (
+            nightly_version == ""
+            or nightly_version == "cpu"
+            or re.match(r"^cu\d+$", nightly_version)
+            or re.match(r"^rocm\d+\.\d+$", nightly_version)
+        ):
+            # Empty string or variant-only specification, show error with latest version
+            variant = "cpu" if nightly_version == "" else nightly_version
+            report(f"-- Detecting latest {variant} nightly version...")
+            latest_version = get_latest_nightly_version(variant)
+            # Also get the git hash to tell user which commit to checkout
+            git_hash = get_nightly_git_hash(latest_version)
+
+            if nightly_version == "":
+                error_msg = f"USE_NIGHTLY cannot be empty. Latest available version: {latest_version}\n"
+            else:
+                error_msg = (
+                    "USE_NIGHTLY requires a specific version, not just a variant. "
+                    "Latest available {nightly_version} version: {latest_version}\n"
+                )
+
+            error_msg += f'Try: USE_NIGHTLY="{latest_version}"'
+            error_msg += f"\n\nIMPORTANT: You must checkout the matching source commit for this binary:\ngit checkout {git_hash}"
+            raise RuntimeError(error_msg)
+        else:
+            # Full version specification
+            report(
+                f"-- USE_NIGHTLY={nightly_version} detected, downloading nightly wheel"
+            )
+            download_and_extract_nightly_wheel(nightly_version)
+            return
+
     check_submodules()
     check_pydep("yaml", "pyyaml")
     build_pytorch(
@@ -750,7 +1161,7 @@ def _embed_libomp(self) -> None:
     def run(self) -> None:
         # Report build options. This is run after the build completes so # `CMakeCache.txt` exists
         # and we can get an accurate report on what is used and what is not.
-        cmake_cache_vars = defaultdict(lambda: False, cmake.get_cmake_cache_variables())
+        cmake_cache_vars = get_cmake_cache_vars()
         if cmake_cache_vars["USE_NUMPY"]:
             report("-- Building with NumPy bindings")
         else:
@@ -818,16 +1229,6 @@ def run(self) -> None:
         else:
             report("-- Not using ITT")
 
-        # Do not use clang to compile extensions if `-fstack-clash-protection` is defined
-        # in system CFLAGS
-        c_flags = os.getenv("CFLAGS", "")
-        if (
-            IS_LINUX
-            and "-fstack-clash-protection" in c_flags
-            and "clang" in os.getenv("CC", "")
-        ):
-            os.environ["CC"] = str(os.environ["CC"])
-
         super().run()
 
         if IS_DARWIN:
@@ -850,23 +1251,6 @@ def run(self) -> None:
             target_dir.mkdir(parents=True, exist_ok=True)
             self.copy_file(export_lib, target_lib)
 
-            # In ROCm on Windows case copy rocblas and hipblaslt files into
-            # torch/lib/rocblas/library and torch/lib/hipblaslt/library
-            if str2bool(os.getenv("USE_ROCM")):
-                rocm_dir_path = Path(os.environ["ROCM_DIR"])
-                rocm_bin_path = rocm_dir_path / "bin"
-                rocblas_dir = rocm_bin_path / "rocblas"
-                target_rocblas_dir = target_dir / "rocblas"
-                target_rocblas_dir.mkdir(parents=True, exist_ok=True)
-                self.copy_tree(rocblas_dir, str(target_rocblas_dir))
-
-                hipblaslt_dir = rocm_bin_path / "hipblaslt"
-                target_hipblaslt_dir = target_dir / "hipblaslt"
-                target_hipblaslt_dir.mkdir(parents=True, exist_ok=True)
-                self.copy_tree(hipblaslt_dir, str(target_hipblaslt_dir))
-            else:
-                report("The specified environment variable does not exist.")
-
     def build_extensions(self) -> None:
         self.create_compile_commands()
 
@@ -1310,6 +1694,7 @@ def main() -> None:
         "_inductor/codegen/aoti_runtime/*.h",
         "_inductor/codegen/aoti_runtime/*.cpp",
         "_inductor/script.ld",
+        "_inductor/kernel/flex/templates/*.jinja",
         "_export/serde/*.yaml",
         "_export/serde/*.thrift",
         "share/cmake/ATen/*.cmake",
diff --git a/test/ao/sparsity/test_composability.py b/test/ao/sparsity/test_composability.py
index b3aaf1c6dfbea..528fe9b83c65b 100644
--- a/test/ao/sparsity/test_composability.py
+++ b/test/ao/sparsity/test_composability.py
@@ -411,7 +411,6 @@ def test_q_prep_fx_before_s_prep(self):
         )
         self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
 
-    @xfailIfS390X
     def test_q_prep_fx_s_prep_ref_conv(self):
         r"""
         This checks that the ordering: prepare_fx -> sparse prepare -> convert_to_reference_fx
@@ -586,7 +585,6 @@ def test_s_prep_before_qat_prep_fx(self):
         )
         self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
 
-    @xfailIfS390X
     def test_s_prep_q_prep_fx_ref(self):
         r"""
         This checks that the ordering: sparse prepare -> prepare_fx -> convert_to_reference_fx
diff --git a/test/cpp/aoti_abi_check/test_dtype.cpp b/test/cpp/aoti_abi_check/test_dtype.cpp
index d019b4144a9d0..e6e7e75867c8d 100644
--- a/test/cpp/aoti_abi_check/test_dtype.cpp
+++ b/test/cpp/aoti_abi_check/test_dtype.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+#include <torch/headeronly/core/ScalarType.h>
 #include <torch/headeronly/util/BFloat16.h>
 #include <torch/headeronly/util/Float4_e2m1fn_x2.h>
 #include <torch/headeronly/util/Float8_e4m3fn.h>
@@ -149,3 +150,60 @@ TEST(TestDtype, TestQuintsQintsAndBits) {
   auto i = torch::headeronly::bits8(2);
   auto j = torch::headeronly::bits16(6);
 }
+
+TEST(TestDtype, TestScalarType) {
+  using torch::headeronly::ScalarType;
+  constexpr ScalarType expected_scalar_types[] = {
+      ScalarType::Byte,
+      ScalarType::Char,
+      ScalarType::Short,
+      ScalarType::Int,
+      ScalarType::Long,
+      ScalarType::Half,
+      ScalarType::Float,
+      ScalarType::Double,
+      ScalarType::ComplexHalf,
+      ScalarType::ComplexFloat,
+      ScalarType::ComplexDouble,
+      ScalarType::Bool,
+      ScalarType::QInt8,
+      ScalarType::QUInt8,
+      ScalarType::QInt32,
+      ScalarType::BFloat16,
+      ScalarType::QUInt4x2,
+      ScalarType::QUInt2x4,
+      ScalarType::Bits1x8,
+      ScalarType::Bits2x4,
+      ScalarType::Bits4x2,
+      ScalarType::Bits8,
+      ScalarType::Bits16,
+      ScalarType::Float8_e5m2,
+      ScalarType::Float8_e4m3fn,
+      ScalarType::Float8_e5m2fnuz,
+      ScalarType::Float8_e4m3fnuz,
+      ScalarType::UInt16,
+      ScalarType::UInt32,
+      ScalarType::UInt64,
+      ScalarType::UInt1,
+      ScalarType::UInt2,
+      ScalarType::UInt3,
+      ScalarType::UInt4,
+      ScalarType::UInt5,
+      ScalarType::UInt6,
+      ScalarType::UInt7,
+      ScalarType::Int1,
+      ScalarType::Int2,
+      ScalarType::Int3,
+      ScalarType::Int4,
+      ScalarType::Int5,
+      ScalarType::Int6,
+      ScalarType::Int7,
+      ScalarType::Float8_e8m0fnu,
+      ScalarType::Float4_e2m1fn_x2,
+      ScalarType::Undefined,
+  };
+  for (int8_t i = 0; i < static_cast<int8_t>(torch::headeronly::NumScalarTypes);
+       i++) {
+    EXPECT_EQ(static_cast<ScalarType>(i), expected_scalar_types[i]);
+  }
+}
diff --git a/test/cpp/aoti_abi_check/test_exception.cpp b/test/cpp/aoti_abi_check/test_exception.cpp
index 74a9fee5d9863..26f8092932444 100644
--- a/test/cpp/aoti_abi_check/test_exception.cpp
+++ b/test/cpp/aoti_abi_check/test_exception.cpp
@@ -1,6 +1,7 @@
 #include <gtest/gtest.h>
 
 #include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/util/shim_utils.h>
 
 namespace torch {
 namespace aot_inductor {
@@ -15,5 +16,10 @@ TEST(TestExceptions, TestStdTorchCheck) {
       std::runtime_error);
 }
 
+TEST(TestExceptions, TestTorchErrorCodeCheck) {
+  EXPECT_NO_THROW(TORCH_ERROR_CODE_CHECK(0));
+  EXPECT_THROW(TORCH_ERROR_CODE_CHECK(1), std::runtime_error);
+}
+
 } // namespace aot_inductor
 } // namespace torch
diff --git a/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp b/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
index b038db2eaabac..0831958da761d 100644
--- a/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
@@ -386,7 +386,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
   ASSERT_TRUE(
       setenv(c10d::TORCH_NCCL_ENABLE_MONITORING[0].c_str(), "1", 1) == 0);
   auto tempFilename = c10::str(
-      std::filesystem::temp_directory_path().string(), "/nccl_trace_rank_");
+      std::filesystem::temp_directory_path().string(), "/comm_lib_trace_rank_");
   ASSERT_TRUE(
       setenv("TORCH_NCCL_DEBUG_INFO_TEMP_FILE", tempFilename.c_str(), 1) == 0);
   // Enable nccl flight recorder.
@@ -401,7 +401,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
   // The only difference is that we are storing traces also in memory for
   // validation.
   std::string fileNamePrefix = c10d::getCvarString(
-      {"TORCH_NCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/nccl_trace_rank_");
+      {"TORCH_NCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/comm_lib_trace_rank_");
   std::unique_ptr<TestDebugInfoWriter> wrterForTestPtr =
       std::make_unique<TestDebugInfoWriter>(fileNamePrefix);
   std::vector<uint8_t>& traces = wrterForTestPtr->getTraces();
diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt
index 14fd7f7ae9a2b..86a6c924288bb 100644
--- a/test/cpp/dist_autograd/CMakeLists.txt
+++ b/test/cpp/dist_autograd/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(USE_DISTRIBUTED AND NOT WIN32)
+if(NOT WIN32)
   set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
   set(DIST_AUTOGRAD_TEST_SOURCES
     ${TORCH_ROOT}/test/cpp/common/main.cpp
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index f58d81ed008ab..0b2a06b53c9a2 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -88,6 +88,7 @@ set(JIT_TEST_SRCS
   ${JIT_TEST_ROOT}/test_subgraph_matcher.cpp
   ${JIT_TEST_ROOT}/test_subgraph_rewriter.cpp
   ${JIT_TEST_ROOT}/test_subgraph_utils.cpp
+  ${JIT_TEST_ROOT}/test_te.cpp
   ${JIT_TEST_ROOT}/test_union.cpp
   ${JIT_TEST_ROOT}/test_utils.cpp
   ${JIT_TEST_ROOT}/test_script_profile.cpp
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 363a7bad6e683..ebeeb953d95b6 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -2709,6 +2709,7 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
 }
 
 TEST(RecordDebugHandles, Basic) {
+  GTEST_SKIP() << "Test is flaky and sometimes hangs on CI. ";
   // Enable the profiler in this thread
   const std::set<torch::autograd::profiler::ActivityType> activities(
       {torch::autograd::profiler::ActivityType::CPU});
diff --git a/test/cpp/jit/test_te.cpp b/test/cpp/jit/test_te.cpp
new file mode 100644
index 0000000000000..5456210843fd1
--- /dev/null
+++ b/test/cpp/jit/test_te.cpp
@@ -0,0 +1,41 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+
+#include <iostream>
+
+namespace torch {
+namespace jit {
+
+TEST(TETest, RemoveProfiling) {
+  auto g = std::make_shared<Graph>();
+  const auto graph_string = R"IR(
+    graph(%a : Tensor,
+          %b : bool):
+      %1 : None = prim::Constant()
+      %2 : Tensor? = prim::If(%b)
+        block0():
+          %3 : Tensor? = prim::profile[profiled_type=Tensor, seen_none=0](%1)
+          -> (%3)
+        block1():
+          %4 : Tensor = prim::profile[profiled_type=Tensor, seen_none=0](%a)
+          -> (%4)
+      return (%2))IR";
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  RemoveProfileNodesAndSpecializeTypes(g);
+  g->lint();
+
+  testing::FileCheck()
+      .check("prim::Constant")
+      ->check("prim::If")
+      ->check("block")
+      ->check("block")
+      ->check("return")
+      ->run(*g);
+}
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/nativert/CMakeLists.txt b/test/cpp/nativert/CMakeLists.txt
index 0675357861f96..1b4752ed9089f 100644
--- a/test/cpp/nativert/CMakeLists.txt
+++ b/test/cpp/nativert/CMakeLists.txt
@@ -36,8 +36,20 @@ set(NATIVERT_TEST_SRCS
   ${TORCH_ROOT}/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
   ${TORCH_ROOT}/torch/nativert/kernels/CallTorchBindKernel.cpp
   ${TORCH_ROOT}/torch/nativert/kernels/HigherOrderKernel.cpp
+  ${TORCH_ROOT}/torch/nativert/graph/passes/SubgraphRewriter.cpp
+  ${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp
+  ${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/PassManager.cpp
+  ${TORCH_ROOT}/torch/nativert/kernels/KernelHandlerRegistry.cpp
+  ${TORCH_ROOT}/torch/nativert/kernels/TritonKernel.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/DelegateExecutor.cpp
 )
 
+if(USE_CUDA)
+  list(APPEND NATIVERT_TEST_SRCS ${TORCH_ROOT}/torch/nativert/executor/triton/CudaTritonKernelManager.cpp)
+endif(MSVC)
+
+
 add_executable(test_nativert
   ${TORCH_ROOT}/test/cpp/common/main.cpp
   ${NATIVERT_TEST_SRCS}
diff --git a/test/cpp/nativert/static_kernel_test_utils.h b/test/cpp/nativert/static_kernel_test_utils.h
new file mode 100644
index 0000000000000..eddb9c633f06c
--- /dev/null
+++ b/test/cpp/nativert/static_kernel_test_utils.h
@@ -0,0 +1,158 @@
+#include <gtest/gtest.h>
+
+#include <ATen/ATen.h>
+#include <torch/nativert/executor/Executor.h>
+#include <torch/nativert/graph/Graph.h>
+#include <torch/torch.h>
+
+#include <torch/nativert/kernels/KernelHandlerRegistry.h>
+
+namespace torch::nativert {
+
+/*
+ * This is a lightweight version of ModelRunner that executes a model in
+ * interpreter mode given a string graph with no weights/attributes
+ */
+class SimpleTestModelRunner {
+ public:
+  SimpleTestModelRunner(
+      const std::string_view source,
+      const ExecutorConfig& config) {
+    register_kernel_handlers();
+    graph_ = stringToGraph(source);
+    weights_ = std::make_shared<Weights>(graph_.get());
+
+    executor_ = std::make_unique<Executor>(config, graph_, weights_);
+  }
+
+  std::vector<c10::IValue> run(const std::vector<c10::IValue>& inputs) const {
+    return executor_->execute(inputs);
+  }
+
+  ProfileMetrics benchmarkIndividualNodes(
+      const std::vector<c10::IValue>& inputs) const {
+    return executor_->benchmarkIndividualNodes({inputs}, 10, 10);
+  }
+
+ private:
+  std::shared_ptr<Graph> graph_;
+  std::unique_ptr<Executor> executor_;
+  std::shared_ptr<Weights> weights_;
+};
+
+inline void compareIValue(
+    const c10::IValue& expected,
+    const c10::IValue& actual,
+    bool native = false) {
+  if (expected.isTensor()) {
+    EXPECT_TRUE(actual.isTensor());
+    EXPECT_TRUE(torch::allclose(
+        expected.toTensor(),
+        actual.toTensor(),
+        1e-5,
+        1e-8,
+        /*equal_nan*/ true));
+    if (!native) {
+      EXPECT_TRUE(expected.toTensor().strides() == actual.toTensor().strides());
+    }
+  } else if (expected.isTuple()) {
+    EXPECT_TRUE(actual.isTuple());
+    auto expected_tuple = expected.toTupleRef().elements();
+    auto actual_tuple = actual.toTupleRef().elements();
+    ASSERT_TRUE(expected_tuple.size() == actual_tuple.size());
+    for (size_t i = 0; i < expected_tuple.size(); i++) {
+      compareIValue(expected_tuple[i], actual_tuple[i], native);
+    }
+  } else if (expected.isList()) {
+    EXPECT_TRUE(actual.isList());
+    auto expected_list = expected.toList();
+    auto actual_list = actual.toList();
+    ASSERT_TRUE(expected_list.size() == actual_list.size());
+    for (size_t i = 0; i < expected_list.size(); i++) {
+      compareIValue(expected_list[i], actual_list[i], native);
+    }
+  } else if (expected.isGenericDict()) {
+    EXPECT_TRUE(actual.isGenericDict());
+    auto expected_dict = expected.toGenericDict();
+    auto actual_dict = actual.toGenericDict();
+    EXPECT_TRUE(expected_dict.size() == actual_dict.size());
+    for (auto& expected_kv : expected_dict) {
+      auto actual_kv = actual_dict.find(expected_kv.key());
+      ASSERT_FALSE(actual_kv == actual_dict.end());
+      compareIValue(expected_kv.value(), actual_kv->value(), native);
+    }
+  } else {
+    // Fall back to default comparison from IValue
+    EXPECT_TRUE(expected == actual);
+  }
+}
+
+void compareIValues(
+    std::vector<c10::IValue> expected,
+    std::vector<c10::IValue> actual,
+    bool native = false) {
+  ASSERT_TRUE(expected.size() == actual.size());
+  for (size_t i = 0; i < expected.size(); i++) {
+    compareIValue(expected[i], actual[i], native);
+  }
+}
+
+inline void testStaticKernelEqualityInternal(
+    const SimpleTestModelRunner& modelRunner,
+    const SimpleTestModelRunner& staticModelRunner,
+    const std::vector<c10::IValue>& args,
+    bool native = false) {
+  auto expected = modelRunner.run(args);
+
+  auto output = staticModelRunner.run(args);
+  compareIValues(expected, output, native);
+
+  // Run again to test the static kernel when outputs IValue are cached in the
+  // execution frame
+  auto output2 = staticModelRunner.run(args);
+  compareIValues(expected, output2, native);
+}
+
+void testStaticKernelEquality(
+    const std::string_view source,
+    const std::vector<c10::IValue>& args,
+    bool native = false) {
+  ExecutorConfig config;
+  config.enableStaticCPUKernels = false;
+  SimpleTestModelRunner model(source, config);
+
+  config.enableStaticCPUKernels = true;
+  SimpleTestModelRunner staticKernelModel(source, config);
+
+  testStaticKernelEqualityInternal(model, staticKernelModel, args, native);
+}
+
+inline void testGraphABEquality(
+    const std::string_view graph_a,
+    const std::string_view graph_b,
+    const std::vector<c10::IValue>& args,
+    const ExecutorConfig& config = {},
+    bool native = false) {
+  SimpleTestModelRunner model_a(graph_a, config);
+  auto expected = model_a.run(args);
+
+  SimpleTestModelRunner model_b(graph_b, config);
+  auto output = model_b.run(args);
+
+  compareIValues(expected, output, native);
+}
+
+inline void testGraphABPerf(
+    const std::string_view graph_a,
+    const std::string_view graph_b,
+    const std::vector<c10::IValue>& args,
+    const ExecutorConfig& config = {}) {
+  SimpleTestModelRunner model_a(graph_a, config);
+  auto resultA = model_a.benchmarkIndividualNodes(args);
+
+  SimpleTestModelRunner model_b(graph_b, config);
+  auto resultB = model_b.benchmarkIndividualNodes(args);
+  ASSERT_TRUE(resultA.totalTime > resultB.totalTime);
+}
+
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_layout_planner.cpp b/test/cpp/nativert/test_layout_planner.cpp
new file mode 100644
index 0000000000000..060bc93918871
--- /dev/null
+++ b/test/cpp/nativert/test_layout_planner.cpp
@@ -0,0 +1,498 @@
+#include <gtest/gtest.h>
+
+#include <utility>
+
+#define LayoutPlannerTests_TEST_FRIENDS                                  \
+  friend class LayoutPlannerCtorTests;                                   \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstruct);                    \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructSymbolicShape);       \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructNoMetadata);          \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructPlanWithOverlap);     \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructPlanNoOverlap);       \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructNoOutVariant);        \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructOutputAlias);         \
+  FRIEND_TEST(                                                           \
+      LayoutPlannerCtorTests, TestConstructPlanWithMaybeAliasingToCopy); \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructListPackNoUnpack);    \
+  FRIEND_TEST(LayoutPlannerCtorTests, TestConstructTensorList);
+
+#include <torch/csrc/autograd/generated/variable_factories.h> // @manual
+
+#include <torch/nativert/executor/Executor.h> // @manual
+#include <torch/nativert/executor/SerialGraphExecutor.h> // @manual
+#include <torch/nativert/executor/Weights.h> // @manual
+#include <torch/nativert/executor/memory/LayoutManager.h> // @manual
+#include <torch/nativert/kernels/KernelFactory.h> // @manual
+#include <torch/nativert/kernels/KernelHandlerRegistry.h> // @manual
+#include <torch/nativert/kernels/KernelRegistry.h> // @manual
+
+using namespace ::testing;
+
+namespace torch::nativert /* must be same as namespace that includes TEST_FRIEND
+                             declarations */
+{
+
+class LayoutPlannerCtorTests : public testing::Test {
+ public:
+  void SetUp() override {
+    // register static dispatch kernel handler
+    register_kernel_handlers();
+  }
+  void TearDown() override {
+    executor_config.reset();
+    graph.reset();
+    executor.reset();
+  }
+
+  void createPlannerForModel(
+      const std::string& model,
+      const ExecutorConfig& cfg = {},
+      const std::unordered_map<std::string, torch::_export::TensorMeta>&
+          tensorMeta = {}) {
+    executor_config = std::make_unique<ExecutorConfig>(cfg);
+
+    graph = stringToGraph(model);
+
+    if (!tensorMeta.empty()) {
+      graph->setTensorValuesMeta(tensorMeta);
+    }
+
+    auto kernels = KernelFactory().initializeNodeKernels(
+        *graph, nullptr, *executor_config, nullptr);
+
+    auto kernelSchemas = Executor::getKernelSchemas(kernels.nodeKernels);
+
+    planner = std::make_unique<LayoutPlanner>(
+        *graph,
+        kernelSchemas,
+        ExecutionFrame::getPersistentValueMask(*graph),
+        executor_config->layoutPlannerSettings);
+
+    frame = std::make_unique<ExecutionFrame>(
+        *graph, Weights(graph.get()), *executor_config, planner.get());
+
+    executor = std::make_unique<SerialGraphExecutor>(
+        *graph, std::move(kernels.nodeKernels), *executor_config);
+  }
+
+  torch::_export::TensorMeta createSymbolicTensorMeta(
+      const std::vector<int64_t>& dims,
+      std::string device = "cpu",
+      torch::_export::ScalarType dtype = torch::_export::ScalarType::FLOAT) {
+    torch::_export::TensorMeta out_meta;
+
+    torch::_export::Device d;
+    d.set_type(std::move(device));
+    out_meta.set_device(d);
+
+    std::vector<torch::_export::SymInt> symvec;
+    for (size_t i = 0; i < dims.size(); ++i) {
+      torch::_export::SymInt symint;
+      torch::_export::SymExpr symexpr;
+      symexpr.set_expr_str(std::string("s") + std::to_string(i));
+      symint.set_as_expr(symexpr);
+      symvec.push_back(symint);
+    }
+
+    out_meta.set_sizes(symvec);
+    out_meta.set_dtype(dtype);
+    out_meta.set_layout(torch::_export::Layout::Strided);
+
+    {
+      torch::_export::SymInt i;
+      i.set_as_int(0);
+      out_meta.set_storage_offset(i);
+    }
+
+    return out_meta;
+  }
+
+  torch::_export::TensorMeta createTensorMeta(
+      const std::vector<int64_t>& dims,
+      std::string device = "cpu",
+      torch::_export::ScalarType dtype = torch::_export::ScalarType::FLOAT) {
+    torch::_export::TensorMeta out_meta;
+
+    torch::_export::Device d;
+    d.set_type(std::move(device));
+    out_meta.set_device(d);
+
+    std::vector<torch::_export::SymInt> symvec;
+    for (const auto dim : dims) {
+      torch::_export::SymInt symint;
+      symint.set_as_int(dim);
+      symvec.push_back(symint);
+    }
+
+    out_meta.set_sizes(symvec);
+    out_meta.set_dtype(dtype);
+    out_meta.set_layout(torch::_export::Layout::Strided);
+
+    {
+      torch::_export::SymInt i;
+      i.set_as_int(0);
+      out_meta.set_storage_offset(i);
+    }
+
+    return out_meta;
+  }
+
+ protected:
+  std::unique_ptr<Graph> graph;
+  std::unique_ptr<ExecutionFrame> frame;
+  std::unique_ptr<SerialGraphExecutor> executor;
+  std::unique_ptr<LayoutPlanner> planner;
+  std::unique_ptr<ExecutorConfig> executor_config;
+};
+
+namespace {
+ExecutorConfig create_enabled_executor_config() {
+  ExecutorConfig cfg;
+  cfg.enableStaticCPUKernels = true;
+  cfg.layoutPlannerSettings =
+      LayoutPlannerSettings()
+          .setAlgorithmType(LayoutPlannerAlgorithmType::GreedyBySize)
+          .setEnabled(true)
+          .setLayoutManagerSettings(
+              LayoutManagerSettings().setDeallocateBetweenRequests(false));
+  return cfg;
+};
+} // namespace
+
+TEST_F(LayoutPlannerCtorTests, TestConstructOutputAlias) {
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+  return (%out_t))";
+
+  createPlannerForModel(model, create_enabled_executor_config());
+  // no outputs
+  EXPECT_EQ(planner->get_planned_values().size(), 0);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructNoOutVariant) {
+  std::unordered_map<std::string, torch::_export::TensorMeta> meta = {
+      {"out_t", createTensorMeta({10, 10, 10})}};
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))";
+
+  auto executor_config = create_enabled_executor_config();
+  executor_config.enableStaticCPUKernels = false;
+
+  createPlannerForModel(model, executor_config, meta);
+  // no out variant (static dispatch disabled)
+  EXPECT_EQ(planner->get_planned_values().size(), 0);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructTensorList) {
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t0 = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %out_t1 = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+
+      %l[] = prim.ListPack(l0=%out_t0, l1=%out_t1)
+      %x0, %x1 = prim.ListUnpack(self=%l)
+
+      %res0 = torch.ops.aten.clone.default(self=%x0, memory_format=None)
+      %res1 = torch.ops.aten.clone.default(self=%x1, memory_format=None)
+  return (%res0, %res1))";
+
+  createPlannerForModel(model, create_enabled_executor_config(), {});
+
+  EXPECT_EQ(planner->get_planned_values().size(), 2);
+
+  auto& out_t0_lifetime = planner->planned_allocation_specs_[0].lifetime;
+  auto& out_t1_lifetime = planner->planned_allocation_specs_[1].lifetime;
+
+  EXPECT_EQ(
+      std::abs(
+          static_cast<int64_t>(out_t0_lifetime.start) -
+          static_cast<int64_t>(out_t1_lifetime.start)),
+      1);
+  EXPECT_EQ(
+      std::abs(
+          static_cast<int64_t>(out_t0_lifetime.end) -
+          static_cast<int64_t>(out_t1_lifetime.end)),
+      1);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructListPackNoUnpack) {
+  auto model = R"(graph(%weight1, %weight2):
+%weight1_plannable = torch.ops.aten.clone.default(self=%weight1, memory_format=None)
+%weights_list[] = prim.ListPack(l0=%weight1_plannable, l1=%weight2)
+%weights_cat = torch.ops.aten.cat.default(tensors=%weights_list, dim=0)
+return (%weights_cat)
+)";
+
+  createPlannerForModel(model, create_enabled_executor_config(), {});
+
+  auto& weight1_plannable_lifetime =
+      planner->planned_allocation_specs_[0].lifetime;
+  EXPECT_EQ(weight1_plannable_lifetime.start, 1);
+  EXPECT_EQ(weight1_plannable_lifetime.end, 3);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructReturnTensorListValues) {
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t0 = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %out_t1 = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+
+      %l[] = prim.ListPack(l0=%out_t0, l1=%out_t1)
+      %x0, %x1 = prim.ListUnpack(self=%l)
+  return (%x0, %x1))";
+  createPlannerForModel(model, create_enabled_executor_config(), {});
+
+  EXPECT_EQ(planner->get_planned_values().size(), 0);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructInputTensorList) {
+  auto model = R"(
+    graph(%y0, %y1):
+      %l[] = prim.ListPack(l0=%y0, l1=%y1)
+      %x0, %x1 = prim.ListUnpack(self=%l)
+
+      %res0 = torch.ops.aten.clone.default(self=%x0, memory_format=None)
+      %res1 = torch.ops.aten.clone.default(self=%x1, memory_format=None)
+  return (%res0, %res1))";
+  createPlannerForModel(model, create_enabled_executor_config(), {});
+
+  EXPECT_EQ(planner->get_planned_values().size(), 0);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructReturnTensorList) {
+  auto model = R"(
+    graph(%y0, %y1):
+      %y0_clone = torch.ops.aten.clone.default(self=%y0, memory_format=None)
+      %y1_clone = torch.ops.aten.clone.default(self=%y1, memory_format=None)
+
+      %l[] = prim.ListPack(l0=%y0_clone, l1=%y1_clone)
+  return (%l))";
+  createPlannerForModel(model, create_enabled_executor_config(), {});
+
+  EXPECT_EQ(planner->get_planned_values().size(), 0);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructUnsupportedDevice) {
+  std::unordered_map<std::string, torch::_export::TensorMeta> meta = {
+      {"out_t", createTensorMeta({10, 10, 10})}};
+
+  {
+    torch::_export::Device d;
+    d.set_type("cuda");
+    meta["out_t"].set_device(std::move(d));
+  }
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))";
+  createPlannerForModel(model, create_enabled_executor_config(), meta);
+
+  // not cpu
+  EXPECT_EQ(planner->get_planned_values().size(), 0);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructNoMetadata) {
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))";
+
+  createPlannerForModel(model, create_enabled_executor_config());
+  // no metadata
+
+  planner->create_plan();
+  EXPECT_EQ(planner->planned_allocation_specs_.size(), 1);
+  EXPECT_EQ(planner->get_planned_values().size(), 1);
+  auto& spec = planner->planned_allocation_specs_[0];
+  EXPECT_EQ(spec.size, 0);
+  EXPECT_EQ(spec.lifetime.start, 1);
+  EXPECT_EQ(spec.lifetime.end, 2);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructSymbolicShape) {
+  std::unordered_map<std::string, torch::_export::TensorMeta> meta = {
+      {"out_t", createSymbolicTensorMeta({10, 10, 10})}};
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))";
+
+  createPlannerForModel(model, create_enabled_executor_config(), meta);
+  EXPECT_EQ(planner->get_planned_values().size(), 1);
+  EXPECT_EQ(planner->planned_allocation_specs_.size(), 1);
+  EXPECT_EQ(
+      planner->planned_allocation_specs_[0].size,
+      0 /* haven't populated IValues yet */);
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstruct) {
+  std::unordered_map<std::string, torch::_export::TensorMeta> meta = {
+      {"out_t", createTensorMeta({10, 10, 10})}};
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))";
+  createPlannerForModel(model, create_enabled_executor_config(), meta);
+
+  auto& specs = planner->planned_allocation_specs_;
+
+  EXPECT_EQ(specs.size(), 1);
+  EXPECT_EQ(specs[0].lifetime.start, 1);
+  EXPECT_EQ(specs[0].lifetime.end, 2);
+
+  c10::IValue tensor = c10::IValue(torch::rand({10, 10, 10}));
+
+  executor->execute(*frame, {tensor, tensor});
+
+  // 10 * 10 * 10 * 4 rounded up to the nearest multiple of 64 ==> 64 * 63 =
+  // 4032
+  auto aligned_size = LayoutManager::get_aligned_nbytes(
+      10 * 10 * 10 * at::elementSize(at::ScalarType::Float));
+  EXPECT_EQ(specs[0].size, aligned_size);
+  EXPECT_EQ(specs[0].size, 4032);
+
+  planner->with_plan([&](const LayoutPlan& plan) {
+    EXPECT_EQ(plan.total_size, 4032);
+    EXPECT_EQ(plan.allocations.size(), 1);
+    EXPECT_EQ(plan.allocations[0].size, 4032);
+    EXPECT_EQ(plan.allocations[0].offset, 0);
+    return;
+  });
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructPlanNoOverlap) {
+  std::unordered_map<std::string, torch::_export::TensorMeta> meta = {
+      {"out_t", createTensorMeta({10, 10, 10})},
+      {"out2_t", createTensorMeta({10, 10, 10})}};
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out1_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res1 = torch.ops.aten.clone.default(self=%out1_t, memory_format=None)
+      %out2_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res2 = torch.ops.aten.clone.default(self=%out2_t, memory_format=None)
+  return (%res1, %res2))";
+  createPlannerForModel(model, create_enabled_executor_config(), meta);
+
+  c10::IValue tensor = c10::IValue(torch::rand({10, 10, 10}));
+
+  executor->execute(*frame, {tensor, tensor});
+
+  planner->with_plan([&](const LayoutPlan& plan) {
+    EXPECT_EQ(plan.total_size, 4032);
+    EXPECT_EQ(plan.allocations.size(), 2);
+    EXPECT_EQ(plan.allocations[0].size, 4032);
+    EXPECT_EQ(plan.allocations[0].offset, 0);
+    EXPECT_EQ(plan.allocations[1].size, 4032);
+    EXPECT_EQ(plan.allocations[1].offset, 0);
+    return;
+  });
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructPlanWithOverlap) {
+  std::unordered_map<std::string, torch::_export::TensorMeta> meta = {
+      {"out_t", createTensorMeta({10, 10, 10})},
+      {"out2_t", createTensorMeta({10, 10, 10})}};
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %out2_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out2_t, memory_format=None)
+      %res1 = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res, %res1))";
+  createPlannerForModel(model, create_enabled_executor_config(), meta);
+
+  c10::IValue tensor = c10::IValue(torch::rand({10, 10, 10}));
+
+  executor->execute(*frame, {tensor, tensor});
+
+  planner->with_plan([&](const LayoutPlan& plan) {
+    EXPECT_EQ(plan.total_size, 8064);
+    EXPECT_EQ(plan.allocations.size(), 2);
+    EXPECT_EQ(plan.allocations[0].size, 4032);
+    EXPECT_EQ(plan.allocations[0].offset, 0);
+    EXPECT_EQ(plan.allocations[1].offset, 4032);
+    EXPECT_EQ(plan.allocations[1].size, 4032);
+  });
+}
+
+TEST_F(LayoutPlannerCtorTests, TestConstructPlanWithMaybeAliasingToCopy) {
+  auto model = R"(graph(%input):
+          %i1 = torch.ops.aten._to_copy.default(self=%input, dtype=ScalarType::FLOAT, memory_format=None)
+          %i2 = torch.ops.aten._to_copy.default(self=%input, dtype=ScalarType::FLOAT, memory_format=None)
+          %out_t = torch.ops.aten.matmul.default(self=%i1, other=%i2)
+          return (%out_t))";
+
+  createPlannerForModel(model, create_enabled_executor_config());
+
+  c10::IValue tensor = c10::IValue(torch::rand({10, 10, 10}));
+
+  executor->execute(*frame, {tensor});
+
+  // i1 and i2 could alias input, so we should be safe and not plan them
+  planner->with_plan([&](const LayoutPlan& plan) {
+    EXPECT_EQ(plan.total_size, 0);
+    EXPECT_EQ(plan.allocations.size(), 0);
+    return;
+  });
+}
+
+TEST_F(LayoutPlannerCtorTests, TestCreateMultiplePlanners) {
+  auto executor_config = create_enabled_executor_config();
+
+  auto model = R"(
+    graph(%y0, %y1):
+      %out_t = torch.ops.aten.matmul.default(self=%y0, other=%y1)
+      %res = torch.ops.aten.clone.default(self=%out_t, memory_format=None)
+  return (%res))";
+
+  graph = stringToGraph(model);
+
+  std::vector<std::pair<
+      std::unique_ptr<LayoutPlanner>,
+      std::vector<std::unique_ptr<OpKernel>>>>
+      planners;
+  for ([[maybe_unused]] const auto _ : c10::irange(2)) {
+    auto kernels = KernelFactory().initializeNodeKernels(
+        *graph, nullptr, executor_config, nullptr);
+    auto kernelSchemas = Executor::getKernelSchemas(kernels.nodeKernels);
+    planners.emplace_back(
+        std::make_unique<LayoutPlanner>(
+            *graph,
+            kernelSchemas,
+            ExecutionFrame::getPersistentValueMask(*graph),
+            executor_config.layoutPlannerSettings),
+        std::move(kernels.nodeKernels));
+  }
+
+  c10::IValue tensor = c10::IValue(torch::rand({10, 10, 10}));
+  for (auto& [layout_planner, kernels] : planners) {
+    ExecutionFrame execution_frame(
+        *graph, Weights(graph.get()), executor_config, layout_planner.get());
+    SerialGraphExecutor graph_executor(
+        *graph, std::move(kernels), executor_config);
+    graph_executor.execute(execution_frame, {tensor, tensor});
+    layout_planner->with_plan([&](const LayoutPlan& plan) {
+      EXPECT_EQ(plan.total_size, 4032);
+      EXPECT_EQ(plan.allocations.size(), 1);
+      EXPECT_EQ(plan.allocations[0].size, 4032);
+      EXPECT_EQ(plan.allocations[0].offset, 0);
+      return;
+    });
+  }
+}
+
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_pass_manager.cpp b/test/cpp/nativert/test_pass_manager.cpp
new file mode 100644
index 0000000000000..d3e5d6585978d
--- /dev/null
+++ b/test/cpp/nativert/test_pass_manager.cpp
@@ -0,0 +1,33 @@
+#include <gtest/gtest.h>
+
+#include <torch/nativert/graph/Graph.h>
+#include <torch/nativert/graph/passes/pass_manager/PassManager.h>
+
+#include <torch/csrc/jit/testing/file_check.h>
+
+using namespace ::testing;
+using namespace torch::nativert;
+
+TEST(PassManagerTest, TestEmptyPass) {
+  GraphPassManager manager({"EmptyPass"});
+  EXPECT_FALSE(manager.run(Graph::createGraph().get()));
+}
+
+TEST(PassPipelineTest, TestConcat) {
+  GraphPassPipeline p1({"test"});
+  EXPECT_EQ(p1.size(), 1);
+  EXPECT_EQ(p1.at(0), "test");
+  p1.concat({"test1", "test2"});
+  EXPECT_EQ(p1.at(0), "test");
+  EXPECT_EQ(p1.at(1), "test1");
+  EXPECT_EQ(p1.at(2), "test2");
+}
+
+TEST(PassPipelineTest, TestPushFront) {
+  GraphPassPipeline p1({"test"});
+  EXPECT_EQ(p1.size(), 1);
+  EXPECT_EQ(p1.at(0), "test");
+  p1.push_front("test1");
+  EXPECT_EQ(p1.at(0), "test1");
+  EXPECT_EQ(p1.at(1), "test");
+}
diff --git a/test/cpp/nativert/test_static_dispatch_kernel_registration.cpp b/test/cpp/nativert/test_static_dispatch_kernel_registration.cpp
new file mode 100644
index 0000000000000..df5f427879e1c
--- /dev/null
+++ b/test/cpp/nativert/test_static_dispatch_kernel_registration.cpp
@@ -0,0 +1,15 @@
+#include <gtest/gtest.h>
+
+#include <torch/nativert/kernels/KernelFactory.h>
+#include <torch/nativert/kernels/KernelHandlerRegistry.h>
+
+using namespace ::testing;
+using namespace torch::nativert;
+
+TEST(StaticDispatchKernelRegistrationTests, TestRegistration) {
+  EXPECT_FALSE(KernelFactory::isHandlerRegistered("static_cpu"));
+  register_kernel_handlers();
+  EXPECT_TRUE(KernelFactory::isHandlerRegistered("static_cpu"));
+  // try to re-register, which should be a no-op
+  register_kernel_handlers();
+}
diff --git a/test/cpp/nativert/test_static_kernel_ops.cpp b/test/cpp/nativert/test_static_kernel_ops.cpp
new file mode 100644
index 0000000000000..fcdac1cd5f174
--- /dev/null
+++ b/test/cpp/nativert/test_static_kernel_ops.cpp
@@ -0,0 +1,539 @@
+#include <fmt/format.h>
+#include <gtest/gtest.h>
+#include <torch/torch.h>
+#include <random>
+#include "test/cpp/nativert/static_kernel_test_utils.h" // @manual
+
+namespace torch::nativert {
+
+namespace {
+std::vector<c10::IValue> generateArgsForQuantizedEmbeddingBag() {
+  // Set seed for reproducibility
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<int> int_dis(0, 15); // num_embeddings - 1
+  int num_embeddings = 16;
+  int embedding_dim = 32;
+  int num_lengths = 10;
+
+  auto weight =
+      at::randint(0, 255, {num_embeddings, embedding_dim}).to(at::kByte);
+
+  // Generate random lengths
+  std::vector<int> np_lengths(num_lengths);
+  for (auto& length : np_lengths) {
+    length = int_dis(gen);
+  }
+  int total_length = 0;
+  for (const auto& length : np_lengths) {
+    total_length += length;
+  }
+  // Generate random indices
+  at::Tensor indices =
+      torch::empty({total_length}, torch::dtype(torch::kInt32));
+  auto indices_accessor = indices.accessor<int, 1>();
+  for (int i = 0; i < total_length; ++i) {
+    indices_accessor[i] = int_dis(gen);
+  }
+  // Create lengths tensor
+  at::Tensor lengths = torch::from_blob(
+      np_lengths.data(), {num_lengths}, torch::dtype(torch::kInt32));
+  // Calculate offsets
+  at::Tensor offsets = torch::cat(
+      {torch::zeros({1}, torch::dtype(torch::kInt32)),
+       torch::cumsum(lengths, 0)});
+  offsets = offsets.to(torch::dtype(torch::kInt32));
+
+  at::Tensor per_sample_weights = at::randn(indices.sizes());
+
+  std::vector<c10::IValue> args{weight, indices, offsets, per_sample_weights};
+  return args;
+}
+
+std::vector<c10::IValue> generateArgsForEmbeddingBag(bool include_padding_idx) {
+  torch::Tensor weight = torch::randn({10, 3}, torch::dtype(torch::kFloat32));
+  torch::Tensor indices =
+      torch::randint(0, 10, {20}, torch::dtype(torch::kInt64));
+  torch::Tensor offsets =
+      torch::tensor({0, 5, 10, 15, 20}, torch::dtype(torch::kInt64));
+  torch::Tensor per_sample_weights =
+      torch::rand({20}, torch::dtype(torch::kFloat32));
+  // Define the padding_idx
+  int64_t padding_idx = 1;
+  // Create a vector of IValues to store the arguments
+  std::vector<c10::IValue> args;
+  args.emplace_back(weight);
+  args.emplace_back(indices);
+  args.emplace_back(offsets);
+  args.emplace_back(per_sample_weights);
+  if (include_padding_idx) {
+    args.emplace_back(padding_idx);
+  }
+  return args;
+}
+} // namespace
+
+TEST(StaticKernelTest, QuantizedEmbeddingBagByteRowwiseOffsets) {
+  const std::string graph =
+      R"(graph(%weight, %indices, %offsets, %per_sample_weights):
+%out = torch.ops.quantized.embedding_bag_byte_rowwise_offsets.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, pruned_weights=false, per_sample_weights=%per_sample_weights, compressed_indices_mapping=None, include_last_offset=true)
+%res = torch.ops.aten.clone.default(self=%out, memory_format=None)
+return (%res)
+)";
+
+  std::vector<c10::IValue> args = generateArgsForQuantizedEmbeddingBag();
+
+  testStaticKernelEquality(graph, args);
+}
+
+TEST(StaticKernelTest, QuantizedEmbeddingBag4BitRowwiseOffsets) {
+  const std::string graph =
+      R"(graph(%weight, %indices, %offsets, %per_sample_weights):
+%out = torch.ops.quantized.embedding_bag_4bit_rowwise_offsets.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, pruned_weights=false, per_sample_weights=%per_sample_weights, compressed_indices_mapping=None, include_last_offset=true)
+%res = torch.ops.aten.clone.default(self=%out, memory_format=None)
+return (%res)
+)";
+  std::vector<c10::IValue> args = generateArgsForQuantizedEmbeddingBag();
+
+  testStaticKernelEquality(graph, args);
+}
+
+TEST(StaticKernelTest, EmbeddingBag) {
+  const std::string graph =
+      R"(graph(%weight, %indices, %offsets, %per_sample_weights):
+%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true)
+%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)
+%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)
+%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)
+%res4 = torch.ops.aten.clone.default(self=%out3, memory_format=None)
+return (%res1, %res2, %res3, %res4)
+)";
+  std::vector<c10::IValue> args = generateArgsForEmbeddingBag(false);
+  testStaticKernelEquality(graph, args);
+
+  // Test use_max_indices False
+  const std::string graph2 =
+      R"(graph(%weight, %indices, %offsets, %per_sample_weights):
+%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.default(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true)
+%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)
+%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)
+%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)
+return (%res1, %res2, %res3, %out2)
+)";
+  std::vector<c10::IValue> args2 = generateArgsForEmbeddingBag(false);
+  testStaticKernelEquality(graph2, args2);
+}
+
+TEST(StaticKernelTest, EmbeddingBagPaddingIdx) {
+  const std::string graph =
+      R"(graph(%weight, %indices, %offsets, %per_sample_weights, %padding_idx):
+%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.padding_idx(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true, padding_idx=%padding_idx)
+%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)
+%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)
+%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)
+%res4 = torch.ops.aten.clone.default(self=%out3, memory_format=None)
+return (%res1, %res2, %res3, %res4)
+)";
+  std::vector<c10::IValue> args = generateArgsForEmbeddingBag(true);
+  testStaticKernelEquality(graph, args);
+
+  // Test use_max_indices False
+  const std::string graph2 =
+      R"(graph(%weight, %indices, %offsets, %per_sample_weights, %padding_idx):
+%out0, %out1, %out2, %out3 = torch.ops.aten.embedding_bag.padding_idx(weight=%weight, indices=%indices, offsets=%offsets, scale_grad_by_freq=false, mode=0, sparse=false, per_sample_weights=%per_sample_weights, include_last_offset=true, padding_idx=%padding_idx)
+%res1 = torch.ops.aten.clone.default(self=%out0, memory_format=None)
+%res2 = torch.ops.aten.clone.default(self=%out1, memory_format=None)
+%res3 = torch.ops.aten.clone.default(self=%out2, memory_format=None)
+return (%res1, %res2, %res3, %out2)
+)";
+  std::vector<c10::IValue> args2 = generateArgsForEmbeddingBag(true);
+  testStaticKernelEquality(graph2, args2);
+}
+
+TEST(StaticKernelTest, Aten_ToCopy) {
+  for (auto& target_dtype :
+       {"None",
+        "ScalarType::FLOAT",
+        "ScalarType::DOUBLE",
+        "ScalarType::HALF",
+        "ScalarType::INT",
+        "ScalarType::LONG"}) {
+    for (auto& target_memory_format : {
+             "None",
+             "MemoryFormat::PreserveFormat",
+             "MemoryFormat::ContiguousFormat",
+         }) {
+      for (auto& input_dtype :
+           {at::kLong, at::kInt, at::kFloat, at::kDouble, at::kHalf}) {
+        for (auto& permute_input : {true, false}) {
+          const std::string graph = fmt::format(
+              R"(graph(%input):
+%out = torch.ops.aten._to_copy.default(self=%input, dtype={}, memory_format={})
+return (%out)
+)",
+              target_dtype,
+              target_memory_format);
+          at::Tensor input =
+              at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(input_dtype);
+          if (permute_input) {
+            input = input.permute({1, 0, 3, 2});
+          }
+
+          testStaticKernelEquality(graph, {input});
+        }
+      }
+    }
+  }
+}
+
+TEST(StaticKernelTest, Aten_ToCopy_Aliasing) {
+  const std::string graph =
+      R"(graph(%input):
+          %out = torch.ops.aten._to_copy.default(self=%input, dtype=ScalarType::FLOAT, memory_format=None)
+          return (%out))";
+
+  at::Tensor input =
+      at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(at::kFloat);
+
+  torch::nativert::ExecutorConfig config;
+  config.enableStaticCPUKernels = true;
+  SimpleTestModelRunner runner(graph, config);
+
+  // try standard aliasing case
+  auto output = runner.run({input});
+  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
+  EXPECT_EQ(output[0].toTensor().dim(), 4);
+  EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8);
+  output = runner.run({input});
+  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
+  EXPECT_EQ(output[0].toTensor().dim(), 4);
+  EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8);
+
+  // try swap out input storage between runs
+  at::Storage original_storage = input.storage();
+  input.unsafeGetTensorImpl()->set_storage_keep_dtype(
+      at::randint(0, 128, {8, 8, 8, 8}, at::kLong).to(at::kFloat).storage());
+  output = runner.run({input});
+  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
+  EXPECT_FALSE(output[0].toTensor().storage().is_alias_of(original_storage));
+  EXPECT_EQ(output[0].toTensor().dim(), 4);
+  EXPECT_EQ(output[0].toTensor().numel(), 8 * 8 * 8 * 8);
+
+  // try to upsize between runs
+  input.resize_({16, 16, 16, 16, 16});
+  output = runner.run({input});
+  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
+  EXPECT_EQ(output[0].toTensor().dim(), 5);
+  EXPECT_EQ(output[0].toTensor().numel(), 16 * 16 * 16 * 16 * 16);
+
+  // try to downsize between runs
+  input.resize_({4});
+  output = runner.run({input});
+  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
+  EXPECT_EQ(output[0].toTensor().dim(), 1);
+  EXPECT_EQ(output[0].toTensor().numel(), 4);
+
+  // try to restride between runs
+  input.as_strided_({3, 2}, {3, 6}).random_();
+  output = runner.run({input});
+  EXPECT_TRUE(output[0].toTensor().storage().is_alias_of(input.storage()));
+  EXPECT_EQ(output[0].toTensor().dim(), 2);
+  EXPECT_EQ(output[0].toTensor().numel(), 3 * 2);
+  for (int i = 0; i < 3; i += 1) {
+    for (int j = 0; j < 2; j += 1) {
+      EXPECT_EQ(
+          output[0].toTensor().index({i, j}).item().toFloat(),
+          input.index({i, j}).item().toFloat());
+    }
+  }
+}
+
+TEST(StaticKernelTest, MulScalar) {
+  const std::string graph = R"(graph(%in0_t, %in1_t):
+    %out = torch.ops.aten.mul.Scalar(self=%in0_t, other=%in1_t)
+    return (%out)
+  )";
+
+  std::vector<std::pair<at::Tensor, std::vector<double>>> test_cases = {
+      {at::rand({3, 4}), {2.0, -2.0, -2, 2, 0.0, 1e6, 1e-6, NAN, INFINITY}},
+      {at::rand({2, 3, 4}), {2.0}},
+      {at::rand({3, 4}, at::kFloat), {3.0}}, // fp32 tensor with int scalar
+      {at::randint(0, 10, {3, 4}, at::kInt),
+       {2.0}}, // int32 tensor with double scalar
+      {at::rand({3, 4}, at::kHalf), {2.0}}, // half tensor with float scalar
+      {at::rand({3, 4}, at::kBFloat16), {2.0}}, // bf16 tensor with float scalar
+      {at::randint(0, 10, {3, 4}, at::kInt), {2}}, // int tensor with int scalar
+      {at::randint(0, 10, {3, 4}, at::kLong),
+       {2}}, // int64 tensor with int64 scalar,
+      {at::rand({3, 4, 5}, at::kFloat).permute({2, 0, 1}),
+       {2}}, // int64 strided tensor with int64 scalar
+      {at::rand({3, 4}, at::kFloat).t(),
+       {2}}, // int64 strided tensor with int64 scalar
+      {at::rand({3, 4, 5}, at::kFloat).permute({2, 0, 1}),
+       {2}}, // int64 strided tensor with int64 scalar
+      {at::rand({3, 4}, at::kFloat).t(),
+       {2}}, // int64 strided tensor with int64 scalar
+  };
+
+  for (const auto& [tensor, scalars] : test_cases) {
+    for (double scalar : scalars) {
+      std::vector<c10::IValue> inputs = {tensor, scalar};
+      testStaticKernelEquality(graph, inputs);
+    }
+  }
+}
+
+TEST(StaticKernelTest, SymSizeInt) {
+  const std::string graph = R"(graph(%self, %dim):
+    %out = torch.ops.aten.sym_size.int(self=%self, dim=%dim)
+    return (%out)
+  )";
+
+  // Define test cases with different tensors
+  std::vector<at::Tensor> test_cases = {
+      at::rand({3, 4, 5}), // standard 3D tensor
+      at::rand({0, 4, 5}), // empty tensor
+      at::rand({1}), // single-element tensor
+      at::rand({2, 3, 4, 5, 6}), // high-dimensional tensor
+      at::rand({3, 1, 5}) // tensor with one dimension as 1
+  };
+
+  // Iterate over each test case
+  for (const auto& tensor : test_cases) {
+    for (int64_t dim = 0; dim < tensor.dim(); ++dim) {
+      std::vector<c10::IValue> inputs = {tensor, dim};
+      testStaticKernelEquality(graph, inputs);
+    }
+  }
+}
+
+TEST(StaticKernelTest, BucketizeTensor) {
+  const std::string graph =
+      R"(graph(%input, %boundaries, %out_int32, %right):
+%out = torch.ops.aten.bucketize.Tensor(self=%input, boundaries=%boundaries, out_int32=%out_int32, right=%right)
+return (%out)
+)";
+
+  std::vector<std::pair<bool, bool>> test_cases = {
+      {false, false}, {true, false}, {false, true}, {true, true}};
+
+  for (const auto& [out_int32, right] : test_cases) {
+    at::Tensor input = at::tensor({0.1, 2.5, 3.0, 4.5, 5.0}, at::kFloat);
+    at::Tensor boundaries = at::tensor({1.0, 2.0, 3.0, 4.0}, at::kFloat);
+
+    std::vector<c10::IValue> args = {input, boundaries, out_int32, right};
+
+    testStaticKernelEquality(graph, args);
+  }
+}
+
+TEST(StaticKernelTest, SliceScatter) {
+  const std::string graph =
+      R"(graph(%self, %src, %dim, %start, %end, %step):
+%out = torch.ops.aten.slice_scatter.default(self=%self, src=%src, dim=%dim, start=%start, end=%end, step=%step)
+return (%out)
+)";
+
+  // Create input tensors
+  at::Tensor self = at::rand({5, 5}, at::kFloat);
+  at::Tensor src = at::rand({2, 5}, at::kFloat);
+  int64_t dim = 0;
+  int64_t start = 1;
+  int64_t end = 3;
+  int64_t step = 1;
+
+  // Create a vector of IValues to pass as inputs
+  std::vector<c10::IValue> inputs = {self, src, dim, start, end, step};
+
+  // Run the kernel and verify the output
+  testStaticKernelEquality(graph, inputs);
+}
+
+TEST(StaticKernelTest, QuantizedEmbeddingBagBytePrepack) {
+  const std::string graph = R"(
+    graph(%input):
+        %weight = torch.ops.quantized.embedding_bag_byte_prepack.default(weight=%input)
+        %res = torch.ops.aten.clone.default(self=%weight, memory_format=None)
+        return (%res)
+  )";
+
+  at::Tensor args1 = torch::randn({8, 16}, at::ScalarType::Float);
+
+  testStaticKernelEquality(graph, {args1});
+}
+
+TEST(StaticKernelTest, QuantizedEmbeddingBagByteUnpack) {
+  const std::string graph = R"(
+    graph(%input):
+        %weight = torch.ops.quantized.embedding_bag_byte_prepack.default(weight=%input)
+        %output = torch.ops.quantized.embedding_bag_byte_unpack.default(weight=%weight)
+        %res = torch.ops.aten.clone.default(self=%output, memory_format=None)
+        return (%res)
+  )";
+
+  at::Tensor args1 = torch::randn({8, 16}, at::ScalarType::Float);
+
+  testStaticKernelEquality(graph, {args1});
+}
+
+TEST(StaticKernelTest, QuantizedLinear) {
+  const std::string graph = R"(
+    graph(%input, %weights):
+        %packed_params = torch.ops.quantized.linear_prepack.default(W=%weights, B=None)
+        %1254 = torch.ops.quantized.linear.default(X=%input, W_prepack=%packed_params, Y_scale_i=1.0, Y_zero_point_i=1)
+        %res = torch.ops.aten.dequantize.self(self=%1254)
+        return (%res)
+  )";
+
+  at::Tensor input =
+      at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQUInt8);
+  at::Tensor weight =
+      at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQInt8);
+
+  testStaticKernelEquality(graph, {input, weight});
+}
+
+TEST(NativeKernelTest, View) {
+  const std::string source =
+      R"(graph(%self):
+%ret = torch.ops.aten.view.default(self=%self, size=[36])
+%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
+return (%cloned)
+)";
+
+  auto self0 = at::rand({6, 6});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(source, args, true);
+}
+
+TEST(NativeKernelTest, Permute) {
+  const std::string source =
+      R"(graph(%self):
+%ret = torch.ops.aten.permute.default(self=%self, dims=[1, 0])
+%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
+return (%cloned)
+)";
+
+  auto self0 = at::rand({2, 3});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(source, args, true);
+}
+
+TEST(NativeKernelTest, Reshape) {
+  const std::string source =
+      R"(graph(%self):
+%ret = torch.ops.aten.reshape.default(self=%self, shape=[9, 4])
+%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
+return (%cloned)
+)";
+
+  auto self0 = at::rand({3, 3, 4});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(source, args, true);
+}
+
+TEST(NativeKernelTest, Select) {
+  static constexpr std::string_view source =
+      R"(graph(%self):
+%ret = torch.ops.aten.select.int(self=%self, dim=1, index=0)
+%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
+return (%cloned)
+)";
+
+  auto self0 = at::rand({3, 3, 3});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(source, args, true);
+}
+
+TEST(NativeKernelTest, Slice) {
+  const std::string graph =
+      R"(graph(%self):
+%ret = torch.ops.aten.slice.Tensor(self=%self, dim=0, start=1, end=3, step=1)
+%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
+return (%cloned)
+)";
+
+  auto self0 = at::rand({5, 5});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(graph, args, true);
+}
+
+TEST(NativeKernelTest, Split) {
+  const std::string graph =
+      R"(graph(%self):
+%ret = torch.ops.aten.split.Tensor(self=%self, split_size=2, dim=0)
+return (%ret)
+)";
+
+  auto self0 = at::rand({6, 6});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(graph, args, true);
+}
+
+TEST(NativeKernelTest, SplitWithSizes) {
+  const std::string graph =
+      R"(graph(%self):
+%ret = torch.ops.aten.split_with_sizes.default(self=%self, split_sizes=[2, 4], dim=0)
+return (%ret)
+)";
+
+  auto self0 = at::rand({6, 6});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(graph, args, true);
+}
+
+TEST(NativeKernelTest, TensorSplitSections) {
+  const std::string graph =
+      R"(graph(%self):
+%ret = torch.ops.aten.tensor_split.sections(self=%self, sections=3, dim=0)
+return (%ret)
+)";
+
+  auto self0 = at::rand({9, 3});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(graph, args, true);
+}
+
+TEST(StaticKernelTest, Stack) {
+  const std::string graph =
+      R"(graph(%tensors):
+%ret = torch.ops.aten.stack.default(tensors=%tensors, dim=0)
+return (%ret)
+)";
+
+  auto tensor1 = at::rand({2, 3});
+  auto tensor2 = at::rand({2, 3});
+  auto tensor3 = at::rand({2, 3});
+  std::vector<c10::IValue> args{
+      std::vector<at::Tensor>{tensor1, tensor2, tensor3}};
+  testStaticKernelEquality(graph, args, true);
+}
+
+TEST(NativeKernelTest, Item) {
+  const std::string graph =
+      R"(graph(%self):
+%ret = torch.ops.aten.item.default(self=%self)
+return (%ret)
+)";
+
+  auto self0 = at::tensor({42.0});
+  std::vector<c10::IValue> args{self0};
+  testStaticKernelEquality(graph, args, true);
+}
+
+TEST(NativeKernelTest, Narrow) {
+  const std::string graph =
+      R"(graph(%self, %dim, %start, %length):
+%ret = torch.ops.aten.narrow.default(self=%self, dim=%dim, start=%start, length=%length)
+%cloned = torch.ops.aten.clone.default(self=%ret, memory_format=None)
+return (%cloned)
+)";
+
+  auto self = at::rand({5, 5});
+  int64_t dim = 1;
+  int64_t start = 1;
+  int64_t length = 3;
+  std::vector<c10::IValue> args{self, dim, start, length};
+  testStaticKernelEquality(graph, args, true);
+}
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_triton_kernel_manager_registration.cpp b/test/cpp/nativert/test_triton_kernel_manager_registration.cpp
new file mode 100644
index 0000000000000..ca864158e3122
--- /dev/null
+++ b/test/cpp/nativert/test_triton_kernel_manager_registration.cpp
@@ -0,0 +1,14 @@
+#include <gtest/gtest.h>
+
+#include <torch/nativert/kernels/TritonKernel.h>
+
+using namespace ::testing;
+using namespace torch::nativert;
+
+TEST(TritonKernelManagerRegistrationTests, TestRegister) {
+#ifndef USE_CUDA
+  EXPECT_TRUE(create_cuda_triton_kernel_manager == nullptr);
+#else
+  EXPECT_FALSE(create_cuda_triton_kernel_manager == nullptr);
+#endif // USE_CUDA
+}
diff --git a/test/cpp/tensorexpr/CMakeLists.txt b/test/cpp/tensorexpr/CMakeLists.txt
deleted file mode 100644
index 8fe6ffd525e98..0000000000000
--- a/test/cpp/tensorexpr/CMakeLists.txt
+++ /dev/null
@@ -1,83 +0,0 @@
-set(TENSOREXPR_TEST_ROOT ${TORCH_ROOT}/test/cpp/tensorexpr)
-
-set(TENSOREXPR_TEST_SRCS
-  ${TENSOREXPR_TEST_ROOT}/test_approx.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_aten.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_boundsinference.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_conv.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_cpp_codegen.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_dynamic_shapes.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_expr.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_external_calls.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_graph_opt.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_ir_printer.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_ir_verifier.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_kernel.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_loopnest.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_memdependency.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_ops.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_quantization.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_memplanning.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_reductions.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_registerizer.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_simplify.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_te_fuser_pass.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_type.cpp
-  ${TENSOREXPR_TEST_ROOT}/test_type_specializations.cpp
-)
-
-if(USE_CUDA)
-  list(APPEND TENSOREXPR_TEST_SRCS ${TENSOREXPR_TEST_ROOT}/test_cuda.cpp)
-endif()
-
-if(USE_LLVM AND LLVM_FOUND)
-  list(APPEND TENSOREXPR_TEST_SRCS ${TENSOREXPR_TEST_ROOT}/test_llvm.cpp)
-endif()
-
-add_executable(test_tensorexpr
-  ${TORCH_ROOT}/test/cpp/common/main.cpp
-  ${TENSOREXPR_TEST_ROOT}/padded_buffer.cpp
-  ${TENSOREXPR_TEST_SRCS})
-
-target_link_libraries(test_tensorexpr PRIVATE torch gtest_main)
-target_include_directories(test_tensorexpr PRIVATE ${ATen_CPU_INCLUDE})
-target_compile_definitions(test_tensorexpr PRIVATE USE_GTEST)
-
-add_executable(tutorial_tensorexpr ${TENSOREXPR_TEST_ROOT}/tutorial.cpp)
-target_link_libraries(tutorial_tensorexpr PRIVATE torch)
-target_include_directories(tutorial_tensorexpr PRIVATE ${ATen_CPU_INCLUDE})
-
-# The test case depends on the xnnpack header which in turn depends on the
-# pthreadpool header. For some build environment we need add the dependency
-# explicitly.
-if(USE_PTHREADPOOL)
-  target_link_libraries(test_tensorexpr PRIVATE pthreadpool_interface)
-endif()
-if(USE_CUDA)
-  target_compile_definitions(test_tensorexpr PRIVATE USE_CUDA)
-  target_compile_definitions(tutorial_tensorexpr PRIVATE USE_CUDA)
-elseif(USE_ROCM)
-  target_link_libraries(test_tensorexpr PRIVATE
-    hiprtc::hiprtc
-    hip::amdhip64
-    ${TORCH_CUDA_LIBRARIES})
-  target_compile_definitions(test_tensorexpr PRIVATE USE_ROCM)
-
-  target_link_libraries(tutorial_tensorexpr PRIVATE
-    hiprtc::hiprtc
-    hip::amdhip64
-    ${TORCH_CUDA_LIBRARIES})
-  target_compile_definitions(tutorial_tensorexpr PRIVATE USE_ROCM)
-endif()
-
-if(INSTALL_TEST)
-  set_target_properties(test_tensorexpr PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
-  install(TARGETS test_tensorexpr DESTINATION bin)
-  set_target_properties(tutorial_tensorexpr PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
-  install(TARGETS tutorial_tensorexpr DESTINATION bin)
-  # Install PDB files for MSVC builds
-  if(MSVC AND BUILD_SHARED_LIBS)
-    install(FILES $<TARGET_PDB_FILE:test_tensorexpr> DESTINATION bin OPTIONAL)
-    install(FILES $<TARGET_PDB_FILE:tutorial_tensorexpr> DESTINATION bin OPTIONAL)
-  endif()
-endif()
diff --git a/test/cpp/tensorexpr/README.md b/test/cpp/tensorexpr/README.md
deleted file mode 100644
index f86a50a65e804..0000000000000
--- a/test/cpp/tensorexpr/README.md
+++ /dev/null
@@ -1,55 +0,0 @@
-# TensorExpr C++ Tests
-
-## How to add a new test
-First, create a new test file. Test files should have be placed in this
-directory, with a name that starts with `test_`, like `test_foo.cpp`.
-
-Here is an example test file you can copy-paste.
-```cpp
-#include <test/cpp/tensorexpr/test_base.h>
-
-// Tests go in torch::jit
-namespace torch {
-namespace jit {
-
-// 1. Test cases are void() functions.
-// 2. They start with the prefix `test`
-void testCaseOne() {
-    // ...
-}
-
-void testCaseTwo() {
-    // ...
-}
-}
-}
-```
-
-Then, register your test in `tests.h`:
-```cpp
-// Add to TH_FORALL_TESTS_CUDA instead for CUDA-requiring tests
-#define TH_FORALL_TESTS(_)             \
-  _(ADFormulas)                        \
-  _(Attributes)                        \
-  ...
-  _(CaseOne)  // note that the `test` prefix is omitted.
-  _(CaseTwo)
-```
-
-We glob all the test files together in `CMakeLists.txt` so that you don't
-have to edit it every time you add a test. Unfortunately, this means that in
-order to get the build to pick up your new test file, you need to re-run
-cmake:
-```bash
-CMAKE_FRESH=1 python setup.py build
-```
-
-## How do I run the tests?
-The following commands assume you are in PyTorch root.
-
- ```bash
- # (re)build the test binary
- ninja build/bin/test_tensorexpr
- # run
- build/bin/test_tensorexpr --gtest_filter='glob_style_filter*'
- ```
diff --git a/test/cpp/tensorexpr/gtest_assert_float_eq.h b/test/cpp/tensorexpr/gtest_assert_float_eq.h
deleted file mode 100644
index f85264a8f5d3c..0000000000000
--- a/test/cpp/tensorexpr/gtest_assert_float_eq.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#pragma once
-
-#include <cmath>
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// The Google C++ Testing and Mocking Framework (Google Test)
-//
-// This header file declares functions and macros used internally by
-// Google Test.  They are subject to change without notice.
-
-using Bits = uint32_t;
-
-// this avoids the "dereferencing type-punned pointer
-// will break strict-aliasing rules" error
-union Float {
-  float float_;
-  Bits bits_;
-};
-
-// # of bits in a number.
-static const size_t kBitCount = 8 * sizeof(Bits);
-// The mask for the sign bit.
-static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-// Converts an integer from the sign-and-magnitude representation to
-// the biased representation.  More precisely, let N be 2 to the
-// power of (kBitCount - 1), an integer x is represented by the
-// unsigned number x + N.
-//
-// For instance,
-//
-//   -N + 1 (the most negative number representable using
-//          sign-and-magnitude) is represented by 1;
-//   0      is represented by N; and
-//   N - 1  (the biggest number representable using
-//          sign-and-magnitude) is represented by 2N - 1.
-//
-// Read http://en.wikipedia.org/wiki/Signed_number_representations
-// for more details on signed number representations.
-static Bits SignAndMagnitudeToBiased(const Bits& sam) {
-  if (kSignBitMask & sam) {
-    // sam represents a negative number.
-    return ~sam + 1;
-  } else {
-    // sam represents a positive number.
-    return kSignBitMask | sam;
-  }
-}
-
-// Given two numbers in the sign-and-magnitude representation,
-// returns the distance between them as an unsigned number.
-static Bits DistanceBetweenSignAndMagnitudeNumbers(
-    const Bits& sam1,
-    const Bits& sam2) {
-  const Bits biased1 = SignAndMagnitudeToBiased(sam1);
-  const Bits biased2 = SignAndMagnitudeToBiased(sam2);
-  return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
-}
-
-// How many ULP's (Units in the Last Place) we want to tolerate when
-// comparing two numbers.  The larger the value, the more error we
-// allow.  A 0 value means that two numbers must be exactly the same
-// to be considered equal.
-//
-// The maximum error of a single floating-point operation is 0.5
-// units in the last place.  On Intel CPU's, all floating-point
-// calculations are done with 80-bit precision, while double has 64
-// bits.  Therefore, 4 should be enough for ordinary use.
-//
-// See the following article for more details on ULP:
-// http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
-static const size_t kMaxUlps = 4;
-
-// Returns true if and only if this number is at most kMaxUlps ULP's away
-// from rhs.  In particular, this function:
-//
-//   - returns false if either number is (or both are) NAN.
-//   - treats really large numbers as almost equal to infinity.
-//   - thinks +0.0 and -0.0 are 0 DLP's apart.
-inline bool AlmostEquals(float lhs, float rhs) {
-  // The IEEE standard says that any comparison operation involving
-  // a NAN must return false.
-  if (std::isnan(lhs) || std::isnan(rhs))
-    return false;
-
-  Float l = {lhs};
-  Float r = {rhs};
-
-  return DistanceBetweenSignAndMagnitudeNumbers(l.bits_, r.bits_) <= kMaxUlps;
-}
diff --git a/test/cpp/tensorexpr/padded_buffer.cpp b/test/cpp/tensorexpr/padded_buffer.cpp
deleted file mode 100644
index 424d82c77453c..0000000000000
--- a/test/cpp/tensorexpr/padded_buffer.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "test/cpp/tensorexpr/padded_buffer.h"
-
-#include <c10/util/Logging.h>
-#include <c10/util/irange.h>
-#include <sstream>
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-
-int PaddedBufferBase::Index(const std::vector<int>& indices) const {
-  TORCH_DCHECK_EQ(dims_.size(), indices.size());
-  int total_index = 0;
-  for (const auto i : c10::irange(dims_.size())) {
-    total_index += indices[i] * strides_[i];
-  }
-  return total_index;
-}
-
-PaddedBufferBase::PaddedBufferBase(
-    const std::vector<int>& dims,
-    // NOLINTNEXTLINE(modernize-pass-by-value)
-    const std::string& name)
-    : dims_(dims), name_(name), strides_(dims.size()) {
-  for (int i = (int)dims.size() - 1; i >= 0; --i) {
-    if (i == (int)dims.size() - 1) {
-      strides_[i] = 1;
-    } else {
-      strides_[i] = strides_[i + 1] * dims[i + 1];
-    }
-  }
-  total_size_ = strides_[0] * dims[0];
-}
-
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/padded_buffer.h b/test/cpp/tensorexpr/padded_buffer.h
deleted file mode 100644
index b3e5227ae7e62..0000000000000
--- a/test/cpp/tensorexpr/padded_buffer.h
+++ /dev/null
@@ -1,242 +0,0 @@
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include <c10/util/irange.h>
-#include "torch/csrc/jit/tensorexpr/eval.h"
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-
-template <typename T>
-struct DefaultPaddedValue;
-
-template <>
-struct DefaultPaddedValue<int> {
-  static const int kValue = static_cast<int>(0xDEADBEEF);
-};
-
-template <>
-struct DefaultPaddedValue<int8_t> {
-  static const int8_t kValue = static_cast<int8_t>(0xBE);
-};
-
-template <>
-struct DefaultPaddedValue<uint8_t> {
-  static const uint8_t kValue = static_cast<uint8_t>(0xBE);
-};
-
-template <>
-struct DefaultPaddedValue<int16_t> {
-  static const int16_t kValue = static_cast<int16_t>(0xBEEF);
-};
-
-template <>
-struct DefaultPaddedValue<int64_t> {
-  static const int64_t kValue = static_cast<int64_t>(0xDEADBEEF);
-};
-
-template <>
-struct DefaultPaddedValue<float> {
-  static constexpr float kValue = 0.1357;
-};
-
-template <>
-struct DefaultPaddedValue<at::Half> {
-  // at::Half ctor isn't constexpr, so just fill it with bits.
-  static constexpr uint16_t kValue = 1357;
-};
-
-template <>
-struct DefaultPaddedValue<double> {
-  static constexpr double kValue = 0.1357;
-};
-
-// A concrete base to be used in PaddedBase.
-class PaddedBufferBase {
- public:
-  const std::string& name() const {
-    return name_;
-  }
-
-  int size() const {
-    return total_size_;
-  }
-
-  int raw_size() const {
-    return total_size_ + 2 * kPaddingSize;
-  }
-
-  virtual ~PaddedBufferBase() {}
-
- protected:
-  explicit PaddedBufferBase(
-      const std::vector<int>& dims,
-      const std::string& name);
-  int Index(const std::vector<int>& indices) const;
-
-  std::vector<int> dims_;
-  std::string name_;
-  std::vector<int> strides_;
-  int total_size_; // total number of useful element, does not include the
-                   // paddings
-  static constexpr int kPaddingSize = 64;
-};
-
-// A padded buffer with wartermarks for testing.
-// The buffer carries padded watermarks on both sides to catch potential
-// out-of-bounds writes. For read-only data that are not supposed to change, it
-// can also make a backup and be compared later.
-template <typename T>
-class PaddedBuffer : public PaddedBufferBase {
- public:
-  PaddedBuffer(int d0, const std::string& name = "")
-      : PaddedBuffer(std::vector<int>({d0}), name) {}
-  PaddedBuffer(int d0, int d1, const std::string& name = "")
-      : PaddedBuffer(std::vector<int>({d0, d1}), name) {}
-  PaddedBuffer(int d0, int d1, int d2, const std::string& name = "")
-      : PaddedBuffer(std::vector<int>({d0, d1, d2}), name) {}
-  PaddedBuffer(int d0, int d1, int d2, int d3, const std::string& name = "")
-      : PaddedBuffer(std::vector<int>({d0, d1, d2, d3}), name) {}
-  PaddedBuffer(const std::vector<int>& dims, const std::string& name = "")
-      : PaddedBufferBase(dims, name) {
-    data_.resize(total_size_ + 2 * kPaddingSize, kPaddingValue);
-  }
-  PaddedBuffer(const PaddedBuffer& other, const std::string& name)
-      : PaddedBuffer(other) {
-    this->name_ = name;
-  }
-
-  T* data() {
-    return data_.data() + kPaddingSize;
-  }
-  const T* data() const {
-    return const_cast<PaddedBuffer*>(this)->data();
-  }
-  T* raw_data() {
-    return data_.data();
-  }
-  const T* raw_data() const {
-    return const_cast<PaddedBuffer*>(this)->raw_data();
-  }
-  T& operator()(int i0) {
-    // There is a bit performance impact with forming a vector here. But this
-    // data structure is for testing only, and not performance critical.
-    return this->operator()(std::vector<int>({i0}));
-  }
-  const T& operator()(int i0) const {
-    return const_cast<PaddedBuffer*>(this)->operator()(i0);
-  }
-  T& operator()(int i0, int i1) {
-    return this->operator()(std::vector<int>({i0, i1}));
-  }
-  const T& operator()(int i0, int i1) const {
-    return const_cast<PaddedBuffer*>(this)->operator()(i0, i1);
-  }
-  T& operator()(int i0, int i1, int i2) {
-    return this->operator()(std::vector<int>({i0, i1, i2}));
-  }
-  const T& operator()(int i0, int i1, int i2) const {
-    return const_cast<PaddedBuffer*>(this)->operator()(i0, i1, i2);
-  }
-  T& operator()(int i0, int i1, int i2, int i3) {
-    return this->operator()(std::vector<int>({i0, i1, i2, i3}));
-  }
-  const T& operator()(int i0, int i1, int i2, int i3) const {
-    return const_cast<PaddedBuffer*>(this)->operator()(i0, i1, i2, i3);
-  }
-  T& operator()(const std::vector<int>& indices) {
-    return data_[kPaddingSize + Index(indices)];
-  }
-  const T& operator()(const std::vector<int>& indices) const {
-    return const_cast<PaddedBuffer*>(this)->operator()(indices);
-  }
-
-  template <typename U>
-  friend void ExpectAllNear(
-      const PaddedBuffer<U>& v1,
-      const PaddedBuffer<U>& v2,
-      float abs_error);
-  template <typename U>
-  friend void ExpectAllEqual(
-      const PaddedBuffer<U>& v1,
-      const PaddedBuffer<U>& v2);
-  void Backup() {
-    backup_data_ = data_;
-  }
-
-  // Verify the watermarks in the paddings are intact.
-  void ValidateWatermark() const {
-    for (const auto i : c10::irange(kPaddingSize)) {
-      ASSERT_EQ(data_[i], kPaddingValue);
-      ASSERT_EQ(data_[i + total_size_ + kPaddingSize], kPaddingValue);
-    }
-  }
-
-  void CheckBackup() const {
-    ValidateWatermark();
-    DCHECK(backup_data_.size() == data_.size())
-        << "Please make sure you have call Backup() before calling CheckBackup()";
-    for (const auto i : c10::irange(total_size_)) {
-      ASSERT_EQ(data_[i + kPaddingSize], backup_data_[i + kPaddingSize]);
-    }
-  }
-
- private:
-  std::vector<T> data_;
-  std::vector<T> backup_data_;
-  T kPaddingValue = DefaultPaddedValue<T>::kValue;
-};
-
-template <typename T>
-inline CodeGen::CallArg::CallArg(const PaddedBuffer<T>& buffer)
-    : data_(const_cast<T*>(buffer.data())) {}
-
-template <typename T>
-std::string CompareErrorMsg(
-    const PaddedBuffer<T>& v1,
-    const PaddedBuffer<T>& v2,
-    int index) {
-  std::ostringstream oss;
-  oss << "index: " << index << ", v1: (" << v1.name() << ", " << v1(index)
-      << ")"
-      << ", v2: (" << v2.name() << ", " << v2(index) << ")";
-  return oss.str();
-}
-
-template <typename T>
-void ExpectAllEqual(const PaddedBuffer<T>& f1, const PaddedBuffer<T>& f2) {
-  const std::vector<T>& v1 = f1.data_;
-  const std::vector<T>& v2 = f2.data_;
-  const int kPaddingSize = f1.kPaddingSize;
-  const int total_size = f1.total_size_;
-  ASSERT_EQ(v1.size(), v2.size());
-  f1.ValidateWatermark();
-  f2.ValidateWatermark();
-  for (const auto i : c10::irange(total_size)) {
-    ASSERT_EQ(v1[kPaddingSize + i], v2[kPaddingSize + i]);
-  }
-}
-
-template <typename T>
-void ExpectAllNear(
-    const PaddedBuffer<T>& f1,
-    const PaddedBuffer<T>& f2,
-    float abs_error) {
-  const std::vector<T>& v1 = f1.data_;
-  const std::vector<T>& v2 = f2.data_;
-  const int kPaddingSize = f1.kPaddingSize;
-  const int total_size = f1.total_size_;
-  ASSERT_EQ(v1.size(), v2.size());
-  f1.ValidateWatermark();
-  f2.ValidateWatermark();
-  for (const auto i : c10::irange(total_size)) {
-    ASSERT_NEAR(v1[kPaddingSize + i], v2[kPaddingSize + i], abs_error);
-  }
-}
-
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_approx.cpp b/test/cpp/tensorexpr/test_approx.cpp
deleted file mode 100644
index e1a576aecf526..0000000000000
--- a/test/cpp/tensorexpr/test_approx.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-#ifdef TORCH_ENABLE_LLVM
-
-#include <gtest/gtest.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/torch.h>
-#include <cstring>
-
-using namespace torch::indexing;
-namespace te = torch::jit::tensorexpr;
-
-static void vectorize(te::LoopNest* ln, te::Tensor target, int width) {
-  auto loops = ln->getLoopStmtsFor(target);
-  te::ForPtr inner, tail;
-  ln->splitWithTail(loops[0], width, &inner, &tail);
-  ASSERT_TRUE(te::LoopNest::vectorize(inner));
-}
-
-std::string diffs(const at::Tensor& a, const at::Tensor& b) {
-  auto diff = torch::abs(a.flatten() - b.flatten());
-  auto count_diffs = torch::sum(diff > 0.f);
-  auto greatest_diff_index = torch::argmax(diff);
-  std::stringstream ss;
-  ss << "Found " << count_diffs << " unequal element(s). "
-     << "The greatest difference was " << diff.index({greatest_diff_index})
-     << " at index " << greatest_diff_index;
-  return ss.str();
-}
-
-TEST(Approx, log_vml) {
-  te::VarHandle N("N", te::kInt);
-  te::BufHandle A("A", {N}, te::kFloat);
-  te::Tensor B = te::Compute(
-      "B", {N}, [&](const te::VarHandle& i) { return log_vml(A.load(i)); });
-
-  te::LoopNest ln({B});
-  ln.prepareForCodegen();
-  vectorize(&ln, B, 8);
-  te::StmtPtr s = ln.root_stmt();
-  s = te::IRSimplifier::simplify(s);
-  te::LLVMCodeGen cg(s, {A, B, N});
-
-  auto eps = std::numeric_limits<float>::epsilon();
-  auto test = [&](const at::Tensor& A_t) {
-    at::Tensor B_ref = at::log(A_t);
-    at::Tensor B_t = at::empty_like(A_t);
-    auto ap = A_t.data_ptr<float>();
-    auto bp = B_t.data_ptr<float>();
-    cg.call({ap, bp, A_t.numel()});
-    // Results should be bit-identical.
-    ASSERT_TRUE(torch::allclose(
-        B_t, B_ref, /*rtol=*/eps, /*atol=*/0.0f, /*equal_nan=*/true))
-        << "Input[:8]\n"
-        << A_t.index({Slice(0, 8)}) << "\n"
-        << "Test[:8]\n"
-        << B_t.index({Slice(0, 8)}) << "\n"
-        << "Ref[:8]\n"
-        << B_ref.index({Slice(0, 8)}) << diffs(B_t, B_ref);
-  };
-
-  // Generate every single-precision FP value in [1.0, 2.0).
-  at::Tensor A_t = torch::arange(1.0f, 2.0f, eps);
-  ASSERT_EQ(A_t.numel(), 1 << 23);
-
-  test(A_t);
-
-  test(A_t * 2.0f);
-  test(A_t * 0.5f);
-
-  test(A_t * 4.0f);
-  test(A_t * 0.25f);
-
-  test(A_t * powf(2.0f, 16));
-  test(A_t * powf(2.0f, -16));
-
-  test(A_t * powf(2.0f, 126));
-  test(A_t * powf(2.0f, -126));
-
-  test(torch::full({32}, INFINITY));
-  test(torch::full({32}, NAN));
-
-  auto min = std::numeric_limits<float>::min();
-  auto denorm_min = std::numeric_limits<float>::denorm_min();
-
-  // Denormals aren't bit precise, because sleef isn't bit-precise either.
-  A_t = torch::arange(0.0f, min, denorm_min);
-  ASSERT_EQ(A_t.numel(), 1 << 23);
-  auto B_ref = at::log(A_t);
-  auto B_t = at::empty_like(B_ref);
-  cg.call({A_t.data_ptr<float>(), B_t.data_ptr<float>(), A_t.numel()});
-  ASSERT_TRUE(torch::allclose(B_t, B_ref));
-}
-
-#endif // TORCH_ENABLE_LLVM
diff --git a/test/cpp/tensorexpr/test_aten.cpp b/test/cpp/tensorexpr/test_aten.cpp
deleted file mode 100644
index 34ce2bd069d55..0000000000000
--- a/test/cpp/tensorexpr/test_aten.cpp
+++ /dev/null
@@ -1,1068 +0,0 @@
-#include <algorithm>
-#include <sstream>
-#include <stdexcept>
-
-#include <gtest/gtest.h>
-
-#include <c10/macros/Macros.h>
-#include <c10/util/irange.h>
-#include "test/cpp/tensorexpr/padded_buffer.h"
-#include "test/cpp/tensorexpr/test_base.h"
-#include "torch/csrc/jit/tensorexpr/ir_printer.h"
-
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::tensorexpr;
-
-TEST(ATen, _cast_Float) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle to_float = Cast::make(kFloat, load_a);
-  StmtPtr store_b = b_buf.store({index}, to_float);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<int> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), static_cast<float>(i));
-  }
-}
-
-TEST(ATen, negInt) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle to_float = Sub::make(0, load_a);
-  StmtPtr store_b = b_buf.store({index}, to_float);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<int> a_v(kTotalSize);
-  PaddedBuffer<int> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), -static_cast<float>(i));
-  }
-}
-
-TEST(ATen, negFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle to_float = Sub::make(0, load_a);
-  StmtPtr store_b = b_buf.store({index}, to_float);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), -i);
-  }
-}
-
-TEST(ATen, addInt) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kInt);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  ExprHandle load_c = c_buf.load(index);
-  StmtPtr store_d = d_buf.store({index}, load_a + load_b * load_c);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_d);
-
-  PaddedBuffer<int> a_v(kTotalSize);
-  PaddedBuffer<int> b_v(kTotalSize);
-  PaddedBuffer<int> c_v(kTotalSize);
-  PaddedBuffer<int> d_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-    c_v(i) = 3 * i + 2;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf});
-  ir_eval(a_v, b_v, c_v, d_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), 3 * i + 2);
-    ASSERT_EQ(d_v(i), a_v(i) + b_v(i) * c_v(i));
-  }
-}
-
-TEST(ATen, addFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  ExprHandle load_c = c_buf.load(index);
-  StmtPtr store_d = d_buf.store({index}, load_a + load_b * load_c);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_d);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-  PaddedBuffer<float> c_v(kTotalSize);
-  PaddedBuffer<float> d_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-    c_v(i) = 3 * i + 2;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf});
-  ir_eval(a_v, b_v, c_v, d_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), 3 * i + 2);
-    ASSERT_EQ(d_v(i), a_v(i) + b_v(i) * c_v(i));
-  }
-}
-
-TEST(ATen, subInt) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kInt);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  ExprHandle load_c = c_buf.load(index);
-  StmtPtr store_d = d_buf.store({index}, load_a - load_b * load_c);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_d);
-
-  PaddedBuffer<int> a_v(kTotalSize);
-  PaddedBuffer<int> b_v(kTotalSize);
-  PaddedBuffer<int> c_v(kTotalSize);
-  PaddedBuffer<int> d_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-    c_v(i) = 3 * i + 2;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf});
-  ir_eval(a_v, b_v, c_v, d_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), 3 * i + 2);
-    ASSERT_EQ(d_v(i), a_v(i) - b_v(i) * c_v(i));
-  }
-}
-
-TEST(ATen, subFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  ExprHandle load_c = c_buf.load(index);
-  StmtPtr store_d = d_buf.store({index}, load_a - load_b * load_c);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_d);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-  PaddedBuffer<float> c_v(kTotalSize);
-  PaddedBuffer<float> d_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-    c_v(i) = 3 * i + 2;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf});
-  ir_eval(a_v, b_v, c_v, d_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), 3 * i + 2);
-    ASSERT_EQ(d_v(i), a_v(i) - b_v(i) * c_v(i));
-  }
-}
-
-TEST(ATen, lerp) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  ExprHandle load_c = c_buf.load(index);
-  StmtPtr store_d = d_buf.store({index}, load_a + load_c * (load_b - load_a));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_d);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-  PaddedBuffer<float> c_v(kTotalSize);
-  PaddedBuffer<float> d_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-    c_v(i) = 3 * i + 2;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf});
-  ir_eval(a_v, b_v, c_v, d_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), 3 * i + 2);
-    ASSERT_EQ(d_v(i), a_v(i) + c_v(i) * (b_v(i) - a_v(i)));
-  }
-}
-
-TEST(ATen, addcmulInt) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle e_buf("E", {ExprHandle(kTotalSize)}, kInt);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  ExprHandle load_c = c_buf.load(index);
-  ExprHandle load_d = d_buf.load(index);
-  StmtPtr store_e = e_buf.store({index}, load_a + load_b * load_c * load_d);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_e);
-
-  PaddedBuffer<int> a_v(kTotalSize);
-  PaddedBuffer<int> b_v(kTotalSize);
-  PaddedBuffer<int> c_v(kTotalSize);
-  PaddedBuffer<int> d_v(kTotalSize);
-  PaddedBuffer<int> e_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-    c_v(i) = 3 * i + 2;
-    d_v(i) = 5 * i + 3;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf, e_buf});
-  ir_eval(a_v, b_v, c_v, d_v, e_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), 3 * i + 2);
-    ASSERT_EQ(d_v(i), 5 * i + 3);
-    ASSERT_EQ(e_v(i), a_v(i) + b_v(i) * c_v(i) * d_v(i));
-  }
-}
-
-TEST(ATen, addcmulFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle d_buf("D", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle e_buf("E", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  ExprHandle load_c = c_buf.load(index);
-  ExprHandle load_d = d_buf.load(index);
-  StmtPtr store_e = e_buf.store({index}, load_a + load_b * load_c * load_d);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_e);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-  PaddedBuffer<float> c_v(kTotalSize);
-  PaddedBuffer<float> d_v(kTotalSize);
-  PaddedBuffer<float> e_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-    c_v(i) = 3 * i + 2;
-    d_v(i) = 5 * i + 3;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf, e_buf});
-  ir_eval(a_v, b_v, c_v, d_v, e_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), 3 * i + 2);
-    ASSERT_EQ(d_v(i), 5 * i + 3);
-    ASSERT_FLOAT_EQ(e_v(i), a_v(i) + b_v(i) * c_v(i) * d_v(i));
-  }
-}
-
-TEST(ATen, mulInt) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  StmtPtr store_c = c_buf.store({index}, load_a * load_b);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
-
-  PaddedBuffer<int> a_v(kTotalSize);
-  PaddedBuffer<int> b_v(kTotalSize);
-  PaddedBuffer<int> c_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
-  ir_eval(a_v, b_v, c_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), a_v(i) * b_v(i));
-  }
-}
-
-TEST(ATen, mulFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  StmtPtr store_c = c_buf.store({index}, load_a * load_b);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-  PaddedBuffer<float> c_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
-  ir_eval(a_v, b_v, c_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), a_v(i) * b_v(i));
-  }
-}
-
-TEST(ATen, divInt) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  StmtPtr store_c = c_buf.store({index}, load_a / load_b);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
-
-  PaddedBuffer<int> a_v(kTotalSize);
-  PaddedBuffer<int> b_v(kTotalSize);
-  PaddedBuffer<int> c_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = 2 * i + 1;
-    b_v(i) = i + 1;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
-  ir_eval(a_v, b_v, c_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), 2 * i + 1);
-    ASSERT_EQ(b_v(i), i + 1);
-    ASSERT_EQ(c_v(i), a_v(i) / b_v(i));
-  }
-}
-
-TEST(ATen, divFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  StmtPtr store_c = c_buf.store({index}, load_a / load_b);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-  PaddedBuffer<float> c_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = 2 * i + 1;
-    b_v(i) = i + 1;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
-  ir_eval(a_v, b_v, c_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), 2 * i + 1);
-    ASSERT_EQ(b_v(i), i + 1);
-    ASSERT_EQ(c_v(i), a_v(i) / b_v(i));
-  }
-}
-
-TEST(ATen, maxInt) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  StmtPtr store_c = c_buf.store({index}, Max::make(load_a, load_b, true));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
-
-  PaddedBuffer<int> a_v(kTotalSize);
-  PaddedBuffer<int> b_v(kTotalSize);
-  PaddedBuffer<int> c_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
-  ir_eval(a_v, b_v, c_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), std::max(a_v(i), b_v(i)));
-  }
-}
-
-TEST(ATen, maxFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  StmtPtr store_c = c_buf.store({index}, Max::make(load_a, load_b, true));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-  PaddedBuffer<float> c_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
-  ir_eval(a_v, b_v, c_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), std::fmax(a_v(i), b_v(i)));
-  }
-}
-
-TEST(ATen, minInt) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kInt);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  StmtPtr store_c = c_buf.store({index}, Min::make(load_a, load_b, true));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
-
-  PaddedBuffer<int> a_v(kTotalSize);
-  PaddedBuffer<int> b_v(kTotalSize);
-  PaddedBuffer<int> c_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
-  ir_eval(a_v, b_v, c_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), std::min(a_v(i), b_v(i)));
-  }
-}
-
-TEST(ATen, minFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle c_buf("C", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  ExprHandle load_b = b_buf.load(index);
-  StmtPtr store_c = c_buf.store({index}, Min::make(load_a, load_b, true));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_c);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-  PaddedBuffer<float> c_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-    b_v(i) = 2 * i + 1;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
-  ir_eval(a_v, b_v, c_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 2 * i + 1);
-    ASSERT_EQ(c_v(i), std::fmin(a_v(i), b_v(i)));
-  }
-}
-
-void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store({index}, FloatImm::make(1.0f) / load_a);
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i);
-    ASSERT_EQ(b_v(i), 1.0f / i);
-  }
-}
-
-TEST(ATen, reluInt) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store({index}, Max::make(load_a, 0, false));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<int> a_v(kTotalSize);
-  PaddedBuffer<int> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i - 64;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i - 64);
-    ASSERT_EQ(b_v(i), std::max(a_v(i), 0));
-  }
-}
-
-TEST(ATen, reluFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store(
-      {index}, Max::make(load_a, 0, false) // relu does not propagate nans
-  );
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i - 64;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i - 64);
-    ASSERT_EQ(b_v(i), std::fmax(a_v(i), 0));
-  }
-}
-
-TEST(ATen, logFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store({index}, log(load_a));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i + 10;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i + 10);
-    ASSERT_EQ(b_v(i), std::log(a_v(i)));
-  }
-}
-
-TEST(ATen, fastLogFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store({index}, fast_log(load_a));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = at::randn({1}).item().to<float>();
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    auto test = b_v(i);
-    auto ref = std::log(a_v(i));
-    if (std::isnan(ref)) {
-      ASSERT_EQ(std::isnan(test), true);
-    } else {
-      ASSERT_FLOAT_EQ(test, ref);
-    }
-  }
-}
-
-TEST(ATen, fastTanhFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store({index}, fast_tanh(load_a));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = at::randn({1}).item().to<float>();
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    auto test = b_v(i);
-    auto ref = std::tanh(a_v(i));
-    if (std::isnan(ref)) {
-      ASSERT_EQ(std::isnan(test), true);
-    } else {
-      ASSERT_NEAR(test, ref, 1e-6);
-    }
-  }
-}
-
-TEST(ATen, fastSigmoidFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store({index}, fast_sigmoid(load_a));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = at::randn({1}).item().to<float>();
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    auto test = b_v(i);
-    at::Tensor t = at::ones({1}) * a_v(i);
-    float ref = at::sigmoid(t).item().to<float>();
-    if (std::isnan(ref)) {
-      ASSERT_EQ(std::isnan(test), true);
-    } else {
-      ASSERT_NEAR(test, ref, 1e-6);
-    }
-  }
-}
-
-TEST(ATen, log10Float) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store({index}, log10(load_a));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i + 10;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i + 10);
-    ASSERT_EQ(b_v(i), std::log10(a_v(i)));
-  }
-}
-
-TEST(ATen, log2Float) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store({index}, log2(load_a));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i + 10;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i + 10);
-    ASSERT_EQ(b_v(i), std::log2(a_v(i)));
-  }
-}
-
-TEST(ATen, expFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store({index}, exp(load_a));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-    a_v(i) = i / 10.0f;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i / 10.0f);
-    ASSERT_EQ(b_v(i), std::exp(a_v(i)));
-  }
-}
-
-TEST(ATen, erfFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store({index}, erf(load_a));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-    a_v(i) = i / 10.0f;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i / 10.0f);
-    ASSERT_EQ(b_v(i), std::erf(a_v(i)));
-  }
-}
-
-TEST(ATen, cosFloat) {
-  const int kTotalSize = 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store({index}, cos(load_a));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-    a_v(i) = i / 10.0f;
-  }
-
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf});
-  ir_eval(a_v, b_v);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    ASSERT_EQ(a_v(i), i / 10.0f);
-    ASSERT_EQ(b_v(i), std::cos(a_v(i)));
-  }
-}
-
-TEST(ATen, eqInt) {
-  constexpr int N = 128;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  std::vector<int> a_buffer(N, 1);
-  std::vector<int> b_buffer(N, 1);
-  std::vector<int> c_buffer(N, 0);
-
-  VarHandle i("i", kInt);
-  auto memcpy_expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
-
-  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
-  ir_eval(a_buffer, b_buffer, c_buffer);
-
-  assertAllEqual(c_buffer, 1);
-}
-
-TEST(ATen, geInt) {
-  constexpr int N = 128;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  std::vector<int> a_buffer(N, 5);
-  std::vector<int> b_buffer(N, 5);
-  std::vector<int> c_buffer(N, 0);
-
-  VarHandle i("i", kInt);
-  auto memcpy_expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i), b.load(i), CompareSelectOperation::kGE)));
-
-  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
-  ir_eval(a_buffer, b_buffer, c_buffer);
-
-  assertAllEqual(c_buffer, 1);
-}
-
-TEST(ATen, gtInt) {
-  constexpr int N = 128;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  std::vector<int> a_buffer(N, 6);
-  std::vector<int> b_buffer(N, 3);
-  std::vector<int> c_buffer(N, 0);
-
-  VarHandle i("i", kInt);
-  auto memcpy_expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i), b.load(i), CompareSelectOperation::kGT)));
-
-  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
-  ir_eval(a_buffer, b_buffer, c_buffer);
-
-  assertAllEqual(c_buffer, 1);
-}
-
-TEST(ATen, leInt) {
-  constexpr int N = 128;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  std::vector<int> a_buffer(N, 5);
-  std::vector<int> b_buffer(N, 5);
-  std::vector<int> c_buffer(N, 0);
-
-  VarHandle i("i", kInt);
-  auto memcpy_expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i), b.load(i), CompareSelectOperation::kLE)));
-
-  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
-  ir_eval(a_buffer, b_buffer, c_buffer);
-
-  assertAllEqual(c_buffer, 1);
-}
-
-TEST(ATen, ltInt) {
-  constexpr int N = 128;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  std::vector<int> a_buffer(N, 5);
-  std::vector<int> b_buffer(N, 5);
-  std::vector<int> c_buffer(N, 1);
-
-  VarHandle i("i", kInt);
-  auto memcpy_expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i), b.load(i), CompareSelectOperation::kLT)));
-
-  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
-  ir_eval(a_buffer, b_buffer, c_buffer);
-
-  assertAllEqual(c_buffer, 0);
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_base.h b/test/cpp/tensorexpr/test_base.h
deleted file mode 100644
index 68b96fe6c90f7..0000000000000
--- a/test/cpp/tensorexpr/test_base.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#pragma once
-
-#if defined(USE_GTEST)
-#include <gtest/gtest.h>
-#include <test/cpp/common/support.h>
-#else
-#include <cmath>
-#include "c10/util/Exception.h"
-#include "test/cpp/tensorexpr/gtest_assert_float_eq.h"
-#define ASSERT_EQ(x, y, ...) TORCH_INTERNAL_ASSERT((x) == (y), __VA_ARGS__)
-#define ASSERT_FLOAT_EQ(x, y, ...) \
-  TORCH_INTERNAL_ASSERT(AlmostEquals((x), (y)), __VA_ARGS__)
-#define ASSERT_NE(x, y, ...) TORCH_INTERNAL_ASSERT((x) != (y), __VA_ARGS__)
-#define ASSERT_GT(x, y, ...) TORCH_INTERNAL_ASSERT((x) > (y), __VA_ARGS__)
-#define ASSERT_GE(x, y, ...) TORCH_INTERNAL_ASSERT((x) >= (y), __VA_ARGS__)
-#define ASSERT_LT(x, y, ...) TORCH_INTERNAL_ASSERT((x) < (y), __VA_ARGS__)
-#define ASSERT_LE(x, y, ...) TORCH_INTERNAL_ASSERT((x) <= (y), __VA_ARGS__)
-
-#define ASSERT_NEAR(x, y, a, ...) \
-  TORCH_INTERNAL_ASSERT(std::fabs((x) - (y)) < (a), __VA_ARGS__)
-
-#define ASSERT_TRUE TORCH_INTERNAL_ASSERT
-#define ASSERT_FALSE(x) ASSERT_TRUE(!(x))
-#define ASSERT_THROWS_WITH(statement, substring)                         \
-  try {                                                                  \
-    (void)statement;                                                     \
-    ASSERT_TRUE(false);                                                  \
-  } catch (const std::exception& e) {                                    \
-    ASSERT_NE(std::string(e.what()).find(substring), std::string::npos); \
-  }
-#define ASSERT_ANY_THROW(statement)     \
-  {                                     \
-    bool threw = false;                 \
-    try {                               \
-      (void)statement;                  \
-    } catch (const std::exception& e) { \
-      threw = true;                     \
-    }                                   \
-    ASSERT_TRUE(threw);                 \
-  }
-
-#endif // defined(USE_GTEST)
-#include <string>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-
-template <typename U, typename V>
-void ExpectAllNear(
-    const std::vector<U>& v1,
-    const std::vector<U>& v2,
-    V threshold,
-    const std::string& name = "") {
-  ASSERT_EQ(v1.size(), v2.size());
-  for (size_t i = 0; i < v1.size(); i++) {
-    ASSERT_NEAR(v1[i], v2[i], threshold);
-  }
-}
-
-template <typename U, typename V>
-void ExpectAllNear(
-    const std::vector<U>& vec,
-    const U& val,
-    V threshold,
-    const std::string& name = "") {
-  for (size_t i = 0; i < vec.size(); i++) {
-    ASSERT_NEAR(vec[i], val, threshold);
-  }
-}
-
-template <typename T>
-static void assertAllEqual(const std::vector<T>& vec, const T& val) {
-  for (auto const& elt : vec) {
-    ASSERT_EQ(elt, val);
-  }
-}
-
-template <typename T>
-static void assertAllEqual(const std::vector<T>& v1, const std::vector<T>& v2) {
-  ASSERT_EQ(v1.size(), v2.size());
-  for (size_t i = 0; i < v1.size(); ++i) {
-    ASSERT_EQ(v1[i], v2[i]);
-  }
-}
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp
deleted file mode 100644
index 2605842d6e74d..0000000000000
--- a/test/cpp/tensorexpr/test_boundsinference.cpp
+++ /dev/null
@@ -1,1019 +0,0 @@
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <unordered_map>
-
-#include <gtest/gtest.h>
-
-#include <c10/util/irange.h>
-#include <test/cpp/tensorexpr/padded_buffer.h>
-#include <torch/csrc/jit/tensorexpr/analysis.h>
-#include <torch/csrc/jit/tensorexpr/bounds_inference.h>
-#include <torch/csrc/jit/tensorexpr/eval.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_printer.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::tensorexpr;
-
-static void verifyConstBounds(
-    const TensorAccessBoundsInfo& access_info,
-    const std::vector<std::pair<int, int>>& ref) {
-  size_t ndim = ref.size();
-  ASSERT_EQ(access_info.start.size(), ndim);
-  ASSERT_EQ(access_info.stop.size(), ndim);
-  for (const auto i : c10::irange(ndim)) {
-    if (ref[i].first >= 0) { // Negative values are used to skip the check
-      ASSERT_TRUE(access_info.start[i]->isConstant());
-      int start_i = immediateAs<int>(access_info.start[i]);
-      ASSERT_EQ(start_i, ref[i].first);
-    }
-    if (ref[i].second >= 0) {
-      ASSERT_TRUE(access_info.stop[i]->isConstant());
-      int stop_i = immediateAs<int>(access_info.stop[i]);
-      ASSERT_EQ(stop_i, ref[i].second);
-    }
-  }
-}
-
-TEST(BoundsInference, _1) {
-  // Verify that bounds inference works for the following example:
-  // for i in 0..100:
-  //   b[i] = a[i]
-  // For this loop bounds inference should yield the following:
-  // {{b, kStore, 0, 99}, {a, kLoad, 0, 99}}
-  ExprHandle n(100);
-  BufHandle a("a", {n}, kFloat);
-  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
-  LoopNest l({b});
-  auto bounds_info = inferBounds(l.root_stmt());
-
-  // We should have two entries: one for 'b' and one for 'a'.
-  ASSERT_EQ(bounds_info.size(), 2);
-  ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-  ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-  verifyConstBounds(bounds_info.at(a.node())[0], {{0, 99}});
-
-  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
-  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 99}});
-}
-
-TEST(BoundsInference, _2) {
-  // Verify that bounds inference works for the following example:
-  // for i in 0..n:
-  //   b[i] = a[i]
-  // For this loop bounds inference should yield the following:
-  // {{b, kStore, 0, n-1}, {a, kLoad, 0, n-1}}
-  VarHandle n("n", kInt);
-  BufHandle a("a", {n}, kFloat);
-  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
-  LoopNest l({b});
-  auto bounds_info = inferBounds(l.root_stmt());
-
-  // We should have two entries: one for 'b' and one for 'a'.
-  ASSERT_EQ(bounds_info.size(), 2);
-  ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-  ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-  verifyConstBounds(bounds_info.at(a.node())[0], {{0, -1}});
-
-  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
-  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, -1}});
-}
-
-TEST(BoundsInference, _3) {
-  // Verify that bounds inference works for the following example:
-  // for i in 0..100:
-  //   b[i] = a[i] * a[i+10]
-  // For this loop bounds inference should yield the following:
-  // {{b, kStore, 0, 99}, {a, kLoad, 0, 109}}
-  ExprHandle n(100);
-  BufHandle a("a", {n + 10}, kFloat);
-  Tensor b = Compute(
-      "b", {n}, [&](const VarHandle& i) { return a.load(i) * a.load(i + 10); });
-  LoopNest l({b});
-  auto bounds_info = inferBounds(l.root_stmt());
-
-  // We should have two entries: one for 'b' and one for 'a'.
-  ASSERT_EQ(bounds_info.size(), 2);
-  ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-  ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-  verifyConstBounds(bounds_info.at(a.node())[0], {{0, 109}});
-
-  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
-  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 99}});
-}
-
-TEST(BoundsInference, _4) {
-  // Verify that bounds inference works for the following example:
-  //
-  // for y in 0..200:
-  //   for x in 0..320:
-  //     b[y,x] = x*y
-  // for y in 0..200:
-  //   for x in 0..320:
-  //     c[y,x] = a[y,x] * b[y,x]
-  ExprHandle W(320);
-  ExprHandle H(200);
-  BufHandle a("a", {H, W}, kFloat);
-  Tensor b = Compute("b", {H, W}, [&](const VarHandle& y, const VarHandle& x) {
-    return x * y;
-  });
-  Tensor c = Compute("c", {H, W}, [&](const VarHandle& y, const VarHandle& x) {
-    return a.load(y, x) * b.load(y, x);
-  });
-  LoopNest l({c});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  StmtPtr body = l.getLoopBodyFor(c);
-  {
-    // Infer bounds on the top-level loop scope
-    auto bounds_info = inferBounds(loops[0]);
-    ASSERT_EQ(bounds_info.size(), 3);
-
-    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(a.node())[0], {{0, 199}, {0, 319}});
-
-    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 199}, {0, 319}});
-
-    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 199}, {0, 319}});
-  }
-  {
-    // Infer bounds on the inner loop scope
-    auto bounds_info = inferBounds(loops[1]);
-    ASSERT_EQ(bounds_info.size(), 3);
-
-    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(a.node())[0], {{-1, -1}, {0, 319}});
-
-    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {0, 319}});
-
-    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {0, 319}});
-  }
-  {
-    // Infer bounds on the inner loop body's scope
-    auto bounds_info = inferBounds(body);
-    ASSERT_EQ(bounds_info.size(), 3);
-
-    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(a.node())[0], {{-1, -1}, {-1, -1}});
-
-    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {-1, -1}});
-
-    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {-1, -1}});
-  }
-}
-
-TEST(BoundsInference, _5) {
-  // Verify that bounds inference works for the following example:
-  // for i in 0..100:
-  //   b[i] = a[i]
-  //
-  // ==> split ==>
-  //
-  // for i_outer in 0..100/16:
-  //   for i_inner in 0..16:
-  //     b[i_outer * 16 + i_inner] = a[i_outer * 16 + i_inner]
-  // for i_tail in 0..100%16:
-  //   b[i_tail + (100/16)*16] = a[i_tail + (100/16)*16];
-  ExprHandle n(100);
-  BufHandle a("a", {n}, kFloat);
-  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
-  LoopNest l({b});
-
-  ForPtr inner;
-  ForPtr tail;
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(b);
-  LoopNest::splitWithTail(loops[0], 16, &inner, &tail);
-  ForPtr outer = loops[0];
-
-  {
-    // Verify inferred bounds for the outer loop
-    auto bounds_info = inferBounds(outer);
-    ASSERT_EQ(bounds_info.size(), 2);
-
-    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(a.node())[0], {{0, 95}});
-
-    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 95}});
-  }
-  {
-    // Verify inferred bounds for the tail loop
-    auto bounds_info = inferBounds(tail);
-    ASSERT_EQ(bounds_info.size(), 2);
-
-    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(a.node())[0], {{96, 99}});
-
-    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b.buf())[0], {{96, 99}});
-  }
-}
-
-TEST(BoundsInference, _6) {
-  // Verify that bounds inference works for the following example:
-  //
-  // for y in 0..200:
-  //   for x in 0..320:
-  //     b[y,x] = x*y
-  // for y in 0..20:
-  //   for x in 0..32:
-  //     c[y,x] = a[y+100,x+100] * b[y*2,x*5]
-  ExprHandle W(320);
-  ExprHandle H(200);
-  ExprHandle CW(32);
-  ExprHandle CH(20);
-  BufHandle a("a", {H, W}, kFloat);
-  Tensor b = Compute("b", {H, W}, [&](const VarHandle& y, const VarHandle& x) {
-    return x * y;
-  });
-  Tensor c =
-      Compute("c", {CH, CW}, [&](const VarHandle& y, const VarHandle& x) {
-        return a.load(y + 100, x + 100) * b.load(y * 2, x * 5);
-      });
-  LoopNest l({c});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  StmtPtr body = l.getLoopBodyFor(c);
-  {
-    // Infer bounds on the top-level loop scope
-    auto bounds_info = inferBounds(loops[0]);
-    ASSERT_EQ(bounds_info.size(), 3);
-
-    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(a.node())[0], {{100, 119}, {100, 131}});
-
-    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 38}, {0, 155}});
-
-    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 19}, {0, 31}});
-  }
-  {
-    // Infer bounds on the inner loop scope
-    auto bounds_info = inferBounds(loops[1]);
-    ASSERT_EQ(bounds_info.size(), 3);
-
-    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(a.node())[0], {{-1, -1}, {100, 131}});
-
-    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {0, 155}});
-
-    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {0, 31}});
-  }
-  {
-    // Infer bounds on the inner loop body's scope
-    auto bounds_info = inferBounds(body);
-    ASSERT_EQ(bounds_info.size(), 3);
-
-    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(a.node())[0], {{-1, -1}, {-1, -1}});
-
-    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {-1, -1}});
-
-    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {-1, -1}});
-  }
-}
-
-TEST(BoundsInference, Adjacent) {
-  ExprHandle H(6);
-  BufHandle a("a", {20}, kFloat);
-  Tensor b = Compute("b", {H}, [&](const VarHandle& x) { return a.load(x); });
-  Tensor c =
-      Compute("c", {H}, [&](const VarHandle& x) { return a.load(x + H); });
-  LoopNest l({b, c});
-  std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
-
-  {
-    // Infer bounds on the top-level loop scope
-    auto bounds_info = inferBounds(loops[0]);
-    ASSERT_EQ(bounds_info.size(), 2);
-
-    // reads from a[0:5], writes to b[0:5]
-    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(a.node())[0], {{0, 5}});
-
-    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 5}});
-  }
-  {
-    // Infer bounds on the inner loop scope
-    auto bounds_info = inferBounds(loops[1]);
-    ASSERT_EQ(bounds_info.size(), 2);
-
-    // reads from a[0+6:5+6], writes to c[0:5]
-    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(a.node())[0], {{6, 11}});
-
-    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 5}});
-  }
-  {
-    // Infer bounds on the high level program.
-    auto bounds_info = inferBounds(l.root_stmt());
-    ASSERT_EQ(bounds_info.size(), 3);
-
-    // Should be union of above 2 bounds, but this time the bounds of A can be
-    // merged.
-    ASSERT_EQ(bounds_info.at(a.node()).size(), 1);
-    ASSERT_EQ(bounds_info.at(a.node())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(a.node())[0], {{0, 11}});
-
-    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 5}});
-
-    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 5}});
-  }
-}
-
-TEST(BoundsInference, MultipleTopLoopLoad) {
-  BufHandle a("a", {100}, kFloat);
-  Tensor b = Compute("b", {64}, [&](const VarHandle& x) { return a.load(x); });
-  Tensor c =
-      Compute("c", {32}, [&](const VarHandle& x) { return a.load(x + 10); });
-  Tensor d =
-      Compute("d", {96}, [&](const VarHandle& x) { return a.load(x + 2); });
-  LoopNest l({b, c, d});
-
-  auto bounds_info = inferBounds(l.root_stmt());
-
-  ASSERT_EQ(bounds_info.size(), 4);
-
-  // a only read.
-  {
-    auto bounds = bounds_info[a.node()];
-    ASSERT_EQ(bounds.size(), 1);
-    // One dimension.
-    auto bound = bounds[0];
-    ASSERT_EQ(bound.kind, TensorAccessKind::kLoad);
-    // Bounds:
-    // start: Min of the 3 load bounds = Min of loop starts + offset = 0+0 (b).
-    // stop: Max of the 3 load bounds = Max of loop stops + offset - 1 =
-    //       96 + 2 - 1 (d).
-    verifyConstBounds(bound, {{0, 97}});
-  }
-
-  // b, c, d only written.
-  {
-    auto bounds = bounds_info[b.buf()];
-    ASSERT_EQ(bounds.size(), 1);
-    auto bound = bounds[0];
-    ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
-    // Just the loop extents for b.
-    verifyConstBounds(bound, {{0, 63}});
-  }
-  {
-    auto bounds = bounds_info[c.buf()];
-    ASSERT_EQ(bounds.size(), 1);
-    auto bound = bounds[0];
-    ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
-    // Just the loop extents for c.
-    verifyConstBounds(bound, {{0, 31}});
-  }
-  {
-    auto bounds = bounds_info[d.buf()];
-    ASSERT_EQ(bounds.size(), 1);
-    auto bound = bounds[0];
-    ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
-    // Just the loop extents for d.
-    verifyConstBounds(bound, {{0, 95}});
-  }
-}
-
-TEST(BoundsInference, MultipleTopLoopStore) {
-  BufHandle a("a", {100}, kFloat);
-  BufHandle b("b", {100}, kFloat);
-  BufHandle c("c", {100}, kFloat);
-  BufHandle d("d", {100}, kFloat);
-  VarHandle x("x", kInt);
-
-  // Same as above but the offsets are on the Store now.
-  // Can't do this through ComputeAPI without transforms we don't have yet.
-  StmtPtr stmt = Block::make(
-      {For::make(x, 0, 64, Store::make(b, {x}, Load::make(a, {x}))),
-       For::make(x, 0, 32, Store::make(c, {x + 10}, Load::make(a, {x}))),
-       For::make(x, 0, 96, Store::make(d, {x + 2}, Load::make(a, {x})))});
-
-  auto bounds_info = inferBounds(stmt);
-
-  ASSERT_EQ(bounds_info.size(), 4);
-
-  // a only read.
-  {
-    auto bounds = bounds_info[a.node()];
-    ASSERT_EQ(bounds.size(), 1);
-    // One dimension.
-    auto bound = bounds[0];
-    ASSERT_EQ(bound.kind, TensorAccessKind::kLoad);
-    // Bounds: there are no offsets, so this is just the max loop bounds.
-    verifyConstBounds(bound, {{0, 95}});
-  }
-
-  // b, c, d only written.
-  {
-    auto bounds = bounds_info[b.node()];
-    ASSERT_EQ(bounds.size(), 1);
-    auto bound = bounds[0];
-    ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
-    // This should be equivalent to {offset, extent + offset} for the b loop.
-    // b loop has no offset, so just the loop extents.
-    verifyConstBounds(bound, {{0, 63}});
-  }
-  {
-    auto bounds = bounds_info[c.node()];
-    ASSERT_EQ(bounds.size(), 1);
-    auto bound = bounds[0];
-    ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
-    // This should be equivalent to {offset, extent + offset} for the c loop.
-    // Offset is 10, extent is 32-1.
-    verifyConstBounds(bound, {{10, 41}});
-  }
-  {
-    auto bounds = bounds_info[d.node()];
-    ASSERT_EQ(bounds.size(), 1);
-    auto bound = bounds[0];
-    ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
-    // This should be equivalent to {offset, extent + offset} for the d loop.
-    // Offset is 2, extent is 96-1.
-    verifyConstBounds(bound, {{2, 97}});
-  }
-}
-
-TEST(BoundsInference, CacheReads) {
-  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
-    return i * j;
-  });
-  Tensor B =
-      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i + 30, j + 3);
-      });
-  Tensor C =
-      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
-      });
-
-  LoopNest l({B, C});
-  auto bounds_info_before = inferBounds(l.root_stmt());
-
-  StmtPtr j_loop = l.getLoopStmtsFor(B)[1];
-  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
-
-  auto bounds_info_after = inferBounds(l.root_stmt());
-
-  // CacheAccesses should not change existing bounds, but add a new one for the
-  // cache.
-  for (auto& pair : bounds_info_after) {
-    auto beforeIt = bounds_info_before.find(pair.first);
-    if (beforeIt != bounds_info_before.end()) {
-      // Same number of TensorAccessBoundInfos.
-      ASSERT_EQ(pair.second.size(), beforeIt->second.size());
-
-      for (const auto i : c10::irange(pair.second.size())) {
-        TensorAccessBoundsInfo& after = pair.second[i];
-        TensorAccessBoundsInfo& before = beforeIt->second[i];
-        // Same number of dimensions.
-        ASSERT_EQ(before.start.size(), after.start.size());
-
-        // Bounds are equal.
-        for (const auto j : c10::irange(before.start.size())) {
-          ASSERT_TRUE(exprEquals(before.start[j], after.start[j]));
-          ASSERT_TRUE(exprEquals(before.stop[j], after.stop[j]));
-        }
-      }
-    } else {
-      // This should be the cache.
-      ASSERT_EQ(pair.first->name_hint(), "A_local");
-      // Should have both a load and a store.
-      ASSERT_EQ(pair.second.size(), 2);
-      TensorAccessBoundsInfo& first = pair.second[0];
-      TensorAccessBoundsInfo& second = pair.second[1];
-
-      ASSERT_NE(first.kind, second.kind);
-      // 2 dimensions.
-      ASSERT_EQ(first.start.size(), second.start.size());
-      ASSERT_EQ(first.start.size(), 2);
-
-      // bounds for load and store are equal.
-      for (const auto j : c10::irange(first.start.size())) {
-        ASSERT_TRUE(exprEquals(first.start[j], second.start[j]));
-        ASSERT_TRUE(exprEquals(first.stop[j], second.stop[j]));
-      }
-    }
-  }
-}
-
-TEST(BoundsInference, Flattened) {
-  Tensor b = Compute(
-      "b",
-      {3, 4, 5},
-      [&](const VarHandle& z, const VarHandle& y, const VarHandle& x) {
-        return x * y + z;
-      });
-
-  LoopNest l({b});
-  // Flatten indices.
-  l.prepareForCodegen();
-  auto bounds_info = inferBounds(l.root_stmt());
-
-  // There's only one buffer.
-  ASSERT_EQ(bounds_info.size(), 1);
-  auto& TABI = bounds_info[b.buf()][0];
-  ASSERT_EQ(TABI.kind, TensorAccessKind::kStore);
-  // Flattened bounds should have a single dimension.
-  ASSERT_EQ(TABI.start.size(), 1);
-  ASSERT_EQ(TABI.stop.size(), 1);
-
-  // Bounds should be 0 -> (3*4*5)-1
-  ASSERT_TRUE(exprEquals(TABI.start[0], alloc<IntImm>(0)));
-  ASSERT_TRUE(exprEquals(TABI.stop[0], alloc<IntImm>(3 * 4 * 5 - 1)));
-}
-
-TEST(BoundsInference, GetPotentialHazards) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  using namespace analysis;
-
-  {
-    /*
-     * A[0] = B[0];
-     * B[0] = 3;      WAR on B
-     * A[0] = B[0];   WAW on A, RAW on B
-     * C[0] = 5;
-     */
-
-    StorePtr store1 = Store::make(a, {0}, Load::make(b, {0}));
-    StorePtr store2 = Store::make(b, {0}, 3);
-    StorePtr store3 = Store::make(a, {0}, Load::make(b, {0}));
-    StorePtr store4 = Store::make(c, {0}, 5);
-    StmtPtr stmt = Block::make({store1, store2, store3, store4});
-
-    MemDependencyChecker analyzer;
-    stmt->accept(&analyzer);
-
-    ASSERT_EQ(
-        HazardKind::WriteAfterRead,
-        getPotentialHazards(analyzer, store1, store2));
-
-    ASSERT_EQ(
-        HazardKind::ReadAfterWrite,
-        getPotentialHazards(analyzer, store2, store3));
-
-    ASSERT_EQ(
-        HazardKind::WriteAfterWrite,
-        getPotentialHazards(analyzer, store1, store3));
-
-    // Fourth store has no dependencies
-    ASSERT_EQ(
-        HazardKind::NoDependency,
-        getPotentialHazards(analyzer, store1, store4));
-    ASSERT_EQ(
-        HazardKind::NoDependency,
-        getPotentialHazards(analyzer, store2, store4));
-    ASSERT_EQ(
-        HazardKind::NoDependency,
-        getPotentialHazards(analyzer, store3, store4));
-  }
-}
-
-TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
-  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
-    return i * j;
-  });
-  Tensor B = Compute("B", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
-    return (i + 1) * (j + 1);
-  });
-
-  LoopNest l({A, B});
-
-  using namespace analysis;
-
-  MemDependencyChecker analyzer;
-  l.root_stmt()->accept(&analyzer);
-
-  ForPtr loopRootA = l.getLoopStmtsFor(A)[0];
-  ForPtr loopRootB = l.getLoopStmtsFor(B)[0];
-
-  // No dependencies between loops.
-  ASSERT_EQ(
-      HazardKind::NoDependency,
-      getPotentialHazards(analyzer, loopRootA, loopRootB));
-}
-
-TEST(BoundsInference, GetPotentialHazardsLoopCall) {
-  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
-    return i * j;
-  });
-  Tensor B =
-      Compute("B", {64, 64}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i, j) + 5;
-      });
-
-  LoopNest l({A, B});
-
-  using namespace analysis;
-
-  MemDependencyChecker analyzer;
-  l.root_stmt()->accept(&analyzer);
-
-  ForPtr loopRootA = l.getLoopStmtsFor(A)[0];
-  ForPtr loopRootB = l.getLoopStmtsFor(B)[0];
-
-  ASSERT_EQ(
-      HazardKind::ReadAfterWrite,
-      getPotentialHazards(analyzer, loopRootA, loopRootB));
-}
-
-TEST(BoundsInference, GetPotentialHazardsLoopSplit) {
-  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
-    return i * j;
-  });
-
-  LoopNest l({A});
-  ForPtr inner, tail;
-
-  // Splitting with tail by something offset creates a tail which also writes to
-  // A.
-  ForPtr outer = l.getLoopStmtsFor(A)[0];
-  // `outer` loop get transformed to the outer loop after splitting.
-  LoopNest::splitWithTail(outer, 5, &inner, &tail);
-
-  using namespace analysis;
-
-  MemDependencyChecker analyzer;
-  l.root_stmt()->accept(&analyzer);
-
-  ASSERT_EQ(
-      HazardKind::WriteAfterWrite, getPotentialHazards(analyzer, outer, tail));
-}
-
-TEST(BoundsInference, HasConflictingOverlapSameBufferWithPartialOverlap) {
-  // Input IR:
-  //   for (const auto j : c10::irange(10, 100)) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (const auto k : c10::irange(10, 100)) {
-  //     A[k-1] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {200}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK =
-      For::make(k, 10, 100, Store::make(a_buf, {k - 1}, Mul::make(20, k)));
-  auto par = Block::make({forJ, forK});
-
-  tensorexpr::analysis::MemDependencyChecker analyzer;
-  par->accept(&analyzer);
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
-}
-
-TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlap) {
-  // Input IR:
-  //   for (const auto j : c10::irange(10, 100)) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (const auto k : c10::irange(10, 100)) {
-  //     A[k] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {200}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK = For::make(k, 10, 100, Store::make(a_buf, {k}, Mul::make(20, k)));
-  auto par = Block::make({forJ, forK});
-
-  tensorexpr::analysis::MemDependencyChecker analyzer;
-  par->accept(&analyzer);
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
-}
-
-TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlapRAW) {
-  // Input IR:
-  //   for (const auto j : c10::irange(10, 100)) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (const auto k : c10::irange(10, 100)) {
-  //     B[k] = A[k];
-  //   }
-  BufHandle a_buf("A", {200}, kInt);
-  BufHandle b_buf("B", {200}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK =
-      For::make(k, 10, 100, Store::make(b_buf, {k}, Load::make(a_buf, {k})));
-  auto par = Block::make({forJ, forK});
-
-  tensorexpr::analysis::MemDependencyChecker analyzer;
-  par->accept(&analyzer);
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
-}
-
-TEST(BoundsInference, HasConflictingOverlapSameBufferNotOverlapping) {
-  // Input IR:
-  //   for (const auto j : c10::irange(10, 100)) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (const auto k : c10::irange(10, 100)) {
-  //     A[k+100] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {200}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK =
-      For::make(k, 10, 100, Store::make(a_buf, {k + 100}, Mul::make(20, k)));
-  auto par = Block::make({forJ, forK});
-
-  tensorexpr::analysis::MemDependencyChecker analyzer;
-  par->accept(&analyzer);
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, forK));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forK, forJ));
-}
-
-TEST(BoundsInference, HasConflictingOverlap2DBufferWithOverlap) {
-  // Input IR:
-  //   for (const auto i : c10::irange(20)) {
-  //     for (const auto j : c10::irange(100)) {
-  //       A[i,j] = i * j * 500;
-  //     }
-  //   }
-  //   for (const auto m : c10::irange(20)) {
-  //     for (const auto n : c10::irange(50)) {
-  //       A[m+1,n] = m + n * 100;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 100}, kInt);
-  BufHandle b_buf("B", {20, 50}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  auto storeA1 = Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500));
-  auto forJ = For::make(j, 0, 100, storeA1);
-  auto forI = For::make(i, 0, 20, forJ);
-  auto storeA2 =
-      Store::make(a_buf, {m + 1, n}, Add::make(m, Mul::make(n, 100)));
-  auto forN = For::make(n, 0, 50, storeA2);
-  auto forM = For::make(m, 0, 20, forN);
-  auto par = Block::make({forI, forM});
-
-  tensorexpr::analysis::MemDependencyChecker analyzer;
-  par->accept(&analyzer);
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forI, forM));
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forM, forI));
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forN));
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forN, forJ));
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, storeA1, storeA2));
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, storeA2, storeA1));
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, storeA2));
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, storeA1, forM));
-}
-
-TEST(BoundsInference, HasConflictingOverlap2DBufferWithNoOverlap) {
-  // Input IR:
-  //   for (const auto i : c10::irange(20)) {
-  //     for (const auto j : c10::irange(100)) {
-  //       A[i,j] = i * j * 500;
-  //     }
-  //   }
-  //   for (const auto m : c10::irange(20)) {
-  //     for (const auto n : c10::irange(50)) {
-  //       A[m+20,n+100] = m + n * 100;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 100}, kInt);
-  BufHandle b_buf("B", {20, 50}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  auto storeA1 = Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500));
-  auto forJ = For::make(j, 0, 100, storeA1);
-  auto forI = For::make(i, 0, 20, forJ);
-  auto storeA2 =
-      Store::make(a_buf, {m + 20, n + 100}, Add::make(m, Mul::make(n, 100)));
-  auto forN = For::make(n, 0, 50, storeA2);
-  auto forM = For::make(m, 0, 20, forN);
-  auto par = Block::make({forI, forM});
-
-  tensorexpr::analysis::MemDependencyChecker analyzer;
-  par->accept(&analyzer);
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forI, forM));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forM, forI));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, forN));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forN, forJ));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA1, storeA2));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA2, storeA1));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, storeA2));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA1, forM));
-}
-
-TEST(BoundsInference, HasConflictingOverlapDifferentBuffers) {
-  // Input IR:
-  //   for (const auto i : c10::irange(20)) {
-  //     for (const auto j : c10::irange(100)) {
-  //       A[i,j] = i * j * 500;
-  //     }
-  //   }
-  //   for (const auto m : c10::irange(20)) {
-  //     for (const auto n : c10::irange(50)) {
-  //       B[m,n] = m + n * 100;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 100}, kInt);
-  BufHandle b_buf("B", {20, 50}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  auto storeA1 = Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500));
-  auto forJ = For::make(j, 0, 100, storeA1);
-  auto forI = For::make(i, 0, 20, forJ);
-  auto storeA2 = Store::make(b_buf, {m, n}, Add::make(m, Mul::make(n, 100)));
-  auto forN = For::make(n, 0, 50, storeA2);
-  auto forM = For::make(m, 0, 20, forN);
-  auto par = Block::make({forI, forM});
-
-  tensorexpr::analysis::MemDependencyChecker analyzer;
-  par->accept(&analyzer);
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forI, forM));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forM, forI));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, forN));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forN, forJ));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA1, storeA2));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA2, storeA1));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, storeA2));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, storeA1, forM));
-}
-
-TEST(BoundsInference, HasConflictingOverlapDueToRAWDependence) {
-  // Input IR:
-  //   for (const auto j : c10::irange(100)) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (const auto k : c10::irange(100)) {
-  //     B[k] = 20 * A[99-k];
-  //   }
-  BufHandle a_buf("A", {100}, kInt);
-  BufHandle b_buf("B", {100}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK = For::make(
-      k,
-      0,
-      100,
-      Store::make(
-          b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
-  auto par = Block::make({forJ, forK});
-
-  tensorexpr::analysis::MemDependencyChecker analyzer;
-  par->accept(&analyzer);
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
-}
-
-TEST(BoundsInference, HasConflictingOverlapDueToWARDependence) {
-  // Input IR:
-  //   for (const auto k : c10::irange(100)) {
-  //     B[k] = 20 * A[99-k];
-  //   }
-  //   for (const auto j : c10::irange(100)) {
-  //     A[j] = 10 * j;
-  //   }
-  BufHandle a_buf("A", {100}, kInt);
-  BufHandle b_buf("B", {100}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forK = For::make(
-      k,
-      0,
-      100,
-      Store::make(
-          b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
-  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto par = Block::make({forK, forJ});
-
-  tensorexpr::analysis::MemDependencyChecker analyzer;
-  par->accept(&analyzer);
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forJ, forK));
-  ASSERT_TRUE(hasConflictingOverlap(analyzer, forK, forJ));
-}
-
-TEST(BoundsInference, HasConflictingOverlapWithLoads) {
-  // Input IR:
-  //   for (const auto k : c10::irange(10, 100)) {
-  //     B[k] = 20 * A[99-k];
-  //   }
-  //   for (const auto j : c10::irange(10, 100)) {
-  //     C[j] = 10 * A[j];
-  //   }
-  BufHandle a_buf("A", {100}, kInt);
-  BufHandle b_buf("B", {100}, kInt);
-  BufHandle c_buf("C", {100}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forK = For::make(
-      k,
-      10,
-      100,
-      Store::make(
-          b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
-  auto forJ = For::make(
-      j,
-      10,
-      100,
-      Store::make(c_buf, {j}, Mul::make(10, Load::make(a_buf, {j}))));
-  auto par = Block::make({forK, forJ});
-
-  tensorexpr::analysis::MemDependencyChecker analyzer;
-  par->accept(&analyzer);
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forJ, forK));
-  ASSERT_FALSE(hasConflictingOverlap(analyzer, forK, forJ));
-}
-
-TEST(BoundsInference, IsOverlapping) {
-  // Input IR:
-  //   for (const auto i : c10::irange(100)) {
-  //     A[i] = i * 10;               // storeA1
-  //     B[i] = A[99-i] * 20;         // loadA1
-  //     C[i] = A[i + 100] * 10;      // loadA2
-  //     A[i + 50] = i * 50;          // storeA2
-  //     A[i + 150] = i * 150;        // storeA3
-  //   }
-  BufHandle a_buf("A", {300}, kInt);
-  BufHandle b_buf("B", {100}, kInt);
-  BufHandle c_buf("C", {100}, kInt);
-  VarHandle i("i", kInt);
-  auto storeA1 = Store::make(a_buf, {i}, i * 10);
-  auto loadA1 = Load::make(a_buf, {ExprHandle(99) - i});
-  auto storeB = Store::make(b_buf, {i}, Mul::make(loadA1, 20));
-  auto loadA2 = Load::make(a_buf, {i + 100});
-  auto storeC = Store::make(c_buf, {i}, Mul::make(loadA2, 10));
-  auto storeA2 = Store::make(a_buf, {i + 50}, i * 50);
-  auto storeA3 = Store::make(a_buf, {i + 150}, i * 150);
-  auto forI = For::make(
-      i, 0, 100, Block::make({storeA1, storeB, storeC, storeA2, storeA3}));
-  tensorexpr::analysis::MemDependencyChecker analyzer;
-  forI->accept(&analyzer);
-  ASSERT_TRUE(isOverlapping(analyzer, storeA1, to<Load>(loadA1.node())));
-  ASSERT_FALSE(isOverlapping(analyzer, storeA1, to<Load>(loadA2.node())));
-  ASSERT_TRUE(isOverlapping(analyzer, storeA1, storeA2));
-  ASSERT_FALSE(isOverlapping(analyzer, storeA1, storeA3));
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_conv.cpp b/test/cpp/tensorexpr/test_conv.cpp
deleted file mode 100644
index e72303873a6cf..0000000000000
--- a/test/cpp/tensorexpr/test_conv.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-#include <gtest/gtest.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/operators/conv2d.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/torch.h>
-
-namespace torch {
-namespace jit {
-
-namespace te = torch::jit::tensorexpr;
-namespace F = torch::nn::functional;
-
-#ifdef TORCH_ENABLE_LLVM
-
-// Generate test data with few bits of precision, to minimize error
-// accumulation from floating-point reordering.
-static at::Tensor genTestData(c10::IntArrayRef args) {
-  return at::trunc(at::randn(args) * 256.0f) / 256.0f;
-}
-
-TEST(Conv, DepthwiseConv2D) {
-  constexpr int N = 1, C = 72, H = 56, W = 56;
-  constexpr int K = 72, R = 3, S = 3;
-  constexpr int kPad = 1, kStride = 2, kGroups = C;
-  constexpr int CperG = C / kGroups;
-
-  te::BufHandle input("input", {N, C, H, W}, te::kFloat);
-  te::BufHandle weight("weight", {K, CperG, R, S}, te::kFloat);
-  te::BufHandle bias("bias", {K}, te::kFloat);
-  te::Tensor output =
-      te::conv2d_depthwise(input, weight, bias, kStride, kPad, kGroups);
-
-  te::LoopNest loop({output});
-  loop.simplify();
-  loop.prepareForCodegen();
-  te::LLVMCodeGen cg(loop.root_stmt(), {input, weight, bias, output});
-
-  auto it = genTestData({N, C, H, W});
-  auto wt = genTestData({K, CperG, R, S});
-  auto bt = genTestData({K});
-  auto ref = at::conv2d(it, wt, bt, kStride, kPad, /*dilation=*/1, kGroups);
-  auto ot = at::zeros_like(ref);
-  cg.call(
-      {it.data_ptr<float>(),
-       wt.data_ptr<float>(),
-       bt.data_ptr<float>(),
-       ot.data_ptr<float>()});
-
-  ASSERT_TRUE(at::allclose(ref, ot));
-}
-
-TEST(Conv, DepthwiseConv2DNoBias) {
-  constexpr int N = 1, C = 72, H = 56, W = 56;
-  constexpr int K = 72, R = 3, S = 3;
-  constexpr int kPad = 1, kStride = 2, kGroups = C;
-  constexpr int CperG = C / kGroups;
-
-  te::BufHandle input("input", {N, C, H, W}, te::kFloat);
-  te::BufHandle weight("weight", {K, CperG, R, S}, te::kFloat);
-  te::Tensor output =
-      te::conv2d_depthwise(input, weight, kStride, kPad, kGroups);
-
-  te::LoopNest loop({output});
-  loop.simplify();
-  loop.prepareForCodegen();
-  te::LLVMCodeGen cg(loop.root_stmt(), {input, weight, output});
-
-  auto it = genTestData({N, C, H, W});
-  auto wt = genTestData({K, CperG, R, S});
-  auto ref =
-      at::conv2d(it, wt, at::Tensor(), kStride, kPad, /*dilation=*/1, kGroups);
-  auto ot = at::zeros_like(ref);
-  cg.call({it.data_ptr<float>(), wt.data_ptr<float>(), ot.data_ptr<float>()});
-
-  ASSERT_TRUE(at::allclose(ref, ot));
-}
-
-TEST(Conv, DepthwiseConv2DDynamicShapes) {
-  te::VarHandle N_var("N", te::kInt);
-  te::VarHandle C_var("C", te::kInt);
-  te::VarHandle H_var("H", te::kInt);
-  te::VarHandle W_var("W", te::kInt);
-  te::VarHandle K_var("K", te::kInt);
-  te::VarHandle CperG_var("CperG", te::kInt);
-  te::VarHandle R_var("R", te::kInt);
-  te::VarHandle S_var("S", te::kInt);
-  te::VarHandle kPad_var("kPad", te::kInt);
-  te::VarHandle kStride_var("kStride", te::kInt);
-  te::VarHandle kGroups_var("kGroups", te::kInt);
-
-  te::BufHandle input("input", {N_var, C_var, H_var, W_var}, te::kFloat);
-  te::BufHandle weight("weight", {K_var, CperG_var, R_var, S_var}, te::kFloat);
-  te::Tensor output = te::conv2d_depthwise(
-      input,
-      weight,
-      N_var,
-      C_var,
-      H_var,
-      W_var,
-      K_var,
-      CperG_var,
-      R_var,
-      S_var,
-      kStride_var,
-      kPad_var,
-      kGroups_var);
-
-  te::LoopNest loop({output});
-  loop.simplify();
-  loop.prepareForCodegen();
-  std::vector<te::CodeGen::BufferArg> buffer_args = {
-      input,
-      weight,
-      N_var,
-      C_var,
-      H_var,
-      W_var,
-      K_var,
-      CperG_var,
-      R_var,
-      S_var,
-      kPad_var,
-      kStride_var,
-      kGroups_var,
-      output};
-  te::LLVMCodeGen cg(loop.root_stmt(), buffer_args);
-
-  constexpr int N = 1, C = 72, H = 56, W = 56;
-  constexpr int K = 72, R = 3, S = 3;
-  constexpr int kPad = 1, kStride = 2, kGroups = C;
-  constexpr int CperG = C / kGroups;
-
-  auto it = genTestData({N, C, H, W});
-  auto wt = genTestData({K, CperG, R, S});
-  auto ref =
-      at::conv2d(it, wt, at::Tensor(), kStride, kPad, /*dilation=*/1, kGroups);
-  auto ot = at::zeros_like(ref);
-  std::vector<te::CodeGen::CallArg> call_args = {
-      it.data_ptr<float>(),
-      wt.data_ptr<float>(),
-      N,
-      C,
-      H,
-      W,
-      K,
-      CperG,
-      R,
-      S,
-      kPad,
-      kStride,
-      kGroups,
-      ot.data_ptr<float>()};
-  cg.call(call_args);
-
-  ASSERT_TRUE(at::allclose(ref, ot));
-}
-
-#endif
-
-TEST(Conv, Conv2D) {
-  // Input dimensions.
-  constexpr int N = 1;
-  constexpr int C = 3;
-  constexpr int H = 11;
-  constexpr int W = 11;
-
-  // Filter dimensions.
-  constexpr int K = 8;
-  constexpr int R = 3;
-  constexpr int S = 3;
-
-  // Output dims.
-  constexpr int OH = H - R + 1;
-  constexpr int OW = W - S + 1;
-
-  // Compute reference result.
-  at::Tensor input = torch::randn({N, C, H, W});
-  at::Tensor filter = torch::randn({K, C, R, S});
-  at::Tensor ref = F::conv2d(input, filter);
-
-  // Double check the output size is as expected.
-  ASSERT_EQ(ref.size(0), N);
-  ASSERT_EQ(ref.size(1), K);
-  ASSERT_EQ(ref.size(2), OH);
-  ASSERT_EQ(ref.size(3), OW);
-
-  te::BufHandle inputB("input", {N, C, H, W}, te::kFloat);
-  te::BufHandle filterB("filter", {K, C, R, S}, te::kFloat);
-
-  te::Tensor conv = te::Reduce(
-      "conv",
-      {N, K, OH, OW},
-      te::Sum(),
-      // FIXME: We have to use a `std::vector` parameter here and then unpack
-      // it, because we don't have an overload allowing for an arbitrary number
-      // of ExprHandle/VarHandle parameters.
-      [&](const std::vector<te::VarHandle>& v) {
-        auto const& n = v[0];
-        auto const& k = v[1];
-        auto const& oh = v[2];
-        auto const& ow = v[3];
-        auto const& c = v[4];
-        auto const& r = v[5];
-        auto const& s = v[6];
-        // FIXME: We have to use `call` and construct a `std::vector` here
-        // because the `operator()` overload is only specialized for a small
-        // number of arguments.
-        return inputB.load(n, c, oh + r, ow + s) * filterB.load(k, c, r, s);
-      },
-      // FIXME: If you forget one of the reduction dims, you get a segfault.
-      // Could that be caught by a verifier?
-      {C, R, S});
-
-  // FIXME: It'd be nice to have a single header that pulls in things like
-  // LoopNest, IRSimplifier, etc.
-  te::LoopNest loop({conv});
-  loop.prepareForCodegen();
-  te::StmtPtr s = loop.root_stmt();
-  s = te::IRSimplifier::simplify(s);
-
-  at::Tensor result = at::empty_like(ref);
-  te::SimpleIREvaluator cg(s, {inputB, filterB, conv});
-  cg.call(
-      {input.data_ptr<float>(),
-       filter.data_ptr<float>(),
-       result.data_ptr<float>()});
-
-  ASSERT_TRUE(at::allclose(ref, result, 1e-3, 1e-3));
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_cpp_codegen.cpp b/test/cpp/tensorexpr/test_cpp_codegen.cpp
deleted file mode 100644
index ed7679053637c..0000000000000
--- a/test/cpp/tensorexpr/test_cpp_codegen.cpp
+++ /dev/null
@@ -1,259 +0,0 @@
-#include <gtest/gtest.h>
-
-#include "test/cpp/tensorexpr/test_base.h"
-
-#include <c10/util/irange.h>
-#include <torch/csrc/jit/tensorexpr/cpp_codegen.h>
-#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
-#include <torch/csrc/jit/tensorexpr/stmt.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/csrc/jit/testing/file_check.h>
-
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::tensorexpr;
-
-#define STR_CHECK(node, expected) \
-  std::stringstream ss;           \
-  CppPrinter printer(&ss);        \
-  printer.visit(node);            \
-  ASSERT_EQ(ss.str(), expected)
-
-#define FILE_CHECK(node, pattern) \
-  std::stringstream ss;           \
-  CppPrinter printer(&ss);        \
-  printer.visit(node);            \
-  torch::jit::testing::FileCheck().run(pattern, ss.str())
-
-TEST(CppPrinter, IntImm) {
-  auto i = alloc<IntImm>(10);
-  STR_CHECK(i, "10");
-}
-
-TEST(CppPrinter, FloatImm) {
-  auto f = alloc<FloatImm>(10);
-  STR_CHECK(f, "10.f");
-}
-
-TEST(CppPrinter, FloatImm1) {
-  auto f = alloc<FloatImm>(10);
-  STR_CHECK(f, "10.f");
-}
-
-TEST(CppPrinter, DoubleImm) {
-  auto d = alloc<DoubleImm>(10);
-  STR_CHECK(d, "10.0");
-}
-
-TEST(CppPrinter, DoubleImm1) {
-  auto d = alloc<DoubleImm>(10.1);
-  STR_CHECK(d, "10.1");
-}
-
-TEST(CppPrinter, HalfImm) {
-  auto h = alloc<HalfImm>(10);
-  STR_CHECK(h, "10");
-}
-
-TEST(CppPrinter, Add) {
-  auto add = alloc<Add>(alloc<IntImm>(1), alloc<IntImm>(2));
-  STR_CHECK(add, "1 + 2");
-}
-
-TEST(CppPrinter, AddExpr1) {
-  auto add = alloc<Add>(
-      alloc<Add>(alloc<IntImm>(0), alloc<IntImm>(1)),
-      alloc<Sub>(alloc<IntImm>(2), alloc<IntImm>(3)));
-  STR_CHECK(add, "(0 + 1) + (2 - 3)");
-}
-
-TEST(CppPrinter, AddExpr2) {
-  auto add = alloc<Add>(
-      alloc<Mul>(alloc<IntImm>(0), alloc<IntImm>(1)),
-      alloc<Sub>(alloc<IntImm>(2), alloc<IntImm>(3)));
-  STR_CHECK(add, "0 * 1 + (2 - 3)");
-}
-
-TEST(CppPrinter, AddExpr3) {
-  auto add = alloc<Add>(
-      alloc<Add>(alloc<IntImm>(0), alloc<IntImm>(1)),
-      alloc<Div>(alloc<IntImm>(2), alloc<IntImm>(3)));
-  STR_CHECK(add, "(0 + 1) + 2 / 3");
-}
-
-TEST(CppPrinter, Mod) {
-  auto mod = alloc<Mod>(alloc<IntImm>(1), alloc<IntImm>(2));
-  STR_CHECK(mod, "1 % 2");
-}
-
-TEST(CppPrinter, ModFloat) {
-  auto mod = alloc<Mod>(alloc<FloatImm>(1), alloc<FloatImm>(2));
-  STR_CHECK(mod, "std::fmod(1.f, 2.f)");
-}
-
-TEST(CppPrinter, Max) {
-  auto max = alloc<Max>(alloc<IntImm>(1), alloc<IntImm>(2), false);
-  STR_CHECK(max, "std::max(1, 2)");
-}
-
-TEST(CppPrinter, MaxFloat) {
-  auto max = alloc<Max>(alloc<FloatImm>(1), alloc<FloatImm>(2), false);
-  STR_CHECK(max, "std::max(1.f, 2.f)");
-}
-
-TEST(CppPrinter, MaxHalf) {
-  auto max = alloc<Max>(alloc<HalfImm>(1), alloc<HalfImm>(2), false);
-  STR_CHECK(max, "(1 < 2) ? 2 : 1");
-}
-
-TEST(CppPrinter, And) {
-  auto v = alloc<And>(alloc<IntImm>(1), alloc<IntImm>(2));
-  STR_CHECK(v, "1 & 2");
-}
-
-TEST(CppPrinter, CompareSelect) {
-  auto cs = alloc<CompareSelect>(
-      alloc<IntImm>(1),
-      alloc<IntImm>(2),
-      alloc<FloatImm>(1),
-      alloc<FloatImm>(2),
-      CompareSelectOperation::kLE);
-  STR_CHECK(cs, "((1 <= 2) ? 1.f : 2.f)");
-}
-
-TEST(CppPrinter, IfThenElse) {
-  auto cond = alloc<Add>(alloc<IntImm>(1), alloc<IntImm>(2));
-  auto true_value = alloc<Sub>(alloc<IntImm>(0), alloc<IntImm>(1));
-  auto false_value = alloc<Mul>(alloc<IntImm>(2), alloc<IntImm>(3));
-  auto v = alloc<IfThenElse>(cond, true_value, false_value);
-  STR_CHECK(v, "((1 + 2) ? 0 - 1 : 2 * 3)");
-}
-
-TEST(CppPrinter, AllocateFree) {
-  BufHandle buf("x", {2, 3}, kInt);
-  AllocatePtr alloc = Allocate::make(buf);
-  FreePtr free = Free::make(buf);
-  BlockPtr block = Block::make({alloc, free});
-
-  const std::string pattern = R"(
-   # CHECK: {
-   # CHECK:   int* x = static_cast<int*>(malloc(24));
-   # CHECK:   free(x);
-   # CHECK: }
-  )";
-  FILE_CHECK(block, pattern);
-}
-
-TEST(CppPrinter, LoadStore) {
-  BufHandle a("A", {2, 3}, kInt);
-  BufHandle b("B", {3, 4}, kInt);
-  auto store = b.store({2, 2}, a.load(1, 1));
-  STR_CHECK(
-      store, "B[(0 + 2 * (1 * 4)) + 2 * 1] = A[(0 + 1 * (1 * 3)) + 1 * 1];\n");
-}
-
-TEST(CppPrinter, Var) {
-  auto var = alloc<Var>("x", kInt);
-  STR_CHECK(var, "x");
-}
-
-TEST(CppPrinter, Cast) {
-  auto cast = alloc<Cast>(kFloat, alloc<IntImm>(1));
-  STR_CHECK(cast, "static_cast<float>(1)");
-}
-
-TEST(CppPrinter, BitCast) {
-  auto cast = alloc<BitCast>(kInt, alloc<FloatImm>(20));
-  STR_CHECK(cast, "std::bitcast<float, int>(20.f)");
-}
-
-TEST(CppPrinter, Let) {
-  auto var = alloc<Var>("x", kFloat);
-  auto val = alloc<FloatImm>(2);
-  auto let = alloc<Let>(var, val);
-  STR_CHECK(let, "float x = 2.f;\n");
-}
-
-TEST(CppPrinter, For) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  VarHandle i("i", kInt);
-  auto f = For::make(i, 0, N, c.store({i}, Add::make(a.load(i), b.load(i))));
-  const std::string pattern = R"(
-   # CHECK: for (int i = 0; i < 1024; i++) {
-   # CHECK:   C[i] = (A[i]) + (B[i]);
-   # CHECK: }
-  )";
-  FILE_CHECK(f, pattern);
-}
-
-TEST(CppPrinter, Cond) {
-  BufHandle x("X", {1}, kInt);
-  auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
-  auto cond =
-      Cond::make(cmp, x.store({0}, x.load(0) + 1), x.store({0}, x.load(0) - 1));
-  const std::string pattern = R"(
-    # CHECK: if (((X[0] < 10) ? 1 : 0)) {
-    # CHECK:   X[0] = (X[0]) + 1;
-    # CHECK: } else {
-    # CHECK:   X[0] = (X[0]) - 1;
-    # CHECK: }
-  )";
-  FILE_CHECK(cond, pattern);
-}
-
-TEST(CppPrinter, Intrinsics) {
-  const std::unordered_set<IntrinsicsOp, std::hash<int>> unsupported_ops{
-      kRand, kSigmoid};
-  for (const auto i : c10::irange(static_cast<uint32_t>(kMaxIntrinsicsOp))) {
-    IntrinsicsOp op = static_cast<IntrinsicsOp>(i);
-    if (unsupported_ops.count(op)) {
-      continue;
-    }
-
-    if (Intrinsics::OpArgCount(op) == 1) {
-      auto v = alloc<Intrinsics>(op, alloc<FloatImm>(2.0f));
-      STR_CHECK(v, "std::" + v->func_name() + "(2.f)");
-    } else {
-      auto v =
-          alloc<Intrinsics>(op, alloc<FloatImm>(1.0f), alloc<FloatImm>(2.0f));
-      STR_CHECK(v, "std::" + v->func_name() + "(1.f, 2.f)");
-    }
-  }
-}
-
-TEST(CppPrinter, ExternalCall) {
-  std::vector<ExprPtr> dims{alloc<IntImm>(2), alloc<IntImm>(2)};
-  auto output = alloc<Buf>("out", dims, kFloat);
-  auto buf_arg1 = alloc<Buf>("a", dims, kFloat);
-  auto buf_arg2 = alloc<Buf>("b", dims, kFloat);
-  auto scalar_arg = alloc<Add>(alloc<IntImm>(1), alloc<IntImm>(2));
-  std::vector<BufPtr> buf_args{buf_arg1, buf_arg2};
-  std::vector<ExprPtr> scalar_args{scalar_arg};
-  auto call =
-      alloc<ExternalCall>(output, "nnc_aten_matmul", buf_args, scalar_args);
-  const std::string pattern = R"(
-   # CHECK: {
-   # CHECK:   void* buf_ptrs[]{out, a, b};
-   # CHECK:   int64_t buf_ranks[]{2, 2, 2};
-   # CHECK:   int64_t buf_dims[]{2, 2, 2, 2, 2, 2};
-   # CHECK:   int8_t buf_dtypes[]{6, 6, 6};
-   # CHECK:   int64_t extra_args[]{1 + 2};
-   # CHECK:   nnc_aten_matmul(
-   # CHECK:       3,
-   # CHECK:       buf_ptrs,
-   # CHECK:       buf_ranks,
-   # CHECK:       buf_dims,
-   # CHECK:       buf_dtypes,
-   # CHECK:       1,
-   # CHECK:       extra_args);
-   # CHECK: }
-  )";
-  FILE_CHECK(call, pattern);
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
deleted file mode 100644
index 2e1e84e758db3..0000000000000
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ /dev/null
@@ -1,2344 +0,0 @@
-#ifdef USE_CUDA
-
-#include <cmath>
-#include <sstream>
-#include <stdexcept>
-
-#include <gtest/gtest.h>
-
-#include <test/cpp/tensorexpr/test_base.h>
-
-#include <test/cpp/tensorexpr/padded_buffer.h>
-#include <torch/csrc/jit/tensorexpr/cuda_codegen.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/csrc/jit/testing/file_check.h>
-
-#include <torch/csrc/jit/testing/file_check.h>
-
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/util/Half.h>
-#include <c10/util/irange.h>
-
-namespace torch {
-namespace jit {
-using namespace torch::jit::tensorexpr;
-using namespace torch::jit::tensorexpr;
-
-template <typename ctype>
-static void testCudaTestVectorAdd01_impl() {
-  const int num_iter = 3;
-  const int block_count = 16;
-  const int block_size = 128;
-  Dtype dtype = ToDtype<ctype>();
-  BufHandle a_buf("a", {num_iter, block_count, block_size}, dtype);
-  BufHandle b_buf("b", {num_iter, block_count, block_size}, dtype);
-  Tensor c = Compute(
-      "c",
-      {
-          num_iter,
-          block_count,
-          block_size,
-      },
-      [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
-        return a_buf.load(n, b_id, t_id) + b_buf.load(n, b_id, t_id);
-      });
-  LoopNest l({c});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  loops[1]->set_gpu_block_index(0);
-  loops[2]->set_gpu_thread_index(0);
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  CudaCodeGen cuda_cg(stmt, c, a_buf, b_buf);
-  const int N = block_count * block_size * num_iter;
-  PaddedBuffer<ctype> a_v(N);
-  PaddedBuffer<ctype> b_v(N);
-  PaddedBuffer<ctype> c_v(N);
-  PaddedBuffer<ctype> c_ref(N);
-
-  for (const auto i : c10::irange(N)) {
-    a_v(i) = ctype(i);
-    b_v(i) = ctype(i * 3 + 7);
-    c_ref(i) = a_v(i) + b_v(i);
-  }
-
-  // TODO: move gpu support into PaddedBuffer
-  ctype* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, N * sizeof(ctype)));
-  ctype* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, N * sizeof(ctype)));
-  ctype* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, N * sizeof(ctype)));
-  C10_CUDA_CHECK(
-      cudaMemcpy(a_dev, a_v.data(), N * sizeof(ctype), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(
-      cudaMemcpy(b_dev, b_v.data(), N * sizeof(ctype), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(
-      cudaMemcpy(c_dev, c_v.data(), N * sizeof(ctype), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(c_dev, a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(
-      cudaMemcpy(c_v.data(), c_dev, N * sizeof(ctype), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-}
-
-float sigmoid(float x) {
-  return 1.0f / (1.0f + expf(-0.0f - x));
-}
-
-TEST(Cuda, Sigmoid_CUDA) {
-  const int num_iter = 3;
-  const int block_count = 16;
-  const int block_size = 128;
-  Dtype dtype = ToDtype<float>();
-  BufHandle a_buf("a", {num_iter, block_count, block_size}, dtype);
-  Tensor c = Compute(
-      "c",
-      {
-          num_iter,
-          block_count,
-          block_size,
-      },
-      [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
-        return sigmoid(sigmoid(a_buf.load(n, b_id, t_id)));
-      });
-  LoopNest l({c});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  loops[1]->set_gpu_block_index(0);
-  loops[2]->set_gpu_thread_index(0);
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  CudaCodeGen cuda_cg(stmt, c, a_buf);
-  const int N = block_count * block_size * num_iter;
-  PaddedBuffer<float> a_v(N);
-  PaddedBuffer<float> c_v(N);
-  PaddedBuffer<float> c_ref(N);
-
-  for (const auto i : c10::irange(N)) {
-    a_v(i) = float(i);
-    c_ref(i) = sigmoid(sigmoid(a_v(i)));
-  }
-
-  // TODO: move gpu support into PaddedBuffer
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, N * sizeof(float)));
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, N * sizeof(float)));
-  C10_CUDA_CHECK(
-      cudaMemcpy(a_dev, a_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(
-      cudaMemcpy(c_dev, c_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(c_dev, a_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(
-      cudaMemcpy(c_v.data(), c_dev, N * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-}
-
-TEST(Cuda, TestVectorAdd01_CUDA) {
-  // floating types.
-  testCudaTestVectorAdd01_impl<float>();
-  testCudaTestVectorAdd01_impl<at::Half>();
-  testCudaTestVectorAdd01_impl<double>();
-
-  // integer types.
-  testCudaTestVectorAdd01_impl<int8_t>();
-  testCudaTestVectorAdd01_impl<uint8_t>();
-  testCudaTestVectorAdd01_impl<int16_t>();
-  testCudaTestVectorAdd01_impl<int32_t>();
-  testCudaTestVectorAdd01_impl<int64_t>();
-}
-
-static void testCudaTestVectorAdd02_impl(int64_t N, int64_t block_size) {
-  BufHandle a_buf("a", {N}, kFloat);
-  BufHandle b_buf("b", {N}, kFloat);
-  Tensor c = Compute("c", {N}, [&](const VarHandle& n) {
-    return a_buf.load(n) + b_buf.load(n);
-  });
-  LoopNest l({c});
-  ForPtr n_inner;
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  l.splitWithMask(loops[0], block_size, &n_inner);
-  loops[0]->set_gpu_block_index(0);
-  n_inner->set_gpu_thread_index(0);
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  CudaCodeGen cuda_cg(stmt, c, a_buf, b_buf);
-  PaddedBuffer<float> a_v(N);
-  PaddedBuffer<float> b_v(N);
-  PaddedBuffer<float> c_v(N);
-  PaddedBuffer<float> c_ref(N);
-
-  for (const auto i : c10::irange(N)) {
-    a_v(i) = i;
-    b_v(i) = i * 3 + 7;
-    c_ref(i) = a_v(i) + b_v(i);
-  }
-
-  // TODO: move gpu support into PaddedBuffer
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, N * sizeof(float)));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, N * sizeof(float)));
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, N * sizeof(float)));
-  C10_CUDA_CHECK(
-      cudaMemcpy(a_dev, a_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(
-      cudaMemcpy(b_dev, b_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(
-      cudaMemcpy(c_dev, c_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(c_dev, a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(
-      cudaMemcpy(c_v.data(), c_dev, N * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-}
-
-TEST(Cuda, TestVectorAdd02_CUDA) {
-  testCudaTestVectorAdd02_impl(1024, 128);
-  testCudaTestVectorAdd02_impl(1030, 128);
-}
-
-TEST(Cuda, HalfCast_CUDA) {
-  auto half = ToDtype<at::Half>();
-  BufHandle a("a", {4}, half);
-  Tensor b = Compute("b", {4}, [&](const VarHandle& i) {
-    return Cast::make(kFloat, a.load(i));
-  });
-
-  LoopNest l({b});
-  l.prepareForCodegen();
-  StmtPtr s = l.root_stmt();
-  CudaCodeGen cg(s, {a, b});
-
-  std::vector<at::Half> aData(4, 2.0f);
-  std::vector<float> bData(4, 0.0f);
-  at::Half* aDev = nullptr;
-  float* bDev = nullptr;
-  auto aSize = aData.size() * sizeof(aData[0]);
-  auto bSize = bData.size() * sizeof(bData[0]);
-
-  C10_CUDA_CHECK(cudaMalloc(&aDev, aSize));
-  C10_CUDA_CHECK(cudaMalloc(&bDev, bSize));
-  C10_CUDA_CHECK(cudaMemcpy(aDev, aData.data(), aSize, cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(bDev, bData.data(), bSize, cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cg.call({aDev, bDev});
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  C10_CUDA_CHECK(cudaMemcpy(aData.data(), aDev, aSize, cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(bData.data(), bDev, bSize, cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  assertAllEqual(bData, 2.0f);
-
-  C10_CUDA_CHECK(cudaFree(aDev));
-  C10_CUDA_CHECK(cudaFree(bDev));
-}
-
-TEST(Cuda, DynamicShape2D_CUDA) {
-  auto testWithSize = [](int32_t M, int32_t N) {
-    VarHandle m("m", kInt);
-    VarHandle n("n", kInt);
-    BufHandle a("a", {m, n}, kFloat);
-    BufHandle b("b", {m, n}, kFloat);
-    Tensor c =
-        Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) {
-          return a.load(i, j) + b.load(i, j);
-        });
-    LoopNest l({c});
-    l.prepareForCodegen();
-    StmtPtr s = l.root_stmt();
-    CudaCodeGen cg(s, {a, b, c, m, n});
-
-    std::vector<float> aData(M * N, 1.0f);
-    std::vector<float> bData(M * N, 2.0f);
-    std::vector<float> cData(M * N, 0.0f);
-    float* aDev = nullptr;
-    float* bDev = nullptr;
-    float* cDev = nullptr;
-    C10_CUDA_CHECK(cudaMalloc(&aDev, aData.size() * sizeof(aData[0])));
-    C10_CUDA_CHECK(cudaMalloc(&bDev, bData.size() * sizeof(bData[0])));
-    C10_CUDA_CHECK(cudaMalloc(&cDev, cData.size() * sizeof(cData[0])));
-    C10_CUDA_CHECK(cudaMemcpy(
-        aDev,
-        aData.data(),
-        aData.size() * sizeof(aData[0]),
-        cudaMemcpyHostToDevice));
-    C10_CUDA_CHECK(cudaMemcpy(
-        bDev,
-        bData.data(),
-        bData.size() * sizeof(bData[0]),
-        cudaMemcpyHostToDevice));
-    C10_CUDA_CHECK(cudaMemcpy(
-        cDev,
-        cData.data(),
-        cData.size() * sizeof(cData[0]),
-        cudaMemcpyHostToDevice));
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-    cg.call({aDev, bDev, cDev, M, N});
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-    C10_CUDA_CHECK(cudaMemcpy(
-        cData.data(),
-        cDev,
-        cData.size() * sizeof(cData[0]),
-        cudaMemcpyDeviceToHost));
-    C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-    ExpectAllNear(cData, std::vector<float>(M * N, 3.0f), 1e-7);
-
-    C10_CUDA_CHECK(cudaFree(aDev));
-    C10_CUDA_CHECK(cudaFree(bDev));
-    C10_CUDA_CHECK(cudaFree(cDev));
-  };
-  testWithSize(32, 32);
-  testWithSize(1, 16);
-  testWithSize(27, 13);
-}
-
-TEST(Cuda, TestRand01_CUDA) {
-  const int num_iter = 3;
-  const int block_count = 16;
-  const int block_size = 128;
-  Tensor c = Compute(
-      "c",
-      {
-          num_iter,
-          block_count,
-          block_size,
-      },
-      [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
-        return Intrinsics::make(IntrinsicsOp::kRand, kFloat);
-      });
-  LoopNest l({c});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  loops[1]->set_gpu_block_index(0);
-  loops[2]->set_gpu_thread_index(0);
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  CudaCodeGen cuda_cg(stmt, c);
-  const int N = block_count * block_size * num_iter;
-  PaddedBuffer<float> c_v(N);
-
-  // TODO: move gpu support into PaddedBuffer
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, N * sizeof(float)));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(c_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(
-      cudaMemcpy(c_v.data(), c_dev, N * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  float sum1 = 0;
-  float sum2 = 0;
-  float sum3 = 0;
-  for (const auto i : c10::irange(N)) {
-    float v = c_v.data()[i];
-    sum1 += v;
-    sum2 += v * v;
-    sum3 += v * v * v;
-    ASSERT_TRUE(v >= 0 && v < 1);
-  }
-  sum1 /= N;
-  sum2 /= N;
-  sum3 /= N;
-  float sum1_mean = 1.f / 2;
-  float sum2_mean = 1.f / 3;
-  float sum3_mean = 1.f / 4;
-
-  ASSERT_NEAR(sum1, sum1_mean, 2e-2);
-  ASSERT_NEAR(sum2, sum2_mean, 2e-2);
-  ASSERT_NEAR(sum3, sum3_mean, 2e-2);
-  C10_CUDA_CHECK(cudaFree(c_dev));
-}
-
-TEST(Cuda, DynamicShapeSplit_CUDA) {
-  constexpr int64_t N = 4096;
-  VarHandle n("n", kLong);
-  BufHandle a("a", {n}, kFloat);
-  Tensor b =
-      Compute("b", {n}, [&](const VarHandle& i) { return a.load(i) * 2.0f; });
-  LoopNest l({b});
-  ForPtr inner;
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(b);
-  l.splitWithMask(loops[0], 1024, &inner);
-  loops[0]->set_gpu_block_index(0);
-  inner->set_gpu_thread_index(0);
-  StmtPtr s = l.root_stmt();
-  CudaCodeGen cg(s, {a, b, n});
-
-  std::vector<float> aData(N, 1.0f);
-  std::vector<float> bData(N, 1.0f);
-  float* aDev = nullptr;
-  float* bDev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&aDev, aData.size() * sizeof(aData[0])));
-  C10_CUDA_CHECK(cudaMalloc(&bDev, bData.size() * sizeof(bData[0])));
-  C10_CUDA_CHECK(cudaMemcpy(
-      aDev,
-      aData.data(),
-      aData.size() * sizeof(aData[0]),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      bDev,
-      bData.data(),
-      bData.size() * sizeof(aData[0]),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cg.call({aDev, bDev, N});
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  C10_CUDA_CHECK(cudaMemcpy(
-      bData.data(),
-      bDev,
-      bData.size() * sizeof(aData[0]),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(bData, std::vector<float>(N, 2.0f), 1e-7);
-
-  C10_CUDA_CHECK(cudaFree(aDev));
-  C10_CUDA_CHECK(cudaFree(bDev));
-}
-
-TEST(Cuda, OneBlockOneThreadGlobalReduce1_CUDA) {
-  const static int N = 1024;
-  BufHandle data_buf("data", {N}, kFloat);
-  BufHandle output_buf("output", {1}, kFloat);
-
-  // The test adds the following code for trivial reduction:
-  // for (const auto bidx : c10::irange(1)) { // blockIdx.x
-  //   for (const auto tidx : c10::irange(1)) { // threadIdx.x
-  //     output[0] = 0.f;
-  //     for (const auto i1 : c10::irange(1024)) {
-  //       output[0] = output[0] + data[i1];
-  //     }
-  //   }
-  // }
-
-  StorePtr init_store = output_buf.store({0}, 0.f);
-  VarHandle i1("i1", kInt);
-  ExprHandle load_data = Load::make(data_buf, {i1});
-  ExprHandle load_output = Load::make(output_buf, {0});
-  ExprHandle add_value = load_output + load_data;
-  StorePtr store_output = output_buf.store({0}, add_value);
-  ForPtr for_output = For::make(i1, 0, N, store_output);
-  StmtPtr reduce_block = Block::make({init_store, for_output});
-  VarHandle thread_idx("tidx", kInt);
-  LoopOptions thread_idx_options;
-  thread_idx_options.set_gpu_thread_index(0);
-  ForPtr thread_idx_loop =
-      For::make(thread_idx, 0, 1, reduce_block, thread_idx_options);
-  VarHandle block_idx("bidx", kInt);
-  LoopOptions block_idx_options;
-  block_idx_options.set_gpu_block_index(0);
-  ForPtr block_idx_loop =
-      For::make(block_idx, 0, 1, thread_idx_loop, block_idx_options);
-
-  CudaCodeGen cuda_cg(block_idx_loop, data_buf, output_buf);
-  PaddedBuffer<float> data_v(N);
-  PaddedBuffer<float> output_v(1, "output_v");
-  PaddedBuffer<float> output_ref(1, "output_ref");
-
-  output_ref(0) = 0;
-  for (const auto i : c10::irange(N)) {
-    data_v(i) = i;
-    output_ref(0) += data_v(i);
-  }
-
-  float* data_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&data_dev, N * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      data_dev, data_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
-  float* output_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&output_dev, 1 * sizeof(float)));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(data_dev, output_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(cudaMemcpy(
-      output_v.data(), output_dev, 1 * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(output_v, output_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(data_dev));
-  C10_CUDA_CHECK(cudaFree(output_dev));
-}
-
-TEST(Cuda, OneBlockMultiThreadGlobalReduce1_CUDA) {
-  const static int N = 1024;
-
-  // This test does the following reduction:
-  // clang-format off
-  //   for b in 0..1 // block-idx
-  //    for t in 0..1024: // thread-idx
-  //      if t < 1:
-  //        b[0] = 0
-  //    // implied sync_threads
-  //    for t in 0..1024: // thread-idx
-  //      b[0] = b[0] + a[t] // implied atomic
-  // clang-format on
-
-  BufHandle a_buf("a", {N}, kFloat);
-  BufHandle b_buf("b", {1}, kFloat);
-
-  StorePtr init_store = b_buf.store({0}, 0.f);
-  VarHandle t("t", kInt);
-  VarHandle b("b", kInt);
-
-  //  for t in 0..1024: // thread-idx
-  //    if t < 1:
-  //      b[0] = 0
-  ExprHandle cond_t_lt_1 =
-      CompareSelect::make(t, 1, CompareSelectOperation::kLT);
-  CondPtr masked_init_b = Cond::make(cond_t_lt_1, init_store, nullptr);
-  LoopOptions thread_idx_options;
-  thread_idx_options.set_gpu_thread_index(0);
-  ForPtr for_init = For::make(t, 0, N, masked_init_b, thread_idx_options);
-
-  //  for t in 0..1024: // thread-idx
-  //    b[0] = b[0] + a[t] // implied atomic
-  ExprHandle load_a = Load::make(a_buf, {t});
-  ExprHandle load_b = Load::make(b_buf, {0});
-  ExprHandle add_value = load_b + load_a;
-  StorePtr store_b = b_buf.store({0}, add_value);
-  ForPtr for_b = For::make(t, 0, N, store_b, thread_idx_options);
-
-  StmtPtr reduce_block = Block::make({for_init, for_b});
-
-  VarHandle block_idx("bidx", kInt);
-  LoopOptions block_idx_options;
-  block_idx_options.set_gpu_block_index(0);
-  ForPtr block_idx_loop =
-      For::make(block_idx, 0, 1, reduce_block, block_idx_options);
-
-  CudaCodeGen cuda_cg(block_idx_loop, a_buf, b_buf);
-  PaddedBuffer<float> a_v(N);
-  PaddedBuffer<float> b_v(1, "b_v");
-  PaddedBuffer<float> b_ref(1, "b_ref");
-
-  b_ref(0) = 0;
-  for (const auto i : c10::irange(N)) {
-    a_v(i) = i;
-    b_ref(0) += a_v(i);
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, N * sizeof(float)));
-  C10_CUDA_CHECK(
-      cudaMemcpy(a_dev, a_v.data(), N * sizeof(float), cudaMemcpyHostToDevice));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, 1 * sizeof(float)));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(
-      cudaMemcpy(b_v.data(), b_dev, 1 * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(b_v, b_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-}
-
-TEST(Cuda, NoThreadIdxWrite_1_CUDA) {
-  // This test does the following reduction:
-  //
-  // for k in 0..1: // block-idx
-  //   a[0] = 0
-  //   for n in 0..2:
-  //     a[0] = a[0] + n
-  //   for m in 0..1024: // thread-idx
-  //     b[m] = m
-  //   a[1] = 1
-  //   for l in 0..2:
-  //     a[1] = a[1] + n
-  //
-  //  note that the statements not covered by thread-idx are supposed to be
-  //  covered by its own thread-idx
-
-  const static int N = 1024;
-  BufHandle a_buf("a", {2}, kFloat);
-  BufHandle b_buf("b", {N}, kFloat);
-
-  VarHandle k("k", kInt);
-  VarHandle l("l", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-
-  //   a[0] = 0
-  //   for n in 0..2:
-  //     a[0] = a[0] + n
-  StorePtr store_a0_0 = a_buf.store({0}, 0.f);
-  ExprHandle load_a0 = Load::make(a_buf, {0});
-  ExprHandle v1 = load_a0 + n;
-  StorePtr store_a0_v1 = a_buf.store({0}, v1);
-  ForPtr loop_a_0 = For::make(n, 0, 2, store_a0_v1);
-
-  //   for m in 0..1024: // thread-idx
-  //     b[m] = m
-  StorePtr store_bm_m = b_buf.store({m}, m + 0.f);
-  LoopOptions thread_idx_options;
-  thread_idx_options.set_gpu_thread_index(0);
-  ForPtr loop_b_1 = For::make(m, 0, N, store_bm_m, thread_idx_options);
-
-  //   a[1] = 1
-  //   for l in 0..2:
-  //     a[1] = a[1] + l
-  StorePtr store_a1_1 = a_buf.store({1}, 1.f);
-  ExprHandle load_a1 = a_buf.load(1);
-  ExprHandle v2 = load_a1 + l;
-  StorePtr store_a1_v2 = a_buf.store({1}, v2);
-  ForPtr loop_a_1 = For::make(l, 0, 2, store_a1_v2);
-
-  StmtPtr reduce_block =
-      Block::make({store_a0_0, loop_a_0, loop_b_1, store_a1_1, loop_a_1});
-
-  VarHandle block_idx("bidx", kInt);
-  LoopOptions block_idx_options;
-  block_idx_options.set_gpu_block_index(0);
-  ForPtr block_idx_loop =
-      For::make(block_idx, 0, 1, reduce_block, block_idx_options);
-
-  CudaCodeGen cuda_cg(block_idx_loop, a_buf, b_buf);
-  PaddedBuffer<float> a_v(2);
-  PaddedBuffer<float> b_v(N, "b_v");
-  PaddedBuffer<float> a_ref(2, "a_ref");
-  PaddedBuffer<float> b_ref(N, "b_ref");
-
-  a_ref(0) = 0;
-  for (const auto i : c10::irange(2)) {
-    a_ref(0) += i;
-  }
-  a_ref(1) = a_ref(0) + 1;
-  for (const auto i : c10::irange(N)) {
-    b_ref(i) = i;
-  }
-
-  // TODO: add check of the generated code.
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, 2 * sizeof(float)));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, N * sizeof(float)));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(
-      cudaMemcpy(a_v.data(), a_dev, 2 * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(
-      cudaMemcpy(b_v.data(), b_dev, N * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(a_v, a_ref, 1e-5);
-  ExpectAllNear(b_v, b_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-}
-
-TEST(Cuda, SharedMemReduce_1_CUDA) {
-  // FIXME: this test is flaky in CI.
-  // This test does the following:
-  //  for k in 0..1:  // block-idx
-  //    alloc(c, 64)
-  //    for n in 0..64:  // thread-idx
-  //      c(n) = 0
-  //    for m in 0..128:
-  //      for n in 0..64:  // thread_idx
-  //        c(n) = c(n) + a(k, m, n)
-  //    b(k) = 0
-  //    for n in 0..64:  // thread_idx
-  //      b(k) = b(k) + c(n)
-  //    free(c)
-
-  const int M = 128;
-  const int N = 64;
-  const int kTotalSize = M * N;
-  LoopOptions thread_idx_opt;
-  thread_idx_opt.set_gpu_thread_index(0);
-  LoopOptions block_idx_opt;
-  block_idx_opt.set_gpu_block_index(0);
-
-  BufHandle a("a", {1, M, N}, kFloat);
-  BufHandle b("b", {1}, kFloat);
-  VarHandle k("k", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-
-  std::vector<StmtPtr> block;
-  std::vector<ExprPtr> dims;
-  dims.push_back(ExprHandle(N).node());
-  BufHandle c{alloc<Buf>("c", dims, kFloat)};
-  {
-    // alloc(c, 64);
-    AllocatePtr alloc = Allocate::make(c);
-    block.push_back(alloc);
-  }
-
-  {
-    //    for n in 0..64:  // thread-idx
-    //      c(n) = 0
-    StorePtr store_cn_0 = Store::make(c, {n}, 0.f);
-    ForPtr loop_n1 = For::make(n, 0, N, store_cn_0, thread_idx_opt);
-    block.push_back(loop_n1);
-  }
-
-  {
-    //  for m in 0..128:
-    //    for n in 0..64:  // thread_idx
-    //      c(n) = c(n) + a(k, m, n)
-    ExprHandle load_cn = Load::make(kFloat, c, {n});
-    ExprHandle a_kmn = Load::make(a, {k * (M * N) + m * N + n});
-    ExprHandle v_add = load_cn + a_kmn;
-    StorePtr store_cn_v = Store::make(c, {n}, v_add);
-    ForPtr loop_n2 = For::make(n, 0, N, store_cn_v, thread_idx_opt);
-    ForPtr loop_m1 = For::make(m, 0, M, loop_n2);
-    block.push_back(loop_m1);
-  }
-
-  {
-    //    b(k) = 0
-    //    for n in 0..64:  // thread_idx
-    //      b(k) = b(k) + c(n)
-    StorePtr store_bk_0 = b.store({k}, 0.f);
-    block.push_back(store_bk_0);
-    ExprHandle load_bk = b.load(k);
-    ExprHandle load_cn = Load::make(kFloat, c, {n});
-    ExprHandle v_add = load_bk + load_cn;
-    StorePtr store_bk = b.store({k}, v_add);
-    ForPtr loop_n3 = For::make(n, 0, N, store_bk, thread_idx_opt);
-    block.push_back(loop_n3);
-  }
-
-  {
-    //    free(c)
-    FreePtr free_stmt = Free::make(c);
-    block.push_back(free_stmt);
-  }
-
-  BlockPtr reduce_body = Block::make(block);
-  ForPtr loop_k1 = For::make(k, 0, 1, reduce_body, block_idx_opt);
-
-  // TODO: check the generated code for correctness.
-  CudaCodeGen cuda_cg(loop_k1, a, b);
-
-  std::ostringstream oss;
-  oss << *cuda_cg.stmt();
-
-  // Check the c write is not masked, but the d write is.
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: c_1 = 0
-# CHECK: for (int m = 0; m < 128
-# CHECK:   c_1 = c_1 +
-# CHECK: __syncthreads();
-# CHECK: if (threadIdx.x<1
-# CHECK:   b[blockIdx.x] =
-# CHECK: __syncthreads();
-# CHECK: atomicAdd(&b[blockIdx.x], c_1)
-)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  PaddedBuffer<float> a_v(1, M, N, "a_v");
-  PaddedBuffer<float> b_v(1, "b_v");
-  PaddedBuffer<float> b_ref(1, "b_ref");
-
-  b_ref(0) = 0;
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N)) {
-      int v = i + j;
-      a_v(0, i, j) = v;
-      b_ref(0) += v;
-    }
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, kTotalSize * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev, a_v.data(), kTotalSize * sizeof(float), cudaMemcpyHostToDevice));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, 1 * sizeof(float)));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(
-      cudaMemcpy(b_v.data(), b_dev, 1 * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(b_v, b_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-}
-
-TEST(Cuda, LocalMemReduce_1_CUDA) {
-  // This test does the following:
-  //  for k in 0..1:  // block-idx
-  //    b(k) = 0
-  //    for n in 0..64:  // thread-idx
-  //      alloc(c, 1)
-  //      c(0) = 0
-  //      for m in 0..128:
-  //        c(0) = c(0) + a(k, m, n)
-  //      b(k) = b(k) + c(0)
-  //      free(c)
-
-  const int M = 128;
-  const int N = 64;
-  const int kTotalSize = M * N;
-  LoopOptions thread_idx_opt;
-  thread_idx_opt.set_gpu_thread_index(0);
-  LoopOptions block_idx_opt;
-  block_idx_opt.set_gpu_block_index(0);
-
-  BufHandle a("a", {1, M, N}, kFloat);
-  BufHandle b("b", {1}, kFloat);
-  VarHandle k("k", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-
-  BufHandle c{
-      alloc<Buf>("c", std::vector<ExprPtr>({alloc<IntImm>(1)}), kFloat)};
-  std::vector<StmtPtr> block_k;
-  {
-    //    b(k) = 0
-    StorePtr store_bk_0 = b.store({k}, 0.f);
-    block_k.push_back(store_bk_0);
-  }
-  std::vector<StmtPtr> block_n;
-  {
-    // alloc(c, 1);
-    AllocatePtr alloc = Allocate::make(c);
-    block_n.push_back(alloc);
-  }
-  {
-    // c(0) = 0
-    StorePtr store_c0_0 = Store::make(c, {0}, 0.f);
-    block_n.push_back(store_c0_0);
-  }
-  {
-    //      for m in 0..128:
-    //        c(0) = c(0) + a(k, m, n)
-    ExprHandle load_c0 = Load::make(kFloat, c, {0});
-    ExprHandle a_kmn = a.load(k * (M * N) + m * N + n);
-    ExprHandle v_add = load_c0 + a_kmn;
-    StorePtr store_c0_v = Store::make(c, {0}, v_add);
-    ForPtr loop_m = For::make(m, 0, M, store_c0_v);
-    block_n.push_back(loop_m);
-  }
-  {
-    //      b(k) = b(k) + c(0)
-    ExprHandle load_bk = b.load(k);
-    ExprHandle load_c0 = Load::make(kFloat, c, {0});
-    ExprHandle v_add = load_bk + load_c0;
-    StorePtr store_bk = b.store({k}, v_add);
-    block_n.push_back(store_bk);
-  }
-  {
-    //      free(c)
-    FreePtr free_stmt = Free::make(c);
-    block_n.push_back(free_stmt);
-  }
-  {
-    BlockPtr block_n_stmt = Block::make(block_n);
-    ForPtr for_n = For::make(n, 0, N, block_n_stmt, thread_idx_opt);
-    block_k.push_back(for_n);
-  }
-  BlockPtr block_k_stmt = Block::make(block_k);
-  ForPtr loop_k = For::make(k, 0, 1, block_k_stmt, block_idx_opt);
-
-  CudaCodeGen cuda_cg(loop_k, a, b);
-  PaddedBuffer<float> a_v(1, M, N, "a_v");
-  PaddedBuffer<float> b_v(1, "b_v");
-  PaddedBuffer<float> b_ref(1, "b_ref");
-
-  b_ref(0) = 0;
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N)) {
-      int v = i + j;
-      a_v(0, i, j) = v;
-      b_ref(0) += v;
-    }
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, kTotalSize * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev, a_v.data(), kTotalSize * sizeof(float), cudaMemcpyHostToDevice));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, 1 * sizeof(float)));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(
-      cudaMemcpy(b_v.data(), b_dev, 1 * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(b_v, b_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-}
-
-TEST(Cuda, HalfSupport_CUDA) {
-  auto half = ToDtype<at::Half>();
-  BufHandle a("a", {4}, half);
-  Tensor b = Compute("b", {4}, [&](const VarHandle& i) {
-    return Cast::make(half, ExprHandle(2.0f) * a.load(i));
-  });
-
-  Tensor c = Compute("c", {4}, [&](const VarHandle& i) {
-    return Cast::make(kFloat, Cast::make(half, ExprHandle(42)) + b.load(i));
-  });
-
-  Tensor d = Compute("d", {4}, [&](const VarHandle& i) {
-    return Cast::make(half, c.load(i));
-  });
-
-  LoopNest l({b, c, d});
-  l.prepareForCodegen();
-  StmtPtr s = l.root_stmt();
-  CudaCodeGen cg(s, {a, b, c, d});
-
-  std::vector<at::Half> aData(4, 2.0f);
-  std::vector<float> cData(4, 0.0f);
-  std::vector<at::Half> dData(4, 0.0f);
-  at::Half* aDev = nullptr;
-  at::Half* bDev = nullptr;
-  at::Half* cDev = nullptr;
-  at::Half* dDev = nullptr;
-  auto aSize = aData.size() * sizeof(aData[0]);
-  auto bSize = aData.size() * sizeof(aData[0]);
-  auto cSize = cData.size() * sizeof(float);
-  auto dSize = dData.size() * sizeof(dData[0]);
-
-  C10_CUDA_CHECK(cudaMalloc(&aDev, aSize));
-  C10_CUDA_CHECK(cudaMalloc(&bDev, bSize));
-  C10_CUDA_CHECK(cudaMalloc(&cDev, cSize));
-  C10_CUDA_CHECK(cudaMalloc(&dDev, dSize));
-  C10_CUDA_CHECK(cudaMemcpy(aDev, aData.data(), aSize, cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(cDev, cData.data(), cSize, cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(dDev, dData.data(), dSize, cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cg.call({aDev, bDev, cDev, dDev});
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  C10_CUDA_CHECK(cudaMemcpy(aData.data(), aDev, aSize, cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(cData.data(), cDev, cSize, cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(dData.data(), dDev, dSize, cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  assertAllEqual(cData, 46.0f);
-
-  C10_CUDA_CHECK(cudaFree(aDev));
-  C10_CUDA_CHECK(cudaFree(bDev));
-  C10_CUDA_CHECK(cudaFree(cDev));
-  C10_CUDA_CHECK(cudaFree(dDev));
-}
-
-TEST(Cuda, HalfPropagation_CUDA) {
-  auto half = ToDtype<at::Half>();
-  BufHandle a("a", {4}, half);
-  Tensor relu = Compute("relu", {4}, [&](const VarHandle& i) {
-    return Max::make(a.load(i), ExprHandle(alloc<HalfImm>(0)), true);
-  });
-
-  LoopNest l({relu});
-  l.prepareForCodegen();
-  StmtPtr s = l.root_stmt();
-  CudaCodeGen cg(s, {a, relu});
-
-  std::ostringstream oss;
-  oss << *cg.stmt();
-
-  // Check the types used by the Max are Float.
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (
-# CHECK:  float v = float(a[i]);
-# CHECK:  relu[i] = half(Max(v, 0.f
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  std::vector<at::Half> aData(4, 2.0f);
-  std::vector<at::Half> reluData(4, 0.0f);
-  at::Half* aDev = nullptr;
-  at::Half* reluDev = nullptr;
-  auto aSize = aData.size() * sizeof(aData[0]);
-  auto reluSize = reluData.size() * sizeof(reluData[0]);
-
-  C10_CUDA_CHECK(cudaMalloc(&aDev, aSize));
-  C10_CUDA_CHECK(cudaMalloc(&reluDev, reluSize));
-  C10_CUDA_CHECK(cudaMemcpy(aDev, aData.data(), aSize, cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(
-      cudaMemcpy(reluDev, reluData.data(), reluSize, cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cg.call({aDev, reluDev});
-  C10_CUDA_CHECK(
-      cudaMemcpy(reluData.data(), reluDev, reluSize, cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  assertAllEqual(aData, reluData);
-
-  C10_CUDA_CHECK(cudaFree(aDev));
-  C10_CUDA_CHECK(cudaFree(reluDev));
-}
-
-TEST(Cuda, UnusedHalfArgument_CUDA) {
-  BufHandle a("a", {4}, kFloat);
-  auto half = ToDtype<at::Half>();
-  BufHandle b("b", {4}, half);
-  Tensor relu = Compute("relu", {4}, [&](const VarHandle& i) {
-    return Max::make(a.load(i), ExprHandle(alloc<FloatImm>(0)), true);
-  });
-
-  LoopNest l({relu});
-  l.prepareForCodegen();
-  StmtPtr s = l.root_stmt();
-  CudaCodeGen cg(s, {a, b, relu});
-
-  std::ostringstream oss;
-  oss << *cg.stmt();
-
-  // Check the types used by the Max are Float.
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (
-# CHECK:  float v = a[i];
-# CHECK:  relu[i] = Max(v, 0.f
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // Sanity Cbeck;
-  std::vector<float> aData(4, 2.0f);
-  std::vector<at::Half> bData(4, 2.0f);
-  std::vector<float> reluData(4, 0.0f);
-  at::Half* aDev = nullptr;
-  at::Half* bDev = nullptr;
-  at::Half* reluDev = nullptr;
-  auto aSize = aData.size() * sizeof(aData[0]);
-  auto bSize = bData.size() * sizeof(bData[0]);
-  auto reluSize = reluData.size() * sizeof(reluData[0]);
-
-  C10_CUDA_CHECK(cudaMalloc(&aDev, aSize));
-  C10_CUDA_CHECK(cudaMalloc(&bDev, bSize));
-  C10_CUDA_CHECK(cudaMalloc(&reluDev, reluSize));
-  C10_CUDA_CHECK(cudaMemcpy(aDev, aData.data(), aSize, cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(bDev, bData.data(), bSize, cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(
-      cudaMemcpy(reluDev, reluData.data(), reluSize, cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cg.call({aDev, bDev, reluDev});
-  C10_CUDA_CHECK(
-      cudaMemcpy(reluData.data(), reluDev, reluSize, cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  assertAllEqual(aData, reluData);
-
-  C10_CUDA_CHECK(cudaFree(aDev));
-  C10_CUDA_CHECK(cudaFree(bDev));
-  C10_CUDA_CHECK(cudaFree(reluDev));
-}
-
-TEST(Cuda, PrioritizeDependents_CUDA) {
-  BufHandle a("a", {10}, kFloat);
-  BufHandle b("b", {12}, kFloat);
-  BufHandle c("c", {12}, kFloat);
-
-  LoopOptions block_idx_opt;
-  block_idx_opt.set_gpu_block_index(0);
-
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-
-  /*
-   * for (const auto i : c10::irange(12)) {
-   *   c[i] = (i < 10 ? a[i] + b[i] : b[i]);
-   * }
-   */
-  ExprHandle load_a = a.load({i});
-  ExprHandle load_b = b.load({i});
-  ExprHandle cmp = CompareSelect::make(i, 10, CompareSelectOperation::kLT);
-  ExprHandle ite = IfThenElse::make(cmp, Add::make(load_a, load_b), load_b);
-
-  ForPtr loop =
-      For::make(i, 0, 12, Block::make({c.store({i}, ite)}), block_idx_opt);
-
-  CudaCodeGen cuda_cg(loop, a, b, c);
-
-  PaddedBuffer<float> a_v(10, "a_v");
-  PaddedBuffer<float> b_v(12, "b_v");
-  PaddedBuffer<float> c_v(12, "c_v");
-  PaddedBuffer<float> c_ref(12, "c_ref");
-
-  for (const auto i : c10::irange(10)) {
-    a_v(i) = i * 100;
-    b_v(i) = i;
-    c_v(i) = 0;
-  }
-
-  for (const auto i : c10::irange(10, 12)) {
-    b_v(i) = i;
-    c_v(i) = 0;
-  }
-
-  float* a_dev = nullptr;
-  float* b_dev = nullptr;
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, 10 * sizeof(float)));
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, 12 * sizeof(float)));
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, 12 * sizeof(float)));
-
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev, a_v.data(), 10 * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      b_dev, b_v.data(), 12 * sizeof(float), cudaMemcpyHostToDevice));
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(a_dev, b_dev, c_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_v.data(), c_dev, 12 * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  for (const auto i : c10::irange(12)) {
-    if (i < 10) {
-      c_ref(i) = i + i * 100;
-    } else {
-      c_ref(i) = i;
-    }
-  }
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-}
-
-/// Tests the case where there are two loops which have different extents bound
-/// to the same block dimension. We must mask the smaller extent loop body.
-TEST(Cuda, MaskBlockDim_CUDA) {
-  int A_SIZE = 100;
-  int B_SIZE = 50;
-  BufHandle a_buf("a", {A_SIZE}, kFloat);
-  BufHandle b_buf("b", {B_SIZE}, kFloat);
-  Tensor c = Compute(
-      "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; });
-  Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) {
-    return a_buf.load(i) + b_buf.load(i);
-  });
-
-  LoopNest l({c, d});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  loops[0]->set_gpu_block_index(0);
-  loops = l.getLoopStmtsFor(d);
-  loops[0]->set_gpu_block_index(0);
-
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
-
-  std::ostringstream oss;
-  oss << *cuda_cg.stmt();
-
-  // Check the c write is not masked, but the d write is.
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK-NOT: if (blockIdx
-# CHECK: c[blockIdx.x] =
-# CHECK: if (blockIdx.x<50
-# CHECK:   d[blockIdx.x] =)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto blockExtents = cuda_cg.gpu_block_extents();
-  auto threadExtents = cuda_cg.gpu_thread_extents();
-  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(A_SIZE)));
-  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(1)));
-
-  // Sanity check that the kernel works.
-  PaddedBuffer<float> a_v(A_SIZE);
-  PaddedBuffer<float> b_v(B_SIZE);
-  PaddedBuffer<float> c_v(A_SIZE);
-  PaddedBuffer<float> d_v(B_SIZE);
-
-  PaddedBuffer<float> c_ref(A_SIZE);
-  PaddedBuffer<float> d_ref(B_SIZE);
-
-  for (const auto i : c10::irange(A_SIZE)) {
-    a_v(i) = (float)i;
-    c_ref(i) = (float)(i + 10);
-  }
-
-  for (const auto i : c10::irange(B_SIZE)) {
-    b_v(i) = (float)(B_SIZE - i);
-    d_ref(i) = a_v(i) + b_v(i);
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, A_SIZE * sizeof(float)));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, B_SIZE * sizeof(float)));
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, A_SIZE * sizeof(float)));
-  float* d_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&d_dev, B_SIZE * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev, a_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      b_dev, b_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_dev, c_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_dev, d_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(c_dev, d_dev, a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_v.data(), c_dev, A_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_v.data(), d_dev, B_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-  ExpectAllNear(d_v, d_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-  C10_CUDA_CHECK(cudaFree(d_dev));
-}
-
-/// Tests the case with two loops, which have different extents that are bound
-/// to the same thread dimension. This is the same as the above - the smaller
-/// rank write should be masked. But this time we also need to syncthreads.
-TEST(Cuda, MaskThreadDim_CUDA) {
-  int A_SIZE = 50;
-  int B_SIZE = 100;
-  BufHandle a_buf("a", {A_SIZE}, kFloat);
-  BufHandle b_buf("b", {B_SIZE}, kFloat);
-  Tensor c = Compute(
-      "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; });
-  Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) {
-    return a_buf.load(i / 2) + b_buf.load(i);
-  });
-
-  LoopNest l({c, d});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  loops[0]->set_gpu_thread_index(0);
-  loops = l.getLoopStmtsFor(d);
-  loops[0]->set_gpu_thread_index(0);
-
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
-
-  std::ostringstream oss;
-  oss << *cuda_cg.stmt();
-
-  // Check the c write is masked, but the d write is not.
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: if (threadIdx.x<50
-# CHECK:   c[threadIdx.x] =
-# CHECK: __syncthreads();
-# CHECK-NOT: if (threadIdx.x
-# CHECK: d[threadIdx.x] =)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto blockExtents = cuda_cg.gpu_block_extents();
-  auto threadExtents = cuda_cg.gpu_thread_extents();
-  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(1)));
-  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(B_SIZE)));
-
-  PaddedBuffer<float> a_v(A_SIZE);
-  PaddedBuffer<float> b_v(B_SIZE);
-  PaddedBuffer<float> c_v(A_SIZE);
-  PaddedBuffer<float> d_v(B_SIZE);
-
-  PaddedBuffer<float> c_ref(A_SIZE);
-  PaddedBuffer<float> d_ref(B_SIZE);
-
-  for (const auto i : c10::irange(A_SIZE)) {
-    a_v(i) = (float)i;
-    c_ref(i) = (float)(i + 10);
-  }
-
-  for (const auto i : c10::irange(B_SIZE)) {
-    b_v(i) = (float)(B_SIZE - i);
-    d_ref(i) = a_v(i / 2) + b_v(i);
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, A_SIZE * sizeof(float)));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, B_SIZE * sizeof(float)));
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, A_SIZE * sizeof(float)));
-  float* d_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&d_dev, B_SIZE * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev, a_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      b_dev, b_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_dev, c_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_dev, d_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(c_dev, d_dev, a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_v.data(), c_dev, A_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_v.data(), d_dev, B_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-  ExpectAllNear(d_v, d_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-  C10_CUDA_CHECK(cudaFree(d_dev));
-}
-
-/// Tests the case where there are two loops, and each is bound to a different
-/// block dimension. In this case all writes should be masked since they occur
-/// in distinct dimensions.
-// Note: this is an extremely dumb pattern which we should never see, but is a
-// useful edge case to make sure we've got things covered.
-TEST(Cuda, MaskMultiBlockDim_CUDA) {
-  int A_SIZE = 100;
-  int B_SIZE = 50;
-  BufHandle a_buf("a", {A_SIZE}, kFloat);
-  BufHandle b_buf("b", {B_SIZE}, kFloat);
-  Tensor c = Compute(
-      "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; });
-  Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) {
-    return a_buf.load(i) + b_buf.load(i);
-  });
-
-  LoopNest l({c, d});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  loops[0]->set_gpu_block_index(0);
-  loops = l.getLoopStmtsFor(d);
-  loops[0]->set_gpu_block_index(1);
-
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
-
-  std::ostringstream oss;
-  oss << *cuda_cg.stmt();
-
-  // Write to c should be masked against y, write to d against x.
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: if (blockIdx.y<1
-# CHECK:   c[blockIdx.x] =
-# CHECK: if (blockIdx.x<1
-# CHECK:   d[blockIdx.y] =)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto blockExtents = cuda_cg.gpu_block_extents();
-  auto threadExtents = cuda_cg.gpu_thread_extents();
-  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(A_SIZE)));
-  ASSERT_TRUE(exprEquals(blockExtents[1], alloc<IntImm>(B_SIZE)));
-
-  PaddedBuffer<float> a_v(A_SIZE);
-  PaddedBuffer<float> b_v(B_SIZE);
-  PaddedBuffer<float> c_v(A_SIZE);
-  PaddedBuffer<float> d_v(B_SIZE);
-
-  PaddedBuffer<float> c_ref(A_SIZE);
-  PaddedBuffer<float> d_ref(B_SIZE);
-
-  for (const auto i : c10::irange(A_SIZE)) {
-    a_v(i) = (float)i;
-    c_ref(i) = (float)(i + 10);
-  }
-
-  for (const auto i : c10::irange(B_SIZE)) {
-    b_v(i) = (float)(B_SIZE - i);
-    d_ref(i) = a_v(i) + b_v(i);
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, A_SIZE * sizeof(float)));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, B_SIZE * sizeof(float)));
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, A_SIZE * sizeof(float)));
-  float* d_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&d_dev, B_SIZE * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev, a_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      b_dev, b_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_dev, c_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_dev, d_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(c_dev, d_dev, a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_v.data(), c_dev, A_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_v.data(), d_dev, B_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-  ExpectAllNear(d_v, d_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-  C10_CUDA_CHECK(cudaFree(d_dev));
-}
-
-/// Tests the case where both the blockDim and threadDim are bound to different
-/// loops. In this instance both stores should be masked since they are
-/// distinct.
-// Note: this is an extremely dumb pattern which we should never see, but is a
-// useful edge case to make sure we've got things covered.
-TEST(Cuda, MaskBlockAndThreadDim_CUDA) {
-  int A_SIZE = 100;
-  int B_SIZE = 50;
-  BufHandle a_buf("a", {A_SIZE}, kFloat);
-  BufHandle b_buf("b", {B_SIZE}, kFloat);
-  Tensor c = Compute(
-      "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; });
-  Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) {
-    return a_buf.load(i) + b_buf.load(i);
-  });
-
-  LoopNest l({c, d});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  loops[0]->set_gpu_block_index(0);
-  loops = l.getLoopStmtsFor(d);
-  loops[0]->set_gpu_thread_index(0);
-
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
-
-  std::ostringstream oss;
-  oss << *cuda_cg.stmt();
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: if (threadIdx.x<1
-# CHECK:   c[blockIdx.x] =
-# CHECK: }
-# CHECK: if (blockIdx.x<1
-# CHECK:   d[threadIdx.x] =)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto blockExtents = cuda_cg.gpu_block_extents();
-  auto threadExtents = cuda_cg.gpu_thread_extents();
-  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(A_SIZE)));
-  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(B_SIZE)));
-
-  PaddedBuffer<float> a_v(A_SIZE);
-  PaddedBuffer<float> b_v(B_SIZE);
-  PaddedBuffer<float> c_v(A_SIZE);
-  PaddedBuffer<float> d_v(B_SIZE);
-
-  PaddedBuffer<float> c_ref(A_SIZE);
-  PaddedBuffer<float> d_ref(B_SIZE);
-
-  for (const auto i : c10::irange(A_SIZE)) {
-    a_v(i) = (float)i;
-    c_ref(i) = (float)(i + 10);
-  }
-
-  for (const auto i : c10::irange(B_SIZE)) {
-    b_v(i) = (float)(B_SIZE - i);
-    d_ref(i) = a_v(i) + b_v(i);
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, A_SIZE * sizeof(float)));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, B_SIZE * sizeof(float)));
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, A_SIZE * sizeof(float)));
-  float* d_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&d_dev, B_SIZE * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev, a_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      b_dev, b_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_dev, c_v.data(), A_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_dev, d_v.data(), B_SIZE * sizeof(float), cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(c_dev, d_dev, a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_v.data(), c_dev, A_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_v.data(), d_dev, B_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-  ExpectAllNear(d_v, d_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-  C10_CUDA_CHECK(cudaFree(d_dev));
-}
-
-/// Tests the case where the loopnest has two loops of depth two: each with the
-/// outer loop bound to blockDim.x and the inner loop bound to threadDim.x. In
-/// this case all writes with a rank smaller than the max should be masked.
-TEST(Cuda, MaskMultiDim_CUDA) {
-  int OUTER_SIZE = 10;
-  int A_SIZE = 100;
-  int B_SIZE = 50;
-  BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
-  BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
-  Tensor c = Compute(
-      "C", {OUTER_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
-        return ExprHandle(2) * a_buf.load(i, j);
-      });
-  Tensor d = Compute(
-      "D", {OUTER_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
-        return c.load(i, j * 2) + b_buf.load(i, j);
-      });
-
-  LoopNest l({c, d});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  loops[0]->set_gpu_block_index(0);
-  loops[1]->set_gpu_thread_index(0);
-  loops = l.getLoopStmtsFor(d);
-  loops[0]->set_gpu_block_index(0);
-  loops[1]->set_gpu_thread_index(0);
-
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
-
-  std::ostringstream oss;
-  oss << *cuda_cg.stmt();
-
-  // The write to D should be masked, but not the write to C.
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK-NOT: if (
-# CHECK: C[threadIdx.x + 100 * blockIdx.x] =
-# CHECK: __syncthreads();
-# CHECK: if (threadIdx.x<50
-# CHECK:   D[threadIdx.x + 50 * blockIdx.x] =)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto blockExtents = cuda_cg.gpu_block_extents();
-  auto threadExtents = cuda_cg.gpu_thread_extents();
-  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(OUTER_SIZE)));
-  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(A_SIZE)));
-
-  PaddedBuffer<float> a_v(OUTER_SIZE, A_SIZE);
-  PaddedBuffer<float> b_v(OUTER_SIZE, B_SIZE);
-  PaddedBuffer<float> c_v(OUTER_SIZE, A_SIZE);
-  PaddedBuffer<float> d_v(OUTER_SIZE, B_SIZE);
-
-  PaddedBuffer<float> c_ref(OUTER_SIZE, A_SIZE);
-  PaddedBuffer<float> d_ref(OUTER_SIZE, B_SIZE);
-
-  for (const auto o : c10::irange(OUTER_SIZE)) {
-    for (const auto i : c10::irange(A_SIZE)) {
-      a_v(o, i) = (float)i;
-      c_ref(o, i) = (float)(i * 2);
-    }
-  }
-
-  for (const auto o : c10::irange(OUTER_SIZE)) {
-    for (const auto i : c10::irange(B_SIZE)) {
-      b_v(o, i) = (float)(B_SIZE - i);
-      d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i);
-    }
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
-  float* d_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&d_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev,
-      a_v.data(),
-      OUTER_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      b_dev,
-      b_v.data(),
-      OUTER_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_dev,
-      c_v.data(),
-      OUTER_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_dev,
-      d_v.data(),
-      OUTER_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(c_dev, d_dev, a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_v.data(),
-      c_dev,
-      OUTER_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_v.data(),
-      d_dev,
-      OUTER_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-  ExpectAllNear(d_v, d_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-  C10_CUDA_CHECK(cudaFree(d_dev));
-}
-
-// Tests the case where loop extents are symbolic and not known at compile time.
-// In this case both stores must be masked against the extent of the other loop,
-// in case it is larger.
-TEST(Cuda, MaskMultiDimSymbolic_CUDA) {
-  VarHandle OUTER_SIZE("OUTER_SIZE", kLong);
-  VarHandle A_SIZE("A_SIZE", kLong);
-  VarHandle B_SIZE("B_SIZE", kLong);
-  BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
-  BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
-  Tensor c = Compute(
-      "C", {OUTER_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
-        return ExprHandle(2) * a_buf.load(i, j);
-      });
-  Tensor d = Compute(
-      "D", {OUTER_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
-        return c.load(i, j * 2) + b_buf.load(i, j);
-      });
-
-  LoopNest l({c, d});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  loops[0]->set_gpu_block_index(0);
-  loops[1]->set_gpu_thread_index(0);
-  loops = l.getLoopStmtsFor(d);
-  loops[0]->set_gpu_block_index(0);
-  loops[1]->set_gpu_thread_index(0);
-
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  CudaCodeGen cuda_cg(stmt, c, d, OUTER_SIZE, A_SIZE, B_SIZE, a_buf, b_buf);
-
-  std::ostringstream oss;
-  oss << *cuda_cg.stmt();
-
-  // Since we don't know which is bigger (A_SIZE or B_SIZE) we must mask both.
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: if (threadIdx.x<A_SIZE
-# CHECK:   C[A_SIZE * int64_t(blockIdx.x) + int64_t(threadIdx.x)] =
-# CHECK: __syncthreads();
-# CHECK: if (threadIdx.x<B_SIZE
-# CHECK:   D[B_SIZE * int64_t(blockIdx.x) + int64_t(threadIdx.x)] =)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto blockExtents = cuda_cg.gpu_block_extents();
-  auto threadExtents = cuda_cg.gpu_thread_extents();
-  ASSERT_TRUE(exprEquals(blockExtents[0], OUTER_SIZE.node()));
-  ASSERT_TRUE(exprEquals(
-      threadExtents[0], alloc<Max>(A_SIZE.node(), B_SIZE.node(), true)));
-
-  int64_t OUTER_EXTENT = 10;
-  int64_t A_EXTENT = 100;
-  int64_t B_EXTENT = 50;
-
-  PaddedBuffer<float> a_v(OUTER_EXTENT, A_EXTENT);
-  PaddedBuffer<float> b_v(OUTER_EXTENT, B_EXTENT);
-  PaddedBuffer<float> c_v(OUTER_EXTENT, A_EXTENT);
-  PaddedBuffer<float> d_v(OUTER_EXTENT, B_EXTENT);
-
-  PaddedBuffer<float> c_ref(OUTER_EXTENT, A_EXTENT);
-  PaddedBuffer<float> d_ref(OUTER_EXTENT, B_EXTENT);
-
-  for (const auto o : c10::irange(OUTER_EXTENT)) {
-    for (const auto i : c10::irange(A_EXTENT)) {
-      a_v(o, i) = (float)i;
-      c_ref(o, i) = (float)(i * 2);
-    }
-  }
-
-  for (const auto o : c10::irange(OUTER_EXTENT)) {
-    for (const auto i : c10::irange(B_EXTENT)) {
-      b_v(o, i) = (float)(B_EXTENT - i);
-      d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i);
-    }
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, OUTER_EXTENT * A_EXTENT * sizeof(float)));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, OUTER_EXTENT * B_EXTENT * sizeof(float)));
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, OUTER_EXTENT * A_EXTENT * sizeof(float)));
-  float* d_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&d_dev, OUTER_EXTENT * B_EXTENT * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev,
-      a_v.data(),
-      OUTER_EXTENT * A_EXTENT * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      b_dev,
-      b_v.data(),
-      OUTER_EXTENT * B_EXTENT * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_dev,
-      c_v.data(),
-      OUTER_EXTENT * A_EXTENT * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_dev,
-      d_v.data(),
-      OUTER_EXTENT * B_EXTENT * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(c_dev, d_dev, OUTER_EXTENT, A_EXTENT, B_EXTENT, a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_v.data(),
-      c_dev,
-      OUTER_EXTENT * A_EXTENT * sizeof(float),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_v.data(),
-      d_dev,
-      OUTER_EXTENT * B_EXTENT * sizeof(float),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-  ExpectAllNear(d_v, d_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-  C10_CUDA_CHECK(cudaFree(d_dev));
-}
-
-// Tests the case where two loops are fused at a common parent loop, which is
-// bound to the block dimension. Internally the inner loops have different
-// extents but are bound to the same thread dimension. The smaller loop should
-// be masked.
-TEST(Cuda, MaskCompoundInnerLoop_CUDA) {
-  int OUTER_SIZE = 10;
-  int A_SIZE = 100;
-  int B_SIZE = 50;
-  BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
-  BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
-  BufHandle c_buf("c", {OUTER_SIZE, A_SIZE}, kFloat);
-  BufHandle d_buf("d", {OUTER_SIZE, B_SIZE}, kFloat);
-
-  // Can't build this using Compute and transforms yet.
-  LoopOptions blockBound;
-  blockBound.set_gpu_block_index(0);
-  LoopOptions threadBound;
-  threadBound.set_gpu_thread_index(0);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-
-  StmtPtr stmt = For::make(
-      i,
-      0,
-      OUTER_SIZE,
-      Block::make(
-          {For::make(
-               j,
-               0,
-               A_SIZE,
-               c_buf.store({i, j}, ExprHandle(2) * a_buf.load(i, j)),
-               threadBound),
-           For::make(
-               k,
-               0,
-               B_SIZE,
-               d_buf.store({i, k}, c_buf.load(i, k * 2) + b_buf.load(i, k)),
-               threadBound)}),
-      blockBound);
-
-  stmt = FlattenIndexes(stmt);
-  stmt = IRSimplifier::simplify(stmt);
-
-  CudaCodeGen cuda_cg(stmt, a_buf, b_buf, c_buf, d_buf);
-
-  std::ostringstream oss;
-  oss << *cuda_cg.stmt();
-
-  // The write to D should be masked, but not the write to C.
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK-NOT: if (
-# CHECK: c[threadIdx.x + 100 * blockIdx.x] =
-# CHECK: __syncthreads();
-# CHECK: if (threadIdx.x<50
-# CHECK:   d[threadIdx.x + 50 * blockIdx.x] =)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto blockExtents = cuda_cg.gpu_block_extents();
-  auto threadExtents = cuda_cg.gpu_thread_extents();
-  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(OUTER_SIZE)));
-  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(A_SIZE)));
-
-  PaddedBuffer<float> a_v(OUTER_SIZE, A_SIZE);
-  PaddedBuffer<float> b_v(OUTER_SIZE, B_SIZE);
-  PaddedBuffer<float> c_v(OUTER_SIZE, A_SIZE);
-  PaddedBuffer<float> d_v(OUTER_SIZE, B_SIZE);
-
-  PaddedBuffer<float> c_ref(OUTER_SIZE, A_SIZE);
-  PaddedBuffer<float> d_ref(OUTER_SIZE, B_SIZE);
-
-  for (const auto o : c10::irange(OUTER_SIZE)) {
-    for (const auto i : c10::irange(A_SIZE)) {
-      a_v(o, i) = (float)i;
-      c_ref(o, i) = (float)(i * 2);
-    }
-    for (const auto i : c10::irange(B_SIZE)) {
-      b_v(o, i) = (float)(B_SIZE - i);
-      d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i);
-    }
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
-  float* d_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&d_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev,
-      a_v.data(),
-      OUTER_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      b_dev,
-      b_v.data(),
-      OUTER_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_dev,
-      c_v.data(),
-      OUTER_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_dev,
-      d_v.data(),
-      OUTER_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(a_dev, b_dev, c_dev, d_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_v.data(),
-      c_dev,
-      OUTER_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_v.data(),
-      d_dev,
-      OUTER_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-  ExpectAllNear(d_v, d_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-  C10_CUDA_CHECK(cudaFree(d_dev));
-}
-
-// Tests the case with two loops fused into a common parent, which is not bound
-// to any block or thread dimension - however it's two inner loops are bound to
-// the first thread dimensions. This should work just like the MaskThreadDim
-// test where the bigger loop is unmasked but the smaller is masked.
-TEST(Cuda, MaskInnerLoopOneBlock_CUDA) {
-  int OUTER_SIZE = 10;
-  int A_SIZE = 100;
-  int B_SIZE = 50;
-  BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
-  BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
-  BufHandle c_buf("c", {OUTER_SIZE, A_SIZE}, kFloat);
-  BufHandle d_buf("d", {OUTER_SIZE, B_SIZE}, kFloat);
-
-  // Can't build this using Compute and transforms yet.
-  LoopOptions blockBound;
-  blockBound.set_gpu_block_index(0);
-  LoopOptions threadBound;
-  threadBound.set_gpu_thread_index(0);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-
-  StmtPtr stmt = For::make(
-      i,
-      0,
-      OUTER_SIZE,
-      Block::make(
-          {For::make(
-               j,
-               0,
-               A_SIZE,
-               c_buf.store({i, j}, ExprHandle(2) * a_buf.load(i, j)),
-               threadBound),
-           For::make(
-               k,
-               0,
-               B_SIZE,
-               d_buf.store({i, k}, c_buf.load(i, k * 2) + b_buf.load(i, k)),
-               threadBound)}));
-
-  stmt = FlattenIndexes(stmt);
-  stmt = IRSimplifier::simplify(stmt);
-
-  CudaCodeGen cuda_cg(stmt, a_buf, b_buf, c_buf, d_buf);
-
-  std::ostringstream oss;
-  oss << *cuda_cg.stmt();
-
-  // The other loop remains the D write is masked.
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i = 0; i < 10
-# CHECK-NOT: if (
-# CHECK: c[threadIdx.x + 100 * i] =
-# CHECK: __syncthreads();
-# CHECK: if (threadIdx.x<50
-# CHECK:   d[threadIdx.x + 50 * i] =)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto blockExtents = cuda_cg.gpu_block_extents();
-  auto threadExtents = cuda_cg.gpu_thread_extents();
-  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(1)));
-  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(A_SIZE)));
-
-  PaddedBuffer<float> a_v(OUTER_SIZE, A_SIZE);
-  PaddedBuffer<float> b_v(OUTER_SIZE, B_SIZE);
-  PaddedBuffer<float> c_v(OUTER_SIZE, A_SIZE);
-  PaddedBuffer<float> d_v(OUTER_SIZE, B_SIZE);
-
-  PaddedBuffer<float> c_ref(OUTER_SIZE, A_SIZE);
-  PaddedBuffer<float> d_ref(OUTER_SIZE, B_SIZE);
-
-  for (const auto o : c10::irange(OUTER_SIZE)) {
-    for (const auto i : c10::irange(A_SIZE)) {
-      a_v(o, i) = (float)i;
-      c_ref(o, i) = (float)(i * 2);
-    }
-    for (const auto i : c10::irange(B_SIZE)) {
-      b_v(o, i) = (float)(B_SIZE - i);
-      d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i);
-    }
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
-  float* d_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&d_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev,
-      a_v.data(),
-      OUTER_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      b_dev,
-      b_v.data(),
-      OUTER_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_dev,
-      c_v.data(),
-      OUTER_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_dev,
-      d_v.data(),
-      OUTER_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(a_dev, b_dev, c_dev, d_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_v.data(),
-      c_dev,
-      OUTER_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_v.data(),
-      d_dev,
-      OUTER_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-  ExpectAllNear(d_v, d_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-  C10_CUDA_CHECK(cudaFree(d_dev));
-}
-
-// Tests the case with two loop nests, each of which bound to the same block
-// size, but with internal loops bound to different thread rank (ie x and y). In
-// this case both bodies must be masked against the other dimension being > 0.
-// Note: this is a bit degenerate no one would actually write this for perf.
-TEST(Cuda, MaskMultiDimMultiAxis_CUDA) {
-  int OUTER_SIZE = 10;
-  int A_SIZE = 30;
-  int B_SIZE = 15;
-  BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat);
-  BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat);
-  Tensor c = Compute(
-      "C", {OUTER_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
-        return ExprHandle(2) * a_buf.load(i, j);
-      });
-  Tensor d = Compute(
-      "D", {OUTER_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
-        return c.load(i, j * 2) + b_buf.load(i, j);
-      });
-
-  LoopNest l({c, d});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  loops[0]->set_gpu_block_index(0);
-  loops[1]->set_gpu_thread_index(0);
-  loops = l.getLoopStmtsFor(d);
-  loops[0]->set_gpu_block_index(0);
-  loops[1]->set_gpu_thread_index(1);
-
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
-
-  std::ostringstream oss;
-  oss << *cuda_cg.stmt();
-
-  // Both stores masked against the other thread dim < 1.
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: if (threadIdx.y<1
-# CHECK:   C[threadIdx.x + 30 * blockIdx.x] =
-# CHECK: __syncthreads();
-# CHECK: if (threadIdx.x<1
-# CHECK:   D[threadIdx.y + 15 * blockIdx.x] =)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto blockExtents = cuda_cg.gpu_block_extents();
-  auto threadExtents = cuda_cg.gpu_thread_extents();
-  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(OUTER_SIZE)));
-  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(A_SIZE)));
-
-  PaddedBuffer<float> a_v(OUTER_SIZE, A_SIZE);
-  PaddedBuffer<float> b_v(OUTER_SIZE, B_SIZE);
-  PaddedBuffer<float> c_v(OUTER_SIZE, A_SIZE);
-  PaddedBuffer<float> d_v(OUTER_SIZE, B_SIZE);
-
-  PaddedBuffer<float> c_ref(OUTER_SIZE, A_SIZE);
-  PaddedBuffer<float> d_ref(OUTER_SIZE, B_SIZE);
-
-  for (const auto o : c10::irange(OUTER_SIZE)) {
-    for (const auto i : c10::irange(A_SIZE)) {
-      a_v(o, i) = (float)i;
-      c_ref(o, i) = (float)(i * 2);
-    }
-  }
-
-  for (const auto o : c10::irange(OUTER_SIZE)) {
-    for (const auto i : c10::irange(B_SIZE)) {
-      b_v(o, i) = (float)(B_SIZE - i);
-      d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i);
-    }
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, OUTER_SIZE * A_SIZE * sizeof(float)));
-  float* d_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&d_dev, OUTER_SIZE * B_SIZE * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev,
-      a_v.data(),
-      OUTER_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      b_dev,
-      b_v.data(),
-      OUTER_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_dev,
-      c_v.data(),
-      OUTER_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_dev,
-      d_v.data(),
-      OUTER_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(c_dev, d_dev, a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_v.data(),
-      c_dev,
-      OUTER_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_v.data(),
-      d_dev,
-      OUTER_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-  ExpectAllNear(d_v, d_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-  C10_CUDA_CHECK(cudaFree(d_dev));
-}
-
-// Tests the case with two loop nests, each bound to both Block and Thread but
-// the second loop is smaller in both cases - the second store must be masked
-// for both the block and thread dimension.
-TEST(Cuda, MaskMultiDimMultiLevel_CUDA) {
-  int OUTER_A_SIZE = 10;
-  int OUTER_B_SIZE = 5;
-  int A_SIZE = 30;
-  int B_SIZE = 15;
-  BufHandle a_buf("a", {OUTER_A_SIZE, A_SIZE}, kFloat);
-  BufHandle b_buf("b", {OUTER_B_SIZE, B_SIZE}, kFloat);
-  Tensor c = Compute(
-      "C", {OUTER_A_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
-        return ExprHandle(2) * a_buf.load(i, j);
-      });
-  Tensor d = Compute(
-      "D", {OUTER_B_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) {
-        return c.load(i, j * 2) + b_buf.load(i, j);
-      });
-
-  LoopNest l({c, d});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
-  loops[0]->set_gpu_block_index(0);
-  loops[1]->set_gpu_thread_index(0);
-  loops = l.getLoopStmtsFor(d);
-  loops[0]->set_gpu_block_index(0);
-  loops[1]->set_gpu_thread_index(0);
-
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  CudaCodeGen cuda_cg(stmt, c, d, a_buf, b_buf);
-
-  std::ostringstream oss;
-  oss << *cuda_cg.stmt();
-
-  // The write to D should be masked twice, but not the write to C.
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK-NOT: if (
-# CHECK: C[threadIdx.x + 30 * blockIdx.x] =
-# CHECK: __syncthreads();
-# CHECK: if (blockIdx.x<5
-# CHECK:   if (threadIdx.x<15
-# CHECK:     D[threadIdx.x + 15 * blockIdx.x] =)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto blockExtents = cuda_cg.gpu_block_extents();
-  auto threadExtents = cuda_cg.gpu_thread_extents();
-  ASSERT_TRUE(exprEquals(blockExtents[0], alloc<IntImm>(OUTER_A_SIZE)));
-  ASSERT_TRUE(exprEquals(threadExtents[0], alloc<IntImm>(A_SIZE)));
-
-  PaddedBuffer<float> a_v(OUTER_A_SIZE, A_SIZE);
-  PaddedBuffer<float> b_v(OUTER_B_SIZE, B_SIZE);
-  PaddedBuffer<float> c_v(OUTER_A_SIZE, A_SIZE);
-  PaddedBuffer<float> d_v(OUTER_B_SIZE, B_SIZE);
-
-  PaddedBuffer<float> c_ref(OUTER_A_SIZE, A_SIZE);
-  PaddedBuffer<float> d_ref(OUTER_B_SIZE, B_SIZE);
-
-  for (const auto o : c10::irange(OUTER_A_SIZE)) {
-    for (const auto i : c10::irange(A_SIZE)) {
-      a_v(o, i) = (float)i;
-      c_ref(o, i) = (float)(i * 2);
-    }
-  }
-
-  for (const auto o : c10::irange(OUTER_B_SIZE)) {
-    for (const auto i : c10::irange(B_SIZE)) {
-      b_v(o, i) = (float)(B_SIZE - i);
-      d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i);
-    }
-  }
-
-  float* a_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&a_dev, OUTER_A_SIZE * A_SIZE * sizeof(float)));
-  float* b_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&b_dev, OUTER_B_SIZE * B_SIZE * sizeof(float)));
-  float* c_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&c_dev, OUTER_A_SIZE * A_SIZE * sizeof(float)));
-  float* d_dev = nullptr;
-  C10_CUDA_CHECK(cudaMalloc(&d_dev, OUTER_B_SIZE * B_SIZE * sizeof(float)));
-  C10_CUDA_CHECK(cudaMemcpy(
-      a_dev,
-      a_v.data(),
-      OUTER_A_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      b_dev,
-      b_v.data(),
-      OUTER_B_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_dev,
-      c_v.data(),
-      OUTER_A_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_dev,
-      d_v.data(),
-      OUTER_B_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyHostToDevice));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  cuda_cg(c_dev, d_dev, a_dev, b_dev);
-
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-  C10_CUDA_CHECK(cudaMemcpy(
-      c_v.data(),
-      c_dev,
-      OUTER_A_SIZE * A_SIZE * sizeof(float),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaMemcpy(
-      d_v.data(),
-      d_dev,
-      OUTER_B_SIZE * B_SIZE * sizeof(float),
-      cudaMemcpyDeviceToHost));
-  C10_CUDA_CHECK(cudaDeviceSynchronize());
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-  ExpectAllNear(d_v, d_ref, 1e-5);
-
-  C10_CUDA_CHECK(cudaFree(a_dev));
-  C10_CUDA_CHECK(cudaFree(b_dev));
-  C10_CUDA_CHECK(cudaFree(c_dev));
-  C10_CUDA_CHECK(cudaFree(d_dev));
-}
-
-} // namespace jit
-} // namespace torch
-
-#endif
diff --git a/test/cpp/tensorexpr/test_dynamic_shapes.cpp b/test/cpp/tensorexpr/test_dynamic_shapes.cpp
deleted file mode 100644
index 07b9872fb8325..0000000000000
--- a/test/cpp/tensorexpr/test_dynamic_shapes.cpp
+++ /dev/null
@@ -1,701 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <ATen/code_template.h>
-#include <c10/core/DeviceType.h>
-#include <test/cpp/tensorexpr/test_base.h>
-#include <torch/csrc/jit/ir/ir.h>
-#include <torch/csrc/jit/ir/irparser.h>
-#include <torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h>
-#include <torch/csrc/jit/tensorexpr/kernel.h>
-#include <torch/csrc/jit/testing/file_check.h>
-#include <torch/torch.h>
-#include <cmath>
-#include <sstream>
-#include <stdexcept>
-#include <thread>
-
-namespace torch {
-namespace jit {
-
-using namespace torch::indexing;
-using namespace torch::jit::tensorexpr;
-
-TEST(DynamicShapes, SimpleGraph) {
-#ifdef TORCH_ENABLE_LLVM
-  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-  const auto graph_string = R"IR(
-      graph(%x : Tensor,
-            %SS_2 : int,
-            %SS_3 : int):
-        %3 : Tensor = aten::tanh(%x)
-        %4 : Tensor = aten::erf(%3)
-        return (%4))IR";
-  torch::jit::parseIR(graph_string, graph.get());
-
-  auto x_inp = graph->inputs()[0];
-  auto x_type = TensorType::create(at::rand({10, 5}));
-  std::vector<ShapeSymbol> x_sym_dims(
-      {c10::ShapeSymbol::newSymbol(), c10::ShapeSymbol::newSymbol()});
-  auto x_sym_type = x_type->withSymbolicShapes(x_sym_dims);
-  graph->inputs().at(0)->setType(x_sym_type);
-  for (const auto n : graph->nodes()) {
-    n->output()->setType(x_sym_type);
-  }
-
-  // Graph with symbolic shapes:
-  //
-  // graph(%x : Float(SS(-2), SS(-3)),
-  //       %SS_2 : int,
-  //       %SS_3 : int):
-  //   %3 : Float(SS(-2), SS(-3)) = aten::tanh(%x)
-  //   %4 : Float(SS(-2), SS(-3)) = aten::erf(%3)
-  //   return (%4)
-
-  std::vector<torch::jit::StrideInput> input_desc = {
-      torch::jit::StrideInput::TENSOR_CONT};
-  std::unordered_map<
-      const torch::jit::Value*,
-      std::vector<torch::jit::StrideInput>>
-      symbolic_strides;
-  symbolic_strides[x_inp] = input_desc;
-  symbolic_strides[graph->outputs().at(0)] = input_desc;
-  std::vector<int64_t> symbolic_shape_inputs = c10::fmap(
-      x_sym_dims,
-      [](const c10::ShapeSymbol& shapeSym) { return shapeSym.value(); });
-
-  TensorExprKernel kernel(
-      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-  // Run with the same static dims as the one we initialized the graph with.
-  {
-    auto a = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto ref = at::erf(at::tanh(a));
-
-    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a}));
-    stack.push_back(10);
-    stack.push_back(5);
-    kernel.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  }
-
-  // Run with inputs having different dims.
-  {
-    auto a = at::rand({50, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto ref = at::erf(at::tanh(a));
-
-    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a}));
-    stack.push_back(50);
-    stack.push_back(100);
-    kernel.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  }
-#endif
-}
-
-TEST(DynamicShapes, GraphWith2InputsSameDims) {
-#ifdef TORCH_ENABLE_LLVM
-  // The two inputs in this graph must have the same dims.
-  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-  const auto graph_string = R"IR(
-      graph(%x : Tensor,
-            %y : Tensor,
-            %SS_2 : int,
-            %SS_3 : int):
-        %3 : Tensor = aten::tanh(%x)
-        %4 : Tensor = aten::erf(%3)
-        %5 : Tensor = aten::mul(%4, %y)
-        return (%5))IR";
-  torch::jit::parseIR(graph_string, graph.get());
-
-  auto x_inp = graph->inputs()[0];
-  auto y_inp = graph->inputs()[1];
-  auto x_type = TensorType::create(at::rand({10, 5}));
-  std::vector<ShapeSymbol> x_sym_dims(
-      {c10::ShapeSymbol::newSymbol(), c10::ShapeSymbol::newSymbol()});
-  auto x_sym_type = x_type->withSymbolicShapes(x_sym_dims);
-  graph->inputs().at(0)->setType(x_sym_type);
-  graph->inputs().at(1)->setType(x_sym_type);
-  for (const auto n : graph->nodes()) {
-    n->output()->setType(x_sym_type);
-  }
-
-  // Graph with symbolic shapes:
-  //
-  // graph(%x : Float(SS(-4), SS(-5)),
-  //       %y : Float(SS(-4), SS(-5)),
-  //       %SS_2 : int,
-  //       %SS_3 : int):
-  //   %4 : Float(SS(-4), SS(-5)) = aten::tanh(%x)
-  //   %5 : Float(SS(-4), SS(-5)) = aten::erf(%4)
-  //   %6 : Float(SS(-4), SS(-5)) = aten::mul(%5, %y)
-  //   return (%6)
-
-  std::vector<int64_t> symbolic_shape_inputs = c10::fmap(
-      x_sym_dims,
-      [](const c10::ShapeSymbol& shapeSym) { return shapeSym.value(); });
-
-  std::vector<torch::jit::StrideInput> input_desc = {
-      torch::jit::StrideInput::TENSOR_CONT};
-  std::unordered_map<
-      const torch::jit::Value*,
-      std::vector<torch::jit::StrideInput>>
-      symbolic_strides;
-  symbolic_strides[x_inp] = input_desc;
-  symbolic_strides[y_inp] = input_desc;
-  symbolic_strides[graph->outputs().at(0)] = input_desc;
-
-  TensorExprKernel kernel(
-      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-
-  // Run with the same static dims as the one we initialized the graph with.
-  {
-    auto a = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto b = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto ref = at::mul(at::erf(at::tanh(a)), b);
-
-    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
-    stack.push_back(10);
-    stack.push_back(5);
-    kernel.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  }
-
-  // Run with inputs having different dims.
-  {
-    auto a = at::rand({50, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto b = at::rand({50, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto ref = at::mul(at::erf(at::tanh(a)), b);
-
-    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
-    stack.push_back(50);
-    stack.push_back(100);
-    kernel.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  }
-#endif
-}
-
-TEST(DynamicShapes, GraphWith2InputsAndBroadcast) {
-#ifdef TORCH_ENABLE_LLVM
-  // The second input to the graph has a dim of size 1 which should be
-  // broadcasted in the at::mul op.
-  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-  const auto graph_string = R"IR(
-      graph(%x : Float(10, 5, requires_grad=0, device=cpu),
-            %y : Float(1, 5, requires_grad=0, device=cpu),
-            %SS_2 : int,
-            %SS_3 : int):
-        %3 : Tensor = aten::tanh(%x)
-        %4 : Tensor = aten::erf(%3)
-        %5 : Tensor = aten::mul(%4, %y)
-        return (%5))IR";
-  torch::jit::parseIR(graph_string, graph.get());
-
-  auto x_inp = graph->inputs()[0];
-  auto y_inp = graph->inputs()[1];
-  auto x_type = TensorType::create(at::rand({10, 5}));
-  auto y_type = TensorType::create(at::rand({1, 5}));
-  auto x_dim0_sym = c10::ShapeSymbol::newSymbol();
-  auto x_dim1_sym = c10::ShapeSymbol::newSymbol();
-  auto x_sym_type = x_type->withSymbolicShapes(
-      std::vector<ShapeSymbol>({x_dim0_sym, x_dim1_sym}));
-  auto y_sym_type = y_type->withSymbolicShapes(std::vector<ShapeSymbol>(
-      {c10::ShapeSymbol::fromStaticSize(1), x_dim1_sym}));
-  graph->inputs().at(0)->setType(x_sym_type);
-  graph->inputs().at(1)->setType(y_sym_type);
-  for (const auto n : graph->nodes()) {
-    n->output()->setType(x_sym_type);
-  }
-
-  // Graph with symbolic shapes:
-  //
-  // graph(%x : Float(SS(-6), SS(-7)),
-  //       %y : Float(1, SS(-7)),
-  //       %SS_2 : int,
-  //       %SS_3 : int):
-  //   %4 : Float(SS(-6), SS(-7)) = aten::tanh(%x)
-  //   %5 : Float(SS(-6), SS(-7)) = aten::erf(%4)
-  //   %6 : Float(SS(-6), SS(-7)) = aten::mul(%5, %y)
-  //   return (%6)
-
-  std::vector<int64_t> symbolic_shape_inputs(
-      {x_dim0_sym.value(), x_dim1_sym.value()});
-
-  std::vector<torch::jit::StrideInput> input_desc = {
-      torch::jit::StrideInput::TENSOR_CONT};
-  std::unordered_map<
-      const torch::jit::Value*,
-      std::vector<torch::jit::StrideInput>>
-      symbolic_strides;
-  symbolic_strides[x_inp] = input_desc;
-  symbolic_strides[y_inp] = input_desc;
-  symbolic_strides[graph->outputs().at(0)] = input_desc;
-
-  TensorExprKernel kernel(
-      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-
-  // Run with the same static dims as the one we initialized the graph with.
-  {
-    auto a = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto b = at::rand({1, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto ref = at::mul(at::erf(at::tanh(a)), b);
-
-    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
-    stack.push_back(10);
-    stack.push_back(5);
-    kernel.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  }
-
-  // Run with inputs having different dims.
-  {
-    auto a = at::rand({50, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto b = at::rand({1, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto ref = at::mul(at::erf(at::tanh(a)), b);
-
-    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
-    stack.push_back(50);
-    stack.push_back(100);
-    kernel.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  }
-#endif
-}
-
-TEST(DynamicShapes, GraphWithPartiallySymbolicOutput) {
-#ifdef TORCH_ENABLE_LLVM
-  // The second input to the graph has a dim of size 1 which should be
-  // broadcasted in the at::mul op.
-  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-  const auto graph_string = R"IR(
-      graph(%x : Float(1, 5, requires_grad=0, device=cpu),
-            %y : Float(1, 5, requires_grad=0, device=cpu),
-            %SS_2 : int):
-        %4 : Tensor = aten::tanh(%x)
-        %5 : Tensor = aten::mul(%4, %y)
-        return (%5))IR";
-  torch::jit::parseIR(graph_string, graph.get());
-
-  auto x_inp = graph->inputs()[0];
-  auto y_inp = graph->inputs()[1];
-  auto x_type = TensorType::create(at::rand({1, 5}));
-  auto x_dim1_sym = c10::ShapeSymbol::newSymbol();
-  auto x_sym_type = x_type->withSymbolicShapes(std::vector<ShapeSymbol>(
-      {c10::ShapeSymbol::fromStaticSize(1), x_dim1_sym}));
-  graph->inputs().at(0)->setType(x_sym_type);
-  graph->inputs().at(1)->setType(x_sym_type);
-  for (const auto n : graph->nodes()) {
-    n->output()->setType(x_sym_type);
-  }
-
-  // Graph with symbolic shapes:
-  //
-  // graph(%x : Float(1, SS(-2)),
-  //       %y : Float(1, SS(-2)),
-  //       %SS_2 : int):
-  //   %3 : Float(1, SS(-2)) = aten::tanh(%x)
-  //   %4 : Float(1, SS(-2)) = aten::mul(%3, %y)
-  //   return (%4)
-
-  std::vector<int64_t> symbolic_shape_inputs({x_dim1_sym.value()});
-
-  std::vector<torch::jit::StrideInput> input_desc = {
-      torch::jit::StrideInput::TENSOR_CONT};
-  std::unordered_map<
-      const torch::jit::Value*,
-      std::vector<torch::jit::StrideInput>>
-      symbolic_strides;
-  symbolic_strides[x_inp] = input_desc;
-  symbolic_strides[y_inp] = input_desc;
-  symbolic_strides[graph->outputs().at(0)] = input_desc;
-
-  TensorExprKernel kernel(
-      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-
-  // Run with the same static dims as the one we initialized the graph with.
-  {
-    auto a = at::rand({1, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto b = at::rand({1, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto ref = at::mul(at::tanh(a), b);
-
-    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
-    stack.push_back(5);
-    kernel.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  }
-
-  // Run with inputs having different dims.
-  {
-    auto a = at::rand({1, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto b = at::rand({1, 100}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto ref = at::mul(at::tanh(a), b);
-
-    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
-    stack.push_back(100);
-    kernel.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  }
-#endif
-}
-
-TEST(DynamicShapes, GraphWithSymbolicStrides) {
-#ifdef TORCH_ENABLE_LLVM
-  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-  const auto graph_string = R"IR(
-    graph(%0 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu),
-          %1 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu),
-          %SS_3 : int,
-          %SS_2 : int):
-      %15 : int = prim::Constant[value=1]()
-      %21 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu) = aten::add(%0, %1, %15)
-      %22 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu) = aten::mul(%21, %0)
-      return (%22))IR";
-  parseIR(graph_string, &*graph);
-
-  std::vector<torch::jit::StrideInput> input_desc = {
-      torch::jit::StrideInput::S_AS_ARG, torch::jit::StrideInput::S_ONE};
-  std::vector<torch::jit::StrideInput> output_desc = {
-      torch::jit::StrideInput::TENSOR_CONT};
-  std::unordered_map<
-      const torch::jit::Value*,
-      std::vector<torch::jit::StrideInput>>
-      symbolic_strides;
-  symbolic_strides[graph->inputs().at(0)] = input_desc;
-  symbolic_strides[graph->inputs().at(1)] = input_desc;
-  symbolic_strides[graph->outputs().at(0)] = output_desc;
-  std::vector<int64_t> symbolic_shape_inputs = {-3, -2};
-  TensorExprKernel k(graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-
-  {
-    auto x0 = at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto x1 = at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto ref = at::mul(at::add(x0, x1, 1), x0);
-
-    std::vector<at::Tensor> inputs = {x0, x1};
-    std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
-    stack.push_back(32);
-    stack.push_back(10);
-    k.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  }
-
-  {
-    auto x0 = at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto x1 = at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto out =
-        at::rand({10, 32}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    auto ref = at::mul(at::add(x0, x1, 1), x0);
-
-    std::vector<at::Tensor> inputs = {out, x0, x1};
-    std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
-    stack.push_back(32);
-    stack.push_back(10);
-    k.runWithAllocatedOutputs(stack);
-
-    ASSERT_TRUE(at::allclose(out, ref));
-  }
-#endif
-}
-
-TEST(DynamicShapes, GraphWithCatAndBroadcast) {
-#ifdef TORCH_ENABLE_LLVM
-  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-  const auto graph_string = R"IR(
-      graph(%x : Float(10, 5, requires_grad=0, device=cpu),
-            %y : Float(4, 5, requires_grad=0, device=cpu),
-            %z : Float(1, 1, requires_grad=0, device=cpu),
-            %SS_2 : int,
-            %SS_3 : int,
-            %SS_4 : int,
-            %SS_5 : int):
-        %11 : int = prim::Constant[value=0]()
-        %3 : Tensor = aten::tanh(%x)
-        %out1 : Tensor = aten::erf(%3)
-        %out2 : Tensor = aten::relu(%y)
-        %10 : Tensor[] = prim::ListConstruct(%out1, %out2)
-        %25 : Tensor = aten::cat(%10, %11)
-        %28 : Tensor = aten::hardswish(%25)
-        %29 : Tensor = aten::mul(%28, %z)
-        return (%29))IR";
-  torch::jit::parseIR(graph_string, graph.get());
-
-  auto x_inp = graph->inputs()[0];
-  auto y_inp = graph->inputs()[1];
-  auto z_inp = graph->inputs()[2];
-  auto x_type = TensorType::create(at::rand({10, 5}));
-  auto y_type = TensorType::create(at::rand({4, 5}));
-  auto z_type = TensorType::create(at::rand({1, 1}));
-  auto x_dim0_sym = c10::ShapeSymbol::newSymbol();
-  auto x_dim1_sym = c10::ShapeSymbol::newSymbol();
-  auto x_sym_type = x_type->withSymbolicShapes(
-      std::vector<ShapeSymbol>({x_dim0_sym, x_dim1_sym}));
-  auto y_dim0_sym = c10::ShapeSymbol::newSymbol();
-  auto y_sym_type = y_type->withSymbolicShapes(
-      std::vector<ShapeSymbol>({y_dim0_sym, x_dim1_sym}));
-  graph->inputs().at(0)->setType(x_sym_type);
-  graph->inputs().at(1)->setType(y_sym_type);
-  auto cat_dim0_sym = c10::ShapeSymbol::newSymbol();
-  auto cat_out_type = x_type->withSymbolicShapes(
-      std::vector<ShapeSymbol>({cat_dim0_sym, x_dim1_sym}));
-  auto nodeIt = graph->nodes().begin();
-  ++nodeIt;
-  nodeIt->output()->setType(x_sym_type); // aten::tanh
-  ++nodeIt;
-  nodeIt->output()->setType(x_sym_type); // aten::erf
-  ++nodeIt;
-  nodeIt->output()->setType(y_sym_type); // aten::relu
-  ++nodeIt;
-  ++nodeIt;
-  nodeIt->output()->setType(cat_out_type); // aten::cat
-  ++nodeIt;
-  nodeIt->output()->setType(cat_out_type); // aten::hardswish
-  ++nodeIt;
-  nodeIt->output()->setType(cat_out_type); // aten::mul
-
-  // Graph with symbolic shapes:
-  //
-  // graph(%x : Float(SS(-2), SS(-3)),
-  //       %y : Float(SS(-4), SS(-3)),
-  //       %z : Float(1, 1),
-  //       %SS_2 : int,
-  //       %SS_3 : int,
-  //       %SS_4 : int,
-  //       %SS_5 : int):
-  //   %7 : int = prim::Constant[value=0]()
-  //   %8 : Float(SS(-2), SS(-3)) = aten::tanh(%x)
-  //   %9 : Float(SS(-2), SS(-3)) = aten::erf(%8)
-  //   %10 : Float(SS(-4), SS(-3)) = aten::relu(%y)
-  //   %11 : Tensor[] = prim::ListConstruct(%9, %10)
-  //   %12 : Float(SS(-5), SS(-3)) = aten::cat(%11, %7)
-  //   %13 : Float(SS(-5), SS(-3)) = aten::hardswish(%12)
-  //   %14 : Float(SS(-5), SS(-3)) = aten::mul(%13, %z)
-  //   return (%14)
-
-  std::vector<int64_t> symbolic_shape_inputs(
-      {x_dim0_sym.value(),
-       x_dim1_sym.value(),
-       y_dim0_sym.value(),
-       cat_dim0_sym.value()});
-
-  std::vector<torch::jit::StrideInput> input_desc = {
-      torch::jit::StrideInput::TENSOR_CONT};
-  std::unordered_map<
-      const torch::jit::Value*,
-      std::vector<torch::jit::StrideInput>>
-      symbolic_strides;
-  symbolic_strides[x_inp] = input_desc;
-  symbolic_strides[y_inp] = input_desc;
-  symbolic_strides[z_inp] = input_desc;
-  symbolic_strides[graph->outputs().at(0)] = input_desc;
-
-  TensorExprKernel kernel(
-      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-
-  auto a = at::rand({10, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto b = at::rand({4, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto c = at::rand({1, 1}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto ref = at::mul(
-      at::hardswish(at::cat({at::erf(at::tanh(a)), at::relu(b)}, 0)), c);
-
-  std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b, c}));
-  stack.push_back(10);
-  stack.push_back(5);
-  stack.push_back(4);
-  stack.push_back(14);
-  kernel.run(stack);
-
-  auto o = stack[0].toTensor();
-  ASSERT_TRUE(at::allclose(o, ref));
-#endif
-}
-
-TEST(DynamicShapes, GraphFromModel) {
-#ifdef TORCH_ENABLE_LLVM
-  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-  const auto graph_string = R"IR(
-    graph(%0 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu),
-          %1 : Float(SS(-2), SS(-4), requires_grad=0, device=cpu),
-          %2 : Float(SS(-2), SS(-5), requires_grad=0, device=cpu),
-          %input.4 : Long(SS(-2), SS(-6), requires_grad=0, device=cpu),
-          %4 : Float(SS(-7), requires_grad=0, device=cpu),
-          %5 : Float(SS(-7), requires_grad=0, device=cpu),
-          %SS_10 : int,
-          %SS_9 : int,
-          %SS_8 : int,
-          %SS_7 : int,
-          %SS_6 : int,
-          %SS_5 : int,
-          %SS_4 : int,
-          %SS_3 : int,
-          %SS_2 : int):
-      %15 : int = prim::Constant[value=1]()
-      %16 : bool = prim::Constant[value=0]()
-      %17 : int = prim::Constant[value=6]()
-      %18 : Float(SS(-2), SS(-6), strides=[139, 1], requires_grad=0, device=cpu) = aten::to(%input.4, %17, %16, %16)
-      %19 : Tensor[] = prim::ListConstruct(%0, %1, %18, %2)
-      %20 : Float(SS(-2), SS(-8), strides=[261, 1], requires_grad=0, device=cpu) = aten::cat(%19, %15)
-      %21 : Float(SS(-2), SS(-9), strides=[261, 1], requires_grad=0, device=cpu) = aten::add(%20, %5, %15)
-      %22 : Float(SS(-2), SS(-10), requires_grad=0, device=cpu) = aten::mul(%21, %4)
-      return (%22))IR";
-  parseIR(graph_string, &*graph);
-
-  std::vector<torch::jit::StrideInput> input_desc = {
-      torch::jit::StrideInput::TENSOR_CONT};
-  std::unordered_map<
-      const torch::jit::Value*,
-      std::vector<torch::jit::StrideInput>>
-      symbolic_strides;
-  symbolic_strides[graph->inputs().at(0)] = input_desc;
-  symbolic_strides[graph->inputs().at(1)] = input_desc;
-  symbolic_strides[graph->inputs().at(2)] = input_desc;
-  symbolic_strides[graph->inputs().at(3)] = input_desc;
-  symbolic_strides[graph->inputs().at(4)] = input_desc;
-  symbolic_strides[graph->inputs().at(5)] = input_desc;
-  symbolic_strides[graph->outputs().at(0)] = input_desc;
-  std::vector<int64_t> symbolic_shape_inputs = {
-      -10, -9, -8, -7, -6, -5, -4, -3, -2};
-  TensorExprKernel k(graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-
-  int64_t i2 = 10;
-  int64_t i3 = 32;
-  int64_t i4 = 19;
-  int64_t i5 = 71;
-  int64_t i6 = 139;
-  int64_t i7 = 261;
-  int64_t i8 = 261;
-  int64_t i9 = 261;
-  int64_t i10 = 261;
-  auto x0 = at::rand({i2, i3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto x1 = at::rand({i2, i4}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto x2 = at::rand({i2, i5}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto x3 = at::ones({i2, i6}, at::TensorOptions(at::kCPU).dtype(at::kLong));
-  auto x4 = at::rand({i7}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto x5 = at::rand({i8}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  auto ref = at::mul(at::add(at::cat({x0, x1, x3, x2}, 1), x5), x4);
-
-  {
-    std::vector<at::Tensor> inputs = {x0, x1, x2, x3, x4, x5};
-    std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
-    stack.emplace_back(i10);
-    stack.emplace_back(i9);
-    stack.emplace_back(i8);
-    stack.emplace_back(i7);
-    stack.emplace_back(i6);
-    stack.emplace_back(i5);
-    stack.emplace_back(i4);
-    stack.emplace_back(i3);
-    stack.emplace_back(i2);
-    k.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  }
-
-  {
-    auto out =
-        at::rand({i2, i10}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-    std::vector<at::Tensor> inputs = {out, x0, x1, x2, x3, x4, x5};
-    std::vector<IValue> stack = at::fmap<at::IValue>(inputs);
-    stack.emplace_back(i10);
-    stack.emplace_back(i9);
-    stack.emplace_back(i8);
-    stack.emplace_back(i7);
-    stack.emplace_back(i6);
-    stack.emplace_back(i5);
-    stack.emplace_back(i4);
-    stack.emplace_back(i3);
-    stack.emplace_back(i2);
-    k.runWithAllocatedOutputs(stack);
-
-    ASSERT_TRUE(at::allclose(out, ref));
-  }
-#endif
-}
-
-TEST(DynamicShapes, MultiThreadedExecution) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_template = R"IR(
-      graph(%x : Float(SS(-2), SS(-3), requires_grad=0, device=${device}),
-            %y : Float(SS(-2), SS(-3), requires_grad=0, device=${device}),
-            %SS_2 : int,
-            %SS_3 : int):
-        %3 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::tanh(%x)
-        %4 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::erf(%3)
-        %5 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::mul(%4, %y)
-        return (%5))IR";
-  for (bool use_cuda : {false, true}) {
-    if (!torch::cuda::is_available() && use_cuda) {
-      continue;
-    }
-    auto device = use_cuda ? at::kCUDA : at::kCPU;
-    at::jit::TemplateEnv env;
-    env.s("device", use_cuda ? "cuda:0" : "cpu");
-    const auto graph_string = format(graph_template, env);
-    std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-    torch::jit::parseIR(graph_string, graph.get());
-
-    std::vector<int64_t> symbolic_shape_inputs = {-2, -3};
-
-    std::vector<torch::jit::StrideInput> input_desc = {
-        torch::jit::StrideInput::TENSOR_CONT};
-    std::unordered_map<
-        const torch::jit::Value*,
-        std::vector<torch::jit::StrideInput>>
-        symbolic_strides;
-    symbolic_strides[graph->inputs().at(0)] = input_desc;
-    symbolic_strides[graph->inputs().at(1)] = input_desc;
-    symbolic_strides[graph->outputs().at(0)] = input_desc;
-
-    TensorExprKernel kernel(
-        graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-
-    auto run_kernel = [&](int dim1, int dim2) {
-      auto a =
-          at::rand({dim1, dim2}, at::TensorOptions(device).dtype(at::kFloat));
-      auto b =
-          at::rand({dim1, dim2}, at::TensorOptions(device).dtype(at::kFloat));
-
-      auto ref = at::mul(at::erf(at::tanh(a)), b);
-
-      std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
-      stack.emplace_back(dim1);
-      stack.emplace_back(dim2);
-      kernel.run(stack);
-
-      auto o = stack[0].toTensor();
-      ASSERT_TRUE(at::allclose(o, ref));
-    };
-
-    // Run the kernel in parallel to ensure that the run() method calls in
-    // TensorExprKernel are not changing any state.
-    constexpr size_t kNumThreads = 4;
-    std::vector<std::thread> threads;
-    for (size_t id = 0; id < kNumThreads; ++id) {
-      threads.emplace_back(run_kernel, id + 5, id + 20);
-    }
-    for (auto& t : threads) {
-      t.join();
-    }
-  }
-#endif
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp
deleted file mode 100644
index eb2d6296b2299..0000000000000
--- a/test/cpp/tensorexpr/test_expr.cpp
+++ /dev/null
@@ -1,836 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <test/cpp/tensorexpr/test_base.h>
-
-#include <c10/util/irange.h>
-#include <test/cpp/tensorexpr/padded_buffer.h>
-#include <test/cpp/tensorexpr/test_utils.h>
-#include <torch/csrc/jit/tensorexpr/eval.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_printer.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/ir_verifier.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-
-#include <cmath>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-namespace torch {
-namespace jit {
-using namespace torch::jit::tensorexpr;
-
-using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
-
-TEST(Expr, BasicValueTest) {
-  ExprHandle a = IntImm::make(2), b = IntImm::make(3);
-  ExprHandle c = Add::make(a, b);
-  SimpleIRExprEval eval(c);
-  ASSERT_EQ(eval.value<int>(), 5);
-}
-
-TEST(Expr, BasicValueTest02) {
-  ExprHandle a(2.0f);
-  ExprHandle b(3.0f);
-  ExprHandle c(4.0f);
-  ExprHandle d(5.0f);
-  ExprHandle f = (a + b) - (c + d);
-  SimpleIRExprEval eval(f);
-  ASSERT_EQ(eval.value<float>(), -4.0f);
-}
-
-TEST(Expr, IsChannelsLastContiguous) {
-  std::vector<VarHandle> vars = {
-      VarHandle("var1", kLong),
-      VarHandle("var2", kLong),
-      VarHandle("var3", kLong),
-      VarHandle("var4", kLong),
-      VarHandle("var5", kLong)};
-
-  // {
-  //   key: ndims,
-  //   value: [
-  //     ...
-  //     [dim_2, dim_1, ..., dim_n]
-  //   ]
-  // }
-  using shapGenInfo = std::unordered_map<int, std::vector<std::vector<int>>>;
-
-  // {
-  //   size: [ExprHandle_1, ExprHandle_2, ..., ExprHandle_n],
-  //   strides: [
-  //     ...
-  //     [ExprHandle_x, ExprHandle_y, ..., ExprHandle_z]
-  //   ]
-  // }
-  using shapeInfo =
-      std::pair<std::vector<ExprHandle>, std::vector<std::vector<ExprHandle>>>;
-
-  std::vector<int> dims = {3, 4, 5};
-
-  std::unordered_map<int, std::vector<ExprHandle>> dims_expr_vec_conf = {
-      {3, std::vector<ExprHandle>(vars.begin(), vars.begin() + 2)},
-      {4, std::vector<ExprHandle>(vars.begin(), vars.begin() + 3)},
-      {5, std::vector<ExprHandle>(vars.begin(), vars.begin() + 4)},
-  };
-
-  shapGenInfo channels_last_cont_shape_conf = {
-      {3, {{1, 2, 0}}}, {4, {{1, 3, 2, 0}}}, {5, {{1, 4, 3, 2, 0}}}};
-  shapGenInfo channels_last_non_cont_shape_conf = {
-      {3, {{2, 1, 0}, {1, 0, 2}}},
-      {4, {{3, 1, 2, 0}, {1, 2, 3, 0}, {1, 0, 2, 3}}},
-      {5, {{4, 3, 2, 1, 0}, {1, 3, 2, 4, 0}, {1, 4, 3, 2, 0}}}};
-
-  shapGenInfo cont_shape_conf = {
-      {3, {{0, 1, 2}}}, {4, {{0, 1, 2, 3}}}, {5, {{0, 1, 2, 3, 4}}}};
-
-  auto shape_gen_fn = [dims_expr_vec_conf](
-                          int ndims, shapGenInfo shape_gen_info) -> shapeInfo {
-    auto dims_expr_vec = dims_expr_vec_conf.at(ndims);
-    std::vector<std::vector<ExprHandle>> strides_expr_vec;
-    for (size_t i = 0; i < strides_expr_vec.size(); i++) {
-      strides_expr_vec[i].resize(ndims);
-    }
-
-    auto stride_gen_fn = [](int indicator, ExprHandle a, ExprHandle b) {
-      if (indicator % 2 == 0) {
-        return a * b;
-      } else {
-        return b * a;
-      }
-    };
-
-    auto stride_order_vec = shape_gen_info.at(ndims);
-    for (size_t i = 0; i < strides_expr_vec.size(); i++) {
-      auto stride_order = stride_order_vec[i];
-
-      strides_expr_vec[i][stride_order[0]] = 1;
-      for (size_t j = 1; j < stride_order.size(); j++) {
-        auto cur_dim_idx = stride_order[j];
-        auto adjacent_dim_idx = stride_order[j - 1];
-
-        strides_expr_vec[i][cur_dim_idx] = stride_gen_fn(
-            i,
-            dims_expr_vec[adjacent_dim_idx],
-            strides_expr_vec[i][adjacent_dim_idx]);
-      }
-    }
-
-    return {dims_expr_vec, strides_expr_vec};
-  };
-
-  auto check_channels_last_fn = [](int ndims, BufHandle buf_handle) -> bool {
-    if (ndims == 3) {
-      return buf_handle.is_channels_last_1d_contiguous();
-    } else if (ndims == 4) {
-      return buf_handle.is_contiguous(at::MemoryFormat::ChannelsLast);
-    } else {
-      return buf_handle.is_contiguous(at::MemoryFormat::ChannelsLast3d);
-    }
-  };
-
-  // channels-last contiguous
-  for (size_t i = 0; i < dims.size(); i++) {
-    auto shape_info = shape_gen_fn(dims[i], channels_last_cont_shape_conf);
-    for (size_t j = 0; j < shape_info.second.size(); j++) {
-      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
-      ASSERT_EQ(check_channels_last_fn(dims[i], buf_handle), true);
-    }
-  }
-
-  // channels-last non-contiguous
-  for (size_t i = 0; i < dims.size(); i++) {
-    auto shape_info = shape_gen_fn(dims[i], channels_last_non_cont_shape_conf);
-    for (size_t j = 0; j < shape_info.second.size(); j++) {
-      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
-      ASSERT_EQ(check_channels_last_fn(dims[i], buf_handle), false);
-    }
-  }
-
-  // contiguous
-  for (size_t i = 0; i < dims.size(); i++) {
-    auto shape_info = shape_gen_fn(dims[i], cont_shape_conf);
-    for (size_t j = 0; j < shape_info.second.size(); j++) {
-      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
-      ASSERT_EQ(buf_handle.is_contiguous(), true);
-    }
-  }
-
-  // non-contiguous
-  for (size_t i = 0; i < dims.size(); i++) {
-    auto shape_info = shape_gen_fn(dims[i], channels_last_cont_shape_conf);
-    for (size_t j = 0; j < shape_info.second.size(); j++) {
-      BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat);
-      ASSERT_EQ(buf_handle.is_contiguous(), false);
-    }
-  }
-}
-
-TEST(Expr, LetTest01) {
-  VarHandle x("x", kFloat);
-  ExprHandle body = ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f));
-  SimpleIRExprEval eval(body);
-  eval.bindVar(x, ExprHandle(3.f));
-  ASSERT_EQ(eval.value<float>(), 2 + (3 * 3 + 4));
-}
-
-TEST(Expr, LetTest02) {
-  VarHandle x("x", kFloat);
-  VarHandle y("y", kFloat);
-  ExprHandle body =
-      ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f) * y);
-  SimpleIRExprEval eval(body);
-  eval.bindVar(x, ExprHandle(3.f));
-  eval.bindVar(y, ExprHandle(6.f));
-  ASSERT_EQ(eval.value<float>(), 2 + (3 * 3 + 4 * 6));
-}
-
-TEST(Expr, LetStmtTest01) {
-  BufHandle a_buf("a", {1}, kFloat);
-  BufHandle b_buf("b", {1}, kFloat);
-
-  ExprHandle load_a = a_buf.load(0);
-  VarHandle var = VarHandle("v", kFloat);
-  StmtPtr let_store = Let::make(var, load_a);
-  StmtPtr store_b = b_buf.store({0}, var);
-  BlockPtr block = Block::make({let_store, store_b});
-
-  SimpleIREvaluator eval(block, {a_buf, b_buf});
-
-  PaddedBuffer<float> a_v(1);
-  PaddedBuffer<float> b_v(1);
-  PaddedBuffer<float> b_ref(1);
-
-  a_v(0) = 23;
-  b_ref(0) = a_v(0);
-  eval(a_v, b_v);
-
-  ExpectAllNear(b_v, b_ref, 1e-5);
-}
-
-TEST(Expr, IntTest) {
-  VarHandle x("x", kInt);
-  ExprHandle body = ExprHandle(2) + (x * ExprHandle(3) + ExprHandle(4));
-  SimpleIRExprEval eval(body);
-  eval.bindVar(x, ExprHandle(3));
-  ASSERT_EQ(eval.value<int>(), 2 + (3 * 3 + 4));
-}
-
-TEST(Expr, FloatTest) {
-  VarHandle x("x", kFloat);
-  ExprHandle body = ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f));
-  SimpleIRExprEval eval(body);
-  eval.bindVar(x, ExprHandle(3.f));
-  ASSERT_EQ(eval.value<float>(), 2 + (3 * 3 + 4));
-}
-
-TEST(Expr, ByteTest) {
-  VarHandle x("x", kByte);
-  ExprHandle body = ExprHandle((uint8_t)2) +
-      (x * ExprHandle((uint8_t)3) + ExprHandle((uint8_t)4));
-  SimpleIRExprEval eval(body);
-  eval.bindVar(x, ExprHandle((uint8_t)3));
-  ASSERT_EQ(eval.value<uint8_t>(), 2 + (3 * 3 + 4));
-}
-
-TEST(Expr, CharTest) {
-  VarHandle x("x", kChar);
-  ExprHandle body = ExprHandle((int8_t)2) +
-      (x * ExprHandle((int8_t)3) + ExprHandle((int8_t)4));
-  SimpleIRExprEval eval(body);
-  eval.bindVar(x, ExprHandle((int8_t)3));
-  ASSERT_EQ(eval.value<int8_t>(), 2 + (3 * 3 + 4));
-}
-
-TEST(Expr, ShortTest) {
-  VarHandle x("x", kShort);
-  ExprHandle body = ExprHandle((int16_t)2) +
-      (x * ExprHandle((int16_t)3) + ExprHandle((int16_t)4));
-  SimpleIRExprEval eval(body);
-  eval.bindVar(x, ExprHandle((int16_t)3));
-  ASSERT_EQ(eval.value<int16_t>(), 2 + (3 * 3 + 4));
-}
-
-TEST(Expr, LongTest) {
-  VarHandle x("x", kLong);
-  ExprHandle body = ExprHandle((int64_t)2) +
-      (x * ExprHandle((int64_t)3) + ExprHandle((int64_t)4));
-  SimpleIRExprEval eval(body);
-  eval.bindVar(x, ExprHandle((int64_t)3));
-  ASSERT_EQ(eval.value<int64_t>(), 2 + (3 * 3 + 4));
-}
-
-TEST(Expr, HalfTest) {
-  VarHandle x("x", kHalf);
-  ExprHandle body = ExprHandle((at::Half)2) +
-      (x * ExprHandle((at::Half)3) + ExprHandle((at::Half)4));
-  SimpleIRExprEval eval(body);
-  eval.bindVar(x, ExprHandle((at::Half)3));
-  ASSERT_EQ(eval.value<at::Half>(), 2 + (3 * 3 + 4));
-}
-
-TEST(Expr, DoubleTest) {
-  VarHandle x("x", kDouble);
-  ExprHandle body = ExprHandle((double)2) +
-      (x * ExprHandle((double)3) + ExprHandle((double)4));
-  SimpleIRExprEval eval(body);
-  eval.bindVar(x, ExprHandle((double)3));
-  ASSERT_EQ(eval.value<double>(), 2 + (3 * 3 + 4));
-}
-
-TEST(Expr, VectorAdd01) {
-  const int kVectorSize = 8;
-  const int kVectorCount = 128;
-  const int kTotalSize = kVectorSize * kVectorCount;
-
-  BufHandle a_buf("A", {kTotalSize}, kFloat);
-  BufHandle b_buf("B", {kTotalSize}, kFloat);
-  BufHandle c_buf("C", {kTotalSize}, kFloat);
-
-  /*
-  Build the following:
-    for (const auto index : c10::irange(kVectorCount)) {
-      store(c_buf, ramp(index * 8, 1, 8),
-            load(a_buf, ramp(index * 8, 1, 8) +
-            load(b_buf, ramp(index * 8, 1, 8))))
-    }
-  */
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a =
-      a_buf.load({Ramp::make(index * kVectorSize, 1, kVectorSize)});
-  ExprHandle load_b =
-      b_buf.load({Ramp::make(index * kVectorSize, 1, kVectorSize)});
-  ExprHandle value = load_a + load_b;
-  StmtPtr store_c =
-      c_buf.store({Ramp::make(index * kVectorSize, 1, kVectorSize)}, value);
-  StmtPtr stmt = For::make(index, 0, kVectorCount, store_c);
-
-  ASSERT_EQ(load_a.dtype(), Dtype(kFloat, kVectorSize));
-  ASSERT_EQ(load_b.dtype(), Dtype(kFloat, kVectorSize));
-  ASSERT_EQ(value.dtype(), Dtype(kFloat, kVectorSize));
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-  PaddedBuffer<float> c_v(kTotalSize);
-  PaddedBuffer<float> c_ref(kTotalSize);
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = i * i;
-    b_v(i) = i * i * 4;
-    c_ref(i) = a_v(i) + b_v(i);
-  }
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf});
-  ir_eval(a_v, b_v, c_v);
-  ExpectAllNear(c_v, c_ref, 1e-5);
-}
-
-TEST(Expr, CompareSelectEQ) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  std::vector<int> a_buffer(N, 1);
-  std::vector<int> b_buffer(N, 1);
-  std::vector<int> c_buffer(N, 0);
-  std::vector<int> c_ref(N, 0);
-
-  VarHandle i("i", kInt);
-  auto memcpy_expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
-
-  SimpleIREvaluator ir_eval(memcpy_expr, {a, b, c});
-  ir_eval(a_buffer, b_buffer, c_buffer);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-
-  assertAllEqual(a_buffer, 1);
-  assertAllEqual(b_buffer, 1);
-  assertAllEqual(c_buffer, 1);
-}
-
-TEST(Expr, CompareSelectDtypes) {
-  // LHS and RHS expressions should have the same dtype, but this dtype could
-  // differ from the dtype of the return values (but dtypes of true and false
-  // return values should be the same).
-  // This test constructs a CompareSelect expression where the input dtype is
-  // different from the output dtype and verifies that it works correctly:
-  //   result = ((int)lhs == (int)rhs) ? (float)retval1 : (float)retval2
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kFloat);
-  std::vector<int> a_buffer(N, 1);
-  std::vector<int> b_buffer(N, 1);
-  std::vector<float> c_buffer(N, 0.0f);
-  std::vector<float> c_ref(N, 3.14f);
-
-  VarHandle i("i", kInt);
-  // C[i] = (A[i] == B[i]) ? 3.14f : 2.78f
-  // A and B are int, C is float.
-  auto select_expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i),
-              b.load(i),
-              FloatImm::make(3.14f),
-              FloatImm::make(2.78f),
-              CompareSelectOperation::kEQ)));
-
-  SimpleIREvaluator ir_eval(select_expr, {a, b, c});
-  ir_eval(a_buffer, b_buffer, c_buffer);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-
-  assertAllEqual(a_buffer, 1);
-  assertAllEqual(b_buffer, 1);
-  ExpectAllNear(c_buffer, c_ref, 1e-7);
-}
-
-TEST(Expr, IntrinsicsDtypes) {
-  constexpr int N = 256;
-  BufHandle a("A", {N}, kDouble);
-  BufHandle b("B", {N}, kDouble);
-  std::vector<double> a_buffer(N, -10.0);
-  std::vector<double> b_buffer(N, 0.0);
-  std::vector<double> b_ref(N, 10.0);
-
-  VarHandle i("i", kInt);
-  auto abs_expr = For::make(i, 0, N, b.store({i}, tensorexpr::abs(a.load(i))));
-
-  SimpleIREvaluator ir_eval(abs_expr, {a, b});
-  ir_eval(a_buffer, b_buffer);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-
-  assertAllEqual(a_buffer, -10.0);
-  ExpectAllNear(b_buffer, b_ref, 1e-7);
-}
-
-TEST(Expr, Substitute01) {
-  VarPtr x = alloc<Var>("x", kFloat);
-  VarPtr y = alloc<Var>("y", kFloat);
-  ExprPtr e =
-      alloc<Mul>(alloc<Sub>(x, alloc<FloatImm>(1.0f)), alloc<Add>(x, y));
-
-  VarPtr z = alloc<Var>("z", kFloat);
-  ExprPtr e2 = Substitute(e, {{x, alloc<Add>(z, alloc<FloatImm>(5.0f))}});
-  ExprPtr e2_ref = alloc<Mul>(
-      alloc<Sub>(alloc<Add>(z, alloc<FloatImm>(5.0f)), alloc<FloatImm>(1.0f)),
-      alloc<Add>(alloc<Add>(z, alloc<FloatImm>(5.0f)), y));
-  std::ostringstream oss;
-  oss << *e2;
-  std::string e2_str = oss.str();
-
-  oss.str("");
-  oss << *e2_ref;
-  std::string e2_ref_str = oss.str();
-  ASSERT_EQ(e2_str, e2_ref_str);
-}
-
-TEST(Expr, Math01) {
-  ExprHandle v = sin(ExprHandle(1.0f));
-
-  std::ostringstream oss;
-  oss << v;
-  ASSERT_EQ(oss.str(), "sin(1.f)");
-
-  SimpleIRExprEval eval(v);
-  float v_ref = std::sin(1.0f);
-  float res = eval.value<float>();
-  ASSERT_NEAR(res, v_ref, 1e-6);
-}
-
-TEST(Expr, UnaryMath01) {
-  struct TestConfig {
-    std::function<ExprHandle(const ExprHandle&)> func;
-    std::function<float(float)> ref_func;
-  };
-
-  std::vector<TestConfig> test_configs = {
-      {[](const ExprHandle& v) { return sin(v); },
-       [](float v) { return std::sin(v); }},
-      {[](const ExprHandle& v) { return sin(v); },
-       [](float v) { return std::sin(v); }},
-      {[](const ExprHandle& v) { return tan(v); },
-       [](float v) { return std::tan(v); }},
-      {[](const ExprHandle& v) { return asin(v); },
-       [](float v) { return std::asin(v); }},
-      {[](const ExprHandle& v) { return acos(v); },
-       [](float v) { return std::acos(v); }},
-      {[](const ExprHandle& v) { return atan(v); },
-       [](float v) { return std::atan(v); }},
-      {[](const ExprHandle& v) { return sinh(v); },
-       [](float v) { return std::sinh(v); }},
-      {[](const ExprHandle& v) { return cosh(v); },
-       [](float v) { return std::cosh(v); }},
-      {[](const ExprHandle& v) { return tanh(v); },
-       [](float v) { return std::tanh(v); }},
-      {[](const ExprHandle& v) { return exp(v); },
-       [](float v) { return std::exp(v); }},
-      {[](const ExprHandle& v) { return tensorexpr::abs(v); },
-       [](float v) { return std::fabs(v); }},
-      {[](const ExprHandle& v) { return log(v); },
-       [](float v) { return std::log(v); }},
-      {[](const ExprHandle& v) { return log2(v); },
-       [](float v) { return std::log2(v); }},
-      {[](const ExprHandle& v) { return log10(v); },
-       [](float v) { return std::log10(v); }},
-      {[](const ExprHandle& v) { return erf(v); },
-       [](float v) { return std::erf(v); }},
-      {[](const ExprHandle& v) { return sqrt(v); },
-       [](float v) { return std::sqrt(v); }},
-      {[](const ExprHandle& v) { return rsqrt(v); },
-       [](float v) { return 1.0f / std::sqrt(v); }},
-      {[](const ExprHandle& v) { return ceil(v); },
-       [](float v) { return std::ceil(v); }},
-      {[](const ExprHandle& v) { return floor(v); },
-       [](float v) { return std::floor(v); }},
-      {[](const ExprHandle& v) { return round(v); },
-       [](float v) { return std::round(v); }},
-      {[](const ExprHandle& v) { return trunc(v); },
-       [](float v) { return std::trunc(v); }},
-  };
-
-  for (const TestConfig& test_config : test_configs) {
-    const float input_v = 0.8765f;
-    ExprHandle v = test_config.func(ExprHandle(input_v));
-    float v_ref = test_config.ref_func(input_v);
-    SimpleIRExprEval eval(v);
-    ASSERT_NEAR(eval.value<float>(), v_ref, 1e-6);
-  }
-
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  for (float input_v : {std::nan("1"), 0., .5}) {
-    ExprHandle v = FloatImm::make(input_v);
-    SimpleIRExprEval eval(Intrinsics::make(kIsNan, v));
-    ASSERT_NEAR(eval.value<int>(), std::isnan(input_v), 0);
-  }
-}
-
-TEST(Expr, BinaryMath01) {
-  struct TestConfig {
-    std::function<ExprHandle(const ExprHandle&, const ExprHandle&)> func;
-    std::function<float(float, float)> ref_func;
-  };
-
-  std::vector<TestConfig> test_configs = {
-      {[](const ExprHandle& v1, const ExprHandle& v2) { return pow(v1, v2); },
-       [](float v1, float v2) { return std::pow(v1, v2); }},
-      {[](const ExprHandle& v1, const ExprHandle& v2) { return fmod(v1, v2); },
-       [](float v1, float v2) { return std::fmod(v1, v2); }},
-  };
-
-  for (const TestConfig& test_config : test_configs) {
-    const float v1 = 0.8765f;
-    float v2 = 1.2345f;
-    ExprHandle v_expr = test_config.func(ExprHandle(v1), ExprHandle(v2));
-    float v_ref = test_config.ref_func(v1, v2);
-    SimpleIRExprEval eval(v_expr);
-    ASSERT_NEAR(eval.value<float>(), v_ref, 1e-6);
-  }
-}
-
-TEST(Expr, LogicalOps01) {
-  ExprHandle a(23);
-  ExprHandle b(11);
-  ExprHandle c(0.72f);
-  ExprHandle d(0.69f);
-  ExprHandle f1 = (a > b) && (c > d);
-  ExprHandle f2 = (a > b) && (c < d);
-  ExprHandle f3 = (a < b) && (c > d);
-  ExprHandle f4 = (a < b) && (c < d);
-  ExprHandle f5 = (a < b) || (c > d);
-  ExprHandle f6 = (a < b) || (c < d);
-  ExprHandle f7 = (a > b) || (c < d);
-  ExprHandle f8 = (a > b) || (c > d);
-
-  SimpleIRExprEval eval1(f1);
-  SimpleIRExprEval eval2(f2);
-  SimpleIRExprEval eval3(f3);
-  SimpleIRExprEval eval4(f4);
-  SimpleIRExprEval eval5(f5);
-  SimpleIRExprEval eval6(f6);
-  SimpleIRExprEval eval7(f7);
-  SimpleIRExprEval eval8(f8);
-  ASSERT_EQ(eval1.value<int>(), 1);
-  ASSERT_EQ(eval2.value<int>(), 0);
-  ASSERT_EQ(eval3.value<int>(), 0);
-  ASSERT_EQ(eval4.value<int>(), 0);
-  ASSERT_EQ(eval5.value<int>(), 1);
-  ASSERT_EQ(eval6.value<int>(), 0);
-  ASSERT_EQ(eval7.value<int>(), 1);
-  ASSERT_EQ(eval8.value<int>(), 1);
-}
-
-TEST(Expr, LogicalOps02) {
-  ExprHandle a(23);
-  ExprHandle b(11);
-  ExprHandle c(0.72f);
-  ExprHandle d(0.72f);
-
-  ExprHandle f1 = (a > b) || (c > d);
-  ExprHandle f2 = (a > b) && (c <= d);
-  ExprHandle f3 = (a > b) && (c > d);
-  ExprHandle ff1 = f1 && f2;
-  ExprHandle ff2 = f2 || f3;
-
-  SimpleIRExprEval eval1(ff1);
-  SimpleIRExprEval eval2(ff2);
-  ASSERT_EQ(eval1.value<int>(), 1);
-  ASSERT_EQ(eval2.value<int>(), 1);
-}
-
-TEST(Expr, LogicalOps03) {
-  ExprHandle a(23);
-  ExprHandle b(11);
-  ExprHandle c(0.72f);
-  ExprHandle d(0.69f);
-
-  // Bool types
-  ExprHandle bool_f1 = (a > b) && BoolImm::make(true);
-  ExprHandle bool_f2 = (c <= d) || BoolImm::make(true);
-
-  // Int types
-  ExprHandle int_f1 = (a > b) && IntImm::make(1);
-  ExprHandle int_f2 = (c <= d) || IntImm::make(1);
-
-  // Short types
-  ExprHandle short_f1 = (a > b) && ShortImm::make(1);
-  ExprHandle short_f2 = (c <= d) || ShortImm::make(1);
-
-  // Long types
-  ExprHandle long_f1 = (a > b) && LongImm::make(1);
-  ExprHandle long_f2 = (c <= d) || LongImm::make(1);
-
-  // Char types
-  ExprHandle char_f1 = (a > b) && CharImm::make(1);
-  ExprHandle char_f2 = (c <= d) || CharImm::make(1);
-
-  // Byte types
-  ExprHandle byte_f1 = (a > b) && ByteImm::make(1);
-  ExprHandle byte_f2 = (c <= d) || ByteImm::make(1);
-
-  SimpleIRExprEval eval1(bool_f1);
-  SimpleIRExprEval eval2(bool_f2);
-  SimpleIRExprEval eval3(int_f1);
-  SimpleIRExprEval eval4(int_f2);
-  SimpleIRExprEval eval5(short_f1);
-  SimpleIRExprEval eval6(short_f2);
-  SimpleIRExprEval eval7(long_f1);
-  SimpleIRExprEval eval8(long_f2);
-  SimpleIRExprEval eval9(char_f1);
-  SimpleIRExprEval eval10(char_f2);
-  SimpleIRExprEval eval11(byte_f1);
-  SimpleIRExprEval eval12(byte_f2);
-
-  ASSERT_EQ(eval1.value<bool>(), true);
-  ASSERT_EQ(eval2.value<bool>(), true);
-  ASSERT_EQ(eval3.value<int>(), 1);
-  ASSERT_EQ(eval4.value<int>(), 1);
-  ASSERT_EQ(eval5.value<int16_t>(), 1);
-  ASSERT_EQ(eval6.value<int16_t>(), 1);
-  ASSERT_EQ(eval7.value<int64_t>(), 1);
-  ASSERT_EQ(eval8.value<int64_t>(), 1);
-  ASSERT_EQ(eval9.value<int8_t>(), 1);
-  ASSERT_EQ(eval10.value<int8_t>(), 1);
-  ASSERT_EQ(eval11.value<uint8_t>(), 1);
-  ASSERT_EQ(eval12.value<uint8_t>(), 1);
-}
-
-TEST(Expr, BitwiseOps) {
-  ExprHandle a(59);
-  ExprHandle b(11);
-  ExprHandle c(101);
-  ExprHandle d(2);
-  ExprHandle f = (((a ^ (b << 1)) & c) >> 2) | d;
-
-  SimpleIRExprEval eval(f);
-  ASSERT_EQ(eval.value<int>(), 11);
-}
-
-TEST(Expr, DynamicShapeAdd) {
-  auto testWithSize = [](int32_t size) {
-    VarHandle n("n", kInt);
-    BufHandle a("a", {n}, kFloat);
-    BufHandle b("b", {n}, kFloat);
-    BufHandle c("c", {n}, kFloat);
-    VarHandle i("i", kInt);
-    StmtPtr s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
-    std::vector<float> aData(size, 1.0f);
-    std::vector<float> bData(size, 2.0f);
-    std::vector<float> cData(size, 0.0f);
-    SimpleIREvaluator(s, {a, b, c, n})(aData, bData, cData, size);
-    ExpectAllNear(cData, std::vector<float>(size, 3.0f), 1e-7);
-  };
-  testWithSize(1);
-  testWithSize(16);
-  testWithSize(37);
-}
-
-TEST(Expr, OutOfBounds) {
-  ExprHandle N(10);
-  ExprHandle start(0);
-  ExprHandle stop(15);
-  VarHandle i("i", kInt);
-
-  BufHandle X("X", {N}, kInt);
-
-  auto body = Store::make(X, {i}, i);
-  auto stmt = For::make(i, start, stop, body);
-
-  PaddedBuffer<int> data(20);
-
-  EXPECT_ANY_THROW(SimpleIREvaluator(stmt, {X})(data));
-}
-
-TEST(Expr, OutOfBounds2d) {
-  std::vector<std::pair<int, int>> size_options = {{10, 15}, {15, 10}};
-  for (auto sizes : size_options) {
-    ExprHandle N(sizes.first);
-    ExprHandle M(sizes.second);
-    ExprHandle start(0);
-    ExprHandle stopInner(15);
-    ExprHandle stopOuter(15);
-    VarHandle i("i", kInt);
-    VarHandle j("j", kInt);
-
-    BufHandle X("X", {N, M}, kInt);
-
-    auto body = Store::make(X, {i, j}, i);
-    auto inner = For::make(j, start, stopInner, body);
-    auto stmt = For::make(i, start, stopOuter, inner);
-
-    PaddedBuffer<int> data(400);
-
-    EXPECT_ANY_THROW(SimpleIREvaluator(stmt, {X})(data));
-  }
-}
-
-TEST(Expr, OutOfBounds2dFlattenedIndex) {
-  ExprHandle buf_size(149);
-  ExprHandle start(0);
-  ExprHandle stopInner(15);
-  ExprHandle stopOuter(10);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-
-  BufHandle X("X", {buf_size}, kInt);
-
-  auto idx = Add::make(Mul::make(i, stopInner), j);
-  auto body = Store::make(X, {idx}, i);
-  auto inner = For::make(j, start, stopInner, body);
-  auto stmt = For::make(i, start, stopOuter, inner);
-
-  PaddedBuffer<int> data(400);
-
-  EXPECT_ANY_THROW(SimpleIREvaluator(stmt, {X})(data));
-}
-
-void testCond01() {
-  const int N = 16;
-  PaddedBuffer<float> a_v(N);
-  BufHandle a_buf("a", {N}, kFloat);
-  VarHandle index = VarHandle("index", kInt);
-  StmtPtr assign_x2 = a_buf.store({index}, cast<float>(index) * 2);
-  StmtPtr assign_x3 = a_buf.store({index}, cast<float>(index) * 3);
-  ExprHandle even_cond = CompareSelect::make(Mod::make(index, 2), 0, kEQ);
-  StmtPtr assign = Cond::make(even_cond, assign_x2, assign_x3);
-  StmtPtr for_stmt = For::make(index, 0, N, assign);
-  SimpleIREvaluator(for_stmt, {a_buf})(a_v);
-
-  PaddedBuffer<float> a_ref(N);
-  for (const auto i : c10::irange(N)) {
-    if (i % 2 == 0) {
-      a_ref(i) = i * 2;
-    } else {
-      a_ref(i) = i * 3;
-    }
-  }
-  ExpectAllNear(a_v, a_ref, 1e-5);
-}
-
-void testIfThenElse01() {
-  ExprHandle v = ifThenElse(ExprHandle(1), ExprHandle(1.0f), ExprHandle(2.0f));
-
-  std::ostringstream oss;
-  oss << v;
-  ASSERT_EQ(oss.str(), "IfThenElse(1, 1.f, 2.f)");
-
-  SimpleIRExprEval eval(v);
-  ASSERT_EQ(eval.value<float>(), 1.0f);
-}
-
-void testIfThenElse02() {
-  ExprHandle v = ifThenElse(ExprHandle(0), ExprHandle(1.0f), ExprHandle(2.0f));
-
-  std::ostringstream oss;
-  oss << v;
-  ASSERT_EQ(oss.str(), "IfThenElse(0, 1.f, 2.f)");
-
-  SimpleIRExprEval eval(v);
-  ASSERT_EQ(eval.value<float>(), 2.0f);
-}
-
-void testIfThenElse03() {
-  ExprHandle v =
-      ifThenElse(BoolImm::make(false), ExprHandle(1.0f), ExprHandle(2.0f));
-
-  std::ostringstream oss;
-  oss << v;
-  ASSERT_EQ(oss.str(), "IfThenElse(0, 1.f, 2.f)");
-
-  SimpleIRExprEval eval(v);
-  ASSERT_EQ(eval.value<float>(), 2.0f);
-}
-
-void testStmtClone() {
-  const int N = 16;
-
-  BufHandle a_buf("a", {N}, kInt);
-  VarHandle index = VarHandle("index", kInt);
-  StmtPtr body = a_buf.store({index}, 5);
-  StmtPtr loop = For::make(index, 0, N, body);
-
-  StmtPtr cloned_loop = Stmt::clone(loop);
-  std::vector<int> orig_loop_results(N);
-  std::vector<int> cloned_loop_results(N);
-  SimpleIREvaluator(loop, {a_buf})(orig_loop_results);
-  SimpleIREvaluator(cloned_loop, {a_buf})(cloned_loop_results);
-
-  assertAllEqual(orig_loop_results, 5);
-  assertAllEqual(cloned_loop_results, 5);
-
-  // Let's add another assign to the body in the cloned loop and verify that the
-  // original statement hasn't changed while the cloned one has.
-  StmtPtr body_addition = a_buf.store({index}, 33);
-  BlockPtr cloned_body = static_to<Block>(static_to<For>(cloned_loop)->body());
-  cloned_body->append_stmt(body_addition);
-
-  std::vector<int> orig_loop_results_after_mutation(N);
-  std::vector<int> cloned_loop_results_after_mutation(N);
-  SimpleIREvaluator(loop, {a_buf})(orig_loop_results_after_mutation);
-  SimpleIREvaluator(cloned_loop, {a_buf})(cloned_loop_results_after_mutation);
-
-  assertAllEqual(orig_loop_results_after_mutation, 5);
-  assertAllEqual(cloned_loop_results_after_mutation, 33);
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp
deleted file mode 100644
index 49f43d16b499d..0000000000000
--- a/test/cpp/tensorexpr/test_external_calls.cpp
+++ /dev/null
@@ -1,1061 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <test/cpp/tensorexpr/test_base.h>
-
-#include <torch/csrc/jit/ir/irparser.h>
-#include <torch/csrc/jit/passes/subgraph_rewrite.h>
-#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
-#include <torch/csrc/jit/runtime/custom_operator.h>
-#include <torch/csrc/jit/tensorexpr/kernel.h>
-
-#include <test/cpp/tensorexpr/test_utils.h>
-#include <torch/csrc/jit/runtime/operator.h>
-#include <torch/csrc/jit/runtime/symbolic_shape_registry.h>
-#include <torch/csrc/jit/tensorexpr/eval.h>
-#include <torch/csrc/jit/tensorexpr/external_functions_registry.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_printer.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-
-#include <torch/csrc/jit/testing/file_check.h>
-#include <torch/jit.h>
-
-#include <ATen/NativeFunctions.h>
-#include <ATen/core/dispatch/Dispatcher.h>
-#include <ATen/native/xnnpack/OpContext.h>
-
-namespace torch {
-namespace jit {
-using namespace torch::jit::tensorexpr;
-
-TEST(ExternalCall, Conv1d_float) {
-  BufHandle Input("Input", {1, 100, 115}, kFloat);
-  BufHandle Weight("Weight", {100, 1, 7}, kFloat);
-  BufHandle Bias("Bias", {100}, kFloat);
-  BufHandle ResultBuf("Result", {1, 100, 115}, kFloat);
-  int64_t stride = 1;
-  int64_t pad = 3;
-  int64_t dilation = 1;
-  int64_t groups = 100;
-
-  Tensor Result = Tensor(
-      ResultBuf.node(),
-      ExternalCall::make(
-          ResultBuf,
-          "nnc_aten_conv1d",
-          {Input, Weight, Bias},
-          {stride, pad, dilation, groups}));
-  LoopNest l({Result});
-  l.prepareForCodegen();
-  l.simplify();
-
-  auto options = at::TensorOptions()
-                     .dtype(at::kFloat)
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-  at::Tensor input = at::ones({1, 100, 115}, options) * 5.f;
-  at::Tensor weight = at::ones({100, 1, 7}, options) * 6.f;
-  at::Tensor bias = at::ones({100}, options) * 11.f;
-  at::Tensor ref =
-      at::conv1d(input, weight, bias, {stride}, {pad}, {dilation}, groups);
-
-  at::Tensor nnc_result;
-  std::vector<float> input_buf(1 * 100 * 115, 5.f);
-  std::vector<float> weight_buf(100 * 1 * 7, 6.f);
-  std::vector<float> bias_buf(100, 11.f);
-  std::vector<float> result_buf(1 * 100 * 115, -1.f);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Weight, Bias, Result});
-
-  llvm_codegen.call({input_buf, weight_buf, bias_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 100, 115}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Weight, Bias, Result});
-
-  ir_eval.call({input_buf, weight_buf, bias_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 100, 115}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-}
-
-TEST(ExternalCall, Conv1d_int) {
-  // A similar test, but now using kInt tensors
-  BufHandle Input("Input", {1, 100, 115}, kInt);
-  BufHandle Weight("Weight", {100, 1, 7}, kInt);
-  BufHandle Bias("Bias", {100}, kInt);
-  BufHandle ResultBuf("Result", {1, 100, 115}, kInt);
-  int64_t stride = 1;
-  int64_t pad = 3;
-  int64_t dilation = 1;
-  int64_t groups = 100;
-
-  Tensor Result = Tensor(
-      ResultBuf.node(),
-      ExternalCall::make(
-          ResultBuf,
-          "nnc_aten_conv1d",
-          {Input, Weight, Bias},
-          {stride, pad, dilation, groups}));
-  LoopNest l({Result});
-  l.prepareForCodegen();
-  l.simplify();
-
-  auto options = at::TensorOptions()
-                     .dtype(at::kInt)
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-  at::Tensor input = at::ones({1, 100, 115}, options) * 5;
-  at::Tensor weight = at::ones({100, 1, 7}, options) * 6;
-  at::Tensor bias = at::ones({100}, options) * 11;
-  at::Tensor ref =
-      at::conv1d(input, weight, bias, {stride}, {pad}, {dilation}, groups);
-
-  at::Tensor nnc_result;
-  std::vector<int32_t> input_buf(1 * 100 * 115, 5);
-  std::vector<int32_t> weight_buf(100 * 1 * 7, 6);
-  std::vector<int32_t> bias_buf(100, 11);
-  std::vector<int32_t> result_buf(1 * 100 * 115, -1);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Weight, Bias, Result});
-
-  llvm_codegen.call({input_buf, weight_buf, bias_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 100, 115}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Weight, Bias, Result});
-
-  ir_eval.call({input_buf, weight_buf, bias_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 100, 115}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-}
-
-TEST(ExternalCall, Conv1d_nobias_noargs) {
-  BufHandle Input("Input", {1, 1, 115}, kFloat);
-  BufHandle Weight("Weight", {10, 1, 7}, kFloat);
-  BufHandle ResultBuf("Result", {1, 10, 109}, kFloat);
-
-  Tensor Result = Tensor(
-      ResultBuf.node(),
-      ExternalCall::make(ResultBuf, "nnc_aten_conv1d", {Input, Weight}, {}));
-  LoopNest l({Result});
-  l.prepareForCodegen();
-  l.simplify();
-
-  auto options = at::TensorOptions()
-                     .dtype(at::kFloat)
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-  at::Tensor input = at::ones({1, 1, 115}, options) * 5.f;
-  at::Tensor weight = at::ones({10, 1, 7}, options) * 6.f;
-  at::Tensor ref = at::conv1d(input, weight);
-
-  at::Tensor nnc_result;
-  std::vector<float> input_buf(1 * 1 * 115, 5.f);
-  std::vector<float> weight_buf(10 * 1 * 7, 6.f);
-  std::vector<float> result_buf(1 * 10 * 109, -1.f);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Weight, Result});
-
-  llvm_codegen.call({input_buf, weight_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 10, 109}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Weight, Result});
-
-  ir_eval.call({input_buf, weight_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 10, 109}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-}
-
-TEST(ExternalCall, Conv2d_float) {
-  BufHandle Input("Input", {1, 3, 224, 224}, kFloat);
-  BufHandle Weight("Weight", {16, 3, 3, 3}, kFloat);
-  BufHandle Bias("Bias", {16}, kFloat);
-  BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat);
-  int64_t stride = 2;
-  int64_t pad = 1;
-  int64_t dilation = 1;
-  int64_t groups = 1;
-
-  Tensor Result = Tensor(
-      ResultBuf.node(),
-      ExternalCall::make(
-          ResultBuf,
-          "nnc_aten_conv2d",
-          {Input, Weight, Bias},
-          {stride, stride, pad, pad, dilation, dilation, groups}));
-  LoopNest l({Result});
-  l.prepareForCodegen();
-  l.simplify();
-
-  auto options = at::TensorOptions()
-                     .dtype(at::kFloat)
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-  at::Tensor input = at::ones({1, 3, 224, 224}, options) * 5.f;
-  at::Tensor weight = at::ones({16, 3, 3, 3}, options) * 6.f;
-  at::Tensor bias = at::ones({16}, options) * 11.f;
-  at::Tensor ref = at::conv2d(
-      input,
-      weight,
-      bias,
-      {stride, stride},
-      {pad, pad},
-      {dilation, dilation},
-      groups);
-
-  at::Tensor nnc_result;
-  std::vector<float> input_buf(1 * 3 * 224 * 224, 5.f);
-  std::vector<float> weight_buf(16 * 3 * 3 * 3, 6.f);
-  std::vector<float> bias_buf(16, 11.f);
-  std::vector<float> result_buf(1 * 16 * 112 * 112, -1.f);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Weight, Bias, Result});
-
-  llvm_codegen.call({input_buf, weight_buf, bias_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Weight, Bias, Result});
-
-  ir_eval.call({input_buf, weight_buf, bias_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-}
-
-TEST(ExternalCall, Conv2d_int) {
-  // A similar test, but now using kInt tensors
-
-  BufHandle Input("Input", {1, 3, 224, 224}, kInt);
-  BufHandle Weight("Weight", {16, 3, 3, 3}, kInt);
-  BufHandle Bias("Bias", {16}, kInt);
-  BufHandle ResultBuf("Result", {1, 16, 112, 112}, kInt);
-  int64_t stride = 2;
-  int64_t pad = 1;
-  int64_t dilation = 1;
-  int64_t groups = 1;
-
-  Tensor Result = Tensor(
-      ResultBuf.node(),
-      ExternalCall::make(
-          ResultBuf,
-          "nnc_aten_conv2d",
-          {Input, Weight, Bias},
-          {stride, stride, pad, pad, dilation, dilation, groups}));
-  LoopNest l({Result});
-  l.prepareForCodegen();
-  l.simplify();
-
-  auto options = at::TensorOptions()
-                     .dtype(at::kInt)
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-  at::Tensor input = at::ones({1, 3, 224, 224}, options) * 5;
-  at::Tensor weight = at::ones({16, 3, 3, 3}, options) * 6;
-  at::Tensor bias = at::ones({16}, options) * 11;
-  at::Tensor ref = at::conv2d(
-      input,
-      weight,
-      bias,
-      {stride, stride},
-      {pad, pad},
-      {dilation, dilation},
-      groups);
-
-  at::Tensor nnc_result;
-  std::vector<int32_t> input_buf(1 * 3 * 224 * 224, 5);
-  std::vector<int32_t> weight_buf(16 * 3 * 3 * 3, 6);
-  std::vector<int32_t> bias_buf(16, 11);
-  std::vector<int32_t> result_buf(1 * 16 * 112 * 112, -1);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Weight, Bias, Result});
-
-  llvm_codegen.call({input_buf, weight_buf, bias_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Weight, Bias, Result});
-
-  ir_eval.call({input_buf, weight_buf, bias_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-}
-
-TEST(ExternalCall, Conv2d_nobias_noargs) {
-  BufHandle Input("Input", {1, 16, 112, 112}, kFloat);
-  BufHandle Weight("Weight", {16, 16, 1, 1}, kFloat);
-  BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat);
-
-  Tensor Result = Tensor(
-      ResultBuf.node(),
-      ExternalCall::make(ResultBuf, "nnc_aten_conv2d", {Input, Weight}, {}));
-  LoopNest l({Result});
-  l.prepareForCodegen();
-  l.simplify();
-
-  auto options = at::TensorOptions()
-                     .dtype(at::kFloat)
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-  at::Tensor input = at::ones({1, 16, 112, 112}, options) * 5.f;
-  at::Tensor weight = at::ones({16, 16, 1, 1}, options) * 6.f;
-  at::Tensor ref = at::conv2d(input, weight);
-
-  at::Tensor nnc_result;
-  std::vector<float> input_buf(1 * 16 * 112 * 112, 5.f);
-  std::vector<float> weight_buf(16 * 16 * 1 * 1, 6.f);
-  std::vector<float> result_buf(1 * 16 * 112 * 112, -1.f);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Weight, Result});
-
-  llvm_codegen.call({input_buf, weight_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Weight, Result});
-
-  ir_eval.call({input_buf, weight_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-}
-
-TEST(ExternalCall, Addmm_float) {
-  BufHandle Input("Input", {100, 300}, kFloat);
-  BufHandle Mat1("Mat1", {100, 200}, kFloat);
-  BufHandle Mat2("Mat2", {200, 300}, kFloat);
-  BufHandle ResultBuf("Result", {100, 300}, kFloat);
-  int64_t beta = 2;
-  int64_t alpha = 2;
-
-  Tensor Result = Tensor(
-      ResultBuf.node(),
-      ExternalCall::make(
-          ResultBuf, "nnc_aten_addmm", {Input, Mat1, Mat2}, {beta, alpha}));
-  LoopNest l({Result});
-  l.prepareForCodegen();
-  l.simplify();
-
-  auto options = at::TensorOptions()
-                     .dtype(at::kFloat)
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-  at::Tensor input = at::ones({100, 300}, options) * 5.f;
-  at::Tensor mat1 = at::ones({100, 200}, options) * 6.f;
-  at::Tensor mat2 = at::ones({200, 300}, options) * 11.f;
-  at::Tensor ref = at::addmm(input, mat1, mat2, beta, alpha);
-
-  at::Tensor nnc_result;
-  std::vector<float> input_buf(100 * 300, 5.f);
-  std::vector<float> mat1_buf(100 * 200, 6.f);
-  std::vector<float> mat2_buf(200 * 300, 11.f);
-  std::vector<float> result_buf(100 * 300, -1.f);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Mat1, Mat2, Result});
-
-  llvm_codegen.call({input_buf, mat1_buf, mat2_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {100, 300}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Mat1, Mat2, Result});
-
-  ir_eval.call({input_buf, mat1_buf, mat2_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {100, 300}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-}
-
-TEST(ExternalCall, Embedding) {
-  BufHandle Weight("Weight", {256, 100}, kFloat);
-  BufHandle Indices("Indices", {1, 115}, kLong);
-  BufHandle ResultBuf("Result", {1, 115, 100}, kFloat);
-  int64_t padding_idx = -1;
-  bool scale_grad_by_freq = false;
-  bool sparse = false;
-
-  Tensor Result = Tensor(
-      ResultBuf.node(),
-      ExternalCall::make(
-          ResultBuf,
-          "nnc_aten_embedding",
-          {Weight, Indices},
-          {padding_idx, (int64_t)scale_grad_by_freq, (int64_t)sparse}));
-  LoopNest l({Result});
-  l.prepareForCodegen();
-  l.simplify();
-
-  auto options = at::TensorOptions()
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-
-  at::Tensor weight = at::ones({256, 100}, options.dtype(at::kFloat)) * 5.f;
-  at::Tensor indices = at::ones({1, 115}, options.dtype(at::kLong)) * 6;
-  at::Tensor ref =
-      at::embedding(weight, indices, padding_idx, scale_grad_by_freq, sparse);
-
-  at::Tensor nnc_result;
-  std::vector<float> weight_buf(256 * 100, 5.f);
-  std::vector<int64_t> indices_buf(1 * 115, 6);
-  std::vector<float> result_buf(1 * 115 * 100, -1.f);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(l.root_stmt(), {Weight, Indices, Result});
-
-  llvm_codegen.call({weight_buf, indices_buf, result_buf});
-  nnc_result = at::from_blob(
-      result_buf.data(), {1, 115, 100}, options.dtype(at::kFloat));
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-  SimpleIREvaluator ir_eval(l.root_stmt(), {Weight, Indices, Result});
-
-  ir_eval.call({weight_buf, indices_buf, result_buf});
-  nnc_result = at::from_blob(
-      result_buf.data(), {1, 115, 100}, options.dtype(at::kFloat));
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-}
-
-TEST(ExternalCall, MaxReduction) {
-  BufHandle Input("Input", {1, 115, 152}, kFloat);
-  BufHandle ResultBuf("Result", {1, 152}, kFloat);
-  int64_t dim = 1;
-  bool keep_dim = false;
-
-  Tensor Result = Tensor(
-      ResultBuf.node(),
-      ExternalCall::make(
-          ResultBuf, "nnc_aten_max_red", {Input}, {dim, (int64_t)keep_dim}));
-  LoopNest l({Result});
-  l.prepareForCodegen();
-  l.simplify();
-
-  auto options = at::TensorOptions()
-                     .dtype(at::kFloat)
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-
-  at::Tensor input = at::ones({1, 115, 152}, options) * 5.f;
-  at::Tensor ref = std::get<0>(at::max(input, dim, keep_dim));
-
-  at::Tensor nnc_result;
-  std::vector<float> input_buf(1 * 115 * 152, 5.f);
-  std::vector<float> result_buf(1 * 152, -1.f);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, Result});
-
-  llvm_codegen.call({input_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 152}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, Result});
-
-  ir_eval.call({input_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 152}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-}
-
-#ifdef USE_XNNPACK
-
-TEST(ExternalCall, Prepacked_Linear_float) {
-  using namespace at::native::xnnpack;
-
-  BufHandle Input("Input", {100, 200}, kFloat);
-  BufHandle ResultBuf("Result", {100, 300}, kFloat);
-
-  // Calculate reference result using at::linear.
-  auto options = at::TensorOptions()
-                     .dtype(at::kFloat)
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-  at::Tensor input =
-      at::linspace(-10.0, 10.0, 100 * 200, options).resize_({100, 200});
-  at::Tensor weight =
-      at::linspace(-10.0, 10.0, 300 * 200, options).resize_({300, 200});
-  at::Tensor bias = at::linspace(-10.0, 10.0, 300, options);
-  at::Tensor ref = at::linear(input, weight, bias);
-
-  // Create prepacked xnnpack context object.
-  auto linear_clamp_prepack_op =
-      c10::Dispatcher::singleton()
-          .findSchemaOrThrow("prepacked::linear_clamp_prepack", "")
-          .typed<c10::intrusive_ptr<LinearOpContext>(
-              at::Tensor,
-              std::optional<at::Tensor>,
-              const std::optional<at::Scalar>&,
-              const std::optional<at::Scalar>&)>();
-  auto prepacked = linear_clamp_prepack_op.call(
-      weight, bias, std::optional<at::Scalar>(), std::optional<at::Scalar>());
-
-  BufHandle DummyPrepacked("DummyPrepacked", {1}, kFloat);
-  Tensor Result = Tensor(
-      ResultBuf.node(),
-      ExternalCall::make(
-          ResultBuf,
-          "nnc_prepacked_linear_clamp_run",
-          {Input, DummyPrepacked},
-          {}));
-  LoopNest l({Result});
-  l.prepareForCodegen();
-  l.simplify();
-
-  at::Tensor nnc_result;
-  std::vector<float> input_buf(
-      input.data_ptr<float>(), input.data_ptr<float>() + 100 * 200);
-  std::vector<float> result_buf(100 * 300, -1.f);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, DummyPrepacked, Result});
-
-  llvm_codegen.call({input_buf, prepacked.get(), result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {100, 300}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, DummyPrepacked, Result});
-
-  ir_eval.call({input_buf, prepacked.get(), result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {100, 300}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-}
-
-TEST(ExternalCall, Prepacked_Conv2d_float) {
-  using namespace at::native::xnnpack;
-
-  BufHandle Input("Input", {1, 3, 224, 224}, kFloat);
-  BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat);
-  int64_t stride = 2;
-  int64_t pad = 1;
-  int64_t dilation = 1;
-  int64_t groups = 1;
-
-  // Calculate reference result using at::conv2d.
-  auto options = at::TensorOptions()
-                     .dtype(at::kFloat)
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-  at::Tensor input = at::linspace(-10.0, 10.0, 1 * 3 * 224 * 224, options)
-                         .resize_({1, 3, 224, 224});
-  at::Tensor weight =
-      at::linspace(-10.0, 10.0, 16 * 3 * 3 * 3, options).resize_({16, 3, 3, 3});
-  at::Tensor bias = at::linspace(-10.0, 10.0, 16, options);
-  at::Tensor ref = at::conv2d(
-      input,
-      weight,
-      bias,
-      {stride, stride},
-      {pad, pad},
-      {dilation, dilation},
-      groups);
-
-  // Create prepacked xnnpack context object.
-  auto conv2d_clamp_prepack_op =
-      c10::Dispatcher::singleton()
-          .findSchemaOrThrow("prepacked::conv2d_clamp_prepack", "")
-          .typed<c10::intrusive_ptr<Conv2dOpContext>(
-              at::Tensor,
-              std::optional<at::Tensor>,
-              std::vector<int64_t>,
-              std::vector<int64_t>,
-              std::vector<int64_t>,
-              int64_t,
-              const std::optional<at::Scalar>&,
-              const std::optional<at::Scalar>&)>();
-  auto prepacked = conv2d_clamp_prepack_op.call(
-      weight,
-      bias,
-      {stride, stride},
-      {pad, pad},
-      {dilation, dilation},
-      groups,
-      std::optional<at::Scalar>(),
-      std::optional<at::Scalar>());
-
-  BufHandle DummyPrepacked("DummyPrepacked", {1}, kFloat);
-  Tensor Result = Tensor(
-      ResultBuf.node(),
-      ExternalCall::make(
-          ResultBuf,
-          "nnc_prepacked_conv2d_clamp_run",
-          {Input, DummyPrepacked},
-          {}));
-  LoopNest l({Result});
-  l.prepareForCodegen();
-  l.simplify();
-
-  at::Tensor nnc_result;
-  std::vector<float> input_buf(
-      input.data_ptr<float>(), input.data_ptr<float>() + 1 * 3 * 224 * 224);
-  std::vector<float> result_buf(1 * 16 * 112 * 112, -1.f);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(l.root_stmt(), {Input, DummyPrepacked, Result});
-
-  llvm_codegen.call({input_buf, prepacked.get(), result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref, 1e-03, 1e-03));
-#endif
-
-  SimpleIREvaluator ir_eval(l.root_stmt(), {Input, DummyPrepacked, Result});
-
-  ir_eval.call({input_buf, prepacked.get(), result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref, 1e-03, 1e-03));
-}
-
-#endif // USE_XNNPACK
-
-TEST(ExternalCall, BinaryFloat) {
-  using TensorFunc = std::function<at::Tensor(at::Tensor, at::Tensor)>;
-  using Test = std::tuple<
-      std::vector<int64_t>,
-      std::vector<int64_t>,
-      std::vector<int64_t>,
-      TensorFunc,
-      std::string>;
-  std::vector<Test> tests = {};
-  tests.push_back(
-      Test{{100, 200}, {200, 300}, {100, 300}, at::matmul, "nnc_aten_matmul"});
-  tests.push_back(Test{{100, 300}, {300}, {100}, at::mv, "nnc_aten_mv"});
-  tests.push_back(Test{
-      {100, 200},
-      {200, 300},
-      {100, 300},
-      [&](const at::Tensor& a, const at::Tensor& b) { return at::mm(a, b); },
-      "nnc_aten_mm"});
-  for (auto curTest : tests) {
-    auto [aShape, bShape, resShape, torchFunc, externCallName] = curTest;
-    auto toExprHandleVec = [](std::vector<int64_t> v) {
-      auto intV = std::vector<int>(v.begin(), v.end());
-      return std::vector<ExprHandle>(intV.begin(), intV.end());
-    };
-    BufHandle A("A", toExprHandleVec(aShape), kFloat);
-    BufHandle B("B", toExprHandleVec(bShape), kFloat);
-    BufHandle ResultBuf("Result", toExprHandleVec(resShape), kFloat);
-
-    Tensor Result = Tensor(
-        ResultBuf.node(),
-        ExternalCall::make(ResultBuf, externCallName, {A, B}, {}));
-    LoopNest l({Result});
-    l.prepareForCodegen();
-    l.simplify();
-
-    auto options = at::TensorOptions()
-                       .dtype(at::kFloat)
-                       .layout(at::kStrided)
-                       .device(at::kCPU)
-                       .requires_grad(false);
-    at::Tensor a = at::ones(c10::IntArrayRef(aShape), options) * 5.f;
-    at::Tensor b = at::ones(c10::IntArrayRef(bShape), options) * 6.f;
-    at::Tensor ref = torchFunc(a, b);
-
-    auto prod = [](std::vector<int64_t> v) {
-      // NOLINTNEXTLINE(modernize-use-transparent-functors)
-      return std::accumulate(v.begin(), v.end(), 1, std::multiplies<int64_t>());
-    };
-
-    at::Tensor nnc_result;
-    std::vector<float> a_buf(prod(aShape), 5.f);
-    std::vector<float> b_buf(prod(bShape), 6.f);
-    std::vector<float> result_buf(prod(resShape), -1.f);
-
-#ifdef TORCH_ENABLE_LLVM
-    LLVMCodeGen llvm_codegen(l.root_stmt(), {A, B, Result});
-
-    llvm_codegen.call({a_buf, b_buf, result_buf});
-    nnc_result =
-        at::from_blob(result_buf.data(), c10::IntArrayRef(resShape), options);
-    ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-    SimpleIREvaluator ir_eval(l.root_stmt(), {A, B, Result});
-    ir_eval.call({a_buf, b_buf, result_buf});
-    nnc_result =
-        at::from_blob(result_buf.data(), c10::IntArrayRef(resShape), options);
-    ASSERT_TRUE(at::allclose(nnc_result, ref));
-  }
-}
-
-TEST(ExternalCall, UnaryFloat) {
-  using TensorFunc = std::function<at::Tensor(at::Tensor)>;
-  auto toExprHandleVec = [](std::vector<int64_t> v) {
-    auto intV = std::vector<int>(v.begin(), v.end());
-    return std::vector<ExprHandle>(intV.begin(), intV.end());
-  };
-  using Test = std::tuple<
-      std::vector<int64_t>,
-      std::vector<int64_t>,
-      TensorFunc,
-      std::string,
-      std::vector<ExprHandle>>;
-  std::vector<Test> tests = {};
-  tests.push_back(Test{
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      {1, 64, 8, 9},
-      {1, 64, 5, 7},
-      [](at::Tensor x) { return at::adaptive_avg_pool2d(x, {5, 7}); },
-      "nnc_aten_adaptive_avg_pool2d",
-      toExprHandleVec({5, 7})});
-  tests.push_back(Test{// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-                       {100, 200},
-                       {100},
-                       [](at::Tensor x) { return at::mean(x, {1}); },
-                       "nnc_aten_mean",
-                       toExprHandleVec({1, /*keepdim=*/0})});
-  for (auto curTest : tests) {
-    auto [aShape, resShape, torchFunc, externCallName, externCallArgs] =
-        curTest;
-    BufHandle A("A", toExprHandleVec(aShape), kFloat);
-    BufHandle ResultBuf("Result", toExprHandleVec(resShape), kFloat);
-
-    Tensor Result = Tensor(
-        ResultBuf.node(),
-        ExternalCall::make(ResultBuf, externCallName, {A}, externCallArgs));
-    LoopNest l({Result});
-    l.prepareForCodegen();
-    l.simplify();
-
-    auto options = at::TensorOptions()
-                       .dtype(at::kFloat)
-                       .layout(at::kStrided)
-                       .device(at::kCPU)
-                       .requires_grad(false);
-    at::Tensor a = at::ones(c10::IntArrayRef(aShape), options) * 5.f;
-    at::Tensor ref = torchFunc(a);
-
-    auto prod = [](std::vector<int64_t> v) {
-      // NOLINTNEXTLINE(modernize-use-transparent-functors)
-      return std::accumulate(v.begin(), v.end(), 1, std::multiplies<int64_t>());
-    };
-
-    at::Tensor nnc_result;
-    std::vector<float> a_buf(prod(aShape), 5.f);
-    std::vector<float> result_buf(prod(resShape), -1.f);
-
-#ifdef TORCH_ENABLE_LLVM
-    LLVMCodeGen llvm_codegen(l.root_stmt(), {A, Result});
-
-    llvm_codegen.call({a_buf, result_buf});
-    nnc_result =
-        at::from_blob(result_buf.data(), c10::IntArrayRef(resShape), options);
-    ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-    SimpleIREvaluator ir_eval(l.root_stmt(), {A, Result});
-    ir_eval.call({a_buf, result_buf});
-    nnc_result =
-        at::from_blob(result_buf.data(), c10::IntArrayRef(resShape), options);
-    ASSERT_TRUE(at::allclose(nnc_result, ref));
-  }
-}
-
-TEST(ExternalCall, ComputeInterop) {
-  // This test verifies that Tensors using external calls can be used by and can
-  // use Tensors built with Compute API.
-
-  BufHandle ConvResultBuf("ConvResult", {1, 16, 32, 32}, kFloat);
-  BufHandle MatmulResultBuf("MatmulResult", {1, 16, 32, 32}, kFloat);
-
-  Tensor Input = Compute(
-      "Input",
-      {1, 16, 32, 32},
-      [&](const VarHandle& n,
-          const VarHandle& c,
-          const VarHandle& h,
-          const VarHandle& w) { return FloatImm::make(5.0f); });
-  Tensor Weight = Compute(
-      "Weight",
-      {16, 16, 1, 1},
-      [&](const VarHandle& n,
-          const VarHandle& c,
-          const VarHandle& h,
-          const VarHandle& w) { return FloatImm::make(6.0f); });
-
-  Tensor ConvResult = Tensor(
-      ConvResultBuf.node(),
-      ExternalCall::make(
-          ConvResultBuf,
-          "nnc_aten_conv2d",
-          {BufHandle(Input.buf()), BufHandle(Weight.buf())},
-          {}));
-  Tensor MatmulResult = Tensor(
-      MatmulResultBuf.node(),
-      ExternalCall::make(
-          MatmulResultBuf,
-          "nnc_aten_matmul",
-          {BufHandle(ConvResult.buf()), BufHandle(ConvResult.buf())},
-          {}));
-  Tensor Result = Compute(
-      "Result",
-      {1, 16, 32, 32},
-      [&](const VarHandle& n,
-          const VarHandle& c,
-          const VarHandle& h,
-          const VarHandle& w) {
-        return ConvResult.load(n, c, h, w) + MatmulResult.load(n, c, h, w);
-      });
-
-  LoopNest l({Input, Weight, ConvResult, MatmulResult, Result});
-
-  // Inlining should not inline anything here since all Bufs are either defined
-  // or used in ExternalCalls - we run it just for testing
-  l.inlineIntermediateBufs(true);
-
-  l.prepareForCodegen();
-  l.simplify();
-
-  auto options = at::TensorOptions()
-                     .dtype(at::kFloat)
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-  at::Tensor input = at::ones({1, 16, 32, 32}, options) * 5.f;
-  at::Tensor weight = at::ones({16, 16, 1, 1}, options) * 6.f;
-  at::Tensor t = at::conv2d(input, weight);
-  at::Tensor t2 = at::matmul(t, t);
-  at::Tensor ref = t + t2;
-
-  at::Tensor nnc_result;
-  std::vector<float> input_buf(1 * 16 * 32 * 32, 5.f);
-  std::vector<float> weight_buf(16 * 16 * 1 * 1, 6.f);
-  std::vector<float> conv_result_buf(1 * 16 * 32 * 32, -1.f);
-  std::vector<float> matmul_result_buf(1 * 16 * 32 * 32, -1.f);
-  std::vector<float> result_buf(1 * 16 * 32 * 32, -1.f);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(
-      l.root_stmt(), {Input, Weight, ConvResult, MatmulResult, Result});
-
-  llvm_codegen.call(
-      {input_buf, weight_buf, conv_result_buf, matmul_result_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 16, 32, 32}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-  SimpleIREvaluator ir_eval(
-      l.root_stmt(), {Input, Weight, ConvResult, MatmulResult, Result});
-
-  ir_eval.call(
-      {input_buf, weight_buf, conv_result_buf, matmul_result_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 16, 32, 32}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-}
-
-TEST(ExternalCall, Inlining) {
-  // This test verifies that Tensors using external calls can be used by and
-  // can use Tensors built with Compute API.
-
-  BufHandle MatmulResultBuf("MatmulResult", {8, 8}, kFloat);
-
-  Tensor A = Compute("A", {8, 8}, [&](const VarHandle& i, const VarHandle& j) {
-    return FloatImm::make(5.0f);
-  });
-  Tensor B = Compute("B", {8, 8}, [&](const VarHandle& i, const VarHandle& j) {
-    return FloatImm::make(4.0f);
-  });
-  Tensor MatmulResult = Tensor(
-      MatmulResultBuf.node(),
-      ExternalCall::make(
-          MatmulResultBuf,
-          "nnc_aten_matmul",
-          {BufHandle(A.buf()), BufHandle(B.buf())},
-          {}));
-  Tensor Result =
-      Compute("Result", {8, 8}, [&](const VarHandle& i, const VarHandle& j) {
-        return MatmulResult.load(i, j) + FloatImm::make(3.0f);
-      });
-
-  StmtPtr root_stmt = alloc<torch::jit::tensorexpr::Block>(std::vector<StmtPtr>(
-      {A.stmt(), B.stmt(), MatmulResult.stmt(), Result.stmt()}));
-  LoopNest l(root_stmt, {Result.buf()});
-
-  // Inlining should not inline anything here since all Bufs are either
-  // defined or used in ExternalCalls
-  l.inlineIntermediateBufs(false);
-
-  l.prepareForCodegen();
-  l.simplify();
-
-  auto options = at::TensorOptions()
-                     .dtype(at::kFloat)
-                     .layout(at::kStrided)
-                     .device(at::kCPU)
-                     .requires_grad(false);
-  at::Tensor a = at::ones({8, 8}, options) * 5.f;
-  at::Tensor b = at::ones({8, 8}, options) * 4.f;
-  at::Tensor t = at::matmul(a, b);
-  at::Tensor ref = t + 3.f;
-
-  at::Tensor nnc_result;
-  std::vector<float> result_buf(8 * 8);
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen llvm_codegen(l.root_stmt(), {Result});
-
-  llvm_codegen.call({result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {8, 8}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-#endif
-
-  SimpleIREvaluator ir_eval(l.root_stmt(), {Result});
-
-  ir_eval.call({result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {8, 8}, options);
-  ASSERT_TRUE(at::allclose(nnc_result, ref));
-}
-
-TEST(ExternalCall, JitCustomFusionOp) {
-  const char* custom_op_schema_literal =
-      "nnc_custom::add_mul(Tensor a, Tensor b, Tensor c) -> Tensor";
-  const char* external_func_name = "nnc_add_mul";
-
-  auto add_mul_lowering_func =
-      [external_func_name](
-          const std::vector<torch::jit::tensorexpr::ArgValue>& inputs,
-          const std::vector<torch::jit::tensorexpr::ExprHandle>& output_shape,
-          const std::vector<torch::jit::tensorexpr::ExprHandle>& output_strides,
-          const std::optional<torch::jit::tensorexpr::ScalarType>& output_type,
-          at::Device device) {
-        auto output_dtype = Dtype(*output_type);
-        torch::jit::tensorexpr::BufHandle result_buf(
-            "nnc_add_mul_res_buf", output_shape, output_dtype);
-        const torch::jit::tensorexpr::BufHandle& a =
-            std::get<torch::jit::tensorexpr::BufHandle>(inputs[0]);
-        const torch::jit::tensorexpr::BufHandle& b =
-            std::get<torch::jit::tensorexpr::BufHandle>(inputs[1]);
-        const torch::jit::tensorexpr::BufHandle& c =
-            std::get<torch::jit::tensorexpr::BufHandle>(inputs[1]);
-        torch::jit::tensorexpr::StmtPtr s =
-            torch::jit::tensorexpr::ExternalCall::make(
-                result_buf, external_func_name, {a, b, c}, {});
-        return Tensor(result_buf.node(), s);
-      };
-
-  auto add_mul_external_func = [](int64_t bufs_num,
-                                  void** buf_data,
-                                  int64_t* buf_ranks,
-                                  int64_t* buf_dims,
-                                  int64_t* buf_strides,
-                                  int8_t* buf_dtypes,
-                                  int64_t args_num,
-                                  int64_t* extra_args) {};
-
-  torch::jit::RegisterOperators reg({Operator(
-      custom_op_schema_literal,
-      [](const Node* node) -> Operation {
-        return [](Stack& _stack) {
-          auto a = std::move(peek(_stack, 0, 3)).toTensor();
-          auto b = std::move(peek(_stack, 1, 3)).toTensor();
-          auto c = std::move(peek(_stack, 2, 3)).toTensor();
-          drop(_stack, 3);
-          auto result = (a + b) * c;
-          pack(_stack, std::move(result));
-          return 0;
-        };
-      },
-      c10::AliasAnalysisKind::FROM_SCHEMA)});
-
-  auto& custom_operator_set = torch::jit::tensorexpr::getCustomOperatorSet();
-  custom_operator_set.insert({custom_op_schema_literal});
-
-  auto& te_lowering_registry = torch::jit::tensorexpr::getNNCLoweringRegistry();
-  te_lowering_registry.insert(
-      parseSchema(custom_op_schema_literal), add_mul_lowering_func);
-
-  auto& te_nnc_func_registry = torch::jit::tensorexpr::getNNCFunctionRegistry();
-  te_nnc_func_registry[external_func_name] = add_mul_external_func;
-
-  std::string graph_string = R"IR(
-    graph(%a : Float(10, 20, strides=[20, 1], device=cpu),
-          %b : Float(10, 20, strides=[20, 1], device=cpu),
-          %c : Float(10, 20, strides=[20, 1], device=cpu)):
-      %res : Float(10, 20, strides=[20, 1], device=cpu) = nnc_custom::add_mul(%a, %b, %c)
-      return (%res))IR";
-
-  auto graph = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, graph.get());
-
-  std::string shape_compute_python_string = R"PY(
-  def computOutput(a: List[int], b: List[int], c: List[int]):
-    expandedSizes: List[int] = []
-    dimsA = len(a)
-    dimsB = len(b)
-    dimsC = len(c)
-    ndim = max(dimsA, dimsB, dimsC)
-    for i in range(ndim):
-        offset = ndim - 1 - i
-        dimA = dimsA - 1 - offset
-        dimB = dimsB - 1 - offset
-        dimC = dimsC - 1 - offset
-        sizeA = a[dimA] if (dimA >= 0) else 1
-        sizeB = b[dimB] if (dimB >= 0) else 1
-        sizeC = a[dimC] if (dimC >= 0) else 1
-
-        if sizeA != sizeB and sizeB != sizeC and sizeA != 1 and sizeB != 1 and sizeC != 1:
-            # TODO: only assertion error is bound in C++ compilation right now
-            raise AssertionError(
-                "The size of tensor a {} must match the size of tensor b ("
-                "{} and c {}) at non-singleton dimension {}".format(sizeA, sizeB, sizeC, i)
-            )
-
-        expandedSizes.append(max(sizeA, sizeB, sizeC))
-
-    return expandedSizes
-  )PY";
-  auto cu_ptr = torch::jit::compile(shape_compute_python_string);
-  torch::jit::GraphFunction* gf =
-      (torch::jit::GraphFunction*)&cu_ptr->get_function("computOutput");
-  ASSERT_TRUE(gf);
-
-#ifdef TORCH_ENABLE_LLVM
-  auto static_graph_case = graph->copy();
-  FuseTensorExprs(static_graph_case, 1);
-  torch::jit::testing::FileCheck()
-      .check("prim::TensorExprGroup_")
-      ->check("nnc_custom::add_mul")
-      ->run(*static_graph_case);
-
-  auto dynamic_graph_case = graph->copy();
-  auto custom_op = torch::jit::getOperatorForLiteral(custom_op_schema_literal);
-  ASSERT_TRUE(custom_op);
-  torch::jit::RegisterShapeComputeGraphForSchema(
-      custom_op->schema(), gf->graph());
-  FuseTensorExprs(dynamic_graph_case, 1, false, true);
-  torch::jit::testing::FileCheck()
-      .check("prim::TensorExprGroup_")
-      ->check("nnc_custom::add_mul")
-      ->run(*dynamic_graph_case);
-#else
-  torch::jit::testing::FileCheck().check("nnc_custom::add_mul")->run(*graph);
-#endif
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_graph_opt.cpp b/test/cpp/tensorexpr/test_graph_opt.cpp
deleted file mode 100644
index aed73d09d14d5..0000000000000
--- a/test/cpp/tensorexpr/test_graph_opt.cpp
+++ /dev/null
@@ -1,319 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <test/cpp/tensorexpr/test_base.h>
-#include <torch/csrc/jit/ir/ir.h>
-#include <torch/csrc/jit/ir/irparser.h>
-#include <torch/csrc/jit/passes/lower_tuples.h>
-#include <torch/csrc/jit/tensorexpr/graph_opt.h>
-#include <torch/csrc/jit/tensorexpr/kernel.h>
-#include <torch/csrc/jit/testing/file_check.h>
-#include <torch/torch.h>
-
-#include <limits>
-
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::tensorexpr;
-
-class GraphOpt : public ::testing::Test {
- public:
-  void SetUp() override {
-    old_cat_wo_conditionals_ = getCatWoConditionals();
-    getCatWoConditionals() = true;
-  }
-
-  void TearDown() override {
-    getCatWoConditionals() = old_cat_wo_conditionals_;
-  }
-
- private:
-  bool old_cat_wo_conditionals_;
-};
-
-TEST_F(GraphOpt, OptimizeCat) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_string = R"IR(
-    graph(%x : Float(10, strides=[1], device=cpu),
-          %y : Float(20, strides=[1], device=cpu),
-          %z : Float(30, strides=[1], device=cpu)):
-      %dim : int = prim::Constant[value=0]()
-      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
-      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
-      %5 : Float(60, strides=[1], device=cpu) = aten::log(%cat)
-      return (%5))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-  g->lint();
-
-  TensorExprKernel kernel(g);
-
-  // The `aten::log` op must be moved to the inputs of `aten::cat`.
-  testing::FileCheck()
-      .check("aten::log")
-      ->check("aten::log")
-      ->check("aten::log")
-      ->check("aten::cat")
-      ->check_not("aten::log")
-      ->run(*kernel.graph());
-
-  auto x = at::rand({10}, at::kFloat);
-  auto y = at::rand({20}, at::kFloat);
-  auto z = at::rand({30}, at::kFloat);
-  auto ref = at::log(at::cat({x, y, z}, 0));
-
-  std::vector<at::Tensor> inputs = {x, y, z};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  kernel.run(stack);
-  auto out = stack[0].toTensor();
-  ASSERT_EQ(out.sizes(), ref.sizes());
-  ASSERT_EQ(out.dtype(), ref.dtype());
-  ASSERT_TRUE(at::allclose(out, ref));
-#endif
-}
-
-TEST_F(GraphOpt, OptimizeCat2) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_string = R"IR(
-    graph(%x : Float(10, strides=[1], device=cpu),
-          %y : Float(20, strides=[1], device=cpu),
-          %z : Float(30, strides=[1], device=cpu)):
-      %dim : int = prim::Constant[value=0]()
-      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
-      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
-      %5 : Float(60, strides=[1], device=cpu) = aten::log(%cat)
-      %6 : Float(60, strides=[1], device=cpu) = aten::tanh(%5)
-      return (%6))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-  g->lint();
-
-  TensorExprKernel kernel(g);
-
-  // The `aten::log` and `aten::tanh` ops must be moved to the inputs of
-  // `aten::cat`.
-  testing::FileCheck()
-      .check("aten::log")
-      ->check("aten::log")
-      ->check("aten::log")
-      ->check("aten::tanh")
-      ->check("aten::tanh")
-      ->check("aten::tanh")
-      ->check("aten::cat")
-      ->check_not("aten::log")
-      ->check_not("aten::tanh")
-      ->run(*kernel.graph());
-
-  auto x = at::rand({10}, at::kFloat);
-  auto y = at::rand({20}, at::kFloat);
-  auto z = at::rand({30}, at::kFloat);
-  auto ref = at::tanh(at::log(at::cat({x, y, z}, 0)));
-
-  std::vector<at::Tensor> inputs = {x, y, z};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  kernel.run(stack);
-  auto out = stack[0].toTensor();
-  ASSERT_EQ(out.sizes(), ref.sizes());
-  ASSERT_EQ(out.dtype(), ref.dtype());
-  ASSERT_TRUE(at::allclose(out, ref));
-#endif
-}
-
-TEST_F(GraphOpt, OptimizeCat3) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_string = R"IR(
-    graph(%a : Float(60, strides=[1], device=cpu),
-          %x : Float(10, strides=[1], device=cpu),
-          %y : Float(20, strides=[1], device=cpu),
-          %z : Float(30, strides=[1], device=cpu)):
-      %dim : int = prim::Constant[value=0]()
-      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
-      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
-      %5 : Float(60, strides=[1], device=cpu) = aten::tanh(%cat)
-      %6 : Float(60, strides=[1], device=cpu) = aten::mul(%a, %5)
-      return (%6))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-  g->lint();
-
-  TensorExprKernel kernel(g);
-
-  // The `aten::tanh` op must be moved to the inputs of `aten::cat`.
-  // But the `aten::mul` op must not be moved since it is not a single-tensor
-  // op (it has 2 tensor inputs).
-  testing::FileCheck()
-      .check("aten::tanh")
-      ->check("aten::tanh")
-      ->check("aten::tanh")
-      ->check("aten::cat")
-      ->check("aten::mul")
-      ->check_not("aten::tanh")
-      ->run(*kernel.graph());
-
-  auto a = at::rand({60}, at::kFloat);
-  auto x = at::rand({10}, at::kFloat);
-  auto y = at::rand({20}, at::kFloat);
-  auto z = at::rand({30}, at::kFloat);
-  auto ref = at::tanh(at::cat({x, y, z}, 0)) * a;
-
-  std::vector<at::Tensor> inputs = {a, x, y, z};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  kernel.run(stack);
-  auto out = stack[0].toTensor();
-  ASSERT_EQ(out.sizes(), ref.sizes());
-  ASSERT_EQ(out.dtype(), ref.dtype());
-  ASSERT_TRUE(at::allclose(out, ref));
-#endif
-}
-
-TEST_F(GraphOpt, OptimizeCatWithTypePromotionInUser) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_string = R"IR(
-    graph(%x : Int(10, strides=[1], device=cpu),
-          %y : Int(20, strides=[1], device=cpu),
-          %z : Int(30, strides=[1], device=cpu)):
-      %dim : int = prim::Constant[value=0]()
-      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
-      %cat : Int(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
-      %5 : Float(60, strides=[1], device=cpu) = aten::tanh(%cat)
-      return (%5))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-  g->lint();
-
-  TensorExprKernel kernel(g);
-
-  // The `aten::tanh` op must be moved to the inputs of `aten::cat`.
-  // The scalar type of the inputs to `cat` should now be `Float` since they
-  // are the result of `tanh` which does the type promotion.
-  testing::FileCheck()
-      .check("aten::tanh")
-      ->check("aten::tanh")
-      ->check("aten::tanh")
-      ->check("aten::cat")
-      ->check_not("aten::tanh")
-      ->run(*kernel.graph());
-
-  auto x = at::randint(std::numeric_limits<int>::max(), {10}, at::kInt);
-  auto y = at::randint(std::numeric_limits<int>::max(), {20}, at::kInt);
-  auto z = at::randint(std::numeric_limits<int>::max(), {30}, at::kInt);
-  auto ref = at::tanh(at::cat({x, y, z}, 0));
-
-  std::vector<at::Tensor> inputs = {x, y, z};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  kernel.run(stack);
-  auto out = stack[0].toTensor();
-  ASSERT_EQ(out.sizes(), ref.sizes());
-  ASSERT_EQ(out.dtype(), ref.dtype());
-  ASSERT_TRUE(at::allclose(out, ref));
-#endif
-}
-
-TEST_F(GraphOpt, OptimizeCatWithTypePromotionInCat) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_string = R"IR(
-    graph(%x : Float(10, strides=[1], device=cpu),
-          %y : Float(20, strides=[1], device=cpu),
-          %z : Double(30, strides=[1], device=cpu)):
-      %dim : int = prim::Constant[value=0]()
-      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
-      %cat : Double(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
-      %5 : Double(60, strides=[1], device=cpu) = aten::log(%cat)
-      return (%5))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-  g->lint();
-
-  TensorExprKernel kernel(g);
-
-  // No transformation should have happened because the `aten::cat` op performs
-  // type promotion. This case is currently not handled.
-  testing::FileCheck()
-      .check("aten::cat")
-      ->check("aten::log")
-      ->check_not("aten::cat")
-      ->check_not("aten::log")
-      ->run(*kernel.graph());
-#endif
-}
-
-TEST_F(GraphOpt, OptimizeCatNoSingleTensorElementwiseOp) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_string = R"IR(
-    graph(%0 : Float(60, strides=[1], device=cpu),
-          %x : Float(10, strides=[1], device=cpu),
-          %y : Float(20, strides=[1], device=cpu),
-          %z : Float(30, strides=[1], device=cpu)):
-      %dim : int = prim::Constant[value=0]()
-      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
-      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
-      %5 : Float(60, strides=[1], device=cpu) = aten::mul(%0, %cat)
-      return (%5))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-  g->lint();
-
-  TensorExprKernel kernel(g);
-
-  // No transformation is expected since the consumers of cat are not
-  // single-tensor element-wise ops.
-  testing::FileCheck()
-      .check("aten::cat")
-      ->check("aten::mul")
-      ->check_not("aten::cat")
-      ->check_not("aten::mul")
-      ->run(*kernel.graph());
-#endif
-}
-
-TEST_F(GraphOpt, OptimizeCatNoSingleTensorElementwiseOp2) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_string = R"IR(
-    graph(%0 : Float(60, strides=[1], device=cpu),
-          %1 : Float(60, strides=[1], device=cpu),
-          %x : Float(10, strides=[1], device=cpu),
-          %y : Float(20, strides=[1], device=cpu),
-          %z : Float(30, strides=[1], device=cpu)):
-      %one : int = prim::Constant[value=1]()
-      %dim : int = prim::Constant[value=0]()
-      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
-      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
-      %5 : Float(60, strides=[1], device=cpu) = aten::mul(%0, %cat)
-      %6 : Float(60, strides=[1], device=cpu) = aten::add(%5, %1, %one)
-      return (%6))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-  g->lint();
-
-  TensorExprKernel kernel(g);
-
-  // No transformation is expected since the consumers of cat are not
-  // single-tensor element-wise ops.
-  testing::FileCheck()
-      .check("aten::cat")
-      ->check("aten::mul")
-      ->check("aten::add")
-      ->check_not("aten::cat")
-      ->check_not("aten::mul")
-      ->check_not("aten::add")
-      ->run(*kernel.graph());
-#endif
-}
-
-TEST_F(GraphOpt, AOTGraphPrepPasses) {
-  const auto graph_string = R"IR(
-    graph(%x, %y, %z, %i : int):
-      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
-      return (%xyz_list, %i))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-
-  removeGraphOutput(g, 1);
-  replaceListOutputWithTuple(g);
-  LowerAllTuples(g);
-
-  testing::FileCheck().check("return (%x, %y, %z)")->run(*g);
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_ir_printer.cpp b/test/cpp/tensorexpr/test_ir_printer.cpp
deleted file mode 100644
index 4d2f8c6e906ee..0000000000000
--- a/test/cpp/tensorexpr/test_ir_printer.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <stdexcept>
-#include "test/cpp/tensorexpr/test_base.h"
-
-#include <torch/csrc/jit/tensorexpr/expr.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_printer.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/csrc/jit/testing/file_check.h>
-
-#include <sstream>
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::tensorexpr;
-
-TEST(IRPrinter, BasicValueTest) {
-  ExprHandle a = IntImm::make(2), b = IntImm::make(3);
-  ExprHandle c = Add::make(a, b);
-
-  std::stringstream ss;
-  ss << c;
-  ASSERT_EQ(ss.str(), "2 + 3");
-}
-
-TEST(IRPrinter, BasicValueTest02) {
-  ExprHandle a(2.0f);
-  ExprHandle b(3.0f);
-  ExprHandle c(4.0f);
-  ExprHandle d(5.0f);
-  ExprHandle f = (a + b) - (c + d);
-
-  std::stringstream ss;
-  ss << f;
-  ASSERT_EQ(ss.str(), "(2.f + 3.f) - (4.f + 5.f)");
-}
-
-TEST(IRPrinter, BasicValueTest03) {
-  ExprHandle a(3.402823466385289e+38f);
-  ExprHandle b(-3.402823466385289e+38f);
-  std::stringstream ss;
-  ss << a << ", " << b;
-  ASSERT_EQ(ss.str(), "3.402823466385289e+38f, -3.402823466385289e+38f");
-}
-
-TEST(IRPrinter, CastTest) {
-  VarHandle x("x", kHalf);
-  VarHandle y("y", kFloat);
-  ExprHandle body = ExprHandle(2.f) +
-      (Cast::make(kFloat, x) * ExprHandle(3.f) + ExprHandle(4.f) * y);
-
-  std::stringstream ss;
-  ss << body;
-  ASSERT_EQ(ss.str(), "2.f + (float(x) * 3.f + 4.f * y)");
-}
-
-TEST(IRPrinter, FunctionName) {
-  int M = 4;
-  int N = 20;
-
-  Tensor producer = Compute(
-      "producer", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return m * n;
-      });
-
-  Tensor chunk_0 = Compute(
-      "chunk_0", {M, N / 2}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return producer.load(m, n);
-      });
-
-  Tensor chunk_1 = Compute(
-      "chunk_1", {M, N / 2}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return producer.load(m, n + ExprHandle(N / 2));
-      });
-
-  Tensor consumer = Compute(
-      "consumer", {M, N / 2}, [&](const ExprHandle& i, const ExprHandle& j) {
-        return i * chunk_1.load(i, j);
-      });
-
-  LoopNest l({chunk_0, chunk_1, consumer});
-  auto body = LoopNest::sanitizeNames(l.root_stmt());
-
-  std::stringstream ss;
-  ss << *body;
-
-  const std::string& verification_pattern =
-      R"IR(
- # CHECK:   for (int i_2
- # CHECK:    for (int j_2
- # CHECK:     consumer[i_2, j_2] = i_2 * (chunk_1[i_2, j_2])IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, ss.str());
-}
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_ir_verifier.cpp b/test/cpp/tensorexpr/test_ir_verifier.cpp
deleted file mode 100644
index 886213ea9c760..0000000000000
--- a/test/cpp/tensorexpr/test_ir_verifier.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <stdexcept>
-#include "test/cpp/tensorexpr/test_base.h"
-
-#include <torch/csrc/jit/tensorexpr/expr.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_verifier.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/csrc/jit/testing/file_check.h>
-
-#include <sstream>
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::tensorexpr;
-
-TEST(IRVerifier, BitwiseOps) {
-  VarPtr X = alloc<Var>("x", kInt);
-  VarPtr Y = alloc<Var>("y", kFloat);
-  {
-    auto a = alloc<And>(X, Y);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-  {
-    auto a = alloc<Or>(X, Y);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-  {
-    auto a = alloc<Xor>(X, Y);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-  {
-    auto a = alloc<Lshift>(X, Y);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-  {
-    auto a = alloc<Rshift>(X, Y);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-}
-
-TEST(IRVerifier, CompareSelect) {
-  ExprPtr X = alloc<IntImm>(1);
-  ExprPtr Y = alloc<FloatImm>(3.14f);
-  {
-    auto a = alloc<CompareSelect>(X, X, X, Y, kEQ);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-  {
-    auto a = alloc<CompareSelect>(X, Y, X, X, kEQ);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-}
-
-TEST(IRVerifier, Ramp) {
-  VarPtr I = alloc<Var>("i", kInt);
-  VarPtr J = alloc<Var>("j", kFloat);
-  {
-    auto a = alloc<Ramp>(I, J, 4);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-}
-
-TEST(IRVerifier, Load) {
-  VarPtr I = alloc<Var>("i", kInt);
-  VarPtr J = alloc<Var>("j", kLong);
-  VarPtr K = alloc<Var>("k", kFloat);
-  BufPtr B = alloc<Buf>(
-      "b",
-      std::vector<ExprPtr>({alloc<IntImm>(10), alloc<IntImm>(20)}),
-      kFloat);
-  {
-    // Indices with different int dtypes (kInt, kLong) are ok
-    auto a = alloc<Load>(B, std::vector<ExprPtr>({I, J}));
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_NO_THROW(verify(a));
-  }
-  {
-    // Float index
-    auto a = alloc<Load>(B, std::vector<ExprPtr>({K, K}));
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-  {
-    // Multilanes are only allowed in flattened indices
-    auto multilane_index = alloc<Ramp>(I, alloc<IntImm>(1), 4);
-    auto a = alloc<Load>(B, std::vector<ExprPtr>({I, multilane_index}));
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-}
-
-TEST(IRVerifier, IfThenElse) {
-  VarPtr I = alloc<Var>("i", kInt);
-  VarPtr J = alloc<Var>("j", kLong);
-  VarPtr K = alloc<Var>("k", kFloat);
-  {
-    // Condition must be integral
-    auto a = alloc<IfThenElse>(K, I, I);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-  {
-    // Dtypes of true and false exprs must match
-    auto a = alloc<IfThenElse>(I, I, J);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-  {
-    // Can't have multiple lanes in condition expr
-    auto a = alloc<IfThenElse>(alloc<Broadcast>(I, 4), I, I);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-}
-
-TEST(IRVerifier, For) {
-  VarPtr I = alloc<Var>("i", kInt);
-  VarPtr J = alloc<Var>("j", kInt);
-  StmtPtr body = alloc<Block>(std::vector<StmtPtr>({}));
-  {
-    // Can't have nullptr as a Var
-    auto a = alloc<For>(nullptr, I, J, body);
-    // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-    EXPECT_ANY_THROW(verify(a));
-  }
-}
-
-TEST(IRVerifier, Block) {
-  VarPtr I = alloc<Var>("i", kInt);
-  BufPtr B = alloc<Buf>("B", std::vector<ExprPtr>({alloc<IntImm>(10)}), kInt);
-  {
-    StmtPtr store = alloc<Store>(B, std::vector<ExprPtr>({I}), I);
-    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-    StmtPtr block1 = alloc<Block>(std::vector<StmtPtr>({store}));
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    StmtPtr block2 = alloc<Block>(std::vector<StmtPtr>({store}));
-    // Stmt can't have multiple parents, thus inserting it into several blocks
-    // is illegal
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(block2));
-  }
-}
-
-TEST(IRVerifier, Store) {
-  VarPtr I = alloc<Var>("i", kInt);
-  VarPtr J = alloc<Var>("j", kLong);
-  VarPtr K = alloc<Var>("k", kFloat);
-  BufPtr B = alloc<Buf>(
-      "b",
-      std::vector<ExprPtr>({alloc<IntImm>(10), alloc<IntImm>(20)}),
-      kFloat);
-  {
-    // Indices with different int dtypes (kInt, kLong) are ok
-    auto a = alloc<Store>(B, std::vector<ExprPtr>({I, J}), K);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_NO_THROW(verify(a));
-  }
-  {
-    // Float index
-    auto a = alloc<Store>(B, std::vector<ExprPtr>({K, K}), K);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-  {
-    // Multilanes are only allowed in flattened indices
-    auto multilane_index = alloc<Ramp>(I, alloc<IntImm>(1), 4);
-    auto a = alloc<Store>(B, std::vector<ExprPtr>({I, multilane_index}), K);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-  {
-    // Value and buf dtypes mismatch
-    auto a = alloc<Store>(B, std::vector<ExprPtr>({I}), I);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto,clang-analyzer-cplusplus.NewDeleteLeaks)
-    EXPECT_ANY_THROW(verify(a));
-  }
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
deleted file mode 100644
index dc67928b111a0..0000000000000
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ /dev/null
@@ -1,2133 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <ATen/code_template.h>
-#include <c10/util/irange.h>
-#include <test/cpp/tensorexpr/test_base.h>
-#include <torch/csrc/jit/ir/ir.h>
-#include <torch/csrc/jit/ir/irparser.h>
-#include <torch/csrc/jit/passes/constant_propagation.h>
-#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
-#include <torch/csrc/jit/tensorexpr/kernel.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/csrc/jit/testing/file_check.h>
-#include <torch/torch.h>
-#include <cmath>
-#include <sstream>
-#include <stdexcept>
-
-namespace torch {
-namespace jit {
-
-using namespace torch::indexing;
-using namespace torch::jit::tensorexpr;
-
-class Kernel : public ::testing::Test {
- public:
-  void SetUp() override {
-    getTEMustUseLLVMOnCPU() = false;
-  }
-};
-
-TEST_F(Kernel, ParallelExternalCallBuf) {
-  const auto graph_string = R"IR(
-    graph(%0 : Float(1000, 5000, strides=[5000, 1], device=cpu),
-          %1 : Float(1000, 5000, strides=[5000, 1], device=cpu),
-          %2 : Float(5000, 1000, strides=[5000, 1], device=cpu)):
-      %3 : Float(1000, 5000, strides=[5000, 1], device=cpu) = aten::mul(%0, %1)
-      %4 : Float(1000, 5000, strides=[5000, 1], device=cpu) = aten::matmul(%3, %2)
-      return (%4))IR";
-  auto graph = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, graph.get());
-#ifdef TORCH_ENABLE_LLVM
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int64_t i = 0ll; i < 5000ll; i++)  /* parallel */{)IR";
-
-  TensorExprKernel k(graph);
-  StmtPtr s = k.getCodeGenStmt();
-  std::ostringstream oss;
-  oss << *s;
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-#endif
-}
-
-TEST_F(Kernel, InliningIntermediates) {
-  // here, each mul has only one use, so it should be completely inlined
-  {
-    const auto graph_string = R"IR(
-        graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
-              %1 : Float(5, 3, strides=[3, 1], device=cpu)):
-          %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
-          %one : int = prim::Constant[value=1]()
-          %4 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
-          %5: Float(5, 3, strides=[3, 1]) = aten::add(%4, %1, %one)
-          return (%5))IR";
-    auto graph = std::make_shared<Graph>();
-    parseIR(graph_string, &*graph);
-    TensorExprKernel k(graph);
-    auto stmt = k.getCodeGenStmt();
-    std::ostringstream oss;
-    oss << *stmt;
-    torch::jit::testing::FileCheck().check_not("aten_mul")->run(oss.str());
-  }
-  {
-    const auto graph_template = R"IR(
-        graph(%0 : Float(5, 3, strides=[3, 1], device=${device}),
-              %1 : Float(5, 3, strides=[3, 1], device=${device})):
-          %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
-          %one : int = prim::Constant[value=1]()
-          %3 : Float(5, 3, strides=[3, 1]) = aten::sub(%0, %2, %one)
-          %4 : Float(5, 3, strides=[3, 1]) = aten::add(%3, %0, %one)
-          %5 : Float(5, 3, strides=[3, 1]) = aten::div(%3, %0)
-          return (%4, %5))IR";
-    for (bool use_cuda : {false, true}) {
-      if (!torch::cuda::is_available() && use_cuda) {
-        continue;
-      }
-
-      at::jit::TemplateEnv env;
-      env.s("device", use_cuda ? "cuda:0" : "cpu");
-      const auto graph_string = format(graph_template, env);
-      auto graph = std::make_shared<Graph>();
-      parseIR(graph_string, &*graph);
-      TensorExprKernel k(graph);
-      auto stmt = k.getCodeGenStmt();
-      std::ostringstream oss;
-      oss << *stmt;
-      // aten_mul only has one use, inlined completely
-      torch::jit::testing::FileCheck().check_not("aten_mul")->run(oss.str());
-
-      // aten_sub should be removed by the CUDA backend by metavar rewriting
-      // and by the CPU backend by horizontal fusion.
-      torch::jit::testing::FileCheck().check_not("aten_sub")->run(oss.str());
-    }
-  }
-}
-
-TEST_F(Kernel, PreAllocIntermediateBufs) {
-  const auto graph_string = R"IR(
-graph(%a.1 : Float(8, 8, strides=[8, 1], requires_grad=0, device=cpu),
-      %b.1 : Float(8, 8, strides=[8, 1], requires_grad=0, device=cpu)):
-  %2 : int = prim::Constant[value=1]()
-  %c.2 : Float(8, 8, strides=[8, 1], requires_grad=0, device=cpu) = aten::matmul(%a.1, %b.1) # test_matmul.py:12:12
-  %3 : Float(8, 8, strides=[8, 1], requires_grad=0, device=cpu) = aten::add(%a.1, %c.2, %2) # test_matmul.py:13:15
-  return (%3))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto a = at::rand({8, 8}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b = at::rand({8, 8}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto o = at::zeros({8, 8}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto ref = at::matmul(a, b) + a;
-  TensorExprKernel k(graph, {}, {}, true);
-
-  std::vector<at::Tensor> inputs = {a, b};
-  auto stmt = k.getCodeGenStmt();
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  // Check whether the intermediate buffer has been added to constants
-  auto constants = k.getConstantDescriptors();
-  ASSERT_EQ(constants.size(), 1);
-
-  // Check the IR we produced
-  torch::jit::testing::FileCheck().check_not("Alloc")->run(oss.str());
-  torch::jit::testing::FileCheck().check_not("Free")->run(oss.str());
-
-  // Check correctness
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  o = stack[0].toTensor();
-  ASSERT_TRUE(at::allclose(o, ref));
-}
-
-TEST_F(Kernel, _1) {
-  const auto graph_string = R"IR(
-      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
-            %1 : Float(5, 3, strides=[3, 1], device=cpu)):
-        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
-        %3 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
-        return (%3))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto ref = a * (a * b);
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {a, b};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::ostringstream oss;
-  oss << *s;
-
-  // Check the IR we produced
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for
-# CHECK-NEXT: for
-# CHECK-NOT: for)IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  o = stack[0].toTensor();
-  for (size_t i = 0; i < 5 * 3; i++) {
-    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-  }
-}
-
-TEST_F(Kernel, _2) {
-  const auto graph_string = R"IR(
-      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
-            %1 : Float(5, 3, strides=[1, 5], device=cpu)):
-        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
-        %3 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
-        return (%3))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b =
-      at::rand({3, 5}, TensorOptions(kCPU).dtype(at::kFloat)).transpose(0, 1);
-  auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto ref = a * (a * b);
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {a, b};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::ostringstream oss;
-  oss << *s;
-
-  // Check the IR we produced
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for
-# CHECK-NEXT: for
-# CHECK-NOT: for)IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  o = stack[0].toTensor();
-  for (size_t i = 0; i < 5 * 3; i++) {
-    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-  }
-}
-
-TEST_F(Kernel, _3) {
-  const auto graph_string = R"IR(
-      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
-            %1 : Float(5, 3, strides=[12, 2], device=cpu)):
-        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
-        %3 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
-        return (%3))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b = at::rand({10, 6}, TensorOptions(kCPU).dtype(at::kFloat))
-               .index({Slice(None, None, 2), Slice(None, None, 2)});
-  auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto ref = a * (a * b);
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {a, b};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::ostringstream oss;
-  oss << *s;
-
-  // Check the IR we produced
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for
-# CHECK-NEXT: for
-# CHECK-NOT: for)IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  o = stack[0].toTensor();
-  for (size_t i = 0; i < 5 * 3; i++) {
-    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-  }
-}
-
-TEST_F(Kernel, Huge) {
-  const auto graph_string = R"IR(
-      graph(%x.1 : Float(4000000000, strides=[1], requires_grad=0, device=cpu)):
-        %1 : int = prim::Constant[value=0]()
-        %2 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::unsqueeze(%x.1, %1)
-        %3 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::relu(%2)
-        return (%3))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-  TensorExprKernel k(graph);
-  std::ostringstream oss;
-  oss << *k.getCodeGenStmt();
-  // The 4000000000 iterations loop will be split into 500000000 x 8 and the
-  // outer loop will be parallel. If LLVM is not present, it will not be split,
-  // and to cover both of these cases we're looking for 00000000ll; in the
-  // output.
-  const std::string& verification_pattern = R"IR(# CHECK: 00000000ll;)IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST_F(Kernel, ParallelStrided) {
-  const auto graph_string = R"IR(
-      graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu),
-            %1 : Float(5, 3, 40005, strides=[960120, 160020, 2], device=cpu)):
-        %2 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %1)
-        %3 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %2)
-        return (%3))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto a = at::rand({5, 3, 40005}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b = at::rand({10, 6, 80010}, TensorOptions(kCPU).dtype(at::kFloat))
-               .index(
-                   {Slice(None, None, 2),
-                    Slice(None, None, 2),
-                    Slice(None, None, 2)});
-  auto ref = a * (a * b);
-  auto o = at::zeros_like(ref);
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {a, b};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  o = stack[0].toTensor();
-  for (size_t i = 0; i < 5 * 3; i++) {
-    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-  }
-}
-
-TEST_F(Kernel, DISABLED_Shape_Inference) {
-  // disabled: doesn't do stride propagation, and isn't being used currently
-
-  // Test TensorExpr shape inference capabilities: it should only require shapes
-  // for the inputs
-  {
-    const auto graph_string = R"IR(
-      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
-            %1 : Float(5, 3, strides=[12, 2], device=cpu)):
-        %2 : Tensor = aten::mul(%0, %1)
-        %3 : Tensor = aten::mul(%0, %2)
-        return (%3))IR";
-    auto graph = std::make_shared<Graph>();
-    parseIR(graph_string, &*graph);
-
-    auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto b = at::rand({10, 6}, TensorOptions(kCPU).dtype(at::kFloat))
-                 .index({Slice(None, None, 2), Slice(None, None, 2)});
-    auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto ref = a * (a * b);
-    TensorExprKernel k(graph);
-    std::vector<at::Tensor> inputs = {a, b};
-    StmtPtr s = k.getCodeGenStmt();
-
-    std::ostringstream oss;
-    oss << *s;
-
-    // Check the IR we produced
-    const std::string& verification_pattern =
-        R"IR(
-# CHECK: for
-# CHECK-NEXT: for
-# CHECK-NOT: for)IR";
-    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-    std::vector<IValue> stack = fmap<IValue>(inputs);
-    k.run(stack);
-    o = stack[0].toTensor();
-    for (size_t i = 0; i < 5 * 3; i++) {
-      TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-    }
-  }
-  {
-    const auto graph_string = R"IR(
-      graph(%0 : Float(8, 8, strides=[8, 1], device=cpu),
-            %1 : Float(8, 8, strides=[8, 1], device=cpu)):
-        %2 : Tensor = aten::mul(%0, %1)
-        %3 : Tensor, %4 : Tensor = prim::ConstantChunk[dim=1,chunks=2](%2)
-        %r : Tensor = aten::mul(%3, %4)
-        return (%r))IR";
-    auto graph = std::make_shared<Graph>();
-    parseIR(graph_string, &*graph);
-
-    auto a = at::rand({8, 8}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto b = at::rand({8, 8}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto o = at::zeros({8, 4}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto t = torch::chunk(a * b, 2, 1);
-    auto ref = t[0] * t[1];
-    TensorExprKernel k(graph);
-    std::vector<at::Tensor> inputs = {a, b};
-    StmtPtr s = k.getCodeGenStmt();
-
-    std::ostringstream oss;
-    oss << *s;
-
-    // Check the IR we produced
-    const std::string& verification_pattern =
-        R"IR(
-# CHECK: for)IR";
-    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-    std::vector<IValue> stack = fmap<IValue>(inputs);
-    k.run(stack);
-    o = stack[0].toTensor();
-    TORCH_CHECK_EQ(o.sizes()[0], 8);
-    TORCH_CHECK_EQ(o.sizes()[1], 4);
-    for (size_t i = 0; i < 8 * 4; i++) {
-      TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-    }
-  }
-  {
-    // Test that shape inference handles aten::unsqueeze
-
-    const auto graph_string = R"IR(
-      graph(%a : Float(4, 2, strides=[2, 1], device=cpu),
-            %b : Float(4, 3, 2, strides=[6, 2, 1], device=cpu),
-            %c : Float(3, 2, 2, strides=[4, 2, 1], device=cpu)):
-        %one : int = prim::Constant[value=1]()
-        %minus_one : int = prim::Constant[value=-1]()
-        %three : int = prim::Constant[value=3]()
-        %minus_four : int = prim::Constant[value=-4]()
-        %a1 : Tensor = aten::unsqueeze(%a, %one)        # new size: [4,1,2]
-        %a2 : Tensor = aten::unsqueeze(%a1, %minus_one) # new size: [4,1,2,1]
-        %b1 : Tensor = aten::unsqueeze(%b, %three)      # new size: [4,3,2,1]
-        %c1 : Tensor = aten::unsqueeze(%c, %minus_four) # new size: [1,3,2,2]
-        %ab : Tensor = aten::mul(%a2, %b1)         # expected size: [4,3,2,1]
-        %abc : Tensor = aten::mul(%ab, %c1)        # expected size: [4,3,2,2]
-        return (%abc))IR";
-    auto graph = std::make_shared<Graph>();
-    parseIR(graph_string, &*graph);
-
-    auto a = at::rand({4, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto b = at::rand({4, 3, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto c = at::rand({3, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto o = at::zeros({4, 3, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto ref = at::unsqueeze(at::unsqueeze(a, 1), -1) * at::unsqueeze(b, 3) *
-        at::unsqueeze(c, -4);
-
-    TensorExprKernel k(graph);
-    std::vector<at::Tensor> inputs = {a, b, c};
-    StmtPtr s = k.getCodeGenStmt();
-
-    std::ostringstream oss;
-    oss << *s;
-
-    // Check the IR we produced
-    const std::string& verification_pattern =
-        R"IR(
-# CHECK: for
-# CHECK-NEXT: for
-# CHECK-NEXT: for
-# CHECK-NEXT: for
-# CHECK-NEXT: aten_mul)IR";
-    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-    std::vector<IValue> stack = fmap<IValue>(inputs);
-    k.run(stack);
-    o = stack[0].toTensor();
-
-    // Check sizes
-    TORCH_CHECK_EQ(o.sizes().size(), ref.sizes().size());
-    size_t num_el = 1;
-    for (const auto idx : c10::irange(ref.sizes().size())) {
-      TORCH_CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]);
-      num_el *= ref.sizes()[idx];
-    }
-
-    // Check the contents
-    for (const auto i : c10::irange(num_el)) {
-      TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-    }
-  }
-  {
-    // Test that shape inference handles aten::cat
-
-    const auto graph_string = R"IR(
-      graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
-            %b : Float(5, 7, 2, strides=[14, 2, 1], device=cpu),
-            %c : Float(5, 9, 2, strides=[18, 2, 1], device=cpu)):
-        %dim : int = prim::Constant[value=1]()
-        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
-        %r : Tensor = aten::cat(%inputs, %dim)               # new size: [5,19,2]
-        return (%r))IR";
-    auto graph = std::make_shared<Graph>();
-    parseIR(graph_string, &*graph);
-
-    auto a = at::rand({5, 3, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto b = at::rand({5, 7, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto c = at::rand({5, 9, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto o = at::zeros({5, 19, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto ref = at::cat({a, b, c}, 1);
-
-    TensorExprKernel k(graph);
-    std::vector<at::Tensor> inputs = {a, b, c};
-    StmtPtr s = k.getCodeGenStmt();
-
-    std::ostringstream oss;
-    oss << *s;
-
-    // Check the IR we produced
-    const std::string& verification_pattern =
-        R"IR(
-# CHECK: for
-# CHECK-NEXT: for
-# CHECK-NEXT: for
-# CHECK-NEXT: aten_cat)IR";
-    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-    std::vector<IValue> stack = fmap<IValue>(inputs);
-    k.run(stack);
-    o = stack[0].toTensor();
-
-    // Check sizes
-    TORCH_CHECK_EQ(o.sizes().size(), ref.sizes().size());
-    size_t num_el = 1;
-    for (const auto idx : c10::irange(ref.sizes().size())) {
-      TORCH_CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]);
-      num_el *= ref.sizes()[idx];
-    }
-
-    // Check the contents
-    for (const auto i : c10::irange(num_el)) {
-      TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-    }
-  }
-  {
-    // Test that we throw an error when input list for aten::cat is empty
-
-    const auto graph_string = R"IR(
-      graph():
-        %dim : int = prim::Constant[value=1]()
-        %inputs : Tensor[] = prim::ListConstruct()
-        %r : Tensor = aten::cat(%inputs, %dim)
-        return (%r))IR";
-    auto graph = std::make_shared<Graph>();
-    parseIR(graph_string, &*graph);
-    auto compile = [&]() {
-      TensorExprKernel k(graph);
-      k.getCodeGenStmt();
-    };
-    ASSERT_THROWS_WITH(compile(), "Empty input list is passed to aten::cat");
-  }
-  {
-    // Test that we throw an error when 'dim' passed to aten::cat is invalid
-
-    const auto ir_dim_99 = R"IR(
-      graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
-            %b : Float(5, 3, 2, strides=[6, 2, 1], device=cpu)):
-        %dim : int = prim::Constant[value=99]()
-        %inputs : Tensor[] = prim::ListConstruct(%a, %b)
-        %r : Float(5, 3, 2, strides=[6, 2, 1], device=cpu) = aten::cat(%inputs, %dim)
-        return (%r))IR";
-    const auto ir_dim_minus_6 = R"IR(
-      graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
-            %b : Float(5, 3, 2, strides=[6, 2, 1], device=cpu)):
-        %dim : int = prim::Constant[value=-6]()
-        %inputs : Tensor[] = prim::ListConstruct(%a, %b)
-        %r : Float(5, 3, 2, strides=[6, 2, 1], device=cpu) = aten::cat(%inputs, %dim)
-        return (%r))IR";
-
-    auto compile = [](const std::string& graph_string) {
-      auto graph = std::make_shared<Graph>();
-      parseIR(graph_string, &*graph);
-      TensorExprKernel k(graph);
-      k.getCodeGenStmt();
-    };
-    ASSERT_THROWS_WITH(compile(ir_dim_99), "Invalid index");
-    ASSERT_THROWS_WITH(compile(ir_dim_minus_6), "Invalid index");
-  }
-}
-
-TEST_F(Kernel, CatInputTypesPromotion) {
-  {
-    // Test that we properly promote input types for aten::cat
-
-    const auto graph_string = R"IR(
-      graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
-            %b : Float(5, 7, 2, strides=[14, 2, 1], device=cpu),
-            %c : Double(5, 9, 2, strides=[18, 2, 1], device=cpu)):
-        %dim : int = prim::Constant[value=1]()
-        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
-        %r : Double(5, 19, 2, strides=[38, 2, 1]) = aten::cat(%inputs, %dim)
-        return (%r))IR";
-    auto graph = std::make_shared<Graph>();
-    parseIR(graph_string, &*graph);
-
-    auto a = at::rand({5, 3, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto b = at::rand({5, 7, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto c = at::rand({5, 9, 2}, TensorOptions(kCPU).dtype(at::kDouble));
-    auto ref = at::cat({a, b, c}, 1);
-
-    TensorExprKernel k(graph);
-    std::vector<at::Tensor> inputs = {a, b, c};
-    StmtPtr s = k.getCodeGenStmt();
-
-    std::ostringstream oss;
-    oss << *s;
-
-    // Check the IR we produced
-    const std::string& verification_pattern =
-        R"IR(
-# CHECK: for
-# CHECK-NEXT: for
-# CHECK-NEXT: for
-# CHECK-NEXT: aten_cat)IR";
-    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-    std::vector<IValue> stack = fmap<IValue>(inputs);
-    k.run(stack);
-    auto o = stack[0].toTensor();
-
-    // Check sizes
-    TORCH_CHECK_EQ(o.sizes().size(), ref.sizes().size());
-    TORCH_CHECK_EQ(o.dtype(), ref.dtype());
-    size_t num_el = 1;
-    for (const auto idx : c10::irange(ref.sizes().size())) {
-      TORCH_CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]);
-      num_el *= ref.sizes()[idx];
-    }
-
-    // Check the contents
-    for (const auto i : c10::irange(num_el)) {
-      TORCH_CHECK_EQ(((double*)o.data_ptr())[i], ((double*)ref.data_ptr())[i]);
-    }
-  }
-}
-
-TEST_F(Kernel, ToDType) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_string = R"IR(
-      graph(%x.1 : BFloat16(2, 2, strides=[2, 1], requires_grad=0, device=cpu)):
-        %1 : NoneType = prim::Constant()
-        %2 : bool = prim::Constant[value=0]()
-        %3 : int = prim::Constant[value=6]()
-        %4 : int = prim::Constant[value=15]()
-        %5 : int = prim::Constant[value=5]()
-        %6 : bool = prim::Constant[value=1]()
-        %y.3 : BFloat16(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::sigmoid(%x.1)
-        %z.3 : BFloat16(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::_autocast_to_reduced_precision(%y.3, %6, %6, %5, %4)
-        %h.3 : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::_autocast_to_full_precision(%z.3, %6, %6)
-        %i.3 : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::to(%h.3, %3, %2, %2, %1)
-        %j.3 : BFloat16(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::to(%i.3, %4, %2, %2, %1)
-        %k.3 : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::to(%j.3, %3, %2, %2, %1)
-        return (%k.3))IR";
-
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-  TensorExprKernel k(graph);
-  StmtPtr s = k.getCodeGenStmt();
-  std::ostringstream oss;
-  oss << *s;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for
-# CHECK-NEXT: for
-# CHECK-NEXT: aten_to
-# CHECK-NEXT: }
-# CHECK-NEXT: })IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto a = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kBFloat16));
-  auto ref =
-      at::_to_copy(at::sigmoid(a), TensorOptions(kCPU).dtype(at::kFloat));
-
-  std::vector<at::Tensor> inputs = {a};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto o = stack[0].toTensor();
-  ASSERT_EQ(o.sizes(), ref.sizes());
-  ASSERT_EQ(o.dtype(), ref.dtype());
-  ASSERT_TRUE(at::allclose(o, ref, 4E-3, 4E-3));
-#endif
-}
-
-TEST_F(Kernel, CatAndInlineWithAConstantDim) {
-  const auto graph_string = R"IR(
-      graph(%0 : Float(1, 512, strides=[1024, 1], requires_grad=0, device=cpu),
-            %1 : Float(1, 512, strides=[1024, 1], requires_grad=0, device=cpu)):
-        %2 : bool = prim::Constant[value=0]()
-        %3 : int = prim::Constant[value=1]()
-        %4 : Tensor[] = prim::ListConstruct(%0, %1)
-        %5 : Float(1, 1024, strides=[1024, 1], requires_grad=0, device=cpu) = aten::cat(%4, %3)
-        %6 : Tensor[] = prim::ListConstruct(%5)
-        %7 : Float(1, 1024, strides=[1024, 1], requires_grad=0, device=cpu) = aten::cat(%6, %3)
-        %8 : Float(1, 1024, strides=[1024, 1], requires_grad=0, device=cpu) = aten::_cast_Float(%7, %2)
-        return (%8, %7))IR";
-
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-  TensorExprKernel k(graph);
-
-  auto a = at::rand({1, 512}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b = at::rand({1, 512}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto ref = at::_cast_Float(at::cat({a, b}, 1), 0);
-
-  std::vector<at::Tensor> inputs = {a, b};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto o = stack[0].toTensor();
-  ASSERT_EQ(o.sizes(), ref.sizes());
-  ASSERT_EQ(o.dtype(), ref.dtype());
-  ASSERT_TRUE(at::allclose(o, ref));
-}
-
-TEST_F(Kernel, CatWithEmptyInputs) {
-  bool curr_cat_wo_conditionals = getCatWoConditionals();
-  for (auto cat_wo_conditionals : {true, false}) {
-    getCatWoConditionals() = cat_wo_conditionals;
-    const auto graph_string = R"IR(
-        graph(%0 : Float(0, 64, strides=[64, 1], requires_grad=0, device=cpu),
-              %1 : Float(10, 64, strides=[64, 1], requires_grad=0, device=cpu)):
-          %3 : int = prim::Constant[value=0]()
-          %6 : Float(0, 64, strides=[64, 1], requires_grad=0, device=cpu) = aten::tanh(%0)
-          %7 : Float(10, 64, strides=[64, 1], requires_grad=0, device=cpu) = aten::tanh(%1)
-          %10 : Tensor[] = prim::ListConstruct(%6, %7)
-          %11 : Float(10, 64, strides=[64, 1], requires_grad=0, device=cpu) = aten::cat(%10, %3)
-          return (%11))IR";
-
-    auto graph = std::make_shared<Graph>();
-    parseIR(graph_string, &*graph);
-    TensorExprKernel k(graph);
-
-    auto a = at::rand({0, 64}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto b = at::rand({10, 64}, TensorOptions(kCPU).dtype(at::kFloat));
-    auto ref = at::cat({at::tanh(a), at::tanh(b)}, 0);
-
-    std::vector<at::Tensor> inputs = {a, b};
-    std::vector<IValue> stack = fmap<IValue>(inputs);
-    k.run(stack);
-    auto o = stack[0].toTensor();
-    ASSERT_EQ(o.sizes(), ref.sizes());
-    ASSERT_EQ(o.dtype(), ref.dtype());
-    ASSERT_TRUE(at::allclose(o, ref));
-  }
-  getCatWoConditionals() = curr_cat_wo_conditionals;
-}
-
-TEST_F(Kernel, CatWoConditionals) {
-  bool old_cat_wo_conditionals = getCatWoConditionals();
-  getCatWoConditionals() = true;
-  const auto graph_string = R"IR(
-      graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
-            %b : Float(5, 7, 2, strides=[14, 2, 1], device=cpu),
-            %c : Float(5, 9, 2, strides=[18, 2, 1], device=cpu)):
-        %dim : int = prim::Constant[value=1]()
-        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
-        %r : Float(5, 19, 2, strides=[38, 2, 1]) = aten::cat(%inputs, %dim)
-        return (%r))IR";
-
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  TensorExprKernel k(graph);
-  StmtPtr s = k.getCodeGenStmt();
-  std::ostringstream oss;
-  oss << *s;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for
-# CHECK: for
-# CHECK: for
-# CHECK: aten_cat
-# CHECK: for
-# CHECK: for
-# CHECK: aten_cat
-# CHECK: for
-# CHECK: for
-# CHECK: aten_cat)IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto a = at::rand({5, 3, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b = at::rand({5, 7, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto c = at::rand({5, 9, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto ref = at::cat({a, b, c}, 1);
-
-  std::vector<at::Tensor> inputs = {a, b, c};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto o = stack[0].toTensor();
-
-  // Check sizes
-  TORCH_CHECK_EQ(o.sizes().size(), ref.sizes().size());
-  TORCH_CHECK_EQ(o.dtype(), ref.dtype());
-  size_t num_el = 1;
-  for (const auto idx : c10::irange(ref.sizes().size())) {
-    TORCH_CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]);
-    num_el *= ref.sizes()[idx];
-  }
-
-  // Check the contents
-  for (const auto i : c10::irange(num_el)) {
-    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-  }
-  getCatWoConditionals() = old_cat_wo_conditionals;
-}
-
-TEST_F(Kernel, OptimizeConditionals) {
-  bool old_cat_wo_conditionals = getCatWoConditionals();
-  bool old_opt_conditionals = getOptConditionals();
-  getCatWoConditionals() = false;
-  getOptConditionals() = true;
-  const auto graph_string = R"IR(
-      graph(%a : Float(5, 3, strides=[3, 1], device=cpu),
-            %b : Float(5, 7, strides=[7, 1], device=cpu),
-            %c : Float(5, 9, strides=[9, 1], device=cpu)):
-        %dim : int = prim::Constant[value=1]()
-        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
-        %r : Float(5, 19, strides=[19, 1]) = aten::cat(%inputs, %dim)
-        %t : Float(5, 19, strides=[19, 1]) = aten::relu(%r)
-        return (%t))IR";
-
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  TensorExprKernel k(graph);
-  StmtPtr s = k.getCodeGenStmt();
-  std::ostringstream oss;
-  oss << *s;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for
-# CHECK-NEXT: for
-# CHECK-NEXT: aten_relu
-# CHECK: for
-# CHECK-NEXT: aten_relu
-# CHECK: for
-# CHECK-NEXT: aten_relu
-# CHECK-NOT: Allocate
-# CHECK-NOT: Free)IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto b = at::rand({5, 7}, TensorOptions(kCPU).dtype(at::kFloat));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto c = at::rand({5, 9}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto ref = at::relu(at::cat({a, b, c}, 1));
-
-  std::vector<at::Tensor> inputs = {a, b, c};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto o = stack[0].toTensor();
-
-  // Check sizes
-  TORCH_CHECK_EQ(o.sizes().size(), ref.sizes().size());
-  TORCH_CHECK_EQ(o.dtype(), ref.dtype());
-  size_t num_el = 1;
-  for (const auto idx : c10::irange(ref.sizes().size())) {
-    TORCH_CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]);
-    num_el *= ref.sizes()[idx];
-  }
-
-  // Check the contents
-  for (const auto i : c10::irange(num_el)) {
-    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-  }
-  getOptConditionals() = old_opt_conditionals;
-  getCatWoConditionals() = old_cat_wo_conditionals;
-}
-
-namespace {
-
-std::string dtypeConstant(ScalarType scalar_type) {
-  if (scalar_type == ScalarType::Undefined) {
-    return "None = prim::Constant()";
-  } else {
-    at::jit::TemplateEnv env_dtype;
-    env_dtype.d("scalar_type", static_cast<int>(scalar_type));
-    return format("int = prim::Constant[value=${scalar_type}]()", env_dtype);
-  }
-}
-
-at::Tensor iotaTensor(IntArrayRef sizes, const at::TensorOptions& options) {
-  int64_t numel = std::accumulate(
-      sizes.begin(),
-      sizes.end(),
-      1,
-      // NOLINTNEXTLINE(modernize-use-transparent-functors)
-      std::multiplies<int64_t>());
-  std::vector<float> values(numel);
-  std::iota(values.begin(), values.end(), 0);
-  auto a = at::tensor(values, options);
-  return a.reshape(sizes);
-}
-
-} // namespace
-
-TEST_F(Kernel, SumAllAxes) {
-  // Test lowering of sum on all axes.
-  const auto graph_template = R"IR(
-      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu)):
-        %1 : ${dtype}
-        %2 : ${out_dtype}(requires_grad=0, device=cpu) = aten::sum(%0, %1)
-        return (%2))IR";
-  auto a = iotaTensor({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-
-  for (auto scalar_type : {ScalarType::Undefined, ScalarType::Double}) {
-    at::jit::TemplateEnv env;
-    env.s("dtype", dtypeConstant(scalar_type));
-    if (scalar_type == ScalarType::Undefined) {
-      env.s("out_dtype", "Float");
-    } else {
-      env.s("out_dtype", "Double");
-    }
-    const auto graph_string = format(graph_template, env);
-
-    auto graph = std::make_shared<Graph>();
-    parseIR(graph_string, &*graph);
-
-    auto o = at::empty({}, TensorOptions(kCPU));
-    std::optional<c10::ScalarType> dtype;
-    if (scalar_type != ScalarType::Undefined) {
-      dtype = static_cast<c10::ScalarType>(scalar_type);
-    }
-    auto ref = a.sum(/*dtype=*/dtype);
-    TensorExprKernel k(graph);
-    std::vector<at::Tensor> inputs = {a};
-    StmtPtr s = k.getCodeGenStmt();
-
-    std::ostringstream oss;
-    oss << *s;
-
-    // Check the IR we produced
-    const std::string& verification_pattern =
-        R"IR(
-# CHECK: for
-# CHECK-NEXT: for)IR";
-    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-    std::vector<IValue> stack = fmap<IValue>(inputs);
-    k.run(stack);
-    o = stack[0].toTensor();
-    ASSERT_EQ(o.sizes(), ref.sizes());
-    ASSERT_EQ(o.dtype(), ref.dtype());
-    ASSERT_TRUE(at::allclose(o, ref));
-  }
-}
-
-std::string li_to_str(at::ArrayRef<int64_t> li) {
-  std::stringstream out;
-  bool first = true;
-  for (auto elem : li) {
-    if (!first) {
-      out << ", ";
-    }
-    out << elem;
-    first = false;
-  }
-  return out.str();
-}
-
-TEST_F(Kernel, SumOneAxis) {
-  // Test lowering of sum on one axis.
-  const auto graph_template = R"IR(
-      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu)):
-        %1 : int[] = prim::Constant[value=[${dim}]]()
-        %2 : bool = prim::Constant[value=${keepdim}]()
-        %3 : ${dtype}
-        %4 : ${out_dtype}(${size}, strides=[${strides}], device=cpu) = aten::sum(%0, %1, %2, %3)
-        return (%4))IR";
-  auto a = iotaTensor({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-
-  for (int dim = -a.dim(); dim < a.dim(); ++dim) {
-    for (bool keepdim : {false, true}) {
-      for (auto scalar_type : {ScalarType::Undefined, ScalarType::Double}) {
-        at::jit::TemplateEnv env;
-        env.d("dim", dim);
-        env.d("keepdim", keepdim);
-        env.s("dtype", dtypeConstant(scalar_type));
-        std::optional<c10::ScalarType> dtype;
-        if (scalar_type != ScalarType::Undefined) {
-          dtype = static_cast<c10::ScalarType>(scalar_type);
-        }
-        auto ref = a.sum({dim}, /*keepdim=*/keepdim, /*dtype=*/dtype);
-        if (scalar_type == ScalarType::Undefined) {
-          env.s("out_dtype", "Float");
-        } else {
-          env.s("out_dtype", "Double");
-        }
-        env.s("size", li_to_str(ref.sizes()));
-        env.s("strides", li_to_str(ref.strides()));
-        const auto graph_string = format(graph_template, env);
-        auto graph = std::make_shared<Graph>();
-        parseIR(graph_string, &*graph);
-
-        auto o = at::empty({}, TensorOptions(kCPU));
-        TensorExprKernel k(graph);
-        std::vector<at::Tensor> inputs = {a};
-        StmtPtr s = k.getCodeGenStmt();
-
-        std::ostringstream oss;
-        oss << *s;
-
-        // Check the IR we produced
-        const std::string& verification_pattern =
-            R"IR(
-# CHECK: for (int64_t
-# CHECK-NEXT: sum
-# CHECK-NEXT: for (int64_t
-# CHECK-NEXT:   sum)IR";
-        torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-        std::vector<IValue> stack = fmap<IValue>(inputs);
-        k.run(stack);
-        o = stack[0].toTensor();
-        ASSERT_EQ(o.sizes(), ref.sizes());
-        ASSERT_EQ(o.dtype(), ref.dtype());
-        ASSERT_TRUE(at::allclose(o, ref, 4E-3, 4E-3));
-      }
-    }
-  }
-}
-
-TEST_F(Kernel, SumMultipleAxes) {
-  // Test lowering of sum on multiple axes.
-  const auto graph_template = R"IR(
-      graph(%0 : Float(2, 3, 2, 3, strides=[18, 6, 3, 1], requires_grad=0, device=cpu)):
-        %1 : int = prim::Constant[value=${dim1}]()
-        %2 : int = prim::Constant[value=${dim2}]()
-        %3 : int[] = prim::ListConstruct(%1, %2)
-        %4 : bool = prim::Constant[value=${keepdim}]()
-        %5 : ${dtype}
-        %6 : Float(${size}, strides=[${strides}], requires_grad=0, device=cpu) = aten::sum(%0, %3, %4, %5)
-        return (%6))IR";
-  auto a = iotaTensor({2, 3, 2, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-
-  // Only iterate over positive values of axes to keep the running time
-  // reasonable, since the number of pairs is quadratic.
-  for (const auto dim1 : c10::irange(a.dim())) {
-    for (int dim2 = dim1 + 1; dim2 < a.dim(); ++dim2) {
-      for (bool keepdim : {false, true}) {
-        at::jit::TemplateEnv env;
-        env.d("dim1", dim1);
-        env.d("dim2", dim2);
-        env.d("keepdim", keepdim);
-        env.s("dtype", dtypeConstant(ScalarType::Undefined));
-        auto o = at::empty({}, TensorOptions(kCPU));
-        auto ref = a.sum(IntArrayRef{dim1, dim2}, /*keepdim=*/keepdim);
-
-        env.s("size", li_to_str(ref.sizes()));
-        env.s("strides", li_to_str(ref.strides()));
-
-        const auto graph_string = format(graph_template, env);
-
-        auto graph = std::make_shared<Graph>();
-        parseIR(graph_string, &*graph);
-
-        TensorExprKernel k(graph);
-        std::vector<at::Tensor> inputs = {a};
-        StmtPtr s = k.getCodeGenStmt();
-
-        std::ostringstream oss;
-        oss << *s;
-
-        // Check the IR we produced
-        const std::string& verification_pattern =
-            R"IR(
-# CHECK: for (int64_t
-# CHECK: for (int64_t
-# CHECK: for (int64_t
-# CHECK: for (int64_t
-# CHECK: sum)IR";
-        torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-        std::vector<IValue> stack = fmap<IValue>(inputs);
-        k.run(stack);
-        o = stack[0].toTensor();
-        ASSERT_EQ(o.sizes(), ref.sizes());
-        ASSERT_EQ(o.dtype(), ref.dtype());
-        ASSERT_TRUE(at::allclose(o, ref));
-      }
-    }
-  }
-}
-
-// This test and the following ones testing Softmax only tests with dim set
-// to one of the valid input dimensions. It does not test with dim=None
-// because that is supposed to be deprecated.
-TEST_F(Kernel, Softmax2D) {
-  const auto graph_template = R"IR(
-      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu)):
-        %1 : int = prim::Constant[value=${dim}]()
-        %dt_float : int = prim::Constant[value=7]()
-        %dt_none : NoneType = prim::Constant()
-        %4 : Float(${size}, strides=[${strides}]) = aten::${op}(%0, %1, %${dt})
-        return (%4))IR";
-
-  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-
-  const std::string& verification_template =
-      R"IR(
-        # CHECK: for (int i${other_dim} = 0; i${other_dim} < ${other_dim_size}
-        # CHECK: for (int i${softmax_dim} = 0; i${softmax_dim} < ${softmax_dim_size}
-        # CHECK-NEXT: aten_softmax_max
-        # CHECK: for (int i${other_dim}_1 = 0; i${other_dim}_1 < ${other_dim_size}
-        # CHECK: for (int i${softmax_dim}_1 = 0; i${softmax_dim}_1 < ${softmax_dim_size}
-        # CHECK-NEXT: aten_softmax_sum
-        # CHECK: for (int i0_2 = 0; i0_2 < 5
-        # CHECK-NEXT: for (int i1_2 = 0; i1_2 < 3
-        # CHECK-NEXT: aten_softmax)IR";
-
-  for (bool empty_dtype : {false, true}) {
-    for (auto log_softmax : {false, true}) {
-      for (const auto softmax_dim : c10::irange(a.dim())) {
-        auto softmax_dim_size = a.sizes()[softmax_dim];
-        auto other_dim = (softmax_dim + 1) % a.dim();
-        auto ref =
-            log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim);
-        at::jit::TemplateEnv env;
-        env.d("dim", softmax_dim);
-        env.s("op", log_softmax ? "log_softmax" : "softmax");
-        env.s("size", li_to_str(ref.sizes()));
-        env.s("strides", li_to_str(ref.strides()));
-        env.s("dt", empty_dtype ? "dt_none" : "dt_float");
-
-        const auto graph_string = format(graph_template, env);
-
-        auto graph = std::make_shared<Graph>();
-        parseIR(graph_string, &*graph);
-
-        TensorExprKernel k(graph);
-        std::vector<at::Tensor> inputs = {a};
-        StmtPtr s = k.getCodeGenStmt();
-
-        std::ostringstream oss;
-        oss << *s;
-
-        at::jit::TemplateEnv ver_env;
-        ver_env.d("other_dim", other_dim);
-        ver_env.d("other_dim_size", a.sizes()[other_dim]);
-        ver_env.d("softmax_dim", softmax_dim);
-        ver_env.d("softmax_dim_size", softmax_dim_size);
-        const auto verification_pattern =
-            format(verification_template, ver_env);
-
-        // verification string temporarily disabled until
-        // inlining of exp() is benchmarked and determined
-        // torch::jit::testing::FileCheck().run(verification_pattern,
-        // oss.str());
-
-        std::vector<IValue> stack = fmap<IValue>(inputs);
-        k.run(stack);
-        auto output = stack[0].toTensor();
-        ASSERT_EQ(output.sizes(), ref.sizes());
-        ASSERT_TRUE(at::allclose(output, ref));
-      }
-    }
-  }
-}
-
-TEST_F(Kernel, Softmax3D) {
-  const auto graph_template = R"IR(
-      graph(%0 : Float(3, 4, 5, strides=[20, 5, 1], device=cpu)):
-        %1 : int = prim::Constant[value=${dim}]()
-        %2 : int = prim::Constant[value=7]()
-        %3 : Float(${size}, strides=[${strides}]) = aten::${op}(%0, %1, %2)
-        return (%3))IR";
-
-  auto a = at::rand({3, 4, 5}, TensorOptions(kCPU).dtype(at::kFloat));
-
-  const std::string& verification_template =
-      R"IR(
-        # CHECK: for (int i${dim1} = 0; i${dim1} < ${dim1_size}
-        # CHECK-NEXT: for (int i${dim2} = 0; i${dim2} < ${dim2_size}
-        # CHECK: for (int i${softmax_dim} = 0; i${softmax_dim} < ${softmax_dim_size}
-        # CHECK-NEXT: aten_softmax_max
-        # CHECK: for (int i${dim1}_1 = 0; i${dim1}_1 < ${dim1_size}
-        # CHECK-NEXT: for (int i${dim2}_1 = 0; i${dim2}_1 < ${dim2_size}
-        # CHECK: for (int i${softmax_dim}_1 = 0; i${softmax_dim}_1 < ${softmax_dim_size}
-        # CHECK-NEXT: aten_softmax_sum
-        # CHECK: for (int i0_2 = 0; i0_2 < 3
-        # CHECK-NEXT: for (int i1_2 = 0; i1_2 < 4
-        # CHECK-NEXT: for (int i2_2 = 0; i2_2 < 5
-        # CHECK-NEXT: aten_softmax)IR";
-
-  for (auto log_softmax : {false, true}) {
-    for (const auto softmax_dim : c10::irange(a.dim())) {
-      auto softmax_dim_size = a.sizes()[softmax_dim];
-      std::vector<int> other_dims;
-      for (const auto i : c10::irange(a.dim())) {
-        if (i != softmax_dim) {
-          other_dims.push_back(i);
-        }
-      }
-      auto ref =
-          log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim);
-
-      at::jit::TemplateEnv env;
-      env.d("dim", softmax_dim);
-      env.s("op", log_softmax ? "log_softmax" : "softmax");
-      env.s("size", li_to_str(ref.sizes()));
-      env.s("strides", li_to_str(ref.strides()));
-
-      const auto graph_string = format(graph_template, env);
-
-      auto graph = std::make_shared<Graph>();
-      parseIR(graph_string, &*graph);
-
-      TensorExprKernel k(graph);
-      std::vector<at::Tensor> inputs = {a};
-      StmtPtr s = k.getCodeGenStmt();
-
-      std::ostringstream oss;
-      oss << *s;
-
-      at::jit::TemplateEnv ver_env;
-      ver_env.d("dim1", other_dims[0]);
-      ver_env.d("dim1_size", a.sizes()[other_dims[0]]);
-      ver_env.d("dim2", other_dims[1]);
-      ver_env.d("dim2_size", a.sizes()[other_dims[1]]);
-      ver_env.d("softmax_dim", softmax_dim);
-      ver_env.d("softmax_dim_size", softmax_dim_size);
-      const auto verification_pattern = format(verification_template, ver_env);
-
-      // verification string temporarily disabled until
-      // inlining of exp() is benchmarked and determined
-      // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-      std::vector<IValue> stack = fmap<IValue>(inputs);
-      k.run(stack);
-      auto output = stack[0].toTensor();
-
-      ASSERT_EQ(output.sizes(), ref.sizes());
-      ASSERT_TRUE(at::allclose(output, ref));
-    }
-  }
-}
-
-TEST_F(Kernel, Softmax4D) {
-  const auto graph_template = R"IR(
-      graph(%0 : Float(2, 3, 2, 3, strides=[18, 6, 3, 1], device=cpu)):
-        %1 : int = prim::Constant[value=${dim}]()
-        %2 : int = prim::Constant[value=7]()
-        %3 : Float(${size}, strides=[${strides}]) = aten::${op}(%0, %1, %2)
-        return (%3))IR";
-
-  auto a = at::rand({2, 3, 2, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-
-  const std::string& verification_template =
-      R"IR(
-        # CHECK: for (int i${dim1} = 0; i${dim1} < ${dim1_size}
-        # CHECK-NEXT: for (int i${dim2} = 0; i${dim2} < ${dim2_size}
-        # CHECK-NEXT: for (int i${dim3} = 0; i${dim3} < ${dim3_size}
-        # CHECK: for (int i${softmax_dim} = 0; i${softmax_dim} < ${softmax_dim_size}
-        # CHECK-NEXT: aten_softmax_max
-        # CHECK: for (int i${dim1}_1 = 0; i${dim1}_1 < ${dim1_size}
-        # CHECK-NEXT: for (int i${dim2}_1 = 0; i${dim2}_1 < ${dim2_size}
-        # CHECK-NEXT: for (int i${dim3}_1 = 0; i${dim3}_1 < ${dim3_size}
-        # CHECK: for (int i${softmax_dim}_1 = 0; i${softmax_dim}_1 < ${softmax_dim_size}
-        # CHECK-NEXT: aten_softmax_sum
-        # CHECK: for (int i0_2 = 0; i0_2 < 2
-        # CHECK-NEXT: for (int i1_2 = 0; i1_2 < 3
-        # CHECK-NEXT: for (int i2_2 = 0; i2_2 < 2
-        # CHECK-NEXT: for (int i3_2 = 0; i3_2 < 3
-        # CHECK-NEXT: aten_softmax)IR";
-
-  for (auto log_softmax : {false, true}) {
-    for (const auto softmax_dim : c10::irange(a.dim())) {
-      auto softmax_dim_size = a.sizes()[softmax_dim];
-      std::vector<int> other_dims;
-      for (const auto i : c10::irange(a.dim())) {
-        if (i != softmax_dim) {
-          other_dims.push_back(i);
-        }
-      }
-      auto ref =
-          log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim);
-
-      at::jit::TemplateEnv env;
-      env.d("dim", softmax_dim);
-      env.s("op", log_softmax ? "log_softmax" : "softmax");
-      env.s("size", li_to_str(ref.sizes()));
-      env.s("strides", li_to_str(ref.strides()));
-
-      const auto graph_string = format(graph_template, env);
-
-      auto graph = std::make_shared<Graph>();
-      parseIR(graph_string, &*graph);
-
-      TensorExprKernel k(graph);
-      std::vector<at::Tensor> inputs = {a};
-      StmtPtr s = k.getCodeGenStmt();
-
-      std::ostringstream oss;
-      oss << *s;
-
-      at::jit::TemplateEnv ver_env;
-      ver_env.d("dim1", other_dims[0]);
-      ver_env.d("dim1_size", a.sizes()[other_dims[0]]);
-      ver_env.d("dim2", other_dims[1]);
-      ver_env.d("dim2_size", a.sizes()[other_dims[1]]);
-      ver_env.d("dim3", other_dims[2]);
-      ver_env.d("dim3_size", a.sizes()[other_dims[2]]);
-      ver_env.d("softmax_dim", softmax_dim);
-      ver_env.d("softmax_dim_size", softmax_dim_size);
-      const auto verification_pattern = format(verification_template, ver_env);
-
-      // verification string temporarily disabled until
-      // inlining of exp() is benchmarked and determined
-      // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-      std::vector<IValue> stack = fmap<IValue>(inputs);
-      k.run(stack);
-      auto output = stack[0].toTensor();
-      ASSERT_EQ(output.sizes(), ref.sizes());
-      ASSERT_TRUE(at::allclose(output, ref));
-    }
-  }
-}
-
-TEST_F(Kernel, SignTest) {
-  const auto graph_template = R"IR(
-      graph(%0 : ${dtype}(${size}, strides=[1], device=cpu)):
-        %2 : ${dtype}(${size}, strides=[1]) = aten::sign(%0)
-        return (%2))IR";
-
-  auto run_test = [](const std::string& graph_string, const at::Tensor& input) {
-    auto graph = std::make_shared<Graph>();
-    parseIR(graph_string, &*graph);
-
-    TensorExprKernel k(graph);
-    StmtPtr s = k.getCodeGenStmt();
-
-    std::vector<at::Tensor> inputs = {input};
-    std::vector<IValue> stack = fmap<IValue>(inputs);
-    k.run(stack);
-    auto o = stack[0].toTensor();
-    auto ref = at::sign(input);
-    ASSERT_TRUE(at::allclose(o, ref));
-  };
-  auto common_options = at::TensorOptions()
-                            .layout(at::kStrided)
-                            .device(at::kCPU)
-                            .requires_grad(false);
-  int default_input_size = 100;
-  for (auto scalar_type : {ScalarType::Float, ScalarType::Double}) {
-    at::Tensor corner_case_inputs;
-    at::jit::TemplateEnv env;
-    auto options = common_options;
-    switch (scalar_type) {
-      case ScalarType::Float: {
-        env.s("dtype", "Float");
-        options = options.dtype(at::kFloat);
-        std::vector<float> input_float = {
-            0.0f,
-            -0.0f,
-            std::numeric_limits<float>::infinity(),
-            -std::numeric_limits<float>::infinity(),
-            std::nanf("1"),
-            -std::nanf("1")};
-        corner_case_inputs = at::from_blob(
-            input_float.data(),
-            {static_cast<long>(input_float.size())},
-            options);
-        auto rand_input = at::rand({default_input_size}, options);
-        auto input = at::cat({rand_input, corner_case_inputs});
-        env.d("size", at::numel(input));
-        const auto graph_string = format(graph_template, env);
-        run_test(graph_string, input);
-        break;
-      }
-      case ScalarType::Double: {
-        env.s("dtype", "Double");
-        options = options.dtype(at::kDouble);
-        std::vector<double> input_double = {
-            0.0,
-            -0.0,
-            std::numeric_limits<double>::infinity(),
-            -std::numeric_limits<double>::infinity(),
-            std::nan("1"),
-            -std::nan("1")};
-        corner_case_inputs = at::from_blob(
-            input_double.data(),
-            {static_cast<long>(input_double.size())},
-            options);
-        auto rand_input = at::rand({default_input_size}, options);
-        auto input = at::cat({rand_input, corner_case_inputs});
-        env.d("size", at::numel(input));
-        const auto graph_string = format(graph_template, env);
-        run_test(graph_string, input);
-        break;
-      }
-      default:
-        throw unsupported_dtype();
-    }
-  }
-}
-
-TEST_F(Kernel, InlineProducerIntoReduction) {
-  // Inline producer (mul) into reduction (sum).
-  const auto graph_string = R"IR(
-      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
-            %1 : Float(5, 3, strides=[3, 1], device=cpu)):
-        %2 : Float(5, 3, strides=[3, 1], device=cpu) = aten::mul(%0, %1)
-        %3 : int = prim::Constant[value=7]()
-        %4 : Double(device=cpu) = aten::sum(%2, %3)
-        return (%4))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  TensorExprKernel k(graph);
-  StmtPtr s = k.getCodeGenStmt();
-  std::ostringstream oss;
-  oss << *s;
-
-  // Check the IR we produced.
-  // We should have only one loop in the end.
-  const std::string& verification_pattern =
-      R"IR(
-        # CHECK: for (int64_t i_1 = 0ll; i_1 < 5
-        # CHECK-NEXT: for (int64_t j_1 = 0ll; j_1 < 3
-        # CHECK-NEXT:   sum
-        # CHECK-NOT: for)IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  std::vector<at::Tensor> inputs = {a, b};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto o = stack[0].toTensor();
-  auto ref = (a * b).sum(at::kDouble);
-  ASSERT_TRUE(at::allclose(o, ref));
-}
-
-TEST_F(Kernel, InlineReductionIntoConsumer) {
-  // Inline producer (mul %2) into reduction (sum %4) but DO NOT
-  // inline the reduction into consumer (mul %4).
-  const auto graph_string = R"IR(
-      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
-            %1 : Float(5, 3, strides=[3, 1], device=cpu)):
-        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
-        %3 : int = prim::Constant[value=6]()
-        %4 : Float(device=cpu) = aten::sum(%2, %3)
-        %5 : Float(5, 3, strides=[3, 1], device=cpu) = aten::mul(%2, %4)
-        return (%5))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  TensorExprKernel k(graph);
-  StmtPtr s = k.getCodeGenStmt();
-  std::ostringstream oss;
-  oss << *s;
-
-  // Check the IR we produced.
-  // We should have two loops in the end.
-  const std::string& verification_pattern =
-      R"IR(
-        # CHECK: for (int64_t i_1 = 0ll; i_1 < 5
-        # CHECK-NEXT: for (int64_t j_1 = 0ll; j_1 < 3
-        # CHECK-NEXT:   sum
-        # CHECK: for (int64_t i_2 = 0ll; i_2 < 5
-        # CHECK-NEXT: for (int64_t j_2 = 0ll; j_2 < 3
-        # CHECK-NEXT:   aten_mul
-        # CHECK-NOT: for)IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  std::vector<at::Tensor> inputs = {a, b};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto o = stack[0].toTensor();
-  auto ref = (a * b).sum(at::kFloat) * (a * b);
-  ASSERT_TRUE(at::allclose(o, ref));
-}
-
-TEST_F(Kernel, SanitizeNames_CUDA) {
-  const auto graph_string = R"IR(
-      graph(%0 : Float(5, 3, strides=[3, 1], device=cuda:0),
-            %1 : Float(5, 3, strides=[3, 1], device=cuda:0)):
-        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
-        %4 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
-        return (%4))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-  graph->inputs().at(0)->setDebugName("aten::add:");
-  graph->inputs().at(1)->setDebugName("aten::add_");
-  TensorExprKernel k(graph);
-  auto a = at::rand({5, 3}, TensorOptions(kCUDA).dtype(at::kFloat));
-  auto b = at::rand({5, 3}, TensorOptions(kCUDA).dtype(at::kFloat));
-  auto ref = a * (a * b);
-  std::vector<at::Tensor> inputs = {a, b};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto o = stack[0].toTensor();
-  ASSERT_TRUE(at::allclose(o, ref));
-}
-
-TEST_F(Kernel, SanitizeConstants_CUDA) {
-  const auto graph_string = R"IR(
-        graph(%x : Float(16, 16, strides=[16, 1], device=cuda:0)):
-          %none : NoneType = prim::Constant()
-          %size : int = prim::Constant[value=16]()
-          %sizes : int[] = prim::ListConstruct(%size, %size)
-          %30 : Device = prim::Constant[value="cuda"]()
-          %y : Float(16, 16, strides=[16, 1], device=cuda:0) = aten::ones(%sizes, %none, %none, %30, %none)
-          %z : Float(16, 16, strides=[16, 1], device=cuda:0) = aten::mul(%x, %y)
-          return (%z))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-  // IRParser doesn't support tensor constants, so we insert a call to
-  // aten::ones and then const-prop it
-  ConstantPropagation(graph);
-
-  // We set the name of the constant to include special characters that are
-  // not allowed. This should be fixed by the sanitizer in TensorExprKernel.
-  graph->nodes().front()->output()->setDebugName("illegal.name");
-
-  // Check if we have a constant node with illegal name in the graph.
-  auto const_node = graph->nodes().front();
-  ASSERT_EQ(const_node->kind(), prim::Constant);
-  ASSERT_NE(const_node->output()->debugName().find('.'), std::string::npos);
-
-  TensorExprKernel k(graph);
-
-  auto x = at::rand({16, 16}, TensorOptions(kCUDA).dtype(at::kFloat));
-  std::vector<at::Tensor> inputs = {x};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto o = stack[0].toTensor();
-  auto y = at::ones({16, 16}, TensorOptions(kCUDA).dtype(at::kFloat));
-  auto ref = x * y;
-  ASSERT_TRUE(at::allclose(o, ref));
-}
-
-TEST_F(Kernel, ConstantTensors) {
-  const auto graph_string = R"IR(
-        graph(%x : Float(16, 16, strides=[16, 1], device=cpu)):
-          %none : NoneType = prim::Constant()
-          %size : int = prim::Constant[value=16]()
-          %sizes : int[] = prim::ListConstruct(%size, %size)
-          %y : Float(16, 16, strides=[16, 1], device=cpu) = aten::ones(%sizes, %none, %none, %none, %none)
-          %z : Float(16, 16, strides=[16, 1], device=cpu) = aten::mul(%x, %y)
-          return (%z))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-  // IRParser doesn't support tensor constants, so we insert a call to
-  // aten::ones and then const-prop it
-  ConstantPropagation(graph);
-
-  TensorExprKernel k(graph);
-
-  auto x = at::rand({16, 16}, TensorOptions(kCPU).dtype(at::kFloat));
-  std::vector<at::Tensor> inputs = {x};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto o = stack[0].toTensor();
-  auto y = at::ones({16, 16}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto ref = x * y;
-  ASSERT_TRUE(at::allclose(o, ref));
-}
-
-TEST_F(Kernel, ConstantTensorsNonContiguous) {
-  const auto graph_string = R"IR(
-        graph(%x : Float(16, 16, strides=[16, 1], device=cpu)):
-          %none : NoneType = prim::Constant()
-          %dtype : int = prim::Constant[value=6]()
-          %c0 : int = prim::Constant[value=0]()
-          %c256 : int = prim::Constant[value=256]()
-          %c16 : int = prim::Constant[value=16]()
-          %y_flat : Tensor = aten::arange(%c0, %c256, %dtype, %none, %none, %none)
-          %sizes : int[] = prim::ListConstruct(%c16, %c16)
-          %y_t : Tensor = aten::view(%y_flat, %sizes)
-          %y : Tensor = aten::t(%y_t)
-          %z : Float(16, 16, strides=[16, 1], device=cpu) = aten::mul(%x, %y)
-          return (%z))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-  // IRParser doesn't support tensor constants, so we generate several aten
-  // calls to produce non-contiguous constant tensor and then const-prop it
-  ConstantPropagation(graph);
-
-  TensorExprKernel k(graph);
-
-  auto x = at::rand({16, 16}, TensorOptions(kCPU).dtype(at::kFloat));
-  std::vector<at::Tensor> inputs = {x};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto o = stack[0].toTensor();
-  auto y = at::arange(0, 256, TensorOptions(kCPU).dtype(at::kFloat))
-               .view({16, 16})
-               .t();
-  auto ref = x * y;
-  ASSERT_TRUE(at::allclose(o, ref));
-}
-
-TEST_F(Kernel, RunFast) {
-#ifdef TORCH_ENABLE_LLVM
-  // TODO: Implement call_raw in IREval and remove the ifdef
-
-  const auto graph_string = R"IR(
-      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
-            %1 : Float(5, 3, strides=[1, 5], device=cpu)):
-        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
-        %3 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
-        return (%3))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b =
-      at::rand({3, 5}, TensorOptions(kCPU).dtype(at::kFloat)).transpose(0, 1);
-  auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto ref = a * (a * b);
-  TensorExprKernel k(graph);
-
-  k.runFast({a.data_ptr(), b.data_ptr()}, {o.data_ptr()});
-  for (size_t i = 0; i < 5 * 3; i++) {
-    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-  }
-#endif
-}
-
-TEST_F(Kernel, RunWithAllocatedOutputs) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_string = R"IR(
-      graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
-            %1 : Float(5, 3, strides=[1, 5], device=cpu)):
-        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
-        %3 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
-        return (%3))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b =
-      at::rand({3, 5}, TensorOptions(kCPU).dtype(at::kFloat)).transpose(0, 1);
-  auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto ref = a * (a * b);
-  TensorExprKernel k(graph);
-
-  std::vector<at::Tensor> args = {o, a, b};
-  std::vector<IValue> stack = fmap<IValue>(args);
-  k.runWithAllocatedOutputs(stack);
-  for (size_t i = 0; i < 5 * 3; i++) {
-    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-  }
-#endif
-}
-
-TEST_F(Kernel, CodegenInspection) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_string = R"IR(
-        graph(%x : Float(16, 16, strides=[16, 1], device=cpu)):
-          %none : NoneType = prim::Constant()
-          %dtype : int = prim::Constant[value=6]()
-          %c0 : int = prim::Constant[value=0]()
-          %c256 : int = prim::Constant[value=256]()
-          %c16 : int = prim::Constant[value=16]()
-          %y_flat : Tensor = aten::arange(%c0, %c256, %dtype, %none, %none, %none)
-          %sizes : int[] = prim::ListConstruct(%c16, %c16)
-          %y_t : Tensor = aten::view(%y_flat, %sizes)
-          %y : Tensor = aten::t(%y_t)
-          %z : Float(16, 16, strides=[16, 1], device=cpu) = aten::mul(%x, %y)
-          return (%z))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-  // IRParser doesn't support tensor constants, so we generate several aten
-  // calls to produce non-contiguous constant tensor and then const-prop it
-  ConstantPropagation(graph);
-
-  TensorExprKernel k(graph);
-
-  // Check that we could retrieve generated assembly
-  auto asm_str = k.getCodeText("asm");
-  const std::string& asm_verification_pattern =
-      R"ASM(
-        # CHECK: .text
-        # CHECK: retq)ASM";
-  torch::jit::testing::FileCheck().run(asm_verification_pattern, asm_str);
-
-  // Check that we could retrieve info about codegen parameters
-  auto constants = k.getConstantDescriptors();
-  auto buf_args = k.getBufferArgs();
-  // Expected buf args: [input0, output0, constant0]
-  ASSERT_EQ(buf_args.size(), 3);
-  ASSERT_EQ(constants.size(), 1);
-  ASSERT_TRUE(
-      !buf_args[0].isVar() && !buf_args[1].isVar() && !buf_args[2].isVar());
-#endif
-}
-
-Tensor lowerNanToNum(
-    const std::vector<ArgValue>& inputs,
-    const std::vector<ExprHandle>& outputShape,
-    const std::vector<ExprHandle>& outputStrides,
-    const std::optional<ScalarType>& outputType,
-    at::Device device) {
-  auto input_buf = std::get<BufHandle>(inputs[0]);
-  auto e = Compute(
-      "custom_nan_to_num",
-      outputShape,
-      outputStrides,
-      [&](const std::vector<VarHandle>& axes) {
-        std::vector<ExprHandle> indices(axes.begin(), axes.end());
-        auto load = input_buf.load(indices);
-        return IfThenElse::make(Cast::make(kBool, isnan(load)), 0.0f, load);
-      });
-  return e;
-}
-
-TEST_F(Kernel, CustomLowering) {
-  const auto graph_string = R"IR(
-      graph(%x : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu)):
-          %none : NoneType = prim::Constant()
-          %y : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::nan_to_num(%x, %none, %none, %none)
-          return (%y)
-)IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  std::unordered_map<c10::Symbol, NNCLoweringFunction> lowerings = {
-      {aten::nan_to_num, lowerNanToNum}};
-  TensorExprKernel k(graph, lowerings);
-
-  auto stmt = k.getCodeGenStmt();
-  std::ostringstream oss;
-  oss << *stmt;
-
-  // Check that our custom lowering is actually used
-  torch::jit::testing::FileCheck().check("custom_nan_to_num")->run(oss.str());
-  torch::jit::testing::FileCheck().check("isnan")->run(oss.str());
-}
-
-TEST_F(Kernel, Vectorize) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_string = R"IR(
-      graph(%0 : Float(100, 16, strides=[16, 1], device=cpu),
-            %1 : Float(100, 16, strides=[16, 1], device=cpu)):
-        %2 : Float(100, 16, strides=[16, 1]) = aten::mul(%0, %1)
-        %3 : Float(100, 16, strides=[16, 1]) = aten::mul(%0, %2)
-        return (%3))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto a = at::rand({100, 16}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b = at::rand({100, 16}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto o = at::zeros({100, 16}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto ref = a * (a * b);
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {a, b};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::ostringstream oss;
-  oss << *s;
-
-  // Check the IR we produced
-  const std::string& verification_pattern = R"IR(# CHECK: Ramp)IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  o = stack[0].toTensor();
-  for (size_t i = 0; i < 100 * 16; i++) {
-    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-  }
-#endif
-}
-
-// TODO: To vectorize loopnest for 100x3 case, we need to flatten loops first.
-TEST_F(Kernel, DISABLED_FlattenVectorize) {
-#ifdef TORCH_ENABLE_LLVM
-  const auto graph_string = R"IR(
-      graph(%0 : Float(100, 3, strides=[3, 1], device=cpu),
-            %1 : Float(100, 3, strides=[3, 1], device=cpu)):
-        %2 : Float(100, 3, strides=[3, 1]) = aten::mul(%0, %1)
-        %3 : Float(100, 3, strides=[3, 1]) = aten::mul(%0, %2)
-        return (%3))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto a = at::rand({100, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b = at::rand({100, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto o = at::zeros({100, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto ref = a * (a * b);
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {a, b};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::ostringstream oss;
-  oss << *s;
-
-  // Check the IR we produced
-  const std::string& verification_pattern = R"IR(# CHECK: Ramp)IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  o = stack[0].toTensor();
-  for (size_t i = 0; i < 100 * 3; i++) {
-    TORCH_CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-  }
-#endif
-}
-
-TEST_F(Kernel, Strided1dWithinBounds) {
-  auto ir = R"IR(
-    graph(%0 : Float(3, strides=[1], device=cpu),
-          %1 : Float(3, strides=[2], device=cpu)):
-        %2 : int = prim::Constant[value=1]()
-        %3 : Float(3, strides=[1]) = aten::add(%0, %1, %2)
-        return (%3))IR";
-  auto graph = std::make_shared<Graph>();
-  std::unordered_map<std::string, Value*> vmap;
-  parseIR(ir, graph.get(), vmap);
-  TensorExprKernel k(graph);
-
-  auto a = at::rand({3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b = at::rand({6}, TensorOptions(kCPU).dtype(at::kFloat))
-               .index({Slice(None, None, 2)});
-  auto expect = a + b;
-
-  std::vector<at::Tensor> inputs = {a, b};
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-
-  auto output = stack[0].toTensor();
-
-  for (size_t i = 0; i < 3; ++i) {
-    TORCH_CHECK_EQ(
-        ((float*)output.data_ptr())[i], ((float*)expect.data_ptr())[i]);
-  }
-}
-
-TEST_F(Kernel, InputAsOutput) {
-  const auto graph_string = R"IR(
-      graph(%x : Float(5, 3, strides=[3, 1], device=cpu),
-            %y : Float(5, 3, strides=[1, 5], device=cpu)):
-        return (%x, %y))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto x = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto y =
-      at::rand({3, 5}, TensorOptions(kCPU).dtype(at::kFloat)).transpose(0, 1);
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {x, y};
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  CHECK(at::allclose(x, stack[0].toTensor()));
-  CHECK(at::allclose(y, stack[1].toTensor()));
-}
-
-TEST_F(Kernel, ScalarOut) {
-  auto ir = R"IR(
-graph(%x : int, %y : int):
-  %z : int = aten::mul(%x, %y)
-  %r : int = aten::mul(%z, %x)
-  return (%r, %z))IR";
-  auto graph = std::make_shared<Graph>();
-  std::unordered_map<std::string, Value*> vmap;
-  parseIR(ir, graph.get(), vmap);
-  TensorExprKernel k(graph);
-
-  auto stmt = k.getCodeGenStmt();
-  std::ostringstream oss;
-  oss << *stmt;
-
-  // Verify the generated IR. We expect to see a scalar variable (Let) followed
-  // by a store to a 0-dim buffer.
-  const std::string& verification_pattern = R"IR(
-# CHECK: int64_t
-# CHECK-NEXT: [0ll] =
-# CHECK-NEXT: int64_t
-# CHECK-NEXT: [0ll] =
-)IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  int64_t x = 2, y = 3, r = 0, z = 0;
-
-  // Verify that TEK::runFast works correctly with scalar outputs
-  std::vector<void*> inputs = {&x, &y};
-  std::vector<void*> outputs = {&r, &z};
-  k.runFast(inputs, outputs);
-  TORCH_CHECK_EQ(z, x * y);
-  TORCH_CHECK_EQ(r, z * x);
-
-  // Verify that TEK::run works correctly with scalar outputs
-  std::vector<IValue> stack = {x, y};
-  k.run(stack);
-  TORCH_CHECK_EQ(stack[0], x * y * x);
-  TORCH_CHECK_EQ(stack[1], x * y);
-}
-
-TEST_F(Kernel, ScalarTensorOut) {
-  auto ir = R"IR(
-graph(%x : int,
-      %xt : Long(3, strides=[1], device=cpu),
-      %y : int,
-      %yt : Long(3, strides=[1], device=cpu)):
-  %z : int = aten::mul(%x, %y)
-  %r : int = aten::mul(%z, %x)
-  %zt : Long(3, strides=[1], device=cpu) = aten::mul(%xt, %y)
-  %rt : Long(3, strides=[1], device=cpu) = aten::mul(%zt, %xt)
-  return (%r, %rt, %z, %zt))IR";
-  auto graph = std::make_shared<Graph>();
-  std::unordered_map<std::string, Value*> vmap;
-  parseIR(ir, graph.get(), vmap);
-  TensorExprKernel k(graph);
-  int64_t x = 2, y = 3, r = 0, z = 0;
-  auto xt = at::ones({3}, TensorOptions(kCPU).dtype(at::kLong)) * 2;
-  auto yt = at::ones({3}, TensorOptions(kCPU).dtype(at::kLong)) * 3;
-  auto zt = at::zeros({3}, TensorOptions(kCPU).dtype(at::kLong));
-  auto rt = at::zeros({3}, TensorOptions(kCPU).dtype(at::kLong));
-
-  // Verify that TEK::runFast works correctly with mixed scalar and tensor
-  // inputs/outputs
-  std::vector<void*> inputs = {&x, xt.data_ptr(), &y, yt.data_ptr()};
-  std::vector<void*> outputs = {&r, rt.data_ptr(), &z, zt.data_ptr()};
-  k.runFast(inputs, outputs);
-  TORCH_CHECK_EQ(z, x * y);
-  TORCH_CHECK_EQ(r, z * x);
-  ASSERT_TRUE(at::equal(zt, xt * yt));
-  ASSERT_TRUE(at::equal(rt, zt * xt));
-
-  // Verify that TEK::run works correctly with mixed scalar and tensor
-  // inputs/outputs
-  std::vector<IValue> stack = {x, xt, y, yt};
-  k.run(stack);
-  TORCH_CHECK_EQ(stack[0], x * y * x);
-  ASSERT_TRUE(at::equal(stack[1].toTensor(), xt * yt * xt));
-  TORCH_CHECK_EQ(stack[2], x * y);
-  ASSERT_TRUE(at::equal(stack[3].toTensor(), xt * yt));
-}
-
-TEST_F(Kernel, FuseLoopsWithVariableBounds) {
-#ifdef TORCH_ENABLE_LLVM
-  bool old_cat_wo_conditionals = getCatWoConditionals();
-  getCatWoConditionals() = true;
-  const auto graph_string = R"IR(
-      graph(%a : Float(SS(-2), 3, SS(-3), requires_grad=0, device=cpu),
-            %b : Float(SS(-2), 7, SS(-3), requires_grad=0, device=cpu),
-            %c : Float(SS(-2), 9, SS(-3), requires_grad=0, device=cpu),
-            %SS_2 : int,
-            %SS_3 : int):
-        %dim : int = prim::Constant[value=1]()
-        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
-        %r : Float(SS(-2), 19, SS(-3), requires_grad=0, device=cpu) = aten::cat(%inputs, %dim)               # new size: [5,19,2]
-        return (%r))IR";
-  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, graph.get());
-
-  std::vector<int64_t> symbolic_shape_inputs = {-2, -3};
-
-  std::vector<torch::jit::StrideInput> input_desc = {
-      torch::jit::StrideInput::TENSOR_CONT};
-  std::unordered_map<
-      const torch::jit::Value*,
-      std::vector<torch::jit::StrideInput>>
-      symbolic_strides;
-  symbolic_strides[graph->inputs().at(0)] = input_desc;
-  symbolic_strides[graph->inputs().at(1)] = input_desc;
-  symbolic_strides[graph->inputs().at(2)] = input_desc;
-  symbolic_strides[graph->outputs().at(0)] = input_desc;
-
-  TensorExprKernel kernel(
-      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-
-  std::ostringstream oss;
-  oss << *kernel.getCodeGenStmt();
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int64_t i
-# CHECK-NEXT: for (int64_t j
-# CHECK-NEXT: for (int64_t k
-# CHECK: for (int64_t j
-# CHECK-NEXT: for (int64_t k
-# CHECK: for (int64_t j
-# CHECK-NEXT: for (int64_t k
-# CHECK-NOT: for (int64_t i
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto run_kernel = [&](int dim1, int dim2) {
-    auto a =
-        at::rand({dim1, 3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
-    auto b =
-        at::rand({dim1, 7, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
-    auto c =
-        at::rand({dim1, 9, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
-
-    auto ref = at::cat({a, b, c}, 1);
-
-    std::vector<IValue> stack =
-        fmap<IValue>(std::vector<at::Tensor>({a, b, c}));
-    stack.emplace_back(dim1);
-    stack.emplace_back(dim2);
-    kernel.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  };
-
-  run_kernel(10, 20);
-  getCatWoConditionals() = old_cat_wo_conditionals;
-#endif
-}
-
-TEST_F(Kernel, FuseLoopsWithVariableConcatDim) {
-#ifdef TORCH_ENABLE_LLVM
-  bool old_cat_wo_conditionals = getCatWoConditionals();
-  getCatWoConditionals() = true;
-  const auto graph_string = R"IR(
-      graph(%a : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu),
-            %b : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu),
-            %c : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu),
-            %SS_2 : int,
-            %SS_3 : int,
-            %SS_4 : int,
-            %SS_5 : int):
-        %dim : int = prim::Constant[value=1]()
-        %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c)
-        %r : Float(SS(-2), SS(-5), SS(-3), requires_grad=0, device=cpu) = aten::cat(%inputs, %dim)               # new size: [5,19,2]
-        return (%r))IR";
-  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, graph.get());
-
-  std::vector<int64_t> symbolic_shape_inputs = {-2, -3, -4, -5};
-
-  std::vector<torch::jit::StrideInput> input_desc = {
-      torch::jit::StrideInput::TENSOR_CONT};
-  std::unordered_map<
-      const torch::jit::Value*,
-      std::vector<torch::jit::StrideInput>>
-      symbolic_strides;
-  symbolic_strides[graph->inputs().at(0)] = input_desc;
-  symbolic_strides[graph->inputs().at(1)] = input_desc;
-  symbolic_strides[graph->inputs().at(2)] = input_desc;
-  symbolic_strides[graph->outputs().at(0)] = input_desc;
-
-  TensorExprKernel kernel(
-      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-
-  std::ostringstream oss;
-  oss << *kernel.getCodeGenStmt();
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int64_t i
-# CHECK-NEXT: for (int64_t j
-# CHECK-NEXT: for (int64_t k
-# CHECK: for (int64_t j
-# CHECK-NEXT: for (int64_t k
-# CHECK: for (int64_t j
-# CHECK-NEXT: for (int64_t k
-# CHECK-NOT: for (int64_t i
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto run_kernel = [&](int dim1, int dim2, int dim3) {
-    auto a =
-        at::rand({dim1, dim3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
-    auto b =
-        at::rand({dim1, dim3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
-    auto c =
-        at::rand({dim1, dim3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat));
-
-    auto ref = at::cat({a, b, c}, 1);
-
-    std::vector<IValue> stack =
-        fmap<IValue>(std::vector<at::Tensor>({a, b, c}));
-    stack.emplace_back(dim1);
-    stack.emplace_back(dim2);
-    stack.emplace_back(dim3);
-    stack.emplace_back(3 * dim3);
-    kernel.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  };
-
-  run_kernel(10, 20, 15);
-  getCatWoConditionals() = old_cat_wo_conditionals;
-#endif
-}
-
-TEST_F(Kernel, DoNotFuseLoopsWithMismatchingVariableDims) {
-#ifdef TORCH_ENABLE_LLVM
-  bool old_cat_wo_conditionals = getCatWoConditionals();
-  getCatWoConditionals() = true;
-  const auto graph_string = R"IR(
-      graph(%a : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu),
-            %b : Float(SS(-2), SS(-5), SS(-3), requires_grad=0, device=cpu),
-            %SS_2 : int,
-            %SS_3 : int,
-            %SS_4 : int,
-            %SS_5 : int,
-            %SS_6 : int):
-        %dim : int = prim::Constant[value=1]()
-        %inputs : Tensor[] = prim::ListConstruct(%a, %b)
-        %r : Float(SS(-2), SS(-6), SS(-3), requires_grad=0, device=cpu) = aten::cat(%inputs, %dim)               # new size: [5,19,2]
-        return (%r))IR";
-  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, graph.get());
-
-  std::vector<int64_t> symbolic_shape_inputs = {-2, -3, -4, -5, -6};
-
-  std::vector<torch::jit::StrideInput> input_desc = {
-      torch::jit::StrideInput::TENSOR_CONT};
-  std::unordered_map<
-      const torch::jit::Value*,
-      std::vector<torch::jit::StrideInput>>
-      symbolic_strides;
-  symbolic_strides[graph->inputs().at(0)] = input_desc;
-  symbolic_strides[graph->inputs().at(1)] = input_desc;
-  symbolic_strides[graph->outputs().at(0)] = input_desc;
-
-  TensorExprKernel kernel(
-      graph, {}, symbolic_shape_inputs, false, symbolic_strides);
-
-  std::ostringstream oss;
-  oss << *kernel.getCodeGenStmt();
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int64_t i
-# CHECK-NEXT: for (int64_t j
-# CHECK-NEXT: for (int64_t k
-# CHECK: for (int64_t j
-# CHECK-NEXT: for (int64_t k
-# CHECK-NOT: for (int64_t j
-# CHECK-NOT: for (int64_t i
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  auto run_kernel = [&](int dim2, int dim3, int dim4, int dim5) {
-    auto a =
-        at::rand({dim2, dim4, dim3}, at::TensorOptions(kCPU).dtype(at::kFloat));
-    auto b =
-        at::rand({dim2, dim5, dim3}, at::TensorOptions(kCPU).dtype(at::kFloat));
-
-    auto ref = at::cat({a, b}, 1);
-
-    std::vector<IValue> stack = fmap<IValue>(std::vector<at::Tensor>({a, b}));
-    stack.emplace_back(dim2);
-    stack.emplace_back(dim3);
-    stack.emplace_back(dim4);
-    stack.emplace_back(dim5);
-    stack.emplace_back(dim4 + dim5);
-    kernel.run(stack);
-
-    auto o = stack[0].toTensor();
-    ASSERT_TRUE(at::allclose(o, ref));
-  };
-
-  run_kernel(10, 20, 15, 8);
-  getCatWoConditionals() = old_cat_wo_conditionals;
-#endif
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
deleted file mode 100644
index f6ffc84f62c09..0000000000000
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ /dev/null
@@ -1,1799 +0,0 @@
-#ifdef TORCH_ENABLE_LLVM
-#include <gtest/gtest.h>
-
-#include <test/cpp/tensorexpr/test_base.h>
-
-#include <c10/util/irange.h>
-#include <test/cpp/tensorexpr/padded_buffer.h>
-#include <test/cpp/tensorexpr/test_utils.h>
-#include <torch/csrc/jit/tensorexpr/eval.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_printer.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/csrc/jit/testing/file_check.h>
-
-#include <cmath>
-#include <numeric>
-
-namespace torch {
-namespace jit {
-using namespace torch::jit::tensorexpr;
-
-using LLVMExprEval = ExprEval<LLVMCodeGen>;
-
-// Typed tests, can't use gtest params here due to the way we instantiate tests.
-#define TEST_LLVM_SCALAR_TYPES(_) \
-  _(uint8_t, Byte, 24)            \
-  _(int8_t, Char, -20)            \
-  _(int16_t, Short, 3332)         \
-  _(int, Int, 123456)             \
-  _(int64_t, Long, 2631563121321) \
-  _(float, Float, 0.122)          \
-  _(double, Double, 0.21312)      \
-  _(at::Half, Half, 0.128f)
-
-#define IMM_TEST(Type, Name, Val)                  \
-  TEST(LLVM, Name##ImmTest) {                      \
-    auto a = Name##Imm::make(Val);                 \
-    LLVMExprEval cg(a);                            \
-    if (std::is_floating_point<decltype(Val)>()) { \
-      ASSERT_NEAR(cg.value<Type>(), Val, 0.1);     \
-    } else {                                       \
-      ASSERT_EQ(cg.value<Type>(), Val);            \
-    }                                              \
-  }
-TEST_LLVM_SCALAR_TYPES(IMM_TEST)
-#undef IMM_TEST
-
-#define ADD_TEST(Type, Name, Val)                  \
-  TEST(LLVM, Name##AddTest) {                      \
-    auto a = Name##Imm::make(Val);                 \
-    auto b = Name##Imm::make(Val * 2);             \
-    auto c = Add::make(a, b);                      \
-    LLVMExprEval cg(c);                            \
-    if (std::is_floating_point<decltype(Val)>()) { \
-      ASSERT_NEAR(cg.value<Type>(), Val * 3, 0.1); \
-    } else {                                       \
-      ASSERT_EQ(cg.value<Type>(), Val * 3);        \
-    }                                              \
-  }
-TEST_LLVM_SCALAR_TYPES(ADD_TEST)
-#undef ADD_TEST
-
-#define SUB_TEST(Type, Name, Val)                  \
-  TEST(LLVM, Name##SubTest) {                      \
-    auto a = Name##Imm::make(Val * 2);             \
-    auto b = Name##Imm::make(Val);                 \
-    auto c = Sub::make(a, b);                      \
-    LLVMExprEval cg(c);                            \
-    if (std::is_floating_point<decltype(Val)>()) { \
-      ASSERT_NEAR(cg.value<Type>(), Val, 0.1);     \
-    } else {                                       \
-      ASSERT_EQ(cg.value<Type>(), Val);            \
-    }                                              \
-  }
-TEST_LLVM_SCALAR_TYPES(SUB_TEST)
-#undef SUB_TEST
-
-#define MUL_TEST(Type, Name, Val)                  \
-  TEST(LLVM, Name##MulTest) {                      \
-    auto a = Name##Imm::make(Val);                 \
-    auto b = Name##Imm::make((Type)4);             \
-    auto c = Mul::make(a, b);                      \
-    LLVMExprEval cg(c);                            \
-    if (std::is_floating_point<decltype(Val)>()) { \
-      ASSERT_NEAR(cg.value<Type>(), Val * 4, 0.1); \
-    } else {                                       \
-      ASSERT_EQ(cg.value<Type>(), Val * 4);        \
-    }                                              \
-  }
-TEST_LLVM_SCALAR_TYPES(MUL_TEST)
-#undef MUL_TEST
-
-#define DIV_TEST(Type, Name, Val)                  \
-  TEST(LLVM, Name##DivTest) {                      \
-    auto a = Name##Imm::make((Type)6);             \
-    auto b = Name##Imm::make((Type)3);             \
-    auto c = Div::make(a, b);                      \
-    LLVMExprEval cg(c);                            \
-    if (std::is_floating_point<decltype(Val)>()) { \
-      ASSERT_NEAR(cg.value<Type>(), 2, 0.1);       \
-    } else {                                       \
-      ASSERT_EQ(cg.value<Type>(), 2);              \
-    }                                              \
-  }
-TEST_LLVM_SCALAR_TYPES(DIV_TEST)
-#undef DIV_TEST
-
-TEST(LLVM, IntToFloatCastTest) {
-  auto a = IntImm::make(2);
-  auto b = Cast::make(kFloat, a);
-  LLVMExprEval cg(b, {});
-  ASSERT_EQ(cg.value<float>(), 2.0);
-}
-
-TEST(LLVM, FloatToIntCastTest) {
-  auto a = FloatImm::make(2.0);
-  auto b = Cast::make(kInt, a);
-  LLVMExprEval cg(b);
-  ASSERT_EQ(cg.value<int>(), 2);
-}
-
-TEST(LLVM, IntToLongCastTest) {
-  auto a = IntImm::make(12345);
-  auto b = Cast::make(kLong, a);
-  LLVMExprEval cg(b);
-  ASSERT_EQ(cg.value<int64_t>(), 12345);
-}
-
-TEST(LLVM, ByteToCharCastTest) {
-  auto a = ByteImm::make(250);
-  auto b = Cast::make(kChar, a);
-  LLVMExprEval cg(b);
-  ASSERT_EQ(cg.value<int8_t>(), (int8_t)250);
-}
-
-TEST(LLVM, HalfToLongCastTest) {
-  auto a = HalfImm::make(2.0);
-  auto b = Cast::make(kLong, a);
-  LLVMExprEval cg(b);
-  ASSERT_EQ(cg.value<int64_t>(), 2);
-}
-
-TEST(LLVM, ByteToDoubleCastTest) {
-  auto a = ByteImm::make(2);
-  auto b = Cast::make(kDouble, a);
-  LLVMExprEval cg(b);
-  ASSERT_EQ(cg.value<double>(), 2);
-}
-
-TEST(LLVM, FloatToByteCastTest) {
-  auto a = FloatImm::make(254.0);
-  auto b = Cast::make(kByte, a);
-  LLVMExprEval cg(b);
-  ASSERT_EQ(cg.value<uint8_t>(), 254);
-}
-
-TEST(LLVM, FloatToCharCastTest) {
-  auto a = FloatImm::make(-2.0);
-  auto b = Cast::make(kChar, a);
-  LLVMExprEval cg(b);
-  ASSERT_EQ(cg.value<int8_t>(), -2);
-}
-
-TEST(LLVM, ByteToFloatCastTest) {
-  auto a = ByteImm::make(254);
-  auto b = Cast::make(kFloat, a);
-  LLVMExprEval cg(b);
-  ASSERT_EQ(cg.value<float>(), 254.0);
-}
-
-TEST(LLVM, CharToFloatCastTest) {
-  auto a = CharImm::make(-2);
-  auto b = Cast::make(kFloat, a);
-  LLVMExprEval cg(b);
-  ASSERT_EQ(cg.value<float>(), -2.0);
-}
-
-TEST(LLVM, BitCast) {
-  /* constexpr int16_t ref16 = 1337; */
-  constexpr int32_t ref32 = 1337;
-  constexpr int64_t ref64 = 1337;
-  constexpr float reff32 = 1337.0f;
-  constexpr double reff64 = 1337.0f;
-
-  // this is broken
-  /*{
-    at::Half k_;
-    at::Half* k = &k_;
-    *reinterpret_cast<int16_t*>(k) = ref16;
-    auto a = HalfImm::make(k);
-    auto b = BitCast::make(kShort, a);
-    LLVMExprEval cg(b);
-    ASSERT_EQ(cg.value<int16_t>(), ref16);
-  }*/
-
-  {
-    float k = raw_bitcast<float>(ref32);
-    auto a = FloatImm::make(k);
-    auto b = BitCast::make(kInt, a);
-    LLVMExprEval cg(b);
-    ASSERT_EQ(cg.value<int32_t>(), ref32);
-  }
-
-  {
-    double k = raw_bitcast<double>(ref64);
-    auto a = DoubleImm::make(k);
-    auto b = BitCast::make(kLong, a);
-    LLVMExprEval cg(b);
-    ASSERT_EQ(cg.value<int64_t>(), ref64);
-  }
-
-  {
-    int64_t k = raw_bitcast<int64_t>(reff64);
-    auto a = LongImm::make(k);
-    auto b = BitCast::make(kDouble, a);
-    LLVMExprEval cg(b);
-    ASSERT_EQ(cg.value<double>(), reff64);
-  }
-
-  {
-    int32_t k = raw_bitcast<int32_t>(reff32);
-    auto a = IntImm::make(k);
-    auto b = BitCast::make(kFloat, a);
-    LLVMExprEval cg(b);
-    ASSERT_EQ(cg.value<float>(), reff32);
-  }
-}
-
-TEST(LLVM, fastLogFloat) {
-  const int kTotalSize = 128 * 128;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kFloat);
-
-  VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = a_buf.load(index);
-  StmtPtr store_b = b_buf.store({index}, fast_log(load_a));
-  StmtPtr stmt = For::make(index, 0, kTotalSize, store_b);
-
-  PaddedBuffer<float> a_v(kTotalSize);
-  PaddedBuffer<float> b_v(kTotalSize);
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    a_v(i) = at::randn({1}).item().to<float>();
-  }
-
-  LLVMCodeGen ir_eval(stmt, {a_buf, b_buf});
-  ir_eval.call({a_v, b_v});
-
-  for (const auto i : c10::irange(kTotalSize)) {
-    auto test = b_v(i);
-    auto ref = std::log(a_v(i));
-    if (std::isnan(ref)) {
-      ASSERT_EQ(std::isnan(test), true);
-    } else {
-      ASSERT_FLOAT_EQ(test, ref);
-    }
-  }
-}
-
-TEST(LLVM, LetTest01) {
-  BufHandle a("A", {1}, kFloat);
-  std::vector<float> v = {1, 0};
-  std::vector<void*> args({v.data()});
-  VarHandle x("x", kFloat);
-  auto block = Block::make({
-      Let::make(x, 3.f),
-      a.store({0}, ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f))),
-  });
-
-  LLVMCodeGen cg(block, {a});
-  ASSERT_EQ(cg.value<int>(args), 0);
-  ASSERT_EQ(v[0], 2.f + 3.f * 3.f + 4.f);
-}
-
-TEST(LLVM, LetTest02) {
-  BufHandle a("A", {1}, kFloat);
-  std::vector<float> v = {1, 0};
-  std::vector<void*> args({v.data()});
-  VarHandle x("x", kFloat);
-  VarHandle y("y", kFloat);
-  auto block = Block::make(
-      {Let::make(x, 3.f),
-       Let::make(y, 6.f),
-       a.store(
-           {IntImm::make(0)},
-           ExprHandle(2.f) + (x * ExprHandle(3.f) + y * ExprHandle(4.f)))});
-
-  LLVMCodeGen cg(block, {a});
-  ASSERT_EQ(cg.value<int>(args), 0);
-  ASSERT_EQ(v[0], 2.f + 3.f * 3.f + 6.f * 4.f);
-}
-
-TEST(LLVM, LetTestMultitype) {
-  BufHandle a("A", {1}, kDouble);
-  std::vector<double> v = {1, 0};
-  std::vector<void*> args({v.data()});
-  VarHandle x("x", kByte);
-  VarHandle y("y", kHalf);
-  auto block = Block::make(
-      {Let::make(x, 3),
-       Let::make(y, 6.f),
-       a.store(
-           {0},
-           Cast::make(
-               kDouble,
-               ExprHandle(2.f) +
-                   (x * ExprHandle(3.f) + y * ExprHandle(4.f))))});
-
-  LLVMCodeGen cg(block, {a});
-  ASSERT_EQ(cg.value<int>(args), 0);
-  ASSERT_EQ(v[0], 2.f + 3 * 3.f + 6.f * 4.f);
-}
-
-TEST(LLVM, BufferTest) {
-  BufHandle a("A", {32}, kFloat);
-  std::vector<int32_t> v(5);
-  std::vector<void*> args({v.data()});
-  auto rv = IntImm::make(0);
-  LLVMExprEval cg(rv, {a});
-  ASSERT_EQ(cg.value<int>(args), 0);
-}
-
-TEST(LLVM, BlockTest) {
-  BufHandle a("A", {32}, kInt);
-  std::vector<int32_t> v = {1, 2};
-  std::vector<void*> args({v.data()});
-
-  auto block = Block::make({
-      a.store({0}, 3),
-      a.store({1}, 4),
-      a.store({0}, 4),
-  });
-
-  LLVMCodeGen cg(block, {a});
-  ASSERT_EQ(cg.value<int>(args), 0);
-  ASSERT_EQ(v[0], 4);
-  ASSERT_EQ(v[1], 4);
-}
-
-TEST(LLVM, LoadStoreTest) {
-  BufHandle a("A", {1}, kInt);
-  BufHandle b("B", {1}, kInt);
-  std::vector<int32_t> a_buffer = {42};
-  std::vector<int32_t> b_buffer = {-11};
-
-  auto store = b.store({0}, a.load(0));
-  LLVMCodeGen cg(store, {a, b});
-  std::vector<void*> args({a_buffer.data(), b_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-  ASSERT_EQ(a_buffer[0], 42);
-  ASSERT_EQ(b_buffer[0], 42);
-}
-
-TEST(LLVM, IfThenElseTest) {
-  BufHandle a("A", {1}, kInt);
-  BufHandle b("B", {1}, kInt);
-  BufHandle c("C", {1}, kInt);
-  std::vector<int32_t> a_buffer = {42};
-  std::vector<int32_t> b_buffer = {-11};
-  std::vector<int32_t> c_buffer = {1};
-
-  auto store = b.store({0}, IfThenElse::make(c.load(0), a.load(0), 0));
-  LLVMCodeGen cg(store, {a, b, c});
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-  ASSERT_EQ(a_buffer[0], 42);
-  ASSERT_EQ(b_buffer[0], 42);
-}
-
-// if (x < 10) x = x + 1
-TEST(LLVM, CondNoFalseBlockTest) {
-  BufHandle x("X", {1}, kInt);
-  auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
-  auto cond = Cond::make(cmp, x.store({0}, x.load(0) + 1), nullptr);
-
-  for (int32_t x_value : {0, 10, 20}) {
-    std::vector<int32_t> x_buffer = {x_value};
-    std::vector<void*> args({x_buffer.data()});
-    LLVMCodeGen cg(cond, {x});
-    ASSERT_EQ(cg.value<int>(args), 0);
-    if (x_value < 10) {
-      ASSERT_EQ(x_buffer[0], x_value + 1);
-    } else {
-      ASSERT_EQ(x_buffer[0], x_value);
-    }
-  }
-}
-
-// if (x < 10) {
-//   x = x + 1;
-// } else {
-//   x = x - 1;
-// }
-TEST(LLVM, CondTest) {
-  BufHandle x("X", {1}, kInt);
-  auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
-  auto cond =
-      Cond::make(cmp, x.store({0}, x.load(0) + 1), x.store({0}, x.load(0) - 1));
-  auto block = Block::make({
-      cond,
-      x.store({0}, x.load(0) * 2),
-  });
-
-  for (int32_t x_value : {0, 10, 20}) {
-    std::vector<int32_t> x_buffer = {x_value};
-    std::vector<void*> args({x_buffer.data()});
-    LLVMCodeGen cg(block, {x});
-    ASSERT_EQ(cg.value<int>(args), 0);
-    if (x_value < 10) {
-      ASSERT_EQ(x_buffer[0], (x_value + 1) * 2);
-    } else {
-      ASSERT_EQ(x_buffer[0], (x_value - 1) * 2);
-    }
-  }
-}
-
-// if (x < 10) {
-//   if (x > 5) {
-//     x = x + 1;
-//   } else {
-//     x = x - 1;
-//   }
-// } else {
-//   if (x <= 15) {
-//     x = x + 2;
-//   } else {
-//     x = x - 2;
-//   }
-// }
-TEST(LLVM, CondNestedTest) {
-  BufHandle x("X", {1}, kInt);
-  auto true_cmp =
-      CompareSelect::make(x.load(0), 5, CompareSelectOperation::kGT);
-  auto true_cond = Cond::make(
-      true_cmp, x.store({0}, x.load(0) + 1), x.store({0}, x.load(0) - 1));
-  auto false_cmp =
-      CompareSelect::make(x.load(0), 15, CompareSelectOperation::kLE);
-  auto false_cond = Cond::make(
-      false_cmp, x.store({0}, x.load(0) + 2), x.store({0}, x.load(0) - 2));
-  auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
-  auto cond = Cond::make(cmp, true_cond, false_cond);
-
-  for (int32_t x_value : {0, 8, 15, 20}) {
-    std::vector<int32_t> x_buffer = {x_value};
-    std::vector<void*> args({x_buffer.data()});
-    LLVMCodeGen cg(cond, {x});
-    ASSERT_EQ(cg.value<int>(args), 0);
-    if (x_value < 10) {
-      if (x_value > 5) {
-        ASSERT_EQ(x_buffer[0], x_value + 1);
-      } else {
-        ASSERT_EQ(x_buffer[0], x_value - 1);
-      }
-    } else {
-      if (x_value <= 15) {
-        ASSERT_EQ(x_buffer[0], x_value + 2);
-      } else {
-        ASSERT_EQ(x_buffer[0], x_value - 2);
-      }
-    }
-  }
-}
-
-TEST(LLVM, DirectVectorization) {
-  constexpr int M = 3;
-  constexpr int N = 64;
-  BufHandle a("a", {M, N}, kFloat);
-  BufHandle b("b", {M, N}, kFloat);
-  BufHandle c("c", {M, N}, kFloat);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  StmtPtr s = For::make(
-      m,
-      0,
-      M,
-      Store::make(
-          c,
-          {Ramp::make(m * 64, 1, 64)},
-          Load::make({kFloat, 64}, a, {Ramp::make(m * 64, 1, 64)}) *
-              Load::make({kFloat, 64}, b, {Ramp::make(m * 64, 1, 64)})));
-  LLVMCodeGen cg(s, {a, b, c});
-}
-
-TEST(LLVM, VecLoadStoreTest) {
-  BufHandle a("A", {1}, kInt);
-  BufHandle b("B", {1}, kInt);
-  std::vector<int32_t> a_buffer = {1, 1, 1, 1};
-  std::vector<int32_t> b_buffer = {2, 2, 2, 2};
-
-  auto store = b.store({Ramp::make(0, 1, 4)}, a.load({Ramp::make(0, 1, 4)}));
-  LLVMCodeGen cg(store, {a, b});
-  std::vector<void*> args({a_buffer.data(), b_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-  ASSERT_EQ(a_buffer[0], 1);
-  ASSERT_EQ(a_buffer[1], 1);
-  ASSERT_EQ(a_buffer[2], 1);
-  ASSERT_EQ(a_buffer[3], 1);
-  ASSERT_EQ(b_buffer[0], 1);
-  ASSERT_EQ(b_buffer[1], 1);
-  ASSERT_EQ(b_buffer[2], 1);
-  ASSERT_EQ(b_buffer[3], 1);
-}
-
-#define FLOAT_INTRINSICS_TEST(Name, Lanes)                                   \
-  TEST(LLVM, VecFloat_##Name##Lane##Lanes##Test) {                           \
-    BufHandle a("A", {1}, kFloat);                                           \
-    BufHandle b("B", {1}, kFloat);                                           \
-    float val = 0.5f;                                                        \
-    std::vector<float> a_buffer(Lanes, val);                                 \
-    std::vector<float> b_buffer(Lanes, val);                                 \
-    auto store = b.store(                                                    \
-        {Ramp::make(0, 1, Lanes)}, Name(a.load({Ramp::make(0, 1, Lanes)}))); \
-    LLVMCodeGen cg(store, {a, b});                                           \
-    std::vector<void*> args({a_buffer.data(), b_buffer.data()});             \
-    ASSERT_EQ(cg.value<int>(args), 0);                                       \
-    for (const auto i : c10::irange(Lanes)) {                                \
-      ASSERT_FLOAT_EQ(a_buffer[i], val);                                     \
-    }                                                                        \
-  } // namespace jit
-FLOAT_INTRINSICS_TEST(erf, 4)
-FLOAT_INTRINSICS_TEST(erfc, 4)
-FLOAT_INTRINSICS_TEST(acos, 4)
-FLOAT_INTRINSICS_TEST(asin, 4)
-FLOAT_INTRINSICS_TEST(atan, 4)
-FLOAT_INTRINSICS_TEST(cosh, 4)
-FLOAT_INTRINSICS_TEST(sinh, 4)
-FLOAT_INTRINSICS_TEST(tanh, 4)
-FLOAT_INTRINSICS_TEST(expm1, 4)
-FLOAT_INTRINSICS_TEST(lgamma, 4)
-FLOAT_INTRINSICS_TEST(erf, 8)
-FLOAT_INTRINSICS_TEST(erfc, 8)
-FLOAT_INTRINSICS_TEST(acos, 8)
-FLOAT_INTRINSICS_TEST(asin, 8)
-FLOAT_INTRINSICS_TEST(atan, 8)
-FLOAT_INTRINSICS_TEST(cosh, 8)
-FLOAT_INTRINSICS_TEST(sinh, 8)
-FLOAT_INTRINSICS_TEST(tanh, 8)
-FLOAT_INTRINSICS_TEST(expm1, 8)
-FLOAT_INTRINSICS_TEST(lgamma, 8)
-#undef FLOAT_INTRINSICS_TEST
-
-#define DOUBLE_INTRINSICS_TEST(Name, Lanes)                                  \
-  TEST(LLVM, VecDouble_##Name##Lane##Lanes##Test) {                          \
-    BufHandle a("A", {1}, kDouble);                                          \
-    BufHandle b("B", {1}, kDouble);                                          \
-    float val = 0.5f;                                                        \
-    std::vector<double> a_buffer(Lanes, val);                                \
-    std::vector<double> b_buffer(Lanes, val);                                \
-    auto store = b.store(                                                    \
-        {Ramp::make(0, 1, Lanes)}, Name(a.load({Ramp::make(0, 1, Lanes)}))); \
-    LLVMCodeGen cg(store, {a, b});                                           \
-    std::vector<void*> args({a_buffer.data(), b_buffer.data()});             \
-    ASSERT_EQ(cg.value<int>(args), 0);                                       \
-    for (const auto i : c10::irange(Lanes)) {                                \
-      ASSERT_FLOAT_EQ(a_buffer[i], val);                                     \
-    }                                                                        \
-  } // namespace jit
-DOUBLE_INTRINSICS_TEST(erf, 2)
-DOUBLE_INTRINSICS_TEST(erfc, 2)
-DOUBLE_INTRINSICS_TEST(acos, 2)
-DOUBLE_INTRINSICS_TEST(asin, 2)
-DOUBLE_INTRINSICS_TEST(atan, 2)
-DOUBLE_INTRINSICS_TEST(cosh, 2)
-DOUBLE_INTRINSICS_TEST(sinh, 2)
-DOUBLE_INTRINSICS_TEST(tanh, 2)
-DOUBLE_INTRINSICS_TEST(expm1, 2)
-DOUBLE_INTRINSICS_TEST(lgamma, 2)
-DOUBLE_INTRINSICS_TEST(erf, 4)
-DOUBLE_INTRINSICS_TEST(erfc, 4)
-DOUBLE_INTRINSICS_TEST(acos, 4)
-DOUBLE_INTRINSICS_TEST(asin, 4)
-DOUBLE_INTRINSICS_TEST(atan, 4)
-DOUBLE_INTRINSICS_TEST(cosh, 4)
-DOUBLE_INTRINSICS_TEST(sinh, 4)
-DOUBLE_INTRINSICS_TEST(tanh, 4)
-DOUBLE_INTRINSICS_TEST(expm1, 4)
-DOUBLE_INTRINSICS_TEST(lgamma, 4)
-#undef DOUBLE_INTRINSICS_TEST
-
-TEST(LLVM, VectorizerLoadStoreTest) {
-  BufHandle a("A", {1}, kInt);
-
-  Tensor c = Compute("c", {4}, [&](const VarHandle& i) { return a.load(i); });
-
-  BufHandle c_buf(c.buf());
-  LoopNest l({c});
-  StmtPtr s = l.root_stmt();
-  ASSERT_TRUE(LoopNest::vectorize(to<For>(to<Block>(s)->front())));
-
-  ASSERT_TRUE(to<For>(to<Block>(s)->front()) == nullptr);
-
-  LLVMCodeGen cg(s, {a, c_buf});
-
-  std::vector<int> a_vec(4, 21);
-  std::vector<int> c_vec(4, 0);
-  std::vector<void*> args({a_vec.data(), c_vec.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-  assertAllEqual(c_vec, 21);
-}
-
-TEST(LLVM, VectorizeBitCast) {
-  BufHandle a("A", {128}, kInt);
-
-  Tensor c = Compute("c", {128}, [&](const VarHandle& i) {
-    return bitcast<float>(a.load(i));
-  });
-
-  BufHandle c_buf(c.buf());
-  LoopNest l({c});
-  StmtPtr s = l.root_stmt();
-  ASSERT_TRUE(LoopNest::vectorize(to<For>(to<Block>(s)->front())));
-  ASSERT_TRUE(to<For>(to<Block>(s)->front()) == nullptr);
-
-  LLVMCodeGen cg(s, {a, c_buf});
-
-  std::vector<int> a_vec(128);
-  std::vector<float> c_vec(128);
-  for (const auto i : c10::irange(128)) {
-    a_vec[i] = raw_bitcast<int>(1337.f);
-  }
-  std::vector<void*> args({a_vec.data(), c_vec.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-  assertAllEqual(c_vec, 1337.f);
-}
-
-TEST(LLVM, MemcpyTest) {
-  constexpr int N = 32;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  std::vector<int32_t> a_buffer(N, 42);
-  std::vector<int32_t> b_buffer(N, 0);
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(i, 0, N, b.store({i}, a.load(i)));
-
-  LLVMCodeGen cg(expr, {a, b});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  assertAllEqual(a_buffer, 42);
-  assertAllEqual(b_buffer, 42);
-}
-
-TEST(LLVM, BzeroTest) {
-  constexpr int N = 32;
-  BufHandle b("B", {N}, kInt);
-  std::vector<int32_t> b_buffer(N, 11);
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(i, 0, N, b.store({i}, 0));
-
-  LLVMCodeGen cg(expr, {b});
-
-  std::vector<void*> args({b_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(b_buffer.size(), N);
-  assertAllEqual(b_buffer, 0);
-}
-
-TEST(LLVM, ElemwiseAdd) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  std::vector<int32_t> a_buffer(N, 41);
-  std::vector<int32_t> b_buffer(N, 1);
-  std::vector<int32_t> c_buffer(N, 1);
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(i, 0, N, c.store({i}, Add::make(a.load(i), b.load(i))));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-  assertAllEqual(a_buffer, 41);
-  assertAllEqual(b_buffer, 1);
-  assertAllEqual(c_buffer, 42);
-}
-
-TEST(LLVM, ElemwiseAddFloat) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kFloat);
-  BufHandle b("B", {N}, kFloat);
-  BufHandle c("C", {N}, kFloat);
-  std::vector<float> a_buffer(N, 41);
-  std::vector<float> b_buffer(N, 1);
-  std::vector<float> c_buffer(N, 1);
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(i, 0, N, c.store({i}, a.load(i) + b.load(i)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-  assertAllEqual(a_buffer, 41.0f);
-  assertAllEqual(b_buffer, 1.0f);
-  assertAllEqual(c_buffer, 42.0f);
-}
-
-TEST(LLVM, ElemwiseLog10Float) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kFloat);
-  BufHandle b("B", {N}, kFloat);
-  std::vector<float> a_buffer(N, 10.0f);
-  std::vector<float> b_buffer(N, 2.0f);
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N / 4,
-      b.store(
-          {Ramp::make(i * 4, 1, 4)}, log10(a.load({Ramp::make(i * 4, 1, 4)}))));
-
-  LLVMCodeGen cg(expr, {a, b});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  assertAllEqual(a_buffer, 10.0f);
-  assertAllEqual(b_buffer, 1.0f);
-}
-
-TEST(LLVM, ElemwiseLog1pFloat) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kFloat);
-  BufHandle b("B", {N}, kFloat);
-  std::vector<float> a_buffer(N, expf(3.0f) - 1);
-  std::vector<float> b_buffer(N, 42.0f);
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N / 4,
-      b.store(
-          {Ramp::make(i * 4, 1, 4)}, log1p(a.load({Ramp::make(i * 4, 1, 4)}))));
-
-  LLVMCodeGen cg(expr, {a, b});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  assertAllEqual(a_buffer, expf(3.0f) - 1);
-  ExpectAllNear(b_buffer, 3.0f, 1e-5f);
-}
-
-TEST(LLVM, ElemwiseMaxInt) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  std::vector<int> a_buffer(N, 41);
-  std::vector<int> b_buffer(N, 1);
-  std::vector<int> c_buffer(N, 1);
-
-  VarHandle i("i", kInt);
-  auto expr =
-      For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-  assertAllEqual(a_buffer, 41);
-  assertAllEqual(b_buffer, 1);
-  assertAllEqual(c_buffer, 41);
-}
-
-TEST(LLVM, ElemwiseMinInt) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  std::vector<int> a_buffer(N, 41);
-  std::vector<int> b_buffer(N, 1);
-  std::vector<int> c_buffer(N, 1);
-
-  VarHandle i("i", kInt);
-  auto expr =
-      For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-  assertAllEqual(a_buffer, 41);
-  assertAllEqual(b_buffer, 1);
-  assertAllEqual(c_buffer, 1);
-}
-
-TEST(LLVM, ElemwiseMaxFloat) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kFloat);
-  BufHandle b("B", {N}, kFloat);
-  BufHandle c("C", {N}, kFloat);
-  std::vector<float> a_buffer(N, 41);
-  std::vector<float> b_buffer(N, 1);
-  std::vector<float> c_buffer(N, 1);
-
-  VarHandle i("i", kInt);
-  auto expr =
-      For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-  assertAllEqual(a_buffer, 41.0f);
-  assertAllEqual(b_buffer, 1.0f);
-  assertAllEqual(c_buffer, 41.0f);
-}
-
-TEST(LLVM, ElemwiseMaxNaNFloat) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kFloat);
-  BufHandle b("B", {N}, kFloat);
-  BufHandle c("C", {N}, kFloat);
-  std::vector<float> a_buffer(N, NAN);
-  std::vector<float> b_buffer(N, 1);
-  std::vector<float> c_buffer(N, 1);
-
-  VarHandle i("i", kInt);
-  auto expr =
-      For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-  assertAllEqual(b_buffer, 1.0f);
-  for (auto const& elt : c_buffer) {
-    ASSERT_TRUE(std::isnan(elt));
-  }
-}
-
-TEST(LLVM, ElemwiseMinFloat) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kFloat);
-  BufHandle b("B", {N}, kFloat);
-  BufHandle c("C", {N}, kFloat);
-  std::vector<float> a_buffer(N, 41);
-  std::vector<float> b_buffer(N, 1);
-  std::vector<float> c_buffer(N, 1);
-
-  VarHandle i("i", kInt);
-  auto expr =
-      For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-  assertAllEqual(a_buffer, 41.0f);
-  assertAllEqual(b_buffer, 1.0f);
-  assertAllEqual(c_buffer, 1.0f);
-}
-
-TEST(LLVM, ElemwiseMinNaNFloat) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kFloat);
-  BufHandle b("B", {N}, kFloat);
-  BufHandle c("C", {N}, kFloat);
-  std::vector<float> a_buffer(N, NAN);
-  std::vector<float> b_buffer(N, 1);
-  std::vector<float> c_buffer(N, 1);
-
-  VarHandle i("i", kInt);
-  auto expr =
-      For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-  assertAllEqual(b_buffer, 1.0f);
-  for (auto const& elt : c_buffer) {
-    ASSERT_TRUE(std::isnan(elt));
-  }
-}
-
-TEST(LLVM, ElemwiseMod) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  std::vector<int32_t> a_buffer(N, 41);
-  std::vector<int32_t> b_buffer(N, 23);
-  std::vector<int32_t> c_buffer(N, 18);
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(i, 0, N, c.store({i}, Mod::make(a.load(i), b.load(i))));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-  assertAllEqual(a_buffer, 41);
-  assertAllEqual(b_buffer, 23);
-  assertAllEqual(c_buffer, 18);
-}
-
-TEST(LLVM, CompareSelectIntEQ) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  std::vector<int> a_buffer(N, 1);
-  std::vector<int> b_buffer(N, 1);
-  std::vector<int> c_buffer(N, 0);
-  std::vector<int> c_ref(N, 1);
-
-  for (int i = 0; i < N / 2; i++) {
-    b_buffer[i] = 0;
-    c_ref[i] = 0;
-  }
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-
-  assertAllEqual(a_buffer, 1);
-  for (const auto i : c10::irange(N)) {
-    ASSERT_EQ(c_ref[i], c_buffer[i]);
-  }
-}
-
-TEST(LLVM, CompareSelectFloatEQ) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kFloat);
-  BufHandle b("B", {N}, kFloat);
-  BufHandle c("C", {N}, kInt);
-  std::vector<float> a_buffer(N, 1.0f);
-  std::vector<float> b_buffer(N, 1.0f);
-  std::vector<int> c_buffer(N, 0);
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-
-  assertAllEqual(a_buffer, 1.0f);
-  assertAllEqual(b_buffer, 1.0f);
-  assertAllEqual(c_buffer, 1);
-}
-
-TEST(LLVM, CompareSelectByteGT) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kByte);
-  BufHandle b("B", {N}, kByte);
-  BufHandle c("C", {N}, kInt);
-  std::vector<uint8_t> a_buffer(N, 0);
-  std::vector<uint8_t> b_buffer(N, 0);
-  std::vector<int> c_buffer(N, 0);
-  std::vector<int> c_ref(N, 0);
-
-  for (int i = 0; i < N / 2; i++) {
-    a_buffer[i] = 128;
-    c_ref[i] = 1;
-  }
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i), b.load(i), CompareSelectOperation::kGT)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-
-  assertAllEqual(b_buffer, uint8_t(0));
-  for (const auto i : c10::irange(N)) {
-    ASSERT_EQ(c_ref[i], c_buffer[i]);
-  }
-}
-
-TEST(LLVM, CompareSelectByteGE) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kByte);
-  BufHandle b("B", {N}, kByte);
-  BufHandle c("C", {N}, kInt);
-  std::vector<uint8_t> a_buffer(N, 0);
-  std::vector<uint8_t> b_buffer(N, 0);
-  std::vector<int> c_buffer(N, 0);
-  std::vector<int> c_ref(N, 1);
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i), b.load(i), CompareSelectOperation::kGE)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-
-  assertAllEqual(b_buffer, uint8_t(0));
-  for (const auto i : c10::irange(N)) {
-    ASSERT_EQ(c_ref[i], c_buffer[i]);
-  }
-}
-
-TEST(LLVM, CompareSelectByteLT) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kByte);
-  BufHandle b("B", {N}, kByte);
-  BufHandle c("C", {N}, kInt);
-  std::vector<uint8_t> a_buffer(N, 0);
-  std::vector<uint8_t> b_buffer(N, 128);
-  std::vector<int> c_buffer(N, 0);
-  std::vector<int> c_ref(N, 1);
-
-  for (int i = 0; i < N / 2; i++) {
-    a_buffer[i] = 128;
-    c_ref[i] = 0;
-  }
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i), b.load(i), CompareSelectOperation::kLT)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-
-  assertAllEqual(b_buffer, uint8_t(128));
-  for (const auto i : c10::irange(N)) {
-    ASSERT_EQ(c_ref[i], c_buffer[i]);
-  }
-}
-
-TEST(LLVM, CompareSelectByteLE) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kByte);
-  BufHandle b("B", {N}, kByte);
-  BufHandle c("C", {N}, kInt);
-  std::vector<uint8_t> a_buffer(N, 0);
-  std::vector<uint8_t> b_buffer(N, 128);
-  std::vector<int> c_buffer(N, 0);
-  std::vector<int> c_ref(N, 1);
-
-  VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      c.store(
-          {i},
-          CompareSelect::make(
-              a.load(i), b.load(i), CompareSelectOperation::kLE)));
-
-  LLVMCodeGen cg(expr, {a, b, c});
-
-  std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  ASSERT_EQ(a_buffer.size(), N);
-  ASSERT_EQ(b_buffer.size(), N);
-  ASSERT_EQ(c_buffer.size(), N);
-
-  assertAllEqual(b_buffer, uint8_t(128));
-  for (const auto i : c10::irange(N)) {
-    ASSERT_EQ(c_ref[i], c_buffer[i]);
-  }
-}
-
-TEST(LLVM, StoreFloat) {
-  BufHandle result("result", {1}, kFloat);
-  std::vector<float> result_buffer = {0.0f};
-  auto expr = result.store({0}, FloatImm::make(3.14f));
-  LLVMCodeGen cg(expr, {result});
-  std::vector<void*> args({result_buffer.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-  ASSERT_EQ(result_buffer[0], 3.14f);
-}
-
-TEST(LLVM, SimpleMath01) {
-  const int N = 1024;
-  Tensor tensor = Compute(
-      "f", {N}, [](const VarHandle& i) { return cast<float>(i * i + 1); });
-  LoopNest l({tensor});
-  StmtPtr stmt = l.root_stmt();
-  BufHandle f_buf(tensor.buf());
-  LLVMCodeGen cg(stmt, {f_buf});
-
-  PaddedBuffer<float> f_v(N, "f_v");
-  std::vector<void*> args({f_v.data()});
-  int value = cg.value<int>(args);
-  ASSERT_EQ(value, 0);
-  PaddedBuffer<float> f_ref(N, "f_ref");
-  for (const auto i : c10::irange(N)) {
-    f_ref(i) = i * i + 1;
-  }
-  ExpectAllNear(f_v, f_ref, 1e-5);
-}
-
-TEST(LLVM, ComputeMul) {
-  const int N = 1024;
-  BufHandle a("a", {N}, kFloat);
-  BufHandle b("b", {N}, kFloat);
-  Tensor c = Compute(
-      "c", {N}, [&](const VarHandle& i) { return a.load(i) * b.load(i); });
-
-  BufHandle c_buf(c.buf());
-  LoopNest l({c});
-  StmtPtr s = l.root_stmt();
-
-  LLVMCodeGen cg(s, {a, b, c_buf});
-
-  std::vector<float> a_vec(N, 21.0f);
-  std::vector<float> b_vec(N, 2.0f);
-  std::vector<float> c_vec(N, 0.0f);
-  std::vector<void*> args({a_vec.data(), b_vec.data(), c_vec.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-  assertAllEqual(c_vec, 42.0f);
-}
-
-TEST(LLVM, BroadcastAdd) {
-  const int M = 32;
-  const int N = 1024;
-  BufHandle a("a", {M, N}, kFloat);
-  BufHandle b("b", {N}, kFloat);
-  Tensor c = Compute("c", {M, N}, [&](const VarHandle& i, const VarHandle& j) {
-    return a.load(i, j) + b.load(j);
-  });
-
-  BufHandle c_buf(c.buf());
-  LoopNest l({c});
-  l.prepareForCodegen();
-  StmtPtr s = l.root_stmt();
-
-  LLVMCodeGen cg(s, {a, b, c_buf});
-
-  std::vector<float> av(M * N);
-  std::iota(av.begin(), av.end(), 0);
-  std::vector<float> bv(N);
-  std::iota(bv.begin(), bv.end(), 0);
-  std::vector<float> cv(M * N, 0);
-  std::vector<void*> args({av.data(), bv.data(), cv.data()});
-  ASSERT_EQ(cg.value<int>(args), 0);
-
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N)) {
-      ASSERT_EQ(cv[i * N + j], av[i * N + j] + bv[j]);
-    }
-  }
-}
-
-TEST(LLVM, BitwiseOps) {
-  auto a = IntImm::make(59);
-  auto b = IntImm::make(11);
-  auto c = IntImm::make(101);
-  auto d = IntImm::make(2);
-
-  ExprHandle f = (((a ^ (b << 1)) & c) >> 2) | d;
-  LLVMExprEval cg(f);
-
-  ASSERT_EQ(cg.value<int>(), 11);
-}
-
-TEST(LLVM, ArithmeticRightShift) {
-  auto a = CharImm::make(-4);
-  auto b = CharImm::make(1);
-  ExprHandle f = a >> b;
-  LLVMExprEval cg(f);
-  ASSERT_EQ(cg.value<int8_t>(), -2);
-}
-
-TEST(LLVM, LogicalRightShift) {
-  auto a = ByteImm::make(0xfc);
-  auto b = ByteImm::make(1);
-  ExprHandle f = a >> b;
-  LLVMExprEval cg(f);
-  ASSERT_EQ(cg.value<uint8_t>(), 0x7e);
-}
-
-TEST(LLVM, DynamicShapeAdd) {
-  auto testWithSize = [](int32_t size) {
-    VarHandle n("n", kInt);
-    BufHandle a("a", {n}, kFloat);
-    BufHandle b("b", {n}, kFloat);
-    BufHandle c("c", {n}, kFloat);
-    VarHandle i("i", kInt);
-    StmtPtr s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
-    std::vector<float> aData(size, 1.0f);
-    std::vector<float> bData(size, 2.0f);
-    std::vector<float> cData(size, 0.0f);
-    LLVMCodeGen cg(s, {a, b, c, n});
-    std::vector<void*> args({aData.data(), bData.data(), cData.data(), &size});
-    cg.value<float>(args);
-    ExpectAllNear(cData, std::vector<float>(size, 3.0f), 1e-7);
-  };
-  testWithSize(1);
-  testWithSize(16);
-  testWithSize(37);
-}
-
-TEST(LLVM, BindDynamicShapeAdd) {
-  auto testWithSize = [](int32_t size) {
-    VarHandle n("n", kInt);
-    BufHandle a("a", {n}, kFloat);
-    BufHandle b("b", {n}, kFloat);
-    BufHandle c("c", {n}, kFloat);
-    VarHandle i("i", kInt);
-    StmtPtr s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
-    std::vector<float> aData(size, 1.0f);
-    std::vector<float> bData(size, 2.0f);
-    std::vector<float> cData(size, 0.0f);
-    LLVMCodeGen cg(s, {a, b, c, n});
-    cg.call({aData, bData, cData, size});
-    ExpectAllNear(cData, std::vector<float>(size, 3.0f), 1e-7);
-  };
-  testWithSize(1);
-  testWithSize(16);
-  testWithSize(37);
-}
-
-TEST(LLVM, TensorDynamicShapeAdd) {
-  auto testWithSize = [](int32_t size) {
-    VarHandle n("n", kInt);
-    BufHandle a("a", {n}, kFloat);
-    BufHandle b("b", {n}, kFloat);
-    Tensor c = Compute(
-        "c", {n}, [&](const VarHandle& i) { return a.load(i) + b.load(i); });
-    LoopNest l({c});
-    StmtPtr s = l.root_stmt();
-    LLVMCodeGen cg(s, {a, b, c, n});
-    std::vector<float> aData(size, 1.0f);
-    std::vector<float> bData(size, 2.0f);
-    std::vector<float> cData(size, 0.0f);
-    cg.call({aData, bData, cData, size});
-    ExpectAllNear(cData, std::vector<float>(size, 3.0f), 1e-7);
-  };
-  testWithSize(1);
-  testWithSize(16);
-  testWithSize(37);
-}
-
-TEST(LLVM, DynamicShape2D) {
-  auto testWithSize = [](int32_t M, int32_t N) {
-    VarHandle m("m", kInt);
-    VarHandle n("n", kInt);
-    BufHandle a("a", {m, n}, kFloat);
-    BufHandle b("b", {m, n}, kFloat);
-    Tensor c =
-        Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) {
-          return a.load(i, j) + b.load(i, j);
-        });
-    LoopNest l({c});
-    l.prepareForCodegen();
-    StmtPtr s = l.root_stmt();
-    LLVMCodeGen cg(s, {a, b, c, m, n});
-    std::vector<float> aData(M * N, 1.0f);
-    std::vector<float> bData(M * N, 2.0f);
-    std::vector<float> cData(M * N, 0.0f);
-    cg.call({aData, bData, cData, M, N});
-    ExpectAllNear(cData, std::vector<float>(M * N, 3.0f), 1e-7);
-  };
-  testWithSize(1, 8);
-  testWithSize(16, 32);
-  testWithSize(37, 11);
-}
-
-TEST(LLVM, EmptyStmt) {
-  StmtPtr s = alloc<Block>(std::vector<StmtPtr>({}));
-
-  LLVMCodeGen cg(s, {});
-  cg.call({});
-  // Just don't crash.
-}
-
-TEST(LLVM, EliminatedStmt) {
-  BufHandle a("a", {1}, kFloat);
-
-  Tensor c = Compute("c", {0}, [&](const VarHandle& m) { return m; });
-
-  LoopNest l({c});
-  l.prepareForCodegen();
-  StmtPtr s = l.root_stmt();
-  s = IRSimplifier::simplify(s);
-  LLVMCodeGen cg(s, {a, c});
-  std::vector<float> aData(1, 1.0f);
-  std::vector<float> cData(0, 0.0f);
-  cg.call({aData, cData});
-}
-
-TEST(LLVM, SimpleReduction) {
-  int M = 128;
-  int N = 64;
-
-  BufHandle a("a", {1, M, N}, kFloat);
-
-  Tensor b = Reduce("sum", {1}, Sum(), a, {M, N});
-  LoopNest loop({b});
-
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  LLVMCodeGen cg(s, {a, b});
-
-  PaddedBuffer<float> a_v(1, M, N, "a_v");
-  PaddedBuffer<float> b_v(1, "b_v");
-  PaddedBuffer<float> b_ref(1, "b_ref");
-
-  b_ref(0) = 0;
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N)) {
-      int v = i + j;
-      a_v(0, i, j) = v;
-      b_ref(0) += v;
-    }
-  }
-
-  cg.call({a_v, b_v});
-
-  ExpectAllNear(b_v, b_ref, 1e-5);
-}
-
-TEST(LLVM, RFactorReduction) {
-  int M = 128;
-  int N = 64;
-
-  BufHandle a("a", {1, M, N}, kFloat);
-
-  Tensor b = Reduce("sum", {1}, Sum(), a, {M, N});
-  LoopNest loop({b});
-
-  std::vector<ForPtr> loops = loop.getLoopStmtsFor(b);
-  ForPtr loop_m = loops.at(1);
-  ForPtr loop_n = loops.at(2);
-  loop.reorderAxis(loop_m, loop_n);
-
-  loops = loop.getLoopStmtsFor(b);
-  loop_m = loops.at(2);
-  loop_n = loops.at(1);
-  auto b_body = loop.getAllWritesToBuf(b.buf())[1];
-  ASSERT_TRUE(loop.rfactor(b_body, loop_n));
-
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  LLVMCodeGen cg(s, {a, b});
-
-  PaddedBuffer<float> a_v(1, M, N, "a_v");
-  PaddedBuffer<float> b_v(1, "b_v");
-  PaddedBuffer<float> b_ref(1, "b_ref");
-
-  b_ref(0) = 0;
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N)) {
-      int v = i + j;
-      a_v(0, i, j) = v;
-      b_ref(0) += v;
-    }
-  }
-
-  cg.call({a_v, b_v});
-
-  ExpectAllNear(b_v, b_ref, 1e-5);
-}
-
-TEST(LLVM, RFactorVectorizedReduction) {
-  int M = 128;
-  int N = 64;
-
-  BufHandle a("a", {1, M, N}, kFloat);
-
-  Tensor b = Reduce("sum", {1}, Sum(), a, {M, N});
-  LoopNest loopnest({b});
-  std::vector<ForPtr> loops = loopnest.getLoopStmtsFor(b);
-  // Reorder n and m loops
-  loopnest.reorderAxis(loops.at(1), loops.at(2));
-  auto b_body = loopnest.getAllWritesToBuf(b.buf()).at(1);
-  auto all_loops = loopnest.getAllLoopNestsWritingToBuf(b.buf());
-  ASSERT_TRUE(all_loops.size() == 2 && all_loops[1].size() == 3);
-  ASSERT_TRUE(loopnest.rfactor(b_body, all_loops[1][1]));
-  auto distributed_loops = loopnest.distributeLoop(all_loops[1][1]);
-
-  // Vectorize initializer of rfac_buf
-  ASSERT_TRUE(LoopNest::vectorize(distributed_loops[0]));
-  // Vectorize producer of rfac_buf
-  ASSERT_TRUE(LoopNest::vectorize(distributed_loops[1]));
-  loopnest.simplify();
-
-  loopnest.prepareForCodegen();
-
-  StmtPtr s = IRSimplifier::simplify(loopnest.root_stmt());
-  LLVMCodeGen cg(s, {a, b});
-
-  PaddedBuffer<float> a_v(1, M, N, "a_v");
-  PaddedBuffer<float> b_v(1, "b_v");
-  PaddedBuffer<float> b_ref(1, "b_ref");
-
-  b_ref(0) = 0;
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N)) {
-      int v = i + j;
-      a_v(0, i, j) = v;
-      b_ref(0) += v;
-    }
-  }
-
-  cg.call({a_v, b_v});
-
-  ExpectAllNear(b_v, b_ref, 1e-5);
-}
-
-template <bool outer, bool inner>
-static void testSimpleParallel() {
-  // Compute a simple operation, and try all loop-axis combination to be
-  // parallel or sequential.
-  const int M = 4;
-  const int N = 6;
-  Tensor f = Compute("f", {M, N}, [](const VarHandle& m, const VarHandle& n) {
-    return cast<float>(m + n);
-  });
-  LoopNest loop_nest({f});
-  auto const& loops = loop_nest.getLoopStmtsFor(f);
-  ForPtr m = loops[0];
-  ForPtr n = loops[1];
-  if (outer) {
-    m->set_parallel();
-  }
-  if (inner) {
-    n->set_parallel();
-  }
-  loop_nest.prepareForCodegen();
-  StmtPtr stmt = loop_nest.root_stmt();
-  LLVMCodeGen cg(stmt, {f});
-
-  PaddedBuffer<float> f_v(M, N, "f_v");
-  std::vector<void*> args({f_v.data()});
-  int value = cg.value<int>(args);
-  ASSERT_EQ(value, 0);
-  PaddedBuffer<float> f_ref(M, N, "f_ref");
-  for (const auto m : c10::irange(M)) {
-    for (const auto n : c10::irange(N)) {
-      f_ref(m, n) = m + n;
-    }
-  }
-  ExpectAllNear(f_v, f_ref, 1e-5);
-}
-
-TEST(LLVM, SimpleParallelSS) {
-  testSimpleParallel<false, false>();
-}
-TEST(LLVM, SimpleParallelSP) {
-  testSimpleParallel<false, true>();
-}
-TEST(LLVM, SimpleParallelPS) {
-  testSimpleParallel<true, false>();
-}
-TEST(LLVM, SimpleParallelPP) {
-  testSimpleParallel<true, true>();
-}
-
-TEST(LLVM, CompositeParallel) {
-  int loop_count = 6;
-  int test_count = 1 << loop_count;
-  // Compute a composite operation, and try all loop-axis combination to be
-  // parallel or sequential.
-  for (const auto test_cfg : c10::irange(test_count)) {
-    int M = 5;
-    int N = 7;
-    Tensor t1 = Compute("t1", {M}, [](const VarHandle& m) { return m + 1.f; });
-    Tensor t2 = Compute("t2", {N}, [](const VarHandle& n) { return n + 2.f; });
-    Tensor t3 =
-        Compute("t3", {M, N}, [=](const VarHandle& m, const VarHandle& n) {
-          return t1.load(m) * t2.load(n);
-        });
-    Tensor t4 =
-        Compute("t4", {M, N}, [=](const VarHandle& m, const VarHandle& n) {
-          return t3.load(m, n) + m + n;
-        });
-    LoopNest loop_nest({t4}, {t1, t2, t3, t4});
-    std::vector<ForPtr> loop_list;
-    {
-      auto const& loops = loop_nest.getLoopStmtsFor(t1);
-      loop_list.push_back(loops[0]);
-    }
-    {
-      auto const& loops = loop_nest.getLoopStmtsFor(t2);
-      loop_list.push_back(loops[0]);
-    }
-    {
-      auto const& loops = loop_nest.getLoopStmtsFor(t3);
-      loop_list.push_back(loops[0]);
-      loop_list.push_back(loops[1]);
-    }
-    {
-      auto const& loops = loop_nest.getLoopStmtsFor(t4);
-      loop_list.push_back(loops[0]);
-      loop_list.push_back(loops[1]);
-    }
-    ASSERT_EQ(loop_list.size(), loop_count);
-    for (const auto i : c10::irange(loop_count)) {
-      if (test_cfg & (1 << i)) {
-        loop_list[i]->set_parallel();
-      }
-    }
-    loop_nest.prepareForCodegen();
-    StmtPtr stmt = loop_nest.root_stmt();
-    LLVMCodeGen cg(stmt, {t4});
-
-    PaddedBuffer<float> t4_v(M, N, "t4_v");
-    std::vector<void*> args({t4_v.data()});
-    int value = cg.value<int>(args);
-    ASSERT_EQ(value, 0);
-    PaddedBuffer<float> t4_ref(M, N, "t4_ref");
-    for (const auto m : c10::irange(M)) {
-      for (const auto n : c10::irange(N)) {
-        t4_ref(m, n) = (m + 1) * (n + 2) + m + n;
-      }
-    }
-    ExpectAllNear(t4_v, t4_ref, 1e-5);
-  }
-}
-
-TEST(LLVM, VectorizedGEMM) {
-  int M = 32;
-  int N = 32;
-  int K = 48;
-
-  BufHandle AP("A", {M, K}, kFloat);
-  BufHandle BP("B", {K, N}, kFloat);
-  Tensor CT = Reduce(
-      "gemm",
-      {M, N},
-      Sum(),
-      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
-        return AP.load(m, k) * BP.load(k, n);
-      },
-      {K});
-  LoopNest loop({CT});
-
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    ForPtr m = loops[0];
-    loop.splitWithMask(m, 16);
-  }
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    ForPtr n = loops[2];
-    loop.splitWithMask(n, 16);
-  }
-  // mo, mi, no, ni, k ->
-  // mo, no, mi, ni, k
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    ForPtr mi = loops[1];
-    ForPtr no = loops[2];
-    loop.reorderAxis(mi, no);
-  }
-  // mo, no, mi, ni, k ->
-  // mo, no, mi, k, ni
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    ForPtr ni = loops[3];
-    ForPtr k = loops[4];
-    loop.reorderAxis(ni, k);
-  }
-  // mo, no, mi, k, ni ->
-  // mo, no, k, mi, ni
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    ForPtr mi = loops[2];
-    ForPtr k = loops[3];
-    loop.reorderAxis(mi, k);
-  }
-  {
-    auto loops = NodeFinder<For>::find(loop.root_stmt());
-    ASSERT_TRUE(LoopNest::vectorize(loops[3]));
-    ASSERT_TRUE(LoopNest::vectorize(loops.back()));
-  }
-
-  loop.prepareForCodegen();
-
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-  LLVMCodeGen cg(s, {AP, BP, CT});
-
-  PaddedBuffer<float> a_v(M, K, "a_v");
-  PaddedBuffer<float> b_v(K, N, "b_v");
-  PaddedBuffer<float> c_v(M, N, "c_v");
-  PaddedBuffer<float> c_ref(M, N, "c_ref");
-
-  for (const auto m : c10::irange(M)) {
-    for (const auto n : c10::irange(N)) {
-      c_ref(m, n) = 0.f;
-      for (const auto k : c10::irange(K)) {
-        c_ref(m, n) += a_v(m, k) * b_v(k, n);
-      }
-    }
-  }
-
-  cg.call({a_v, b_v, c_v});
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-}
-
-TEST(LLVM, CallRaw) {
-  const int M = 32;
-  VarHandle N("N", kInt);
-  BufHandle a("a", {M, N}, kFloat);
-  BufHandle b("b", {N}, kFloat);
-  Tensor c = Compute("c", {M, N}, [&](const VarHandle& i, const VarHandle& j) {
-    return a.load(i, j) + b.load(j);
-  });
-
-  LoopNest l({c});
-  l.prepareForCodegen();
-  StmtPtr s = l.root_stmt();
-
-  int32_t N_value = 1024;
-  std::vector<float> av(M * N_value);
-  std::iota(av.begin(), av.end(), 0);
-  std::vector<float> bv(N_value);
-  std::iota(bv.begin(), bv.end(), 0);
-  std::vector<float> cv(M * N_value, 0);
-  std::vector<void*> args({av.data(), bv.data(), cv.data(), &N_value});
-
-  LLVMCodeGen cg(s, {a, b, BufHandle(c.buf()), N});
-  cg.call_raw(args);
-
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N_value)) {
-      ASSERT_EQ(cv[i * N_value + j], av[i * N_value + j] + bv[j]);
-    }
-  }
-
-  SimpleIREvaluator eval(s, {a, b, BufHandle(c.buf()), N});
-  eval.call_raw(args);
-
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N_value)) {
-      ASSERT_EQ(cv[i * N_value + j], av[i * N_value + j] + bv[j]);
-    }
-  }
-}
-
-TEST(LLVM, CustomTarget) {
-  constexpr int M = 16;
-  BufHandle a("a", {M}, kFloat);
-  BufHandle b("b", {M}, kFloat);
-  BufHandle c("c", {M}, kFloat);
-  Tensor d = Compute("d", {M}, [&](const VarHandle& m) {
-    return a.load(m) * b.load(m) + c.load(m);
-  });
-  LoopNest nest({d});
-  nest.prepareForCodegen();
-  auto cg = LLVMCodeGenBuilder(nest.root_stmt(), {a, b, c, d})
-                .triple("i686-elf")
-                .cpu("i386")
-                .build();
-  std::ostringstream ss;
-  ss << cg->getCodeText("asm");
-  torch::jit::testing::FileCheck()
-      .check("fadds")
-      ->check("fmuls")
-      ->check_not("vfmadd")
-      ->run(ss.str());
-}
-
-TEST(LLVM, CodeGenKernelFuncName) {
-  BufHandle a("A", {1}, kInt);
-  BufHandle b("B", {1}, kInt);
-  std::vector<int32_t> a_buffer = {42};
-  std::vector<int32_t> b_buffer = {-11};
-  auto store = b.store({0}, a.load(0));
-
-  {
-    LLVMCodeGen cg(store, {a, b});
-    // Check that the kernel function name used by LLVMCodeGen
-    // is not empty.
-    ASSERT_NE(cg.kernel_func_name(), "");
-  }
-
-  {
-    LLVMCodeGen cg(store, {a, b}, at::kCPU, "new_func");
-    // Check that the kernel function name used by LLVMCodeGen
-    // is the one that was given above.
-    ASSERT_EQ(cg.kernel_func_name(), "new_func");
-  }
-}
-
-} // namespace jit
-} // namespace torch
-
-#endif // TORCH_ENABLE_LLVM
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
deleted file mode 100644
index a8bda8814dbae..0000000000000
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ /dev/null
@@ -1,6894 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <test/cpp/tensorexpr/test_base.h>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <unordered_map>
-
-#include <test/cpp/tensorexpr/padded_buffer.h>
-#include <test/cpp/tensorexpr/test_utils.h>
-#include <torch/csrc/jit/tensorexpr/analysis.h>
-#include <torch/csrc/jit/tensorexpr/bounds_inference.h>
-#include <torch/csrc/jit/tensorexpr/eval.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_printer.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/csrc/jit/testing/file_check.h>
-
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::tensorexpr;
-
-void checkIR(StmtPtr s, const std::string& pattern) {
-  std::ostringstream oss;
-  oss << *s;
-  torch::jit::testing::FileCheck().run(pattern, oss.str());
-}
-
-void checkExprIR(ExprPtr e, const std::string& pattern) {
-  std::string prefixed_pattern = "# CHECK: " + pattern + "\n";
-  std::ostringstream oss;
-  oss << *e << "\n";
-  torch::jit::testing::FileCheck().run(prefixed_pattern, oss.str());
-}
-
-void checkExprIR(const ExprHandle& e, const std::string& pattern) {
-  checkExprIR(e.node(), pattern);
-}
-
-TEST(LoopNest, ExprSimple01) {
-  Tensor tensor =
-      Compute("f", {16, 5}, [](const VarHandle& x, const VarHandle& y) {
-        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
-      });
-  LoopNest l({tensor});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-
-  LoopNest::splitWithTail(loops[0], 2);
-  LoopNest::splitWithTail(loops[0], 2);
-}
-
-TEST(LoopNest, ExprLower01) {
-  Tensor tensor =
-      Compute("f", {16, 5}, [](const VarHandle& x, const VarHandle& y) {
-        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
-      });
-  LoopNest l({tensor});
-  StmtPtr stmt = l.root_stmt();
-  std::ostringstream oss;
-  oss << *stmt;
-  ASSERT_GT(oss.str().size(), 20);
-  ASSERT_LT(oss.str().size(), 200);
-}
-
-TEST(LoopNest, ExprSimple02) {
-  auto func = [](const ExprHandle& x, const ExprHandle& y) {
-    return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
-  };
-  Tensor tensor = Compute("f", {26, 5}, func);
-  LoopNest l({tensor});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-
-  LoopNest::splitWithTail(loops[0], 4);
-
-  StmtPtr stmt = l.root_stmt();
-  std::ostringstream oss;
-  oss << *stmt;
-  ASSERT_GT(oss.str().size(), 200);
-  ASSERT_LT(oss.str().size(), 600);
-
-  {
-    // Compare to a reference loop structure structure.
-    VarHandle x_outer("i_outer", kInt);
-    VarHandle x_inner("i_inner", kInt);
-    VarHandle y("i", kInt);
-    VarHandle x_tail("i_tail", kInt);
-    BufHandle f("f", {26, 5}, kFloat);
-    ExprHandle x_1 = x_outer * 4 + x_inner;
-    ExprHandle x_outer_end = (ExprHandle(26) - 0) / 4;
-    ForPtr stmt1 = For::make(
-        x_outer,
-        0,
-        x_outer_end,
-        For::make(
-            x_inner,
-            0,
-            4,
-            For::make(y, 0, 5, Store::make(f, {x_1, y}, func(x_1, y)))));
-    ExprHandle x_2 = x_tail + x_outer_end * 4;
-    ForPtr stmt2 = For::make(
-        x_tail,
-        0,
-        (ExprHandle(26) - 0) % 4,
-        For::make(y, 0, 5, Store::make(f, {x_2, y}, func(x_2, y))));
-    StmtPtr stmt = Block::make({stmt1, stmt2});
-
-    std::ostringstream oss_ref;
-    oss_ref << *stmt;
-    ASSERT_EQ(oss.str(), oss_ref.str());
-  }
-
-  {
-    PaddedBuffer<float> f_v(26, 5, "f_v");
-    PaddedBuffer<float> f_ref(26, 5, "f_res");
-
-    stmt = FlattenIndexes(stmt);
-    SimpleIREvaluator ir_eval(stmt, {tensor});
-    ir_eval(f_v);
-
-    for (int x = 0; x < 26; x++) {
-      for (int y = 0; y < 5; y++) {
-        f_ref(x, y) = 1 + x * x + y * y;
-      }
-    }
-
-    ExpectAllNear(f_v, f_ref, 1e-5);
-  }
-}
-
-BlockPtr getSimplifiedBody(const LoopNest& l) {
-  StmtPtr stmt = l.root_stmt();
-  StmtPtr simplified = IRSimplifier::simplify(stmt);
-  return to<Block>(simplified);
-}
-
-void assertForRange(ForPtr f, int expected_start, int expected_stop) {
-  ASSERT_NE(f, nullptr);
-  IntImmPtr start = to<IntImm>(f->start());
-  ASSERT_NE(start, nullptr);
-  ASSERT_EQ(start->value(), expected_start);
-  IntImmPtr stop = to<IntImm>(f->stop());
-  ASSERT_NE(stop, nullptr);
-  ASSERT_EQ(stop->value(), expected_stop);
-}
-
-void assertForRanges(
-    BlockPtr body,
-    const std::vector<std::pair<int, int>>& start_stops) {
-  ASSERT_EQ(body->nstmts(), start_stops.size());
-
-  auto it = body->begin();
-  for (size_t i = 0; i < start_stops.size(); i++, it++) {
-    ForPtr loop = to<For>(*it);
-    assertForRange(loop, start_stops[i].first, start_stops[i].second);
-  }
-}
-
-TEST(LoopNest, ExprSliceHeadWithLoopOptions) {
-  auto func = [](const ExprHandle& x) {
-    return ExprHandle(1.0f) + cast<float>(x);
-  };
-  Tensor tensor = Compute("f", {10}, func);
-  LoopNest l({tensor});
-  ForPtr head;
-  ForPtr tail;
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  loops[0]->set_gpu_block_index(LoopOptions::IDX_Y);
-  LoopNest::sliceHead(loops[0], 2, &head, &tail);
-
-  BlockPtr body = getSimplifiedBody(l);
-  assertForRanges(body, {{0, 2}, {0, 8}});
-
-  ASSERT_TRUE(tail->loop_options().is_gpu_block_index());
-  ASSERT_EQ(tail->loop_options().gpu_block_index(), LoopOptions::IDX_Y);
-
-  ASSERT_TRUE(head->loop_options().isDefault());
-}
-
-TEST(LoopNest, ExprSliceTailWithLoopOptions) {
-  auto func = [](const ExprHandle& x) {
-    return ExprHandle(1.0f) + cast<float>(x);
-  };
-  Tensor tensor = Compute("f", {10}, func);
-  LoopNest l({tensor});
-  ForPtr head;
-  ForPtr tail;
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::sliceTail(loops[0], 4, &head, &tail);
-
-  ForPtr tail_head;
-  ForPtr tail_tail;
-  tail->set_gpu_block_index(LoopOptions::IDX_Y);
-  LoopNest::sliceTail(tail, 2, &tail_head, &tail_tail);
-
-  BlockPtr body = getSimplifiedBody(l);
-  assertForRanges(body, {{0, 6}, {0, 2}, {8, 10}});
-
-  ASSERT_TRUE(tail_head->loop_options().is_gpu_block_index());
-  ASSERT_EQ(tail_head->loop_options().gpu_block_index(), LoopOptions::IDX_Y);
-
-  ASSERT_TRUE(head->loop_options().isDefault());
-  ASSERT_TRUE(tail_tail->loop_options().isDefault());
-}
-
-TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) {
-  // When factor equals the For loop's original size, keep using the original
-  // For loop.
-  auto func = [](const ExprHandle& x) {
-    return ExprHandle(1.0f) + cast<float>(x);
-  };
-  Tensor tensor = Compute("f", {10}, func);
-  LoopNest l({tensor});
-  ForPtr head;
-  ForPtr tail;
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::sliceHead(loops[0], 10, &head, &tail);
-
-  ASSERT_EQ(head, loops[0]);
-  ASSERT_EQ(tail, nullptr);
-
-  BlockPtr body = getSimplifiedBody(l);
-  assertForRanges(body, {{0, 10}});
-}
-
-TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) {
-  auto func = [](const ExprHandle& x) {
-    return ExprHandle(1.0f) + cast<float>(x);
-  };
-  Tensor tensor = Compute("f", {10}, func);
-  LoopNest l({tensor});
-  ForPtr head;
-  ForPtr tail;
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::sliceHead(loops[0], 100, &head, &tail);
-
-  ASSERT_EQ(head, loops[0]);
-  ASSERT_EQ(tail, nullptr);
-
-  BlockPtr body = getSimplifiedBody(l);
-  assertForRanges(body, {{0, 10}});
-}
-
-TEST(LoopNest, ExprSliceHead) {
-  auto func = [](const ExprHandle& x) {
-    return ExprHandle(1.0f) + cast<float>(x);
-  };
-  Tensor tensor = Compute("f", {10}, func);
-  LoopNest l({tensor});
-  ForPtr head;
-  ForPtr tail;
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::sliceHead(loops[0], 4, &head, &tail);
-
-  ASSERT_NE(head, nullptr);
-  ASSERT_NE(head, loops[0]);
-  ASSERT_NE(tail, nullptr);
-  ASSERT_EQ(tail, loops[0]);
-
-  BlockPtr body = getSimplifiedBody(l);
-  assertForRanges(body, {{0, 4}, {4, 10}});
-}
-
-TEST(LoopNest, ExprSliceHeadWithNonZeroStart) {
-  auto func = [](const ExprHandle& x) {
-    return ExprHandle(1.0f) + cast<float>(x);
-  };
-  Tensor tensor = Compute("f", {10}, func);
-  LoopNest l({tensor});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-
-  ForPtr head;
-  ForPtr tail;
-  LoopNest::sliceTail(loops[0], 4, &head, &tail);
-  // head: [0, 6)
-  // tail: [6, 10)
-
-  LoopNest::sliceHead(tail, 2);
-  // tail_head: [6, 8)
-  // tail_tail: [8, 10)
-
-  BlockPtr body = getSimplifiedBody(l);
-  assertForRanges(body, {{0, 6}, {6, 8}, {8, 10}});
-}
-
-TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) {
-  // When factor equals the For loop's original size, keep using the original
-  // For loop.
-  auto func = [](const ExprHandle& x) {
-    return ExprHandle(1.0f) + cast<float>(x);
-  };
-  Tensor tensor = Compute("f", {10}, func);
-  LoopNest l({tensor});
-  ForPtr head;
-  ForPtr tail;
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::sliceTail(loops[0], 10, &head, &tail);
-
-  ASSERT_EQ(head, nullptr);
-  ASSERT_EQ(tail, loops[0]);
-
-  BlockPtr body = getSimplifiedBody(l);
-  assertForRanges(body, {{0, 10}});
-}
-
-TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) {
-  // When factor equals the For loop's original size, keep using the original
-  // For loop.
-  auto func = [](const ExprHandle& x) {
-    return ExprHandle(1.0f) + cast<float>(x);
-  };
-  Tensor tensor = Compute("f", {10}, func);
-  LoopNest l({tensor});
-  ForPtr head;
-  ForPtr tail;
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::sliceTail(loops[0], 100, &head, &tail);
-
-  ASSERT_EQ(head, nullptr);
-  ASSERT_EQ(tail, loops[0]);
-
-  BlockPtr body = getSimplifiedBody(l);
-  assertForRanges(body, {{0, 10}});
-}
-
-TEST(LoopNest, ExprSliceTail) {
-  auto func = [](const ExprHandle& x) {
-    return ExprHandle(1.0f) + cast<float>(x);
-  };
-  Tensor tensor = Compute("f", {10}, func);
-  LoopNest l({tensor});
-  ForPtr head;
-  ForPtr tail;
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::sliceTail(loops[0], 4, &head, &tail);
-
-  ASSERT_NE(head, nullptr);
-  ASSERT_EQ(head, loops[0]);
-  ASSERT_NE(tail, nullptr);
-  ASSERT_NE(tail, loops[0]);
-
-  BlockPtr body = getSimplifiedBody(l);
-  assertForRanges(body, {{0, 6}, {6, 10}});
-}
-
-TEST(LoopNest, ExprSplitAndSlice) {
-  // 0: splitWithTail
-  // 1: sliceTail on inner loop
-  // 2: sliceHead on outer loop
-  auto func = [](const ExprHandle& x) {
-    return ExprHandle(1.0f) + cast<float>(x);
-  };
-  Tensor tensor = Compute("f", {100}, func);
-  LoopNest l({tensor});
-
-  ForPtr inner;
-  ForPtr tail;
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  // outer: [0, 4)
-  // inner: [0, 21)
-  // tail:  [84, 100)
-  LoopNest::splitWithTail(loops[0], 21, &inner, &tail);
-  LoopNest::sliceTail(inner, 2);
-  LoopNest::sliceHead(loops[0], 2);
-
-  // for (int x_outer = 0; x_outer < 2; x_outer++) {
-  //   for (int x_inner = 0; x_inner < 19; x_inner++) {
-  //     f[21 * x_outer + x_inner] = 1.f + float(21 * x_outer + x_inner);
-  //   }
-  //   for (int x_inner = 19; x_inner < 21; x_inner++) {
-  //     f[21 * x_outer + x_inner] = 1.f + float(21 * x_outer + x_inner);
-  //   }
-  // }
-  // for (int x_outer = 2; x_outer < 4; x_outer++) {
-  //   for (int x_inner = 0; x_inner < 19; x_inner++) {
-  //     f[21 * x_outer + x_inner] = 1.f + float(21 * x_outer + x_inner);
-  //   }
-  //   for (int x_inner = 19; x_inner < 21; x_inner++) {
-  //     f[21 * x_outer + x_inner] = 1.f + float(21 * x_outer + x_inner);
-  //   }
-  // }
-  // for (int x_tail = 0; x_tail < 16; x_tail++) {
-  //   f[x_tail + 84] = 1.f + float(x_tail + 84);
-  // }
-  BlockPtr body = getSimplifiedBody(l);
-  assertForRanges(body, {{0, 2}, {2, 4}, {0, 16}});
-
-  auto biter = body->begin();
-
-  ForPtr loop = to<For>(*biter++);
-  assertForRanges(loop->body(), {{0, 19}, {19, 21}});
-
-  loop = to<For>(*biter);
-  assertForRanges(loop->body(), {{0, 19}, {19, 21}});
-}
-
-TEST(LoopNest, ExprSliceAndNormalize) {
-  // 0: sliceHead
-  // 1: normalize tail
-  auto func = [](const ExprHandle& x) {
-    return ExprHandle(1.0f) + cast<float>(x);
-  };
-  Tensor tensor = Compute("f", {10}, func);
-  LoopNest l({tensor});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-
-  ForPtr head;
-  ForPtr tail;
-  LoopNest::sliceHead(loops[0], 2, &head, &tail);
-  // head: [0, 2)
-  // tail: [2, 10)
-
-  LoopNest::normalize(tail);
-  // normalized_tail: [0, 8)
-
-  BlockPtr body = getSimplifiedBody(l);
-  assertForRanges(body, {{0, 2}, {0, 8}});
-}
-
-template <typename T>
-T evalExpr(const ExprHandle& expr, const VarHandle& var, T value) {
-  ExprEval<SimpleIREvaluator> eval(expr, {var});
-  return eval.value<T>(value);
-}
-
-TEST(LoopNest, ExprSliceWithVariableDimension) {
-  auto testWithDimension =
-      [](int dimension,
-         const std::vector<std::pair<int, int>>& expected_for_ranges) {
-        VarHandle dim("dim", kInt);
-        Tensor tensor =
-            Compute("f", {dim}, [](const ExprHandle& x) { return x; });
-        LoopNest l({tensor});
-        std::vector<ForPtr> loops =
-            l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-
-        ForPtr head;
-        ForPtr tail;
-        LoopNest::sliceHead(loops[0], 2, &head, &tail);
-
-        LoopNest::sliceTail(tail, 2);
-
-        BlockPtr body = getSimplifiedBody(l);
-        ASSERT_EQ(expected_for_ranges.size(), 3);
-        auto it = body->begin();
-        for (auto& start_stop : expected_for_ranges) {
-          ForPtr loop = to<For>(*it++);
-          int start = evalExpr<int>(ExprHandle(loop->start()), dim, dimension);
-          int stop = evalExpr<int>(ExprHandle(loop->stop()), dim, dimension);
-          ASSERT_EQ(start, start_stop.first);
-          ASSERT_EQ(stop, start_stop.second);
-        }
-      };
-
-  testWithDimension(1, {{0, 1}, {1, 1}, {1, 1}});
-  testWithDimension(2, {{0, 2}, {2, 2}, {2, 2}});
-  testWithDimension(3, {{0, 2}, {2, 2}, {2, 3}});
-  testWithDimension(4, {{0, 2}, {2, 2}, {2, 4}});
-  testWithDimension(5, {{0, 2}, {2, 3}, {3, 5}});
-  testWithDimension(10, {{0, 2}, {2, 8}, {8, 10}});
-}
-
-TEST(LoopNest, ExprSplitWithTail) {
-  auto func = [](const ExprHandle& x) {
-    return ExprHandle(1.0f) + cast<float>(x);
-  };
-  Tensor tensor = Compute("f", {199}, func);
-  LoopNest l({tensor});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  LoopNest::splitWithTail(loops[0], 17);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  LoopNest::splitWithTail(loops[0], 7);
-
-  StmtPtr stmt = l.root_stmt();
-  StmtPtr simplified = IRSimplifier::simplify(stmt);
-  BlockPtr body = to<Block>(simplified);
-  ASSERT_EQ(body->nstmts(), 3);
-  auto biter = body->begin();
-
-  // Verify that the split loops are ordered correctly.
-  ForPtr loop = to<For>(*biter++);
-  assertForRange(loop, 0, 7);
-
-  loop = to<For>(*biter++);
-  assertForRange(loop, 0, 4);
-
-  loop = to<For>(*biter);
-  assertForRange(loop, 0, 12);
-}
-
-TEST(LoopNest, ExprSplitWithTailNone) {
-  auto func = [](const ExprHandle& x, const ExprHandle& y) {
-    return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
-  };
-  Tensor tensor = Compute("f", {24, 5}, func);
-  LoopNest l({tensor});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::splitWithTail(loops[0], 4);
-
-  StmtPtr stmt = l.root_stmt();
-  std::ostringstream oss;
-  oss << *stmt;
-  ASSERT_GT(oss.str().size(), 200);
-  ASSERT_LT(oss.str().size(), 600);
-
-  {
-    // Compare to a reference loop structure structure.
-    VarHandle x_outer("i_outer", kInt);
-    VarHandle x_inner("i_inner", kInt);
-    VarHandle y("i", kInt);
-    VarHandle x_tail("i_tail", kInt);
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks,cppcoreguidelines-avoid-magic-numbers)
-    BufHandle f("f", {24, 5}, kFloat);
-    ExprHandle x_1 = x_outer * 4 + x_inner;
-    ExprHandle x_outer_end = (ExprHandle(24) - 0) / 4;
-    StmtPtr stmt = alloc<Block>(std::vector<StmtPtr>({For::make(
-        x_outer,
-        0,
-        x_outer_end,
-        For::make(
-            x_inner,
-            0,
-            4,
-            For::make(y, 0, 5, Store::make(f, {x_1, y}, func(x_1, y)))))}));
-
-    std::ostringstream oss_ref;
-    oss_ref << *stmt;
-    ASSERT_EQ(oss.str(), oss_ref.str());
-  }
-
-  {
-    PaddedBuffer<float> f_v(24, 5, "f_v");
-    PaddedBuffer<float> f_ref(24, 5, "f_res");
-
-    SimpleIREvaluator ir_eval(stmt, {tensor});
-    ir_eval(f_v);
-
-    for (int x = 0; x < 24; x++) {
-      for (int y = 0; y < 5; y++) {
-        f_ref(x, y) = 1 + x * x + y * y;
-      }
-    }
-
-    ExpectAllNear(f_v, f_ref, 1e-5);
-  }
-}
-
-TEST(LoopNest, ExprSplitWithMask01) {
-  const int M = 26;
-  const int N = 5;
-  BufHandle a_buf("a", {M, N}, kFloat);
-  BufHandle b_buf("b", {M, N}, kFloat);
-  Tensor tensor =
-      Compute("f", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return a_buf.load(m, n) + b_buf.load(m, n) + 1.0f;
-      });
-
-  LoopNest l({tensor});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::splitWithMask(loops[1], 4);
-
-  StmtPtr stmt = l.root_stmt();
-
-  PaddedBuffer<float> a_v(M, N, "a");
-  PaddedBuffer<float> b_v(M, N, "b");
-  PaddedBuffer<float> c_v(M, N, "c");
-  PaddedBuffer<float> c_ref(M, N, "c_ref");
-  for (int m = 0; m < M; m++) {
-    for (int n = 0; n < N; n++) {
-      a_v(m, n) = 2 * m;
-      b_v(m, n) = 3 * n;
-      c_ref(m, n) = a_v(m, n) + b_v(m, n) + 1.0f;
-    }
-  }
-
-  SimpleIREvaluator(stmt, {a_buf, b_buf, tensor})(a_v, b_v, c_v);
-
-  ExpectAllNear(c_v, c_ref, 1e-5);
-}
-
-// Tests the case where we split a loop cleanly multiple times, we should not
-// insert any masks.
-TEST(LoopNest, ExprSplitWithMaskRepeatedNoMask) {
-  const int M = 64;
-  BufHandle a_buf("a", {M}, kFloat);
-  BufHandle b_buf("b", {M}, kFloat);
-  Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) {
-    return a_buf.load(m) + b_buf.load(m) + 1.0f;
-  });
-
-  LoopNest l({tensor});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::splitWithMask(loops[0], 4);
-  LoopNest::splitWithMask(loops[0], 4);
-
-  StmtPtr stmt1 = IRSimplifier::simplify(l.root_stmt());
-
-  // Two splits mean 3 loops, but should need no masks in this case.
-  checkIR(stmt1, R"IR(
-# CHECK: for (
-# CHECK-NOT: if (
-# CHECK:   for (
-# CHECK-NOT: if (
-# CHECK:     for (
-# CHECK-NOT: if (
-# CHECK:       f[)IR");
-}
-
-TEST(LoopNest, getLoopAt) {
-  // Input IR:
-  //  for (int i = 0; i < 100; i++) {
-  //    for (int j = 0; j < 100; j++) {
-  //      A[i, j] = sin(i * j);
-  //      for (int k1 = 0; k1 < 200; k1++) {
-  //        B[i, j, k1] = (A[i, j]) / (k1 + 1);
-  //      }
-  //      for (int k2 = 0; k2 < 300; k2++) {
-  //        C[i, j, k2] = (A[i, j]) * (k2 + 1);
-  //      }
-  //    }
-  //  }
-  BufPtr A = alloc<Buf>(
-      "A",
-      std::vector<ExprPtr>({alloc<IntImm>(100), alloc<IntImm>(100)}),
-      kInt);
-  BufPtr B = alloc<Buf>(
-      "B",
-      std::vector<ExprPtr>(
-          {alloc<IntImm>(100), alloc<IntImm>(100), alloc<IntImm>(200)}),
-      kInt);
-  BufPtr C = alloc<Buf>(
-      "C",
-      std::vector<ExprPtr>(
-          {alloc<IntImm>(100), alloc<IntImm>(100), alloc<IntImm>(300)}),
-      kInt);
-  BufHandle a_buf(A);
-  BufHandle b_buf(B);
-  BufHandle c_buf(C);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k1("k1", kInt);
-  VarHandle k2("k2", kInt);
-  auto store1 = Store::make(a_buf, {i, j}, sin(i * j));
-  auto store2 = Store::make(
-      b_buf, {i, j, k1}, Div::make(Load::make(a_buf, {i, j}), (k1 + 1)));
-  auto store3 = Store::make(
-      c_buf, {i, j, k2}, Mul::make(Load::make(a_buf, {i, j}), (k2 + 1)));
-  auto for_k2 = For::make(k2, 0, 300, Block::make({store3}));
-  auto for_k1 = For::make(k1, 0, 200, Block::make({store2}));
-  auto for_j = For::make(j, 0, 100, Block::make({store1, for_k1, for_k2}));
-  auto for_i = For::make(i, 0, 100, for_j);
-  LoopNest l(Block::make({for_i}), {B, C});
-  auto ret_k2 = l.getLoopAt(for_i, {0, 2});
-  TORCH_CHECK(ret_k2 == for_k2);
-
-  std::ostringstream oss;
-  oss << *ret_k2;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int k2
-# CHECK-NEXT: C[i, j, k2] =
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(LoopNest, TileSimple) {
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  const int M = 64, N = 64;
-  BufHandle a_buf("a", {M, N}, kFloat);
-  BufHandle b_buf("b", {M, N}, kFloat);
-  Tensor tensor =
-      Compute("f", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f;
-      });
-
-  LoopNest l({tensor});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  l.tile(loops[0], loops[1], 4, 8);
-
-  // IR check
-  StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
-  checkIR(stmt, R"IR(
-# CHECK: for (int i_outer
-# CHECK:   for (int i_outer_1
-# CHECK:     for (int i_inner
-# CHECK:       for (int i_inner_1
-# CHECK:         f[
-# CHECK-NOT:     for (int i_tail
-# CHECK-NOT: for (int i_tail)IR");
-
-  // Correctness check
-  PaddedBuffer<float> a_v(M, N, "a");
-  PaddedBuffer<float> b_v(M, N, "b");
-  PaddedBuffer<float> c_v(M, N, "c");
-  PaddedBuffer<float> c_ref(M, N, "c_ref");
-  for (int m = 0; m < M; m++) {
-    for (int n = 0; n < N; n++) {
-      a_v(m, n) = 2 * m;
-      b_v(m, n) = 3 * n;
-      c_ref(m, n) = a_v(m, n) + b_v(m, n) + 1.0f;
-    }
-  }
-
-  SimpleIREvaluator(stmt, {a_buf, b_buf, tensor})(a_v, b_v, c_v);
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  ExpectAllNear(c_v, c_ref, 1e-5);
-}
-
-TEST(LoopNest, TileWithTails) {
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  const int M = 64, N = 64;
-  BufHandle a_buf("a", {M, N}, kFloat);
-  BufHandle b_buf("b", {M, N}, kFloat);
-  Tensor tensor =
-      Compute("f", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f;
-      });
-
-  LoopNest l({tensor});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  l.tile(loops[0], loops[1], 5, 9);
-
-  // IR check
-  StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
-  checkIR(stmt, R"IR(
-# CHECK: for (int i_outer
-# CHECK:   for (int i_outer_1
-# CHECK:     for (int i_inner
-# CHECK:       for (int i_inner_1
-# CHECK:         f[
-# CHECK:   for (int i_inner
-# CHECK:     f[
-# CHECK: for (int i_tail)IR");
-
-  // Correctness check
-  PaddedBuffer<float> a_v(M, N, "a");
-  PaddedBuffer<float> b_v(M, N, "b");
-  PaddedBuffer<float> c_v(M, N, "c");
-  PaddedBuffer<float> c_ref(M, N, "c_ref");
-  for (int m = 0; m < M; m++) {
-    for (int n = 0; n < N; n++) {
-      a_v(m, n) = 2 * m;
-      b_v(m, n) = 3 * n;
-      c_ref(m, n) = a_v(m, n) + b_v(m, n) + 1.0f;
-    }
-  }
-
-  SimpleIREvaluator(stmt, {a_buf, b_buf, tensor})(a_v, b_v, c_v);
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  ExpectAllNear(c_v, c_ref, 1e-5);
-}
-
-TEST(LoopNest, TileInMiddle) {
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  const int M = 8, N = 8, L = 8, K = 8;
-  BufHandle a_buf("a", {M, N, L, K}, kFloat);
-  BufHandle b_buf("b", {M, N, L, K}, kFloat);
-  Tensor tensor = Compute(
-      "f",
-      {M, N, L, K},
-      [&](const ExprHandle& m,
-          const ExprHandle& n,
-          const ExprHandle& l,
-          const ExprHandle& k) {
-        return a_buf.load({m, n, l, k}) + b_buf.load({m, n, l, k}) + 1.0f;
-      });
-
-  LoopNest nest({tensor});
-  std::vector<ForPtr> loops =
-      nest.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  nest.tile(loops[1], loops[2], 3, 3);
-
-  // IR check
-  StmtPtr stmt = IRSimplifier::simplify(nest.root_stmt());
-  checkIR(stmt, R"IR(
-# CHECK: for (int i
-# CHECK:   for (int i_outer
-# CHECK:     for (int i_outer_1
-# CHECK:       for (int i_inner
-# CHECK:         for (int i_inner_1
-# CHECK:           for (int i_1
-# CHECK:             f[
-# CHECK:     for (int i_tail_1
-# CHECK:       for (int i_inner_1
-# CHECK:         for (int i_1
-# CHECK:           f[
-# CHECK:   for (int i_tail)IR");
-
-  // Correctness check
-  PaddedBuffer<float> a_v(M, N, L, K, "a");
-  PaddedBuffer<float> b_v(M, N, L, K, "b");
-  PaddedBuffer<float> c_v(M, N, L, K, "c");
-  PaddedBuffer<float> c_ref(M, N, L, K, "c_ref");
-  for (int m = 0; m < M; m++) {
-    for (int n = 0; n < N; n++) {
-      for (int l = 0; l < L; l++) {
-        for (int k = 0; k < K; k++) {
-          a_v(m, n, l, k) = 2 * (m + l);
-          b_v(m, n, l, k) = 3 * (n + k);
-          c_ref(m, n, l, k) = a_v(m, n, l, k) + b_v(m, n, l, k) + 1.0f;
-        }
-      }
-    }
-  }
-
-  SimpleIREvaluator(stmt, {a_buf, b_buf, tensor})(a_v, b_v, c_v);
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  ExpectAllNear(c_v, c_ref, 1e-5);
-}
-
-TEST(LoopNest, SplitWithTailWithLoopOptions) {
-  const int M = 21;
-  BufHandle a_buf("a", {M}, kFloat);
-  BufHandle b_buf("b", {M}, kFloat);
-  Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) {
-    return a_buf.load(m) + b_buf.load(m) + 1.0f;
-  });
-  ForPtr inner, tail;
-
-  LoopNest l({tensor});
-  auto loops = NodeFinder<For>::find(l.root_stmt());
-  ASSERT_GT(loops.size(), 0);
-  loops[0]->set_gpu_block_index(LoopOptions::IDX_Y);
-  LoopNest::splitWithTail(loops[0], 4, &inner, &tail);
-  ASSERT_NE(inner, nullptr);
-  ASSERT_NE(tail, nullptr);
-  ForPtr outer = loops[0];
-
-  // Outer loop carries loop axis bindings.
-  ASSERT_TRUE(outer->loop_options().is_gpu_block_index());
-  ASSERT_EQ(outer->loop_options().gpu_block_index(), LoopOptions::IDX_Y);
-
-  // Inner loop has none.
-  ASSERT_TRUE(inner->loop_options().isDefault());
-
-  // Tail loop has none.
-  ASSERT_TRUE(tail->loop_options().isDefault());
-}
-
-TEST(LoopNest, SplitWithMaskWithLoopOptions) {
-  const int M = 21;
-  BufHandle a_buf("a", {M}, kFloat);
-  BufHandle b_buf("b", {M}, kFloat);
-  Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) {
-    return a_buf.load(m) + b_buf.load(m) + 1.0f;
-  });
-  ForPtr inner;
-
-  LoopNest l({tensor});
-  auto loops = NodeFinder<For>::find(l.root_stmt());
-  loops[0]->set_gpu_block_index(LoopOptions::IDX_Y);
-  LoopNest::splitWithMask(loops[0], 4, &inner);
-  ForPtr outer = loops[0];
-
-  // Outer loop carries loop axis bindings.
-  ASSERT_TRUE(outer->loop_options().is_gpu_block_index());
-  ASSERT_EQ(outer->loop_options().gpu_block_index(), LoopOptions::IDX_Y);
-
-  // Inner loop has none.
-  ASSERT_TRUE(inner->loop_options().isDefault());
-}
-
-TEST(LoopNest, ScheduleBroadcastAddBuffer) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-  BufHandle a_buf("a", {M, N}, kFloat);
-  BufHandle b_buf("b", {N, K}, kFloat);
-  Tensor c = Compute(
-      "broadcast_add",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n) + b_buf.load(n, k);
-      });
-  LoopNest l({c});
-  StmtPtr stmt = l.root_stmt();
-
-  PaddedBuffer<float> a_v(M, N, "a_v");
-  for (int m = 0; m < M; m++) {
-    for (int n = 0; n < N; n++) {
-      a_v(m, n) = 7 * m * n;
-    }
-  }
-  a_v.Backup();
-
-  PaddedBuffer<float> b_v(N, K, "b_v");
-  for (int n = 0; n < N; n++) {
-    for (int k = 0; k < K; k++) {
-      b_v(n, k) = 11 * n * k;
-    }
-  }
-  b_v.Backup();
-
-  PaddedBuffer<float> c_v(M, N, K, "c_buf");
-  SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c});
-  ir_eval(a_v, b_v, c_v);
-
-  a_v.CheckBackup();
-  b_v.CheckBackup();
-  PaddedBuffer<float> c_ref(M, N, K, "c_ref");
-  for (int m = 0; m < M; m++) {
-    for (int n = 0; n < N; n++) {
-      for (int k = 0; k < K; k++) {
-        c_ref(m, n, k) = 7 * m * n + 11 * n * k;
-      }
-    }
-  }
-  ExpectAllNear(c_v, c_ref, 1e-5);
-}
-
-TEST(LoopNest, ScheduleFunctionCall01) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-  BufHandle a_buf("a", {M, N}, kFloat);
-  BufHandle b_buf("b", {N, K}, kFloat);
-  Tensor c = Compute(
-      "broadcast_add",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n) + b_buf.load(n, k);
-      });
-  Tensor d = Compute(
-      "d",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c.load(m, n, k) + 1;
-      });
-
-  LoopNest l({d}, {c, d});
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-  std::ostringstream oss;
-  oss << *stmt;
-  ASSERT_GT(oss.str().size(), 100);
-
-  PaddedBuffer<float> a_v(M, N);
-  PaddedBuffer<float> b_v(N, K);
-  PaddedBuffer<float> c_v(M, N, K);
-  PaddedBuffer<float> d_v(M, N, K);
-  PaddedBuffer<float> d_ref(M, N, K);
-
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      a_v(i, j) = i * i;
-    }
-  }
-  for (int i = 0; i < N; i++) {
-    for (int j = 0; j < K; j++) {
-      b_v(i, j) = j * j;
-    }
-  }
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      for (int k = 0; k < K; k++) {
-        d_ref(i, j, k) = a_v(i, j) + b_v(j, k) + 1;
-      }
-    }
-  }
-
-  SimpleIREvaluator eval(stmt, {a_buf, b_buf, d});
-  eval(a_v, b_v, d_v);
-
-  ExpectAllNear(d_v, d_ref, 1e-5);
-}
-
-TEST(LoopNest, ScheduleInlineSimple) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-  BufHandle a_buf("a", {M, N}, kFloat);
-  BufHandle b_buf("b", {N, K}, kFloat);
-  BufHandle c_buf("c", {M, N}, kFloat);
-  BufHandle d_buf("d", {M, K}, kFloat);
-
-  Tensor x = Compute(
-      "x",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n) * b_buf.load(n, k);
-      });
-  Tensor y = Compute(
-      "y",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
-      });
-
-  LoopNest l1({y}, {x, y});
-  LoopNest l2(l1);
-  l2.computeInline(x.buf());
-
-  l1.prepareForCodegen();
-  l2.prepareForCodegen();
-
-  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
-  StmtPtr stmt2 = IRSimplifier::simplify(l2.root_stmt());
-
-  SimpleIREvaluator eval1(stmt1, {a_buf, b_buf, c_buf, d_buf, y});
-  SimpleIREvaluator eval2(stmt2, {a_buf, b_buf, c_buf, d_buf, y});
-
-  PaddedBuffer<float> a_v(M, N);
-  PaddedBuffer<float> b_v(N, K);
-  PaddedBuffer<float> c_v(M, N);
-  PaddedBuffer<float> d_v(M, K);
-
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      a_v(i, j) = i * i;
-    }
-  }
-  for (int i = 0; i < N; i++) {
-    for (int j = 0; j < K; j++) {
-      b_v(i, j) = j * j;
-    }
-  }
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      c_v(i, j) = i + j;
-    }
-  }
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < K; j++) {
-      d_v(i, j) = i * j;
-    }
-  }
-
-  PaddedBuffer<float> y_1(M, N, K);
-  PaddedBuffer<float> y_2(M, N, K);
-
-  eval1(a_v, b_v, c_v, d_v, y_1);
-  eval2(a_v, b_v, c_v, d_v, y_2);
-  ExpectAllNear(y_1, y_2, 1e-5);
-  std::ostringstream oss1, oss2;
-  oss1 << *stmt1;
-  oss2 << *stmt2;
-  ASSERT_GT(oss1.str().size(), oss2.str().size());
-}
-
-static std::string remove_space(const std::string& str) {
-  std::string str_new = str;
-  str_new.erase(
-      remove_if(str_new.begin(), str_new.end(), isspace), str_new.end());
-  return str_new;
-}
-
-void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-  BufHandle a_buf("a", {M, N}, kFloat);
-  BufHandle b_buf("b", {N, K}, kFloat);
-  BufHandle c_buf("c", {M, N}, kFloat);
-  BufHandle d_buf("d", {M, K}, kFloat);
-
-  Tensor x = Compute(
-      "x",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n) * b_buf.load(n, k);
-      });
-  Tensor y = Compute(
-      "y",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
-      });
-  Tensor z = Compute(
-      "z",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x.load(m, n, k) + y.load(m, n, k);
-      });
-
-  LoopNest l({z}, {x, y, z});
-  for (const std::string& order : inline_order) {
-    if (order == "x") {
-      l.computeInline(x.buf());
-    } else if (order == "y") {
-      l.computeInline(y.buf());
-    } else {
-      throw std::runtime_error("Invalid order: " + order);
-    }
-  }
-  l.prepareForCodegen();
-  StmtPtr stmt = l.root_stmt();
-
-  std::ostringstream oss;
-  oss << *stmt;
-  std::string str1 = remove_space(oss.str());
-
-  {
-    PaddedBuffer<float> a_v(M, N);
-    PaddedBuffer<float> b_v(N, K);
-    PaddedBuffer<float> c_v(M, N);
-    PaddedBuffer<float> d_v(M, K);
-
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < N; j++) {
-        a_v(i, j) = i * i;
-      }
-    }
-    for (int i = 0; i < N; i++) {
-      for (int j = 0; j < K; j++) {
-        b_v(i, j) = j * j;
-      }
-    }
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < N; j++) {
-        c_v(i, j) = i + j;
-      }
-    }
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < K; j++) {
-        d_v(i, j) = i * j;
-      }
-    }
-
-    PaddedBuffer<float> z_v(M, N, K);
-    PaddedBuffer<float> z_ref(M, N, K);
-    for (int m = 0; m < M; m++) {
-      for (int n = 0; n < N; n++) {
-        for (int k = 0; k < K; k++) {
-          z_ref(m, n, k) = a_v(m, n) * b_v(n, k) * 2 + c_v(m, n) * d_v(m, k);
-        }
-      }
-    }
-
-    SimpleIREvaluator eval(stmt, {a_buf, b_buf, c_buf, d_buf, z});
-    eval(a_v, b_v, c_v, d_v, z_v);
-    ExpectAllNear(z_v, z_ref, 1e-5);
-  }
-
-  if (inline_order.size() == 2) {
-    Tensor z2 = Compute(
-        "z",
-        {M, N, K},
-        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-          return a_buf.load(m, n) * b_buf.load(n, k) +
-              (c_buf.load(m, n) * d_buf.load(m, k) +
-               a_buf.load(m, n) * b_buf.load(n, k));
-        });
-    LoopNest l2({z2});
-    l2.prepareForCodegen();
-    StmtPtr stmt2 = l2.root_stmt();
-
-    std::ostringstream oss2;
-    oss2 << *stmt2;
-    std::string str2 = remove_space(oss2.str());
-
-    ASSERT_EQ(str1, str2);
-    ASSERT_GT(str1.size(), 100);
-  }
-}
-
-TEST(LoopNest, ScheduleInlineFunc01) {
-  InlineFunc01Helper({"x", "y"});
-  InlineFunc01Helper({"y", "x"});
-  InlineFunc01Helper({"x"});
-  InlineFunc01Helper({"y"});
-  InlineFunc01Helper({});
-}
-
-// Make sure we cache random vars if we should.
-TEST(LoopNest, ScheduleInlineRandom) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-
-  Tensor x = Compute(
-      "x",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return Mod::make(Intrinsics::make(kRand, kInt), 5);
-      });
-  Tensor y = Compute(
-      "y",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x.load(m, n, k) + x.load(m, n, k);
-      });
-
-  LoopNest l1({y}, {x, y});
-  l1.computeInline(x.buf());
-
-  // would normally compare results but Rand isn't implemented in the
-  // SimpleIREvaluator, even if we could seed it.
-  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
-
-  // Check the IR we produced
-  checkIR(stmt1, R"IR(
-# CHECK: for (int i = 0; i < 4; i++)
-# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
-# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
-# CHECK:       int x = rand();
-# CHECK:       y[i, i_1, i_2] = 2 * (x % 5);)IR");
-}
-
-// Make sure we don't cache random vars that are not being inlined.
-TEST(LoopNest, ScheduleInlineRandomUnrelated) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-
-  Tensor x = Compute(
-      "x",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return m * n * k;
-      });
-  Tensor y = Compute(
-      "y",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x.load(m, n, k) + Intrinsics::make(kRand, kInt) +
-            Intrinsics::make(kRand, kInt);
-      });
-
-  LoopNest l1({y}, {x, y});
-  l1.computeInline(x.buf());
-
-  // would normally compare results but Rand isn't implemented in the
-  // SimpleIREvaluator, even if we could seed it.
-  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
-
-  // Check the IR we produced
-  checkIR(stmt1, R"IR(
-# CHECK: for (int i = 0; i < 4; i++)
-# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
-# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
-# CHECK:       y[i, i_1, i_2] = ((i * i_1) * i_2 + (rand())) + (rand());)IR");
-}
-
-// Make sure we generate the right number of random values == the dimensionality
-// of the production tensor.
-TEST(LoopNest, ScheduleInlineRandomLowerDimensions) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-
-  Tensor x = Compute("x", {M}, [&](const VarHandle& m) {
-    return Mod::make(Intrinsics::make(kRand, kInt), 5);
-  });
-  Tensor y = Compute(
-      "y",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x.load(m) + x.load(m);
-      });
-
-  LoopNest l1({y}, {x, y});
-  l1.computeInline(x.buf());
-
-  // would normally compare results but Rand isn't implemented in the
-  // SimpleIREvaluator, even if we could seed it.
-  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
-
-  // Check the IR we produced
-  checkIR(stmt1, R"IR(
-# CHECK: for (int i = 0; i < 4; i++)
-# CHECK:   int x = rand();
-# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
-# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
-# CHECK:       y[i, i_1, i_2] = 2 * (x % 5);)IR");
-}
-
-// Make sure we don't screw up intrinsics thinking they're rand.
-TEST(LoopNest, ScheduleInlineIntrinsics) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-  BufHandle a_buf("a", {M, N}, kFloat);
-  BufHandle b_buf("b", {N, K}, kFloat);
-
-  Tensor x = Compute(
-      "x",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n) * b_buf.load(n, k);
-      });
-  Tensor y = Compute(
-      "y",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return Intrinsics::make(kSqrt, x.load(m, n, k));
-      });
-
-  PaddedBuffer<float> a_v(M, N);
-  PaddedBuffer<float> b_v(N, K);
-
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      a_v(i, j) = i * i;
-    }
-  }
-  for (int i = 0; i < N; i++) {
-    for (int j = 0; j < K; j++) {
-      b_v(i, j) = j * j;
-    }
-  }
-
-  LoopNest l1({y}, {x, y});
-  LoopNest l2(l1);
-  l2.computeInline(x.buf());
-
-  l1.prepareForCodegen();
-  l2.prepareForCodegen();
-
-  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
-  StmtPtr stmt2 = IRSimplifier::simplify(l2.root_stmt());
-
-  SimpleIREvaluator eval1(stmt1, {a_buf, b_buf, y});
-  SimpleIREvaluator eval2(stmt2, {a_buf, b_buf, y});
-
-  PaddedBuffer<float> y_1(M, N, K);
-  PaddedBuffer<float> y_2(M, N, K);
-
-  eval1(a_v, b_v, y_1);
-  eval2(a_v, b_v, y_2);
-  ExpectAllNear(y_1, y_2, 1e-5);
-  std::ostringstream oss1, oss2;
-  oss1 << *stmt1;
-  oss2 << *stmt2;
-  ASSERT_GT(oss1.str().size(), oss2.str().size());
-}
-
-// Make sure we can handle rand and non-rand intrinsics.
-TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-
-  Tensor x = Compute(
-      "x",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return Intrinsics::make(kRand, kFloat);
-      });
-  Tensor y = Compute(
-      "y",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return Intrinsics::make(kSqrt, x.load(m, n, k));
-      });
-
-  LoopNest l1({y}, {x, y});
-  l1.computeInline(x.buf());
-
-  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
-
-  // Check the IR we produced
-  checkIR(stmt1, R"IR(
-# CHECK: for (int i = 0; i < 4; i++)
-# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
-# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
-# CHECK:       float x = rand();
-# CHECK:       y[i, i_1, i_2] = sqrt(x);)IR");
-}
-
-// Split a Compute then inline it into another compute.
-TEST(LoopNest, ScheduleSplitAThenInline) {
-  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute(
-      "b", {2}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
-
-  LoopNest l({b}, {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
-  LoopNest::splitWithMask(loops[0], 4);
-  ASSERT_FALSE(l.computeInline(a.buf()));
-}
-
-// Split a Compute then inline another Compute into it.
-TEST(LoopNest, ScheduleSplitBThenInline) {
-  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute(
-      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
-
-  LoopNest l({b}, {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0);
-  LoopNest::splitWithMask(loops[0], 3);
-  l.computeInline(a.buf());
-  l.prepareForCodegen();
-  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
-
-  std::vector<int> output(6, 0);
-  SimpleIREvaluator eval(s, {b});
-  eval(output);
-
-  for (int i = 0; i < 6; ++i) {
-    ASSERT_EQ(output[i], (i + 8) * (i + 8));
-  }
-}
-
-// Split a Compute twice then inline it.
-TEST(LoopNest, ScheduleSplitTwiceThenInline) {
-  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute(
-      "b", {2}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
-  ForPtr i_inner;
-
-  LoopNest l({b}, {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
-  LoopNest::splitWithMask(loops[0], 4, &i_inner);
-  LoopNest::splitWithMask(i_inner, 2);
-  ASSERT_FALSE(l.computeInline(a.buf()));
-}
-
-// Inline a Compute, then split.
-TEST(LoopNest, ScheduleInlineThenSplit) {
-  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute(
-      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
-
-  LoopNest l({b}, {a, b});
-  l.computeInline(a.buf());
-
-  std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
-  LoopNest::splitWithMask(loops.back(), 3);
-  l.prepareForCodegen();
-  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
-  std::vector<int> output(6, 0);
-  SimpleIREvaluator eval(s, {b});
-  eval(output);
-
-  for (int i = 0; i < 6; ++i) {
-    ASSERT_EQ(output[i], (i + 8) * (i + 8));
-  }
-}
-
-// Split a Compute, inline it, then split the result.
-TEST(LoopNest, ScheduleSplitInlineThenSplit) {
-  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute(
-      "b", {16}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
-
-  LoopNest l({b}, {a, b});
-  auto loops = NodeFinder<For>::find(l.root_stmt());
-  LoopNest::splitWithMask(loops.back(), 2);
-  l.computeInline(a.buf());
-
-  loops = NodeFinder<For>::find(l.root_stmt());
-  LoopNest::splitWithMask(loops.front(), 2);
-  l.prepareForCodegen();
-  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
-  std::vector<int> output(16, 0);
-  SimpleIREvaluator eval(s, {b});
-  eval(output);
-
-  for (int i = 0; i < 16; ++i) {
-    ASSERT_EQ(output[i], (i + 8) * (i + 8));
-  }
-}
-
-// Oversplit a loop that is simplified out after inlining.
-TEST(LoopNest, ScheduleSplitInlineSimplify) {
-  Tensor a = Compute("a", {18}, [&](const VarHandle& i) {
-    return ExprHandle(4) * i - ExprHandle(2) * i;
-  });
-  Tensor b = Compute(
-      "b", {2}, [&](const VarHandle& j) { return a.load(j) - ExprHandle(1); });
-
-  LoopNest l({b}, {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
-  LoopNest::splitWithMask(loops[0], 4);
-  ASSERT_FALSE(l.computeInline(a.buf()));
-}
-
-// Inline a Compute with two consumers.
-TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
-  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute(
-      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
-  Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) {
-    return a.load(k) * b.load(l);
-  });
-
-  LoopNest l({c}, {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
-  l.computeInline(a.buf());
-  l.prepareForCodegen();
-
-  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
-  std::vector<int> output(4 * 3, 0);
-  SimpleIREvaluator eval(s, {c});
-  eval(output);
-
-  for (int k = 0; k < 4; ++k) {
-    for (int l = 0; l < 3; ++l) {
-      ASSERT_EQ(output[k * 3 + l], (k) * (k) * (l + 8) * (l + 8));
-    }
-  }
-}
-
-// Inline Compute A into B, then inline B into C.
-TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
-  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute(
-      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
-  Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) {
-    return a.load(k) * b.load(l);
-  });
-
-  LoopNest l({c}, {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
-  l.computeInline(a.buf());
-  l.computeInline(b.buf());
-  l.prepareForCodegen();
-
-  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
-  std::vector<int> output(4 * 3, 0);
-  SimpleIREvaluator eval(s, {c});
-  eval(output);
-
-  for (int k = 0; k < 4; ++k) {
-    for (int l = 0; l < 3; ++l) {
-      ASSERT_EQ(output[k * 3 + l], (k) * (k) * (l + 8) * (l + 8));
-    }
-  }
-}
-
-// Inline a Compute that is both a producer and consumer.
-TEST(LoopNest, ScheduleInlineThreeMixedInner) {
-  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute(
-      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
-  Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) {
-    return a.load(k) * b.load(l);
-  });
-
-  LoopNest l({c}, {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
-  l.computeInline(b.buf());
-  l.prepareForCodegen();
-
-  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
-  std::vector<int> output(4 * 3, 0);
-  SimpleIREvaluator eval(s, {c});
-  eval(output);
-
-  for (int k = 0; k < 4; ++k) {
-    for (int l = 0; l < 3; ++l) {
-      ASSERT_EQ(output[k * 3 + l], (k) * (k) * (l + 8) * (l + 8));
-    }
-  }
-}
-
-// Split 3 Computes, then inline the first two into the last.
-TEST(LoopNest, ScheduleInlineThreeMixedSplit) {
-  Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
-  Tensor b = Compute(
-      "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
-  Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) {
-    return a.load(k) * b.load(l);
-  });
-
-  LoopNest l({c}, {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
-  LoopNest::splitWithMask(loops[0], 4);
-  loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0);
-  LoopNest::splitWithMask(loops[0], 3);
-  loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
-  LoopNest::splitWithMask(loops[0], 2);
-
-  ASSERT_FALSE(l.computeInline(a.buf()));
-}
-
-// Check that inlining works for output tensors too
-TEST(LoopNest, ScheduleInlineOutputTensors) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-
-  Tensor x = Compute(
-      "x",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return m * n * k;
-      });
-  Tensor y = Compute(
-      "y",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x.load(m, n, k) + m;
-      });
-
-  LoopNest l1({x, y});
-  l1.computeInline(x.buf());
-
-  // would normally compare results but Rand isn't implemented in the
-  // SimpleIREvaluator, even if we could seed it.
-  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
-
-  // Check the IR we produced
-  checkIR(stmt1, R"IR(
-# CHECK: for (int i = 0; i < 4; i++)
-# CHECK:   for (int i_1 = 0; i_1 < 5; i_1++)
-# CHECK:     for (int i_2 = 0; i_2 < 6; i_2++)
-# CHECK:       x[i, i_1, i_2] = (i * i_1) * i_2;
-# CHECK: for (int i_3 = 0; i_3 < 4; i_3++)
-# CHECK:   for (int i_4 = 0; i_4 < 5; i_4++)
-# CHECK:     for (int i_5 = 0; i_5 < 6; i_5++)
-# CHECK:       y[i_3, i_4, i_5] = i_3 + (i_3 * i_4) * i_5;)IR");
-}
-
-TEST(LoopNest, ScheduleInlineWithCompoundIndices) {
-  // Input IR:
-  //     for (int64_t i = 0; i < 100; i++) {
-  //       A[i*2,i] = i * 500ll;
-  //     }
-  //     for (int64_t j = 0; j < 100; j++) {
-  //       B[0ll,j] = A[0, j] + j * 100ll;
-  //     }
-  BufHandle a_buf("A", {20, 100}, kLong);
-  BufHandle b_buf("B", {20, 100}, kLong);
-  VarHandle i("i", kLong);
-  VarHandle j("j", kLong);
-  auto forI = For::make(
-      i,
-      0,
-      100,
-      Store::make(a_buf, {i * 2, i}, Mul::make(i, static_cast<int64_t>(500))));
-  auto forJ = For::make(
-      j,
-      0,
-      100,
-      Store::make(
-          b_buf,
-          {static_cast<int64_t>(0), j},
-          Add::make(
-              Load::make(a_buf, {static_cast<int64_t>(0), j}),
-              Mul::make(j, static_cast<int64_t>(100)))));
-  auto par = Block::make({forI, forJ});
-
-  LoopNest l(par, {b_buf.node()});
-  // Inlining should fail since the producer has compound expr as index.
-  ASSERT_FALSE(l.computeInline(a_buf.node()));
-
-  // The input statement must remain as is.
-  checkIR(l.root_stmt(), R"IR(
-    # CHECK: for (int64_t i = 0;
-    # CHECK-NEXT:   A[
-    # CHECK: for (int64_t j = 0;
-    # CHECK-NEXT:   B[)IR");
-}
-
-TEST(LoopNest, ScheduleInlineConsumerIndicesWithCast) {
-  // Input IR:
-  //     for (int64_t i = 0; i < 100; i++) {
-  //       A[0ll,i] = i * 500ll;
-  //     }
-  //     for (int64_t j = 0; j < 100; j++) {
-  //       B[0ll,j] = A[(int64_t)0, j] + j * 100ll;
-  //     }
-  BufHandle a_buf("A", {20, 100}, kLong);
-  BufHandle b_buf("B", {20, 100}, kLong);
-  VarHandle i("i", kLong);
-  VarHandle j("j", kLong);
-  auto forI = For::make(
-      i,
-      0,
-      100,
-      Store::make(
-          a_buf,
-          {static_cast<int64_t>(0), i},
-          Mul::make(i, static_cast<int64_t>(500))));
-  auto forJ = For::make(
-      j,
-      0,
-      100,
-      Store::make(
-          b_buf,
-          {static_cast<int64_t>(0), j},
-          Add::make(
-              Load::make(a_buf, {0, j}),
-              Mul::make(j, static_cast<int64_t>(100)))));
-  auto par = Block::make({forI, forJ});
-
-  LoopNest l(par, {b_buf.node()});
-  ASSERT_TRUE(l.computeInline(a_buf.node()));
-
-  checkIR(l.root_stmt(), R"IR(
-    # CHECK: for (int64_t j = 0; j < 100; j++) {
-    # CHECK:   B[0ll, j] = j * 500ll + j * 100ll;
-    # CHECK: })IR");
-}
-
-TEST(LoopNest, ScheduleInlineProducerIndicesWithCast) {
-  // Input IR:
-  //     for (int64_t i = 0; i < 100; i++) {
-  //       A[(int64_t)0,i] = i * 500ll;
-  //     }
-  //     for (int64_t j = 0; j < 100; j++) {
-  //       B[0ll,j] = A[0ll, j] + j * 100ll;
-  //     }
-  BufHandle a_buf("A", {20, 100}, kLong);
-  BufHandle b_buf("B", {20, 100}, kLong);
-  VarHandle i("i", kLong);
-  VarHandle j("j", kLong);
-  auto forI = For::make(
-      i,
-      0,
-      100,
-      Store::make(a_buf, {0, i}, Mul::make(i, static_cast<int64_t>(500))));
-  auto forJ = For::make(
-      j,
-      0,
-      100,
-      Store::make(
-          b_buf,
-          {static_cast<int64_t>(0), j},
-          Add::make(
-              Load::make(a_buf, {static_cast<int64_t>(0), j}),
-              Mul::make(j, static_cast<int64_t>(100)))));
-  auto par = Block::make({forI, forJ});
-
-  LoopNest l(par, {b_buf.node()});
-  ASSERT_TRUE(l.computeInline(a_buf.node()));
-
-  checkIR(l.root_stmt(), R"IR(
-    # CHECK: for (int64_t j = 0; j < 100; j++) {
-    # CHECK:   B[0ll, j] = j * 500ll + j * 100ll;
-    # CHECK: })IR");
-}
-
-TEST(LoopNest, ScheduleFuserStyle) {
-  const int kVectorSize = 8;
-  const int kVectorCount = 128;
-  const int kTotalSize = kVectorSize * kVectorCount;
-
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-
-  Tensor b =
-      Compute("f", {kTotalSize}, [&](const std::vector<VarHandle>& axes) {
-        return a_buf.load(axes[0]) + 11.0f;
-      });
-
-  Tensor c =
-      Compute("g", {kTotalSize}, [&](const std::vector<VarHandle>& axes) {
-        return b.load(axes[0]) + 1.0f;
-      });
-
-  LoopNest l({b, c});
-  l.prepareForCodegen();
-  StmtPtr s = l.root_stmt();
-
-  std::vector<float> a_data(kTotalSize, 7.0f);
-  std::vector<float> b_data(kTotalSize, 0.0f);
-  std::vector<float> c_data(kTotalSize, 0.0f);
-  SimpleIREvaluator(s, {a_buf, b, c})(a_data, b_data, c_data);
-
-  for (int i = 0; i < kTotalSize; i++) {
-    ASSERT_EQ(b_data[i], 18.0f);
-    ASSERT_EQ(c_data[i], 19.0f);
-  }
-}
-
-TEST(LoopNest, ScheduleFuserThreeArg) {
-  const int kVectorSize = 8;
-  const int kVectorCount = 128;
-  const int kTotalSize = kVectorSize * kVectorCount;
-
-  BufHandle a("A", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle b("B", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle c("C", {ExprHandle(kTotalSize)}, kFloat);
-  BufHandle d("D", {ExprHandle(kTotalSize)}, kFloat);
-
-  Tensor e = Compute("e", {kTotalSize}, [&](const VarHandle& i) {
-    return a.load(i) + b.load(i);
-  });
-  Tensor f = Compute("f", {kTotalSize}, [&](const VarHandle& i) {
-    return e.load(i) + c.load(i);
-  });
-  Tensor g = Compute("g", {kTotalSize}, [&](const VarHandle& i) {
-    return f.load(i) + d.load(i);
-  });
-
-  LoopNest l({g}, {e, f, g});
-  l.computeInline(l.getLoopBodyFor(e));
-  l.computeInline(l.getLoopBodyFor(f));
-  l.prepareForCodegen();
-  StmtPtr s = l.root_stmt();
-
-  std::vector<float> a_data(kTotalSize, 1.0f);
-  std::vector<float> b_data(kTotalSize, 2.0f);
-  std::vector<float> c_data(kTotalSize, 3.0f);
-  std::vector<float> d_data(kTotalSize, 4.0f);
-  std::vector<float> g_data(kTotalSize, 0.0f);
-  SimpleIREvaluator(s, {a, b, c, d, g})(a_data, b_data, c_data, d_data, g_data);
-
-  for (int i = 0; i < kTotalSize; i++) {
-    ASSERT_EQ(g_data[i], 10.0f);
-  }
-}
-
-TEST(LoopNest, ScheduleDynamicShape2D) {
-  auto testWithSize = [](int32_t M, int32_t N) {
-    VarHandle m("m", kInt);
-    VarHandle n("n", kInt);
-    BufHandle a("a", {m, n}, kFloat);
-    BufHandle b("b", {m, n}, kFloat);
-    Tensor c =
-        Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) {
-          return a.load(i, j) + b.load(i, j);
-        });
-    LoopNest l({c});
-    StmtPtr s = l.root_stmt();
-    SimpleIREvaluator cg(s, {a, b, c, m, n});
-    std::vector<float> aData(M * N, 1.0f);
-    std::vector<float> bData(M * N, 2.0f);
-    std::vector<float> cData(M * N, 0.0f);
-    cg.call({aData, bData, cData, M, N});
-    ExpectAllNear(cData, std::vector<float>(M * N, 3.0f), 1e-7);
-  };
-  testWithSize(1, 8);
-  testWithSize(16, 32);
-  testWithSize(37, 11);
-}
-
-TEST(LoopNest, LoopNestComputeAt_1) {
-  // Verify that compute_at works on the following example:
-  //
-  // for (int i_a = 0; i_a < N; i_a++) {
-  //   A[i_a] = i_a * i_a
-  // }
-  // for (int i_b = 0; i_b < N; i_b++) {
-  //   B[i_b] = A[i_b]
-  // }
-  //
-  // After the transformation the i_b loop should have an allocation for a temp
-  // buffer and that buffer should be used in computation of B. No use of A
-  // should be in that loop after the transformation. Also, computation of A
-  // should not be inlined into B. Instead, it should be computed into the temp,
-  // and the temp should be used in B.
-  VarHandle N("N", kInt);
-  Tensor A = Compute("A", {N}, [&](const VarHandle& i_a) { return i_a * i_a; });
-  Tensor B =
-      Compute("B", {N}, [&](const VarHandle& i_b) { return A.load(i_b); });
-  LoopNest l({B}, {A, B});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0);
-  LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
-  l.prepareForCodegen();
-  SimpleIREvaluator cg(l.root_stmt(), {B, N});
-  StmtPtr s = cg.stmt();
-
-  checkIR(s, R"IR(
-# CHECK: Allocate(temp); // dtype=int, dims=[1]
-# CHECK: for (int i = 0; i < N; i++)
-# CHECK:   temp[
-# CHECK-NOT: A[
-# CHECK:   B[i_1] = temp[0]
-# CHECK:   Free(temp))IR");
-
-  // Now check that the loop still produces the correct result.
-  std::vector<int> b_data(100, 0);
-  cg.call({b_data, 100});
-
-  std::vector<int> b_ref(100, 0);
-  for (int i = 0; i < 100; i++) {
-    b_ref[i] = i * i;
-  }
-  assertAllEqual(b_data, b_ref);
-}
-
-TEST(LoopNest, LoopNestComputeAt_2) {
-  // Verify that compute_at works on the following example:
-  //
-  // for (int py = 0; py < H+1; py++) {
-  //   for (int px = 0; px < W+1; px++) {
-  //     p[py, px] = py*px
-  //   }
-  // }
-  // for (int cy = 0; cy < H; cy++) {
-  //   for (int cx = 0; cx < W; cx++) {
-  //     c[py, px] = p[cy,cx]   + p[cy+1,cx] +
-  //                 p[cy,cx+1] + p[cy+1,cx+1]
-  //   }
-  // }
-
-  const int kW = 16, kH = 16;
-  VarHandle W("W", kInt);
-  VarHandle H("H", kInt);
-  Tensor p = Compute(
-      "prod", {H + 1, W + 1}, [&](const VarHandle& py, const VarHandle& px) {
-        return px * py;
-      });
-  Tensor c =
-      Compute("cons", {H, W}, [&](const VarHandle& y, const VarHandle& x) {
-        return p.load(y, x) + p.load(y + 1, x) + p.load(y, x + 1) +
-            p.load(y + 1, x + 1);
-      });
-
-  std::vector<int> c_ref(kW * kH, 0);
-  for (int y = 0; y < kH; y++) {
-    for (int x = 0; x < kW; x++) {
-      c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
-    }
-  }
-  LoopNest orig_loopnest({c}, {p, c});
-
-  {
-    // First let's try to compute P at axis cy (the outer loop)
-    LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
-    LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]);
-    l.prepareForCodegen();
-    SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
-    StmtPtr s = cg.stmt();
-
-    // Check the IR we produced
-    checkIR(s, R"IR(
-# CHECK: Allocate(temp); // dtype=int, dims=[2, W + 1]
-# CHECK: for (int i_2 = 0; i_2 < H; i_2++)
-# CHECK:   for
-# CHECK:     for
-# CHECK:   for (int i_3 = 0; i_3 < W; i_3++)
-# CHECK-NOT: prod[
-# CHECK:     cons[
-# CHECK: Free(temp))IR");
-
-    // Now check that the loop still produces the correct result.
-    std::vector<int> c_data(kW * kH, 0);
-    cg.call({c_data, kW, kH});
-
-    assertAllEqual(c_data, c_ref);
-  }
-  {
-    // Now let's try to compute P at axis cx (the inner loop)
-    LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
-    LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]);
-    l.prepareForCodegen();
-    SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
-    StmtPtr s = cg.stmt();
-
-    // Check the IR we produced
-    checkIR(s, R"IR(
-# CHECK: Allocate(temp); // dtype=int, dims=[2, 2]
-# CHECK: for (int i_2 = 0; i_2 < H; i_2++)
-# CHECK:   for (int i_3 = 0; i_3 < W; i_3++)
-# CHECK:     for
-# CHECK:       for
-# CHECK-NOT: prod[
-# CHECK:     cons[
-# CHECK: Free(temp))IR");
-
-    // Now check that the loop still produces the correct result.
-    std::vector<int> c_data(kW * kH, 0);
-    cg.call({c_data, kW, kH});
-
-    assertAllEqual(c_data, c_ref);
-  }
-}
-
-TEST(LoopNest, LoopNestComputeAt_3) {
-  // Verify that compute_at works on the following example:
-  //
-  // A(x,y) = x*y
-  // B(x,y) = A(x, y)
-  // C(x,y) = B(x+1, y)
-  // D(x,y) = A(x, y+1) + C(x, y)
-  //
-  // i.e. when 'A' comes to 'D' directly and indirectly through 'C'.
-
-  const int kW = 16, kH = 16;
-  VarHandle W("W", kInt);
-  VarHandle H("H", kInt);
-  Tensor A = Compute(
-      "A", {H + 1, W + 1}, [&](const VarHandle& ay, const VarHandle& ax) {
-        return ax * ay;
-      });
-  Tensor B = Compute(
-      "B", {H + 1, W + 1}, [&](const VarHandle& by, const VarHandle& bx) {
-        return A.load(by, bx);
-      });
-  Tensor C =
-      Compute("C", {H, W}, [&](const VarHandle& cy, const VarHandle& cx) {
-        return B.load(cy, cx + 1);
-      });
-  Tensor D =
-      Compute("D", {H, W}, [&](const VarHandle& dy, const VarHandle& dx) {
-        return A.load(dy + 1, dx) + C.load(dy, dx);
-      });
-
-  std::vector<int> c_ref(kW * kH, 0);
-  for (int y = 0; y < kH; y++) {
-    for (int x = 0; x < kW; x++) {
-      c_ref[y * kW + x] = (y + 1) * x + y * (x + 1);
-    }
-  }
-
-  LoopNest orig_loopnest({D}, {A, B, C, D});
-  {
-    // First let's try to compute A at axis dy (the outer loop)
-    LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0);
-    LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
-    l.prepareForCodegen();
-    SimpleIREvaluator cg(l.root_stmt(), {D, W, H});
-    StmtPtr s = cg.stmt();
-
-    // Check the IR we produced
-    checkIR(s, R"IR(
-# CHECK: Allocate(temp); // dtype=int, dims=[1, W]
-# CHECK: for (int i = 0; i < H + 1; i++)
-# CHECK:   for (int i_1 = 0; i_1 < W + 1; i_1++)
-# CHECK:     A[
-# CHECK: for (int i_2 = 0; i_2 < H + 1; i_2++)
-# CHECK:   for (int i_3 = 0; i_3 < W + 1; i_3++)
-# CHECK:     B[
-# CHECK: for (int i_4 = 0; i_4 < H; i_4++)
-# CHECK:   for (int i_5 = 0; i_5 < W; i_5++)
-# CHECK:     C[
-# CHECK: for (int i_6 = 0; i_6 < H; i_6++)
-# CHECK:   for (int i_7 = 0; i_7 < W; i_7++)
-# CHECK-NOT: A[)IR");
-
-    // Now check that the loop still produces the correct result.
-    std::vector<int> c_data(kW * kH, 0);
-    cg.call({c_data, kW, kH});
-
-    assertAllEqual(c_data, c_ref);
-  }
-  {
-    // Now let's try to compute A at axis dx (the inner loop)
-    LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0);
-    LoopNest::computeAt(l.getLoopBodyFor(A), loops[1]);
-    l.prepareForCodegen();
-    SimpleIREvaluator cg(l.root_stmt(), {D, W, H});
-    StmtPtr s = cg.stmt();
-
-    // Check the IR we produced
-    checkIR(s, R"IR(
-# CHECK: Allocate(temp); // dtype=int, dims=[1, 1]
-# CHECK: for (int i = 0; i < H + 1; i++)
-# CHECK:   for (int i_1 = 0; i_1 < W + 1; i_1++)
-# CHECK:     A[
-# CHECK: for (int i_2 = 0; i_2 < H + 1; i_2++)
-# CHECK:   for (int i_3 = 0; i_3 < W + 1; i_3++)
-# CHECK:     B[
-# CHECK: for (int i_4 = 0; i_4 < H; i_4++)
-# CHECK:   for (int i_5 = 0; i_5 < W; i_5++)
-# CHECK:     C[
-# CHECK: for (int i_6 = 0; i_6 < H; i_6++)
-# CHECK:   for (int i_7 = 0; i_7 < W; i_7++)
-# CHECK-NOT: A[)IR");
-
-    // Now check that the loop still produces the correct result.
-    std::vector<int> c_data(kW * kH, 0);
-    cg.call({c_data, kW, kH});
-
-    assertAllEqual(c_data, c_ref);
-  }
-}
-
-using Axis = const VarHandle&;
-
-TEST(LoopNest, Reduce2dComputeAt) {
-  const int kW = 16, kH = 16;
-  VarHandle W("W", kInt);
-  VarHandle H("H", kInt);
-
-  Tensor p = Compute(
-      "prod", {H + 1, W + 1}, [&](Axis py, Axis px) { return px * py; });
-  Tensor c = Reduce(
-      "cons",
-      {H, W},
-      Sum(),
-      [&](Axis y, Axis x, Axis r, Axis s) { return p.load(y + r, x + s); },
-      {2, 2});
-
-  std::vector<int> c_ref(kW * kH, 0);
-  for (int y = 0; y < kH; y++) {
-    for (int x = 0; x < kW; x++) {
-      c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
-    }
-  }
-  LoopNest orig_loopnest({c}, {p, c});
-  checkIR(orig_loopnest.root_stmt(), R"IR(
-# CHECK: for (int i = 0; i < H + 1; i++) {
-# CHECK:   for (int i_1 = 0; i_1 < W + 1; i_1++) {
-# CHECK:     prod[i, i_1] = i_1 * i;
-# CHECK:   }
-# CHECK: }
-# CHECK: for (int i_2 = 0; i_2 < H; i_2++) {
-# CHECK:   for (int i_3 = 0; i_3 < W; i_3++) {
-# CHECK:     cons[i_2, i_3] = int(0);
-# CHECK:     for (int i_4 = 0; i_4 < 2; i_4++) {
-# CHECK:       for (int i_5 = 0; i_5 < 2; i_5++) {
-# CHECK:         cons[i_2, i_3] = ReduceOp((cons[i_2, i_3]) + (prod[i_2 + i_4, i_3 + i_5]), reduce_args={i_4, i_5});
-# CHECK:       }
-# CHECK:     }
-# CHECK:   }
-# CHECK: }
-)IR");
-
-  {
-    // First let's try to compute P at axis cy (the outer loop)
-    LoopNest l(orig_loopnest);
-    auto loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
-    LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]);
-    // FIXME: Calling simplify here breaks the IR:
-    // MALFORMED INPUT: could not find base node in Load - temp[...]
-    // l.simplify();
-    l.eliminateDeadStores();
-    l.prepareForCodegen();
-    SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
-    checkIR(cg.stmt(), R"IR(
-# CHECK: Allocate(temp); // dtype=int, dims=[2, W + 1]
-# CHECK: for (int i = 0; i < H; i++) {
-# CHECK:   for (int idx0 = 0; idx0 < 2; idx0++) {
-# CHECK:     for (int idx1 = 0; idx1 < W + 1; idx1++) {
-# CHECK:       temp[(0 + idx0 * (1 * (W + 1))) + idx1 * 1] = (idx0 + i) * (idx1 + 0);
-# CHECK:     }
-# CHECK:   }
-# CHECK:   for (int i_1 = 0; i_1 < W; i_1++) {
-# CHECK:     cons[(0 + i * (1 * W)) + i_1 * 1] = int(0);
-# CHECK:     for (int i_2 = 0; i_2 < 2; i_2++) {
-# CHECK:       for (int i_3 = 0; i_3 < 2; i_3++) {
-# CHECK:         cons[(0 + i * (1 * W)) + i_1 * 1] = (cons[(0 + i * (1 * W)) + i_1 * 1]) + (temp[(0 + i_2 * (1 * (W + 1))) + (i_1 + i_3) * 1]);
-# CHECK:       }
-# CHECK:     }
-# CHECK:   }
-# CHECK: }
-# CHECK: Free(temp);
-)IR");
-
-    // Now check that the loop still produces the correct result.
-    std::vector<int> c_data(kW * kH, 0);
-    cg.call({c_data, kW, kH});
-    assertAllEqual(c_data, c_ref);
-  }
-  {
-    // Now let's try to compute P at axis cx (the inner loop)
-    LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
-    LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]);
-    l.simplify();
-    l.eliminateDeadStores();
-    l.prepareForCodegen();
-    SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
-    checkIR(cg.stmt(), R"IR(
-# CHECK: Allocate(temp); // dtype=int, dims=[2, 2]
-# CHECK: for (int i = 0; i < H; i++) {
-# CHECK:   for (int i_1 = 0; i_1 < W; i_1++) {
-# CHECK:     for (int idx0 = 0; idx0 < 2; idx0++) {
-# CHECK:       for (int idx1 = 0; idx1 < 2; idx1++) {
-# CHECK:         temp[(0 + idx0 * (1 * 2)) + idx1 * 1] = (i + idx0) * (i_1 + idx1);
-# CHECK:       }
-# CHECK:     }
-# CHECK:     cons[(0 + i * (1 * W)) + i_1 * 1] = 0;
-# CHECK:     for (int i_2 = 0; i_2 < 2; i_2++) {
-# CHECK:       for (int i_3 = 0; i_3 < 2; i_3++) {
-# CHECK:         cons[(0 + i * (1 * W)) + i_1 * 1] = (cons[(0 + i * (1 * W)) + i_1 * 1]) + (temp[(0 + i_2 * (1 * 2)) + i_3 * 1]);
-# CHECK:       }
-# CHECK:     }
-# CHECK:   }
-# CHECK: }
-# CHECK: Free(temp);
-)IR");
-
-    // Now check that the loop still produces the correct result.
-    std::vector<int> c_data(kW * kH, 0);
-    cg.call({c_data, kW, kH});
-    assertAllEqual(c_data, c_ref);
-  }
-}
-
-TEST(LoopNest, DISABLED_Conv1d_NH) {
-  // Lots of stuff is broken here.  The computeAt swaps the axes for some odd
-  // reason.  Even without that, the index flattener fails due to "dimensions
-  // mismatch in flatten index".
-
-  int N = 4;
-  int H = 256;
-  int R = 3;
-  int Pad = 1;
-  BufHandle IP("input", {H}, kFloat);
-
-  Tensor A = Compute("A", {N, H + 2 * Pad}, [&](Axis n, Axis h) {
-    auto cond = CompareSelect::make(h, Pad, 1, 0, kLT);
-    cond = CompareSelect::make(h, H + Pad, 1, cond, kGE);
-    return ifThenElse(cond, 0.f, IP.load(n, h - Pad));
-  });
-  Tensor B = Reduce(
-      "B",
-      {N, H},
-      Sum(),
-      [&](Axis n, Axis h, Axis r) { return A.load(n, h + r); },
-      {R});
-  LoopNest l({B});
-  checkIR(l.root_stmt(), R"IR(
-# CHECK: for (int np = 0; np < 4; np++) {
-# CHECK:   for (int hp = 0; hp < 258; hp++) {
-# CHECK:     A[np, hp] = IfThenElse(hp>=257 ? 1 : (hp<1 ? 1 : 0), 0.f, input[np, hp - 1]);
-# CHECK:   }
-# CHECK: }
-# CHECK: for (int n = 0; n < 4; n++) {
-# CHECK:   for (int h = 0; h < 256; h++) {
-# CHECK:     B[n, h] = float(0);
-# CHECK:     for (int r = 0; r < 3; r++) {
-# CHECK:       B[n, h] = ReduceOp((B[n, h]) + (A(n, h + r)), reduce_args={r});
-# CHECK:     }
-# CHECK:   }
-# CHECK: }
-)IR");
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0);
-  LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
-  // FIXME: The current IR is totally broken.  The body of the inlined loop is:
-
-  // temp[idx0, idx1] = IfThenElse(idx0 + n>=257 ? 1 : (idx0 + n<1 ? 1 : 0),
-  // 0.f, input[idx1 + 0, (idx0 + n) - 1]);
-
-  // Which seems to mix up the axes.  The CHECK below is my best guess at what
-  // the input "should" look like
-
-  checkIR(l.root_stmt(), R"IR(
-# CHECK: for (int n = 0; n < 4; n++) {
-# CHECK:   for (int idx0 = 0; idx0 < 1; idx0++) {
-# CHECK:     for (int idx1 = 0; idx1 < 258; idx1++) {
-        temp[idx0, idx1] = IfThenElse(idx1>=257 ? 1 : (idx1<1 ? 1 : 0), 0.f, input[n, idx1 - 1]);
-# CHECK:     }
-# CHECK:   }
-# CHECK:   for (int h = 0; h < 256; h++) {
-# CHECK:     B[n, h] = float(0);
-# CHECK:     for (int r = 0; r < 3; r++) {
-# CHECK:       B[n, h] = ReduceOp((B[n, h]) + (temp[0, r + h]), reduce_args={r});
-# CHECK:     }
-# CHECK:   }
-# CHECK: }
-)IR");
-
-  l.simplify();
-  l.prepareForCodegen();
-  StmtPtr s = l.root_stmt();
-
-  SimpleIREvaluator cg(s, {IP, B});
-  // auto At = at::ones({N, H}, at::kFloat);
-  auto At = at::arange(N * H, at::kFloat).reshape({N, H});
-  auto Rt = at::conv1d(
-      At, at::ones({1, 1, 3}), at::Tensor(), /*stride=*/1, /*padding=*/3);
-  auto Bt = at::empty_like(Rt);
-  cg.call({At.data_ptr<float>(), Bt.data_ptr<float>()});
-  ASSERT_TRUE(at::allclose(Rt, Bt));
-}
-
-class LoopOrderHelper : public IRVisitor {
-  std::stringstream ordering;
-
- public:
-  std::string getOrder(StmtPtr s) {
-    ordering.str("");
-    s->accept(this);
-    return ordering.str();
-  }
-
-  void visit(const ForPtr& v) final {
-    ordering << v->var()->name_hint() << ",";
-    IRVisitor::visit(v);
-  }
-};
-
-TEST(LoopNest, LoopNestReorderAxis1) {
-  Tensor tensor =
-      Compute("f", {2, 3}, [](const VarHandle& x, const VarHandle& y) {
-        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
-      });
-  LoopNest l({tensor});
-  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
-
-  std::vector<int> stmt1_output(6, 0);
-  SimpleIREvaluator cg(stmt1, {tensor});
-  cg.call({stmt1_output});
-
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::reorderAxis(loops[0], loops[1]);
-  StmtPtr stmt2 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
-
-  ASSERT_NE(stmt1, stmt2);
-  LoopOrderHelper loopOrderHelper;
-  std::string order1 = loopOrderHelper.getOrder(stmt1);
-  std::string order2 = loopOrderHelper.getOrder(stmt2);
-
-  ASSERT_EQ(order1, "j,i,");
-  ASSERT_EQ(order2, "i,j,");
-
-  std::vector<int> stmt2_output(6, 0);
-  SimpleIREvaluator cg2(stmt2, {tensor});
-  cg.call({stmt2_output});
-
-  for (int i = 0; i < 6; ++i) {
-    ASSERT_EQ(stmt1_output[i], stmt2_output[i]);
-  }
-
-  // Reorder them back.
-  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::reorderAxis(loops[0], loops[1]);
-  StmtPtr stmt3 = l.root_stmt();
-
-  std::string order3 = loopOrderHelper.getOrder(stmt3);
-  ASSERT_EQ(order3, order1);
-
-  std::ostringstream oss1, oss2;
-  oss1 << *stmt1;
-  oss2 << *stmt3;
-
-  // Should be identical to the unreordered statement.
-  ASSERT_EQ(oss1.str(), oss2.str());
-}
-
-TEST(LoopNest, LoopNestReorderPartialAxes) {
-  Tensor tensor = Compute(
-      "f",
-      {2, 3, 4},
-      [](const VarHandle& x, const VarHandle& y, const VarHandle& z) {
-        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y +
-            cast<float>(z) * z;
-      });
-  LoopNest l({tensor});
-
-  LoopOrderHelper loopOrderHelper;
-  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
-  ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "i,j,k,");
-
-  std::vector<int> stmt1_output(24, 0);
-  SimpleIREvaluator cg(stmt1, {tensor});
-  cg.call({stmt1_output});
-
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::reorderAxis(loops[0], loops[1]);
-  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "j,i,k,");
-
-  StmtPtr stmt2 = Stmt::clone(l.root_stmt());
-
-  std::vector<int> stmt2_output(24, 0);
-  SimpleIREvaluator cg2(stmt2, {tensor});
-  cg2.call({stmt2_output});
-
-  for (int i = 0; i < 24; ++i) {
-    ASSERT_EQ(stmt1_output[i], stmt2_output[i]);
-  }
-
-  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::reorderAxis(loops[1], loops[2]);
-  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "j,k,i,");
-
-  StmtPtr stmt3 = Stmt::clone(l.root_stmt());
-
-  std::vector<int> stmt3_output(24, 0);
-  SimpleIREvaluator cg3(stmt3, {tensor});
-  cg3.call({stmt3_output});
-
-  for (int i = 0; i < 24; ++i) {
-    ASSERT_EQ(stmt1_output[i], stmt3_output[i]);
-  }
-}
-
-TEST(LoopNest, LoopNestReorderInternalAxis) {
-  Tensor tensor = Compute(
-      "f",
-      {1, 2, 3, 4},
-      [](const VarHandle& w,
-         const VarHandle& x,
-         const VarHandle& y,
-         const VarHandle& z) {
-        return ExprHandle(1.0f) + w + cast<float>(x) * x + cast<float>(y) * y +
-            cast<float>(z) * z;
-      });
-  LoopNest l({tensor});
-
-  LoopOrderHelper loopOrderHelper;
-  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
-  ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "i,j,k,l,");
-
-  std::vector<int> stmt1_output(24, 0);
-  SimpleIREvaluator cg(stmt1, {tensor});
-  cg.call({stmt1_output});
-
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::reorderAxis(loops[2], loops[1]);
-  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "i,k,j,l,");
-
-  StmtPtr stmt2 = l.root_stmt();
-
-  std::vector<int> stmt2_output(24, 0);
-  SimpleIREvaluator cg2(stmt2, {tensor});
-  cg2.call({stmt2_output});
-
-  for (int i = 0; i < 24; ++i) {
-    ASSERT_EQ(stmt1_output[i], stmt2_output[i]);
-  }
-}
-
-TEST(LoopNest, LoopNestReorderEnclosingAxis) {
-  Tensor tensor = Compute(
-      "f",
-      {1, 2, 3, 4},
-      [](const VarHandle& w,
-         const VarHandle& x,
-         const VarHandle& y,
-         const VarHandle& z) {
-        return ExprHandle(1.0f) + w + cast<float>(x) * x + cast<float>(y) * y +
-            cast<float>(z) * z;
-      });
-  LoopNest l({tensor});
-
-  LoopOrderHelper loopOrderHelper;
-  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
-
-  std::vector<int> stmt1_output(24, 0);
-  SimpleIREvaluator cg(stmt1, {tensor});
-  cg.call({stmt1_output});
-
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::reorderAxis(loops[0], loops[3]);
-  ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "l,j,k,i,");
-
-  StmtPtr stmt2 = l.root_stmt();
-
-  std::vector<int> stmt2_output(24, 0);
-  SimpleIREvaluator cg2(stmt2, {tensor});
-  cg2.call({stmt2_output});
-
-  for (int i = 0; i < 24; ++i) {
-    ASSERT_EQ(stmt1_output[i], stmt2_output[i]);
-  }
-}
-
-TEST(LoopNest, LoopNestReorderSameAxis) {
-  Tensor tensor =
-      Compute("f", {2, 3}, [](const VarHandle& x, const VarHandle& y) {
-        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
-      });
-  LoopNest l({tensor});
-  StmtPtr stmt1 = Stmt::clone(l.root_stmt());
-
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::reorderAxis(loops[1], loops[1]);
-  StmtPtr stmt2 = Stmt::clone(l.root_stmt());
-
-  std::ostringstream oss, oss2;
-  oss << *stmt1;
-  oss2 << *stmt2;
-  ASSERT_EQ(oss.str(), oss2.str());
-}
-
-TEST(LoopNest, LoopNestReorderExtraStatements) {
-  /* We're going for a structure like this:
-   * for i in ...
-   *   Stmt 1
-   *   for j in ...
-   *     Stmt 2
-   *     for k in ...
-   *       Stmt 3
-   *     Stmt 4
-   */
-
-  Tensor tensor = Compute(
-      "f",
-      {2, 3, 4},
-      [](const VarHandle& x, const VarHandle& y, const VarHandle& z) {
-        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y +
-            cast<float>(z) * z;
-      });
-  LoopNest l({tensor});
-
-  BufHandle extra("res", {6, 3}, kFloat);
-
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-
-  VarHandle i = VarHandle(loops[0]->var());
-
-  StmtPtr store_1 = Store::make(extra, {i, 0}, 1.f);
-  StmtPtr store_2 = Store::make(extra, {i, 1}, 2.f);
-  // stmt 3 is the Function body.
-  StmtPtr store_3 = Store::make(extra, {i, 2}, 4.f);
-
-  loops[0]->body()->prepend_stmt(store_1);
-  loops[1]->body()->prepend_stmt(store_2);
-  loops[1]->body()->append_stmt(store_3);
-  StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
-
-  std::vector<int> extra1(6, 0);
-  std::vector<int> res1(24, 0);
-  SimpleIREvaluator cg(stmt1, {tensor, extra});
-  cg.call({res1, extra1});
-
-  /* Then we reorder loop y and z, we want it to look like:
-   *
-   * for i in ...
-   *   Stmt 1
-   *   for j in ...
-   *     Stmt 2
-   *   for j_1 in ...
-   *    for k in ...
-   *       Stmt 3
-   *   for j_2 in ...
-   *     Stmt 4
-   *
-   * We need extra loops because we don't have dependency info about stmt 3
-   * and 4.
-   *
-   */
-
-  LoopNest::reorderAxis(loops[1], loops[2]);
-  StmtPtr stmt2 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
-
-  // Check the IR we produced
-  checkIR(stmt2, R"IR(
-# CHECK: for
-# CHECK:   res[i, 0] = 1
-# CHECK:   for
-# CHECK:     res[i, 1] = 2
-# CHECK:   for
-# CHECK:     for
-# CHECK:       f[
-# CHECK:   for
-# CHECK:     res[i, 2] = 4
-)IR");
-
-  std::vector<int> extra2(6, 0);
-  std::vector<int> res2(24, 0);
-  SimpleIREvaluator cg2(stmt2, {tensor, extra});
-  cg2.call({res2, extra2});
-
-  for (int i = 0; i < 24; ++i) {
-    ASSERT_EQ(res1[i], res2[i]);
-  }
-  for (int i = 0; i < 6; ++i) {
-    ASSERT_EQ(extra1[i], extra2[i]);
-  }
-
-  /* Now reorder x and the y above stmt 3:
-   *
-   *
-   * for x in ...
-   *   Stmt 1
-   *   for y in ...
-   *     Stmt 2
-   *
-   * for y in ...
-   *   for z in ...
-   *    for x in ...
-   *       Stmt 3
-   *
-   * for x in ...
-   *   for y in ...
-   *     Stmt 4
-   *
-   *
-   */
-  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
-  LoopNest::reorderAxis(loops[0], loops[2]);
-  StmtPtr stmt3 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt()));
-
-  // Check the IR we produced
-  checkIR(stmt3, R"IR(
-# CHECK: for
-# CHECK:   res[i, 0] = 1
-# CHECK:   for
-# CHECK:     res[i, 1] = 2
-# CHECK: for
-# CHECK:   for
-# CHECK:     for
-# CHECK:       f[
-# CHECK: for
-# CHECK:   for
-# CHECK:     res[i_2, 2] = 4
-)IR");
-
-  std::vector<int> extra3(6, 0);
-  std::vector<int> res3(24, 0);
-  SimpleIREvaluator cg3(stmt3, {tensor, extra});
-  cg3.call({res3, extra3});
-
-  for (int i = 0; i < 24; ++i) {
-    ASSERT_EQ(res1[i], res3[i]);
-  }
-  for (int i = 0; i < 6; ++i) {
-    ASSERT_EQ(extra1[i], extra3[i]);
-  }
-}
-
-void LoopNestReorderTestHelper(
-    bool prepend,
-    bool append,
-    int index1,
-    int index2) {
-  Tensor c = Compute(
-      "5d", {2, 3, 2, 3, 2}, [](const std::vector<VarHandle>&) { return -1; });
-  LoopNest l({c});
-
-  BufHandle extra("extra", {5}, kInt);
-
-  auto loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
-  int j = 0;
-  for (auto l : loops) {
-    // Add an increment at each layer of the loop which counts the number of
-    // times the loop executes.
-    LoadPtr load =
-        alloc<Load>(extra.node(), std::vector<ExprPtr>({alloc<IntImm>(j)}));
-    AddPtr add = alloc<Add>(load, alloc<IntImm>(1));
-    StmtPtr store = alloc<Store>(
-        extra.node(), std::vector<ExprPtr>({alloc<IntImm>(j)}), add);
-    if (prepend) {
-      l->body()->prepend_stmt(store);
-    }
-    if (append) {
-      l->body()->append_stmt(Stmt::clone(store));
-    }
-
-    j++;
-  }
-
-  StmtPtr stmt1 = Stmt::clone(l.root_stmt());
-
-  std::vector<int> extra1(5, 0);
-  std::vector<int> res1(2 * 3 * 2 * 3 * 2, 0);
-  SimpleIREvaluator cg(stmt1, {c, extra});
-  cg.call({res1, extra1});
-
-  std::vector<int> loopExtents = {2, 3, 2, 3, 2};
-
-  int expected_loops = 0;
-  if (prepend) {
-    expected_loops++;
-  }
-  if (append) {
-    expected_loops++;
-  }
-  for (int i = 0; i < 5; ++i) {
-    expected_loops *= loopExtents[i];
-    ASSERT_EQ(extra1[i], expected_loops);
-  }
-
-  loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
-  LoopNest::reorderAxis(loops[index1], loops[index2]);
-  StmtPtr stmt2 = Stmt::clone(l.root_stmt());
-
-  std::ostringstream oss, oss2;
-  oss << *stmt1;
-  oss2 << *stmt2;
-  ASSERT_NE(oss.str(), oss2.str());
-
-  std::vector<int> extra2(5, 0);
-  std::vector<int> res2(2 * 3 * 2 * 3 * 2, 0);
-  SimpleIREvaluator cg2(stmt2, {c, extra});
-  cg2.call({res2, extra2});
-
-  expected_loops = 0;
-  if (prepend) {
-    expected_loops++;
-  }
-  if (append) {
-    expected_loops++;
-  }
-
-  for (int i = 0; i < 5; ++i) {
-    expected_loops *= loopExtents[i];
-    ASSERT_EQ(extra2[i], expected_loops);
-  }
-
-  for (int i = 0; i < 2 * 3 * 2 * 3 * 2; ++i) {
-    ASSERT_EQ(res2[i], res1[i]);
-  }
-}
-
-TEST(LoopNest, LoopNestReorderLongStringOfPreOrphans) {
-  for (int i = 0; i < 5; ++i) {
-    for (int j = 0; j < 5; ++j) {
-      // skip noops, since we check the loop isn't the same after reordering.
-      if (i != j) {
-        LoopNestReorderTestHelper(true, false, i, j);
-      }
-    }
-  }
-}
-
-TEST(LoopNest, LoopNestReorderLongStringOfPostOrphans) {
-  for (int i = 0; i < 5; ++i) {
-    for (int j = 0; j < 5; ++j) {
-      // skip noops, since we check the loop isn't the same after reordering.
-      if (i != j) {
-        LoopNestReorderTestHelper(false, true, i, j);
-      }
-    }
-  }
-}
-
-TEST(LoopNest, LoopNestReorderLongStringFull) {
-  for (int i = 0; i < 5; ++i) {
-    for (int j = 0; j < 5; ++j) {
-      // skip noops, since we check the loop isn't the same after reordering.
-      if (i != j) {
-        LoopNestReorderTestHelper(true, true, i, j);
-      }
-    }
-  }
-}
-
-TEST(LoopNest, LoopNestReorderInternalLoopNest) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-  BufHandle a_buf("a", {M, N}, kFloat);
-  BufHandle b_buf("b", {N, K}, kFloat);
-  BufHandle c_buf("c", {M, N}, kFloat);
-  BufHandle d_buf("d", {M, K}, kFloat);
-
-  Tensor x = Compute(
-      "x",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n) * b_buf.load(n, k);
-      });
-  Tensor y = Compute(
-      "y",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
-      });
-  Tensor z = Compute(
-      "z",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x.load(m, n, k) + y.load(m, n, k);
-      });
-
-  LoopNest l({z}, {x, y, z});
-  ForPtr a = l.getAllLoopNestsWritingToBuf(y.buf())[0][2];
-  ForPtr b = l.getAllLoopNestsWritingToBuf(y.buf())[0][0];
-  LoopNest::reorderAxis(a, b);
-
-  l.prepareForCodegen();
-  StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
-
-  // Check the IR we produced has the 3 nests in the right order, but k and m
-  // swapped in the middle.
-  checkIR(stmt, R"IR(
-# CHECK: < 4
-# CHECK: < 5
-# CHECK: < 6
-# CHECK: < 6
-# CHECK: < 5
-# CHECK: < 4
-# CHECK: < 4
-# CHECK: < 5
-# CHECK: < 6)IR");
-
-  {
-    PaddedBuffer<float> a_v(M, N);
-    PaddedBuffer<float> b_v(N, K);
-    PaddedBuffer<float> c_v(M, N);
-    PaddedBuffer<float> d_v(M, K);
-
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < N; j++) {
-        a_v(i, j) = i * i;
-      }
-    }
-    for (int i = 0; i < N; i++) {
-      for (int j = 0; j < K; j++) {
-        b_v(i, j) = j * j;
-      }
-    }
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < N; j++) {
-        c_v(i, j) = i + j;
-      }
-    }
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < K; j++) {
-        d_v(i, j) = i * j;
-      }
-    }
-
-    PaddedBuffer<float> z_v(M, N, K);
-    PaddedBuffer<float> z_ref(M, N, K);
-    for (int m = 0; m < M; m++) {
-      for (int n = 0; n < N; n++) {
-        for (int k = 0; k < K; k++) {
-          z_ref(m, n, k) = a_v(m, n) * b_v(n, k) * 2 + c_v(m, n) * d_v(m, k);
-        }
-      }
-    }
-
-    SimpleIREvaluator eval(stmt, {a_buf, b_buf, c_buf, d_buf, z});
-    eval(a_v, b_v, c_v, d_v, z_v);
-    ExpectAllNear(z_v, z_ref, 1e-5);
-  }
-}
-
-TEST(LoopNest, OuterLoopVectorization) {
-  Tensor tensor =
-      Compute("f", {8, 8}, [](const VarHandle& x, const VarHandle& y) {
-        return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
-      });
-  LoopNest l({tensor});
-
-  ASSERT_TRUE(
-      LoopNest::vectorize(l.getAllLoopNestsWritingToBuf(tensor.buf())[0][0]));
-
-  StmtPtr root_stmt = l.root_stmt();
-  BlockPtr outer_block = to<Block>(root_stmt);
-  ASSERT_NE(outer_block, nullptr);
-  while (BlockPtr inner_block = to<Block>(outer_block->front())) {
-    outer_block = inner_block;
-  }
-
-  // Verify that we have only a single loop level remaining after
-  // vectorization.
-  ASSERT_EQ(outer_block->nstmts(), 1);
-  ForPtr for_loop = to<For>(outer_block->front());
-  ASSERT_NE(for_loop, nullptr);
-  BlockPtr for_body = for_loop->body();
-  ASSERT_EQ(for_body->nstmts(), 1);
-  ASSERT_EQ(to<For>(for_body->front()), nullptr);
-}
-
-TEST(LoopNest, VectorizeLoopNotNormalized) {
-  // Input IR:
-  //   for (int i = 0; i < 10; i++) {
-  //     for (int j = 1; j < 5; j++) {
-  //       A[i,j] = i * j;
-  //     }
-  //   }
-  BufHandle a_buf("A", {10, 5}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
-  auto inner_for = For::make(j, 1, 5, for_body);
-  auto outer_for = For::make(i, 0, 10, inner_for);
-  auto block = Block::make({outer_for});
-  LoopNest l(block, {a_buf.node()});
-
-  ASSERT_TRUE(LoopNest::vectorize(inner_for));
-  ASSERT_EQ(outer_for->body()->nstmts(), 1);
-  ASSERT_EQ(to<For>(outer_for->body()->front()), nullptr);
-}
-
-namespace {
-
-std::string constantUpperBoundLoopIR(int upper_bound_val) {
-  ExprHandle upper_bound(upper_bound_val);
-  Tensor A =
-      Compute("A", {upper_bound}, [&](const VarHandle& x) { return x * 2; });
-  LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
-  StmtPtr unrolled = nullptr;
-  LoopNest::fullUnroll(loops[0], &unrolled);
-  std::ostringstream oss;
-  oss << *unrolled;
-  return oss.str();
-}
-
-} // namespace
-
-TEST(LoopNest, Unroll) {
-  const std::string actual = constantUpperBoundLoopIR(3);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: A[0] = 0;
-# CHECK: A[1] = 2;
-# CHECK: A[2] = 4)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, actual);
-}
-
-TEST(LoopNest, UnrollOuter) {
-  ExprHandle outer_bound(3);
-  ExprHandle inner_bound(4);
-  Tensor A = Compute(
-      "A",
-      {outer_bound, inner_bound},
-      [&](const VarHandle& x, const VarHandle& y) { return x + y; });
-  LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
-  StmtPtr unrolled = nullptr;
-  LoopNest::fullUnroll(loops[0], &unrolled);
-  checkIR(unrolled, R"IR(
-# CHECK: for (int i = 0; i < 4; i++) {
-# CHECK: A[0, i] = i;
-# CHECK: }
-# CHECK: for (int i = 0; i < 4; i++) {
-# CHECK: A[1, i] = i + 1;
-# CHECK: }
-# CHECK: for (int i = 0; i < 4; i++) {
-# CHECK: A[2, i] = i + 2;
-# CHECK: })IR");
-}
-
-TEST(LoopNest, UnrollInner) {
-  ExprHandle outer_bound(3);
-  ExprHandle inner_bound(4);
-  Tensor A = Compute(
-      "A",
-      {outer_bound, inner_bound},
-      [&](const VarHandle& x, const VarHandle& y) { return x + y; });
-  LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
-  StmtPtr unrolled = nullptr;
-  LoopNest::fullUnroll(
-      static_to<For>(loops[0]->body()->stmts().front()), &unrolled);
-  checkIR(loops[0], R"IR(
-# CHECK: for (int i = 0; i < 3; i++) {
-# CHECK: A[i, 0] = i;
-# CHECK: A[i, 1] = i + 1;
-# CHECK: A[i, 2] = i + 2;
-# CHECK: A[i, 3] = i + 3;
-# CHECK: })IR");
-}
-
-TEST(LoopNest, UnrollMultipleStatements) {
-  const int kTotalSize = 3;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-
-  VarHandle x("x", kInt);
-  auto f = For::make(
-      x,
-      0,
-      kTotalSize,
-      Block::make(
-          {Store::make(a_buf, {x}, x * 2),
-           Store::make(b_buf, {x}, Load::make(a_buf, {x}))}));
-  auto parent_block = Block::make({f});
-  StmtPtr unrolled = nullptr;
-  LoopNest::fullUnroll(f, &unrolled);
-  checkIR(unrolled, R"IR(
-# CHECK: A[0] = 0;
-# CHECK: B[0] = A[0];
-# CHECK: A[1] = 2;
-# CHECK: B[1] = A[1];
-# CHECK: A[2] = 4
-# CHECK: B[2] = A[2];)IR");
-}
-
-TEST(LoopNest, UnrollNonLiteralConstantBounds) {
-  // Input IR:
-  //   for (int i = 2 - 1; i < 12 / 3; i++) {
-  //     for (int j = 0; j < 4; j++) {
-  //       A[i,j] = i * j;
-  //     }
-  //   }
-  BufHandle a_buf("A", {3, 4}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
-  auto inner_for = For::make(j, 0, 4, for_body);
-  auto outer_for = For::make(
-      i,
-      IntImm::make(2) - IntImm::make(1),
-      IntImm::make(12) / IntImm::make(3),
-      inner_for);
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto b = Block::make({outer_for});
-
-  std::vector<ForPtr> loops = {outer_for, inner_for};
-  StmtPtr unrolled = nullptr;
-  LoopNest::fullUnroll(loops[0], &unrolled);
-  checkIR(unrolled, R"IR(
-# CHECK: for (int j = 0; j < 4; j++) {
-# CHECK:   A[1, j] = j;
-# CHECK: }
-# CHECK: for (int j = 0; j < 4; j++) {
-# CHECK:   A[2, j] = 2 * j;
-# CHECK: }
-# CHECK: for (int j = 0; j < 4; j++) {
-# CHECK:   A[3, j] = 3 * j;
-# CHECK: })IR");
-}
-
-TEST(LoopNest, UnrollNonConstantBounds) {
-  // Input IR:
-  //   for (int i = 0; i < M; i++) {
-  //     for (int j = 0; j < N; j++) {
-  //       A[i, j] = i * j;
-  //     }
-  //   }
-  VarHandle M("M", kInt);
-  VarHandle N("N", kInt);
-  BufHandle a_buf("A", {M, N}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
-  auto inner_for = For::make(j, 0, N, for_body);
-  auto outer_for = For::make(i, 0, M, inner_for);
-  auto block = Block::make({outer_for});
-  LoopNest l(block, {a_buf.node()});
-
-  LoopNest::unroll(inner_for, 8);
-  l.simplify();
-  checkIR(l.root_stmt(), R"IR(
-    # CHECK: for (int i = 0; i < M; i++) {
-    # CHECK:   for (int j_outer = 0; j_outer < N / 8; j_outer++) {
-    # CHECK:     A[i, 8 * j_outer] =
-    # CHECK:     A[i, 8 * j_outer + 1] =
-    # CHECK:     A[i, 2 * (4 * j_outer + 1)] =
-    # CHECK:     A[i, 8 * j_outer + 3] =
-    # CHECK:     A[i, 4 * (2 * j_outer + 1)] =
-    # CHECK:     A[i, 8 * j_outer + 5] =
-    # CHECK:     A[i, 8 * j_outer + 6] =
-    # CHECK:     A[i, 8 * j_outer + 7] =
-    # CHECK:   }
-    # CHECK:   for (int j_tail = 0; j_tail < N % 8; j_tail++) {
-    # CHECK:     A[i, 8 * (N / 8) + j_tail] =
-    # CHECK:   }
-    # CHECK: }
-  )IR");
-}
-
-TEST(LoopNest, UnrollByFactorsLessThan2) {
-  // Input IR:
-  //   for (int i = 0; i < M; i++) {
-  //     for (int j = 0; j < N; j++) {
-  //       A[i, j] = i * j;
-  //     }
-  //   }
-  VarHandle M("M", kInt);
-  VarHandle N("N", kInt);
-  BufHandle a_buf("A", {M, N}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
-  auto inner_for = For::make(j, 0, N, for_body);
-  auto outer_for = For::make(i, 0, M, inner_for);
-  auto block = Block::make({outer_for});
-  LoopNest l(block, {a_buf.node()});
-
-  // Unrolling by factor = 1 should do nothing.
-  LoopNest::unroll(inner_for, 1);
-  checkIR(l.root_stmt(), R"IR(
-    # CHECK: for (int i = 0; i < M; i++) {
-    # CHECK:   for (int j = 0; j < N; j++) {
-    # CHECK:     A[i, j] =
-    # CHECK:   }
-    # CHECK: }
-  )IR");
-
-  // Unrolling by factor = 0 should do nothing.
-  LoopNest::unroll(inner_for, 0);
-  checkIR(l.root_stmt(), R"IR(
-    # CHECK: for (int i = 0; i < M; i++) {
-    # CHECK:   for (int j = 0; j < N; j++) {
-    # CHECK:     A[i, j] =
-    # CHECK:   }
-    # CHECK: }
-  )IR");
-
-  // Unrolling by negative factor should do nothing.
-  LoopNest::unroll(inner_for, -2);
-  checkIR(l.root_stmt(), R"IR(
-    # CHECK: for (int i = 0; i < M; i++) {
-    # CHECK:   for (int j = 0; j < N; j++) {
-    # CHECK:     A[i, j] =
-    # CHECK:   }
-    # CHECK: }
-  )IR");
-}
-
-TEST(LoopNest, UnrollByFactorEqualToIters) {
-  // Input IR:
-  //   for (int i = 0; i < 5; i++) {
-  //     A[i] = i * i;
-  //   }
-  BufHandle a_buf("A", {5}, kInt);
-  VarHandle i("i", kInt);
-  auto for_body = Block::make({Store::make(a_buf, {i}, i * i)});
-  auto for_loop = For::make(i, 0, 5, for_body);
-  auto block = Block::make({for_loop});
-  LoopNest l(block, {a_buf.node()});
-
-  LoopNest::unroll(for_loop, 5);
-  checkIR(l.root_stmt(), R"IR(
-    # CHECK: for (int i_outer = 0; i_outer < (5 - 0) / 5; i_outer++)
-    # CHECK:   A[5 * i_outer]
-    # CHECK:   A[5 * i_outer + 1]
-    # CHECK:   A[5 * i_outer + 2]
-    # CHECK:   A[5 * i_outer + 3]
-    # CHECK:   A[5 * i_outer + 4]
-  )IR");
-}
-
-TEST(LoopNest, UnrollEmpty) {
-  const std::string actual = constantUpperBoundLoopIR(0);
-  const std::string& verification_pattern = R"IR(
-# CHECK-NOT: A[
-  )IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, actual);
-}
-
-TEST(LoopNest, NoUnroll) {
-  VarHandle upper_bound("N", kInt);
-  Tensor A =
-      Compute("A", {upper_bound}, [&](const VarHandle& x) { return x * 2; });
-  LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
-  StmtPtr unrolled = nullptr;
-  ASSERT_THROWS_WITH(
-      LoopNest::fullUnroll(loops[0], &unrolled), "non-constant loop");
-}
-
-TEST(LoopNest, UnrollWithLet) {
-  const int kTotalSize = 3;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-
-  VarHandle e("e", kInt);
-  VarHandle x("x", kInt);
-  auto f = For::make(
-      x,
-      0,
-      kTotalSize,
-      Block::make(
-          {Let::make(e, 7),
-           Store::make(a_buf, {x}, e),
-           Store::make(b_buf, {x}, e + 1)}));
-  auto parent_block = Block::make({f});
-  StmtPtr unrolled = nullptr;
-  LoopNest::fullUnroll(f, &unrolled);
-  std::ostringstream oss;
-  oss << *unrolled;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int e = 7;
-# CHECK: A[0] = e;
-# CHECK: B[0] = e + 1;
-# CHECK: A[1] = e;
-# CHECK: B[1] = e + 1;
-# CHECK: A[2] = e;
-# CHECK: B[2] = e + 1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  std::vector<int> a_v(kTotalSize, 0);
-  std::vector<int> b_v(kTotalSize, 0);
-  SimpleIREvaluator eval(unrolled, {a_buf, b_buf});
-  eval(a_v, b_v);
-  for (int i = 0; i < kTotalSize; ++i) {
-    ASSERT_EQ(a_v[i], 7);
-    ASSERT_EQ(b_v[i], 8);
-  }
-}
-
-TEST(LoopNest, IsNormalized) {
-  // Input IR:
-  //   for (int i = 50; i < 100; i++) {
-  //     A[i] = B[i];
-  //   }
-  BufHandle a_buf("A", {ExprHandle(100)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(100)}, kInt);
-  VarHandle i("i", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto for_stmt =
-      For::make(i, 50, 100, Store::make(a_buf, {i}, Load::make(b_buf, {i})));
-  Block::make({for_stmt});
-  ASSERT_FALSE(LoopNest::isNormalized(for_stmt));
-
-  for_stmt->set_start(alloc<IntImm>(0));
-  ASSERT_TRUE(LoopNest::isNormalized(for_stmt));
-
-  VarHandle N("N", kInt);
-  for_stmt->set_start(N.node());
-  ASSERT_FALSE(LoopNest::isNormalized(for_stmt));
-}
-
-TEST(LoopNest, NormalizeStartPositive) {
-  // Input IR:
-  //   for (int x = 50; x < 100; x++) {
-  //     A[x] = B[x];
-  //     B[x] = x * 2;
-  //   }
-  const int kTotalSize = 50;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-  VarHandle x("x", kInt);
-  auto for_body = Block::make(
-      {Store::make(a_buf, {x}, Load::make(kInt, b_buf, {x})),
-       Store::make(b_buf, {x}, x * 2)});
-  auto for_stmt = For::make(x, 50, 100, for_body);
-  Block::make({for_stmt});
-
-  LoopNest::normalize(for_stmt);
-
-  auto result = IRSimplifier::simplify(for_stmt);
-  std::ostringstream oss;
-  oss << *result;
-  const std::string& expected_ir =
-      R"IR(
-        # CHECK: for (int x = 0; x < 50; x++) {
-        # CHECK:   A[x + 50] = B[x + 50];
-        # CHECK:   B[x + 50] = 2 * (x + 50);
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-}
-
-TEST(LoopNest, NormalizeStartNegative) {
-  // Input IR:
-  //   for (int x = -50; x < 100; x++) {
-  //     A[x + 50] = B[x + 50];
-  //     B[x + 50] = x * 2;
-  //   }
-  const int kTotalSize = 150;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-  VarHandle x("x", kInt);
-  auto for_body = Block::make(
-      {Store::make(a_buf, {x + 50}, Load::make(kInt, b_buf, {x + 50})),
-       Store::make(b_buf, {x + 50}, x * 2)});
-  auto for_stmt = For::make(x, -50, 100, for_body);
-  Block::make({for_stmt});
-
-  LoopNest::normalize(for_stmt);
-
-  auto result = IRSimplifier::simplify(for_stmt);
-  std::ostringstream oss;
-  oss << *result;
-  const std::string& expected_ir =
-      R"IR(
-        # CHECK: for (int x = 0; x < 150; x++) {
-        # CHECK:   A[x] = B[x];
-        # CHECK:   B[x] = 2 * (x - 50);
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-}
-
-TEST(LoopNest, NormalizeStartZero) {
-  // Input IR:
-  //   for (int x = 0; x < 100; x++) {
-  //     A[x] = B[x];
-  //     B[x] = x * 2;
-  //   }
-  // Should not be modified.
-
-  const int kTotalSize = 100;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-  VarHandle x("x", kInt);
-  auto for_body = Block::make(
-      {Store::make(a_buf, {x}, Load::make(kInt, b_buf, {x})),
-       Store::make(b_buf, {x}, x * 2)});
-  auto for_stmt = For::make(x, 0, 100, for_body);
-  Block::make({for_stmt});
-
-  LoopNest::normalize(for_stmt);
-
-  auto result = IRSimplifier::simplify(for_stmt);
-  std::ostringstream oss;
-  oss << *result;
-  const std::string& expected_ir =
-      R"IR(
-        # CHECK: for (int x = 0; x < 100; x++) {
-        # CHECK:   A[x] = B[x];
-        # CHECK:   B[x] = 2 * x;
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-}
-
-TEST(LoopNest, NormalizeStartVariable) {
-  // Input IR:
-  //   for (int x = y; x < 100; x++) {
-  //     A[x] = B[x];
-  //     B[x] = x * 2;
-  //   }
-
-  const int kTotalSize = 100;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  auto for_body = Block::make(
-      {Store::make(a_buf, {x}, Load::make(kInt, b_buf, {x})),
-       Store::make(b_buf, {x}, x * 2)});
-  auto for_stmt = For::make(x, y, 100, for_body);
-  auto parent_block = Block::make({for_stmt});
-
-  LoopNest::normalize(for_stmt);
-
-  auto result = IRSimplifier::simplify(for_stmt);
-  std::ostringstream oss;
-  oss << *result;
-  const std::string& expected_ir =
-      R"IR(
-        # CHECK: for (int x = 0; x < 100 - y; x++) {
-        # CHECK:   A[x + y] = B[x + y];
-        # CHECK:   B[x + y] = 2 * (x + y);
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-}
-
-TEST(LoopNest, NormalizeOnNestedOuterLoop) {
-  // Input IR:
-  //   for (int x = 50; x < 100; x++) {
-  //     for (int y = 10; y < 100; y++) {
-  //       A[x] = A[x] + B[y] + y * 2;
-  //     }
-  //   }
-
-  BufHandle a_buf("A", {ExprHandle(50)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(100)}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  auto inner_for_body = Store::make(
-      a_buf, {x}, Load::make(a_buf, {x}) + Load::make(b_buf, {y}) + y * 2);
-  auto inner_for = For::make(y, 10, 100, inner_for_body);
-  auto for_stmt = For::make(x, 50, 100, inner_for);
-  Block::make({for_stmt});
-
-  LoopNest::normalize(for_stmt);
-
-  auto result = IRSimplifier::simplify(for_stmt);
-  std::ostringstream oss;
-  oss << *result;
-  const std::string& expected_ir =
-      R"IR(
-        # CHECK: for (int x = 0; x < 50; x++) {
-        # CHECK:   for (int y = 10; y < 100; y++) {
-        # CHECK:     A[x + 50] = ((A[x + 50]) + (B[y])) + 2 * y;
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-}
-
-TEST(LoopNest, NormalizeOnNestedInnerLoop) {
-  // Input IR:
-  //   for (int x = 50; x < 100; x++) {
-  //     for (int y = 10; y < 100; y++) {
-  //       A[x] = A[x] + B[y] + y * 2;
-  //     }
-  //   }
-
-  BufHandle a_buf("A", {ExprHandle(50)}, kInt);
-  BufHandle b_buf("B", {ExprHandle(100)}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  auto inner_for_body = Store::make(
-      a_buf, {x}, Load::make(a_buf, {x}) + Load::make(b_buf, {y}) + y * 2);
-  auto inner_for = For::make(y, 10, 100, inner_for_body);
-  auto for_stmt = For::make(x, 50, 100, inner_for);
-  Block::make({for_stmt});
-
-  LoopNest::normalize(inner_for);
-
-  auto result = IRSimplifier::simplify(for_stmt);
-  std::ostringstream oss;
-  oss << *result;
-  const std::string& expected_ir =
-      R"IR(
-        # CHECK: for (int x = 50; x < 100; x++) {
-        # CHECK:   for (int y = 0; y < 90; y++) {
-        # CHECK:     A[x] = (((A[x]) + (B[y + 10])) + 2 * y) + 20;
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-}
-
-TEST(LoopNest, NormalizeAndSplitWithTail) {
-  // Create a dummy tensor to construct LoopNest.
-  ExprHandle n(100);
-  BufHandle a("a", {n}, kFloat);
-  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
-  LoopNest l({b});
-
-  // Input IR:
-  //   for (int x = 5; x < 10; x++) {
-  //     A[x] = x * 2;
-  //   }
-  const int kTotalSize = 5;
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
-  VarHandle x("x", kInt);
-  auto for_stmt = For::make(x, 5, 10, Store::make(a_buf, {x}, x * 2));
-  auto parent_block = Block::make({for_stmt});
-
-  LoopNest::normalize(for_stmt);
-
-  ForPtr x_inner;
-  ForPtr x_tail;
-  LoopNest::splitWithTail(for_stmt, 10, &x_inner, &x_tail);
-
-  auto x_outer_result = IRSimplifier::simplify(for_stmt);
-  std::ostringstream oss_outer;
-  oss_outer << *x_outer_result;
-  const std::string& expected_outer_ir =
-      R"IR(
-        # CHECK: {
-        # CHECK: }
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_outer_ir, oss_outer.str());
-
-  auto x_tail_result = IRSimplifier::simplify(x_tail);
-  std::ostringstream oss_tail;
-  oss_tail << *x_tail_result;
-  const std::string& expected_tail_ir =
-      R"IR(
-        # CHECK: for (int x_tail = 0; x_tail < 5; x_tail++) {
-        # CHECK:   A[x_tail + 5] = 2 * (x_tail + 5);
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_tail_ir, oss_tail.str());
-}
-
-TEST(LoopNest, NotNormalizeAndSplitWithTail) {
-  // Create a dummy tensor to construct LoopNest.
-  ExprHandle n(100);
-  BufHandle a("a", {n}, kFloat);
-  Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
-  LoopNest l({b});
-
-  // Input IR:
-  //   for (int x = 5; x < 15; x++) {
-  //     A[x] = x * 2;
-  //   }
-  const int kTotalSize = 10;
-  BufHandle a_buf("A", {kTotalSize}, kInt);
-  VarHandle x("x", kInt);
-  auto for_stmt = For::make(x, 5, 15, Store::make(a_buf, {x}, x * 2));
-  auto parent_block = Block::make({for_stmt});
-
-  ForPtr x_inner;
-  ForPtr x_tail;
-  LoopNest::splitWithTail(for_stmt, 8, &x_inner, &x_tail);
-
-  auto x_outer_result = IRSimplifier::simplify(for_stmt);
-  std::ostringstream oss_outer;
-  oss_outer << *x_outer_result;
-  const std::string& expected_outer_ir =
-      R"IR(
-        # CHECK: {
-        # CHECK: }
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_outer_ir, oss_outer.str());
-
-  auto x_tail_result = IRSimplifier::simplify(x_tail);
-  std::ostringstream oss_tail;
-  oss_tail << *x_tail_result;
-  const std::string& expected_tail_ir =
-      R"IR(
-        # CHECK: for (int x_tail = 0; x_tail < 2; x_tail++) {
-        # CHECK:   A[x_tail + 13] = 2 * (x_tail + 13);
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_tail_ir, oss_tail.str());
-}
-
-TEST(LoopNest, FlattenSimpleLoopNest2D) {
-  // Input IR:
-  //   for (int i = 0; i < 10; i++) {
-  //     for (int j = 0; j < 5; j++) {
-  //       A[i,j] = i * j;
-  //     }
-  //   }
-  BufHandle a_buf("A", {10, 5}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
-  auto inner_for = For::make(j, 0, 5, for_body);
-  auto outer_for = For::make(i, 0, 10, inner_for);
-  auto parent_block = Block::make({outer_for});
-
-  std::vector<ForPtr> loops = {outer_for, inner_for};
-  ForPtr flattened = nullptr;
-  ASSERT_TRUE(LoopNest::flatten(loops, &flattened));
-  ASSERT_EQ(flattened, loops.front());
-
-  auto result = IRSimplifier::simplify(flattened);
-  std::ostringstream oss;
-  oss << *result;
-  const std::string& expected_ir =
-      R"IR(
-        # CHECK: for (int i_flat = 0; i_flat < 50; i_flat++) {
-        # CHECK:   A[i_flat / 5, i_flat % 5] =
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-
-  {
-    SimpleIREvaluator eval1(loops[0], {a_buf});
-    PaddedBuffer<int> inp1(10, 5);
-    eval1(inp1);
-    SimpleIREvaluator eval2(flattened, {a_buf});
-    PaddedBuffer<int> inp2(10, 5);
-    eval2(inp2);
-    ExpectAllNear(inp1, inp2, 1e-5);
-  }
-}
-
-TEST(LoopNest, FlattenSimpleLoopNest3D) {
-  // Input IR:
-  //   for (int i = 0; i < 10; i++) {
-  //     for (int j = 0; j < 5; j++) {
-  //       for (int k = 0; k < 7; k++) {
-  //         A[i,j,k] = i + j * k;
-  //       }
-  //     }
-  //   }
-  BufHandle a_buf("A", {10, 5, 7}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto for_body = Block::make({Store::make(a_buf, {i, j, k}, i + j * k)});
-  auto for1 = For::make(k, 0, 7, for_body);
-  auto for2 = For::make(j, 0, 5, for1);
-  auto for3 = For::make(i, 0, 10, for2);
-  auto parent_block = Block::make({for3});
-
-  std::vector<ForPtr> loops = {for3, for2, for1};
-  ForPtr flattened = nullptr;
-  ASSERT_TRUE(LoopNest::flatten(loops, &flattened));
-  ASSERT_EQ(flattened, loops.front());
-
-  auto result = IRSimplifier::simplify(flattened);
-  std::ostringstream oss;
-  oss << *result;
-  const std::string& expected_ir =
-      R"IR(
-        # CHECK: for (int i_flat = 0; i_flat < 350; i_flat++) {
-        # CHECK:   A[i_flat / 35, (i_flat / 7) % 5, i_flat % 7] =
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-
-  {
-    SimpleIREvaluator eval1(loops[0], {a_buf});
-    PaddedBuffer<int> inp1(10, 5, 7);
-    eval1(inp1);
-    SimpleIREvaluator eval2(flattened, {a_buf});
-    PaddedBuffer<int> inp2(10, 5, 7);
-    eval2(inp2);
-    ExpectAllNear(inp1, inp2, 1e-5);
-  }
-}
-
-TEST(LoopNest, FlattenLoopNestAfterNormalize) {
-  // Input IR:
-  //   for (int i = 2; i < 10; i++) {
-  //     for (int j = 3; j < 15; j++) {
-  //       A[i - 2,j - 3] = i * j;
-  //     }
-  //   }
-  BufHandle a_buf("A", {8, 12}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto for_body = Block::make({Store::make(a_buf, {i - 2, j - 3}, i * j)});
-  auto inner_for = For::make(j, 3, 15, for_body);
-  auto outer_for = For::make(i, 2, 10, inner_for);
-  auto parent_block = Block::make({outer_for});
-
-  std::vector<ForPtr> loops = {outer_for, inner_for};
-  ForPtr flattened = nullptr;
-  ASSERT_TRUE(LoopNest::flatten(loops, &flattened));
-  ASSERT_EQ(flattened, loops.front());
-
-  auto result = IRSimplifier::simplify(flattened);
-  std::ostringstream oss;
-  oss << *result;
-  const std::string& expected_ir =
-      R"IR(
-        # CHECK: for (int i_flat = 0; i_flat < 96; i_flat++) {
-        # CHECK:   A[i_flat / 12, i_flat % 12] =
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-
-  {
-    SimpleIREvaluator eval1(loops[0], {a_buf});
-    PaddedBuffer<int> inp1(8, 12);
-    eval1(inp1);
-    SimpleIREvaluator eval2(flattened, {a_buf});
-    PaddedBuffer<int> inp2(8, 12);
-    eval2(inp2);
-    ExpectAllNear(inp1, inp2, 1e-5);
-  }
-}
-
-TEST(LoopNest, FlattenLoopNestWithNonLiteralConstantBounds) {
-  // Input IR:
-  //   for (int i = 0; i < 15-5; i++) {
-  //     for (int j = 0; j < 20/4; j++) {
-  //       A[i,j] = i * j;
-  //     }
-  //   }
-  BufHandle a_buf("A", {10, 5}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
-  auto inner_for =
-      For::make(j, 0, IntImm::make(20) / IntImm::make(4), for_body);
-  auto outer_for =
-      For::make(i, 0, IntImm::make(15) - IntImm::make(5), inner_for);
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto b = Block::make({outer_for});
-
-  std::vector<ForPtr> loops = {outer_for, inner_for};
-  ForPtr flattened = nullptr;
-  ASSERT_TRUE(LoopNest::flatten(loops, &flattened));
-  ASSERT_EQ(flattened, loops.front());
-
-  auto result = IRSimplifier::simplify(flattened);
-  checkIR(result, R"IR(
-        # CHECK: for (int i_flat = 0; i_flat < 50; i_flat++) {
-        # CHECK:   A[i_flat / 5, i_flat % 5] =
-      )IR");
-
-  {
-    SimpleIREvaluator eval1(loops[0], {a_buf});
-    PaddedBuffer<int> inp1(10, 5);
-    eval1(inp1);
-    SimpleIREvaluator eval2(flattened, {a_buf});
-    PaddedBuffer<int> inp2(10, 5);
-    eval2(inp2);
-    ExpectAllNear(inp1, inp2, 1e-5);
-  }
-}
-
-TEST(LoopNest, FlattenImperfectLoopNest) {
-  // Input IR:
-  //   for (int i = 0; i < 10; i++) {
-  //     A[i, i] = 0;
-  //     for (int j = 0; j < 15; j++) {
-  //       A[i,j] = i * j;
-  //     }
-  //   }
-  // Do not flatten.
-
-  BufHandle a_buf("A", {10, 15}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
-  auto inner_for = For::make(j, 0, 15, for_body);
-  auto outer_for = For::make(
-      i, 0, 10, Block::make({Store::make(a_buf, {i, i}, 0), inner_for}));
-  auto par = Block::make({outer_for});
-  HashProvider hasher;
-  auto hash_before = hasher.hash(par);
-
-  std::vector<ForPtr> loops = {outer_for, inner_for};
-  ForPtr flattened = nullptr;
-  ASSERT_FALSE(LoopNest::flatten(loops, &flattened));
-  ASSERT_EQ(flattened, nullptr);
-  auto hash_after = hasher.hash(par);
-  ASSERT_EQ(hash_before, hash_after);
-}
-
-TEST(LoopNest, FlattenReductionLoopNest) {
-  // Input IR:
-  //   for (int i = 0; i < 10; i++) {
-  //     S[i] = 0;
-  //     for (int j = 0; j < 15; j++) {
-  //       S[i] = S[i] + A[i,j];
-  //     }
-  //   }
-  // Do not flatten.
-
-  BufHandle a_buf("A", {10, 15}, kInt);
-  BufHandle s_buf("S", {10}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto for_body = Block::make({Store::make(
-      s_buf, {i}, Load::make(s_buf, {i}) + Load::make(a_buf, {i, j}))});
-  auto inner_for = For::make(j, 0, 15, for_body);
-  auto outer_for =
-      For::make(i, 0, 10, Block::make({Store::make(s_buf, {i}, 0), inner_for}));
-  auto par = Block::make({outer_for});
-  HashProvider hasher;
-  auto hash_before = hasher.hash(par);
-
-  std::vector<ForPtr> loops = {outer_for, inner_for};
-  ForPtr flattened = nullptr;
-  ASSERT_FALSE(LoopNest::flatten(loops, &flattened));
-  ASSERT_EQ(flattened, nullptr);
-  auto hash_after = hasher.hash(par);
-  ASSERT_EQ(hash_before, hash_after);
-}
-
-TEST(LoopNest, FlattenReductionLoopNestFromTensor) {
-  const int M = 3;
-  const int N = 7;
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  BufHandle b("b", {m, n}, kFloat);
-  Tensor c = Reduce("sum", {M}, Sum(), b, {N});
-  LoopNest loop({c});
-  HashProvider hasher;
-  auto hash_before = hasher.hash(loop.root_stmt());
-
-  auto loops = loop.getAllLoopNestsWritingToBuf(c.buf())[1];
-  ForPtr flattened = nullptr;
-  ASSERT_FALSE(LoopNest::flatten(loops, &flattened));
-  ASSERT_EQ(flattened, nullptr);
-  auto hash_after = hasher.hash(loop.root_stmt());
-  ASSERT_EQ(hash_before, hash_after);
-}
-
-TEST(LoopNest, FlattenIncorrectLoopsAsInput) {
-  // Input IR:
-  //   for (int i = 0; i < 10; i++) {
-  //     for (int j = 0; j < 5; j++) {
-  //       A[i,j] = i * j;
-  //     }
-  //   }
-  //   for (int x = 0; x < 10; x++) {
-  //     for (int y = 0; y < 5; y++) {
-  //       A[x,y] = A[x,y] + x + y;
-  //     }
-  //   }
-  // Flatten({For_i, For_y}) => should not succeed
-
-  BufHandle a_buf("A", {10, 5}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  auto for_body1 = Block::make({Store::make(a_buf, {i, j}, i * j)});
-  auto inner_for1 = For::make(j, 0, 5, for_body1);
-  auto outer_for1 = For::make(i, 0, 10, inner_for1);
-  auto for_body2 = Block::make(
-      {Store::make(a_buf, {x, y}, Load::make(a_buf, {x, y}) + x + y)});
-  auto inner_for2 = For::make(y, 0, 5, for_body2);
-  auto outer_for2 = For::make(x, 0, 10, inner_for2);
-  auto par = Block::make({outer_for1, outer_for2});
-  HashProvider hasher;
-  auto hash_before = hasher.hash(par);
-
-  std::vector<ForPtr> loops = {outer_for1, inner_for2};
-  ForPtr flattened = nullptr;
-  ASSERT_FALSE(LoopNest::flatten(loops, &flattened));
-  ASSERT_EQ(flattened, nullptr);
-  auto hash_after = hasher.hash(par);
-  ASSERT_EQ(hash_before, hash_after);
-}
-
-TEST(LoopNest, DetectInlineRankMismatch) {
-  const int kTotalSize = 8;
-
-  BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat);
-  Tensor a = Compute(
-      "a", {kTotalSize}, [&](const VarHandle& i) { return a_buf.load(i); });
-  Tensor reshape = Compute(
-      "reshape",
-      {kTotalSize / 2, 2},
-      [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j); });
-  LoopNest l({reshape}, {a, reshape});
-  ASSERT_FALSE(l.computeInline(l.getLoopBodyFor(a)));
-}
-
-TEST(LoopNest, CacheReadsSimple) {
-  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
-    return i * j;
-  });
-  Tensor B =
-      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i + 30, j + 3);
-      });
-  Tensor C =
-      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
-      });
-
-  LoopNest l({B, C}, {A, B, C});
-  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1];
-  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
-
-  l.prepareForCodegen();
-  StmtPtr result =
-      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
-  SimpleIREvaluator cg(result, {B, C});
-  result = cg.stmt();
-
-  // just this once: verify the whole thing.
-  checkIR(result, R"IR(
-#CHECK: Allocate(A); // dtype=int, dims=[64, 64]
-#CHECK: Allocate(A_local); // dtype=int, dims=[1, 10]
-#CHECK: for (int i
-#CHECK:  for (int j
-#CHECK:   A[
-#CHECK:  }
-#CHECK: }
-#CHECK: for (int i_1
-#CHECK:  for (int j_1
-#CHECK:   A_local[j_1] = A[
-#CHECK:  }
-#CHECK:  for (int j_2
-#CHECK:   B[j_2 + 10 * i_1] = A_local[j_2];
-#CHECK:  }
-#CHECK: }
-#CHECK: for (int i_2
-#CHECK:  for (int j_3
-#CHECK:   C[
-#CHECK:  }
-#CHECK: }
-#CHECK: Free(A_local);
-#CHECK: Free(A);
-      )IR");
-
-  std::vector<int> b_data(200, 0);
-  std::vector<int> c_data(200, 0);
-  cg.call({b_data, c_data});
-
-  std::vector<int> b_ref(200, 0);
-  std::vector<int> c_ref(200, 0);
-
-  for (int i = 0; i < 20; ++i) {
-    for (int j = 0; j < 10; ++j) {
-      b_ref[i * 10 + j] = (i + 30) * (j + 3);
-      c_ref[i * 10 + j] = (i + 10) * (j + 20) + (i + 30) * (j + 40);
-    }
-  }
-
-  assertAllEqual(b_data, b_ref);
-  assertAllEqual(c_data, c_ref);
-}
-
-TEST(LoopNest, CacheReadsOuter) {
-  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
-    return i * j;
-  });
-  Tensor B =
-      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
-      });
-  Tensor C =
-      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
-      });
-
-  LoopNest l({B, C}, {A, B, C});
-  StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][0];
-  LoopNest::cacheAccesses(A.buf(), "A_local", i_loop);
-
-  l.prepareForCodegen();
-  StmtPtr result =
-      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
-  SimpleIREvaluator cg(result, {B, C});
-  result = cg.stmt();
-
-  checkIR(result, R"IR(
-#CHECK: Allocate(A_local); // dtype=int, dims=[21, 11]
-#CHECK: A_local[j_1 + 11 * i_1] =
-#CHECK: B[j_2 + 10 * i_2] = (A_local[j_2 + 11 * i_2]) + (A_local[(j_2 + 11 * i_2) + 12]);
-      )IR");
-
-  std::vector<int> b_data(200, 0);
-  std::vector<int> c_data(200, 0);
-  cg.call({b_data, c_data});
-
-  std::vector<int> b_ref(200, 0);
-  std::vector<int> c_ref(200, 0);
-
-  for (int i = 0; i < 20; ++i) {
-    for (int j = 0; j < 10; ++j) {
-      b_ref[i * 10 + j] = (i + 30) * (j + 40) + (i + 31) * (j + 41);
-      c_ref[i * 10 + j] = (i + 10) * (j + 20) + (i + 30) * (j + 40);
-    }
-  }
-
-  assertAllEqual(b_data, b_ref);
-  assertAllEqual(c_data, c_ref);
-}
-
-TEST(LoopNest, CacheReadsInternal) {
-  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
-    return i * j;
-  });
-  Tensor B =
-      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
-      });
-  Tensor C =
-      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
-      });
-
-  LoopNest l({B, C}, {A, B, C});
-  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1];
-  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
-  l.prepareForCodegen();
-  StmtPtr result =
-      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
-  SimpleIREvaluator cg(result, {B, C});
-  result = cg.stmt();
-
-  checkIR(result, R"IR(
-#CHECK: Allocate(A_local); // dtype=int, dims=[2, 11]
-#CHECK: A_local[k + 11 * j_1] =
-#CHECK: B[j_2 + 10 * i_1] = (A_local[j_2 + 12]) + (A_local[j_2]);
-      )IR");
-
-  std::vector<int> b_data(200, 0);
-  std::vector<int> c_data(200, 0);
-  cg.call({b_data, c_data});
-
-  std::vector<int> b_ref(200, 0);
-  std::vector<int> c_ref(200, 0);
-
-  for (int i = 0; i < 20; ++i) {
-    for (int j = 0; j < 10; ++j) {
-      b_ref[i * 10 + j] = (i + 30) * (j + 40) + (i + 31) * (j + 41);
-      c_ref[i * 10 + j] = (i + 10) * (j + 20) + (i + 30) * (j + 40);
-    }
-  }
-
-  assertAllEqual(b_data, b_ref);
-  assertAllEqual(c_data, c_ref);
-}
-
-TEST(LoopNest, CacheReadsInner) {
-  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
-    return i * j;
-  });
-  // note im changing the offset of the first arg of the first call to A.
-  Tensor B =
-      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i + 34, j + 40) + A.load(i + 30, j + 41);
-      });
-  Tensor C =
-      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
-      });
-
-  LoopNest l({B, C}, {A, B, C});
-  StmtPtr body = l.getLoopBodyFor(B);
-  LoopNest::cacheAccesses(A.buf(), "A_local", body);
-  l.prepareForCodegen();
-  StmtPtr result =
-      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
-  SimpleIREvaluator cg(result, {B, C});
-  result = cg.stmt();
-
-  checkIR(result, R"IR(
-#CHECK: Allocate(A_local); // dtype=int, dims=[5, 2]
-#CHECK: A_local[l + 2 * k] =
-#CHECK: B[j_1 + 10 * i_1] = (A_local[1]) + (A_local[8]);
-      )IR");
-
-  std::vector<int> b_data(200, 0);
-  std::vector<int> c_data(200, 0);
-  cg.call({b_data, c_data});
-
-  std::vector<int> b_ref(200, 0);
-  std::vector<int> c_ref(200, 0);
-
-  for (int i = 0; i < 20; ++i) {
-    for (int j = 0; j < 10; ++j) {
-      b_ref[i * 10 + j] = (i + 34) * (j + 40) + (i + 30) * (j + 41);
-      c_ref[i * 10 + j] = (i + 10) * (j + 20) + (i + 30) * (j + 40);
-    }
-  }
-
-  assertAllEqual(b_data, b_ref);
-  assertAllEqual(c_data, c_ref);
-}
-
-TEST(LoopNest, CacheWritesSimple) {
-  Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) {
-    return i * j;
-  });
-  Tensor B =
-      Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
-      });
-  Tensor C =
-      Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) {
-        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
-      });
-
-  LoopNest l({B, C}, {A, B, C});
-  StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A.buf())[0][1];
-  LoopNest::cacheAccesses(A.buf(), "A_local", a_loop);
-
-  l.prepareForCodegen();
-  StmtPtr result =
-      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
-  SimpleIREvaluator cg(result, {B, C});
-  result = cg.stmt();
-
-  checkIR(result, R"IR(
-#CHECK: Allocate(A_local); // dtype=int, dims=[1, 64]
-#CHECK: for (int j = 0; j < 64
-#CHECK:   A_local[j] = i * j;
-#CHECK: for (int j_1 = 0; j_1 < 64
-#CHECK:   A[j_1 + 64 * i] = A_local[
-#CHECK: Free(A_local);
-#CHECK-NOT: A_local
-      )IR");
-
-  std::vector<int> b_data(200, 0);
-  std::vector<int> c_data(200, 0);
-  cg.call({b_data, c_data});
-
-  std::vector<int> b_ref(200, 0);
-  std::vector<int> c_ref(200, 0);
-
-  for (int i = 0; i < 20; ++i) {
-    for (int j = 0; j < 10; ++j) {
-      b_ref[i * 10 + j] = (i + 30) * (j + 40) + (i + 31) * (j + 41);
-      c_ref[i * 10 + j] = (i + 10) * (j + 20) + (i + 30) * (j + 40);
-    }
-  }
-
-  assertAllEqual(b_data, b_ref);
-  assertAllEqual(c_data, c_ref);
-}
-
-TEST(LoopNest, DeadStoreElimination) {
-  VarHandle y("y", kInt);
-  VarHandle x("x_tail", kInt);
-  BufHandle f("f", {26, 5}, kInt);
-  BufHandle g("g", {26, 5}, kInt);
-  ExprHandle x_outer_end = 5;
-  ExprHandle x_2 = x + x_outer_end * 4;
-  ForPtr stmt1 = For::make(
-      x,
-      0,
-      5,
-      For::make(
-          y,
-          0,
-          5,
-          Block::make({
-              Store::make(f, {x_2, y}, (x_2 + y)),
-              Store::make(g, {x_2, y}, (x_2 * y)),
-          })));
-  StmtPtr stmt = Block::make({stmt1});
-
-  // Will eliminate if not used by an output.
-  LoopNest loop(Stmt::clone(stmt), {f.node()});
-  loop.eliminateDeadStores();
-
-  checkIR(loop.root_stmt(), R"IR(
-#CHECK:     f[x_tail + 5 * 4, y]
-#CHECK-NOT: g[x_tail + 5 * 4, y]
-      )IR");
-
-  // But won't eliminate if used by different outputs.
-  LoopNest loop2(stmt, {f.node(), g.node()});
-  loop2.eliminateDeadStores();
-
-  checkIR(loop2.root_stmt(), R"IR(
-#CHECK:     f[x_tail + 5 * 4, y]
-#CHECK:     g[x_tail + 5 * 4, y]
-      )IR");
-}
-
-TEST(LoopNest, DeadStoreEliminationWithIntermediates) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  VarHandle z("z", kInt);
-  BufHandle f("f", {26 * 5}, kInt);
-  BufHandle g("g", {26 * 5}, kInt);
-  BufHandle h("h", {26, 5}, kInt);
-  ExprHandle x_outer_end = 5;
-  ExprHandle x_2 = x + x_outer_end * 4;
-  ForPtr stmt1 = For::make(x, 0, 26 * 5, Store::make(f, {x}, x));
-  ForPtr stmt2 = For::make(z, 0, 26 * 5, Store::make(g, {z}, z + 1));
-  ForPtr stmt3 = For::make(
-      x,
-      0,
-      5,
-      For::make(
-          y,
-          0,
-          5,
-          Block::make({
-              Store::make(h, {x, y}, Load::make(f, {x * y})),
-          })));
-  StmtPtr stmt = Block::make({stmt1, stmt2, stmt3});
-
-  // Will eliminate the write to g, but not f since it used by the producer of
-  // h.
-  LoopNest loop(Stmt::clone(stmt), {h.node()});
-  loop.eliminateDeadStores();
-
-  checkIR(loop.root_stmt(), R"IR(
-  #CHECK:     f[x] = x;
-  #CHECK-NOT: g[z] =
-  #CHECK:     h[x, y] = f[x * y];
-      )IR");
-
-  // Sanity check won't eliminate if g is an output.
-  LoopNest loop2(stmt, {h.node(), g.node()});
-  loop2.eliminateDeadStores();
-
-  checkIR(loop2.root_stmt(), R"IR(
-  #CHECK:     f[x] = x;
-  #CHECK:     g[z] = z + 1;
-  #CHECK:     h[x, y] = f[x * y];
-      )IR");
-}
-
-TEST(LoopNest, CompoundTensorSimple) {
-  BufHandle a_buf("A", {10, 5}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  auto for_body1 = Block::make({Store::make(a_buf, {i, j}, i * j)});
-  auto inner_for1 = For::make(j, 0, 5, for_body1);
-  auto outer_for1 = For::make(i, 0, 10, inner_for1);
-  auto for_body2 = Block::make(
-      {Store::make(a_buf, {x, y}, Load::make(a_buf, {x, y}) + x + y)});
-  auto inner_for2 = For::make(y, 0, 5, for_body2);
-  auto outer_for2 = For::make(x, 0, 10, inner_for2);
-  BlockPtr body = Block::make({outer_for1, outer_for2});
-
-  Tensor A = Tensor(a_buf.node(), body);
-
-  LoopNest l({A});
-  l.prepareForCodegen();
-
-  std::vector<int> a_data(50, 0);
-
-  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
-  SimpleIREvaluator cg(s, {A});
-
-  std::vector<int> a_ref(50, 0);
-
-  for (int i = 0; i < 10; ++i) {
-    for (int j = 0; j < 5; ++j) {
-      a_ref[i * 5 + j] = (i * j) + i + j;
-    }
-  }
-  cg.call({a_data});
-
-  assertAllEqual(a_data, a_ref);
-}
-
-TEST(LoopNest, InlineConstantIndex) {
-  const int N = 10;
-  BufHandle x_buf("a", {1, N, 1}, kFloat);
-  Tensor y = Compute(
-      "f",
-      {1, N, 1},
-      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) {
-        return x_buf.load(m, n, o);
-      });
-  Tensor z = Compute(
-      "f",
-      {1, N, 1},
-      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) {
-        return y.load(m, n, o);
-      });
-
-  LoopNest l({z}, {y, z});
-  l.simplify();
-  ASSERT_TRUE(l.computeInline(y.buf()));
-}
-
-TEST(LoopNest, CompoundTensorUsed) {
-  BufHandle a_buf("A", {10, 5}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  auto for_body1 = Block::make({Store::make(a_buf, {i, j}, i * j)});
-  auto inner_for1 = For::make(j, 0, 5, for_body1);
-  auto outer_for1 = For::make(i, 0, 10, inner_for1);
-  auto for_body2 = Block::make(
-      {Store::make(a_buf, {x, y}, Load::make(a_buf, {x, y}) + x + y)});
-  auto inner_for2 = For::make(y, 0, 5, for_body2);
-  auto outer_for2 = For::make(x, 0, 10, inner_for2);
-  BlockPtr body = Block::make({outer_for1, outer_for2});
-
-  Tensor A = Tensor(a_buf.node(), body);
-  Tensor B = Compute("B", {10, 3}, [&](const VarHandle& i, const VarHandle& j) {
-    return A.load(i, j + 1) + A.load(i, j + 2);
-  });
-
-  LoopNest l({B}, {A, B});
-  ASSERT_FALSE(l.computeInline(A.buf()));
-  l.prepareForCodegen();
-
-  std::vector<int> a_data(50, 0);
-  std::vector<int> b_data(50, 0);
-
-  StmtPtr s = IRSimplifier::simplify(l.root_stmt());
-  SimpleIREvaluator cg(s, {B});
-
-  std::vector<int> b_ref(50, 0);
-
-  auto AT = [](int i, int j) { return i * j + i + j; };
-  for (int i = 0; i < 10; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      b_ref[i * 3 + j] = AT(i, j + 1) + AT(i, j + 2);
-    }
-  }
-  cg.call({b_data});
-
-  assertAllEqual(b_data, b_ref);
-}
-
-TEST(LoopNest, InlineFromLoad) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto store_a = For::make(i, 0, N, Store::make(a, {i}, i));
-  auto store_b = For::make(j, 0, N, Store::make(b, {j}, Load::make(a, {j})));
-  LoopNest l(Block::make({store_a, store_b}), {b.node()});
-
-  l.computeInline(a.node());
-
-  // Check that A[j] is replaced with j after inlining
-  std::ostringstream oss;
-  oss << *l.root_stmt();
-  torch::jit::testing::FileCheck().run(
-      R"IR(
-# CHECK: for (int j
-# CHECK-NOT: B[j] = A[j]
-# CHECK-NEXT: B[j] = j
-)IR",
-      oss.str());
-}
-
-TEST(LoopNest, OptimizeConditionalsSimple) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
-  //   }
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle a_buf("A", {20}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle b_buf("B", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle c_buf("C", {15}, kInt);
-  VarHandle i("i", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto store = Store::make(
-      a_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(i, 5, kLT),
-          Load::make(b_buf, {i}),
-          Load::make(c_buf, {i - 5})));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto forI = For::make(i, 0, 20, store);
-  auto par = Block::make({forI});
-
-  LoopNest nest(par, {a_buf.node()});
-  nest.optimizeConditionals();
-
-  std::ostringstream oss;
-  oss << *nest.root_stmt();
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i = 0; i < 5
-# CHECK-NEXT: A[i] = B[i]
-# CHECK: for (int i = 0; i < 15
-# CHECK-NEXT: A[i + 5] = C[i]
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(LoopNest, OptimizeConditionalsNestedConditions) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = IfThenElse(i<10, IfThenElse(i<5, B[i], C[i-5]), D[i-10])
-  //   }
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle a_buf("A", {20}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle b_buf("B", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle c_buf("C", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle d_buf("D", {10}, kInt);
-  VarHandle i("i", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto store = Store::make(
-      a_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(i, 10, kLT),
-          IfThenElse::make(
-              CompareSelect::make(i, 5, kLT),
-              Load::make(b_buf, {i}),
-              Load::make(c_buf, {i - 5})),
-          Load::make(d_buf, {i - 10})));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto forI = For::make(i, 0, 20, store);
-  auto par = Block::make({forI});
-
-  LoopNest nest(par, {a_buf.node()});
-  nest.optimizeConditionals();
-
-  std::ostringstream oss;
-  oss << *nest.root_stmt();
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i = 0; i < 5
-# CHECK-NEXT: A[i] = B[i]
-# CHECK: for (int i = 0; i < 5
-# CHECK-NEXT: A[i + 5] = C[i]
-# CHECK: for (int i = 0; i < 10
-# CHECK-NEXT: A[i + 10] = D[i]
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(LoopNest, OptimizeConditionalsMultipleStores) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
-  //   }
-  //   for (int j = 0; j < 100; j++) {
-  //     B[j] = IfThenElse(j<30 ? 1 : 0, C[j], D[j])
-  //   }
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle a_buf("A", {20}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle b_buf("B", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle c_buf("C", {100}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle d_buf("D", {100}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto storeA = Store::make(
-      a_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(i, 5, kLT),
-          Load::make(b_buf, {i}),
-          Load::make(c_buf, {i - 5})));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto forI = For::make(i, 0, 20, storeA);
-  auto storeB = Store::make(
-      b_buf,
-      {j},
-      IfThenElse::make(
-          CompareSelect::make(j, 30, kLT),
-          Load::make(c_buf, {j}),
-          Load::make(d_buf, {j})));
-  auto forJ = For::make(j, 0, 100, storeB);
-  auto par = Block::make({forI, forJ});
-
-  LoopNest nest(par, {a_buf.node()});
-  nest.optimizeConditionals();
-
-  std::ostringstream oss;
-  oss << *nest.root_stmt();
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i = 0; i < 5
-# CHECK-NEXT: A[i] = B[i]
-# CHECK: for (int i = 0; i < 15
-# CHECK-NEXT: A[i + 5] = C[i]
-# CHECK: for (int j = 0; j < 30
-# CHECK-NEXT: B[j] = C[j]
-# CHECK: for (int j = 0; j < 70
-# CHECK-NEXT: B[j + 30] = D[j + 30]
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(LoopNest, OptimizeConditionalsMultipleStoresInOneLoop) {
-  // Input IR:
-  //   for (int i = 0; i < 50; i++) {
-  //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
-  //     B[j] = IfThenElse(j<30 ? 1 : 0, C[j], D[j])
-  //   }
-  // Only the first conditional, in the write to A, will be optimized.
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle a_buf("A", {100}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle b_buf("B", {100}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle c_buf("C", {100}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle d_buf("D", {100}, kInt);
-  VarHandle i("i", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto storeA = Store::make(
-      a_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(i, 5, kLT),
-          Load::make(b_buf, {i}),
-          Load::make(c_buf, {i - 5})));
-  auto storeB = Store::make(
-      b_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(i, 30, kLT),
-          Load::make(c_buf, {i}),
-          Load::make(d_buf, {i})));
-  auto forI = For::make(i, 0, 50, Block::make({storeA, storeB}));
-  auto par = Block::make({forI});
-
-  LoopNest nest(par, {a_buf.node()});
-  nest.optimizeConditionals();
-
-  std::ostringstream oss;
-  oss << *nest.root_stmt();
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i = 0; i < 5
-# CHECK-NEXT: A[i] = B[i]
-# CHECK-NEXT: B[i] = C[i]
-# CHECK: for (int i = 0; i < 45
-# CHECK-NEXT: A[i + 5] = C[i]
-# CHECK-NEXT: B[i + 5] = IfThenElse(i + 5<30 ? 1 : 0, C[i + 5], D[i + 5])
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(LoopNest, OptimizeConditionalsOuterLoopVar) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 100; j++) {
-  //       A[i] = IfThenElse(i<10, IfThenElse(i<5, B[i], C[i-5]), D[i-10])
-  //     }
-  //   }
-  // Currently, this case where the condition variable `i` is not the
-  // inner-most loop variable, is not optimized.
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle a_buf("A", {20}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle b_buf("B", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle c_buf("C", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle d_buf("D", {10}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto store = Store::make(
-      a_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(i, 10, kLT),
-          IfThenElse::make(
-              CompareSelect::make(i, 5, kLT),
-              Load::make(b_buf, {i}),
-              Load::make(c_buf, {i - 5})),
-          Load::make(d_buf, {i - 10})));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto forI = For::make(i, 0, 20, For::make(j, 0, 100, store));
-  auto par = Block::make({forI});
-  LoopNest nest(par, {a_buf.node()});
-
-  HashProvider hasher;
-  auto hash_before = hasher.hash(nest.root_stmt());
-  nest.optimizeConditionals();
-  auto hash_after = hasher.hash(nest.root_stmt());
-  ASSERT_EQ(hash_before, hash_after);
-}
-
-TEST(LoopNest, OptimizeConditionalsCompValuesNotOrdered) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = IfThenElse(i<5, IfThenElse(i<10, B[i], C[i-5]), D[i-10])
-  //   }
-  // No optimization should be done here because one of the conditions use '>'.
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle a_buf("A", {20}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle b_buf("B", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle c_buf("C", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle d_buf("D", {10}, kInt);
-  VarHandle i("i", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto store = Store::make(
-      a_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(i, 5, kLT),
-          IfThenElse::make(
-              CompareSelect::make(i, 10, kLT),
-              Load::make(b_buf, {i}),
-              Load::make(c_buf, {i - 5})),
-          Load::make(d_buf, {i - 10})));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto forI = For::make(i, 0, 20, store);
-  auto par = Block::make({forI});
-  LoopNest nest(par, {a_buf.node()});
-
-  HashProvider hasher;
-  auto hash_before = hasher.hash(nest.root_stmt());
-  nest.optimizeConditionals();
-  auto hash_after = hasher.hash(nest.root_stmt());
-  ASSERT_EQ(hash_before, hash_after);
-}
-
-TEST(LoopNest, OptimizeConditionalsCompValuesNotConstants) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = IfThenElse(i<N, IfThenElse(i<5, B[i], C[i-5]), D[i-10])
-  //   }
-  // No optimization should be done here because one of the conditions use '>'.
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle a_buf("A", {20}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle b_buf("B", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle c_buf("C", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle d_buf("D", {10}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle N("N", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto store = Store::make(
-      a_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(i, N, kLT),
-          IfThenElse::make(
-              CompareSelect::make(i, 5, kLT),
-              Load::make(b_buf, {i}),
-              Load::make(c_buf, {i - 5})),
-          Load::make(d_buf, {i - 10})));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto forI = For::make(i, 0, 20, store);
-  auto par = Block::make({forI});
-  LoopNest nest(par, {a_buf.node()});
-
-  HashProvider hasher;
-  auto hash_before = hasher.hash(nest.root_stmt());
-  nest.optimizeConditionals();
-  auto hash_after = hasher.hash(nest.root_stmt());
-  ASSERT_EQ(hash_before, hash_after);
-}
-
-TEST(LoopNest, OptimizeConditionalsInvalidCondition) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = IfThenElse(i<10, IfThenElse(i>5, B[i], C[i-5]), D[i-10])
-  //   }
-  // No optimization should be done here because one of the conditions use '>'.
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle a_buf("A", {20}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle b_buf("B", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle c_buf("C", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle d_buf("D", {10}, kInt);
-  VarHandle i("i", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto store = Store::make(
-      a_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(i, 10, kLT),
-          IfThenElse::make(
-              CompareSelect::make(i, 5, kGT),
-              Load::make(b_buf, {i}),
-              Load::make(c_buf, {i - 5})),
-          Load::make(d_buf, {i - 10})));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto forI = For::make(i, 0, 20, store);
-  auto par = Block::make({forI});
-  LoopNest nest(par, {a_buf.node()});
-
-  HashProvider hasher;
-  auto hash_before = hasher.hash(nest.root_stmt());
-  nest.optimizeConditionals();
-  auto hash_after = hasher.hash(nest.root_stmt());
-  ASSERT_EQ(hash_before, hash_after);
-}
-
-TEST(LoopNest, OptimizeConditionalsInvalidCondition2) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = IfThenElse(10<i, IfThenElse(i<5, B[i], C[i-5]), D[i-10])
-  //   }
-  // No optimization should be done here because of the invalid condition:
-  //    "10 < i".
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle a_buf("A", {20}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle b_buf("B", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle c_buf("C", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle d_buf("D", {10}, kInt);
-  VarHandle i("i", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto store = Store::make(
-      a_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(10, i, kLT),
-          IfThenElse::make(
-              CompareSelect::make(i, 5, kLT),
-              Load::make(b_buf, {i}),
-              Load::make(c_buf, {i - 5})),
-          Load::make(d_buf, {i - 10})));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto forI = For::make(i, 0, 20, store);
-  auto par = Block::make({forI});
-  LoopNest nest(par, {a_buf.node()});
-
-  HashProvider hasher;
-  auto hash_before = hasher.hash(nest.root_stmt());
-  nest.optimizeConditionals();
-  auto hash_after = hasher.hash(nest.root_stmt());
-  ASSERT_EQ(hash_before, hash_after);
-}
-
-TEST(LoopNest, OptimizeConditionalsInvalidCondition3) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = IfThenElse(i<10, IfThenElse(k<5, B[i], C[i-5]), D[i-10])
-  //   }
-  // No optimization should be done here because the conditions use different
-  // variables: "i < 10" and "k < 5"
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle a_buf("A", {20}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle b_buf("B", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle c_buf("C", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle d_buf("D", {10}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle k("k", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto store = Store::make(
-      a_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(i, 10, kLT),
-          IfThenElse::make(
-              CompareSelect::make(k, 5, kLT),
-              Load::make(b_buf, {i}),
-              Load::make(c_buf, {i - 5})),
-          Load::make(d_buf, {i - 10})));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto forI = For::make(i, 0, 20, store);
-  auto par = Block::make({forI});
-  LoopNest nest(par, {a_buf.node()});
-
-  HashProvider hasher;
-  auto hash_before = hasher.hash(nest.root_stmt());
-  nest.optimizeConditionals();
-  auto hash_after = hasher.hash(nest.root_stmt());
-  ASSERT_EQ(hash_before, hash_after);
-}
-
-TEST(LoopNest, OptimizeConditionalsInvalidCondition4) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = IfThenElse(k<10, IfThenElse(k<5, B[i], C[i-5]), D[i-10])
-  //   }
-  // No optimization should be done here because the conditions use the
-  // variable 'k' which is not a loop variable.
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle a_buf("A", {20}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle b_buf("B", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle c_buf("C", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle d_buf("D", {10}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle k("k", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto store = Store::make(
-      a_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(k, 10, kLT),
-          IfThenElse::make(
-              CompareSelect::make(k, 5, kLT),
-              Load::make(b_buf, {i}),
-              Load::make(c_buf, {i - 5})),
-          Load::make(d_buf, {i - 10})));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto forI = For::make(i, 0, 20, store);
-  auto par = Block::make({forI});
-  LoopNest nest(par, {a_buf.node()});
-
-  HashProvider hasher;
-  auto hash_before = hasher.hash(nest.root_stmt());
-  nest.optimizeConditionals();
-  auto hash_after = hasher.hash(nest.root_stmt());
-  ASSERT_EQ(hash_before, hash_after);
-}
-
-TEST(LoopNest, OptimizeConditionalsNotNormalized) {
-  // Input IR:
-  //   for (int i = 2; i < 20; i++) {
-  //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
-  //   }
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle a_buf("A", {20}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle b_buf("B", {5}, kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  BufHandle c_buf("C", {15}, kInt);
-  VarHandle i("i", kInt);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto store = Store::make(
-      a_buf,
-      {i},
-      IfThenElse::make(
-          CompareSelect::make(i, 5, kLT),
-          Load::make(b_buf, {i}),
-          Load::make(c_buf, {i - 5})));
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  auto forI = For::make(i, 2, 20, store);
-  auto par = Block::make({forI});
-  LoopNest nest(par, {a_buf.node()});
-
-  HashProvider hasher;
-  auto hash_before = hasher.hash(nest.root_stmt());
-  nest.optimizeConditionals();
-  auto hash_after = hasher.hash(nest.root_stmt());
-  ASSERT_EQ(hash_before, hash_after);
-}
-
-static std::pair<BufHandle, Tensor> colReduce(int M, int N) {
-  BufHandle a("a", {M, N}, kFloat);
-  Tensor t = Reduce(
-      "b",
-      {N},
-      Sum(),
-      [&](const VarHandle& n, const VarHandle& m) { return a.load(m, n); },
-      {M});
-  return {a, Tensor(t.buf(), LoopNest::sanitizeNames(t.stmt()))};
-}
-
-static StmtPtr splitTailReorder(Tensor b) {
-  constexpr int kVectorWidth = 8;
-  LoopNest nest({b});
-  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0];
-  nest.splitWithTail(loops[0], kVectorWidth);
-  // Now the loopnests will look like:
-  //
-  // for (int i_outer = 0; ...
-  //   for (int i_inner = 0; ...
-  //     b[i_outer * 8 + i_inner] = float(0);
-  //     for (int j = 0; ...
-  //       b[i_outer * 8 + i_inner] = ReduceOp(...);
-  //
-  // for (int i_tail = 0; ...
-  //   b[i_tail + ((100 - 0) / 8) * 8] = float(0);
-  //   for (int j = 0; ...
-  //     b[i_tail + ((100 - 0) / 8) * 8] = ReduceOp(...);
-  //
-  // Since there are 4 writes to b, we will get 4 loopnests from the
-  // call to `getAllLoopNestsWritingToBuf` below.
-  //
-  // Write #2: "b[i_outer * 8 + i_inner] = ReduceOp(...)"
-  // Loopnest #2: {i_outer, i_inner, j};
-  // We will have to reorder i_inner and j.
-  auto loopnests = nest.getAllLoopNestsWritingToBuf(b.buf());
-  LoopNest::reorderAxis(loopnests[1][1], loopnests[1][2]);
-  nest.prepareForCodegen();
-  return nest.root_stmt();
-}
-
-static StmtPtr splitMaskReorder(Tensor b) {
-  constexpr int kVectorWidth = 8;
-  LoopNest nest({b});
-  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[1];
-  nest.splitWithMask(loops[0], kVectorWidth);
-  loops = nest.getAllLoopNestsWritingToBuf(b.buf())[1];
-  LoopNest::reorderAxis(loops[1], loops[2]);
-  nest.prepareForCodegen();
-  return nest.root_stmt();
-}
-
-static void checkColReduce(StmtPtr s, BufHandle p, Tensor t) {
-  int M = immediateAs<int>(p.dim(0));
-  int N = immediateAs<int>(p.dim(1));
-  PaddedBuffer<float> a(M, N);
-  PaddedBuffer<float> b(N);
-  PaddedBuffer<float> ref(N);
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      a(i, j) = 1.0f;
-    }
-  }
-  for (int i = 0; i < N; i++) {
-    b(i) = 0.0f;
-  }
-  for (int i = 0; i < N; i++) {
-    ref(i) = 76.0f;
-  }
-  SimpleIREvaluator(s, {p, t}).call({a, b});
-  ExpectAllNear(b, ref, 1e-5);
-}
-
-TEST(LoopNest, ColReduceSplitTailEvenReorder) {
-  constexpr int M = 76, N = 128;
-  auto p = colReduce(M, N);
-  StmtPtr s = splitTailReorder(p.second);
-
-  std::ostringstream oss;
-  oss << *s;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i_outer
-# CHECK-NEXT: for (int i_inner
-# CHECK-NEXT: b[
-# CHECK: for (int j
-# CHECK-NEXT: for (int i_inner
-# CHECK-NEXT: b[
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  checkColReduce(s, p.first, p.second);
-}
-
-TEST(LoopNest, ColReduceSplitTailUnevenReorder) {
-  constexpr int M = 76, N = 100;
-  auto p = colReduce(M, N);
-  StmtPtr s = splitTailReorder(p.second);
-
-  std::ostringstream oss;
-  oss << *s;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i_outer
-# CHECK-NEXT: for (int i_inner
-# CHECK-NEXT: b[
-# CHECK: for (int j
-# CHECK-NEXT: for (int i_inner
-# CHECK-NEXT: b[
-# CHECK: for (int i_tail
-# CHECK-NEXT: b[
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: b[
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  checkColReduce(s, p.first, p.second);
-}
-
-TEST(LoopNest, ColReduceSplitMaskEvenReorder) {
-  constexpr int M = 76, N = 128;
-  auto p = colReduce(M, N);
-  StmtPtr s = splitMaskReorder(p.second);
-  checkColReduce(s, p.first, p.second);
-}
-
-TEST(LoopNest, ColReduceSplitMaskUnevenReorder) {
-  constexpr int M = 76, N = 100;
-  auto p = colReduce(M, N);
-  StmtPtr s = splitMaskReorder(p.second);
-  checkColReduce(s, p.first, p.second);
-}
-
-TEST(LoopNest, ReorderAxisWithMultipleConds) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     if i > 5 {
-  //       if i < 10 {
-  //         for (int j = 0; j < 100; j++) {
-  //           A[i] = i * j;
-  //         }
-  //       }
-  //     }
-  //   }
-  BufHandle a_buf("A", {20}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {i}, Mul::make(i, j)));
-  auto inner_cond = Cond::make(CompareSelect::make(i, 10, kLT), forJ, nullptr);
-  auto outer_cond =
-      Cond::make(CompareSelect::make(i, 5, kGT), inner_cond, nullptr);
-  auto forI = For::make(i, 0, 20, outer_cond);
-  StmtPtr par = Block::make({forI});
-  LoopNest l(par, {a_buf.node()});
-  LoopNest::reorderAxis(forI, forJ);
-  ASSERT_EQ(par, l.root_stmt());
-  par = IRSimplifier::simplify(par);
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int j
-# CHECK-NEXT: for (int i
-# CHECK-NEXT: if (i>5
-# CHECK-NEXT: if (i<10
-# CHECK-NEXT: A[i] = i * j
-# CHECK-NOT: for (
-      )IR";
-  std::ostringstream oss;
-  oss << *par;
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(LoopNest, VectorizeUse) {
-  constexpr int N = 8;
-  BufHandle a("a", {N}, kFloat);
-  Tensor b =
-      Compute("b", {N}, [&](const VarHandle& n) { return a.load(n) + 1.0f; });
-  Tensor c =
-      Compute("c", {N}, [&](const VarHandle& n) { return b.load(n) + 2.0f; });
-  LoopNest nest({c}, {b, c});
-  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0];
-  ASSERT_TRUE(LoopNest::vectorize(loops[0]));
-  loops = nest.getAllLoopNestsWritingToBuf(c.buf())[0];
-  ASSERT_TRUE(LoopNest::vectorize(loops[0]));
-  nest.prepareForCodegen();
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  StmtPtr s = nest.root_stmt();
-  std::ostringstream oss;
-  oss << *nest.root_stmt();
-  torch::jit::testing::FileCheck().run(
-      R"IR(
-# CHECK: c[Ramp
-)IR",
-      oss.str());
-}
-
-const char* int64Loop = R"IR(
-# CHECK: for (int64_t i = 0ll; i < 12ll; i++) {
-# CHECK:   b[i] = (a[i]) + 1ll;
-# CHECK: }
-)IR";
-
-TEST(LoopNest, Int64Direct) {
-  constexpr int64_t N = 12;
-  BufHandle a("a", {N}, kLong);
-  BufHandle b("b", {N}, kLong);
-  VarHandle n("i", kLong);
-  StmtPtr s = For::make(
-      n, LongImm::make(0l), N, b.store({n}, a.load({n}) + LongImm::make(1l)));
-  s = IRSimplifier::simplify(s);
-  std::ostringstream oss;
-  oss << *s;
-  torch::jit::testing::FileCheck().run(int64Loop, oss.str());
-}
-
-TEST(LoopNest, Int64Compute) {
-  constexpr int64_t N = 12;
-  BufHandle a("a", {N}, kLong);
-  Tensor b = Compute("b", {N}, [&](const VarHandle& n) {
-    return a.load(n) + LongImm::make(1l);
-  });
-  LoopNest nest({b});
-  nest.prepareForCodegen();
-  nest.simplify();
-  std::ostringstream oss;
-  oss << *nest.root_stmt();
-  torch::jit::testing::FileCheck().run(int64Loop, oss.str());
-}
-
-TEST(LoopNest, DistributeLoopWithAllStmtsAsPivots) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = 0;
-  //     for (int j = 0; j < 100; j++) {
-  //       A[i] = A[i] + i * j;
-  //     }
-  //     B[i] = A[i];
-  //     for (int k = 0; k < 50; k++) {
-  //       B[i] = B[i] + i * k;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20}, kInt);
-  BufHandle b_buf("B", {20}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto initA = Store::make(a_buf, {i}, 0);
-  auto forJ = For::make(
-      j,
-      0,
-      100,
-      Store::make(
-          a_buf, {i}, Add::make(Load::make(a_buf, {i}), Mul::make(i, j))));
-  auto initB = Store::make(b_buf, {i}, Load::make(a_buf, {i}));
-  auto forK = For::make(
-      k,
-      0,
-      50,
-      Store::make(
-          b_buf, {i}, Add::make(Load::make(b_buf, {i}), Mul::make(i, k))));
-  auto forI = For::make(i, 0, 20, Block::make({initA, forJ, initB, forK}));
-  auto par = Block::make({forI});
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: A[i] = 0
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[i] =
-# CHECK: for (int i
-# CHECK-NEXT: B[i] = A[i]
-# CHECK: for (int i
-# CHECK-NEXT: for (int k
-# CHECK-NEXT: B[i] =
-# CHECK-NOT: for (
-      )IR";
-
-  LoopNest nest(par, {a_buf.node(), b_buf.node()});
-  auto new_loops = LoopNest::distributeLoop(forI, {initA, forJ, initB});
-
-  std::ostringstream oss;
-  oss << *par;
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The first loop after distribution must be same as the original For.
-  ASSERT_EQ(new_loops.front(), forI);
-}
-
-TEST(LoopNest, DistributeLoopWithOneStmtAsPivot) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = 0;
-  //     for (int j = 0; j < 100; j++) {
-  //       A[i] = A[i] + i * j;
-  //     }
-  //     B[i] = A[i];
-  //     for (int k = 0; k < 50; k++) {
-  //       B[i] = B[i] + i * k;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20}, kInt);
-  BufHandle b_buf("B", {20}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto initA = Store::make(a_buf, {i}, 0);
-  auto forJ = For::make(
-      j,
-      0,
-      100,
-      Store::make(
-          a_buf, {i}, Add::make(Load::make(a_buf, {i}), Mul::make(i, j))));
-  auto initB = Store::make(b_buf, {i}, Load::make(a_buf, {i}));
-  auto forK = For::make(
-      k,
-      0,
-      50,
-      Store::make(
-          b_buf, {i}, Add::make(Load::make(b_buf, {i}), Mul::make(i, k))));
-  auto forI = For::make(i, 0, 20, Block::make({initA, forJ, initB, forK}));
-  auto par = Block::make({forI});
-
-  LoopNest nest(par, {a_buf.node(), b_buf.node()});
-  auto new_loops = LoopNest::distributeLoop(forI, {forJ});
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: A[i] = 0
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[i] =
-# CHECK: for (int i
-# CHECK-NEXT: B[i] = A[i]
-# CHECK-NEXT: for (int k
-# CHECK-NEXT: B[i] =
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The first loop after distribution must be same as the original For.
-  ASSERT_EQ(new_loops.front(), forI);
-}
-
-TEST(LoopNest, DistributeLoopWithoutAnyPivot) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = 0;
-  //     for (int j = 0; j < 100; j++) {
-  //       A[i] = A[i] + i * j;
-  //     }
-  //     B[i] = A[i];
-  //     for (int k = 0; k < 50; k++) {
-  //       B[i] = B[i] + i * k;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20}, kInt);
-  BufHandle b_buf("B", {20}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto initA = Store::make(a_buf, {i}, 0);
-  auto forJ = For::make(
-      j,
-      0,
-      100,
-      Store::make(
-          a_buf, {i}, Add::make(Load::make(a_buf, {i}), Mul::make(i, j))));
-  auto initB = Store::make(b_buf, {i}, Load::make(a_buf, {i}));
-  auto forK = For::make(
-      k,
-      0,
-      50,
-      Store::make(
-          b_buf, {i}, Add::make(Load::make(b_buf, {i}), Mul::make(i, k))));
-  auto forI = For::make(i, 0, 20, Block::make({initA, forJ, initB, forK}));
-  auto par = Block::make({forI});
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: A[i] = 0
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[i] =
-# CHECK: for (int i
-# CHECK-NEXT: B[i] = A[i]
-# CHECK: for (int i
-# CHECK-NEXT: for (int k
-# CHECK-NEXT: B[i] =
-# CHECK-NOT: for (
-      )IR";
-
-  LoopNest nest(par, {a_buf.node(), b_buf.node()});
-  auto new_loops = LoopNest::distributeLoop(forI);
-
-  std::ostringstream oss;
-  oss << *par;
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The first loop after distribution must be same as the original For.
-  ASSERT_EQ(new_loops.front(), forI);
-}
-
-TEST(LoopNest, DistributeLoopOverInnerLoops) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = 0;
-  //     for (int j = 0; j < 100; j++) {
-  //       A[i] = A[i] + i * j;
-  //     }
-  //     B[i] = A[i];
-  //     for (int k = 0; k < 50; k++) {
-  //       B[i] = B[i] + i * k;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20}, kInt);
-  BufHandle b_buf("B", {20}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto initA = Store::make(a_buf, {i}, 0);
-  auto forJ = For::make(
-      j,
-      0,
-      100,
-      Store::make(
-          a_buf, {i}, Add::make(Load::make(a_buf, {i}), Mul::make(i, j))));
-  auto initB = Store::make(b_buf, {i}, Load::make(a_buf, {i}));
-  auto forK = For::make(
-      k,
-      0,
-      50,
-      Store::make(
-          b_buf, {i}, Add::make(Load::make(b_buf, {i}), Mul::make(i, k))));
-  auto forI = For::make(i, 0, 20, Block::make({initA, forJ, initB, forK}));
-  auto par = Block::make({forI});
-
-  LoopNest nest(par, {a_buf.node(), b_buf.node()});
-  auto new_loops = LoopNest::distributeLoopOverInnerLoops(forI);
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: A[i] = 0
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[i] =
-# CHECK: for (int i
-# CHECK-NEXT: B[i] = A[i]
-# CHECK-NEXT: for (int k
-# CHECK-NEXT: B[i] =
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The first loop after distribution must be same as the original For.
-  ASSERT_EQ(new_loops.front(), forI);
-}
-
-TEST(LoopNest, DistributeLoopAndParentsWithoutAnyPivot) {
-  // Input IR:
-  // for (int m = 0; m < 50; m++) {
-  //   for (int i = 0; i < 20; i++) {
-  //     A[m,i] = 0;
-  //     for (int j = 0; j < 100; j++) {
-  //       A[m,i] = A[m,i] + i * j;
-  //     }
-  //     B[m,i] = A[m,i];
-  //     for (int k = 0; k < 50; k++) {
-  //       B[m,i] = B[m,i] + i * k;
-  //     }
-  //   }
-  // }
-  BufHandle a_buf("A", {100, 100}, kInt);
-  BufHandle b_buf("B", {100, 100}, kInt);
-  VarHandle m("m", kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto initA = Store::make(a_buf, {m, i}, 0);
-  auto forJ = For::make(
-      j,
-      0,
-      100,
-      Store::make(
-          a_buf,
-          {m, i},
-          Add::make(Load::make(a_buf, {m, i}), Mul::make(i, j))));
-  auto initB = Store::make(b_buf, {m, i}, Load::make(a_buf, {m, i}));
-  auto forK = For::make(
-      k,
-      0,
-      50,
-      Store::make(
-          b_buf,
-          {m, i},
-          Add::make(Load::make(b_buf, {m, i}), Mul::make(i, k))));
-  auto forI = For::make(i, 0, 20, Block::make({initA, forJ, initB, forK}));
-
-  {
-    // Check the case of distributing loop and its parents over all the
-    // statements in the loop.
-    const std::string& verification_pattern =
-        R"IR(
-# CHECK: for (int m
-# CHECK-NEXT: for (int i
-# CHECK-NEXT: A[m, i] = 0
-# CHECK: for (int m
-# CHECK-NEXT: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[m, i] =
-# CHECK: for (int m
-# CHECK-NEXT: for (int i
-# CHECK-NEXT: B[m, i] = A[m, i]
-# CHECK: for (int m
-# CHECK-NEXT: for (int i
-# CHECK-NEXT: for (int k
-# CHECK-NEXT: B[m, i] =
-# CHECK-NOT: for (
-        )IR";
-
-    auto newForI = to<For>(Stmt::clone(forI));
-    auto forM = For::make(m, 0, 50, newForI);
-    auto par = Block::make({forM});
-    LoopNest nest(par, {a_buf.node(), b_buf.node()});
-    auto newLoops = LoopNest::distributeLoopAndParents(newForI);
-
-    std::ostringstream oss;
-    oss << *par;
-    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-    // The first loop after distribution must be same as the original For.
-    ASSERT_EQ(newLoops.front(), forM);
-  }
-
-  {
-    // Check the case of distributing loop and its parents over all the inner
-    // loops.
-    const std::string& verification_pattern =
-        R"IR(
-# CHECK: for (int m
-# CHECK-NEXT: for (int i
-# CHECK-NEXT: A[m, i] = 0
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[m, i] =
-# CHECK: for (int m
-# CHECK-NEXT: for (int i
-# CHECK-NEXT: B[m, i] = A[m, i]
-# CHECK-NEXT: for (int k
-# CHECK-NEXT: B[m, i] =
-# CHECK-NOT: for (
-        )IR";
-
-    auto newForI = to<For>(Stmt::clone(forI));
-    auto forM = For::make(m, 0, 50, newForI);
-    auto par = Block::make({forM});
-    LoopNest nest(par, {a_buf.node(), b_buf.node()});
-    auto newLoops = LoopNest::distributeLoopAndParentsOverInnerLoops(newForI);
-
-    std::ostringstream oss;
-    oss << *par;
-    torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-    // The first loop after distribution must be same as the original For.
-    ASSERT_EQ(newLoops.front(), forM);
-  }
-}
-
-TEST(LoopNest, fuseLoopsSimple) {
-  // Input IR:
-  //   for (int j = 0; j < 100; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (int k = 0; k < 100; k++) {
-  //     B[k] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {100}, kInt);
-  BufHandle b_buf("B", {100}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK = For::make(k, 0, 100, Store::make(b_buf, {k}, Mul::make(20, k)));
-  auto par = Block::make({forJ, forK});
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int j
-# CHECK-NEXT: A[j] =
-# CHECK-NEXT: B[j] =
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forJ);
-}
-
-TEST(LoopNest, fuseLoopsMultiple) {
-  // Input IR:
-  //   for (int i = 0; i < 100; i++) {
-  //     A[i+100] = 20 + i;
-  //   }
-  //   for (int j = 0; j < 100; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (int k = 0; k < 100; k++) {
-  //     B[k] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {200}, kInt);
-  BufHandle b_buf("B", {100}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forI =
-      For::make(i, 0, 100, Store::make(a_buf, {i + 100}, Add::make(20, i)));
-  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK = For::make(k, 0, 100, Store::make(b_buf, {k}, Mul::make(20, k)));
-  auto par = Block::make({forI, forJ, forK});
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forI, forJ, forK}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: A[i + 100] =
-# CHECK-NEXT: A[i] =
-# CHECK-NEXT: B[i] =
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forI);
-}
-
-TEST(LoopNest, fuseLoopsNested) {
-  // Input IR:
-  //   for (int m = 0; m < 20; m++) {
-  //     A[m] = 0;
-  //     for (int j = 0; j < 100; j++) {
-  //       A[m] = A[m] + m * j;
-  //     }
-  //   }
-  //   for (int n = 0; n < 20; n++) {
-  //     B[n] = A[n];
-  //     for (int k = 0; k < 50; k++) {
-  //       B[n] = B[n] + n * k;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 100}, kInt);
-  BufHandle b_buf("B", {20, 100}, kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto initA = Store::make(a_buf, {m}, 0);
-  auto forJ = For::make(
-      j,
-      0,
-      100,
-      Store::make(
-          a_buf, {m}, Add::make(Load::make(a_buf, {m}), Mul::make(m, j))));
-  auto initB = Store::make(b_buf, {n}, Load::make(a_buf, {n}));
-  auto forK = For::make(
-      k,
-      0,
-      50,
-      Store::make(
-          b_buf, {n}, Add::make(Load::make(b_buf, {n}), Mul::make(n, k))));
-  auto forM = For::make(m, 0, 20, Block::make({initA, forJ}));
-  auto forN = For::make(n, 0, 20, Block::make({initB, forK}));
-  auto par = Block::make({forM, forN});
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forM, forN}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int m
-# CHECK-NEXT: A[m] = 0
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[m] =
-# CHECK: B[m] = A[m]
-# CHECK-NEXT: for (int k
-# CHECK-NEXT: B[m] =
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forM);
-}
-
-TEST(LoopNest, fuseLoopsNested2D) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 100; j++) {
-  //       A[i,j] = i * j * 500;
-  //     }
-  //   }
-  //   for (int m = 0; m < 20; m++) {
-  //     for (int n = 0; n < 50; n++) {
-  //       B[m,n] = m + n * 100;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 100}, kInt);
-  BufHandle b_buf("B", {20, 100}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  auto forI = For::make(
-      i,
-      0,
-      20,
-      For::make(
-          j,
-          0,
-          100,
-          Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500))));
-  auto forM = For::make(
-      m,
-      0,
-      20,
-      For::make(
-          n,
-          0,
-          50,
-          Store::make(b_buf, {m, n}, Add::make(m, Mul::make(n, 100)))));
-  auto par = Block::make({forI, forM});
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[i, j] =
-# CHECK: for (int n
-# CHECK-NEXT: B[i, n] =
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forI);
-}
-
-TEST(LoopNest, fuseLoopsNested2DInner) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 100; j++) {
-  //       A[i,j] = i * j * 500;
-  //     }
-  //     for (int n = 0; n < 100; n++) {
-  //       B[i,n] = m + n * 100;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 100}, kInt);
-  BufHandle b_buf("B", {20, 100}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle n("n", kInt);
-  auto forJ = For::make(
-      j, 0, 100, Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500)));
-  auto forN = For::make(
-      n, 0, 100, Store::make(b_buf, {i, n}, Add::make(i, Mul::make(n, 100))));
-  auto forI = For::make(i, 0, 20, Block::make({forJ, forN}));
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forN}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *forI;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[i, j] =
-# CHECK-NEXT: B[i, j] =
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forJ);
-}
-
-TEST(LoopNest, fuseLoopsDifferentStopBounds) {
-  // Input IR:
-  //   for (int j = 0; j < 100; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (int k = 0; k < 50; k++) {
-  //     B[k] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {100}, kInt);
-  BufHandle b_buf("B", {100}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK = For::make(k, 0, 50, Store::make(b_buf, {j}, Mul::make(20, k)));
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forJ, forK});
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
-}
-
-TEST(LoopNest, fuseLoopsDifferentStartBounds) {
-  // Input IR:
-  //   for (int j = 0; j < 100; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (int k = 50; k < 100; k++) {
-  //     B[k] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {100}, kInt);
-  BufHandle b_buf("B", {100}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forJ, forK});
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
-}
-
-TEST(LoopNest, fuseLoopsNotContiguous) {
-  // Input IR:
-  //   for (int j = 0; j < 100; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  //   B[0] = 0;
-  //   for (int k = 0; k < 100; k++) {
-  //     B[k] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {100}, kInt);
-  BufHandle b_buf("B", {100}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto initB = Store::make(b_buf, {0}, 0);
-  auto forK = For::make(k, 0, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forJ, initB, forK});
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
-}
-
-TEST(LoopNest, fuseLoopsWithDifferentParents) {
-  // Input IR:
-  //   for (int i = 0; i < 50; i++) {
-  //     for (int j = 0; j < 100; j++) {
-  //       A[i,j] = i * j;
-  //     }
-  //   }
-  //   B[0] = 0;
-  //   for (int k = 50; k < 100; k++) {
-  //     B[k] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {50, 100}, kInt);
-  BufHandle b_buf("B", {100}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {i, j}, Mul::make(i, j)));
-  auto forI = For::make(i, 0, 50, forJ);
-  auto initB = Store::make(b_buf, {0}, 0);
-  auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forI, initB, forK});
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
-}
-
-TEST(LoopNest, fuseLoopsWithVariableBounds) {
-  // Input IR:
-  //   for (int j = 0; j < N; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (int k = 0; k < N; k++) {
-  //     B[k] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {20}, kInt);
-  BufHandle b_buf("B", {20}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  VarHandle N("N", kInt);
-  auto forJ = For::make(j, 0, N, Store::make(a_buf, {j}, Mul::make(10, j)));
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks,cppcoreguidelines-avoid-magic-numbers)
-  auto forK = For::make(k, 0, N, Store::make(b_buf, {j}, Mul::make(20, k)));
-  auto par = Block::make({forJ, forK});
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int j
-# CHECK-NEXT: A[j] =
-# CHECK-NEXT: B[j] =
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forJ);
-}
-
-TEST(LoopNest, fuseLoopsWithExprBounds) {
-  // Input IR:
-  //   for (int j = 0; j < M + N; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (int k = 0; k < M + N; k++) {
-  //     B[k] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {20}, kInt);
-  BufHandle b_buf("B", {20}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  VarHandle M("M", kInt);
-  VarHandle N("N", kInt);
-  auto forJ = For::make(j, 0, M + N, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK = For::make(k, 0, M + N, Store::make(b_buf, {j}, Mul::make(20, k)));
-  auto par = Block::make({forJ, forK});
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int j
-# CHECK-NEXT: A[j] =
-# CHECK-NEXT: B[j] =
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forJ);
-}
-
-TEST(LoopNest, fuseLoopsWithDifferentExprBounds) {
-  // Input IR:
-  //   for (int j = M; j < N * 2; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (int k = M; k < N + N; k++) {
-  //     B[k] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {20}, kInt);
-  BufHandle b_buf("B", {20}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  VarHandle M("M", kInt);
-  VarHandle N("N", kInt);
-  auto forJ = For::make(j, M, N * 2, Store::make(a_buf, {j}, Mul::make(10, j)));
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks,cppcoreguidelines-avoid-magic-numbers)
-  auto forK = For::make(k, M, N + N, Store::make(b_buf, {j}, Mul::make(20, k)));
-  auto par = Block::make({forJ, forK});
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int j
-# CHECK-NEXT: A[j] =
-# CHECK-NEXT: B[j] =
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forJ);
-}
-
-TEST(LoopNest, fuseLoopsWithNonOverlappingBufferAccesses) {
-  // Input IR:
-  //   for (int j = 10; j < 100; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (int k = 10; k < 100; k++) {
-  //     A[k+100] = 30 * k
-  //   }
-  BufHandle a_buf("A", {200}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK =
-      For::make(k, 10, 100, Store::make(a_buf, {k + 100}, Mul::make(30, k)));
-  auto par = Block::make({forJ, forK});
-
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int j
-# CHECK-NEXT: A[j] =
-# CHECK-NEXT: A[j + 100] =
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forJ);
-}
-
-TEST(LoopNest, fuseLoopsWithNonOverlapping2DBufferAccesses) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 100; j++) {
-  //       A[i,j] = i * j * 500;
-  //     }
-  //   }
-  //   for (int m = 0; m < 20; m++) {
-  //     for (int n = 0; n < 50; n++) {
-  //       A[m+20,n+100] = m + n * 100;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 100}, kInt);
-  BufHandle b_buf("B", {20, 50}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  auto storeA1 = Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500));
-  auto forJ = For::make(j, 0, 100, storeA1);
-  auto forI = For::make(i, 0, 20, forJ);
-  auto storeA2 =
-      Store::make(a_buf, {m + 20, n + 100}, Add::make(m, Mul::make(n, 100)));
-  auto forN = For::make(n, 0, 50, storeA2);
-  auto forM = For::make(m, 0, 20, forN);
-  auto par = Block::make({forI, forM});
-
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[i, j] =
-# CHECK: for (int n
-# CHECK-NEXT: A[i + 20, n + 100] =
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forI);
-}
-
-TEST(LoopNest, fuseLoopsWithReductions) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     A[i] = 0
-  //     for (int j = 0; j < 100; j++) {
-  //       A[i] = A[i] + B[i,j];
-  //     }
-  //   }
-  //   for (int m = 0; m < 20; m++) {
-  //     C[m] = A[m];
-  //   }
-  BufHandle a_buf("A", {20}, kInt);
-  BufHandle b_buf("B", {20, 100}, kInt);
-  BufHandle c_buf("C", {20}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle m("m", kInt);
-  auto initA = Store::make(a_buf, {i}, 0);
-  auto sumA = Store::make(
-      a_buf, {i}, Add::make(Load::make(a_buf, {i}), Load::make(b_buf, {i, j})));
-  auto forJ = For::make(j, 0, 100, sumA);
-  auto forI = For::make(i, 0, 20, Block::make({initA, forJ}));
-  auto forM =
-      For::make(m, 0, 20, Store::make(c_buf, {m}, Load::make(a_buf, {m})));
-  auto par = Block::make({forI, forM});
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: A[i] =
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[i] = (A[i]) +
-# CHECK-NOT: for (
-# CHECK: C[i] = A[i]
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forI);
-}
-
-TEST(LoopNest, fuseLoopsWith2DReductions) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 50; j++) {
-  //       A[i,j] = 0
-  //       for (int k = 0; k < 100; k++) {
-  //         A[i,j] = A[i,j] + B[i,j,k];
-  //       }
-  //     }
-  //   }
-  //   for (int m = 0; m < 20; m++) {
-  //     for (int n = 0; n < 40; n++) {
-  //       C[m,n] = A[m,n];
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 50}, kInt);
-  BufHandle b_buf("B", {20, 50, 100}, kInt);
-  BufHandle c_buf("C", {20, 40}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  auto initA = Store::make(a_buf, {i, j}, 0);
-  auto sumA = Store::make(
-      a_buf,
-      {i, j},
-      Add::make(Load::make(a_buf, {i, j}), Load::make(b_buf, {i, j, k})));
-  auto forK = For::make(k, 0, 100, sumA);
-  auto forJ = For::make(j, 0, 50, Block::make({initA, forK}));
-  auto forI = For::make(i, 0, 20, forJ);
-  auto storeC = Store::make(c_buf, {m, n}, Load::make(a_buf, {m, n}));
-  auto forM = For::make(m, 0, 20, For::make(n, 0, 40, storeC));
-  auto par = Block::make({forI, forM});
-
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[i, j] =
-# CHECK-NEXT: for (int k
-# CHECK-NEXT: A[i, j] = (A[i, j]) +
-# CHECK: for (int n
-# CHECK-NEXT: C[i, n] = A[i, n]
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forI);
-}
-
-TEST(LoopNest, fuseLoopsWithComplexIndices) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 20; j++) {
-  //       A[i,j*20+j+2] = i + j;
-  //     }
-  //   }
-  //   for (int m = 0; m < 20; m++) {
-  //     for (int n = 0; n < 20; n++) {
-  //       B[m,n] = A[m,n*20+n+2];
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 400}, kInt);
-  BufHandle b_buf("B", {20, 400}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  auto writeA = Store::make(a_buf, {i, j * 20 + j + 2}, i + j);
-  auto forI = For::make(i, 0, 20, For::make(j, 0, 20, writeA));
-  auto storeB =
-      Store::make(b_buf, {m, n}, Load::make(a_buf, {m, n * 20 + n + 2}));
-  auto forM = For::make(m, 0, 20, For::make(n, 0, 20, storeB));
-  auto par = Block::make({forI, forM});
-
-  ForPtr fused_loop;
-  ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[i, (j * 20 + j) + 2] = i + j
-# CHECK: for (int n
-# CHECK-NEXT: B[i, n] = A[i, (n * 20 + n) + 2]
-# CHECK-NOT: for (
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  // The fused loop must be the same as the first loop.
-  ASSERT_EQ(fused_loop, forI);
-}
-
-TEST(LoopNest, fuseLoopsWithMixedLoopVarsAsIndices) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 20; j++) {
-  //       A[i,i*20+j] = i + j;
-  //     }
-  //   }
-  //   for (int m = 0; m < 20; m++) {
-  //     for (int n = 0; n < 20; n++) {
-  //       B[m,n] = A[m,m*20+n];  // Both indices of A use m
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 500}, kInt);
-  BufHandle b_buf("B", {20, 500}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  auto writeA = Store::make(a_buf, {i, i * 20 + j}, i + j);
-  auto forI = For::make(i, 0, 20, For::make(j, 0, 20, writeA));
-  auto storeB = Store::make(b_buf, {m, n}, Load::make(a_buf, {m, m * 20 + n}));
-  auto forM = For::make(m, 0, 20, For::make(n, 0, 20, storeB));
-  auto par = Block::make({forI, forM});
-
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
-}
-
-TEST(LoopNest, fuseLoopsWithTranspose) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 20; j++) {
-  //       A[i,j] = i + j;
-  //     }
-  //   }
-  //   for (int m = 0; m < 20; m++) {
-  //     for (int n = 0; n < 20; n++) {
-  //       B[m,n] = A[n,m];  // Transpose
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 20}, kInt);
-  BufHandle b_buf("B", {20, 20}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  auto writeA = Store::make(a_buf, {i, j}, i + j);
-  auto forI = For::make(i, 0, 20, For::make(j, 0, 20, writeA));
-  auto storeB = Store::make(b_buf, {m, n}, Load::make(a_buf, {n, m}));
-  auto forM = For::make(m, 0, 20, For::make(n, 0, 20, storeB));
-  auto par = Block::make({forI, forM});
-
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
-}
-
-TEST(LoopNest, fuseLoopsThatViolateDependencies1) {
-  // Input IR:
-  //   for (int j = 10; j < 100; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (int k = 10; k < 100; k++) {
-  //     A[k-1] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {100}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK =
-      For::make(k, 10, 100, Store::make(a_buf, {k - 1}, Mul::make(20, k)));
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forJ, forK});
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
-}
-
-TEST(LoopNest, fuseLoopsThatViolateDependencies2) {
-  // Input IR:
-  //   for (int j = 10; j < 100; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (int k = 10; k < 100; k++) {
-  //     A[k+50] = 20 * k;
-  //   }
-  BufHandle a_buf("A", {150}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 10, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK =
-      For::make(k, 10, 100, Store::make(a_buf, {k + 50}, Mul::make(20, k)));
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forJ, forK});
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
-}
-
-TEST(LoopNest, fuseLoopsThatViolateDependencies3) {
-  // Input IR:
-  //   for (int m = 0; m < 20; m++) {
-  //     A[m] = 0;
-  //     for (int j = 0; j < 100; j++) {
-  //       A[m] = A[m] + m * j;
-  //     }
-  //   }
-  //   for (int n = 0; n < 20; n++) {
-  //     B[n] = A[n+1];
-  //     for (int k = 0; k < 50; k++) {
-  //       B[n] = B[n] + n * k;
-  //     }
-  //   }
-  BufHandle a_buf("A", {25, 100}, kInt);
-  BufHandle b_buf("B", {20, 50}, kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto initA = Store::make(a_buf, {m}, 0);
-  auto forJ = For::make(
-      j,
-      0,
-      100,
-      Store::make(
-          a_buf, {m}, Add::make(Load::make(a_buf, {m}), Mul::make(m, j))));
-  auto initB = Store::make(b_buf, {n}, Load::make(a_buf, {n + 1}));
-  auto forK = For::make(
-      k,
-      0,
-      50,
-      Store::make(
-          b_buf, {n}, Add::make(Load::make(b_buf, {n}), Mul::make(n, k))));
-  auto forM = For::make(m, 0, 20, Block::make({initA, forJ}));
-  auto forN = For::make(n, 0, 20, Block::make({initB, forK}));
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forM, forN});
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forM, forN}, &fused_loop));
-}
-
-TEST(LoopNest, fuseLoopsThatViolateDependencies4) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 100; j++) {
-  //       A[i,j] = i * j * 500;
-  //     }
-  //   }
-  //   for (int m = 0; m < 20; m++) {
-  //     for (int n = 0; n < 50; n++) {
-  //       A[m+1,n] = m + n * 100;
-  //     }
-  //   }
-  BufHandle a_buf("A", {30, 100}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  auto forI = For::make(
-      i,
-      0,
-      20,
-      For::make(
-          j,
-          0,
-          100,
-          Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500))));
-  auto forM = For::make(
-      m,
-      0,
-      20,
-      For::make(
-          n,
-          0,
-          50,
-          Store::make(a_buf, {m + 1, n}, Add::make(m, Mul::make(n, 100)))));
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forI, forM});
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
-}
-
-TEST(LoopNest, fuseLoopsThatViolateDependencies5) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 100; j++) {
-  //       A[i,j] = i * j * 500;
-  //     }
-  //     for (int n = 0; n < 100; n++) {
-  //       A[i,n+1] = m + n * 100;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 200}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle n("n", kInt);
-  auto forJ = For::make(
-      j, 0, 100, Store::make(a_buf, {i, j}, Mul::make(Mul::make(i, j), 500)));
-  auto forN = For::make(
-      n,
-      0,
-      100,
-      Store::make(a_buf, {i, n + 1}, Add::make(i, Mul::make(n, 100))));
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,cppcoreguidelines-avoid-magic-numbers)
-  auto forI = For::make(i, 0, 20, Block::make({forJ, forN}));
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forN}, &fused_loop));
-}
-
-TEST(LoopNest, fuseLoopsThatViolateDependencies6) {
-  // Input IR:
-  //   for (int j = 0; j < 100; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  //   for (int k = 0; k < 100; k++) {
-  //     B[k] = 20 * A[99-k];
-  //   }
-  BufHandle a_buf("A", {100}, kInt);
-  BufHandle b_buf("B", {100}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  auto forK = For::make(
-      k,
-      0,
-      100,
-      Store::make(
-          b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forJ, forK});
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
-}
-
-TEST(LoopNest, fuseLoopsThatViolateDependencies7) {
-  // Input IR:
-  //   for (int k = 0; k < 100; k++) {
-  //     B[k] = 20 * A[99-k];
-  //   }
-  //   for (int j = 0; j < 100; j++) {
-  //     A[j] = 10 * j;
-  //   }
-  BufHandle a_buf("A", {100}, kInt);
-  BufHandle b_buf("B", {100}, kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto forK = For::make(
-      k,
-      0,
-      100,
-      Store::make(
-          b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
-  auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forK, forJ});
-  ForPtr fused_loop;
-  ASSERT_FALSE(LoopNest::fuseLoops({forK, forJ}, &fused_loop));
-}
-
-TEST(LoopNest, areLoopsPerfectlyNested) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 30; j++) {
-  //       for (int k = 0; k < 40; k++) {
-  //         A[i,j,k] = i * j * k;
-  //       }
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 30, 40}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto store = Store::make(a_buf, {i, j, k}, Mul::make(Mul::make(i, j), k));
-  auto forK = For::make(k, 0, 40, store);
-  auto forJ = For::make(j, 0, 30, forK);
-  auto forI = For::make(i, 0, 20, forJ);
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forI});
-  ASSERT_TRUE(LoopNest::areLoopsPerfectlyNested({forI, forJ, forK}));
-
-  // Specifying the loops in any other order fails.
-  ASSERT_FALSE(LoopNest::areLoopsPerfectlyNested({forJ, forI, forK}));
-  ASSERT_FALSE(LoopNest::areLoopsPerfectlyNested({forI, forK, forJ}));
-  ASSERT_FALSE(LoopNest::areLoopsPerfectlyNested({forK, forJ, forI}));
-
-  // Adding a statement to forK body should be OK.
-  auto init = Store::make(a_buf, {i, j}, 0);
-  forK->body()->insert_stmt_before(init, store);
-  ASSERT_TRUE(LoopNest::areLoopsPerfectlyNested({forI, forJ, forK}));
-
-  // Adding a statement in forJ body should fail this test.
-  forK->body()->remove_stmt(init);
-  forJ->body()->insert_stmt_before(init, forK);
-  ASSERT_FALSE(LoopNest::areLoopsPerfectlyNested({forI, forJ, forK}));
-
-  // Similarly, adding a statement in forI body should fail this test.
-  forJ->body()->remove_stmt(init);
-  forI->body()->insert_stmt_before(init, forJ);
-  ASSERT_FALSE(LoopNest::areLoopsPerfectlyNested({forI, forJ, forK}));
-}
-
-TEST(LoopNest, reorderNestedLoops2D) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 30; j++) {
-  //       A[i,j] = i * j;
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 30, 40}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto store = Store::make(a_buf, {i, j}, Mul::make(i, j));
-  auto forJ = For::make(j, 0, 30, store);
-  auto forI = For::make(i, 0, 20, forJ);
-  auto par = Block::make({forI});
-
-  auto reordered = LoopNest::reorder({forI, forJ}, {1, 0});
-
-  ASSERT_EQ(reordered[0], forJ);
-  ASSERT_EQ(reordered[1], forI);
-  ASSERT_TRUE(LoopNest::areLoopsPerfectlyNested({forJ, forI}));
-  ASSERT_EQ(forJ->get_parent(), par);
-  ASSERT_EQ(store->get_parent(), forI->body());
-}
-
-TEST(LoopNest, reorderNestedLoops3D) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 30; j++) {
-  //       for (int k = 0; k < 40; k++) {
-  //         A[i,j,k] = i * j * k;
-  //       }
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 30, 40}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto store = Store::make(a_buf, {i, j, k}, Mul::make(Mul::make(i, j), k));
-  auto forK = For::make(k, 0, 40, store);
-  auto forJ = For::make(j, 0, 30, forK);
-  auto forI = For::make(i, 0, 20, forJ);
-  auto par = Block::make({forI});
-
-  auto reordered = LoopNest::reorder({forI, forJ, forK}, {2, 0, 1});
-
-  ASSERT_EQ(reordered[0], forK);
-  ASSERT_EQ(reordered[1], forI);
-  ASSERT_EQ(reordered[2], forJ);
-  ASSERT_TRUE(LoopNest::areLoopsPerfectlyNested({forK, forI, forJ}));
-  ASSERT_EQ(forK->get_parent(), par);
-  ASSERT_EQ(store->get_parent(), forJ->body());
-}
-
-TEST(LoopNest, reorderNestedLoops4D) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 30; j++) {
-  //       for (int k = 0; k < 40; k++) {
-  //         for (int l = 0; l < 50; l++) {
-  //           A[i,j,k,l] = i * j * k * l * 500;
-  //         }
-  //       }
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 30, 40, 50}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  VarHandle l("l", kInt);
-  auto store = Store::make(
-      a_buf,
-      {i, j, k, l},
-      Mul::make(Mul::make(Mul::make(Mul::make(i, j), k), l), 500));
-  auto forL = For::make(l, 0, 50, store);
-  auto forK = For::make(k, 0, 40, forL);
-  auto forJ = For::make(j, 0, 30, forK);
-  auto forI = For::make(i, 0, 20, forJ);
-  auto par = Block::make({forI});
-
-  auto reordered = LoopNest::reorder({forI, forJ, forK, forL}, {2, 0, 3, 1});
-
-  ASSERT_EQ(reordered[0], forK);
-  ASSERT_EQ(reordered[1], forI);
-  ASSERT_EQ(reordered[2], forL);
-  ASSERT_EQ(reordered[3], forJ);
-  ASSERT_TRUE(LoopNest::areLoopsPerfectlyNested({forK, forI, forL, forJ}));
-  ASSERT_EQ(forK->get_parent(), par);
-  ASSERT_EQ(store->get_parent(), forJ->body());
-}
-
-TEST(LoopNest, reorderTrivialPermutation) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 30; j++) {
-  //       for (int k = 0; k < 40; k++) {
-  //         A[i,j,k] = i * j * k;
-  //       }
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 30, 40}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto store = Store::make(a_buf, {i, j, k}, Mul::make(Mul::make(i, j), k));
-  auto forK = For::make(k, 0, 40, store);
-  auto forJ = For::make(j, 0, 30, forK);
-  auto forI = For::make(i, 0, 20, forJ);
-  auto par = Block::make({forI});
-
-  auto reordered = LoopNest::reorder({forI, forJ, forK}, {0, 1, 2});
-
-  ASSERT_EQ(reordered[0], forI);
-  ASSERT_EQ(reordered[1], forJ);
-  ASSERT_EQ(reordered[2], forK);
-  ASSERT_TRUE(LoopNest::areLoopsPerfectlyNested({forI, forJ, forK}));
-  ASSERT_EQ(forI->get_parent(), par);
-  ASSERT_EQ(store->get_parent(), forK->body());
-}
-
-TEST(LoopNest, reorderInvalidPermutations) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 30; j++) {
-  //       for (int k = 0; k < 40; k++) {
-  //         A[i,j,k] = i * j * k;
-  //       }
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 30, 40}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto store = Store::make(a_buf, {i, j, k}, Mul::make(Mul::make(i, j), k));
-  auto forK = For::make(k, 0, 40, store);
-  auto forJ = For::make(j, 0, 30, forK);
-  auto forI = For::make(i, 0, 20, forJ);
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forI});
-
-  ASSERT_THROWS_WITH(
-      LoopNest::reorder({forI, forJ, forK}, {0, 1, 2, 3}),
-      "invalid permutation size");
-  ASSERT_THROWS_WITH(
-      LoopNest::reorder({forI, forJ, forK}, {1, 2}),
-      "invalid permutation size");
-  ASSERT_THROWS_WITH(
-      LoopNest::reorder({forI, forJ, forK}, {2, 1, 3}),
-      "invalid permutation for reorder");
-  ASSERT_THROWS_WITH(
-      LoopNest::reorder({forI, forJ, forK}, {1, 1, 0}),
-      "invalid permutation for reorder");
-  ASSERT_THROWS_WITH(
-      LoopNest::reorder({forI, forJ, forK}, {0, 0, 0}),
-      "invalid permutation for reorder");
-}
-
-TEST(LoopNest, reorderInvalidLoopNest) {
-  // Input IR:
-  //   for (int i = 0; i < 20; i++) {
-  //     for (int j = 0; j < 30; j++) {
-  //       A[i,j] = 0
-  //       for (int k = 0; k < 40; k++) {
-  //         A[i,j,k] = i * j * k;
-  //       }
-  //     }
-  //   }
-  BufHandle a_buf("A", {20, 30, 40}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto store = Store::make(a_buf, {i, j, k}, Mul::make(Mul::make(i, j), k));
-  auto forK = For::make(k, 0, 40, store);
-  auto forJ = For::make(j, 0, 30, forK);
-  auto forI = For::make(i, 0, 20, forJ);
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto par = Block::make({forI});
-
-  // Specifying the loops in incorrect order fails.
-  ASSERT_THROWS_WITH(
-      LoopNest::reorder({forK, forI, forJ}, {1, 0, 2}),
-      "reorder is only allowed on perfectly nested loops");
-
-  // Adding a statement to forJ loop fails.
-  auto init = Store::make(a_buf, {i}, 0);
-  forJ->body()->insert_stmt_before(init, forK);
-  ASSERT_THROWS_WITH(
-      LoopNest::reorder({forI, forJ, forK}, {1, 0, 2}),
-      "reorder is only allowed on perfectly nested loops");
-
-  // Moving that statement to forI loop also fails.
-  forJ->body()->remove_stmt(init);
-  forI->body()->insert_stmt_before(init, forJ);
-  ASSERT_THROWS_WITH(
-      LoopNest::reorder({forI, forJ, forK}, {1, 0, 2}),
-      "reorder is only allowed on perfectly nested loops");
-}
-
-TEST(LoopNest, compressBufferSimple) {
-  // Input IR:
-  // for (int i = 0; i < 100; ++i) {
-  //   for (int j = 0; j < 200; ++j) {
-  //     A[i,j] = sin(i*j)
-  //   }
-  //   for (int j = 0; j < 199; ++j) {
-  //     B[i,j] = A[i,j] + A[i, j+1]
-  //   }
-  // }
-  BufHandle aBuf("A", {100, 200}, kInt);
-  BufHandle bBuf("B", {100, 200}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto forJ1 = For::make(j, 0, 200, Store::make(aBuf, {i, j}, sin(i * j)));
-  auto forJ2 = For::make(
-      j,
-      0,
-      199,
-      Store::make(
-          bBuf,
-          {i, j},
-          Add::make(Load::make(aBuf, {i, j}), Load::make(aBuf, {i, j + 1}))));
-  auto forI = For::make(i, 0, 100, Block::make({forJ1, forJ2}));
-  auto par = Block::make({forI});
-  LoopNest::compressBuffer(aBuf.node(), par);
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[0, j] =
-# CHECK: for (int j
-# CHECK-NEXT: B[i, j] = (A[0, j]) + (A[0, j + 1])
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  ASSERT_EQ(aBuf.node()->ndim(), 2);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 1);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 200);
-}
-
-TEST(LoopNest, compressBufferMultipleDims) {
-  // Input IR:
-  // for (int i = 0; i < 100; ++i) {
-  //   for (int j = 0; j < 200; ++j) {
-  //     A[i,j] = sin(i*j)
-  //     B[i,j] = A[i,j] + A[i,j]
-  //   }
-  // }
-  BufHandle aBuf("A", {100, 200}, kInt);
-  BufHandle bBuf("B", {100, 200}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto store1 = Store::make(aBuf, {i, j}, sin(i * j));
-  auto store2 = Store::make(
-      bBuf,
-      {i, j},
-      Add::make(Load::make(aBuf, {i, j}), Load::make(aBuf, {i, j})));
-  auto forJ = For::make(j, 0, 200, Block::make({store1, store2}));
-  auto forI = For::make(i, 0, 100, forJ);
-  auto par = Block::make({forI});
-  LoopNest::compressBuffer(aBuf.node(), par);
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[0, 0] =
-# CHECK-NEXT: B[i, j] = (A[0, 0]) + (A[0, 0])
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  ASSERT_EQ(aBuf.node()->ndim(), 2);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 1);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 1);
-}
-
-TEST(LoopNest, compressBufferMultipleDims2) {
-  // Input IR:
-  // for (int i = 0; i < 100; ++i) {
-  //   for (int j = 0; j < 200; ++j) {
-  //     for (int k = 0; k < 300; ++k) {
-  //       A[i,j,k] = sin(i*j*k)
-  //     }
-  //     for (int k = 0; k < 299; ++j) {
-  //       B[i,j,k] = A[i,j,k] + A[i,j,k+1]
-  //     }
-  //   }
-  // }
-  BufHandle aBuf("A", {100, 200, 300}, kInt);
-  BufHandle bBuf("B", {100, 200, 300}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  auto store1 = Store::make(aBuf, {i, j, k}, sin(i * j * k));
-  auto forK1 = For::make(k, 0, 300, store1);
-  auto store2 = Store::make(
-      bBuf,
-      {i, j, k},
-      Add::make(Load::make(aBuf, {i, j, k}), Load::make(aBuf, {i, j, k + 1})));
-  auto forK2 = For::make(k, 0, 299, store2);
-  auto forJ = For::make(j, 0, 200, Block::make({forK1, forK2}));
-  auto forI = For::make(i, 0, 100, forJ);
-  auto par = Block::make({forI});
-  LoopNest::compressBuffer(aBuf.node(), par);
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: for (int k
-# CHECK-NEXT: A[0, 0, k] =
-# CHECK: for (int k
-# CHECK-NEXT: B[i, j, k] = (A[0, 0, k]) + (A[0, 0, k + 1])
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  ASSERT_EQ(aBuf.node()->ndim(), 3);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 1);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 1);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(2), 300);
-}
-
-TEST(LoopNest, compressBufferDifferentOrderIndices) {
-  // Input IR:
-  // for (int i = 0; i < 100; ++i) {
-  //   for (int j = 0; j < 200; ++j) {
-  //     A[j, i] = sin(i*j)
-  //   }
-  //   for (int j = 0; j < 99; ++j) {
-  //     B[i, j] = A[j, i] + A[j+1, 0]
-  //   }
-  // }
-  BufHandle aBuf("A", {100, 200}, kInt);
-  BufHandle bBuf("B", {100, 200}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto forJ1 = For::make(j, 0, 200, Store::make(aBuf, {j, i}, sin(i * j)));
-  auto forJ2 = For::make(
-      j,
-      0,
-      99,
-      Store::make(
-          bBuf,
-          {i, j},
-          Add::make(Load::make(aBuf, {j, i}), Load::make(aBuf, {j + 1, i}))));
-  auto forI = For::make(i, 0, 100, Block::make({forJ1, forJ2}));
-  auto par = Block::make({forI});
-  LoopNest::compressBuffer(aBuf.node(), par);
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[j, 0] =
-# CHECK: for (int j
-# CHECK-NEXT: B[i, j] = (A[j, 0]) + (A[j + 1, 0])
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  ASSERT_EQ(aBuf.node()->ndim(), 2);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 100);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 1);
-}
-
-TEST(LoopNest, compressBufferVariableBounds) {
-  // Input IR:
-  // for (int i = 0; i < M; ++i) {
-  //   for (int j = 0; j < N; ++j) {
-  //     A[i,j] = sin(i*j)
-  //   }
-  //   for (int j = 0; j < N-1; ++j) {
-  //     B[i,j] = A[i,j] + A[i, j+1]
-  //   }
-  // }
-  BufHandle aBuf("A", {100, 200}, kInt);
-  BufHandle bBuf("B", {100, 200}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle M("M", kInt);
-  VarHandle N("N", kInt);
-  auto forJ1 = For::make(j, 0, N, Store::make(aBuf, {i, j}, sin(i * j)));
-  auto forJ2 = For::make(
-      j,
-      0,
-      N - 1,
-      Store::make(
-          bBuf,
-          {i, j},
-          Add::make(Load::make(aBuf, {i, j}), Load::make(aBuf, {i, j + 1}))));
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  auto forI = For::make(i, 0, M, Block::make({forJ1, forJ2}));
-  auto par = Block::make({forI});
-  LoopNest::compressBuffer(aBuf.node(), par);
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[0, j] =
-# CHECK: for (int j
-# CHECK-NEXT: B[i, j] = (A[0, j]) + (A[0, j + 1])
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  ASSERT_EQ(aBuf.node()->ndim(), 2);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 1);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 200);
-}
-
-TEST(LoopNest, compressBufferNoCommonParentLoops) {
-  // Input IR:
-  // for (int i = 0; i < 100; ++i) {
-  //   for (int j = 0; j < 200; ++j) {
-  //     A[i,j] = sin(i*j)
-  //   }
-  // }
-  // for (int i = 0; i < 100; ++i) {
-  //   for (int j = 0; j < 199; ++j) {
-  //     B[i,j] = A[i,j] + A[i, j+1]
-  //   }
-  // }
-  BufHandle aBuf("A", {100, 200}, kInt);
-  BufHandle bBuf("B", {100, 200}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto forJ1 = For::make(j, 0, 200, Store::make(aBuf, {i, j}, sin(i * j)));
-  auto forJ2 = For::make(
-      j,
-      0,
-      199,
-      Store::make(
-          bBuf,
-          {i, j},
-          Add::make(Load::make(aBuf, {i, j}), Load::make(aBuf, {i, j + 1}))));
-  auto forI1 = For::make(i, 0, 100, forJ1);
-  auto forI2 = For::make(i, 0, 100, forJ2);
-  auto par = Block::make({forI1, forI2});
-  LoopNest::compressBuffer(aBuf.node(), par);
-
-  // There should be no change in the buffer or code.
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[i, j] =
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: B[i, j] = (A[i, j]) + (A[i, j + 1])
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  ASSERT_EQ(aBuf.node()->ndim(), 2);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 100);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 200);
-}
-
-TEST(LoopNest, compressBufferIndicesMixed) {
-  // Input IR:
-  // for (int i = 0; i < 100; ++i) {
-  //   for (int j = 0; j < 200; ++j) {
-  //     A[i + j, j] = sin(i*j)
-  //   }
-  //   for (int j = 0; j < 199; ++j) {
-  //     B[i,j] = A[i + j, j] + A[i + j, j+1]
-  //   }
-  // }
-  BufHandle aBuf("A", {300, 200}, kInt);
-  BufHandle bBuf("B", {100, 200}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto forJ1 = For::make(j, 0, 200, Store::make(aBuf, {i + j, j}, sin(i * j)));
-  auto forJ2 = For::make(
-      j,
-      0,
-      199,
-      Store::make(
-          bBuf,
-          {i, j},
-          Add::make(
-              Load::make(aBuf, {i + j, j}), Load::make(aBuf, {i + j, j + 1}))));
-  auto forI = For::make(i, 0, 100, Block::make({forJ1, forJ2}));
-  auto par = Block::make({forI});
-  LoopNest::compressBuffer(aBuf.node(), par);
-
-  // There should be no change in the buffer or code.
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[i + j, j] =
-# CHECK: for (int j
-# CHECK-NEXT: B[i, j] = (A[i + j, j]) + (A[i + j, j + 1])
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  ASSERT_EQ(aBuf.node()->ndim(), 2);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 300);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 200);
-}
-
-TEST(LoopNest, compressMultipleBuffers) {
-  // Input IR:
-  // for (int i = 0; i < 100; ++i) {
-  //   for (int j = 0; j < 200; ++j) {
-  //     A[i,j] = sin(i*j)
-  //   }
-  //   for (int k = 0; k < 199; ++k) {
-  //     B[i,k] = A[i,k] + A[i, k+1]
-  //   }
-  //   for (int m = 0; m < 50; ++m) {
-  //     C[i,m] = B[i,m]
-  //   }
-  // }
-  BufHandle aBuf("A", {100, 200}, kInt);
-  BufHandle bBuf("B", {100, 200}, kInt);
-  BufHandle cBuf("C", {100, 200}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  VarHandle k("k", kInt);
-  VarHandle m("m", kInt);
-  auto forJ = For::make(j, 0, 200, Store::make(aBuf, {i, j}, sin(i * j)));
-  auto forK = For::make(
-      k,
-      0,
-      199,
-      Store::make(
-          bBuf,
-          {i, k},
-          Add::make(Load::make(aBuf, {i, k}), Load::make(aBuf, {i, k + 1}))));
-  auto forM =
-      For::make(m, 0, 50, Store::make(cBuf, {i, m}, Load::make(bBuf, {i, m})));
-  auto forI = For::make(i, 0, 100, Block::make({forJ, forK, forM}));
-  auto par = Block::make({forI});
-
-  // This should compress all buffers A, B, and C as follows:
-  //   A[100, 200] -> A[1, 200]
-  //   B[100, 200] -> B[1, 200]
-  //   C[100, 200] -> C[1, 1]
-  LoopNest::compressAllBuffers(par);
-
-  std::ostringstream oss;
-  oss << *par;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT: for (int j
-# CHECK-NEXT: A[0, j] =
-# CHECK: for (int k
-# CHECK-NEXT: B[0, k] = (A[0, k]) + (A[0, k + 1])
-# CHECK: for (int m
-# CHECK-NEXT: C[0, 0] = B[0, m]
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-
-  ASSERT_EQ(aBuf.node()->ndim(), 2);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(0), 1);
-  IS_IMM_WITH_VAL(Int, aBuf.node()->dim(1), 200);
-  ASSERT_EQ(bBuf.node()->ndim(), 2);
-  IS_IMM_WITH_VAL(Int, bBuf.node()->dim(0), 1);
-  IS_IMM_WITH_VAL(Int, bBuf.node()->dim(1), 200);
-  ASSERT_EQ(cBuf.node()->ndim(), 2);
-  IS_IMM_WITH_VAL(Int, cBuf.node()->dim(0), 1);
-  IS_IMM_WITH_VAL(Int, cBuf.node()->dim(1), 1);
-}
-
-TEST(LoopNest, sanitizeNames) {
-  std::vector<ExprHandle> dim_args;
-  // Let's pick names that would overlap with default index names if not
-  // sanitized properly:
-  dim_args.emplace_back(ExprHandle(alloc<Var>("i", kInt)));
-  dim_args.emplace_back(ExprHandle(alloc<Var>("N:2", kInt)));
-  // Now let's create a many dimensions so that we had to use the same letter
-  // for different loops
-  for (int i = 0; i < 10; i++) {
-    dim_args.emplace_back(ExprHandle(alloc<Var>("N", kInt)));
-  }
-
-  // Now create two Computes with conflicting after sanitization names:
-  Tensor X = Compute("$X:!", dim_args, [&](const std::vector<VarHandle>& v) {
-    return v[0] + v[1] + v[9] + 1;
-  });
-  Tensor Y = Reduce(
-      "%X\"+",
-      {},
-      Sum(),
-      [&](const std::vector<VarHandle>& v) { return X.load(v); },
-      dim_args);
-
-  // Finally, let's verify what we got after sanitization:
-  LoopNest l({X, Y});
-  StmtPtr s = l.root_stmt();
-  LoopNest::sanitizeNames(s);
-
-  std::ostringstream oss;
-  oss << *s;
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK:  for (int i = 0; i < i_1; i++) {
-# CHECK-NEXT:    for (int j = 0; j < N_2_1; j++) {
-# CHECK-NEXT:      for (int k = 0; k < N_9; k++) {
-# CHECK-NEXT:        for (int l = 0; l < N_8; l++) {
-# CHECK-NEXT:          for (int m = 0; m < N_7; m++) {
-# CHECK-NEXT:            for (int n = 0; n < N_6; n++) {
-# CHECK-NEXT:              for (int o = 0; o < N_5; o++) {
-# CHECK-NEXT:                for (int p = 0; p < N_4; p++) {
-# CHECK-NEXT:                  for (int i1 = 0; i1 < N_3; i1++) {
-# CHECK-NEXT:                    for (int j1 = 0; j1 < N_2; j1++) {
-# CHECK-NEXT:                      for (int k1 = 0; k1 < N_1; k1++) {
-# CHECK-NEXT:                        for (int l1 = 0; l1 < N; l1++) {
-# CHECK-NEXT:                          v_X__[i, j, k, l, m, n, o, p, i1, j1, k1, l1] = ((i + j) + j1) + 1;
-# CHECK:  v_X___1 = int(0);
-# CHECK-NEXT:  for (int i_2 = 0; i_2 < i_1; i_2++) {
-# CHECK-NEXT:    for (int j_1 = 0; j_1 < N_2_1; j_1++) {
-# CHECK-NEXT:      for (int k_1 = 0; k_1 < N_9; k_1++) {
-# CHECK-NEXT:        for (int l_1 = 0; l_1 < N_8; l_1++) {
-# CHECK-NEXT:          for (int m_1 = 0; m_1 < N_7; m_1++) {
-# CHECK-NEXT:            for (int n_1 = 0; n_1 < N_6; n_1++) {
-# CHECK-NEXT:              for (int o_1 = 0; o_1 < N_5; o_1++) {
-# CHECK-NEXT:                for (int p_1 = 0; p_1 < N_4; p_1++) {
-# CHECK-NEXT:                  for (int i1_1 = 0; i1_1 < N_3; i1_1++) {
-# CHECK-NEXT:                    for (int j1_1 = 0; j1_1 < N_2; j1_1++) {
-# CHECK-NEXT:                      for (int k1_1 = 0; k1_1 < N_1; k1_1++) {
-# CHECK-NEXT:                        for (int l1_1 = 0; l1_1 < N; l1_1++) {
-# CHECK-NEXT:                          v_X___1 = ReduceOp((v_X___1) + (v_X__[i_2, j_1, k_1, l_1, m_1, n_1, o_1, p_1, i1_1, j1_1, k1_1, l1_1]), reduce_args={i_2, j_1, k_1, l_1, m_1, n_1, o_1, p_1, i1_1, j1_1, k1_1, l1_1});
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp
deleted file mode 100644
index 5db84eab1f509..0000000000000
--- a/test/cpp/tensorexpr/test_memdependency.cpp
+++ /dev/null
@@ -1,3252 +0,0 @@
-#include <gtest/gtest.h>
-#include <test/cpp/tensorexpr/test_base.h>
-
-#include <torch/csrc/jit/tensorexpr/bounds_overlap.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_printer.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/mem_dependency_checker.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::tensorexpr;
-
-// Test helper function used to determine if two regions of a buffer have an
-// overlap. No Overlap & partial overlap is obvious. Contains means A is
-// larger and fully encloses B, while ContainedOrEqual is the reverse. Equal
-// ranges are ContainedOrEqual.
-TEST(MemDependency, BoundOverlap) {
-  using namespace analysis;
-
-  auto CB = [](int s, int e) {
-    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
-  };
-
-  // Sanity check 3 overlap cases.
-  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(0, 0), CB(0, 0)));
-  ASSERT_EQ(OverlapKind::PartialOverlap, boundOverlap(CB(0, 3), CB(2, 5)));
-  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(0, 0), CB(1, 1)));
-
-  // Partial overlap works in either order.
-  ASSERT_EQ(OverlapKind::PartialOverlap, boundOverlap(CB(0, 10), CB(7, 14)));
-  ASSERT_EQ(OverlapKind::PartialOverlap, boundOverlap(CB(7, 14), CB(0, 10)));
-
-  // Total Overlap works when one bound encloses the other, and returns which.
-  ASSERT_EQ(OverlapKind::Contains, boundOverlap(CB(2, 15), CB(7, 9)));
-  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(2, 15), CB(0, 16)));
-
-  // Total overlap works when the bounds are an identical range, returns
-  // ContainedOrEqual.
-  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(2, 15), CB(2, 15)));
-
-  // Total overlap when only one end of the bound matches.
-  ASSERT_EQ(OverlapKind::Contains, boundOverlap(CB(2, 15), CB(2, 10)));
-  ASSERT_EQ(OverlapKind::Contains, boundOverlap(CB(2, 15), CB(3, 15)));
-  ASSERT_EQ(OverlapKind::Contains, boundOverlap(CB(0, 10), CB(0, 9)));
-  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(2, 10), CB(2, 15)));
-  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(3, 15), CB(2, 15)));
-
-  // No overlap when a < b.
-  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(0, 2), CB(5, 10)));
-  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(2, 2), CB(3, 3)));
-  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(100, 120), CB(130, 130)));
-
-  // No overlap when a > b.
-  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(5, 10), CB(0, 2)));
-  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(3, 3), CB(2, 2)));
-  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(130, 130), CB(100, 120)));
-
-  // No overlap when adjacent.
-  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(0, 100), CB(101, 120)));
-  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(2, 3), CB(0, 1)));
-
-  // Partial overlap when middle bounds match.
-  ASSERT_EQ(
-      OverlapKind::PartialOverlap, boundOverlap(CB(0, 100), CB(100, 120)));
-  ASSERT_EQ(OverlapKind::PartialOverlap, boundOverlap(CB(0, 2), CB(2, 4)));
-  ASSERT_EQ(
-      OverlapKind::PartialOverlap, boundOverlap(CB(100, 120), CB(0, 100)));
-  ASSERT_EQ(OverlapKind::PartialOverlap, boundOverlap(CB(2, 3), CB(1, 2)));
-
-  // Total overlap when one bound is single length over one end of the other.
-  ASSERT_EQ(OverlapKind::Contains, boundOverlap(CB(2, 15), CB(15, 15)));
-  ASSERT_EQ(OverlapKind::Contains, boundOverlap(CB(2, 15), CB(2, 2)));
-  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(2, 2), CB(2, 15)));
-  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(15, 15), CB(2, 15)));
-}
-
-TEST(MemDependency, BoundComparison) {
-  using namespace analysis;
-
-  auto CB = [](int s, int e) {
-    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
-  };
-
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kEQ));
-  ASSERT_EQ(
-      CmpEvalResult::True,
-      compareBound(CB(10, 10), CB(10, 10), CompareSelectOperation::kEQ));
-  ASSERT_EQ(
-      CmpEvalResult::False,
-      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kEQ));
-  ASSERT_EQ(
-      CmpEvalResult::False,
-      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kEQ));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kEQ));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 40), CB(20, 30), CompareSelectOperation::kEQ));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kEQ));
-
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kNE));
-  ASSERT_EQ(
-      CmpEvalResult::False,
-      compareBound(CB(10, 10), CB(10, 10), CompareSelectOperation::kNE));
-  ASSERT_EQ(
-      CmpEvalResult::True,
-      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kNE));
-  ASSERT_EQ(
-      CmpEvalResult::True,
-      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kNE));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kNE));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 40), CB(20, 30), CompareSelectOperation::kEQ));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kNE));
-
-  ASSERT_EQ(
-      CmpEvalResult::True,
-      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kLT));
-  ASSERT_EQ(
-      CmpEvalResult::False,
-      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kLT));
-  ASSERT_EQ(
-      CmpEvalResult::False,
-      compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kLT));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kLT));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kLT));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kLT));
-
-  ASSERT_EQ(
-      CmpEvalResult::False,
-      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kGE));
-  ASSERT_EQ(
-      CmpEvalResult::True,
-      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kGE));
-  ASSERT_EQ(
-      CmpEvalResult::True,
-      compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kGE));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kGE));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kGE));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kGE));
-
-  ASSERT_EQ(
-      CmpEvalResult::False,
-      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kGT));
-  ASSERT_EQ(
-      CmpEvalResult::False,
-      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kGT));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kGT));
-  ASSERT_EQ(
-      CmpEvalResult::True,
-      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kGT));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kGT));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kGT));
-
-  ASSERT_EQ(
-      CmpEvalResult::True,
-      compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kLE));
-  ASSERT_EQ(
-      CmpEvalResult::True,
-      compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kLE));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kLE));
-  ASSERT_EQ(
-      CmpEvalResult::False,
-      compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kLE));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kLE));
-  ASSERT_EQ(
-      CmpEvalResult::NotDetermined,
-      compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kLE));
-}
-
-TEST(MemDependency, BoundOverlapSymbolic) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  VarHandle z("z", kInt);
-  VarHandle w("w", kInt);
-
-  using namespace analysis;
-
-  auto CB = [](ExprHandle s, ExprHandle e) {
-    return Bound(s.node(), e.node());
-  };
-
-  // Sanity check cases where the start and end is symbolic but the diff is
-  // constant.
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  ASSERT_EQ(OverlapKind::ContainedOrEqual, boundOverlap(CB(x, x), CB(x, x)));
-  ASSERT_EQ(
-      OverlapKind::PartialOverlap,
-      boundOverlap(CB(x, x + 3), CB(x + 2, x + 5)));
-  ASSERT_EQ(OverlapKind::NoOverlap, boundOverlap(CB(x, x), CB(x + 1, x + 1)));
-
-  // We can't infer the sign of y, so cannot tell whether adding y is larger or
-  // smaller than y/2.
-  ASSERT_EQ(
-      OverlapKind::PartialOverlap,
-      boundOverlap(CB(x, x + y), CB(x, x + y / 2)));
-
-  // No information about this bound, have to take the most conservative option:
-  // there may be an overlap.
-  ASSERT_EQ(OverlapKind::PartialOverlap, boundOverlap(CB(x, y), CB(z, w)));
-
-  // Math on opaque terms works.
-  ASSERT_EQ(
-      OverlapKind::ContainedOrEqual,
-      boundOverlap(CB(x + w, y - z), CB(x + w, y - z)));
-  // Even requiring simplification.
-  ASSERT_EQ(
-      OverlapKind::ContainedOrEqual,
-      boundOverlap(CB(x - w - w, y), CB(x - w * 2, y)));
-}
-
-// Tests the helper function for overlap of multi dimensional indices bounds.
-// This uses boundOverlap on each dimension and return the "lowest" kind of
-// overlap.
-TEST(MemDependency, BoundOverlapMultiDim) {
-  using namespace analysis;
-
-  auto CB = [](int s, int e) {
-    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
-  };
-
-  // Sanity check one dimensional cases.
-  ASSERT_EQ(OverlapKind::ContainedOrEqual, overlaps({CB(0, 0)}, {CB(0, 0)}));
-  ASSERT_EQ(OverlapKind::NoOverlap, overlaps({CB(0, 2)}, {CB(5, 10)}));
-  ASSERT_EQ(
-      OverlapKind::PartialOverlap, overlaps({CB(0, 100)}, {CB(100, 120)}));
-
-  // Total overlap in 3 dims.
-  ASSERT_EQ(
-      OverlapKind::ContainedOrEqual,
-      overlaps({CB(0, 2), CB(0, 5), CB(0, 4)}, {CB(0, 2), CB(0, 5), CB(0, 4)}));
-  ASSERT_EQ(
-      OverlapKind::ContainedOrEqual,
-      overlaps(
-          {CB(0, 2), CB(0, 5), CB(0, 4)}, {CB(0, 2), CB(0, 5), CB(0, 10)}));
-
-  // Total overlap in 2 dims, no overlap in another.
-  ASSERT_EQ(
-      OverlapKind::NoOverlap,
-      overlaps(
-          {CB(0, 2), CB(0, 5), CB(0, 4)}, {CB(0, 2), CB(0, 5), CB(5, 10)}));
-
-  // Total overlap in 2 dims, partial overlap in another.
-  ASSERT_EQ(
-      OverlapKind::PartialOverlap,
-      overlaps(
-          {CB(0, 2), CB(0, 5), CB(0, 5)}, {CB(0, 2), CB(0, 5), CB(5, 10)}));
-  // This case is most important, so verify the overlap in any dim. (dim 2)
-  ASSERT_EQ(
-      OverlapKind::PartialOverlap,
-      overlaps({CB(0, 2), CB(0, 5), CB(0, 5)}, {CB(0, 2), CB(2, 6), CB(0, 5)}));
-  // Dim 1.
-  ASSERT_EQ(
-      OverlapKind::PartialOverlap,
-      overlaps({CB(0, 2), CB(0, 5), CB(0, 5)}, {CB(1, 3), CB(0, 5), CB(0, 5)}));
-  // Total overlap in 1 dim, partial in 2.
-  ASSERT_EQ(
-      OverlapKind::PartialOverlap,
-      overlaps(
-          {CB(0, 2), CB(0, 5), CB(0, 5)}, {CB(2, 6), CB(0, 5), CB(5, 10)}));
-  // Total overlap, partial overlap, no overlap.
-  ASSERT_EQ(
-      OverlapKind::NoOverlap,
-      overlaps(
-          {CB(0, 2), CB(0, 5), CB(0, 5)}, {CB(2, 6), CB(11, 15), CB(0, 5)}));
-
-  // Total overlap (B) in 2 dims, total overlap (A) in another.
-  ASSERT_EQ(
-      OverlapKind::Contains,
-      overlaps({CB(0, 2), CB(0, 5), CB(0, 4)}, {CB(0, 2), CB(0, 3), CB(0, 4)}));
-
-  // Total overlap (A) in 2 dims, total overlap (B) in another.
-  ASSERT_EQ(
-      OverlapKind::Contains,
-      overlaps(
-          {CB(0, 12), CB(0, 15), CB(0, 4)}, {CB(0, 2), CB(0, 3), CB(0, 14)}));
-
-  // Total (B), No Overlap, Total (A).
-  ASSERT_EQ(
-      OverlapKind::NoOverlap,
-      overlaps(
-          {CB(0, 2), CB(0, 5), CB(0, 5)}, {CB(0, 6), CB(11, 15), CB(1, 2)}));
-}
-
-// Test the helper we use to subtract bounds: returns the regions(s) of A which
-// remain after removing the region of B.
-TEST(MemDependency, BoundSubtract) {
-  using namespace analysis;
-
-  auto CB = [](int s, int e) {
-    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
-  };
-  auto EQ = [](const IndexBounds& x, const IndexBounds& y) {
-    return indexBoundsEquals(x, y);
-  };
-
-  // One element subtract.
-  ASSERT_EQ(subtractBound(CB(0, 0), CB(0, 0)).size(), 0);
-  ASSERT_EQ(subtractBound(CB(5, 5), CB(5, 5)).size(), 0);
-
-  // No Overlap.
-  ASSERT_TRUE(EQ(subtractBound(CB(5, 5), CB(2, 2)), {CB(5, 5)}));
-  ASSERT_TRUE(EQ(subtractBound(CB(5, 5), CB(0, 4)), {CB(5, 5)}));
-
-  // one side overlap.
-  ASSERT_TRUE(EQ(subtractBound(CB(1, 5), CB(4, 7)), {CB(1, 3)}));
-  ASSERT_TRUE(EQ(subtractBound(CB(0, 5), CB(5, 7)), {CB(0, 4)}));
-  ASSERT_TRUE(EQ(subtractBound(CB(4, 5), CB(1, 4)), {CB(5, 5)}));
-  ASSERT_TRUE(EQ(subtractBound(CB(1, 5), CB(0, 4)), {CB(5, 5)}));
-
-  // both sides overlap.
-  ASSERT_TRUE(EQ(subtractBound(CB(1, 5), CB(0, 7)), {}));
-  ASSERT_TRUE(EQ(subtractBound(CB(5, 5), CB(5, 7)), {}));
-
-  // internal overlap.
-  ASSERT_TRUE(EQ(subtractBound(CB(1, 5), CB(2, 3)), {CB(1, 1), CB(4, 5)}));
-  ASSERT_TRUE(EQ(subtractBound(CB(0, 5), CB(2, 4)), {CB(0, 1), CB(5, 5)}));
-}
-
-TEST(MemDependency, BoundSubtractSymbolic) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  VarHandle z("z", kInt);
-  VarHandle w("w", kInt);
-
-  using namespace analysis;
-
-  auto CB = [](ExprHandle s, ExprHandle e) {
-    return Bound(s.node(), e.node());
-  };
-  auto EQ = [](const IndexBounds& x, const IndexBounds& y) {
-    return indexBoundsEquals(x, y);
-  };
-
-  // One element subtract.
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  ASSERT_TRUE(EQ(subtractBound(CB(x, x), CB(x, x)), {}));
-  ASSERT_TRUE(EQ(subtractBound(CB(x + 1, x + 1), CB(x + 1, x + 1)), {}));
-  ASSERT_TRUE(EQ(subtractBound(CB(x * 2, x * 2), CB(x * 2, x * 2)), {}));
-
-  // Subtract constant range low.
-  ASSERT_TRUE(
-      EQ(subtractBound(CB(x, x + 10), CB(x, x + 4)), {CB(x + 5, x + 10)}));
-  // Subtract constant range high.
-  ASSERT_TRUE(
-      EQ(subtractBound(CB(x, x + 10), CB(x + 6, x + 12)), {CB(x, x + 5)}));
-  // Subtract constant range total overlap.
-  ASSERT_TRUE(EQ(subtractBound(CB(x, x + 10), CB(x, x + 10)), {}));
-  ASSERT_TRUE(EQ(subtractBound(CB(x + 2, x + 10), CB(x, x + 12)), {}));
-  // Subtract constant range internal.
-  ASSERT_TRUE(
-      EQ(subtractBound(CB(x, x + 10), CB(x + 3, x + 7)),
-         {CB(x, x + 2), CB(x + 8, x + 10)}));
-
-  // Size is inferable but not constant, only works with a single var.
-  ASSERT_TRUE(EQ(subtractBound(CB(0, x), CB(0, x * 2)), {}));
-  ASSERT_TRUE(EQ(subtractBound(CB(0, x * 2), CB(0, x - 1)), {CB(x, x * 2)}));
-
-  // Size is not inferable.
-  ASSERT_TRUE(EQ(subtractBound(CB(x, y), CB(z, w)), {CB(x, y)}));
-  ASSERT_TRUE(EQ(subtractBound(CB(x, y), CB(x, z)), {CB(x, y)}));
-  ASSERT_TRUE(EQ(subtractBound(CB(x, y), CB(0, x)), {CB(x, y)}));
-  ASSERT_TRUE(EQ(subtractBound(CB(x, x), CB(0, 0)), {CB(x, x)}));
-}
-
-// Tests the helper function that does subtraction, but for multi dimensional
-// indices bounds.
-TEST(MemDependency, BoundSubtractMultiDim) {
-  using namespace analysis;
-
-  auto CB = [](int s, int e) {
-    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
-  };
-  auto EQ = [](std::vector<IndexBounds> x, std::vector<IndexBounds> y) {
-    if (x.size() != y.size()) {
-      return false;
-    }
-    for (auto i = 0U; i < x.size(); ++i) {
-      if (!indexBoundsEquals(x[i], y[i])) {
-        return false;
-      }
-    }
-    return true;
-  };
-
-  // sanity check one dimension.
-  ASSERT_TRUE(EQ(subtractIndicesBounds({CB(0, 9)}, {CB(0, 9)}), {}));
-  ASSERT_TRUE(EQ(subtractIndicesBounds({CB(3, 9)}, {CB(0, 12)}), {}));
-  ASSERT_TRUE(
-      EQ(subtractIndicesBounds({CB(0, 12)}, {CB(0, 9)}), {{CB(10, 12)}}));
-  ASSERT_TRUE(
-      EQ(subtractIndicesBounds({CB(0, 12)}, {CB(3, 12)}), {{CB(0, 2)}}));
-  ASSERT_TRUE(EQ(
-      subtractIndicesBounds({CB(0, 9)}, {CB(1, 8)}), {{CB(0, 0)}, {CB(9, 9)}}));
-
-  // Multi dim total overlap.
-  ASSERT_TRUE(EQ(
-      subtractIndicesBounds({CB(0, 9), CB(0, 2)}, {CB(0, 9), CB(0, 2)}), {}));
-  ASSERT_TRUE(EQ(
-      subtractIndicesBounds({CB(0, 9), CB(0, 2)}, {CB(0, 10), CB(0, 20)}), {}));
-
-  // Multi dim one way partial in dim 1.
-  ASSERT_TRUE(
-      EQ(subtractIndicesBounds({CB(0, 9), CB(0, 2)}, {CB(0, 3), CB(0, 2)}),
-         {{CB(4, 9), CB(0, 2)}}));
-
-  // Multi dim one way partial in dim 2.
-  ASSERT_TRUE(
-      EQ(subtractIndicesBounds({CB(0, 9), CB(0, 20)}, {CB(0, 9), CB(0, 10)}),
-         {{CB(0, 9), CB(11, 20)}}));
-
-  // Partial overlap in 2 dims.
-  ASSERT_TRUE(
-      EQ(subtractIndicesBounds({CB(0, 5), CB(0, 5)}, {CB(2, 8), CB(2, 8)}),
-         {{CB(0, 1), CB(0, 5)}, {CB(2, 5), CB(0, 1)}}));
-
-  // Partial overlap in 3 dims.
-  ASSERT_TRUE(
-      EQ(subtractIndicesBounds(
-             {CB(0, 5), CB(0, 5), CB(0, 5)}, {CB(2, 8), CB(2, 8), CB(2, 8)}),
-         {{CB(0, 1), CB(0, 5), CB(0, 5)},
-          {CB(2, 5), CB(0, 1), CB(0, 5)},
-          {CB(2, 5), CB(2, 5), CB(0, 1)}}));
-}
-
-// Tests the multi dimensional subtraction code for bounds that cannot be fully
-// materialized.
-TEST(MemDependency, BoundSubtractMultiDimSymbolic) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  using namespace analysis;
-
-  auto CB = [](ExprHandle s, ExprHandle e) {
-    return Bound(s.node(), e.node());
-  };
-
-  auto EQ = [](std::vector<IndexBounds> x, std::vector<IndexBounds> y) {
-    if (x.size() != y.size()) {
-      return false;
-    }
-    for (auto i = 0U; i < x.size(); ++i) {
-      if (!indexBoundsEquals(x[i], y[i])) {
-        return false;
-      }
-    }
-    return true;
-  };
-
-  // Cannot determine overlaps.
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  ASSERT_TRUE(EQ(subtractIndicesBounds({CB(x, x)}, {CB(0, 0)}), {{CB(x, x)}}));
-
-  // Various total Overlaps.
-  ASSERT_TRUE(EQ(
-      subtractIndicesBounds({CB(x, x), CB(x, x)}, {CB(x, x), CB(x, x)}), {}));
-  ASSERT_TRUE(EQ(
-      subtractIndicesBounds({CB(x, y), CB(x, y)}, {CB(x, y), CB(x, y)}), {}));
-  ASSERT_TRUE(EQ(
-      subtractIndicesBounds({CB(x, x), CB(y, y)}, {CB(x, x), CB(y, y)}), {}));
-  ASSERT_TRUE(EQ(
-      subtractIndicesBounds({CB(0, x), CB(0, y)}, {CB(0, x), CB(0, y)}), {}));
-
-  // one-way overlap in first dim.
-  ASSERT_TRUE(
-      EQ(subtractIndicesBounds({CB(0, x), CB(0, y)}, {CB(0, x - 5), CB(0, y)}),
-         {{CB(x - 4, x), CB(0, y)}}));
-  // second dim.
-  ASSERT_TRUE(
-      EQ(subtractIndicesBounds({CB(0, x), CB(0, y)}, {CB(0, x), CB(5, y)}),
-         {{CB(0, x), CB(0, 4)}}));
-
-  // Internal overlap in first dim.
-  ASSERT_TRUE(
-      EQ(subtractIndicesBounds({CB(0, x), CB(0, y)}, {CB(2, x - 5), CB(0, y)}),
-         {{CB(0, 1), CB(0, y)}, {CB(x - 4, x), CB(0, y)}}));
-  // second dim.
-  ASSERT_TRUE(EQ(
-      subtractIndicesBounds({CB(0, x), CB(0, y)}, {CB(0, x), CB(10, y - 10)}),
-      {{CB(0, x), CB(0, 9)}, {CB(0, x), CB(y - 9, y)}}));
-
-  // Overlap in both dimensions.
-  ASSERT_TRUE(
-      EQ(subtractIndicesBounds(
-             {CB(0, x), CB(0, y)}, {CB(5, x - 5), CB(10, y - 10)}),
-         {
-             {CB(0, 4), CB(0, y)},
-             {CB(x - 4, x), CB(0, y)},
-             {CB(0, x), CB(0, 9)},
-             {CB(0, x), CB(y - 9, y)},
-         }));
-}
-
-// Simple check that the analyzer does anything at all...
-TEST(MemDependency, MemDependencyCheckerSimple) {
-  BufHandle a("A", {1}, kInt);
-  BufHandle b("B", {1}, kInt);
-
-  analysis::MemDependencyChecker analyzer;
-
-  /*
-   * A[0] = 3;
-   * B[0] = A[0] + 1;
-   */
-
-  StorePtr aStore = Store::make(a, {0}, 3);
-  StorePtr bStore = Store::make(b, {0}, Add::make(Load::make(a, {0}), 1));
-
-  StmtPtr stmt = Block::make({aStore, bStore});
-
-  stmt->accept(&analyzer);
-
-  ASSERT_TRUE(analyzer.dependsDirectly(bStore, aStore));
-  ASSERT_FALSE(analyzer.dependsIndirectly(aStore, bStore));
-  // sanity check, but anything that depends directly must depend indirectly.
-  ASSERT_TRUE(analyzer.dependsIndirectly(bStore, aStore));
-}
-
-// Check that there is a difference between direct and indirect dependence.
-TEST(MemDependency, MemDependencyCheckerMultiStmt) {
-  BufHandle a("A", {1}, kInt);
-  BufHandle b("B", {1}, kInt);
-  BufHandle c("C", {1}, kInt);
-
-  analysis::MemDependencyChecker analyzer;
-
-  /*
-   * A[0] = 3;
-   * B[0] = A[0];
-   * C[0] = B[0] + 1;
-   */
-
-  StorePtr aStore = Store::make(a, {0}, 3);
-  StorePtr bStore = Store::make(b, {0}, Load::make(a, {0}));
-  StorePtr cStore = Store::make(c, {0}, Add::make(Load::make(b, {0}), 1));
-
-  StmtPtr stmt = Block::make({aStore, bStore, cStore});
-
-  stmt->accept(&analyzer);
-
-  // C depends on A indirectly.
-  ASSERT_FALSE(analyzer.dependsDirectly(cStore, aStore));
-  ASSERT_TRUE(analyzer.dependsIndirectly(cStore, aStore));
-
-  // C depends on B directly, which depends on A directly.
-  ASSERT_TRUE(analyzer.dependsDirectly(cStore, bStore));
-  ASSERT_TRUE(analyzer.dependsDirectly(bStore, aStore));
-
-  // Dependency goes top to bottom only.
-  ASSERT_FALSE(analyzer.dependsIndirectly(bStore, cStore));
-  ASSERT_FALSE(analyzer.dependsIndirectly(aStore, bStore));
-  ASSERT_FALSE(analyzer.dependsIndirectly(aStore, cStore));
-}
-
-// Verify that we do filter writes that are totally overlapped by later writes.
-TEST(MemDependency, MemDependencyCheckerOverlap) {
-  BufHandle a("A", {1}, kInt);
-  BufHandle b("B", {1}, kInt);
-
-  analysis::MemDependencyChecker analyzer;
-
-  /*
-   * A[0] = 3;
-   * A[0] = 6;
-   * B[0] = A[0] + 1;
-   */
-
-  StorePtr aStore = Store::make(a, {0}, 3);
-  StorePtr a2Store = Store::make(a, {0}, 6);
-  StorePtr bStore = Store::make(b, {0}, Add::make(Load::make(a, {0}), 1));
-
-  StmtPtr stmt = Block::make({aStore, a2Store, bStore});
-
-  stmt->accept(&analyzer);
-
-  // B store depends on second A store but not first since it is completely
-  // overlapped.
-  ASSERT_TRUE(analyzer.dependsIndirectly(bStore, a2Store));
-  ASSERT_FALSE(analyzer.dependsIndirectly(bStore, aStore));
-
-  // No dependency between either A store.
-  ASSERT_FALSE(analyzer.dependsIndirectly(aStore, a2Store));
-  ASSERT_FALSE(analyzer.dependsIndirectly(a2Store, aStore));
-}
-
-// Verify that bounds match loop iterations, and that dependencies progress
-// across loop scopes.
-TEST(MemDependency, MemDependencyCheckerLoop) {
-  BufHandle a("A", {1}, kInt);
-  BufHandle b("B", {1}, kInt);
-  VarHandle x("x", kInt);
-
-  using namespace analysis;
-
-  MemDependencyChecker analyzer;
-
-  /*
-   * for (int x = 0; x < 10; ++x) {
-   *   A[x] = x;
-   * }
-   * B[0] = A[0] + 1;
-   */
-
-  StorePtr aStore = Store::make(a, {x}, x);
-  StmtPtr loop = For::make(x, 0, 10, aStore);
-  StorePtr bStore = Store::make(b, {0}, Add::make(Load::make(a, {4}), 1));
-
-  StmtPtr stmt = Block::make({loop, bStore});
-
-  stmt->accept(&analyzer);
-
-  // Same A->B dependency.
-  ASSERT_TRUE(analyzer.dependsDirectly(bStore, aStore));
-
-  // B depends on the loop.
-  ASSERT_TRUE(analyzer.dependsDirectly(bStore, loop));
-  // A is in the loop but does not depend on any loop iteration.
-  ASSERT_FALSE(analyzer.dependsIndirectly(aStore, loop));
-
-  auto aStoreAccess = analyzer.accessFor(aStore);
-  ASSERT_NE(aStoreAccess, nullptr);
-
-  // It should have bounds covering the range of x: 0 <= x < 10.
-  ASSERT_TRUE(indexBoundsEquals(
-      aStoreAccess->bounds(), {Bound(alloc<IntImm>(0), alloc<IntImm>(9))}));
-}
-
-// Reductions should promote dependencies as well.
-TEST(MemDependency, MemDependencyCheckerLoopReduce) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-
-  using namespace analysis;
-
-  MemDependencyChecker analyzer;
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; ++x) {
-   *   A[0] = A[x] + 1;
-   * }
-   * B[0] = A[0];
-   */
-
-  StorePtr aInit = Store::make(a, {0}, 0);
-  ExprHandle reduce = Sum()(a, 1, {x}, {x});
-  StorePtr aReduce = Store::make(a, {0}, reduce);
-  StmtPtr loop = For::make(x, 0, 10, aReduce);
-  StorePtr bStore = Store::make(b, {0}, Load::make(a, {0}));
-
-  StmtPtr stmt = Block::make({aInit, loop, bStore});
-
-  stmt->accept(&analyzer);
-
-  // B -> A.
-  ASSERT_TRUE(analyzer.dependsDirectly(bStore, aReduce));
-
-  // B depends indirectly on the initializer of A, since the reduction depends
-  // on it.
-  ASSERT_FALSE(analyzer.dependsDirectly(bStore, aInit));
-  ASSERT_TRUE(analyzer.dependsIndirectly(bStore, aInit));
-
-  ASSERT_TRUE(analyzer.dependsDirectly(aReduce, aInit));
-
-  // B depends on the loop.
-  ASSERT_TRUE(analyzer.dependsDirectly(bStore, loop));
-  // A is in the loop and depends on other iterations.
-  ASSERT_TRUE(analyzer.dependsDirectly(aReduce, loop));
-
-  // The loop contents depend on the initializer too.
-  ASSERT_TRUE(analyzer.dependsDirectly(loop, aInit));
-
-  // Find loads within the reduction:
-  auto reduceLoads = NodeFinder<Load>::find(reduce.node());
-  // Pull out the access for the load inside the loop.
-  for (auto load : reduceLoads) {
-    auto loopLoad = analyzer.accessFor(load);
-    // It should have 10 element long bounds.
-    ASSERT_TRUE(indexBoundsEquals(
-        loopLoad->bounds(), {Bound(alloc<IntImm>(0), alloc<IntImm>(9))}));
-  }
-}
-
-// Lowering a reduction doesn't affect dependency analysis.
-TEST(MemDependency, MemDependencyCheckerLoopReduceExpanded) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-
-  using namespace analysis;
-
-  MemDependencyChecker analyzer;
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; ++x) {
-   *   A[0] = A[x] + 1;
-   * }
-   * B[0] = A[0];
-   */
-
-  StorePtr aInit = Store::make(a, {0}, 0);
-  ExprHandle aLoad = Load::make(a, {x});
-  StorePtr aReduce = Store::make(a, {0}, Add::make(aLoad, 1));
-  StmtPtr loop = For::make(x, 0, 10, aReduce);
-  StorePtr bStore = Store::make(b, {0}, Load::make(a, {0}));
-
-  StmtPtr stmt = Block::make({aInit, loop, bStore});
-
-  stmt->accept(&analyzer);
-
-  // B -> A.
-  ASSERT_TRUE(analyzer.dependsDirectly(bStore, aReduce));
-
-  // B depends indirectly on the initializer of A, since the reduction depends
-  // on it.
-  ASSERT_FALSE(analyzer.dependsDirectly(bStore, aInit));
-  ASSERT_TRUE(analyzer.dependsIndirectly(bStore, aInit));
-
-  ASSERT_TRUE(analyzer.dependsDirectly(aReduce, aInit));
-
-  // B depends on the loop.
-  ASSERT_TRUE(analyzer.dependsDirectly(bStore, loop));
-  // A is in the loop and depends on other iterations.
-  ASSERT_TRUE(analyzer.dependsDirectly(aReduce, loop));
-
-  // The loop contents depend on the initializer too.
-  ASSERT_TRUE(analyzer.dependsDirectly(loop, aInit));
-
-  // Pull out the access for the store inside the loop.
-  auto loopLoad = analyzer.accessFor(aLoad.node());
-  // It should have 10 element long bounds.
-  ASSERT_TRUE(indexBoundsEquals(
-      loopLoad->bounds(), {Bound(alloc<IntImm>(0), alloc<IntImm>(9))}));
-}
-
-// Can determine dependencies of outputs, through to inputs.
-TEST(MemDependency, MemDependencyCheckerInputsOutputs) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-
-  // initialize analyzer with inputs and outputs.
-  analysis::MemDependencyChecker analyzer({a}, {b});
-
-  // Here's a Relu.
-  /*
-   * for (int x = 0; x < 10; ++x) {
-   *   B[x] = Max(A[x], 0);
-   * }
-   */
-
-  ExprHandle aLoad = Load::make(a, {x});
-  StorePtr bStore = Store::make(b, {x}, Max::make(aLoad, 0, true));
-  StmtPtr loop = For::make(x, 0, 10, bStore);
-
-  StmtPtr stmt = Block::make({loop});
-
-  stmt->accept(&analyzer);
-
-  // Output depends indirectly on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
-  // aLoad depends directly on the input A.
-  ASSERT_TRUE(analyzer.dependsDirectly(aLoad.node(), a.node()));
-  // bStore therefore depends directly on the input A.
-  ASSERT_TRUE(analyzer.dependsDirectly(bStore, a.node()));
-  // The output depends directly on the store.
-  ASSERT_TRUE(analyzer.dependsDirectly(b.node(), bStore));
-
-  // Check AccessInfo based overloads.
-  auto input = analyzer.input(a.node());
-  auto output = analyzer.output(b.node());
-
-  // Output depends indirectly on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(output, input));
-  // Not directly.
-  ASSERT_FALSE(analyzer.dependsDirectly(output, input));
-  // Not in reverse order.
-  ASSERT_FALSE(analyzer.dependsIndirectly(input, output));
-
-  // output -> bStore -> bLoad -> input.
-  auto storeAccess = analyzer.accessFor(bStore);
-  auto loadAccess = analyzer.accessFor(aLoad.node());
-
-  ASSERT_TRUE(analyzer.dependsDirectly(output, storeAccess));
-  ASSERT_TRUE(analyzer.dependsDirectly(loadAccess, input));
-}
-
-// Can tell if an output does not depend on an input.
-TEST(MemDependency, MemDependencyCheckerOutputDoesntDepend) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-
-  // initialize analyzer with inputs and outputs.
-  analysis::MemDependencyChecker analyzer({a}, {b});
-
-  // Here's a dumb Relu.
-  /*
-   * for (int x = 0; x < 10; ++x) {
-   *   B[x] = Max(x, 0);
-   * }
-   */
-
-  StorePtr bStore = Store::make(b, {x}, Max::make(x, 0, true));
-  StmtPtr loop = For::make(x, 0, 10, bStore);
-
-  StmtPtr stmt = Block::make({loop});
-
-  stmt->accept(&analyzer);
-
-  // Output does not depend indirectly on input.
-  ASSERT_FALSE(analyzer.dependsIndirectly(b.node(), a.node()));
-
-  // The output still depends directly on the store.
-  ASSERT_TRUE(analyzer.dependsDirectly(b.node(), bStore));
-
-  // Check AccessInfo based overloads.
-  auto input = analyzer.input(a.node());
-  auto output = analyzer.output(b.node());
-
-  // Output does not depend indirectly on input.
-  ASSERT_FALSE(analyzer.dependsIndirectly(output, input));
-}
-
-// Verify different loop extents produce accesses with different bounds, and
-// that later accesses find dependencies that overlap their entire bound range.
-TEST(MemDependency, MemDependencyCheckerLoopBounds) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  BufHandle c("C", {10}, kInt);
-  VarHandle x("x", kInt);
-  using namespace analysis;
-
-  MemDependencyChecker analyzer({a}, {c});
-
-  // This enables using the execution order of the loops to determine if some
-  // loops are self dependent or not.
-  analyzer.allowLoopExecutionOrderAnalysis();
-
-  /*
-   * for (int x = 1; x < 10; ++x) {
-   *   B[x] = A[x];
-   * }
-   * for (int x = 1; x < 9; ++x) {
-   *   B[x] = B[x] * 2;
-   * }
-   * for (int x = 3; x < 4; ++x) {
-   *   C[x] = A[x];
-   * }
-   * for (int x = 0; x < 10; ++x) {
-   *   C[x] = B[x];
-   * }
-   */
-
-  std::vector<StmtPtr> stmts(
-      {For::make(x, 1, 10, Store::make(b, {x}, Load::make(a, {x}))),
-       For::make(
-           x, 1, 9, Store::make(b, {x}, Mul::make(Load::make(b, {x}), 2))),
-       For::make(x, 3, 4, Store::make(c, {x}, Load::make(a, {x}))),
-       For::make(x, 0, 10, Store::make(c, {x}, Load::make(b, {x})))});
-
-  StmtPtr stmt = Block::make(stmts);
-
-  stmt->accept(&analyzer);
-
-  auto input = analyzer.input(a.node());
-  auto output = analyzer.output(c.node());
-
-  // sanity check Output -> Input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(output, input));
-
-  // Check the For loop dependencies:
-
-  // Last write to C depends on both writes to B since they contain the last
-  // write to at least one element.
-  ASSERT_TRUE(analyzer.dependsIndirectly(stmts[3], stmts[1]));
-  ASSERT_TRUE(analyzer.dependsIndirectly(stmts[3], stmts[0]));
-
-  // The last write to C does not depend on the other write to C.
-  ASSERT_FALSE(analyzer.dependsIndirectly(stmts[3], stmts[2]));
-
-  auto CB = [](int s, int e) {
-    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
-  };
-  auto EQ = [](const IndexBounds& x, const IndexBounds& y) {
-    return indexBoundsEquals(x, y);
-  };
-
-  /*  0. Input: A[(0, 9)] - dependents: 1 5
-   *  1. Load: A[(1, 9)] - depends on: 0  - dependents: 2
-   *  2. Store: B[(1, 9)] - depends on: 1  - dependents: 3 7
-   *  3. Load: B[(1, 8)] - depends on: 2  - dependents: 4
-   *  4. Store: B[(1, 8)] - depends on: 3  - dependents: 7
-   *  5. Load: A[(3, 3)] - depends on: 0  - dependents: 6
-   *  6. Store: C[(3, 3)] - depends on: 5
-   *  7. Load: B[(0, 9)] - depends on: 2 4  - dependents: 8
-   *  8. Store: C[(0, 9)] - depends on: 7  - dependents: 9
-   *  9. Output: C[(0, 9)] - depends on: 8
-   */
-
-  // Now let's look at the bounds of each access.
-  // There are 9 accesses in this Stmt, so this is exhaustive, we won't do this
-  // much.
-  auto history = analyzer.getHistory();
-  ASSERT_EQ(history.size(), 10);
-  VarPtr aVar = a.node()->base_handle();
-  VarPtr bVar = b.node()->base_handle();
-  VarPtr cVar = c.node()->base_handle();
-
-  // The first access is the input A.
-  ASSERT_EQ(history[0]->type(), AccessType::Input);
-  ASSERT_EQ(history[0]->var(), aVar);
-  // It has the bounds of the producing Input.
-  ASSERT_TRUE(EQ(history[0]->bounds(), {CB(0, 9)}));
-  // sanity check the input we retrieved earlier matches.
-  ASSERT_EQ(history[0], input);
-
-  // The second access is the load of A in the first loop.
-  ASSERT_EQ(history[1]->type(), AccessType::Load);
-  ASSERT_EQ(history[1]->var(), aVar);
-  // It has the bounds of the loop, i.e. start == 1.
-  ASSERT_TRUE(EQ(history[1]->bounds(), {CB(1, 9)}));
-  // It reads from A, so it should have a dependency on the last write to this
-  // range - with is the input.
-  ASSERT_EQ(history[1]->dependencies().size(), 1);
-  ASSERT_TRUE(history[1]->hasDependency(history[0]));
-
-  // The third access is the store into B in the first loop.
-  ASSERT_EQ(history[2]->type(), AccessType::Store);
-  ASSERT_EQ(history[2]->var(), bVar);
-  // It also has the bounds of the loop, i.e. start == 1.
-  ASSERT_TRUE(EQ(history[2]->bounds(), {CB(1, 9)}));
-  // The previous load is in its RHS, so it depends on it.
-  ASSERT_EQ(history[2]->dependencies().size(), 1);
-  ASSERT_TRUE(history[2]->hasDependency(history[1]));
-
-  // The third access is the load from B in the second loop.
-  ASSERT_EQ(history[3]->type(), AccessType::Load);
-  ASSERT_EQ(history[3]->var(), bVar);
-  // It has the bounds of the second loop, i.e. >= 1 < 9.
-  ASSERT_TRUE(EQ(history[3]->bounds(), {CB(1, 8)}));
-  // It reads from B in a smaller range, so should depend on the previous
-  // store.
-  ASSERT_EQ(history[3]->dependencies().size(), 1);
-  ASSERT_TRUE(history[3]->hasDependency(history[2]));
-
-  // The fourth: the store to B in the second loop.
-  ASSERT_EQ(history[4]->type(), AccessType::Store);
-  ASSERT_EQ(history[4]->var(), bVar);
-  // It also has the bounds of the second loop.
-  ASSERT_TRUE(EQ(history[4]->bounds(), {CB(1, 8)}));
-  // The previous load is in its RHS, so it depends on it as before.
-  ASSERT_EQ(history[4]->dependencies().size(), 1);
-  ASSERT_TRUE(history[4]->hasDependency(history[3]));
-
-  // The fifth access is the load is from the 3rd loop, and skips previous B
-  // accesses.
-  ASSERT_EQ(history[5]->type(), AccessType::Load);
-  ASSERT_EQ(history[5]->var(), aVar);
-  // It has the bounds of the third loop: >= 3 < 4.
-  ASSERT_TRUE(EQ(history[5]->bounds(), {CB(3, 3)}));
-  // It depends on the last thing to write to A, which is the A input.
-  ASSERT_EQ(history[5]->dependencies().size(), 1);
-  ASSERT_TRUE(history[5]->hasDependency(history[0]));
-
-  // Sixth: the store into the output C.
-  ASSERT_EQ(history[6]->type(), AccessType::Store);
-  ASSERT_EQ(history[6]->var(), cVar);
-  // It also has the bounds of the third loop.
-  ASSERT_TRUE(EQ(history[6]->bounds(), {CB(3, 3)}));
-  // The previous load is in its RHS, so it depends on it as always.
-  ASSERT_EQ(history[6]->dependencies().size(), 1);
-  ASSERT_TRUE(history[6]->hasDependency(history[5]));
-
-  // The seventh access is the load of B in the fourth loop.
-  ASSERT_EQ(history[7]->type(), AccessType::Load);
-  ASSERT_EQ(history[7]->var(), bVar);
-  // It has the bounds of the final loop, >= 0 < 10
-  ASSERT_TRUE(EQ(history[7]->bounds(), {CB(0, 9)}));
-  // The bounds of this read are larger than the bounds of the previous write,
-  // so it depends on both previous Stores to B.
-  ASSERT_EQ(history[7]->dependencies().size(), 2);
-  ASSERT_TRUE(history[7]->hasDependency(history[2]));
-  ASSERT_TRUE(history[7]->hasDependency(history[4]));
-
-  // Eight: the final store into the output C.
-  ASSERT_EQ(history[8]->type(), AccessType::Store);
-  ASSERT_EQ(history[8]->var(), cVar);
-  // It also has the bounds of the final loop.
-  ASSERT_TRUE(EQ(history[8]->bounds(), {CB(0, 9)}));
-  // The previous load is in its RHS, so it depends on it as always.
-  ASSERT_EQ(history[8]->dependencies().size(), 1);
-  ASSERT_TRUE(history[8]->hasDependency(history[7]));
-
-  // The last access represents the output Buf.
-  ASSERT_EQ(history[9]->type(), AccessType::Output);
-  ASSERT_EQ(history[9]->var(), cVar);
-  // It has the bounds of the output Buf.
-  ASSERT_TRUE(EQ(history[9]->bounds(), {CB(0, 9)}));
-  // sanity check the input we retrieved earlier matches.
-  ASSERT_EQ(history[9], output);
-  // It depends on the last write to C only.
-  ASSERT_EQ(history[9]->dependencies().size(), 1);
-  ASSERT_TRUE(history[9]->hasDependency(history[8]));
-}
-
-// Verify that we can still infer bounds when the loop var is offset.
-TEST(MemDependency, MemDependencyCheckerLoopBoundsIndexShift) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-
-  using namespace analysis;
-
-  MemDependencyChecker analyzer({a}, {b});
-
-  // This enables using the execution order of the loops to determine if some
-  // loops are self dependent or not.
-  analyzer.allowLoopExecutionOrderAnalysis();
-
-  /*
-   * for (int x = 1; x < 10; x++) {
-   *   A[x] = A[x - 1];
-   * }
-   * for (int x = 0; x < 9; x++) {
-   *   A[x] = A[x + 1];
-   * }
-   * for (int x = 0; x < 9; x++) {
-   *   A[9 - x] = A[8 - x];
-   * }
-   * for (int x = 0; x < 10; x++) {
-   *   A[x] = A[9 - x];
-   * }
-   * for (int x = 0; x < 10; x++) {
-   *   B[x] = A[x];
-   * }
-   */
-
-  StmtPtr stmt = Block::make(
-      {For::make(x, 1, 10, Store::make(a, {x}, Load::make(a, {x - 1}))),
-       For::make(x, 0, 9, Store::make(a, {x}, Load::make(a, {x + 1}))),
-       For::make(
-           x,
-           0,
-           9,
-           Store::make(
-               a, {ExprHandle(9) - x}, Load::make(a, {ExprHandle(8) - x}))),
-       For::make(
-           x, 0, 10, Store::make(a, {x}, Load::make(a, {ExprHandle(9) - x}))),
-       For::make(x, 0, 10, Store::make(b, {x}, Load::make(a, {x})))});
-
-  stmt->accept(&analyzer);
-
-  // Sanity check output depends on Input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
-
-  auto CB = [](int s, int e) {
-    return Bound(alloc<IntImm>(s), alloc<IntImm>(e));
-  };
-  auto EQ = [](const IndexBounds& x, const IndexBounds& y) {
-    return indexBoundsEquals(x, y);
-  };
-
-  /*  0. Input: A[(0, 9)] - dependents: 1
-   *  1. Load: A[(0, 8)] - depends on: 0 2  - dependents: 2
-   *  2. Store: A[(1, 9)] - depends on: 1  - dependents: 1 3
-   *  3. Load: A[(1, 9)] - depends on: 2  - dependents: 4
-   *  4. Store: A[(0, 8)] - depends on: 3  - dependents: 5 7
-   *  5. Load: A[(0, 8)] - depends on: 4  - dependents: 6
-   *  6. Store: A[(1, 9)] - depends on: 5  - dependents: 7
-   *  7. Load: A[(0, 9)] - depends on: 4 6 8  - dependents: 8
-   *  8. Store: A[(0, 9)] - depends on: 7  - dependents: 7 9
-   *  9. Load: A[(0, 9)] - depends on: 8  - dependents: 10
-   *  10. Store: B[(0, 9)] - depends on: 9  - dependents: 11
-   *  11. Output: B[(0, 9)] - depends on: 10
-   */
-
-  // Now let's look at the bounds of each access.
-  auto history = analyzer.getHistory();
-  ASSERT_EQ(history.size(), 12);
-  VarPtr aVar = a.node()->base_handle();
-  VarPtr bVar = b.node()->base_handle();
-
-  // The first access is the input A.
-  ASSERT_EQ(history[0]->type(), AccessType::Input);
-  ASSERT_EQ(history[0]->var(), aVar);
-  // It has the bounds of the producing Input.
-  ASSERT_TRUE(EQ(history[0]->bounds(), {CB(0, 9)}));
-
-  // The second access is the load A[x-1].
-  ASSERT_EQ(history[1]->type(), AccessType::Load);
-  ASSERT_EQ(history[1]->var(), aVar);
-  // It has the bounds of the loop modified by the offset of each index, in
-  // this case -1.
-  ASSERT_TRUE(EQ(history[1]->bounds(), {CB(0, 8)}));
-  // It depends on the input, but also the store in the same loop, since
-  // different iterations of the loop depend on each other.
-  ASSERT_EQ(history[1]->dependencies().size(), 2);
-  ASSERT_TRUE(history[1]->hasDependency(history[0]));
-  ASSERT_TRUE(history[1]->hasDependency(history[2]));
-
-  // The third access is the Store to A[x] in the first loop.
-  ASSERT_EQ(history[2]->type(), AccessType::Store);
-  ASSERT_EQ(history[2]->var(), aVar);
-  // It has no offset on x, so should have the same bounds as the loop.
-  ASSERT_TRUE(EQ(history[2]->bounds(), {CB(1, 9)}));
-
-  // The fourth access is the load A[x+1] in the second loop.
-  ASSERT_EQ(history[3]->type(), AccessType::Load);
-  ASSERT_EQ(history[3]->var(), aVar);
-  // It has the bounds of the loop (0 <= x < 9) modified by the offset of each
-  // index, in this case 1.
-  ASSERT_TRUE(EQ(history[3]->bounds(), {CB(1, 9)}));
-  // This load totally overlaps the previous write to A, so it depends only on
-  // it and not the input.
-  ASSERT_EQ(history[3]->dependencies().size(), 1);
-  ASSERT_TRUE(history[3]->hasDependency(history[2]));
-
-  // The fifth access is the store to A[x] in the second loop.
-  ASSERT_EQ(history[4]->type(), AccessType::Store);
-  ASSERT_EQ(history[4]->var(), aVar);
-  // It has no offset on x, so should have the same bounds as the loop.
-  ASSERT_TRUE(EQ(history[4]->bounds(), {CB(0, 8)}));
-
-  // The sixth access is the load to A[8 - x] in the third loop.
-  ASSERT_EQ(history[5]->type(), AccessType::Load);
-  ASSERT_EQ(history[5]->var(), aVar);
-  // It has the bounds of the loop (0 <= x < 9) modified by the offset of each
-  // index, in this case 8 - x.
-  // This access has a negative stride, which will be normalized.
-  ASSERT_TRUE(EQ(history[5]->bounds(), {CB(0, 8)}));
-  // This load totally overlaps the most recent write to A, so it depends only
-  // on it and not the input or the first write to A.
-  ASSERT_EQ(history[5]->dependencies().size(), 1);
-  ASSERT_TRUE(history[5]->hasDependency(history[4]));
-
-  // The seventh access is the store to A[9 - x] in the third loop.
-  ASSERT_EQ(history[6]->type(), AccessType::Store);
-  ASSERT_EQ(history[6]->var(), aVar);
-  // This store has a negative stride on it's indices, but is normalized
-  // internally.
-  ASSERT_TRUE(EQ(history[6]->bounds(), {CB(1, 9)}));
-
-  // The eighth access is the load A[9-x] in the second loop.
-  ASSERT_EQ(history[7]->type(), AccessType::Load);
-  ASSERT_EQ(history[7]->var(), aVar);
-  // It has the bounds of the loop (0 <= x < 9), modified by the offset 9 - x,
-  // which essentially traverses the loop backwards.
-  ASSERT_TRUE(EQ(history[7]->bounds(), {CB(0, 9)}));
-  // This Load has three write dependencies:
-  ASSERT_EQ(history[7]->dependencies().size(), 3);
-  //  * The previous store (#6) for elements 1-9
-  ASSERT_TRUE(history[7]->hasDependency(history[6]));
-  //  * An earlier store (#4) covering element 0
-  ASSERT_TRUE(history[7]->hasDependency(history[4]));
-  //  * A future store inside this loop, since this loop modifies the buffer
-  //  in a non distinct way (due to the load and store having different access
-  //  strides).
-  ASSERT_TRUE(history[7]->hasDependency(history[8]));
-
-  // The ninth access is the store to A[x] in the fourth loop.
-  ASSERT_EQ(history[8]->type(), AccessType::Store);
-  ASSERT_EQ(history[8]->var(), aVar);
-  // This store has a negative stride on it's indices, but is normalized
-  // internally.
-  ASSERT_TRUE(EQ(history[8]->bounds(), {CB(0, 9)}));
-
-  // The tenth and 11th accesses are the copy from A[x] to B[x].
-  ASSERT_EQ(history[9]->type(), AccessType::Load);
-  ASSERT_EQ(history[9]->var(), aVar);
-  ASSERT_EQ(history[10]->type(), AccessType::Store);
-  ASSERT_EQ(history[10]->var(), bVar);
-
-  // The last access represents the output Buf.
-  ASSERT_EQ(history[11]->type(), AccessType::Output);
-  ASSERT_EQ(history[11]->var(), bVar);
-  // It has the bounds of the output Buf.
-  ASSERT_TRUE(EQ(history[11]->bounds(), {CB(0, 9)}));
-  // It depends on the last write to B only.
-  ASSERT_EQ(history[11]->dependencies().size(), 1);
-  ASSERT_TRUE(history[11]->hasDependency(history[10]));
-
-  // ok that's enough of that.
-}
-
-// Check many different cases of loop self dependency - when a load within a
-// loop is dependent on a Store later in the same loop but in different
-// iteration. This is affected by whether or not we can trust the execution
-// order of the loop.
-TEST(MemDependency, MemDependencyCheckerLoopSelfDependency) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  VarHandle z("z", kInt);
-
-  using namespace analysis;
-
-  // This check assumes that the Stmt has a single Store with a single Load on
-  // the RHS.
-  auto isSelfDependent =
-      [](const std::vector<std::shared_ptr<AccessInfo>>& history) -> bool {
-    return history.front()->hasDependency(history.back());
-  };
-
-  {
-    /* for (int y = 0; y < 10; y++) {
-     *   A[y] = (A[y]) + 1;
-     * } */
-
-    // Not self dependent since all loop iterations use a different y.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        y,
-        0,
-        10,
-        Block::make({Store::make(a, {y}, Add::make(Load::make(a, {y}), 1))}));
-
-    stmt->accept(&analyzer);
-
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int y = 0; y < 10; y++) {
-     *   A[y + 1] = (A[y + 1]) + 1;
-     * }
-     */
-
-    // Not self dependent due to different y (with offset).
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        y,
-        0,
-        10,
-        Block::make(
-            {Store::make(a, {y + 1}, Add::make(Load::make(a, {y + 1}), 1))}));
-
-    stmt->accept(&analyzer);
-
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[0] = (A[0]) + x;
-     * }
-     */
-
-    // Is self dependent since all loops use a common constant element of A.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x,
-        0,
-        10,
-        Block::make({Store::make(a, {0}, Add::make(Load::make(a, {0}), x))}));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[0] = (B[0]) + x;
-     * }
-     */
-
-    // Is not self dependent because there is no store to the buffer that is
-    // read.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x,
-        0,
-        10,
-        Block::make({Store::make(a, {0}, Add::make(Load::make(b, {0}), x))}));
-    stmt->accept(&analyzer);
-
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[y] = (A[y]) + x;
-     * }
-     */
-
-    // Is self dependent since all loops use a common symbolic element of A.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x,
-        0,
-        10,
-        Block::make({Store::make(a, {y}, Add::make(Load::make(a, {y}), x))}));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x] = A[x + 1];
-     * }
-     */
-
-    // In this case it depends if we are considering execution order.
-
-    MemDependencyChecker analyzer;
-
-    StmtPtr stmt =
-        For::make(x, 0, 10, Store::make(a, {x}, Load::make(a, {x + 1})));
-    stmt->accept(&analyzer);
-
-    // With analysis of order disabled, this is self dependent since the read
-    // from X+1 and the write to X+1 could be in reverse order.
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x] = A[x + 1];
-     * }
-     */
-
-    MemDependencyChecker analyzer;
-    analyzer.allowLoopExecutionOrderAnalysis();
-
-    StmtPtr stmt =
-        For::make(x, 0, 10, Store::make(a, {x}, Load::make(a, {x + 1})));
-    stmt->accept(&analyzer);
-
-    // If order analysis is enabled, this is not dependent since the read for
-    // each element occurs before the write to that element.
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 1; x < 10; x++) {
-     *   A[x] = A[x - 1];
-     * }
-     */
-
-    MemDependencyChecker analyzer;
-
-    StmtPtr stmt =
-        For::make(x, 1, 10, Store::make(a, {x}, Load::make(a, {x - 1})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 1; x < 10; x++) {
-     *   A[x] = A[x - 1];
-     * }
-     */
-
-    MemDependencyChecker analyzer;
-    analyzer.allowLoopExecutionOrderAnalysis();
-
-    StmtPtr stmt =
-        For::make(x, 1, 10, Store::make(a, {x}, Load::make(a, {x - 1})));
-    stmt->accept(&analyzer);
-
-    // In this case, even with order analysis the Load is dependent on the
-    // Store, since the write to X occurs before the read from X.
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 9; x++) {
-     *   A[9 - x] = A[8 - x];
-     * }
-     */
-
-    // Still works if the execution order is reversed, so long as the read
-    // comes before the write.
-
-    MemDependencyChecker analyzer;
-    analyzer.allowLoopExecutionOrderAnalysis();
-
-    StmtPtr stmt = For::make(
-        x,
-        3,
-        10,
-        Store::make(
-            a, {ExprHandle(9) - x}, Load::make(a, {ExprHandle(8) - x})));
-    stmt->accept(&analyzer);
-
-    // However here was can determine the A store is earlier in the order than
-    // the load.
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 9; x++) {
-     *   A[8 - x] = A[9 - x];
-     * }
-     */
-
-    // But not if it doesn't.
-
-    MemDependencyChecker analyzer;
-    analyzer.allowLoopExecutionOrderAnalysis();
-
-    StmtPtr stmt = For::make(
-        x,
-        3,
-        10,
-        Store::make(
-            a, {ExprHandle(8) - x}, Load::make(a, {ExprHandle(9) - x})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 9; x++) {
-     *   A[9 - x] = A[8 - x];
-     * }
-     */
-
-    // And not if we're not relying on execution order.
-
-    MemDependencyChecker analyzer;
-
-    StmtPtr stmt = For::make(
-        x,
-        3,
-        10,
-        Store::make(
-            a, {ExprHandle(9) - x}, Load::make(a, {ExprHandle(8) - x})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 3; x < 10; x++) {
-     *   A[x - 2] = A[x - 1];
-     * }
-     */
-
-    // Forward order but negative indices.
-
-    MemDependencyChecker analyzer;
-    analyzer.allowLoopExecutionOrderAnalysis();
-
-    StmtPtr stmt =
-        For::make(x, 3, 10, Store::make(a, {x - 2}, Load::make(a, {x - 1})));
-    stmt->accept(&analyzer);
-
-    // However here was can determine the A store is earlier in the order than
-    // the load.
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2] = A[x * 2];
-     * }
-     */
-
-    // With an access stride.
-
-    MemDependencyChecker analyzer;
-    // Execution order doesn't matter since the read and the write are totally
-    // distinct.
-
-    StmtPtr stmt =
-        For::make(x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2})));
-    stmt->accept(&analyzer);
-
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2] = A[x * 2 + 1];
-     * }
-     */
-
-    // Here we can use the common stride of the accesses to determine they are
-    // distinct.
-    // Note, this is the only place (loop self dependency) we use this stride
-    // to avoid unnecessary dependence.
-
-    MemDependencyChecker analyzer;
-    // Execution order doesn't matter since the read and the write are totally
-    // distinct.
-
-    StmtPtr stmt = For::make(
-        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2 + 1})));
-    stmt->accept(&analyzer);
-
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2] = A[x * 2 - 1];
-     * }
-     */
-
-    // same if the read is behind the write so long as they are distinct.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x, 1, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2 - 1})));
-    stmt->accept(&analyzer);
-
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2] = A[x * 2 + 2];
-     * }
-     */
-
-    // But not if the offset is in the stride.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2 + 2})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2] = A[x * 2 - 2];
-     * }
-     */
-
-    // Works with negative offsets too.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x, 1, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2 - 2})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2] = A[x * 2 + 7];
-     * }
-     */
-
-    // Detects accesses are distinct when offset is large but not a multiple
-    // of stride.
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2 + 7})));
-    stmt->accept(&analyzer);
-
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2] = A[x * 2 + 4];
-     * }
-     */
-
-    // Works with offsets which are multiples of the stride.
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 2 + 4})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 6] = A[x * 6 + 5];
-     * }
-     */
-
-    // detects accesses are distinct with large strides when the offset is
-    // within.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x, 0, 10, Store::make(a, {x * 6}, Load::make(a, {x * 6 + 5})));
-    stmt->accept(&analyzer);
-
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2] = A[x * 6];
-     * }
-     */
-
-    // detects accesses are overlapping when stride is different but a
-    // multiple.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt =
-        For::make(x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 6})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 4] = A[x * 2];
-     * }
-     */
-
-    // still works when the read axis is the smaller stride.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt =
-        For::make(x, 0, 10, Store::make(a, {x * 4}, Load::make(a, {x * 2})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2] = A[x * 6 + 1];
-     * }
-     */
-
-    // detects accesses are distinct when stride is different but a multiple
-    // and there is an offset.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 6 + 1})));
-    stmt->accept(&analyzer);
-
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2] = A[x * 6 + 4];
-     * }
-     */
-
-    // The smaller stride determines whether there is overlap.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 6 + 4})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2 + 3] = A[x * 6];
-     * }
-     */
-
-    // The smaller stride determines whether there is overlap, not the larger.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x, 0, 10, Store::make(a, {x * 2 + 3}, Load::make(a, {x * 6})));
-    stmt->accept(&analyzer);
-
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2] = A[x * 3 + 1];
-     * }
-     */
-
-    // If they have strides with no common multiple > 1, they overlap.
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x, 0, 10, Store::make(a, {x * 2}, Load::make(a, {x * 3 + 1})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x] = A[x + 10];
-     * }
-     */
-
-    // If the offset is greater than the size of the loop, they can't overlap.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt =
-        For::make(x, 0, 10, Store::make(a, {x}, Load::make(a, {x + 10})));
-    stmt->accept(&analyzer);
-
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x] = A[9 - x];
-     * }
-     */
-
-    // If they have different execution orders they may overlap.
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x, 0, 10, Store::make(a, {x}, Load::make(a, {ExprHandle(9) - x})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x * 2] = A[19 - x * 2];
-     * }
-     */
-
-    // Or they may not, depending on their start offset and strides.
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x,
-        0,
-        10,
-        Store::make(a, {x * 2}, Load::make(a, {ExprHandle(19) - x * 2})));
-    stmt->accept(&analyzer);
-
-    ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x / 2] = A[x / 2];
-     * }
-     */
-
-    // If the stride is not monotonic, they overlap.
-
-    MemDependencyChecker analyzer;
-    StmtPtr stmt =
-        For::make(x, 0, 10, Store::make(a, {x / 2}, Load::make(a, {x / 2})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x / 2] = A[x / 2] + 1;
-     * }
-     */
-
-    // If the stride is not monotonic, they overlap - even with an offset.
-    MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x, 0, 10, Store::make(a, {x / 2}, Load::make(a, {x / 2 + 1})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   A[x % 2] = A[x % 2];
-     * }
-     */
-
-    // Mod too...
-
-    analysis::MemDependencyChecker analyzer;
-    StmtPtr stmt = For::make(
-        x,
-        0,
-        10,
-        Store::make(a, {Mod::make(x, 2)}, Load::make(a, {Mod::make(x, 2)})));
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-  }
-
-  {
-    /* for (int x = y; x < z; x++) {
-     *   A[x] = A[x + 1];
-     * }
-     */
-
-    // Still works with symbolic loop extents.
-
-    {
-      MemDependencyChecker analyzer;
-      StmtPtr stmt =
-          For::make(x, y, z, Store::make(a, {x}, Load::make(a, {x + 1})));
-      stmt->accept(&analyzer);
-
-      ASSERT_TRUE(isSelfDependent(analyzer.getHistory()));
-    }
-
-    {
-      MemDependencyChecker analyzer;
-      analyzer.allowLoopExecutionOrderAnalysis();
-      StmtPtr stmt =
-          For::make(x, y, z, Store::make(a, {x}, Load::make(a, {x + 1})));
-      stmt->accept(&analyzer);
-
-      ASSERT_FALSE(isSelfDependent(analyzer.getHistory()));
-    }
-  }
-}
-
-// Verify that a strided access still works.
-// TODO: actually this only works because of the size of the ranges, revisit
-// this test after strided overlap is implemented.
-TEST(MemDependency, MemDependencyCheckerLoopDistinctStrides) {
-  BufHandle a("A", {20}, kInt);
-  BufHandle b("B", {20}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  using namespace analysis;
-  MemDependencyChecker analyzer({a.node()}, {b.node()});
-  StmtPtr stmt = Block::make(
-      {For::make(
-           x, 0, 10, Store::make(b, {x * 2 + 1}, Load::make(a, {x * 2 + 1}))),
-       For::make(x, 0, 10, Store::make(b, {x * 2}, Load::make(a, {x * 2})))
-
-      });
-  stmt->accept(&analyzer);
-
-  // Sanity check output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
-
-  // Output has 2 dependencies... the store in each loop.
-  auto outputAccess = analyzer.output(b.node());
-  ASSERT_EQ(outputAccess->dependencies().size(), 2);
-}
-
-/* TODO(nickg) - this test will fail due to the lack of stride math in Bound
-TEST(MemDependency, MemDependencyCheckerLoopDistinctStrides) {
-  BufHandle a("A", {20}, kInt);
-  BufHandle b("B", {20}, kInt);
-  BufHandle c("C", {10}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  {
-    analysis::MemDependencyChecker analyzer({a.node()}, {c.node()});
-    StmtPtr stmt = Block::make(
-        {For::make(
-             x,
-             0,
-             10,
-             Store::make(b, {x * 2 + 1}, Load::make(a, {x * 2 + 1}))),
-         For::make(
-             x, 0, 10, Store::make(b, {x * 2}, Load::make(a, {x * 2}))),
-         For::make(x, 0, 10, Store::make(c, {x}, Load::make(b, {x})))
-
-        });
-    stmt->accept(&analyzer);
-
-    std::cout << *stmt << "\n";
-    for (auto& wi : analyzer.getHistory()) {
-      wi->print();
-    }
-  }
-}*/
-
-// analysis on Stmts using Cond.
-TEST(MemDependency, MemDependencyCheckerLoopBoundsCond) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  BufHandle c("C", {10}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  using namespace analysis;
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   C[x] = A[x];
-     * }
-     * if (y<5 ? 1 : 0) {
-     *   C[0] = (B[0]) + 1;
-     * } else {
-     *   C[0] = (B[1]) + 1;
-     * }
-     */
-
-    // Future usages may depend on accesses in both branches of a condition.
-
-    MemDependencyChecker analyzer({a, b}, {c});
-    StmtPtr stmt = Block::make(
-        {For::make(x, 0, 10, Store::make(c, {x}, Load::make(a, {x}))),
-         Cond::make(
-             CompareSelect::make(y, 5, CompareSelectOperation::kLT),
-             Store::make(c, {0}, Add::make(Load::make(b, {0}), 1)),
-             Store::make(c, {0}, Add::make(Load::make(b, {1}), 1)))});
-
-    stmt->accept(&analyzer);
-
-    // Output C should have 3 dependencies, each of the three stores.
-    auto outputAccess = analyzer.output(c.node());
-    ASSERT_NE(outputAccess, nullptr);
-    ASSERT_EQ(outputAccess->dependencies().size(), 3);
-
-    // C depends indirectly on A and B.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   C[x] = A[x];
-     * }
-     * if (y<5 ? 1 : 0) {
-     *   for (int x = 0; x < 10; x++) {
-     *     C[x] = B[x];
-     *   }
-     * } else {
-     *   for (int x = 0; x < 10; x++) {
-     *     C[x] = (B[x]) + 1;
-     *   }
-     * }
-     */
-
-    // Future usages may depend on accesses in both branches of a condition.
-
-    MemDependencyChecker analyzer({a, b}, {c});
-    StmtPtr stmt = Block::make(
-        {For::make(x, 0, 10, Store::make(c, {x}, Load::make(a, {x}))),
-         Cond::make(
-             CompareSelect::make(y, 5, CompareSelectOperation::kLT),
-             For::make(x, 0, 10, Store::make(c, {x}, Load::make(b, {x}))),
-             For::make(
-                 x,
-                 0,
-                 10,
-                 Store::make(c, {x}, Add::make(Load::make(b, {x}), 1))))});
-
-    stmt->accept(&analyzer);
-
-    // Output C should have 3 dependencies, each of the three stores.
-    auto outputAccess = analyzer.output(c.node());
-    ASSERT_NE(outputAccess, nullptr);
-    ASSERT_EQ(outputAccess->dependencies().size(), 3);
-
-    // TODO(nickg): actually since the true and false branch cover the total
-    // range of the first store this should have 2 dependencies, but we don't
-    // do that yet.
-
-    // C depends indirectly on A and B.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   C[x] = A[x];
-     * }
-     * if (y<5 ? 1 : 0) {
-     *   for (int x = 0; x < 10; x++) {
-     *     C[x] = (B[x]) + 1;
-     *   }
-     * }
-     */
-
-    // Only has true branch.
-
-    MemDependencyChecker analyzer({a, b}, {c});
-    StmtPtr stmt = Block::make(
-        {For::make(x, 0, 10, Store::make(c, {x}, Load::make(a, {x}))),
-         Cond::make(
-             CompareSelect::make(y, 5, CompareSelectOperation::kLT),
-             For::make(
-                 x,
-                 0,
-                 10,
-                 Store::make(c, {x}, Add::make(Load::make(b, {x}), 1))),
-             nullptr)});
-
-    stmt->accept(&analyzer);
-
-    // Output C should have 3 dependencies, each of the three stores.
-    auto outputAccess = analyzer.output(c.node());
-    ASSERT_NE(outputAccess, nullptr);
-    ASSERT_EQ(outputAccess->dependencies().size(), 2);
-
-    // C depends indirectly on A and B.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   C[x] = A[x];
-     * }
-     * if (y<5 ? 1 : 0) {
-     * } else {
-     *   for (int x = 0; x < 10; x++) {
-     *     C[x] = (B[x]) + 1;
-     *   }
-     * }
-     */
-
-    // Only has false branch.
-
-    MemDependencyChecker analyzer({a, b}, {c});
-    StmtPtr stmt = Block::make(
-        {For::make(x, 0, 10, Store::make(c, {x}, Load::make(a, {x}))),
-         Cond::make(
-             CompareSelect::make(y, 5, CompareSelectOperation::kLT),
-             nullptr,
-             For::make(
-                 x,
-                 0,
-                 10,
-                 Store::make(c, {x}, Add::make(Load::make(b, {x}), 1))))});
-
-    stmt->accept(&analyzer);
-
-    // Output C should have 3 dependencies, each of the three stores.
-    auto outputAccess = analyzer.output(c.node());
-    ASSERT_NE(outputAccess, nullptr);
-    ASSERT_EQ(outputAccess->dependencies().size(), 2);
-
-    // C depends indirectly on A and B.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   C[x] = A[x];
-     * }
-     * if (C[0]<5 ? 1 : 0) {
-     *   C[0] = 5;
-     * }
-     */
-
-    // Cond's Condition depends on a previous access.
-
-    MemDependencyChecker analyzer({a}, {c});
-    StorePtr initStore = Store::make(c, {x}, Load::make(a, {x}));
-    ExprHandle conditionalLoad = Load::make(c, {0});
-    StmtPtr stmt = Block::make(
-        {For::make(x, 0, 10, initStore),
-         Cond::make(
-             CompareSelect::make(
-                 conditionalLoad, 5, CompareSelectOperation::kLT),
-             Store::make(c, {0}, 5),
-             nullptr)});
-
-    stmt->accept(&analyzer);
-
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-
-    ASSERT_TRUE(analyzer.dependsDirectly(conditionalLoad.node(), initStore));
-    ASSERT_FALSE(analyzer.dependsDirectly(conditionalLoad.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(conditionalLoad.node(), a.node()));
-  }
-}
-
-// Stmts using IfThenElse.
-TEST(MemDependency, MemDependencyCheckerIfThenElse) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  BufHandle c("C", {10}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  using namespace analysis;
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   C[x] = A[x];
-     * }
-     * C[0] = (y < 5 ? (B[0]) + 1 : (B[1]) + 1;
-     */
-
-    // Future usages may depend on accesses in both branches of a condition.
-
-    MemDependencyChecker analyzer({a, b}, {c});
-    StorePtr ifStore = Store::make(
-        c,
-        {0},
-        IfThenElse::make(
-            CompareSelect::make(y, 5, CompareSelectOperation::kLT),
-            Add::make(Load::make(b, {0}), 1),
-            Add::make(Load::make(b, {1}), 1)));
-    StmtPtr stmt = Block::make(
-        {For::make(x, 0, 10, Store::make(c, {x}, Load::make(a, {x}))),
-         ifStore});
-
-    stmt->accept(&analyzer);
-
-    // Output C should have 2 dependencies, each of the two stores.
-    auto outputAccess = analyzer.output(c.node());
-    ASSERT_NE(outputAccess, nullptr);
-    ASSERT_EQ(outputAccess->dependencies().size(), 2);
-
-    // Now we need to check the Store containing the IfThenElse.
-    auto ifStoreAccess = analyzer.accessFor(ifStore);
-
-    // It should have 2 dependencies.
-    ASSERT_EQ(ifStoreAccess->dependencies().size(), 2);
-
-    // C depends indirectly on A and B.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   C[x] = A[x];
-     * }
-     * C[0] = (y < 5 ? (B[0]) + 1 : 42;
-     */
-
-    // If the load appears in only one side of an IfThenElse the output may be
-    // dependent on it.
-
-    MemDependencyChecker analyzer({a, b}, {c});
-    StorePtr ifStore = Store::make(
-        c,
-        {0},
-        IfThenElse::make(
-            CompareSelect::make(y, 5, CompareSelectOperation::kLT),
-            Add::make(Load::make(b, {0}), 1),
-            42));
-    StmtPtr stmt = Block::make(
-        {For::make(x, 0, 10, Store::make(c, {x}, Load::make(a, {x}))),
-         ifStore});
-
-    stmt->accept(&analyzer);
-
-    // C depends indirectly on A and B.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   C[x] = (x < 5 ? B[x] : A[x];
-     * }
-     */
-
-    // In this case C is dependent on both A and B.
-
-    // TODO: in cases like this it would be possible to split the range of B
-    // into two bounds, one dependent on A and one dependent on B. We'd need to
-    // examine conditions relative to previously encountered loop variables. I'm
-    // uncertain if this would be helpful.
-
-    MemDependencyChecker analyzer({a, b}, {c});
-    StorePtr ifStore = Store::make(
-        c,
-        {0},
-        IfThenElse::make(
-            CompareSelect::make(y, 5, CompareSelectOperation::kLT),
-            Load::make(b, {x}),
-            Load::make(a, {x})));
-    StmtPtr stmt = Block::make({For::make(x, 0, 10, ifStore)});
-
-    stmt->accept(&analyzer);
-
-    // C depends indirectly on A and B.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
-  }
-}
-
-// Cutting a loop with single elem writes
-TEST(MemDependency, MemDependencyCheckerCutLoop) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-
-  using namespace analysis;
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   B[x] = A[x];
-     * }
-     * B[5] = 100;
-     */
-
-    // Cutting a loop with single element writes.
-
-    MemDependencyChecker analyzer({a}, {b});
-    StmtPtr stmt = Block::make(
-        {For::make(x, 0, 10, Store::make(b, {x}, Load::make(a, {x}))),
-         Store::make(b, {5}, 100)});
-
-    stmt->accept(&analyzer);
-
-    // Output depends on input.
-    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
-
-    // Output has 2 dependencies.
-    auto outputAccess = analyzer.output(b.node());
-    ASSERT_NE(outputAccess, nullptr);
-    ASSERT_EQ(outputAccess->dependencies().size(), 2);
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   B[x] = A[x];
-     * }
-     * for (int x = 4; x < 7; x++) {
-     *   B[x] = B[x] + 3;
-     * }
-     * B[5] = 100;
-     * B[6] = 101;
-     * B[7] = 102;
-     */
-
-    // Cutting a loop with a smaller loop but then totally overlap that second
-    // loop with one element writes.
-
-    MemDependencyChecker analyzer({a}, {b});
-    ForPtr firstLoop =
-        For::make(x, 0, 10, Store::make(b, {x}, Load::make(a, {x})));
-    StorePtr secondStore =
-        Store::make(b, {x}, Add::make(Load::make(b, {x}), 1));
-    ForPtr secondLoop = For::make(x, 4, 7, secondStore);
-
-    StmtPtr stmt = Block::make(
-        {firstLoop,
-         secondLoop,
-         Store::make(b, {4}, 100),
-         Store::make(b, {5}, 101),
-         Store::make(b, {6}, 102)});
-
-    stmt->accept(&analyzer);
-
-    // Output depends on input.
-    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
-
-    // Output has 4 dependencies.
-    auto outputAccess = analyzer.output(b.node());
-    ASSERT_NE(outputAccess, nullptr);
-    ASSERT_EQ(outputAccess->dependencies().size(), 4);
-
-    // Second loop depends on first loop.
-    ASSERT_TRUE(analyzer.dependsDirectly(secondLoop, firstLoop));
-
-    // Output does not depend on second loop or store.
-    ASSERT_FALSE(analyzer.dependsIndirectly(b.node(), secondLoop));
-    ASSERT_FALSE(analyzer.dependsIndirectly(b.node(), secondStore));
-  }
-}
-
-// Dynamic shapes (load in indices).
-TEST(MemDependency, MemDependencyCheckerDynamicShapes) {
-  BufHandle a("A", {100}, kInt);
-  BufHandle b("B", {100}, kInt);
-  BufHandle c("C", {100}, kInt);
-  VarHandle x("x", kInt);
-
-  using namespace analysis;
-
-  auto CB = [](ExprHandle s, ExprHandle e) {
-    return Bound(s.node(), e.node());
-  };
-
-  auto EQ = [](const IndexBounds& x, const IndexBounds& y) {
-    return indexBoundsEquals(x, y);
-  };
-
-  {
-    /* for (int x = 0; x < B[0]; x++) {
-     *   C[x] = A[x];
-     * }
-     */
-    MemDependencyChecker analyzer({a, b}, {c});
-    StmtPtr stmt = Block::make({For::make(
-        x, 0, Load::make(b, {0}), Store::make(c, {x}, Load::make(a, {x})))});
-
-    stmt->accept(&analyzer);
-
-    /*  0. Input: B[(0, 99)] - dependents: 2
-     *  1. Input: A[(0, 99)] - dependents: 3
-     *  2. Load: B[(0, 0)] - depends on: 0  - dependents: 3 4
-     *  3. Load: A[(0, (B[0]) - 1)] - depends on: 1 2  - dependents: 4
-     *  4. Store: C[(0, (B[0]) - 1)] - depends on: 2 3  - dependents: 5
-     *  5. Output: C[(0, 99)] - depends on: 4
-     */
-
-    // Output dependent on A input.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-    // Also dependent on B input to determine the size of the region written.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
-
-    auto history = analyzer.getHistory();
-    ASSERT_EQ(history.size(), 6);
-
-    // The accesses in the loop depend on the load in the stop condition.
-    ASSERT_TRUE(history[4]->hasDependency(history[2]));
-    ASSERT_TRUE(history[3]->hasDependency(history[2]));
-
-    // Make a load from B to compare against.
-    ExprHandle loadFromB = Load::make(b, {0});
-
-    ASSERT_TRUE(EQ(history[3]->bounds(), {CB(0, loadFromB - 1)}));
-    ASSERT_TRUE(EQ(history[4]->bounds(), {CB(0, loadFromB - 1)}));
-  }
-
-  {
-    /* for (int x = B[0]; x < B[1]; x++) {
-     *   C[x] = A[x];
-     * }
-     */
-    MemDependencyChecker analyzer({a, b}, {c});
-    StmtPtr stmt = Block::make({For::make(
-        x,
-        Load::make(b, {0}),
-        Load::make(b, {1}),
-        Store::make(c, {x}, Load::make(a, {x})))});
-
-    stmt->accept(&analyzer);
-
-    /*  0. Input: B[(0, 99)] - dependents: 2 3
-     *  1. Input: A[(0, 99)] - dependents: 4
-     *  2. Load: B[(0, 0)] - depends on: 0  - dependents: 4 5
-     *  3. Load: B[(1, 1)] - depends on: 0  - dependents: 4 5
-     *  4. Load: A[(B[0], (B[1]) - 1)] - depends on: 1 2 3  - dependents: 5
-     *  5. Store: C[(B[0], (B[1]) - 1)] - depends on: 2 3 4  - dependents: 6
-     *  6. Output: C[(0, 99)] - depends on: 5
-     */
-
-    // Sanity check output depends on input.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
-
-    auto history = analyzer.getHistory();
-    ASSERT_EQ(history.size(), 7);
-
-    // The accesses in the loop depend on the load in the start condition.
-    ASSERT_TRUE(history[5]->hasDependency(history[2]));
-    ASSERT_TRUE(history[4]->hasDependency(history[2]));
-
-    // also the stop condition.
-    ASSERT_TRUE(history[5]->hasDependency(history[3]));
-    ASSERT_TRUE(history[4]->hasDependency(history[3]));
-
-    // Make loads from B to compare against.
-    ExprHandle loadFromB0 = Load::make(b, {0});
-    ExprHandle loadFromB1 = Load::make(b, {1});
-    ASSERT_TRUE(EQ(history[4]->bounds(), {CB(loadFromB0, loadFromB1 - 1)}));
-    ASSERT_TRUE(EQ(history[5]->bounds(), {CB(loadFromB0, loadFromB1 - 1)}));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   C[x] = A[B[x]];
-     * }
-     */
-    MemDependencyChecker analyzer({a, b}, {c});
-    StmtPtr stmt = Block::make({For::make(
-        x, 0, 10, Store::make(c, {x}, Load::make(a, {Load::make(b, {x})})))});
-
-    stmt->accept(&analyzer);
-
-    /*  0. Input: B[(0, 99)] - dependents: 2
-     *  1. Input: A[(0, 99)] - dependents: 3
-     *  2. Load: B[(0, 9)] - depends on: 0  - dependents: 3 4
-     *  3. Load: A[(B[0], B[9])] - depends on: 1 2  - dependents: 4
-     *  4. Store: C[(0, 9)] - depends on: 2 3  - dependents: 5
-     *  5. Output: C[(0, 99)] - depends on: 4
-     */
-
-    // Sanity check output depends on input.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
-
-    auto history = analyzer.getHistory();
-    ASSERT_EQ(history.size(), 6);
-
-    // The store depends on both loads, the load of A depends on the load of B.
-    ASSERT_TRUE(history[4]->hasDependency(history[2]));
-    ASSERT_TRUE(history[4]->hasDependency(history[3]));
-
-    ASSERT_TRUE(history[3]->hasDependency(history[2]));
-
-    // The loads in the indices depend on the relevant input buffer.
-    ASSERT_TRUE(history[3]->hasDependency(history[1]));
-    ASSERT_TRUE(history[2]->hasDependency(history[0]));
-
-    // The load from B has the loop bounds.
-    ASSERT_TRUE(EQ(history[2]->bounds(), {CB(0, 9)}));
-
-    // The load from A has bounds B[0] to B[9].
-    ExprHandle loadFromB0 = Load::make(b, {0});
-    ExprHandle loadFromB9 = Load::make(b, {9});
-    ASSERT_TRUE(EQ(history[3]->bounds(), {CB(loadFromB0, loadFromB9)}));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   C[B[x]] = A[x];
-     * }
-     */
-    MemDependencyChecker analyzer({a, b}, {c});
-    StmtPtr stmt = Block::make({For::make(
-        x, 0, 10, Store::make(c, {Load::make(b, {x})}, Load::make(a, {x})))});
-
-    stmt->accept(&analyzer);
-
-    /*  0. Input: B[(0, 99)] - dependents: 3
-     *  1. Input: A[(0, 99)] - dependents: 2
-     *  2. Load: A[(0, 9)] - depends on: 1  - dependents: 4
-     *  3. Load: B[(0, 9)] - depends on: 0  - dependents: 4
-     *  4. Store: C[(B[0], B[9])] - depends on: 2 3  - dependents: 5
-     *  5. Output: C[(0, 99)] - depends on: 4
-     */
-    // Sanity check output depends on input.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
-
-    auto history = analyzer.getHistory();
-    ASSERT_EQ(history.size(), 6);
-
-    // The store depends on both loads, neither load is dependent.
-    ASSERT_TRUE(history[4]->hasDependency(history[2]));
-    ASSERT_TRUE(history[4]->hasDependency(history[3]));
-
-    ASSERT_FALSE(history[3]->hasDependency(history[2]));
-    ASSERT_FALSE(history[2]->hasDependency(history[3]));
-
-    // The loads each depend on their relevant input. (but accesses are in a
-    // different order than the last case).
-    ASSERT_TRUE(history[3]->hasDependency(history[0]));
-    ASSERT_TRUE(history[2]->hasDependency(history[1]));
-
-    // The load from B has the loop bounds.
-    ASSERT_TRUE(EQ(history[3]->bounds(), {CB(0, 9)}));
-
-    // And so does the load from A.
-    ASSERT_TRUE(EQ(history[2]->bounds(), {CB(0, 9)}));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   C[B[A[x]]] = x;
-     * }
-     */
-    MemDependencyChecker analyzer({a, b}, {c});
-    StmtPtr stmt = Block::make({For::make(
-        x, 0, 10, Store::make(c, {Load::make(b, {Load::make(a, {x})})}, x))});
-
-    stmt->accept(&analyzer);
-
-    /*  0. Input: B[(0, 99)] - dependents: 3
-     *  1. Input: A[(0, 99)] - dependents: 2
-     *  2. Load: A[(0, 9)] - depends on: 1  - dependents: 3 4
-     *  3. Load: B[(A[0], A[9])] - depends on: 0 2  - dependents: 4
-     *  4. Store: C[(B[A[0]], B[A[9]])] - depends on: 2 3  - dependents: 5
-     *  5. Output: C[(0, 99)] - depends on: 4
-     */
-
-    // Sanity check output depends on input.
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(c.node(), b.node()));
-
-    auto history = analyzer.getHistory();
-    ASSERT_EQ(history.size(), 6);
-
-    // The store depends on both loads.
-    ASSERT_TRUE(history[4]->hasDependency(history[2]));
-    ASSERT_TRUE(history[4]->hasDependency(history[3]));
-
-    // The outer load depends on the inner.
-    ASSERT_TRUE(history[3]->hasDependency(history[2]));
-
-    // The loads each depend on their relevant input. (but accesses are in a
-    // different order than the last case).
-    ASSERT_TRUE(history[3]->hasDependency(history[0]));
-    ASSERT_TRUE(history[2]->hasDependency(history[1]));
-
-    // The load from A has the loop bounds.
-    ASSERT_TRUE(EQ(history[2]->bounds(), {CB(0, 9)}));
-    // The load from B as bounds A[0] to A[9].
-    ExprHandle loadFromA0 = Load::make(a, {0});
-    ExprHandle loadFromA9 = Load::make(a, {9});
-    ASSERT_TRUE(EQ(history[3]->bounds(), {CB(loadFromA0, loadFromA9)}));
-
-    // The store has bounds of B[A[0]] to B[A[9]].
-    ExprHandle loadFromBA0 = Load::make(b, {loadFromA0});
-    ExprHandle loadFromBA9 = Load::make(b, {loadFromA9});
-    ASSERT_TRUE(EQ(history[4]->bounds(), {CB(loadFromBA0, loadFromBA9)}));
-  }
-}
-
-// Verify multi dimensional bounds work.
-TEST(MemDependency, MemDependencyCheckerMultiDim) {
-  int M = 10, N = 9, K = 12;
-  BufHandle a("A", {M, N, K}, kInt);
-  BufHandle b("B", {M, N, K}, kInt);
-  BufHandle c("C", {M, K}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  VarHandle z("z", kInt);
-
-  using namespace analysis;
-
-  auto CB = [](ExprHandle s, ExprHandle e) {
-    return Bound(s.node(), e.node());
-  };
-
-  auto EQ = [](const IndexBounds& x, const IndexBounds& y) {
-    return indexBoundsEquals(x, y);
-  };
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   for (int y = 0; y < 9; y++) {
-     *     for (int z = 0; z < 12; z++) {
-     *       B[x, y, z] = A[x, y, z];
-     *     }
-     *   }
-     * }
-     */
-    // Full range.
-
-    MemDependencyChecker analyzer({a}, {b});
-    StmtPtr stmt = Block::make({For::make(
-        x,
-        0,
-        M,
-        For::make(
-            y,
-            0,
-            N,
-            For::make(
-                z,
-                0,
-                K,
-                Store::make(b, {x, y, z}, Load::make(a, {x, y, z})))))});
-
-    stmt->accept(&analyzer);
-
-    // Sanity test: Output depends on input.
-    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
-
-    // 4 accesses: input, load, store, output.
-    auto history = analyzer.getHistory();
-    ASSERT_EQ(history.size(), 4);
-
-    // Simple chain from input to output.
-    ASSERT_TRUE(history[3]->hasDependency(history[2]));
-    ASSERT_TRUE(history[2]->hasDependency(history[1]));
-    ASSERT_TRUE(history[1]->hasDependency(history[0]));
-
-    ASSERT_TRUE(
-        EQ(history[1]->bounds(), {CB(0, M - 1), CB(0, N - 1), CB(0, K - 1)}));
-    ASSERT_TRUE(
-        EQ(history[2]->bounds(), {CB(0, M - 1), CB(0, N - 1), CB(0, K - 1)}));
-  }
-
-  {
-    /* for (int x = 0; x < 5; x++) {
-     *   for (int y = 0; y < 5; y++) {
-     *     for (int z = 0; z < 5; z++) {
-     *       B[x, y, z] = A[x, y, z];
-     *     }
-     *   }
-     * }
-     */
-    // Partial range.
-
-    MemDependencyChecker analyzer({a}, {b});
-    StmtPtr stmt = Block::make({For::make(
-        x,
-        0,
-        5,
-        For::make(
-            y,
-            0,
-            5,
-            For::make(
-                z,
-                0,
-                5,
-                Store::make(b, {x, y, z}, Load::make(a, {x, y, z})))))});
-
-    stmt->accept(&analyzer);
-
-    // Sanity test: Output depends on input.
-    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
-
-    // 4 accesses: input, load, store, output.
-    auto history = analyzer.getHistory();
-    ASSERT_EQ(history.size(), 4);
-
-    // Simple chain from input to output.
-    ASSERT_TRUE(history[3]->hasDependency(history[2]));
-    ASSERT_TRUE(history[2]->hasDependency(history[1]));
-    ASSERT_TRUE(history[1]->hasDependency(history[0]));
-
-    ASSERT_TRUE(EQ(history[1]->bounds(), {CB(0, 4), CB(0, 4), CB(0, 4)}));
-    ASSERT_TRUE(EQ(history[2]->bounds(), {CB(0, 4), CB(0, 4), CB(0, 4)}));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   for (int y = 0; y < 12; y++) {
-     *     B[x, 0, y] = A[x, 0, y];
-     *   }
-     * }
-     */
-
-    // Partial loops.
-
-    MemDependencyChecker analyzer({a}, {b});
-    StmtPtr stmt = Block::make({For::make(
-        x,
-        0,
-        N,
-        For::make(
-            y, 0, K, Store::make(b, {x, 0, y}, Load::make(a, {x, 0, y}))))});
-
-    stmt->accept(&analyzer);
-
-    // Sanity test: Output depends on input.
-    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
-
-    // 4 accesses: input, load, store, output.
-    auto history = analyzer.getHistory();
-    ASSERT_EQ(history.size(), 4);
-
-    // Simple chain from input to output.
-    ASSERT_TRUE(history[3]->hasDependency(history[2]));
-    ASSERT_TRUE(history[2]->hasDependency(history[1]));
-    ASSERT_TRUE(history[1]->hasDependency(history[0]));
-
-    ASSERT_TRUE(
-        EQ(history[1]->bounds(), {CB(0, N - 1), CB(0, 0), CB(0, K - 1)}));
-    ASSERT_TRUE(
-        EQ(history[2]->bounds(), {CB(0, N - 1), CB(0, 0), CB(0, K - 1)}));
-  }
-
-  {
-    /* for (int x = 0; x < 10; x++) {
-     *   for (int y = 0; y < 100; y++) {
-     *     for (int z = 0; z < 12; z++) {
-     *       B[x, 0, z] = (A[x, 0, z]) + (C[x, z]);
-     *     }
-     *   }
-     * }
-     */
-
-    // Loops that don't correspond to an index, bufs with different
-    // dimensionality.
-
-    MemDependencyChecker analyzer({a, c}, {b});
-    StmtPtr stmt = Block::make({For::make(
-        x,
-        0,
-        M,
-        For::make(
-            y,
-            0,
-            100,
-            For::make(
-                z,
-                0,
-                K,
-                Store::make(
-                    b,
-                    {x, 0, z},
-                    Add::make(
-                        Load::make(a, {x, 0, z}), Load::make(c, {x, z}))))))});
-
-    stmt->accept(&analyzer);
-
-    // Sanity test: Output depends on both inputs.
-    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
-    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), c.node()));
-
-    // 6 accesses: 2 inputs, 2 loads, store, output.
-    auto history = analyzer.getHistory();
-    ASSERT_EQ(history.size(), 6);
-
-    // Simple chain from input to output over the A buf.
-    // history[0] is the C input, history[3] is the load from C.
-    ASSERT_TRUE(history[5]->hasDependency(history[4]));
-    ASSERT_TRUE(history[4]->hasDependency(history[2]));
-    ASSERT_TRUE(history[2]->hasDependency(history[1]));
-    // The store also depends on the load from the C input.
-    ASSERT_TRUE(history[4]->hasDependency(history[3]));
-    ASSERT_TRUE(history[3]->hasDependency(history[0]));
-
-    // A Buf accesses.
-    ASSERT_TRUE(
-        EQ(history[4]->bounds(), {CB(0, M - 1), CB(0, 0), CB(0, K - 1)}));
-    ASSERT_TRUE(
-        EQ(history[2]->bounds(), {CB(0, M - 1), CB(0, 0), CB(0, K - 1)}));
-
-    // C buf access.
-    ASSERT_TRUE(EQ(history[3]->bounds(), {CB(0, M - 1), CB(0, K - 1)}));
-  }
-
-  {
-    /* for (int x = 0; x < 9; x++) {
-     *   for (int y = 0; y < 10; y++) {
-     *     for (int z = 0; z < 12; z++) {
-     *       B[x, 0, 0] = (B[x, y, z]) + (A[x, y, z]);
-     *     }
-     *   }
-     * }
-     */
-    // Multi-dim reductions.
-
-    MemDependencyChecker analyzer({a}, {b});
-    StmtPtr stmt = Block::make({For::make(
-        x,
-        0,
-        M,
-        For::make(
-            y,
-            0,
-            N,
-            For::make(
-                z,
-                0,
-                K,
-                Store::make(
-                    b,
-                    {x, 0, 0},
-                    Add::make(
-                        Load::make(b, {x, y, z}),
-                        Load::make(a, {x, y, z}))))))});
-
-    stmt->accept(&analyzer);
-
-    // Sanity test: Output depends on input.
-    ASSERT_TRUE(analyzer.dependsIndirectly(b.node(), a.node()));
-
-    // 4 accesses: input, 2 loads, store, output.
-    auto history = analyzer.getHistory();
-    ASSERT_EQ(history.size(), 5);
-
-    // Simple chain from input to output.
-    ASSERT_TRUE(history[4]->hasDependency(history[3]));
-    ASSERT_TRUE(history[3]->hasDependency(history[2]));
-    ASSERT_TRUE(history[3]->hasDependency(history[1]));
-    ASSERT_TRUE(history[2]->hasDependency(history[0]));
-
-    // The load from B depends on the store to B.
-    ASSERT_TRUE(history[1]->hasDependency(history[3]));
-
-    ASSERT_TRUE(
-        EQ(history[1]->bounds(), {CB(0, M - 1), CB(0, N - 1), CB(0, K - 1)}));
-    ASSERT_TRUE(
-        EQ(history[2]->bounds(), {CB(0, M - 1), CB(0, N - 1), CB(0, K - 1)}));
-    ASSERT_TRUE(EQ(history[3]->bounds(), {CB(0, M - 1), CB(0, 0), CB(0, 0)}));
-  }
-}
-
-// Various tests using the external Compute/Reduce API.
-TEST(MemDependency, MemDependencyCheckerComputeAPI) {
-  using namespace analysis;
-
-  /* for (int m = 0; m < 4; m++) {
-   *   for (int n = 0; n < 5; n++) {
-   *     for (int k = 0; k < 6; k++) {
-   *       broadcast_add[m, n, k] = (a[m, n]) + (b[n, k]);
-   *     }
-   *   }
-   * }
-   * for (int m_1 = 0; m_1 < 4; m_1++) {
-   *   for (int n_1 = 0; n_1 < 5; n_1++) {
-   *     for (int k_1 = 0; k_1 < 6; k_1++) {
-   *       d[m_1, n_1, k_1] = (broadcast_add(m_1, n_1, k_1)) + float(1);
-   *     }
-   *   }
-   * }
-   */
-
-  // Can determine if 2 loops created by Compute are dependent.
-  BufHandle a_buf("a", {4, 5}, kFloat);
-  BufHandle b_buf("b", {5, 6}, kFloat);
-  Tensor c = Compute(
-      "broadcast_add",
-      {4, 5, 6},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n) + b_buf.load(n, k);
-      });
-  Tensor d = Compute(
-      "d",
-      {4, 5, 6},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c.load(m, n, k) + 1;
-      });
-
-  LoopNest l({d}, {c, d});
-
-  MemDependencyChecker analyzer({a_buf.node(), b_buf.node()}, {d.buf()});
-
-  l.root_stmt()->accept(&analyzer);
-
-  // Sanity test: Output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a_buf.node()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b_buf.node()));
-
-  // Second loop depends on first loop.
-  auto c_loop = l.getLoopStmtsFor(c)[0];
-  auto d_loop = l.getLoopStmtsFor(d)[0];
-  ASSERT_TRUE(analyzer.dependsDirectly(d_loop, c_loop));
-}
-
-TEST(MemDependency, MemDependencyCheckerComputeInline) {
-  using namespace analysis;
-
-  /* for (int m = 0; m < 4; m++) {
-   *   for (int n = 0; n < 5; n++) {
-   *     for (int k = 0; k < 6; k++) {
-   *       d[m, n, k] = ((a[m, n]) + (b[n, k])) + float(1);
-   *     }
-   *   }
-   * }
-   */
-
-  // Check inlining affects the number of accesses returned.
-
-  BufHandle a_buf("a", {4, 5}, kFloat);
-  BufHandle b_buf("b", {5, 6}, kFloat);
-  Tensor c = Compute(
-      "broadcast_add",
-      {4, 5, 6},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n) + b_buf.load(n, k);
-      });
-  Tensor d = Compute(
-      "d",
-      {4, 5, 6},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c.load(m, n, k) + 1;
-      });
-
-  LoopNest l({d}, {c, d});
-  l.computeInline(c.buf());
-
-  MemDependencyChecker analyzer({a_buf.node(), b_buf.node()}, {d.buf()});
-  l.root_stmt()->accept(&analyzer);
-
-  // Sanity test: Output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a_buf.node()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b_buf.node()));
-
-  // broadcast_add tensor should not appear in trace at all.
-  for (auto& wi : analyzer.getHistory()) {
-    ASSERT_NE(wi->var(), c.buf()->base_handle());
-  }
-}
-
-TEST(MemDependency, MemDependencyCheckerComputeSplit) {
-  using namespace analysis;
-  // Split an axis, so the number of loops != the number of dimensions.
-
-  BufHandle a_buf("a", {4, 5}, kFloat);
-  BufHandle b_buf("b", {5, 6}, kFloat);
-  Tensor c = Compute(
-      "broadcast_add",
-      {4, 5, 6},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n) + b_buf.load(n, k);
-      });
-
-  LoopNest l({c});
-
-  MemDependencyChecker analyzer_before({a_buf.node(), b_buf.node()}, {c.buf()});
-  l.root_stmt()->accept(&analyzer_before);
-
-  l.splitWithTail(l.getLoopStmtsFor(c)[0], 2);
-
-  MemDependencyChecker analyzer_after({a_buf.node(), b_buf.node()}, {c.buf()});
-  StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
-  stmt->accept(&analyzer_after);
-
-  // Splitting should not change accesses at all.
-  auto history_before = analyzer_before.getHistory();
-  auto history_after = analyzer_after.getHistory();
-
-  ASSERT_EQ(history_before.size(), history_after.size());
-
-  for (size_t i = 0; i < history_before.size(); ++i) {
-    ASSERT_EQ(history_before[i]->type(), history_after[i]->type());
-    ASSERT_EQ(history_before[i]->var(), history_after[i]->var());
-    ASSERT_EQ(
-        history_before[i]->bounds().size(), history_after[i]->bounds().size());
-    ASSERT_TRUE(indexBoundsEquals(
-        history_before[i]->bounds(), history_after[i]->bounds()));
-    ASSERT_EQ(
-        history_before[i]->dependencies().size(),
-        history_after[i]->dependencies().size());
-    ASSERT_EQ(
-        history_before[i]->dependents().size(),
-        history_after[i]->dependents().size());
-  }
-}
-
-TEST(MemDependency, MemDependencyCheckerComputeReorder) {
-  using namespace analysis;
-  // Reorder an axis, so the loop order doesn't match the indexing order.
-
-  BufHandle a_buf("a", {4, 5}, kFloat);
-  BufHandle b_buf("b", {5, 6}, kFloat);
-  Tensor c = Compute(
-      "broadcast_add",
-      {4, 5, 6},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n) + b_buf.load(n, k);
-      });
-
-  LoopNest l({c});
-
-  MemDependencyChecker analyzer_before({a_buf.node(), b_buf.node()}, {c.buf()});
-  l.root_stmt()->accept(&analyzer_before);
-
-  auto loops = l.getLoopStmtsFor(c);
-  l.reorderAxis(loops[0], loops[1]);
-
-  MemDependencyChecker analyzer_after({a_buf.node(), b_buf.node()}, {c.buf()});
-  StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
-  stmt->accept(&analyzer_after);
-
-  // Reordering should not change accesses at all.
-  auto history_before = analyzer_before.getHistory();
-  auto history_after = analyzer_after.getHistory();
-
-  ASSERT_EQ(history_before.size(), history_after.size());
-
-  for (size_t i = 0; i < history_before.size(); ++i) {
-    ASSERT_EQ(history_before[i]->type(), history_after[i]->type());
-    ASSERT_EQ(history_before[i]->var(), history_after[i]->var());
-    ASSERT_EQ(
-        history_before[i]->bounds().size(), history_after[i]->bounds().size());
-    ASSERT_TRUE(indexBoundsEquals(
-        history_before[i]->bounds(), history_after[i]->bounds()));
-    ASSERT_EQ(
-        history_before[i]->dependencies().size(),
-        history_after[i]->dependencies().size());
-    ASSERT_EQ(
-        history_before[i]->dependents().size(),
-        history_after[i]->dependents().size());
-  }
-}
-
-TEST(MemDependency, MemDependencyCheckerComputeReduce) {
-  using namespace analysis;
-  /* for (int l2 = 0; l2 < 2; l2++) {
-   *   for (int n1 = 0; n1 < 3; n1++) {
-   *     for (int m1 = 0; m1 < 6; m1++) {
-   *       scale[l2, n1, m1] = (b[l2, n1, m1]) * (a[l2, n1, m1]);
-   *     }
-   *   }
-   * }
-   * for (int l1 = 0; l1 < 2; l1++) {
-   *   sum[l1] = float(0);
-   *   for (int n1_1 = 0; n1_1 < 3; n1_1++) {
-   *     for (int m1_1 = 0; m1_1 < 6; m1_1++) {
-   *       sum[l1] = ReduceOp(sum, (sum[l1]) + (scale(l1, n1_1, m1_1)),
-   *                    out_args={l1}, reduce_args={n1, m1});
-   *     }
-   *   }
-   * }
-   */
-
-  // Can determine dependencies of a Reduction.
-
-  BufHandle a("a", {2, 3, 6}, kFloat);
-  BufHandle b("b", {2, 3, 6}, kFloat);
-
-  Tensor c = Compute(
-      "scale",
-      {2, 3, 6},
-      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
-        return b.load(l, n, m) * a.load(l, n, m);
-      });
-  Tensor d = Reduce("sum", {2}, Sum(), c, {3, 6});
-  LoopNest l({d}, {c, d});
-
-  MemDependencyChecker analyzer({a.node(), b.node()}, {d.buf()});
-
-  l.root_stmt()->accept(&analyzer);
-
-  // Sanity test: Output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a.node()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b.node()));
-
-  // Second loop depends on first loop.
-  auto c_loop = l.getLoopStmtsFor(c)[0];
-  auto d_loop = l.getLoopStmtsFor(d)[0];
-  ASSERT_TRUE(analyzer.dependsDirectly(d_loop, c_loop));
-
-  // Reduction depends on both inputs.
-  auto reduces = NodeFinder<ReduceOp>::find(l.root_stmt());
-  ASSERT_TRUE(analyzer.dependsIndirectly(reduces[0], a.node()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(reduces[0], b.node()));
-}
-
-TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
-  int M = 1024;
-  int N = 1024;
-  int K = 2048;
-  using namespace analysis;
-
-  BufHandle AP("A", {M, K}, kFloat);
-  BufHandle BP("B", {K, N}, kFloat);
-  Tensor CT = Reduce(
-      "gemm",
-      {M, N},
-      Sum(),
-      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
-        return AP.load(m, k) * BP.load(k, n);
-      },
-      {K});
-  LoopNest loop({CT});
-
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    ForPtr m = loops[0];
-    loop.splitWithMask(m, 4);
-  }
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    ForPtr n = loops[2];
-    loop.splitWithMask(n, 16);
-  }
-  // mo, mi, no, ni, k ->
-  // mo, no, mi, ni, k
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    ForPtr mi = loops[1];
-    ForPtr no = loops[2];
-    loop.reorderAxis(mi, no);
-  }
-  // mo, no, mi, ni, k ->
-  // mo, no, mi, k, ni
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    ForPtr ni = loops[3];
-    ForPtr k = loops[4];
-    loop.reorderAxis(ni, k);
-  }
-  // mo, no, mi, k, ni ->
-  // mo, no, k, mi, ni
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    ForPtr mi = loops[2];
-    ForPtr k = loops[3];
-    loop.reorderAxis(mi, k);
-  }
-  {
-    auto const& loops = loop.getLoopStmtsFor(CT);
-    loop.cacheAccesses(CT.buf(), "C_regs", loops[2]);
-  }
-
-  MemDependencyChecker analyzer_unlowered(
-      loop.getInputBufs(), loop.getOutputBufs());
-
-  MemDependencyChecker analyzer_lowered(
-      loop.getInputBufs(), loop.getOutputBufs());
-
-  // Test both unlowered and lowered form.
-  {
-    StmtPtr stmt = IRSimplifier::simplify(loop.root_stmt());
-    stmt->accept(&analyzer_unlowered);
-
-    // Outputs depend on inputs.
-    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT.buf(), AP.node()));
-    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT.buf(), BP.node()));
-
-    // The last write to gemm should cover the total bound of the output.
-    std::shared_ptr<AccessInfo> outputAccess =
-        analyzer_unlowered.output(CT.buf());
-    // A single dependency.
-    ASSERT_EQ(outputAccess->dependencies().size(), 1);
-
-    // dependencies is a set with 1 element, so can just deref begin().
-    std::shared_ptr<AccessInfo> gemmStore =
-        outputAccess->dependencies().begin()->second;
-    // Check its a store.
-    ASSERT_EQ(gemmStore->type(), AccessType::Store);
-
-    ASSERT_TRUE(indexBoundsEquals(outputAccess->bounds(), gemmStore->bounds()));
-
-    // Likewise the first read from each input cover the entire range of the
-    // input.
-    auto aInput = analyzer_unlowered.input(AP.node());
-    auto bInput = analyzer_unlowered.input(BP.node());
-
-    // A single dependent each.
-    ASSERT_EQ(aInput->dependents().size(), 1);
-    ASSERT_EQ(bInput->dependents().size(), 1);
-
-    // They're both loads.
-    std::shared_ptr<AccessInfo> aLoad = aInput->dependents().begin()->second;
-    std::shared_ptr<AccessInfo> bLoad = bInput->dependents().begin()->second;
-    ASSERT_EQ(aLoad->type(), AccessType::Load);
-    ASSERT_EQ(bLoad->type(), AccessType::Load);
-
-    ASSERT_TRUE(indexBoundsEquals(aInput->bounds(), aLoad->bounds()));
-    ASSERT_TRUE(indexBoundsEquals(bInput->bounds(), bLoad->bounds()));
-  }
-
-  loop.prepareForCodegen();
-  SimpleIREvaluator cg(loop.root_stmt(), {AP, BP, CT});
-
-  // now check lowered dependency graph.
-  {
-    StmtPtr stmt = IRSimplifier::simplify(cg.stmt());
-    stmt->accept(&analyzer_lowered);
-
-    // Lowering will change the dimensionality of all bounds due to index
-    // flattening and will insert Allocates and Frees.
-
-    auto history_before = analyzer_unlowered.getHistory();
-    auto history_after = analyzer_lowered.getHistory();
-
-    ASSERT_EQ(history_before.size() + 2, history_after.size());
-
-    // Filter out the alloc/free;
-    auto isAllocFree = [](const auto& info) {
-      return info->type() == AccessType::Alloc ||
-          info->type() == AccessType::Free;
-    };
-    history_after.erase(
-        std::remove_if(history_after.begin(), history_after.end(), isAllocFree),
-        history_after.end());
-
-    ASSERT_EQ(history_before.size(), history_after.size());
-
-    for (size_t i = 0; i < history_before.size(); ++i) {
-      ASSERT_EQ(history_before[i]->type(), history_after[i]->type());
-      ASSERT_EQ(history_before[i]->var(), history_after[i]->var());
-
-      if (history_before[i]->dependencies().size() !=
-          history_after[i]->dependencies().size()) {
-        // Must depend on an Alloc.
-        ASSERT_TRUE(std::any_of(
-            history_after[i]->dependencies().begin(),
-            history_after[i]->dependencies().end(),
-            [](const auto& pair) {
-              return pair.second->type() == AccessType::Alloc;
-            }));
-
-        ASSERT_EQ(
-            history_before[i]->dependencies().size() + 1,
-            history_after[i]->dependencies().size());
-      }
-
-      if (history_before[i]->dependents().size() !=
-          history_after[i]->dependents().size()) {
-        // Must depend on an Free.
-        ASSERT_TRUE(std::any_of(
-            history_after[i]->dependents().begin(),
-            history_after[i]->dependents().end(),
-            [](const auto& pair) {
-              return pair.second->type() == AccessType::Free;
-            }));
-
-        ASSERT_EQ(
-            history_before[i]->dependents().size() + 1,
-            history_after[i]->dependents().size());
-      }
-
-      // Inputs and outputs are not flattened, only accesses.
-      if (history_before[i]->type() == AccessType::Input ||
-          history_before[i]->type() == AccessType::Output) {
-        ASSERT_EQ(
-            history_before[i]->bounds().size(),
-            history_after[i]->bounds().size());
-        ASSERT_TRUE(indexBoundsEquals(
-            history_before[i]->bounds(), history_after[i]->bounds()));
-      } else {
-        ASSERT_EQ(history_after[i]->bounds().size(), 1);
-        ExprPtr flat_bounds = alloc<IntImm>(1);
-
-        for (auto& b : history_before[i]->bounds()) {
-          flat_bounds =
-              alloc<Mul>(flat_bounds, alloc<Add>(b.end, alloc<IntImm>(1)));
-
-          // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-          ASSERT_TRUE(exprEquals(b.start, history_after[i]->bounds()[0].start));
-        }
-
-        flat_bounds = IRSimplifier::simplify(flat_bounds);
-        ExprPtr after_bounds = IRSimplifier::simplify(
-            alloc<Add>(history_after[i]->bounds()[0].end, alloc<IntImm>(1)));
-        ASSERT_TRUE(exprEquals(flat_bounds, after_bounds));
-      }
-    }
-  }
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_memplanning.cpp b/test/cpp/tensorexpr/test_memplanning.cpp
deleted file mode 100644
index f5ee8747650fc..0000000000000
--- a/test/cpp/tensorexpr/test_memplanning.cpp
+++ /dev/null
@@ -1,708 +0,0 @@
-#include <gtest/gtest.h>
-#include <test/cpp/tensorexpr/test_base.h>
-
-#include <c10/util/irange.h>
-#include <test/cpp/tensorexpr/padded_buffer.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_printer.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::tensorexpr;
-
-extern void checkIR(StmtPtr s, const std::string& pattern);
-
-TEST(BufLiveRange, SingleRangeLine) {
-  VarHandle i("i", kInt), j("j", kInt);
-  BufHandle a("a", {32}, kFloat);
-  BufHandle b("b", {32, 32}, kFloat);
-
-  // Construct Stmt:
-  // {
-  //   for (int i = 0; i < 32; i++) {
-  //     a[i] = 0;
-  //     for (int j = 0; j < 32; j++) {
-  //       a[i] = (a[i]) + (b[i, j]);
-  //     }
-  //   }
-  // }
-
-  StorePtr aInit = Store::make(a, {i}, 0);
-  ExprHandle reduce = a.load({i}) + b.load({i, j});
-  StorePtr aReduce = Store::make(a, {i}, reduce);
-  StmtPtr loop =
-      For::make(i, 0, 32, Block::make({aInit, For::make(j, 0, 32, aReduce)}));
-
-  StmtPtr stmt = Block::make({loop});
-
-  auto range = BufLiveRange::liveRange(stmt, a.node());
-  ASSERT_TRUE(std::get<0>(range) == 0);
-  ASSERT_TRUE(std::get<1>(range) == 0);
-}
-
-TEST(BufLiveRange, MulRangeLine) {
-  VarHandle i("i", kInt);
-  BufHandle a("a", {32}, kFloat);
-  BufHandle b("b", {32}, kFloat);
-
-  // Construct Stmt:
-  // {
-  //   for (int i = 0; i < 32; i++) {
-  //     if (i<10 ? 1 : 0) {
-  //       a[i] = i + i;
-  //       b[i] = i * i;
-  //     }
-  //   }
-  //   for (int i = 0; i < 32; i++) {
-  //     if (i>10 ? 1 : 0) {
-  //       a[i] = i * i;
-  //       b[i] = i + i;
-  //     }
-  //   }
-  // }
-
-  StorePtr aStore_1 = Store::make(a, {i}, i + i);
-  StorePtr bStore_1 = Store::make(b, {i}, i * i);
-  StmtPtr loop_1 = For::make(
-      i, 0, 32, Cond::make(i < 10, Block::make({aStore_1, bStore_1}), NULL));
-
-  StorePtr aStore_2 = Store::make(a, {i}, i * i);
-  StorePtr bStore_2 = Store::make(b, {i}, i + i);
-  StmtPtr loop_2 = For::make(
-      i, 0, 32, Cond::make(i > 10, Block::make({aStore_2, bStore_2}), NULL));
-
-  StmtPtr stmt = Block::make({loop_1, loop_2});
-
-  auto range_a = BufLiveRange::liveRange(stmt, a.node());
-  ASSERT_TRUE(std::get<0>(range_a) == 0);
-  ASSERT_TRUE(std::get<1>(range_a) == 1);
-
-  auto range_b = BufLiveRange::liveRange(stmt, b.node());
-  ASSERT_TRUE(std::get<0>(range_b) == 0);
-  ASSERT_TRUE(std::get<1>(range_b) == 1);
-}
-
-TEST(MemPlanning, MemReuseWithTypeCast) {
-  int M = 4;
-  int N = 4;
-  int K = 4;
-
-  BufHandle AP("A", {M, K}, kFloat);
-  BufHandle BP("B", {K, N}, kFloat);
-
-  Tensor CT = Reduce(
-      "gemm",
-      {M, N},
-      Sum(),
-      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
-        return AP.load(m, k) * BP.load(k, n);
-      },
-      {K});
-  Tensor DT =
-      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return CompareSelect::make(
-            CT.load(m, n), 0.0f, 0.0f, CT.load(m, n), kLT);
-      });
-  Tensor ET =
-      Compute("E", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return Cast::make(kQUInt8, DT.load(m, n) + DT.load(m, n));
-      });
-  Tensor FT =
-      Compute("F", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return ET.load(m, n);
-      });
-  StmtPtr stmt =
-      tensorexpr::Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
-
-  // Constructed stmt:
-  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
-  // E [2, 3]. The dimensions of 'gemm' and 'E' are the same but their types are
-  // different: 'E' type quint8 < 'gemm' type float. We'll reuse 'gemm' for 'E'
-  // with typecasting.
-  //{
-  //  for (int i = 0; i < 4; i++) {
-  //    for (int i_1 = 0; i_1 < 4; i_1++) {
-  //      gemm[i, i_1] = float(0);
-  //      for (int i_2 = 0; i_2 < 4; i_2++) {
-  //        gemm[i, i_1] = ReduceOp((gemm[i, i_1]) + (A[i, i_2]) * (B[i_2,
-  //        i_1]), reduce_args={i_2});
-  //      }
-  //    }
-  //  }
-  //  for (int i_3 = 0; i_3 < 4; i_3++) {
-  //    for (int i_4 = 0; i_4 < 4; i_4++) {
-  //      relu[i_3, i_4] = (gemm[i_3, i_4])<0.f ? 0.f : (gemm[i_3, i_4]);
-  //    }
-  //  }
-  //  for (int i_5 = 0; i_5 < 4; i_5++) {
-  //    for (int i_6 = 0; i_6 < 4; i_6++) {
-  //      E[i_5, i_6] = quint8((relu[i_5, i_6]) + (relu[i_5, i_6]));
-  //    }
-  //  }
-  //  for (int i_7 = 0; i_7 < 4; i_7++) {
-  //    for (int i_8 = 0; i_8 < 4; i_8++) {
-  //      F[i_7, i_8] = E[i_7, i_8];
-  //    }
-  //  }
-  //}
-
-  LoopNest l(stmt, {FT.buf()});
-  l.prepareForCodegen();
-  SimpleIREvaluator cg(Stmt::clone(l.root_stmt()), {AP, BP, FT});
-
-  checkIR(cg.stmt(), R"IR(
-# CHECK: Allocate(gemm); // dtype=float, dims=[4, 4]
-# CHECK: Allocate(relu); // dtype=float, dims=[4, 4]
-# CHECK: Alias(E,gemm);
-# CHECK: Free(relu);
-# CHECK: Free(gemm))IR");
-
-  PaddedBuffer<float> a_v(M, K, "a");
-  PaddedBuffer<float> b_v(K, N, "b");
-  PaddedBuffer<uint8_t> o1(M, N, "e_before");
-  PaddedBuffer<uint8_t> o2(M, N, "e_after");
-
-  for (const auto m : c10::irange(M)) {
-    for (const auto k : c10::irange(K)) {
-      a_v(m, k) = at::randn({1}).item().to<float>();
-    }
-  }
-
-  for (const auto k : c10::irange(K)) {
-    for (const auto n : c10::irange(N)) {
-      b_v(k, n) = at::randn({1}).item().to<float>();
-    }
-  }
-
-  cg.call({a_v, b_v, o1});
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen cg_llvm(Stmt::clone(l.root_stmt()), {AP, BP, FT});
-
-  checkIR(cg_llvm.stmt(), R"IR(
-# CHECK: Allocate(gemm); // dtype=float, dims=[4, 4]
-# CHECK: Allocate(relu); // dtype=float, dims=[4, 4]
-# CHECK: Alias(E,gemm);
-# CHECK: Free(relu);
-# CHECK: Free(gemm))IR");
-
-  cg_llvm.call({a_v, b_v, o2});
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  ExpectAllNear(o1, o2, 1e-5);
-#endif
-}
-
-TEST(MemPlanning, NoMemReuseForLargerType) {
-  int M = 4;
-  int N = 4;
-  int K = 4;
-
-  BufHandle AP("A", {M, K}, kShort);
-  BufHandle BP("B", {K, N}, kShort);
-
-  Tensor CT = Reduce(
-      "gemm",
-      {M, N},
-      Sum(),
-      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
-        return AP.load(m, k) * BP.load(k, n);
-      },
-      {K});
-  auto zero = Cast::make(CT.buf()->dtype(), 0);
-  Tensor DT =
-      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return CompareSelect::make(
-            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
-      });
-  Tensor ET =
-      Compute("E", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return Cast::make(kFloat, DT.load(m, n) + DT.load(m, n));
-      });
-  Tensor FT =
-      Compute("F", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return ET.load(m, n);
-      });
-  StmtPtr stmt =
-      tensorexpr::Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
-
-  // Constructed stmt:
-  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
-  // E [2, 3]. The dimensions of 'gemm' and 'E' are the same but their types are
-  // different: 'E' type float > 'gemm' type int16. We won't reuse 'gemm' for
-  // 'E'.
-  //{
-  //  for (int i = 0; i < 4; i++) {
-  //    for (int i_1 = 0; i_1 < 4; i_1++) {
-  //      gemm[i, i_1] = int16_t(0);
-  //      for (int i_2 = 0; i_2 < 4; i_2++) {
-  //        gemm[i, i_1] = ReduceOp((gemm[i, i_1]) + (A[i, i_2]) * (B[i_2,
-  //        i_1]), reduce_args={i_2});
-  //      }
-  //    }
-  //  }
-  //  for (int i_3 = 0; i_3 < 4; i_3++) {
-  //    for (int i_4 = 0; i_4 < 4; i_4++) {
-  //      relu[i_3, i_4] = (gemm[i_3, i_4])<int16_t(0) ? int16_t(0) : (gemm[i_3,
-  //      i_4]);
-  //    }
-  //  }
-  //  for (int i_5 = 0; i_5 < 4; i_5++) {
-  //    for (int i_6 = 0; i_6 < 4; i_6++) {
-  //      E[i_5, i_6] = float((relu[i_5, i_6]) + (relu[i_5, i_6]));
-  //    }
-  //  }
-  //  for (int i_7 = 0; i_7 < 4; i_7++) {
-  //    for (int i_8 = 0; i_8 < 4; i_8++) {
-  //      F[i_7, i_8] = E[i_7, i_8];
-  //    }
-  //  }
-  //}
-
-  LoopNest l(stmt, {FT.buf()});
-  l.prepareForCodegen();
-  SimpleIREvaluator cg(Stmt::clone(l.root_stmt()), {AP, BP, FT.buf()});
-
-  checkIR(cg.stmt(), R"IR(
-# CHECK: Allocate(gemm); // dtype=int16_t, dims=[4, 4]
-# CHECK: Allocate(relu); // dtype=int16_t, dims=[4, 4]
-# CHECK: Allocate(E); // dtype=float, dims=[4, 4]
-# CHECK: Free(E);
-# CHECK: Free(relu);
-# CHECK: Free(gemm))IR");
-
-  PaddedBuffer<short> a_v(M, K, "a");
-  PaddedBuffer<short> b_v(K, N, "b");
-  PaddedBuffer<float> o1(M, N, "e_before");
-  PaddedBuffer<float> o2(M, N, "e_after");
-
-  for (const auto m : c10::irange(M)) {
-    for (const auto k : c10::irange(K)) {
-      a_v(m, k) = at::randn({1}).item().to<float>();
-    }
-  }
-
-  for (const auto k : c10::irange(K)) {
-    for (const auto n : c10::irange(N)) {
-      b_v(k, n) = at::randn({1}).item().to<float>();
-    }
-  }
-
-  cg.call({a_v, b_v, o1});
-
-#ifdef TORCH_ENABLE_LLVM
-  LLVMCodeGen cg_llvm(Stmt::clone(l.root_stmt()), {AP, BP, FT});
-
-  checkIR(cg_llvm.stmt(), R"IR(
-# CHECK: Allocate(gemm); // dtype=int16_t, dims=[4, 4]
-# CHECK: Allocate(relu); // dtype=int16_t, dims=[4, 4]
-# CHECK: Allocate(E); // dtype=float, dims=[4, 4]
-# CHECK: Free(E);
-# CHECK: Free(relu);
-# CHECK: Free(gemm))IR");
-
-  cg_llvm.call({a_v, b_v, o2});
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  ExpectAllNear(o1, o2, 1e-5);
-#endif
-}
-
-TEST(MemPlanning, SameBufSizeMemReuse) {
-  int M = 1024;
-  int N = 1024;
-  int K = 2048;
-
-  BufHandle AP("A", {M, K}, kFloat);
-  BufHandle BP("B", {K, N}, kFloat);
-
-  Tensor CT = Reduce(
-      "gemm",
-      {M, N},
-      Sum(),
-      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
-        return AP.load(m, k) * BP.load(k, n);
-      },
-      {K});
-  Tensor DT =
-      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        auto zero = Cast::make(CT.buf()->dtype(), 0);
-        return CompareSelect::make(
-            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
-      });
-  Tensor ET =
-      Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return DT.load(m, n) + DT.load(m, n);
-      });
-  Tensor FT =
-      Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return ET.load(m, n) * ET.load(m, n);
-      });
-  auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
-
-  // Constructed stmt:
-  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
-  // add [2, 3] Buffer 'gemm' and 'add' are the same size; we'll reuse 'gemm'
-  // for 'add'.
-  //{
-  //  for (int M = 0; M < 1024; M++) {
-  //    for (int N = 0; N < 1024; N++) {
-  //      gemm[M, N] = float(0);
-  //      for (int K = 0; K < 2048; K++) {
-  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
-  //        reduce_args={K});
-  //      }
-  //    }
-  //  }
-  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
-  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
-  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
-  //      N_1]);
-  //    }
-  //  }
-  //  for (int M_2 = 0; M_2 < 1024; M_2++) {
-  //    for (int N_2 = 0; N_2 < 1024; N_2++) {
-  //      add[M_2, N_2] = (relu[M_2, N_2]) + (relu[M_2, N_2]);
-  //    }
-  //  }
-  //  for (int M_3 = 0; M_3 < 1024; M_3++) {
-  //    for (int N_3 = 0; N_3 < 1024; N_3++) {
-  //      mul[M_3, N_3] = (add[M_3, N_3]) * (add[M_3, N_3]);
-  //    }
-  //  }
-  //}
-
-  SimpleIREvaluator cg(stmt, {AP, BP, FT});
-
-  checkIR(cg.stmt(), R"IR(
-# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
-# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
-# CHECK: Alias(add,gemm);
-# CHECK: Free(relu);
-# CHECK: Free(gemm))IR");
-
-#ifdef TORCH_ENABLE_LLVM
-  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
-  loop.prepareForCodegen();
-  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
-
-  checkIR(cg_llvm.stmt(), R"IR(
-# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
-# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
-# CHECK: Alias(add,gemm);
-# CHECK: Free(relu);
-# CHECK: Free(gemm))IR");
-#endif
-}
-
-TEST(MemPlanning, SameBufSizeMultiMemReuses) {
-  int M = 1024;
-  int N = 1024;
-  int K = 2048;
-
-  BufHandle AP("A", {M, K}, kFloat);
-  BufHandle BP("B", {K, N}, kFloat);
-
-  Tensor CT = Reduce(
-      "gemm",
-      {M, N},
-      Sum(),
-      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
-        return AP.load(m, k) * BP.load(k, n);
-      },
-      {K});
-  Tensor DT =
-      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        auto zero = Cast::make(CT.buf()->dtype(), 0);
-        return CompareSelect::make(
-            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
-      });
-  Tensor ET =
-      Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return DT.load(m, n) + DT.load(m, n);
-      });
-  Tensor FT =
-      Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return ET.load(m, n) * ET.load(m, n);
-      });
-  Tensor GT =
-      Compute("sub", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return FT.load(m, n) - ET.load(m, n);
-      });
-
-  auto stmt =
-      Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt(), GT.stmt()});
-
-  // Constructed stmt:
-  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
-  // add [2, 3], mul [3, 4] Buffer 'gemm', 'relu, ''add' and 'mul' are the same
-  // size; we'll reuse 'gemm' for 'add', and reuse 'relu' for 'mul'
-  //{
-  //  for (int M = 0; M < 1024; M++) {
-  //    for (int N = 0; N < 1024; N++) {
-  //      gemm[M, N] = float(0);
-  //      for (int K = 0; K < 2048; K++) {
-  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
-  //        reduce_args={K});
-  //      }
-  //    }
-  //  }
-  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
-  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
-  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
-  //      N_1]);
-  //    }
-  //  }
-  //  for (int M_2 = 0; M_2 < 1024; M_2++) {
-  //    for (int N_2 = 0; N_2 < 1024; N_2++) {
-  //      add[M_2, N_2] = (relu[M_2, N_2]) + (relu[M_2, N_2]);
-  //    }
-  //  }
-  //  for (int M_3 = 0; M_3 < 1024; M_3++) {
-  //    for (int N_3 = 0; N_3 < 1024; N_3++) {
-  //      mul[M_3, N_3] = (add[M_3, N_3]) * (add[M_3, N_3]);
-  //    }
-  //  }
-  //  for (int M_4 = 0; M_4 < 1024; M_4++) {
-  //    for (int N_4 = 0; N_4 < 1024; N_4++) {
-  //      sub[M_4, N_4] = (mul[M_4, N_4]) - (add[M_4, N_4]);
-  //    }
-  //  }
-  //}
-
-  SimpleIREvaluator cg(stmt, {AP, BP, GT});
-
-  checkIR(cg.stmt(), R"IR(
-# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
-# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
-# CHECK: Alias(add,gemm);
-# CHECK: Alias(mul,relu);
-# CHECK: Free(relu);
-# CHECK: Free(gemm))IR");
-
-#ifdef TORCH_ENABLE_LLVM
-  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
-  loop.prepareForCodegen();
-  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
-
-  checkIR(cg_llvm.stmt(), R"IR(
-# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
-# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
-# CHECK: Alias(add,gemm);
-# CHECK: Alias(mul,relu);
-# CHECK: Free(relu);
-# CHECK: Free(gemm))IR");
-#endif
-}
-
-TEST(MemPlanning, SameBufSizeMultiMemReusesOfOneBuf) {
-  int M = 1024;
-  int N = 1024;
-  int K = 2048;
-
-  BufHandle AP("A", {M, K}, kFloat);
-  BufHandle BP("B", {K, N}, kFloat);
-
-  Tensor CT = Reduce(
-      "gemm",
-      {M, N},
-      Sum(),
-      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
-        return AP.load(m, k) * BP.load(k, n);
-      },
-      {K});
-  Tensor DT =
-      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        auto zero = Cast::make(CT.buf()->dtype(), 0);
-        return CompareSelect::make(
-            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
-      });
-  Tensor ET =
-      Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return DT.load(m, n) + DT.load(m, n);
-      });
-  Tensor FT =
-      Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return ET.load(m, n) * ET.load(m, n);
-      });
-  Tensor GT =
-      Compute("sub", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return FT.load(m, n) - 1;
-      });
-  Tensor HT =
-      Compute("div", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return GT.load(m, n) / 2;
-      });
-
-  auto stmt = Block::make(
-      {CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt(), GT.stmt(), HT.stmt()});
-
-  // Constructed stmt:
-  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
-  // add [2, 3], mul [3, 4], sub [4, 5] Buffer 'gemm', 'relu, ''add', 'mul' and
-  // 'sub' are the same size; we'll reuse 'gemm' for 'add', reuse 'relu' for
-  // 'mul', and reuse 'gemm' for 'sub'.
-  //{
-  //  for (int M = 0; M < 1024; M++) {
-  //    for (int N = 0; N < 1024; N++) {
-  //      gemm[M, N] = float(0);
-  //      for (int K = 0; K < 2048; K++) {
-  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
-  //        reduce_args={K});
-  //      }
-  //    }
-  //  }
-  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
-  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
-  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
-  //      N_1]);
-  //    }
-  //  }
-  //  for (int M_2 = 0; M_2 < 1024; M_2++) {
-  //    for (int N_2 = 0; N_2 < 1024; N_2++) {
-  //      add[M_2, N_2] = (relu[M_2, N_2]) + (relu[M_2, N_2]);
-  //    }
-  //  }
-  //  for (int M_3 = 0; M_3 < 1024; M_3++) {
-  //    for (int N_3 = 0; N_3 < 1024; N_3++) {
-  //      mul[M_3, N_3] = (add[M_3, N_3]) * (add[M_3, N_3]);
-  //    }
-  //  }
-  //  for (int M_4 = 0; M_4 < 1024; M_4++) {
-  //    for (int N_4 = 0; N_4 < 1024; N_4++) {
-  //      sub[M_4, N_4] = (mul[M_4, N_4]) - float(1);
-  //    }
-  //  }
-  //  for (int M_5 = 0; M_5 < 1024; M_5++) {
-  //    for (int N_5 = 0; N_5 < 1024; N_5++) {
-  //      div[M_5, N_5] = (sub[M_5, N_5]) / float(2);
-  //    }
-  //  }
-  //}
-
-  SimpleIREvaluator cg(stmt, {AP, BP, HT});
-
-  checkIR(cg.stmt(), R"IR(
-# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
-# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
-# CHECK: Alias(add,gemm);
-# CHECK: Alias(mul,relu);
-# CHECK: Alias(sub,gemm);
-# CHECK: Free(relu);
-# CHECK: Free(gemm))IR");
-
-#ifdef TORCH_ENABLE_LLVM
-  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
-  loop.prepareForCodegen();
-  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
-
-  checkIR(cg_llvm.stmt(), R"IR(
-# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
-# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
-# CHECK: Alias(add,gemm);
-# CHECK: Alias(mul,relu);
-# CHECK: Alias(sub,gemm);
-# CHECK: Free(relu);
-# CHECK: Free(gemm))IR");
-#endif
-}
-
-TEST(MemPlanning, SmallerBufSizeNonMemReuse) {
-  int M = 1024;
-  int N = 1024;
-  int K = 2048;
-
-  BufHandle AP("A", {M, K}, kFloat);
-  BufHandle BP("B", {K, N}, kFloat);
-
-  Tensor CT = Reduce(
-      "gemm",
-      {M, N},
-      Sum(),
-      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
-        return AP.load(m, k) * BP.load(k, n);
-      },
-      {K});
-  Tensor DT =
-      Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) {
-        auto zero = Cast::make(CT.buf()->dtype(), 0);
-        return CompareSelect::make(
-            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
-      });
-  Tensor ET = Compute(
-      "add", {M * 2, N * 2}, [&](const ExprHandle& em, const ExprHandle& en) {
-        return DT.load(em / 2, en / 2) + DT.load(em / 2, en / 2);
-      });
-  Tensor FT = Compute(
-      "mul", {M * 2, N * 2}, [&](const ExprHandle& fm, const ExprHandle& fn) {
-        return ET.load(fm, fn) * ET.load(fm, fn);
-      });
-  auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
-
-  // Constructed stmt:
-  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
-  // add [2, 3] We do not reuse buffer 'gemm' for 'add' because the size of
-  // buffer 'gemm' is smaller.
-  //{
-  //  for (int M = 0; M < 1024; M++) {
-  //    for (int N = 0; N < 1024; N++) {
-  //      gemm[M, N] = float(0);
-  //      for (int K = 0; K < 2048; K++) {
-  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
-  //        reduce_args={K});
-  //      }
-  //    }
-  //  }
-  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
-  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
-  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
-  //      N_1]);
-  //    }
-  //  }
-  //  for (int EM = 0; EM < 2048; EM++) {
-  //    for (int EN = 0; EN < 2048; EN++) {
-  //      add[EM, EN] = (relu[EM / 2, EN / 2]) + (relu[EM / 2, EN / 2]);
-  //    }
-  //  }
-  //  for (int FM = 0; FM < 2048; FM++) {
-  //    for (int FN = 0; FN < 2048; FN++) {
-  //      mul[FM, FN] = (add[FM, FN]) * (add[FM, FN]);
-  //    }
-  //  }
-  //}
-  //
-
-  SimpleIREvaluator cg(stmt, {AP, BP, FT});
-
-  checkIR(cg.stmt(), R"IR(
-# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
-# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
-# CHECK-NOT: Alias(add,gemm);
-# CHECK: Allocate(add); // dtype=float, dims=[2048, 2048]
-# CHECK: Free(add);
-# CHECK: Free(relu);
-# CHECK: Free(gemm))IR");
-
-#ifdef TORCH_ENABLE_LLVM
-  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
-  loop.prepareForCodegen();
-  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
-
-  checkIR(cg_llvm.stmt(), R"IR(
-# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
-# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
-# CHECK-NOT: Alias(add,gemm);
-# CHECK: Allocate(add); // dtype=float, dims=[2048, 2048]
-# CHECK: Free(add);
-# CHECK: Free(relu);
-# CHECK: Free(gemm))IR");
-#endif
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_ops.cpp b/test/cpp/tensorexpr/test_ops.cpp
deleted file mode 100644
index 379c901968d54..0000000000000
--- a/test/cpp/tensorexpr/test_ops.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-#include <gtest/gtest.h>
-#include <torch/csrc/jit/tensorexpr/eval.h>
-#include <torch/csrc/jit/tensorexpr/expr.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/operators/operators.h>
-#include <torch/torch.h>
-
-using namespace torch::jit::tensorexpr;
-
-using Tensors = std::vector<Tensor>;
-using Args = std::vector<CodeGen::BufferArg>;
-std::unique_ptr<SimpleIREvaluator> compile(
-    const Args& inputs,
-    const Tensors& outputs) {
-  LoopNest nest({outputs});
-  nest.prepareForCodegen();
-  nest.simplify();
-  auto join = inputs;
-  join.insert(join.end(), outputs.begin(), outputs.end());
-  return std::make_unique<SimpleIREvaluator>(nest.root_stmt(), join);
-}
-
-TEST(Ops, Sum) {
-  constexpr int M = 8;
-  constexpr int N = 16;
-  std::vector<IntList> testDims = {{0}, {1}, {0, 1}};
-  std::vector<std::vector<ExprHandle>> outputShapes = {{N}, {M}, {}};
-  for (unsigned idx = 0; idx < testDims.size(); idx++) {
-    const auto& dims = testDims[idx];
-    const auto& outShape = outputShapes[idx];
-
-    BufHandle a("a", {M, N}, kFloat);
-    std::vector<ExprHandle> outStrides =
-        c10::fmap<ExprHandle>(make_contiguous_strides(outShape));
-    Tensor b = computeSum(
-        {a, dims, false}, outShape, outStrides, c10::kFloat, at::kCPU);
-    auto cg = compile({a}, {b});
-
-    auto at = at::arange(M * N, at::kFloat).view({M, N});
-    auto ref = at::sum(at, dims);
-    auto bt = at::empty_like(ref);
-
-    cg->call({at.data_ptr<float>(), bt.data_ptr<float>()});
-
-    ASSERT_TRUE(at::allclose(bt, ref));
-  }
-}
-
-TEST(Ops, ChannelsLastSum) {
-  constexpr int A = 2;
-  constexpr int B = 3;
-  constexpr int C = 4;
-  constexpr int D = 5;
-  constexpr int E = 6;
-  std::vector<IntList> testDims = {{0}, {1}, {0, 1}};
-
-  std::vector<std::vector<ExprHandle>> outputShapes = {
-      {B, C, D, E}, {A, C, D, E}, {C, D, E}};
-  for (unsigned idx = 0; idx < testDims.size(); idx++) {
-    const auto& dims = testDims[idx];
-    const auto& outShape = outputShapes[idx];
-
-    BufHandle a("a", {A, B, C, D, E}, kFloat);
-    std::vector<ExprHandle> outStrides =
-        c10::fmap<ExprHandle>(make_channels_last_strides(outShape));
-    Tensor b = computeSum(
-        {a, dims, false}, outShape, outStrides, c10::kFloat, at::kCPU);
-    auto cg = compile({a}, {b});
-
-    auto at = at::arange(A * B * C * D * E, at::kFloat).view({A, B, C, D, E});
-    auto ref = at::sum(at, dims);
-    auto bt = at::empty_like(ref);
-
-    cg->call({at.data_ptr<float>(), bt.data_ptr<float>()});
-
-    ASSERT_TRUE(at::allclose(bt, ref));
-  }
-}
diff --git a/test/cpp/tensorexpr/test_quantization.cpp b/test/cpp/tensorexpr/test_quantization.cpp
deleted file mode 100644
index af6b539ff33e9..0000000000000
--- a/test/cpp/tensorexpr/test_quantization.cpp
+++ /dev/null
@@ -1,452 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <ATen/native/quantized/PackedParams.h>
-#include <test/cpp/tensorexpr/test_base.h>
-#include <torch/csrc/jit/ir/ir.h>
-#include <torch/csrc/jit/ir/irparser.h>
-#include <torch/csrc/jit/tensorexpr/kernel.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/csrc/jit/testing/file_check.h>
-#include <torch/torch.h>
-#include <cmath>
-#include <sstream>
-#include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/ir.h"
-
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::tensorexpr;
-using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
-using namespace torch::indexing;
-using namespace torch::jit::tensorexpr;
-
-class Quantization : public ::testing::Test {
- public:
-  void SetUp() override {
-    getTEMustUseLLVMOnCPU() = false;
-  }
-};
-
-TEST_F(Quantization, QuantDequantInt8) {
-  const auto graph_string = R"IR(
-      graph(%x.1 : Float(2, 2, strides=[2, 1], device=cpu)):
-        %2 : int = prim::Constant[value=12]()
-        %3 : int = prim::Constant[value=13]()
-        %4 : float = prim::Constant[value=0.1]()
-        %q.1 : QInt8(2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2)
-        %6 : Float(2, 2) = aten::dequantize(%q.1)
-        return (%6))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto x = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto q = at::quantize_per_tensor(x, 0.1f, 13, at::kQInt8);
-  auto y_expected = at::dequantize(q);
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {x};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto y = stack[0].toTensor();
-  bool check = at::allclose(y_expected, y);
-  if (!check) {
-    std::cout << "y_expected:\n" << y_expected << std::endl;
-    std::cout << "y:\n" << y << std::endl;
-  }
-  TORCH_CHECK_EQ(check, 1);
-}
-
-TEST_F(Quantization, QuantDequantUInt8) {
-  const auto graph_string = R"IR(
-      graph(%x.1 : Float(2, 2, strides=[2, 1], device=cpu)):
-        %2 : int = prim::Constant[value=13]()
-        %3 : int = prim::Constant[value=122]()
-        %4 : float = prim::Constant[value=0.1]()
-        %q.1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2)
-        %6 : Float(2, 2) = aten::dequantize(%q.1)
-        return (%6))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto x = 2 * at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto q = at::quantize_per_tensor(x, 0.1f, 122, at::kQUInt8);
-  auto y_expected = at::dequantize(q);
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {x};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto y = stack[0].toTensor();
-  bool check = at::allclose(y_expected, y);
-  if (!check) {
-    std::cout << "y_expected:\n" << y_expected << std::endl;
-    std::cout << "y:\n" << y << std::endl;
-  }
-  TORCH_CHECK_EQ(check, 1);
-}
-
-TEST_F(Quantization, QuantDequantUInt8_NLC) {
-  const auto graph_string = R"IR(
-      graph(%x.1 : Float(1, 2, 2, strides=[4, 1, 2], device=cpu)):
-        %2 : int = prim::Constant[value=13]()
-        %3 : int = prim::Constant[value=122]()
-        %4 : float = prim::Constant[value=0.1]()
-        %q.1 : QUInt8(1, 2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2)
-        %6 : Float(1, 2, 2) = aten::dequantize(%q.1)
-        return (%6))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto x = 2 * at::rand({1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  x.unsafeGetTensorImpl()->set_sizes_and_strides(
-      std::initializer_list<int64_t>{1, 2, 2}, {4, 1, 2});
-  auto q = at::quantize_per_tensor(x, 0.1f, 122, at::kQUInt8);
-  auto y_expected = at::dequantize(q);
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {x};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto y = stack[0].toTensor();
-  bool check = at::allclose(y_expected, y);
-  if (!check) {
-    std::cout << "x:\n" << x << std::endl;
-    std::cout << "y_expected:\n" << y_expected << std::endl;
-    std::cout << "y:\n" << y << std::endl;
-  }
-  TORCH_CHECK_EQ(check, 1);
-}
-
-at::Tensor quantized_add(
-    at::Tensor x1,
-    at::Tensor x2,
-    double scale,
-    int64_t zero) {
-  const auto qadd_op =
-      c10::Dispatcher::singleton()
-          .findSchemaOrThrow("quantized::add", "")
-          .typed<at::Tensor(at::Tensor, at::Tensor, double, int64_t)>();
-  return qadd_op.call(x1, x2, scale, zero);
-}
-
-TEST_F(Quantization, QuantAddDequantInt8) {
-  const auto graph_string = R"IR(
-      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu), %x2 : Float(2, 2, strides=[2, 1], device=cpu)):
-        %2 : int = prim::Constant[value=12]()
-        %qz1 : int = prim::Constant[value=13]()
-        %qs1 : float = prim::Constant[value=0.1]()
-        %qz2 : int = prim::Constant[value=13]()
-        %qs2 : float = prim::Constant[value=0.1]()
-        %qza : int = prim::Constant[value=13]()
-        %qsa : float = prim::Constant[value=0.1]()
-        %q1 : QInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
-        %q2 : QInt8(2, 2) = aten::quantize_per_tensor(%x2, %qs2, %qz2, %2)
-        %qa : QInt8(2, 2) = quantized::add(%q1, %q2, %qsa, %qza)
-        %6 : Float(2, 2) = aten::dequantize(%qa)
-        return (%6))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto x2 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQInt8);
-  auto q2 = at::quantize_per_tensor(x2, 0.1f, 13, at::kQInt8);
-  auto qa = quantized_add(q1, q2, 0.1f, 13);
-  auto y_expected = at::dequantize(qa);
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {x1, x2};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto y = stack[0].toTensor();
-  bool check = at::allclose(y_expected, y);
-  if (!check) {
-    std::cout << "x1:\n" << x1 << std::endl;
-    std::cout << "q1:\n" << q1 << std::endl;
-    std::cout << "x2:\n" << x2 << std::endl;
-    std::cout << "q2:\n" << q2 << std::endl;
-    std::cout << "y_expected:\n" << y_expected << std::endl;
-    std::cout << "y:\n" << y << std::endl;
-  }
-  TORCH_CHECK_EQ(check, 1);
-}
-
-TEST_F(Quantization, QuantAddDequantUInt8) {
-  const auto graph_string = R"IR(
-      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu), %x2 : Float(2, 2, strides=[2, 1], device=cpu)):
-        %2 : int = prim::Constant[value=13]()
-        %qz1 : int = prim::Constant[value=13]()
-        %qs1 : float = prim::Constant[value=0.1]()
-        %qz2 : int = prim::Constant[value=13]()
-        %qs2 : float = prim::Constant[value=0.1]()
-        %qza : int = prim::Constant[value=13]()
-        %qsa : float = prim::Constant[value=0.1]()
-        %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
-        %q2 : QUInt8(2, 2) = aten::quantize_per_tensor(%x2, %qs2, %qz2, %2)
-        %qa : QUInt8(2, 2) = quantized::add(%q1, %q2, %qsa, %qza)
-        %6 : Float(2, 2) = aten::dequantize(%qa)
-        return (%6))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto x2 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8);
-  auto q2 = at::quantize_per_tensor(x2, 0.1f, 13, at::kQUInt8);
-  auto qa = quantized_add(q1, q2, 0.1f, 13);
-  auto y_expected = at::dequantize(qa);
-
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {x1, x2};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto y = stack[0].toTensor();
-  bool check = at::allclose(y_expected, y);
-  if (!check) {
-    std::cout << "x1:\n" << x1 << std::endl;
-    std::cout << "q1:\n" << q1 << std::endl;
-    std::cout << "x2:\n" << x2 << std::endl;
-    std::cout << "q2:\n" << q2 << std::endl;
-    std::cout << "y_expected:\n" << y_expected << std::endl;
-    std::cout << "y:\n" << y << std::endl;
-  }
-  TORCH_CHECK_EQ(check, 1);
-}
-
-TEST_F(Quantization, QuantSigmoidDequantUInt8) {
-  const auto graph_string = R"IR(
-      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu)):
-        %2 : int = prim::Constant[value=13]()
-        %qz1 : int = prim::Constant[value=13]()
-        %qs1 : float = prim::Constant[value=0.1]()
-        %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
-        %qa : QUInt8(2, 2) = aten::sigmoid(%q1)
-        %6 : Float(2, 2) = aten::dequantize(%qa)
-        return (%6))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8);
-  auto qs = at::sigmoid(q1);
-  auto y_expected = at::dequantize(qs);
-
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {x1};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto y = stack[0].toTensor();
-  bool check = at::allclose(y_expected, y);
-  if (!check) {
-    std::cout << "x1:\n" << x1 << std::endl;
-    std::cout << "q1:\n" << q1 << std::endl;
-    std::cout << "qs:\n" << qs << std::endl;
-    std::cout << "y_expected:\n" << y_expected << std::endl;
-    std::cout << "y:\n" << y << std::endl;
-  }
-  TORCH_CHECK_EQ(check, 1);
-}
-
-at::Tensor quantized_mul(
-    at::Tensor x1,
-    at::Tensor x2,
-    double scale,
-    int64_t zero) {
-  const auto op =
-      c10::Dispatcher::singleton()
-          .findSchemaOrThrow("quantized::mul", "")
-          .typed<at::Tensor(at::Tensor, at::Tensor, double, int64_t)>();
-  return op.call(x1, x2, scale, zero);
-}
-
-TEST_F(Quantization, QuantMulDequantUInt8) {
-  const auto graph_string = R"IR(
-      graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu), %x2 : Float(2, 2, strides=[2, 1], device=cpu)):
-        %2 : int = prim::Constant[value=13]()
-        %qz1 : int = prim::Constant[value=13]()
-        %qs1 : float = prim::Constant[value=0.1]()
-        %qz2 : int = prim::Constant[value=13]()
-        %qs2 : float = prim::Constant[value=0.1]()
-        %qza : int = prim::Constant[value=13]()
-        %qsa : float = prim::Constant[value=0.1]()
-        %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2)
-        %q2 : QUInt8(2, 2) = aten::quantize_per_tensor(%x2, %qs2, %qz2, %2)
-        %qa : QUInt8(2, 2) = quantized::mul(%q1, %q2, %qsa, %qza)
-        %6 : Float(2, 2) = aten::dequantize(%qa)
-        return (%6))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto x2 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8);
-  auto q2 = at::quantize_per_tensor(x2, 0.1f, 13, at::kQUInt8);
-  auto qa = quantized_mul(q1, q2, 0.1f, 13);
-  auto y_expected = at::dequantize(qa);
-
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {x1, x2};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto y = stack[0].toTensor();
-  bool check = at::allclose(y_expected, y);
-  if (!check) {
-    std::cout << "x1:\n" << x1 << std::endl;
-    std::cout << "q1:\n" << q1 << std::endl;
-    std::cout << "x2:\n" << x2 << std::endl;
-    std::cout << "q2:\n" << q2 << std::endl;
-    std::cout << "y_expected:\n" << y_expected << std::endl;
-    std::cout << "y:\n" << y << std::endl;
-  }
-  TORCH_CHECK_EQ(check, 1);
-}
-
-TEST_F(Quantization, QuantUpsampleNearst2dDequantUInt8) {
-  const auto graph_string = R"IR(
-      graph(%x : Float(1, 1, 4, 4, strides=[16, 16, 4, 1], device=cpu)):
-        %2 : int = prim::Constant[value=13]()
-        %4 : NoneType = prim::Constant()
-        %3 : int[] = prim::Constant[value=[6, 6]]()
-        %qz : int = prim::Constant[value=13]()
-        %qs : float = prim::Constant[value=0.1]()
-        %q : QUInt8(1, 1, 4, 4) = aten::quantize_per_tensor(%x, %qs, %qz, %2)
-        %qu : QUInt8(1, 1, 6, 6) = aten::upsample_nearest2d(%q, %3, %4)
-        %6 : Float(1, 1, 6, 6) = aten::dequantize(%qu)
-        return (%6))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto x = at::rand({1, 1, 4, 4}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto q = at::quantize_per_tensor(x, 0.1f, 13, at::kQUInt8);
-  auto qu = at::upsample_nearest2d(q, {6, 6});
-  auto y_expected = at::dequantize(qu);
-
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {x};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto y = stack[0].toTensor();
-  bool check = at::allclose(y_expected, y);
-  if (!check) {
-    std::cout << "x:\n" << x << std::endl;
-    std::cout << "q:\n" << q << std::endl;
-    std::cout << "qu:\n" << qu << std::endl;
-    std::cout << "y_expected:\n" << y_expected << std::endl;
-    std::cout << "y:\n" << y << std::endl;
-  }
-  TORCH_CHECK_EQ(check, 1);
-}
-
-TEST_F(Quantization, UpsampleNearst2d) {
-  const auto graph_string = R"IR(
-      graph(%x : Float(1, 1, 2, 2, strides=[2, 2, 2, 1], device=cpu)):
-        %4 : NoneType = prim::Constant()
-        %3 : int[] = prim::Constant[value=[4, 4]]()
-        %u : Float(1, 1, 4, 4) = aten::upsample_nearest2d(%x, %3, %4)
-        return (%u))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto x = at::rand({1, 1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto y_expected = at::upsample_nearest2d(x, {4, 4});
-
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {x};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto y = stack[0].toTensor();
-  bool check = at::allclose(y_expected, y);
-  if (!check) {
-    std::cout << "x:\n" << x << std::endl;
-    std::cout << "y_expected:\n" << y_expected << std::endl;
-    std::cout << "y:\n" << y << std::endl;
-  }
-  TORCH_CHECK_EQ(check, 1);
-}
-
-at::Tensor quantized_cat(
-    c10::List<at::Tensor> const& xs,
-    int64_t dim,
-    double scale,
-    int64_t zero) {
-  const auto op = c10::Dispatcher::singleton()
-                      .findSchemaOrThrow("quantized::cat", "")
-                      .typed<at::Tensor(
-                          c10::List<at::Tensor> const&,
-                          int64_t,
-                          std::optional<double>,
-                          std::optional<int64_t>)>();
-  return op.redispatch(
-      DispatchKeySet({DispatchKey::QuantizedCPU}), xs, dim, scale, zero);
-}
-
-TEST_F(Quantization, QuantCatDequantUInt8) {
-  const auto graph_string = R"IR(
-      graph(%x : Float(1, 1, 2, 2, strides=[2, 2, 2, 1], device=cpu), %y : Float(1, 1, 2, 2, strides=[2, 2, 2, 1], device=cpu), %z : Float(1, 1, 2, 2, strides=[2, 2, 2, 1], device=cpu)):
-        %qdt : int = prim::Constant[value=13]()
-        %qxz : int = prim::Constant[value=13]()
-        %qxs : float = prim::Constant[value=0.1]()
-        %qyz : int = prim::Constant[value=16]()
-        %qys : float = prim::Constant[value=0.15]()
-        %qzz : int = prim::Constant[value=19]()
-        %qzs : float = prim::Constant[value=0.2]()
-        %qx : QUInt8(1, 1, 2, 2) = aten::quantize_per_tensor(%x, %qxs, %qxz, %qdt)
-        %qy : QUInt8(1, 1, 2, 2) = aten::quantize_per_tensor(%y, %qys, %qyz, %qdt)
-        %qz : QUInt8(1, 1, 2, 2) = aten::quantize_per_tensor(%z, %qzs, %qzz, %qdt)
-        %catx : Tensor[] = prim::ListConstruct(%qx, %qy, %qz)
-        %catd : int = prim::Constant[value=0]()
-        %qcat : QUInt8(3, 1, 2, 2) = quantized::cat(%catx, %catd, %qxs, %qxz)
-        %cat : Float(3, 1, 2, 2) = aten::dequantize(%qcat)
-        return (%cat))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto x = at::rand({1, 1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto y = at::rand({1, 1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto z = at::rand({1, 1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto qx = at::quantize_per_tensor(x, 0.1f, 13, at::kQUInt8);
-  auto qy = at::quantize_per_tensor(y, 0.15f, 16, at::kQUInt8);
-  auto qz = at::quantize_per_tensor(z, 0.2f, 19, at::kQUInt8);
-  auto qcat = quantized_cat({qx, qy, qz}, 0, 0.1f, 13);
-  auto expected = at::dequantize(qcat);
-
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {x, y, z};
-  StmtPtr s = k.getCodeGenStmt();
-
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  auto result = stack[0].toTensor();
-  bool check = at::allclose(expected, result);
-  if (!check) {
-    std::cout << "x:\n" << x << std::endl;
-    std::cout << "y:\n" << y << std::endl;
-    std::cout << "z:\n" << z << std::endl;
-    std::cout << "qx:\n" << qx << std::endl;
-    std::cout << "qy:\n" << qy << std::endl;
-    std::cout << "qz:\n" << qz << std::endl;
-    std::cout << "qcat:\n" << qcat << std::endl;
-    std::cout << "expected:\n" << expected << std::endl;
-    std::cout << "result:\n" << result << std::endl;
-  }
-  TORCH_CHECK_EQ(check, 1);
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
deleted file mode 100644
index fb83ab85b71ed..0000000000000
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ /dev/null
@@ -1,1928 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <limits>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <unordered_map>
-
-#include <test/cpp/tensorexpr/test_base.h>
-
-#include <c10/util/irange.h>
-#include <test/cpp/tensorexpr/padded_buffer.h>
-#include <torch/csrc/jit/tensorexpr/analysis.h>
-#include <torch/csrc/jit/tensorexpr/eval.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_printer.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/csrc/jit/testing/file_check.h>
-
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::tensorexpr;
-
-TEST(Reductions, ReduceSum0D_1) {
-  const int M = 10;
-
-  BufHandle b("b", {M}, kFloat);
-  std::vector<float> in(M);
-  for (const auto j : c10::irange(M)) {
-    in[j] = j;
-  }
-
-  std::vector<float> out(M, -1.f);
-
-  Tensor c = Reduce("sum", {M}, Sum(), b, {});
-  LoopNest loop({c});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {b, c});
-
-  cg.call({in, out});
-  for (const auto i : c10::irange(M)) {
-    ASSERT_EQ(out[i], in[i]);
-  }
-}
-
-TEST(Reductions, ReduceSum0D_2) {
-  BufHandle b("b", {}, kFloat);
-  std::vector<float> in(1);
-  in[0] = 77.7;
-
-  std::vector<float> out(1, -1.f);
-
-  Tensor c = Reduce("sum", {}, Sum(), b, {});
-  LoopNest loop({c});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {b, c});
-
-  cg.call({in, out});
-  ASSERT_EQ(out[0], in[0]);
-}
-
-// Sum an array to a single value.
-TEST(Reductions, ReduceSum1D) {
-  BufHandle b("b", {10}, kFloat);
-  std::vector<float> in(10);
-  for (const auto j : c10::irange(10)) {
-    in[j] = j;
-  }
-
-  std::vector<float> out(1, -1.f);
-
-  Tensor c = Reduce("sum", {}, Sum(), b, {10});
-  LoopNest loop({c});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {b, c});
-
-  cg.call({in, out});
-  ASSERT_EQ(out[0], 45);
-}
-// Sum a 2D tensor to a 1D tensor with dynamic shapes.
-TEST(Reductions, ReduceSum2D) {
-  const int M = 3;
-  const int N = 7;
-
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-
-  BufHandle b("b", {m, n}, kFloat);
-  std::vector<float> in(M * N);
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N)) {
-      in[i * N + j] = j;
-    }
-  }
-
-  std::vector<float> out(M, -1.f);
-
-  Tensor c = Reduce("sum", {M}, Sum(), b, {N});
-  LoopNest loop({c});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {b, c, n, m});
-
-  cg.call({in, out, 5, 7});
-
-  float expected = 0;
-  for (const auto i : c10::irange(N)) {
-    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-    expected += i;
-  }
-
-  for (const auto i : c10::irange(M)) {
-    ASSERT_EQ(out[i], expected);
-  }
-}
-
-// Sum a 3D tensor to both a 2D and 1D tensor, then reduce the 2D tensor flat to
-// check our work.
-TEST(Reductions, ReduceSum3D) {
-  const int M = 10;
-  VarHandle m("m", kInt);
-
-  BufHandle b("b", {2, 3, m}, kFloat);
-
-  Tensor c = Reduce("sum", {2, 3}, Sum(), b, {m});
-  LoopNest loop({c});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {b, c, m});
-
-  std::vector<float> bData(2 * 3 * M, 0);
-  std::vector<float> cData(2 * 3, 6.0f);
-  std::vector<float> dData(2, 1.0f);
-  std::vector<float> eData(2, 1.0f);
-
-  for (int i = 0; i < 2 * 3; ++i) {
-    for (const auto j : c10::irange(M)) {
-      bData[i * M + j] = j;
-    }
-  }
-
-  cg.call({bData, cData, M});
-  float expected = 0;
-  for (const auto i : c10::irange(M)) {
-    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-    expected += i;
-  }
-
-  for (int i = 0; i < 2 * 3; ++i) {
-    ASSERT_EQ(cData[i], expected);
-  }
-
-  Tensor d = Reduce("sum2", {2}, Sum(), b, {3, m});
-  LoopNest loop2({d});
-  loop2.prepareForCodegen();
-  StmtPtr s2 = loop2.root_stmt();
-  s2 = IRSimplifier::simplify(s2);
-
-  SimpleIREvaluator cg2(s2, {b, d, m});
-  cg2.call({bData, dData, M});
-
-  // We're combining an additional dimension of 3, so the sum is 3x.
-  expected = expected * 3;
-
-  for (const auto i : c10::irange(2)) {
-    ASSERT_EQ(dData[i], expected);
-  }
-
-  // This is the same as just reducing the original result across that axis.
-  BufHandle c_buf(c.buf());
-  Tensor e = Reduce("sum3", {2}, Sum(), c_buf, {3});
-  LoopNest loop3({e});
-  loop3.prepareForCodegen();
-  StmtPtr s3 = loop3.root_stmt();
-  s3 = IRSimplifier::simplify(s3);
-
-  SimpleIREvaluator cg3(s3, {c, e});
-  cg3.call({cData, eData});
-
-  for (const auto i : c10::irange(2)) {
-    ASSERT_EQ(eData[i], expected);
-  }
-}
-
-// Sum a large (10 D) Tensor 5 dimensions in.
-TEST(Reductions, ReduceSum10D) {
-  BufHandle in_("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat);
-  const int InputSize = 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3;
-  BufHandle out_("out_", {2, 3, 2, 3, 2}, kFloat);
-  const int OutputSize = 2 * 3 * 2 * 3 * 2;
-
-  std::vector<float> in(InputSize, 1.f);
-  std::vector<float> out(OutputSize, -1.f);
-
-  Tensor c = Reduce("sum", {2, 3, 2, 3, 2}, Sum(), in_, {3, 2, 3, 2, 3});
-  LoopNest loop({c});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {in_, c});
-
-  cg.call({in, out});
-
-  // NOLINTNEXTLINE(bugprone-integer-division)
-  float expected = InputSize / OutputSize;
-  for (const auto i : c10::irange(OutputSize)) {
-    ASSERT_EQ(out[i], expected);
-  }
-}
-
-// Reduce via Mul rather than Add using a custom Reducer.
-TEST(Reductions, ReduceProduct) {
-  const int M = 4;
-  const int N = 4;
-
-  BufHandle b("b", {M, N}, kFloat);
-  std::vector<float> in(M * N);
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N)) {
-      in[i * N + j] = 2 + j;
-    }
-  }
-
-  std::vector<float> out(M, -1.f);
-
-  Reducer product(
-      ExprHandle(1.f), [](ExprHandle a, ExprHandle b) { return a * b; });
-
-  Tensor c = Reduce("product", {M}, product, b, {N});
-  LoopNest loop({c});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {b, c});
-
-  cg.call({in, out});
-
-  float expected = 1;
-  for (const auto i : c10::irange(N)) {
-    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-    expected *= 2 + i;
-  }
-
-  for (const auto i : c10::irange(M)) {
-    ASSERT_EQ(out[i], expected);
-  }
-}
-
-// Maximum reductions.
-TEST(Reductions, ReduceMax) {
-  BufHandle in_("b", {10}, kFloat);
-
-  std::vector<float> in(10);
-  std::vector<float> out(1, -1.f);
-  for (const auto j : c10::irange(10)) {
-    in[j] = j;
-  }
-
-  Tensor dm1 = Reduce("max", {}, Maximum(kFloat), in_, {10});
-
-  LoopNest loop({dm1});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-  SimpleIREvaluator cg(s, {in_, dm1});
-
-  cg.call({in, out});
-
-  ASSERT_EQ(out[0], 9);
-
-  BufHandle in2_("b", {2, 5}, kFloat);
-  std::vector<float> out2(2, -1.f);
-
-  Tensor m2d = Reduce("max", {2}, Maximum(kFloat), in2_, {5});
-
-  LoopNest loop2({m2d});
-  loop2.prepareForCodegen();
-  s = loop2.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg2(s, {in2_, m2d});
-  cg2.call({in, out2});
-
-  ASSERT_EQ(out2[0], 4);
-  ASSERT_EQ(out2[1], 9);
-}
-
-// Minimum reduction, with custom initialization.
-TEST(Reductions, ReduceMinCustomInitializer) {
-  VarHandle minInit("minInit", kFloat);
-  BufHandle in_("b", {10}, kFloat);
-
-  std::vector<float> in(10);
-  std::vector<float> out(1, -1.f);
-  for (const auto j : c10::irange(10)) {
-    in[j] = 10 + j;
-  }
-
-  Tensor min = Reduce(
-      "min",
-      {},
-      Minimum(ExprHandle(minInit)),
-      [&](ParameterList& v) { return in_.load(v); },
-      {10});
-
-  LoopNest loop({min});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {in_, min, minInit});
-
-  // Works normally (note that out data starts lower than the correct
-  // minimum).
-  cg.call({in, out, std::numeric_limits<float>::max()});
-  ASSERT_EQ(out[0], 10);
-
-  // With an initializer lower than the min, that's the min.
-  cg.call({in, out, 5.f});
-  ASSERT_EQ(out[0], 5);
-}
-
-// Example implementation of Any/All.
-// TODO: this is very awkward without logical And/Or operators.
-TEST(Reductions, ReduceAnyAll) {
-  VarHandle searchValue("searchValue", kInt);
-  BufHandle b("b", {4, 10}, kInt);
-
-  Reducer anyEqSV(ExprHandle(0), [](ExprHandle a, ExprHandle b) {
-    return CompareSelect::make(a, 1, 1, b, kEQ);
-  });
-
-  Tensor any = Reduce(
-      "anyEqual",
-      {4},
-      anyEqSV,
-      [&](const auto& i, const auto& j) {
-        return CompareSelect::make(b.load(i, j), searchValue, kEQ);
-      },
-      {10});
-
-  LoopNest loop({any});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {b, any, searchValue});
-
-  std::vector<int> in(40, 0);
-  std::vector<int> out(4, 0);
-
-  // input has 0-39 in 4 rows.
-  for (const auto i : c10::irange(40)) {
-    in[i] = i;
-  }
-  cg.call({in, out, 1});
-
-  // only the first row has 1
-  ASSERT_EQ(out[0], 1);
-  ASSERT_EQ(out[1], 0);
-  ASSERT_EQ(out[2], 0);
-  ASSERT_EQ(out[3], 0);
-
-  cg.call({in, out, 15});
-
-  // 15 in the 3rd row
-  ASSERT_EQ(out[0], 0);
-  ASSERT_EQ(out[1], 1);
-  ASSERT_EQ(out[2], 0);
-  ASSERT_EQ(out[3], 0);
-
-  Reducer allGTSV(ExprHandle(1), [](ExprHandle a, ExprHandle b) {
-    return CompareSelect::make(a, 0, 0, b, kEQ);
-  });
-
-  Tensor allGreaterThan = Reduce(
-      "allGreaterThan",
-      {4},
-      allGTSV,
-      [&](const auto& i, const auto& j) {
-        return CompareSelect::make(b.load(i, j), searchValue, kGT);
-      },
-      {10});
-
-  LoopNest loop2({allGreaterThan});
-  loop2.prepareForCodegen();
-  s = loop2.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg2(s, {b, allGreaterThan, searchValue});
-
-  cg2.call({in, out, 11});
-
-  // 11 is in row 2.
-  ASSERT_EQ(out[0], 0);
-  ASSERT_EQ(out[1], 0);
-  ASSERT_EQ(out[2], 1);
-  ASSERT_EQ(out[3], 1);
-
-  cg2.call({in, out, -3});
-
-  // All are positive.
-  ASSERT_EQ(out[0], 1);
-  ASSERT_EQ(out[1], 1);
-  ASSERT_EQ(out[2], 1);
-  ASSERT_EQ(out[3], 1);
-}
-
-TEST(Reductions, ReduceMatmul2D) {
-  BufHandle tA("tA", {3, 2}, kFloat);
-  BufHandle tB("tB", {2, 3}, kFloat);
-
-  std::vector<float> tA_(6);
-  std::vector<float> tB_(6);
-
-  std::vector<float> out(9, -1.f);
-  for (const auto i : c10::irange(3)) {
-    for (const auto j : c10::irange(2)) {
-      tA_[i * 2 + j] = i * 2 + j;
-      tB_[j * 3 + i] = i * 2 + j;
-    }
-  }
-
-  Tensor mm = Reduce(
-      "mm",
-      {3, 3},
-      Sum(),
-      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
-        return tA.load(m, k) * tB.load(k, n);
-      },
-      {2});
-
-  LoopNest loop({mm});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {tA, tB, mm});
-  cg.call({tA_, tB_, out});
-
-  std::vector<float> expected(
-      {1.f, 3.f, 5.f, 3.f, 13.f, 23.f, 5.f, 23.f, 41.f});
-
-  for (const auto i : c10::irange(9)) {
-    ASSERT_EQ(out[i], expected[i]);
-  }
-}
-
-TEST(Reductions, ReduceRfactorLike) {
-  BufHandle in("in", {10, 10}, kFloat);
-  std::vector<float> in_(100);
-  for (const auto i : c10::irange(100)) {
-    in_[i] = i;
-  }
-  std::vector<float> in_rf_(10, -2.f);
-  std::vector<float> out(1, -1.f);
-
-  Tensor l1 = Reduce("l1", {10}, Sum(), in, {10});
-  BufHandle in_rf(l1.buf());
-
-  Tensor l2 = Reduce("l2", {}, Sum(), in_rf, {10});
-
-  LoopNest loop({l1, l2});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {in, l1, l2});
-  cg.call({in_, in_rf_, out});
-
-  ASSERT_EQ(out[0], 99 * 50);
-}
-
-TEST(Reductions, ReduceAsProducer) {
-  const int M = 10;
-  VarHandle m("m", kInt);
-
-  BufHandle a("a", {2, 3}, kFloat);
-  BufHandle b("b", {2, 3, m}, kFloat);
-
-  Tensor c = Reduce("sum", {2, 3}, Sum(), b, {m});
-  Tensor d =
-      Compute("scale", {2, 3}, [&](const VarHandle& l, const VarHandle& n) {
-        return c.load(l, n) * a.load(l, n);
-      });
-  LoopNest loop({d}, {c, d});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {a, b, d, m});
-
-  std::vector<float> aData(2 * 3, 0);
-  std::vector<float> bData(2 * 3 * M, 0);
-  std::vector<float> dData(2 * 3, 6.0f);
-
-  for (int i = 0; i < 2 * 3; ++i) {
-    aData[i] = 6 - i;
-    for (const auto j : c10::irange(M)) {
-      bData[i * M + j] = j;
-    }
-  }
-
-  cg.call({aData, bData, dData, M});
-  float expected = 0;
-  for (const auto i : c10::irange(M)) {
-    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-    expected += i;
-  }
-  for (int i = 0; i < 2 * 3; ++i) {
-    ASSERT_EQ(dData[i], expected * (6 - i));
-  }
-}
-
-TEST(Reductions, ReduceAsConsumer) {
-  const int M = 10;
-  VarHandle m("m", kInt);
-
-  BufHandle a("a", {2, 3, m}, kFloat);
-  BufHandle b("b", {2, 3, m}, kFloat);
-
-  Tensor c = Compute(
-      "scale",
-      {2, 3, m},
-      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
-        return b.load(l, n, m) * a.load(l, n, m);
-      });
-  Tensor d = Reduce("sum", {2}, Sum(), c, {3, m});
-  LoopNest loop({d}, {c, d});
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {a, b, d, m});
-
-  std::vector<float> aData(2 * 3 * M, 0);
-  std::vector<float> bData(2 * 3 * M, 0);
-  std::vector<float> dData(2, 6.0f);
-
-  for (int i = 0; i < 2 * 3; ++i) {
-    for (const auto j : c10::irange(M)) {
-      bData[i * M + j] = j + 1;
-      aData[i * M + j] = 6 - i;
-    }
-  }
-
-  cg.call({aData, bData, dData, M});
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-  float expected[2] = {0, 0};
-  for (const auto i : c10::irange(2)) {
-    for (const auto j : c10::irange(3)) {
-      for (const auto k : c10::irange(M)) {
-        // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-        expected[i] += (k + 1) * (6 - (i * 3 + j));
-      }
-    }
-  }
-
-  for (const auto i : c10::irange(2)) {
-    ASSERT_EQ(dData[i], expected[i]);
-  }
-}
-
-TEST(Reductions, SplitReduceAxis) {
-  BufHandle in("in", {16, 8}, kFloat);
-
-  std::vector<float> in_(16 * 8);
-  for (const auto i : c10::irange(16)) {
-    for (const auto j : c10::irange(8)) {
-      in_[i * 8 + j] = i;
-    }
-  }
-  std::vector<float> out(16, -1.f);
-
-  Tensor tensor = Reduce("sum", {16}, Sum(), in, {8});
-  LoopNest l({tensor});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
-  LoopNest::splitWithTail(loops[1], 2);
-
-  l.prepareForCodegen();
-
-  StmtPtr s = l.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {in, tensor});
-  cg.call({in_, out});
-
-  for (const auto i : c10::irange(16)) {
-    ASSERT_EQ(out[i], i * 8);
-  }
-}
-
-TEST(Reductions, SplitNonReduceAxis) {
-  BufHandle in("in", {16, 8}, kFloat);
-
-  std::vector<float> in_(16 * 8);
-  for (const auto i : c10::irange(16)) {
-    for (const auto j : c10::irange(8)) {
-      in_[i * 8 + j] = i;
-    }
-  }
-  std::vector<float> out(16, -1.f);
-  Tensor tensor = Reduce("sum", {16}, Sum(), in, {8});
-  LoopNest l({tensor});
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
-  LoopNest::splitWithTail(loops[0], 2);
-  LoopNest::splitWithTail(loops[0], 2);
-
-  l.prepareForCodegen();
-
-  StmtPtr s = l.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {in, tensor});
-  cg.call({in_, out});
-
-  for (const auto i : c10::irange(16)) {
-    ASSERT_EQ(out[i], i * 8);
-  }
-}
-
-TEST(Reductions, ReorderedReductionInitializer) {
-  /* From the quip:
-  for k in 0..1:  // blockIdx
-    for m in 0..128:
-      for n in 0..64: // threadIdx
-        SumOp(c(k, n), 0, a(k, m, n), {m})
-  */
-
-  BufHandle in("in", {1, 12, 6}, kFloat);
-  std::vector<float> in_(12 * 6, 1.f);
-
-  Tensor tensor_ = Reduce("sum", {1, 12}, Sum(), in, {6});
-  LoopNest l_({tensor_});
-
-  l_.prepareForCodegen();
-  StmtPtr s_ = Stmt::clone(l_.root_stmt());
-  s_ = IRSimplifier::simplify(s_);
-
-  Tensor tensor = Reduce("sum", {1, 12}, Sum(), in, {6});
-  LoopNest l({tensor});
-
-  auto loops = l.getLoopStmtsFor(tensor);
-  loops[0]->set_gpu_block_index(0);
-  loops[1]->set_gpu_thread_index(0);
-
-  LoopNest::reorderAxis(loops[1], loops[2]);
-
-  StmtPtr s = l.root_stmt();
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  s = IRSimplifier::simplify(s);
-
-  l.prepareForCodegen();
-
-  s = l.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  std::vector<float> out1(16, -1.f);
-  SimpleIREvaluator cg(s_, {in, tensor_});
-  cg.call({in_, out1});
-
-  std::vector<float> out2(16, -1.f);
-  SimpleIREvaluator cg2(s, {in, tensor});
-  cg2.call({in_, out2});
-
-  for (const auto i : c10::irange(16)) {
-    ASSERT_EQ(out1[i], out2[i]);
-  }
-}
-
-TEST(Reductions, ReduceRfactor) {
-  const int M = 10;
-  const int N = 10;
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-
-  BufHandle b("b", {m, n}, kFloat);
-  std::vector<float> in(M * N);
-  for (int j = 0; j < M * N; ++j) {
-    in[j] = j;
-  }
-
-  std::vector<float> out(1, -1.f);
-
-  Tensor c = Reduce("sum", {}, Sum(), b, {m, n});
-  LoopNest loop({c});
-  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
-  ASSERT_TRUE(loop.rfactor(c_body, loops.at(0)));
-  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
-  ASSERT_EQ(rc.size(), 2);
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {b, c, m, n});
-
-  cg.call({in, out, M, N});
-  ASSERT_EQ(out[0], 4950);
-}
-
-TEST(Reductions, Reduce3DRfactorInner) {
-  const int M = 10;
-  const int N = 10;
-  const int K = 10;
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  VarHandle k("k", kInt);
-
-  BufHandle b("b", {m, n, k}, kFloat);
-  std::vector<float> in(M * N * K);
-  for (int j = 0; j < M * N * K; ++j) {
-    in[j] = j;
-  }
-
-  std::vector<float> out(1, -1.f);
-
-  Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k});
-  LoopNest loop({c});
-  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
-  ASSERT_FALSE(loop.rfactor(c_body, loops.at(2)));
-  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
-  ASSERT_EQ(rc.size(), 1);
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {b, c, m, n, k});
-
-  cg.call({in, out, M, N, K});
-  ASSERT_EQ(out[0], 499500);
-}
-
-TEST(Reductions, Reduce3DRfactorOuter) {
-  const int M = 10;
-  const int N = 10;
-  const int K = 10;
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  VarHandle k("k", kInt);
-
-  BufHandle b("b", {m, n, k}, kFloat);
-  std::vector<float> in(M * N * K);
-  for (int j = 0; j < M * N * K; ++j) {
-    in[j] = j;
-  }
-
-  std::vector<float> out(1, -1.f);
-
-  Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k});
-  LoopNest loop({c});
-  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
-  ASSERT_TRUE(loop.rfactor(c_body, loops.at(0)));
-  auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
-  ASSERT_EQ(rc.size(), 2);
-  loop.prepareForCodegen();
-  StmtPtr s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {b, c, m, n, k});
-  cg.call({in, out, M, N, K});
-  ASSERT_EQ(out[0], 499500);
-}
-
-TEST(Reductions, ReduceRepeatedInternalRfactor) {
-  BufHandle in_("in_", {2, 3, 4, 5, 6}, kFloat);
-  const int InputSize = 2 * 3 * 4 * 5 * 6;
-
-  std::vector<float> in(InputSize, 1.f);
-  std::vector<float> out(1, -1.f);
-  std::vector<float> ref(1, -1.f);
-
-  Tensor c = Reduce("sum", {}, Sum(), in_, {2, 3, 4, 5, 6});
-  LoopNest orig_loop({c});
-
-  // Try rfactoring N outer loops
-  for (const auto rfac_number : c10::irange(1, 5)) {
-    LoopNest refloop(orig_loop);
-    LoopNest loop(orig_loop);
-    refloop.prepareForCodegen();
-    SimpleIREvaluator ref_cg(
-        IRSimplifier::simplify(refloop.root_stmt()), {in_, c});
-    ref_cg.call({in, ref});
-
-    BufPtr tmp_buf = c.buf();
-
-    for (const auto idx : c10::irange(rfac_number)) {
-      auto reduce = loop.getAllWritesToBuf(tmp_buf)[1];
-      ASSERT_TRUE(loop.rfactor(
-          reduce, loop.getLoopStmtsFor(tmp_buf).at(idx), &tmp_buf));
-    }
-
-    loop.prepareForCodegen();
-    StmtPtr s = loop.root_stmt();
-    s = IRSimplifier::simplify(s);
-
-    SimpleIREvaluator cg(s, {in_, c});
-    cg.call({in, out});
-
-    ASSERT_EQ(ref[0], out[0]);
-  }
-}
-
-// Split a reduction axis with a tail loop.
-TEST(Reductions, ReduceSplitTail) {
-  const int M = 10;
-  const int N = 10;
-  const int K = 10;
-
-  BufHandle b("b", {M, N, K}, kFloat);
-  std::vector<float> in(M * N * K);
-  for (int j = 0; j < M * N * K; ++j) {
-    in[j] = j;
-  }
-
-  for (const auto i : c10::irange(3)) {
-    std::vector<float> out(M, -1.f);
-
-    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
-    LoopNest loop({c});
-    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-    LoopNest::splitWithTail(loops[i], 8);
-
-    loop.prepareForCodegen();
-    StmtPtr s = loop.root_stmt();
-    s = IRSimplifier::simplify(s);
-
-    SimpleIREvaluator cg(s, {b, c});
-
-    cg.call({in, out});
-    ASSERT_EQ(out[0], 4950);
-  }
-}
-
-// Split a reduction axis cleanly so there is no tail loop.
-TEST(Reductions, ReduceSplitNoTail) {
-  const int M = 10;
-  const int N = 10;
-  const int K = 10;
-  BufHandle b("b", {M, N, K}, kFloat);
-  std::vector<float> in(M * N * K);
-  for (int j = 0; j < M * N * K; ++j) {
-    in[j] = j;
-  }
-
-  for (const auto i : c10::irange(3)) {
-    std::vector<float> out(M, -1.f);
-
-    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
-    LoopNest loop({c});
-    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-    LoopNest::splitWithTail(loops[i], 5);
-
-    loop.prepareForCodegen();
-    StmtPtr s = loop.root_stmt();
-    s = IRSimplifier::simplify(s);
-
-    SimpleIREvaluator cg(s, {b, c});
-
-    cg.call({in, out});
-    ASSERT_EQ(out[0], 4950);
-  }
-}
-
-// Split a reduction axis with only a tail loop (the split loop will be size 0
-// and eliminated out).
-TEST(Reductions, ReduceOverSplitTail) {
-  const int M = 10;
-  const int N = 10;
-  const int K = 10;
-
-  BufHandle b("b", {M, N, K}, kFloat);
-  std::vector<float> in(M * N * K);
-  for (int j = 0; j < M * N * K; ++j) {
-    in[j] = j;
-  }
-
-  for (const auto i : c10::irange(3)) {
-    std::vector<float> out(M, -1.f);
-
-    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
-    LoopNest loop({c});
-    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-    LoopNest::splitWithTail(loops[i], 16);
-
-    loop.prepareForCodegen();
-    StmtPtr s = loop.root_stmt();
-    s = IRSimplifier::simplify(s);
-
-    SimpleIREvaluator cg(s, {b, c});
-
-    cg.call({in, out});
-    ASSERT_EQ(out[0], 4950);
-  }
-}
-
-// Split a reduction axis with a mask.
-TEST(Reductions, ReduceSplitMask) {
-  const int M = 10;
-  const int N = 10;
-  const int K = 10;
-
-  BufHandle b("b", {M, N, K}, kFloat);
-  std::vector<float> in(M * N * K);
-  for (int j = 0; j < M * N * K; ++j) {
-    in[j] = j;
-  }
-
-  for (const auto i : c10::irange(3)) {
-    std::vector<float> out(M, -1.f);
-
-    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
-    LoopNest loop({c});
-    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-    LoopNest::splitWithMask(loops[i], 8);
-
-    loop.prepareForCodegen();
-    StmtPtr s = loop.root_stmt();
-    s = IRSimplifier::simplify(s);
-
-    SimpleIREvaluator cg(s, {b, c});
-
-    cg.call({in, out});
-    ASSERT_EQ(out[0], 4950);
-  }
-}
-
-// Split a reduction axis cleanly not requiring a mask.
-TEST(Reductions, ReduceSplitNoMask) {
-  const int M = 10;
-  const int N = 10;
-  const int K = 10;
-  BufHandle b("b", {M, N, K}, kFloat);
-  std::vector<float> in(M * N * K);
-  for (int j = 0; j < M * N * K; ++j) {
-    in[j] = j;
-  }
-
-  for (const auto i : c10::irange(3)) {
-    std::vector<float> out(M, -1.f);
-
-    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
-    LoopNest loop({c});
-    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-    LoopNest::splitWithMask(loops[i], 5);
-
-    loop.prepareForCodegen();
-    StmtPtr s = loop.root_stmt();
-    s = IRSimplifier::simplify(s);
-
-    SimpleIREvaluator cg(s, {b, c});
-
-    cg.call({in, out});
-    ASSERT_EQ(out[0], 4950);
-  }
-}
-
-// Split a reduction axis with all logic in the mask.
-TEST(Reductions, ReduceOverSplitMask) {
-  const int M = 10;
-  const int N = 10;
-  const int K = 10;
-
-  BufHandle b("b", {M, N, K}, kFloat);
-  std::vector<float> in(M * N * K);
-  for (int j = 0; j < M * N * K; ++j) {
-    in[j] = j;
-  }
-
-  for (const auto i : c10::irange(3)) {
-    std::vector<float> out(M, -1.f);
-
-    Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
-    LoopNest loop({c});
-    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-    LoopNest::splitWithMask(loops[i], 16);
-
-    loop.prepareForCodegen();
-    StmtPtr s = loop.root_stmt();
-    s = IRSimplifier::simplify(s);
-
-    SimpleIREvaluator cg(s, {b, c});
-
-    cg.call({in, out});
-    ASSERT_EQ(out[0], 4950);
-  }
-}
-
-// Test an rfactor when there are two ReduceOps in the graph due to a
-// splitWithTail.
-TEST(Reductions, ReduceSplitRfactor) {
-  const int M = 2;
-  const int N = 10;
-  const int K = 10;
-  const int SPLIT_FACTOR = 4;
-
-  BufHandle b("b", {M, N, K}, kFloat);
-  std::vector<float> in(M * N * K);
-  for (const auto m : c10::irange(M)) {
-    for (int j = 0; j < N * K; ++j) {
-      in[m * N * K + j] = j;
-    }
-  }
-
-  std::vector<float> out(M, -1.f);
-
-  Tensor c = Reduce("sum", {M}, Sum(), b, {N, K});
-  LoopNest loop({c});
-  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  LoopNest::splitWithTail(loops[2], SPLIT_FACTOR);
-
-  auto c_body = loop.getAllWritesToBuf(c.buf())[2];
-  auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
-  ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3);
-  LoopNest::reorderAxis(all_loops[2][1], all_loops[2][2]);
-  all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
-  ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3);
-  ASSERT_TRUE(loop.rfactor(c_body, all_loops[2][1]));
-  loop.prepareForCodegen();
-  loop.simplify();
-  StmtPtr s = loop.root_stmt();
-
-  SimpleIREvaluator cg(s, {b, c});
-
-  cg.call({in, out});
-  for ([[maybe_unused]] const auto i : c10::irange(M)) {
-    ASSERT_EQ(out[0], 4950);
-  }
-}
-
-// Test an rfactor which ends up being eliminated since the total loop size is
-// smaller than the split factor.
-TEST(Reductions, ReduceOverSplitRfactor) {
-  const int N = 10;
-  const int K = 10;
-  const int SPLIT_FACTOR = 16;
-
-  BufHandle b("b", {N, K}, kFloat);
-  std::vector<float> in(N * K);
-  for (int j = 0; j < N * K; ++j) {
-    in[j] = j;
-  }
-
-  std::vector<float> out(1, -1.f);
-
-  Tensor c = Reduce("sum", {}, Sum(), b, {N, K});
-  LoopNest loop({c});
-  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  ForPtr i, t;
-  LoopNest::splitWithTail(loops[1], SPLIT_FACTOR, &i, &t);
-  LoopNest::reorderAxis(loops[0], i);
-
-  auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
-  ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(1).size() == 3);
-  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
-  ASSERT_TRUE(loop.rfactor(c_body, all_loops[1][0]));
-  LoopNest::reorderAxis(all_loops[1][0], all_loops[1][2]);
-
-  loop.prepareForCodegen();
-  loop.simplify();
-  StmtPtr s = loop.root_stmt();
-
-  SimpleIREvaluator cg(s, {b, c});
-
-  cg.call({in, out});
-  ASSERT_EQ(out[0], 4950);
-
-  std::ostringstream oss;
-  oss << *cg.stmt();
-
-  // Check the IR to verify the rfactored reduce is eliminated.
-  // TODO: The alloc free should be eliminated here since it is size 0.
-  /*
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: Allocate(tmp_buf); // dtype=float, dims=[0]
-# CHECK: sum[0] = 0.f;
-# CHECK: for (int n = 0; n < 10; n++) {
-# CHECK:   for (int k_tail = 0; k_tail < 10; k_tail++) {
-# CHECK:     sum[0] = (sum[0]) + (b[k_tail + 10 * n]);
-# CHECK:   }
-# CHECK: }
-# CHECK: Free(tmp_buf);)IR";
-  */
-  // TODO: rfactor output is not consistent yet, will fix (@nickg).
-  // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Reductions, ReduceInlineReduction) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-
-  BufHandle a_buf("a", {M}, kFloat);
-  BufHandle b_buf("b", {M, N, K}, kFloat);
-
-  Tensor x = Reduce("x", {M}, Sum(), b_buf, {N, K});
-  Tensor y = Compute(
-      "y", {M}, [&](const VarHandle& m) { return a_buf.load(m) + x.load(m); });
-
-  PaddedBuffer<float> a_v(M);
-  PaddedBuffer<float> b_v(M, N, K);
-
-  for (const auto i : c10::irange(M)) {
-    a_v(i) = i * i;
-  }
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N)) {
-      for (const auto k : c10::irange(K)) {
-        b_v(i, j, k) = j * j * k;
-      }
-    }
-  }
-
-  LoopNest l1({y}, {x, y});
-  // Cannot inline a reduction computation
-  ASSERT_FALSE(l1.computeInline(x.buf()));
-}
-
-TEST(Reductions, ReduceInlineConsumer) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-
-  BufHandle a_buf("a", {M, N, K}, kFloat);
-  BufHandle b_buf("b", {M, N, K}, kFloat);
-
-  Tensor x = Compute(
-      "x",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n, k) + b_buf.load(m, n, k);
-      });
-  Tensor y = Reduce("y", {M}, Sum(), x, {N, K});
-
-  PaddedBuffer<float> a_v(M, N, K);
-  PaddedBuffer<float> b_v(M, N, K);
-
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N)) {
-      for (const auto k : c10::irange(K)) {
-        a_v(i, j, k) = i * i + k;
-        b_v(i, j, k) = j * j + k;
-      }
-    }
-  }
-
-  LoopNest l1({y}, {x, y});
-  LoopNest l2(l1);
-  l2.computeInline(x.buf());
-
-  l1.prepareForCodegen();
-  l2.prepareForCodegen();
-
-  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
-  StmtPtr stmt2 = IRSimplifier::simplify(l2.root_stmt());
-
-  SimpleIREvaluator eval1(stmt1, {a_buf, b_buf, y});
-  SimpleIREvaluator eval2(stmt2, {a_buf, b_buf, y});
-
-  PaddedBuffer<float> y_1(M);
-  PaddedBuffer<float> y_2(M);
-
-  eval1(a_v, b_v, y_1);
-  eval2(a_v, b_v, y_2);
-  ExpectAllNear(y_1, y_2, 1e-5);
-  std::ostringstream oss1, oss2;
-  oss1 << *stmt1;
-  oss2 << *stmt2;
-  ASSERT_GT(oss1.str().size(), oss2.str().size());
-}
-
-TEST(Reductions, ReduceInlineReducerInternal) {
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-
-  BufHandle a_buf("a", {M, N, K}, kFloat);
-  BufHandle b_buf("b", {M, N, K}, kFloat);
-
-  Tensor x = Compute(
-      "x",
-      {M, N, K},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n, k) + b_buf.load(m, n, k);
-      });
-
-  Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) {
-    return Add::make(ExprHandle(1.f), Min::make(a, b, false));
-  });
-  Tensor y = Reduce("y", {M}, minimum, x, {N, K});
-
-  PaddedBuffer<float> a_v(M, N, K);
-  PaddedBuffer<float> b_v(M, N, K);
-
-  for (const auto i : c10::irange(M)) {
-    for (const auto j : c10::irange(N)) {
-      for (const auto k : c10::irange(K)) {
-        a_v(i, j, k) = i * i + k;
-        b_v(i, j, k) = j * j + k;
-      }
-    }
-  }
-
-  LoopNest l1({y}, {x, y});
-  LoopNest l2(l1);
-  l2.computeInline(x.buf());
-
-  l1.prepareForCodegen();
-  l2.prepareForCodegen();
-
-  StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
-  StmtPtr stmt2 = IRSimplifier::simplify(l2.root_stmt());
-
-  SimpleIREvaluator eval1(stmt1, {a_buf, b_buf, y});
-  SimpleIREvaluator eval2(stmt2, {a_buf, b_buf, y});
-
-  PaddedBuffer<float> y_1(M);
-  PaddedBuffer<float> y_2(M);
-
-  eval1(a_v, b_v, y_1);
-  eval2(a_v, b_v, y_2);
-  ExpectAllNear(y_1, y_2, 1e-5);
-  std::ostringstream oss1, oss2;
-  oss1 << *stmt1;
-  oss2 << *stmt2;
-  ASSERT_GT(oss1.str().size(), oss2.str().size());
-}
-
-TEST(Reductions, ReductionCacheAccessesOperatorAxis) {
-  int L = 4;
-  int N = 3;
-  int M = 2;
-
-  BufHandle a("a", {L, N, M}, kFloat);
-  BufHandle b("b", {L, N, M}, kFloat);
-
-  Tensor c = Compute(
-      "scale",
-      {L, N, M},
-      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
-        return b.load(l, n, m) * a.load(l, n, m);
-      });
-  Tensor d = Reduce("sum", {L}, Sum(), c, {N, M});
-
-  Tensor e = Compute("scale", {L}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d.load(l);
-  });
-
-  LoopNest l({e}, {c, d, e});
-  LoopNest l_before(l);
-  l_before.prepareForCodegen();
-  SimpleIREvaluator cg_before(
-      LoopNest::sanitizeNames(l_before.root_stmt()), {a, b, e});
-
-  StmtPtr d_loop = l.getLoopStmtsFor(d)[0];
-  l.cacheAccesses(d.buf(), "d_local", d_loop);
-  l.prepareForCodegen();
-
-  StmtPtr result =
-      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
-  SimpleIREvaluator cg_after(result, {a, b, e});
-
-  std::ostringstream oss;
-  oss << *cg_after.stmt();
-  const std::string& expected_ir =
-      R"IR(
-#CHECK: Allocate(d_local); // dtype=float, dims=[4]
-#CHECK: for (int i_2
-#CHECK:   d_local[i_2] = 0.f
-#CHECK:   for (int
-#CHECK:     for (int
-#CHECK:       d_local[i_2] = (d_local[i_2]) + (scale[
-#CHECK:     }
-#CHECK:   }
-#CHECK: }
-#CHECK: for (int i_3
-#CHECK:   sum[i_3] = d_local[i_3]
-#CHECK: Free(d_local);
-#CHECK-NOT: d_local
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-
-  PaddedBuffer<float> a_v(L, M, N, "a");
-  PaddedBuffer<float> b_v(L, M, N, "b");
-  PaddedBuffer<float> c_v(L, M, N, "c");
-  PaddedBuffer<float> d_v(L, "d");
-  PaddedBuffer<float> e_before(L, "e_before");
-  PaddedBuffer<float> e_after(L, "e_after");
-
-  for (const auto l : c10::irange(L)) {
-    for (const auto m : c10::irange(M)) {
-      for (const auto n : c10::irange(N)) {
-        a_v(l, m, n) = at::randn({1}).item().to<float>();
-        b_v(l, m, n) = at::randn({1}).item().to<float>();
-      }
-    }
-  }
-
-  cg_before.call({a_v, b_v, e_before});
-  cg_after.call({a_v, b_v, e_after});
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  ExpectAllNear(e_before, e_after, 1e-5);
-}
-
-TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) {
-  int L = 4;
-  int N = 3;
-  int M = 2;
-
-  BufHandle a("a", {L, N, M}, kFloat);
-  BufHandle b("b", {L, N, M}, kFloat);
-
-  Tensor c = Compute(
-      "scale",
-      {L, N, M},
-      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
-        return b.load(l, n, m) * a.load(l, n, m);
-      });
-  Tensor d = Reduce("sum", {L}, Sum(), c, {N, M});
-
-  Tensor e = Compute("scale", {L}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d.load(l);
-  });
-
-  LoopNest l({e}, {c, d, e});
-  LoopNest l_before(l);
-  l_before.prepareForCodegen();
-  SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
-
-  StmtPtr d_loop = l.getLoopStmtsFor(d)[1];
-  l.cacheAccesses(d.buf(), "d_local", d_loop);
-  l.prepareForCodegen();
-
-  StmtPtr result =
-      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
-  SimpleIREvaluator cg_after(result, {a, b, e});
-
-  std::ostringstream oss;
-  oss << *cg_after.stmt();
-  const std::string& expected_ir =
-      R"IR(
-#CHECK: Allocate(d_local); // dtype=float, dims=[1]
-#CHECK: sum[i_1] = 0
-#CHECK: d_local[0] = sum[i_1]
-#CHECK: for (int j_1
-#CHECK:   for (int k_1
-#CHECK: d_local[0] = (d_local[0]) + (scale[
-#CHECK:   }
-#CHECK: }
-#CHECK: sum[i_1] = d_local[0]
-#CHECK: Free(d_local);
-#CHECK-NOT: d_local
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-
-  PaddedBuffer<float> a_v(L, M, N, "a");
-  PaddedBuffer<float> b_v(L, M, N, "b");
-  PaddedBuffer<float> c_v(L, M, N, "c");
-  PaddedBuffer<float> d_v(L, "d");
-  PaddedBuffer<float> e_before(L, "e_before");
-  PaddedBuffer<float> e_after(L, "e_after");
-
-  for (const auto l : c10::irange(L)) {
-    for (const auto m : c10::irange(M)) {
-      for (const auto n : c10::irange(N)) {
-        a_v(l, m, n) = at::randn({1}).item().to<float>();
-        b_v(l, m, n) = at::randn({1}).item().to<float>();
-      }
-    }
-  }
-
-  cg_before.call({a_v, b_v, e_before});
-  cg_after.call({a_v, b_v, e_after});
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  ExpectAllNear(e_before, e_after, 1e-5);
-}
-
-TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) {
-  int L = 4;
-  int N = 3;
-  int M = 2;
-
-  BufHandle a("a", {L, N, M}, kFloat);
-  BufHandle b("b", {L, N, M}, kFloat);
-
-  Tensor c = Compute(
-      "scale",
-      {L, N, M},
-      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
-        return b.load(l, n, m) * a.load(l, n, m);
-      });
-  Tensor d = Reduce("sum", {L}, Sum(), c, {N, M});
-
-  Tensor e = Compute("scale", {L}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d.load(l);
-  });
-
-  LoopNest l({e}, {c, d, e});
-  LoopNest l_before(l);
-  l_before.prepareForCodegen();
-  SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
-
-  StmtPtr d_loop = l.getLoopStmtsFor(d)[2];
-  l.cacheAccesses(d.buf(), "d_local", d_loop);
-  l.prepareForCodegen();
-
-  StmtPtr result =
-      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
-  SimpleIREvaluator cg_after(result, {a, b, e});
-
-  std::ostringstream oss;
-  oss << *cg_after.stmt();
-  const std::string& expected_ir =
-      R"IR(
-#CHECK: Allocate(d_local); // dtype=float, dims=[1]
-#CHECK: sum[i_1] = 0
-#CHECK: for (int
-#CHECK:   d_local[0] = 0
-#CHECK:   for (int
-#CHECK:     d_local[0] = (d_local[0]) + (scale[
-#CHECK:   }
-#CHECK:   sum[i_1] = (sum[i_1]) + (d_local[0])
-#CHECK: }
-#CHECK: Free(d_local);
-#CHECK-NOT: d_local
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-
-  PaddedBuffer<float> a_v(L, M, N, "a");
-  PaddedBuffer<float> b_v(L, M, N, "b");
-  PaddedBuffer<float> c_v(L, M, N, "c");
-  PaddedBuffer<float> d_v(L, "d");
-  PaddedBuffer<float> e_before(L, "e_before");
-  PaddedBuffer<float> e_after(L, "e_after");
-
-  for (const auto l : c10::irange(L)) {
-    for (const auto m : c10::irange(M)) {
-      for (const auto n : c10::irange(N)) {
-        a_v(l, m, n) = at::randn({1}).item().to<float>();
-        b_v(l, m, n) = at::randn({1}).item().to<float>();
-      }
-    }
-  }
-
-  cg_before.call({a_v, b_v, e_before});
-  cg_after.call({a_v, b_v, e_after});
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-  ExpectAllNear(e_before, e_after, 1e-5);
-}
-
-TEST(Reductions, ReductionCacheBodyAccess) {
-  BufHandle a("a", {24, 32, 12}, kFloat);
-  BufHandle b("b", {24, 32, 12}, kFloat);
-
-  Tensor c = Compute(
-      "scale",
-      {24, 32, 12},
-      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
-        return b.load(l, n, m) * a.load(l, n, m);
-      });
-  Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12});
-
-  Tensor e = Compute("scale", {24}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d.load(l);
-  });
-
-  LoopNest l({e}, {c, d, e});
-
-  StmtPtr d_loop = l.getLoopStmtsFor(d)[1];
-  l.cacheAccesses(c.buf(), "scale_local", d_loop);
-
-  l.prepareForCodegen();
-  StmtPtr result =
-      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
-  SimpleIREvaluator cg(result, {a, b, e});
-
-  std::ostringstream oss;
-  oss << *cg.stmt();
-  const std::string& expected_ir =
-      R"IR(
-#CHECK: Allocate(scale_local); // dtype=float, dims=[1, 32, 12]
-#CHECK: for (int j_1 = 0; j_1 < 32; j_1++) {
-#CHECK:   for (int k_1 = 0; k_1 < 12; k_1++) {
-#CHECK:     scale_local[k_1 + 12 * j_1] = scale[(k_1 + 12 * j_1) + 384 * i_1];
-#CHECK: sum[i_1] = (sum[i_1]) + (scale_local[k_2 + 12 * j_2]);
-#CHECK: scale_1[i_2] = (b[i_2]) * (sum[i_2]);
-#CHECK: Free(scale_local);
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-}
-
-TEST(Reductions, ReductionCacheConsumerAccess) {
-  BufHandle a("a", {24, 32, 12}, kFloat);
-  BufHandle b("b", {24, 32, 12}, kFloat);
-
-  Tensor c = Compute(
-      "scale",
-      {24, 32, 12},
-      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
-        return b.load(l, n, m) * a.load(l, n, m);
-      });
-  Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12});
-
-  Tensor e = Compute("scale", {24}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d.load(l);
-  });
-
-  LoopNest l({e}, {c, d, e});
-
-  LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4);
-
-  StmtPtr e_loop = l.getLoopStmtsFor(e)[1];
-  l.cacheAccesses(d.buf(), "sum_local", e_loop);
-  l.prepareForCodegen();
-
-  StmtPtr result =
-      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
-  SimpleIREvaluator cg(result, {a, b, e});
-
-  std::ostringstream oss;
-  oss << *cg.stmt();
-  const std::string& expected_ir =
-      R"IR(
-#CHECK: Alias(sum_local,scale);
-#CHECK: sum[i_1] = (sum[i_1]) + (scale[
-#CHECK: for (int j_2 = 0; j_2 < 4
-#CHECK:   sum_local[j_2] = sum[j_2 + 4 * i_2];
-#CHECK:   scale_1[j_3 + 4 * i_2] = (b[j_3 + 4 * i_2]) * (sum_local[j_3]);
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-}
-
-TEST(Reductions, ReductionSplitCacheConsumerAccess) {
-  BufHandle a("a", {24, 32, 12}, kFloat);
-  BufHandle b("b", {24, 32, 12}, kFloat);
-
-  Tensor c = Compute(
-      "scale",
-      {24, 32, 12},
-      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
-        return b.load(l, n, m) * a.load(l, n, m);
-      });
-  Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12});
-
-  Tensor e = Compute("scale", {24}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d.load(l);
-  });
-
-  LoopNest l({e}, {c, d, e});
-
-  ForPtr inner;
-
-  // Split outer reduction axis.
-  LoopNest::splitWithMask(l.getLoopStmtsFor(d)[0], 4, &inner);
-
-  // Split reduction consumer.
-  LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner);
-
-  l.cacheAccesses(d.buf(), "sum_local", inner);
-  l.prepareForCodegen();
-
-  StmtPtr result =
-      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
-  SimpleIREvaluator cg(result, {a, b, e});
-
-  // reduction changes but cache does not.
-  std::ostringstream oss;
-  oss << *cg.stmt();
-  const std::string& expected_ir =
-      R"IR(
-#CHECK: Alias(sum_local,scale);
-#CHECK:         sum[j_1 + 4 * i_1] = (sum[j_1 + 4 * i_1]) + (scale[((l + 12 * k_1) + 1536 * i_1) + 384 * j_1]);
-#CHECK: for (int i_2 = 0; i_2 < 6
-#CHECK:   for (int j_2 = 0; j_2 < 4
-#CHECK:     sum_local[j_2] = sum[j_2 + 4 * i_2];
-#CHECK:   for (int j_3 = 0; j_3 < 4
-#CHECK:     scale_1[j_3 + 4 * i_2] = (b[j_3 + 4 * i_2]) * (sum_local[j_3]);
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-}
-
-TEST(Reductions, ReductionReorderCacheConsumerAccess) {
-  BufHandle a("a", {24, 32, 12}, kFloat);
-  BufHandle b("b", {24, 32, 12}, kFloat);
-
-  Tensor c = Compute(
-      "scale",
-      {24, 32, 12},
-      [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
-        return b.load(l, n, m) * a.load(l, n, m);
-      });
-  Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12});
-
-  Tensor e = Compute("scale", {24}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d.load(l);
-  });
-
-  LoopNest l({e}, {c, d, e});
-
-  ForPtr inner;
-
-  // reorder outer reduction axes.
-  auto loops = l.getLoopStmtsFor(d);
-  LoopNest::reorderAxis(loops[0], loops[1]);
-
-  // Split reduction consumer.
-  LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner);
-
-  l.cacheAccesses(d.buf(), "sum_local", inner);
-  l.prepareForCodegen();
-
-  StmtPtr result =
-      LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt()));
-  SimpleIREvaluator cg(result, {a, b, e});
-
-  // neither reduction body not cache changes.
-  std::ostringstream oss;
-  oss << *cg.stmt();
-  const std::string& expected_ir =
-      R"IR(
-#CHECK:        sum[j_1] = (sum[j_1]) + (scale[(k_1 + 12 * i_2) + 384 * j_1]);
-#CHECK:  for (int i_3 = 0; i_3 < 6;
-#CHECK:    for (int j_2 = 0; j_2 < 4;
-#CHECK:      sum_local[j_2] = sum[j_2 + 4 * i_3];
-#CHECK:    for (int j_3 = 0; j_3 < 4;
-#CHECK:      scale_1[j_3 + 4 * i_3] = (b[j_3 + 4 * i_3]) * (sum_local[j_3]);
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-}
-
-TEST(Reductions, ReductionRfactorCacheTempOuter) {
-  const int M = 10;
-  const int N = 10;
-  const int K = 10;
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  VarHandle k("k", kInt);
-
-  BufHandle b("B", {m, n, k}, kFloat);
-  std::vector<float> in(M * N * K);
-  for (int j = 0; j < M * N * K; ++j) {
-    in[j] = j;
-  }
-
-  std::vector<float> out(1, -1.f);
-
-  Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k});
-  LoopNest loop({c});
-
-  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  LoopNest::reorderAxis(loops.at(0), loops.at(1));
-  loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
-  BufPtr rfac_buf;
-  ASSERT_TRUE(loop.rfactor(c_body, loops.at(0), &rfac_buf));
-  loop.distributeLoop(loops.at(0));
-
-  auto all_loops = loop.getAllLoopNestsWritingToBuf(rfac_buf);
-  ASSERT_TRUE(all_loops.size() == 2 && all_loops.at(1).size() == 3);
-  LoopNest::reorderAxis(all_loops[1][0], all_loops[1][1]);
-
-  all_loops = loop.getAllLoopNestsWritingToBuf(rfac_buf);
-  LoopNest::cacheAccesses(rfac_buf, "tmp", all_loops[1][1]);
-  loop.simplify();
-  loop.prepareForCodegen();
-  StmtPtr s = LoopNest::sanitizeNames(loop.root_stmt());
-  SimpleIREvaluator cg(s, {b, c, m, n, k});
-
-  std::ostringstream oss;
-  oss << *cg.stmt();
-  const std::string& expected_ir =
-      R"IR(
-#CHECK: Allocate(sum_rfac); // dtype=float, dims=[n]
-#CHECK: Allocate(tmp); // dtype=float, dims=[n]
-#CHECK: for (int i_1 = 0; i_1 < m
-#CHECK:   for (int j = 0; j < n
-#CHECK:     tmp[j] = 0
-#CHECK:   }
-#CHECK:   for (int j_1 = 0; j_1 < n
-#CHECK:     for (int k
-#CHECK:       tmp[j_1] = (tmp[j_1]) + (B[
-#CHECK:     }
-#CHECK:   }
-#CHECK:   for (int j_2 = 0; j_2 < n
-#CHECK:     sum_rfac[j_2] = (sum_rfac[j_2]) + (tmp[j_2]);
-#CHECK:   }
-#CHECK:   Free(tmp);
-#CHECK-NOT: tmp
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-
-  cg.call({in, out, M, N, K});
-  ASSERT_EQ(out[0], 499500);
-}
-
-TEST(Reductions, ReductionRfactorCacheTempInner) {
-  const int M = 10;
-  const int N = 10;
-  const int K = 10;
-  VarHandle m("m", kInt);
-  VarHandle n("n", kInt);
-  VarHandle k("k", kInt);
-
-  BufHandle b("B", {m, n, k}, kFloat);
-  std::vector<float> in(M * N * K);
-  for (int j = 0; j < M * N * K; ++j) {
-    in[j] = j;
-  }
-
-  std::vector<float> out(1, -1.f);
-
-  Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k});
-  LoopNest loop({c});
-  std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
-
-  LoopNest::reorderAxis(loops.at(0), loops.at(1));
-  loops = loop.getLoopStmtsFor(c);
-  BufPtr rfac_buf;
-  ASSERT_TRUE(loop.rfactor(c_body, loops.at(0), &rfac_buf));
-  loop.distributeLoop(loops.at(0));
-  auto all_loops = loop.getAllLoopNestsWritingToBuf(rfac_buf);
-  ASSERT_TRUE(all_loops.size() == 2 && all_loops.at(1).size() == 3);
-  LoopNest::reorderAxis(all_loops[1][0], all_loops[1][1]);
-
-  all_loops = loop.getAllLoopNestsWritingToBuf(rfac_buf);
-  ASSERT_TRUE(all_loops.size() == 2 && all_loops.at(1).size() == 3);
-  LoopNest::cacheAccesses(rfac_buf, "tmp", all_loops[1][2]);
-  loop.prepareForCodegen();
-  loop.simplify();
-  StmtPtr s = LoopNest::sanitizeNames(loop.root_stmt());
-  SimpleIREvaluator cg(s, {b, c, m, n, k});
-
-  std::ostringstream oss;
-  oss << *cg.stmt();
-  const std::string& expected_ir =
-      R"IR(
-#CHECK: Allocate(sum_rfac); // dtype=float, dims=[n]
-#CHECK: Allocate(tmp); // dtype=float, dims=[1]
-#CHECK: for (int i_1 = 0; i_1 < m
-#CHECK:   for (int j = 0; j < n
-#CHECK:     tmp[0] = 0
-#CHECK:     for (int k
-#CHECK:       tmp[0] = (tmp[0]) + (B[
-#CHECK:     }
-#CHECK:   sum_rfac[j] = (sum_rfac[j]) + (tmp[0]);
-#CHECK:   Free(tmp);
-#CHECK-NOT: tmp
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-
-  cg.call({in, out, M, N, K});
-  ASSERT_EQ(out[0], 499500);
-}
-
-TEST(Reductions, ReductionVectorize) {
-  std::vector<float> in_(8 * 8);
-  for (const auto i : c10::irange(8)) {
-    for (const auto j : c10::irange(8)) {
-      in_[i * 8 + j] = i;
-    }
-  }
-  std::vector<float> out_before(8, -1.f);
-  std::vector<float> out_after(8, -1.f);
-
-  BufHandle in("in", {8, 8}, kFloat);
-
-  Tensor tensor = Reduce("sum", {8}, Sum(), in, {8});
-  LoopNest l_before({tensor});
-  LoopNest l(l_before);
-  l_before.prepareForCodegen();
-  SimpleIREvaluator cg_before(l_before.root_stmt(), {in, tensor});
-  cg_before.call({in_, out_before});
-
-  ASSERT_TRUE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[0]));
-
-  StmtPtr s = l.root_stmt();
-  s = LoopNest::sanitizeNames(IRSimplifier::simplify(s));
-
-  std::ostringstream oss;
-  oss << *s;
-  const std::string& expected_ir =
-      R"IR(
-#CHECK: sum[Ramp(0, 1, 8)] = Broadcast(0.f, 8);
-#CHECK: for (int i = 0; i < 8; i++) {
-#CHECK: sum[Ramp(0, 1, 8)] = ReduceOp((sum[Ramp(0, 1, 8)]) + (in[Ramp(i, 8, 8)]), reduce_args={i});
-#CHECK: }
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-
-  // Vectorizing should not change result.
-  l.prepareForCodegen();
-  s = IRSimplifier::simplify(l.root_stmt());
-  SimpleIREvaluator cg_after(s, {in, tensor});
-  cg_after.call({in_, out_after});
-  for (const auto i : c10::irange(8)) {
-    ASSERT_EQ(out_before[i], out_after[i]);
-  }
-}
-
-TEST(Reductions, ReductionVectorizeInner) {
-  BufHandle in("in", {8, 8}, kFloat);
-
-  Tensor tensor = Reduce("sum", {8}, Sum(), in, {8});
-  LoopNest l({tensor});
-
-  ASSERT_FALSE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[1]));
-}
-
-TEST(Reductions, ReductionVectorizeRfactor) {
-  std::vector<float> in_(8 * 8);
-  for (const auto i : c10::irange(8)) {
-    for (const auto j : c10::irange(8)) {
-      in_[i * 8 + j] = i;
-    }
-  }
-  std::vector<float> out_before(1, -1.f);
-  std::vector<float> out_after(1, -1.f);
-
-  BufHandle in("in", {8, 8}, kFloat);
-
-  Tensor tensor = Reduce("sum", {}, Sum(), in, {8, 8});
-
-  LoopNest l_before({tensor});
-  LoopNest l(l_before);
-  l_before.prepareForCodegen();
-  SimpleIREvaluator cg_before(l_before.root_stmt(), {in, tensor});
-  cg_before.call({in_, out_before});
-
-  ASSERT_FALSE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[1]));
-
-  // But if we rfactor this so it's not a reduce axis we can vectorize that
-  // loop.
-  std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
-  LoopNest::reorderAxis(loops[0], loops[1]);
-  loops = l.getLoopStmtsFor(tensor);
-  auto tensor_body = l.getAllWritesToBuf(tensor.buf())[1];
-  BufPtr rfac_buf = nullptr;
-  ASSERT_TRUE(LoopNest::rfactor(tensor_body, loops.at(0), &rfac_buf));
-
-  LoopNest::distributeLoop(loops.at(0));
-  auto rfac_loops = l.getAllLoopNestsWritingToBuf(rfac_buf);
-
-  ASSERT_TRUE(LoopNest::vectorize(rfac_loops[1][0]));
-  l.simplify();
-
-  StmtPtr s = LoopNest::sanitizeNames(l.root_stmt());
-
-  std::ostringstream oss;
-  oss << *s;
-  const std::string& expected_ir =
-      R"IR(
-#CHECK: sum = 0.f;
-#CHECK: for (int i = 0; i < 8; i++) {
-#CHECK:   sum_rfac[i] = 0.f;
-#CHECK: }
-#CHECK: for (int i_1 = 0; i_1 < 8; i_1++) {
-#CHECK:   sum_rfac[Ramp(0, 1, 8)] = ReduceOp((sum_rfac[Ramp(0, 1, 8)]) + (in[Ramp(8 * i_1, 1, 8)]), reduce_args={i_1});
-#CHECK: }
-#CHECK: for (int i_2 = 0; i_2 < 8; i_2++) {
-#CHECK:   sum = ReduceOp((sum) + (sum_rfac[i_2]), reduce_args={i_2});
-#CHECK: }
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-
-  // Vectorizing should not change result.
-  l.prepareForCodegen();
-  s = IRSimplifier::simplify(l.root_stmt());
-  SimpleIREvaluator cg_after(s, {in, tensor});
-  cg_after.call({in_, out_after});
-
-  ASSERT_EQ(out_before[0], out_after[0]);
-}
-
-TEST(Reductions, InitFunction) {
-  constexpr int M = 32;
-  constexpr int N = 16;
-  BufHandle A("A", {M, N}, kFloat);
-  BufHandle B("B", {N}, kFloat);
-  Tensor C = Reduce(
-      "C",
-      {N},
-      Sum(),
-      [&](const std::vector<VarHandle>& v) { return B.load(v[0]); },
-      [&](const std::vector<VarHandle>& v) { return A.load(v[1], v[0]); },
-      {M});
-  LoopNest nest({C});
-  nest.prepareForCodegen();
-  StmtPtr s = LoopNest::sanitizeNames(IRSimplifier::simplify(nest.root_stmt()));
-  std::ostringstream oss;
-  oss << *s << "\n";
-  const std::string& expected_ir =
-      R"IR(
-#CHECK:  for (int i = 0; i < 16; i++) {
-#CHECK:    C[i] = B[i];
-#CHECK:    for (int j = 0; j < 32; j++) {
-#CHECK:      C[i] = (C[i]) + (A[i + 16 * j]);
-#CHECK:    }
-#CHECK:  }
-      )IR";
-  torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-}
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_registerizer.cpp b/test/cpp/tensorexpr/test_registerizer.cpp
deleted file mode 100644
index 6cbd04264c321..0000000000000
--- a/test/cpp/tensorexpr/test_registerizer.cpp
+++ /dev/null
@@ -1,3702 +0,0 @@
-#include <gtest/gtest.h>
-#include "test/cpp/tensorexpr/test_base.h"
-
-#include "test/cpp/tensorexpr/test_utils.h"
-#include "torch/csrc/jit/tensorexpr/ir_simplifier.h"
-#include "torch/csrc/jit/tensorexpr/registerizer.h"
-
-#include <iostream>
-
-namespace torch {
-namespace jit {
-using namespace torch::jit::tensorexpr;
-
-// Can replace a simple scalar access with a local variable.
-TEST(Registerizer, RegisterizerSimple) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {0}, Add::make(Load::make(a, {0}), x))}))});
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = (A[0]) + x;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A_1 = x + A_1;
-   * }
-   * A[0] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 0;
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK-NOT: A[
-# CHECK:   A_1 =
-# CHECK: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Won't do replacement of a loop access.
-TEST(Registerizer, RegisterizerLoop) {
-  BufHandle a("A", {10}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {x}, Add::make(Load::make(a, {x}), x))}))});
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[x] = (A[x]) + x;
-   * }
-   */
-
-  // No change.
-  stmt = registerize(stmt);
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[x] = (A[x]) + x;
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK-NOT: int
-# CHECK: A[0] = 0;
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK-NOT: A_
-# CHECK:   A[x] =
-# CHECK-NOT: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Won't replace even if the load is a fixed scalar, since the store could
-// invalidate it.
-TEST(Registerizer, RegisterizerLoopFixedLoad) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {x}, Add::make(Load::make(a, {0}), x))}))});
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[x] = (A[0]) + x;
-   * }
-   */
-
-  // No change.
-  stmt = registerize(stmt);
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[x] = (A[0]) + x;
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK-NOT: int
-# CHECK: A[0] = 0;
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK-NOT: A_
-# CHECK:   A[x] =
-# CHECK-NOT: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// We can registerize accesses that occur entirely within inner scopes, even if
-// they depend on the loop var.
-TEST(Registerizer, RegisterizerLoopInternal) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make({For::make(
-      x,
-      0,
-      10,
-      Block::make(
-          {Store::make(a, {x}, Add::make(Load::make(a, {x}), x)),
-           Store::make(a, {x}, Add::make(Load::make(a, {x}), x))}))});
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   A[x] = (A[x]) + x;
-   *   A[x] = (A[x]) + x;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  // TODO: the order of terms in addition changes and in general depends on
-  // some hash value. This results in unpredictable swaps of the operands from
-  // random changes, which is not great. Ideally, we should ensure some
-  // specific order (ideally, the original one).
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   int A_1 = A[x];
-   *   A_1 = x + A_1;
-   *   A_1 = x + A_1;
-   *   A[x] = A_1;
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK: int A_1 = A[x];
-# CHECK:   A_1 = A_1 + x;
-# CHECK:   A_1 = A_1 + x;
-# CHECK:   A[x] = A_1;
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// An access can be overlapped by another read in the same Expr. In this case
-// B[z] and B[y] overlap and prevent registerization of both accesses.
-TEST(Registerizer, RegisterizerLoopInternalLoadOverlap) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  VarHandle z("z", kInt);
-  StmtPtr stmt = Block::make({For::make(
-      x,
-      0,
-      10,
-      Store::make(a, {x}, Add::make(Load::make(b, {y}), Load::make(b, {z}))))});
-  stmt = IRSimplifier::simplify(stmt);
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   A[x] = (B[y]) + (B[z]);
-   * }
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-TEST(Registerizer, RegisterizerLoopInternalRepeated) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {0}, Add::make(Load::make(a, {1}), x)),
-                Store::make(a, {0}, Add::make(Load::make(a, {1}), x))})),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {0}, Add::make(Load::make(a, {1}), x)),
-                Store::make(a, {0}, Add::make(Load::make(a, {1}), x))}))
-
-      });
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = x + (A[1]);
-   *   A[0] = x + (A[1]);
-   * }
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = x + (A[1]);
-   *   A[0] = x + (A[1]);
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = A[1];
-   * int A_2 = A[0];
-   * for (int x = 0; x < 10; x++) {
-   *   A_2 = A_1 + x;
-   *   A_2 = A_1 + x;
-   * }
-   * for (int x = 0; x < 10; x++) {
-   *   A_2 = A_1 + x;
-   *   A_2 = A_1 + x;
-   * }
-   * A[0] = A_2;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[1];
-# CHECK: int A_2 = A[0];
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK:   A_2 = A_1 + x;
-# CHECK:   A_2 = A_1 + x;
-# CHECK: }
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK:   A_2 = A_1 + x;
-# CHECK:   A_2 = A_1 + x;
-# CHECK: }
-# CHECK-NOT: A[1]
-# CHECK: A[0] = A_2;
-# CHECK-NOT: A[1]
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapLoopVar) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {0}, Add::make(Load::make(a, {x}), x)),
-                Store::make(a, {0}, Add::make(Load::make(a, {x}), x))})),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {0}, Add::make(Load::make(a, {x}), x)),
-                Store::make(a, {0}, Add::make(Load::make(a, {x}), x))}))
-
-      });
-  stmt = IRSimplifier::simplify(stmt);
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = (A[x]) + x;
-   *   A[0] = (A[x]) + x;
-   * }
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = (A[x]) + x;
-   *   A[0] = (A[x]) + x;
-   * }
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapOther) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  StmtPtr stmt = IRSimplifier::simplify(Block::make(
-      {For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {0}, Add::make(x, Load::make(a, {y}))),
-                Store::make(a, {0}, Add::make(x, Load::make(a, {y})))})),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {0}, Add::make(x, Load::make(a, {y}))),
-                Store::make(a, {0}, Add::make(x, Load::make(a, {y})))}))
-
-      }));
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = (A[x]) + x;
-   *   A[0] = (A[x]) + x;
-   * }
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = (A[x]) + x;
-   *   A[0] = (A[x]) + x;
-   * }
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-// Will registerize multiple accesses of different items of the same buffer.
-TEST(Registerizer, RegisterizerMultiVar) {
-  BufHandle a("A", {2}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make({
-      Store::make(a, {0}, 0),
-      Store::make(a, {1}, 0),
-      For::make(
-          x,
-          0,
-          10,
-          Block::make(
-              {Store::make(a, {0}, Add::make(Load::make(a, {0}), x)),
-               Store::make(a, {1}, Sub::make(Load::make(a, {1}), x))})),
-  });
-
-  /*
-   * A[0] = 0;
-   * A[1] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = (A[0]) + x;
-   *   A[1] = (A[1]) - x;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 0;
-   * int A_2 = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A_2 = x + A_2;
-   *   A_1 = A_1 - x;
-   * }
-   * A[1] = A_2;
-   * A[0] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 0;
-# CHECK: int A_2 = 0;
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK-NOT: A[
-# CHECK:   A_1 =
-# CHECK:   A_2 =
-# CHECK: A[1] = A_2
-# CHECK: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Will registerize the valid accesses while skipping invalid replacements.
-TEST(Registerizer, RegisterizerVariableLoad) {
-  BufHandle a("A", {1}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle x2("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 0),
-       For::make(x, 0, 10, Store::make(b, {x}, x)),
-       For::make(
-           x2,
-           0,
-           10,
-           Block::make({Store::make(
-               a, {0}, Add::make(Load::make(a, {0}), Load::make(b, {x2})))}))});
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   B[x] = x;
-   * }
-   * for (int x_1 = 0; x_1 < 10; x_1++) {
-   *   A[0] = (A[0]) + (B[x_1]);
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   B[x] = x;
-   * }
-   * for (int x_1 = 0; x_1 < 10; x_1++) {
-   *   A_1 = A_1 + (B[x_1]);
-   * }
-   * A[0] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 0;
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK:   B[x] = x
-# CHECK: for (int x_1 = 0; x_1 < 10; x_1++)
-# CHECK-NOT: A[
-# CHECK:   A_1 =
-# CHECK: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Can registerize variable accesses so long as the variable does not change.
-TEST(Registerizer, RegisterizerSymbolicIndices) {
-  VarHandle i("i", kInt);
-  VarHandle N("N", kInt);
-  BufHandle a("A", {N}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {i}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {i}, Add::make(Load::make(a, {i}), x))}))});
-
-  /*
-   * A[i] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[i] = (A[i]) + x;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A_1 = x + A_1;
-   * }
-   * A[i] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 0;
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK-NOT: A[
-# CHECK:   A_1 =
-# CHECK: A[i] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Can registerize accesses dependent on multiple loop vars.
-TEST(Registerizer, RegisterizerMultiLoop) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           For::make(
-               y,
-               0,
-               10,
-               Block::make({Store::make(
-                   a,
-                   {0},
-                   Mul::make(Add::make(Load::make(a, {0}), x), y))})))});
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   for (int y = 0; y < 10; y++) {
-   *     A[0] = x * y + (A[0]) * y;
-   *   }
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   for (int y = 0; y < 10; y++) {
-   *     A_1 = x * y + y * A_1;
-   *   }
-   * }
-   * A[0] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 0;
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK:   for (int y = 0; y < 10; y++)
-# CHECK-NOT: A[
-# CHECK:     A_1 =
-# CHECK: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Can registerize correctly if scalars already exist in the program.
-TEST(Registerizer, RegisterizerRepeated) {
-  BufHandle a("A", {2}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make({
-      Store::make(a, {0}, 0),
-      Store::make(a, {1}, 0),
-      For::make(
-          x,
-          0,
-          10,
-          Block::make(
-              {Store::make(a, {0}, Add::make(Load::make(a, {0}), x)),
-               Store::make(a, {1}, Sub::make(Load::make(a, {1}), x))})),
-  });
-
-  // Registerize manually to make sure we only replace a single target.
-  {
-    registerizer::RegisterizerAnalysis analysis;
-    stmt->accept(&analysis);
-    auto candidates = analysis.getCandidates();
-    ASSERT_EQ(candidates.size(), 2);
-
-    candidates.pop_back();
-    registerizer::RegisterizerReplacer replacer(candidates);
-    stmt = stmt->accept_mutator(&replacer);
-  }
-
-  // Re-analyze and replace the second target.
-  {
-    registerizer::RegisterizerAnalysis analysis;
-    stmt->accept(&analysis);
-    auto candidates = analysis.getCandidates();
-    ASSERT_EQ(candidates.size(), 1);
-
-    registerizer::RegisterizerReplacer replacer(candidates);
-    stmt = stmt->accept_mutator(&replacer);
-  }
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 0;
-# CHECK: int A_1_1 = 0;
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK-NOT: A[
-# CHECK:   A_1 =
-# CHECK:   A_1_1 =
-# CHECK: A[1] = A_1_1;
-# CHECK: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Can registerize the load of A.
-TEST(Registerizer, RegisterizerNoLoads) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 0),
-       For::make(
-           x, 0, 10, Block::make({Store::make(a, {0}, Add::make(x, 1))}))});
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = x + 1;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A_1 = x + 1;
-   * }
-   * A[0] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 0;
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK-NOT: A[
-# CHECK:   A_1 =
-# CHECK: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Can registerize the load of A but not the store of B.
-TEST(Registerizer, RegisterizerNoRepeatedStores) {
-  BufHandle a("A", {1}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(b, {x}, Add::make(Load::make(a, {0}), x))}))});
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   B[x] = (A[0]) + x;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  // TODO: its unnecessary to reorder the initializer of A[0], but it's not
-  // actually worse so lets not worry for now.
-
-  /*
-   * int A_1 = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   B[x] = x + A_1;
-   * }
-   * A[0] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 0;
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK-NOT: A_
-# CHECK:   B[x] =
-# CHECK: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Won't registerize if there are multiple accesses which may overlap.
-TEST(Registerizer, RegisterizerMultiVarOverlap) {
-  BufHandle a("A", {2}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make({
-      Store::make(a, {0}, 0),
-      Store::make(a, {1}, 0),
-      For::make(
-          x,
-          0,
-          10,
-          Block::make(
-              {Store::make(a, {x}, Add::make(Load::make(a, {0}), x)),
-               Store::make(a, {x + 1}, Sub::make(Load::make(a, {1}), x))})),
-  });
-  stmt = IRSimplifier::simplify(stmt);
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-TEST(Registerizer, RegisterizerAllocs) {
-  BufHandle a("A", {2}, kInt);
-  BufHandle c("C", {1}, kInt);
-  VarHandle x("x", kInt);
-
-  BufHandle b("B", {Load::make(c, {0})}, kInt);
-
-  StmtPtr stmt = Block::make(
-      {Allocate::make(b),
-       Store::make(a, {0}, Load::make(c, {0})),
-       Store::make(b, {0}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(b, {0}, Add::make(Load::make(b, {0}), x)),
-                Store::make(a, {0}, Load::make(c, {0}))})),
-       Free::make(b)});
-
-  /*
-   * Allocate(B, int, {C[0]});
-   * A[0] = C[0];
-   * B[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   B[0] = (B[0]) + x;
-   *   A[0] = C[0];
-   * }
-   * Free(B);
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int C_1 = C[0];
-   * Allocate(B, int, {C_});
-   * int A_1 = C_1;
-   * int B_1 = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   B_1 = B_1 + x;
-   *   A_1 = C_1;
-   * }
-   * B[0] = B_1;
-   * A[0] = A_1;
-   * Free(B);
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int C_1 = C[0];
-# CHECK: Allocate(B
-# CHECK: int A_1 = C_1;
-# CHECK: int B_1 = 0;
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK:   B_1 =
-# CHECK:   A_1 = C_
-# CHECK: B[0] = B_1;
-# CHECK: A[0] = A_1;
-# CHECK: Free(B)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Registerizer, RegisterizerNoInitializer) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make({For::make(
-      x,
-      0,
-      10,
-      Block::make({Store::make(a, {0}, Add::make(Load::make(a, {0}), x))}))});
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = (A[0]) + x;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = A[0];
-   * for (int x = 0; x < 10; x++) {
-   *   A_1 = x + A_1;
-   * }
-   * A[0] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[0];
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK-NOT: A[
-# CHECK:   A_1 =
-# CHECK: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Registerizer, RegisterizerNoInitializerLoopVar) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make({For::make(
-      x,
-      0,
-      10,
-      Block::make({Store::make(a, {x}, Add::make(Load::make(a, {x}), x))}))});
-  stmt = IRSimplifier::simplify(stmt);
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   A[x] = (A[x]) + x;
-   * }
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-TEST(Registerizer, RegisterizerLoadThenStore) {
-  BufHandle a("A", {1}, kInt);
-  BufHandle b("B", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make({For::make(
-      x,
-      0,
-      10,
-      Block::make(
-          {Store::make(b, {0}, Add::make(Load::make(a, {0}), x)),
-           Store::make(a, {0}, Load::make(b, {0}))}))});
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   B[0] = (A[0]) + x;
-   *   A[0] = B[0];
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = A[0];
-   * int B_1 = B[0];
-   * for (int x = 0; x < 10; x++) {
-   *   B_1 = x + A_1;
-   *   A_1 = B_1;
-   * }
-   * B[0] = B_1;
-   * A[0] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[0];
-# CHECK: int B_1 = B[0];
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK-NOT: B[
-# CHECK:   B_1 =
-# CHECK-NOT: A[
-# CHECK:   A_1 = B_
-# CHECK: B[0] = B_
-# CHECK: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Registerizer, RegisterizerParallelized) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  LoopOptions loopOpts;
-  loopOpts.set_gpu_block_index(0);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make({Store::make(a, {0}, Add::make(Load::make(a, {0}), x))}),
-           loopOpts)});
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = (A[0]) + x;
-   * }
-   */
-
-  ASSERT_THROWS_WITH(
-      registerize(stmt),
-      "Registerization must occur after parallelism flattening");
-}
-
-// Should be able to registerize this since the scalar would exist before the
-// branch.
-TEST(Registerizer, RegisterizerConditionAfter) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {x}, Load::make(b, {x})),
-       Store::make(c, {x}, Load::make(a, {x})),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-           nullptr)});
-
-  /*
-   * A[x] = B[x];
-   * C[x] = A[x];
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = (A[x]) + 1;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = B[x];
-   * C[x] = A_1;
-   * if (x<5 ? 1 : 0) {
-   *   A_1 = A_1 + 1;
-   * }
-   * A[x] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = B[x];
-# CHECK: C[x] = A_1;
-# CHECK: if (
-# CHECK:   A_1 = A_1 + 1;
-# CHECK: A[x] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Should be able to registerize this since the scalar exists in the same form
-// after the branch and there is no overlap.
-TEST(Registerizer, RegisterizerConditionBefore) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-
-  StmtPtr stmt = Block::make(
-      {Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-           nullptr),
-       Store::make(a, {x}, Load::make(b, {x})),
-       Store::make(c, {x}, Load::make(a, {x}))});
-
-  /*
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = (A[x]) + 1;
-   * }
-   * A[x] = B[x];
-   * C[x] = A[x];
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_ 1 = A[x];
-   * if (x<5 ? 1 : 0) {
-   *   A_1 = A_1 + 1;
-   * }
-   * A_1 = B[x];
-   * C[x] = A_1;
-   * A[x] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[x];
-# CHECK: if (
-# CHECK:   A_1 = A_1 + 1;
-# CHECK: }
-# CHECK: A_1 = B[x];
-# CHECK: C[x] = A_1;
-# CHECK: A[x] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Should be able to registerize this as the combination of the two above rules.
-TEST(Registerizer, RegisterizerConditionInside) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {x}, Load::make(b, {x})),
-       Store::make(c, {x}, Load::make(a, {x})),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-           nullptr),
-       Store::make(b, {x}, Load::make(a, {x})),
-       Store::make(a, {x}, Load::make(c, {x}))});
-
-  /*
-   * A[x] = B[x];
-   * C[x] = A[x];
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = (A[x]) + 1;
-   * }
-   * B[x] = A[x];
-   * A[x] = C[x];
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = B[x];
-   * C[x] = A_1;
-   * if (x<5 ? 1 : 0) {
-   *   A_1 = A_1 + 1;
-   * }
-   * B[x] = A_1;
-   * A_1 = C[x];
-   * A[x] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = B[x];
-# CHECK: C[x] = A_1;
-# CHECK: if (
-# CHECK:   A_1 = A_1 + 1;
-# CHECK: }
-# CHECK: B[x] = A_1;
-# CHECK: A_1 = C[x];
-# CHECK: A[x] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// An example where an access is cut by an overlapping access inside a
-// condition, and both sides are large enough to be registerized but cannot be
-// because there is no safe place to put the initializer or finalizer.
-TEST(Registerizer, RegisterizerConditionInsideOverlap1) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  StmtPtr stmt = Block::make(
-      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-      {Store::make(a, {x}, Load::make(b, {x})),
-       Store::make(c, {x}, Load::make(a, {x})),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Block::make({
-               Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-               Store::make(a, {0}, 3),
-               Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-           }),
-           nullptr),
-       Store::make(b, {x}, Load::make(a, {x})),
-       Store::make(a, {x}, Load::make(c, {x}))});
-
-  /*
-   * A[x] = B[x];
-   * C[x] = A[x];
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = (A[x]) + 1;
-   *   A[0] = 3;
-   *   A[x] = (A[x]) + 1;
-   * }
-   * B[x] = A[x];
-   * A[x] = C[x];
-   */
-
-  // The A[0] store overlaps, A[x] cutting the region that can be registerized
-  // into two groups.
-  // Each group has 2 loads and 2 stores however, so we could registerize it,
-  // but the first group would need to be finalized inside the condition block,
-  // the second would need to be initialized inside the condition block. There's
-  // no safe place to put these that's visible to the other uses in the group
-  // and so neither registerization is possible.
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-// Same as the above, but the access group before the condition (and after the
-// condition) are large enough to be registerized without needing the access
-// from the loop. Registerization occurs but does not include any accesses in
-// the condition, and the first group must be finalized before the Cond, the
-// second initialized after it.
-TEST(Registerizer, RegisterizerConditionInsideOverlap2) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  StmtPtr stmt = Block::make(
-      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-      {Store::make(a, {x}, Load::make(b, {x})),
-       Store::make(a, {x}, Load::make(b, {x + 1})),
-       Store::make(c, {x}, Load::make(a, {x})),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Block::make({
-               Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-               Store::make(a, {0}, 3),
-               Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-           }),
-           nullptr),
-       Store::make(b, {x}, Load::make(a, {x})),
-       Store::make(b, {x + 1}, Load::make(a, {x})),
-       Store::make(a, {x}, Load::make(c, {x}))});
-
-  /*
-   * A[x] = B[x];
-   * A[x] = B[x + 1];
-   * C[x] = A[x];
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = (A[x]) + 1;
-   *   A[0] = 3;
-   *   A[x] = (A[x]) + 1;
-   * }
-   * B[x] = A[x];
-   * B[x + 1] = A[x];
-   * A[x] = C[x];
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = B[x];              // A_1 initializer
-   * A_1 = B[x + 1];              //
-   * C[x] = A_1;                  //
-   * A[x] = A_1;                  // A_1 finalizer
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = (A[x]) + 1;
-   *   A[0] = 3;
-   *   A[x] = (A[x]) + 1;
-   * }
-   * int A_2 = A[x];              // A_2 initializer
-   * B[x] = A_2;                  //
-   * B[x + 1] = A_2;              //
-   * A_2 = C[x];                  //
-   * A[x] = A_2;                  // A_2 finalizer
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = B[x];
-# CHECK: A_1 = B[x + 1];
-# CHECK: C[x] = A_1;
-# CHECK: A[x] = A_1;
-# CHECK: if (
-# CHECK-NOT:   A_1 = A_1 + 1;
-# CHECK:   A[x] = (A[x]
-# CHECK:   A[0] =
-# CHECK:   A[x] = (A[x]
-# CHECK: }
-# CHECK: int A_2 = A[x];
-# CHECK: B[x] = A_2;
-# CHECK: B[x + 1] = A_2;
-# CHECK: A_2 = C[x];
-# CHECK: A[x] = A_2;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// When accesses are within conditional blocks they are not visible to the wider
-// program, because we don't know if the branch would be taken and if it isn't
-// the accesses in it don't need to be valid (think size checks on the index).
-// In this case the accesses cannot be registerized.
-TEST(Registerizer, RegisterizerConditionHidden) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-
-  StmtPtr stmt = Block::make(
-      {Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-           nullptr),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kGT),
-           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-           nullptr)});
-
-  /*
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = (A[x]) + 1;
-   * }
-   * if (x>5 ? 1 : 0) {
-   *   A[x] = (A[x]) + 1;
-   * }
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-// But... if the same access is found in a non conditional scope, that means
-// that that access is valid in the higher scope (or at least if its not it's
-// the user's fault). It "unhides" the conditional accesses, allowing
-// registerization to occur.
-TEST(Registerizer, RegisterizerConditionUnhidden) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-
-  StmtPtr stmt = Block::make(
-      {Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-           nullptr),
-       Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kGT),
-           Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-           nullptr)});
-
-  /*
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = (A[x]) + 1;
-   * }
-   * A[x] = (A[x]) + 1;            <-- this is doing the unhiding.
-   * if (x>5 ? 1 : 0) {
-   *   A[x] = (A[x]) + 1;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = A[x];
-   * if (x<5 ? 1 : 0) {
-   *   A_1 = A_1 + 1;
-   * }
-   * A_1 = A_1 + 1;
-   * if (x>5 ? 1 : 0) {
-   *   A_1 = A_1 + 1;
-   * }
-   * A[x] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[x];
-# CHECK: if (x<5
-# CHECK:   A_1 = A_1 + 1;
-# CHECK: }
-# CHECK: A_1 = A_1 + 1;
-# CHECK: if (x>5
-# CHECK:   A_1 = A_1 + 1;
-# CHECK: }
-# CHECK: A[x] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Can registerize a load that occurs in the condition of a Cond.
-TEST(Registerizer, RegisterizerCondCondition) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {x}, Load::make(b, {x})),
-       Store::make(c, {x}, Load::make(a, {x})),
-       Cond::make(
-           CompareSelect::make(
-               Load::make(a, {x}), 5, CompareSelectOperation::kLT),
-           Store::make(c, {x}, Add::make(Load::make(c, {x}), 1)),
-           nullptr)});
-
-  /*
-   * A[x] = B[x];
-   * C[x] = A[x];
-   * if ((A[x])<5 ? 1 : 0) {
-   *   C[x] = (C[x]) + 1;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = B[x];
-   * int C_1 = A_1;
-   * if (A_1<5 ? 1 : 0) {
-   *   C_1 = C_1 + 1;
-   * }
-   * C[x] = C_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = B[x];
-# CHECK: int C_1 = A_1;
-# CHECK: if (A_1<5
-# CHECK:   C_1 = C_1 + 1;
-# CHECK: C[x] = C_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Appearing in the condition of a Cond makes it visible to the enclosing scope,
-// and so we can registerize internal usages.
-TEST(Registerizer, RegisterizerCondConditionUnhidden) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-
-  StmtPtr stmt = Block::make({Cond::make(
-      CompareSelect::make(Load::make(a, {x}), 5, CompareSelectOperation::kLT),
-      Store::make(a, {x}, Add::make(Load::make(a, {x}), 1)),
-      Store::make(a, {x}, Add::make(Load::make(a, {x}), 10)))});
-
-  /*
-   * if ((A[x])<5 ? 1 : 0) {
-   *   A[x] = (A[x]) + 1;
-   * } else {
-   *   A[x] = (A[x]) + 10;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = A[x];
-   * if (A_1<5 ? 1 : 0) {
-   *   A_1 = A_1 + 1;
-   * } else {
-   *   A_1 = A_1 + 10;
-   * }
-   * A[x] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[x];
-# CHECK: if (A_1<5
-# CHECK:   A_1 = A_1 + 1;
-# CHECK: } else {
-# CHECK:   A_1 = A_1 + 10;
-# CHECK: }
-# CHECK: A[x] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Conditional hiding also works for IfThenElse exprs.
-TEST(Registerizer, RegisterizerIfThenElseHidden) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  StmtPtr stmt = Block::make(
-      {Store::make(
-           b,
-           {y},
-           IfThenElse::make(
-               CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-               Add::make(Load::make(a, {x}), 1),
-               Add::make(Load::make(a, {x + 1}), 2))),
-       Store::make(
-           b,
-           {y + 1},
-           IfThenElse::make(
-               CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-               Add::make(Load::make(a, {x}), 1),
-               Add::make(Load::make(a, {x + 1}), 2)))});
-
-  /*
-   * B[y] = IfThenElse(x<5 ? 1 : 0, (A[x]) + 1, (A[x + 1]) + 2);
-   * B[y + 1] = IfThenElse(x<5 ? 1 : 0, (A[x]) + 1, (A[x + 1]) + 2);
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-// Conditional unhiding also works for IfThenElse exprs.
-TEST(Registerizer, RegisterizerIfThenElseUnhidden) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  StmtPtr stmt = Block::make({
-      Store::make(a, {x}, 0),
-      Store::make(
-          b,
-          {y},
-          IfThenElse::make(
-              CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-              Add::make(Load::make(a, {x}), 1),
-              Add::make(Load::make(a, {x + 1}), 2))),
-      Store::make(
-          b,
-          {y + 1},
-          IfThenElse::make(
-              CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-              Add::make(Load::make(a, {x}), 1),
-              Add::make(Load::make(a, {x + 1}), 2))),
-  });
-
-  /*
-   * A[x] = 0;
-   * B[y] = IfThenElse(x<5 ? 1 : 0, (A[x]) + 1, (A[x + 1]) + 2);
-   * B[y + 1] = IfThenElse(x<5 ? 1 : 0, (A[x]) + 1, (A[x + 1]) + 2);
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 0;
-   * B[y] = IfThenElse(x<5 ? 1 : 0, A_1 + 1, (A[x + 1]) + 2);
-   * B[y + 1] = IfThenElse(x<5 ? 1 : 0, A_1 + 1, (A[x + 1]) + 2);
-   * A[x] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 0;
-# CHECK: B[y] = IfThenElse(x<5 ? 1 : 0, A_1 + 1, (A[x + 1]) + 2);
-# CHECK: B[y + 1] = IfThenElse(x<5 ? 1 : 0, A_1 + 1, (A[x + 1]) + 2);
-# CHECK: A[x] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Nested IfThenElse exprs can't promote to higher level scopes.
-TEST(Registerizer, RegisterizerIfThenElseNested) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  BufHandle d("D", {5}, kInt);
-  VarHandle x("x", kInt);
-
-  StmtPtr stmt = Block::make({Store::make(
-      a,
-      {x},
-      IfThenElse::make(
-          CompareSelect::make(x, 3, CompareSelectOperation::kLT),
-          IfThenElse::make(
-              CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-              Load::make(d, {x}),
-              Load::make(b, {x})),
-          IfThenElse::make(
-              CompareSelect::make(x, 5, CompareSelectOperation::kEQ),
-              Load::make(c, {x}),
-              Load::make(d, {x}))))});
-
-  /*
-   * A[x] = IfThenElse(x<3 ? 1 : 0,
-   *          IfThenElse(x==2 ? 1 : 0, D[x], B[x]),
-   *            IfThenElse(x==5 ? 1 : 0, C[x], D[x]));
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-// Cannot registerize an access completely contained within an IfThenElse
-// branch, since it is not a Stmt and cannot hold variable definitions. We need
-// to check that we don't promote the initializer/finalizer to the enclosing
-// Block.
-TEST(Registerizer, RegisterizerIfThenElseInternal) {
-  // Making these floats so they don't get simplified to a single access.
-  BufHandle a("A", {5}, kFloat);
-  BufHandle b("B", {5}, kFloat);
-  VarHandle x("x", kInt);
-
-  StmtPtr stmt = Block::make({Store::make(
-      a,
-      {x},
-      IfThenElse::make(
-          CompareSelect::make(x, 3, CompareSelectOperation::kLT),
-          Add::make(Load::make(b, {x}), Load::make(b, {x})),
-          Load::make(b, {x})))});
-
-  /*
-   * A[x] = IfThenElse(x<3 ? 1 : 0, (B[x]) + (B[x]), B[x]);
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-
-  // If this was a Cond instead of an IfThenElse then we could registerize the
-  // two accesses to B[x] in the True branch.
-
-  // Actually lets verify that.
-
-  stmt = Block::make({Cond::make(
-      CompareSelect::make(x, 3, CompareSelectOperation::kLT),
-      Store::make(a, {x}, Add::make(Load::make(b, {x}), Load::make(b, {x}))),
-      Store::make(a, {x}, Load::make(b, {x})))});
-
-  /*
-   * if (x<3 ? 1 : 0) {
-   *   A[x] = (B[x]) + (B[x]);
-   * } else {
-   *   A[x] = B[x];
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * if (x<3 ? 1 : 0) {
-   *   float B_1 = B[x];
-   *   A[x] = B_1 + B_1;
-   * } else {
-   *   A[x] = B[x];
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK-NOT: int
-# CHECK-NOT: float
-# CHECK: if (x<3
-# CHECK:   float B_1 =
-# CHECK:   A[x] = B_1 + B_1
-# CHECK: } else {
-# CHECK:   A[x] = B[x]
-# CHECK: }
-# CHECK-NOT: A[x]
-# CHECK-NOT: B[x])IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Can registerize a load that occurs in the condition of an IfThenElse;
-TEST(Registerizer, RegisterizerIfThenElseCondition) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {x}, Load::make(a, {x})),
-       Store::make(
-           a,
-           {x},
-           IfThenElse::make(
-               CompareSelect::make(
-                   Load::make(a, {x}), 5, CompareSelectOperation::kLT),
-               Load::make(b, {0}),
-               Load::make(c, {0})))});
-
-  /*
-   * A[x] = A[x];       <---- just here so there are enough accesses to combine.
-   * A[x] = IfThenElse((A[x])<5 ? 1 : 0, B[0], C[0]);
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = A[x];
-   * A_1 = A_1;
-   * A_1 = IfThenElse(A_1<5 ? 1 : 0, B[0], C[0]);
-   * A[x] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[x];
-# CHECK: A_1 = IfThenElse(A_1<5 ? 1 : 0, B[0], C[0]);
-# CHECK: A[x] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Appearing in the condition of a Cond makes it visible to the enclosing scope,
-// and so we can registerize internal usages.
-TEST(Registerizer, RegisterizerIfThenElseConditionUnhidden) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-
-  StmtPtr stmt = Block::make({Store::make(
-      b,
-      {x},
-      IfThenElse::make(
-          CompareSelect::make(
-              Load::make(a, {x}), 5, CompareSelectOperation::kLT),
-          Add::make(Load::make(a, {x}), 1),
-          Add::make(Load::make(a, {x}), 10)))});
-
-  /*
-   * B[x] = IfThenElse((A[x])<5 ? 1 : 0, (A[x]) + 1, (A[x]) + 10);
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = A[x];
-   * B[x] = IfThenElse(A_1<5 ? 1 : 0, A_1 + 1, A_1 + 10);
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[x];
-# CHECK: B[x] = IfThenElse(A_1<5 ? 1 : 0, A_1 + 1, A_1 + 10);)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Cannot promote accesses internal to IfThenElse branches even if the enclosing
-// scope if conditional.
-TEST(Registerizer, RegisterizerConditionBranchOnly) {
-  BufHandle a("A", {5}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make({For::make(
-      x,
-      0,
-      10,
-      Block::make({
-          Cond::make(
-              CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-              Store::make(
-                  a,
-                  {x},
-                  IfThenElse::make(
-                      CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-                      Add::make(Load::make(a, {x}), x),
-                      Add::make(Load::make(a, {x - 5}), x))),
-              Store::make(
-                  a,
-                  {x - 5},
-                  IfThenElse::make(
-                      CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-                      Add::make(Load::make(a, {x}), x),
-                      Add::make(Load::make(a, {x - 5}), x)))),
-      }))});
-  stmt = IRSimplifier::simplify(stmt);
-
-  std::ostringstream before;
-  before << *stmt;
-
-  /* for (int x = 0; x < 10; x++) {
-   *   if (x<5 ? 1 : 0) {
-   *     A[x] = IfThenElse(x<5 ? 1 : 0, (A[x]) + x, (A[x - 5]) + x);
-   *   } else {
-   *     A[x - 5] = IfThenElse(x<5 ? 1 : 0, (A[x]) + x, (A[x - 5]) + x);
-   *   }
-   * }
-   */
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-// We can registerize an IfThenElse that appears in the condition branch of a
-// Cond. This is a weird but valid thing to do.
-TEST(Registerizer, RegisterizerCondIfThenElse) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  BufHandle c("C", {5}, kInt);
-  VarHandle x("x", kInt);
-
-  StmtPtr stmt = Block::make({Cond::make(
-      CompareSelect::make(
-          IfThenElse::make(
-              CompareSelect::make(
-                  Load::make(a, {x}), 5, CompareSelectOperation::kLT),
-              Load::make(a, {x}),
-              Load::make(b, {x})),
-          x,
-          CompareSelectOperation::kEQ),
-      Store::make(c, {x}, Add::make(Load::make(c, {x}), 1)),
-      nullptr)});
-
-  /*
-   * if ((IfThenElse((A[x])<5 ? 1 : 0, A[x], B[x]))==x ? 1 : 0) {
-   *   C[x] = (C[x]) + 1;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  // access to A can be registerized, but not B or C
-
-  /*
-   * int A_1 = A[x];
-   * if ((IfThenElse(A_1<5 ? 1 : 0, A_1, B[x]))==x ? 1 : 0) {
-   *   C[x] = (C[x]) + 1;
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[x];
-# CHECK: if ((IfThenElse(A_1<5 ? 1 : 0, A_1, B[x]
-# CHECK:   C[x] = (C[x]) + 1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Can registerize a conditional access in the RHS of a store unhidden by it's
-// LHS, and hoist it out of a loop.
-TEST(Registerizer, RegisterizerIfThenElseLoop) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  StmtPtr stmt = For::make(
-      y,
-      0,
-      10,
-      Store::make(
-          a,
-          {x},
-          IfThenElse::make(
-              CompareSelect::make(x, 3, CompareSelectOperation::kLT),
-              Load::make(a, {x}),
-              Load::make(b, {y}))));
-
-  /*
-   * for (int y = 0; y < 10; y++) {
-   *   A[x] = IfThenElse(x<3 ? 1 : 0, A[x], B[y]);
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = A[x];
-   * for (int y = 0; y < 10; y++) {
-   *   A_1 = IfThenElse(x<3 ? 1 : 0, A_1, B[y]);
-   * }
-   * A[x] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[x];
-# CHECK: for (
-# CHECK:   A_1 = IfThenElse(x<3 ? 1 : 0, A_1, B[y]);
-# CHECK: }
-# CHECK: A[x] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Cannot registerize if the RHS overlaps the access creating visibility.
-TEST(Registerizer, RegisterizerIfThenElseLoopCut) {
-  BufHandle a("A", {5}, kInt);
-  BufHandle b("B", {5}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  StmtPtr stmt = Block::make({For::make(
-      y,
-      0,
-      10,
-      Store::make(
-          a,
-          {x},
-          IfThenElse::make(
-              CompareSelect::make(x, 3, CompareSelectOperation::kLT),
-              Load::make(a, {x}),
-              Load::make(a, {y}))))});
-
-  /*
-   * for (int y = 0; y < 10; y++) {
-   *   A[x] = IfThenElse(x<3 ? 1 : 0, A[x], A[y]);
-   * }
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-// Simple case where an access is cut by an overlapping access later in the
-// program, we can registerize up until the overlap.
-TEST(Registerizer, RegisterizerPartialAfter) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {0}, Add::make(Load::make(a, {0}), x))})),
-       For::make(x, 1, 10, Store::make(a, {x}, Load::make(a, {x - 1})))});
-
-  /*
-   * A[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = (A[0]) + x;
-   * }
-   * for (int x = 1; x < 10; x++) {
-   *   A[x] = A[x - 1];
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A_1 = A_1 + x;
-   * }
-   * A[0] = A_1;
-   * for (int x = 1; x < 10; x++) {
-   *   A[x] = A[x - 1];
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 0;
-# CHECK: for (
-# CHECK:   A_1 = A_1 + x;
-# CHECK: }
-# CHECK: A[0] = A_1;
-# CHECK: for (
-# CHECK:   A[x] = A[x - 1];
-# CHECK: }
-# CHECK-NOT: A)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// We can registerize an access which overlaps a previous access, the
-// initializer must be inserted after the previous access.
-TEST(Registerizer, RegisterizerPartialBefore) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {For::make(x, 1, 10, Store::make(a, {x}, Load::make(a, {x - 1}))),
-       Store::make(a, {0}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {0}, Add::make(Load::make(a, {0}), x))}))});
-
-  /*
-   * for (int x = 1; x < 10; x++) {
-   *   A[x] = A[x - 1];
-   * }
-   * A[0] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = (A[0]) + x;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * for (int x = 1; x < 10; x++) {
-   *   A[x] = A[x - 1];
-   * }
-   * int A_1 = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A_1 = A_1 + x;
-   * }
-   * A[0] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK-NOT: int
-# CHECK: for (
-# CHECK:   A[x] = A[x - 1];
-# CHECK: }
-# CHECK: int A_1 = 0;
-# CHECK: for (
-# CHECK:   A_1 = A_1 + x;
-# CHECK: }
-# CHECK: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// The combination of the previous two tests, an access is cut by an overlapping
-// access in both directions.
-TEST(Registerizer, RegisterizerPartialInside) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x1("x1", kInt);
-  VarHandle x2("x2", kInt);
-  VarHandle x3("x3", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 2),
-       For::make(
-           x1, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), x1))),
-       For::make(x2, 1, 10, Store::make(a, {x2}, Load::make(a, {x2 - 1}))),
-       For::make(
-           x3, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), x3)))});
-
-  /*
-   * A[0] = 2;
-   * for (int x1 = 0; x1 < 10; x1++) {
-   *   A[0] = (A[0]) + x1;
-   * }
-   * for (int x2 = 1; x2 < 10; x2++) {
-   *   A[x2] = A[x2 - 1];
-   * }
-   * for (int x3 = 0; x3 < 10; x3++) {
-   *   A[0] = (A[0]) + x3;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 2;
-   * for (int x1 = 0; x1 < 10; x1++) {
-   *   A_1 = A_1 + x1;
-   * }
-   * A[0] = A_1;
-   * for (int x2 = 1; x2 < 10; x2++) {
-   *   A[x2] = A[x2 - 1];
-   * }
-   * int A_2 = A[0];
-   * for (int x3 = 0; x3 < 10; x3++) {
-   *   A_2 = A_2 + x3;
-   * }
-   * A[0] = A_2;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 2;
-# CHECK: for (
-# CHECK:   A_1 = A_1 + x1;
-# CHECK: }
-# CHECK: A[0] = A_1;
-# CHECK: for (
-# CHECK:   A[x2] =
-# CHECK: }
-# CHECK: int A_2 = A[0];
-# CHECK: for (
-# CHECK:   A_2 = A_2 + x3;
-# CHECK: }
-# CHECK: A[0] = A_2;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// An element could be registerized program wide but is cut by a conditional
-// access, we should break this into two scalars and write back to the buffer
-// before the condition.
-TEST(Registerizer, RegisterizerPartialCondition) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 2),
-       For::make(
-           x, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), x))),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Store::make(a, {x}, Load::make(a, {x - 1})),
-           nullptr),
-       For::make(
-           x, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), x)))});
-
-  /*
-   * A[0] = 2;
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = (A[0]) + x;
-   * }
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = A[x - 1];
-   * }
-   * for (int x = 0; x < 10; x++) {
-   *   A[0] = (A[0]) + x;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 2;
-   * for (int x = 0; x < 10; x++) {
-   *   A_1 = A_1 + x;
-   * }
-   * A[0] = A_1;
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = A[x - 1];
-   * }
-   * int A_2 = A[0];
-   * for (int x = 0; x < 10; x++) {
-   *   A_2 = A_2 + x;
-   * }
-   * A[0] = A_2;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 2;
-# CHECK: for (
-# CHECK:   A_1 = A_1 + x;
-# CHECK: }
-# CHECK: A[0] = A_1;
-# CHECK: if (
-# CHECK:   A[x] =
-# CHECK: }
-# CHECK: int A_2 = A[0];
-# CHECK: for (
-# CHECK:   A_2 = A_2 + x;
-# CHECK: }
-# CHECK: A[0] = A_2;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Tests case where an access is cut by an internal conditional access which
-// itself is registerized.
-TEST(Registerizer, RegisterizerPartialConditionInternalCut) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 1),
-       Store::make(a, {0}, 3),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Block::make({Store::make(a, {x}, 1), Store::make(a, {x}, 3)}),
-           nullptr),
-       Store::make(a, {0}, 4),
-       Store::make(a, {0}, 6)});
-
-  /*
-   * A[0] = 1;
-   * A[0] = 3;
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = 1;
-   *   A[x] = 3;
-   * }
-   * A[0] = 4;
-   * A[0] = 6;
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 1;
-   * A_1 = 3;
-   * A[0] = A_1;
-   * if (x<5 ? 1 : 0) {
-   *   int A_2 = 1;
-   *   A_2 = 3;
-   *   A[x] = A_2;
-   * }
-   * int A_3 = 4;
-   * A_3 = 6;
-   * A[0] = A_3;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 1;
-# CHECK: A_1 = 3
-# CHECK: A[0] = A_1;
-# CHECK: if (
-# CHECK:   int A_2 = 1;
-# CHECK:   A_2 = 3;
-# CHECK:   A[x] = A_2;
-# CHECK: }
-# CHECK: int A_3 = 4;
-# CHECK: A_3 = 6;
-# CHECK: A[0] = A_3;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// First statement in condition closes outer access, but can be registerized
-// with later statements.
-TEST(Registerizer, RegisterizerPartialConditionInternalStart) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, 1),
-       Store::make(a, {0}, 3),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Block::make({Store::make(a, {x}, 1), Store::make(a, {x}, 3)}),
-           nullptr),
-       Store::make(a, {x}, 4),
-       Store::make(a, {x}, 6)});
-
-  /*
-   * A[0] = 1;
-   * A[0] = 3;
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = 1;
-   *   A[x] = 3;
-   * }
-   * A[x] = 4;
-   * A[x] = 6;
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 1;
-   * A_1 = 3;
-   * A[0] = A_1;
-   * int A_2 = A[x];    <--- must read from the input here.
-   * if (x<5 ? 1 : 0) {
-   *   A_2 = 1;
-   *   A_2 = 3;
-   * }
-   * A_2 = 4;
-   * A_2 = 6;
-   * A[x] = A_2;
-   */
-
-  // TODO: I suppose we could refactor with a conditional initializer?
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 1;
-# CHECK: A_1 = 3
-# CHECK: A[0] = A_1;
-# CHECK: int A_2 = A[x];
-# CHECK: if (
-# CHECK:   A_2 = 1;
-# CHECK:   A_2 = 3;
-# CHECK: }
-# CHECK: A_2 = 4;
-# CHECK: A_2 = 6;
-# CHECK: A[x] = A_2;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// An access cuts two open overlaps and creates four scalar variables.
-TEST(Registerizer, RegisterizerPartialOverlapsTwo) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {1}, Load::make(a, {0})),
-       Store::make(a, {0}, Load::make(a, {1})),
-       Store::make(a, {0}, Load::make(a, {1})),
-       For::make(x, 1, 10, Store::make(a, {x}, x)),
-       Store::make(a, {1}, Load::make(a, {0})),
-       Store::make(a, {0}, Load::make(a, {1})),
-       Store::make(a, {0}, Load::make(a, {1}))});
-
-  /*
-   * A[1] = A[0];
-   * A[0] = A[1];
-   * A[0] = A[1];
-   * for (int x = 1; x < 10; x++) {
-   *   A[x] = x;
-   * }
-   * A[1] = A[0];
-   * A[0] = A[1];
-   * A[0] = A[1];
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = A[0];
-   * int A_2 = A_1;
-   * A_1 = A_2;
-   * A_1 = A_2;
-   * A[1] = A_2;
-   * A[0] = A_1;
-   * for (int x = 1; x < 10; x++) {
-   *   A[x] = x;
-   * }
-   * int A_3 = A[0];
-   * int A_4 = A_3;
-   * A_3 = A_4;
-   * A_3 = A_4;
-   * A[1] = A_4;
-   * A[0] = A_3;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[0];
-# CHECK: int A_2 = A_1;
-# CHECK: A_1 = A_2;
-# CHECK: A_1 = A_2;
-# CHECK: A[1] = A_2;
-# CHECK: A[0] = A_1;
-# CHECK: for (
-# CHECK:   A[x] = x;
-# CHECK: }
-# CHECK: int A_3 = A[0];
-# CHECK: int A_4 = A_3;
-# CHECK: A_3 = A_4;
-# CHECK: A_3 = A_4;
-# CHECK: A[1] = A_4;
-# CHECK: A[0] = A_3;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Nested blocks will automatically be flattened and do not provent
-// registerization of enclosed accesses.
-TEST(Registerizer, RegisterizerNestedBlocks) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-      {Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-       Block::make({Store::make(a, {0}, Add::make(Load::make(a, {0}), 2))}),
-       Block::make(
-           {Store::make(a, {0}, Add::make(Load::make(a, {0}), 3)),
-            Block::make(
-                {Store::make(a, {0}, Add::make(Load::make(a, {0}), 4))})})});
-
-  /*
-   * A[0] = (A[0]) + 1;
-   * {
-   *   A[0] = (A[0]) + 2;
-   * }
-   * {
-   *   A[0] = (A[0]) + 3;
-   *   {
-   *     A[0] = (A[0]) + 4;
-   *   }
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = A[0];
-   * A_1 = A_1 + 1;
-   * A_1 = A_1 + 2;
-   * A_1 = A_1 + 3;
-   * A_1 = A_1 + 4;
-   * A[0] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[0];
-# CHECK: A_1 = A_1 + 1;
-# CHECK: A_1 = A_1 + 2;
-# CHECK: A_1 = A_1 + 3;
-# CHECK: A_1 = A_1 + 4;
-# CHECK: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// The access can be registerized internally to a condition, but must ensure
-// that both initializer and finalizer are within the same condition.
-TEST(Registerizer, RegisterizerNestedConditions) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make({Cond::make(
-      CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-      Block::make(
-          {Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-           Cond::make(
-               CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-               Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-               nullptr)}),
-      nullptr)});
-
-  /*
-   * if (x<5 ? 1 : 0) {
-   *   A[0] = (A[0]) + 1;
-   *   if (x==2 ? 1 : 0) {
-   *
-   *     A[0] = (A[0]) + 1;
-   *   }
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * if (x<5 ? 1 : 0) {
-   *   int A_1 = A[0];
-   *   A_1 = A_1 + 1;
-   *   if (x==2 ? 1 : 0) {
-   *     A_1 = A_1 + 1;
-   *   }
-   * A[0] = A_1;
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: if (x<5
-# CHECK:   int A_1 = A[0];
-# CHECK:   A_1 = A_1 + 1;
-# CHECK:   if (x==2
-# CHECK:     A_1 = A_1 + 1;
-# CHECK:   }
-# CHECK: A[0] = A_1;
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// If an access exists outside the scope of the condition then we can lift
-// nested conditional usages into the same scalar.
-TEST(Registerizer, RegisterizerNestedConditionsUnhidden) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Block::make(
-               {Store::make(a, {1}, 1),
-                Cond::make(
-                    CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-                    Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-                    nullptr)}),
-           nullptr)});
-
-  /*
-   * A[0] = (A[0]) + 1;
-   * if (x<5 ? 1 : 0) {
-   *   A[1] = 1;
-   *   if (x==2 ? 1 : 0) {
-   *     A[0] = (A[0]) + 1;
-   *   }
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = A[0];
-   * A_1 = A_1 + 1;
-   * if (x<5 ? 1 : 0) {
-   *   A[1] = 1;
-   *   if (x==2 ? 1 : 0) {
-   *     A_1 = A_1 + 1;
-   *   }
-   * }
-   * A[0] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = A[0];
-# CHECK: A_1 = A_1 + 1;
-# CHECK: if (x<5
-# CHECK:   A[1] = 1;
-# CHECK:   if (x==2
-# CHECK:     A_1 = A_1 + 1;
-# CHECK: A[0] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Registerizer, RegisterizerNestedConditionsHiddenFirst) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Cond::make(
-           CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-           Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-           nullptr),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Block::make({Cond::make(
-               CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-               Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-               nullptr)}),
-           nullptr)});
-
-  /*
-   * if (x==2 ? 1 : 0) {
-   *   A[0] = (A[0]) + 1;
-   * }
-   * if (x<5 ? 1 : 0) {
-   *   if (x==2 ? 1 : 0) {
-   *     A[0] = (A[0]) + 1;
-   *   }
-   * }
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  stmt = registerize(stmt);
-}
-
-TEST(Registerizer, RegisterizerNestedConditionsHiddenSecond) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Block::make({Cond::make(
-               CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-               Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-               nullptr)}),
-           nullptr),
-       Cond::make(
-           CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-           Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-           nullptr)});
-
-  /*
-   * if (x<5 ? 1 : 0) {
-   *   if (x==2 ? 1 : 0) {
-   *     A[0] = (A[0]) + 1;
-   *   }
-   * }
-   * if (x==2 ? 1 : 0) {
-   *   A[0] = (A[0]) + 1;
-   * }
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  stmt = registerize(stmt);
-}
-
-// If an access is cut by another access internal to a condition block, it still
-// cuts the access.
-TEST(Registerizer, RegisterizerNestedConditionsCut) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           Block::make(
-               {Store::make(a, {x}, 1),
-                Cond::make(
-                    CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-                    Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-                    nullptr)}),
-           nullptr)});
-
-  /*
-   * A[0] = (A[0]) + 1;
-   * if (x<5 ? 1 : 0) {
-   *   A[x] = 1;
-   *   if (x==2 ? 1 : 0) {
-   *
-   *     A[0] = (A[0]) + 1;
-   *   }
-   * }
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-TEST(Registerizer, RegisterizerNestedConditionLoopHidden) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Cond::make(
-           CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-           Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-           nullptr),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(b, {x}, 0),
-                Cond::make(
-                    CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-                    Store::make(a, {0}, Add::make(Load::make(a, {0}), 1)),
-                    nullptr)}))});
-
-  /*
-   * if (x==2 ? 1 : 0) {
-   *   A[0] = (A[0]) + 1;
-   * }
-   * for (int x = 0; x < 10; x++) {
-   *   B[x] = 0;     <-- this is only here to prevent Loop/Cond reordering.
-   *   if (x==2 ? 1 : 0) {
-   *     A[0] = (A[0]) + 1;
-   *   }
-   * }
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-// Three loops and four element regions, three of which should be registerized
-// at different levels of the IR.
-TEST(Registerizer, RegisterizerNestedConditionThreeDeep) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {4}, 0),
-       Cond::make(
-           CompareSelect::make(x, 2, CompareSelectOperation::kGT),
-           Cond::make(
-               CompareSelect::make(x, 3, CompareSelectOperation::kGT),
-               Block::make({
-                   Cond::make(
-                       CompareSelect::make(x, 4, CompareSelectOperation::kGT),
-                       Block::make({
-                           Store::make(
-                               a, {1}, Add::make(Load::make(a, {1}), 1)),
-                           Store::make(
-                               a, {2}, Add::make(Load::make(a, {2}), 1)),
-                           Store::make(
-                               a, {3}, Add::make(Load::make(a, {3}), 1)),
-                           Store::make(
-                               a, {4}, Add::make(Load::make(a, {4}), 1)),
-                           Store::make(
-                               a, {1}, Add::make(Load::make(a, {1}), 1)),
-                       }),
-                       nullptr),
-                   Store::make(a, {2}, Add::make(Load::make(a, {2}), 1)),
-               }),
-               nullptr),
-           nullptr)});
-
-  /*
-   * A[4] = 0;
-   * if (x>2 ? 1 : 0) {
-   *   if (x>3 ? 1 : 0) {
-   *     if (x>4 ? 1 : 0) {
-   *       A[1] = (A[1]) + 1;
-   *       A[2] = (A[2]) + 1;
-   *       A[3] = (A[3]) + 1;
-   *       A[4] = (A[4]) + 1;
-   *       A[1] = (A[1]) + 1;
-   *     }
-   *     A[2] = (A[2]) + 1;
-   *   }
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 0;
-   * if (x>2 ? 1 : 0) {
-   *   if (x>3 ? 1 : 0) {
-   *     int A_3 = A[2];
-   *     if (x>4 ? 1 : 0) {
-   *       int A_2 = A[1];
-   *       A_2 = A_2 + 1;
-   *       A_3 = A_3 + 1;
-   *       A[3] = (A[3]) + 1;
-   *       A_1 = A_1 + 1;
-   *       A_2 = A_2 + 1;
-   *       A[1] = A_2;
-   *     }
-   *     A_3 = A_3 + 1;
-   *     A[2] = A_3;
-   *   }
-   * }
-   * A[4] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 0;
-# CHECK: if (x>2 ? 1 : 0) {
-# CHECK:   if (x>3 ? 1 : 0) {
-# CHECK:     int A_3 = A[2];
-# CHECK:     if (x>4 ? 1 : 0) {
-# CHECK:       int A_2 = A[1];
-# CHECK:       A_2 = A_2 + 1;
-# CHECK:       A_3 = A_3 + 1;
-# CHECK:       A[3] = (A[3]) + 1;
-# CHECK:       A_1 = A_1 + 1;
-# CHECK:       A_2 = A_2 + 1;
-# CHECK:       A[1] = A_2;
-# CHECK:     }
-# CHECK:     A_3 = A_3 + 1;
-# CHECK:     A[2] = A_3;
-# CHECK:   }
-# CHECK: }
-# CHECK: A[4] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Can replace a simple scalar access with a local variable even when that
-// variable is an outer loop var.
-TEST(Registerizer, RegisterizerNestedLoopSimple) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  StmtPtr stmt = Block::make({For::make(
-      y,
-      0,
-      10,
-      For::make(
-          x,
-          0,
-          10,
-          Block::make(
-              {Store::make(a, {y}, Add::make(Load::make(a, {y}), x))})))});
-
-  /*
-   * for (int y = 0; y < 10; y++) {
-   *   for (int x = 0; x < 10; x++) {
-   *     A[y] = (A[y]) + x;
-   *   }
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * for (int y = 0; y < 10; y++) {
-   *   int A_1 = A[y];
-   *   for (int x = 0; x < 10; x++) {
-   *     A_1 = A_1 + x;
-   *   }
-   * A[y] = A_1;
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int y
-# CHECK:   int A_1 = A[y];
-# CHECK:   for (int x
-# CHECK:     A_1 = A_1 + x;
-# CHECK:   }
-# CHECK:   A[y] = A_1;
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Test the positive case of the hiddenAccess split, where an internal
-// conditional access can be hoisted up through a loop to match an existing
-// access in a higher scope and the two can be registerized.
-TEST(Registerizer, RegisterizerHiddenAccessYes) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  StmtPtr stmt = Block::make({Cond::make(
-      CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-      Block::make(
-          {Store::make(a, {0}, 0),
-           For::make(
-               x,
-               0,
-               10,
-               Block::make(
-                   {Store::make(b, {x}, 0),
-                    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-                    Cond::make(
-                        CompareSelect::make(x, 3, CompareSelectOperation::kEQ),
-                        For::make(
-                            y,
-                            0,
-                            10,
-                            Store::make(
-                                a, {0}, Add::make(Load::make(a, {0}), 1))),
-                        nullptr)}))}),
-      nullptr)});
-
-  /*
-   * if (x==2 ? 1 : 0) {
-   *   A[0] = 0;
-   *   for (int x = 0; x < 10; x++) {
-   *     B[x] = 0;
-   *     if (x==3 ? 1 : 0) {
-   *       for (int y = 0; y < 10; y++) {
-   *         A[0] = (A[0]) + 1;
-   *       }
-   *     }
-   *   }
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * if (x==2 ? 1 : 0) {
-   *   int A_1 = 0;
-   *   for (int x = 0; x < 10; x++) {
-   *     B[x] = 0;
-   *     if (x==3 ? 1 : 0) {
-   *       for (int y = 0; y < 10; y++) {
-   *         A_1 = A_1 + 1;
-   *       }
-   *     }
-   *   }
-   *   A[0] = A_1;
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: if (x==2
-# CHECK:   int A_1 = 0;
-# CHECK:   for (int x
-# CHECK:     B[x] = 0;
-# CHECK:     if (x==3
-# CHECK:       for (int y
-# CHECK:         A_1 = A_1 + 1;
-# CHECK:       }
-# CHECK:     }
-# CHECK:   }
-# CHECK:  A[0] = A_1;
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Test the negative case of the hiddenAccess split, where the hoisted access is
-// never unhidden at a higher scope and registerization occurs at the lower
-// scope.
-TEST(Registerizer, RegisterizerHiddenAccessNo) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  StmtPtr stmt = Block::make({Cond::make(
-      CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-      Block::make({For::make(
-          x,
-          0,
-          10,
-          Block::make(
-              {Store::make(b, {x}, 0),
-               // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-               Cond::make(
-                   CompareSelect::make(x, 3, CompareSelectOperation::kEQ),
-                   For::make(
-                       y,
-                       0,
-                       10,
-                       Store::make(a, {0}, Add::make(Load::make(a, {0}), 1))),
-                   nullptr)}))}),
-      nullptr)});
-
-  /*
-   * if (x==2 ? 1 : 0) {
-   *   A[0] = 0;
-   *   for (int x = 0; x < 10; x++) {
-   *     B[x] = 0;
-   *     if (x==3 ? 1 : 0) {
-   *       for (int y = 0; y < 10; y++) {
-   *         A[0] = (A[0]) + 1;
-   *       }
-   *     }
-   *   }
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * if (x==2 ? 1 : 0) {
-   *   for (int x = 0; x < 10; x++) {
-   *     B[x] = 0;
-   *     if (x==3 ? 1 : 0) {
-   *       int A_1 = A[0];
-   *       for (int y = 0; y < 10; y++) {
-   *         A_1 = A_1 + 1;
-   *       }
-   *       A[0] = A_1;
-   *     }
-   *   }
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: if (x==2
-# CHECK:   for (int x
-# CHECK:     B[x] = 0;
-# CHECK:     if (x==3
-# CHECK:       int A_1 = A[0];
-# CHECK:       for (int y
-# CHECK:         A_1 = A_1 + 1;
-# CHECK:       }
-# CHECK:       A[0] = A_1;
-# CHECK:     }
-# CHECK:   }
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// In this case the conditional access must be hoisted by two loops, there are
-// two accesses here one is unhidden and the other isn't. A[0] can be
-// registerized but B[0] cannot.
-TEST(Registerizer, RegisterizerHiddenAccessMultiLoop) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  StmtPtr stmt = Block::make({Cond::make(
-      CompareSelect::make(x, 2, CompareSelectOperation::kEQ),
-      Block::make(
-          {Store::make(a, {0}, 0),
-           For::make(
-               x,
-               0,
-               10,
-               For::make(
-                   y,
-                   0,
-                   10,
-                   Block::make({Cond::make(
-                       CompareSelect::make(y, 3, CompareSelectOperation::kEQ),
-                       Block::make(
-                           {Store::make(
-                                a, {0}, Add::make(Load::make(a, {0}), 1)),
-                            Store::make(
-                                b, {0}, Add::make(Load::make(b, {0}), 1))}),
-                       nullptr)})))}),
-      nullptr)});
-
-  /*
-   * if (x==2 ? 1 : 0) {
-   *   A[0] = 0;
-   *   for (int x = 0; x < 10; x++) {
-   *     for (int y = 0; y < 10; y++) {
-   *       if (y==3 ? 1 : 0) {
-   *         A[0] = (A[0]) + 1;
-   *         B[0] = (B[0]) + 1;
-   *       }
-   *     }
-   *   }
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * if (x==2 ? 1 : 0) {
-   *   int A_1 = 0;
-   *   for (int x = 0; x < 10; x++) {
-   *     for (int y = 0; y < 10; y++) {
-   *       if (y==3 ? 1 : 0) {
-   *         A_1 = A_1 + 1;
-   *         B[0] = (B[0]) + 1;
-   *       }
-   *     }
-   *   }
-   *   A[0] = A_1;
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: if (x==2
-# CHECK:   int A_1 = 0;
-# CHECK:   for (int x
-# CHECK:     for (int y
-# CHECK:       if (y==3
-# CHECK:         A_1 = A_1 + 1;
-# CHECK:         B[0] = (B[0]) + 1;
-# CHECK:       }
-# CHECK:     }
-# CHECK:   }
-# CHECK:  A[0] = A_1;
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Accesses are registerized inside two conditions, but the immediate parent is
-// not a condition.
-TEST(Registerizer, RegisterizerTwoConditionalLoops) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           For::make(
-               x, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), 1))),
-           nullptr),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kGT),
-           For::make(
-               x, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), 1))),
-           nullptr)});
-
-  /*
-   * if (x<5 ? 1 : 0) {
-   *   for (int x = 0; x < 10; x++) {
-   *     A[0] = (A[0]) + 1;
-   *   }
-   * }
-   * if (x>5 ? 1 : 0) {
-   *   for (int x = 0; x < 10; x++) {
-   *     A[0] = (A[0]) + 1;
-   *   }
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * if (x<5 ? 1 : 0) {
-   *   int A_1 = A[0];
-   *   for (int x = 0; x < 10; x++) {
-   *     A_1 = A_1 + 1;
-   *   }
-   *   A[0] = A_1;
-   * }
-   * if (x>5 ? 1 : 0) {
-   *   int A_2 = A[0];
-   *   for (int x = 0; x < 10; x++) {
-   *     A_2 = A_2 + 1;
-   *   }
-   *   A[0] = A_2;
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: if (x<5
-# CHECK:   int A_1 = A[0];
-# CHECK:   for (int x
-# CHECK:     A_1 = A_1 + 1;
-# CHECK:   }
-# CHECK:   A[0] = A_1;
-# CHECK: }
-# CHECK: if (x>5
-# CHECK:   int A_2 = A[0];
-# CHECK:   for (int x
-# CHECK:     A_2 = A_2 + 1;
-# CHECK:   }
-# CHECK:   A[0] = A_2;
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Accesses are registerized inside two conditions, cut in the middle.
-TEST(Registerizer, RegisterizerTwoConditionalLoopsCut) {
-  BufHandle a("A", {1}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kLT),
-           For::make(
-               x, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), 1))),
-           nullptr),
-       For::make(x, 0, 10, Store::make(a, {x}, 1)),
-       Cond::make(
-           CompareSelect::make(x, 5, CompareSelectOperation::kGT),
-           For::make(
-               x, 0, 10, Store::make(a, {0}, Add::make(Load::make(a, {0}), 1))),
-           nullptr)});
-
-  /*
-   * if (x<5 ? 1 : 0) {
-   *   for (int x = 0; x < 10; x++) {
-   *     A[0] = (A[0]) + 1;
-   *   }
-   * }
-   * for (int x = 0; x < 10; x++) {
-   *   A[x] = 1;
-   * }
-   * if (x>5 ? 1 : 0) {
-   *   for (int x = 0; x < 10; x++) {
-   *     A[0] = (A[0]) + 1;
-   *   }
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * if (x<5 ? 1 : 0) {
-   *   int A_1 = A[0];
-   *   for (int x = 0; x < 10; x++) {
-   *     A_1 = A_1 + 1;
-   *   }
-   *   A[0] = A_1;
-   * }
-   * for (int x = 0; x < 10; x++) {
-   *   A[x] = 1;
-   * }
-   * if (x>5 ? 1 : 0) {
-   *   int A_2 = A[0];
-   *   for (int x = 0; x < 10; x++) {
-   *     A_2 = A_2 + 1;
-   *   }
-   *   A[0] = A_2;
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: if (x<5
-# CHECK:   int A_1 = A[0];
-# CHECK:   for (int x
-# CHECK:     A_1 = A_1 + 1;
-# CHECK:   }
-# CHECK:   A[0] = A_1;
-# CHECK: }
-# CHECK: for (int x
-# CHECK:  A[x] = 1;
-# CHECK: if (x>5
-# CHECK:   int A_2 = A[0];
-# CHECK:   for (int x
-# CHECK:     A_2 = A_2 + 1;
-# CHECK:   }
-# CHECK:   A[0] = A_2;
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// references a Let var in a local scope which cannot be hoisted out of the
-// loop.
-TEST(Registerizer, RegisterizerLoopLetVar) {
-  BufHandle a("A", {10}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  StmtPtr stmt = IRSimplifier::simplify(Block::make({For::make(
-      x,
-      0,
-      10,
-      Block::make(
-          {Let::make(y, 30),
-           Store::make(a, {y}, Add::make(x, Load::make(a, {y})))}))}));
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   int y = 30;
-   *   A[y] = x + (A[y]);
-   * }
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-// references a Let var in an outer scope that does not prevent hoisting the
-// initializer.
-TEST(Registerizer, RegisterizerLoopLetVarOuter) {
-  BufHandle a("A", {10}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  StmtPtr stmt = Block::make(
-      {Let::make(y, 30),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make(
-               {Store::make(a, {y}, Add::make(x, Load::make(a, {y})))}))});
-
-  /*
-   * int y = 30;
-   * for (int x = 0; x < 10; x++) {
-   *   A[y] = x + (A[y]);
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int y = 30;
-   * int A_1 = A[y];
-   * for (int x = 0; x < 10; x++) {
-   *   A_1 = A_1 + x;
-   * }
-   * A[y] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int y = 30;
-# CHECK: int A_1 = A[y];
-# CHECK: for (int x
-# CHECK:   A_1 = A_1 + x;
-# CHECK: A[y] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Okay so the registerizer generally goes after index flattening, but just in
-// case. Test multi index registerization.
-TEST(Registerizer, RegisterizerMultiDim) {
-  BufHandle a("A", {3, 4, 5}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0, 1, 2}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make({Store::make(
-               a, {0, 1, 2}, Add::make(Load::make(a, {0, 1, 2}), x))}))});
-
-  /*
-   * A[0, 1, 2] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[0, 1, 2] = (A[0, 1, 2]) + x;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * int A_1 = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A_1 = x + A_1;
-   * }
-   * A[0, 1, 2] = A_1;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: int A_1 = 0;
-# CHECK: for (int x = 0; x < 10; x++)
-# CHECK-NOT: A[
-# CHECK:   A_1 =
-# CHECK: A[0, 1, 2] = A_1;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// Won't registerize if only some dims match, but will still registerize
-// distinct elements.
-TEST(Registerizer, RegisterizerMultiDimPartial) {
-  BufHandle a("A", {3, 4, 5}, kInt);
-  VarHandle x("x", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0, 1, 2}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make({Store::make(
-               a, {0, 2, 2}, Add::make(Load::make(a, {0, 1, 4}), x))}))});
-
-  /*
-   * A[0, 1, 2] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[0, 2, 2] = (A[0, 1, 4]) + x;
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * A[0, 1, 2] = 0;
-   * int A_1 = A[0, 1, 4];
-   * int A_2 = A[0, 2, 2];
-   * for (int x = 0; x < 10; x++) {
-   *   A_2 = A_1 + x;
-   * }
-   * A[0, 2, 2] = A_2;
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: A[0, 1, 2] = 0;
-# CHECK: int A_1 = A[0, 1, 4];
-# CHECK: int A_2 = A[0, 2, 2];
-# CHECK: for (
-# CHECK:   A_2 = A_1 + x;
-# CHECK: A[0, 2, 2] = A_2;)IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// If they could overlap across all dimensions we cannot registerize.
-TEST(Registerizer, RegisterizerMultiDimOverlap) {
-  BufHandle a("A", {3, 4, 5}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0, 1, 2}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make({Store::make(
-               a, {0, x, 2}, Add::make(Load::make(a, {y, 2, 2}), x))}))});
-  stmt = IRSimplifier::simplify(stmt);
-
-  /*
-   * A[0, 1, 2] = 0;
-   * for (int x = 0; x < 10; x++) {
-   *   A[0, x, 2] = (A[y, 2, 2]) + x;
-   * }
-   */
-
-  std::ostringstream before;
-  before << *stmt;
-
-  // No change.
-  stmt = registerize(stmt);
-
-  std::ostringstream after;
-  after << *stmt;
-
-  ASSERT_EQ(before.str(), after.str());
-}
-
-// But, if one dimension is known to be distinct they do not overlap.
-TEST(Registerizer, RegisterizerMultiDimPartialOverlap) {
-  BufHandle a("A", {3, 4, 5}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  StmtPtr stmt = Block::make(
-      {Store::make(a, {0, 1, 2}, 0),
-       For::make(
-           x,
-           0,
-           10,
-           Block::make({Store::make(
-               a, {0, x, 2}, Add::make(Load::make(a, {y, 2, 4}), x))}))});
-
-  /*
-   * A[0, 1, 2] = 0;                          <---- 2nd dim overlaps with store.
-   * for (int x = 0; x < 10; x++) {
-   *   A[0, x, 2] = (A[y, 2, 4]) + x;           <---- 3rd dim has constant diff.
-   * }
-   */
-
-  stmt = registerize(stmt);
-
-  /*
-   * A[0, 1, 2] = 0;
-   * int A_1 = A[y, 2, 4];
-   * for (int x = 0; x < 10; x++) {
-   *   A[0, x, 2] = A_1 + x;
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: A[0, 1, 2] = 0;
-# CHECK: int A_1 = A[y, 2, 4];
-# CHECK: for (
-# CHECK:   A[0, x, 2] = A_1 + x;
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// A 3D reduction with different input dimensionality.
-TEST(Registerizer, RegisterizerMultiDim3DReduction1) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10, 10}, kInt);
-  BufHandle c("C", {10, 10, 10}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  VarHandle z("z", kInt);
-  StmtPtr stmt = For::make(
-      x,
-      0,
-      10,
-      For::make(
-          y,
-          0,
-          10,
-          For::make(
-              z,
-              0,
-              10,
-              Store::make(
-                  c,
-                  {x, y, z},
-                  Add::make(
-                      Load::make(c, {x, y, z}),
-                      Mul::make(Load::make(b, {x, y}), Load::make(a, {x})))))));
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   for (int y = 0; y < 10; y++) {
-   *     for (int z = 0; z < 10; z++) {
-   *       C[x, y, z] = (C[x, y, z]) + (B[x, y]) * (A[x]);
-   *     }
-   *   }
-   * }
-   */
-
-  // We can registerize the A and B access since they can be hoisted before
-  // hitting a dependent loop var.
-
-  stmt = registerize(stmt);
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   int A_1 = A[x];
-   *   for (int y = 0; y < 10; y++) {
-   *     int B_1 = B[x, y];
-   *     for (int z = 0; z < 10; z++) {
-   *       C[x, y, z] = A_1 * B_1 + (C[x, y, z]);
-   *     }
-   *   }
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int x
-# CHECK:   int A_1 = A[x];
-# CHECK:   for (int y
-# CHECK:     int B_1 = B[x, y];
-# CHECK:       for (int z
-# CHECK:         C[x, y, z] = A_1 * B_1 + (C[x, y, z]);
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-// A 3D reduction with the same smaller dimensionality using different loop
-// vars.
-TEST(Registerizer, RegisterizerMultiDim3DReduction2) {
-  BufHandle a("A", {10}, kInt);
-  BufHandle b("B", {10}, kInt);
-  BufHandle c("C", {10}, kInt);
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  VarHandle z("z", kInt);
-  StmtPtr stmt = For::make(
-      x,
-      0,
-      10,
-      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-      For::make(
-          y,
-          0,
-          10,
-          For::make(
-              z,
-              0,
-              10,
-              Store::make(
-                  c,
-                  {x},
-                  Add::make(
-                      Load::make(c, {x}),
-                      Mul::make(Load::make(b, {y}), Load::make(a, {x})))))));
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   for (int y = 0; y < 10; y++) {
-   *     for (int z = 0; z < 10; z++) {
-   *       C[x] = (C[x]) + (B[y]) * (A[x]);
-   *     }
-   *   }
-   * }
-   */
-
-  // We can registerize all accesses, the A and C access can be hoisted to the
-  // outer loop since they depend only on it's loop var while the B can only be
-  // raised to the loop of y.
-
-  stmt = registerize(stmt);
-
-  /*
-   * for (int x = 0; x < 10; x++) {
-   *   int A_1 = A[x];
-   *   int C_1 = C[x];
-   *   for (int y = 0; y < 10; y++) {
-   *     int B_1 = B[y];
-   *     for (int z = 0; z < 10; z++) {
-   *       C_1 = A_1 * B_1 + C_1;
-   *     }
-   *   }
-   *   C[x] = C_1;
-   * }
-   */
-
-  std::ostringstream oss;
-  oss << *stmt;
-
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int x
-# CHECK:   int A_1 = A[x];
-# CHECK:   int C_1 = C[x];
-# CHECK:   for (int y
-# CHECK:     int B_1 = B[y];
-# CHECK:       for (int z
-# CHECK:         C_1 = A_1 * B_1 + C_1;
-# CHECK:       }
-# CHECK:     }
-# CHECK:   C[x] = C_1;
-# CHECK: })IR";
-
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
deleted file mode 100644
index 7ca2b74eaa766..0000000000000
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ /dev/null
@@ -1,5680 +0,0 @@
-#include <gtest/gtest.h>
-#include <test/cpp/tensorexpr/test_base.h>
-
-#include <c10/util/irange.h>
-#include <test/cpp/tensorexpr/test_utils.h>
-#include <torch/csrc/jit/tensorexpr/hash_provider.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-
-#include <cmath>
-
-namespace torch {
-namespace jit {
-using namespace torch::jit::tensorexpr;
-using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
-
-TEST(Simplify, ConstantFoldSimple) {
-  ExprHandle a(2.0f);
-  ExprHandle b(3.0f);
-  ExprHandle f = (a + b);
-
-  ExprHandle newF = IRSimplifier::simplify(f);
-  ASSERT_NE(newF.AsNode<FloatImm>(), nullptr);
-  ASSERT_EQ(newF.AsNode<FloatImm>()->value(), 5);
-
-  SimpleIRExprEval eval(newF);
-  ASSERT_EQ(eval.value<float>(), 5.f);
-}
-
-TEST(Simplify, ConstantFoldTwoLayer) {
-  ExprHandle a(2.0f);
-  ExprHandle b(3.0f);
-  ExprHandle c(4.0f);
-  ExprHandle d(5.0f);
-  ExprHandle f = (a + b) - (c + d);
-
-  ExprHandle newF = IRSimplifier::simplify(f);
-  ASSERT_NE(newF.AsNode<FloatImm>(), nullptr);
-  ASSERT_EQ(newF.AsNode<FloatImm>()->value(), -4);
-
-  SimpleIRExprEval eval(newF);
-  ASSERT_EQ(eval.value<float>(), -4.f);
-}
-
-TEST(Simplify, ConstantFoldShifts) {
-  ExprHandle a(7);
-  ExprHandle b(2);
-  ExprHandle c(3);
-  ExprHandle f = ((a << b) << b) >> c;
-
-  ExprHandle newF = IRSimplifier::simplify(f);
-  ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
-  ASSERT_EQ(newF.AsNode<IntImm>()->value(), 14);
-
-  SimpleIRExprEval eval(newF);
-  ASSERT_EQ(eval.value<int>(), 7 << (4 - 3));
-}
-
-TEST(Simplify, ConstantFoldBitwise) {
-  ExprHandle a(59);
-  ExprHandle b(22);
-  ExprHandle c(101);
-  ExprHandle f = (a ^ b) & c;
-
-  ExprHandle newF = IRSimplifier::simplify(f);
-  ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
-  ASSERT_EQ(newF.AsNode<IntImm>()->value(), 37);
-
-  SimpleIRExprEval eval(newF);
-  ASSERT_EQ(eval.value<int>(), (59 ^ 22) & 101);
-}
-
-TEST(Simplify, ConstantFoldMultiOp) {
-  ExprHandle a(2.0f);
-  ExprHandle b(3.0f);
-  ExprHandle c(4.0f);
-  ExprHandle d(5.0f);
-  ExprHandle e(6.0f);
-  ExprHandle f(7.0f);
-  ExprHandle fn = ((a / e) - (c + d)) * (f / b);
-
-  ExprHandle newF = IRSimplifier::simplify(fn);
-  ASSERT_NE(newF.AsNode<FloatImm>(), nullptr);
-
-  SimpleIRExprEval eval(newF);
-  SimpleIRExprEval ref(fn);
-
-  ASSERT_EQ(eval.value<float>(), ref.value<float>());
-}
-
-TEST(Simplify, ConstantFoldMinMax) {
-  ExprHandle a(12.0f);
-  ExprHandle b(15.0f);
-  ExprHandle c(17.0f);
-
-  // x = max(12, min(15, 17)).
-  ExprHandle minHandle = Min::make(b, c, true);
-  ExprHandle fn = Max::make(a, minHandle, false);
-
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  ASSERT_EQ(fn.dtype().scalar_type(), ScalarType::Float);
-
-  ExprHandle newF = IRSimplifier::simplify(fn);
-  ASSERT_NE(newF.AsNode<FloatImm>(), nullptr);
-
-  SimpleIRExprEval eval(newF);
-  ASSERT_EQ(eval.value<float>(), 15.f);
-}
-
-TEST(Simplify, ConstantFoldIntrinsics) {
-  ExprHandle a(2.0f);
-  ExprHandle b(3.0f);
-  ExprHandle c(4.0f);
-  ExprHandle powHandle = Intrinsics::make(kPow, a, b);
-  ExprHandle sinHandle = Intrinsics::make(kSin, powHandle);
-  ExprHandle modHandle = Intrinsics::make(kFmod, c, sinHandle);
-  ExprHandle logHandle = Intrinsics::make(kLog10, modHandle);
-  ExprHandle rndHandle = Intrinsics::make(kRound, logHandle);
-  ExprHandle fn = Intrinsics::make(kAbs, rndHandle);
-
-  ExprHandle newF = IRSimplifier::simplify(fn);
-  ASSERT_NE(newF.AsNode<FloatImm>(), nullptr);
-  ASSERT_EQ(newF.AsNode<FloatImm>()->value(), 1);
-
-  SimpleIRExprEval eval(newF);
-  SimpleIRExprEval ref(fn);
-
-  ASSERT_EQ(eval.value<float>(), ref.value<float>());
-}
-
-TEST(Simplify, ConstantFoldCastToBool) {
-  ExprHandle f = Cast::make(kBool, IntImm::make(0));
-  ExprHandle newF = IRSimplifier::simplify(f);
-  SimpleIRExprEval eval(newF);
-  ASSERT_EQ(eval.value<bool>(), false);
-}
-
-TEST(Simplify, ConstantFoldWithVar) {
-  {
-    VarHandle x("x", kInt);
-    ExprHandle body = x * (ExprHandle(2) + ExprHandle(4));
-
-    ExprHandle newF = IRSimplifier::simplify(body);
-    MulPtr root = newF.AsNode<Mul>();
-    ASSERT_NE(root, nullptr);
-    ASSERT_NE(to<IntImm>(root->lhs()), nullptr);
-
-    SimpleIRExprEval eval(newF);
-    eval.bindVar(x, ExprHandle(3));
-    ASSERT_EQ(eval.value<int>(), 3 * (2 + 4));
-  }
-
-  {
-    VarHandle x("x", kFloat);
-    ExprHandle body = x * (ExprHandle(2.f) + ExprHandle(4.f));
-
-    ExprHandle newF = IRSimplifier::simplify(body);
-    MulPtr root = newF.AsNode<Mul>();
-    ASSERT_NE(root, nullptr);
-    ASSERT_NE(to<FloatImm>(root->rhs()), nullptr);
-
-    SimpleIRExprEval eval(newF);
-    eval.bindVar(x, ExprHandle(3.f));
-    ASSERT_EQ(eval.value<float>(), 3 * (2 + 4));
-  }
-}
-
-TEST(Simplify, ConditionalSelectFoldSimple) {
-  ExprHandle a(3.0f);
-  ExprHandle b(4.0f);
-  ExprHandle c(3.0f);
-  {
-    ExprHandle f = (a > b);
-
-    ExprHandle newF = IRSimplifier::simplify(f);
-    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
-    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 0);
-
-    SimpleIRExprEval eval(newF);
-    ASSERT_EQ(eval.value<int>(), 0);
-  }
-  {
-    ExprHandle f = (a < b);
-
-    ExprHandle newF = IRSimplifier::simplify(f);
-    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
-    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 1);
-
-    SimpleIRExprEval eval(newF);
-    ASSERT_EQ(eval.value<int>(), 1);
-  }
-  {
-    ExprHandle f = (a == c);
-
-    ExprHandle newF = IRSimplifier::simplify(f);
-    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
-    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 1);
-
-    SimpleIRExprEval eval(newF);
-    ASSERT_EQ(eval.value<int>(), 1);
-  }
-  {
-    ExprHandle f = (a != c);
-
-    ExprHandle newF = IRSimplifier::simplify(f);
-    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
-    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 0);
-
-    SimpleIRExprEval eval(newF);
-    ASSERT_EQ(eval.value<int>(), 0);
-  }
-}
-
-TEST(Simplify, ConditionalSelectFoldTwoLayer) {
-  ExprHandle a(3.0f);
-  ExprHandle b(2.0f);
-  ExprHandle c(2.0f);
-  ExprHandle d(1.0f);
-  {
-    ExprHandle f = (a + b < c + d);
-
-    ExprHandle newF = IRSimplifier::simplify(f);
-    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
-    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 0);
-
-    SimpleIRExprEval eval(newF);
-    ASSERT_EQ(eval.value<int>(), 0);
-  }
-  {
-    ExprHandle f = (a + b > c + d);
-
-    ExprHandle newF = IRSimplifier::simplify(f);
-    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
-    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 1);
-
-    SimpleIRExprEval eval(newF);
-    ASSERT_EQ(eval.value<int>(), 1);
-  }
-  {
-    ExprHandle f = (a + d == b + c);
-
-    ExprHandle newF = IRSimplifier::simplify(f);
-    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
-    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 1);
-
-    SimpleIRExprEval eval(newF);
-    ASSERT_EQ(eval.value<int>(), 1);
-  }
-  {
-    ExprHandle f = (a + d != b + c);
-
-    ExprHandle newF = IRSimplifier::simplify(f);
-    ASSERT_NE(newF.AsNode<IntImm>(), nullptr);
-    ASSERT_EQ(newF.AsNode<IntImm>()->value(), 0);
-
-    SimpleIRExprEval eval(newF);
-    ASSERT_EQ(eval.value<int>(), 0);
-  }
-}
-
-TEST(Simplify, ConditionalSelectFoldWithVar) {
-  VarHandle x("x", kFloat);
-  ExprHandle f = x < 4.f;
-
-  ExprHandle newF = IRSimplifier::simplify(f);
-  IntImmPtr folded = newF.AsNode<IntImm>();
-  ASSERT_EQ(folded, nullptr);
-
-  {
-    SimpleIRExprEval eval(newF);
-    eval.bindVar(x, ExprHandle(3.f));
-    ASSERT_EQ(eval.value<int>(), 1);
-  }
-  {
-    SimpleIRExprEval eval(newF);
-    eval.bindVar(x, ExprHandle(5.f));
-    ASSERT_EQ(eval.value<int>(), 0);
-  }
-}
-
-TEST(Simplify, UnFoldableExpr) {
-  VarHandle x("x", kFloat);
-  VarHandle y("y", kFloat);
-  ExprHandle body = (ExprHandle(3) * x) + (ExprHandle(5) * y);
-
-  ExprHandle newF = IRSimplifier::simplify(body);
-  AddPtr root = newF.AsNode<Add>();
-  ASSERT_NE(root, nullptr);
-  ASSERT_EQ(to<FloatImm>(root->lhs()), nullptr);
-  ASSERT_EQ(to<FloatImm>(root->rhs()), nullptr);
-
-  SimpleIRExprEval eval(newF);
-  eval.bindVar(x, ExprHandle(3.f));
-  eval.bindVar(y, ExprHandle(2.f));
-  ASSERT_EQ(eval.value<float>(), 9 + 10);
-}
-
-TEST(Simplify, HashSimple) {
-  VarHandle x("x", kFloat);
-  ExprHandle a(2.0f);
-  ExprHandle b(3.0f);
-  ExprHandle f = a + b * x;
-
-  HashProvider hasher;
-
-  auto hash_x = hasher.hash(x.node());
-  auto hash_a = hasher.hash(a.node());
-  auto hash_f = hasher.hash(f.node());
-
-  ASSERT_NE(hash_x, (size_t)0);
-  ASSERT_NE(hash_a, (size_t)0);
-  ASSERT_NE(hash_f, (size_t)0);
-  ASSERT_NE(hash_x, hash_a);
-  ASSERT_NE(hash_x, hash_f);
-  ASSERT_NE(hash_a, hash_f);
-}
-
-TEST(Simplify, HashEquivalence) {
-  VarHandle x("x", kFloat);
-  VarHandle y("y", kFloat);
-  ExprHandle f = (x * y) + (x * y);
-
-  AddPtr root = f.AsNode<Add>();
-  ASSERT_NE(root, nullptr);
-
-  HashProvider hasher;
-  auto hash_f = hasher.hash(f.node());
-  auto hash_l = hasher.hash(root->lhs());
-  auto hash_r = hasher.hash(root->rhs());
-
-  // Root not equal to either branch.
-  ASSERT_NE(hash_f, hash_l);
-  ASSERT_NE(hash_f, hash_r);
-  // but branches are equal.
-  ASSERT_EQ(hash_l, hash_r);
-
-  // Still equivalent if separate.
-  ExprHandle a(2);
-  ExprHandle f2 = x + a / y;
-  ExprHandle b(2);
-  ExprHandle f3 = x + b / y;
-  ASSERT_EQ(hasher.hash(f2.node()), hasher.hash(f3.node()));
-
-  // Not equivalent if different vars (even with same name).
-  VarHandle z("x", kFloat);
-  ExprHandle f4 = z + b / y;
-  ASSERT_NE(hasher.hash(f2.node()), hasher.hash(f4.node()));
-
-  // Intrinsics sanity check.
-  ExprHandle f5 = Intrinsics::make(kSin, x) * Intrinsics::make(kCos, x);
-  ASSERT_NE(hasher.hash(f5.node()), (size_t)0);
-}
-
-TEST(Simplify, HashEquivalenceRand) {
-  ExprHandle f =
-      Intrinsics::make(kRand, kFloat) + Intrinsics::make(kRand, kInt);
-
-  AddPtr root = f.AsNode<Add>();
-  ASSERT_NE(root, nullptr);
-
-  HashProvider hasher;
-  auto hash_f = hasher.hash(f.node());
-  auto hash_l = hasher.hash(root->lhs());
-  auto hash_r = hasher.hash(root->rhs());
-
-  // Root not equal to either branch.
-  ASSERT_NE(hash_f, hash_l);
-  ASSERT_NE(hash_f, hash_r);
-  // and branches are NOT equal.
-  ASSERT_NE(hash_l, hash_r);
-}
-
-TEST(Simplify, HashEquivalenceAfterFolding) {
-  VarHandle x("x", kFloat);
-  ExprHandle a(2.0f);
-  ExprHandle b(3.0f);
-  ExprHandle c(5.0f);
-
-  ExprHandle f1 = ((a + b) * x);
-  ExprHandle f2 = (c * x);
-
-  HashProvider hasher;
-  auto hash_l = hasher.hash(f1.node());
-  auto hash_r = hasher.hash(f2.node());
-
-  // Root not equal to either branch, and branches not equal.
-  ASSERT_NE(hash_l, hash_r);
-
-  ExprHandle ff1 = IRSimplifier::simplify(f1);
-  ExprHandle ff2 = IRSimplifier::simplify(f2);
-
-  auto hash_l_n = hasher.hash(ff1.node());
-  auto hash_r_n = hasher.hash(ff2.node());
-  // but branches are now equal.
-  ASSERT_EQ(hash_l_n, hash_r_n);
-}
-
-TEST(Simplify, HashDifferenceTypes) {
-  HashProvider hasher;
-  std::vector<ExprPtr> immediates;
-
-  immediates.push_back(alloc<DoubleImm>(1));
-  immediates.push_back(alloc<FloatImm>(1));
-  immediates.push_back(alloc<HalfImm>(1));
-  // NOLINTNEXTLINE(modernize-use-bool-literals)
-  immediates.push_back(alloc<BoolImm>(1));
-  immediates.push_back(alloc<CharImm>(1));
-  immediates.push_back(alloc<ByteImm>(1));
-  immediates.push_back(alloc<ShortImm>(1));
-  immediates.push_back(alloc<IntImm>(1));
-  immediates.push_back(alloc<LongImm>(1));
-
-  // Immediates of different types are not equal.
-  for (unsigned int i = 0; i < immediates.size(); ++i) {
-    for (unsigned int j = i + 1; j < immediates.size(); ++j) {
-      ASSERT_NE(hasher.hash(immediates[i]), hasher.hash(immediates[j]));
-    }
-  }
-
-  // But coerced immediates are if they are the same type:
-  ExprHandle f1 = ExprHandle(2.f) + CharImm::make(1);
-  ExprHandle f2 = Cast::make(kFloat, IntImm::make(3));
-
-  ExprHandle ff1 = IRSimplifier::simplify(f1);
-  ExprHandle ff2 = IRSimplifier::simplify(f2);
-
-  ASSERT_EQ(hasher.hash(ff1.node()), hasher.hash(ff2.node()));
-}
-
-TEST(Simplify, HashLargeExpression) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  VarHandle i("i", kInt);
-  auto memcpy_stmt = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          CompareSelect::make(
-              Load::make(a, {i}),
-              Load::make(b, {i}),
-              CompareSelectOperation::kEQ)));
-
-  BufHandle d("D", {1}, kInt);
-  BufHandle e("E", {1}, kInt);
-  auto store_ramp_stmt = Store::make(
-      e, {Ramp::make(0, 1, 4)}, Load::make(d, {Ramp::make(0, 1, 4)}));
-
-  auto if_stmt = Cond::make(
-      CompareSelect::make(
-          Load::make(a, {i}), Load::make(b, {i}), CompareSelectOperation::kGE),
-      memcpy_stmt,
-      store_ramp_stmt);
-
-  HashProvider hasher;
-  auto hash_r = hasher.hash(if_stmt);
-  // We should not have to do any more work.
-  ASSERT_TRUE(hasher.cachedHash(memcpy_stmt));
-  auto hash_t = hasher.hash(memcpy_stmt);
-  ASSERT_TRUE(hasher.cachedHash(store_ramp_stmt));
-  auto hash_f = hasher.hash(store_ramp_stmt);
-
-  // Root not equal to either branch, and branches not equal.
-  ASSERT_NE(hash_r, hash_t);
-  ASSERT_NE(hash_r, hash_f);
-  ASSERT_NE(hash_t, hash_f);
-}
-
-TEST(Simplify, HashForLoopOptions) {
-  constexpr int N = 1024;
-  BufHandle a("A", {N}, kInt);
-  BufHandle b("B", {N}, kInt);
-  BufHandle c("C", {N}, kInt);
-  VarHandle i("i", kInt);
-  auto for_stmt = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          CompareSelect::make(
-              Load::make(a, {i}),
-              Load::make(b, {i}),
-              CompareSelectOperation::kEQ)));
-
-  HashProvider hasher;
-  auto hash_before = hasher.hash(for_stmt);
-  hasher.clearCache();
-
-  for_stmt->set_gpu_block_index(LoopOptions::IDX_X);
-  auto hash_block_idx = hasher.hash(for_stmt);
-  hasher.clearCache();
-
-  ASSERT_NE(hash_before, hash_block_idx);
-
-  for_stmt->set_gpu_block_index(LoopOptions::IDX_UNSET);
-  auto hash_reset = hasher.hash(for_stmt);
-  hasher.clearCache();
-
-  ASSERT_EQ(hash_before, hash_reset);
-  for_stmt->set_gpu_thread_index(LoopOptions::IDX_X);
-  auto hash_thread_idx = hasher.hash(for_stmt);
-
-  ASSERT_NE(hash_before, hash_thread_idx);
-  ASSERT_NE(hash_block_idx, hash_thread_idx);
-}
-
-/// (2 + x) + 4 => x + 6
-TEST(Simplify, SimplifyAdd) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  VarHandle m("m", kInt);
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  VarHandle n("n", kInt);
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  VarHandle n_1("n_1", kInt);
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  ExprHandle body = (ExprHandle(2) + x) + ExprHandle(4);
-
-  ExprHandle simplified = IRSimplifier::simplify(body);
-  AddPtr root = simplified.AsNode<Add>();
-  ASSERT_NE(root, nullptr);
-  VarPtr lhs = to<Var>(root->lhs());
-  ASSERT_NE(lhs, nullptr);
-  ASSERT_EQ(lhs->name_hint(), "x");
-  IntImmPtr rhs = to<IntImm>(root->rhs());
-  ASSERT_NE(rhs, nullptr);
-  ASSERT_EQ(rhs->value(), 6.f);
-}
-
-/// (2 - x) - 4 => -2 - x
-TEST(Simplify, SimplifySub) {
-  VarHandle x("x", kInt);
-  ExprHandle body = (ExprHandle(2) - x) - ExprHandle(4);
-
-  ExprHandle simplified = IRSimplifier::simplify(body);
-  SubPtr root = simplified.AsNode<Sub>();
-  ASSERT_NE(root, nullptr);
-  IntImmPtr lhs = to<IntImm>(root->lhs());
-  ASSERT_NE(lhs, nullptr);
-  ASSERT_EQ(lhs->value(), -2.f);
-  VarPtr rhs = to<Var>(root->rhs());
-  ASSERT_NE(rhs, nullptr);
-  ASSERT_EQ(rhs->name_hint(), "x");
-}
-
-/// 2 * (1 - x) - 4 => 2 * (-3 - x)
-TEST(Simplify, SimplifyMultiLayer) {
-  VarHandle x("x", kInt);
-  ExprHandle body = ExprHandle(2) * ((ExprHandle(1) - x) - ExprHandle(4));
-  ExprHandle simplified = IRSimplifier::simplify(body);
-  IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-  IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-  IS_NODE_WITH_NAME(Sub, mul->rhs(), sub);
-  IS_IMM_WITH_VAL(Int, sub->lhs(), -3);
-  IS_VAR_WITH_NAME(sub->rhs(), "x");
-}
-
-/// 2 * (3 * x) - (x * 4) => 2 * x
-TEST(Simplify, SimplifyMultiTerm) {
-  VarHandle x("x", kInt);
-  ExprHandle body =
-      (ExprHandle(2) * ((ExprHandle(3) * x)) - (x * ExprHandle(4)));
-
-  ExprHandle simplified = IRSimplifier::simplify(body);
-  MulPtr root = simplified.AsNode<Mul>();
-  ASSERT_NE(root, nullptr);
-  IntImmPtr lhs = to<IntImm>(root->lhs());
-  ASSERT_NE(lhs, nullptr);
-  ASSERT_EQ(lhs->value(), 2);
-  VarPtr rhs = to<Var>(root->rhs());
-  ASSERT_NE(rhs, nullptr);
-  ASSERT_EQ(rhs->name_hint(), "x");
-}
-
-/// 2 * (3 * (long)x) - (x * 4) => 2 * x
-TEST(Simplify, SimplifyCasts) {
-  VarHandle x("x", kLong);
-  ExprHandle body =
-      (ExprHandle(2) * ((ExprHandle(3) * x)) - (x * ExprHandle(4)));
-
-  ExprHandle simplified = IRSimplifier::simplify(body);
-  MulPtr root = simplified.AsNode<Mul>();
-  ASSERT_NE(root, nullptr);
-  LongImmPtr lhs = to<LongImm>(root->lhs());
-  ASSERT_NE(lhs, nullptr);
-  ASSERT_EQ(lhs->value(), 2);
-  VarPtr rhs = to<Var>(root->rhs());
-  ASSERT_NE(rhs, nullptr);
-  ASSERT_EQ(rhs->name_hint(), "x");
-}
-
-/// (x + 0) * 1 => x
-TEST(Simplify, SimplifyEliminatesNoOps) {
-  VarHandle x("x", kInt);
-  ExprHandle body = (x + ExprHandle(0)) * 1;
-
-  ExprHandle simplified = IRSimplifier::simplify(body);
-  VarPtr root = simplified.AsNode<Var>();
-  ASSERT_NE(root, nullptr);
-  ASSERT_EQ(root->name_hint(), "x");
-}
-
-/// Cannot simplify this.
-TEST(Simplify, SimplifyMultiVar) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  ExprHandle body = x * 24 + y * 34;
-
-  ExprHandle simplified = IRSimplifier::simplify(body);
-
-  AddPtr root = simplified.AsNode<Add>();
-  ASSERT_NE(root, nullptr);
-  MulPtr lhs = to<Mul>(root->lhs());
-  ASSERT_NE(lhs, nullptr);
-  VarPtr varX = to<Var>(lhs->rhs());
-  ASSERT_NE(varX, nullptr);
-  ASSERT_EQ(varX->name_hint(), "x");
-  MulPtr rhs = to<Mul>(root->rhs());
-  ASSERT_NE(rhs, nullptr);
-  VarPtr varY = to<Var>(rhs->rhs());
-  ASSERT_NE(varY, nullptr);
-  ASSERT_EQ(varY->name_hint(), "y");
-}
-
-// x + 2 + y => x + y + 2
-TEST(Simplify, DISABLED_SimplifyReorderings) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  ExprHandle body = x + 2 + y;
-  ExprHandle simplified = IRSimplifier::simplify(body);
-
-  AddPtr root = simplified.AsNode<Add>();
-  ASSERT_NE(root, nullptr);
-
-  IS_NODE_WITH_NAME(Add, root->lhs(), rhs);
-  IS_VAR_WITH_NAME(rhs->lhs(), "x");
-  IS_VAR_WITH_NAME(rhs->rhs(), "y");
-  IS_IMM_WITH_VAL(Int, root->rhs(), 2);
-}
-
-/// y + x * 0 => y
-TEST(Simplify, SimplifyEliminatesVar) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  ExprHandle body = y + x * ExprHandle(0);
-
-  ExprHandle simplified = IRSimplifier::simplify(body);
-  IS_VAR_WITH_NAME(simplified.node(), "y");
-}
-
-TEST(Simplify, SimplifyAdds) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  {
-    // (x + y) + (x + y) => 2 * (x + y)
-    ExprHandle body = (x + y) + (x + y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), root);
-    IS_IMM_WITH_VAL(Int, root->lhs(), 2);
-    IS_NODE_WITH_NAME(Add, root->rhs(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_VAR_WITH_NAME(add->rhs(), "y");
-  }
-
-  {
-    // (x * y) + (x * y) => 2 * (x * y)
-    ExprHandle body = (x * y) + (x * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), root);
-    IS_IMM_WITH_VAL(Int, root->lhs(), 2);
-    IS_NODE_WITH_NAME(Mul, root->rhs(), mul);
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // (x - y) + (x - y) => 2 * (x - y)
-    ExprHandle body = (x - y) + (x - y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-
-    IS_NODE_WITH_NAME(Sub, mul->rhs(), rhs);
-    IS_VAR_WITH_NAME(rhs->lhs(), "x");
-    IS_VAR_WITH_NAME(rhs->rhs(), "y");
-  }
-
-  {
-    // (x + x + x + x) => 4 * x
-    ExprHandle body = (x + x + x + x);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), root);
-    IS_IMM_WITH_VAL(Int, root->lhs(), 4);
-    IS_VAR_WITH_NAME(root->rhs(), "x");
-  }
-
-  {
-    // (x + 0) => x.
-    ExprHandle body = x + 0;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // (x + 0.f) => float(x).
-    ExprHandle body = x + 0.f;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Cast, simplified.node(), cast);
-    ASSERT_EQ(cast->dtype().scalar_type(), ScalarType::Float);
-    IS_VAR_WITH_NAME(cast->src_value(), "x");
-  }
-}
-
-TEST(Simplify, SimplifyMuls) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  {
-    // (x + y) * (x + y) => (x + y) * (x + y)
-    // We don't attempt to simplify multiplication of polynomials since the
-    // result is only very rarely more efficient.
-    ExprHandle body = (x + y) * (x + y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_NODE_WITH_NAME(Add, mul->lhs(), lhs);
-    IS_VAR_WITH_NAME(lhs->lhs(), "x");
-    IS_VAR_WITH_NAME(lhs->rhs(), "y");
-    IS_NODE_WITH_NAME(Add, mul->rhs(), rhs);
-    IS_VAR_WITH_NAME(rhs->lhs(), "x");
-    IS_VAR_WITH_NAME(rhs->rhs(), "y");
-  }
-
-  {
-    // x * y * x * y => x * x * y * y
-    // These get reordered only.
-    ExprHandle body = x * y * x * y;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul1);
-    IS_NODE_WITH_NAME(Mul, mul1->lhs(), mul2);
-    IS_NODE_WITH_NAME(Mul, mul2->lhs(), mul3);
-    IS_VAR_WITH_NAME(mul1->rhs(), "y");
-    IS_VAR_WITH_NAME(mul2->rhs(), "y");
-    IS_VAR_WITH_NAME(mul3->lhs(), "x");
-    IS_VAR_WITH_NAME(mul3->rhs(), "x");
-  }
-
-  {
-    // 1 * (x * 1) => x
-    // Ones cancel cleanly.
-    ExprHandle body = ExprHandle(1) * (x * ExprHandle(1));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // 1.f * (x * 1.f) => x
-    // Even float ones cancel cleanly, but carry their type.
-    ExprHandle body = ExprHandle(1.f) * (x * ExprHandle(1.f));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Cast, simplified.node(), cast);
-    ASSERT_EQ(cast->dtype().scalar_type(), ScalarType::Float);
-    IS_VAR_WITH_NAME(cast->src_value(), "x");
-  }
-
-  {
-    // 1 * (x * 1.f) => x
-    // One float is enough to cast the expr.
-    ExprHandle body = ExprHandle(1) * (x * ExprHandle(1.f));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Cast, simplified.node(), cast);
-    ASSERT_EQ(cast->dtype().scalar_type(), ScalarType::Float);
-    IS_VAR_WITH_NAME(cast->src_value(), "x");
-  }
-
-  {
-    // 1 * (x * 0) => 0
-    // Zeroes are eliminated.
-    ExprHandle body = ExprHandle(1) * (x * ExprHandle(0));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // 1 * (x * 0) => 0
-    // But not for Float since nan * 0 = nan.
-    ExprHandle body = ExprHandle(1.f) * (x * ExprHandle(0.f));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_NODE_WITH_NAME(Cast, mul->lhs(), cast);
-    ASSERT_EQ(cast->dtype().scalar_type(), ScalarType::Float);
-    IS_VAR_WITH_NAME(cast->src_value(), "x");
-    IS_IMM_WITH_VAL(Float, mul->rhs(), 0.0);
-  }
-
-  {
-    // (x - y) * (x - y) => (x - y) * (x - y)
-    // As with Add we don't attempt simplification of this.
-    ExprHandle body = (x - y) * (x - y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_NODE_WITH_NAME(Sub, mul->lhs(), lhs);
-    IS_VAR_WITH_NAME(lhs->lhs(), "x");
-    IS_VAR_WITH_NAME(lhs->rhs(), "y");
-    IS_NODE_WITH_NAME(Sub, mul->rhs(), rhs);
-    IS_VAR_WITH_NAME(rhs->lhs(), "x");
-    IS_VAR_WITH_NAME(rhs->rhs(), "y");
-  }
-
-  {
-    // (x + y) * (x - y) => (x + y) * (x - y)
-    // Don't simplify with different ops on each side.
-    ExprHandle body = (x + y) * (x - y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_NODE_WITH_NAME(Add, mul->lhs(), lhs);
-    IS_VAR_WITH_NAME(lhs->lhs(), "x");
-    IS_VAR_WITH_NAME(lhs->rhs(), "y");
-    IS_NODE_WITH_NAME(Sub, mul->rhs(), rhs);
-    IS_VAR_WITH_NAME(rhs->lhs(), "x");
-    IS_VAR_WITH_NAME(rhs->rhs(), "y");
-  }
-
-  {
-    // Multiply a polynomial by a term.
-    //   - term with no scalar, poly with non-identity scalar.
-    // x * (y + 1) => x + x * y
-    ExprHandle body = x * (y + ExprHandle(1));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_NODE_WITH_NAME(Mul, add->rhs(), mul);
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // Multiply a polynomial by a term.
-    //   - term with identity scalar, poly with non-identity scalar.
-    // (x * 1) * (y + 1) => x + x * y
-    ExprHandle body = (x * ExprHandle(1)) * (y + ExprHandle(1));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_NODE_WITH_NAME(Mul, add->rhs(), mul);
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // Multiply a polynomial by a term.
-    //   - term with non-identity scalar, poly with non-identity scalar.
-    // (x * 2) * (y + 1) => 2 * (x + x * y)
-    ExprHandle body = (x * ExprHandle(2)) * (y + ExprHandle(1));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_NODE_WITH_NAME(Add, mul->rhs(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_NODE_WITH_NAME(Mul, add->rhs(), mul2);
-    IS_VAR_WITH_NAME(mul2->lhs(), "x");
-    IS_VAR_WITH_NAME(mul2->rhs(), "y");
-  }
-
-  {
-    // Multiply a polynomial by a term.
-    //   - term with non-identity scalar, poly with identity scalar.
-    // (x * 2) * (y + 0) => 2 * (x * y)
-    ExprHandle body = (x * ExprHandle(2)) * (y + ExprHandle(0));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_NODE_WITH_NAME(Mul, mul->rhs(), mul2);
-    IS_VAR_WITH_NAME(mul2->lhs(), "x");
-    IS_VAR_WITH_NAME(mul2->rhs(), "y");
-  }
-
-  {
-    // Multiply a polynomial by a term.
-    //   - term with identity scalar, poly with identity scalar.
-    // (x * 1) * (y + 0) => x * y
-    ExprHandle body = (x * ExprHandle(1)) * (y + ExprHandle(0));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // Multiply a polynomial by a term.
-    //   - term with no scalar, poly with identity scalar.
-    // x * (y + 0) => x * y
-    ExprHandle body = x * (y + ExprHandle(0));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-}
-
-// Sub an expr from itself will result in zero.
-TEST(Simplify, SimplifySubs) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  {
-    // (x + y) - (x + y) => 0
-    ExprHandle body = (x + y) - (x + y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // (x * y) - (x * y) => 0
-    ExprHandle body = (x * y) - (x * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // (x - y) - (x - y) => 0
-    ExprHandle body = (x - y) - (x - y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // (x + y) - 2 * (x + y) => -1 * x - y
-    ExprHandle body = (x + y) - ExprHandle(2) * (x + y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
-    IS_NODE_WITH_NAME(Mul, sub->lhs(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), -1);
-    IS_VAR_WITH_NAME(mul->rhs(), "x");
-    IS_VAR_WITH_NAME(sub->rhs(), "y");
-  }
-
-  {
-    // (x + y) - y => x
-    ExprHandle body = (x + y) - y;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // (x - 0) => x.
-    ExprHandle body = x - 0;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // (x - 0.f) => x.
-    // Simple enough to cancel in float.
-    ExprHandle body = x - ExprHandle(0.f);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Cast, simplified.node(), cast);
-    ASSERT_EQ(cast->dtype().scalar_type(), ScalarType::Float);
-    IS_VAR_WITH_NAME(cast->src_value(), "x");
-  }
-
-  {
-    // (x - (float)(y - y)) => x.
-    ExprHandle body = x - Cast::make(kFloat, y - y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Cast, simplified.node(), cast);
-    ASSERT_EQ(cast->dtype().scalar_type(), ScalarType::Float);
-    IS_VAR_WITH_NAME(cast->src_value(), "x");
-  }
-
-  {
-    // (x - y) - y => x - 2 * y
-    ExprHandle body = (x - y) - y;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
-    IS_VAR_WITH_NAME(sub->lhs(), "x");
-    IS_NODE_WITH_NAME(Mul, sub->rhs(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // 2 * x - x => x
-    ExprHandle body = (ExprHandle(2) * x) - x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // x - 2 * x = -1 * x
-    // We don't have a unary negate, but this could be 0 -x I guess?
-    ExprHandle body = x - (ExprHandle(2) * x);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-
-    IS_IMM_WITH_VAL(Int, mul->lhs(), -1);
-    IS_VAR_WITH_NAME(mul->rhs(), "x");
-  }
-
-  {
-    // (x + y + 5) * (x - x) => 0
-    // Cancelling out one side of Mul cancels both.
-    ExprHandle body = (x + y + 5) * (x - x);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // Cancel out opaque modulus.
-    ExprHandle body = (x % y + 2) - (x % y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 2);
-  }
-
-  {
-    // Cancel out opaque modulus with a bit more going on.
-    ExprHandle body = (x % y + (x * 2 - x - y * 0) - x + 2) - (x % y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 2);
-  }
-
-  {
-    // Sub where result is negative.
-    ExprHandle body = x - (x + 1);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), -1);
-  }
-
-  {
-    // Sub where result is positive due to negative scalar on RHS.
-    ExprHandle body = x - (x - 1);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 1);
-  }
-
-  {
-    // Term - Polynomial sub where RHS must be negated.
-    ExprHandle body = (x * 2) - (x * 2 + 1);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), -1);
-  }
-
-  {
-    // Term - Polynomial sub where the result is a Term.
-    ExprHandle body = (y * x * 2) - (x * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // Term - Polynomial sub where the result is a Polynomial.
-    ExprHandle body = (x * 2) - (x + 1);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
-
-    IS_VAR_WITH_NAME(sub->lhs(), "x");
-    IS_IMM_WITH_VAL(Int, sub->rhs(), 1);
-  }
-}
-
-TEST(Simplify, SimplifyDiv) {
-  VarHandle x("x", kInt);
-
-  {
-    ExprHandle body = ExprHandle(0) / x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    ExprHandle body = x / 1;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-}
-
-TEST(Simplify, SimplifyDivWithLoopContext0) {
-  // Stmt to simplify:
-  // for (int i = 0; i < 100; i++) {
-  //  A[i] = i / 100;
-  //}
-  VarHandle i("i", kInt);
-  BufHandle a_buf("A", {100}, kInt);
-  auto for_stmt = For::make(i, 0, 100, Store::make(a_buf, {i}, (i / 100)));
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT:   A[i] = 0;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyDivWithLoopContext1) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(6)) {
-  //  A[i] = (i + 24) / 6;
-  //}
-  VarHandle i("i", kInt);
-  BufHandle a_buf("A", {6}, kInt);
-  auto for_stmt = For::make(i, 0, 6, Store::make(a_buf, {i}, (i + 24) / 6));
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT:   A[i] = 4;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyDivWithLoopContext2) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(5)) {
-  //  A[i] = (i + 25) / 6;
-  //}
-  VarHandle i("i", kInt);
-  BufHandle a_buf("A", {5}, kInt);
-  auto for_stmt = For::make(i, 0, 5, Store::make(a_buf, {i}, (i + 25) / 6));
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT:   A[i] = 4;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyDivWithLoopContext3) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(6)) {
-  //  A[i] = (i + 24) / (-6);
-  //}
-  VarHandle i("i", kInt);
-  BufHandle a_buf("A", {6}, kInt);
-  auto for_stmt = For::make(i, 0, 6, Store::make(a_buf, {i}, (i + 24) / (-6)));
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NOT:   A[i] = -4;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyDivWithLoopContext4) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(5)) {
-  //  A[i] = (i - 5) / 6;
-  //}
-  VarHandle i("i", kInt);
-  BufHandle a_buf("A", {5}, kInt);
-  auto for_stmt = For::make(i, 0, 5, Store::make(a_buf, {i}, (i + (-5)) / 6));
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NOT:   A[i] = 0;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyDivWithLoopContext5) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(6)) {
-  //  for (const auto j : c10::irange(10)) {
-  //    A[i, j] = (i + 6*j) / 6;
-  //  }
-  //}
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  BufHandle a_buf("A", {6, 10}, kInt);
-  auto for_j = For::make(j, 0, 10, Store::make(a_buf, {i, j}, (i + j * 6) / 6));
-  auto for_i = For::make(i, 0, 6, for_j);
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_i);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK:   for (int j
-# CHECK-NEXT:   A[i, j] = j;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyDivWithLoopContext6) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(6)) {
-  //  for (int j = -1; j < 9; j++) {
-  //    A[i, j+1] = (i + 6*j) / 6;
-  //  }
-  //}
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  BufHandle a_buf("A", {6, 10}, kInt);
-  auto for_j =
-      For::make(j, -1, 9, Store::make(a_buf, {i, j + 1}, (i + j * 6) / 6));
-  auto for_i = For::make(i, 0, 6, for_j);
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_i);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK:   for (int j
-# CHECK-NOT:   A[i, j] = j;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyDivWithLoopContext7) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(6)) {
-  //  for (const auto j : c10::irange(10)) {
-  //    A[i, j] = (i + 6*j) / (-6);
-  //  }
-  //}
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  BufHandle a_buf("A", {6, 10}, kInt);
-  auto for_j =
-      For::make(j, 0, 10, Store::make(a_buf, {i, j}, (i + j * 6) / (-6)));
-  auto for_i = For::make(i, 0, 6, for_j);
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_i);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK:   for (int j
-# CHECK-NOT:   A[i, j] = -j;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyModWithLoopContext0) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(100)) {
-  //  A[i] = i % 100;
-  //}
-  VarHandle i("i", kInt);
-  BufHandle a_buf("A", {100}, kInt);
-  auto for_stmt = For::make(i, 0, 100, Store::make(a_buf, {i}, (i % 100)));
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT:   A[i] = i;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyModWithLoopContext1) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(6)) {
-  //  A[i] = (i + 24) % 6;
-  //}
-  VarHandle i("i", kInt);
-  BufHandle a_buf("A", {6}, kInt);
-  auto for_stmt = For::make(i, 0, 6, Store::make(a_buf, {i}, (i + 24) % 6));
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT:   A[i] = i;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyModWithLoopContext2) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(5)) {
-  //  A[i] = (i + 25) % 6;
-  //}
-  VarHandle i("i", kInt);
-  BufHandle a_buf("A", {5}, kInt);
-  auto for_stmt = For::make(i, 0, 5, Store::make(a_buf, {i}, (i + 25) % 6));
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NEXT:   A[i] = i + 1;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyModWithLoopContext3) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(6)) {
-  //  A[i] = (i + 24) % (-6);
-  //}
-  VarHandle i("i", kInt);
-  BufHandle a_buf("A", {6}, kInt);
-  auto for_stmt = For::make(i, 0, 6, Store::make(a_buf, {i}, (i + 24) % (-6)));
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NOT:   A[i] = i;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyModWithLoopContext4) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(5)) {
-  //  A[i] = (i - 5) % 6;
-  //}
-  VarHandle i("i", kInt);
-  BufHandle a_buf("A", {5}, kInt);
-  auto for_stmt = For::make(i, 0, 5, Store::make(a_buf, {i}, (i + (-5)) % 6));
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_stmt);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK-NOT:   A[i] = i - 5;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyModWithLoopContext5) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(6)) {
-  //  for (const auto j : c10::irange(10)) {
-  //    A[i, j] = (i + 6*j) % 6;
-  //  }
-  //}
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  BufHandle a_buf("A", {6, 10}, kInt);
-  auto for_j = For::make(j, 0, 10, Store::make(a_buf, {i, j}, (i + j * 6) % 6));
-  auto for_i = For::make(i, 0, 6, for_j);
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_i);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK:   for (int j
-# CHECK-NEXT:   A[i, j] = i;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyModWithLoopContext6) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(6)) {
-  //  for (int j = -1; j < 9; j++) {
-  //    A[i, j+1] = (i + 6*j) % 6;
-  //  }
-  //}
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  BufHandle a_buf("A", {6, 10}, kInt);
-  auto for_j =
-      For::make(j, -1, 9, Store::make(a_buf, {i, j + 1}, (i + j * 6) % 6));
-  auto for_i = For::make(i, 0, 6, for_j);
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_i);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK:   for (int j
-# CHECK-NOT:   A[i, j] = i;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyModWithLoopContext7) {
-  // Stmt to simplify:
-  // for (const auto i : c10::irange(6)) {
-  //  for (const auto j : c10::irange(10)) {
-  //    A[i, j] = (i + 6*j) % (-6);
-  //  }
-  //}
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  BufHandle a_buf("A", {6, 10}, kInt);
-  auto for_j =
-      For::make(j, 0, 10, Store::make(a_buf, {i, j}, (i + j * 6) % (-6)));
-  auto for_i = For::make(i, 0, 6, for_j);
-
-  const StmtPtr simplified = IRSimplifier::simplify(for_i);
-
-  std::ostringstream oss;
-  oss << *(simplified);
-  const std::string& verification_pattern =
-      R"IR(
-# CHECK: for (int i
-# CHECK:   for (int j
-# CHECK-NOT:   A[i, j] = i;
-      )IR";
-  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
-}
-
-TEST(Simplify, SimplifyMod) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  VarHandle z("z", kInt);
-
-  {
-    // Constant folding works.
-    ExprHandle body = ExprHandle(10) % 8;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    IS_IMM_WITH_VAL(Int, simplified.node(), 2);
-  }
-
-  {
-    // x % x => 0
-    ExprHandle body = x % x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // 0 % x => 0
-    ExprHandle body = ExprHandle(0) % x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // x % 1 => 0
-    ExprHandle body = x % 1;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // Doesn't change unknown mods.
-    // x % y => x % y
-    ExprHandle body = x % y;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "x");
-    IS_VAR_WITH_NAME(mod->rhs(), "y");
-  }
-
-  {
-    // don't touch if RHS is unknown.
-    // 4 % x => 4 % x
-    ExprHandle body = ExprHandle(4) % x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
-    IS_IMM_WITH_VAL(Int, mod->lhs(), 4);
-    IS_VAR_WITH_NAME(mod->rhs(), "x");
-  }
-
-  {
-    // don't touch if LHS is unknown.
-    // x % 4 => x % 4
-    ExprHandle body = x % 4;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "x");
-    IS_IMM_WITH_VAL(Int, mod->rhs(), 4);
-  }
-
-  {
-    // if LHS is a multiple of RHS, mod is zero.
-    // 2 * x % x => 0
-    ExprHandle body = (x * 2) % x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // true even if the multiple is not constant.
-    // x * y % x => 0
-    ExprHandle body = (x * y) % x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // true with multiple unknown values in LHS.
-    // x * y * z % x => 0
-    ExprHandle body = (x * y * z) % x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // true if the denom is compound.
-    // x * y * z % y * z => 0
-    ExprHandle body = (x * y * z) % (y * z);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // Sanity check true with scalars that are multiples.
-    // 12 * x % 4 => 0
-    ExprHandle body = (x * 12) % 4;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-
-  {
-    // Sanity check not true if the smaller scalar is on LHS.
-    // 4 * x % 12 => 4 * x % 12
-    ExprHandle body = (x * 4) % 12;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
-    IS_NODE_WITH_NAME(Mul, mod->lhs(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 4);
-    IS_VAR_WITH_NAME(mul->rhs(), "x");
-    IS_IMM_WITH_VAL(Int, mod->rhs(), 12);
-  }
-
-  {
-    // Both scalar and symbolic in multiple.
-    // (6 * x * y) % (3 * x * y) => 0
-    ExprHandle body = (ExprHandle(6) * x * y) % (x * y * 3);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_IMM_WITH_VAL(Int, simplified.node(), 0);
-  }
-}
-
-// Test that mixing ops together simplifies as expected.
-TEST(Simplify, SimplifyMultiOp) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  {
-    // (x * y) + (x - y) => (x + x * y) - y
-    ExprHandle body = (x * y) + (x - y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
-    IS_NODE_WITH_NAME(Add, sub->lhs(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_NODE_WITH_NAME(Mul, add->rhs(), mul);
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-    IS_VAR_WITH_NAME(sub->rhs(), "y");
-  }
-
-  {
-    // (x + y) - x * y => (x + y) - x * y
-    ExprHandle body = (x + y) - x * y;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
-    IS_NODE_WITH_NAME(Add, sub->lhs(), add);
-    IS_NODE_WITH_NAME(Mul, sub->rhs(), mul);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_VAR_WITH_NAME(add->rhs(), "y");
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // (x - y) - (x + y) => -2 * y
-    ExprHandle body = (x - y) - (x + y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), -2);
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // (x - 0) + (x * 1) - (x + 0) => x
-    ExprHandle body = (x - 0) + (x * 1) - (x + 0);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // (x - 0.f) + (x * 1.f) - (x + 0.f) => float(x) + float(x) - float(x)
-    // Even in Float simple terms cancel out, but the variable ones cannot.
-    ExprHandle body =
-        (x - ExprHandle(0.f)) + (x * ExprHandle(1.f)) - (x + ExprHandle(0.f));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
-    IS_NODE_WITH_NAME(Add, sub->lhs(), add);
-    IS_NODE_WITH_NAME(Cast, add->lhs(), cast1);
-    IS_VAR_WITH_NAME(cast1->src_value(), "x");
-    IS_NODE_WITH_NAME(Cast, add->rhs(), cast2);
-    IS_VAR_WITH_NAME(cast2->src_value(), "x");
-    IS_NODE_WITH_NAME(Cast, sub->rhs(), cast3);
-    IS_VAR_WITH_NAME(cast3->src_value(), "x");
-  }
-}
-
-// Test that chaining many ops together works as expected.
-TEST(Simplify, SimplifyManyOps) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  {
-    // x + y + x + x + y + y + x + y + x = 4 * y + 5 * x
-    ExprHandle body = x + y + x + x + y + y + x + y + x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-
-    IS_NODE_WITH_NAME(Mul, add->lhs(), lhs);
-    IS_IMM_WITH_VAL(Int, lhs->lhs(), 4);
-    IS_VAR_WITH_NAME(lhs->rhs(), "y");
-
-    IS_NODE_WITH_NAME(Mul, add->rhs(), rhs);
-    IS_IMM_WITH_VAL(Int, rhs->lhs(), 5);
-    IS_VAR_WITH_NAME(rhs->rhs(), "x");
-  }
-
-  {
-    // x - y + x + x - y - y + x - y + x = 5 * x - 4 * y
-    ExprHandle body = x - y + x + x - y - y + x - y + x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Sub, simplified.node(), add);
-
-    IS_NODE_WITH_NAME(Mul, add->lhs(), lhs);
-    IS_IMM_WITH_VAL(Int, lhs->lhs(), 5);
-    IS_VAR_WITH_NAME(lhs->rhs(), "x");
-
-    IS_NODE_WITH_NAME(Mul, add->rhs(), rhs);
-    IS_IMM_WITH_VAL(Int, rhs->lhs(), 4);
-    IS_VAR_WITH_NAME(rhs->rhs(), "y");
-  }
-
-  {
-    // x + y + x - x - y - y + x + y + x = 3 * x
-    ExprHandle body = x + y + x - x - y - y + x + y + x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 3);
-    IS_VAR_WITH_NAME(mul->rhs(), "x");
-  }
-}
-
-TEST(Simplify, SimplifyFactorization) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  {
-    // (2 * x) + (2 * y) => 2 * (x + y)
-    ExprHandle body = (ExprHandle(2) * x + ExprHandle(2) * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-
-    IS_NODE_WITH_NAME(Add, mul->rhs(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_VAR_WITH_NAME(add->rhs(), "y");
-  }
-
-  {
-    // Factorization when scalars have common divider.
-    // (2 * x) + (4 * y) => 2 * (2 * y + x)
-    ExprHandle body = (ExprHandle(2) * x + ExprHandle(4) * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-
-    IS_NODE_WITH_NAME(Add, mul->rhs(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_NODE_WITH_NAME(Mul, add->rhs(), mul2);
-    IS_IMM_WITH_VAL(Int, mul2->lhs(), 2);
-    IS_VAR_WITH_NAME(mul2->rhs(), "y");
-  }
-
-  {
-    // Factorization attempt without a common divider.
-    // (2 * x) + (5 * y) =>  (5 * y) + (2 * x)
-    ExprHandle body = (ExprHandle(2) * x + ExprHandle(5) * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-
-    IS_NODE_WITH_NAME(Mul, add->lhs(), lhs);
-    IS_IMM_WITH_VAL(Int, lhs->lhs(), 2);
-    IS_VAR_WITH_NAME(lhs->rhs(), "x");
-
-    IS_NODE_WITH_NAME(Mul, add->rhs(), rhs);
-    IS_IMM_WITH_VAL(Int, rhs->lhs(), 5);
-    IS_VAR_WITH_NAME(rhs->rhs(), "y");
-  }
-
-  {
-    // Factorization after merging.
-    // (2 * x) + (4 * y) + (8 * x + 6 * y) => 10 * (x + y)
-    ExprHandle body = (ExprHandle(2) * x + ExprHandle(4) * y) +
-        (ExprHandle(8) * x + ExprHandle(6) * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 10);
-
-    IS_NODE_WITH_NAME(Add, mul->rhs(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_VAR_WITH_NAME(add->rhs(), "y");
-  }
-
-  {
-    // Factorization with common divider but different signs.
-    // (2 * x) + (-4 * y) => 2 * (x - 2 * y)
-    ExprHandle body = (ExprHandle(2) * x + ExprHandle(-4) * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-
-    IS_NODE_WITH_NAME(Sub, mul->rhs(), sub);
-    IS_VAR_WITH_NAME(sub->lhs(), "x");
-    IS_NODE_WITH_NAME(Mul, sub->rhs(), mul2);
-    IS_IMM_WITH_VAL(Int, mul2->lhs(), 2);
-    IS_VAR_WITH_NAME(mul2->rhs(), "y");
-  }
-
-  {
-    // Factorization with all negative numbers.
-    // (-2 * x) + (-4 * y) => 2 * (-1 * x - 2 * y)
-    ExprHandle body = ExprHandle(-2) * x + ExprHandle(-4) * y;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-
-    IS_NODE_WITH_NAME(Sub, mul->rhs(), sub);
-    IS_NODE_WITH_NAME(Mul, sub->lhs(), mul2);
-    IS_IMM_WITH_VAL(Int, mul2->lhs(), -1);
-    IS_VAR_WITH_NAME(mul2->rhs(), "x");
-    IS_NODE_WITH_NAME(Mul, sub->rhs(), mul3);
-    IS_IMM_WITH_VAL(Int, mul3->lhs(), 2);
-    IS_VAR_WITH_NAME(mul3->rhs(), "y");
-  }
-
-  {
-    // The following test ensures that there in no infinite recursion during
-    // factorization when negative numbers are involved.
-    VarHandle a("a", kInt);
-    VarHandle b("b", kInt);
-    VarHandle c("c", kInt);
-    VarHandle d("d", kInt);
-    VarHandle e("e", kInt);
-    VarHandle f("f", kInt);
-    VarHandle g("g", kInt);
-    VarHandle h("h", kInt);
-
-    ExprHandle body = a * 1024 + 0 + b * (-1) + c * (-1) + d * 1 + e * 1 +
-        f * 32 + g * (-1024) + h * (-32);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(
-        simplified,
-        "((((((d + e) + 1024 * a) + 32 * f) - b) - c) - 1024 * g) - 32 * h");
-  }
-}
-
-// (4 * x + y + z * 2) + (4 * x + y + z * 4) => 2 * (y + 3 * z + 4 * x)
-TEST(Simplify, SimplifyFactorizeUneven) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  VarHandle z("z", kInt);
-  ExprHandle body =
-      (ExprHandle(4) * x + y + z * 2) + (ExprHandle(4) * x + y + z * 4);
-  ExprHandle simplified = IRSimplifier::simplify(body);
-
-  IS_NODE_WITH_NAME(Mul, simplified.node(), root);
-  IS_IMM_WITH_VAL(Int, root->lhs(), 2);
-  IS_NODE_WITH_NAME(Add, root->rhs(), add1);
-  IS_NODE_WITH_NAME(Add, add1->lhs(), add2);
-
-  IS_VAR_WITH_NAME(add2->lhs(), "y");
-  IS_NODE_WITH_NAME(Mul, add2->rhs(), zmul);
-  IS_NODE_WITH_NAME(Mul, add1->rhs(), xmul);
-
-  IS_IMM_WITH_VAL(Int, xmul->lhs(), 4);
-  IS_VAR_WITH_NAME(xmul->rhs(), "x");
-
-  IS_IMM_WITH_VAL(Int, zmul->lhs(), 3);
-  IS_VAR_WITH_NAME(zmul->rhs(), "z");
-}
-
-// (x * y) + (2 * x) * (x + y) => 2 * (x * x) + 3 * (x * y)
-// This is kind of a placeholder test for variable factorization.
-TEST(Simplify, SimplifyDeeperTerms) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  ExprHandle body = (x * y) + (ExprHandle(2) * x) * (x + y);
-  ExprHandle simplified = IRSimplifier::simplify(body);
-
-  IS_NODE_WITH_NAME(Add, simplified.node(), add);
-
-  IS_NODE_WITH_NAME(Mul, add->lhs(), lhs);
-  IS_IMM_WITH_VAL(Int, lhs->lhs(), 2);
-  IS_NODE_WITH_NAME(Mul, lhs->rhs(), xxTerm);
-  IS_VAR_WITH_NAME(xxTerm->lhs(), "x");
-  IS_VAR_WITH_NAME(xxTerm->rhs(), "x");
-
-  IS_NODE_WITH_NAME(Mul, add->rhs(), rhs);
-  IS_IMM_WITH_VAL(Int, rhs->lhs(), 3);
-  IS_NODE_WITH_NAME(Mul, rhs->rhs(), xyTerm);
-  IS_VAR_WITH_NAME(xyTerm->lhs(), "x");
-  IS_VAR_WITH_NAME(xyTerm->rhs(), "y");
-}
-
-// Tests the difference between two less trivial expressions.
-// (m * (1 * n_1) + (n  + 1)) - (m *  (1 * n_1) + n) => 1
-TEST(Simplify, SimplifyDeeperDifference) {
-  VarHandle n("n", kInt);
-  VarHandle n_1("n_1", kInt);
-  VarHandle m("m", kInt);
-  ExprHandle body =
-      (m * (ExprHandle(1) * n_1) + (n + 1)) - (m * (ExprHandle(1) * n_1) + n);
-  ExprHandle simplified = IRSimplifier::simplify(body);
-
-  IS_IMM_WITH_VAL(Int, simplified.node(), 1);
-}
-
-// Test constant folding into the difference between expressions.
-// 2 + char((m * (1 * n_1) + (n  + 1)) - (m *  (1 * n_1) + n)) => 3
-TEST(Simplify, SimplifyFoldComplexDifference) {
-  VarHandle n("n", kInt);
-  VarHandle n_1("n_1", kInt);
-  VarHandle m("m", kInt);
-  ExprHandle body =
-      (IntImm::make(2) +
-       (Cast::make(
-           kChar,
-           (m * (ExprHandle(1) * n_1) + (n + 1)) -
-               (m * (ExprHandle(1) * n_1) + n))));
-  ExprHandle simplified = IRSimplifier::simplify(body);
-  IS_IMM_WITH_VAL(Int, simplified.node(), 3);
-}
-
-TEST(Simplify, SimplifyIfComponents) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  ExprHandle body = IfThenElse::make(
-      ((ExprHandle(5) - ExprHandle(4)) * x) > y,
-      ExprHandle(2) * x - x,
-      ExprHandle(2) * y - y);
-
-  ExprHandle simplified = IRSimplifier::simplify(body);
-
-  IS_NODE_WITH_NAME(IfThenElse, simplified.node(), ifexpr);
-
-  IS_NODE_WITH_NAME(CompareSelect, ifexpr->condition(), cmp);
-  ASSERT_EQ(cmp->compare_select_op(), kGT);
-  IS_VAR_WITH_NAME(cmp->lhs(), "x");
-  IS_VAR_WITH_NAME(cmp->rhs(), "y");
-
-  IS_VAR_WITH_NAME(ifexpr->true_value(), "x");
-  IS_VAR_WITH_NAME(ifexpr->false_value(), "y");
-}
-
-TEST(Simplify, SimplifyOpaqueTerms) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  {
-    // 2 * x/y * y - x/y * y => x/y * y
-    ExprHandle body = ((ExprHandle(2)) * (x / y) * y) - ((x / y) * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_NODE_WITH_NAME(Div, mul->lhs(), div);
-    IS_VAR_WITH_NAME(div->lhs(), "x");
-    IS_VAR_WITH_NAME(div->rhs(), "y");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // x%y - (x%y - 1) => 1
-    ExprHandle body = (x % y) - ((x % y) - 1);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_IMM_WITH_VAL(Int, simplified.node(), 1);
-  }
-}
-
-TEST(Simplify, SimplifySymbolicMinMax) {
-  {
-    // Minimum with constant difference between terms.
-    VarHandle x("x", kInt);
-    ExprHandle body = Min::make(x + 3, x + 7, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_IMM_WITH_VAL(Int, add->rhs(), 3);
-  }
-
-  {
-    // Maximum with constant difference between terms.
-    VarHandle x("x", kInt);
-    ExprHandle body = Max::make(x + 3, x + 7, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_IMM_WITH_VAL(Int, add->rhs(), 7);
-  }
-
-  {
-    // Can't simplify multiples because of signedness of variable component.
-    // TODO: maybe we could for unsigned types?
-    VarHandle x("x", kInt);
-    ExprHandle body = Max::make(x * 3, x * 7, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE(Max, simplified.node());
-  }
-}
-
-TEST(Simplify, SimplifyNestedMax) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  VarHandle z("z", kInt);
-
-  {
-    // Max(x + y, x + y) => x + y
-    ExprHandle body = Max::make(x + y, x + y, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    IS_BINOP_W_VARS(Add, simplified.node(), add, "x", "y");
-  }
-
-  {
-    // Max(x + y, Max(x + y, z)) => Max(x + y, z)
-    ExprHandle body = Max::make(x + y, Max::make(x + y, z, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y");
-    IS_VAR_WITH_NAME(max->rhs(), "z");
-  }
-
-  {
-    // Max(x + y, Max(z, x + y)) => Max(x + y, z)
-    ExprHandle body = Max::make(x + y, Max::make(z, x + y, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y");
-    IS_VAR_WITH_NAME(max->rhs(), "z");
-  }
-
-  {
-    // Max(Max(x + y, z), x + y) => Max(x + y, z)
-    ExprHandle body = Max::make(Max::make(x + y, z, true), x + y, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y");
-    IS_VAR_WITH_NAME(max->rhs(), "z");
-  }
-
-  {
-    // Max(Max(z, x + y), x + y) => Max(x + y, z)
-    ExprHandle body = Max::make(Max::make(z, x + y, true), x + y, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y");
-    IS_VAR_WITH_NAME(max->rhs(), "z");
-  }
-
-  {
-    // Max(Max(x, y), x) => Max(Max(x, y), x)
-    // Nested Max ops with different propagate_nans should not be simplified.
-    ExprHandle body = Max::make(Max::make(x, y, true), x, false);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_BINOP_W_VARS(Max, max->lhs(), max1, "x", "y");
-    ASSERT_TRUE(max1->propagate_nans());
-    IS_VAR_WITH_NAME(max->rhs(), "x");
-    ASSERT_FALSE(max->propagate_nans());
-  }
-
-  {
-    // Max(Min(x, y), Min(x, z)) => Min(Max(y, z), x)
-    ExprHandle body =
-        Max::make(Min::make(x, y, true), Min::make(x, z, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)");
-  }
-
-  {
-    // Max(Min(x, y), Min(z, x)) => Min(Max(y, z), x)
-    ExprHandle body =
-        Max::make(Min::make(x, y, true), Min::make(z, x, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)");
-  }
-
-  {
-    // Max(Min(y, x), Min(x, z)) => Min(Max(y, z), x)
-    ExprHandle body =
-        Max::make(Min::make(y, x, true), Min::make(x, z, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)");
-  }
-
-  {
-    // Max(Min(y, x), Min(z, x)) => Min(Max(y, z), x)
-    ExprHandle body =
-        Max::make(Min::make(y, x, true), Min::make(z, x, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)");
-  }
-
-  {
-    // Max(Min(y, x), Min(z, x)) => Max(Min(x, y), Min(x, z))
-    // When all the ops in the pattern do not have the same propagate_nans,
-    // it should not be simplified.
-    ExprHandle body =
-        Max::make(Min::make(y, x, true), Min::make(z, x, false), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_BINOP_W_VARS(Min, max->lhs(), min1, "x", "y");
-    ASSERT_TRUE(min1->propagate_nans());
-    IS_BINOP_W_VARS(Min, max->rhs(), min2, "x", "z");
-    ASSERT_FALSE(min2->propagate_nans());
-    ASSERT_TRUE(max->propagate_nans());
-  }
-
-  {
-    // Max(5, Max(x, 8)) => Max(x, 8)
-    ExprHandle body = Max::make(5, Max::make(x, 8, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_BINOP_W_CONST(Max, simplified.node(), max, "x", 8);
-    ASSERT_TRUE(max->propagate_nans());
-  }
-
-  {
-    // Max(8, Max(x, 5)) => Max(x, 8)
-    ExprHandle body = Max::make(8, Max::make(x, 5, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_BINOP_W_CONST(Max, simplified.node(), max, "x", 8);
-    ASSERT_TRUE(max->propagate_nans());
-  }
-
-  {
-    // Max(Max(x, 8), 5) => Max(x, 8)
-    ExprHandle body = Max::make(Max::make(x, 8, true), 5, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_BINOP_W_CONST(Max, simplified.node(), max, "x", 8);
-    ASSERT_TRUE(max->propagate_nans());
-  }
-
-  {
-    // Max(Max(x, 5), 8) => Max(x, 8)
-    ExprHandle body = Max::make(Max::make(x, 5, true), 8, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_BINOP_W_CONST(Max, simplified.node(), max, "x", 8);
-    ASSERT_TRUE(max->propagate_nans());
-  }
-
-  {
-    // Max(5, Max(x, Max(y, Max(z, 8)))) => Max(Max(Max(x, 8), y), z)
-    ExprHandle body = Max::make(
-        5, Max::make(x, Max::make(y, Max::make(z, 8, true), true), true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
-    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
-    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
-    ASSERT_TRUE(max3->propagate_nans());
-    IS_VAR_WITH_NAME(max2->rhs(), "y");
-    IS_VAR_WITH_NAME(max1->rhs(), "z");
-  }
-
-  {
-    // Max(8, Max(Max(y, Max(z, 5)), x)) => Max(Max(Max(x, 8), y), z)
-    ExprHandle body = Max::make(
-        8, Max::make(Max::make(y, Max::make(z, 5, true), true), x, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
-    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
-    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
-    ASSERT_TRUE(max3->propagate_nans());
-    IS_VAR_WITH_NAME(max2->rhs(), "y");
-    IS_VAR_WITH_NAME(max1->rhs(), "z");
-  }
-
-  {
-    // Max(5, Max(Max(Max(z, 8), y), x)) => Max(Max(Max(x, 8), y), z)
-    ExprHandle body = Max::make(
-        5, Max::make(Max::make(Max::make(z, 8, true), y, true), x, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
-    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
-    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
-    ASSERT_TRUE(max3->propagate_nans());
-    IS_VAR_WITH_NAME(max2->rhs(), "y");
-    IS_VAR_WITH_NAME(max1->rhs(), "z");
-  }
-
-  {
-    // Max(Max(x, Max(y, Max(5, z))), 8) => Max(Max(Max(x, 8), y), z)
-    ExprHandle body = Max::make(
-        Max::make(x, Max::make(y, Max::make(5, z, true), true), true), 8, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
-    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
-    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
-    ASSERT_TRUE(max3->propagate_nans());
-    IS_VAR_WITH_NAME(max2->rhs(), "y");
-    IS_VAR_WITH_NAME(max1->rhs(), "z");
-  }
-
-  {
-    // Max(Max(Max(y, Max(8, z)), x), 5) => Max(Max(Max(x, 8), y), z)
-    ExprHandle body = Max::make(
-        Max::make(Max::make(y, Max::make(z, 8, true), true), x, true), 5, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
-    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
-    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
-    ASSERT_TRUE(max3->propagate_nans());
-    IS_VAR_WITH_NAME(max2->rhs(), "y");
-    IS_VAR_WITH_NAME(max1->rhs(), "z");
-  }
-
-  {
-    // Max(Max(Max(Max(5, z), y), x), 8) => Max(Max(Max(x, 8), y), z)
-    ExprHandle body = Max::make(
-        Max::make(Max::make(Max::make(z, 5, true), y, true), x, true), 8, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
-    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
-    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
-    ASSERT_TRUE(max3->propagate_nans());
-    IS_VAR_WITH_NAME(max2->rhs(), "y");
-    IS_VAR_WITH_NAME(max1->rhs(), "z");
-  }
-
-  {
-    // Max(Max(Max(Max(z, 5), y), x), 8) => Max(Max(x, Max(Max(z, 5), y)), 8)
-    // Do not simplify when all the Max ops do not have the same
-    // propagate_nans.
-    ExprHandle body = Max::make(
-        Max::make(Max::make(Max::make(z, 5, true), y, false), x, true),
-        8,
-        false);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "Max(Max(Max(Max(z, 5, 1), y, 0), x, 1), 8, 0)");
-  }
-
-  {
-    // Max(8, Max(Max(x, 5), Max(y, z))) => Max(Max(Max(x, 8), y), z)
-    ExprHandle body = Max::make(
-        8, Max::make(Max::make(x, 5, true), Max::make(y, z, true), true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
-    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
-    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
-    ASSERT_TRUE(max3->propagate_nans());
-    IS_VAR_WITH_NAME(max2->rhs(), "y");
-    IS_VAR_WITH_NAME(max1->rhs(), "z");
-  }
-
-  {
-    // Max(Max(Max(x, 5), Max(y, z)), 8) => Max(Max(Max(x, 8), y), z)
-    ExprHandle body = Max::make(
-        Max::make(Max::make(x, 5, true), Max::make(y, z, true), true), 8, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
-    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
-    IS_BINOP_W_CONST(Max, max2->lhs(), max3, "x", 8);
-    ASSERT_TRUE(max3->propagate_nans());
-    IS_VAR_WITH_NAME(max2->rhs(), "y");
-    IS_VAR_WITH_NAME(max1->rhs(), "z");
-  }
-}
-
-TEST(Simplify, SimplifyNestedMin) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-  VarHandle z("z", kInt);
-
-  {
-    // Min(x + y, x + y) => x + y
-    ExprHandle body = Min::make(x + y, x + y, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    IS_BINOP_W_VARS(Add, simplified.node(), add, "x", "y");
-  }
-
-  {
-    // Min(x + y, Min(x + y, z)) => Min(x + y, z)
-    ExprHandle body = Min::make(x + y, Min::make(x + y, z, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y");
-    IS_VAR_WITH_NAME(min->rhs(), "z");
-  }
-
-  {
-    // Min(x + y, Min(z, x + y)) => Min(x + y, z)
-    ExprHandle body = Min::make(x + y, Min::make(z, x + y, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y");
-    IS_VAR_WITH_NAME(min->rhs(), "z");
-  }
-
-  {
-    // Min(Min(x + y, z), x + y) => Min(x + y, z)
-    ExprHandle body = Min::make(Min::make(x + y, z, true), x + y, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y");
-    IS_VAR_WITH_NAME(min->rhs(), "z");
-  }
-
-  {
-    // Min(Min(z, x + y), x + y) => Min(x + y, z)
-    ExprHandle body = Min::make(Min::make(z, x + y, true), x + y, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y");
-    IS_VAR_WITH_NAME(min->rhs(), "z");
-  }
-
-  {
-    // Min(Min(x, y), x) => Min(Min(x, y), x)
-    // Nested Min ops with different propagate_nans should not be simplified.
-    ExprHandle body = Min::make(Min::make(x, y, true), x, false);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
-    IS_BINOP_W_VARS(Min, min1->lhs(), min2, "x", "y");
-    ASSERT_TRUE(min2->propagate_nans());
-    IS_VAR_WITH_NAME(min1->rhs(), "x");
-    ASSERT_FALSE(min1->propagate_nans());
-  }
-
-  {
-    // Min(Max(x, y), Max(x, z)) => Max(Min(y, z), x)
-    ExprHandle body =
-        Min::make(Max::make(x, y, true), Max::make(x, z, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)");
-  }
-
-  {
-    // Min(Max(x, y), Max(z, x)) => Max(Min(y, z), x)
-    ExprHandle body =
-        Min::make(Max::make(x, y, true), Max::make(z, x, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)");
-  }
-
-  {
-    // Min(Max(y, x), Max(x, z)) => Max(Min(y, z), x)
-    ExprHandle body =
-        Min::make(Max::make(y, x, true), Max::make(x, z, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)");
-  }
-
-  {
-    // Min(Max(y, x), Max(z, x)) => Max(Min(y, z), x)
-    ExprHandle body =
-        Min::make(Max::make(y, x, true), Max::make(z, x, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)");
-  }
-
-  {
-    // Min(Max(y, x), Max(z, x)) => Min(Max(x, y), Max(x, z))
-    // When all the ops in the pattern do not have the same propagate_nans,
-    // it should not be simplified.
-    ExprHandle body =
-        Min::make(Max::make(y, x, true), Max::make(z, x, false), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_BINOP_W_VARS(Max, min->lhs(), max1, "x", "y");
-    ASSERT_TRUE(max1->propagate_nans());
-    IS_BINOP_W_VARS(Max, min->rhs(), max2, "x", "z");
-    ASSERT_FALSE(max2->propagate_nans());
-    ASSERT_TRUE(min->propagate_nans());
-  }
-
-  {
-    // Min(5, Min(x, 8)) => Min(x, 8)
-    ExprHandle body = Min::make(5, Min::make(x, 8, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_BINOP_W_CONST(Min, simplified.node(), min, "x", 5);
-    ASSERT_TRUE(min->propagate_nans());
-  }
-
-  {
-    // Min(8, Min(x, 5)) => Min(x, 8)
-    ExprHandle body = Min::make(8, Min::make(x, 5, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_BINOP_W_CONST(Min, simplified.node(), min, "x", 5);
-    ASSERT_TRUE(min->propagate_nans());
-  }
-
-  {
-    // Min(Min(x, 8), 5) => Min(x, 8)
-    ExprHandle body = Min::make(Min::make(x, 8, true), 5, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_BINOP_W_CONST(Min, simplified.node(), min, "x", 5);
-    ASSERT_TRUE(min->propagate_nans());
-  }
-
-  {
-    // Min(Min(x, 5), 8) => Min(x, 8)
-    ExprHandle body = Min::make(Min::make(x, 5, true), 8, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_BINOP_W_CONST(Min, simplified.node(), min, "x", 5);
-    ASSERT_TRUE(min->propagate_nans());
-  }
-
-  {
-    // Min(5, Min(x, Min(y, Min(z, 8)))) => Min(Min(Min(x, 5), y), z)
-    ExprHandle body = Min::make(
-        5, Min::make(x, Min::make(y, Min::make(z, 8, true), true), true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
-    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
-    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
-    ASSERT_TRUE(min3->propagate_nans());
-    IS_VAR_WITH_NAME(min2->rhs(), "y");
-    IS_VAR_WITH_NAME(min1->rhs(), "z");
-  }
-
-  {
-    // Min(5, Min(Min(y, Min(z, 8)), x)) => Min(Min(Min(x, 5), y), z)
-    ExprHandle body = Min::make(
-        5, Min::make(Min::make(y, Min::make(z, 8, true), true), x, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
-    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
-    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
-    ASSERT_TRUE(min3->propagate_nans());
-    IS_VAR_WITH_NAME(min2->rhs(), "y");
-    IS_VAR_WITH_NAME(min1->rhs(), "z");
-  }
-
-  {
-    // Min(5, Min(Min(Min(z, 8), y), x)) => Min(Min(Min(x, 5), y), z)
-    ExprHandle body = Min::make(
-        5, Min::make(Min::make(Min::make(z, 8, true), y, true), x, true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
-    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
-    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
-    ASSERT_TRUE(min3->propagate_nans());
-    IS_VAR_WITH_NAME(min2->rhs(), "y");
-    IS_VAR_WITH_NAME(min1->rhs(), "z");
-  }
-
-  {
-    // Min(Min(x, Min(y, Min(8, z))), 5) => Min(Min(Min(x, 5), y), z)
-    ExprHandle body = Min::make(
-        Min::make(x, Min::make(y, Min::make(8, z, true), true), true), 5, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
-    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
-    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
-    ASSERT_TRUE(min3->propagate_nans());
-    IS_VAR_WITH_NAME(min2->rhs(), "y");
-    IS_VAR_WITH_NAME(min1->rhs(), "z");
-  }
-
-  {
-    // Min(Min(Min(y, Min(8, z)), x), 5) => Min(Min(Min(x, 5), y), z)
-    ExprHandle body = Min::make(
-        Min::make(Min::make(y, Min::make(z, 8, true), true), x, true), 5, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
-    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
-    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
-    ASSERT_TRUE(min3->propagate_nans());
-    IS_VAR_WITH_NAME(min2->rhs(), "y");
-    IS_VAR_WITH_NAME(min1->rhs(), "z");
-  }
-
-  {
-    // Min(Min(Min(Min(8, z), y), x), 5) => Min(Min(Min(x, 5), y), z)
-    ExprHandle body = Min::make(
-        Min::make(Min::make(Min::make(z, 8, true), y, true), x, true), 5, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
-    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
-    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
-    ASSERT_TRUE(min3->propagate_nans());
-    IS_VAR_WITH_NAME(min2->rhs(), "y");
-    IS_VAR_WITH_NAME(min1->rhs(), "z");
-  }
-
-  {
-    // Min(Min(Min(Min(z, 5), y), x), 8) => Min(Min(Min(Min(z, 5), y), x), 8)
-    // Do not simplify when all the Min ops do not have the same
-    // propagate_nans.
-    ExprHandle body = Min::make(
-        Min::make(Min::make(Min::make(z, 5, true), y, false), x, true),
-        8,
-        false);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "Min(Min(Min(Min(z, 5, 1), y, 0), x, 1), 8, 0)");
-  }
-
-  {
-    // Min(8, Min(Min(x, 5), Min(y, z))) => Min(Min(Min(x, 5), y), z)
-    ExprHandle body = Min::make(
-        8, Min::make(Min::make(x, 5, true), Min::make(y, z, true), true), true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
-    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
-    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
-    ASSERT_TRUE(min3->propagate_nans());
-    IS_VAR_WITH_NAME(min2->rhs(), "y");
-    IS_VAR_WITH_NAME(min1->rhs(), "z");
-  }
-
-  {
-    // Min(Min(Min(x, 5), Min(y, z)), 8) => Min(Min(Min(x, 5), y), z)
-    ExprHandle body = Min::make(
-        Min::make(Min::make(x, 5, true), Min::make(y, z, true), true), 8, true);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
-    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
-    IS_BINOP_W_CONST(Min, min2->lhs(), min3, "x", 5);
-    ASSERT_TRUE(min3->propagate_nans());
-    IS_VAR_WITH_NAME(min2->rhs(), "y");
-    IS_VAR_WITH_NAME(min1->rhs(), "z");
-  }
-}
-
-TEST(Simplify, SimplifyWontReorderFloat) {
-  {
-    // 3 * (3 * x) - 3 * (3 * y) => 9 * (x - y)
-    // This is an expression we can simplify.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-
-    ExprHandle body = ExprHandle(3) * (ExprHandle(3) * x) -
-        ExprHandle(3) * (ExprHandle(3) * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 9);
-    IS_NODE_WITH_NAME(Sub, mul->rhs(), sub);
-    IS_VAR_WITH_NAME(sub->lhs(), "x");
-    IS_VAR_WITH_NAME(sub->rhs(), "y");
-  }
-
-  {
-    // 3 * (3 * x) - 3 * (3 * y) => 3 * (3 * x) - 3 * (3 * y).
-    // If the vars are floating point, ops are not associative and we can't
-    // reorder.
-    VarHandle x("x", kFloat);
-    VarHandle y("y", kFloat);
-
-    ExprHandle body = ExprHandle(3) * (ExprHandle(3) * x) -
-        ExprHandle(3) * (ExprHandle(3) * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
-    IS_NODE_WITH_NAME(Mul, sub->lhs(), lhsMul);
-    IS_IMM_WITH_VAL(Float, lhsMul->lhs(), 3);
-    IS_NODE_WITH_NAME(Mul, lhsMul->rhs(), lhsVarMul);
-    IS_IMM_WITH_VAL(Float, lhsVarMul->lhs(), 3);
-    IS_VAR_WITH_NAME(lhsVarMul->rhs(), "x");
-
-    IS_NODE_WITH_NAME(Mul, sub->rhs(), rhsMul);
-    IS_IMM_WITH_VAL(Float, rhsMul->lhs(), 3);
-    IS_NODE_WITH_NAME(Mul, rhsMul->rhs(), rhsVarMul);
-    IS_IMM_WITH_VAL(Float, rhsVarMul->lhs(), 3);
-    IS_VAR_WITH_NAME(rhsVarMul->rhs(), "y");
-  }
-
-  {
-    // 3 * (3 * x) - 3 * (3 * y) => 3 * (3 * x) - (9 * y).
-    // We will simplify subexprs if they dont reorder floating point ops.
-    VarHandle x("x", kDouble);
-    VarHandle y("y", kInt);
-
-    ExprHandle body = ExprHandle(3) * (ExprHandle(3) * x) -
-        ExprHandle(3) * (ExprHandle(3) * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
-    IS_NODE_WITH_NAME(Mul, sub->lhs(), lhsMul);
-    IS_IMM_WITH_VAL(Double, lhsMul->lhs(), 3);
-    IS_NODE_WITH_NAME(Mul, lhsMul->rhs(), lhsVarMul);
-    IS_IMM_WITH_VAL(Double, lhsVarMul->lhs(), 3);
-    IS_VAR_WITH_NAME(lhsVarMul->rhs(), "x");
-
-    IS_NODE_WITH_NAME_AND_CAST(Mul, sub->rhs(), rhsMul, Double);
-    IS_IMM_WITH_VAL(Int, rhsMul->lhs(), 9);
-    IS_VAR_WITH_NAME(rhsMul->rhs(), "y");
-  }
-
-  {
-    // Prevent reordering if FP propagated from dtypes.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-
-    ExprHandle body = ExprHandle(3.f) * (ExprHandle(3) * x) -
-        ExprHandle(3) * (ExprHandle(3.f) * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
-    IS_NODE_WITH_NAME(Mul, sub->lhs(), lhsMul);
-    IS_IMM_WITH_VAL(Float, lhsMul->lhs(), 3);
-    IS_NODE_WITH_NAME_AND_CAST(Mul, lhsMul->rhs(), lhsVarMul, Float);
-    IS_IMM_WITH_VAL(Int, lhsVarMul->lhs(), 3);
-    IS_VAR_WITH_NAME(lhsVarMul->rhs(), "x");
-
-    IS_NODE_WITH_NAME(Mul, sub->rhs(), rhsMul);
-    IS_IMM_WITH_VAL(Float, rhsMul->lhs(), 3);
-    IS_NODE_WITH_NAME(Mul, rhsMul->rhs(), rhsVarMul);
-    IS_IMM_WITH_VAL(Float, rhsVarMul->lhs(), 3);
-    IS_NODE_WITH_NAME(Cast, rhsVarMul->rhs(), yCast);
-    IS_VAR_WITH_NAME(yCast->src_value(), "y");
-  }
-
-  {
-    VarHandle x("x", kFloat);
-    VarHandle y("y", kFloat);
-    // x%y - (x%y - 1) => x%y - (x%y - 1).
-    // We won't reorder opaque ops if they are FP.
-    ExprHandle body = (x % y) - ((x % y) - 1);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
-    IS_NODE_WITH_NAME(Mod, sub->lhs(), lhsMod);
-    IS_VAR_WITH_NAME(lhsMod->lhs(), "x");
-    IS_VAR_WITH_NAME(lhsMod->rhs(), "y");
-
-    IS_NODE_WITH_NAME(Sub, sub->rhs(), rhsSub);
-    IS_NODE_WITH_NAME(Mod, rhsSub->lhs(), rhsMod);
-    IS_VAR_WITH_NAME(rhsMod->lhs(), "x");
-    IS_VAR_WITH_NAME(rhsMod->rhs(), "y");
-    IS_IMM_WITH_VAL(Float, rhsSub->rhs(), 1);
-  }
-}
-
-TEST(Simplify, SimplifyRoundModPattern) {
-  {
-    // (x/y)*y + x%y => x.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = ((x / y) * y) + (x % y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // Reverse order.
-    // x%y + (x/y)*y => x.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = (x % y) + ((x / y) * y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // Non opaque denominator.
-    // (x / (4+y)) * (4+y)) + (x % (y + 4)) => x.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = ((x / (ExprHandle(4) + y)) * (ExprHandle(4) + y)) +
-        (x % (y + ExprHandle(4)));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // Reverse order.
-    // (x % (y + 4)) + (x / (4+y)) * (4+y)) => x.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = (x % (y + ExprHandle(4))) +
-        ((x / (ExprHandle(4) + y)) * (ExprHandle(4) + y));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // Opaque denominator.
-    // (x / (2/y)) * (2/y)) + (x % (2/y)) => x.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = ((x / (ExprHandle(2) / y)) * (ExprHandle(2) / y)) +
-        (x % (ExprHandle(2) / y));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // Non opaque numerator
-    // ((2*x)/y * y) + ((2*x) % y) => 2 * x.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body =
-        (((ExprHandle(2) * x) / y) * y) + ((ExprHandle(2) * x) % y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_VAR_WITH_NAME(mul->rhs(), "x");
-  }
-
-  {
-    // Opaque numerator.
-    // ((x/2) / y * y) + (x/2 % y) => x / 2.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body =
-        (((x / ExprHandle(2)) / y) * y) + ((x / ExprHandle(2)) % y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Div, simplified.node(), div);
-    IS_VAR_WITH_NAME(div->lhs(), "x");
-    IS_IMM_WITH_VAL(Int, div->rhs(), 2);
-  }
-
-  {
-    // Numerator and denominator.
-    // ((2*x)/(2*y) * (2*y)) + ((2*x) % (2*y)) => 2 * x.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body =
-        (((ExprHandle(2) * x) / (ExprHandle(2) * y)) * (ExprHandle(2) * y)) +
-        ((ExprHandle(2) * x) % (ExprHandle(2) * y));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_VAR_WITH_NAME(mul->rhs(), "x");
-  }
-
-  {
-    // Reverse order.
-    // ((2*x) % (2*y)) + ((2*x)/(2*y) * (2*y)) => 2 * x.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = ((ExprHandle(2) * x) % (ExprHandle(2) * y)) +
-        (((ExprHandle(2) * x) / (ExprHandle(2) * y)) * (ExprHandle(2) * y));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_VAR_WITH_NAME(mul->rhs(), "x");
-  }
-
-  {
-    // Negated Subtraction of Round Mod.
-    // (x/y) * y - (0 - x%y) => x.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = ((x / y) * y) - (ExprHandle(0) - (x % y));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // Other terms are preserved.
-    // (x/y)*y + x%y + (y * x) => x + (y * x).
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = ((x / y) * y) + (x % y) + (y * x);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_NODE_WITH_NAME(Mul, add->rhs(), mul);
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // Sanity checking we won't do the optimization on floats.
-    VarHandle x("x", kFloat);
-    VarHandle y("y", kFloat);
-    ExprHandle body = ((x / y) * y) + (x % y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_NODE_WITH_NAME(Mul, add->lhs(), roundMul);
-    IS_NODE_WITH_NAME(Div, roundMul->lhs(), roundDiv);
-    IS_VAR_WITH_NAME(roundDiv->lhs(), "x");
-    IS_VAR_WITH_NAME(roundDiv->rhs(), "y");
-    IS_VAR_WITH_NAME(roundMul->rhs(), "y");
-    IS_NODE_WITH_NAME(Mod, add->rhs(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "x");
-    IS_VAR_WITH_NAME(mod->rhs(), "y");
-  }
-
-  {
-    // Sanity check we won't do it if the mod term doesn't match.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    VarHandle z("z", kInt);
-    ExprHandle body = ((x / y) * y) + (x % z);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "(x / y) * y + x % z");
-  }
-
-  {
-    // Sanity check we won't do it if the div term doesn't match.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    VarHandle z("z", kInt);
-    ExprHandle body = (y * (x / z)) + (x % y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "x % y + (x / z) * y");
-  }
-
-  {
-    // Sanity check we won't do it if the mul term doesn't match.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    VarHandle z("z", kInt);
-    ExprHandle body = ((x / y) * z) + (x % y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "x % y + (x / y) * z");
-  }
-}
-
-TEST(Simplify, SimplifyRoundModPatternFactorization) {
-  {
-    // Full factorization.
-    // 2 * (x/y * y) + 2 * (x%y) => 2 * x.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = ExprHandle(2) * ((x / y) * y) + ExprHandle(2) * (x % y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_VAR_WITH_NAME(mul->rhs(), "x");
-  }
-
-  {
-    // Partial Factorization.
-    // 32 * (x/8) + 4 * (x % 8) => 4 * x.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks,cppcoreguidelines-avoid-magic-numbers)
-    ExprHandle body = ExprHandle(32) * (x / 8) + ExprHandle(4) * (x % 8);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 4);
-    IS_VAR_WITH_NAME(mul->rhs(), "x");
-  }
-
-  {
-    // Factorization requiring constant folding.
-    // 20 * (x  / (16 / 2)) * 2 + (11 % 6) * (x % (7+1)) => 5 * x.
-    VarHandle x("x", kInt);
-    ExprHandle body = ExprHandle(40) * (x / (ExprHandle(16) / 2)) +
-        (ExprHandle(11) % 6) * (x % (ExprHandle(7) + 1));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 5);
-    IS_VAR_WITH_NAME(mul->rhs(), "x");
-  }
-
-  {
-    VarHandle x("x", kInt);
-    ExprHandle body = (x / 5) * 10 + ExprHandle(2) * (x % 5);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_VAR_WITH_NAME(mul->rhs(), "x");
-  }
-
-  {
-    VarHandle x("x", kInt);
-    ExprHandle body = (x / 10) * 0 + x % 5;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "x");
-    IS_IMM_WITH_VAL(Int, mod->rhs(), 5);
-  }
-}
-
-TEST(Simplify, SimplifyRoundModPatternMultivar) {
-  {
-    // Multivar.
-    // (x/8) * 8 + (y/5)*5 + x%8 + y%5 => x + y.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = (x / ExprHandle(8) * ExprHandle(8)) +
-        (y / ExprHandle(5) * ExprHandle(5)) + (x % 8) + (y % 5);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "x");
-    IS_VAR_WITH_NAME(add->rhs(), "y");
-  }
-
-  {
-    // Find the right var.
-    // (y/8) * 8  x%8 + y%8 + z%8 => x%8 + y + z%8
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    VarHandle z("z", kInt);
-    ExprHandle body =
-        (y / ExprHandle(8) * ExprHandle(8)) + (x % 8) + (y % 8) + (z % 8);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_NODE_WITH_NAME(Add, add->lhs(), add2);
-    IS_NODE_WITH_NAME(Mod, add2->lhs(), xMod);
-    IS_VAR_WITH_NAME(xMod->lhs(), "x");
-    IS_IMM_WITH_VAL(Int, xMod->rhs(), 8);
-    IS_VAR_WITH_NAME(add2->rhs(), "y");
-    IS_NODE_WITH_NAME(Mod, add->rhs(), zMod);
-    IS_VAR_WITH_NAME(zMod->lhs(), "z");
-    IS_IMM_WITH_VAL(Int, zMod->rhs(), 8);
-  }
-
-  {
-    // Compound.
-    // (x + (z + 512 * y) % 16) + 16 * ((z + 512 * y) / 16)
-    // => (z + 512 * y) + x
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    VarHandle z("z", kInt);
-
-    ExprHandle body = x + (z + y * 512) % 16 + ((z + y * 512) / 16 * 16);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "x + (z + 512 * y)");
-  }
-}
-
-TEST(Simplify, SimplifyModRoundModPattern) {
-  {
-    // t/7 % 9 * 7 + t % 7 => t%63
-    VarHandle t("t", kInt);
-    ExprHandle body = (t / 7 % 9) * 7 + t % 7;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "t");
-    IS_IMM_WITH_VAL(Int, mod->rhs(), 63);
-  }
-
-  {
-    // 2*t/7 % 9 * 7 + 2*t % 7 => 2*t % 63
-    VarHandle t("t", kInt);
-    ExprHandle body = (ExprHandle(2) * t / 7 % 9) * 7 + ExprHandle(2) * t % 7;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
-    IS_NODE_WITH_NAME(Mul, mod->lhs(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_VAR_WITH_NAME(mul->rhs(), "t");
-    IS_IMM_WITH_VAL(Int, mod->rhs(), 63);
-  }
-
-  {
-    // t/x % y * x + t % x => t%(x*y)
-    VarHandle t("t", kInt);
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = (t / x % y) * x + t % x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "t");
-    IS_NODE_WITH_NAME(Mul, mod->rhs(), mul);
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // k*t/x % y * x + k*t % x => k*t%(x*y)
-    VarHandle t("t", kInt);
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    VarHandle k("k", kInt);
-    ExprHandle body = (k * t / x % y) * x + k * t % x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "(k * t) % (x * y)");
-  }
-
-  {
-    // t/k/x % y * x + t/k % x => t/k%(x*y)
-    VarHandle t("t", kInt);
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    VarHandle k("k", kInt);
-    ExprHandle body = (t / k / x % y) * x + t / k % x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
-    IS_NODE_WITH_NAME(Div, mod->lhs(), div);
-    IS_VAR_WITH_NAME(div->lhs(), "t");
-    IS_VAR_WITH_NAME(div->rhs(), "k");
-    IS_NODE_WITH_NAME(Mul, mod->rhs(), mul);
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // Sanity checking we won't do the optimization on floats.
-    VarHandle x("x", kFloat);
-    VarHandle y("y", kFloat);
-    VarHandle z("z", kFloat);
-    ExprHandle body = ((x / y % z) * y) + (x % y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_NODE_WITH_NAME(Mul, add->lhs(), mul);
-    IS_NODE_WITH_NAME(Mod, mul->lhs(), mod);
-    IS_NODE_WITH_NAME(Div, mod->lhs(), div);
-    IS_VAR_WITH_NAME(div->lhs(), "x");
-    IS_VAR_WITH_NAME(div->rhs(), "y");
-    IS_VAR_WITH_NAME(mod->rhs(), "z");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-    IS_NODE_WITH_NAME(Mod, add->rhs(), mod2);
-    IS_VAR_WITH_NAME(mod2->lhs(), "x");
-    IS_VAR_WITH_NAME(mod2->rhs(), "y");
-  }
-}
-
-TEST(Simplify, SimplifyModRoundModPatternFactorization) {
-  {
-    // 2 * (t /7 % 9 * 7) + 2 * (t % 7) => 2 * (t % 63)
-    VarHandle t("t", kInt);
-    ExprHandle body =
-        ExprHandle(2) * ((t / 7 % 9) * 7) + ExprHandle(2) * (t % 7);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_NODE_WITH_NAME(Mod, mul->rhs(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "t");
-    IS_IMM_WITH_VAL(Int, mod->rhs(), 63);
-  }
-
-  {
-    // t /7 % 9 * 14 + 2* (t % 7) => 2* (t % 63)
-    VarHandle t("t", kInt);
-    ExprHandle body = (t / 7 % 9) * 14 + ExprHandle(2) * (t % 7);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_NODE_WITH_NAME(Mod, mul->rhs(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "t");
-    IS_IMM_WITH_VAL(Int, mod->rhs(), 63);
-  }
-
-  {
-    // t/14 % 9 * 7 + t/2 % 7 => t/2 % 63
-    VarHandle t("t", kInt);
-    ExprHandle body = (t / 14 % 9) * 7 + t / 2 % 7;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
-    IS_NODE_WITH_NAME(Div, mod->lhs(), div);
-    IS_VAR_WITH_NAME(div->lhs(), "t");
-    IS_IMM_WITH_VAL(Int, div->rhs(), 2);
-    IS_IMM_WITH_VAL(Int, mod->rhs(), 63);
-  }
-
-  {
-    // t/(7*3) % 9 * 7*3 + t % (7*3) => t % 189
-    VarHandle t("t", kInt);
-    ExprHandle body = (t / (ExprHandle(7) * ExprHandle(3)) % 9) * 7 * 3 +
-        t % (ExprHandle(7) * ExprHandle(3));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "t");
-    IS_IMM_WITH_VAL(Int, mod->rhs(), 189);
-  }
-
-  {
-    // 2*(t/x % y * x) + 2*(t % x) => 2*(t%(x*y))
-    VarHandle t("t", kInt);
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body =
-        ExprHandle(2) * ((t / x % y) * x) + ExprHandle(2) * (t % x);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_NODE_WITH_NAME(Mod, mul->rhs(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "t");
-    IS_NODE_WITH_NAME(Mul, mod->rhs(), mul2);
-    IS_VAR_WITH_NAME(mul2->lhs(), "x");
-    IS_VAR_WITH_NAME(mul2->rhs(), "y");
-  }
-}
-
-TEST(Simplify, SimplifyModRoundModPatternMultivar) {
-  {
-    // t/7 % 9 * 7 + t % 7 + t => t % 63 + t
-    VarHandle t("t", kInt);
-    ExprHandle body = (t / 7 % 9) * 7 + t % 7 + t;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "t % 63 + t");
-  }
-
-  {
-    // t/7 % 9 * 7 + t/8 % 9 * 8 + t % 7 + t % 8  => t % 63 + t % 72
-    VarHandle t("t", kInt);
-    ExprHandle body = (t / 7 % 9) * 7 + (t / 8 % 9) * 8 + t % 7 + t % 8;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_NODE_WITH_NAME(Mod, add->lhs(), mod1);
-    IS_VAR_WITH_NAME(mod1->lhs(), "t");
-    IS_IMM_WITH_VAL(Int, mod1->rhs(), 63);
-    IS_NODE_WITH_NAME(Mod, add->rhs(), mod2);
-    IS_VAR_WITH_NAME(mod2->lhs(), "t");
-    IS_IMM_WITH_VAL(Int, mod2->rhs(), 72);
-  }
-
-  {
-    // k + t/x % y * x + t % x => k + t%(x*y)
-    VarHandle t("t", kInt);
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    VarHandle k("k", kInt);
-    ExprHandle body = k + (t / x % y) * x + t % x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "k");
-    IS_NODE_WITH_NAME(Mod, add->rhs(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "t");
-    IS_NODE_WITH_NAME(Mul, mod->rhs(), mul);
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-  }
-
-  {
-    // t/x % y * x + t % x + (t/k / x % y) * x + t/k % x
-    // => t%(x*y) + t/k % (x*y)
-    VarHandle t("t", kInt);
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    VarHandle k("k", kInt);
-    ExprHandle body = (t / x % y) * x + t % x + (t / k / x % y) * x + t / k % x;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    checkExprIR(simplified, "(t / k) % (x * y) + t % (x * y)");
-  }
-
-  {
-    // 3D: (7 * ((i0_flat / 7) % 9) + i0_flat % 7) + 63 * (i0_flat / 63)
-    // => io_flat
-    VarHandle t("io_flat", kInt);
-    ExprHandle body =
-        ExprHandle(7) * (t / 7 % 9) + t % 7 + ExprHandle(63) * (t / 63);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "io_flat");
-  }
-
-  { // 5D: i0_flat / (11 * 10 * 9 * 7)  * (7 * 9 * 10 * 11) +
-    // (i0_flat / (10 * 9 * 7) % 11)  * 7 * 9 * 10 +
-    // (i0_flat / (9 * 7) % 10) * 7 * 9 +
-    // (i0_flat / 7 % 9)  * 7 +
-    // i0_flat % 7 => io_flat
-    VarHandle t("io_flat", kInt);
-    ExprHandle body = (t / (ExprHandle(11) * 10 * 9 * 7)) * (7 * 9 * 10 * 11) +
-        (t / (ExprHandle(10) * 9 * 7) % 11) * 7 * 9 * 10 +
-        (t / (ExprHandle(9) * 7) % 10) * 7 * 9 + (t / 7 % 9) * 7 + t % 7;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "io_flat");
-  }
-
-  {
-    // 3D: (m * ((i0_flat / m) % n) + i0_flat % m) + (m * n) *
-    // (i0_flat / (m * n)) => io_flat
-    VarHandle t("io_flat", kInt);
-    VarHandle m("m", kInt);
-    VarHandle n("n", kInt);
-    ExprHandle body = m * (t / m % n) + t % m + (m * n) * (t / (m * n));
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "io_flat");
-  }
-
-  { // 5D: i0_flat / (k * l * n * m)  * (m * n * l * k) +
-    // (i0_flat / (l * n * m) % k)  * m * n * l +
-    // (i0_flat / (n * m) % l) * m * n +
-    // (i0_flat / m % n)  * m +
-    // i0_flat % m => io_flat
-    VarHandle t("io_flat", kInt);
-    VarHandle m("m", kInt);
-    VarHandle n("n", kInt);
-    VarHandle l("l", kInt);
-    VarHandle k("k", kInt);
-    ExprHandle body = (t / (k * l * n * m)) * (m * n * l * k) +
-        (t / (l * n * m) % k) * m * n * l + (t / (n * m) % l) * m * n +
-        (t / m % n) * m + t % m;
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "io_flat");
-  }
-}
-
-TEST(Simplify, SimplifyDivisionScalarFactorization) {
-  {
-    // Simple factorization of numerator and denominator.
-    // 8x / 4y => 2x / y.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = (x * 8) / (y * 4);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Div, simplified.node(), div);
-    IS_NODE_WITH_NAME(Mul, div->lhs(), lhs);
-    IS_IMM_WITH_VAL(Int, lhs->lhs(), 2);
-    IS_VAR_WITH_NAME(lhs->rhs(), "x");
-    IS_VAR_WITH_NAME(div->rhs(), "y");
-  }
-
-  {
-    // Don't change anything if we can't factorize.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = (x * 7) / (y * 4);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Div, simplified.node(), div);
-    IS_NODE_WITH_NAME(Mul, div->lhs(), lhs);
-    IS_IMM_WITH_VAL(Int, lhs->lhs(), 7);
-    IS_VAR_WITH_NAME(lhs->rhs(), "x");
-    IS_NODE_WITH_NAME(Mul, div->rhs(), rhs);
-    IS_IMM_WITH_VAL(Int, rhs->lhs(), 4);
-    IS_VAR_WITH_NAME(rhs->rhs(), "y");
-  }
-
-  {
-    // Don't reorder floats.
-    VarHandle x("x", kFloat);
-    VarHandle y("y", kFloat);
-    ExprHandle body = (x * 8) / (y * 4);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Div, simplified.node(), div);
-    IS_NODE_WITH_NAME(Mul, div->lhs(), lhs);
-    IS_VAR_WITH_NAME(lhs->lhs(), "x");
-    IS_IMM_WITH_VAL(Float, lhs->rhs(), 8.f);
-    IS_NODE_WITH_NAME(Mul, div->rhs(), rhs);
-    IS_VAR_WITH_NAME(rhs->lhs(), "y");
-    IS_IMM_WITH_VAL(Float, rhs->rhs(), 4.f);
-  }
-
-  {
-    // Sanity check we do nothing if there are only scalar parts.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = (x * 1) / (y * 1);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Div, simplified.node(), div);
-    IS_VAR_WITH_NAME(div->lhs(), "x");
-    IS_VAR_WITH_NAME(div->rhs(), "y");
-  }
-
-  {
-    // Can factorize amounts of variables.
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = (x + x + x + x) / (y + y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Div, simplified.node(), div);
-    IS_NODE_WITH_NAME(Mul, div->lhs(), lhs);
-    IS_IMM_WITH_VAL(Int, lhs->lhs(), 2);
-    IS_VAR_WITH_NAME(lhs->rhs(), "x");
-    IS_VAR_WITH_NAME(div->rhs(), "y");
-  }
-}
-
-TEST(Simplify, SimplifyConstantBranches) {
-  {
-    // If the condition is constant true then take the true_value.
-    // 1 ? x : y => x
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle t(1);
-    ExprHandle body = IfThenElse::make(t, x, y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // If the condition is constant false then take the false_value.
-    // 0 ? x : y => y
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle t(0);
-    ExprHandle body = IfThenElse::make(t, x, y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "y");
-  }
-
-  {
-    // condition is simplified before checking.
-    // (x-x) ? x : y => y
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = IfThenElse::make(x - x, x, y);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "y");
-  }
-
-  {
-    // If both branches are the same then don't do the condition.
-    // y ? x : x => x
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = IfThenElse::make(y, x, x);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_VAR_WITH_NAME(simplified.node(), "x");
-  }
-
-  {
-    // If both branches simplify to the same thing it still works.
-    // y ? (x + x) : (2 * x) => x
-    VarHandle x("x", kInt);
-    VarHandle y("y", kInt);
-    ExprHandle body = IfThenElse::make(y, x + x, ExprHandle(2) * x);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
-    IS_VAR_WITH_NAME(mul->rhs(), "x");
-  }
-}
-
-TEST(Simplify, SimplifyConstantCond) {
-  {
-    // If the condition is constant true then take the true_value.
-    // 1 ? A[0] = 1 : B[0] = 1 => A[0] = 1
-    BufHandle a("A", {1}, kInt);
-    BufHandle b("B", {1}, kInt);
-    ExprHandle condition(1);
-    StmtPtr true_val = Store::make(a, {0}, 1);
-    StmtPtr false_val = Store::make(b, {0}, 1);
-
-    CondPtr body = alloc<Cond>(condition.node(), true_val, false_val);
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    IS_NODE_WITH_NAME(Store, block->front(), store);
-    IS_VAR_WITH_NAME(store->base_handle(), "A");
-  }
-
-  {
-    // If the condition is constant false then take the false_value.
-    // 0 ? A[0] = 1 : B[0] = 1 => B[0] = 1
-    BufHandle a("A", {1}, kInt);
-    BufHandle b("B", {1}, kInt);
-    ExprHandle condition(0);
-    StmtPtr true_val = Store::make(a, {0}, 1);
-    StmtPtr false_val = Store::make(b, {0}, 1);
-
-    StmtPtr body = alloc<Cond>(condition.node(), true_val, false_val);
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    IS_NODE_WITH_NAME(Store, block->front(), store);
-    IS_VAR_WITH_NAME(store->base_handle(), "B");
-  }
-
-  {
-    // condition is simplified before checking.
-    // (x-x) ? A[0] = 1 : B[0] = 1 => B[0] = 1
-    VarHandle x("x", kInt);
-    BufHandle a("A", {1}, kInt);
-    BufHandle b("B", {1}, kInt);
-    ExprHandle condition(x - x);
-    StmtPtr true_val = Store::make(a, {0}, 1);
-    StmtPtr false_val = Store::make(b, {0}, 1);
-
-    StmtPtr body = alloc<Cond>(condition.node(), true_val, false_val);
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    IS_NODE_WITH_NAME(Store, block->front(), store);
-    IS_VAR_WITH_NAME(store->base_handle(), "B");
-  }
-
-  {
-    // If both branches are the same then don't do the condition.
-    // x ? A[0] = x : A[0] = x => A[0] = x
-    VarHandle x("x", kInt);
-    BufHandle a("A", {1}, kInt);
-    ExprHandle condition(x - x);
-    StmtPtr true_val = Store::make(a, {0}, x);
-    StmtPtr false_val = Store::make(a, {0}, x);
-
-    StmtPtr body = alloc<Cond>(condition.node(), true_val, false_val);
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    IS_NODE_WITH_NAME(Store, block->front(), store);
-    IS_VAR_WITH_NAME(store->base_handle(), "A");
-  }
-
-  {
-    // If both branches simplify to the same thing it still works.
-    // x ? (x + x) : (2 * x) => x
-    VarHandle x("x", kInt);
-    BufHandle a("A", {1}, kInt);
-    ExprHandle condition(x - x);
-    StmtPtr true_val = Store::make(a, {0}, ExprHandle(2) * x);
-    StmtPtr false_val = Store::make(a, {0}, x + x);
-
-    StmtPtr body = alloc<Cond>(condition.node(), true_val, false_val);
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    IS_NODE_WITH_NAME(Store, block->front(), store);
-    IS_VAR_WITH_NAME(store->base_handle(), "A");
-  }
-
-  {
-    // But not if they dont
-    // x ? x : (2 * x) => x ? x : (2 * x)
-    VarHandle x("x", kInt);
-    BufHandle a("A", {1}, kInt);
-    ExprHandle condition(x);
-    StmtPtr true_val = Store::make(a, {0}, x);
-    StmtPtr false_val = Store::make(a, {0}, ExprHandle(2) * x);
-
-    StmtPtr body = alloc<Cond>(condition.node(), true_val, false_val);
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    ASSERT_EQ(block, nullptr);
-  }
-
-  {
-    StmtPtr cond = alloc<Cond>(
-        ExprHandle(false).node(),
-        alloc<Block>(std::vector<StmtPtr>({})),
-        nullptr);
-    StmtPtr simplified = IRSimplifier::simplify(cond);
-    ASSERT_EQ(simplified, nullptr);
-  }
-
-  {
-    StmtPtr cond = alloc<Cond>(
-        ExprHandle(true).node(),
-        nullptr,
-        alloc<Block>(std::vector<StmtPtr>({})));
-    StmtPtr simplified = IRSimplifier::simplify(cond);
-    ASSERT_EQ(simplified, nullptr);
-  }
-}
-
-TEST(Simplify, SimplifyEliminateEmptyCond) {
-  // If the branches are empty in different ways, eliminate.
-  {
-    VarHandle x("x", kInt);
-    ExprHandle condition(x);
-    StmtPtr true_val = alloc<Block>(std::vector<StmtPtr>({}));
-
-    StmtPtr body = alloc<Cond>(condition.node(), true_val, nullptr);
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    ASSERT_NE(block, nullptr);
-    ASSERT_EQ(block->nstmts(), 0);
-  }
-
-  {
-    VarHandle x("x", kInt);
-    ExprHandle condition(x);
-    StmtPtr false_val = alloc<Block>(std::vector<StmtPtr>({}));
-
-    StmtPtr body = alloc<Cond>(condition.node(), nullptr, false_val);
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    ASSERT_NE(block, nullptr);
-    ASSERT_EQ(block->nstmts(), 0);
-  }
-}
-
-TEST(Simplify, SimplifyConstantComparisons) {
-  auto ComparisonTest =
-      [](ExprHandle a, ExprHandle b, CompareSelectOperation op, int result) {
-        ExprHandle body = CompareSelect::make(a, b, op);
-        ExprHandle simplified = IRSimplifier::simplify(body);
-        IS_IMM_WITH_VAL(Int, simplified.node(), result);
-      };
-
-  // Equals.
-  ComparisonTest(2, 2, kEQ, 1);
-  ComparisonTest(1, 2, kEQ, 0);
-  ComparisonTest(2, 1, kEQ, 0);
-
-  // Greater than.
-  ComparisonTest(2, 2, kGT, 0);
-  ComparisonTest(1, 2, kGT, 0);
-  ComparisonTest(2, 1, kGT, 1);
-
-  // Greater or Equal.
-  ComparisonTest(2, 2, kGE, 1);
-  ComparisonTest(1, 2, kGE, 0);
-  ComparisonTest(2, 1, kGE, 1);
-
-  // Less Than.
-  ComparisonTest(2, 2, kLT, 0);
-  ComparisonTest(1, 2, kLT, 1);
-  ComparisonTest(2, 1, kLT, 0);
-
-  // Less or Equal.
-  ComparisonTest(2, 2, kLE, 1);
-  ComparisonTest(1, 2, kLE, 1);
-  ComparisonTest(2, 1, kLE, 0);
-
-  // Not equal.
-  ComparisonTest(2, 2, kNE, 0);
-  ComparisonTest(1, 2, kNE, 1);
-  ComparisonTest(2, 1, kNE, 1);
-
-  // With specified results:
-  ExprHandle body = CompareSelect::make(2, 2, 5, 42, kNE);
-  ExprHandle simplified = IRSimplifier::simplify(body);
-  IS_IMM_WITH_VAL(Int, simplified.node(), 42);
-}
-
-TEST(Simplify, SimplifySymbolicComparisons) {
-  VarHandle x("x", kInt);
-  VarHandle y("y", kInt);
-
-  auto TookTrueBranch = [](ExprHandle a) { IS_IMM_WITH_VAL(Int, a.node(), 1); };
-  auto TookFalseBranch = [](ExprHandle a) {
-    IS_IMM_WITH_VAL(Int, a.node(), 0);
-  };
-
-  // EQ
-
-  // x == x => 1
-  ExprHandle body = CompareSelect::make(x, x, kEQ);
-  TookTrueBranch(IRSimplifier::simplify(body));
-
-  // x == x+1 => 0
-  body = CompareSelect::make(x, x + 1, kEQ);
-  TookFalseBranch(IRSimplifier::simplify(body));
-
-  // x == x * 2 cannot simplify since we don't know x is nonzero.
-  body = CompareSelect::make(x, x * 2, kEQ);
-  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-  IS_NODE(CompareSelect, IRSimplifier::simplify(body).node());
-
-  // x == x * 1 => 1
-  body = CompareSelect::make(x, x * 1, kEQ);
-  TookTrueBranch(IRSimplifier::simplify(body));
-
-  {
-    // x == y => x == y
-    body = CompareSelect::make(x, y, kEQ);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(CompareSelect, simplified.node(), cmp);
-    ASSERT_EQ(cmp->compare_select_op(), kEQ);
-    IS_VAR_WITH_NAME(cmp->lhs(), "x");
-    IS_VAR_WITH_NAME(cmp->rhs(), "y");
-  }
-
-  {
-    // x == 5 => x == 5
-    body = CompareSelect::make(x, 5, kEQ);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(CompareSelect, simplified.node(), cmp);
-    ASSERT_EQ(cmp->compare_select_op(), kEQ);
-    IS_VAR_WITH_NAME(cmp->lhs(), "x");
-    IS_IMM_WITH_VAL(Int, cmp->rhs(), 5);
-  }
-
-  // GT
-
-  // x+1 > x => 1
-  body = CompareSelect::make(x + 1, x, kGT);
-  TookTrueBranch(IRSimplifier::simplify(body));
-
-  // x > x + 1 => 0
-  body = CompareSelect::make(x, x + 1, kGT);
-  TookFalseBranch(IRSimplifier::simplify(body));
-
-  // x > x - 1 => 1
-  body = CompareSelect::make(x, x - 1, kGT);
-  TookTrueBranch(IRSimplifier::simplify(body));
-
-  // x - 1 > x => 0
-  body = CompareSelect::make(x - 1, x, kGT);
-  TookFalseBranch(IRSimplifier::simplify(body));
-
-  // x > x => 0
-  body = CompareSelect::make(x, x, kGT);
-  TookFalseBranch(IRSimplifier::simplify(body));
-
-  // x * 2 > x => x * 2 > x
-  // since we don't know the sign of x.
-  body = CompareSelect::make(x * 2, x, kGT);
-  IS_NODE(CompareSelect, IRSimplifier::simplify(body).node());
-
-  // GE
-
-  // x+1 >= x => 1
-  body = CompareSelect::make(x + 1, x, kGE);
-  TookTrueBranch(IRSimplifier::simplify(body));
-
-  // x >= x + 1 => 0
-  body = CompareSelect::make(x, x + 1, kGE);
-  TookFalseBranch(IRSimplifier::simplify(body));
-
-  // x >= x => 1
-  body = CompareSelect::make(x, x, kGE);
-  TookTrueBranch(IRSimplifier::simplify(body));
-
-  // x * 2 >= x => x * 2 >= x
-  // since we don't know the sign of x.
-  body = CompareSelect::make(x * 2, x, kGE);
-  IS_NODE(CompareSelect, IRSimplifier::simplify(body).node());
-
-  // LT
-
-  // x+1 < x => 0
-  body = CompareSelect::make(x + 1, x, kLT);
-  TookFalseBranch(IRSimplifier::simplify(body));
-
-  // x < x + 1 => 1
-  body = CompareSelect::make(x, x + 1, kLT);
-  TookTrueBranch(IRSimplifier::simplify(body));
-
-  // x < x => 0
-  body = CompareSelect::make(x, x, kLT);
-  TookFalseBranch(IRSimplifier::simplify(body));
-
-  // LE
-
-  // x+1 <= x => 0
-  body = CompareSelect::make(x + 1, x, kLE);
-  TookFalseBranch(IRSimplifier::simplify(body));
-
-  // x <= x + 1 => 1
-  body = CompareSelect::make(x, x + 1, kLE);
-  TookTrueBranch(IRSimplifier::simplify(body));
-
-  // x <= x => 1
-  body = CompareSelect::make(x, x, kLE);
-  TookTrueBranch(IRSimplifier::simplify(body));
-
-  // NE
-
-  // x+1 != x => 1
-  body = CompareSelect::make(x + 1, x, kNE);
-  TookTrueBranch(IRSimplifier::simplify(body));
-
-  // x != x + 1 => 1
-  body = CompareSelect::make(x, x + 1, kNE);
-  TookTrueBranch(IRSimplifier::simplify(body));
-
-  // x != x => 0
-  body = CompareSelect::make(x, x, kNE);
-  TookFalseBranch(IRSimplifier::simplify(body));
-}
-
-TEST(Simplify, SimplifyEliminateZeroLengthFor) {
-  {
-    // Will eliminate zero loop For.
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    auto body = For::make(i, 0, 0, Store::make(c, {i}, Load::make(a, {i})));
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    ASSERT_EQ(block->nstmts(), 0);
-  }
-
-  {
-    // still works if start is not zero.
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    auto body = For::make(i, 2, 2, Store::make(c, {i}, Load::make(a, {i})));
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    ASSERT_EQ(block->nstmts(), 0);
-  }
-
-  {
-    // works if both terms are variable.
-    VarHandle x("x", kInt);
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    auto body = For::make(i, x, x, Store::make(c, {i}, Load::make(a, {i})));
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    ASSERT_EQ(block->nstmts(), 0);
-  }
-
-  {
-    // works if one term simplifies down.
-    VarHandle x("x", kInt);
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    auto body = For::make(i, 0, x - x, Store::make(c, {i}, Load::make(a, {i})));
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    ASSERT_EQ(block->nstmts(), 0);
-  }
-
-  {
-    // Sanity check does nothing if the condition is not met.
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    auto body = For::make(i, 0, 3, Store::make(c, {i}, Load::make(a, {i})));
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE(For, simplified);
-  }
-}
-
-TEST(Simplify, SimplifyOneLoopFor) {
-  {
-    // Will remove the loop if the body is run once.
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    auto body = For::make(i, 0, 1, Store::make(c, {i}, Load::make(a, {i})));
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    IS_NODE_WITH_NAME(Store, block->front(), store);
-    IS_VAR_WITH_NAME(store->base_handle(), "C");
-    IS_IMM_WITH_VAL(Int, store->flat_index(), 0);
-  }
-
-  {
-    // still works if start is not zero.
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    auto body = For::make(i, 2, 3, Store::make(c, {i}, Load::make(a, {i})));
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    IS_NODE_WITH_NAME(Store, block->front(), store);
-    IS_VAR_WITH_NAME(store->base_handle(), "C");
-    IS_IMM_WITH_VAL(Int, store->flat_index(), 2);
-  }
-
-  {
-    // works if both terms are variable.
-    VarHandle x("x", kInt);
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    auto body = For::make(i, x, x + 1, Store::make(c, {i}, Load::make(a, {i})));
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    IS_NODE_WITH_NAME(Store, block->front(), store);
-    IS_VAR_WITH_NAME(store->base_handle(), "C");
-    IS_VAR_WITH_NAME(store->flat_index(), "x");
-  }
-
-  {
-    // works if one term simplifies down.
-    VarHandle x("x", kInt);
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    auto body =
-        For::make(i, 0, x - x + 1, Store::make(c, {i}, Load::make(a, {i})));
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    BlockPtr block = to<Block>(simplified);
-    IS_NODE_WITH_NAME(Store, block->front(), store);
-    IS_VAR_WITH_NAME(store->base_handle(), "C");
-    IS_IMM_WITH_VAL(Int, store->flat_index(), 0);
-  }
-
-  {
-    // Sanity check does nothing if the condition is not met.
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    auto body = For::make(i, 0, 3, Store::make(c, {i}, Load::make(a, {i})));
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE(For, simplified);
-  }
-}
-
-TEST(Simplify, SimplifyForWontLoseLoopOptions) {
-  {
-    // Sanity check does nothing if the condition is not met.
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    LoopOptions options;
-    options.set_gpu_block_index(LoopOptions::IDX_W);
-    auto body =
-        For::make(i, 0, 1, Store::make(c, {i}, Load::make(a, {i})), options);
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(For, simplified, for_);
-    LoopOptions options2 = for_->loop_options();
-    ASSERT_EQ(options.gpu_block_index(), options2.gpu_block_index());
-  }
-}
-
-TEST(Simplify, SimplifyMultilevelFor) {
-  {
-    // Multiple layers of For will be simplified out.
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    VarHandle j("j", kInt);
-    auto body = For::make(i, 0, 1, Store::make(c, {i}, Load::make(a, {i})));
-    auto outer = For::make(j, 0, 1, body);
-    StmtPtr simplified = IRSimplifier::simplify(outer);
-    BlockPtr block = to<Block>(simplified);
-    IS_NODE_WITH_NAME(Store, block->front(), store);
-    IS_VAR_WITH_NAME(store->base_handle(), "C");
-    IS_IMM_WITH_VAL(Int, store->flat_index(), 0);
-  }
-
-  {
-    // Will maintain an outer loop if the inner loop is eliminated.
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    VarHandle j("j", kInt);
-    auto body = For::make(i, 0, 1, Store::make(c, {i}, Load::make(a, {i})));
-    auto outer = For::make(j, 0, 2, body);
-    StmtPtr simplified = IRSimplifier::simplify(outer);
-    ForPtr for__ = static_to<For>(simplified);
-    IS_NODE_WITH_NAME(For, for__, for_);
-    IS_VAR_WITH_NAME(for_->var(), "j");
-    IS_IMM_WITH_VAL(Int, for_->start(), 0);
-    IS_IMM_WITH_VAL(Int, for_->stop(), 2);
-    BlockPtr block = to<Block>(for_->body());
-    ASSERT_NE(block, nullptr);
-    IS_NODE_WITH_NAME(Store, block->front(), store);
-    IS_VAR_WITH_NAME(store->base_handle(), "C");
-    IS_IMM_WITH_VAL(Int, store->flat_index(), 0);
-  }
-
-  {
-    // Will maintain inner loop if outer loops is eliminated.
-    BufHandle a("A", {4}, kInt);
-    BufHandle c("C", {4}, kInt);
-    VarHandle i("i", kInt);
-    VarHandle j("j", kInt);
-    auto body = For::make(i, 0, 2, Store::make(c, {i}, Load::make(a, {i})));
-    auto outer = For::make(j, 0, 1, body);
-    StmtPtr simplified = IRSimplifier::simplify(outer);
-    BlockPtr block = to<Block>(simplified);
-    IS_NODE_WITH_NAME(For, block->front(), for_);
-    IS_VAR_WITH_NAME(for_->var(), "i");
-    IS_IMM_WITH_VAL(Int, for_->start(), 0);
-    IS_IMM_WITH_VAL(Int, for_->stop(), 2);
-    IS_NODE_WITH_NAME(Store, for_->body()->front(), store);
-    IS_VAR_WITH_NAME(store->base_handle(), "C");
-    IS_VAR_WITH_NAME(store->flat_index(), "i");
-  }
-}
-
-TEST(Simplify, SimplifyForCleansUp) {
-  {
-    BufHandle a("a", {1, 12, 1}, kFloat);
-    VarHandle x("x", kInt);
-    Tensor b = Compute(
-        "x",
-        {1, 12, 1},
-        [](const VarHandle& i, const VarHandle& m, const VarHandle& n) {
-          return i + m + n;
-        });
-    LoopNest l({b});
-    l.prepareForCodegen();
-
-    StmtPtr body = LoopNest::sanitizeNames(l.root_stmt());
-    StmtPtr simplified = IRSimplifier::simplify(body);
-
-    BlockPtr block = to<Block>(simplified);
-    IS_NODE_WITH_NAME(For, block->front(), for_);
-    // for is over "m".
-    IS_VAR_WITH_NAME(for_->var(), "j");
-    // x[m] = m;
-    IS_NODE_WITH_NAME(Store, for_->body()->front(), store);
-    IS_VAR_WITH_NAME(store->flat_index(), "j");
-    IS_VAR_WITH_NAME(store->value(), "j");
-  }
-}
-
-TEST(Simplify, SimplifyEliminateEmptyFor) {
-  {
-    // Flatten many layers around an empty block to an empty block.
-    StmtPtr last = alloc<Block>(std::vector<StmtPtr>({}));
-    for ([[maybe_unused]] const auto i : c10::irange(11)) {
-      VarHandle loopVar("loopVar", kInt);
-      last = For::make(loopVar, 0, 10, last);
-    }
-
-    StmtPtr simplified = IRSimplifier::simplify(last);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 0);
-  }
-}
-
-TEST(Simplify, SimplifyFlattenBlock) {
-  {
-    // Flatten multiple blocks down to one.
-    // { { { stmt1, stmt2 } } } =>  { stmt1, stmt2 }
-    BufHandle a("A", {1}, kInt);
-    StorePtr store1 = Store::make(a, {0}, 1);
-    StorePtr store2 = Store::make(a, {0}, 0);
-
-    BlockPtr block1 = alloc<Block>(std::vector<StmtPtr>({store1, store2}));
-    BlockPtr block2 = alloc<Block>(std::vector<StmtPtr>({block1}));
-
-    BlockPtr enclosing = alloc<Block>(std::vector<StmtPtr>({block2}));
-    StmtPtr simplified = IRSimplifier::simplify(enclosing);
-
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 2);
-
-    IS_NODE_WITH_NAME(Store, block->front(), store1_);
-    IS_NODE_WITH_NAME(Store, block->back(), store2_);
-
-    ASSERT_EQ(store1->value(), store1_->value());
-    ASSERT_EQ(store2->value(), store2_->value());
-  }
-
-  {
-    // Flatten multiple sub blocks containing statements.
-    // { { stmt1 }, { stmt2 } } =>  { stmt1, stmt2 }
-    BufHandle a("A", {1}, kInt);
-    StorePtr store1 = Store::make(a, {0}, 1);
-    StorePtr store2 = Store::make(a, {0}, 0);
-
-    BlockPtr block1 = alloc<Block>(std::vector<StmtPtr>({store1}));
-    BlockPtr block2 = alloc<Block>(std::vector<StmtPtr>({store2}));
-
-    BlockPtr enclosing = alloc<Block>(std::vector<StmtPtr>({block1, block2}));
-    StmtPtr simplified = IRSimplifier::simplify(enclosing);
-
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 2);
-
-    IS_NODE_WITH_NAME(Store, block->front(), store1_);
-    IS_NODE_WITH_NAME(Store, block->back(), store2_);
-
-    ASSERT_EQ(store1->value(), store1_->value());
-    ASSERT_EQ(store2->value(), store2_->value());
-  }
-
-  {
-    // Flatten sub blocks with different depths.
-    // { stmt1 , { { stmt2 } } } =>  { stmt1, stmt2 }
-    BufHandle a("A", {1}, kInt);
-    StorePtr store1 = Store::make(a, {0}, 1);
-    StorePtr store2 = Store::make(a, {0}, 0);
-
-    BlockPtr block1 = alloc<Block>(std::vector<StmtPtr>({store2}));
-    BlockPtr block2 = alloc<Block>(std::vector<StmtPtr>({block1}));
-
-    BlockPtr enclosing = alloc<Block>(std::vector<StmtPtr>({store1, block2}));
-    StmtPtr simplified = IRSimplifier::simplify(enclosing);
-
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 2);
-
-    IS_NODE_WITH_NAME(Store, block->front(), store1_);
-    IS_NODE_WITH_NAME(Store, block->back(), store2_);
-
-    ASSERT_EQ(store1->value(), store1_->value());
-    ASSERT_EQ(store2->value(), store2_->value());
-  }
-
-  {
-    // Flatten many layers around an empty block to an empty block.
-    StmtPtr last = alloc<Block>(std::vector<StmtPtr>({}));
-    for ([[maybe_unused]] const auto i : c10::irange(11)) {
-      last = alloc<Block>(std::vector<StmtPtr>({last}));
-    }
-
-    StmtPtr simplified = IRSimplifier::simplify(last);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 0);
-  }
-}
-
-TEST(Simplify, SimplifyEliminateZeroLengthAlloc) {
-  {
-    // Simple positive case.
-    BufHandle b("x", {0}, kInt);
-
-    AllocatePtr alloc_ = Allocate::make(b);
-    FreePtr free_ = Free::make(b);
-
-    BlockPtr block1 = alloc<Block>(std::vector<StmtPtr>({alloc_, free_}));
-    ASSERT_EQ(block1->nstmts(), 2);
-
-    StmtPtr simplified = IRSimplifier::simplify(block1);
-    IS_NODE_WITH_NAME(Block, simplified, block2);
-    ASSERT_EQ(block2->nstmts(), 0);
-  }
-
-  {
-    // Simple negative case.
-    BufHandle b("x", {2}, kInt);
-
-    AllocatePtr alloc_ = Allocate::make(b);
-    FreePtr free_ = Free::make(b);
-
-    BlockPtr block1 = alloc<Block>(std::vector<StmtPtr>({alloc_, free_}));
-    ASSERT_EQ(block1->nstmts(), 2);
-
-    StmtPtr simplified = IRSimplifier::simplify(block1);
-    IS_NODE_WITH_NAME(Block, simplified, block2);
-    ASSERT_EQ(block2->nstmts(), 2);
-  }
-
-  {
-    // Finds right Alloc/Free.
-    BufHandle b1("x", {0}, kInt);
-    BufHandle b2("y", {2}, kInt);
-
-    AllocatePtr alloc1 = Allocate::make(b1);
-    AllocatePtr alloc2 = Allocate::make(b2);
-    FreePtr free2_ = Free::make(b2);
-    FreePtr free1_ = Free::make(b1);
-
-    BlockPtr block1 =
-        alloc<Block>(std::vector<StmtPtr>({alloc1, alloc2, free2_, free1_}));
-    ASSERT_EQ(block1->nstmts(), 4);
-
-    StmtPtr simplified = IRSimplifier::simplify(block1);
-    IS_NODE_WITH_NAME(Block, simplified, block2);
-    ASSERT_EQ(block2->nstmts(), 2);
-    IS_NODE_WITH_NAME(Allocate, block2->stmts().front(), simplified_alloc);
-    IS_VAR_WITH_NAME(simplified_alloc->buffer_var(), "y");
-    IS_NODE_WITH_NAME(Free, block2->stmts().back(), simplified_free);
-    ASSERT_EQ(simplified_alloc->buffer_var(), simplified_free->buffer_var());
-  }
-
-  {
-    // Dynamic shape.
-    VarHandle z("z", kInt);
-    BufHandle b1("x", {0}, kInt);
-    BufHandle b2("y", {z}, kInt);
-
-    AllocatePtr alloc1 = Allocate::make(b1);
-    AllocatePtr alloc2 = Allocate::make(b2);
-    FreePtr free2_ = Free::make(b2);
-    FreePtr free1_ = Free::make(b1);
-
-    BlockPtr block1 =
-        alloc<Block>(std::vector<StmtPtr>({alloc1, alloc2, free2_, free1_}));
-    ASSERT_EQ(block1->nstmts(), 4);
-    StmtPtr simplified = IRSimplifier::simplify(block1);
-    IS_NODE_WITH_NAME(Block, simplified, block2);
-    ASSERT_EQ(block2->nstmts(), 2);
-  }
-}
-
-TEST(Simplify, DontSimplifyRand) {
-  {
-    // rand() + rand() = rand() + rand() NOT 2 * rand().
-    ExprHandle body =
-        Intrinsics::make(kRand, kInt) + Intrinsics::make(kRand, kInt);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_RAND(add->lhs());
-    IS_RAND(add->rhs());
-  }
-
-  {
-    // rand() - rand() = rand() - rand() NOT 0.
-    ExprHandle body =
-        Intrinsics::make(kRand, kFloat) - Intrinsics::make(kRand, kFloat);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
-    IS_RAND(sub->lhs());
-    IS_RAND(sub->rhs());
-  }
-
-  {
-    // rand() * rand() = rand() * rand().
-    ExprHandle body =
-        Intrinsics::make(kRand, kInt) * Intrinsics::make(kRand, kInt);
-    ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_RAND(mul->lhs());
-    IS_RAND(mul->rhs());
-  }
-}
-
-TEST(Simplify, SimplifyReorderForCond) {
-  BufHandle a("A", {4}, kInt);
-  BufHandle b("B", {1}, kInt);
-  BufHandle c("C", {4}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-
-  {
-    // for ( if ( ... ) ) => if ( for ( ... ) ).
-    auto body = For::make(
-        i,
-        0,
-        4,
-        Cond::make(
-            CompareSelect::make(j, 10, CompareSelectOperation::kLT),
-            Store::make(c, {i}, Load::make(a, {i})),
-            nullptr));
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Cond, simplified, cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_block);
-    IS_NODE_WITH_NAME(For, true_block->front(), loop);
-  }
-
-  {
-    // Can't reorder if condition is dependent on the loop var.
-    auto body = For::make(
-        i,
-        0,
-        4,
-        Cond::make(
-            CompareSelect::make(i, 2, CompareSelectOperation::kEQ),
-            Store::make(c, {i}, Load::make(a, {i})),
-            nullptr));
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(For, simplified, loop);
-    IS_NODE_WITH_NAME(Cond, loop->body()->front(), cond);
-  }
-
-  {
-    // Can't reorder if condition is dependent on a var that is modified inside
-    // the loop.
-    auto body = For::make(
-        i,
-        0,
-        4,
-        Cond::make(
-            CompareSelect::make(
-                Load::make(c, {0}), 10, CompareSelectOperation::kLT),
-            Store::make(c, {0}, Load::make(a, {i})),
-            nullptr));
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(For, simplified, loop);
-    IS_NODE_WITH_NAME(Cond, loop->body()->front(), cond);
-  }
-
-  {
-    // Condition based on buffer not referenced in body. Can reorder here.
-    auto body = For::make(
-        i,
-        0,
-        4,
-        Cond::make(
-            CompareSelect::make(
-                Load::make(b, {0}), 10, CompareSelectOperation::kLT),
-            Store::make(c, {0}, Load::make(a, {i})),
-            nullptr));
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Cond, simplified, cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_block);
-    IS_NODE_WITH_NAME(For, true_block->front(), loop);
-  }
-
-  {
-    // Condition based on buffer read only in body. Can reorder here.
-    auto body = For::make(
-        i,
-        0,
-        4,
-        Cond::make(
-            CompareSelect::make(
-                Load::make(a, {0}), 10, CompareSelectOperation::kLT),
-            Store::make(c, {0}, Load::make(a, {i})),
-            nullptr));
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Cond, simplified, cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_block);
-    IS_NODE_WITH_NAME(For, true_block->front(), loop);
-  }
-
-  {
-    // Condition depends on Let in the loop. Cannot reorder.
-    auto body = For::make(
-        i,
-        0,
-        4,
-        Block::make(
-            {Let::make(j, 3),
-             Cond::make(
-                 CompareSelect::make(j, 10, CompareSelectOperation::kLT),
-                 Store::make(c, {0}, Load::make(a, {i})),
-                 nullptr)}));
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(For, simplified, loop);
-    IS_NODE_WITH_NAME(Let, loop->body()->front(), let);
-    IS_NODE_WITH_NAME(Cond, loop->body()->back(), cond);
-  }
-
-  {
-    // Multi level Ifs where all conditions are distinct. Move BOTH Cond
-    // statements outside the loop.
-    auto body = For::make(
-        i,
-        0,
-        4,
-        Cond::make(
-            CompareSelect::make(
-                Load::make(a, {0}), 10, CompareSelectOperation::kLT),
-            Cond::make(
-                CompareSelect::make(j, 10, CompareSelectOperation::kEQ),
-                Store::make(c, {0}, Load::make(a, {i})),
-                nullptr),
-            nullptr));
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Cond, simplified, cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_block);
-    IS_NODE_WITH_NAME(Cond, true_block->front(), cond2);
-    IS_NODE_WITH_NAME(Block, cond2->true_stmt(), true_block2);
-    IS_NODE_WITH_NAME(For, true_block2->front(), loop);
-  }
-
-  {
-    // Multi level Ifs where the inner condition does depend on a loop var,
-    // reorder only the first Cond.
-    auto body = For::make(
-        i,
-        0,
-        4,
-        Cond::make(
-            CompareSelect::make(
-                Load::make(a, {0}), 10, CompareSelectOperation::kLT),
-            Cond::make(
-                CompareSelect::make(i, 3, CompareSelectOperation::kEQ),
-                Store::make(c, {0}, Load::make(a, {i})),
-                nullptr),
-            nullptr));
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Cond, simplified, cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_block);
-    IS_NODE_WITH_NAME(For, true_block->front(), loop);
-    IS_NODE_WITH_NAME(Block, loop->body(), loop_body);
-    IS_NODE_WITH_NAME(Cond, loop_body->front(), cond2);
-  }
-
-  {
-    // Don't reorder if there's an else block of the Cond.
-    // We could, but is it much better?
-    auto body = For::make(
-        i,
-        0,
-        4,
-        Cond::make(
-            CompareSelect::make(j, 10, CompareSelectOperation::kLT),
-            Store::make(c, {0}, Load::make(a, {i})),
-            Store::make(c, {0}, 0)));
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(For, simplified, loop);
-    IS_NODE_WITH_NAME(Cond, loop->body()->front(), cond);
-  }
-
-  {
-    // Condition uses distinct region of Tensor.
-    // We could reorder here with better analysis, but we don't. Included for
-    // completeness.
-    auto body = For::make(
-        i,
-        0,
-        4,
-        Cond::make(
-            CompareSelect::make(
-                Load::make(c, {0}), 10, CompareSelectOperation::kLT),
-            Store::make(c, {1}, Load::make(a, {i})),
-            nullptr));
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(For, simplified, loop);
-    IS_NODE_WITH_NAME(Cond, loop->body()->front(), cond);
-  }
-}
-
-TEST(Simplify, SimplifyFuseConditions) {
-  BufHandle a("A", {2}, kInt);
-  BufHandle b("B", {2}, kInt);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-
-  {
-    // Can fuse since the conditions are identical.
-    // if (A) { X }; if (A) { Y }; => if (A) { X; Y }
-    auto body = Block::make(
-        {Cond::make(
-             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-             Store::make(a, {0}, i),
-             nullptr),
-         Cond::make(
-             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-             Store::make(a, {1}, i),
-             nullptr)});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 1);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
-    ASSERT_EQ(true_stmt->nstmts(), 2);
-    ASSERT_EQ(cond->false_stmt(), nullptr);
-  }
-
-  {
-    // Can't fuse, conditions are not identical in lhs (i != j).
-    auto body = Block::make(
-        {Cond::make(
-             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-             Store::make(a, {0}, i),
-             nullptr),
-         Cond::make(
-             CompareSelect::make(j, 10, CompareSelectOperation::kLT),
-             Store::make(a, {1}, i),
-             nullptr)});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 2);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond1);
-    IS_NODE_WITH_NAME(Cond, block->back(), cond2);
-
-    IS_NODE_WITH_NAME(Block, cond1->true_stmt(), true_stmt1);
-    IS_NODE_WITH_NAME(Block, cond2->true_stmt(), true_stmt2);
-    ASSERT_EQ(true_stmt1->nstmts(), 1);
-    ASSERT_EQ(true_stmt2->nstmts(), 1);
-
-    ASSERT_EQ(cond1->false_stmt(), nullptr);
-    ASSERT_EQ(cond2->false_stmt(), nullptr);
-  }
-  {
-    // Can't fuse, conditions are not identical in rhs (10 != 11).
-    auto body = Block::make(
-        {Cond::make(
-             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-             Store::make(a, {0}, i),
-             nullptr),
-         Cond::make(
-             CompareSelect::make(i, 11, CompareSelectOperation::kLT),
-             Store::make(a, {1}, i),
-             nullptr)});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 2);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond1);
-    IS_NODE_WITH_NAME(Cond, block->back(), cond2);
-
-    IS_NODE_WITH_NAME(Block, cond1->true_stmt(), true_stmt1);
-    IS_NODE_WITH_NAME(Block, cond2->true_stmt(), true_stmt2);
-    ASSERT_EQ(true_stmt1->nstmts(), 1);
-    ASSERT_EQ(true_stmt2->nstmts(), 1);
-
-    ASSERT_EQ(cond1->false_stmt(), nullptr);
-    ASSERT_EQ(cond2->false_stmt(), nullptr);
-  }
-
-  {
-    // Can't fuse, conditions are not identical in operation (LT vs GT).
-    auto body = Block::make(
-        {Cond::make(
-             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-             Store::make(a, {0}, i),
-             nullptr),
-         Cond::make(
-             CompareSelect::make(i, 10, CompareSelectOperation::kGT),
-             Store::make(a, {1}, i),
-             nullptr)});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 2);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond1);
-    IS_NODE_WITH_NAME(Cond, block->back(), cond2);
-
-    IS_NODE_WITH_NAME(Block, cond1->true_stmt(), true_stmt1);
-    IS_NODE_WITH_NAME(Block, cond2->true_stmt(), true_stmt2);
-    ASSERT_EQ(true_stmt1->nstmts(), 1);
-    ASSERT_EQ(true_stmt2->nstmts(), 1);
-
-    ASSERT_EQ(cond1->false_stmt(), nullptr);
-    ASSERT_EQ(cond2->false_stmt(), nullptr);
-  }
-
-  {
-    // Can't fuse, CompareSelect results are different.
-    // Actually we totally could if we normalized CompareSelect results, but
-    // TODO for later.
-    auto body = Block::make(
-        {Cond::make(
-             CompareSelect::make(i, 10, 1, 0, CompareSelectOperation::kLT),
-             Store::make(a, {0}, i),
-             nullptr),
-         Cond::make(
-             CompareSelect::make(j, 10, 2, 0, CompareSelectOperation::kLT),
-             Store::make(a, {1}, i),
-             nullptr)});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 2);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond1);
-    IS_NODE_WITH_NAME(Cond, block->back(), cond2);
-
-    IS_NODE_WITH_NAME(Block, cond1->true_stmt(), true_stmt1);
-    IS_NODE_WITH_NAME(Block, cond2->true_stmt(), true_stmt2);
-    ASSERT_EQ(true_stmt1->nstmts(), 1);
-    ASSERT_EQ(true_stmt2->nstmts(), 1);
-
-    ASSERT_EQ(cond1->false_stmt(), nullptr);
-    ASSERT_EQ(cond2->false_stmt(), nullptr);
-  }
-
-  {
-    // Can fuse with false stmt only.
-    auto body = Block::make(
-        {Cond::make(
-             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-             nullptr,
-             Store::make(a, {0}, i)),
-         Cond::make(
-             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-             nullptr,
-             Store::make(a, {1}, i))});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 1);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond);
-    IS_NODE_WITH_NAME(Block, cond->false_stmt(), false_stmt);
-    ASSERT_EQ(false_stmt->nstmts(), 2);
-    ASSERT_EQ(cond->true_stmt(), nullptr);
-  }
-
-  {
-    // Can fuse with both true and false stmt.
-    auto body = Block::make(
-        {Cond::make(
-             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-             Store::make(a, {0}, i),
-             Store::make(b, {0}, i)),
-         Cond::make(
-             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-             Store::make(a, {1}, i),
-             Store::make(b, {1}, i))});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 1);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
-    ASSERT_EQ(true_stmt->nstmts(), 2);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), false_stmt);
-    ASSERT_EQ(false_stmt->nstmts(), 2);
-  }
-
-  {
-    // Can fuse with mismatched true / false stmt existing
-    auto body = Block::make(
-        {Cond::make(
-             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-             Store::make(a, {0}, i),
-             nullptr),
-         Cond::make(
-             CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-             nullptr,
-             Store::make(b, {1}, i))});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 1);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
-    ASSERT_EQ(true_stmt->nstmts(), 1);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), false_stmt);
-    ASSERT_EQ(false_stmt->nstmts(), 1);
-  }
-
-  {
-    // Can fuse partial block contents, ie when there are non fused stmts before
-    // and after.
-    // before:
-    // if (j < 10) { A[0] = j; }
-    // if (i < 10) { A[0] = i; }
-    // if (i < 10) { A[1] = i; }
-    // if (i < 11) { A[1] = j; }
-    //
-    // after:
-    //
-    // if (j < 10) { A[0] = j; }
-    // if (i < 10) {
-    //   A[0] = i;
-    //   A[1] = i;
-    // }
-    // if (i < 11) { A[1] = j; }
-
-    auto body = Block::make({
-        Cond::make(
-            CompareSelect::make(j, 10, CompareSelectOperation::kLT),
-            Store::make(a, {0}, j),
-            nullptr),
-        Cond::make(
-            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-            Store::make(a, {0}, i),
-            nullptr),
-        Cond::make(
-            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-            Store::make(a, {1}, i),
-            nullptr),
-        Cond::make(
-            CompareSelect::make(i, 11, CompareSelectOperation::kLT),
-            Store::make(a, {1}, j),
-            nullptr),
-    });
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 3);
-    auto it = block->begin();
-    it++;
-    IS_NODE_WITH_NAME(Cond, *it, cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
-    ASSERT_EQ(true_stmt->nstmts(), 2);
-    ASSERT_EQ(cond->false_stmt(), nullptr);
-  }
-
-  {
-    // Can fuse longer sequences of identical conditions.
-    auto body = Block::make({
-        Cond::make(
-            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-            Store::make(a, {0}, j),
-            nullptr),
-        Cond::make(
-            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-            Store::make(a, {0}, i),
-            nullptr),
-        Cond::make(
-            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-            Store::make(a, {1}, i),
-            nullptr),
-        Cond::make(
-            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-            Store::make(a, {1}, j),
-            nullptr),
-    });
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 1);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
-    ASSERT_EQ(true_stmt->nstmts(), 4);
-    ASSERT_EQ(cond->false_stmt(), nullptr);
-  }
-
-  {
-    // Can't fuse through a non condition.
-    auto body = Block::make({
-        Cond::make(
-            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-            Store::make(a, {0}, j),
-            nullptr),
-        Cond::make(
-            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-            Store::make(a, {0}, i),
-            nullptr),
-        Store::make(b, {1}, i + j),
-        Cond::make(
-            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-            Store::make(a, {1}, i),
-            nullptr),
-        Cond::make(
-            CompareSelect::make(i, 10, CompareSelectOperation::kLT),
-            Store::make(a, {1}, j),
-            nullptr),
-    });
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 3);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
-    ASSERT_EQ(true_stmt->nstmts(), 2);
-    ASSERT_EQ(cond->false_stmt(), nullptr);
-
-    IS_NODE_WITH_NAME(Cond, block->back(), cond2);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt2);
-    ASSERT_EQ(true_stmt2->nstmts(), 2);
-    ASSERT_EQ(cond2->false_stmt(), nullptr);
-
-    auto it = block->begin();
-    it++;
-    IS_NODE_WITH_NAME(Store, *it, middle);
-  }
-
-  {
-    // Can fuse if the conditions simplify to the same thing.
-    auto body = Block::make(
-        {Cond::make(
-             CompareSelect::make(
-                 i * 2,
-                 ExprHandle(87) % ExprHandle(11),
-                 CompareSelectOperation::kLT),
-             Store::make(a, {0}, i),
-             nullptr),
-         Cond::make(
-             CompareSelect::make(
-                 i * 2,
-                 ExprHandle(300) / ExprHandle(30),
-                 CompareSelectOperation::kLT),
-             Store::make(a, {1}, i),
-             nullptr)});
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 1);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
-    ASSERT_EQ(true_stmt->nstmts(), 2);
-    ASSERT_EQ(cond->false_stmt(), nullptr);
-  }
-
-  {
-    // Can fuse non-CompareSelects.
-    // if (i) { X } if (i) { Y } => if (i) { X; Y }
-    auto body = Block::make(
-        {Cond::make(i, Store::make(a, {0}, i), nullptr),
-         Cond::make(i, Store::make(a, {1}, i), nullptr)});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 1);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_stmt);
-    ASSERT_EQ(true_stmt->nstmts(), 2);
-    ASSERT_EQ(cond->false_stmt(), nullptr);
-  }
-
-  {
-    // Sanity check won't fuse different non-CompareSelects.
-    auto body = Block::make(
-        {Cond::make(i, Store::make(a, {0}, i), nullptr),
-         Cond::make(j, Store::make(a, {1}, i), nullptr)});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 2);
-    IS_NODE_WITH_NAME(Cond, block->front(), cond1);
-    IS_NODE_WITH_NAME(Cond, block->back(), cond2);
-  }
-
-  {
-    // Sanity check constant condition elimination still occurs when merging is
-    // possible.
-    auto body = Block::make(
-        {Cond::make(1, Store::make(a, {0}, i), nullptr),
-         Cond::make(1, Store::make(a, {1}, i), nullptr)});
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 2);
-    IS_NODE_WITH_NAME(Store, block->front(), store1);
-    IS_NODE_WITH_NAME(Store, block->back(), store2);
-  }
-
-  {
-    // Sanity check for-cond reordering occurs after fusing.
-    auto body = For::make(
-        i,
-        0,
-        4,
-        Block::make(
-            {Cond::make(
-                 CompareSelect::make(j, 10, CompareSelectOperation::kLT),
-                 Store::make(a, {1}, Load::make(b, {0})),
-                 nullptr),
-             Cond::make(
-                 CompareSelect::make(j, 10, CompareSelectOperation::kLT),
-                 Store::make(a, {2}, Load::make(b, {0})),
-                 nullptr)}));
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Cond, simplified, cond);
-    IS_NODE_WITH_NAME(Block, cond->true_stmt(), true_block);
-    IS_NODE_WITH_NAME(For, true_block->front(), loop);
-  }
-}
-
-TEST(Simplify, SimplifySyncThreads) {
-  BufHandle a("A", {4}, kInt);
-  VarHandle i("i", kInt);
-
-  {
-    // Merge two inner SyncThreads.
-    auto body = Block::make(
-        // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-        {Store::make(a, {0}, 1),
-         alloc<SyncThreads>(),
-         alloc<SyncThreads>(),
-         Store::make(a, {1}, 0)});
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 3);
-    auto it = block->begin();
-    IS_NODE(Store, *it++);
-    IS_NODE(SyncThreads, *it++);
-    IS_NODE(Store, *it++);
-  }
-
-  {
-    // Eliminate outer SyncThreads.
-    auto body = Block::make(
-        {alloc<SyncThreads>(), Store::make(a, {1}, 0), alloc<SyncThreads>()});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 1);
-    auto it = block->begin();
-    IS_NODE(Store, *it);
-  }
-
-  {
-    // Merge many inner SyncThreads.
-    auto body = Block::make(
-        {Store::make(a, {0}, 1),
-         alloc<SyncThreads>(),
-         alloc<SyncThreads>(),
-         alloc<SyncThreads>(),
-         alloc<SyncThreads>(),
-         alloc<SyncThreads>(),
-         Store::make(a, {1}, 0)});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 3);
-    auto it = block->begin();
-    IS_NODE(Store, *it++);
-    IS_NODE(SyncThreads, *it++);
-    IS_NODE(Store, *it++);
-  }
-
-  {
-    // Merge multiple outer SyncThreads.
-    auto body = Block::make(
-        {alloc<SyncThreads>(),
-         alloc<SyncThreads>(),
-         Store::make(a, {1}, 0),
-         alloc<SyncThreads>(),
-         alloc<SyncThreads>(),
-         alloc<SyncThreads>(),
-         alloc<SyncThreads>()});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 1);
-    auto it = block->begin();
-    IS_NODE(Store, *it);
-  }
-
-  {
-    // Merge multiple sections;
-    auto body = Block::make(
-        {Store::make(a, {0}, 1),
-         alloc<SyncThreads>(),
-         alloc<SyncThreads>(),
-         Store::make(a, {1}, 0),
-         Store::make(a, {2}, 0),
-         alloc<SyncThreads>(),
-         alloc<SyncThreads>(),
-         alloc<SyncThreads>(),
-         Store::make(a, {3}, 0)});
-
-    StmtPtr simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Block, simplified, block);
-    ASSERT_EQ(block->nstmts(), 6);
-    auto it = block->begin();
-    IS_NODE(Store, *it++);
-    IS_NODE(SyncThreads, *it++);
-    IS_NODE(Store, *it++);
-    IS_NODE(Store, *it++);
-    IS_NODE(SyncThreads, *it++);
-    IS_NODE(Store, *it++);
-  }
-}
-
-TEST(Simplify, SimplifyRampSubBroadcast) {
-  int num_lanes = 4;
-  ExprHandle ramp = Ramp::make(ExprHandle(0), ExprHandle(6), num_lanes);
-  ExprHandle broadcast = Broadcast::make(ExprHandle(-5), num_lanes);
-  ExprHandle simplified = IRSimplifier::simplify(ramp - broadcast);
-  RampPtr newRamp = simplified.AsNode<Ramp>();
-  IS_NODE_WITH_NAME(IntImm, newRamp->base(), base);
-  ASSERT_EQ(base->value(), 5);
-  IS_NODE_WITH_NAME(IntImm, newRamp->stride(), stride);
-  ASSERT_EQ(stride->value(), 6);
-  ASSERT_EQ(newRamp->lanes(), num_lanes);
-}
-
-TEST(Simplify, SimplifyBroadcastTermExpander) {
-  int num_lanes = 8;
-  ExprHandle bc0 = Broadcast::make(ExprHandle(0), num_lanes);
-  ExprHandle bc1 = Broadcast::make(ExprHandle(1), num_lanes);
-  ExprHandle bc2 = Broadcast::make(ExprHandle(2), num_lanes);
-  // NB: We need a term in the middle which isn't simplified to trigger the
-  // relevant path in TermExpander::mutate. The two bc1 terms are brought
-  // together and simplified to 2 * bc1, which then needs to make 2 multi-lane.
-  ExprHandle simplified = IRSimplifier::simplify(bc1 + (bc0 / bc2) + bc1);
-  BufHandle buf("buf", {num_lanes}, kInt);
-  // The result isn't fully simplified currently and thus would be brittle to
-  // match. Observe its value instead.
-  auto store = Store::make(buf, {Ramp::make(0, 1, num_lanes)}, simplified);
-  SimpleIREvaluator eval(store, {buf});
-  std::vector<int> output(num_lanes);
-  eval(output);
-  for (const auto i : c10::irange(num_lanes)) {
-    ASSERT_EQ(output[i], 2);
-  }
-}
-
-TEST(Simplify, CompareSelectLoopBounds) {
-  constexpr int N = 8;
-  BufHandle b("b", {N}, kFloat);
-  VarHandle n("n", kInt);
-  VarHandle m("m", kInt);
-  VarHandle var_N("var_N", kInt);
-  VarHandle var_M("var_M", kInt);
-
-  auto test_case_fn = [](const VarHandle& n,
-                         const BufHandle& b,
-                         const ExprHandle& start,
-                         const ExprHandle& stop,
-                         const int& cmp_val,
-                         const CompareSelectOperation& cmp_op,
-                         const std::string& check_string) {
-    StmtPtr s = For::make(
-        n,
-        start,
-        stop,
-        b.store({n}, CompareSelect::make(n, cmp_val, 0.f, 1.0f, cmp_op)));
-    s = IRSimplifier::simplify(s);
-    std::ostringstream oss;
-    oss << *s;
-    std::string target_string = "# CHECK: ";
-    target_string += check_string;
-    torch::jit::testing::FileCheck().run(target_string, oss.str());
-  };
-
-  auto test_case_nest_loops_fn = [](const VarHandle& n,
-                                    const VarHandle& m,
-                                    const BufHandle& b,
-                                    const ExprHandle& n_start,
-                                    const ExprHandle& n_stop,
-                                    const ExprHandle& m_start,
-                                    const ExprHandle& m_stop,
-                                    const CompareSelectOperation& cmp_op,
-                                    const std::string& check_string) {
-    StmtPtr s = For::make(
-        m,
-        m_start,
-        m_stop,
-        b.store({n, m}, CompareSelect::make(n, m, 0.f, 1.0f, cmp_op)));
-    StmtPtr root_s = For::make(n, n_start, n_stop, s);
-    root_s = IRSimplifier::simplify(root_s);
-    std::ostringstream oss;
-    oss << *root_s;
-    std::string target_string = "# CHECK: ";
-    target_string += check_string;
-    torch::jit::testing::FileCheck().run(target_string, oss.str());
-  };
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n < 1 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, 1, kLT, "b[n] = 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n <= 1 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n <= 1 ? 0.f : 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, 1, kLE, "b[n] = n<=1 ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n <= 0 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, 0, kLE, "b[n] = 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n < 0 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, 0, kLT, "b[n] = 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n < 8 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 0.f;
-  //   }
-  test_case_fn(n, b, 1, N, N, kLT, "b[n] = 0.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n <= 7 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 0.f;
-  //   }
-  test_case_fn(n, b, 1, N, N - 1, kLE, "b[n] = 0.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n <= 8 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 0.f;
-  //   }
-  test_case_fn(n, b, 1, N, N, kLE, "b[n] = 0.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n < 7 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n < 7 ? 0.f : 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, N - 1, kLT, "b[n] = n<7 ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n > 0 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 0.f;
-  //   }
-  test_case_fn(n, b, 1, N, 0, kGT, "b[n] = 0.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n > 1 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n > 1 ? 0.f : 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, 1, kGT, "b[n] = n>1 ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n >= 1 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 0.f;
-  //   }
-  test_case_fn(n, b, 1, N, 1, kGE, "b[n] = 0.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n > 7 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, N - 1, kGT, "b[n] = 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n >= 7 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n >= 7 ? 0.f : 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, N - 1, kGE, "b[n] = n>=7 ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n > 5 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n > 5 ? 0.f : 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, 5, kGT, "b[n] = n>5 ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n >= 5 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n >= 5 ? 0.f : 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, 5, kGE, "b[n] = n>=5 ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n > 8 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, N, kGT, "b[n] = 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n >= 8 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, N, kGE, "b[n] = 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, 2)) {
-  //     b[n] = n == 1 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, 2)) {
-  //     b[1] = 0.f;
-  //   }
-  test_case_fn(n, b, 1, 2, 1, kEQ, "b[1] = 0.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n == 1 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n == 1 ? 0.f : 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, 1, kEQ, "b[n] = n==1 ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n == 0 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, 0, kEQ, "b[n] = 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n == 7 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n == 7 ? 0.f : 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, N - 1, kEQ, "b[n] = n==7 ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n == 8 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, N, kEQ, "b[n] = 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n != 1 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n != 1 ? 0.f : 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, 1, kNE, "b[n] = n!=1 ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n != 7 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n != 7 ? 0.f : 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, N - 1, kNE, "b[n] = n!=7 ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n != 5 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n != 5 ? 0.f : 1.f;
-  //   }
-  test_case_fn(n, b, 1, N, 5, kNE, "b[n] = n!=5 ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n != 0 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 0.f;
-  //   }
-  test_case_fn(n, b, 1, N, 0, kNE, "b[n] = 0.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n != 8 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 0.f;
-  //   }
-  test_case_fn(n, b, 1, N, N, kNE, "b[n] = 0.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(10, 20)) {
-  //     for(const auto m : c10::irange(30, 40)) {
-  //       b[n, m] = (n != m) ? 0.f : 1.f;
-  //     }
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(10, 20)) {
-  //     for(const auto m : c10::irange(30, 40)) {
-  //       b[n, m] = 0.f;
-  //     }
-  //   }
-  test_case_nest_loops_fn(n, m, b, 10, 20, 30, 40, kNE, "b[n, m] = 0.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 10,
-      var_N + 20,
-      var_N + 30,
-      var_N + 40,
-      kNE,
-      "b[n, m] = 0.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 10,
-      var_N + 20,
-      var_M + 30,
-      var_M + 40,
-      kNE,
-      "b[n, m] = n!=m ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(30, 40)) {
-  //     for(const auto m : c10::irange(10, 20)) {
-  //       b[n, m] = (n != m) ? 0.f : 1.f;
-  //     }
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(30, 40)) {
-  //     for(const auto m : c10::irange(10, 20)) {
-  //       b[n, m] = 0.f;
-  //     }
-  //   }
-  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 20, kNE, "b[n, m] = 0.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 30,
-      var_N + 40,
-      var_N + 10,
-      var_N + 20,
-      kNE,
-      "b[n, m] = 0.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 30,
-      var_N + 40,
-      var_M + 10,
-      var_M + 20,
-      kNE,
-      "b[n, m] = n!=m ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(30, 40)) {
-  //     for(const auto m : c10::irange(10, 31)) {
-  //       b[n, m] = (n != m) ? 0.f : 1.f;
-  //     }
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(30, 40)) {
-  //     for(const auto m : c10::irange(10, 31)) {
-  //       b[n, m] = (n != m) ? 0.f : 1.f;
-  //     }
-  //   }
-  test_case_nest_loops_fn(
-      n, m, b, 30, 40, 10, 31, kNE, "b[n, m] = n!=m ? 0.f : 1.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 30,
-      var_N + 40,
-      var_N + 10,
-      var_N + 31,
-      kNE,
-      "b[n, m] = n!=m ? 0.f : 1.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 30,
-      var_N + 40,
-      var_M + 10,
-      var_M + 31,
-      kNE,
-      "b[n, m] = n!=m ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(10, 31)) {
-  //     for(const auto m : c10::irange(30, 40)) {
-  //       b[n, m] = (n != m) ? 0.f : 1.f;
-  //     }
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(10, 31)) {
-  //     for(const auto m : c10::irange(30, 40)) {
-  //       b[n, m] = (n != m) ? 0.f : 1.f;
-  //     }
-  //   }
-  test_case_nest_loops_fn(
-      n, m, b, 10, 31, 30, 40, kNE, "b[n, m] = n!=m ? 0.f : 1.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 10,
-      var_N + 31,
-      var_N + 30,
-      var_N + 40,
-      kNE,
-      "b[n, m] = n!=m ? 0.f : 1.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 10,
-      var_N + 31,
-      var_M + 30,
-      var_M + 40,
-      kNE,
-      "b[n, m] = n!=m ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(10, 20)) {
-  //     for(const auto m : c10::irange(30, 40)) {
-  //       b[n, m] = (n < m) ? 0.f : 1.f;
-  //     }
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(10, 20)) {
-  //     for(const auto m : c10::irange(30, 40)) {
-  //       b[n, m] = 0.f;
-  //     }
-  //   }
-  test_case_nest_loops_fn(n, m, b, 10, 20, 30, 40, kLT, "b[n, m] = 0.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 10,
-      var_N + 20,
-      var_N + 30,
-      var_N + 40,
-      kLT,
-      "b[n, m] = 0.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 10,
-      var_N + 20,
-      var_M + 30,
-      var_M + 40,
-      kLT,
-      "b[n, m] = n<m ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(30, 40)) {
-  //     for(const auto m : c10::irange(10, 31)) {
-  //       b[n, m] = (n < m) ? 0.f : 1.f;
-  //     }
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(30, 40)) {
-  //     for(const auto m : c10::irange(10, 31)) {
-  //       b[n, m] = 1.f;
-  //     }
-  //   }
-  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 31, kLT, "b[n, m] = 1.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 30,
-      var_N + 40,
-      var_N + 10,
-      var_N + 31,
-      kLT,
-      "b[n, m] = 1.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 30,
-      var_N + 40,
-      var_M + 10,
-      var_M + 31,
-      kLT,
-      "b[n, m] = n<m ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(30, 40)) {
-  //     for(const auto m : c10::irange(10, 20)) {
-  //       b[n, m] = (n > m) ? 0.f : 1.f;
-  //     }
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(30, 40)) {
-  //     for(const auto m : c10::irange(10, 20)) {
-  //       b[n, m] = 0.f;
-  //     }
-  //   }
-  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 20, kGT, "b[n, m] = 0.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 30,
-      var_N + 40,
-      var_N + 10,
-      var_N + 20,
-      kGT,
-      "b[n, m] = 0.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 30,
-      var_N + 40,
-      var_M + 10,
-      var_M + 20,
-      kGT,
-      "b[n, m] = n>m ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(10, 31)) {
-  //     for(const auto m : c10::irange(30, 40)) {
-  //       b[n, m] = (n > m) ? 0.f : 1.f;
-  //     }
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(10, 31)) {
-  //     for(const auto m : c10::irange(30, 40)) {
-  //       b[n, m] = 1.f;
-  //     }
-  //   }
-  test_case_nest_loops_fn(n, m, b, 10, 31, 30, 40, kGT, "b[n, m] = 1.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 10,
-      var_N + 31,
-      var_N + 30,
-      var_N + 40,
-      kGT,
-      "b[n, m] = 1.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 10,
-      var_N + 31,
-      var_M + 30,
-      var_M + 40,
-      kGT,
-      "b[n, m] = n>m ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(30, 40)) {
-  //     for(const auto m : c10::irange(10, 31)) {
-  //       b[n, m] = (n >= m) ? 0.f : 1.f;
-  //     }
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(30, 40)) {
-  //     for(const auto m : c10::irange(10, 31)) {
-  //       b[n, m] = 0.f;
-  //     }
-  //   }
-  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 31, kGE, "b[n, m] = 0.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 30,
-      var_N + 40,
-      var_N + 10,
-      var_N + 31,
-      kGE,
-      "b[n, m] = 0.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 30,
-      var_N + 40,
-      var_M + 10,
-      var_M + 31,
-      kGE,
-      "b[n, m] = n>=m ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(10, 20)) {
-  //     for(const auto m : c10::irange(30, 40)) {
-  //       b[n, m] = (n >= m) ? 0.f : 1.f;
-  //     }
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(10, 20)) {
-  //     for(const auto m : c10::irange(30, 40)) {
-  //       b[n, m] = 1.f;
-  //     }
-  //   }
-  test_case_nest_loops_fn(n, m, b, 10, 20, 30, 40, kGE, "b[n, m] = 1.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 10,
-      var_N + 20,
-      var_N + 30,
-      var_N + 40,
-      kGE,
-      "b[n, m] = 1.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 10,
-      var_N + 20,
-      var_M + 30,
-      var_M + 40,
-      kGE,
-      "b[n, m] = n>=m ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(10, 31)) {
-  //     for(const auto m : c10::irange(30, 40)) {
-  //       b[n, m] = (n <= m) ? 0.f : 1.f;
-  //     }
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(10, 31)) {
-  //     for(const auto m : c10::irange(30, 40)) {
-  //       b[n, m] = 0.f;
-  //     }
-  //   }
-  test_case_nest_loops_fn(n, m, b, 10, 31, 30, 40, kLE, "b[n, m] = 0.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 10,
-      var_N + 31,
-      var_N + 30,
-      var_N + 40,
-      kLE,
-      "b[n, m] = 0.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 10,
-      var_N + 31,
-      var_M + 30,
-      var_M + 40,
-      kLE,
-      "b[n, m] = n<=m ? 0.f : 1.f;");
-
-  // Before:
-  //   for (const auto n : c10::irange(30, 40)) {
-  //     for(const auto m : c10::irange(10, 20)) {
-  //       b[n, m] = (n <= m) ? 0.f : 1.f;
-  //     }
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(30, 40)) {
-  //     for(const auto m : c10::irange(10, 20)) {
-  //       b[n, m] = 0.f;
-  //     }
-  //   }
-  test_case_nest_loops_fn(n, m, b, 30, 40, 10, 20, kLE, "b[n, m] = 1.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 30,
-      var_N + 40,
-      var_N + 10,
-      var_N + 20,
-      kLE,
-      "b[n, m] = 1.f;");
-  test_case_nest_loops_fn(
-      n,
-      m,
-      b,
-      var_N + 30,
-      var_N + 40,
-      var_M + 10,
-      var_M + 20,
-      kLE,
-      "b[n, m] = n<=m ? 0.f : 1.f;");
-}
-
-TEST(Simplify, CompareSelectCondAlwaysInLoopBounds) {
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = n < 1 ? 0.f : 1.f;
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 1.f;
-  //   }
-  constexpr int N = 8;
-  BufHandle b("b", {N}, kFloat);
-  VarHandle n("n", kInt);
-  StmtPtr s = For::make(
-      n, 1, N, b.store({n}, CompareSelect::make(n, 1, 0.f, 1.0f, kLT)));
-  s = IRSimplifier::simplify(s);
-  std::ostringstream oss;
-  oss << *s;
-  torch::jit::testing::FileCheck().run(
-      R"IR(
-# CHECK: b[n] = 1.f;
-)IR",
-      oss.str());
-}
-
-TEST(Simplify, IfThenCondAlwaysInLoopBounds) {
-  // Before:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = IfThenElse(n < 1 ? 1 : 0, 0.f, 1.f);
-  //   }
-  // After:
-  //   for (const auto n : c10::irange(1, N)) {
-  //     b[n] = 1.f;
-  //   }
-  constexpr int N = 8;
-  BufHandle b("b", {N}, kFloat);
-  VarHandle n("n", kInt);
-  StmtPtr s =
-      For::make(n, 1, N, b.store({n}, IfThenElse::make(n < 1, 0.f, 1.0f)));
-  s = IRSimplifier::simplify(s);
-  std::ostringstream oss;
-  oss << *s;
-  torch::jit::testing::FileCheck().run(
-      R"IR(
-# CHECK: b[n] = 1.f;
-)IR",
-      oss.str());
-}
-
-TEST(Simplify, MultiClauseCondAlwaysInLoopBounds) {
-  // This test mimics the unpadded region of a conv2d.  We want to remove any
-  // conditional that is provably satisfied (or unsatisfied) by the entire loop
-  // range.
-  // Before:
-  //   for (const auto i : c10::irange(1, 7)) {
-  //     for (const auto j : c10::irange(1, 7)) {
-  //       b[i, j] = IfThenElse(
-  //         j>=7 ? 1 : (i>=7 ? 1 : (j<1 ? 1 : (i<1 ? 1 : 0))), 0.f, 1.f);
-  // After:
-  //   for (const auto i : c10::irange(1, 7)) {
-  //     for (const auto j : c10::irange(1, 7)) {
-  //       b[i, j] = 1.f;
-  constexpr int N = 8;
-  BufHandle b("b", {N, N}, kFloat);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto csel = CompareSelect::make(i, 1, kLT);
-  csel = CompareSelect::make(j, 1, 1, csel, kLT);
-  csel = CompareSelect::make(i, N - 1, 1, csel, kGE);
-  csel = CompareSelect::make(j, N - 1, 1, csel, kGE);
-  StmtPtr s = b.store({i, j}, IfThenElse::make(csel, 0.f, 1.0f));
-  s = For::make(j, 1, N - 1, s);
-  s = For::make(i, 1, N - 1, s);
-  s = IRSimplifier::simplify(s);
-  std::ostringstream oss;
-  oss << *s;
-  torch::jit::testing::FileCheck().run(
-      R"IR(
-# CHECK: b[i, j] = 1.f;
-)IR",
-      oss.str());
-}
-
-TEST(Simplify, DISABLED_SimplifyLoopBounds) {
-  // This test mimics the padded region of a conv2d.  We want to adjust the
-  // loop bounds such that the condition will be always met.  Note that this
-  // could be solved by peeling, and applying the range-based conditional
-  // simplification in the previous tests.
-  // Before:
-  //   for (const auto i : c10::irange(3)) {
-  //     for (const auto j : c10::irange(3)) {
-  //       b[i, j] = (b[i, j]) + (IfThenElse(
-  //         j>=7 ? 1 : (i>=7 ? 1 : (j<1 ? 1 : (i<1 ? 1 : 0))), 0.f, a[i, j]));
-  // After:
-  //   for (const auto i : c10::irange(1, 3)) {
-  //     for (const auto j : c10::irange(1, 3)) {
-  //       b[i, j] = (b[i, j]) + 1.f;
-  constexpr int N = 8;
-  constexpr int K = 3;
-  BufHandle a("a", {N, N}, kFloat);
-  BufHandle b("b", {N, N}, kFloat);
-  VarHandle i("i", kInt);
-  VarHandle j("j", kInt);
-  auto csel = CompareSelect::make(i, 1, kLT);
-  csel = CompareSelect::make(j, 1, 1, csel, kLT);
-  csel = CompareSelect::make(i, N - 1, 1, csel, kGE);
-  csel = CompareSelect::make(j, N - 1, 1, csel, kGE);
-  StmtPtr s = b.store(
-      {i, j}, b.load({i, j}) + IfThenElse::make(csel, 0.f, a.load({i, j})));
-  s = For::make(j, 0, K, s);
-  s = For::make(i, 0, K, s);
-  s = IRSimplifier::simplify(s);
-  std::ostringstream oss;
-  oss << *s;
-  torch::jit::testing::FileCheck().run(
-      R"IR(
-# CHECK: for (const auto i : c10::irange(1, 3)) {
-# CHECK: for (const auto j : c10::irange(1, 3)) {
-# CHECK-NOT: IfThenElse
-)IR",
-      oss.str());
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
deleted file mode 100644
index 56535de914e43..0000000000000
--- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp
+++ /dev/null
@@ -1,402 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <test/cpp/tensorexpr/test_base.h>
-#include <torch/csrc/jit/codegen/fuser/interface.h>
-#include <torch/csrc/jit/ir/ir.h>
-#include <torch/csrc/jit/ir/irparser.h>
-#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
-#include <torch/csrc/jit/runtime/interpreter.h>
-#include <torch/csrc/jit/testing/file_check.h>
-#include <sstream>
-
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::tensorexpr;
-
-struct WithCPUFuser {
-  WithCPUFuser(bool val = true) : cpuFuserEnabled(canFuseOnCPU()) {
-    overrideCanFuseOnCPU(val);
-  }
-
-  ~WithCPUFuser() {
-    overrideCanFuseOnCPU(cpuFuserEnabled);
-  }
-
-  bool cpuFuserEnabled;
-};
-
-TEST(TEFuserPass, FuserPass_1) {
-  WithCPUFuser cf;
-  const auto graph_string = R"IR(
-    graph(%0 : Float(128, strides=[1], device=cpu),
-          %1 : Float(128, strides=[1], device=cpu)):
-      %12 : int = prim::Constant[value=1]()
-      %2.1 : Float(128, strides=[1], device=cpu) = aten::mul(%0, %1)
-      %2 : Float(128, strides=[1], device=cpu) = aten::mul(%2.1, %1)
-      %3 : Float(128, strides=[1], device=cpu) = aten::add_(%2, %1, %12)
-      %4 : Float(128, strides=[1], device=cpu) = aten::mul(%2, %1)
-      %5 : Float(128, strides=[1], device=cpu) = aten::add(%2, %4, %12)
-      return (%5))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-
-  g->lint();
-  FuseTensorExprs(g);
-
-  // We should not be able to fuse across the in-place operation here.
-  testing::FileCheck()
-      .check("prim::TensorExprGroup_")
-      ->check("aten::add_")
-      ->check("prim::TensorExprGroup_")
-      ->run(*g);
-}
-
-TEST(TEFuserPass, FuserPass_2) {
-  WithCPUFuser cf;
-  const auto graph_string = R"IR(
-    graph(%0 : Float(128, strides=[1], device=cpu),
-          %1 : Float(128, strides=[1], device=cpu)):
-      %12 : int = prim::Constant[value=1]()
-      %a : Float(128, strides=[1], device=cpu) = aten::mul(%0, %1)
-      %b : Float(128, strides=[1], device=cpu) = aten::add(%0, %1, %12)
-      %c : Float(128, strides=[1], device=cpu) = aten::add_(%b, %1, %12)
-      %d : Float(128, strides=[1], device=cpu) = aten::mul(%c, %a)
-      return (%d))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-
-  g->lint();
-  FuseTensorExprs(g);
-
-  // We should not be able to fuse across the in-place operation here.
-  testing::FileCheck()
-      .check("aten::add_")
-      ->check("prim::TensorExprGroup_0")
-      ->run(*g);
-}
-
-TEST(TEFuserPass, FuserPass_3) {
-  WithCPUFuser cf;
-  const auto graph_string = R"IR(
-    graph(%x : Float(128, strides=[1], device=cpu),
-          %y : Float(128, strides=[1], device=cpu)):
-      %r : Float(128, strides=[1], device=cpu) = aten::mul(%x, %y)
-      return (%r))IR";
-  {
-    auto g = std::make_shared<Graph>();
-    torch::jit::parseIR(graph_string, g.get());
-
-    g->lint();
-    FuseTensorExprs(g, /* min_group_size= */ 2);
-
-    // We should not create a fusion group since its size would be too small
-    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
-  }
-  {
-    auto g = std::make_shared<Graph>();
-    torch::jit::parseIR(graph_string, g.get());
-
-    g->lint();
-    FuseTensorExprs(g, /* min_group_size= */ 1);
-
-    // We should create a fusion group since its size is above the threshold
-    testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
-  }
-}
-
-TEST(TEFuserPass, FuserPass_0DimInput) {
-  WithCPUFuser cf;
-  const auto graph_string = R"IR(
-    graph(%x : Float(device=cpu),
-          %y : Float(device=cpu)):
-      %one : int = prim::Constant[value=1]()
-      %a : Float(device=cpu) = aten::mul(%x, %y)
-      %b : Float(device=cpu) = aten::add(%x, %a, %one)
-      return (%b))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-
-  g->lint();
-  FuseTensorExprs(g);
-
-  // We should fuse 0-dim tensors too
-  testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
-}
-
-TEST(TEFuserPass, FuserPass_UnfusibleDevice) {
-  WithCPUFuser cf(false);
-  const auto graph_string = R"IR(
-    graph(%x : Float(10, strides=[1], device=cpu),
-          %y : Float(10, strides=[1], device=cpu)):
-      %a : Float(10, strides=[1], device=cpu) = aten::mul(%x, %y)
-      return (%a))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-
-  g->lint();
-  FuseTensorExprs(g, /* min_group_size= */ 1);
-
-  // Test that we're not starting fusion groups from nodes with unfusible device
-  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
-}
-
-TEST(TEFuserPass, FuserPass_UnknownShapes) {
-  WithCPUFuser cf;
-  const auto graph_string = R"IR(
-    graph(%x : Tensor,
-          %y : Tensor):
-      %a : Tensor = aten::mul(%x, %y)
-      %b : Tensor = aten::mul(%x, %a)
-      return (%b))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-
-  g->lint();
-  FuseTensorExprs(g);
-
-  // Test that we're not generating fusion groups when shapes are not known
-  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
-}
-
-TEST(TEFuserPass, FuserPass_Multidevice) {
-  {
-    WithCPUFuser cf;
-    const auto graph_string = R"IR(
-    graph(%x : Float(10, strides=[1], device=cpu),
-          %y : Float(20, strides=[1], device=cpu),
-          %z : Float(30, strides=[1], device=cpu)):
-      %dim : int = prim::Constant[value=0]()
-      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
-      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
-      return (%cat))IR";
-    auto g = std::make_shared<Graph>();
-    torch::jit::parseIR(graph_string, g.get());
-
-    g->lint();
-    FuseTensorExprs(g, /* min_group_size= */ 1);
-
-    // We should be able to fuse this
-    testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
-  }
-  {
-    WithCPUFuser cf;
-    const auto graph_string = R"IR(
-    graph(%x : Float(10, strides=[1], device=cpu),
-          %y : Float(20, strides=[1], device=cuda:0),
-          %z : Float(30, strides=[1], device=cpu)):
-      %dim : int = prim::Constant[value=0]()
-      %xyz_list : Tensor[] = prim::ListConstruct(%x, %y, %z)
-      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xyz_list, %dim)
-      return (%cat))IR";
-    auto g = std::make_shared<Graph>();
-    torch::jit::parseIR(graph_string, g.get());
-
-    g->lint();
-    FuseTensorExprs(g, /* min_group_size= */ 1);
-
-    // We should not fuse this aten::cat since its inputs are from different
-    // devices
-    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
-  }
-  {
-    WithCPUFuser cf;
-    const auto graph_string = R"IR(
-    graph(%x : Float(10, strides=[1], device=cpu),
-          %y : Float(20, strides=[1], device=cpu),
-          %z : Float(10, strides=[1], device=cuda:0)):
-      %dim : int = prim::Constant[value=0]()
-      %xy_list : Tensor[] = prim::ListConstruct(%x, %y)
-      %xy_cat : Float(30, strides=[1], device=cpu) = aten::cat(%xy_list, %dim)
-      %r : Float(30, strides=[1], device=cpu) = aten::mul(%xy_cat, %z)
-      return (%r))IR";
-    auto g = std::make_shared<Graph>();
-    torch::jit::parseIR(graph_string, g.get());
-
-    g->lint();
-    FuseTensorExprs(g, /* min_group_size= */ 2);
-
-    // Test that we check device before merging one node (cat) into another
-    // (mul)
-    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
-  }
-  {
-    WithCPUFuser cf;
-    const auto graph_string = R"IR(
-    graph(%x : Float(10, strides=[1], device=cpu),
-          %y : Float(20, strides=[1], device=cpu),
-          %z : Float(10, strides=[1], device=cuda:0)):
-      %z2 : Tensor = aten::mul(%z, %z)
-      %dim : int = prim::Constant[value=0]()
-      %xy_list : Tensor[] = prim::ListConstruct(%x, %y, %z2)
-      %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xy_list, %dim)
-      return (%cat))IR";
-    auto g = std::make_shared<Graph>();
-    torch::jit::parseIR(graph_string, g.get());
-
-    g->lint();
-    FuseTensorExprs(g, /* min_group_size= */ 2);
-
-    // Test that we check device before merging one node (mul) into another
-    // (cat)
-    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
-  }
-  {
-    WithCPUFuser cf;
-    const auto graph_string = R"IR(
-    graph(%x : Float(10, strides=[1], device=cpu),
-          %y : Float(20, strides=[1], device=cuda:0)):
-      %r : Float(10, strides=[1], device=cpu) = aten::mul(%x, %y)
-      return (%r))IR";
-    auto g = std::make_shared<Graph>();
-    torch::jit::parseIR(graph_string, g.get());
-
-    g->lint();
-    FuseTensorExprs(g, /* min_group_size= */ 1);
-
-    // We should not fuse this graph since its inputs are from different devices
-    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
-  }
-  {
-    WithCPUFuser cf;
-    const auto graph_string = R"IR(
-    graph(%x : Float(10, strides=[1], device=cuda:0),
-          %y : Float(20, strides=[1], device=cuda:1),
-          %z : Float(20, strides=[1], device=cpu)):
-      %x2 : Float(10, strides=[1], device=cpu) = aten::mul(%x, %x)
-      %y2 : Float(10, strides=[1], device=cpu) = aten::mul(%y, %y)
-      %z2 : Float(10, strides=[1], device=cpu) = aten::mul(%z, %z)
-      return (%x2, %y2, %z2))IR";
-    auto g = std::make_shared<Graph>();
-    torch::jit::parseIR(graph_string, g.get());
-
-    g->lint();
-    FuseTensorExprs(g, /* min_group_size= */ 2);
-
-    // We should not fuse these two computations since they use different
-    // devices
-    testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
-  }
-}
-
-TEST(TEFuserPass, FuserPass_MergeGroups) {
-  WithCPUFuser cf;
-  const auto graph_string = R"IR(
-    graph(%a : Float(128, strides=[1], device=cpu),
-          %b : Float(128, strides=[1], device=cpu)):
-      %x : Float(128, strides=[1], device=cpu) = aten::mul(%a, %a)
-      %y : Float(128, strides=[1], device=cpu) = aten::mul(%b, %b)
-      return (%x, %y))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-
-  g->lint();
-  FuseTensorExprs(g, /* min_group_size= */ 1);
-
-  // The %x and %y computations are completely independent and yet we should put
-  // them into a single fusion group rather than having two separate ones.
-  testing::FileCheck()
-      .check("= prim::TensorExprGroup_")
-      ->check_not("= prim::TensorExprGroup_")
-      ->run(*g);
-}
-
-TEST(TEFuserPass, FuserPass_IgnoreUnknownShapeAtStart) {
-  WithCPUFuser cf;
-  const auto graph_string = R"IR(
-    graph(%x : Bool(8, strides=[1], device=cpu),
-          %y : Bool(8, strides=[1], device=cpu)):
-      %a : Bool(8, strides=[1], device=cpu) = aten::__and__(%x, %y)
-      %b : Tensor = aten::__or__(%a, %y)
-      return (%b)
-    )IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-  g->lint();
-  FuseTensorExprs(g, /* min_group_size= */ 2);
-  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
-}
-
-TEST(TEFuserPass, FuserPass_Where) {
-  WithCPUFuser cf;
-  const auto graph_string = R"IR(
-    graph(%x : Float(8, strides=[1], device=cpu),
-          %y : Float(8, strides=[1], device=cpu),
-          %z : Float(8, strides=[1], device=cpu)):
-      %cond : Bool(8, strides=[1], device=cpu) = aten::eq(%x, %y)
-      %b : Float(8, strides=[1], device=cpu) = aten::where(%cond, %y, %z)
-      return (%b)
-    )IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-  g->lint();
-  FuseTensorExprs(g, /* min_group_size= */ 2);
-  testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
-}
-
-TEST(TEFuserPass, FuserPass_WhereList) {
-  WithCPUFuser cf;
-  const auto graph_string = R"IR(
-    graph(%x : Float(8, strides=[1], device=cpu),
-          %y : Float(8, strides=[1], device=cpu),
-          %z : Float(8, strides=[1], device=cpu)):
-      %cond : Bool(8, strides=[1], device=cpu) = aten::eq(%x, %y)
-      %b : Tensor[] = aten::where(%cond)
-      return (%b)
-    )IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-  g->lint();
-  FuseTensorExprs(g, /* min_group_size= */ 2);
-  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
-}
-
-TEST(TEFuserPass, DynamicShapeFusion) {
-  WithCPUFuser cf;
-  const auto graph_string = R"IR(
-    graph(%0 : Float(10, 5, strides=[5, 1], device=cpu),
-          %1 : Float(10, 5, strides=[5, 1], device=cpu)):
-      %2 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%0, %1)
-      %3 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%2, %1)
-      return (%3))IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-
-  g->lint();
-  FuseTensorExprs(
-      g,
-      /* min_group_size = */ 2,
-      /* add_composed_op = */ true,
-      /* fuse_to_dynamic_shapes = */ true);
-  Code code(g, "");
-
-  testing::FileCheck()
-      .check("prim::TensorExprDynamicGroup_")
-      ->check("prim::TensorExprDynamicGuard")
-      ->check("prim::TensorExprGroup_")
-      ->run(*g);
-
-  auto run_and_compare = [&](const std::vector<at::Tensor>& inputs) {
-    TORCH_INTERNAL_ASSERT(inputs.size() == 2);
-
-    auto ref = at::mul(at::mul(inputs[0], inputs[1]), inputs[1]);
-
-    InterpreterState interp(code);
-    Stack stack(inputs.begin(), inputs.end());
-    interp.run(stack);
-    at::Tensor out = pop(stack).toTensor();
-    ASSERT_TRUE(at::allclose(out, ref));
-  };
-
-  std::vector<at::Tensor> inputs = {at::rand({10, 5}), at::rand({10, 5})};
-  run_and_compare(inputs);
-
-  std::vector<at::Tensor> inputs2 = {at::rand({20, 5}), at::rand({20, 5})};
-  run_and_compare(inputs2);
-
-  std::vector<at::Tensor> inputs3 = {at::rand({25, 60}), at::rand({25, 60})};
-  run_and_compare(inputs3);
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_type.cpp b/test/cpp/tensorexpr/test_type.cpp
deleted file mode 100644
index 6758503f4de79..0000000000000
--- a/test/cpp/tensorexpr/test_type.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-#include <gtest/gtest.h>
-
-#include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/ir.h"
-#include "torch/csrc/jit/tensorexpr/tensor.h"
-
-namespace torch {
-namespace jit {
-using namespace torch::jit::tensorexpr;
-
-TEST(Type, Test01) {
-  {
-    Dtype dt1 = kInt;
-    ASSERT_EQ(dt1, kInt);
-  }
-  {
-    Dtype dt2_a(kInt, 8);
-    Dtype dt2_b(kInt, 4);
-    Dtype dt2_c(ScalarType::Int, 8);
-    ASSERT_EQ(dt2_a, dt2_c);
-    ASSERT_NE(dt2_a, dt2_b);
-  }
-  {
-    ASSERT_EQ(kInt, ToDtype<int>());
-    ASSERT_EQ(kFloat, ToDtype<float>());
-    ASSERT_EQ(kByte, ToDtype<uint8_t>());
-    ASSERT_EQ(kChar, ToDtype<int8_t>());
-    ASSERT_EQ(kShort, ToDtype<int16_t>());
-    ASSERT_EQ(kLong, ToDtype<int64_t>());
-    ASSERT_EQ(kHalf, ToDtype<at::Half>());
-    ASSERT_EQ(kDouble, ToDtype<double>());
-    ASSERT_EQ(kBool, ToDtype<bool>());
-  }
-  {
-    Dtype int32x8(kInt, 8);
-    Dtype float32x8(kFloat, 8);
-    ASSERT_NE(int32x8, float32x8);
-    ASSERT_EQ(float32x8, BinaryOpDtype(int32x8, float32x8));
-    ASSERT_EQ(float32x8, BinaryOpDtype(float32x8, int32x8));
-    ASSERT_EQ(int32x8, BinaryOpDtype(int32x8, int32x8));
-    ASSERT_EQ(float32x8, BinaryOpDtype(float32x8, float32x8));
-  }
-}
-
-TEST(Type, BitCasting) {
-  {
-    VarHandle x("x", kFloat);
-    ExprHandle y = bitcast<int32_t>(x);
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    ASSERT_EQ(y.dtype(), kInt);
-  }
-  {
-    VarHandle x("x", kInt);
-    ExprHandle y = bitcast<float>(x);
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    ASSERT_EQ(y.dtype(), kFloat);
-  }
-  {
-    VarHandle x("x", kShort);
-    ExprHandle y = bitcast<at::Half>(x);
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    ASSERT_EQ(y.dtype(), kHalf);
-  }
-  {
-    VarHandle x("x", kHalf);
-    ExprHandle y = bitcast<int16_t>(x);
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    ASSERT_EQ(y.dtype(), kShort);
-  }
-
-  constexpr int32_t ref32 = 1337;
-  constexpr int64_t ref64 = 1337;
-  constexpr float reff32 = 1337.0f;
-  constexpr double reff64 = 1337.0f;
-  using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
-  // this is broken
-  /*{
-    constexpr int16_t ref16 = 1337;
-    at::Half k_;
-    at::Half* k = &k_;
-    *reinterpret_cast<int16_t*>(k) = ref16;
-    auto a = HalfImm::make(*k);
-    auto b = BitCast::make(kShort, a);
-    SimpleIRExprEval cg(b);
-    ASSERT_EQ(cg.value<int16_t>(), ref16);
-  }*/
-
-  {
-    float k = raw_bitcast<float>(ref32);
-    auto a = FloatImm::make(k);
-    auto b = BitCast::make(kInt, a);
-    SimpleIRExprEval cg(b);
-    ASSERT_EQ(cg.value<int32_t>(), ref32);
-  }
-
-  {
-    double k = raw_bitcast<double>(ref64);
-    auto a = DoubleImm::make(k);
-    auto b = BitCast::make(kLong, a);
-    SimpleIRExprEval cg(b);
-    ASSERT_EQ(cg.value<int64_t>(), ref64);
-  }
-
-  {
-    int64_t k = raw_bitcast<int64_t>(reff64);
-    auto a = LongImm::make(k);
-    auto b = BitCast::make(kDouble, a);
-    SimpleIRExprEval cg(b);
-    ASSERT_EQ(cg.value<double>(), reff64);
-  }
-
-  {
-    int32_t k = raw_bitcast<int32_t>(reff32);
-    auto a = IntImm::make(k);
-    auto b = BitCast::make(kFloat, a);
-    SimpleIRExprEval cg(b);
-    ASSERT_EQ(cg.value<float>(), reff32);
-  }
-
-  // This segfaults :(
-  /*{
-    VarHandle x("x", kDouble);
-    ASSERT_ANY_THROW(ExprHandle y = bitcast<int32_t>(x));
-  }
-  {
-    VarHandle x("x", kFloat);
-    ASSERT_ANY_THROW(ExprHandle y = bitcast<int64_t>(x));
-  }
-  {
-    VarHandle x("x", kLong);
-    ASSERT_ANY_THROW(ExprHandle y = bitcast<float>(x));
-  }
-  {
-    VarHandle x("x", kShort);
-    ASSERT_ANY_THROW(ExprHandle y = bitcast<float>(x));
-  }
-  {
-    VarHandle x("x", kInt);
-    ASSERT_ANY_THROW(ExprHandle y = bitcast<at::Half>(x));
-  }*/
-}
-
-TEST(Type, Propagation) {
-  // Same types:
-  {
-    VarHandle x("x", kFloat);
-    VarHandle y("y", kFloat);
-    ExprHandle body = FloatImm::make(2.f) +
-        (x * FloatImm::make(3.f) + FloatImm::make(4.f) * y);
-    ASSERT_EQ(body.dtype(), kFloat);
-  }
-  // Int to bigger int:
-  {
-    VarHandle x("x", kShort);
-    VarHandle y("y", kLong);
-    ExprHandle body =
-        ShortImm::make(2.f) + (x * ShortImm::make(3) + ShortImm::make(4) * y);
-    ASSERT_EQ(body.dtype(), kLong);
-  }
-  // Float to bigger float:
-  {
-    VarHandle x("x", kHalf);
-    VarHandle y("y", kDouble);
-    ExprHandle body =
-        HalfImm::make(2.f) + (x * HalfImm::make(3) + HalfImm::make(4) * y);
-    ASSERT_EQ(body.dtype(), kDouble);
-  }
-  // Int to Float:
-  {
-    VarHandle x("x", kFloat);
-    VarHandle y("y", kInt);
-    ExprHandle body =
-        IntImm::make(2) + (x * IntImm::make(3) + IntImm::make(4) * y);
-    ASSERT_EQ(body.dtype(), kFloat);
-  }
-  // Smaller float, bigger Int:
-  {
-    VarHandle x("x", kHalf);
-    VarHandle y("y", kLong);
-    ExprHandle body =
-        HalfImm::make(2) + (x * HalfImm::make(3) + HalfImm::make(4) * y);
-    ASSERT_EQ(body.dtype(), kHalf);
-  }
-  // Bigger float, smaller Int:
-  {
-    VarHandle x("x", kChar);
-    VarHandle y("y", kDouble);
-    ExprHandle body =
-        CharImm::make(2) + (x * CharImm::make(3) + CharImm::make(4) * y);
-    ASSERT_EQ(body.dtype(), kDouble);
-  }
-  // Sign change char/byte upgrades to short:
-  {
-    VarHandle x("x", kChar);
-    VarHandle y("y", kByte);
-    ExprHandle body =
-        CharImm::make(2) + (x * CharImm::make(3) + CharImm::make(4) * y);
-    ASSERT_EQ(body.dtype(), kShort);
-  }
-}
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_type_specializations.cpp b/test/cpp/tensorexpr/test_type_specializations.cpp
deleted file mode 100644
index d9756627fa74d..0000000000000
--- a/test/cpp/tensorexpr/test_type_specializations.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <torch/csrc/autograd/generated/variable_factories.h>
-#include <torch/csrc/jit/frontend/ir_emitter.h>
-#include <torch/csrc/jit/ir/ir.h>
-#include <torch/csrc/jit/ir/irparser.h>
-#include <torch/csrc/jit/jit_log.h>
-#include <torch/csrc/jit/passes/pass_manager.h>
-#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
-
-// Test that tensor type specializations are available in
-// the custom passes
-
-namespace torch {
-namespace jit {
-
-namespace {
-
-bool hasTensorTypeSpecializations(torch::jit::Block* block) {
-  for (Value* v : block->inputs()) {
-    if (hasTensorTypeSpecialization(v))
-      return true;
-  }
-  for (Node* n : block->nodes()) {
-    for (torch::jit::Block* b : n->blocks()) {
-      if (hasTensorTypeSpecializations(b))
-        return true;
-    }
-    for (Value* v : n->outputs()) {
-      if (hasTensorTypeSpecialization(v))
-        return true;
-    }
-  }
-  return false;
-}
-
-static bool hasSpecializations = false;
-void detectTTSpecializationPass(std::shared_ptr<Graph>& graph) {
-  GRAPH_DUMP("In detectTTSpecialization Custom Post Pass: ", graph);
-  hasSpecializations = hasTensorTypeSpecializations(graph->block());
-}
-
-} // namespace
-
-TEST(SpecializationsInCustomPasses, Basic) {
-  RegisterPass p(detectTTSpecializationPass);
-  hasSpecializations = false;
-  std::shared_ptr<Graph> graph = std::make_shared<Graph>();
-  parseIR(
-      R"IR(
-graph(%a.1 : Tensor,
-      %b.1 : Tensor):
-  %c.1 : Tensor = aten::mul(%a.1, %b.1) # misc/test_specializations.py:5:8
-  %d.1 : Tensor = aten::mul(%c.1, %b.1) # misc/test_specializations.py:6:8
-  return (%d.1)
-  )IR",
-      &*graph);
-
-  IValue ival = IValue(torch::randn({22}, at::kCPU));
-  std::vector<IValue> stack = {ival, ival};
-  auto run = [&](std::shared_ptr<Graph>& graph, std::vector<IValue> stack) {
-    GraphExecutor executor(graph, "");
-    executor.run(stack);
-    return stack;
-  };
-  run(graph, stack);
-
-  // Profiling mode will not be run with simple executor
-  if (!getExecutorMode()) {
-    EXPECT_TRUE(hasSpecializations);
-  }
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_utils.h b/test/cpp/tensorexpr/test_utils.h
deleted file mode 100644
index 065e513c1a645..0000000000000
--- a/test/cpp/tensorexpr/test_utils.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#pragma once
-
-#include <memory>
-#include <vector>
-
-#include <test/cpp/tensorexpr/test_base.h>
-#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
-#include <torch/csrc/jit/testing/file_check.h>
-
-namespace torch {
-namespace jit {
-using namespace torch::jit::tensorexpr;
-
-#define IS_NODE(T, node)       \
-  {                            \
-    auto node_ = to<T>(node);  \
-    ASSERT_NE(nullptr, node_); \
-  }
-
-#define IS_NODE_WITH_NAME(T, node, name) \
-  auto name = to<T>(node);               \
-  ASSERT_NE(nullptr, name);
-
-#define IS_NODE_WITH_NAME_AND_CAST(T, node, name, Type)        \
-  NodePtr<T> name = nullptr;                                   \
-  {                                                            \
-    auto node_ = to<Cast>(node);                               \
-    ASSERT_NE(nullptr, node_);                                 \
-    ASSERT_EQ(node_->dtype().scalar_type(), ScalarType::Type); \
-    name = to<T>(node_->src_value());                          \
-  }                                                            \
-  ASSERT_NE(nullptr, name);
-
-#define IS_IMM_WITH_VAL(T, node, val) \
-  {                                   \
-    auto node_ = to<T##Imm>(node);    \
-    ASSERT_NE(nullptr, node_);        \
-    ASSERT_EQ(node_->value(), val);   \
-  }
-
-#define IS_VAR_WITH_NAME(node, name)     \
-  {                                      \
-    auto node_ = to<Var>(node);          \
-    ASSERT_NE(nullptr, node_);           \
-    ASSERT_EQ(node_->name_hint(), name); \
-  }
-
-#define IS_BINOP_W_VARS(T, node, name, v1, v2) \
-  NodePtr<T> name = nullptr;                   \
-  {                                            \
-    name = to<T>(node);                        \
-    ASSERT_NE(nullptr, name);                  \
-    IS_VAR_WITH_NAME(name->lhs(), v1);         \
-    IS_VAR_WITH_NAME(name->rhs(), v2);         \
-  }
-
-#define IS_BINOP_W_CONST(T, node, name, v, c) \
-  NodePtr<T> name = nullptr;                  \
-  {                                           \
-    name = to<T>(node);                       \
-    ASSERT_NE(nullptr, name);                 \
-    IS_VAR_WITH_NAME(name->lhs(), v);         \
-    IS_IMM_WITH_VAL(Int, name->rhs(), c);     \
-  }
-
-#define IS_RAND(node)                   \
-  {                                     \
-    auto node_ = to<Intrinsics>(node);  \
-    ASSERT_NE(nullptr, node_);          \
-    ASSERT_EQ(node_->op_type(), kRand); \
-  }
-
-void checkIR(StmtPtr s, const std::string& pattern);
-void checkExprIR(ExprPtr e, const std::string& pattern);
-void checkExprIR(const ExprHandle& e, const std::string& pattern);
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp
deleted file mode 100644
index 3f4c32af463b6..0000000000000
--- a/test/cpp/tensorexpr/tutorial.cpp
+++ /dev/null
@@ -1,542 +0,0 @@
-// *** Tensor Expressions ***
-//
-// This tutorial covers basics of NNC's tensor expressions, shows basic APIs to
-// work with them, and outlines how they are used in the overall TorchScript
-// compilation pipeline. This doc is permanently a "work in progress" since NNC
-// is under active development and things change fast.
-//
-// This Tutorial's code is compiled in the standard pytorch build, and the
-// executable can be found in `build/bin/tutorial_tensorexpr`.
-//
-// *** What is NNC ***
-//
-// NNC stands for Neural Net Compiler. It is a component of TorchScript JIT
-// and it performs on-the-fly code generation for kernels, which are often a
-// combination of multiple aten (torch) operators.
-//
-// When the JIT interpreter executes a torchscript model, it automatically
-// extracts subgraphs from the torchscript IR graph for which specialized code
-// can be JIT generated. This usually improves performance as the 'combined'
-// kernel created from the subgraph could avoid unnecessary memory traffic that
-// is unavoidable when the subgraph is interpreted as-is, operator by operator.
-// This optimization is often referred to as 'fusion'. Relatedly, the process of
-// finding and extracting subgraphs suitable for NNC code generation is done by
-// a JIT pass called 'fuser'.
-//
-// *** What is TE ***
-//
-// TE stands for Tensor Expressions. TE is a commonly used approach for
-// compiling kernels performing tensor (~matrix) computation. The idea behind it
-// is that operators are represented as a mathematical formula describing what
-// computation they do (as TEs) and then the TE engine can perform mathematical
-// simplification and other optimizations using those formulas and eventually
-// generate executable code that would produce the same results as the original
-// sequence of operators, but more efficiently.
-//
-// NNC's design and implementation of TE was heavily inspired by Halide and TVM
-// projects.
-#include <iostream>
-#include <string>
-
-#include <c10/util/irange.h>
-#include <torch/csrc/jit/ir/ir.h>
-#include <torch/csrc/jit/ir/irparser.h>
-#include <torch/csrc/jit/tensorexpr/eval.h>
-#include <torch/csrc/jit/tensorexpr/expr.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-#include <torch/csrc/jit/tensorexpr/ir_printer.h>
-#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
-#include <torch/csrc/jit/tensorexpr/kernel.h>
-#include <torch/csrc/jit/tensorexpr/loopnest.h>
-#include <torch/csrc/jit/tensorexpr/stmt.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-#include <torch/torch.h>
-
-using namespace torch::jit::tensorexpr;
-
-#ifdef TORCH_ENABLE_LLVM
-
-// Helper function to print a snippet from a big multi-line string
-static void printLinesToFrom(const std::string& input_str, int from, int to);
-
-#endif
-
-int main(int argc, char* argv[]) {
-  std::cout << "*** Structure of tensor expressions and statements ***"
-            << std::endl;
-  {
-    // A tensor expression is a tree of expressions. Each expression has a type,
-    // and that type defines what sub-expressions the current expression has.
-    // For instance, an expression of type 'Mul' would have a type 'kMul' and
-    // two subexpressions: LHS and RHS. Each of these two sub-expressions could
-    // also be a 'Mul' or some other expression.
-    //
-    // Let's construct a simple TE:
-    ExprPtr lhs = alloc<IntImm>(5);
-    ExprPtr rhs = alloc<Var>("x", kInt);
-    ExprPtr mul = alloc<Mul>(lhs, rhs);
-    std::cout << "Tensor expression: " << *mul << std::endl;
-    // Prints: Tensor expression: 5 * x
-
-    // Here we created an expression representing a 5*x computation, where x is
-    // an int variable.
-
-    // Another, probably a more convenient, way to construct tensor expressions
-    // is to use so called expression handles (as opposed to raw expressions
-    // like we did in the previous example). Expression handles overload common
-    // operations and allow us to express the same semantics in a more natural
-    // way:
-    ExprHandle l = 5;
-    ExprHandle r = Var::make("x", kInt);
-    ExprHandle m = l * r;
-    std::cout << "Tensor expression: " << *m.node() << std::endl;
-    // Prints: Tensor expression: 5 * x
-
-    // Converting from handles to raw expressions and back is easy:
-    ExprHandle handle = Var::make("x", kInt);
-    ExprPtr raw_expr_from_handle = handle.node();
-    ExprPtr raw_expr = alloc<Var>("x", kInt);
-    ExprHandle handle_from_raw_expr = ExprHandle(raw_expr);
-
-    // We could construct arbitrarily complex expressions using mathematical
-    // and logical operations, casts between various data types, and a bunch of
-    // intrinsics.
-    ExprHandle a = Var::make("a", kInt);
-    ExprHandle b = Var::make("b", kFloat);
-    ExprHandle c = Var::make("c", kFloat);
-    ExprHandle x = ExprHandle(5) * a + b / (sigmoid(c) - 3.0f);
-    std::cout << "Tensor expression: " << *x.node() << std::endl;
-    // Prints: Tensor expression: float(5 * a) + b / ((sigmoid(c)) - 3.f)
-
-    // An ultimate purpose of tensor expressions is to optimize tensor
-    // computations, and in order to represent accesses to tensors data, there
-    // is a special kind of expression - a load.
-    // To construct a load we need two pieces: the base and the indices. The
-    // base of a load is a Buf expression, which could be thought of as a
-    // placeholder similar to Var, but with dimensions info.
-    //
-    // Let's construct a simple load:
-    BufHandle A("A", {64, 32}, kInt);
-    VarPtr i_var = alloc<Var>("i", kInt), j_var = alloc<Var>("j", kInt);
-    ExprHandle i(i_var), j(j_var);
-    ExprHandle load = Load::make(A.dtype(), A, {i, j});
-    std::cout << "Tensor expression: " << *load.node() << std::endl;
-    // Prints: Tensor expression: A[i, j]
-
-    // Tensor Expressions constitute Tensor Statements, which are used to
-    // represent computation of a given operator or a group of operators from a
-    // fusion group.
-    //
-    // There are three main kinds of tensor statements:
-    //  - block
-    //  - store
-    //  - loop
-    //
-    // A Store represents a store to a single element of a tensor (or to a
-    // group of elements if it's a vectorized store). Store statements,
-    // similarly to Load expressions, have a base and indices, but on top of
-    // that they also include a value - an expression representing what needs
-    // to be stored at the given memory location. Let's create a Store stmt:
-    StmtPtr store_a = Store::make(A, {i, j}, i + j);
-    std::cout << "Store statement: " << *store_a << std::endl;
-    // Prints: Store statement: A[i, j] = i + j;
-
-    // An operator fills the entire tensor, not just a single element, and to
-    // represent this we need to use For stmt: let's wrap our store stmt with
-    // two nested loops to represent that variables i and j need to iterate
-    // over some ranges.
-    ForPtr loop_j_a = For::make(VarHandle(j_var), 0, 32, store_a);
-    ForPtr loop_i_a = For::make(VarHandle(i_var), 0, 64, loop_j_a);
-
-    std::cout << "Nested for loops: " << std::endl << *loop_i_a << std::endl;
-    // Prints:
-    // Nested for loops:
-    // for (const auto i : c10::irange(64)) {
-    //   for (const auto j : c10::irange(32)) {
-    //     A[i, j] = i + j;
-    //   }
-    // }
-
-    // A Block statement is used when we need a sequence of other statements.
-    // E.g. if a fusion group contains several operators, we initially define
-    // separate loopnest for each of them and put them all into a common block:
-    BufHandle B("B", {64, 32}, kInt);
-    StmtPtr store_b = Store::make(B, {i, j}, A.load(i, j));
-    ForPtr loop_j_b = For::make(VarHandle(j_var), 0, 32, store_b);
-    ForPtr loop_i_b = For::make(VarHandle(i_var), 0, 64, loop_j_b);
-
-    BlockPtr block = Block::make({loop_i_a, loop_i_b});
-    std::cout << "Compound Block statement: " << std::endl
-              << *block << std::endl;
-    // Prints:
-    // Compound Block statement:
-    // {
-    //   for (const auto i : c10::irange(64)) {
-    //     for (const auto j : c10::irange(32)) {
-    //       A[i, j] = i + j;
-    //     }
-    //   }
-    //   for (const auto i : c10::irange(64)) {
-    //     for (const auto j : c10::irange(32)) {
-    //       B[i, j] = A[i, j];
-    //     }
-    //   }
-    // }
-
-    // Manually constructing nested loops and blocks to represent a computation
-    // might be laborious, and instead we can use a 'Compute' API. This API
-    // requires us to specify dimensions and a lambda to compute a single
-    // element of the resulting tensor and returns a `Tensor` structure. This
-    // structure is simply a pair of a buffer that was created to represent the
-    // result of the computation (BufPtr) and a statement representing the
-    // computation itself (StmtPtr).
-    Tensor C =
-        Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
-          return i * j;
-        });
-    std::cout << "Stmt produced by 'Compute' API: " << std::endl
-              << *C.stmt() << std::endl;
-    // Prints:
-    // Stmt produced by 'Compute' API:
-    // for (const auto i : c10::irange(64)) {
-    //   for (const auto j : c10::irange(32)) {
-    //     C[i, j] = i * j;
-    //   }
-    // }
-
-    // To construct statements to represent computations with reductions, we
-    // can use a 'Reduce' API - it is similar to 'Compute' but takes a couple
-    // of extra arguments defining how to perform the reduction. Let's define a
-    // simple 2D sum of C using that:
-    Tensor D = Reduce(
-        "D",
-        {},
-        Sum(),
-        [&](const VarHandle& i, const VarHandle& j) { return C.load(i, j); },
-        {64, 32});
-    std::cout << "Stmt produced by 'Reduce' API: " << std::endl
-              << *D.stmt() << std::endl;
-  }
-
-  std::cout << "*** Loopnests transformations ***" << std::endl;
-  {
-    // When a statement for the computation is generated, we might want to
-    // apply some optimizations to it. These transformations allow us to end up
-    // with a statement producing the same results, but more efficiently.
-    //
-    // Let's look at a couple of transformations that are used in NNC. We will
-    // begin with constructing a Block statement like we did before.
-
-    Tensor C =
-        Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
-          return i * (j + 1);
-        });
-    BufHandle c_buf(C.buf());
-    Tensor D =
-        Compute("D", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
-          return c_buf.load(i, j) - i;
-        });
-    StmtPtr block = Block::make({C.stmt(), D.stmt()});
-    std::cout << "Stmt produced by 'Compute' API: " << std::endl
-              << *block << std::endl;
-    // Prints:
-    // Stmt produced by 'Compute' API:
-    // {
-    //   for (const auto i : c10::irange(64)) {
-    //     for (const auto j : c10::irange(32)) {
-    //       C[i, j] = i * (j + 1);
-    //     }
-    //   }
-    //   for (const auto i_1 : c10::irange(64)) {
-    //     for (const auto j_1 : c10::irange(32)) {
-    //       D[i_1, j_1] = (C[i_1, j_1]) - i_1;
-    //     }
-    //   }
-    // }
-
-    // One transformation we can apply to this computation is inlining: i.e.
-    // taking the expression that defines values of C and substituting a load
-    // from C with it.
-    // To do that, we first need to create a special object called LoopNest -
-    // all transformations are methods of this class. To create a loopnest we
-    // need to provide a list of output buffers and the root statement:
-    LoopNest nest(block, {D.buf()});
-
-    // We can always retrieve the Stmt back from LoopNest:
-    std::cout << "LoopNest root stmt: " << std::endl
-              << *nest.root_stmt() << std::endl;
-    // Prints:
-    // LoopNest root stmt:
-    // {
-    //   for (const auto i : c10::irange(64)) {
-    //     for (const auto j : c10::irange(32)) {
-    //       C[i, j] = i * (j + 1);
-    //     }
-    //   }
-    //   for (const auto i_1 : c10::irange(64)) {
-    //     for (const auto j_1 : c10::irange(32)) {
-    //       D[i_1, j_1] = (C[i_1, j_1]) - i_1;
-    //     }
-    //   }
-    // }
-
-    // Now we can apply the inlining transformation:
-    nest.computeInline(C.buf());
-    std::cout << "Stmt after inlining:" << std::endl
-              << *nest.root_stmt() << std::endl;
-    // Prints:
-    // Stmt after inlining:
-    // {
-    //   for (const auto i : c10::irange(64)) {
-    //     for (const auto j : c10::irange(32)) {
-    //       D[i, j] = i * (j + 1) - i;
-    //     }
-    //   }
-    // }
-
-    // We can also apply algebraic simplification to a statement:
-    StmtPtr simplified = IRSimplifier::simplify(nest.root_stmt());
-    std::cout << "Stmt after simplification:" << std::endl
-              << *simplified << std::endl;
-    // Prints:
-    // Stmt after simplification:
-    // {
-    //   for (const auto i : c10::irange(64)) {
-    //     for (const auto j : c10::irange(32)) {
-    //       D[i, j] = i * j;
-    //     }
-    //   }
-    // }
-
-    // Many loopnest transformations are stateless and can be applied without
-    // creating a LoopNest object. In fact, we plan to make all transformations
-    // stateless.
-    // splitWithTail is one such transformation: it splits an iteration space
-    // of a given loop into two with a given factor.
-    ForPtr outer_loop = to<For>(to<Block>(simplified)->stmts().front());
-    LoopNest::splitWithTail(outer_loop, 13);
-    // Call simplifier once more to fold some arithmetic.
-    simplified = IRSimplifier::simplify(simplified);
-    std::cout << "Stmt after splitWithTail:" << std::endl
-              << *simplified << std::endl;
-    // Prints:
-    // Stmt after splitWithTail:
-    // {
-    //   for (const auto i_outer : c10::irange(4)) {
-    //     for (const auto i_inner : c10::irange(13)) {
-    //       for (const auto j : c10::irange(32)) {
-    //         D[i_inner + 13 * i_outer, j] = i_inner * j + 13 * (i_outer * j);
-    //       }
-    //     }
-    //   }
-    //   for (const auto i_tail : c10::irange(12)) {
-    //     for (const auto j : c10::irange(32)) {
-    //       D[i_tail + 52, j] = i_tail * j + 52 * j;
-    //     }
-    //   }
-    // }
-
-    // NNC supports a wide range of loop nest transformations, which we are not
-    // listing here. Please refer to documentation in
-    // https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/tensorexpr/loopnest.h
-    // for more details.
-  }
-
-  std::cout << "*** Codegen ***" << std::endl;
-  {
-    // An ultimate goal of tensor expressions is to be provide a mechanism to
-    // execute a given computation in the fastest possible way. So far we've
-    // looked at how we could describe what computation we're interested in, but
-    // we haven't looked at how to actually execute it.
-    //
-    // All we've been dealing with was just symbols with no actual data
-    // associated, in this section we would look at how we can bridge that gap.
-
-    // Let's start by constructing a simple computation for us to work with:
-    BufHandle A("A", {64, 32}, kInt);
-    BufHandle B("B", {64, 32}, kInt);
-    Tensor X =
-        Compute("X", {64, 32}, [&](const VarHandle& i, const VarHandle& j) {
-          return A.load(i, j) + B.load(i, j);
-        });
-
-    // And let's lower it to a loop nest, as we did in the previous section. We
-    // can pass Tensor object directly:
-    LoopNest loopnest({X});
-    std::cout << *loopnest.root_stmt() << std::endl;
-    // Prints:
-    // {
-    //   for (const auto i : c10::irange(64)) {
-    //     for (const auto j : c10::irange(32)) {
-    //       X[i, j] = (A[i, j]) + (B[i, j]);
-    //     }
-    //   }
-
-    // Now imagine that we have two actual tensors 64x32 that we want sum
-    // together, how do we pass those tensors to the computation and how do we
-    // carry it out?
-    //
-    // Codegen object is aimed at providing exactly that functionality. Codegen
-    // is an abstract class and concrete codegens are derived from it.
-    // Currently, we have three codegens:
-    //  1) Simple Evaluator,
-    //  2) LLVM Codegen for CPU,
-    //  3) CUDA Codegen.
-    // In this example we will be using Simple Evaluator, since it's available
-    // everywhere.
-
-    // To create a codegen, we need to provide the statement - it specifies the
-    // computation we want to perform - and a list of placeholders and tensors
-    // used in the computation. The latter part is crucial since that's the only
-    // way the codegen could use to correlate symbols in the statement to actual
-    // data arrays that we will be passing when we will actually be performing
-    // the computation.
-    //
-    // Let's create a Simple IR Evaluator codegen for our computation:
-    SimpleIREvaluator ir_eval(loopnest.root_stmt(), {A, B, X});
-
-    // We are using the simplest codegen and in it almost no work is done at the
-    // construction step. Real codegens such as CUDA and LLVM perform
-    // compilation during that stage so that when we're about to run the
-    // computation everything is ready.
-
-    // Let's now create some inputs and run our computation with them:
-    std::vector<int> data_A(64 * 32, 3); // This will be the input A
-    std::vector<int> data_B(64 * 32, 5); // This will be the input B
-    std::vector<int> data_X(64 * 32, 0); // This will be used for the result
-
-    // Now let's invoke our codegen to perform the computation on our data. We
-    // need to provide as many arguments as how many placeholders and tensors we
-    // passed at the codegen construction time. A position in these lists would
-    // define how real data arrays from the latter call (these arguments are
-    // referred to as 'CallArg's in our codebase) correspond to symbols
-    // (placeholders and tensors) used in the tensor expressions we constructed
-    // (these are referred to as 'BufferArg').
-    // Thus, we will provide three arguments: data_A, data_B, and data_X. data_A
-    // contains data for the placeholder A, data_B - for the placeholder B, and
-    // data_X would be used for contents of tensor X.
-    ir_eval(data_A, data_B, data_X);
-
-    // Let's print one of the elements from each array to verify that the
-    // computation did happen:
-    std::cout << "A[10] = " << data_A[10] << std::endl
-              << "B[10] = " << data_B[10] << std::endl
-              << "X[10] = A[10] + B[10] = " << data_X[10] << std::endl;
-    // Prints:
-    // A[10] = 3
-    // B[10] = 5
-    // X[10] = A[10] + B[10] = 8
-  }
-
-  std::cout << "*** Lowering TorchScript IR to TensorExpr IR ***" << std::endl;
-  {
-    // This section requires a LLVM-enabled PyTorch build, so we have to use a
-    // guard:
-#ifdef TORCH_ENABLE_LLVM
-
-    // Often we would like to convert a TorchScript IR to TE rather than
-    // construct TE IR from scratch.  NNC provides an API to perform such
-    // lowering: it takes a TorchScript graph and returns an object that can be
-    // used to invoke the generated kernel.
-    // This API is currently used by the TorchScript JIT fuser and can also be
-    // used ahead of time to pre-compile parts of a model.
-    //
-    // To get familiar with this API let's first start with defining a simple
-    // TorchScript graph:
-    const auto graph_string = R"IR(
-        graph(%A : Float(5, 3, strides=[3, 1], device=cpu),
-              %B : Float(5, 3, strides=[3, 1], device=cpu)):
-          %AB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %B)
-          %one : int = prim::Constant[value=1]()
-          %AAB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %AB)
-          %AAB_plus_B: Float(5, 3, strides=[3, 1]) = aten::add(%AAB, %B, %one)
-          return (%AAB_plus_B))IR";
-    auto graph = std::make_shared<torch::jit::Graph>();
-    parseIR(graph_string, &*graph);
-
-    // This graph defines a simple computation of A*A*B + B where A and B are
-    // input 5x3 tensors.
-
-    // To lower this TorchScript graph to TE, we just need to create a
-    // TensorExprKernel object. In its constructor it constructs the
-    // corresponding TE IR and compiles it for the given backend (in this
-    // example for CPU using LLVM compiler).
-    TensorExprKernel kernel(graph);
-
-    // We can retrieve the generated TE stmt from the kernel object:
-    StmtPtr kernel_stmt = kernel.getCodeGenStmt();
-    std::cout << "TE Stmt constructed from TorchScript: " << std::endl
-              << *kernel_stmt << std::endl;
-    // Prints:
-    // TE Stmt constructed from TorchScript:
-    // {
-    //   for (const auto v : c10::irange(5)) {
-    //     for (const auto _tail_tail : c10::irange(3)) {
-    //       aten_add[_tail_tail + 3 * v] = (tA[_tail_tail + 3 * v]) *
-    //       ((tA[_tail_tail + 3 * v]) * (tB[_tail_tail + 3 * v])) +
-    //       (tB[_tail_tail + 3 * v]);
-    //     }
-    //   }
-    // }
-
-    // We can also examine generated LLVM IR and assembly code:
-    std::cout << "Generated LLVM IR: " << std::endl;
-    auto ir_str = kernel.getCodeText("ir");
-    printLinesToFrom(ir_str, 15, 20);
-    // Prints:
-    // Generated LLVM IR:
-    //   %9 = bitcast float* %2 to <8 x float>*
-    //   %10 = load <8 x float>, <8 x float>* %9 ...
-    //   %11 = bitcast float* %5 to <8 x float>*
-    //   %12 = load <8 x float>, <8 x float>* %11 ...
-    //   %13 = fmul <8 x float> %10, %12
-    //   %14 = fmul <8 x float> %10, %13
-
-    std::cout << "Generated assembly: " << std::endl;
-    auto asm_str = kernel.getCodeText("asm");
-    printLinesToFrom(asm_str, 10, 15);
-    // Prints:
-    // Generated assembly:
-    //         vmulps  %ymm1, %ymm0, %ymm2
-    //         vfmadd213ps     %ymm1, %ymm0, %ymm2
-    //         vmovups %ymm2, (%rax)
-    //         vmovss  32(%rcx), %xmm0
-    //         vmovss  32(%rdx), %xmm1
-    //         vmulss  %xmm1, %xmm0, %xmm2
-
-    // We can also execute the generated kernel:
-    auto A =
-        at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
-        2.0;
-    auto B =
-        at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
-        3.0;
-    std::vector<at::Tensor> inputs = {A, B};
-    std::vector<torch::IValue> stack = torch::fmap<torch::IValue>(inputs);
-    kernel.run(stack);
-    auto R = stack[0].toTensor();
-
-    // Let's print one of the elements from the result tensor to verify that the
-    // computation did happen and was correct:
-    std::cout << "R[2][2] = " << R[2][2] << std::endl;
-    // Prints:
-    // R[2][2] = 15
-    // [ CPUFloatType{} ]
-#endif
-  }
-  return 0;
-}
-
-void printLinesToFrom(const std::string& input_str, int from, int to) {
-  std::istringstream f(input_str);
-  std::string s;
-  int idx = 0;
-  while (getline(f, s)) {
-    if (idx > from) {
-      std::cout << s << "\n";
-    }
-    if (idx++ > to) {
-      break;
-    }
-  }
-}
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
index 63e9eb77dd34e..306a882627d4b 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@@ -1,8 +1,14 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/stable/accelerator.h>
 #include <torch/csrc/stable/library.h>
 #include <torch/csrc/stable/tensor.h>
 #include <torch/csrc/stable/ops.h>
 #include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/core/ScalarType.h>
+
+#ifdef LAE_USE_CUDA
+#include <cuda_runtime.h>
+#endif
 
 #include <optional>
 
@@ -36,6 +42,11 @@ Tensor sgd_out_of_place(
     const bool maximize) {
   STD_TORCH_CHECK(param.dim() == 1, "param must be 1D");
 
+  // these test the get_device() and get_device_index() methods
+  // while ascertaining that we are still on CPU
+  STD_TORCH_CHECK(param.get_device() == -1, "CPU device index = -1");
+  STD_TORCH_CHECK(param.get_device_index() == -1, "CPU device index = -1");
+
   int64_t *param_sizes;
   int64_t *param_strides;
   aoti_torch_get_sizes(param.get(), &param_sizes);
@@ -129,12 +140,10 @@ Tensor my_ones_like(Tensor t, StableIValue device) {
   const auto num_args = 6;
   StableIValue stack[num_args];
 
-  int32_t t_dtype;
-  aoti_torch_get_dtype(t.get(), &t_dtype);
   auto mf = aoti_torch_memory_format_contiguous_format();
 
   stack[0] = from(t);
-  stack[1] = from(std::optional(t_dtype));    // dtype
+  stack[1] = from(std::optional(t.scalar_type()));    // dtype
   stack[2] = from(std::nullopt);              // layout
   stack[3] = from(std::optional(device));     // device
   stack[4] = from(std::optional(false));      // pin_memory
@@ -278,6 +287,16 @@ void boxed_empty_like(StableIValue* stack, uint64_t num_args, uint64_t num_outpu
   stack[0] = from(res);
 }
 
+bool my_is_cpu(Tensor t) {
+  return t.is_cpu();
+}
+
+
+void boxed_my_is_cpu(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  auto res = my_is_cpu(to<Tensor>(stack[0]));
+  stack[0] = from(res);
+}
+
 Tensor fill_infinity(Tensor t) {
   auto value = std::numeric_limits<float>::infinity();
   return fill_(t, value);
@@ -291,18 +310,82 @@ void boxed_fill_infinity(
   stack[0] = from(res);
 }
 
+Tensor my_pad(Tensor t) {
+  std::vector<int64_t> padding = {1, 2, 2, 1};
+  std::string mode = "constant";
+  double value = 0.0;
+  return pad(t, padding, mode, value);
+}
+
+void boxed_my_pad(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  auto res = my_pad(to<Tensor>(stack[0]));
+  stack[0] = from(res);
+}
+
+Tensor my_narrow(Tensor t, int64_t dim, int64_t start, int64_t length) {
+  return narrow(t, dim, start, length);
+}
+
+void boxed_my_narrow(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  auto res = my_narrow(
+      to<Tensor>(stack[0]),
+      to<int64_t>(stack[1]),
+      to<int64_t>(stack[2]),
+      to<int64_t>(stack[3]));
+  stack[0] = from(res);
+}
+
+Tensor my_new_empty_dtype_variant(Tensor t) {
+  std::vector<int64_t> sizes = {2, 5};
+  auto dtype = std::make_optional(torch::headeronly::ScalarType::BFloat16);
+  return new_empty(t, sizes, dtype);
+}
+
+void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  auto res = my_new_empty_dtype_variant(to<Tensor>(stack[0]));
+  stack[0] = from(res);
+}
+
+Tensor my_new_zeros_dtype_variant(Tensor t) {
+  std::vector<int64_t> sizes = {2, 5};
+  auto dtype = std::make_optional(at::ScalarType::Float);
+  return new_zeros(t, sizes, dtype);
+}
+
+void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  auto res = my_new_zeros_dtype_variant(to<Tensor>(stack[0]));
+  stack[0] = from(res);
+}
+
 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
   m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
   m.def("my_empty_like(Tensor t) -> Tensor");
   m.def("fill_infinity(Tensor(a!) t) -> Tensor(a!)");
+  m.def("my_pad(Tensor t) -> Tensor");
+  m.def("my_narrow(Tensor t, int dim, int start, int length) -> Tensor");
+  m.def("my_new_empty_dtype_variant(Tensor t) -> Tensor");
+  m.def("my_new_zeros_dtype_variant(Tensor t) -> Tensor");
 }
 
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
   m.impl("my_transpose", &boxed_my_transpose);
   m.impl("my_empty_like", &boxed_empty_like);
   m.impl("fill_infinity", &boxed_fill_infinity);
+  m.impl("my_is_cpu", &boxed_my_is_cpu);
+  m.impl("my_new_empty_dtype_variant", &boxed_my_new_empty_dtype_variant);
+  m.impl("my_new_zeros_dtype_variant", &boxed_my_new_zeros_dtype_variant);
 }
 
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
+  m.impl("my_pad", &boxed_my_pad);
+  m.impl("my_narrow", &boxed_my_narrow);
+}
 
 Tensor my_zero_(Tensor t) {
   return zero_(t);
@@ -313,10 +396,159 @@ void boxed_my_zero_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs
   stack[0] = from(res);
 }
 
+Tensor my_amax(Tensor t) {
+  return amax(t, 0, false);
+}
+
+void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  auto res = my_amax(to<Tensor>(stack[0]));
+  stack[0] = from(res);
+}
+
+Tensor my_amax_vec(Tensor t) {
+  std::vector<int64_t> v = {0,1};
+  return amax(t, v, false);
+}
+
+void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  auto res = my_amax_vec(to<Tensor>(stack[0]));
+  stack[0] = from(res);
+}
+
+
 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
   m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
+  m.def("my_amax(Tensor a) -> Tensor");
+  m.def("my_amax_vec(Tensor a) -> Tensor");
+  m.def("my_is_cpu(Tensor t) -> bool");
 }
 
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
   m.impl("my_zero_", &boxed_my_zero_);
 }
+
+bool test_default_constructor(bool defined) {
+  Tensor out;
+  if (defined) {
+    AtenTensorHandle defined_ath;
+    int64_t sizes[] = {2, 3};
+    int64_t strides[] = {3, 1};
+    aoti_torch_empty_strided(
+        2,
+        sizes,
+        strides,
+        aoti_torch_dtype_float32(),
+        aoti_torch_device_type_cpu(),
+        0,
+        &defined_ath);
+    out = Tensor(defined_ath);
+  }
+  return out.defined();
+}
+
+void boxed_test_default_constructor(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  bool res = test_default_constructor(to<bool>(stack[0]));
+  stack[0] = from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("test_default_constructor(bool undefined) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("test_default_constructor", &boxed_test_default_constructor);
+  m.impl("my_amax", &boxed_my_amax);
+  m.impl("my_amax_vec", &boxed_my_amax_vec);
+}
+
+// Test functions for torch::stable::accelerator APIs
+
+#ifdef LAE_USE_CUDA
+int64_t test_device_guard(int64_t device_index) {
+  using torch::stable::accelerator::DeviceGuard;
+
+  STD_TORCH_CHECK(
+      device_index >= std::numeric_limits<int32_t>::min() &&
+          device_index <= std::numeric_limits<int32_t>::max(),
+      "Device index is out of range of DeviceIndex (int32_t).");
+
+  DeviceGuard guard(device_index);
+  int currentDevice;
+  cudaError_t err = cudaGetDevice(&currentDevice);
+  STD_TORCH_CHECK(err == cudaSuccess);
+  return currentDevice;
+}
+
+void boxed_test_device_guard(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  int res = test_device_guard(static_cast<int64_t>(to<int64_t>(stack[0])));
+  stack[0] = from(res);
+}
+
+int64_t test_device_guard_set_index() {
+  using torch::stable::accelerator::DeviceGuard;
+
+  DeviceGuard guard(1);
+  guard.set_index(0);
+  int currentDevice;
+  cudaError_t err = cudaGetDevice(&currentDevice);
+  STD_TORCH_CHECK(err == cudaSuccess);
+  return currentDevice;
+}
+
+void boxed_test_device_guard_set_index(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  int64_t res = test_device_guard_set_index();
+  stack[0] = from(res);
+}
+
+int64_t test_stream(int32_t device_index) {
+  STD_TORCH_CHECK(
+      device_index >= std::numeric_limits<int32_t>::min() &&
+          device_index <= std::numeric_limits<int32_t>::max(),
+      "Device index is out of range of DeviceIndex (int32_t).");
+
+  return torch::stable::accelerator::getCurrentStream(device_index).id();
+}
+
+void boxed_test_stream(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  int64_t res = test_stream(static_cast<int64_t>(to<int64_t>(stack[0])));
+  stack[0] = from(res);
+}
+
+int64_t test_get_current_device_index() {
+  return torch::stable::accelerator::getCurrentDeviceIndex();
+}
+
+void boxed_test_get_current_device_index(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  int64_t res = test_get_current_device_index();
+  stack[0] = from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("test_device_guard(int device_index) -> int");
+  m.def("test_device_guard_set_index() -> int");
+  m.def("test_stream(int device_index) -> int");
+  m.def("test_get_current_device_index() -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("test_device_guard", &boxed_test_device_guard);
+  m.impl("test_device_guard_set_index", &boxed_test_device_guard_set_index);
+  m.impl("test_stream", &boxed_test_stream);
+  m.impl("test_get_current_device_index", &boxed_test_get_current_device_index);
+}
+#endif // LAE_USE_CUDA
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
index 1694bfa1b3965..074461d352740 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@@ -51,6 +51,19 @@ def my_abs(t) -> Tensor:
     return torch.ops.libtorch_agnostic.my_abs.default(t)
 
 
+def my_is_cpu(t) -> bool:
+    """
+    Returns is_cpu on the input tensor.
+
+    Args:
+        t: any Tensor
+
+    Returns:
+        a bool
+    """
+    return torch.ops.libtorch_agnostic.my_is_cpu.default(t)
+
+
 def my_ones_like(tensor, device) -> Tensor:
     """
     Returns a new Tensor like the input tensor, but with all ones
@@ -154,6 +167,30 @@ def my_zero_(t) -> Tensor:
     return torch.ops.libtorch_agnostic.my_zero_.default(t)
 
 
+def my_amax(t) -> Tensor:
+    """
+    Returns t.amax()
+
+    Args:
+        t: Tensor
+
+    Returns: amax(t)
+    """
+    return torch.ops.libtorch_agnostic.my_amax.default(t)
+
+
+def my_amax_vec(t) -> Tensor:
+    """
+    Returns t.amax()
+
+    Args:
+        t: Tensor
+
+    Returns: amax(t)
+    """
+    return torch.ops.libtorch_agnostic.my_amax_vec.default(t)
+
+
 def fill_infinity(t) -> Tensor:
     """
     Fills the tensor with inf.
@@ -164,3 +201,109 @@ def fill_infinity(t) -> Tensor:
     Returns: The modified tensor (same as input)
     """
     return torch.ops.libtorch_agnostic.fill_infinity.default(t)
+
+
+def test_default_constructor(defined) -> bool:
+    """
+    Tests the default constructor for torch::stable::Tensor.
+
+    Args:
+        defined: bool - if True, tests defined tensor; if False, tests undefined tensor
+
+    Returns: bool - result of calling .defined() on the tensor
+    """
+    return torch.ops.libtorch_agnostic.test_default_constructor.default(defined)
+
+
+def my_pad(t) -> Tensor:
+    """
+    Pads the input tensor with hardcoded padding parameters.
+
+    Args:
+        t: Input tensor
+
+    Returns: Padded tensor with padding [1, 2, 2, 1], mode "constant", value 0.0
+    """
+    return torch.ops.libtorch_agnostic.my_pad.default(t)
+
+
+def my_narrow(t, dim, start, length) -> Tensor:
+    """
+    Returns a new tensor that is a narrowed version of the input tensor.
+
+    Args:
+        t: Input tensor
+        dim: Dimension along which to narrow
+        start: Starting position
+        length: Length of the narrowed section
+
+    Returns: Narrowed tensor
+    """
+    return torch.ops.libtorch_agnostic.my_narrow.default(t, dim, start, length)
+
+
+def test_device_guard(device_index) -> int:
+    """
+    Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
+
+    Args:
+        device_index: Device index to set the guard to
+
+    Returns: result of cudaGetDevice() as an integer after using the guard
+    """
+    return torch.ops.libtorch_agnostic.test_device_guard.default(device_index)
+
+
+def test_device_guard_set_index() -> int:
+    """
+    Tests the DeviceGuard set_index functionality by creating a device guard with index 1,
+    then setting it to index 0, and returning the current device.
+
+    Returns: result of cudaGetDevice() as an integer after using set_index
+    """
+    return torch.ops.libtorch_agnostic.test_device_guard_set_index.default()
+
+
+def test_stream(device_index) -> int:
+    """
+    Tests the Stream functionality by getting the current stream ID for the specified device.
+
+    Args:
+        device_index: Device index to get the stream for
+
+    Returns: Stream ID as an integer
+    """
+    return torch.ops.libtorch_agnostic.test_stream.default(device_index)
+
+
+def test_get_current_device_index() -> int:
+    """
+    Tests the getCurrentDeviceIndex functionality by getting the current device index.
+
+    Returns: Current device index as an integer
+    """
+    return torch.ops.libtorch_agnostic.test_get_current_device_index.default()
+
+
+def my_new_empty_dtype_variant(t) -> Tensor:
+    """
+    Returns a new empty tensor with shape [2, 5] and dtype bfloat16
+
+    Args:
+        t: Input tensor used as a reference for device and other properties
+
+    Returns: New empty tensor with shape [2, 5] and dtype bfloat16
+    """
+    return torch.ops.libtorch_agnostic.my_new_empty_dtype_variant.default(t)
+
+
+def my_new_zeros_dtype_variant(t) -> Tensor:
+    """
+    Returns a new tensor filled with 0s with shape [2, 5] and dtype Float
+
+    Args:
+        t: Input tensor used as a reference for device and other properties
+
+    Returns: New zeros tensor
+    """
+    return torch.ops.libtorch_agnostic.my_new_zeros_dtype_variant.default(t)
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/setup.py b/test/cpp_extensions/libtorch_agnostic_extension/setup.py
index 5cd18f5579f93..b7141a3e6fcd6 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/setup.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/setup.py
@@ -4,7 +4,8 @@
 
 from setuptools import find_packages, setup
 
-from torch.utils.cpp_extension import BuildExtension, CppExtension
+import torch
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
 
 
 ROOT_DIR = Path(__file__).parent
@@ -35,10 +36,16 @@ def get_extension():
         "cxx": ["-fdiagnostics-color=always"],
     }
 
+    extension = CppExtension
+    # allow including <cuda_runtime.h>
+    if torch.cuda.is_available():
+        extra_compile_args["cxx"].append("-DLAE_USE_CUDA")
+        extension = CUDAExtension
+
     sources = list(CSRC_DIR.glob("**/*.cpp"))
 
     return [
-        CppExtension(
+        extension(
             "libtorch_agnostic._C",
             sources=sorted(str(s) for s in sources),
             py_limited_api=True,
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
index bd409a0eb5a69..0f471e8132a60 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@@ -5,6 +5,7 @@
 
 import torch
 from torch.testing._internal.common_device_type import (
+    deviceCountAtLeast,
     instantiate_device_type_tests,
     onlyCPU,
     onlyCUDA,
@@ -189,7 +190,7 @@ def test_my_empty_like(self, device):
 
             deterministic = torch.are_deterministic_algorithms_enabled()
             try:
-                # set use_deterministic_algorithms to fill unintialized memory
+                # set use_deterministic_algorithms to fill uninitialized memory
                 torch.use_deterministic_algorithms(True)
 
                 t = torch.rand(2, 7, device=device)
@@ -208,6 +209,27 @@ def test_my_zero_(self, device):
             self.assertEqual(id(out), id(t))
             self.assertEqual(out, torch.zeros_like(t))
 
+        def test_my_amax(self, device):
+            import libtorch_agnostic
+
+            t = torch.rand(2, 7, device=device)
+            out = libtorch_agnostic.ops.my_amax(t)
+            self.assertEqual(out, torch.amax(t, 0))
+
+        def test_my_amax_vec(self, device):
+            import libtorch_agnostic
+
+            t = torch.rand(2, 7, 5, device=device)
+            out = libtorch_agnostic.ops.my_amax_vec(t)
+            self.assertEqual(out, torch.amax(t, (0, 1)))
+
+        def test_my_is_cpu(self, device):
+            import libtorch_agnostic
+
+            t = torch.rand(2, 7, device=device)
+            out = libtorch_agnostic.ops.my_is_cpu(t)
+            self.assertEqual(out, t.is_cpu)
+
         def test_fill_infinity(self, device):
             import libtorch_agnostic
 
@@ -218,6 +240,111 @@ def test_fill_infinity(self, device):
             expected = torch.full_like(t, math.inf)
             self.assertEqual(out, expected)
 
+        @onlyCPU
+        def test_default_constructor(self):
+            import libtorch_agnostic
+
+            defined_tensor_is_defined = libtorch_agnostic.ops.test_default_constructor(
+                True
+            )
+            self.assertTrue(defined_tensor_is_defined)
+
+            undefined_tensor_is_defined = (
+                libtorch_agnostic.ops.test_default_constructor(False)
+            )
+            self.assertFalse(undefined_tensor_is_defined)
+
+        def test_my_pad(self, device):
+            import libtorch_agnostic
+
+            t = torch.rand(2, 3, device=device)
+            out = libtorch_agnostic.ops.my_pad(t)
+            expected = torch.nn.functional.pad(t, [1, 2, 2, 1], "constant", 0.0)
+            self.assertEqual(out, expected)
+
+        def test_my_narrow(self, device):
+            import libtorch_agnostic
+
+            t = torch.randn(2, 5, device=device)
+
+            dim0 = 0
+            start0 = 0
+            length0 = 1
+            out0 = libtorch_agnostic.ops.my_narrow(t, dim0, start0, length0)
+            expected0 = torch.narrow(t, dim0, start0, length0)
+            self.assertEqual(out0, expected0)
+
+        @onlyCUDA
+        @deviceCountAtLeast(2)
+        def test_device_guard(self, device):
+            import libtorch_agnostic
+
+            device_index = 1
+            out = libtorch_agnostic.ops.test_device_guard(device_index)
+            self.assertEqual(out, device_index)
+
+        @onlyCUDA
+        @deviceCountAtLeast(2)
+        def test_device_guard_set_index(self, device):
+            import libtorch_agnostic
+
+            # This test creates a DeviceGuard with index 1, then sets it to index 0
+            # and returns the current device (should be 0)
+            out = libtorch_agnostic.ops.test_device_guard_set_index()
+            self.assertEqual(out, 0)
+
+        @onlyCUDA
+        def test_stream(self, device):
+            import libtorch_agnostic
+
+            stream = torch.cuda.Stream()
+            device = torch.cuda.current_device()
+
+            with stream:
+                expected_stream_id = torch.cuda.current_stream(0).stream_id
+                stream_id = libtorch_agnostic.ops.test_stream(device)
+
+            self.assertEqual(stream_id, expected_stream_id)
+
+        @onlyCUDA
+        @deviceCountAtLeast(2)
+        def test_get_current_device_index(self, device):
+            import libtorch_agnostic
+
+            prev_device = torch.cuda.current_device()
+
+            try:
+                expected_device = 1
+                torch.cuda.set_device(expected_device)
+
+                current_device = libtorch_agnostic.ops.test_get_current_device_index()
+                self.assertEqual(current_device, expected_device)
+            finally:
+                torch.cuda.set_device(prev_device)
+
+        def test_my_new_empty_dtype_variant(self, device):
+            import libtorch_agnostic
+
+            deterministic = torch.are_deterministic_algorithms_enabled()
+            try:
+                # set use_deterministic_algorithms to fill uninitialized memory
+                torch.use_deterministic_algorithms(True)
+                t = torch.randn(3, 4, device=device)
+                out = libtorch_agnostic.ops.my_new_empty_dtype_variant(t)
+                ref_out = t.new_empty((2, 5), dtype=torch.bfloat16)
+
+                self.assertEqual(out, ref_out, exact_device=True)
+            finally:
+                torch.use_deterministic_algorithms(deterministic)
+
+        def test_my_new_zeros_dtype_variant(self, device):
+            import libtorch_agnostic
+
+            t = torch.randn(3, 4, device=device)
+            out = libtorch_agnostic.ops.my_new_zeros_dtype_variant(t)
+            ref_out = t.new_zeros((2, 5), dtype=torch.float)
+            self.assertEqual(out, ref_out, exact_device=True)
+
     instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
 
 if __name__ == "__main__":
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/CMakeLists.txt b/test/cpp_extensions/open_registration_extension/torch_openreg/CMakeLists.txt
index 73163b8cb1ae8..2c207ca63eabd 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/CMakeLists.txt
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/CMakeLists.txt
@@ -4,28 +4,31 @@ project(TORCH_OPENREG CXX C)
 
 include(GNUInstallDirs)
 include(CheckCXXCompilerFlag)
-include(CMakeDependentOption)
-
-set(CMAKE_SKIP_BUILD_RPATH FALSE)
-set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
-set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
-set(CMAKE_INSTALL_RPATH "$ORIGIN/lib/:$ORIGIN/")
-
-set(LINUX TRUE)
-set(CMAKE_INSTALL_MESSAGE NEVER)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
-set(CMAKE_INSTALL_LIBDIR lib)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_SKIP_BUILD_RPATH  FALSE)
+set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 
-add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=1)
+set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+
+if(APPLE)
+  set(CMAKE_INSTALL_RPATH "@loader_path/lib;@loader_path")
+elseif(UNIX)
+  set(CMAKE_INSTALL_RPATH "$ORIGIN/lib:$ORIGIN")
+elseif(WIN32)
+  set(CMAKE_INSTALL_RPATH "")
+endif()
+set(CMAKE_INSTALL_LIBDIR lib)
+set(CMAKE_INSTALL_MESSAGE NEVER)
 
 set(Torch_DIR ${PYTORCH_INSTALL_DIR}/share/cmake/Torch)
 find_package(Torch REQUIRED)
-include_directories(${PYTORCH_INSTALL_DIR}/include)
 
 if(DEFINED PYTHON_INCLUDE_DIR)
   include_directories(${PYTHON_INCLUDE_DIR})
@@ -33,6 +36,9 @@ else()
   message(FATAL_ERROR "Cannot find Python directory")
 endif()
 
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include(${PROJECT_SOURCE_DIR}/cmake/TorchPythonTargets.cmake)
+
 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/openreg)
 add_subdirectory(${PROJECT_SOURCE_DIR}/csrc)
 add_subdirectory(${PROJECT_SOURCE_DIR}/torch_openreg/csrc)
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/README.md b/test/cpp_extensions/open_registration_extension/torch_openreg/README.md
index 9ecbd0b886e32..83ec85b1055c2 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/README.md
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/README.md
@@ -60,6 +60,7 @@ torch_openreg/
     ├── __init__.py
     └── openreg
         ├── __init__.py
+        ├── meta.py
         └── random.py
 ```
 
@@ -110,35 +111,18 @@ There are 4 DSOs in torch_openreg, and the dependencies between them are as foll
 
 - Operator Implementation
 
-  - `TORCH_LIBRARY` form
-    - Registering a specific operator for an existing schema: See `empty.memory_format`
-    - Registering an operator with a custom schema
-      - Extending an existing namespace: (TODO)
-      - Custom namespace: See `custom_autograd_fn_returns_self`
-    - Autograd: See `custom_autograd_fn_returns_self`
-  - STUB form: See `abs_stub`
-
-  - Fallback
+  - Register for builtin PyTorch Operators
+    - `TORCH_LIBRARY_IMPL` form: See `empty.memory_format
+    - `STUB` form: See `abs_stub`
+  - Register for custom operators
+    - Schema Registration: See `custom_abs`
+    - Kernel Registration: See `custom_abs`
+    - Fallback Registration for `AutogradPriavateUse1`: See `custom_abs`
+    - Meta Registration: See `custom_abs`
+    - `torch.autograd.Function`: See `custom_autograd_fn_aliasing`
+  - Register for fallback
+    - Per-operator Fallback: See `sub.Tensor`
     - Global Fallback: See `wrapper_cpu_fallback`
-    - Per-operator Fallback: (TODO)
-
-  - AMP (TODO)
-
-### Memory Management
-
-- Device Memory Management (TODO)
-- Host Memory Management (TODO)
-
-### Custom Storage
-
-- Adding custom device descriptions (TODO)
-- Serialization support (TODO)
-
-### Autoload
-
-- (TODO)
-
-...
 
 ## Installation and Usage
 
@@ -177,7 +161,15 @@ print(f"Device of z: {z.device}")
 
 ## Future Plans
 
-- **Enhance Features**: AMP, memory management, generators, distributed computing, etc. (to reiterate, the fundamental goal is to verify the integration mechanism).
+- **Enhance Features**:
+  - Autoload
+  - AMP
+  - Device-agnostic APIs
+  - Memory Management
+  - Generator
+  - Distrubuted
+  - Custom Tensor&Storage
+  - ...
 - **Improve Tests**: Add more test cases related to the integration mechanism.
 - **Improve Documentation**: Add a new chapter on third-party device integration in the `Developer Notes` section of the PyTorch documentation.
 - **Real-time Synchronization**: Keep the code and documentation updated iteratively and in sync.
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/cmake/TorchPythonTargets.cmake b/test/cpp_extensions/open_registration_extension/torch_openreg/cmake/TorchPythonTargets.cmake
new file mode 100644
index 0000000000000..b7a807d264dde
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/cmake/TorchPythonTargets.cmake
@@ -0,0 +1,22 @@
+if(WIN32)
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/torch_python.lib")
+elseif(APPLE)
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.dylib")
+else()
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.so")
+endif()
+
+add_library(torch_python SHARED IMPORTED)
+
+set_target_properties(torch_python PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${PYTORCH_INSTALL_DIR}/include"
+  INTERFACE_LINK_LIBRARIES "c10;torch_cpu"
+  IMPORTED_LOCATION "${TORCH_PYTHON_IMPORTED_LOCATION}"
+)
+
+add_library(torch_python_library INTERFACE IMPORTED)
+
+set_target_properties(torch_python_library PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "\$<TARGET_PROPERTY:torch_python,INTERFACE_INCLUDE_DIRECTORIES>"
+  INTERFACE_LINK_LIBRARIES "\$<TARGET_FILE:torch_python>;\$<TARGET_PROPERTY:torch_python,INTERFACE_LINK_LIBRARIES>"
+)
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/CMakeLists.txt b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/CMakeLists.txt
index 077f4cf3b6404..e2ae2b3f3667e 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/CMakeLists.txt
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/CMakeLists.txt
@@ -6,7 +6,11 @@ file(GLOB_RECURSE SOURCE_FILES
 
 add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})
 
-target_link_libraries(${LIBRARY_NAME} PRIVATE openreg torch_cpu)
+target_link_libraries(${LIBRARY_NAME} PRIVATE torch_cpu_library openreg)
 target_include_directories(${LIBRARY_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
-install(TARGETS ${LIBRARY_NAME} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+install(TARGETS ${LIBRARY_NAME}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
index 3d8525697cc8c..04ba6d48e8958 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
@@ -3,16 +3,18 @@
 #include <ATen/native/CPUFallback.h>
 #include <ATen/native/DispatchStub.h>
 
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
 #include <torch/library.h>
 
 namespace at::openreg {
 
+namespace {
 at::Tensor wrapper_quantize_per_tensor(
     const at::Tensor& self,
     double scale,
     int64_t zero_point,
     at::ScalarType dtype) {
-  return at::native::quantize_per_tensor_openreg(
+  return at::native::openreg::quantize_per_tensor(
       self, scale, zero_point, dtype);
 }
 
@@ -25,10 +27,19 @@ int64_t wrapper__fused_sdp_choice(
     bool is_causal,
     std::optional<double> scale,
     bool enable_gqa) {
-  return at::native::_fused_sdp_choice_openreg(
+  return at::native::openreg::_fused_sdp_choice(
       query, key, value, attn_mask, dropout_p, is_causal, scale, enable_gqa);
 }
 
+void wrapper_quantize_tensor_per_tensor_affine_stub(
+    const at::Tensor& rtensor,
+    at::Tensor& qtensor,
+    double scale,
+    int64_t zero_point) {
+  at::native::openreg::quantize_tensor_per_tensor_affine_stub(
+      rtensor, qtensor, scale, zero_point);
+}
+
 std::tuple<
     at::Tensor,
     at::Tensor,
@@ -48,7 +59,7 @@ wrapper__scaled_dot_product_fused_attention_overrideable(
     bool is_causal,
     bool return_debug_mask,
     std::optional<double> scale) {
-  return at::native::_scaled_dot_product_fused_attention_overrideable_openreg(
+  return at::native::openreg::_scaled_dot_product_fused_attention_overrideable(
       query,
       key,
       value,
@@ -78,8 +89,8 @@ wrapper_scaled_dot_product_fused_attention_overrideable_backward(
     const at::Tensor& philox_seed,
     const at::Tensor& philox_offset,
     std::optional<double> scale) {
-  return at::native::
-      _scaled_dot_product_fused_attention_overrideable_backward_openreg(
+  return at::native::openreg::
+      _scaled_dot_product_fused_attention_overrideable_backward(
           grad_out,
           query,
           key,
@@ -99,7 +110,66 @@ wrapper_scaled_dot_product_fused_attention_overrideable_backward(
           scale);
 }
 
+at::Tensor wrapper_custom_autograd_fn_returns_self(at::Tensor x) {
+  return at::native::openreg::custom_autograd_fn_returns_self(x);
+}
+
+at::Tensor wrapper_custom_autograd_fn_aliasing(at::Tensor x) {
+  return at::native::openreg::custom_autograd_fn_aliasing(x);
+}
+
+at::Tensor& wrapper_abs_out(const at::Tensor& self, at::Tensor& out) {
+  return at::native::openreg::abs_out(self, out);
+}
+
+void wrapper_abs_stub(at::TensorIteratorBase& iter) {
+  at::native::openreg::abs_kernel(iter);
+}
+
+at::Tensor wrapper_custom_abs(at::Tensor x) {
+  return at::native::openreg::custom_abs(x);
+}
+} // namespace
+
+using namespace at::native;
+// Registration via STUB
+// LITERALINCLUDE START: STUB DEFAULT
+REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &wrapper_abs_stub);
+REGISTER_PRIVATEUSE1_DISPATCH(
+    quantize_tensor_per_tensor_affine_stub,
+    &wrapper_quantize_tensor_per_tensor_affine_stub);
+REGISTER_PRIVATEUSE1_DISPATCH(
+    _fused_sdp_choice_stub,
+    &wrapper__fused_sdp_choice);
+// LITERALINCLUDE END: STUB DEFAULT
+
+// Registration of custom operators
+// LITERALINCLUDE START: CUSTOM OPERATOR SCHEMA
+TORCH_LIBRARY(openreg, m) {
+  m.def("custom_abs(Tensor input)-> Tensor");
+}
+// LITERALINCLUDE END: CUSTOM OPERATOR SCHEMA
+
+// LITERALINCLUDE START: CUSTOM OPERATOR DEFAULT
+TORCH_LIBRARY_IMPL(openreg, PrivateUse1, m) {
+  m.impl("custom_abs", &wrapper_custom_abs);
+}
+// LITERALINCLUDE END: CUSTOM OPERATOR DEFAULT
+
+// LITERALINCLUDE START: CUSTOM OPERATOR FALLBACK
+TORCH_LIBRARY_IMPL(_, AutogradPrivateUse1, m) {
+  m.fallback(torch::autograd::autogradNotImplementedFallback());
+}
+// LITERALINCLUDE END: CUSTOM OPERATOR FALLBACK
+
+// The rest is for testing purposes
 TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+  /*
+   abs_stub only works if abs.out is also registered with PrivateUse1, because
+   abs.default is designed to redirect directly to abs.out, which calls
+   abs_stub.
+  */
+  m.impl("abs.out", &wrapper_abs_out);
   m.impl("quantize_per_tensor", &wrapper_quantize_per_tensor);
   m.impl("_fused_sdp_choice", &wrapper__fused_sdp_choice);
   m.impl(
@@ -110,10 +180,7 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
       &wrapper_scaled_dot_product_fused_attention_overrideable_backward);
 }
 
-} // namespace at::openreg
-
-namespace at::openreg {
-TORCH_LIBRARY(openreg, m) {
+TORCH_LIBRARY_FRAGMENT(openreg, m) {
   m.def("custom_autograd_fn_returns_self(Tensor input)-> Tensor");
   m.def("custom_autograd_fn_aliasing(Tensor(a) input)-> Tensor(a)");
 }
@@ -121,18 +188,8 @@ TORCH_LIBRARY(openreg, m) {
 TORCH_LIBRARY_IMPL(openreg, AutogradPrivateUse1, m) {
   m.impl(
       "custom_autograd_fn_returns_self",
-      &at::native::custom_autograd_fn_returns_self);
-  m.impl(
-      "custom_autograd_fn_aliasing", &at::native::custom_autograd_fn_aliasing);
+      &wrapper_custom_autograd_fn_returns_self);
+  m.impl("custom_autograd_fn_aliasing", &wrapper_custom_autograd_fn_aliasing);
 }
-} // namespace at::openreg
 
-namespace at::native {
-REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &abs_kernel_openreg);
-REGISTER_PRIVATEUSE1_DISPATCH(
-    quantize_tensor_per_tensor_affine_stub,
-    &quantize_tensor_per_tensor_affine_stub_openreg);
-REGISTER_PRIVATEUSE1_DISPATCH(
-    _fused_sdp_choice_stub,
-    &_fused_sdp_choice_openreg);
-} // namespace at::native
+} // namespace at::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
index fe75cdaea8b2a..d54ae5527eb8c 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
@@ -7,6 +7,9 @@
 
 namespace at::openreg {
 
+namespace {
+
+// LITERALINCLUDE START: EMPTY.MEMORY_FORMAT WRAPPER
 at::Tensor wrapper_empty_memory_format(
     c10::IntArrayRef size,
     std::optional<c10::ScalarType> dtype_opt,
@@ -14,7 +17,7 @@ at::Tensor wrapper_empty_memory_format(
     std::optional<c10::Device> device_opt,
     std::optional<bool> pin_memory_opt,
     std::optional<c10::MemoryFormat> memory_format_opt) {
-  return at::native::empty_memory_format_openreg(
+  return at::native::openreg::empty_memory_format(
       size,
       dtype_opt,
       layout_opt,
@@ -22,6 +25,7 @@ at::Tensor wrapper_empty_memory_format(
       pin_memory_opt,
       memory_format_opt);
 }
+// LITERALINCLUDE END: EMPTY.MEMORY_FORMAT WRAPPER
 
 at::Tensor wrapper_empty_strided(
     c10::IntArrayRef size,
@@ -30,7 +34,7 @@ at::Tensor wrapper_empty_strided(
     std::optional<c10::Layout> layout_opt,
     std::optional<c10::Device> device_opt,
     std::optional<bool> pin_memory_opt) {
-  return at::native::empty_strided_openreg(
+  return at::native::openreg::empty_strided(
       size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
 }
 
@@ -39,48 +43,48 @@ at::Tensor wrapper_as_strided(
     c10::SymIntArrayRef size,
     c10::SymIntArrayRef stride,
     std::optional<c10::SymInt> storage_offset) {
-  return at::native::as_strided_openreg(self, size, stride, storage_offset);
+  return at::native::openreg::as_strided(self, size, stride, storage_offset);
 }
 
 const at::Tensor& wrapper_resize_(
     const at::Tensor& self,
     c10::SymIntArrayRef size,
     ::std::optional<at::MemoryFormat> memory_format) {
-  return at::native::resize_openreg_(self, size, memory_format);
+  return at::native::openreg::resize_(self, size, memory_format);
 }
 
 at::Tensor wrapper__reshape_alias(
     const at::Tensor& self,
     c10::SymIntArrayRef size,
     c10::SymIntArrayRef stride) {
-  return at::native::_reshape_alias_openreg(self, size, stride);
+  return at::native::openreg::_reshape_alias(self, size, stride);
 }
 
 at::Tensor wrapper__copy_from(
     const at::Tensor& self,
     const at::Tensor& dst,
     bool non_blocking) {
-  return at::native::_copy_from_openreg(self, dst, non_blocking);
+  return at::native::openreg::_copy_from(self, dst, non_blocking);
 }
 
 at::Tensor wrapper__copy_from_and_resize(
     const at::Tensor& self,
     const at::Tensor& dst) {
-  return at::native::_copy_from_and_resize_openreg(self, dst);
+  return at::native::openreg::_copy_from_and_resize(self, dst);
 }
 
 at::Scalar wrapper__local_scalar_densor(const at::Tensor& self) {
-  return at::native::_local_scalar_dense_openreg(self);
+  return at::native::openreg::_local_scalar_dense(self);
 }
 
 at::Tensor& wrapper_set_source_Tensor_(
     at::Tensor& self,
     const at::Tensor& source) {
-  return at::native::set_source_Tensor_openreg_(self, source);
+  return at::native::openreg::set_source_Tensor_(self, source);
 }
 
 at::Tensor& wrapper_set_source_Storage_(at::Tensor& self, at::Storage source) {
-  return at::native::set_source_Storage_openreg_(self, source);
+  return at::native::openreg::set_source_Storage_(self, source);
 }
 
 at::Tensor& wrapper_set_source_Storage_storage_offsetset_(
@@ -89,14 +93,25 @@ at::Tensor& wrapper_set_source_Storage_storage_offsetset_(
     int64_t storage_offset,
     c10::IntArrayRef size,
     c10::IntArrayRef stride) {
-  return at::native::set_source_Storage_storage_offset_openreg_(
+  return at::native::openreg::set_source_Storage_storage_offset_(
       result, storage, storage_offset, size, stride);
 }
 
 at::Tensor wrapper_view(const at::Tensor& self, c10::SymIntArrayRef size) {
-  return at::native::view_openreg(self, size);
+  return at::native::openreg::view(self, size);
 }
 
+// LITERALINCLUDE START: FALLBACK WRAPPER
+void wrapper_cpu_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack) {
+  at::native::openreg::cpu_fallback(op, stack);
+}
+// LITERALINCLUDE END: FALLBACK WRAPPER
+
+} // namespace
+
+// LITERALINCLUDE START: TORCH_LIBRARY_IMPL DEFAULT
 TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
   m.impl("empty.memory_format", wrapper_empty_memory_format);
   m.impl("empty_strided", wrapper_empty_strided);
@@ -113,16 +128,21 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
       wrapper_set_source_Storage_storage_offsetset_);
   m.impl("view", wrapper_view);
 }
+// LITERALINCLUDE END: TORCH_LIBRARY_IMPL DEFAULT
 
-void wrapper_cpu_fallback(
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack) {
-  at::native::cpu_fallback_openreg(op, stack);
-}
-
+// LITERALINCLUDE START: FALLBACK GLOBAL
 TORCH_LIBRARY_IMPL(_, PrivateUse1, m) {
   m.fallback(
       torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>());
 }
+// LITERALINCLUDE END: FALLBACK GLOBAL
+
+// LITERALINCLUDE START: FALLBACK SINGLE
+TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+  m.impl(
+      "sub.Tensor",
+      torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>());
+}
+// LITERALINCLUDE END: FALLBACK SINGLE
 
 } // namespace at::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Common.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Common.h
index a706137fe852d..c17196d0d797c 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Common.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Common.h
@@ -10,6 +10,7 @@
 #include <ATen/native/transformers/sdp_utils_cpp.h>
 #include <ATen/ops/_local_scalar_dense_native.h>
 #include <ATen/ops/_reshape_alias_native.h>
+#include <ATen/ops/abs_native.h>
 #include <ATen/ops/as_strided_cpu_dispatch.h>
 #include <ATen/ops/copy_native.h>
 #include <ATen/ops/quantize_per_tensor_native.h>
@@ -24,26 +25,18 @@
 
 #include <c10/core/Allocator.h>
 
-#include <set>
-
 #include <include/openreg.h>
 
-namespace at::native {
+namespace at::native::openreg {
 
 class MemoryGuard {
  public:
-  explicit MemoryGuard(const torch::jit::Stack& stack) {
-    for (const c10::IValue& ivalue : stack) {
-      find_and_unprotect_tensors(ivalue);
-    }
-  }
-
   template <typename... Args>
   explicit MemoryGuard(const Args&... args) {
-    (handler(args), ...);
+    (find_and_unprotect_tensors(args), ...);
   }
 
-  ~MemoryGuard() {
+  ~MemoryGuard() noexcept {
     for (void* ptr : unprotected_pointers_) {
       orMemoryProtect(ptr);
     }
@@ -55,26 +48,31 @@ class MemoryGuard {
   MemoryGuard& operator=(MemoryGuard&&) = delete;
 
  private:
-  void find_and_unprotect_tensors(const c10::IValue& ivalue) {
-    if (ivalue.isTensor()) {
-      unprotect_if_needed(ivalue.toTensor());
-    } else if (ivalue.isTensorList()) {
-      for (const at::Tensor& tensor : ivalue.toTensorList()) {
-        unprotect_if_needed(tensor);
-      }
-    } else if (ivalue.isList()) {
-      for (const c10::IValue& element : ivalue.toListRef()) {
-        find_and_unprotect_tensors(element);
-      }
-    } else if (ivalue.isGenericDict()) {
-      for (const auto& pair : ivalue.toGenericDict()) {
-        find_and_unprotect_tensors(pair.key());
-        find_and_unprotect_tensors(pair.value());
+  template <typename T>
+  void find_and_unprotect_tensors(const T& item) {
+    if constexpr (std::is_base_of_v<at::TensorBase, T>) {
+      unprotect_if_needed(item);
+    } else if constexpr (std::is_same_v<T, c10::IValue>) {
+      if (item.isTensor()) {
+        unprotect_if_needed(item.toTensor());
+      } else if (item.isTensorList()) {
+        for (const at::Tensor& tensor : item.toTensorListRef()) {
+          unprotect_if_needed(tensor);
+        }
+      } else if (item.isList()) {
+        for (const c10::IValue& element : item.toListRef()) {
+          find_and_unprotect_tensors(element);
+        }
+      } else if (item.isGenericDict()) {
+        for (const auto& [key, value] : item.toGenericDict()) {
+          find_and_unprotect_tensors(key);
+          find_and_unprotect_tensors(value);
+        }
       }
     }
   }
 
-  void unprotect_if_needed(const at::Tensor& tensor) {
+  void unprotect_if_needed(const at::TensorBase& tensor) {
     if (!tensor.defined() || !tensor.has_storage()) {
       return;
     }
@@ -82,25 +80,18 @@ class MemoryGuard {
     void* ptr = tensor.data_ptr();
     orPointerAttributes attr;
 
-    if (orPointerGetAttributes(&attr, ptr) == orSuccess) {
-      if (attr.type == orMemoryTypeDevice) {
-        if (unprotected_pointers_.find(attr.pointer) ==
-            unprotected_pointers_.end()) {
-          orMemoryUnprotect(attr.pointer);
-          unprotected_pointers_.insert(attr.pointer);
-        }
-      }
+    if (orPointerGetAttributes(&attr, ptr) != orSuccess ||
+        attr.type != orMemoryTypeDevice) {
+      return;
     }
-  }
 
-  template <typename T>
-  void handler(const T& x) {
-    if constexpr (std::is_same_v<std::decay_t<T>, at::Tensor>) {
-      unprotect_if_needed(x);
+    auto [it, inserted] = unprotected_pointers_.insert(attr.pointer);
+    if (inserted) {
+      orMemoryUnprotect(attr.pointer);
     }
   }
 
-  std::set<void*> unprotected_pointers_;
+  std::unordered_set<void*> unprotected_pointers_;
 };
 
-} // namespace at::native
+} // namespace at::native::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp
index 741d148035393..129ad621cf812 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp
@@ -1,8 +1,8 @@
 #include "Extra.h"
 
-namespace at::native {
+namespace at::native::openreg {
 
-at::Tensor quantize_per_tensor_openreg(
+at::Tensor quantize_per_tensor(
     const at::Tensor& self,
     double scale,
     int64_t zero_point,
@@ -10,7 +10,7 @@ at::Tensor quantize_per_tensor_openreg(
   return at::native::quantize_per_tensor(self, scale, zero_point, dtype);
 }
 
-int64_t _fused_sdp_choice_openreg(
+int64_t _fused_sdp_choice(
     const at::Tensor& query,
     const at::Tensor& key,
     const at::Tensor& value,
@@ -23,6 +23,12 @@ int64_t _fused_sdp_choice_openreg(
   return static_cast<int64_t>(backend);
 }
 
+void quantize_tensor_per_tensor_affine_stub(
+    const at::Tensor& rtensor,
+    at::Tensor& qtensor,
+    double scale,
+    int64_t zero_point) {}
+
 std::tuple<
     at::Tensor,
     at::Tensor,
@@ -33,7 +39,7 @@ std::tuple<
     at::Tensor,
     at::Tensor,
     at::Tensor>
-_scaled_dot_product_fused_attention_overrideable_openreg(
+_scaled_dot_product_fused_attention_overrideable(
     const at::Tensor& query,
     const at::Tensor& key,
     const at::Tensor& value,
@@ -72,7 +78,7 @@ _scaled_dot_product_fused_attention_overrideable_openreg(
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-_scaled_dot_product_fused_attention_overrideable_backward_openreg(
+_scaled_dot_product_fused_attention_overrideable_backward(
     const at::Tensor& grad_out,
     const at::Tensor& query,
     const at::Tensor& key,
@@ -97,104 +103,6 @@ _scaled_dot_product_fused_attention_overrideable_backward_openreg(
       at::empty_like(attn_bias));
 }
 
-} // namespace at::native
-
-namespace at::native {
-
-void abs_kernel_openreg(at::TensorIteratorBase& iter) {
-  // Abs only have a input tensor and a output tensor.
-  auto& output_operand = iter.operand(0);
-  auto& input_operand = iter.operand(1);
-  auto& output_tensor_base = output_operand.tensor_base();
-  auto& input_tensor_base = input_operand.tensor_base();
-  TORCH_CHECK(
-      !input_operand.original_tensor_base().defined(),
-      "input original tensor is defined.");
-  TORCH_CHECK(
-      !output_operand.original_tensor_base().defined(),
-      "output original tensor is defined.");
-  // For easy test, only accept contiguous input tensor for calculate.
-  auto memory_format = input_tensor_base.suggest_memory_format();
-  TORCH_CHECK(
-      input_tensor_base.is_contiguous(memory_format),
-      "Input tensor need be contiguous.");
-  // Add necessary restrictions to ensure the security of the demo.
-  TORCH_CHECK(
-      input_tensor_base.sizes() == output_tensor_base.sizes(),
-      "Intput and output tensor size are not equal.");
-  // Common dtype is calculate in TensorIteratorBase.
-  TORCH_CHECK(
-      iter.common_dtype() == at::ScalarType::Float, "Only support float type.")
-  // Using for loop for abs calculate.
-  auto abs_function =
-      [](float* output_ptr, const float* input_ptr, const int64_t NUM) {
-        for (int64_t i = 0; i < NUM; ++i) {
-          *(output_ptr + i) = std::abs(*(input_ptr + i));
-        }
-      };
-  // To simplify the logic of the test demo code,
-  // we only use contiguous tensor to calculate on device side.
-  // And using input tensor memory format.
-  if (iter.is_contiguous()) {
-    // Add for will_resize flag check. You can convert to differernt
-    // tensor memory format when will_resize is True.
-    // If TensorIteratorConfig resize_outputs_ flag is true, and there are two
-    // situations:
-    // 1) Out tensor is undefined, and TensorIterator set will_resize to true;
-    // 2) Out tensor is defined and tensor size is not equal to input tensor
-    // size;
-    //    TensorIterator set will_resize to true, and call
-    //    set_output_raw_strided to resize output tensor.
-    // When output operand will_resize flag is ture, dummy
-    // device can convert tensor to dummy device preferred memory format.
-    // Here we don't convert tensor memory format, because it will become
-    // complex when dummy device want keep same memory format for training
-    // network.
-    TORCH_CHECK(
-        output_operand.will_resize,
-        "output operand will_resize flag need be True.");
-    abs_function(
-        (float*)iter.data_ptr(0), (float*)iter.data_ptr(1), iter.numel());
-  } else {
-    // Stride copy is not support for foo device, using cpu device instead.
-    // For abs op, the last situation is: output tensor is not contiguous with
-    // operand will_resize is False.
-    TORCH_CHECK(
-        !output_operand.will_resize, "output operand will_resize is True.");
-    // Get a contiguous tensor with input memory format.
-    at::Tensor output = at::empty(
-        output_tensor_base.sizes(),
-        input_tensor_base.options().memory_format(memory_format));
-    // For structured op which inheried from TensorIteratorBase, maybe you need
-    // to call set_output_raw_strided function to update output stored in op
-    // sturctured. abs op is no need to do this.
-    output_operand.exchange_tensor(
-        c10::MaybeOwned<at::TensorBase>::owned(std::in_place, output));
-    abs_function(
-        (float*)output_operand.tensor_base().mutable_data_ptr(),
-        (float*)iter.data_ptr(1),
-        iter.numel());
-    // Copy tensor base to original tensor base, and keep same scalar type and
-    // stride with cpu and gpu.
-    if (output_operand.original_tensor_base().defined() &&
-        !output_operand.original_tensor_base().is_same(
-            output_operand.tensor_base())) {
-      output_operand.original_tensor().copy_(output_operand.tensor());
-      output_operand.restore_original_tensor();
-    }
-  }
-}
-
-void quantize_tensor_per_tensor_affine_stub_openreg(
-    const at::Tensor& rtensor,
-    at::Tensor& qtensor,
-    double scale,
-    int64_t zero_point) {}
-
-} // namespace at::native
-
-namespace at::native {
-
 namespace {
 struct CustomAutogradFnReturnsSelf
     : public torch::autograd::Function<CustomAutogradFnReturnsSelf> {
@@ -235,4 +143,68 @@ at::Tensor custom_autograd_fn_aliasing(at::Tensor x) {
   return CustomAutogradFnAliasing::apply(x);
 }
 
-} // namespace at::native
+/*
+ This implementation is only used to test stub registration, so not all
+ capabilities are fully supported.
+
+ Current Limitations:
+ - dtype: Float only
+ - input tensor: must be contiguous layout
+*/
+// LITERALINCLUDE START: STUB ABS
+void abs_kernel(at::TensorIteratorBase& iter) {
+  TORCH_CHECK(iter.ntensors() == 2, "Abs kernel expects 2 tensors");
+  TORCH_CHECK(
+      iter.common_dtype() == at::ScalarType::Float,
+      "Abs kernel only supports float type");
+
+  auto& output_tensor = iter.tensor(0);
+  auto& input_tensor = iter.tensor(1);
+
+  TORCH_CHECK(
+      input_tensor.sizes() == output_tensor.sizes(),
+      "Input and output tensor sizes must match.");
+
+  auto abs_loop = [](float* out_ptr, const float* in_ptr, int64_t n) {
+    for (int64_t i = 0; i < n; ++i) {
+      out_ptr[i] = std::abs(in_ptr[i]);
+    }
+  };
+
+  MemoryGuard guard(input_tensor, output_tensor);
+
+  if (iter.is_contiguous()) {
+    abs_loop(
+        static_cast<float*>(iter.data_ptr(0)),
+        static_cast<float*>(iter.data_ptr(1)),
+        iter.numel());
+  } else {
+    TORCH_CHECK(
+        input_tensor.is_contiguous(), "Input tensor must be contiguous.")
+
+    auto output = at::empty(
+        input_tensor.sizes(),
+        input_tensor.options().memory_format(
+            input_tensor.suggest_memory_format()));
+
+    MemoryGuard guard(output);
+
+    abs_loop(
+        static_cast<float*>(output.data_ptr()),
+        static_cast<float*>(iter.data_ptr(1)),
+        iter.numel());
+
+    output_tensor.copy_(output);
+  }
+}
+// LITERALINCLUDE END: STUB ABS
+
+at::Tensor& abs_out(const at::Tensor& self, at::Tensor& out) {
+  return at::native::abs_out(self, out);
+}
+
+at::Tensor custom_abs(at::Tensor x) {
+  return at::abs(x);
+}
+
+} // namespace at::native::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.h
index 95109cd3fa331..f002949a1035d 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.h
@@ -1,12 +1,13 @@
 #include "Common.h"
 
-namespace at::native {
-at::Tensor quantize_per_tensor_openreg(
+namespace at::native::openreg {
+
+at::Tensor quantize_per_tensor(
     const at::Tensor& self,
     double scale,
     int64_t zero_point,
     at::ScalarType dtype);
-int64_t _fused_sdp_choice_openreg(
+int64_t _fused_sdp_choice(
     const at::Tensor& query,
     const at::Tensor& key,
     const at::Tensor& value,
@@ -15,6 +16,11 @@ int64_t _fused_sdp_choice_openreg(
     bool is_causal,
     std::optional<double> scale,
     bool enable_gqa);
+void quantize_tensor_per_tensor_affine_stub(
+    const at::Tensor& rtensor,
+    at::Tensor& qtensor,
+    double scale,
+    int64_t zero_point);
 std::tuple<
     at::Tensor,
     at::Tensor,
@@ -25,7 +31,7 @@ std::tuple<
     at::Tensor,
     at::Tensor,
     at::Tensor>
-_scaled_dot_product_fused_attention_overrideable_openreg(
+_scaled_dot_product_fused_attention_overrideable(
     const at::Tensor& query,
     const at::Tensor& key,
     const at::Tensor& value,
@@ -35,7 +41,7 @@ _scaled_dot_product_fused_attention_overrideable_openreg(
     bool return_debug_mask,
     std::optional<double> scale);
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-_scaled_dot_product_fused_attention_overrideable_backward_openreg(
+_scaled_dot_product_fused_attention_overrideable_backward(
     const at::Tensor& grad_out,
     const at::Tensor& query,
     const at::Tensor& key,
@@ -53,18 +59,11 @@ _scaled_dot_product_fused_attention_overrideable_backward_openreg(
     const at::Tensor& philox_seed,
     const at::Tensor& philox_offset,
     std::optional<double> scale);
-} // namespace at::native
 
-namespace at::native {
-void abs_kernel_openreg(at::TensorIteratorBase& iter);
-void quantize_tensor_per_tensor_affine_stub_openreg(
-    const at::Tensor& rtensor,
-    at::Tensor& qtensor,
-    double scale,
-    int64_t zero_point);
-} // namespace at::native
-
-namespace at::native {
 at::Tensor custom_autograd_fn_returns_self(at::Tensor x);
 at::Tensor custom_autograd_fn_aliasing(at::Tensor x);
-} // namespace at::native
+at::Tensor& abs_out(const at::Tensor& self, at::Tensor& out);
+void abs_kernel(at::TensorIteratorBase& iter);
+at::Tensor custom_abs(at::Tensor x);
+
+} // namespace at::native::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
index 973869087a2e2..8a3263bb80e00 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
@@ -1,8 +1,11 @@
 #include "Minimal.h"
 
-namespace at::native {
+#include <unordered_set>
 
-at::Tensor empty_memory_format_openreg(
+namespace at::native::openreg {
+
+// LITERALINCLUDE START: EMPTY.MEMORY_FORMAT IMPL
+at::Tensor empty_memory_format(
     c10::IntArrayRef size,
     std::optional<c10::ScalarType> dtype_opt,
     std::optional<c10::Layout> layout_opt,
@@ -24,8 +27,9 @@ at::Tensor empty_memory_format_openreg(
   return at::detail::empty_generic(
       size, allocator, pu1_dks, dtype, memory_format_opt);
 }
+// LITERALINCLUDE END: EMPTY.MEMORY_FORMAT IMPL
 
-at::Tensor empty_strided_openreg(
+at::Tensor empty_strided(
     c10::IntArrayRef size,
     c10::IntArrayRef stride,
     std::optional<c10::ScalarType> dtype_opt,
@@ -48,7 +52,7 @@ at::Tensor empty_strided_openreg(
       size, stride, allocator, pu1_dks, dtype);
 }
 
-at::Tensor as_strided_openreg(
+at::Tensor as_strided(
     const at::Tensor& self,
     c10::SymIntArrayRef size,
     c10::SymIntArrayRef stride,
@@ -58,7 +62,7 @@ at::Tensor as_strided_openreg(
   return at::cpu::as_strided_symint(self, size, stride, storage_offset);
 }
 
-const at::Tensor& resize_openreg_(
+const at::Tensor& resize_(
     const at::Tensor& self,
     c10::SymIntArrayRef size,
     ::std::optional<at::MemoryFormat> memory_format) {
@@ -66,7 +70,7 @@ const at::Tensor& resize_openreg_(
       self, C10_AS_INTARRAYREF_SLOW(size), memory_format);
 }
 
-at::Tensor _reshape_alias_openreg(
+at::Tensor _reshape_alias(
     const at::Tensor& self,
     c10::SymIntArrayRef size,
     c10::SymIntArrayRef stride) {
@@ -74,7 +78,7 @@ at::Tensor _reshape_alias_openreg(
       self, C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride));
 }
 
-at::Tensor _copy_from_openreg(
+at::Tensor _copy_from(
     const at::Tensor& self,
     const at::Tensor& dst,
     bool non_blocking) {
@@ -124,50 +128,58 @@ at::Tensor _copy_from_openreg(
   return dst;
 }
 
-at::Tensor _copy_from_and_resize_openreg(
+at::Tensor _copy_from_and_resize(
     const at::Tensor& self,
     const at::Tensor& dst) {
   at::native::resize_(dst, self.sizes(), std::nullopt);
-
-  MemoryGuard guard(self, dst);
-
   return at::native::copy_(const_cast<at::Tensor&>(dst), self, false);
 }
 
-at::Scalar _local_scalar_dense_openreg(const at::Tensor& self) {
+at::Scalar _local_scalar_dense(const at::Tensor& self) {
   MemoryGuard guard(self);
   return at::native::_local_scalar_dense_cpu(self);
 }
 
-at::Tensor& set_source_Tensor_openreg_(
-    at::Tensor& self,
-    const at::Tensor& source) {
+at::Tensor& set_source_Tensor_(at::Tensor& self, const at::Tensor& source) {
   return at::native::set_tensor_(self, source);
 }
 
-at::Tensor& set_source_Storage_openreg_(at::Tensor& self, at::Storage source) {
+at::Tensor& set_source_Storage_(at::Tensor& self, at::Storage source) {
   return at::native::set_(self, source);
 }
 
-at::Tensor& set_source_Storage_storage_offset_openreg_(
+at::Tensor& set_source_Storage_storage_offset_(
     at::Tensor& result,
     at::Storage storage,
     int64_t storage_offset,
     c10::IntArrayRef size,
     c10::IntArrayRef stride) {
-  // call native::
   return at::cpu::set_(result, storage, storage_offset, size, stride);
 }
 
-at::Tensor view_openreg(const at::Tensor& self, c10::SymIntArrayRef size) {
+at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size) {
   MemoryGuard guard(self);
   return at::native::view(self, C10_AS_INTARRAYREF_SLOW(size));
 }
 
-void cpu_fallback_openreg(
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack) {
-  at::native::cpu_fallback(op, stack);
+// LITERALINCLUDE START: FALLBACK IMPL
+void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  static const std::unordered_set<c10::OperatorName> cpu_fallback_blocklist = {
+      c10::OperatorName("aten::abs", ""),
+      c10::OperatorName("aten::abs", "out"),
+  };
+
+  const auto& op_name = op.schema().operator_name();
+  if (cpu_fallback_blocklist.count(op_name)) {
+    TORCH_CHECK(
+        false,
+        "Operator '",
+        op_name,
+        "' is not implemented for device openreg.");
+  } else {
+    at::native::cpu_fallback(op, stack);
+  }
 }
+// LITERALINCLUDE END: FALLBACK IMPL
 
-} // namespace at::native
+} // namespace at::native::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.h
index 3d144f2debea5..a2e5cf02d2d47 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.h
@@ -1,8 +1,8 @@
 #include "Common.h"
 
-namespace at::native {
+namespace at::native::openreg {
 
-at::Tensor empty_memory_format_openreg(
+at::Tensor empty_memory_format(
     c10::IntArrayRef size,
     std::optional<c10::ScalarType> dtype_opt,
     std::optional<c10::Layout> layout_opt,
@@ -10,7 +10,7 @@ at::Tensor empty_memory_format_openreg(
     std::optional<bool> pin_memory_opt,
     std::optional<c10::MemoryFormat> memory_format_opt);
 
-at::Tensor empty_strided_openreg(
+at::Tensor empty_strided(
     c10::IntArrayRef size,
     c10::IntArrayRef stride,
     std::optional<c10::ScalarType> dtype_opt,
@@ -18,50 +18,44 @@ at::Tensor empty_strided_openreg(
     std::optional<c10::Device> device_opt,
     std::optional<bool> pin_memory_opt);
 
-at::Tensor as_strided_openreg(
+at::Tensor as_strided(
     const at::Tensor& self,
     c10::SymIntArrayRef size,
     c10::SymIntArrayRef stride,
     std::optional<c10::SymInt> storage_offset);
 
-const at::Tensor& resize_openreg_(
+const at::Tensor& resize_(
     const at::Tensor& self,
     c10::SymIntArrayRef size,
     ::std::optional<at::MemoryFormat> memory_format);
 
-at::Tensor _reshape_alias_openreg(
+at::Tensor _reshape_alias(
     const at::Tensor& self,
     c10::SymIntArrayRef size,
     c10::SymIntArrayRef stride);
 
-at::Tensor _copy_from_openreg(
+at::Tensor _copy_from(
     const at::Tensor& self,
     const at::Tensor& dst,
     bool non_blocking);
 
-at::Tensor _copy_from_and_resize_openreg(
-    const at::Tensor& self,
-    const at::Tensor& dst);
+at::Tensor _copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst);
 
-at::Scalar _local_scalar_dense_openreg(const at::Tensor& self);
+at::Scalar _local_scalar_dense(const at::Tensor& self);
 
-at::Tensor& set_source_Tensor_openreg_(
-    at::Tensor& self,
-    const at::Tensor& source);
+at::Tensor& set_source_Tensor_(at::Tensor& self, const at::Tensor& source);
 
-at::Tensor& set_source_Storage_openreg_(at::Tensor& self, at::Storage source);
+at::Tensor& set_source_Storage_(at::Tensor& self, at::Storage source);
 
-at::Tensor& set_source_Storage_storage_offset_openreg_(
+at::Tensor& set_source_Storage_storage_offset_(
     at::Tensor& result,
     at::Storage storage,
     int64_t storage_offset,
     c10::IntArrayRef size,
     c10::IntArrayRef stride);
 
-at::Tensor view_openreg(const at::Tensor& self, c10::SymIntArrayRef size);
+at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size);
 
-void cpu_fallback_openreg(
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack);
+void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
 
-} // namespace at::native
+} // namespace at::native::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegEvent.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegEvent.h
new file mode 100644
index 0000000000000..e869cf0deafb1
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegEvent.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include <include/openreg.h>
+
+#include "OpenRegException.h"
+#include "OpenRegStream.h"
+
+namespace c10::openreg {
+
+struct OpenRegEvent {
+  OpenRegEvent(bool enable_timing) noexcept : enable_timing_{enable_timing} {}
+
+  ~OpenRegEvent() {
+    if (is_created_) {
+      OPENREG_CHECK(orEventDestroy(event_));
+    }
+  }
+
+  OpenRegEvent(const OpenRegEvent&) = delete;
+  OpenRegEvent& operator=(const OpenRegEvent&) = delete;
+
+  OpenRegEvent(OpenRegEvent&& other) noexcept {
+    moveHelper(std::move(other));
+  }
+  OpenRegEvent& operator=(OpenRegEvent&& other) noexcept {
+    if (this != &other) {
+      moveHelper(std::move(other));
+    }
+    return *this;
+  }
+
+  operator orEvent_t() const {
+    return event();
+  }
+
+  std::optional<at::Device> device() const {
+    if (is_created_) {
+      return at::Device(at::kPrivateUse1, device_index_);
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  bool isCreated() const {
+    return is_created_;
+  }
+
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+
+  orEvent_t event() const {
+    return event_;
+  }
+
+  bool query() const {
+    if (!is_created_) {
+      return true;
+    }
+
+    orError_t err = orEventQuery(event_);
+    if (err == orSuccess) {
+      return true;
+    }
+
+    return false;
+  }
+
+  void record() {
+    record(getCurrentOpenRegStream());
+  }
+
+  void recordOnce(const OpenRegStream& stream) {
+    if (!was_recorded_)
+      record(stream);
+  }
+
+  void record(const OpenRegStream& stream) {
+    if (!is_created_) {
+      createEvent(stream.device_index());
+    }
+
+    TORCH_CHECK(
+        device_index_ == stream.device_index(),
+        "Event device ",
+        device_index_,
+        " does not match recording stream's device ",
+        stream.device_index(),
+        ".");
+
+    OPENREG_CHECK(orEventRecord(event_, stream));
+    was_recorded_ = true;
+  }
+
+  void block(const OpenRegStream& stream) {
+    if (is_created_) {
+      OPENREG_CHECK(orStreamWaitEvent(stream, event_, 0));
+    }
+  }
+
+  float elapsed_time(const OpenRegEvent& other) const {
+    TORCH_CHECK_VALUE(
+        !(enable_timing_ & orEventDisableTiming) &&
+            !(other.enable_timing_ & orEventDisableTiming),
+        "Both events must be created with argument 'enable_timing=True'.");
+    TORCH_CHECK_VALUE(
+        is_created_ && other.isCreated(),
+        "Both events must be recorded before calculating elapsed time.");
+    TORCH_CHECK(
+        query() && other.query(),
+        "Both events must be completed before calculating elapsed time.");
+
+    float time_ms = 0;
+    OPENREG_CHECK(orEventElapsedTime(&time_ms, event_, other.event_));
+    return time_ms;
+  }
+
+  void synchronize() const {
+    if (is_created_) {
+      OPENREG_CHECK(orEventSynchronize(event_));
+    }
+  }
+
+ private:
+  unsigned int enable_timing_{orEventDisableTiming};
+  bool is_created_{false};
+  bool was_recorded_{false};
+  DeviceIndex device_index_{-1};
+  orEvent_t event_{};
+
+  void createEvent(DeviceIndex device_index) {
+    device_index_ = device_index;
+    OPENREG_CHECK(orEventCreateWithFlags(&event_, enable_timing_));
+    is_created_ = true;
+  }
+
+  void moveHelper(OpenRegEvent&& other) {
+    std::swap(enable_timing_, other.enable_timing_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(was_recorded_, other.was_recorded_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+};
+
+} // namespace c10::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.cpp
new file mode 100644
index 0000000000000..09eb09b6a2d61
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.cpp
@@ -0,0 +1,9 @@
+#include "OpenRegException.h"
+
+void orCheckFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg) {
+  throw ::c10::Error({func, file, line}, msg);
+}
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.h
new file mode 100644
index 0000000000000..16c1ee1ca2309
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <include/openreg.h>
+
+#include <c10/util/Exception.h>
+
+void orCheckFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg = "");
+
+#define OPENREG_CHECK(EXPR, ...)                                               \
+  do {                                                                         \
+    const orError_t __err = EXPR;                                              \
+    if (__err != orSuccess) {                                                  \
+      orCheckFail(                                                             \
+          __func__, __FILE__, static_cast<uint32_t>(__LINE__), ##__VA_ARGS__); \
+    }                                                                          \
+  } while (0)
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
index 240c2d8ce1aad..566bacd06e9ad 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
@@ -1,5 +1,6 @@
 #include <include/openreg.h>
 
+#include "OpenRegException.h"
 #include "OpenRegFunctions.h"
 
 namespace c10::openreg {
@@ -30,7 +31,7 @@ int device_count_impl() {
   return count;
 }
 
-c10::DeviceIndex device_count() noexcept {
+OPENREG_EXPORT c10::DeviceIndex device_count() noexcept {
   // initialize number of devices only once
   static int count = []() {
     try {
@@ -49,17 +50,17 @@ c10::DeviceIndex device_count() noexcept {
   return static_cast<c10::DeviceIndex>(count);
 }
 
-c10::DeviceIndex current_device() {
+OPENREG_EXPORT c10::DeviceIndex current_device() {
   c10::DeviceIndex cur_device = -1;
   GetDevice(&cur_device);
   return cur_device;
 }
 
-void set_device(c10::DeviceIndex device) {
+OPENREG_EXPORT void set_device(c10::DeviceIndex device) {
   SetDevice(device);
 }
 
-DeviceIndex ExchangeDevice(DeviceIndex device) {
+OPENREG_EXPORT DeviceIndex ExchangeDevice(DeviceIndex device) {
   int current_device = -1;
   orGetDevice(&current_device);
 
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h
index b6b991ff6d3a3..c2eb1e8074961 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h
@@ -3,14 +3,16 @@
 #include <c10/core/Device.h>
 #include <c10/macros/Macros.h>
 
+#include <include/Macros.h>
+
 #include <limits>
 
 namespace c10::openreg {
 
-c10::DeviceIndex device_count() noexcept;
-DeviceIndex current_device();
-void set_device(c10::DeviceIndex device);
+OPENREG_EXPORT c10::DeviceIndex device_count() noexcept;
+OPENREG_EXPORT c10::DeviceIndex current_device();
+OPENREG_EXPORT void set_device(c10::DeviceIndex device);
 
-DeviceIndex ExchangeDevice(DeviceIndex device);
+OPENREG_EXPORT DeviceIndex ExchangeDevice(DeviceIndex device);
 
 } // namespace c10::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.cpp
new file mode 100644
index 0000000000000..aa6c325d077de
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.cpp
@@ -0,0 +1,253 @@
+#include "OpenRegStream.h"
+
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <deque>
+
+namespace c10::openreg {
+
+namespace {
+
+// Global stream state and constants
+static c10::once_flag init_flag;
+
+static DeviceIndex num_devices = -1;
+static constexpr int kStreamsPerPoolBits = 5;
+static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
+static constexpr int kStreamTypeBits = 2;
+
+/*
+ * The stream pools are lazily initialized when the first queue is requested
+ * for a device. The device flags track the initialization of each device. When
+ * a queue is requested, the next queue in the pool to be returned in a
+ * round-robin fashion, see Note [Stream Management].
+ */
+static std::deque<c10::once_flag> device_flags;
+static std::vector<std::array<
+    std::array<orStream_t, kStreamsPerPool>,
+    c10::openreg::max_compile_time_stream_priorities>>
+    streams;
+static std::deque<
+    std::array<std::atomic<uint32_t>, max_compile_time_stream_priorities>>
+    priority_counters;
+
+static thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
+
+/*
+ * Note [StreamId assignment]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * How do we assign stream IDs?
+ *
+ * -- 56 bits --    -- 5 bits --     -- 2 bits --     -- 1 bit --
+ *     zeros       StreamIdIndex     StreamIdType    Ext/native stream
+ *                ignored for ext   ignored for ext
+ *
+ * Where StreamIdType:
+ *  00 = default stream
+ *  01 = normal stream
+ *  11 = external stream
+ *
+ * For external stream, StreamID is a orStream_t pointer. This means that last
+ * bit will always be 0. So when constructing StreamId for a native stream we
+ * set last bit to 1 to distinguish between native and external streams.
+ *
+ * StreamId is 64-bit, so we can just rely on regular promotion rules.
+ * We rely on StreamIdIndex and StreamIdType being non-negative;
+ */
+using StreamIdIndex = uint8_t;
+enum class StreamIdType : uint8_t {
+  DEFAULT = 0x0,
+  NORMAL = 0x1,
+  EXT = 0x3,
+};
+
+inline std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
+  switch (s) {
+    case StreamIdType::DEFAULT:
+      return stream << "DEFAULT";
+    case StreamIdType::NORMAL:
+      return stream << "NORMAL";
+    case StreamIdType::EXT:
+      return stream << "EXT";
+    default:
+      break;
+  }
+
+  return stream << static_cast<int16_t>(s);
+}
+
+static inline StreamIdType streamIdType(StreamId s) {
+  // Externally allocated streams have their id being the orStream_ptr
+  // so the last bit will be 0
+  if (!(s & 1)) {
+    return StreamIdType(StreamIdType::EXT);
+  }
+
+  int mask_for_type = (1 << kStreamTypeBits) - 1;
+  auto st = static_cast<StreamIdType>((s >> 1) & mask_for_type);
+  TORCH_CHECK(
+      st == StreamIdType::DEFAULT || st == StreamIdType::NORMAL,
+      "invalid StreamId: ",
+      s);
+  return st;
+}
+
+static inline size_t streamIdIndex(StreamId s) {
+  return static_cast<size_t>(
+      (s >> (kStreamTypeBits + 1)) & ((1 << kStreamsPerPoolBits) - 1));
+}
+
+StreamId makeStreamId(StreamIdType st, size_t si) {
+  if (st == StreamIdType::EXT) {
+    return static_cast<StreamId>(0);
+  }
+
+  return (static_cast<StreamId>(si) << (kStreamTypeBits + 1)) |
+      (static_cast<StreamId>(st) << 1) | 1;
+}
+
+static void initGlobalStreamState() {
+  num_devices = device_count();
+  device_flags.resize(num_devices);
+  streams.resize(num_devices);
+  priority_counters.resize(num_devices);
+}
+
+static void initSingleDeviceStream(
+    int priority,
+    DeviceIndex device_index,
+    int i) {
+  auto& stream = streams[device_index][priority][i];
+
+  OPENREG_CHECK(orStreamCreateWithPriority(&stream, 0, priority));
+  priority_counters[device_index][priority] = 0;
+}
+
+// Creates stream pools for the specified device. It should be call only once.
+static void initDeviceStreamState(DeviceIndex device_index) {
+  for (const auto i : c10::irange(kStreamsPerPool)) {
+    for (const auto p : c10::irange(max_compile_time_stream_priorities)) {
+      initSingleDeviceStream(p, device_index, i);
+    }
+  }
+}
+
+static void initOpenRegStreamsOnce() {
+  c10::call_once(init_flag, initGlobalStreamState);
+
+  if (current_streams) {
+    return;
+  }
+
+  // Inits current streams (thread local) to the last queue in the "normal
+  // priority" queue pool. Note: the queue pool have not been initialized yet.
+  // It will be initialized in initDeviceStreamState for the specified device.
+  current_streams = std::make_unique<StreamId[]>(num_devices);
+  for (const auto i : c10::irange(num_devices)) {
+    current_streams[i] = makeStreamId(StreamIdType::DEFAULT, 0);
+  }
+}
+
+static uint32_t get_idx(std::atomic<uint32_t>& counter) {
+  auto raw_idx = counter++;
+  return raw_idx % kStreamsPerPool;
+}
+
+OpenRegStream OpenRegStreamForId(DeviceIndex device_index, StreamId stream_id) {
+  return OpenRegStream(
+      OpenRegStream::UNCHECKED,
+      Stream(
+          Stream::UNSAFE,
+          c10::Device(DeviceType::PrivateUse1, device_index),
+          stream_id));
+}
+
+} // anonymous namespace
+
+// See Note [StreamId assignment]
+orStream_t OpenRegStream::stream() const {
+  c10::DeviceIndex device_index = stream_.device_index();
+  StreamId stream_id = stream_.id();
+  StreamIdType st = streamIdType(stream_id);
+  size_t si = streamIdIndex(stream_id);
+  switch (st) {
+    // The index 0 stream is default as well.
+    case StreamIdType::DEFAULT:
+    case StreamIdType::NORMAL:
+      return streams[device_index][static_cast<uint8_t>(st)][si];
+    case StreamIdType::EXT:
+      return reinterpret_cast<orStream_t>(stream_id);
+    default:
+      TORCH_CHECK(
+          false,
+          "Unrecognized stream ",
+          stream_,
+          " (I didn't recognize the stream type, ",
+          st,
+          ").",
+          " Did you manufacture the StreamId yourself?  Don't do that;");
+  }
+}
+
+// Returns a stream from the requested pool
+// Note: when called the first time on a device, this will create the
+// stream pools for that device.
+OpenRegStream getStreamFromPool(const int priority, DeviceIndex device_index) {
+  initOpenRegStreamsOnce();
+  if (device_index == -1) {
+    device_index = current_device();
+  }
+  c10::call_once(
+      device_flags[device_index], initDeviceStreamState, device_index);
+  auto pri_idx =
+      std::clamp(priority, 0, max_compile_time_stream_priorities - 1);
+  const auto idx = get_idx(priority_counters[device_index][pri_idx]);
+  auto id_type = static_cast<StreamIdType>(pri_idx);
+  return OpenRegStreamForId(device_index, makeStreamId(id_type, idx));
+}
+
+OpenRegStream getStreamFromPool(const bool isHighPriority, DeviceIndex device) {
+  initOpenRegStreamsOnce();
+  int priority = 0;
+  return getStreamFromPool(priority, device);
+}
+
+OpenRegStream getStreamFromExternal(
+    orStream_t ext_stream,
+    DeviceIndex device_index) {
+  return OpenRegStreamForId(
+      device_index, reinterpret_cast<int64_t>(ext_stream));
+}
+
+OpenRegStream getDefaultOpenRegStream(DeviceIndex device_index) {
+  initOpenRegStreamsOnce();
+  if (device_index == -1) {
+    device_index = current_device();
+  }
+  return OpenRegStreamForId(
+      device_index, makeStreamId(StreamIdType::DEFAULT, 0));
+}
+
+OpenRegStream getCurrentOpenRegStream(DeviceIndex device_index) {
+  initOpenRegStreamsOnce();
+  if (device_index == -1) {
+    device_index = current_device();
+  }
+  return OpenRegStreamForId(device_index, current_streams[device_index]);
+}
+
+void setCurrentOpenRegStream(OpenRegStream stream) {
+  initOpenRegStreamsOnce();
+  current_streams[stream.device_index()] = stream.id();
+}
+
+std::ostream& operator<<(std::ostream& stream, const OpenRegStream& s) {
+  return stream << s.unwrap();
+}
+
+} // namespace c10::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.h
new file mode 100644
index 0000000000000..e1fd0c719f5a1
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include <include/openreg.h>
+
+#include "OpenRegException.h"
+#include "OpenRegFunctions.h"
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/util/Exception.h>
+
+namespace c10::openreg {
+
+static constexpr int max_compile_time_stream_priorities = 1;
+
+class OpenRegStream {
+ public:
+  enum Unchecked { UNCHECKED };
+
+  explicit OpenRegStream(Stream stream) : stream_(stream) {
+    TORCH_CHECK(stream_.device_type() == DeviceType::PrivateUse1);
+  }
+
+  explicit OpenRegStream(Unchecked, Stream stream) : stream_(stream) {}
+
+  bool operator==(const OpenRegStream& other) const noexcept {
+    return unwrap() == other.unwrap();
+  }
+
+  bool operator!=(const OpenRegStream& other) const noexcept {
+    return unwrap() != other.unwrap();
+  }
+
+  operator orStream_t() const {
+    return stream();
+  }
+
+  operator Stream() const {
+    return unwrap();
+  }
+
+  DeviceType device_type() const {
+    return DeviceType::PrivateUse1;
+  }
+
+  DeviceIndex device_index() const {
+    return stream_.device_index();
+  }
+
+  Device device() const {
+    return Device(DeviceType::PrivateUse1, device_index());
+  }
+
+  StreamId id() const {
+    return stream_.id();
+  }
+
+  bool query() const {
+    DeviceGuard guard{stream_.device()};
+
+    if (orStreamQuery(stream()) == orSuccess) {
+      return true;
+    }
+
+    return false;
+  }
+
+  void synchronize() const {
+    DeviceGuard guard{stream_.device()};
+    OPENREG_CHECK(orStreamSynchronize(stream()));
+  }
+
+  int priority() const {
+    DeviceGuard guard{stream_.device()};
+    int priority = 0;
+    OPENREG_CHECK(orStreamGetPriority(stream(), &priority));
+    return priority;
+  }
+
+  orStream_t stream() const;
+
+  Stream unwrap() const {
+    return stream_;
+  }
+
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  static OpenRegStream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    return OpenRegStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+ private:
+  Stream stream_;
+};
+
+/*
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream from the highest priority pool by setting
+ * isHighPriority to true for a specific device.
+ */
+OPENREG_EXPORT OpenRegStream
+getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
+
+/*
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream by setting a priority value for a specific device.
+ * The priority number lower, the priority higher.
+ */
+OPENREG_EXPORT OpenRegStream
+getStreamFromPool(const int priority, DeviceIndex device = -1);
+
+/*
+ * Get a OpenRegStream from a externally allocated one.
+ *
+ * This is mainly for interoperability with different libraries where we
+ * want to operate on a non-torch allocated stream for data exchange or similar
+ * purposes
+ */
+OPENREG_EXPORT OpenRegStream
+getStreamFromExternal(orStream_t ext_stream, DeviceIndex device_index);
+
+/*
+ * Get the default OpenReg stream, for the passed OpenReg device, or for the
+ * current device if no device index is passed.
+ */
+OPENREG_EXPORT OpenRegStream
+getDefaultOpenRegStream(DeviceIndex device_index = -1);
+
+/*
+ * Get the current OpenReg stream, for the passed OpenReg device, or for the
+ * current device if no device index is passed.
+ */
+OPENREG_EXPORT OpenRegStream
+getCurrentOpenRegStream(DeviceIndex device_index = -1);
+
+/*
+ * Set the current stream on the device of the passed in stream to be the passed
+ * in stream.
+ */
+OPENREG_EXPORT void setCurrentOpenRegStream(OpenRegStream stream);
+
+OPENREG_EXPORT std::ostream& operator<<(
+    std::ostream& stream,
+    const OpenRegStream& s);
+
+} // namespace c10::openreg
+
+namespace std {
+template <>
+struct hash<c10::openreg::OpenRegStream> {
+  size_t operator()(c10::openreg::OpenRegStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+} // namespace std
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/include/Macros.h b/test/cpp_extensions/open_registration_extension/torch_openreg/include/Macros.h
new file mode 100644
index 0000000000000..c75523c2bc78a
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/include/Macros.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#ifdef _WIN32
+#define OPENREG_EXPORT __declspec(dllexport)
+#else
+#define OPENREG_EXPORT __attribute__((visibility("default")))
+#endif
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py b/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
index 07d31e73d76ba..0768653e1ac45 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
@@ -1,5 +1,6 @@
 import multiprocessing
 import os
+import platform
 import shutil
 import subprocess
 import sys
@@ -9,10 +10,23 @@
 from setuptools import Extension, find_packages, setup
 
 
+# Env Variables
+IS_DARWIN = platform.system() == "Darwin"
+IS_WINDOWS = platform.system() == "Windows"
+
 BASE_DIR = os.path.dirname(os.path.realpath(__file__))
 RUN_BUILD_DEPS = any(arg in {"clean", "dist_info"} for arg in sys.argv)
 
 
+def make_relative_rpath_args(path):
+    if IS_DARWIN:
+        return ["-Wl,-rpath,@loader_path/" + path]
+    elif IS_WINDOWS:
+        return []
+    else:
+        return ["-Wl,-rpath,$ORIGIN/" + path]
+
+
 def get_pytorch_dir():
     import torch
 
@@ -39,9 +53,15 @@ def build_deps():
         ".",
         "--target",
         "install",
+        "--config",  # For multi-config generators
+        "Release",
         "--",
     ]
-    build_args += ["-j", str(multiprocessing.cpu_count())]
+
+    if IS_WINDOWS:
+        build_args += ["/m:" + str(multiprocessing.cpu_count())]
+    else:
+        build_args += ["-j", str(multiprocessing.cpu_count())]
 
     command = ["cmake"] + build_args
     subprocess.check_call(command, cwd=build_dir, env=os.environ)
@@ -64,19 +84,48 @@ def main():
     if not RUN_BUILD_DEPS:
         build_deps()
 
+    if IS_WINDOWS:
+        # /NODEFAULTLIB makes sure we only link to DLL runtime
+        # and matches the flags set for protobuf and ONNX
+        extra_link_args: list[str] = ["/NODEFAULTLIB:LIBCMT.LIB"] + [
+            *make_relative_rpath_args("lib")
+        ]
+        # /MD links against DLL runtime
+        # and matches the flags set for protobuf and ONNX
+        # /EHsc is about standard C++ exception handling
+        extra_compile_args: list[str] = ["/MD", "/FS", "/EHsc"]
+    else:
+        extra_link_args = [*make_relative_rpath_args("lib")]
+        extra_compile_args = [
+            "-Wall",
+            "-Wextra",
+            "-Wno-strict-overflow",
+            "-Wno-unused-parameter",
+            "-Wno-missing-field-initializers",
+            "-Wno-unknown-pragmas",
+            "-fno-strict-aliasing",
+        ]
+
     ext_modules = [
         Extension(
             name="torch_openreg._C",
             sources=["torch_openreg/csrc/stub.c"],
             language="c",
-            extra_compile_args=["-g", "-Wall", "-Werror"],
+            extra_compile_args=extra_compile_args,
             libraries=["torch_bindings"],
             library_dirs=[os.path.join(BASE_DIR, "torch_openreg/lib")],
-            extra_link_args=["-Wl,-rpath,$ORIGIN/lib"],
+            extra_link_args=extra_link_args,
         )
     ]
 
-    package_data = {"torch_openreg": ["lib/*.so*"]}
+    package_data = {
+        "torch_openreg": [
+            "lib/*.so*",
+            "lib/*.dylib*",
+            "lib/*.dll",
+            "lib/*.lib",
+        ]
+    }
 
     setup(
         packages=find_packages(),
@@ -85,6 +134,7 @@ def main():
         cmdclass={
             "clean": BuildClean,  # type: ignore[misc]
         },
+        include_package_data=False,
     )
 
 
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/CMakeLists.txt b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/CMakeLists.txt
index 7fec109eeb1cd..2c7d26d6806bc 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/CMakeLists.txt
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/CMakeLists.txt
@@ -1,11 +1,45 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+project(TORCH_OPENREG CXX C)
+
+option(USE_TEST "Build and run unit tests" ON)
+
 set(LIBRARY_NAME openreg)
+set(LIBRARY_TEST ortests)
 
 file(GLOB_RECURSE SOURCE_FILES
-    "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp"
 )
 
 add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})
 
 target_include_directories(${LIBRARY_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
-install(TARGETS ${LIBRARY_NAME} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+install(TARGETS ${LIBRARY_NAME}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+
+if(USE_TEST)
+    enable_testing()
+
+    include(${CMAKE_CURRENT_LIST_DIR}/cmake/GTestTargets.cmake)
+
+    file(GLOB_RECURSE TEST_FILES
+        "${CMAKE_CURRENT_SOURCE_DIR}/tests/*.cpp"
+    )
+
+    add_executable(${LIBRARY_TEST} ${TEST_FILES})
+    target_link_libraries(${LIBRARY_TEST}
+        PRIVATE
+        ${LIBRARY_NAME}
+        gtest
+        gtest_main
+    )
+
+    add_test(NAME alltests COMMAND ${LIBRARY_TEST})
+    add_custom_command(TARGET ${LIBRARY_TEST}
+                POST_BUILD
+                COMMAND ${CMAKE_CTEST_COMMAND} -C Release --output-on-failure --verbose)
+endif()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
index af17ef3abdb1a..0cee2c87ea34b 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
@@ -4,9 +4,9 @@
 
 OpenReg is a C++ backend library that simulates the behavior of a CUDA-like device on a CPU. Its core objective is **not to accelerate computation or improve performance**, but rather to **simulate modern CUDA programming, enabling developers to prototype and test in an environment without actual GPU hardware**. The current design principles are as follows:
 
-* **API Consistency**: Provide an interface consistent with the CUDA Runtime API, allowing upper-level applications (like PyTorch's PrivateUse1 backend) to switch and test seamlessly.
+* **API Consistency**: Provide an interface consistent with the CUDA Runtime API, allowing upper-level applications (like PyTorch's `PrivateUse1` backend) to switch and test seamlessly.
 * **Functional Consistency**: Provide behavior consistent with the CUDA Runtime, such as memory isolation, device context management, etc.
-* **Completeness**: Aim to support PrivateUse1 device integration and safeguard the third-party device integration mechanism, without striving to cover all capabilities of the CUDA Runtime.
+* **Completeness**: Aim to support `PrivateUse1` device integration and safeguard the third-party device integration mechanism, without striving to cover all capabilities of the CUDA Runtime.
 
 ## Directory Structure
 
@@ -14,19 +14,34 @@ The project's code is organized with a clear structure and separation of respons
 
 ```text
 openreg/
-├── CMakeLists.txt      # Top-level CMake build script, used to compile and generate libopenreg.so
+├── README.md               # Comprehensive introduction of OpenReg.
+├── CMakeLists.txt          # Top-level CMake build script, used to compile and generate libopenreg.so
+├── cmake/
+│   └── GTestTargets.cmake  # Utils of fetching GoogleTest.
 ├── include/
-│   └── openreg.h       # Public API header file, external users only need to include this file
+│   ├── openreg.h           # Public API header file, external users only need to include this file
+│   └── openreg.inl         # Public API header file, as an extension of openreg.h, cannot be included separately.
+├── example/
+│   └── example.cpp         # Example for OpenReg.
+├── tests/
+│   ├── event_tests.cpp     # Testcases about OpenReg Event.
+│   ├── stream_tests.cpp    # Testcases about OpenReg Stream.
+│   ├── device_tests.cpp    # Testcases about OpenReg Device.
+│   └── memory_tests.cpp    # Testcases about OpenReg Memory.
 └── csrc/
-    ├── device.cpp      # Implementation of device management-related APIs
-    └── memory.cpp      # Implementation of APIs for memory management, copying, and protection
+    ├── device.cpp          # Implementation of device management APIs
+    ├── memory.cpp          # Implementation of memory management APIs
+    └── stream.cpp          # Implementation of stream and event APIs.
 ```
 
-* `include/openreg.h`: Defines all externally exposed C-style APIs, data structures, and enums. It is the "public face" of this library.
-* `csrc/`: Contains the C++ implementation source code for all core functionalities.
-  * `device.cpp`: Implements device discovery (`orGetDeviceCount`) and thread context management (`orSetDevice`/`orGetDevice`).
-  * `memory.cpp`: Implements the core functions of memory allocation (`orMalloc`/`orMallocHost`), deallocation, copying, and memory protection (`orMemoryProtect`, `orMemoryUnprotect`).
 * `CMakeLists.txt`: Responsible for compiling and linking all source files under the `csrc/` directory to generate the final `libopenreg.so` shared library.
+* `include`: Defines all externally exposed APIs, data structures, and enums.
+  * `openreg.h`: Defines all externally exposed C-style APIs.
+  * `openreg.inl`: Defines all externally exposed C++ APIs.
+* `csrc/`: Contains the C++ implementation source code for all core functionalities.
+  * `device.cpp`: Implements the core functions of device management: device discovery and context management.
+  * `memory.cpp`: Implements the core functions of memory management: allocation, free, copy and memory protection.
+  * `stream.cpp`: Implements the core functions of stream and event: creation, destroy, record, synchronization and so on.
 
 ## Implemented APIs
 
@@ -34,25 +49,49 @@ OpenReg currently provides a set of APIs covering basic memory and device manage
 
 ### Device Management APIs
 
-| OpenReg              | CUDA                 | Feature Description                               |
-| :------------------- | :------------------- | :------------------------------------------------ |
-| `orGetDeviceCount`   | `cudaGetDeviceCount` | Get the number of devices                         |
-| `orSetDevice`        | `cudaSetDevice`      | Set the current device for the current thread     |
-| `orGetDevice`        | `cudaGetDevice`      | Get the current device for the current thread     |
+| OpenReg                          | CUDA                               | Feature Description                |
+| :------------------------------- | :--------------------------------- | :--------------------------------- |
+| `orGetDeviceCount`               | `cudaGetDeviceCount`               | Get the number of available GPUs   |
+| `orSetDevice`                    | `cudaSetDevice`                    | Set the active GPU                 |
+| `orGetDevice`                    | `cudaGetDevice`                    | Get the current GPU                |
+| `orDeviceSynchronize`            | `cudaDeviceSynchronize`            | Wait for all GPU tasks to finish   |
+| `orDeviceGetStreamPriorityRange` | `cudaDeviceGetStreamPriorityRange` | Get the range of stream priorities |
 
 ### Memory Management APIs
 
-| OpenReg                  | CUDA                         | Feature Description                        |
-| :----------------------- | :--------------------------- | :----------------------------------------- |
-| `orMalloc`               | `cudaMalloc`                 | Allocate device memory                     |
-| `orFree`                 | `cudaFree`                   | Free device memory                         |
-| `orMallocHost`           | `cudaMallocHost`             | Allocate page-locked (Pinned) host memory  |
-| `orFreeHost`             | `cudaFreeHost`               | Free page-locked host memory               |
-| `orMemcpy`               | `cudaMemcpy`                 | Synchronous memory copy                    |
-| `orMemcpyAsync`          | `cudaMemcpyAsync`            | Asynchronous memory copy                   |
-| `orPointerGetAttributes` | `cudaPointerGetAttributes`   | Get pointer attributes                     |
-| `orMemoryUnprotect`      | -                            | (Internal use) Unprotect memory            |
-| `orMemoryProtect`        | -                            | (Internal use) Restore memory protection   |
+| OpenReg                  | CUDA                       | Feature Description                       |
+| :----------------------- | :------------------------- | :---------------------------------------- |
+| `orMalloc`               | `cudaMalloc`               | Allocate device memory                    |
+| `orFree`                 | `cudaFree`                 | Free device memory                        |
+| `orMallocHost`           | `cudaMallocHost`           | Allocate page-locked (Pinned) host memory |
+| `orFreeHost`             | `cudaFreeHost`             | Free page-locked host memory              |
+| `orMemcpy`               | `cudaMemcpy`               | Synchronous memory copy                   |
+| `orMemcpyAsyn`           | `cudaMemcpyAsyn`           | Asynchronous memory copy                  |
+| `orPointerGetAttributes` | `cudaPointerGetAttributes` | Get pointer attributes                    |
+
+### Stream APIs
+
+| OpenReg                      | CUDA                           | Feature Description                    |
+| :--------------------------- | :----------------------------- | :------------------------------------- |
+| `orStreamCreate`             | `cudaStreamCreate`             |  Create a default-priority stream      |
+| `orStreamCreateWithPriority` | `cudaStreamCreateWithPriority` |  Create a stream with a given priority |
+| `orStreamDestroy`            | `cudaStreamDestroy`            |  Destroy a stream                      |
+| `orStreamQuery`              | `cudaStreamQuery`              |  Check if a stream has completed       |
+| `orStreamSynchronize`        | `cudaStreamSynchronize`        |  Wait for a stream to complete         |
+| `orStreamWaitEvent`          | `cudaStreamWaitEvent`          |  Make a stream wait for an event       |
+| `orStreamGetPriority`        | `cudaStreamGetPriority`        |  Get a stream’s priority               |
+
+### Event APIs
+
+| OpenReg                  | CUDA                       | Feature Description                 |
+| :----------------------- | :------------------------- | :---------------------------------- |
+| `orEventCreate`          | `cudaEventCreate`          | Create an event with default flag   |
+| `orEventCreateWithFlags` | `cudaEventCreateWithFlags` | Create an event with specific flag  |
+| `orEventDestroy`         | `cudaEventDestroy`         | Destroy an event                    |
+| `orEventRecord`          | `cudaEventRecord`          | Record an event in a stream         |
+| `orEventSynchronize`     | `cudaEventSynchronize`     | Wait for an event to complete       |
+| `orEventQuery`           | `cudaEventQuery`           | Check if an event has completed     |
+| `orEventElapsedTime`     | `cudaEventElapsedTime`     | Get time elapsed between two events |
 
 ## Implementation Principles
 
@@ -71,67 +110,42 @@ Simulating device memory, host memory, and memory copies:
 2. **Deallocation**: Memory is freed using `munmap`.
 3. **Authorization**: When a legitimate memory access is required, an RAII guard restores the memory permissions to `PROT_READ | PROT_WRITE`. The permissions are automatically reverted to `PROT_NONE` when the scope is exited.
 
+### Stream&Event Principles
+
+Simulating creation, release and synchronization for event and steam:
+
+1. **Event**: Each event is encapsulated as a task function and placed into a stream, which acts as a thread. Upon completion of the task, a flag within the event is modified to simulate the event's status.
+2. **Stream**: When each stream is requested, a new thread is created, which sequentially processes each task in the task queue within the stream structure. Tasks can be wrappers around kernel functions or events.
+3. **Synchronization**: Synchronization between streams and events is achieved using multithreading, condition variables, and mutexes.
+
 ## Usage Example
 
-The following is a simple code snippet demonstrating how to use the core features of the OpenReg library.
-
-```cpp
-#include "openreg.h"
-#include <iostream>
-#include <vector>
-#include <cstdio>
-
-#define OR_CHECK(call) do { \
-    orError_t err = call; \
-    if (err != orSuccess) { \
-        fprintf(stderr, "OR Error code %d in %s at line %d\n", err, __FILE__, __LINE__); \
-        exit(EXIT_FAILURE); \
-    } \
-} while (0)
-
-int main() {
-    int device_count = 0;
-    OR_CHECK(orGetDeviceCount(&device_count));
-    std::cout << "Found " << device_count << " simulated devices." << std::endl;
-
-    int current_device = -1;
-    OR_CHECK(orSetDevice(1));
-    OR_CHECK(orGetDevice(&current_device));
-    std::cout << "Set current device to " << current_device << "." << std::endl;
-
-    const int n = 1024;
-    const size_t size = n * sizeof(int);
-    int *h_a, *d_a;
-    OR_CHECK(orMallocHost((void**)&h_a, size));
-    OR_CHECK(orMalloc((void**)&d_a, size));
-
-    orPointerAttributes attr;
-    OR_CHECK(orPointerGetAttributes(&attr, d_a));
-    std::cout << "Pointer " << (void*)d_a << " is of type " << attr.type
-              << " on device " << attr.device << std::endl;
-
-    for (int i = 0; i < n; ++i) {
-        h_a[i] = i;
-    }
-    OR_CHECK(orMemcpy(d_a, h_a, size, orMemcpyHostToDevice));
-    std::cout << "Data copied from Host to Device." << std::endl;
-
-    // std::cout << "Trying to access device memory directly from CPU..." << std::endl;
-    // int val = d_a[0]; // CRASH!
-
-    // Clean up resources
-    OR_CHECK(orFree(d_a));
-    OR_CHECK(orFreeHost(h_a));
-    std::cout << "Resources freed." << std::endl;
-
-    return 0;
-}
+Please refer to [example](example/example.cpp) for example.
+
+The command to compile example.cpp is as follow:
+
+```Shell
+mkdir build
+
+pushd build
+cmake ..
+make -j 32
+popd
+
+g++ -o out example/example.cpp -L ./build -lopenreg
+LD_LIBRARY_PATH=./build ./out
 ```
 
-## Next Steps
+The output is as follow:
 
-To better support PrivateUse1 device integration, the following capabilities are planned for the future:
+```Shell
+Current environment have 2 devices
+Current is 0 device
+All tasks have been submitted.
+Kernel execution time: 0.238168 ms
+Verification PASSED!
+```
+
+## Next Steps
 
-* **Stream Support**: Provide the ability to simulate CUDA Streams.
-* **Event Support**: Provide the ability to simulate CUDA Events.
-* **Cross-Platform Support**: Add support for Windows and macOS (low priority).
+The most basic functions of the OpenReg backend are currently supported, and will be dynamically optimized and expanded based on the needs of PyTorch integration.
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/cmake/GTestTargets.cmake b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/cmake/GTestTargets.cmake
new file mode 100644
index 0000000000000..777fc489ba25c
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/cmake/GTestTargets.cmake
@@ -0,0 +1,12 @@
+set(GTest_REL_PATH "../../../../../../../third_party/googletest")
+get_filename_component(GTest_DIR "${CMAKE_CURRENT_LIST_DIR}/${GTest_REL_PATH}" ABSOLUTE)
+
+if(EXISTS "${GTest_DIR}/CMakeLists.txt")
+    message(STATUS "Found GTest: ${GTest_DIR}")
+
+    set(BUILD_GMOCK OFF CACHE BOOL "Disable GMock build")
+    set(INSTALL_GTEST OFF CACHE BOOL "Disable GTest install")
+    add_subdirectory(${GTest_DIR} "${CMAKE_BINARY_DIR}/gtest")
+else()
+    message(FATAL_ERROR "GTest Not Found")
+endif()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/device.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/device.cpp
index 3f1d43ea0b554..9643bc591587f 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/device.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/device.cpp
@@ -1,10 +1,12 @@
 #include <include/openreg.h>
 
 namespace {
+
 // Total device numbers
 constexpr int DEVICE_COUNT = 2;
 // Current device index
 thread_local int gCurrentDevice = 0;
+
 } // namespace
 
 orError_t orGetDeviceCount(int* count) {
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.cpp
index 762cd96d23bb8..6f02eeb053a6c 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.cpp
@@ -1,37 +1,18 @@
+#include "memory.h"
+
 #include <include/openreg.h>
 
-#include <sys/mman.h>
-#include <unistd.h>
-#include <cstdlib>
-#include <cstring>
 #include <map>
 #include <mutex>
 
-namespace openreg {
-namespace internal {
+namespace {
 
-class ScopedMemoryProtector {
- public:
-  ScopedMemoryProtector(const orPointerAttributes& info)
-      : m_info(info), m_protected(false) {
-    if (m_info.type == orMemoryType::orMemoryTypeDevice) {
-      if (mprotect(m_info.pointer, m_info.size, PROT_READ | PROT_WRITE) ==
-          0) {
-        m_protected = true;
-      }
-    }
-  }
-  ~ScopedMemoryProtector() {
-    if (m_protected) {
-      mprotect(m_info.pointer, m_info.size, PROT_NONE);
-    }
-  }
-  ScopedMemoryProtector(const ScopedMemoryProtector&) = delete;
-  ScopedMemoryProtector& operator=(const ScopedMemoryProtector&) = delete;
-
- private:
-  orPointerAttributes m_info;
-  bool m_protected;
+struct Block {
+  orMemoryType type = orMemoryType::orMemoryTypeUnmanaged;
+  int device = -1;
+  void* pointer = nullptr;
+  size_t size = 0;
+  int refcount{0};
 };
 
 class MemoryManager {
@@ -46,7 +27,7 @@ class MemoryManager {
       return orErrorUnknown;
 
     std::lock_guard<std::mutex> lock(m_mutex);
-    long page_size = sysconf(_SC_PAGESIZE);
+    long page_size = openreg::get_pagesize();
     size_t aligned_size = ((size - 1) / page_size + 1) * page_size;
     void* mem = nullptr;
     int current_device = -1;
@@ -54,26 +35,20 @@ class MemoryManager {
     if (type == orMemoryType::orMemoryTypeDevice) {
       orGetDevice(&current_device);
 
-      mem = mmap(
-          nullptr,
-          aligned_size,
-          PROT_READ | PROT_WRITE,
-          MAP_PRIVATE | MAP_ANONYMOUS,
-          -1,
-          0);
-      if (mem == MAP_FAILED)
+      mem = openreg::mmap(aligned_size);
+      if (mem == nullptr)
         return orErrorUnknown;
-      if (mprotect(mem, aligned_size, PROT_NONE) != 0) {
-        munmap(mem, aligned_size);
+      if (openreg::mprotect(mem, aligned_size, F_PROT_NONE) != 0) {
+        openreg::munmap(mem, aligned_size);
         return orErrorUnknown;
       }
     } else {
-      if (posix_memalign(&mem, page_size, aligned_size) != 0) {
+      if (openreg::alloc(&mem, page_size, aligned_size) != 0) {
         return orErrorUnknown;
       }
     }
 
-    m_registry[mem] = {type, current_device, mem, aligned_size};
+    m_registry[mem] = {type, current_device, mem, aligned_size, 0};
     *ptr = mem;
     return orSuccess;
   }
@@ -86,13 +61,15 @@ class MemoryManager {
     auto it = m_registry.find(ptr);
     if (it == m_registry.end())
       return orErrorUnknown;
+
     const auto& info = it->second;
     if (info.type == orMemoryType::orMemoryTypeDevice) {
-      mprotect(info.pointer, info.size, PROT_READ | PROT_WRITE);
-      munmap(info.pointer, info.size);
+      openreg::mprotect(info.pointer, info.size, F_PROT_READ | F_PROT_WRITE);
+      openreg::munmap(info.pointer, info.size);
     } else {
-      ::free(info.pointer);
+      openreg::free(info.pointer);
     }
+
     m_registry.erase(it);
     return orSuccess;
   }
@@ -104,36 +81,39 @@ class MemoryManager {
       orMemcpyKind kind) {
     if (!dst || !src || count == 0)
       return orErrorUnknown;
+
     std::lock_guard<std::mutex> lock(m_mutex);
-    orPointerAttributes dst_info = getPointerInfo(dst);
-    orPointerAttributes src_info = getPointerInfo(src);
+    Block* dst_info = getBlockInfoNoLock(dst);
+    Block* src_info = getBlockInfoNoLock(src);
+
     switch (kind) {
       case orMemcpyHostToDevice:
-        if (dst_info.type != orMemoryType::orMemoryTypeDevice ||
-            src_info.type == orMemoryType::orMemoryTypeDevice)
+        if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) ||
+            (src_info && src_info->type == orMemoryType::orMemoryTypeDevice))
           return orErrorUnknown;
         break;
       case orMemcpyDeviceToHost:
-        if (dst_info.type == orMemoryType::orMemoryTypeDevice ||
-            src_info.type != orMemoryType::orMemoryTypeDevice)
+        if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) ||
+            (!src_info || src_info->type != orMemoryType::orMemoryTypeDevice))
           return orErrorUnknown;
         break;
       case orMemcpyDeviceToDevice:
-        if (dst_info.type != orMemoryType::orMemoryTypeDevice ||
-            src_info.type != orMemoryType::orMemoryTypeDevice)
+        if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) ||
+            (!src_info || src_info->type != orMemoryType::orMemoryTypeDevice))
           return orErrorUnknown;
         break;
       case orMemcpyHostToHost:
-        if (dst_info.type == orMemoryType::orMemoryTypeDevice ||
-            src_info.type == orMemoryType::orMemoryTypeDevice)
+        if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) ||
+            (src_info && src_info->type == orMemoryType::orMemoryTypeDevice))
           return orErrorUnknown;
         break;
     }
-    {
-      ScopedMemoryProtector dst_protector(dst_info);
-      ScopedMemoryProtector src_protector(src_info);
-      ::memcpy(dst, src, count);
-    }
+
+    unprotectNoLock(dst_info);
+    unprotectNoLock(src_info);
+    ::memcpy(dst, src, count);
+    protectNoLock(dst_info);
+    protectNoLock(src_info);
 
     return orSuccess;
   }
@@ -145,17 +125,16 @@ class MemoryManager {
       return orErrorUnknown;
 
     std ::lock_guard<std::mutex> lock(m_mutex);
-    orPointerAttributes info = getPointerInfo(ptr);
+    Block* info = getBlockInfoNoLock(ptr);
 
-    attributes->type = info.type;
-    if (info.type == orMemoryType::orMemoryTypeUnmanaged) {
+    if (!info) {
+      attributes->type = orMemoryType::orMemoryTypeUnmanaged;
       attributes->device = -1;
       attributes->pointer = const_cast<void*>(ptr);
-      attributes->size = 0;
     } else {
-      attributes->device = info.device;
-      attributes->pointer = info.pointer;
-      attributes->size = info.size;
+      attributes->type = info->type;
+      attributes->device = info->device;
+      attributes->pointer = info->pointer;
     }
 
     return orSuccess;
@@ -163,65 +142,82 @@ class MemoryManager {
 
   orError_t unprotect(void* ptr) {
     std::lock_guard<std::mutex> lock(m_mutex);
-    orPointerAttributes info = getPointerInfo(ptr);
-    if (info.type != orMemoryType::orMemoryTypeDevice) {
-      return orErrorUnknown;
-    }
-    if (mprotect(info.pointer, info.size, PROT_READ | PROT_WRITE) != 0) {
-      return orErrorUnknown;
-    }
-    return orSuccess;
+    return unprotectNoLock(getBlockInfoNoLock(ptr));
   }
 
   orError_t protect(void* ptr) {
     std::lock_guard<std::mutex> lock(m_mutex);
-    orPointerAttributes info = getPointerInfo(ptr);
-    if (info.type != orMemoryType::orMemoryTypeDevice) {
-      return orErrorUnknown;
+    return protectNoLock(getBlockInfoNoLock(ptr));
+  }
+
+ private:
+  MemoryManager() = default;
+
+  orError_t unprotectNoLock(Block* info) {
+    if (info && info->type == orMemoryType::orMemoryTypeDevice) {
+      if (info->refcount == 0) {
+        if (openreg::mprotect(
+                info->pointer, info->size, F_PROT_READ | F_PROT_WRITE) != 0) {
+          return orErrorUnknown;
+        }
+      }
+
+      info->refcount++;
     }
-    if (mprotect(info.pointer, info.size, PROT_NONE) != 0) {
-      return orErrorUnknown;
+
+    return orSuccess;
+  }
+
+  orError_t protectNoLock(Block* info) {
+    if (info && info->type == orMemoryType::orMemoryTypeDevice) {
+      if (info->refcount == 1) {
+        if (openreg::mprotect(info->pointer, info->size, F_PROT_NONE) != 0) {
+          return orErrorUnknown;
+        }
+      }
+
+      info->refcount--;
     }
+
     return orSuccess;
   }
 
- private:
-  MemoryManager() = default;
-  orPointerAttributes getPointerInfo(const void* ptr) {
+  Block* getBlockInfoNoLock(const void* ptr) {
     auto it = m_registry.upper_bound(const_cast<void*>(ptr));
-    if (it == m_registry.begin())
-      return {};
-    --it;
-    const char* p_char = static_cast<const char*>(ptr);
-    const char* base_char = static_cast<const char*>(it->first);
-    if (p_char >= base_char && p_char < (base_char + it->second.size)) {
-      return it->second;
+    if (it != m_registry.begin()) {
+      --it;
+      const char* p_char = static_cast<const char*>(ptr);
+      const char* base_char = static_cast<const char*>(it->first);
+      if (p_char >= base_char && p_char < (base_char + it->second.size)) {
+        return &it->second;
+      }
     }
-    return {};
+
+    return nullptr;
   }
-  std::map<void*, orPointerAttributes> m_registry;
+
+  std::map<void*, Block> m_registry;
   std::mutex m_mutex;
 };
 
-} // namespace internal
-} // namespace openreg
+} // namespace
 
 orError_t orMalloc(void** devPtr, size_t size) {
-  return openreg::internal::MemoryManager::getInstance().allocate(
+  return MemoryManager::getInstance().allocate(
       devPtr, size, orMemoryType::orMemoryTypeDevice);
 }
 
 orError_t orFree(void* devPtr) {
-  return openreg::internal::MemoryManager::getInstance().free(devPtr);
+  return MemoryManager::getInstance().free(devPtr);
 }
 
 orError_t orMallocHost(void** hostPtr, size_t size) {
-  return openreg::internal::MemoryManager::getInstance().allocate(
+  return MemoryManager::getInstance().allocate(
       hostPtr, size, orMemoryType::orMemoryTypeHost);
 }
 
 orError_t orFreeHost(void* hostPtr) {
-  return openreg::internal::MemoryManager::getInstance().free(hostPtr);
+  return MemoryManager::getInstance().free(hostPtr);
 }
 
 orError_t orMemcpy(
@@ -229,21 +225,35 @@ orError_t orMemcpy(
     const void* src,
     size_t count,
     orMemcpyKind kind) {
-  return openreg::internal::MemoryManager::getInstance().memcpy(
-      dst, src, count, kind);
+  return MemoryManager::getInstance().memcpy(dst, src, count, kind);
+}
+
+orError_t orMemcpyAsync(
+    void* dst,
+    const void* src,
+    size_t count,
+    orMemcpyKind kind,
+    orStream_t stream) {
+  if (!stream) {
+    return orErrorUnknown;
+  }
+
+  auto& mm = MemoryManager::getInstance();
+
+  return orLaunchKernel(
+      stream, &MemoryManager::memcpy, &mm, dst, src, count, kind);
 }
 
 orError_t orPointerGetAttributes(
     orPointerAttributes* attributes,
     const void* ptr) {
-  return openreg::internal::MemoryManager::getInstance().getPointerAttributes(
-      attributes, ptr);
+  return MemoryManager::getInstance().getPointerAttributes(attributes, ptr);
 }
 
 orError_t orMemoryUnprotect(void* devPtr) {
-  return openreg::internal::MemoryManager::getInstance().unprotect(devPtr);
+  return MemoryManager::getInstance().unprotect(devPtr);
 }
 
 orError_t orMemoryProtect(void* devPtr) {
-  return openreg::internal::MemoryManager::getInstance().protect(devPtr);
+  return MemoryManager::getInstance().protect(devPtr);
 }
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.h b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.h
new file mode 100644
index 0000000000000..35851ac906597
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.h
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+#define F_PROT_NONE 0x0
+#define F_PROT_READ 0x1
+#define F_PROT_WRITE 0x2
+
+namespace openreg {
+
+void* mmap(size_t size) {
+#if defined(_WIN32)
+  return VirtualAlloc(nullptr, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+#else
+  void* addr = ::mmap(
+      nullptr,
+      size,
+      PROT_READ | PROT_WRITE,
+      MAP_PRIVATE | MAP_ANONYMOUS,
+      -1,
+      0);
+  return (addr == MAP_FAILED) ? nullptr : addr;
+#endif
+}
+
+void munmap(void* addr, size_t size) {
+#if defined(_WIN32)
+  VirtualFree(addr, 0, MEM_RELEASE);
+#else
+  ::munmap(addr, size);
+#endif
+}
+
+int mprotect(void* addr, size_t size, int prot) {
+#if defined(_WIN32)
+  DWORD win_prot = 0;
+  DWORD old;
+  if (prot == F_PROT_NONE) {
+    win_prot = PAGE_NOACCESS;
+  } else {
+    win_prot = PAGE_READWRITE;
+  }
+
+  return VirtualProtect(addr, size, win_prot, &old) ? 0 : -1;
+#else
+  int native_prot = 0;
+  if (prot == F_PROT_NONE)
+    native_prot = PROT_NONE;
+  else {
+    if (prot & F_PROT_READ)
+      native_prot |= PROT_READ;
+    if (prot & F_PROT_WRITE)
+      native_prot |= PROT_WRITE;
+  }
+
+  return ::mprotect(addr, size, native_prot);
+#endif
+}
+
+int alloc(void** mem, size_t alignment, size_t size) {
+#ifdef _WIN32
+  *mem = _aligned_malloc(size, alignment);
+  return *mem ? 0 : -1;
+#else
+  return posix_memalign(mem, alignment, size);
+#endif
+}
+
+void free(void* mem) {
+#ifdef _WIN32
+  _aligned_free(mem);
+#else
+  ::free(mem);
+#endif
+}
+
+long get_pagesize() {
+#ifdef _WIN32
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  return static_cast<long>(si.dwPageSize);
+#else
+  return sysconf(_SC_PAGESIZE);
+#endif
+}
+
+} // namespace openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/stream.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/stream.cpp
new file mode 100644
index 0000000000000..30f50b1aa2895
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/stream.cpp
@@ -0,0 +1,313 @@
+#include <include/openreg.h>
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+
+static std::mutex g_mutex;
+static std::once_flag g_flag;
+static std::vector<std::set<orStream_t>> g_streams_per_device;
+
+static void initialize_registries() {
+  int device_count = 0;
+  orGetDeviceCount(&device_count);
+  g_streams_per_device.resize(device_count);
+}
+
+struct orEventImpl {
+  std::mutex mtx;
+  std::condition_variable cv;
+  std::atomic<bool> completed{true};
+  int device_index = -1;
+  bool timing_enabled{false};
+  std::chrono::high_resolution_clock::time_point completion_time;
+};
+
+struct orEvent {
+  std::shared_ptr<orEventImpl> impl;
+};
+
+struct orStream {
+  std::queue<std::function<void()>> tasks;
+  std::mutex mtx;
+  std::condition_variable cv;
+  std::thread worker;
+  std::atomic<bool> stop_flag{false};
+  int device_index = -1;
+
+  orStream() {
+    worker = std::thread([this] {
+      while (true) {
+        std::function<void()> task;
+        {
+          std::unique_lock<std::mutex> lock(this->mtx);
+          this->cv.wait(lock, [this] {
+            return this->stop_flag.load() || !this->tasks.empty();
+          });
+          if (this->stop_flag.load() && this->tasks.empty()) {
+            return;
+          }
+          task = std::move(this->tasks.front());
+          this->tasks.pop();
+        }
+        task();
+      }
+    });
+  }
+
+  ~orStream() {
+    stop_flag.store(true);
+    cv.notify_one();
+    worker.join();
+  }
+};
+
+orError_t openreg::addTaskToStream(
+    orStream_t stream,
+    std::function<void()> task) {
+  if (!stream)
+    return orErrorUnknown;
+
+  {
+    std::lock_guard<std::mutex> lock(stream->mtx);
+    stream->tasks.push(std::move(task));
+  }
+
+  stream->cv.notify_one();
+  return orSuccess;
+}
+
+orError_t orEventCreateWithFlags(orEvent_t* event, unsigned int flags) {
+  if (!event)
+    return orErrorUnknown;
+
+  auto impl = std::make_shared<orEventImpl>();
+  orGetDevice(&(impl->device_index));
+  if (flags & orEventEnableTiming) {
+    impl->timing_enabled = true;
+  }
+
+  *event = new orEvent{std::move(impl)};
+  return orSuccess;
+}
+
+orError_t orEventCreate(orEvent_t* event) {
+  return orEventCreateWithFlags(event, orEventDisableTiming);
+}
+
+orError_t orEventDestroy(orEvent_t event) {
+  if (!event)
+    return orErrorUnknown;
+
+  delete event;
+  return orSuccess;
+}
+
+orError_t orEventRecord(orEvent_t event, orStream_t stream) {
+  if (!event || !stream)
+    return orErrorUnknown;
+
+  auto event_impl = event->impl;
+  event_impl->completed.store(false);
+  auto record_task = [event_impl]() {
+    if (event_impl->timing_enabled) {
+      event_impl->completion_time = std::chrono::high_resolution_clock::now();
+    }
+
+    {
+      std::lock_guard<std::mutex> lock(event_impl->mtx);
+      event_impl->completed.store(true);
+    }
+
+    event_impl->cv.notify_all();
+  };
+
+  return openreg::addTaskToStream(stream, record_task);
+}
+
+orError_t orEventSynchronize(orEvent_t event) {
+  if (!event)
+    return orErrorUnknown;
+
+  auto event_impl = event->impl;
+  std::unique_lock<std::mutex> lock(event_impl->mtx);
+  event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); });
+
+  return orSuccess;
+}
+
+orError_t orEventQuery(orEvent_t event) {
+  if (!event)
+    return orErrorUnknown;
+
+  return event->impl->completed.load() ? orSuccess : orErrorNotReady;
+}
+
+orError_t orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end) {
+  if (!ms || !start || !end)
+    return orErrorUnknown;
+
+  auto start_impl = start->impl;
+  auto end_impl = end->impl;
+
+  if (start_impl->device_index != end_impl->device_index) {
+    return orErrorUnknown;
+  }
+
+  if (!start_impl->timing_enabled || !end_impl->timing_enabled) {
+    return orErrorUnknown;
+  }
+
+  if (!start_impl->completed.load() || !end_impl->completed.load()) {
+    return orErrorUnknown;
+  }
+
+  auto duration = end_impl->completion_time - start_impl->completion_time;
+  *ms = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(
+            duration)
+            .count();
+
+  return orSuccess;
+}
+
+orError_t orStreamCreateWithPriority(
+    orStream_t* stream,
+    [[maybe_unused]] unsigned int flag,
+    int priority) {
+  if (!stream) {
+    return orErrorUnknown;
+  }
+
+  int min_p, max_p;
+  orDeviceGetStreamPriorityRange(&min_p, &max_p);
+  if (priority < min_p || priority > max_p) {
+    return orErrorUnknown;
+  }
+
+  int current_device = 0;
+  orGetDevice(&current_device);
+
+  orStream_t new_stream = nullptr;
+  new_stream = new orStream();
+  new_stream->device_index = current_device;
+
+  {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    std::call_once(g_flag, initialize_registries);
+    g_streams_per_device[current_device].insert(new_stream);
+  }
+
+  *stream = new_stream;
+
+  return orSuccess;
+}
+
+orError_t orStreamCreate(orStream_t* stream) {
+  int min_p, max_p;
+  orDeviceGetStreamPriorityRange(&min_p, &max_p);
+
+  return orStreamCreateWithPriority(stream, 0, max_p);
+}
+
+orError_t orStreamGetPriority(
+    [[maybe_unused]] orStream_t stream,
+    int* priority) {
+  // Since OpenReg has only one priority level, the following code
+  // returns 0 directly for convenience.
+  *priority = 0;
+
+  return orSuccess;
+}
+
+orError_t orStreamDestroy(orStream_t stream) {
+  if (!stream)
+    return orErrorUnknown;
+
+  {
+    std::lock_guard<std::mutex> lock(g_mutex);
+
+    int device_idx = stream->device_index;
+    if (device_idx >= 0 && device_idx < g_streams_per_device.size()) {
+      g_streams_per_device[device_idx].erase(stream);
+    }
+  }
+
+  delete stream;
+  return orSuccess;
+}
+
+orError_t orStreamQuery(orStream_t stream) {
+  if (!stream) {
+    return orErrorUnknown;
+  }
+
+  std::lock_guard<std::mutex> lock(stream->mtx);
+  return stream->tasks.empty() ? orSuccess : orErrorNotReady;
+}
+
+orError_t orStreamSynchronize(orStream_t stream) {
+  if (!stream)
+    return orErrorUnknown;
+
+  orEvent_t event;
+  orEventCreate(&event);
+  orEventRecord(event, stream);
+
+  orError_t status = orEventSynchronize(event);
+  orEventDestroy(event);
+
+  return status;
+}
+
+orError_t orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int) {
+  if (!stream || !event)
+    return orErrorUnknown;
+
+  auto event_impl = event->impl;
+  auto wait_task = [event_impl]() {
+    std::unique_lock<std::mutex> lock(event_impl->mtx);
+    event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); });
+  };
+
+  return openreg::addTaskToStream(stream, wait_task);
+}
+
+orError_t orDeviceGetStreamPriorityRange(
+    int* leastPriority,
+    int* greatestPriority) {
+  if (!leastPriority || !greatestPriority) {
+    return orErrorUnknown;
+  }
+
+  // OpenReg have only one priority now.
+  *leastPriority = 0;
+  *greatestPriority = 0;
+  return orSuccess;
+}
+
+orError_t orDeviceSynchronize(void) {
+  int current_device = 0;
+  orGetDevice(&current_device);
+
+  std::vector<orStream_t> streams;
+  {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    std::call_once(g_flag, initialize_registries);
+
+    auto& streams_on_device = g_streams_per_device[current_device];
+    streams.assign(streams_on_device.begin(), streams_on_device.end());
+  }
+
+  for (orStream_t stream : streams) {
+    orError_t status = orStreamSynchronize(stream);
+    if (status != orSuccess) {
+      return status;
+    }
+  }
+
+  return orSuccess;
+}
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp
new file mode 100644
index 0000000000000..f00f1909b7ec6
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp
@@ -0,0 +1,112 @@
+#include "include/openreg.h"
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+struct MemoryGuard {
+  MemoryGuard(void* ptr) : ptr_(ptr) {
+    orMemoryUnprotect(ptr_);
+  }
+  ~MemoryGuard() {
+    orMemoryProtect(ptr_);
+  }
+
+ private:
+  void* ptr_{};
+};
+
+void add_kernel(float* out, float* a, float* b, int num) {
+  for (int i = 0; i < num; ++i) {
+    out[i] = a[i] + b[i];
+  }
+}
+
+int main() {
+  int device_count = 0;
+  orGetDeviceCount(&device_count);
+
+  std::cout << "Current environment have " << device_count << " devices"
+            << std::endl;
+
+  orSetDevice(0);
+  int current_device = -1;
+  orGetDevice(&current_device);
+
+  std::cout << "Current is " << current_device << " device" << std::endl;
+
+  constexpr int num = 50000;
+  constexpr size_t size = num * sizeof(float);
+
+  std::vector<float> host_a(num), host_b(num), host_out(num, 0.0f);
+  std::iota(host_a.begin(), host_a.end(), 0.0f);
+  for (int i = 0; i < num; ++i) {
+    host_b[i] = 2.0f;
+  }
+
+  float *dev_a, *dev_b, *dev_out;
+  orMalloc((void**)&dev_a, size);
+  orMalloc((void**)&dev_b, size);
+  orMalloc((void**)&dev_out, size);
+
+  // There will be subsequent memory access operations, so memory protection
+  // needs to be released
+  MemoryGuard a{dev_a};
+  MemoryGuard b{dev_b};
+  MemoryGuard c{dev_out};
+
+  orStream_t stream1, stream2;
+  orEvent_t start_event, stop_event;
+
+  orStreamCreate(&stream1);
+  orStreamCreate(&stream2);
+  orEventCreateWithFlags(&start_event, orEventEnableTiming);
+  orEventCreateWithFlags(&stop_event, orEventEnableTiming);
+
+  // Copy input from host to device
+  orMemcpyAsync(dev_a, host_a.data(), size, orMemcpyHostToDevice, stream1);
+  orMemcpyAsync(dev_b, host_b.data(), size, orMemcpyHostToDevice, stream1);
+
+  // Submit compute kernel and two events those are used for calculating time.
+  orEventRecord(start_event, stream1);
+  orLaunchKernel(stream1, add_kernel, dev_out, dev_a, dev_b, num);
+  orEventRecord(stop_event, stream1);
+
+  // Synchronization between streams.
+  orStreamWaitEvent(stream2, stop_event, 0);
+  orMemcpyAsync(host_out.data(), dev_out, size, orMemcpyDeviceToHost, stream2);
+  orStreamSynchronize(stream2);
+
+  std::cout << "All tasks have been submitted." << std::endl;
+
+  float elapsed_ms = 0.0f;
+  orEventElapsedTime(&elapsed_ms, start_event, stop_event);
+  std::cout << "Kernel execution time: " << elapsed_ms << " ms" << std::endl;
+
+  bool success = true;
+  for (int i = 0; i < num; ++i) {
+    if (std::abs(host_out[i] - (host_a[i] + host_b[i])) > 1e-5) {
+      std::cout << "Verification FAILED at index " << i << "! Expected "
+                << (host_a[i] + host_b[i]) << ", got " << host_out[i]
+                << std::endl;
+      success = false;
+      break;
+    }
+  }
+  if (success) {
+    std::cout << "Verification PASSED!" << std::endl;
+  }
+
+  orFree(dev_a);
+  orFree(dev_b);
+  orFree(dev_out);
+
+  orStreamDestroy(stream1);
+  orStreamDestroy(stream2);
+
+  orEventDestroy(start_event);
+  orEventDestroy(stop_event);
+
+  return 0;
+}
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.h b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.h
index b6b0b3da4295c..a5e4af5585f18 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.h
@@ -2,11 +2,21 @@
 
 #include <cstddef>
 
+#ifdef _WIN32
+#define OPENREG_EXPORT __declspec(dllexport)
+#else
+#define OPENREG_EXPORT __attribute__((visibility("default")))
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef enum orError_t { orSuccess = 0, orErrorUnknown = 1 } orError_t;
+typedef enum orError_t {
+  orSuccess = 0,
+  orErrorUnknown = 1,
+  orErrorNotReady = 2
+} orError_t;
 
 typedef enum orMemcpyKind {
   orMemcpyHostToHost = 0,
@@ -25,25 +35,75 @@ struct orPointerAttributes {
   orMemoryType type = orMemoryType::orMemoryTypeUnmanaged;
   int device;
   void* pointer;
-  size_t size;
 };
 
-orError_t orMalloc(void** devPtr, size_t size);
-orError_t orFree(void* devPtr);
-orError_t orMallocHost(void** hostPtr, size_t size);
-orError_t orFreeHost(void* hostPtr);
-orError_t orMemcpy(void* dst, const void* src, size_t count, orMemcpyKind kind);
-orError_t orMemoryUnprotect(void* devPtr);
-orError_t orMemoryProtect(void* devPtr);
+typedef enum orEventFlags {
+  orEventDisableTiming = 0x0,
+  orEventEnableTiming = 0x1,
+} orEventFlags;
+
+struct orStream;
+struct orEvent;
+typedef struct orStream* orStream_t;
+typedef struct orEvent* orEvent_t;
+
+// Memory
+OPENREG_EXPORT orError_t orMalloc(void** devPtr, size_t size);
+OPENREG_EXPORT orError_t orFree(void* devPtr);
+OPENREG_EXPORT orError_t orMallocHost(void** hostPtr, size_t size);
+OPENREG_EXPORT orError_t orFreeHost(void* hostPtr);
+OPENREG_EXPORT orError_t
+orMemcpy(void* dst, const void* src, size_t count, orMemcpyKind kind);
+OPENREG_EXPORT orError_t orMemcpyAsync(
+    void* dst,
+    const void* src,
+    size_t count,
+    orMemcpyKind kind,
+    orStream_t stream);
+OPENREG_EXPORT orError_t
+orPointerGetAttributes(orPointerAttributes* attributes, const void* ptr);
+OPENREG_EXPORT orError_t orMemoryUnprotect(void* devPtr);
+OPENREG_EXPORT orError_t orMemoryProtect(void* devPtr);
+
+// Device
+OPENREG_EXPORT orError_t orGetDeviceCount(int* count);
+OPENREG_EXPORT orError_t orSetDevice(int device);
+OPENREG_EXPORT orError_t orGetDevice(int* device);
+OPENREG_EXPORT orError_t
+orDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
+OPENREG_EXPORT orError_t orDeviceSynchronize(void);
 
-orError_t orGetDeviceCount(int* count);
-orError_t orSetDevice(int device);
-orError_t orGetDevice(int* device);
+// Stream
+OPENREG_EXPORT orError_t orStreamCreateWithPriority(
+    orStream_t* stream,
+    unsigned int flags,
+    int priority);
+OPENREG_EXPORT orError_t orStreamCreate(orStream_t* stream);
+OPENREG_EXPORT orError_t orStreamGetPriority(orStream_t stream, int* priority);
+OPENREG_EXPORT orError_t orStreamDestroy(orStream_t stream);
+OPENREG_EXPORT orError_t orStreamQuery(orStream_t stream);
+OPENREG_EXPORT orError_t orStreamSynchronize(orStream_t stream);
+OPENREG_EXPORT orError_t
+orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int flags);
 
-orError_t orPointerGetAttributes(
-    orPointerAttributes* attributes,
-    const void* ptr);
+// Event
+OPENREG_EXPORT orError_t
+orEventCreateWithFlags(orEvent_t* event, unsigned int flags);
+OPENREG_EXPORT orError_t orEventCreate(orEvent_t* event);
+OPENREG_EXPORT orError_t orEventDestroy(orEvent_t event);
+OPENREG_EXPORT orError_t orEventRecord(orEvent_t event, orStream_t stream);
+OPENREG_EXPORT orError_t orEventSynchronize(orEvent_t event);
+OPENREG_EXPORT orError_t orEventQuery(orEvent_t event);
+OPENREG_EXPORT orError_t
+orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end);
 
 #ifdef __cplusplus
 } // extern "C"
 #endif
+
+#ifdef __cplusplus
+
+#define OPENREG_H
+#include "openreg.inl"
+
+#endif
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.inl b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.inl
new file mode 100644
index 0000000000000..851be132cc36a
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.inl
@@ -0,0 +1,42 @@
+#ifndef OPENREG_H
+#error "Don`t include openreg.inl directly, include openreg.h instead."
+#endif
+
+#include <functional>
+#include <tuple>
+#include <utility>
+
+namespace openreg {
+OPENREG_EXPORT orError_t
+addTaskToStream(orStream* stream, std::function<void()> task);
+}
+
+template <typename Func, typename... Args>
+OPENREG_EXPORT inline orError_t orLaunchKernel(
+    orStream* stream,
+    Func&& kernel_func,
+    Args&&... args) {
+  if (!stream) {
+    return orErrorUnknown;
+  }
+
+/*
+ * Some tests in PyTorch still use C++11, so we use conditional macro to
+ * select different approaches for different C++ version.
+ *
+ * Std::apply is only supported in C++17, so for C++11/14, std::bind is
+ * a more appropriate approach, but the former has better performance.
+ */
+#if __cplusplus >= 201703L
+  auto task = [func = std::forward<Func>(kernel_func),
+               args_tuple =
+                   std::make_tuple(std::forward<Args>(args)...)]() mutable {
+    std::apply(func, std::move(args_tuple));
+  };
+#else
+  auto task =
+      std::bind(std::forward<Func>(kernel_func), std::forward<Args>(args)...);
+#endif
+
+  return openreg::addTaskToStream(stream, std::move(task));
+}
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/device_tests.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/device_tests.cpp
new file mode 100644
index 0000000000000..b7501c81d7b7c
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/device_tests.cpp
@@ -0,0 +1,41 @@
+#include <gtest/gtest.h>
+#include <include/openreg.h>
+
+namespace {
+
+class DeviceTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    orSetDevice(0);
+  }
+};
+
+TEST_F(DeviceTest, GetDeviceCountValid) {
+  int count = -1;
+  EXPECT_EQ(orGetDeviceCount(&count), orSuccess);
+  EXPECT_EQ(count, 2);
+}
+
+TEST_F(DeviceTest, GetDeviceValid) {
+  int device = -1;
+  EXPECT_EQ(orGetDevice(&device), orSuccess);
+  EXPECT_EQ(device, 0);
+}
+
+TEST_F(DeviceTest, SetDeviceValid) {
+  EXPECT_EQ(orSetDevice(1), orSuccess);
+
+  int device = -1;
+  EXPECT_EQ(orGetDevice(&device), orSuccess);
+  EXPECT_EQ(device, 1);
+
+  EXPECT_EQ(orSetDevice(0), orSuccess);
+  EXPECT_EQ(orGetDevice(&device), orSuccess);
+  EXPECT_EQ(device, 0);
+}
+
+TEST_F(DeviceTest, SetDeviceInvalidNegative) {
+  EXPECT_EQ(orSetDevice(-1), orErrorUnknown);
+}
+
+} // namespace
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/event_tests.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/event_tests.cpp
new file mode 100644
index 0000000000000..416c50a863435
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/event_tests.cpp
@@ -0,0 +1,88 @@
+#include <gtest/gtest.h>
+#include <include/openreg.h>
+
+#include <atomic>
+#include <thread>
+
+namespace {
+
+class EventTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    orSetDevice(0);
+  }
+};
+
+TEST_F(EventTest, EventCreateAndDestroy) {
+  orEvent_t event = nullptr;
+  EXPECT_EQ(orEventCreate(&event), orSuccess);
+  EXPECT_NE(event, nullptr);
+
+  EXPECT_EQ(orEventDestroy(event), orSuccess);
+}
+
+TEST_F(EventTest, EventCreateWithFlagsTiming) {
+  orEvent_t event = nullptr;
+  EXPECT_EQ(orEventCreateWithFlags(&event, orEventEnableTiming), orSuccess);
+  EXPECT_NE(event, nullptr);
+
+  EXPECT_EQ(orEventDestroy(event), orSuccess);
+}
+
+TEST_F(EventTest, EventRecordAndSynchronize) {
+  orStream_t stream = nullptr;
+  EXPECT_EQ(orStreamCreate(&stream), orSuccess);
+
+  orEvent_t event = nullptr;
+  EXPECT_EQ(orEventCreate(&event), orSuccess);
+
+  EXPECT_EQ(orEventRecord(event, stream), orSuccess);
+  EXPECT_EQ(orEventSynchronize(event), orSuccess);
+  EXPECT_EQ(orEventQuery(event), orSuccess);
+
+  EXPECT_EQ(orEventDestroy(event), orSuccess);
+  EXPECT_EQ(orStreamDestroy(stream), orSuccess);
+}
+
+TEST_F(EventTest, EventElapsedTime) {
+  orStream_t stream = nullptr;
+  EXPECT_EQ(orStreamCreate(&stream), orSuccess);
+
+  orEvent_t start = nullptr;
+  orEvent_t end = nullptr;
+  EXPECT_EQ(orEventCreateWithFlags(&start, orEventEnableTiming), orSuccess);
+  EXPECT_EQ(orEventCreateWithFlags(&end, orEventEnableTiming), orSuccess);
+
+  EXPECT_EQ(orEventRecord(start, stream), orSuccess);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+  EXPECT_EQ(orEventRecord(end, stream), orSuccess);
+
+  EXPECT_EQ(orEventSynchronize(start), orSuccess);
+  EXPECT_EQ(orEventSynchronize(end), orSuccess);
+
+  float elapsed_ms = 0.0f;
+  EXPECT_EQ(orEventElapsedTime(&elapsed_ms, start, end), orSuccess);
+  EXPECT_GE(elapsed_ms, 0.0f);
+
+  EXPECT_EQ(orEventDestroy(start), orSuccess);
+  EXPECT_EQ(orEventDestroy(end), orSuccess);
+}
+
+TEST_F(EventTest, StreamWaitEvent) {
+  orStream_t stream = nullptr;
+  EXPECT_EQ(orStreamCreate(&stream), orSuccess);
+
+  orEvent_t event = nullptr;
+  EXPECT_EQ(orEventCreate(&event), orSuccess);
+
+  EXPECT_EQ(orEventRecord(event, stream), orSuccess);
+  EXPECT_EQ(orStreamWaitEvent(stream, event, 0), orSuccess);
+
+  EXPECT_EQ(orEventSynchronize(event), orSuccess);
+  EXPECT_EQ(orEventDestroy(event), orSuccess);
+  EXPECT_EQ(orStreamDestroy(stream), orSuccess);
+}
+
+} // namespace
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/memory_tests.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/memory_tests.cpp
new file mode 100644
index 0000000000000..e36ad4c0da3ee
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/memory_tests.cpp
@@ -0,0 +1,115 @@
+#include <gtest/gtest.h>
+#include <include/openreg.h>
+
+namespace {
+
+class MemoryTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    orSetDevice(0);
+  }
+};
+
+TEST_F(MemoryTest, AllocateAndFreeDevice) {
+  void* ptr = nullptr;
+  EXPECT_EQ(orMalloc(&ptr, 4096), orSuccess);
+  EXPECT_NE(ptr, nullptr);
+
+  EXPECT_EQ(orFree(ptr), orSuccess);
+}
+
+TEST_F(MemoryTest, AllocateAndFreeHost) {
+  void* ptr = nullptr;
+  EXPECT_EQ(orMallocHost(&ptr, 8192), orSuccess);
+  EXPECT_NE(ptr, nullptr);
+
+  EXPECT_EQ(orFreeHost(ptr), orSuccess);
+}
+
+TEST_F(MemoryTest, AllocateNullptr) {
+  EXPECT_EQ(orMalloc(nullptr, 4096), orErrorUnknown);
+  EXPECT_EQ(orMallocHost(nullptr, 4096), orErrorUnknown);
+}
+
+TEST_F(MemoryTest, AllocateZeroSize) {
+  void* ptr = nullptr;
+  EXPECT_EQ(orMalloc(&ptr, 0), orErrorUnknown);
+  EXPECT_EQ(orMallocHost(&ptr, 0), orErrorUnknown);
+}
+
+TEST_F(MemoryTest, MemcpyHostToDevice) {
+  char host_src[] = "data";
+  char host_dst[5] = {};
+
+  void* dev_ptr = nullptr;
+  EXPECT_EQ(orMalloc(&dev_ptr, 5), orSuccess);
+
+  EXPECT_EQ(orMemcpy(dev_ptr, host_src, 5, orMemcpyHostToDevice), orSuccess);
+  EXPECT_EQ(orMemcpy(host_dst, dev_ptr, 5, orMemcpyDeviceToHost), orSuccess);
+
+  EXPECT_STREQ(host_dst, host_src);
+
+  EXPECT_EQ(orFree(dev_ptr), orSuccess);
+}
+
+TEST_F(MemoryTest, MemcpyDeviceToDevice) {
+  const char host_src[5] = "data";
+  char host_dst[5] = {};
+  void *dev_dst1 = nullptr, *dev_dst2 = nullptr;
+
+  EXPECT_EQ(orMalloc(&dev_dst1, 5), orSuccess);
+  EXPECT_EQ(orMalloc(&dev_dst2, 5), orSuccess);
+
+  EXPECT_EQ(orMemcpy(dev_dst1, host_src, 5, orMemcpyHostToDevice), orSuccess);
+  EXPECT_EQ(orMemcpy(dev_dst2, dev_dst1, 5, orMemcpyDeviceToDevice), orSuccess);
+  EXPECT_EQ(orMemcpy(host_dst, dev_dst2, 5, orMemcpyDeviceToHost), orSuccess);
+
+  EXPECT_STREQ(host_dst, host_src);
+
+  EXPECT_EQ(orFree(dev_dst1), orSuccess);
+  EXPECT_EQ(orFree(dev_dst2), orSuccess);
+}
+
+TEST_F(MemoryTest, MemcpyInvalidKind) {
+  char host_ptr[5] = "data";
+  void* dev_ptr = nullptr;
+
+  EXPECT_EQ(orMalloc(&dev_ptr, 5), orSuccess);
+
+  EXPECT_EQ(
+      orMemcpy(nullptr, host_ptr, 4, orMemcpyHostToDevice), orErrorUnknown);
+  EXPECT_EQ(
+      orMemcpy(dev_ptr, nullptr, 4, orMemcpyHostToDevice), orErrorUnknown);
+  EXPECT_EQ(
+      orMemcpy(dev_ptr, host_ptr, 0, orMemcpyHostToDevice), orErrorUnknown);
+
+  EXPECT_EQ(orFree(dev_ptr), orSuccess);
+}
+
+TEST_F(MemoryTest, PointerAttributes) {
+  void* dev_ptr = nullptr;
+  EXPECT_EQ(orMalloc(&dev_ptr, 32), orSuccess);
+
+  orPointerAttributes attr{};
+  EXPECT_EQ(orPointerGetAttributes(&attr, dev_ptr), orSuccess);
+  EXPECT_EQ(attr.type, orMemoryType::orMemoryTypeDevice);
+  EXPECT_EQ(attr.pointer, dev_ptr);
+
+  char host_ptr[16];
+  EXPECT_EQ(orPointerGetAttributes(&attr, host_ptr), orSuccess);
+  EXPECT_EQ(attr.type, orMemoryType::orMemoryTypeUnmanaged);
+
+  EXPECT_EQ(orFree(dev_ptr), orSuccess);
+}
+
+TEST_F(MemoryTest, ProtectUnprotectDevice) {
+  void* dev_ptr = nullptr;
+  EXPECT_EQ(orMalloc(&dev_ptr, 64), orSuccess);
+
+  EXPECT_EQ(orMemoryUnprotect(dev_ptr), orSuccess);
+  EXPECT_EQ(orMemoryProtect(dev_ptr), orSuccess);
+
+  EXPECT_EQ(orFree(dev_ptr), orSuccess);
+}
+
+} // namespace
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/stream_tests.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/stream_tests.cpp
new file mode 100644
index 0000000000000..e91abaa1e7fe9
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/stream_tests.cpp
@@ -0,0 +1,79 @@
+#include <gtest/gtest.h>
+#include <include/openreg.h>
+
+#include <atomic>
+#include <thread>
+
+namespace {
+
+class StreamTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    orSetDevice(0);
+  }
+};
+
+TEST_F(StreamTest, StreamCreateAndDestroy) {
+  orStream_t stream = nullptr;
+  EXPECT_EQ(orStreamCreate(&stream), orSuccess);
+  EXPECT_NE(stream, nullptr);
+
+  EXPECT_EQ(orStreamDestroy(stream), orSuccess);
+}
+
+TEST_F(StreamTest, StreamCreateWithInvalidPriority) {
+  orStream_t stream = nullptr;
+  int min_p, max_p;
+  orDeviceGetStreamPriorityRange(&min_p, &max_p);
+
+  EXPECT_EQ(orStreamCreateWithPriority(&stream, 0, min_p - 1), orErrorUnknown);
+  EXPECT_EQ(orStreamCreateWithPriority(&stream, 0, max_p + 1), orErrorUnknown);
+}
+
+TEST_F(StreamTest, StreamTaskExecution) {
+  orStream_t stream = nullptr;
+  EXPECT_EQ(orStreamCreate(&stream), orSuccess);
+
+  std::atomic<int> counter{0};
+  EXPECT_EQ(openreg::addTaskToStream(stream, [&] { counter++; }), orSuccess);
+
+  EXPECT_EQ(orStreamSynchronize(stream), orSuccess);
+  EXPECT_EQ(counter.load(), 1);
+
+  EXPECT_EQ(orStreamDestroy(stream), orSuccess);
+}
+
+TEST_F(StreamTest, StreamQuery) {
+  orStream_t stream = nullptr;
+  EXPECT_EQ(orStreamCreate(&stream), orSuccess);
+
+  EXPECT_EQ(orStreamQuery(stream), orSuccess);
+
+  std::atomic<int> counter{0};
+  openreg::addTaskToStream(stream, [&] { counter++; });
+
+  EXPECT_EQ(orStreamSynchronize(stream), orSuccess);
+  EXPECT_EQ(orStreamQuery(stream), orSuccess);
+
+  EXPECT_EQ(orStreamDestroy(stream), orSuccess);
+}
+
+TEST_F(StreamTest, DeviceSynchronize) {
+  orStream_t stream1 = nullptr;
+  orStream_t stream2 = nullptr;
+
+  EXPECT_EQ(orStreamCreate(&stream1), orSuccess);
+  EXPECT_EQ(orStreamCreate(&stream2), orSuccess);
+
+  std::atomic<int> counter{0};
+  openreg::addTaskToStream(stream1, [&] { counter++; });
+  openreg::addTaskToStream(stream2, [&] { counter++; });
+
+  EXPECT_EQ(orDeviceSynchronize(), orSuccess);
+  EXPECT_EQ(counter.load(), 2);
+
+  EXPECT_EQ(orStreamDestroy(stream1), orSuccess);
+  EXPECT_EQ(orStreamDestroy(stream2), orSuccess);
+}
+
+} // namespace
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py
index 3ed73794b06de..45b2343070fe1 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/__init__.py
@@ -1,5 +1,15 @@
+import sys
+
 import torch
 
+
+if sys.platform == "win32":
+    from ._utils import _load_dll_libraries
+
+    _load_dll_libraries()
+    del _load_dll_libraries
+
+
 import torch_openreg._C  # type: ignore[misc]
 import torch_openreg.openreg
 
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/_utils.py b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/_utils.py
new file mode 100644
index 0000000000000..1c26f475ba7ad
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/_utils.py
@@ -0,0 +1,42 @@
+import ctypes
+import glob
+import os
+
+
+def _load_dll_libraries() -> None:
+    openreg_dll_path = os.path.join(os.path.dirname(__file__), "lib")
+
+    kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
+    with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
+    prev_error_mode = kernel32.SetErrorMode(0x0001)
+
+    kernel32.LoadLibraryW.restype = ctypes.c_void_p
+    if with_load_library_flags:
+        kernel32.LoadLibraryExW.restype = ctypes.c_void_p
+
+    os.add_dll_directory(openreg_dll_path)
+
+    dlls = glob.glob(os.path.join(openreg_dll_path, "*.dll"))
+    path_patched = False
+    for dll in dlls:
+        is_loaded = False
+        if with_load_library_flags:
+            res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
+            last_error = ctypes.get_last_error()
+            if res is None and last_error != 126:
+                err = ctypes.WinError(last_error)
+                err.strerror += f' Error loading "{dll}" or one of its dependencies.'
+                raise err
+            elif res is not None:
+                is_loaded = True
+        if not is_loaded:
+            if not path_patched:
+                os.environ["PATH"] = ";".join([openreg_dll_path] + [os.environ["PATH"]])
+                path_patched = True
+            res = kernel32.LoadLibraryW(dll)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += f' Error loading "{dll}" or one of its dependencies.'
+                raise err
+
+    kernel32.SetErrorMode(prev_error_mode)
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/CMakeLists.txt b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/CMakeLists.txt
index 574b5b1c748a3..4ff321c43f2cc 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/CMakeLists.txt
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/CMakeLists.txt
@@ -6,7 +6,19 @@ file(GLOB_RECURSE SOURCE_FILES
 
 add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})
 
-target_link_libraries(${LIBRARY_NAME} PRIVATE torch_python torch_openreg)
+target_link_libraries(${LIBRARY_NAME} PRIVATE torch_python_library torch_openreg)
+
+if(WIN32)
+    find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+    target_link_libraries(${LIBRARY_NAME} PRIVATE ${Python3_LIBRARIES})
+elseif(APPLE)
+    set_target_properties(${LIBRARY_NAME} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+endif()
+
 target_link_directories(${LIBRARY_NAME} PRIVATE ${PYTORCH_INSTALL_DIR}/lib)
 
-install(TARGETS ${LIBRARY_NAME} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+install(TARGETS ${LIBRARY_NAME}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp
index 4acdbfc8e1dce..38c456339003e 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp
@@ -90,7 +90,7 @@ static PyMethodDef methods[] = {
  * Therefore, it cannot be named initModule here, otherwise initModule
  * in torch/csrc/Module.cpp will be called, resulting in failure.
  */
-extern "C" PyObject* initOpenRegModule(void) {
+extern "C" OPENREG_EXPORT PyObject* initOpenRegModule(void) {
   static struct PyModuleDef openreg_C_module = {
       PyModuleDef_HEAD_INIT, "torch_openreg._C", nullptr, -1, methods};
   PyObject* mod = PyModule_Create(&openreg_C_module);
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/stub.c b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/stub.c
index cd3eb4fe1ecc3..4e02f9fd551f6 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/stub.c
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/stub.c
@@ -1,15 +1,20 @@
 #include <Python.h>
 
-extern PyObject* initOpenRegModule(void);
+#ifdef _WIN32
+#define OPENREG_EXPORT __declspec(dllexport)
+#else
+#define OPENREG_EXPORT __attribute__((visibility("default")))
+#endif
+
+extern OPENREG_EXPORT PyObject* initOpenRegModule(void);
 
-#ifndef _WIN32
 #ifdef __cplusplus
 extern "C"
 #endif
-__attribute__((visibility("default"))) PyObject* PyInit__C(void);
-#endif
 
-PyMODINIT_FUNC PyInit__C(void)
-{
+    OPENREG_EXPORT PyObject*
+    PyInit__C(void);
+
+PyMODINIT_FUNC PyInit__C(void) {
   return initOpenRegModule();
 }
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py
index 6757fc669c209..670f54245fb07 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py
@@ -2,6 +2,8 @@
 
 import torch_openreg._C  # type: ignore[misc]
 
+from . import meta  # noqa: F401
+
 
 _initialized = False
 
@@ -42,6 +44,10 @@ def set_device(device) -> None:
     return torch_openreg._C._set_device(device)
 
 
+def init():
+    _lazy_init()
+
+
 def is_initialized():
     return _initialized
 
@@ -64,6 +70,7 @@ def _lazy_init():
     "set_device",
     "initial_seed",
     "is_available",
+    "init",
     "is_initialized",
     "random",
     "manual_seed",
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/meta.py b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/meta.py
new file mode 100644
index 0000000000000..c475e8e05ed84
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/meta.py
@@ -0,0 +1,13 @@
+import torch
+
+
+# LITERALINCLUDE START: CUSTOM OPERATOR META
+lib = torch.library.Library("openreg", "IMPL", "Meta")  # noqa: TOR901
+
+
+@torch.library.impl(lib, "custom_abs")
+def custom_abs(self):
+    return torch.empty_like(self)
+
+
+# LITERALINCLUDE END: CUSTOM OPERATOR META
diff --git a/test/cpp_extensions/torch_stable_test_extension/setup.py b/test/cpp_extensions/torch_stable_test_extension/setup.py
new file mode 100644
index 0000000000000..062d466e7ae98
--- /dev/null
+++ b/test/cpp_extensions/torch_stable_test_extension/setup.py
@@ -0,0 +1,67 @@
+import distutils.command.clean
+import shutil
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+from torch.utils.cpp_extension import BuildExtension, CppExtension
+
+
+ROOT_DIR = Path(__file__).parent
+CSRC_DIR = ROOT_DIR / "torch_stable_test" / "csrc"
+
+
+class clean(distutils.command.clean.clean):
+    def run(self):
+        # Run default behavior first
+        distutils.command.clean.clean.run(self)
+
+        # Remove extension
+        for path in (ROOT_DIR / "torch_stable_test").glob("**/*.so"):
+            path.unlink()
+        # Remove build and dist and egg-info directories
+        dirs = [
+            ROOT_DIR / "build",
+            ROOT_DIR / "dist",
+            ROOT_DIR / "torch_stable_test.egg-info",
+        ]
+        for path in dirs:
+            if path.exists():
+                shutil.rmtree(str(path), ignore_errors=True)
+
+
+def get_extension():
+    extra_compile_args = {
+        "cxx": ["-fdiagnostics-color=always", "-DTORCH_STABLE_ONLY"],
+    }
+
+    sources = list(CSRC_DIR.glob("**/*.cpp"))
+
+    return [
+        CppExtension(
+            "torch_stable_test._C",
+            sources=sorted(str(s) for s in sources),
+            py_limited_api=True,
+            extra_compile_args=extra_compile_args,
+            extra_link_args=[],
+        )
+    ]
+
+
+setup(
+    name="torch_stable_test",
+    version="0.0",
+    author="PyTorch Core Team",
+    description="Test extension to verify TORCH_STABLE_ONLY flag",
+    packages=find_packages(exclude=("test",)),
+    package_data={"torch_stable_test": ["*.dll", "*.dylib", "*.so"]},
+    install_requires=[
+        "torch",
+    ],
+    ext_modules=get_extension(),
+    cmdclass={
+        "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
+        "clean": clean,
+    },
+    options={"bdist_wheel": {"py_limited_api": "cp39"}},
+)
diff --git a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_convert_to_bool b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_convert_to_bool
rename to test/cpp_extensions/torch_stable_test_extension/torch_stable_test/__init__.py
diff --git a/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/csrc/test_extension.cpp b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/csrc/test_extension.cpp
new file mode 100644
index 0000000000000..c92d56da11ba3
--- /dev/null
+++ b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/csrc/test_extension.cpp
@@ -0,0 +1 @@
+#include <ATen/core/TensorBase.h> // This should trigger the TORCH_STABLE_ONLY error
diff --git a/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/test_torch_stable.py b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/test_torch_stable.py
new file mode 100644
index 0000000000000..5c5613bb5484e
--- /dev/null
+++ b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/test_torch_stable.py
@@ -0,0 +1,22 @@
+# Owner(s): ["module: cpp"]
+
+from pathlib import Path
+
+from torch.testing._internal.common_utils import (
+    install_cpp_extension,
+    IS_WINDOWS,
+    run_tests,
+    TestCase,
+)
+
+
+if not IS_WINDOWS:
+
+    class TestTorchStable(TestCase):
+        def test_setup_fails(self):
+            with self.assertRaisesRegex(RuntimeError, "build failed for cpp extension"):
+                install_cpp_extension(extension_root=Path(__file__).parent.parent)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_compile.py b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
index 478eb498ac5d5..b64d4107ee0ca 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_compile.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
@@ -299,12 +299,20 @@ def _check_count(copy_count, resize_count):
 
     def _reinplace_all_gather_with_optional_checks(self, fwd_fullgraph):
         def _run_with_checks(graph, orig_fn):
-            self.assertGreater(
-                _count_op_in_graph(
-                    graph, torch.ops._c10d_functional.all_gather_into_tensor.default
-                ),
-                0,
-            )
+            if self.world_size > 1:
+                self.assertGreater(
+                    _count_op_in_graph(
+                        graph, torch.ops._c10d_functional.all_gather_into_tensor.default
+                    ),
+                    0,
+                )
+            elif self.world_size == 1:
+                self.assertEqual(
+                    _count_op_in_graph(
+                        graph, torch.ops._c10d_functional.all_gather_into_tensor.default
+                    ),
+                    0,
+                )
 
             orig_fn(graph)
 
@@ -315,12 +323,22 @@ def _run_with_checks(graph, orig_fn):
                 0,
             )
 
-            self.assertGreater(
-                _count_op_in_graph(
-                    graph, torch.ops._c10d_functional.all_gather_into_tensor_out.default
-                ),
-                0,
-            )
+            if self.world_size > 1:
+                self.assertGreater(
+                    _count_op_in_graph(
+                        graph,
+                        torch.ops._c10d_functional.all_gather_into_tensor_out.default,
+                    ),
+                    0,
+                )
+            else:
+                self.assertEqual(
+                    _count_op_in_graph(
+                        graph,
+                        torch.ops._c10d_functional.all_gather_into_tensor_out.default,
+                    ),
+                    0,
+                )
 
         if fwd_fullgraph:
             return mock.patch.object(
@@ -549,7 +567,7 @@ def test_compiled():
 
   Developer debug context: call_method TensorVariable() backward () {}
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0123.html""",  # noqa: B950
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0123.html""",  # noqa: B950
                     )
                 else:
                     self.assertGreater(len(counters["graph_break"]), 1)
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py b/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
index edf556b847f86..0ce32057ffbe0 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
@@ -73,7 +73,7 @@ def _test_gradient_scaler(self, has_inf: bool, test_2d: bool):
             opt.param_groups[0]["params"][0].grad._local_tensor[0, 0].fill_(
                 float("inf")
             )
-        inital_grad = opt.param_groups[0]["params"][0].grad.to_local().clone()
+        initial_grad = opt.param_groups[0]["params"][0].grad.to_local().clone()
 
         scaler.unscale_(opt)
         for found_inf in scaler._per_optimizer_states[id(opt)][
@@ -85,7 +85,7 @@ def _test_gradient_scaler(self, has_inf: bool, test_2d: bool):
             OptState.UNSCALED.value,
         )
         unscaled_grad = opt.param_groups[0]["params"][0].grad.to_local().clone()
-        self.assertEqual(unscaled_grad, inital_grad * inv_scale)
+        self.assertEqual(unscaled_grad, initial_grad * inv_scale)
         initial_scale = scaler.get_scale()
         initial_state = copy.copy(opt.state)
 
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_logging.py b/test/distributed/_composable/fsdp/test_fully_shard_logging.py
index 2ee46febfb24e..c9450a2b8f475 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_logging.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_logging.py
@@ -6,11 +6,9 @@
 import torch.distributed as dist
 from torch._dynamo.test_case import run_tests
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.inductor_utils import HAS_CUDA
 from torch.testing._internal.logging_utils import LoggingTestCase
 
 
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 requires_distributed = functools.partial(
     unittest.skipIf, not dist.is_available(), "requires distributed"
 )
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
index 06881442b748e..af25d4f35fd1e 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
@@ -28,7 +28,11 @@
     patch_reduce_scatter,
     reduce_scatter_with_assert,
 )
-from torch.testing._internal.common_utils import run_tests, skipIfRocm, TEST_HPU
+from torch.testing._internal.common_utils import (
+    run_tests,
+    skipIfRocmVersionLessThan,
+    TEST_HPU,
+)
 
 
 device_type = torch.device(get_devtype())
@@ -86,7 +90,7 @@ def _get_use_shard_placement_fn_vals_for_bf16_reduce(self):
             use_shard_placement_fn_vals.append(True)
         return use_shard_placement_fn_vals
 
-    @skipIfRocm  # regressed in ROCm 6.4, but ROCm 6.5 fixes it
+    @skipIfRocmVersionLessThan((7, 0))
     @skip_if_lt_x_gpu(2)
     @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
     def test_compute_dtype(self):
@@ -166,7 +170,7 @@ def assert_fn(output: torch.Tensor):
             self.assertEqual(fsdp_loss, ref_loss)
             check_sharded_parity(self, ref_model, model)
 
-    @skipIfRocm  # regressed in ROCm 6.4, but ROCm 6.5 fixes it
+    @skipIfRocmVersionLessThan((7, 0))
     @skip_if_lt_x_gpu(2)
     @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
     def test_reduce_dtype(self):
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
index cf8b86cc8e06d..3991fda639108 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -335,7 +335,7 @@ def test_train_parity_multi_group(self):
         self.run_subtests(
             {
                 "reshard_after_forward": [True, False, 2],
-                "device_type": [device_type.type],
+                "test_device_type": [device_type.type],
                 "offload_policy": [OffloadPolicy()],
                 "delay_after_forward": [False, True],
                 "delay_before_all_gather": [False, True],
@@ -360,7 +360,7 @@ def test_train_parity_multi_group_cpu_offload_eager(self):
                     CPUOffloadPolicy(pin_memory=True),
                     CPUOffloadPolicy(pin_memory=False),
                 ],
-                "device_type": [device_type.type],
+                "test_device_type": [device_type.type],
                 "delay_after_forward": [False, True],
                 "delay_before_all_gather": [False, True],
                 "delay_before_reduce_scatter": [False, True],
@@ -381,7 +381,7 @@ def test_train_parity_multi_group_unshard_async_op(self):
         self.run_subtests(
             {
                 "reshard_after_forward": [True],
-                "device_type": [device_type.type],
+                "test_device_type": [device_type.type],
                 "offload_policy": [OffloadPolicy()],
                 "delay_after_forward": [False, True],
                 "delay_before_all_gather": [False, True],
@@ -396,7 +396,7 @@ def _test_train_parity_multi_group(
         self,
         reshard_after_forward: Union[bool, int],
         offload_policy: OffloadPolicy,
-        device_type: str,
+        test_device_type: str,
         delay_after_forward: bool,
         delay_before_all_gather: bool,
         delay_before_reduce_scatter: bool,
@@ -412,7 +412,7 @@ def _test_train_parity_multi_group(
             in (2, 3)
         ):
             return
-        assert device_type in ("cuda", "hpu", "xpu", "cpu"), f"{device_type}"
+        assert test_device_type in ("cuda", "hpu", "xpu", "cpu"), f"{test_device_type}"
         torch.manual_seed(42)
         vocab_size = 1024
         model_args = ModelArgs(
@@ -424,7 +424,7 @@ def _test_train_parity_multi_group(
         )
         model = Transformer(model_args)
         ref_model = copy.deepcopy(model)
-        if device_type == device_type:
+        if test_device_type == device_type.type:
             replicate(
                 ref_model.to(device_type),
                 device_ids=[self.rank],
@@ -433,7 +433,7 @@ def _test_train_parity_multi_group(
             gloo_pg = dist.new_group(backend="gloo")
             replicate(ref_model, process_group=gloo_pg)
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
-        mesh = init_device_mesh(device_type, (self.world_size,))
+        mesh = init_device_mesh(test_device_type, (self.world_size,))
         fully_shard_fn = functools.partial(
             fully_shard,
             mesh=mesh,
@@ -483,12 +483,12 @@ def delayed_reduce_scatter(*args, **kwargs):
                     _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
                     losses.append(_model(inp).sum())
                     if _model is model and delay_after_forward:
-                        torch.get_device_module(device_type)._sleep(
+                        torch.get_device_module(test_device_type)._sleep(
                             int(delay_in_ms * get_cycles_per_ms())
                         )
                     losses[-1].backward()
                     if _model is model and delay_before_optim:
-                        torch.get_device_module(device_type)._sleep(
+                        torch.get_device_module(test_device_type)._sleep(
                             int(delay_in_ms * get_cycles_per_ms())
                         )
                     _optim.step()
@@ -1360,6 +1360,10 @@ def test_train_parity_hsdp(self):
                 "use_activation_checkpointing": [False, True],
                 "mlp_dim": [3, 16, 17],
                 "sync_gradients_at_last_batch": [True, False],
+                "offload_policy": [
+                    CPUOffloadPolicy(pin_memory=True),
+                    CPUOffloadPolicy(pin_memory=False),
+                ],
             },
             functools.partial(self._test_train_parity_hsdp, global_mesh),
         )
@@ -1371,6 +1375,7 @@ def _test_train_parity_hsdp(
         use_activation_checkpointing: bool,
         mlp_dim: int,
         sync_gradients_at_last_batch: bool,
+        offload_policy: CPUOffloadPolicy,
     ):
         torch.manual_seed(42)
         model = nn.Sequential(
@@ -1389,10 +1394,16 @@ def _test_train_parity_hsdp(
             if use_activation_checkpointing:
                 checkpoint(mlp)
             fully_shard(
-                mlp, mesh=global_mesh, reshard_after_forward=reshard_after_forward
+                mlp,
+                mesh=global_mesh,
+                reshard_after_forward=reshard_after_forward,
+                offload_policy=offload_policy,
             )
         fully_shard(
-            model, mesh=global_mesh, reshard_after_forward=reshard_after_forward
+            model,
+            mesh=global_mesh,
+            reshard_after_forward=reshard_after_forward,
+            offload_policy=offload_policy,
         )
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
         check_sharded_parity(self, ref_model, model)
@@ -1467,5 +1478,70 @@ def forward(self, imgs: torch.Tensor) -> torch.Tensor:
         check_sharded_parity(self, ref_model, model)
 
 
+class TestFullyShardWorldSize1(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return 1
+
+    @skip_if_lt_x_gpu(1)
+    def test_train_parity_single_worldsize1(self):
+        """
+        Tests train parity with DDP for a single FSDP group when sharding
+        parameters on dim-0.
+        """
+        self.run_subtests(
+            {
+                "lin_shapes": [
+                    [(16, 15), (15, 8)],
+                    [(7, 15), (15, 3)],
+                    [(16, 17), (17, 8)],
+                ],
+                "use_shard_placement_fn": [False],
+            },
+            self._test_train_parity_single_group,
+        )
+
+    def _test_train_parity_single_group(
+        self, lin_shapes: list[tuple[int, int]], use_shard_placement_fn: bool
+    ):
+        torch.manual_seed(42)
+        model = nn.Sequential(
+            nn.Linear(*lin_shapes[0]), nn.ReLU(), nn.Linear(*lin_shapes[1])
+        )
+        ref_model = copy.deepcopy(model).to(device_type)
+        replicate(ref_model, device_ids=[self.rank])
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+
+        def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
+            return Shard(param.shape.index(max(param.shape)))
+
+        shard_placement_fn = _shard_placement_fn if use_shard_placement_fn else None
+        fully_shard(model, shard_placement_fn=shard_placement_fn)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        torch.manual_seed(42 + self.rank + 1)
+        inp = (torch.randn((4, lin_shapes[0][0]), device=device_type.type),)
+
+        for iter_idx in range(10):
+            losses: list[torch.Tensor] = []
+
+            ref_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            losses.append(ref_model(*inp).sum())
+            losses[-1].backward()
+            ref_optim.step()
+
+            optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            comm_mode = CommDebugMode()
+            with comm_mode:
+                losses.append(model(*inp).sum())
+                losses[-1].backward()
+
+            # Before there was 1 all-gather and 1 reduce-scatter
+            # Now therre is 1 reduce-scatter
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+            optim.step()
+
+            self.assertEqual(losses[0], losses[1])
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_composable/test_composability/test_2d_composability.py b/test/distributed/_composable/test_composability/test_2d_composability.py
index 3ab0b6269b2da..bcaf06ea947a0 100644
--- a/test/distributed/_composable/test_composability/test_2d_composability.py
+++ b/test/distributed/_composable/test_composability/test_2d_composability.py
@@ -277,19 +277,19 @@ def test_tp_with_fsdp_offloading(self):
                 loss = model(inp).sum()
 
             fwd_comm_counts = fwd_comm_mode.get_comm_counts()
-            self.assertEqual(len(fwd_comm_counts), 2)
+            self.assertEqual(len(fwd_comm_counts), 1)
             self.assertEqual(fwd_comm_counts[funcol.all_reduce], num_mlps)
-            self.assertEqual(fwd_comm_counts[c10d_ops._allgather_base_], num_mlps)
+            self.assertEqual(fwd_comm_counts[c10d_ops._allgather_base_], 0)
             ref_loss = ref_model(inp).sum()
             self.assertEqual(loss, ref_loss)
 
             with CommDebugMode() as bwd_comm_mode:
                 loss.backward()
             bwd_comm_counts = bwd_comm_mode.get_comm_counts()
-            self.assertEqual(len(bwd_comm_counts), 3)
+            self.assertEqual(len(bwd_comm_counts), 2)
             # First MLP's input gradient does not need to be all-reduced
             self.assertEqual(bwd_comm_counts[funcol.all_reduce], num_mlps - 1)
-            self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], num_mlps)
+            self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], 0)
             self.assertEqual(bwd_comm_counts[c10d_ops._reduce_scatter_base_], num_mlps)
             ref_loss.backward()
 
diff --git a/test/distributed/_composable/test_replicate_with_fsdp.py b/test/distributed/_composable/test_replicate_with_fsdp.py
index ff61e2c05f274..099f84b9e848f 100644
--- a/test/distributed/_composable/test_replicate_with_fsdp.py
+++ b/test/distributed/_composable/test_replicate_with_fsdp.py
@@ -256,7 +256,7 @@ def test_train_replicate_fsdp(self):
     @skip_if_lt_x_gpu(2)
     def test_train_parity_2d_mlp(self):
         """
-        Verifies that when a device mesh is passed in, the model has the same behavior as the original model when training
+        Verifies when a device mesh is passed in, the model has the same behavior as the original model when training
         """
         self._init_pg()
         global_mesh = self.init_replicate_tp_mesh()
diff --git a/test/distributed/_test_template.py b/test/distributed/_test_template.py
index 74a38f7136482..517a4cf97f6e8 100644
--- a/test/distributed/_test_template.py
+++ b/test/distributed/_test_template.py
@@ -1,10 +1,10 @@
 # Owner(s): ["oncall: distributed"]
 
-from torch.testing._internal.common_distributed import MultiProcContinousTest
+from torch.testing._internal.common_distributed import MultiProcContinuousTest
 from torch.testing._internal.common_utils import run_tests
 
 
-class TestTemplate(MultiProcContinousTest):
+class TestTemplate(MultiProcContinuousTest):
     def testABC(self):
         print(f"rank {self.rank} of {self.world_size} testing ABC")
 
diff --git a/test/distributed/_tools/test_fsdp2_mem_tracker.py b/test/distributed/_tools/test_fsdp2_mem_tracker.py
index 75a04d0907a22..05e7a9640da33 100644
--- a/test/distributed/_tools/test_fsdp2_mem_tracker.py
+++ b/test/distributed/_tools/test_fsdp2_mem_tracker.py
@@ -37,15 +37,16 @@ def _init_cublas_workspace(dev: torch.device):
 
 
 def _reset_mem_stats(dev: torch.device):
-    torch.cuda.empty_cache()
-    torch.cuda.reset_accumulated_memory_stats(dev)
-    torch.cuda.reset_peak_memory_stats(dev)
+    mod = torch.get_device_module(dev)
+    mod.empty_cache()
+    mod.reset_accumulated_memory_stats(dev)
+    mod.reset_peak_memory_stats(dev)
 
 
 class TestTrackerFullyShard1DTrainingCore(FSDPTest):
     @property
     def world_size(self) -> int:
-        return min(4, torch.cuda.device_count())
+        return min(4, torch.accelerator.device_count())
 
     @skip_if_lt_x_gpu(2)
     def test_tracker_multi_group_eager(self):
@@ -77,17 +78,18 @@ def _test_tracker_multi_group(
         mp_policy: MixedPrecisionPolicy,
     ):
         debug = False
-        dev = torch.device(torch.cuda.current_device())
+        dev = torch.device(torch.accelerator.current_device_index())
         _init_cublas_workspace(dev)
         gc.collect()
         _reset_mem_stats(dev)
-        mem_stats = torch.cuda.memory_stats(dev)
-        pre_cuda_active = mem_stats["active_bytes.all.current"]
+        mod = torch.get_device_module(dev)
+        mem_stats = mod.memory_stats(dev)
+        pre_acc_active = mem_stats["active_bytes.all.current"]
         torch.manual_seed(42)
         lin_dim, bsz = 2048, 8192
         with torch.device(dev):
             model = nn.Sequential(*[MLP(dim=lin_dim, device=dev) for _ in range(4)])
-        mesh = init_device_mesh("cuda", (self.world_size,))
+        mesh = init_device_mesh(dev.type, (self.world_size,))
         fully_shard_fn = functools.partial(
             fully_shard,
             mesh=mesh,
@@ -110,17 +112,19 @@ def _test_tracker_multi_group(
                 optim.zero_grad()
                 if iter_idx == 0:
                     fmt.reset_mod_stats()
-        mem_stats = torch.cuda.memory_stats()
+        mem_stats = mod.memory_stats()
         tracker_max = fmt.get_tracker_snapshot("peak")[dev]["Total"]
-        cuda_max = mem_stats["active_bytes.all.peak"] - pre_cuda_active
-        accuracy = tracker_max / cuda_max
+        acc_max = mem_stats["active_bytes.all.peak"] - pre_acc_active
+        accuracy = tracker_max / acc_max
         if self.rank == 0 and debug:
-            print(f"Accuracy: {accuracy} Tracker Max:{tracker_max} CUDA Max:{cuda_max}")
+            print(
+                f"Accuracy: {accuracy} Tracker Max:{tracker_max} Accelerator Max:{acc_max}"
+            )
         self.assertAlmostEqual(
             accuracy,
             1.0,
             delta=0.1,
-            msg=f"Tracker Max:{tracker_max} CUDA Max:{cuda_max}",
+            msg=f"Tracker Max:{tracker_max} Accelerator Max:{acc_max}",
         )
         del model
         del inp
@@ -132,12 +136,13 @@ def test_tracker_non_root_forward_backward(self):
         Tests tracker accuracy when running forward/backward through a non-root.
         """
         debug = False
-        dev = torch.device(torch.cuda.current_device())
+        dev = torch.device(torch.accelerator.current_device_index())
         _init_cublas_workspace(dev)
         gc.collect()
         _reset_mem_stats(dev)
-        mem_stats = torch.cuda.memory_stats(dev)
-        pre_cuda_active = mem_stats["active_bytes.all.current"]
+        mod = torch.get_device_module(dev)
+        mem_stats = mod.memory_stats(dev)
+        pre_acc_active = mem_stats["active_bytes.all.current"]
         torch.manual_seed(42)
         lin_dim, bsz = 2048, 8
         model = nn.Sequential(*[MLP(lin_dim, dev) for _ in range(3)])
@@ -157,17 +162,19 @@ def test_tracker_non_root_forward_backward(self):
                 optim.zero_grad()
                 if iter_idx == 0:
                     fmt.reset_mod_stats()
-        mem_stats = torch.cuda.memory_stats()
+        mem_stats = mod.memory_stats()
         tracker_max = fmt.get_tracker_snapshot("peak")[dev]["Total"]
-        cuda_max = mem_stats["active_bytes.all.peak"] - pre_cuda_active
-        accuracy = tracker_max / cuda_max
+        acc_max = mem_stats["active_bytes.all.peak"] - pre_acc_active
+        accuracy = tracker_max / acc_max
         if self.rank == 0 and debug:
-            print(f"Accuracy: {accuracy} Tracker Max:{tracker_max} CUDA Max:{cuda_max}")
+            print(
+                f"Accuracy: {accuracy} Tracker Max:{tracker_max} Accelerator Max:{acc_max}"
+            )
         self.assertAlmostEqual(
             accuracy,
             1.0,
-            delta=0.16,
-            msg=f"Tracker Max:{tracker_max} CUDA Max:{cuda_max}",
+            delta=0.1,
+            msg=f"Tracker Max:{tracker_max} Accelerator Max:{acc_max}",
         )
         del inp
         del model
@@ -177,7 +184,7 @@ def test_tracker_non_root_forward_backward(self):
 class TestTrackerFullyShard1DTrainingCompose(FSDPTest):
     @property
     def world_size(self) -> int:
-        return min(torch.cuda.device_count(), 4)
+        return min(torch.accelerator.device_count(), 4)
 
     @skip_if_lt_x_gpu(2)
     def test_tracker_with_activation_checkpointing(self):
@@ -197,12 +204,13 @@ def _test_tracker_with_activation_checkpointing(
     ):
         assert checkpoint_impl in ("composable", "wrapper")
         debug = False
-        dev = torch.device(torch.cuda.current_device())
+        dev = torch.device(torch.accelerator.current_device_index())
         _init_cublas_workspace(dev)
         gc.collect()
         _reset_mem_stats(dev)
-        mem_stats = torch.cuda.memory_stats(dev)
-        pre_cuda_active = mem_stats["active_bytes.all.current"]
+        mod = torch.get_device_module(dev)
+        mem_stats = mod.memory_stats(dev)
+        pre_acc_active = mem_stats["active_bytes.all.current"]
         torch.manual_seed(42)
         vocab_size = 8192
         bsz, seq_len = 16, 512
@@ -249,17 +257,19 @@ def _test_tracker_with_activation_checkpointing(
                 optim.zero_grad()
                 if iter_idx == 0:
                     fmt.reset_mod_stats()
-        mem_stats = torch.cuda.memory_stats()
+        mem_stats = mod.memory_stats()
         tracker_max = fmt.get_tracker_snapshot("peak")[dev]["Total"]
-        cuda_max = mem_stats["active_bytes.all.peak"] - pre_cuda_active
-        accuracy = tracker_max / cuda_max
+        acc_max = mem_stats["active_bytes.all.peak"] - pre_acc_active
+        accuracy = tracker_max / acc_max
         if self.rank == 0 and debug:
-            print(f"Accuracy: {accuracy} Tracker Max:{tracker_max} CUDA Max:{cuda_max}")
+            print(
+                f"Accuracy: {accuracy} Tracker Max:{tracker_max} Accelerator Max:{acc_max}"
+            )
         self.assertAlmostEqual(
             accuracy,
             1.0,
-            delta=0.25,
-            msg=f"Tracker Max:{tracker_max} CUDA Max:{cuda_max}",
+            delta=0.1,
+            msg=f"Tracker Max:{tracker_max} Accelerator Max:{acc_max}",
         )
         del inp
         del model
diff --git a/test/distributed/_tools/test_mem_tracker.py b/test/distributed/_tools/test_mem_tracker.py
index 4a18e26306096..a5824de8fc5e5 100644
--- a/test/distributed/_tools/test_mem_tracker.py
+++ b/test/distributed/_tools/test_mem_tracker.py
@@ -5,11 +5,11 @@
 import torch
 import torch.nn as nn
 from torch.distributed._tools.mem_tracker import MemTracker
-from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import (
     run_tests,
-    skipIfRocm,
     skipIfTorchDynamo,
+    TEST_CUDA,
+    TEST_XPU,
     TestCase,
 )
 from torch.utils.checkpoint import checkpoint
@@ -24,25 +24,28 @@ def _init_cublas_workspace(self, dev: torch.device):
         del inp
 
     def _reset_mem_stats(self, dev: torch.device):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_accumulated_memory_stats(dev)
-        torch.cuda.reset_peak_memory_stats(dev)
+        mod = torch.get_device_module(dev)
+        mod.empty_cache()
+        mod.reset_accumulated_memory_stats(dev)
+        mod.reset_peak_memory_stats(dev)
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
-    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
-    @skipIfRocm()
-    def test_cuda_tracker_equivalence(
+    @unittest.skipIf(
+        not TEST_CUDA and not TEST_XPU, "Neither CUDA or XPU is not available"
+    )
+    def test_accelerator_tracker_equivalence(
         self,
     ):
         """
         Tests that the tracker correctly calculates the peak memory.
         """
-        dev = torch.device(torch.cuda.current_device())
+        dev = torch.device(torch.accelerator.current_device_index())
         self._init_cublas_workspace(dev)
         gc.collect(1)
         self._reset_mem_stats(dev)
-        mem_stats = torch.cuda.memory_stats(dev)
-        pre_cuda_active = mem_stats["active_bytes.all.current"]
+        mod = torch.get_device_module(dev)
+        mem_stats = mod.memory_stats(dev)
+        pre_acc_active = mem_stats["active_bytes.all.current"]
         bsz, n_layers, dim, dtype = 16, 4, 512, torch.bfloat16
 
         class DummyModel(nn.Module):
@@ -74,25 +77,28 @@ def forward(self, x):
         # Check for accuracy of peak memory
 
         tracker_max = mt.get_tracker_snapshot("peak")[dev]["Total"]
-        mem_stats = torch.cuda.memory_stats(dev)
-        cuda_max = mem_stats["active_bytes.all.peak"] - pre_cuda_active
-        accuracy = tracker_max / cuda_max
+        mem_stats = mod.memory_stats(dev)
+        acc_max = mem_stats["active_bytes.all.peak"] - pre_acc_active
+        accuracy = tracker_max / acc_max
         self.assertAlmostEqual(accuracy, 1.0, delta=0.1)
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
-    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    @unittest.skipIf(
+        not TEST_CUDA and not TEST_XPU, "Neither CUDA or XPU is not available"
+    )
     def test_tracker_with_activation_checkpointing(
         self,
     ):
         """
         Tests that the tracker correctly computes the peak memory during activation checkpointing.
         """
-        dev = torch.device(torch.cuda.current_device())
+        dev = torch.device(torch.accelerator.current_device_index())
         self._init_cublas_workspace(dev)
         gc.collect(1)
         self._reset_mem_stats(dev)
-        mem_stats = torch.cuda.memory_stats(dev)
-        pre_cuda_active = mem_stats["active_bytes.all.current"]
+        mod = torch.get_device_module(dev)
+        mem_stats = mod.memory_stats(dev)
+        pre_acc_active = mem_stats["active_bytes.all.current"]
 
         bsz, n_layers, dim, dtype = 128, 4, 1024, torch.float16
 
@@ -144,9 +150,9 @@ def forward(self, x):
 
         # Check for accuracy of peak memory
         tracker_max = mt.get_tracker_snapshot("peak")[dev]["Total"]
-        mem_stats = torch.cuda.memory_stats(dev)
-        cuda_max = mem_stats["active_bytes.all.peak"] - pre_cuda_active
-        accuracy = tracker_max / cuda_max
+        mem_stats = mod.memory_stats(dev)
+        acc_max = mem_stats["active_bytes.all.peak"] - pre_acc_active
+        accuracy = tracker_max / acc_max
         self.assertAlmostEqual(accuracy, 1.0, delta=0.1)
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
diff --git a/test/distributed/_tools/test_memory_tracker.py b/test/distributed/_tools/test_memory_tracker.py
index ccf7f0beefd07..63366033629ff 100644
--- a/test/distributed/_tools/test_memory_tracker.py
+++ b/test/distributed/_tools/test_memory_tracker.py
@@ -5,19 +5,18 @@
 import torch
 import torch.nn as nn
 from torch.distributed._tools import MemoryTracker
-from torch.testing._internal.common_cuda import TEST_CUDA
-from torch.testing._internal.common_utils import run_tests, TEST_XPU, TestCase
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class TestMemoryTracker(TestCase):
-    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "no cuda/xpu")
+    @unittest.skipIf(not torch.accelerator.is_available(), "no accelerator")
     def test_local_model(self):
         """
         Minimal test case to check the memory tracker can collect the expected
         memory stats at operator level, as well as can print the summary result
         without crash.
         """
-        device = "cuda" if TEST_CUDA else "xpu"
+        device = torch.accelerator.current_accelerator()
         # Create a model with a hierarchy of modules
         torch.manual_seed(0)
         model = nn.Sequential(
@@ -35,9 +34,9 @@ def test_local_model(self):
         tracker = MemoryTracker()
         tracker.start_monitor(model)
 
-        x = torch.randn(size=(2, 3, 224, 224), device=torch.device(device))
-        # torch.LongTensor expects cpu device type, not device type in
-        # constructor, so calling .to(device) outside constructor here.
+        x = torch.randn(size=(2, 3, 224, 224), device=device)
+        # torch.LongTensor expects cpu device type, not gpu device type in
+        # constructor, so calling .to() outside constructor here.
         target = torch.LongTensor([0, 1]).to(device)
         criterion = nn.CrossEntropyLoss()
         criterion(model(x), target).backward()
diff --git a/test/distributed/checkpoint/_experimental/test_builder.py b/test/distributed/checkpoint/_experimental/test_builder.py
index 788f78892fbbe..9b2ba937eb4fd 100644
--- a/test/distributed/checkpoint/_experimental/test_builder.py
+++ b/test/distributed/checkpoint/_experimental/test_builder.py
@@ -55,7 +55,7 @@ def test_make_sync_checkpointer(self) -> None:
 
         # Test that it works for sync operations
         checkpoint_path = os.path.join(self.temp_dir, "checkpoint_factory_sync")
-        result = checkpointer.save(self.state_dict, checkpoint_path)
+        result = checkpointer.save(checkpoint_path, self.state_dict)
         self.assertIsNone(result)  # Sync mode returns None
 
         # Verify checkpoint was created
@@ -81,7 +81,7 @@ def test_make_sync_checkpointer_with_config_first(self) -> None:
         checkpoint_path = os.path.join(
             self.temp_dir, "checkpoint_factory_sync_config_first"
         )
-        result = checkpointer.save(self.state_dict, checkpoint_path)
+        result = checkpointer.save(checkpoint_path, self.state_dict)
         self.assertIsNone(result)  # Sync mode returns None
 
         # Verify checkpoint was created
@@ -105,7 +105,7 @@ def test_make_sync_checkpointer_with_custom_config(self) -> None:
         checkpoint_path = os.path.join(
             self.temp_dir, "checkpoint_factory_sync_custom_config"
         )
-        result = checkpointer.save(self.state_dict, checkpoint_path)
+        result = checkpointer.save(checkpoint_path, self.state_dict)
         self.assertIsNone(result)  # Sync mode returns None
 
         # Verify checkpoint was created
@@ -135,7 +135,7 @@ def test_make_async_checkpointer(self) -> None:
             # Test that it works for async operations
             checkpoint_path = os.path.join(self.temp_dir, "checkpoint_factory_async")
             stage_future, write_future = checkpointer.save(
-                self.state_dict, checkpoint_path
+                checkpoint_path, self.state_dict
             )
 
             # Verify futures are returned
diff --git a/test/distributed/checkpoint/_experimental/test_checkpoint_writer.py b/test/distributed/checkpoint/_experimental/test_checkpoint_writer.py
index ce3945c455abd..c5141c6a1730e 100644
--- a/test/distributed/checkpoint/_experimental/test_checkpoint_writer.py
+++ b/test/distributed/checkpoint/_experimental/test_checkpoint_writer.py
@@ -90,7 +90,7 @@ def test_write_creates_checkpoint_file(self):
         checkpoint_path = os.path.join(self.temp_dir, "checkpoint")
 
         # Call write
-        self.writer.write(self.state_dict, checkpoint_path)
+        self.writer.write(checkpoint_path, self.state_dict)
 
         # Verify that the checkpoint file exists
         expected_file_path = os.path.join(
@@ -111,7 +111,7 @@ def test_write_calls_barrier(self):
         checkpoint_path = os.path.join(self.temp_dir, "checkpoint")
 
         # Call write
-        self.writer.write(self.state_dict, checkpoint_path)
+        self.writer.write(checkpoint_path, self.state_dict)
 
         # Verify that the barrier was called
         self.mock_barrier.execute_barrier.assert_called_once()
@@ -123,7 +123,7 @@ def test_write_calls_commit_hooks(self):
 
         # Call write with additional kwargs
         kwargs = {"extra": "value"}
-        self.writer.write(self.state_dict, checkpoint_path, **kwargs)
+        self.writer.write(checkpoint_path, self.state_dict, **kwargs)
 
         # Verify that the pre_commit hook was called with the correct parameters
         self.assertTrue(self.mock_hook.pre_commit_called)
@@ -157,7 +157,7 @@ def test_write_without_barrier(self):
         checkpoint_path = os.path.join(self.temp_dir, "checkpoint_no_barrier")
 
         # Call write
-        writer.write(self.state_dict, checkpoint_path)
+        writer.write(checkpoint_path, self.state_dict)
 
         # Verify that the checkpoint file exists
         expected_file_path = os.path.join(
@@ -179,7 +179,7 @@ def test_write_without_commit_hook(self):
         checkpoint_path = os.path.join(self.temp_dir, "checkpoint_no_hook")
 
         # Call write
-        writer.write(self.state_dict, checkpoint_path)
+        writer.write(checkpoint_path, self.state_dict)
 
         # Verify that the checkpoint file exists
         expected_file_path = os.path.join(
diff --git a/test/distributed/checkpoint/_experimental/test_checkpointer.py b/test/distributed/checkpoint/_experimental/test_checkpointer.py
index e2c030385c89d..f96ecb6e1d7a1 100644
--- a/test/distributed/checkpoint/_experimental/test_checkpointer.py
+++ b/test/distributed/checkpoint/_experimental/test_checkpointer.py
@@ -3,8 +3,14 @@
 import os
 import shutil
 import tempfile
+from concurrent.futures import Future
+from unittest.mock import Mock
 
 import torch
+from torch.distributed.checkpoint._experimental.checkpoint_process import (
+    CheckpointProcess,
+    CheckpointProcessConfig,
+)
 from torch.distributed.checkpoint._experimental.checkpoint_reader import (
     CheckpointReader,
 )
@@ -12,12 +18,39 @@
     CheckpointWriter,
     CheckpointWriterConfig,
 )
-from torch.distributed.checkpoint._experimental.checkpointer import SyncCheckpointer
+from torch.distributed.checkpoint._experimental.checkpointer import (
+    AsyncCheckpointer,
+    Checkpointer,
+    SyncCheckpointer,
+)
+from torch.distributed.checkpoint._experimental.staging import (
+    CheckpointStagerConfig,
+    DefaultStager,
+)
 from torch.distributed.checkpoint._experimental.types import RankInfo
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
-class TestSyncCheckpointer(TestCase):
+def subprocess_init_fn(name: str, parent_pid: int) -> None:
+    """Initialize the subprocess for async checkpointer tests."""
+    assert name == "test-async-checkpointer", f"Unexpected subprocess name: {name}"
+    assert os.getpid() != parent_pid, "This was supposed to run in a different process"
+    assert os.getppid() == parent_pid, (
+        "This was supposed to run as a child to main process"
+    )
+
+
+def ckpt_writer_init_fn(**kwargs) -> CheckpointWriter:
+    """Initialize a CheckpointWriter in the subprocess."""
+    return CheckpointWriter(
+        config=kwargs.get("config"),
+        rank_info=kwargs.get("rank_info"),
+    )
+
+
+class TestCheckpointer(TestCase):
+    """Parameterized tests that work with both sync and async checkpointers."""
+
     def setUp(self):
         # Create a temporary directory for checkpoints
         self.temp_dir = tempfile.mkdtemp()
@@ -28,20 +61,13 @@ def setUp(self):
             global_rank=0,
         )
         self.writer_config = CheckpointWriterConfig()
-        self.writer = CheckpointWriter(
-            config=self.writer_config,
-            rank_info=self.rank_info,
-        )
 
         # Create reader for testing
         self.reader = CheckpointReader(
             rank_info=self.rank_info,
         )
 
-        # Create sync checkpointer
-        self.checkpointer = SyncCheckpointer(self.writer, self.reader)
-
-        # Create a test state dictionary
+        # Create test state dictionary
         self.state_dict = {
             "model": torch.nn.Linear(10, 5).state_dict(),
             "optimizer": {"param_groups": [{"lr": 0.01}]},
@@ -53,129 +79,562 @@ def tearDown(self):
         # Clean up the temporary directory
         shutil.rmtree(self.temp_dir)
 
-    def test_sync_save_and_read(self):
-        """Test saving and reading a checkpoint synchronously."""
-        checkpoint_path = os.path.join(self.temp_dir, "checkpoint_sync")
-
-        # Save the checkpoint synchronously
-        result = self.checkpointer.save(self.state_dict, checkpoint_path)
-        self.assertIsNone(result)  # Sync mode returns None
-
-        # Verify that the checkpoint file exists
-        checkpoint_file = os.path.join(
-            checkpoint_path, f"checkpoint_{self.rank_info.global_rank}.pt"
+    def _create_sync_checkpointer(self) -> SyncCheckpointer:
+        """Create a synchronous checkpointer."""
+        writer = CheckpointWriter(
+            config=self.writer_config,
+            rank_info=self.rank_info,
+        )
+        return SyncCheckpointer(writer, self.reader)
+
+    def _create_async_checkpointer(self) -> AsyncCheckpointer:
+        """Create an asynchronous checkpointer."""
+        # Create staging config for async operations
+        # Use conservative settings to avoid CUDA issues in test environment
+        stager_config = CheckpointStagerConfig(
+            use_async_staging=True,
+            use_pinned_memory=False,  # Disable to avoid CUDA memory issues
+            use_shared_memory=True,
+            use_non_blocking_copy=False,  # Disable to avoid CUDA issues
         )
-        self.assertTrue(os.path.exists(checkpoint_file))
 
-        # Load the checkpoint using the checkpointer
-        loaded_state_dict = self.checkpointer.load(checkpoint_path)
+        # Create process config
+        process_config = CheckpointProcessConfig(
+            subprocess_init_timeout_secs=30,
+            subprocess_shutdown_timeout_secs=60,
+        )
 
-        # Verify the loaded state dictionary
-        self.assertIn("model", loaded_state_dict)
-        self.assertIn("optimizer", loaded_state_dict)
-        self.assertEqual(loaded_state_dict["epoch"], 5)
-        self.assertEqual(loaded_state_dict["step"], 1000)
+        # Create stager
+        checkpoint_stager = DefaultStager(stager_config)
 
-    def test_read_with_map_location(self):
-        """Test reading a checkpoint with a specific map_location."""
-        checkpoint_path = os.path.join(self.temp_dir, "checkpoint_map_location")
+        # Create checkpoint process
+        checkpoint_process = CheckpointProcess(
+            rank_info=self.rank_info,
+            config=process_config,
+            subprocess_init_fn=subprocess_init_fn,
+            subprocess_init_args=(
+                "test-async-checkpointer",
+                os.getpid(),
+            ),
+            checkpoint_writer_init_fn=ckpt_writer_init_fn,
+            checkpoint_writer_init_args={
+                "config": self.writer_config,
+                "rank_info": self.rank_info,
+            },
+        )
 
-        # Save the checkpoint
-        self.checkpointer.save(self.state_dict, checkpoint_path)
+        # Wait for process initialization
+        checkpoint_process.process_creation_future.result()
 
-        # Load the checkpoint with map_location='cpu'
-        loaded_state_dict = self.checkpointer.load(
-            checkpoint_path, default_map_location="cpu"
+        return AsyncCheckpointer(
+            checkpoint_stager=checkpoint_stager,
+            checkpoint_process=checkpoint_process,
+            reader=self.reader,
         )
 
-        # Verify the loaded state dictionary
-        self.assertIn("model", loaded_state_dict)
-        self.assertIn("optimizer", loaded_state_dict)
-        self.assertEqual(loaded_state_dict["epoch"], 5)
-        self.assertEqual(loaded_state_dict["step"], 1000)
+    def _get_checkpointers(self):
+        """Get both sync and async checkpointers for parameterized testing."""
+        return [
+            ("sync", self._create_sync_checkpointer()),
+            ("async", self._create_async_checkpointer()),
+        ]
+
+    def _save_checkpoint(self, checkpointer: Checkpointer, path, state_dict, **kwargs):
+        """Save checkpoint and handle both sync/async return values."""
+        result = checkpointer.save(path, state_dict, **kwargs)
+        return (None, None) if result is None else result
+
+    def _wait_for_save(self, stage_future, write_future):
+        """Wait for save operation to complete."""
+        if write_future is not None:
+            write_future.result()
+        if stage_future is not None:
+            stage_future.result()
+
+    def test_save_and_load_basic(self):
+        """Test basic save and load functionality for both sync and async."""
+        for checkpointer_type, checkpointer in self._get_checkpointers():
+            with self.subTest(checkpointer_type=checkpointer_type):
+                try:
+                    checkpoint_path = os.path.join(
+                        self.temp_dir, f"checkpoint_{checkpointer_type}"
+                    )
+
+                    # Save the checkpoint
+                    stage_future, write_future = self._save_checkpoint(
+                        checkpointer, checkpoint_path, self.state_dict
+                    )
+                    self._wait_for_save(stage_future, write_future)
+
+                    # Verify that the checkpoint file exists
+                    checkpoint_file = os.path.join(
+                        checkpoint_path, f"checkpoint_{self.rank_info.global_rank}.pt"
+                    )
+                    self.assertTrue(os.path.exists(checkpoint_file))
+
+                    # Load the checkpoint using the checkpointer
+                    loaded_state_dict = checkpointer.load(checkpoint_path)
+
+                    # Verify the loaded state dictionary
+                    self.assertIn("model", loaded_state_dict)
+                    self.assertIn("optimizer", loaded_state_dict)
+                    self.assertEqual(loaded_state_dict["epoch"], 5)
+                    self.assertEqual(loaded_state_dict["step"], 1000)
+
+                finally:
+                    checkpointer.close()
+
+    def test_load_with_map_location(self):
+        """Test loading with map_location for both sync and async."""
+        for checkpointer_type, checkpointer in self._get_checkpointers():
+            with self.subTest(checkpointer_type=checkpointer_type):
+                try:
+                    checkpoint_path = os.path.join(
+                        self.temp_dir, f"checkpoint_map_{checkpointer_type}"
+                    )
+
+                    # Save the checkpoint
+                    stage_future, write_future = self._save_checkpoint(
+                        checkpointer, checkpoint_path, self.state_dict
+                    )
+                    self._wait_for_save(stage_future, write_future)
+
+                    # Load with map_location
+                    loaded_state_dict = checkpointer.load(
+                        checkpoint_path, default_map_location="cpu"
+                    )
+
+                    # Verify the loaded state dictionary
+                    self.assertIn("model", loaded_state_dict)
+                    self.assertEqual(loaded_state_dict["epoch"], 5)
+
+                finally:
+                    checkpointer.close()
 
     def test_partial_load(self):
-        """Test loading only specific keys from a checkpoint."""
-        checkpoint_path = os.path.join(self.temp_dir, "checkpoint_partial")
+        """Test partial loading for both sync and async."""
+        for checkpointer_type, checkpointer in self._get_checkpointers():
+            with self.subTest(checkpointer_type=checkpointer_type):
+                try:
+                    checkpoint_path = os.path.join(
+                        self.temp_dir, f"checkpoint_partial_{checkpointer_type}"
+                    )
+
+                    # Save the full checkpoint
+                    stage_future, write_future = self._save_checkpoint(
+                        checkpointer, checkpoint_path, self.state_dict
+                    )
+                    self._wait_for_save(stage_future, write_future)
+
+                    # Create a partial state dictionary
+                    partial_state_dict = {
+                        "model": torch.nn.Linear(10, 5).state_dict(),
+                        "epoch": None,
+                    }
+
+                    # Load only the keys in partial_state_dict
+                    loaded_state_dict = checkpointer.load(
+                        checkpoint_path, state_dict=partial_state_dict
+                    )
+
+                    # Verify partial loading worked
+                    self.assertIn("model", loaded_state_dict)
+                    self.assertIn("epoch", loaded_state_dict)
+                    self.assertEqual(loaded_state_dict["epoch"], 5)
+                    self.assertNotIn("step", loaded_state_dict)
+                    self.assertNotIn("optimizer", loaded_state_dict)
+
+                finally:
+                    checkpointer.close()
+
+    def test_load_strict_mode(self):
+        """Test strict mode loading for both sync and async."""
+        for checkpointer_type, checkpointer in self._get_checkpointers():
+            with self.subTest(checkpointer_type=checkpointer_type):
+                try:
+                    checkpoint_path = os.path.join(
+                        self.temp_dir, f"checkpoint_strict_{checkpointer_type}"
+                    )
+
+                    # Save a checkpoint with limited keys
+                    limited_state_dict = {"model": torch.nn.Linear(10, 5).state_dict()}
+                    stage_future, write_future = self._save_checkpoint(
+                        checkpointer, checkpoint_path, limited_state_dict
+                    )
+                    self._wait_for_save(stage_future, write_future)
+
+                    # Try to load with more keys than exist in checkpoint
+                    partial_state_dict = {
+                        "model": torch.nn.Linear(10, 5).state_dict(),
+                        "missing_key": None,
+                    }
+
+                    # Should raise error in strict mode
+                    with self.assertRaises(RuntimeError) as cm:
+                        checkpointer.load(
+                            checkpoint_path, state_dict=partial_state_dict, strict=True
+                        )
+
+                    self.assertIn("missing keys", str(cm.exception))
+
+                    # Should work without strict mode
+                    loaded_state_dict = checkpointer.load(
+                        checkpoint_path, state_dict=partial_state_dict, strict=False
+                    )
+                    self.assertIn("model", loaded_state_dict)
+
+                finally:
+                    checkpointer.close()
+
+    def test_save_with_kwargs(self):
+        """Test save with additional kwargs for both sync and async."""
+        for checkpointer_type, checkpointer in self._get_checkpointers():
+            with self.subTest(checkpointer_type=checkpointer_type):
+                try:
+                    checkpoint_path = os.path.join(
+                        self.temp_dir, f"checkpoint_kwargs_{checkpointer_type}"
+                    )
+
+                    # For sync checkpointer, we can pass arbitrary kwargs to the writer
+                    # For async checkpointer, we test without kwargs to avoid conflicts
+                    if checkpointer_type == "sync":
+                        # Sync checkpointer passes kwargs directly to writer, so arbitrary kwargs are OK
+                        stage_future, write_future = self._save_checkpoint(
+                            checkpointer,
+                            checkpoint_path,
+                            self.state_dict,
+                            custom_arg="test_value",
+                            another_arg=42,
+                        )
+                    else:
+                        # Async checkpointer has complex kwargs handling between stager and writer
+                        # Just test basic save without kwargs to avoid conflicts
+                        stage_future, write_future = self._save_checkpoint(
+                            checkpointer,
+                            checkpoint_path,
+                            self.state_dict,
+                        )
+
+                    self._wait_for_save(stage_future, write_future)
+
+                    # Verify checkpoint was created
+                    checkpoint_file = os.path.join(
+                        checkpoint_path, f"checkpoint_{self.rank_info.global_rank}.pt"
+                    )
+                    self.assertTrue(os.path.exists(checkpoint_file))
+
+                finally:
+                    checkpointer.close()
+
+    def test_nested_dict_partial_load(self):
+        """Test loading nested dictionaries partially for both sync and async."""
+        for checkpointer_type, checkpointer in self._get_checkpointers():
+            with self.subTest(checkpointer_type=checkpointer_type):
+                try:
+                    # Create a checkpoint with nested dictionaries
+                    nested_state_dict = {
+                        "model": {
+                            "layer1": {
+                                "weight": torch.randn(5, 10),
+                                "bias": torch.randn(5),
+                            },
+                            "layer2": {
+                                "weight": torch.randn(2, 5),
+                                "bias": torch.randn(2),
+                            },
+                        },
+                        "metadata": {"epoch": 10, "step": 2000},
+                    }
+
+                    checkpoint_path = os.path.join(
+                        self.temp_dir, f"checkpoint_nested_{checkpointer_type}"
+                    )
+
+                    # Save the nested state dict
+                    stage_future, write_future = self._save_checkpoint(
+                        checkpointer, checkpoint_path, nested_state_dict
+                    )
+                    self._wait_for_save(stage_future, write_future)
+
+                    # Create a partial state dictionary with nested structure
+                    partial_state_dict = {
+                        "model": {
+                            "layer1": {"weight": None},  # Only request layer1.weight
+                        },
+                        "metadata": {"epoch": None},  # Only request metadata.epoch
+                    }
+
+                    # Load only the keys in partial_state_dict
+                    loaded_state_dict = checkpointer.load(
+                        checkpoint_path, state_dict=partial_state_dict
+                    )
+
+                    # Verify that the nested keys were correctly loaded
+                    self.assertIn("model", loaded_state_dict)
+                    self.assertIn("layer1", loaded_state_dict["model"])
+                    self.assertIn("weight", loaded_state_dict["model"]["layer1"])
+                    self.assertIn("metadata", loaded_state_dict)
+                    self.assertIn("epoch", loaded_state_dict["metadata"])
+
+                    # Verify values were loaded correctly
+                    self.assertTrue(
+                        torch.allclose(
+                            loaded_state_dict["model"]["layer1"]["weight"],
+                            nested_state_dict["model"]["layer1"]["weight"],
+                        )
+                    )
+                    self.assertEqual(loaded_state_dict["metadata"]["epoch"], 10)
+
+                    # Verify that keys not in the partial_state_dict are not loaded
+                    self.assertNotIn("layer2", loaded_state_dict["model"])
+                    self.assertNotIn("step", loaded_state_dict["metadata"])
+
+                finally:
+                    checkpointer.close()
+
+
+class TestAsyncCheckpointerSpecific(TestCase):
+    """Tests specific to AsyncCheckpointer functionality."""
 
-        # Save the full checkpoint
-        self.checkpointer.save(self.state_dict, checkpoint_path)
+    def setUp(self):
+        # Create a temporary directory for checkpoints
+        self.temp_dir = tempfile.mkdtemp()
+
+        # Create real objects for testing
+        self.rank_info = RankInfo(
+            global_world_size=1,
+            global_rank=0,
+        )
+        self.writer_config = CheckpointWriterConfig()
+
+        # Create reader for testing
+        self.reader = CheckpointReader(
+            rank_info=self.rank_info,
+        )
 
-        # Create a partial state dictionary with only some keys
-        partial_state_dict = {
+        # Create test state dictionary
+        self.state_dict = {
             "model": torch.nn.Linear(10, 5).state_dict(),
-            "epoch": None,  # Will be loaded from checkpoint
+            "optimizer": {"param_groups": [{"lr": 0.01}]},
+            "epoch": 5,
+            "step": 1000,
         }
 
-        # Load only the keys in partial_state_dict
-        loaded_state_dict = self.checkpointer.load(
-            checkpoint_path, state_dict=partial_state_dict, default_map_location="cpu"
+    def tearDown(self):
+        # Clean up the temporary directory
+        shutil.rmtree(self.temp_dir)
+
+    def _create_async_checkpointer(self) -> AsyncCheckpointer:
+        """Helper method to create AsyncCheckpointer with real components."""
+        # Create staging config for async operations
+        # Use conservative settings to avoid CUDA issues in test environment
+        stager_config = CheckpointStagerConfig(
+            use_async_staging=True,
+            use_pinned_memory=False,  # Disable to avoid CUDA memory issues
+            use_shared_memory=True,
+            use_non_blocking_copy=False,  # Disable to avoid CUDA issues
         )
 
-        # Verify that the loaded state dictionary contains values from the checkpoint
-        self.assertIn("model", loaded_state_dict)
-        self.assertIn("epoch", loaded_state_dict)
-        self.assertEqual(loaded_state_dict["epoch"], 5)  # From checkpoint
-
-        # Verify that keys not in the partial_state_dict are not loaded
-        self.assertNotIn("step", loaded_state_dict)
-        self.assertNotIn("optimizer", loaded_state_dict)
-
-        # Verify that the loaded state dictionary is the same object as the input
-        self.assertIs(loaded_state_dict, partial_state_dict)
-
-    def test_partial_load_with_nested_dict(self):
-        """Test loading only specific nested keys from a checkpoint."""
-        # Create a checkpoint with nested dictionaries
-        nested_state_dict = {
-            "model": {
-                "layer1": {"weight": torch.randn(5, 10), "bias": torch.randn(5)},
-                "layer2": {"weight": torch.randn(2, 5), "bias": torch.randn(2)},
-            },
-            "metadata": {"epoch": 10, "step": 2000},
-        }
+        # Create process config
+        process_config = CheckpointProcessConfig(
+            subprocess_init_timeout_secs=30,
+            subprocess_shutdown_timeout_secs=60,
+        )
 
-        checkpoint_path = os.path.join(self.temp_dir, "checkpoint_nested")
+        # Create stager
+        checkpoint_stager = DefaultStager(stager_config)
 
-        # Create a writer and save the nested state dict
-        writer = CheckpointWriter(
-            config=self.writer_config,
+        # Create checkpoint process
+        checkpoint_process = CheckpointProcess(
             rank_info=self.rank_info,
+            config=process_config,
+            subprocess_init_fn=subprocess_init_fn,
+            subprocess_init_args=(
+                "test-async-checkpointer",
+                os.getpid(),
+            ),
+            checkpoint_writer_init_fn=ckpt_writer_init_fn,
+            checkpoint_writer_init_args={
+                "config": self.writer_config,
+                "rank_info": self.rank_info,
+            },
         )
-        writer.write(nested_state_dict, checkpoint_path)
 
-        # Create a partial state dictionary with nested structure
-        partial_state_dict = {
-            "model": {
-                "layer1": {"weight": None},  # Only request layer1.weight
-            },
-            "metadata": {"epoch": None},  # Only request metadata.epoch
-        }
+        # Wait for process initialization
+        checkpoint_process.process_creation_future.result()
 
-        # Load only the keys in partial_state_dict
-        loaded_state_dict = self.checkpointer.load(
-            checkpoint_path, state_dict=partial_state_dict, default_map_location="cpu"
+        return AsyncCheckpointer(
+            checkpoint_stager=checkpoint_stager,
+            checkpoint_process=checkpoint_process,
+            reader=self.reader,
         )
 
-        # Verify that the nested keys were correctly loaded
-        self.assertIn("model", loaded_state_dict)
-        self.assertIn("layer1", loaded_state_dict["model"])
-        self.assertIn("weight", loaded_state_dict["model"]["layer1"])
-        self.assertIn("metadata", loaded_state_dict)
-        self.assertIn("epoch", loaded_state_dict["metadata"])
-
-        # Verify values were loaded correctly
-        self.assertTrue(
-            torch.allclose(
-                loaded_state_dict["model"]["layer1"]["weight"],
-                nested_state_dict["model"]["layer1"]["weight"],
+    def test_async_returns_futures(self):
+        """Test that async save returns futures."""
+        checkpointer = self._create_async_checkpointer()
+        checkpoint_path = os.path.join(self.temp_dir, "checkpoint_futures")
+
+        try:
+            # Save the checkpoint asynchronously
+            result = checkpointer.save(checkpoint_path, self.state_dict)
+
+            # Verify that futures are returned
+            self.assertIsInstance(result, tuple)
+            self.assertEqual(len(result), 2)
+            stage_future, write_future = result
+            self.assertIsInstance(stage_future, Future)
+            self.assertIsInstance(write_future, Future)
+
+            # Wait for completion
+            stage_future.result()
+            write_future.result()
+
+        finally:
+            checkpointer.close()
+
+    def test_async_sequential_saves_wait(self):
+        """Test that sequential async saves wait for previous operations."""
+        checkpointer = self._create_async_checkpointer()
+
+        try:
+            # First save
+            checkpoint_path1 = os.path.join(self.temp_dir, "checkpoint_seq_1")
+            stage_future1, write_future1 = checkpointer.save(
+                checkpoint_path1, self.state_dict
             )
+
+            # Second save (should wait for first to complete)
+            checkpoint_path2 = os.path.join(self.temp_dir, "checkpoint_seq_2")
+            modified_state_dict = self.state_dict.copy()
+            modified_state_dict["epoch"] = 10
+            stage_future2, write_future2 = checkpointer.save(
+                checkpoint_path2, modified_state_dict
+            )
+
+            # Wait for both to complete
+            write_future1.result()
+            write_future2.result()
+
+            # Verify both checkpoints were created with correct content
+            checkpoint_file1 = os.path.join(
+                checkpoint_path1, f"checkpoint_{self.rank_info.global_rank}.pt"
+            )
+            checkpoint_file2 = os.path.join(
+                checkpoint_path2, f"checkpoint_{self.rank_info.global_rank}.pt"
+            )
+
+            self.assertTrue(os.path.exists(checkpoint_file1))
+            self.assertTrue(os.path.exists(checkpoint_file2))
+
+            loaded1 = torch.load(checkpoint_file1)
+            loaded2 = torch.load(checkpoint_file2)
+
+            self.assertEqual(loaded1["epoch"], 5)
+            self.assertEqual(loaded2["epoch"], 10)
+
+        finally:
+            checkpointer.close()
+
+    def test_async_multiple_saves_ordering(self):
+        """Test that multiple async saves maintain proper ordering."""
+        checkpointer = self._create_async_checkpointer()
+
+        try:
+            # Create multiple state dicts
+            state_dicts = [
+                {"epoch": 1, "model": torch.nn.Linear(5, 3).state_dict()},
+                {"epoch": 2, "model": torch.nn.Linear(5, 3).state_dict()},
+                {"epoch": 3, "model": torch.nn.Linear(5, 3).state_dict()},
+            ]
+
+            # Save multiple checkpoints
+            futures = []
+            checkpoint_paths = []
+            for i, state_dict in enumerate(state_dicts, 1):
+                checkpoint_path = os.path.join(self.temp_dir, f"multi_{i}")
+                checkpoint_paths.append(checkpoint_path)
+                stage_future, write_future = checkpointer.save(
+                    checkpoint_path, state_dict
+                )
+                futures.append((stage_future, write_future))
+
+            # Wait for all to complete
+            for stage_future, write_future in futures:
+                write_future.result()
+
+            # Verify all checkpoints exist and have correct content
+            for i, checkpoint_path in enumerate(checkpoint_paths, 1):
+                checkpoint_file = os.path.join(
+                    checkpoint_path, f"checkpoint_{self.rank_info.global_rank}.pt"
+                )
+                self.assertTrue(os.path.exists(checkpoint_file))
+
+                loaded = torch.load(checkpoint_file)
+                self.assertEqual(loaded["epoch"], i)
+
+        finally:
+            checkpointer.close()
+
+    def test_async_error_handling(self):
+        """Test error handling in async operations."""
+        # Create checkpointer with mocked components to simulate errors
+        mock_stager = Mock()
+        mock_process = Mock()
+        mock_reader = Mock()
+
+        # Mock staging to return a completed future
+        mock_staging_future = Future()
+        mock_staging_future.set_result({"staged": "data"})
+        mock_stager.stage.return_value = mock_staging_future
+
+        # Mock process write to raise an error
+        mock_write_future = Future()
+        mock_write_future.set_exception(RuntimeError("Write failed"))
+        mock_process.write.return_value = mock_write_future
+
+        checkpointer = AsyncCheckpointer(
+            checkpoint_stager=mock_stager,
+            checkpoint_process=mock_process,
+            reader=mock_reader,
         )
-        self.assertEqual(loaded_state_dict["metadata"]["epoch"], 10)
 
-        # Verify that keys not in the partial_state_dict are not loaded
-        self.assertNotIn("layer2", loaded_state_dict["model"])
-        self.assertNotIn("step", loaded_state_dict["metadata"])
+        try:
+            # This should not raise immediately
+            stage_future, write_future = checkpointer.save("/tmp/test", self.state_dict)
+
+            # But waiting for the write future should raise the error
+            with self.assertRaises(RuntimeError) as cm:
+                write_future.result()
+
+            self.assertIn("Write failed", str(cm.exception))
+
+        finally:
+            checkpointer.close()
+
+    def test_async_future_results(self):
+        """Test the results returned by async futures."""
+        checkpointer = self._create_async_checkpointer()
+        checkpoint_path = os.path.join(self.temp_dir, "checkpoint_results")
+
+        try:
+            # Save checkpoint
+            stage_future, write_future = checkpointer.save(
+                checkpoint_path, self.state_dict
+            )
+
+            # Both futures should complete successfully
+            stage_result = stage_future.result()
+            write_result = write_future.result()
+
+            # Stage result is wrapped by wrap_future() so it returns None on success
+            # This is intentional - the stage_future indicates completion, not data access
+            self.assertIsNone(stage_result)
+
+            # Write result should be None (success indicator)
+            self.assertIsNone(write_result)
+
+        finally:
+            checkpointer.close()
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/checkpoint/e2e/test_fsdp_ep.py b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
index 7489317035b99..51d4b3e995372 100644
--- a/test/distributed/checkpoint/e2e/test_fsdp_ep.py
+++ b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
@@ -73,8 +73,8 @@ def test_e2e(self):
             self.device_type, (2, 4), mesh_dim_names=("dp", "tp")
         )
         # TODO: we are using an internal API atm. Change to a public API once it is ready.
-        mesh_fsdp_ep = _mesh_resources.create_child_mesh(mesh_fsdp_tp, ("dp",))
-        del _mesh_resources.child_to_parent_mapping[mesh_fsdp_ep]
+        mesh_fsdp_ep = _mesh_resources.create_sub_mesh(mesh_fsdp_tp, ("dp",), [(0,)])
+        del _mesh_resources.child_to_root_mapping[mesh_fsdp_ep]
 
         mesh_fsdp = init_device_mesh(self.device_type, (8,))
         for i, l in enumerate(model.second.ep_layers):
diff --git a/test/distributed/checkpoint/test_checkpoint.py b/test/distributed/checkpoint/test_checkpoint.py
index 92e56fcdc5b9d..66911327327d3 100644
--- a/test/distributed/checkpoint/test_checkpoint.py
+++ b/test/distributed/checkpoint/test_checkpoint.py
@@ -2,7 +2,7 @@
 
 import os
 import sys
-from typing import cast, Optional, Union
+from typing import Any, cast, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -170,7 +170,9 @@ def __init__(self, fail_conf):
     def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
         return
 
-    def set_up_storage_writer(self, is_coordinator: bool) -> None:
+    def set_up_storage_writer(
+        self, is_coordinator: bool, *args: Any, **kwargs: Any
+    ) -> None:
         self._fail_rank("fail_set_up_storage_writer")
 
     def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
@@ -356,13 +358,18 @@ def test_load_error_handling(self) -> None:
         self._test_load(state_dict)
         self._test_load(state_dict, fail_set_up_storage_reader=[0])
         self._test_load(state_dict, fail_prepare_global_plan=[0])
-        self._test_load(state_dict, fail_read_metadata=[0])
+        self._test_load(state_dict, fail_read_metadata=[0], ignore_exception_type=True)
         self._test_load(state_dict, fail_prepare_local_plan=[1])
         self._test_load(state_dict, fail_read_data=[3])
         self._test_load(state_dict, fail_read_data_async=[1])
 
         self._test_load(state_dict, coordinator=3, fail_set_up_storage_reader=[0])
-        self._test_load(state_dict, coordinator=1, fail_read_metadata=[3])
+        self._test_load(
+            state_dict,
+            coordinator=1,
+            fail_read_metadata=[3],
+            ignore_exception_type=True,
+        )
         self._test_load(state_dict, coordinator=2, fail_read_data=[0])
         self._test_load(state_dict, coordinator=3, fail_read_data_async=[2])
         self._test_load(state_dict, coordinator=1, fail_prepare_global_plan=[1])
@@ -371,7 +378,7 @@ def test_load_error_handling_no_dist(self) -> None:
         state_dict = {"replicated": torch.rand(10, 10), "bytes": [1, 2, 3, 4]}
         self._test_load(state_dict)
         self._test_load(state_dict, fail_set_up_storage_reader=[0])
-        self._test_load(state_dict, fail_read_metadata=[0])
+        self._test_load(state_dict, fail_read_metadata=[0], ignore_exception_type=True)
         self._test_load(state_dict, fail_prepare_local_plan=[0])
         self._test_load(state_dict, fail_prepare_global_plan=[0])
         self._test_load(state_dict, fail_read_data=[0])
diff --git a/test/distributed/checkpoint/test_consolidate_hf_safetensors.py b/test/distributed/checkpoint/test_consolidate_hf_safetensors.py
index ba07c62728d71..00d73311e1e75 100644
--- a/test/distributed/checkpoint/test_consolidate_hf_safetensors.py
+++ b/test/distributed/checkpoint/test_consolidate_hf_safetensors.py
@@ -8,7 +8,10 @@
 import torch.distributed.checkpoint as dist_cp
 from torch import distributed as dist
 from torch.distributed.checkpoint._consolidate_hf_safetensors import (
+    _calculate_max_contiguous_elements,
+    _write_sub_tensor_to_file_optimized,
     consolidate_safetensors_files,
+    consolidate_safetensors_files_on_every_rank,
 )
 from torch.distributed.checkpoint._hf_utils import _metadata_fn
 from torch.distributed.device_mesh import init_device_mesh
@@ -86,7 +89,11 @@ def test_consolidate_to_one_file(self) -> None:
         global_tensor = torch.arange(16, dtype=torch.float).view(4, 4)
 
         if self.rank == 0:
-            consolidate_safetensors_files(checkpoint_dir, output_dir)
+            consolidate_safetensors_files(
+                checkpoint_dir,
+                output_dir,
+                fqn_to_index_mapping={"dtensor": 1, "dtensor_col": 1},
+            )
 
             file_path = os.path.join(output_dir, "model-00001-of-00001.safetensors")
             loaded_dict = safetensors.torch.load_file(file_path)
@@ -153,6 +160,207 @@ def test_consolidate_to_two_files(self):
                 )
         dist.barrier()
 
+    def test_calculate_max_contiguous_elements_validations(self) -> None:
+        """Test validation logic in _calculate_max_contiguous_elements function."""
+
+        # Test empty lists validation
+        with self.assertRaisesRegex(ValueError, "Input lists cannot be empty"):
+            _calculate_max_contiguous_elements([], [2, 3], [4, 5])
+
+        # Test mismatched list lengths validation
+        with self.assertRaisesRegex(
+            ValueError, "All input lists must have the same length"
+        ):
+            _calculate_max_contiguous_elements([1], [2, 3], [4, 5])
+
+        # Test indices out of bounds validation
+        with self.assertRaisesRegex(
+            ValueError, "Index .* at dimension .* is out of bounds for sub-tensor shape"
+        ):
+            _calculate_max_contiguous_elements(
+                [2, 1], [2, 3], [4, 5]
+            )  # indices[0] >= sub_tensor_shape[0]
+
+        # Test sub-tensor dimensions exceeding tensor dimensions validation
+        with self.assertRaisesRegex(
+            ValueError,
+            "Sub-tensor dimension .* at position .* exceeds tensor dimension",
+        ):
+            _calculate_max_contiguous_elements(
+                [1, 2], [2, 6], [4, 5]
+            )  # sub_tensor_shape[1] > tensor_shape[1]
+
+    def test_calculate_max_contiguous_elements_valid_cases(self) -> None:
+        """Test valid cases for _calculate_max_contiguous_elements function."""
+
+        # Test 1D case - simple remaining elements
+        result = _calculate_max_contiguous_elements([2], [5], [10])
+        self.assertEqual(result, 3)  # 5 - 2 = 3 elements remaining
+
+        # Test 2D case - at start of row, can write complete rows
+        result = _calculate_max_contiguous_elements([1, 0], [3, 4], [6, 4])
+        self.assertEqual(result, 8)  # 2 rows * 4 columns = 8 elements
+
+        # Test 2D case - middle of row, only remaining in current row
+        result = _calculate_max_contiguous_elements([1, 2], [3, 4], [6, 8])
+        self.assertEqual(result, 2)  # 4 - 2 = 2 elements remaining in row
+
+        # Test 3D case - at start of 2D slice, can write complete slices
+        result = _calculate_max_contiguous_elements([1, 0, 0], [3, 2, 4], [5, 2, 4])
+        self.assertEqual(result, 16)  # 2 slices * 2 rows * 4 columns = 16 elements
+
+        # Test edge case - at last position
+        result = _calculate_max_contiguous_elements([2, 3], [3, 4], [6, 8])
+        self.assertEqual(result, 1)  # Only 1 element remaining
+
+        # Test case where sub-tensor spans full width
+        result = _calculate_max_contiguous_elements([0, 0], [2, 5], [4, 5])
+        self.assertEqual(result, 10)  # 2 rows * 5 columns = 10 elements
+
+        # Test column-wise sharded case - sub-tensor doesn't span full width
+        # Even at start of row, can only write width of one row due to column sharding
+        result = _calculate_max_contiguous_elements([1, 0], [3, 2], [4, 8])
+        self.assertEqual(
+            result, 2
+        )  # Only 2 elements (width of sub-tensor) can be written contiguously
+
+        # Test another column-wise sharded case - middle of tensor
+        result = _calculate_max_contiguous_elements([0, 0], [2, 3], [6, 10])
+        self.assertEqual(
+            result, 3
+        )  # Only 3 elements (width of sub-tensor) can be written contiguously
+
+    @with_comms
+    @with_temp_dir
+    @skip_if_lt_x_gpu(2)
+    def test_consolidate_with_two_ranks(self):
+        if importlib.util.find_spec("safetensors") is None:
+            print("safetensors not installed")
+            return
+        import safetensors
+
+        checkpoint_dir = self.temp_dir
+        output_dir = os.path.join(checkpoint_dir, "consolidated")
+        os.makedirs(output_dir, exist_ok=True)
+
+        self._create_d_tensors()
+
+        global_tensor = torch.arange(16, dtype=torch.float).view(4, 4)
+
+        fqn_to_index_mapping = {"dtensor": 1, "dtensor_col": 2}
+        consolidate_safetensors_files_on_every_rank(
+            checkpoint_dir, output_dir, fqn_to_index_mapping=fqn_to_index_mapping
+        )
+
+        file1_path = os.path.join(output_dir, "model-00001-of-00002.safetensors")
+        file2_path = os.path.join(output_dir, "model-00002-of-00002.safetensors")
+
+        loaded_dict = safetensors.torch.load_file(file1_path)
+        self.assertEqual(loaded_dict.keys(), {"dtensor"})
+        self.assertTrue(torch.equal(loaded_dict["dtensor"], global_tensor))
+
+        loaded_dict_col = safetensors.torch.load_file(file2_path)
+        self.assertEqual(loaded_dict_col.keys(), {"dtensor_col"})
+        self.assertTrue(torch.equal(loaded_dict_col["dtensor_col"], global_tensor))
+
+        dist.barrier()
+
+    @with_comms
+    @with_temp_dir
+    @skip_if_lt_x_gpu(2)
+    def test_consolidate_one_file_with_two_ranks(self):
+        if importlib.util.find_spec("safetensors") is None:
+            print("safetensors not installed")
+            return
+        import safetensors
+
+        # this is testing the case where one rank has no data to write
+        # and the other rank has two tensors to write.
+        # the rank with no work should wait properly for the other rank to finish
+        checkpoint_dir = self.temp_dir
+        output_dir = os.path.join(checkpoint_dir, "consolidated")
+        os.makedirs(output_dir, exist_ok=True)
+
+        self._create_d_tensors()
+
+        global_tensor = torch.arange(16, dtype=torch.float).view(4, 4)
+
+        fqn_to_index_mapping = {"dtensor": 1, "dtensor_col": 1}
+        consolidate_safetensors_files_on_every_rank(
+            checkpoint_dir, output_dir, fqn_to_index_mapping=fqn_to_index_mapping
+        )
+
+        file1_path = os.path.join(output_dir, "model-00001-of-00001.safetensors")
+
+        loaded_dict = safetensors.torch.load_file(file1_path)
+        self.assertEqual(loaded_dict.keys(), {"dtensor", "dtensor_col"})
+        self.assertTrue(torch.equal(loaded_dict["dtensor"], global_tensor))
+        self.assertTrue(torch.equal(loaded_dict["dtensor_col"], global_tensor))
+
+    def test_write_sub_tensor_to_file_optimized(self) -> None:
+        """Test the _write_sub_tensor_to_file_optimized function with various scenarios."""
+
+        # Test case 1: Simple 2D tensor, row-wise sharding
+        full_tensor_shape = [4, 6]
+        sub_tensor_shape = [2, 6]
+        sub_tensor_offsets = [1, 0]
+        element_size = 4  # float32
+
+        # Create test data
+        sub_tensor_data = torch.arange(12, dtype=torch.float32)
+        sub_tensor_bytes = sub_tensor_data.numpy().tobytes()
+
+        # Create full tensor buffer
+        full_tensor_buffer = bytearray(4 * 6 * element_size)
+        full_tensor_mv = memoryview(full_tensor_buffer)
+
+        # Call the function
+        _write_sub_tensor_to_file_optimized(
+            full_tensor_mv,
+            sub_tensor_bytes,
+            element_size,
+            full_tensor_shape,
+            sub_tensor_offsets,
+            sub_tensor_shape,
+        )
+
+        # Verify the result
+        result_tensor = torch.frombuffer(full_tensor_buffer, dtype=torch.float32).view(
+            4, 6
+        )
+        expected_tensor = torch.zeros(4, 6, dtype=torch.float32)
+        expected_tensor[1:3, :] = sub_tensor_data.view(2, 6)
+
+        self.assertTrue(torch.equal(result_tensor, expected_tensor))
+
+        # Test case 2: Column-wise sharding
+        full_tensor_shape = [3, 8]
+        sub_tensor_shape = [3, 2]
+        sub_tensor_offsets = [0, 3]
+
+        sub_tensor_data = torch.arange(6, dtype=torch.float32)
+        sub_tensor_bytes = sub_tensor_data.numpy().tobytes()
+
+        full_tensor_buffer = bytearray(3 * 8 * element_size)
+        full_tensor_mv = memoryview(full_tensor_buffer)
+
+        _write_sub_tensor_to_file_optimized(
+            full_tensor_mv,
+            sub_tensor_bytes,
+            element_size,
+            full_tensor_shape,
+            sub_tensor_offsets,
+            sub_tensor_shape,
+        )
+
+        result_tensor = torch.frombuffer(full_tensor_buffer, dtype=torch.float32).view(
+            3, 8
+        )
+        expected_tensor = torch.zeros(3, 8, dtype=torch.float32)
+        expected_tensor[:, 3:5] = sub_tensor_data.view(3, 2)
+
+        self.assertTrue(torch.equal(result_tensor, expected_tensor))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/checkpoint/test_hf_safetensor_e2e.py b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
index 92f9b97237064..9fbe2c47db039 100644
--- a/test/distributed/checkpoint/test_hf_safetensor_e2e.py
+++ b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
@@ -1,11 +1,15 @@
 # Owner(s): ["oncall: distributed checkpointing"]
 
 import importlib
+import json
 import os
 
 import torch
 import torch.distributed.checkpoint as dist_cp
 from torch import distributed as dist
+from torch.distributed.checkpoint.quantized_hf_storage import (
+    QuantizedHuggingFaceStorageReader,
+)
 from torch.distributed.checkpoint.state_dict_loader import _load_state_dict_from_keys
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import distribute_tensor, DTensor, Replicate, Shard, zeros
@@ -117,6 +121,158 @@ def test_load_into_empty_dict(self) -> None:
                 torch.equal(state_dict_to_save[key], state_dict_loaded[key])
             )
 
+    @with_temp_dir
+    def test_load_with_multiple_threads(self) -> None:
+        if importlib.util.find_spec("safetensors") is None:
+            print("safetensors not installed")
+            return
+
+        CHECKPOINT_DIR = self.temp_dir
+
+        state_dict_to_save = MyTestModule().state_dict()
+        state_dict_to_load = MyTestModule().state_dict()
+
+        # Create a mapping to split tensors across multiple files
+        # This will force multiple files to be created, enabling multi-threading
+        fqn_to_index_mapping = {}
+        for i, fqn in enumerate(state_dict_to_save.keys()):
+            fqn_to_index_mapping[fqn] = (i % 2) + 1  # Split across 2 files
+
+        # Save using HuggingFaceStorageWriter with multiple files
+        dist_cp.save(
+            state_dict=state_dict_to_save,
+            storage_writer=dist_cp.HuggingFaceStorageWriter(
+                path=CHECKPOINT_DIR, fqn_to_index_mapping=fqn_to_index_mapping
+            ),
+        )
+
+        dist_cp.load(
+            state_dict=state_dict_to_load,
+            storage_reader=dist_cp.HuggingFaceStorageReader(
+                path=CHECKPOINT_DIR, thread_count=2
+            ),
+        )
+
+        self.assertEqual(
+            sorted(state_dict_to_save.keys()), sorted(state_dict_to_load.keys())
+        )
+        for key in state_dict_to_save.keys():
+            self.assertTrue(
+                torch.equal(state_dict_to_save[key], state_dict_to_load[key])
+            )
+
+    @with_temp_dir
+    def test_quantized_checkpoint_loading(self) -> None:
+        """Test end-to-end saving a quantizaed checkpoint and loading it."""
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print("safetensors not installed")
+            return
+
+        CHECKPOINT_DIR = self.temp_dir
+
+        # Create original (unquantized) tensors to validate against
+        original_tensors = {
+            "linear1.weight": torch.randn(256, 128, dtype=torch.float32) * 2.0,
+            "linear2.weight": torch.randn(128, 64, dtype=torch.float32) * 1.5,
+            "embedding.weight": torch.randn(512, 256, dtype=torch.float32) * 3.0,
+        }
+
+        # Create quantized tensors and scale tensors
+        quantized_checkpoint = {}
+        block_size = 128
+
+        for tensor_name, original_tensor in original_tensors.items():
+            # Simulate quantization: scale down the tensor for quantization
+            # This is a simplified quantization - in real scenarios it would be more complex
+            rows, cols = original_tensor.shape
+
+            # Create scale tensor for block-wise dequantization
+            block_rows = (rows + block_size - 1) // block_size
+            block_cols = (cols + block_size - 1) // block_size
+
+            # Create scale inverse tensor (used for dequantization)
+            scale_inv = torch.ones(block_rows, block_cols, dtype=torch.float32) * 2.0
+
+            # Create quantized version (divide by scale for quantization)
+            quantized_tensor = original_tensor / 2.0  # Simplified quantization
+
+            # Store quantized tensor and its scale
+            quantized_checkpoint[tensor_name] = quantized_tensor
+            quantized_checkpoint[f"{tensor_name}_scale_inv"] = scale_inv
+
+        # Save quantized checkpoint to safetensors file
+        safetensors_file = os.path.join(CHECKPOINT_DIR, "model.safetensors")
+        save_file(quantized_checkpoint, safetensors_file)
+
+        # Create model.safetensors.index.json with weight mapping
+        weight_map = {}
+        for key in quantized_checkpoint.keys():
+            weight_map[key] = "model.safetensors"
+
+        index_data = {
+            "metadata": {
+                "total_size": sum(
+                    t.numel() * t.element_size() for t in quantized_checkpoint.values()
+                )
+            },
+            "weight_map": weight_map,
+        }
+
+        index_file = os.path.join(CHECKPOINT_DIR, "model.safetensors.index.json")
+        with open(index_file, "w") as f:
+            json.dump(index_data, f, indent=2)
+
+        # Prepare state dict to load into
+        state_dict_to_load = {}
+        for tensor_name, original_tensor in original_tensors.items():
+            state_dict_to_load[tensor_name] = torch.zeros_like(original_tensor)
+
+        # Load using QuantizedHuggingFaceStorageReader
+        dist_cp.load(
+            state_dict=state_dict_to_load,
+            storage_reader=QuantizedHuggingFaceStorageReader(
+                path=CHECKPOINT_DIR,
+                target_dtype=torch.float32,
+                block_size=block_size,
+                thread_count=2,
+            ),
+        )
+
+        # Validate that loaded tensors match original tensors
+        self.assertEqual(
+            sorted(original_tensors.keys()), sorted(state_dict_to_load.keys())
+        )
+
+        for tensor_name in original_tensors.keys():
+            original = original_tensors[tensor_name]
+            loaded = state_dict_to_load[tensor_name]
+
+            # Verify shapes match
+            self.assertEqual(
+                original.shape,
+                loaded.shape,
+                f"Shape mismatch for {tensor_name}: {original.shape} vs {loaded.shape}",
+            )
+
+            # Verify dtypes match
+            self.assertEqual(
+                original.dtype,
+                loaded.dtype,
+                f"Dtype mismatch for {tensor_name}: {original.dtype} vs {loaded.dtype}",
+            )
+
+            # Verify dequantized values match original values
+            # We expect exact match since we used simple 2x scaling
+            torch.testing.assert_close(
+                loaded,
+                original,
+                rtol=1e-5,
+                atol=1e-5,
+                msg=f"Value mismatch for tensor {tensor_name}",
+            )
+
 
 class TestDistributedHFSafetensorsConsolidation(DTensorTestBase):
     @with_comms
diff --git a/test/distributed/checkpoint/test_hf_storage.py b/test/distributed/checkpoint/test_hf_storage.py
index 637dd228944f1..81558db13a69f 100644
--- a/test/distributed/checkpoint/test_hf_storage.py
+++ b/test/distributed/checkpoint/test_hf_storage.py
@@ -162,8 +162,16 @@ def test_write_data_with_sharding(self) -> None:
             )
 
     def test_read_data_hf(self) -> None:
-        # Create test tensors
         tensor_0 = torch.tensor([1.0, 2.0, 3.0, 4.0])
+
+        mock_safe_open = MagicMock()
+        mock_context = MagicMock()
+        mock_context.__enter__.return_value.get_slice.return_value = tensor_0
+        mock_safe_open.return_value = mock_context
+
+        sys.modules["safetensors"] = MagicMock()
+        sys.modules["safetensors"].safe_open = mock_safe_open
+
         with tempfile.TemporaryDirectory() as path:
             # Create the reader
             reader = HuggingFaceStorageReader(path=path)
@@ -200,8 +208,6 @@ def test_read_data_hf(self) -> None:
                     fqn="tensor_0", offset=torch.Size([0]), index=None
                 ): _HFStorageInfo(
                     file_path,
-                    len(metadata_bytes) + NUM_BYTES_FOR_HEADER_LEN,
-                    tensor_0.numel() * tensor_0.element_size(),
                     tensor_0.shape,
                     tensor_0.dtype,
                 ),
@@ -260,6 +266,9 @@ def test_read_data_hf(self) -> None:
             # Verify results - the target tensors should now contain the values from our test tensor
             self.assertTrue(torch.equal(state_dict["tensor_0"], tensor_0))
 
+            mock_safe_open.assert_called_once_with(filename=file_path, framework="pt")
+            mock_context.__enter__.return_value.get_slice.assert_called_with("tensor_0")
+
     def test_write_metadata_hf(self) -> None:
         mock_module = MagicMock()
         sys.modules["huggingface_hub"] = mock_module
@@ -313,35 +322,50 @@ def test_write_metadata_hf(self) -> None:
                 self.assertEqual(metadata, expected_metadata)
 
     def test_read_metadata_hf(self):
+        mock_safe_open = MagicMock()
+        mock_context = MagicMock()
+
+        mock_safe_open.return_value = mock_context
+
+        mock_context.__enter__.return_value.keys.return_value = ["tensor_0"]
+        mock_context.__enter__.return_value.metadata.return_value = {}
+
+        mock_slice = MagicMock()
+        mock_slice.get_shape.return_value = [5, 10]
+        mock_slice.get_dtype.return_value = "F32"
+        mock_context.__enter__.return_value.get_slice.return_value = mock_slice
+
+        mock_safetensors = MagicMock()
+        mock_safetensors.safe_open = mock_safe_open
+
+        mock_safetensors.torch._getdtype = MagicMock(return_value=torch.float32)
+
+        sys.modules["safetensors"] = mock_safetensors
+        sys.modules["safetensors.torch"] = mock_safetensors.torch
+
         with tempfile.TemporaryDirectory() as path:
             reader = HuggingFaceStorageReader(path=path)
 
             key = "tensor_0"
             file_name = "test.safetensors"
-            with open(os.path.join(path, file_name), "wb") as f:
-                # write metadata the same way it would be in safetensors file
-                metadata_contents = json.dumps(
-                    {
-                        "tensor_0": {
-                            "dtype": "F32",
-                            "shape": [5, 10],
-                            "data_offsets": [0, 200],
-                        }
-                    }
-                )
-                metadata_bytes = metadata_contents.encode("utf-8")
+            file_path = os.path.join(path, file_name)
 
-                f.write(
-                    len(metadata_bytes).to_bytes(
-                        NUM_BYTES_FOR_HEADER_LEN, byteorder="little"
-                    )
-                )
-                f.write(metadata_bytes)
+            # Create an empty file so fs.ls can find it
+            with open(file_path, "wb") as _:
+                pass
+
+            # Mock the fs.ls method to return our test file
+            original_ls = reader.fs.ls
+            reader.fs.ls = MagicMock(return_value=[file_path])
 
-                tensor = torch.rand(5, 10)
-                f.write(tensor.numpy().tobytes())
+            try:
+                metadata = reader.read_metadata()
+            finally:
+                # Restore the original ls method
+                reader.fs.ls = original_ls
 
-            metadata = reader.read_metadata()
+            # Verify that safe_open was called with our file path
+            mock_safe_open.assert_called_once_with(file_path, framework="pt")
 
             self.assertEqual(
                 metadata.state_dict_metadata,
@@ -365,8 +389,6 @@ def test_read_metadata_hf(self):
                         fqn=key, offset=torch.Size([0, 0]), index=None
                     ): _HFStorageInfo(
                         os.path.join(path, file_name),
-                        len(metadata_bytes) + NUM_BYTES_FOR_HEADER_LEN,
-                        200,
                         torch.Size([5, 10]),
                         torch.float32,
                     )
diff --git a/test/distributed/checkpoint/test_pg_transport.py b/test/distributed/checkpoint/test_pg_transport.py
index baa2eb54b0548..82ce217093ef2 100644
--- a/test/distributed/checkpoint/test_pg_transport.py
+++ b/test/distributed/checkpoint/test_pg_transport.py
@@ -25,7 +25,7 @@
 from torch.distributed.tensor import DTensor
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
-    MultiProcContinousTest,
+    MultiProcContinuousTest,
     requires_nccl,
 )
 from torch.testing._internal.common_utils import (
@@ -201,7 +201,7 @@ def _test_pg_transport_with_sharded_tensor(self, device) -> None:
     torch.testing.assert_close(expected_local_tensor, received_local_tensor)
 
 
-class PgTransportCPU(MultiProcContinousTest):
+class PgTransportCPU(MultiProcContinuousTest):
     world_size = 8
     timeout: timedelta = timedelta(seconds=20)
 
@@ -227,7 +227,7 @@ def test_pg_transport_with_sharded_tensor(self) -> None:
         _test_pg_transport_with_sharded_tensor(self, self.device)
 
 
-class PgTransportCUDA(MultiProcContinousTest):
+class PgTransportCUDA(MultiProcContinuousTest):
     world_size = 2
     timeout: timedelta = timedelta(seconds=20)
 
diff --git a/test/distributed/checkpoint/test_quantized_hf_storage.py b/test/distributed/checkpoint/test_quantized_hf_storage.py
new file mode 100644
index 0000000000000..82d658c271055
--- /dev/null
+++ b/test/distributed/checkpoint/test_quantized_hf_storage.py
@@ -0,0 +1,167 @@
+# Owner(s): ["oncall: distributed checkpointing"]
+
+import tempfile
+from unittest.mock import MagicMock, patch
+
+import torch
+from torch.distributed.checkpoint.metadata import MetadataIndex
+from torch.distributed.checkpoint.planner import LoadItemType, ReadItem
+from torch.distributed.checkpoint.quantized_hf_storage import (
+    QuantizedHuggingFaceStorageReader,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestQuantizedHfStorage(TestCase):
+    def setUp(self):
+        """Set up common test fixtures."""
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.path = self.temp_dir.name
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        self.temp_dir.cleanup()
+
+    def test_dequantization(self):
+        """Test quantized tensors with weights and scales in both same and different files."""
+        reader = QuantizedHuggingFaceStorageReader(self.path, thread_count=1)
+
+        # Test data for two different weights
+        quantized_tensor1 = torch.ones(4, 4, dtype=torch.float32)
+        quantized_tensor2 = (
+            torch.ones(4, 4, dtype=torch.float32) * 3.0
+        )  # Different values
+        scale_inv1 = torch.tensor([[2.0]], dtype=torch.float32)
+        scale_inv2 = torch.tensor([[0.5]], dtype=torch.float32)  # Different scale
+
+        # Define weight and scale tensor names
+        weight1_fqn = "model.layers.0.self_attn.q_proj.weight"  # Scale in same file
+        scale1_fqn = "model.layers.0.self_attn.q_proj.weight_scale_inv"
+        weight2_fqn = (
+            "model.layers.0.self_attn.k_proj.weight"  # Scale in different file
+        )
+        scale2_fqn = "model.layers.0.self_attn.k_proj.weight_scale_inv"
+
+        file1_name = "model-00001-of-00002.safetensors"
+        file2_name = "model-00002-of-00002.safetensors"
+
+        # Setup weight-scale mapping and file locations
+        reader._weight_scale_mapping = {
+            weight1_fqn: scale1_fqn,
+            weight2_fqn: scale2_fqn,
+        }
+        reader._weight_map = {
+            weight1_fqn: file1_name,  # Weight in file 1
+            scale1_fqn: file1_name,  # Scale also in file 1 (same file scenario)
+            weight2_fqn: file1_name,  # Weight in file 1
+            scale2_fqn: file2_name,  # Scale in file 2 (different file scenario)
+        }
+
+        # Mock the main safetensors file (file1)
+        mock_file1 = MagicMock()
+
+        # Mock get_slice to return different tensors based on tensor name
+        def mock_get_slice(tensor_name):
+            mock_tensor = MagicMock()
+            if tensor_name == weight1_fqn:
+                mock_tensor.__getitem__ = lambda _, __: quantized_tensor1
+            elif tensor_name == weight2_fqn:
+                mock_tensor.__getitem__ = lambda _, __: quantized_tensor2
+            return mock_tensor
+
+        mock_file1.get_slice = mock_get_slice
+
+        # Mock get_tensor for same-file scale (scale1)
+        mock_file1.get_tensor.return_value = scale_inv1
+
+        # Mock the cross-file safetensors file (file2) for scale2
+        mock_file2 = MagicMock()
+        mock_file2.get_tensor.return_value = scale_inv2
+
+        # Test 1: Same-file scenario (weight1 + scale1 both in file1)
+        read_item1 = ReadItem(
+            type=LoadItemType.TENSOR,
+            storage_index=MetadataIndex(
+                fqn=weight1_fqn,
+                offset=torch.Size([0, 0]),
+            ),
+            dest_index=MetadataIndex(
+                fqn=weight1_fqn,
+                offset=torch.Size([0, 0]),
+            ),
+            storage_offsets=[0, 0],
+            dest_offsets=[0, 0],
+            lengths=[4, 4],
+        )
+
+        target_tensor1 = torch.zeros(4, 4, dtype=torch.float32)
+        mock_planner1 = MagicMock()
+        mock_planner1.resolve_tensor.return_value = target_tensor1
+
+        # Process first weight (same file scenario)
+        reader._process_read_request(mock_file1, read_item1, mock_planner1)
+
+        # Verify first tensor was dequantized (ones * 2.0 = twos)
+        expected_result1 = torch.ones(4, 4, dtype=torch.float32) * 2.0
+        mock_planner1.commit_tensor.assert_called_once()
+
+        # Check that target_tensor1 was updated correctly
+        args1, _ = mock_planner1.commit_tensor.call_args
+        committed_tensor1 = args1[1]
+        torch.testing.assert_close(committed_tensor1, expected_result1)
+
+        # Test 2: Cross-file scenario (weight2 in file1, scale2 in file2)
+        read_item2 = ReadItem(
+            type=LoadItemType.TENSOR,
+            storage_index=MetadataIndex(
+                fqn=weight2_fqn,
+                offset=torch.Size([0, 0]),
+            ),
+            dest_index=MetadataIndex(
+                fqn=weight2_fqn,
+                offset=torch.Size([0, 0]),
+            ),
+            storage_offsets=[0, 0],
+            dest_offsets=[0, 0],
+            lengths=[4, 4],
+        )
+
+        target_tensor2 = torch.zeros(4, 4, dtype=torch.float32)
+        mock_planner2 = MagicMock()
+        mock_planner2.resolve_tensor.return_value = target_tensor2
+
+        # Mock the entire safetensors module since it may not be available in test environment
+        mock_safetensors = MagicMock()
+        mock_safe_open = MagicMock()
+        mock_safetensors.safe_open = mock_safe_open
+
+        # Set up the mock to return a context manager that yields mock_file2
+        mock_safe_open.return_value.__enter__.return_value = mock_file2
+        mock_safe_open.return_value.__exit__.return_value = False
+
+        # Mock the module import and safe_open function
+        with patch.dict("sys.modules", {"safetensors": mock_safetensors}):
+            # Process second weight (cross-file scenario)
+            reader._process_read_request(mock_file1, read_item2, mock_planner2)
+
+            # Verify safe_open was called with the correct file path
+            expected_path = f"{self.path}/{file2_name}"
+            mock_safe_open.assert_called_once()
+            call_args = mock_safe_open.call_args[0]
+            self.assertEqual(str(call_args[0]), expected_path)
+
+        # Verify the scale tensor was loaded from the correct file
+        mock_file2.get_tensor.assert_called_once_with(scale2_fqn)
+
+        # Verify second tensor was dequantized (3.0 * 0.5 = 1.5)
+        expected_result2 = torch.ones(4, 4, dtype=torch.float32) * 3.0 * 0.5  # 1.5
+        mock_planner2.commit_tensor.assert_called_once()
+
+        # Check that target_tensor2 was updated correctly
+        args2, _ = mock_planner2.commit_tensor.call_args
+        committed_tensor2 = args2[1]
+        torch.testing.assert_close(committed_tensor2, expected_result2)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/test_state_dict_stager.py b/test/distributed/checkpoint/test_state_dict_stager.py
index 86a952e0701d2..8134472f52d5c 100644
--- a/test/distributed/checkpoint/test_state_dict_stager.py
+++ b/test/distributed/checkpoint/test_state_dict_stager.py
@@ -1,12 +1,23 @@
 # Owner(s): ["oncall: distributed"]
 
 import dataclasses
+import os
+import tempfile
+from datetime import timedelta
 
 import torch
 import torch.distributed as dist
+from torch.distributed._shard.sharded_tensor import (
+    init_from_local_shards,
+    Shard as ShardedTensorShard,
+    ShardedTensor,
+    ShardMetadata,
+)
 from torch.distributed._tensor import DTensor
-from torch.distributed._tensor.placement_types import Shard
+from torch.distributed._tensor.placement_types import Replicate, Shard
 from torch.distributed.checkpoint._state_dict_stager import StateDictStager
+from torch.distributed.checkpoint.staging import _ReplicationStager
+from torch.distributed.tensor import DeviceMesh, distribute_tensor
 from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import requires_cuda, run_tests, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import (
@@ -818,5 +829,523 @@ def test_dtensor(self):
         self.assertEqual(cpu_state_dict["dtensor"].size(), dtensor.size())
 
 
+class TestReplicationStager(DTensorTestBase):
+    """
+    Test suite for _ReplicationStager functionality.
+    Tests replication of state_dict across training ranks using CPU tensors only.
+    """
+
+    @property
+    def backend(self) -> str:
+        return "cpu:gloo,cuda:nccl"
+
+    def _create_simple_state_dict(self, rank: int) -> dict:
+        """
+        Create a simple state_dict with CPU tensors, deterministically unique per rank.
+
+        Args:
+            rank: The rank number to create unique tensors for
+
+        Returns:
+            dict: A state dictionary with CPU tensors
+        """
+        # Create unique tensors for each rank
+        torch.manual_seed(42 + rank)  # Different seed per rank
+
+        return {
+            "layer1.weight": torch.randn(64, 128, device="cpu"),
+            "layer1.bias": torch.randn(64, device="cpu"),
+            "layer2.weight": torch.randn(32, 64, device="cpu"),
+            "layer2.bias": torch.randn(32, device="cpu"),
+            "nested": {
+                "param": torch.randn(16, 16, device="cpu"),
+                "buffer": torch.randn(8, device="cpu"),
+            },
+            "scalar": torch.tensor(float(rank), device="cpu"),
+        }
+
+    def _verify_simple_state_dict_replication(
+        self, replicated_dict: dict, rank: int, partner_rank: int
+    ):
+        """
+        Verify that replication worked correctly.
+
+        Args:
+            replicated_dict: The replicated state_dict received from partner
+            rank: Current rank
+            partner_rank: Partner rank we should have received from
+        """
+        # Create expected state_dict (what partner rank would have created)
+        expected_dict = self._create_simple_state_dict(partner_rank)
+
+        def compare_tensors(actual, expected, path=""):
+            if isinstance(actual, dict) and isinstance(expected, dict):
+                self.assertEqual(
+                    actual.keys(), expected.keys(), f"Keys mismatch at {path}"
+                )
+                for key in actual:
+                    compare_tensors(
+                        actual[key], expected[key], f"{path}.{key}" if path else key
+                    )
+            elif isinstance(actual, torch.Tensor) and isinstance(
+                expected, torch.Tensor
+            ):
+                self.assertEqual(
+                    actual.device.type, "cpu", f"Tensor at {path} should be on CPU"
+                )
+                self.assertEqual(
+                    actual.shape, expected.shape, f"Shape mismatch at {path}"
+                )
+                self.assertEqual(
+                    actual.dtype, expected.dtype, f"Dtype mismatch at {path}"
+                )
+                self.assertTrue(
+                    torch.equal(actual, expected), f"Values mismatch at {path}"
+                )
+            else:
+                self.assertEqual(actual, expected, f"Value mismatch at {path}")
+
+        compare_tensors(replicated_dict, expected_dict)
+
+    def _create_dtensor_state_dict(self, rank: int, device_mesh: DeviceMesh) -> dict:
+        """
+        Create state_dict with DTensor and regular tensors for deterministic testing
+        due to DTensor Shard, Replicate placements.
+
+        Args:
+            rank: Current rank
+            device_mesh: DeviceMesh for DTensor creation
+
+        Returns:
+            dict: State dictionary with DTensors
+        """
+        # Create a large global tensor with deterministic values
+        # Each position contains a unique value that encodes both position and rank info
+        global_size = 128
+        global_tensor = torch.arange(0, global_size * 16, dtype=torch.float32).reshape(
+            global_size, 16
+        )
+
+        # Create DTensor with Shard(0) - each rank gets different rows
+        sharded_dtensor = distribute_tensor(global_tensor, device_mesh, [Shard(0)])
+
+        # Create DTensor with Replicate() - all ranks have the same data
+        replicated_global = torch.full(
+            (8, 8), float(global_size * 100), dtype=torch.float32, device="cpu"
+        )
+        replicated_dtensor = distribute_tensor(
+            replicated_global, device_mesh, [Replicate()]
+        )
+
+        return {
+            "sharded_param": sharded_dtensor,
+            "replicated_param": replicated_dtensor,
+            "rank_scalar": torch.tensor(float(rank), device="cpu"),
+        }
+
+    def _verify_dtensor_replication(
+        self, replicated_dict: dict, rank: int, partner_rank: int
+    ):
+        """
+        Verify DTensor replication accuracy by checking local shards and global reconstruction.
+
+        Args:
+            replicated_dict: Replicated state_dict received from partner
+            rank: Current rank
+            partner_rank: Partner rank we should have received from
+        """
+        # Verify sharded DTensor
+        if "sharded_param" in replicated_dict:
+            replicated_sharded = replicated_dict["sharded_param"]
+            self.assertIsInstance(replicated_sharded, DTensor, "Should receive DTensor")
+
+            # Get local shard from replicated DTensor
+            replicated_local = replicated_sharded.to_local()
+
+            # Create expected local shard (what partner rank would have)
+            expected_global = torch.arange(0, 128 * 16, dtype=torch.float32).reshape(
+                128, 16
+            )
+
+            # Calculate expected shard for this rank's position
+            world_size = dist.get_world_size()
+            shard_size = 128 // world_size
+            start_idx = partner_rank * shard_size
+            end_idx = (partner_rank + 1) * shard_size
+            expected_local = expected_global[start_idx:end_idx]
+
+            self.assertTrue(
+                torch.equal(replicated_local, expected_local),
+                "Sharded DTensor value mismatch",
+            )
+
+            # Verify DTensor metadata is preserved
+            self.assertEqual(
+                replicated_sharded._spec.placements[0].__class__.__name__,
+                "Shard",
+                "DTensor should maintain Shard placement",
+            )
+
+        # Verify replicated DTensor
+        if "replicated_param" in replicated_dict:
+            replicated_replicated = replicated_dict["replicated_param"]
+            self.assertIsInstance(
+                replicated_replicated, DTensor, "Should receive DTensor"
+            )
+
+            # Get local data from replicated DTensor
+            replicated_local = replicated_replicated.to_local()
+
+            # Expected value should be global_size * 100
+            expected_value = float(128 * 100)
+            expected_tensor = torch.full(
+                (8, 8), expected_value, dtype=torch.float32, device="cpu"
+            )
+
+            self.assertTrue(
+                torch.equal(replicated_local, expected_tensor),
+                "Replicated DTensor value mismatch",
+            )
+
+            # Verify DTensor metadata is preserved
+            self.assertEqual(
+                replicated_replicated._spec.placements[0].__class__.__name__,
+                "Replicate",
+                "DTensor should maintain Replicate placement",
+            )
+
+        # Verify regular tensors
+        if "rank_scalar" in replicated_dict:
+            self.assertEqual(
+                replicated_dict["rank_scalar"].item(),
+                float(partner_rank),
+                f"Rank scalar should be {partner_rank}, got {replicated_dict['rank_scalar'].item()}",
+            )
+
+    def _create_sharded_tensor_state_dict(self, rank: int, world_size: int) -> dict:
+        """
+        Create state_dict with ShardedTensor for deterministic testing.
+
+        Args:
+            rank: Current rank
+            world_size: Total world size
+
+        Returns:
+            dict: State dictionary with ShardedTensor
+        """
+        # Create deterministic local shard for this rank
+        global_size = 64
+        shard_size = global_size // world_size
+        start_idx = rank * shard_size
+        end_idx = (rank + 1) * shard_size
+
+        # Create local tensor with deterministic values
+        local_tensor = torch.arange(
+            start_idx * 8, end_idx * 8, dtype=torch.float32, device="cpu"
+        ).reshape(shard_size, 8)
+
+        # Create ShardedTensor using init_from_local_shards
+        sharded_tensor = init_from_local_shards(
+            [
+                ShardedTensorShard(
+                    tensor=local_tensor,
+                    metadata=ShardMetadata(
+                        shard_offsets=[start_idx, 0],
+                        shard_sizes=[shard_size, 8],
+                        placement=f"rank:{rank}/cpu",
+                    ),
+                )
+            ],
+            global_size,
+            8,
+        )
+
+        return {
+            "sharded_tensor": sharded_tensor,
+            "rank_scalar": torch.tensor(float(rank), device="cpu"),
+        }
+
+    def _verify_sharded_tensor_replication(
+        self, replicated_dict: dict, rank: int, partner_rank: int
+    ):
+        """
+        Verify ShardedTensor replication accuracy by checking local shards and metadata.
+
+        Args:
+            replicated_dict: Replicated state_dict received from partner
+            rank: Current rank
+            partner_rank: Partner rank we should have received from
+        """
+        # Verify sharded tensor
+        if "sharded_tensor" in replicated_dict:
+            replicated_sharded = replicated_dict["sharded_tensor"]
+            self.assertIsInstance(
+                replicated_sharded, ShardedTensor, "Should receive ShardedTensor"
+            )
+
+            # Get local shard from replicated ShardedTensor
+            local_shards = replicated_sharded.local_shards()
+            self.assertEqual(
+                len(local_shards), 1, "Should have exactly one local shard"
+            )
+
+            local_shard = local_shards[0]
+            replicated_local = local_shard.tensor
+
+            # Create expected local shard (what partner rank would have)
+            world_size = dist.get_world_size()
+            global_size = 64
+            shard_size = global_size // world_size
+            start_idx = partner_rank * shard_size
+            end_idx = (partner_rank + 1) * shard_size
+
+            expected_local = torch.arange(
+                start_idx * 8, end_idx * 8, dtype=torch.float32, device="cpu"
+            ).reshape(shard_size, 8)
+
+            self.assertTrue(
+                torch.equal(replicated_local, expected_local),
+                "Sharded tensor value mismatch",
+            )
+
+            # Verify shard metadata is preserved
+            expected_metadata = ShardMetadata(
+                shard_offsets=[start_idx, 0],
+                shard_sizes=[shard_size, 8],
+                placement=f"rank:{partner_rank}/cpu",
+            )
+            self.assertEqual(
+                local_shard.metadata.shard_offsets,
+                expected_metadata.shard_offsets,
+                "Shard offsets should match",
+            )
+            self.assertEqual(
+                local_shard.metadata.shard_sizes,
+                expected_metadata.shard_sizes,
+                "Shard sizes should match",
+            )
+
+        # Verify regular tensors
+        if "rank_scalar" in replicated_dict:
+            self.assertEqual(
+                replicated_dict["rank_scalar"].item(),
+                float(partner_rank),
+                f"Rank scalar should be {partner_rank}, got {replicated_dict['rank_scalar'].item()}",
+            )
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_replication_basic(self):
+        """Test basic replication functionality with world_size=16"""
+        world_size = dist.get_world_size()
+
+        current_rank = dist.get_rank()
+
+        # Create unique DTensor state_dict for this rank
+        state_dict = self._create_simple_state_dict(current_rank)
+
+        # Initialize replication stager
+        stager = _ReplicationStager(
+            pg=dist.new_group(backend=dist.Backend.GLOO),
+            timeout=timedelta(seconds=30),
+            device=torch.device("cpu"),
+        )
+
+        # Perform replication
+        replicated_dict = stager.stage(state_dict)
+
+        # Calculate expected partner rank
+        partner_rank = (current_rank + world_size // 2) % world_size
+
+        # Verify DTensor replication
+        self._verify_simple_state_dict_replication(
+            replicated_dict, current_rank, partner_rank
+        )
+
+        # Clean up
+        stager.close()
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_replication_dtensors(self):
+        """Test replication with DTensor and mixed tensor types"""
+        world_size = dist.get_world_size()
+
+        current_rank = dist.get_rank()
+
+        # Create CPU-based DeviceMesh for DTensor
+        device_mesh = DeviceMesh("cpu", list(range(world_size)))
+
+        # Create DTensor state_dict which includes different tensor types
+        state_dict = self._create_dtensor_state_dict(current_rank, device_mesh)
+
+        # Initialize replication stager
+        stager = _ReplicationStager(
+            pg=dist.group.WORLD,
+            timeout=timedelta(seconds=30),
+            device=torch.device("cpu"),
+        )
+
+        # Perform replication
+        result = stager.stage(state_dict)
+
+        # Wait for completion
+        from concurrent.futures import Future
+
+        if isinstance(result, Future):
+            replicated_dict = result.result()
+        else:
+            replicated_dict = result
+
+        # Calculate expected partner
+        partner_rank = (current_rank + world_size // 2) % world_size
+
+        # Verify all DTensor types are correctly replicated
+        self._verify_dtensor_replication(replicated_dict, current_rank, partner_rank)
+
+        # Clean up
+        stager.close()
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_replication_sharded_tensors(self):
+        """Test replication with ShardedTensor and mixed tensor types"""
+        world_size = dist.get_world_size()
+
+        current_rank = dist.get_rank()
+
+        # Create ShardedTensor state_dict for this rank
+        state_dict = self._create_sharded_tensor_state_dict(current_rank, world_size)
+
+        # Initialize replication stager
+        stager = _ReplicationStager(
+            pg=dist.group.WORLD,
+            timeout=timedelta(seconds=30),
+            device=torch.device("cpu"),
+        )
+
+        # Perform replication
+        result = stager.stage(state_dict)
+
+        # Wait for completion
+        from concurrent.futures import Future
+
+        if isinstance(result, Future):
+            replicated_dict = result.result()
+        else:
+            replicated_dict = result
+
+        # Calculate expected partner
+        partner_rank = (current_rank + world_size // 2) % world_size
+
+        # Verify all ShardedTensor types are correctly replicated
+        self._verify_sharded_tensor_replication(
+            replicated_dict, current_rank, partner_rank
+        )
+
+        # Clean up
+        stager.close()
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_replication_persistence(self):
+        """Test persistence functionality in _ReplicationStager"""
+        world_size = dist.get_world_size()
+
+        current_rank = dist.get_rank()
+
+        # Test 1: Default storage directory (auto-generated tempdir)
+        with tempfile.TemporaryDirectory() as _:
+            # Create state_dict for this rank
+            state_dict = self._create_simple_state_dict(current_rank)
+
+            # Initialize stager with default storage_dir (None)
+            stager = _ReplicationStager(
+                pg=dist.group.WORLD,
+                timeout=timedelta(seconds=30),
+                device=torch.device("cpu"),
+                storage_dir=None,  # Let it create its own tempdir
+            )
+
+            # Perform replication to trigger persistence
+            stager.stage(state_dict)
+
+            # Calculate expected partner rank
+            partner_rank = (current_rank + world_size // 2) % world_size
+
+            # Verify file was created with correct naming convention
+            expected_path = stager._get_persisted_path(current_rank, partner_rank)
+
+            self.assertTrue(
+                os.path.exists(expected_path),
+                f"Persisted file should exist at {expected_path}",
+            )
+
+            # Verify the storage directory was created
+            self.assertTrue(
+                os.path.isdir(stager._storage_dir), "Storage directory should exist"
+            )
+            self.assertTrue(
+                stager._storage_dir.startswith(tempfile.gettempdir()),
+                "Default storage directory should be in system temp directory",
+            )
+
+            # Load and verify the persisted state_dict matches the received one
+            loaded_state_dict = torch.load(expected_path)
+            self._verify_simple_state_dict_replication(
+                loaded_state_dict, current_rank, partner_rank
+            )
+
+            # Clean up
+            stager.close()
+
+        # Test 2: Custom storage directory
+        with tempfile.TemporaryDirectory() as custom_storage_dir:
+            # Create custom subdirectory
+            custom_subdir = os.path.join(custom_storage_dir, "custom_replication_test")
+
+            # Create state_dict for this rank
+            state_dict = self._create_simple_state_dict(current_rank)
+
+            # Initialize stager with custom storage_dir
+            stager = _ReplicationStager(
+                pg=dist.group.WORLD,
+                timeout=timedelta(seconds=30),
+                device=torch.device("cpu"),
+                storage_dir=custom_subdir,
+            )
+
+            # Perform replication to trigger persistence
+            stager.stage(state_dict)
+
+            # Verify custom storage directory was created and used
+            self.assertEqual(
+                stager._storage_dir,
+                custom_subdir,
+                "Should use custom storage directory",
+            )
+            self.assertTrue(
+                os.path.isdir(custom_subdir),
+                "Custom storage directory should be created",
+            )
+
+            # Verify file was created in custom directory
+            expected_path = stager._get_persisted_path(current_rank, partner_rank)
+
+            self.assertTrue(
+                os.path.exists(expected_path),
+                f"Persisted file should exist in custom directory at {expected_path}",
+            )
+
+            # Load and verify the persisted state_dict
+            loaded_state_dict = torch.load(expected_path)
+            self._verify_simple_state_dict_replication(
+                loaded_state_dict, current_rank, partner_rank
+            )
+
+            # Clean up
+            stager.close()
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/checkpoint/test_state_dict_utils.py b/test/distributed/checkpoint/test_state_dict_utils.py
index 010ebf02ecd60..76e9aeb9e3302 100644
--- a/test/distributed/checkpoint/test_state_dict_utils.py
+++ b/test/distributed/checkpoint/test_state_dict_utils.py
@@ -220,6 +220,13 @@ def _verify(cpu_state_dict):
             self.assertEqual(cpu_state_dict["step"], 7)
             self.assertEqual(cpu_state_dict["nested"], {"list": [1, 2, 3, 4]})
 
+        def _verify_weakref_finalize(cpu_state_dict):
+            import gc
+
+            del cpu_state_dict["tensor1"]
+            del cpu_state_dict
+            gc.collect()
+
         cpu_state_dict = _create_cpu_state_dict(state_dict)
         _verify(cpu_state_dict)
         cpu_state_dict = _create_cpu_state_dict(state_dict, pin_memory=True)
@@ -230,6 +237,7 @@ def _verify(cpu_state_dict):
             state_dict, share_memory=True, pin_memory=True
         )
         _verify(cpu_state_dict)
+        _verify_weakref_finalize(cpu_state_dict)
 
     @with_comms
     @skip_if_lt_x_gpu(2)
diff --git a/test/distributed/elastic/utils/distributed_test.py b/test/distributed/elastic/utils/distributed_test.py
index 54c43d9b0d1e7..1827d63361809 100644
--- a/test/distributed/elastic/utils/distributed_test.py
+++ b/test/distributed/elastic/utils/distributed_test.py
@@ -116,7 +116,6 @@ def test_create_store_timeout_on_server(self):
                 timeout=1,
             )
 
-    @skipIfRocm
     def test_create_store_timeout_on_worker(self):
         with self.assertRaises(DistNetworkError):
             # use any available port (port 0) since timeout is expected
diff --git a/test/distributed/fsdp/test_distributed_checkpoint.py b/test/distributed/fsdp/test_distributed_checkpoint.py
index ac34246ee6432..c80602c5d50f3 100644
--- a/test/distributed/fsdp/test_distributed_checkpoint.py
+++ b/test/distributed/fsdp/test_distributed_checkpoint.py
@@ -31,10 +31,10 @@
     sys.exit(0)
 
 
-_DISTRIBUTED_STATE_DICT_IMPLS = {
+_DISTRIBUTED_STATE_DICT_IMPLS = (
     StateDictType.LOCAL_STATE_DICT,
     StateDictType.SHARDED_STATE_DICT,
-}
+)
 
 
 class TestDistributedCheckpoint(FSDPTest):
diff --git a/test/distributed/fsdp/test_fsdp_comm_hooks.py b/test/distributed/fsdp/test_fsdp_comm_hooks.py
index 9f35d2aebbfe1..624e74d373686 100644
--- a/test/distributed/fsdp/test_fsdp_comm_hooks.py
+++ b/test/distributed/fsdp/test_fsdp_comm_hooks.py
@@ -13,7 +13,7 @@
 from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.testing._internal.common_distributed import (
-    requires_nccl,
+    requires_accelerator_dist_backend,
     requires_nccl_version,
     skip_but_pass_in_sandcastle_if,
     skip_if_lt_x_gpu,
@@ -30,17 +30,22 @@
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
-# bfloat16 is only supported by CUDA 11+
-BFLOAT16_AVAILABLE = torch.cuda.is_available() and (
-    torch.version.cuda is not None or torch.version.hip is not None
+device_type = (
+    acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
 
+# bfloat16 is only supported by CUDA 11+ or XPU
+BFLOAT16_AVAILABLE = (
+    torch.cuda.is_available()
+    and (torch.version.cuda is not None or torch.version.hip is not None)
+) or torch.xpu.is_available()
+
 
 class Net(nn.Module):
     def __init__(self, has_wrapping, sharding_strategy, mixed_precision=None):
         # to ensure determinism
         torch.manual_seed(0)
-        torch.cuda.manual_seed(0)
+        torch.get_device_module(device_type).manual_seed(0)
         super().__init__()
 
         if has_wrapping:
@@ -50,12 +55,12 @@ def __init__(self, has_wrapping, sharding_strategy, mixed_precision=None):
                     nn.ReLU(),
                     FSDP(
                         nn.Linear(16, 8),
-                        device_id=torch.cuda.current_device(),
+                        device_id=torch.accelerator.current_device_index(),
                         sharding_strategy=sharding_strategy,
                         mixed_precision=mixed_precision,
                     ),
                 ),
-                device_id=torch.cuda.current_device(),
+                device_id=torch.accelerator.current_device_index(),
                 sharding_strategy=sharding_strategy,
                 mixed_precision=mixed_precision,
             )
@@ -134,11 +139,11 @@ def test_default_communication_hook_behavior(
         """
         out_dim = self.world_size
         net = torch.nn.Linear(1, out_dim, bias=False)
-        inpt = torch.tensor([self.rank]).float().cuda(self.rank)
+        inpt = torch.tensor([self.rank]).float().to(self.rank)
 
         net_default_hook = FSDP(
             net,
-            device_id=torch.cuda.current_device(),
+            device_id=torch.accelerator.current_device_index(),
             sharding_strategy=sharding_strategy,
         ).to(self.rank)
 
@@ -172,10 +177,10 @@ def _get_submodules(self, fsdp_net):
         ]
 
     def _init_model(self, core, sharding_strategy, mixed_precision=None):
-        device = torch.device("cuda")
+        device = torch.device(device_type)
         return FSDP(
             core,
-            device_id=torch.cuda.current_device(),
+            device_id=torch.accelerator.current_device_index(),
             sharding_strategy=sharding_strategy,
             mixed_precision=mixed_precision,
         ).to(device)
@@ -277,7 +282,7 @@ def test_registering_hook_hybrid_strategy(self):
             ShardingStrategy.HYBRID_SHARD,
             ShardingStrategy._HYBRID_SHARD_ZERO2,
         ):
-            model = Net(False, None, None).cuda()
+            model = Net(False, None, None).to(device=device_type)
             fsdp_model = FSDP(
                 model,
                 auto_wrap_policy=ModuleWrapPolicy({nn.Linear}),
@@ -337,7 +342,7 @@ def _check_low_precision_hook(
     ):
         # keep everything deterministic for input data
         torch.manual_seed(0)
-        torch.cuda.manual_seed(0)
+        torch.get_device_module(device_type).manual_seed(0)
 
         fsdp_with_hook = self._init_model(
             Net(has_wrapping=has_wrapping, sharding_strategy=sharding_strategy),
@@ -359,7 +364,7 @@ def _check_low_precision_hook(
         optim_hook = torch.optim.SGD(fsdp_with_hook.parameters(), lr=0.1)
         optim_mp = torch.optim.SGD(fsdp_with_mp.parameters(), lr=0.1)
 
-        in_data = torch.rand(16, 8).cuda()
+        in_data = torch.rand(16, 8).to(device=device_type)
         fsdp_with_hook.train()
         fsdp_with_mp.train()
         loss_hook = fsdp_with_hook(in_data).sum()
@@ -378,7 +383,7 @@ def _check_low_precision_hook(
         ):
             self.assertEqual(hook_param.grad, mp_param.grad)
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_if_lt_x_gpu(2)
     @parametrize("has_wrapping", [True, False])
     @parametrize(
@@ -399,11 +404,11 @@ def test_fp16_hook(
             state, hook, sharding_strategy, torch.float16, has_wrapping
         )
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     @requires_nccl_version((2, 10), "Need NCCL 2.10+ for BF16_COMPRESS")
     @skip_but_pass_in_sandcastle_if(
         not BFLOAT16_AVAILABLE,
-        "BFloat16 is only supported by CUDA 11+",
+        "BFloat16 is only supported by CUDA 11+ or XPU",
     )
     @skip_if_lt_x_gpu(2)
     @parametrize("has_wrapping", [True, False])
diff --git a/test/distributed/fsdp/test_fsdp_flatten_params.py b/test/distributed/fsdp/test_fsdp_flatten_params.py
index 1e4a408b87292..12e432f214f30 100644
--- a/test/distributed/fsdp/test_fsdp_flatten_params.py
+++ b/test/distributed/fsdp/test_fsdp_flatten_params.py
@@ -44,8 +44,11 @@ def world_size(self) -> int:
         return 1
 
     def _get_default_config(self):
+        device_type = (
+            acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+        )
         return {
-            "device": torch.device("cuda"),
+            "device": torch.device(device_type),
             "sharding_strategy": HandleShardingStrategy.FULL_SHARD,
             "offload_params": False,
             "mp_param_dtype": None,
diff --git a/test/distributed/fsdp/test_fsdp_freezing_weights.py b/test/distributed/fsdp/test_fsdp_freezing_weights.py
index 0ffe6054bd334..ad318a6bf7520 100644
--- a/test/distributed/fsdp/test_fsdp_freezing_weights.py
+++ b/test/distributed/fsdp/test_fsdp_freezing_weights.py
@@ -31,6 +31,8 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 class Model(nn.Module):
     def __init__(
@@ -47,7 +49,6 @@ def __init__(
             nn.AdaptiveAvgPool2d(output_size=(1, 1)),
             nn.Flatten(),
         )
-        self.device = torch.cuda.current_device()
         self.head = nn.Linear(64, 10)
         if with_fsdp and freeze_after_wrap_fsdp:
             self.fsdp_wrap(fsdp_kwargs)
@@ -145,7 +146,7 @@ def _dist_train(
         forward_prefetch,
     ):
         torch.manual_seed(0)
-        batch = torch.randn(size=(2, 3, 224, 224)).cuda()
+        batch = torch.randn(size=(2, 3, 224, 224)).to(device_type)
 
         fsdp_kwargs = {
             "device_id": self.rank,
@@ -164,7 +165,7 @@ def _dist_train(
             disable_autograd,
             fsdp_kwargs,
         )
-        model = model.cuda()
+        model = model.to(device_type)
 
         # freezing the trunk using requires_grad.
         if freezing_method == FreezingMethod.RequiresGrad:
@@ -178,7 +179,7 @@ def _dist_train(
         else:
             model = DistributedDataParallel(model, **ddp_kwargs)
 
-        target = torch.tensor([0, 1], dtype=torch.long).cuda()
+        target = torch.tensor([0, 1], dtype=torch.long).to(device_type)
         criterion = nn.CrossEntropyLoss()
         optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
 
diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
index 70c415ae1fe7f..26a05bbc41714 100644
--- a/test/distributed/fsdp/test_fsdp_hybrid_shard.py
+++ b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
@@ -49,6 +49,8 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 @contextlib.contextmanager
 def patch_allreduce(new_allreduce):
@@ -97,7 +99,7 @@ class ShardingStrategyMode(Enum):
 class TestFSDPHybridShard(FSDPTest):
     @property
     def world_size(self):
-        return max(torch.cuda.device_count(), 2)
+        return max(torch.accelerator.device_count(), 2)
 
     @property
     def process_group(self):
@@ -105,7 +107,7 @@ def process_group(self):
 
     @skip_if_lt_x_gpu(2)
     def test_raises_manual_wrap_hybrid_shard_when_none_policy(self):
-        model = MyModel().cuda()
+        model = MyModel().to(device_type)
         err_ctx = self.assertRaisesRegex(
             ValueError,
             "requires explicit specification of process group or device_mesh.",
@@ -119,8 +121,8 @@ def test_raises_manual_wrap_hybrid_shard_when_none_policy(self):
 
     @skip_if_lt_x_gpu(4)
     def test_hsdp_save_load_state_dict(self):
-        model = MyModel().cuda()
-        num_node_devices = torch.cuda.device_count()
+        model = MyModel().to(device_type)
+        num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
             list(range(0, num_node_devices // 2)),
             list(range(num_node_devices // 2, num_node_devices)),
@@ -161,7 +163,7 @@ def test_hsdp_save_load_state_dict(self):
             msd = model.state_dict()
             osd = FSDP.optim_state_dict(model, optim)
 
-        load_model = fsdp_ctor(MyModel().cuda())
+        load_model = fsdp_ctor(MyModel().to(device_type))
         load_optim = torch.optim.AdamW(load_model.parameters())
         with FSDP.state_dict_type(load_model, StateDictType.SHARDED_STATE_DICT):
             load_model.load_state_dict(msd)
@@ -170,8 +172,8 @@ def test_hsdp_save_load_state_dict(self):
 
     @skip_if_lt_x_gpu(4)
     def test_hsdp_sync_module_state(self):
-        model = MyModel().cuda()
-        num_node_devices = torch.cuda.device_count()
+        model = MyModel().to(device_type)
+        num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
             list(range(0, num_node_devices // 2)),
             list(range(num_node_devices // 2, num_node_devices)),
@@ -214,7 +216,7 @@ def test_hsdp_sync_module_state(self):
     @skip_if_lt_x_gpu(2)
     def test_invalid_pg_specification_raises(self):
         pol = ModuleWrapPolicy({nn.Linear})
-        model = MyModel().cuda()
+        model = MyModel().to(device_type)
         with self.assertRaisesRegex(
             ValueError, "Expected process_group to be passed in"
         ):
@@ -260,7 +262,7 @@ def _test_fsdp_hybrid_shard_basic_setup(
         use_device_mesh: bool,
     ):
         if use_device_mesh:
-            device_mesh = init_device_mesh("cuda", (1, self.world_size))
+            device_mesh = init_device_mesh(device_type, (1, self.world_size))
         else:
             device_mesh = None
         hsdp_model = self._init_hsdp_model(
@@ -316,7 +318,7 @@ def patched_collective(orig_collective, counter, *args, **kwargs):
             patch_allreduce(patched_allreduce),
             patch_reduce_scatter(patched_reduce_scatter),
         ):
-            inp = hsdp_model.get_input(device=torch.cuda.current_device())
+            inp = hsdp_model.get_input(device=torch.accelerator.current_device_index())
             out = hsdp_model(inp[0], inp[1])
             loss = hsdp_model.get_loss(inp, out)
             loss.backward()
@@ -365,7 +367,7 @@ def _test_fsdp_hybrid_shard_parity(
         hsdp_optim = torch.optim.Adam(hsdp_model.parameters(), lr=1e-2)
         torch.manual_seed(global_pg.rank() + 1)
         for _ in range(5):
-            inp = fsdp_model.module.get_input(torch.device("cuda"))
+            inp = fsdp_model.module.get_input(torch.device(device_type))
             losses: list[torch.Tensor] = []
             for model, optim in ((fsdp_model, fsdp_optim), (hsdp_model, hsdp_optim)):
                 optim.zero_grad()
@@ -381,7 +383,7 @@ def _init_fsdp_model(self, use_orig_params: bool) -> nn.Module:
         )
         hsdp_kwargs = {
             "auto_wrap_policy": auto_wrap_policy,
-            "device_id": torch.cuda.current_device(),
+            "device_id": torch.accelerator.current_device_index(),
             "use_orig_params": use_orig_params,
         }
         fsdp_model = TransformerWithSharedParams.init(
@@ -408,7 +410,7 @@ def _init_hsdp_model(
             {TransformerEncoderLayer, TransformerDecoderLayer},
         )
         hsdp_kwargs = {
-            "device_id": torch.cuda.current_device(),
+            "device_id": torch.accelerator.current_device_index(),
             "auto_wrap_policy": auto_wrap_policy,
             "sharding_strategy": hsdp_sharding_strategy,
             "use_orig_params": use_orig_params,
@@ -435,7 +437,7 @@ def _init_hsdp_model(
             # Use `FULL_SHARD` for the embedding and output projection
             hsdp_model = FSDP(
                 model,
-                device_id=torch.cuda.current_device(),
+                device_id=torch.accelerator.current_device_index(),
                 sharding_strategy=ShardingStrategy.FULL_SHARD,
                 use_orig_params=use_orig_params,
             )
diff --git a/test/distributed/fsdp/test_fsdp_ignored_modules.py b/test/distributed/fsdp/test_fsdp_ignored_modules.py
index e75f911226da5..d8974327ea5dd 100644
--- a/test/distributed/fsdp/test_fsdp_ignored_modules.py
+++ b/test/distributed/fsdp/test_fsdp_ignored_modules.py
@@ -36,6 +36,8 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 class Model(torch.nn.Module):
     def __init__(self) -> None:
@@ -94,9 +96,9 @@ def __init__(self, num_ignored: int) -> None:
 class TestFSDPIgnoredModules(FSDPTest):
     @property
     def world_size(self):
-        return min(torch.cuda.device_count(), 2)
+        return min(torch.accelerator.device_count(), 2)
 
-    def _train_model(self, model, optim, num_iters, device=torch.device("cuda")):
+    def _train_model(self, model, optim, num_iters, device=torch.device(device_type)):
         for _ in range(num_iters):
             module = model.module if isinstance(model, FSDP) else model
             inp = module.get_input(device)
@@ -198,7 +200,7 @@ def _test_ignored_modules_nested(self, use_orig_params: bool, ignore_modules: bo
         # Initialize an FSDP-wrapped nested model that first wraps the nested
         # sequential's second linear layer (`layer1[1]`) and then wraps the
         # overall model while ignoring the nested sequential (`layer1`)
-        model = Model().cuda()
+        model = Model().to(device_type)
         fsdp_fn = functools.partial(FSDP, use_orig_params=use_orig_params)
         model.layer1[1] = fsdp_fn(model.layer1[1])
         if ignore_modules:
@@ -246,7 +248,7 @@ def test_ignored_states_auto_wrap(self):
         )
 
     def _test_ignored_states_auto_wrap(self, policy, ignore_bias: bool):
-        model = Model().cuda()
+        model = Model().to(device_type)
         ignored_states = [model.layer1[1].weight]
         if ignore_bias:
             ignored_states.append(model.layer1[1].bias)
@@ -285,7 +287,7 @@ def _test_ignored_states_auto_wrap(self, policy, ignore_bias: bool):
     def test_ignored_modules_invalid(self):
         """Tests that passing an FSDP module as an ignored module or the
         top-level module itself errors."""
-        model = Model().cuda()
+        model = Model().to(device_type)
         wrap_cls = FSDP
         model.layer1 = wrap_cls(model.layer1)
         # Passing an FSDP module as an ignored module should error
@@ -302,7 +304,7 @@ def test_ignored_modules_invalid(self):
         ):
             # FSDP does not allow to wrap the same model twice, so create
             # a new local model here.
-            new_model = Model().cuda()
+            new_model = Model().to(device_type)
             wrap_cls(new_model, ignored_modules=[new_model])
 
     @skip_if_lt_x_gpu(2)
@@ -334,7 +336,7 @@ def _test_diff_ignored_modules_across_ranks(
         # we wrap `layer3` with FSDP, where `layer3` is registered as a module
         # after `layer1`, which has the variable number of ignored modules
         wrap_cls = FSDP
-        model = ModelWithIgnoredModules(num_ignored=self.rank + 1).cuda()
+        model = ModelWithIgnoredModules(num_ignored=self.rank + 1).to(device_type)
         layer1_ignored_modules = [
             m for m in model.layer1.modules() if isinstance(m, IgnoredModule)
         ]
@@ -370,7 +372,7 @@ def _test_diff_ignored_modules_across_ranks(
     @skip_if_lt_x_gpu(2)
     @parametrize("ignore_modules", [True, False])
     def test_ignored_modules_not_under_wrapped_root(self, ignore_modules: bool):
-        model = Model().cuda()
+        model = Model().to(device_type)
         ignored_modules = list(model.layer1.children())[1:]
 
         ignore_kwargs = (
@@ -409,7 +411,7 @@ def test_ignored_states_check(self):
         )
 
     def _test_ignored_states_check(self, ignore_modules: bool):
-        model = Model().cuda()
+        model = Model().to(device_type)
         ignored_modules = list(model.layer1.children())[1:]
         ignored_params = {p for m in ignored_modules for p in m.parameters()}
         ignored_states = ignored_params.union(set(ignored_modules))
diff --git a/test/distributed/fsdp/test_fsdp_memory.py b/test/distributed/fsdp/test_fsdp_memory.py
index d10f78e3b3c79..93391f01b376d 100644
--- a/test/distributed/fsdp/test_fsdp_memory.py
+++ b/test/distributed/fsdp/test_fsdp_memory.py
@@ -14,6 +14,7 @@
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_CUDA,
     TEST_HPU,
     TEST_WITH_DEV_DBG_ASAN,
 )
@@ -31,11 +32,14 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 def get_cur_mem(rank, result, prefix):
     """Collect memory allocated values in a result dict in MB"""
-    torch._C._cuda_clearCublasWorkspaces()
-    result[prefix] = round(torch.cuda.memory_allocated() / 1024 / 1024)
+    if TEST_CUDA:
+        torch._C._cuda_clearCublasWorkspaces()
+    result[prefix] = round(torch.accelerator.memory_allocated() / 1024 / 1024)
 
 
 class Model(nn.Module):
@@ -110,14 +114,14 @@ def world_size(self):
 
     def _dist_train(self, with_checkpoint, expected, model_hidden_dim, iterations):
         gpu_id = self.rank
-        batch = torch.randn(size=(2, 3, 224, 224)).cuda()
+        batch = torch.randn(size=(2, 3, 224, 224)).to(device_type)
 
         model = create_model(
             with_fsdp=True,
             with_checkpoint=with_checkpoint,
             model_hidden_dim=model_hidden_dim,
         )
-        model = model.cuda()
+        model = model.to(device_type)
         model = FSDP(model)
 
         # We enable momentum so that after the first iteration, the optimizer state is added
@@ -133,7 +137,7 @@ def _dist_train(self, with_checkpoint, expected, model_hidden_dim, iterations):
             get_cur_mem(gpu_id, results, f"iter {iteration}: after fwd")
 
             out = sum(o.sum() for o in out[0])
-            fake_loss = criterion(out, torch.tensor(0.0).cuda())
+            fake_loss = criterion(out, torch.tensor(0.0).to(device_type))
             get_cur_mem(gpu_id, results, f"iter {iteration}: after loss")
 
             fake_loss.backward()
@@ -167,8 +171,8 @@ def test_fsdp_memory(self, ckpt):
 
         model = create_model(
             with_fsdp=False, with_checkpoint=False, model_hidden_dim=model_hidden_dim
-        ).cuda()
-        model_size_mb = round(torch.cuda.memory_allocated() / 1024 / 1024)
+        ).to(device_type)
+        model_size_mb = round(torch.accelerator.memory_allocated() / 1024 / 1024)
         del model
 
         sharded_model_size_mb = int(model_size_mb / self.world_size)
diff --git a/test/distributed/fsdp/test_fsdp_meta.py b/test/distributed/fsdp/test_fsdp_meta.py
index 9a3d57c705a53..d3b0079a24adc 100644
--- a/test/distributed/fsdp/test_fsdp_meta.py
+++ b/test/distributed/fsdp/test_fsdp_meta.py
@@ -43,6 +43,8 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 def _reset_params_if_meta(is_meta: bool, model: nn.Module):
     # For torchdistX init, we don't need to call reset_params, as
@@ -117,7 +119,7 @@ def _init_with_reset_params(module: nn.Module):
         )
     )
     if has_meta_states:
-        device = torch.device("cuda", torch.cuda.current_device())
+        device = torch.device(device_type, torch.accelerator.current_device_index())
         module.to_empty(device=device, recurse=False)
         module.reset_parameters()
 
@@ -164,13 +166,13 @@ def _test_simple_model_with_meta_device(self, meta_module_fn, init_fn=None):
 
         # Test to make sure it is the same model parameters as regular FSDP
         # approach.
-        regular = MyModel(device="cuda")
+        regular = MyModel(device=device_type)
         _reset_params_if_meta(is_meta, regular)
         fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap)
         regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
 
         self._compare_fsdp(fsdp_meta, fsdp_regular)
-        inp = torch.randn(10, 2, device="cuda")
+        inp = torch.randn(10, 2, device=device_type)
         fsdp_meta(inp).sum().backward()
         fsdp_regular(inp).sum().backward()
         meta_opt.step()
@@ -182,7 +184,7 @@ def _test_simple_model_with_meta_device(self, meta_module_fn, init_fn=None):
         model = meta_module_fn()
         fsdp_meta = FSDP(model, param_init_fn=init_fn)
         meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
-        regular = MyModel(device="cuda")
+        regular = MyModel(device=device_type)
         _reset_params_if_meta(is_meta, regular)
         fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap)
         regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
@@ -217,7 +219,7 @@ def meta_module_fn():
     )
     def test_simple_model_with_torchdistX_default_init(self):
         def meta_module_fn():
-            return deferred_init.deferred_init(MyModel, device="cuda")
+            return deferred_init.deferred_init(MyModel, device=device_type)
 
         self._test_simple_model_with_meta_device(meta_module_fn)
 
@@ -228,7 +230,7 @@ def meta_module_fn():
     )
     def test_simple_model_with_torchdistX_init_fn(self):
         def meta_module_fn():
-            return deferred_init.deferred_init(MyModel, device="cuda")
+            return deferred_init.deferred_init(MyModel, device=device_type)
 
         self._test_simple_model_with_meta_device(
             meta_module_fn, init_fn=_init_with_torchdistX
@@ -248,7 +250,7 @@ def _test_nested_model_with_meta_device(
                 param_init_fn=init_fn,
             )
             meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
-            module_regular = NestedModel(device="cuda")
+            module_regular = NestedModel(device=device_type)
             _reset_params_if_meta(is_meta, module_regular)
             fsdp_regular = FSDP(
                 module_regular,
@@ -269,7 +271,7 @@ def _test_nested_model_with_meta_device(
 
             # Init and reset parameters before wrapping so that reset_params
             # matches up with meta device's initialization.
-            module_regular = NestedModel(device="cuda")
+            module_regular = NestedModel(device=device_type)
             _reset_params_if_meta(is_meta, module_regular)
             with enable_wrap(wrapper_cls=FSDP):
                 module_regular.lin1 = wrap(module_regular.lin1)
@@ -279,7 +281,7 @@ def _test_nested_model_with_meta_device(
 
         # Compare it before training
         self._compare_fsdp(fsdp_meta, fsdp_regular)
-        inp = torch.randn(10, 2, device="cuda")
+        inp = torch.randn(10, 2, device=device_type)
         fsdp_meta(inp).sum().backward()
         fsdp_regular(inp).sum().backward()
         meta_opt.step()
@@ -317,7 +319,7 @@ def meta_module_fn():
     @parametrize("auto_wrap", [True, False])
     def test_nested_model_with_torchdistX_default_init(self, auto_wrap):
         def meta_module_fn():
-            return deferred_init.deferred_init(NestedModel, device="cuda")
+            return deferred_init.deferred_init(NestedModel, device=device_type)
 
         self._test_nested_model_with_meta_device(
             auto_wrap=auto_wrap, meta_module_fn=meta_module_fn
@@ -331,7 +333,7 @@ def meta_module_fn():
     @parametrize("auto_wrap", [True, False])
     def test_nested_model_with_torchdistX_init_fn(self, auto_wrap):
         def meta_module_fn():
-            return deferred_init.deferred_init(NestedModel, device="cuda")
+            return deferred_init.deferred_init(NestedModel, device=device_type)
 
         self._test_nested_model_with_meta_device(
             auto_wrap=auto_wrap,
@@ -351,7 +353,7 @@ def _test_bad_arg(self, meta_module_fn):
     )
     def test_bad_arg_torchdistx(self):
         def meta_module_fn():
-            return deferred_init.deferred_init(NestedModel, "cuda")
+            return deferred_init.deferred_init(NestedModel, device_type)
 
         self._test_bad_arg(meta_module_fn)
 
@@ -401,7 +403,7 @@ def _param_init_fn(module: nn.Module) -> None:
             # TODO: `module.to_empty()` is not generally correct for meta
             # device initialization.
             # https://github.com/pytorch/pytorch/issues/90465
-            module.to_empty(device=torch.device("cuda"))
+            module.to_empty(device=torch.device(device_type))
             module.apply(model._module_init_fn)
 
         model = Model()
@@ -414,7 +416,7 @@ def _param_init_fn(module: nn.Module) -> None:
                 param_dtype=torch.float32, reduce_dtype=torch.float16
             ),
             param_init_fn=_param_init_fn,
-            device_id=torch.cuda.current_device(),
+            device_id=torch.accelerator.current_device_index(),
         )
 
 
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index a1a317f57da3f..45c1668dfb2e2 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -60,6 +60,10 @@
     )
     sys.exit(0)
 
+device_type = (
+    acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
+)
+
 
 class MyModel(nn.Module):
     def __init__(self) -> None:
@@ -93,9 +97,9 @@ def test_fsdp_device_id(self, use_index):
           without specifying a device ID (i.e. ``torch.device("cuda")``) warns
         """
         dev_id = (
-            torch.cuda.current_device()
+            torch.accelerator.current_device_index()
             if use_index
-            else torch.device("cuda", torch.cuda.current_device())
+            else torch.device(device_type, torch.accelerator.current_device_index())
         )
 
         def _check_device_matches(module, device_id):
@@ -108,7 +112,7 @@ def _check_device_matches(module, device_id):
             self.assertEqual(1, len(devices))
             found_device = devices.pop()
             if use_index and not isinstance(device_id, torch.device):
-                device = torch.device("cuda", device_id)
+                device = torch.device(device_type, device_id)
             else:
                 device = device_id
             self.assertEqual(found_device, device)
@@ -140,10 +144,11 @@ def _check_device_matches(module, device_id):
                 self.process_group,
                 FSDPInitMode.RECURSIVE,
                 DEVICEInitMode.DEVICE_BEFORE,
-                fsdp_kwargs={"device_id": torch.device("cuda")},
+                fsdp_kwargs={"device_id": torch.device(device_type)},
             )
         _check_device_matches(
-            nested_wrapped_module, torch.device("cuda", torch.cuda.current_device())
+            nested_wrapped_module,
+            torch.device(device_type, torch.accelerator.current_device_index()),
         )
 
     @skip_if_lt_x_gpu(2)
@@ -178,8 +183,8 @@ def forward(self, x, y):
                 loss = torch.nn.functional.cross_entropy(output, y)
                 return loss
 
-        model = Mnist().cuda()
-        model1 = Mnist().cuda()
+        model = Mnist().to(device=device_type)
+        model1 = Mnist().to(device=device_type)
         model1.load_state_dict(model.state_dict())
         fsdp_model = FSDP(
             model,
@@ -197,17 +202,17 @@ def forward(self, x, y):
 
         seed = self.rank + 20231010
         torch.manual_seed(seed)
-        torch.cuda.manual_seed(seed)
+        torch.get_device_module(device_type).manual_seed(seed)
 
         losses = []
         grads = []
         for i in range(5):
-            x = torch.randn(8, 1, 28, 28, device="cuda").requires_grad_()
-            y = torch.randint(low=0, high=9, size=(8,), device="cuda")
+            x = torch.randn(8, 1, 28, 28, device=device_type).requires_grad_()
+            y = torch.randint(low=0, high=9, size=(8,), device=device_type)
             for model, opt in ((fsdp_model, fsdp_opt), (ddp_model, ddp_opt)):
                 seed = self.rank + i
                 torch.manual_seed(seed)
-                torch.cuda.manual_seed(seed)
+                torch.get_device_module(device_type).manual_seed(seed)
                 loss = model(x, y).sum()
                 losses.append(loss)
                 loss.backward()
@@ -223,8 +228,8 @@ def forward(self, x, y):
             fsdp_model.eval()
             ddp_model.eval()
             for _ in range(5):
-                x = torch.randn(8, 1, 28, 28, device="cuda").requires_grad_()
-                y = torch.randint(low=0, high=9, size=(8,), device="cuda")
+                x = torch.randn(8, 1, 28, 28, device=device_type).requires_grad_()
+                y = torch.randint(low=0, high=9, size=(8,), device=device_type)
                 fsdp_loss = fsdp_model(x, y)
                 ddp_loss = ddp_model(x, y)
                 assert torch.allclose(fsdp_loss, ddp_loss)
@@ -232,12 +237,12 @@ def forward(self, x, y):
         fsdp_model.train()
         ddp_model.train()
         for i in range(5):
-            x = torch.randn(8, 1, 28, 28, device="cuda").requires_grad_()
-            y = torch.randint(low=0, high=9, size=(8,), device="cuda")
+            x = torch.randn(8, 1, 28, 28, device=device_type).requires_grad_()
+            y = torch.randint(low=0, high=9, size=(8,), device=device_type)
             for model, opt in ((fsdp_model, fsdp_opt), (ddp_model, ddp_opt)):
                 seed = self.rank + i
                 torch.manual_seed(seed)
-                torch.cuda.manual_seed(seed)
+                torch.get_device_module(device_type).manual_seed(seed)
                 loss = model(x, y).sum()
                 losses.append(loss)
                 loss.backward()
@@ -272,12 +277,12 @@ def forward(self, x, y):
                     return out1
 
         fsdp = FSDP(
-            MyModel().cuda(),
+            MyModel().to(device=device_type),
             sharding_strategy=sharding_strategy,
             auto_wrap_policy=always_wrap_policy,
         )
-        x = torch.randn(10, 10, device="cuda")
-        y = torch.randn(10, 10, device="cuda")
+        x = torch.randn(10, 10, device=device_type)
+        y = torch.randn(10, 10, device=device_type)
         for _ in range(4):
             if use_second_layer:
                 a, _ = fsdp(x, y)
@@ -336,7 +341,7 @@ def _check_equal(local, fsdp):
                     torch.testing.assert_close(p1, p2)
 
         fsdp_ctor = functools.partial(FSDP, sharding_strategy=sharding_strategy)
-        m = MyModule().cuda()
+        m = MyModule().to(device=device_type)
         m_local = deepcopy(m)
         local_m = m_local
         prev_params = [p.clone() for p in m_local.parameters()]
@@ -349,7 +354,7 @@ def _check_equal(local, fsdp):
         opt_local = torch.optim.SGD(local_m.parameters(), lr=1e-3)
 
         for i in range(6):
-            t = torch.ones(4, device="cuda")
+            t = torch.ones(4, device=device_type)
             a, b = m(t)
             local_a, local_b = local_m(t)
             if i < 2:
@@ -385,7 +390,7 @@ def _check_equal(local, fsdp):
     @skip_if_lt_x_gpu(2)
     def test_fsdp_optim_overlap_no_use_orig_params_error(self):
         fsdp_overlap = FSDP(
-            MyModel().cuda(),
+            MyModel().to(device=device_type),
             auto_wrap_policy=always_wrap_policy,
             use_orig_params=False,
         )
@@ -398,7 +403,7 @@ def test_fsdp_optim_overlap_no_use_orig_params_error(self):
             register_hook=False,
         )
 
-        inp = torch.randn(10, 10, device="cuda")
+        inp = torch.randn(10, 10, device=device_type)
         with self.assertRaisesRegex(
             RuntimeError, "only supported with use_orig_params=True"
         ):
@@ -409,16 +414,16 @@ def test_fsdp_optimizer_overlap(self):
         torch.manual_seed(0)
         for cpu_offload in [True, False]:
             offload = CPUOffload(offload_params=cpu_offload)
-            model = MyModel().cuda()
+            model = MyModel().to(device=device_type)
             model_overlap = deepcopy(model)
             fsdp = FSDP(
-                model.cuda(),
+                model.to(device=device_type),
                 auto_wrap_policy=always_wrap_policy,
                 use_orig_params=True,
                 cpu_offload=offload,
             )
             fsdp_overlap = FSDP(
-                model_overlap.cuda(),
+                model_overlap.to(device=device_type),
                 auto_wrap_policy=always_wrap_policy,
                 use_orig_params=True,
                 cpu_offload=offload,
@@ -445,7 +450,7 @@ def test_fsdp_optimizer_overlap(self):
                 ]
 
             for i in range(6):
-                inp = torch.randn(2, 2, device="cuda")
+                inp = torch.randn(2, 2, device=device_type)
                 with torch.no_grad():
                     inp_clone = inp.clone()
                 fsdp(inp, inp).sum().backward()
@@ -546,7 +551,7 @@ def test_fsdp_cpu_init_stays_on_cpu(self):
         """Tests that passing a CPU module to FSDP preserves that the wrapped
         module is on CPU after FSDP initialization, albeit after logging a
         warning, and that FSDP moves CPU input to GPU before the forward."""
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
         regex = "passed-in `module` is on CPU"
         context = self.assertWarnsRegex(
             expected_warning=UserWarning, expected_regex=regex
@@ -561,7 +566,7 @@ def test_fsdp_cpu_init_stays_on_cpu(self):
         devices = {p.device for p in fsdp_model.parameters()}
         self.assertEqual(1, len(devices))
         self.assertEqual(torch.device("cpu"), devices.pop())
-        fsdp_model = fsdp_model.cuda()
+        fsdp_model = fsdp_model.to(device=device_type)
         # Ensure fwd + backward can be performed after moving to CUDA.
         # CPU input also tests that input is correctly moved to appropriate
         # CUDA device.
@@ -606,19 +611,19 @@ def init_nested_wrapped_module():
             nested_wrapped_module,
             self.process_group,
             auto_wrap_policy=ModuleWrapPolicy({nn.Linear}),
-            device_id=torch.cuda.current_device(),
+            device_id=torch.accelerator.current_device_index(),
             sync_module_states=True,
         )
         # Each rank's buffers should be 0s since rank 0 is the source, and they
         # should be on GPU since we specified `device_id`
         self.assertEqual(
             nested_wrapped_module.buf.device,
-            torch.device("cuda", torch.cuda.current_device()),
+            torch.device(device_type, torch.accelerator.current_device_index()),
         )
         self.assertEqual(nested_wrapped_module.buf, torch.zeros((2, 2)))
         self.assertEqual(
             nested_wrapped_module.module.module[0].buf.device,
-            torch.device("cuda", torch.cuda.current_device()),
+            torch.device(device_type, torch.accelerator.current_device_index()),
         )
         self.assertEqual(
             nested_wrapped_module.module.module[0].buf, torch.zeros((3, 2))
@@ -644,9 +649,9 @@ def __init__(self) -> None:
             def forward(self, x):
                 return x
 
-        m = MyModule().cuda()
+        m = MyModule().to(device=device_type)
         m = FSDP(m)
-        t = torch.ones(1, device="cuda", requires_grad=True)
+        t = torch.ones(1, device=device_type, requires_grad=True)
 
         MyOutputType = namedtuple(
             "MyOutputType", ["a", "b", "c", "d"], defaults=(t, t, t, t)
@@ -683,7 +688,7 @@ def _test_device_id_auto_wrap(self, use_callable: bool):
             auto_wrap_policy = ModuleWrapPolicy(module_classes)
         fsdp_kwargs = {
             "auto_wrap_policy": auto_wrap_policy,
-            "device_id": torch.cuda.current_device(),
+            "device_id": torch.accelerator.current_device_index(),
         }
         fsdp_model = TransformerWithSharedParams.init(
             self.process_group,
@@ -694,7 +699,7 @@ def _test_device_id_auto_wrap(self, use_callable: bool):
         for fsdp_module in FSDP.fsdp_modules(fsdp_model):
             self.assertEqual(
                 fsdp_module.compute_device,
-                torch.device("cuda", torch.cuda.current_device()),
+                torch.device(device_type, torch.accelerator.current_device_index()),
             )
 
     @skip_if_lt_x_gpu(2)
@@ -729,7 +734,7 @@ def forward(self, x):
             model,
             auto_wrap_policy=auto_wrap_policy,
             cpu_offload=CPUOffload(offload_params=True),
-            device_id=torch.cuda.current_device(),
+            device_id=torch.accelerator.current_device_index(),
             use_orig_params=use_orig_params,
         )
         cpu_device = torch.device("cpu")
@@ -742,12 +747,16 @@ def test_module_device_mismatches_device_id(self):
         module that does not match the GPU device ID raises an error."""
         # TODO: override FSDP MT Thread _run to set this instead of here for
         # every test.
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
+
         context = (
-            self.assertRaisesRegex(ValueError, f"cuda:{self.rank} vs cuda:0")
+            self.assertRaisesRegex(
+                ValueError, f"{device_type}:{self.rank} vs {device_type}:0"
+            )
             if self.rank != 0
             else nullcontext()
         )
+
         with context:
             NestedWrappedModule.init(
                 self.process_group,
@@ -764,18 +773,20 @@ def test_cpu_gpu_module(self):
         """Tests a CPU + GPU module supported if device_id is passed
         in, errors if device_id is not.
         """
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
 
         class CPUGPUModule(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
-                self.a = nn.Linear(1, 1).cuda()
+                self.a = nn.Linear(1, 1).to(device=device_type)
                 self.b = nn.Linear(1, 1)
 
         cpu_gpu = CPUGPUModule()
-        fsdp = FSDP(cpu_gpu, device_id=torch.cuda.current_device())
+        fsdp = FSDP(cpu_gpu, device_id=torch.accelerator.current_device_index())
         for param in fsdp.parameters():
-            self.assertEqual(param.device, torch.device(torch.cuda.current_device()))
+            self.assertEqual(
+                param.device, torch.device(torch.accelerator.current_device_index())
+            )
 
         # without device_id, we hit an error
         with self.assertRaisesRegex(RuntimeError, "please pass in device_id"):
@@ -783,7 +794,7 @@ def __init__(self) -> None:
 
     @skip_if_lt_x_gpu(2)
     def test_fsdp_ignored_module_meta(self):
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
 
         class CPUGPUModule(nn.Module):
             def __init__(self) -> None:
@@ -802,11 +813,11 @@ def __init__(self) -> None:
             m = CPUGPUModule()
         m = FSDP(
             m,
-            device_id=torch.cuda.current_device(),
+            device_id=torch.accelerator.current_device_index(),
             ignored_modules=[m.a],
             use_orig_params=True,
             param_init_fn=lambda m: m.to_empty(
-                device=torch.cuda.current_device(), recurse=False
+                device=torch.accelerator.current_device_index(), recurse=False
             ),
         )
         self.assertEqual(meta_device, next(m.a.parameters()).device)
@@ -854,20 +865,20 @@ def test_no_params(self):
         """
         # TODO: override FSDP MT Thread _run to set this instead of here for
         # every test.
-        torch.cuda.set_device(self.rank)
+        torch.accelerator.set_device_index(self.rank)
         # Test CPU
         no_params = nn.ReLU()
         FSDP(no_params)
         # Test CUDA
-        no_params = nn.ReLU().cuda()
+        no_params = nn.ReLU().to(device=device_type)
         FSDP(no_params)
         # Test CPU + device_id
         no_params = nn.ReLU()
-        FSDP(no_params, device_id=torch.cuda.current_device())
+        FSDP(no_params, device_id=torch.accelerator.current_device_index())
         # For modules with no params, wrong device_id will raise error about
         # inconsistency between compute_device and device_id, since compute_device
         # is computed as torch.cuda.current_device when there are no params.
-        no_params = nn.ReLU().cuda()
+        no_params = nn.ReLU().to(device=device_type)
         context = (
             (
                 self.assertRaisesRegex(
@@ -892,11 +903,11 @@ def __init__(self, rank):
                 super().__init__()
                 # Seed via rank to make model different across ranks
                 torch.manual_seed(rank)
-                torch.cuda.manual_seed(rank)
+                torch.get_device_module(device_type).manual_seed(rank)
                 self.lin = nn.Linear(10, 10, bias=False)
                 self.buffer = nn.Buffer(torch.ones(1) * rank)
 
-        m = MyModel(self.rank).cuda()
+        m = MyModel(self.rank).to(device=device_type)
         _assert_module_states(
             m, process_group=self.process_group, assert_fn=self.assertNotEqual
         )
@@ -913,7 +924,11 @@ def __init__(self, rank):
             m, process_group=self.process_group, assert_fn=self.assertNotEqual
         )
         # Passing sync_module_states into FSDP makes model the same during init.
-        fsdp = FSDP(m, device_id=torch.cuda.current_device(), sync_module_states=True)
+        fsdp = FSDP(
+            m,
+            device_id=torch.accelerator.current_device_index(),
+            sync_module_states=True,
+        )
         with fsdp.summon_full_params(fsdp):
             _assert_module_states(
                 fsdp, process_group=self.process_group, assert_fn=self.assertEqual
@@ -968,7 +983,7 @@ def _test_homogeneous_attributes(self, attr_name_and_values: tuple[str, Any, Any
         with self.assertRaisesRegex(
             ValueError, f"Expects one homogeneous value for {attr_name}"
         ):
-            inp = fsdp_model.module.get_input(torch.device("cuda"))
+            inp = fsdp_model.module.get_input(torch.device(device_type))
             fsdp_model(*inp)
 
     @skip_if_lt_x_gpu(2)
@@ -976,7 +991,7 @@ def test_fsdp_unsupported_module_cls(self):
         regex = r"FSDP will not all-gather parameters for containers that do not implement forward"
         model = nn.ModuleList([MLP(8, torch.device("cpu")) for _ in range(3)])
         with self.assertWarnsRegex(UserWarning, regex):
-            FSDP(model, device_id="cuda")
+            FSDP(model, device_id=device_type)
         model = nn.ModuleDict(
             {"1": MLP(8, torch.device("cpu")), "2": MLP(8, torch.device("cpu"))}
         )
@@ -1000,7 +1015,10 @@ def test_world_size_1_sharding_strategy_warning(self):
         # warning
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")  # trigger all warnings
-            FSDP(nn.Linear(3, 3).cuda(), sharding_strategy=ShardingStrategy.NO_SHARD)
+            FSDP(
+                nn.Linear(3, 3).to(device=device_type),
+                sharding_strategy=ShardingStrategy.NO_SHARD,
+            )
             for warning in w:
                 self.assertTrue(
                     warning.category != UserWarning
@@ -1014,16 +1032,20 @@ def test_world_size_1_sharding_strategy_warning(self):
             warning_prefix + " " + str(ShardingStrategy.FULL_SHARD) + warning_suffix
         )
         with self.assertWarnsRegex(UserWarning, expected_regex_full_shard):
-            FSDP(nn.Linear(3, 3).cuda(), sharding_strategy=ShardingStrategy.FULL_SHARD)
+            FSDP(
+                nn.Linear(3, 3).to(device=device_type),
+                sharding_strategy=ShardingStrategy.FULL_SHARD,
+            )
         with self.assertWarnsRegex(UserWarning, expected_regex_full_shard):
-            FSDP(nn.Linear(3, 3).cuda())
+            FSDP(nn.Linear(3, 3).to(device=device_type))
         # - Pass `SHARD_GRAD_OP`
         expected_regex_shard_grad_op = (
             warning_prefix + " " + str(ShardingStrategy.SHARD_GRAD_OP) + warning_suffix
         )
         with self.assertWarnsRegex(UserWarning, expected_regex_shard_grad_op):
             FSDP(
-                nn.Linear(3, 3).cuda(), sharding_strategy=ShardingStrategy.SHARD_GRAD_OP
+                nn.Linear(3, 3).to(device=device_type),
+                sharding_strategy=ShardingStrategy.SHARD_GRAD_OP,
             )
 
     @skip_if_lt_x_gpu(1)
@@ -1047,7 +1069,7 @@ def test_training_device_mismatch_errors(self):
         # Incorrectly moving from CPU -> GPU
         model = torch.nn.Linear(10, 10)
         fsdp_model = FSDP(model, cpu_offload=CPUOffload(offload_params=True))
-        fsdp_model.to(torch.device("cuda"))
+        fsdp_model.to(torch.device(device_type))
         inp = torch.randn((2, 10))
         with self.assertRaisesRegex(
             RuntimeError,
@@ -1088,16 +1110,16 @@ def __setattr__(self, name: str, value: Any) -> None:
 
         # Construct FSDP module without changing any environment variables and
         # run forward, which triggers both unsharded and sharded view setting
-        module = SetattrLinear(5, 5, torch.device("cuda"))
+        module = SetattrLinear(5, 5, torch.device(device_type))
         fsdp_module = FSDP(module, use_orig_params=use_orig_params)
-        inp = torch.randn((8, 5), device=torch.device("cuda"))
+        inp = torch.randn((8, 5), device=torch.device(device_type))
         called_setattr_override = False
         fsdp_module(inp)
         self.assertTrue(called_setattr_override)
 
         # Repeat with unsafe setattr explicitly enabled
         os.environ[_FSDP_USE_UNSAFE_SETATTR] = "1"
-        module = SetattrLinear(5, 5, torch.device("cuda"))
+        module = SetattrLinear(5, 5, torch.device(device_type))
         fsdp_module = FSDP(module, use_orig_params=use_orig_params)
         called_setattr_override = False
         fsdp_module(inp)
@@ -1105,7 +1127,7 @@ def __setattr__(self, name: str, value: Any) -> None:
 
         # Repeat with unsafe setattr explicitly disabled
         os.environ[_FSDP_USE_UNSAFE_SETATTR] = "0"
-        module = SetattrLinear(5, 5, torch.device("cuda"))
+        module = SetattrLinear(5, 5, torch.device(device_type))
         fsdp_module = FSDP(module, use_orig_params=use_orig_params)
         called_setattr_override = False
         fsdp_module(inp)
diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py
index 3e6e32358f8f7..09d24d5fba8dd 100644
--- a/test/distributed/fsdp/test_fsdp_optim_state.py
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@@ -38,7 +38,6 @@
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
-    skipIfRocm,
     TEST_WITH_DEV_DBG_ASAN,
 )
 
@@ -514,7 +513,6 @@ def _check_same_param_groups(
                     continue
                 self.assertEqual(full_osd_value, ref_osd_pg[name])
 
-    @skipIfRocm
     @skip_if_lt_x_gpu(2)
     @parametrize("state_dict_type", STATE_DICT_TYPES)
     @parametrize("use_multiple_param_groups", [False, True])
diff --git a/test/distributed/pipelining/artifacts/dualpipev_4rank_10mb.csv b/test/distributed/pipelining/artifacts/dualpipev_4rank_10mb.csv
new file mode 100644
index 0000000000000..e5a23c0a80c2b
--- /dev/null
+++ b/test/distributed/pipelining/artifacts/dualpipev_4rank_10mb.csv
@@ -0,0 +1,4 @@
+0F0,0F1,0F2,0F3,0F4,0F5,0F6,7F0,7I0,7W0,7F1,7I1,7W1,7F2,7I2,7W2,7F3,(0F7;7B3)OVERLAP_F_B,(7F4;0B0)OVERLAP_F_B,(0F8;7B4)OVERLAP_F_B,(7F5;0B1)OVERLAP_F_B,(0F9;7B5)OVERLAP_F_B,(7F6;0B2)OVERLAP_F_B,7B6,(7F7;0B3)OVERLAP_F_B,7B7,(7F8;0B4)OVERLAP_F_B,7B8,(7F9;0B5)OVERLAP_F_B,7B9,0I6,0W6,0I7,0W7,0I8,0W8,0I9,0W9
+1F0,1F1,1F2,1F3,1F4,6F0,1F5,6F1,6I0,6W0,6F2,6I1,6W1,6F3,(1F6;6B2)OVERLAP_F_B,(6F4;1B0)OVERLAP_F_B,(1F7;6B3)OVERLAP_F_B,(6F5;1B1)OVERLAP_F_B,(1F8;6B4)OVERLAP_F_B,(6F6;1B2)OVERLAP_F_B,(1F9;6B5)OVERLAP_F_B,(6F7;1B3)OVERLAP_F_B,6B6,(6F8;1B4)OVERLAP_F_B,6B7,(6F9;1B5)OVERLAP_F_B,6B8,1B6,6I9,1I7,6W9,1I8,1W7,1I9,1W8,1W9
+2F0,2F1,2F2,5F0,2F3,5F1,2F4,5F2,5I0,5W0,5F3,(2F5;5B1)OVERLAP_F_B,(5F4;2B0)OVERLAP_F_B,(2F6;5B2)OVERLAP_F_B,(5F5;2B1)OVERLAP_F_B,(2F7;5B3)OVERLAP_F_B,(5F6;2B2)OVERLAP_F_B,(2F8;5B4)OVERLAP_F_B,(5F7;2B3)OVERLAP_F_B,(2F9;5B5)OVERLAP_F_B,(5F8;2B4)OVERLAP_F_B,5B6,(5F9;2B5)OVERLAP_F_B,5B7,2B6,5B8,2I7,5I9,2I8,2W7,2I9,5W9,2W8,2W9
+3F0,4F0,3F1,4F1,3F2,4F2,3F3,4F3,3F4,4B0,(4F4;3B0)OVERLAP_F_B,(3F5;4B1)OVERLAP_F_B,(4F5;3B1)OVERLAP_F_B,(3F6;4B2)OVERLAP_F_B,(4F6;3B2)OVERLAP_F_B,(3F7;4B3)OVERLAP_F_B,(4F7;3B3)OVERLAP_F_B,(3F8;4B4)OVERLAP_F_B,(4F8;3B4)OVERLAP_F_B,(3F9;4B5)OVERLAP_F_B,(4F9;3B5)OVERLAP_F_B,4B6,3B6,4B7,3B7,4I8,3I8,4I9,3I9,4W8,3W8,4W9,3W9
diff --git a/test/distributed/pipelining/model_registry.py b/test/distributed/pipelining/model_registry.py
index 30bc7c5dda5c8..347dad6fb766c 100644
--- a/test/distributed/pipelining/model_registry.py
+++ b/test/distributed/pipelining/model_registry.py
@@ -211,10 +211,10 @@ def __init__(self, d_hid: int):
         self.fc2_weight = torch.nn.Parameter(torch.randn(d_hid, d_hid))
         self.fc2_bias = torch.nn.Parameter(torch.randn(d_hid))
 
-        torch.nn.init.uniform_(self.fc1_weight, -0.01, 0.01)
-        torch.nn.init.uniform_(self.fc2_weight, -0.01, 0.01)
-        torch.nn.init.uniform_(self.fc1_bias, -0.01, 0.01)
-        torch.nn.init.uniform_(self.fc2_bias, -0.01, 0.01)
+        torch.nn.init.uniform_(self.fc1_weight, -0.001, 0.001)
+        torch.nn.init.uniform_(self.fc2_weight, -0.001, 0.001)
+        torch.nn.init.uniform_(self.fc1_bias, -0.001, 0.001)
+        torch.nn.init.uniform_(self.fc2_bias, -0.001, 0.001)
 
         self.cached_context = {}
         self.cached_context["fc1"] = []
diff --git a/test/distributed/pipelining/test_backward.py b/test/distributed/pipelining/test_backward.py
index 10408ebef484c..b46a97d02c29e 100644
--- a/test/distributed/pipelining/test_backward.py
+++ b/test/distributed/pipelining/test_backward.py
@@ -10,7 +10,10 @@
     stage_backward_input,
     stage_backward_weight,
 )
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    skipXPUIf,
+)
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
@@ -19,6 +22,7 @@
 
 
 class StageBackwardTests(TestCase):
+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
     def test_stage_backward(self, device):
         # MLP as a stage module
         mod = MLPModule(d_hid).to(device)
@@ -93,6 +97,7 @@ def test_stage_backward_input(self, device):
             # Check that the weight gradients were not updated
             self.assertEqual(p.grad, None)
 
+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
     def test_stage_backward_weight(self, device):
         # MLP as a stage module
         mod = MLPModule(d_hid).to(device)
@@ -133,6 +138,7 @@ def test_stage_backward_weight(self, device):
                 print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
                 raise
 
+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
     def test_stage_backward_weight_multiple_iters(self, device):
         # MLP as a stage module
         mod = MLPModule(d_hid).to(device)
@@ -223,7 +229,9 @@ def test_stage_backward_weight_grad_validation(self, device):
 
 
 devices = ["cpu", "cuda", "hpu", "xpu"]
-instantiate_device_type_tests(StageBackwardTests, globals(), only_for=devices)
+instantiate_device_type_tests(
+    StageBackwardTests, globals(), only_for=devices, allow_xpu=True
+)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/pipelining/test_microbatch.py b/test/distributed/pipelining/test_microbatch.py
index c7c655e480c66..99bb0fddaa21c 100644
--- a/test/distributed/pipelining/test_microbatch.py
+++ b/test/distributed/pipelining/test_microbatch.py
@@ -9,7 +9,10 @@
     split_args_kwargs_into_chunks,
     TensorChunkSpec,
 )
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    skipXPUIf,
+)
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
@@ -56,6 +59,7 @@ def test_split_and_merge(self):
         torch.testing.assert_close(merged_kwargs, kwargs)
         print("Microbatch test passed")
 
+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
     def test_chunk_spec(self, device):
         mod = ModelWithKwargs().to(device)
         batch_size = ModelWithKwargs.DEFAULT_BATCH_SIZE
@@ -84,12 +88,15 @@ def test_chunk_spec(self, device):
 
         ref = mod(x, y)
         out = pipe(x, y)[0]
+
         torch.testing.assert_close(out, ref)
         print(f"equivalence test passed {torch.sum(out)} ref {torch.sum(ref)}")
 
 
 devices = ["cpu", "cuda", "hpu", "xpu"]
-instantiate_device_type_tests(MicrobatchTests, globals(), only_for=devices)
+instantiate_device_type_tests(
+    MicrobatchTests, globals(), only_for=devices, allow_xpu=True
+)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py
index b1ad9b757a89b..dabf3d78a6f13 100644
--- a/test/distributed/pipelining/test_schedule.py
+++ b/test/distributed/pipelining/test_schedule.py
@@ -10,10 +10,12 @@
 import torch
 from torch.distributed.pipelining import (
     Schedule1F1B,
+    ScheduleDualPipeV,
     ScheduleGPipe,
     ScheduleInterleaved1F1B,
     ScheduleInterleavedZeroBubble,
     ScheduleLoopedBFS,
+    ScheduleZBVZeroBubble,
 )
 from torch.distributed.pipelining._utils import generate_stage_to_rank_mapping
 from torch.distributed.pipelining.schedules import (
@@ -38,7 +40,7 @@
     W,
 )
 from torch.distributed.pipelining.stage import _PipelineStageBase, PipelineStage
-from torch.testing._internal.common_distributed import requires_nccl
+from torch.testing._internal.common_distributed import requires_accelerator_dist_backend
 from torch.testing._internal.common_utils import (
     check_leaked_tensors,
     instantiate_parametrized_tests,
@@ -51,6 +53,7 @@
 
 ARTIFACTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "artifacts")
 
+device = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 logger = logging.getLogger(__name__)
 torch.manual_seed(0)
 
@@ -348,10 +351,91 @@ def stage_to_rank(stage):
                     num_stages=num_stages,
                 )
 
+    @parametrize(
+        "ScheduleClass",
+        [ScheduleDualPipeV, ScheduleZBVZeroBubble],
+    )
+    def test_pipeline_order_for_v_schedules(self, ScheduleClass):
+        for num_local_stages, num_microbatches, group_size in self.test_cases:
+            with self.subTest(
+                num_local_stages=num_local_stages,
+                num_microbatches=num_microbatches,
+                group_size=group_size,
+            ):
+                num_stages = num_local_stages * group_size
+                stages = [
+                    MockPipelineStage(group_size=group_size, num_stages=num_stages)
+                    for i in range(num_local_stages)
+                ]
+
+                # V schedules only support 2 stages per rank so if num_local_stages is not 2, ensure an error is thrown
+                if num_local_stages != 2:
+                    with self.assertRaises(ValueError):
+                        ScheduleClass(
+                            stages,
+                            num_microbatches,
+                        )
+                    continue
+
+                # DualPipeV requires num_microbatches to be >= num_stages
+                if ScheduleClass == ScheduleDualPipeV and num_microbatches < num_stages:
+                    with self.assertRaises(ValueError):
+                        ScheduleClass(
+                            stages,
+                            num_microbatches,
+                        )
+                    continue
+
+                # Create schedule and validate it
+                schedule = ScheduleClass(stages, num_microbatches)
+                _validate_schedule(
+                    schedule.pipeline_order, group_size, num_stages, num_microbatches
+                )
+
 
 instantiate_parametrized_tests(TestSchedulePlan)
 
 
+class TestScheduleCsv(TestCase):
+    @parametrize(
+        "ScheduleClass,csv_name",
+        [
+            (ScheduleDualPipeV, "dualpipev_4rank_10mb"),
+        ],
+    )
+    def test_csv_compare(self, ScheduleClass, csv_name):
+        """
+        Test that schedules matches the expected CSV.  This is a regression test to ensure that the schedule
+        is not changed unintentionally.
+        """
+        num_local_stages = 2
+        group_size = 4
+        num_stages = num_local_stages * group_size
+        stages = [
+            MockPipelineStage(group_size=group_size, num_stages=num_stages)
+            for _ in range(num_local_stages)
+        ]
+        num_microbatches = 10
+        schedule = ScheduleClass(stages, num_microbatches)
+        comms_csv = os.path.join(ARTIFACTS_DIR, f"{csv_name}.csv")
+        sch = schedule.pipeline_order
+
+        # Uncomment to regenerate reference output
+        # schedule._dump_csv("test.csv", "compute_only")
+
+        sch_ref = {}
+        with open(comms_csv, newline="") as ref:
+            for rank, row in enumerate(csv.reader(ref)):
+                sch_ref[rank] = [_Action.from_str(s) for s in row]
+
+        for rank in sch_ref:
+            for timestep, (a, b) in enumerate(zip(sch[rank], sch_ref[rank])):
+                self.assertEqual(a, b, f"Mismatch at {timestep=}, {a=}, expected {b}")
+
+
+instantiate_parametrized_tests(TestScheduleCsv)
+
+
 class TestScheduleLowering(TestCase):
     """Tests lowering passes that convert simple compute-only (FBW) schedules into compute+comms schedules"""
 
@@ -657,7 +741,7 @@ def _dump_csv(pipeline_order_with_comms, filename: str):
         # print(_format_pipeline_order(simulated_schedule))
         self.assertEqual(num_steps, 113)
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     def test_grad_with_v_schedule(self):
         """
         We have a special case for V schedules where 2 adjacent stages are on the same rank.
@@ -677,7 +761,6 @@ def test_grad_with_v_schedule(self):
         d_hid = 512
         batch_size = 256
         n_stages = 2
-        device = "cuda"
         full_mod = MultiMLP(d_hid, n_layers=n_stages)
         full_mod.to(device)
 
@@ -721,7 +804,7 @@ def test_grad_with_v_schedule(self):
             loss_fn=loss_fn,
             scale_grads=False,
         )
-        schedule._load_actions(
+        schedule._prepare_schedule_with_comms(
             {
                 0: self._parse_actions(
                     [
@@ -776,7 +859,7 @@ def test_grad_with_v_schedule(self):
 
         torch.distributed.destroy_process_group()
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     def test_grad_with_split_b_w(self):
         """
         Ensure that separate dInput and dWeight computations are correctly executed.
@@ -789,7 +872,6 @@ def test_grad_with_split_b_w(self):
         d_hid = 512
         batch_size = 256
         n_stages = 1
-        device = "cuda"
         full_mod = MultiMLP(d_hid, n_layers=n_stages)
         full_mod.to(device)
 
@@ -832,7 +914,7 @@ def test_grad_with_split_b_w(self):
             num_microbatches,
             loss_fn=loss_fn,
         )
-        schedule._load_actions(
+        schedule._prepare_schedule_with_comms(
             {
                 0: self._parse_actions(
                     [
diff --git a/test/distributed/pipelining/test_schedule_multiproc.py b/test/distributed/pipelining/test_schedule_multiproc.py
index ae91911bc6a02..9ba12c3d69965 100644
--- a/test/distributed/pipelining/test_schedule_multiproc.py
+++ b/test/distributed/pipelining/test_schedule_multiproc.py
@@ -3,6 +3,7 @@
 import copy
 import logging
 import tempfile
+from dataclasses import dataclass
 
 from model_registry import ModelWithKwargs, MultiMLP, MultiMLPKwargs, MultiMLPWithDw
 from schedule_registry import (
@@ -19,6 +20,7 @@
     pipeline,
     PipelineStage,
     Schedule1F1B,
+    ScheduleDualPipeV,
     ScheduleGPipe,
     ScheduleInterleaved1F1B,
     ScheduleInterleavedZeroBubble,
@@ -26,10 +28,10 @@
     ScheduleZBVZeroBubble,
 )
 from torch.distributed.pipelining.schedules import _PipelineScheduleRuntime
-from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.nn.modules.loss import MSELoss
 from torch.testing._internal.common_distributed import (
-    MultiProcContinousTest,
-    requires_nccl,
+    MultiProcContinuousTest,
+    requires_accelerator_dist_backend,
 )
 from torch.testing._internal.common_utils import (
     check_leaked_tensors,
@@ -45,150 +47,191 @@
 d_hid = 512
 batch_size = 64
 torch.manual_seed(0)
-device_type = "cuda"
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+backend = dist.get_default_backend_for_device(device_type)
+TEST_MULTIACCELERATOR = torch.accelerator.device_count() >= 2
 
 
-class ScheduleTest(MultiProcContinousTest):
-    world_size = 2
+@dataclass
+class PipelineTestConfig:
+    world_size: int
+    device: torch.device
+    rank: int
 
-    @classmethod
-    def backend_str(cls) -> str:
-        # Testing with NCCL backend
-        return "nccl"
 
-    @property
-    def device(self) -> torch.device:
-        return torch.device(device_type, self.rank)
+def setup_models_and_data(
+    config: PipelineTestConfig, n_layers=None, model_class=MultiMLP
+):
+    """Setup models, input data, target data, and loss function."""
+    if n_layers is None:
+        n_layers = config.world_size
 
-    def _setup_models_and_data(self, n_layers=None, model_class=MultiMLP):
-        """Setup models, input data, target data, and loss function."""
-        if n_layers is None:
-            n_layers = self.world_size
+    full_mod = model_class(d_hid, n_layers=n_layers)
+    full_mod.to(config.device)
+    ref_mod = copy.deepcopy(full_mod)
 
-        full_mod = model_class(d_hid, n_layers=n_layers)
-        full_mod.to(self.device)
-        ref_mod = copy.deepcopy(full_mod)
+    x = torch.randn(batch_size, d_hid, device=config.device)
+    with torch.no_grad():
+        y = ref_mod(x)
+        target = y + torch.randn(batch_size, d_hid, device=config.device)
 
-        x = torch.randn(batch_size, d_hid, device=self.device)
-        with torch.no_grad():
-            y = ref_mod(x)
-            target = y + torch.randn(batch_size, d_hid, device=self.device)
+    loss_fn = torch.nn.MSELoss(reduction="sum")
+    return full_mod, ref_mod, x, target, loss_fn
 
-        loss_fn = torch.nn.MSELoss(reduction="sum")
-        return full_mod, ref_mod, x, target, loss_fn
-
-    def _create_single_stage_pipeline(self, mod, x, chunks, use_tracer=True):
-        """Create a single-stage pipeline using either tracer or manual stage creation."""
-        if use_tracer:
-            x_mb = x.chunk(chunks)[0]
-            split_spec = mod.split_spec if hasattr(mod, "split_spec") else None
-            pipe = pipeline(mod, mb_args=(x_mb,), split_spec=split_spec)
-            stage = pipe.build_stage(self.rank, self.device)
-            stage_module = pipe.get_stage_module(self.rank)
-            return stage, stage_module, [stage_module]
-        else:
-            # Manual stage creation
-            submod_name = f"layers.{self.rank}"
-            stage_module = mod.get_submodule(submod_name)
-            stage = PipelineStage(stage_module, self.rank, self.world_size, self.device)
-            return stage, stage_module, [stage_module]
-
-    def _create_multi_stage_pipeline(
-        self, mod, stages_per_rank, n_stages, stage_indices=None
-    ):
-        """Create multiple pipeline stages for interleaved schedules."""
-        if stage_indices is None:
-            stage_indices = [
-                self.rank + i * self.world_size for i in range(stages_per_rank)
-            ]
-
-        submod_names = [f"layers.{i}" for i in stage_indices]
-        stage_modules = [mod.get_submodule(submod_name) for submod_name in submod_names]
-        stages = [
-            PipelineStage(stage_module, stage_idx, n_stages, self.device)
-            for stage_module, stage_idx in zip(stage_modules, stage_indices)
-        ]
-        return stages, stage_modules, submod_names
-
-    def _run_reference_model(
-        self, ref_mod, x, target, loss_fn, num_iterations=2, **kwargs
-    ):
-        """Run reference model for specified iterations and return final output and loss."""
-        ref_out = None
-        ref_loss = None
-
-        for _ in range(num_iterations):
-            ref_mod.zero_grad()
-            ref_out = ref_mod(x, **kwargs)
-            ref_loss = loss_fn(ref_out, target)
-            ref_loss.backward()
 
-        return ref_out, ref_loss
+def create_single_stage_pipeline(
+    config: PipelineTestConfig, mod, x, chunks, use_tracer=True
+):
+    """Create a single-stage pipeline using either tracer or manual stage creation."""
+    if use_tracer:
+        x_mb = x.chunk(chunks)[0]
+        split_spec = mod.split_spec if hasattr(mod, "split_spec") else None
+        pipe = pipeline(mod, mb_args=(x_mb,), split_spec=split_spec)
+        stage = pipe.build_stage(config.rank, config.device)
+        stage_module = pipe.get_stage_module(config.rank)
+        return stage, stage_module, [stage_module]
+    else:
+        # Manual stage creation
+        submod_name = f"layers.{config.rank}"
+        stage_module = mod.get_submodule(submod_name)
+        stage = PipelineStage(
+            stage_module, config.rank, config.world_size, config.device
+        )
+        return stage, stage_module, [stage_module]
 
-    def _check_gradients(
-        self, stage_modules, ref_mod, submod_names=None, rtol=1e-5, atol=4e-5
-    ):
-        """Check that gradients match between pipeline stages and reference model using flexible comparison."""
 
-        def grad_check(grad1, grad2, param_name, rtol, atol, tolerance=0.05):
-            if grad1 is None and grad2 is None:
-                return
-            if grad1 is None or grad2 is None:
-                raise AssertionError(
-                    f"One gradient is None for {param_name}: {grad1} vs {grad2}"
-                )
-            torch.testing.assert_close(grad1, grad2, rtol=rtol, atol=atol)
+def create_multi_stage_pipeline(
+    config: PipelineTestConfig, mod, stages_per_rank, n_stages, stage_indices=None
+):
+    """Create multiple pipeline stages for interleaved schedules."""
+    if stage_indices is None:
+        stage_indices = [
+            config.rank + i * config.world_size for i in range(stages_per_rank)
+        ]
 
-        if submod_names is None:
-            # Single stage case - need to detect tracer vs manual pipeline
-            stage_modules = [stage_modules]
-
-            # Try to detect if this is a tracer-based pipeline by checking if parameter exists in ref_mod
-            sample_param_name = next(iter(stage_modules[0].named_parameters()))[0]
-            try:
-                # Try to get parameter directly from reference model (tracer-based)
-                ref_mod.get_parameter(sample_param_name)
-                is_tracer_based = True
-            except AttributeError:
-                # Parameter doesn't exist at root level, must be manual pipeline
-                is_tracer_based = False
-
-            if is_tracer_based:
-                # Tracer-based pipeline: parameter names are full paths from root model
-                for name, p in stage_modules[0].named_parameters():
-                    ref_p = ref_mod.get_parameter(name)
-                    grad_check(p.grad, ref_p.grad, name, rtol, atol)
-            else:
-                # Manual pipeline: parameter names are local to the submodule
-                submod_name = f"layers.{self.rank}"
-                ref_submod = ref_mod.get_submodule(submod_name)
-                for name, p in stage_modules[0].named_parameters():
-                    ref_p = ref_submod.get_parameter(name)
-                    grad_check(p.grad, ref_p.grad, f"{submod_name}.{name}", rtol, atol)
+    submod_names = [f"layers.{i}" for i in stage_indices]
+    stage_modules = [mod.get_submodule(submod_name) for submod_name in submod_names]
+    stages = [
+        PipelineStage(stage_module, stage_idx, n_stages, config.device)
+        for stage_module, stage_idx in zip(stage_modules, stage_indices, strict=True)
+    ]
+    return stages, stage_modules, submod_names
+
+
+def run_reference_model(ref_mod, x, target, loss_fn, num_iterations=2, **kwargs):
+    """Run reference model for specified iterations and return final output and loss."""
+    ref_out = None
+    ref_loss = None
+
+    for _ in range(num_iterations):
+        ref_mod.zero_grad()
+        ref_out = ref_mod(x, **kwargs)
+        ref_loss = loss_fn(ref_out, target)
+        ref_loss.backward()
+
+    return ref_out, ref_loss
+
+
+def check_gradients(
+    config: PipelineTestConfig,
+    stage_modules,
+    ref_mod,
+    submod_names=None,
+    rtol=1e-5,
+    atol=4e-5,
+):
+    """Check that gradients match between pipeline stages and reference model using flexible comparison."""
+
+    def grad_check(grad1, grad2, param_name, rtol, atol, tolerance=0.05):
+        if grad1 is None and grad2 is None:
+            return
+        if grad1 is None or grad2 is None:
+            raise AssertionError(
+                f"One gradient is None for {param_name}: {grad1} vs {grad2}"
+            )
+        try:
+            torch.testing.assert_close(grad1, grad2, rtol=rtol, atol=atol)
+        except AssertionError:
+            print(
+                f"Numerical issues detected for {param_name}: param grad {grad1} vs ref grad {grad2}"
+            )
+            raise
+
+    if submod_names is None:
+        # Single stage case - need to detect tracer vs manual pipeline
+        stage_modules = [stage_modules]
+
+        # Try to detect if this is a tracer-based pipeline by checking if parameter exists in ref_mod
+        sample_param_name = next(iter(stage_modules[0].named_parameters()))[0]
+        try:
+            # Try to get parameter directly from reference model (tracer-based)
+            ref_mod.get_parameter(sample_param_name)
+            is_tracer_based = True
+        except AttributeError:
+            # Parameter doesn't exist at root level, must be manual pipeline
+            is_tracer_based = False
+
+        if is_tracer_based:
+            # Tracer-based pipeline: parameter names are full paths from root model
+            for name, p in stage_modules[0].named_parameters():
+                ref_p = ref_mod.get_parameter(name)
+                grad_check(p.grad, ref_p.grad, name, rtol, atol)
         else:
-            # Multi-stage case - always use submodule approach
-            for stage_module, submod_name in zip(stage_modules, submod_names):
-                ref_submod = ref_mod.get_submodule(submod_name)
-                for name, p in stage_module.named_parameters():
-                    ref_p = ref_submod.get_parameter(name)
-                    grad_check(p.grad, ref_p.grad, f"{submod_name}.{name}", rtol, atol)
-
-    def _zero_gradients(self, stage_modules):
-        """Zero gradients for all stage modules."""
-        if not isinstance(stage_modules, list):
-            stage_modules = [stage_modules]
-        for stage_module in stage_modules:
-            stage_module.zero_grad()
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+            # Manual pipeline: parameter names are local to the submodule
+            submod_name = f"layers.{config.rank}"
+            ref_submod = ref_mod.get_submodule(submod_name)
+            for name, p in stage_modules[0].named_parameters():
+                ref_p = ref_submod.get_parameter(name)
+                grad_check(p.grad, ref_p.grad, f"{submod_name}.{name}", rtol, atol)
+    else:
+        # Multi-stage case - always use submodule approach
+        for stage_module, submod_name in zip(stage_modules, submod_names):
+            ref_submod = ref_mod.get_submodule(submod_name)
+            for name, p in stage_module.named_parameters():
+                ref_p = ref_submod.get_parameter(name)
+                grad_check(p.grad, ref_p.grad, f"{submod_name}.{name}", rtol, atol)
+
+
+def zero_gradients(stage_modules):
+    """Zero gradients for all stage modules."""
+    if not isinstance(stage_modules, list):
+        stage_modules = [stage_modules]
+    for stage_module in stage_modules:
+        stage_module.zero_grad()
+
+
+class ScheduleTest(MultiProcContinuousTest):
+    world_size = 4
+
+    @classmethod
+    def backend_str(cls) -> str:
+        # Testing with NCCL backend
+        return backend
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    @property
+    def config(self) -> PipelineTestConfig:
+        """Lazily create and return the pipeline test configuration."""
+        return PipelineTestConfig(
+            world_size=self.world_size, device=self.device, rank=self.rank
+        )
+
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [_ScheduleForwardOnly])
     def test_forward_only(self, ScheduleClass):
-        mod, mod_ref, x, _, _ = self._setup_models_and_data()
+        mod, mod_ref, x, _, _ = setup_models_and_data(self.config)
         x_clone = x.clone()
 
         num_microbatches = 2 * self.world_size
-        stage, _, _ = self._create_single_stage_pipeline(mod, x, num_microbatches)
+        stage, _, _ = create_single_stage_pipeline(
+            self.config, mod, x, num_microbatches
+        )
         schedule = ScheduleClass(stage, num_microbatches, scale_grads=False)
 
         # Run forward-only schedule
@@ -210,8 +253,10 @@ def test_forward_only(self, ScheduleClass):
                 x_clone = mod_ref(x_clone)
             torch.testing.assert_close(x_clone, out)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize(
         "ScheduleClass",
         [
@@ -232,22 +277,24 @@ def test_eval_inference_mode(self, ScheduleClass):
             # Multi-stage schedules
             stages_per_rank = 2
             n_stages = stages_per_rank * self.world_size
-            mod, _, x, target, loss_fn = self._setup_models_and_data(n_layers=n_stages)
+            mod, _, x, target, loss_fn = setup_models_and_data(
+                self.config, n_layers=n_stages
+            )
 
             # Create multi-stage pipeline
-            stages, stage_modules, _ = self._create_multi_stage_pipeline(
-                mod, stages_per_rank, n_stages
+            stages, stage_modules, _ = create_multi_stage_pipeline(
+                self.config, mod, stages_per_rank, n_stages
             )
             schedule = ScheduleClass(
                 stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
             )
         else:
             # Single-stage schedules
-            mod, _, x, target, loss_fn = self._setup_models_and_data()
+            mod, _, x, target, loss_fn = setup_models_and_data(self.config)
 
             # Create single-stage pipeline
-            stage, stage_module, _ = self._create_single_stage_pipeline(
-                mod, x, num_microbatches
+            stage, stage_module, _ = create_single_stage_pipeline(
+                self.config, mod, x, num_microbatches
             )
             stage_modules = [stage_module]
             schedule = ScheduleClass(
@@ -255,7 +302,7 @@ def test_eval_inference_mode(self, ScheduleClass):
             )
 
         # Clear gradients and run eval
-        self._zero_gradients(stage_modules)
+        zero_gradients(stage_modules)
         losses = []
 
         if self.rank == 0:
@@ -283,13 +330,15 @@ def test_eval_inference_mode(self, ScheduleClass):
         if self.rank == self.world_size - 1:
             self.assertTrue(len(losses) > 0, "Losses should be computed during eval()")
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
     def test_multi_iter(self, ScheduleClass):
-        mod, _, x, target, loss_fn = self._setup_models_and_data()
+        mod, _, x, target, loss_fn = setup_models_and_data(self.config)
         chunks = 4
-        stage, _, _ = self._create_single_stage_pipeline(mod, x, chunks)
+        stage, _, _ = create_single_stage_pipeline(self.config, mod, x, chunks)
         schedule = ScheduleClass(stage, chunks, loss_fn=loss_fn, scale_grads=False)
 
         # Run
@@ -302,17 +351,15 @@ def test_multi_iter(self, ScheduleClass):
             else:
                 schedule.step()
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+        dist.barrier(device_ids=[self.rank])
+
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
     def test_kwargs_with_tracer(self, ScheduleClass):
-        # Model has two stages only, thus limiting group size to 2
-        group_size = 2
-        group = dist.new_group(list(range(group_size)))
-        if self.rank >= group_size:
-            return
-
-        mod = ModelWithKwargs(d_hid)
+        mod = ModelWithKwargs(d_hid, splits=self.world_size)
         mod.to(self.device)
 
         x = torch.randn(batch_size, d_hid, device=self.device)
@@ -333,7 +380,6 @@ def test_kwargs_with_tracer(self, ScheduleClass):
         stage = pipe.build_stage(
             self.rank,
             self.device,
-            group=group,
         )
 
         # Attach to a schedule
@@ -344,34 +390,36 @@ def test_kwargs_with_tracer(self, ScheduleClass):
         losses = []
         if self.rank == 0:
             schedule.step(x, y=y)
-        elif self.rank == group_size - 1:
+        elif self.rank == self.world_size - 1:
             out = schedule.step(target=target, losses=losses)
         else:
             schedule.step()
 
-        # dist.barrier()
+        dist.barrier(device_ids=[self.rank])
 
         # Last rank checks result
-        if self.rank == group_size - 1:
+        if self.rank == self.world_size - 1:
             ref_out = mod(x, y=y)
             ref_loss = loss_fn(ref_out, target)
             pipe_loss = sum(losses)
             torch.testing.assert_close(out, ref_out, rtol=1e-2, atol=5e-3)
             torch.testing.assert_close(pipe_loss, ref_loss)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
     def test_grad_with_tracer(self, ScheduleClass):
-        mod, ref_mod, x, target, loss_fn = self._setup_models_and_data()
+        mod, ref_mod, x, target, loss_fn = setup_models_and_data(self.config)
 
         # Run reference
-        ref_out, ref_loss = self._run_reference_model(ref_mod, x, target, loss_fn)
+        ref_out, ref_loss = run_reference_model(ref_mod, x, target, loss_fn)
 
         # Create pipeline and schedule
         chunks = 2 * self.world_size
-        stage, stage_module, stage_modules = self._create_single_stage_pipeline(
-            mod, x, chunks
+        stage, stage_module, stage_modules = create_single_stage_pipeline(
+            self.config, mod, x, chunks
         )
         schedule = ScheduleClass(stage, chunks, loss_fn=loss_fn, scale_grads=False)
 
@@ -379,7 +427,7 @@ def test_grad_with_tracer(self, ScheduleClass):
         out = None
         losses = []
         for _ in range(2):
-            self._zero_gradients(stage_module)
+            zero_gradients(stage_module)
             if self.rank == 0:
                 schedule.step(x)
             elif self.rank == self.world_size - 1:
@@ -387,7 +435,7 @@ def test_grad_with_tracer(self, ScheduleClass):
             else:
                 schedule.step()
 
-        dist.barrier()
+        dist.barrier(device_ids=[self.rank])
 
         # Last rank checks result
         if self.rank == self.world_size - 1:
@@ -396,22 +444,24 @@ def test_grad_with_tracer(self, ScheduleClass):
             torch.testing.assert_close(pipe_loss, ref_loss)
 
         # Check gradients using helper method
-        self._check_gradients(stage_module, ref_mod)
+        check_gradients(self.config, stage_module, ref_mod)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
     @parametrize("shape_inference", [True, False])
     def test_grad_with_manual(self, ScheduleClass, shape_inference):
-        mod, ref_mod, x, target, loss_fn = self._setup_models_and_data()
+        mod, ref_mod, x, target, loss_fn = setup_models_and_data(self.config)
 
         # Run reference
-        ref_out, ref_loss = self._run_reference_model(ref_mod, x, target, loss_fn)
+        ref_out, ref_loss = run_reference_model(ref_mod, x, target, loss_fn)
 
         # Create manual pipeline stage
         chunks = 2 * self.world_size
-        stage, stage_module, _ = self._create_single_stage_pipeline(
-            mod, x, chunks, use_tracer=False
+        stage, stage_module, _ = create_single_stage_pipeline(
+            self.config, mod, x, chunks, use_tracer=False
         )
 
         # Handle shape inference
@@ -434,7 +484,7 @@ def test_grad_with_manual(self, ScheduleClass, shape_inference):
         out = None
         losses = []
         for _ in range(2):
-            self._zero_gradients(stage_module)
+            zero_gradients(stage_module)
             if self.rank == 0:
                 schedule.step(x)
             elif self.rank == self.world_size - 1:
@@ -442,7 +492,7 @@ def test_grad_with_manual(self, ScheduleClass, shape_inference):
             else:
                 schedule.step()
 
-        dist.barrier()
+        dist.barrier(device_ids=[self.rank])
 
         # Last rank checks result
         if self.rank == self.world_size - 1:
@@ -451,10 +501,12 @@ def test_grad_with_manual(self, ScheduleClass, shape_inference):
             torch.testing.assert_close(pipe_loss, ref_loss)
 
         # Check gradients using helper method
-        self._check_gradients(stage_module, ref_mod)
+        check_gradients(self.config, stage_module, ref_mod)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize(
         "ScheduleClass",
         [
@@ -467,16 +519,16 @@ def test_grad_with_manual(self, ScheduleClass, shape_inference):
     def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime):
         stages_per_rank = 2
         n_stages = stages_per_rank * self.world_size
-        mod, ref_mod, x, target, loss_fn = self._setup_models_and_data(
-            n_layers=n_stages
+        mod, ref_mod, x, target, loss_fn = setup_models_and_data(
+            self.config, n_layers=n_stages
         )
 
         # Run reference
-        ref_out, ref_loss = self._run_reference_model(ref_mod, x, target, loss_fn)
+        ref_out, ref_loss = run_reference_model(ref_mod, x, target, loss_fn)
 
         # Create multi-stage pipeline
-        stages, stage_modules, submod_names = self._create_multi_stage_pipeline(
-            mod, stages_per_rank, n_stages
+        stages, stage_modules, submod_names = create_multi_stage_pipeline(
+            self.config, mod, stages_per_rank, n_stages
         )
         print(f"Rank {self.rank} stages: {[stage.stage_index for stage in stages]}")
 
@@ -497,7 +549,7 @@ def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime):
             tmp_schedule = _PipelineScheduleRuntime(
                 stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
             )
-            tmp_schedule._load_actions(old_schedule.pipeline_order)
+            tmp_schedule._prepare_schedule_with_comms(old_schedule.pipeline_order)
 
             # Test CSV round-trip for compute_comms schedule
             schedule = _PipelineScheduleRuntime(
@@ -511,7 +563,7 @@ def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime):
             one_more_schedule = _PipelineScheduleRuntime(
                 stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
             )
-            one_more_schedule._load_actions(
+            one_more_schedule._prepare_schedule_with_comms(
                 schedule.pipeline_order_with_comms, format="compute_comms"
             )
 
@@ -536,7 +588,7 @@ def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime):
         losses = []
         with check_leaked_tensors() as garbage_tensors:
             for _ in range(2):
-                self._zero_gradients(stage_modules)
+                zero_gradients(stage_modules)
                 if self.rank == 0:
                     schedule.step(x)
                 elif self.rank == self.world_size - 1:
@@ -559,204 +611,30 @@ def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime):
 
         # Check gradients - use relaxed tolerances for interleaved schedules
         # since gradients are small
-        self._check_gradients(
-            stage_modules, ref_mod, submod_names, rtol=5e-3, atol=5e-3
-        )
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    @parametrize("ScheduleClass", [ScheduleWithW, ScheduleInterleavedZeroBubble])
-    def test_schedule_with_native_zero_bubble(self, ScheduleClass):
-        print(ScheduleClass)
-        if ScheduleClass is ScheduleInterleavedZeroBubble:
-            n_stages = 4
-            num_microbatches = 2 * n_stages
-            rank_stages = {0: [0, 2], 1: [1, 3]}
-        else:
-            n_stages = ScheduleClass.n_stages
-            num_microbatches = ScheduleClass.num_microbatches
-            rank_stages = ScheduleClass.rank_stages
-
-        num_steps = 4
-        mod, ref_mod, x, target, loss_fn = self._setup_models_and_data(
-            n_layers=n_stages
-        )
-
-        # Create multi-stage pipeline with custom stage indices
-        stage_indices = rank_stages[self.rank]
-        print(f"Rank {self.rank} stages: {stage_indices}")
-        stages, stage_modules, submod_names = self._create_multi_stage_pipeline(
-            mod, len(stage_indices), n_stages, stage_indices
-        )
-
-        schedule = ScheduleClass(
-            stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
-        )
-
-        # Run reference model
-        ref_x = x.detach().clone().requires_grad_(x.requires_grad)
-        torch.testing.assert_close(x, ref_x)
-        for _ in range(num_steps):
-            ref_out = ref_mod(ref_x)
-            ref_loss = loss_fn(ref_out, target)
-            ref_loss.backward()
-
-        # Run pipeline with tensor leak checking
-        losses = []
-        with check_leaked_tensors() as garbage_tensors:
-            for _ in range(num_steps):
-                if self.rank == 0:
-                    schedule.step(x)
-                elif self.rank == self.world_size - 1:
-                    schedule.step(target=target, losses=losses)
-                else:
-                    schedule.step()
-
-        self.assertEqual(
-            len(garbage_tensors),
-            0,
-            "Found leaked tensors, check logs above for debug info",
-        )
-
-        # Check gradients using helper method
-        self._check_gradients(stage_modules, ref_mod, submod_names)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    @parametrize("ScheduleClass", [ScheduleWithReorderedB])
-    def test_pipeline_schedule_runtime_custom_sched(self, ScheduleClass):
-        n_stages = 2
-        stages_per_rank = 1
-        mod, ref_mod, x, target, loss_fn = self._setup_models_and_data(
-            n_layers=n_stages
-        )
-
-        # Run reference
-        ref_out, ref_loss = self._run_reference_model(ref_mod, x, target, loss_fn)
-
-        # Create pipeline stages
-        stages, stage_modules, submod_names = self._create_multi_stage_pipeline(
-            mod, stages_per_rank, n_stages
-        )
-        print(f"Rank {self.rank} stages: {[stage.stage_index for stage in stages]}")
-
-        num_microbatches = (
-            ScheduleClass.num_microbatches
-            if hasattr(ScheduleClass, "num_microbatches")
-            else 8
-        )
-
-        schedule = ScheduleClass(
-            stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
-        )
-        assert isinstance(schedule, _PipelineScheduleRuntime)
-
-        # Run pipeline with tensor leak checking
-        with check_leaked_tensors() as garbage_tensors:
-            for _ in range(2):
-                self._zero_gradients(stage_modules)
-                if self.rank == 0:
-                    schedule.step(x)
-                elif self.rank == self.world_size - 1:
-                    losses = []
-                    out = schedule.step(target=target, losses=losses)
-                else:
-                    schedule.step()
-
-        self.assertEqual(
-            len(garbage_tensors),
-            0,
-            "Found leaked tensors, check logs above for debug info",
+        check_gradients(
+            self.config, stage_modules, ref_mod, submod_names, rtol=5e-3, atol=5e-3
         )
-        dist.barrier()
-
-        # Verify results
-        if self.rank == self.world_size - 1:
-            torch.testing.assert_close(out, ref_out)
-            pipe_loss = sum(losses)
-            torch.testing.assert_close(pipe_loss, ref_loss)
-
-        # Check gradients using helper method
-        self._check_gradients(stage_modules, ref_mod, submod_names)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    @parametrize(
-        "schedule_class", [ScheduleVShaped, ScheduleUnbalanced, ScheduleZBVZeroBubble]
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
-    @parametrize("use_new_runtime", [False, True])
-    def test_non_symmetric_stage_ids(self, schedule_class, use_new_runtime):
-        if schedule_class is ScheduleZBVZeroBubble:
-            n_stages = 4
-            rank_stages = {0: [0, 3], 1: [1, 2]}
-        else:
-            n_stages = schedule_class.n_stages
-            rank_stages = schedule_class.rank_stages
-
-        mod, ref_mod, x, target, loss_fn = self._setup_models_and_data(
-            n_layers=n_stages
-        )
-
-        # Run reference
-        ref_out, ref_loss = self._run_reference_model(ref_mod, x, target, loss_fn)
-
-        # Create multi-stage pipeline with custom stage indices
-        num_microbatches = 1
-        stage_indices = rank_stages[self.rank]
-        print(f"Rank {self.rank} stages: {stage_indices}")
-        stages, stage_modules, submod_names = self._create_multi_stage_pipeline(
-            mod, len(stage_indices), n_stages, stage_indices
-        )
-
-        schedule = schedule_class(
-            stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
-        )
-
-        if use_new_runtime:
-            old_schedule = schedule
-            schedule = _PipelineScheduleRuntime(
-                stages, num_microbatches, loss_fn=loss_fn
-            )
-            schedule._load_actions(old_schedule.pipeline_order)
-
-        # Run pipeline - special case where first and last stage are on rank 0
-        out = None
-        losses = []
-        for _ in range(2):
-            self._zero_gradients(stage_modules)
-            if self.rank == 0:
-                out = schedule.step(x, target=target, losses=losses)
-            else:
-                schedule.step()
-
-        dist.barrier()
-
-        # Verify results (rank 0 has both first and last stages)
-        if self.rank == 0:
-            torch.testing.assert_close(out, ref_out)
-            pipe_loss = sum(losses)
-            torch.testing.assert_close(pipe_loss, ref_loss)
-
-        # Check gradients using helper method
-        self._check_gradients(stage_modules, ref_mod, submod_names)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     @parametrize("ScheduleClass", [ScheduleInterleavedZeroBubble])
     def test_schedule_with_weight_update_mlp_e2e(self, ScheduleClass):
         stages_per_rank = 2
         n_stages = stages_per_rank * self.world_size
-        full_mod, ref_mod, x, target, loss_fn = self._setup_models_and_data(
-            n_layers=n_stages, model_class=MultiMLPWithDw
+        full_mod, ref_mod, x, target, _ = setup_models_and_data(
+            self.config, n_layers=n_stages, model_class=MultiMLPWithDw
         )
         full_mod.toggle()
+        loss_fn = MSELoss()
 
         # Run reference
-        ref_out, ref_loss = self._run_reference_model(ref_mod, x, target, loss_fn)
+        ref_out, ref_loss = run_reference_model(ref_mod, x, target, loss_fn)
 
         # Create multi-stage pipeline with custom dw_builder
-        stages, stage_modules, submod_names = self._create_multi_stage_pipeline(
-            full_mod, stages_per_rank, n_stages
+        stages, stage_modules, submod_names = create_multi_stage_pipeline(
+            self.config, full_mod, stages_per_rank, n_stages
         )
 
         class CustomState:
@@ -795,13 +673,13 @@ def dw_runner():
             for stage_module, stage_idx in zip(stage_modules, stage_indices)
         ]
 
-        schedule = ScheduleClass(stages, 2, loss_fn=loss_fn, scale_grads=False)
+        schedule = ScheduleClass(stages, 2, loss_fn=loss_fn)
 
         # Run pipeline
         out = None
         losses = []
         for _ in range(2):
-            self._zero_gradients(stage_modules)
+            zero_gradients(stage_modules)
             if self.rank == 0:
                 schedule.step(x)
             elif self.rank == self.world_size - 1:
@@ -809,19 +687,77 @@ def dw_runner():
             else:
                 schedule.step()
 
-        dist.barrier()
+        dist.barrier(device_ids=[self.rank])
 
         # Verify results
         if self.rank == self.world_size - 1:
+            torch.testing.assert_close(out, ref_out)
+            pipe_loss = sum(losses) / len(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+        # Check gradients using helper method
+        check_gradients(self.config, stage_modules, ref_mod, submod_names)
+
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
+    @parametrize(
+        "schedule_class",
+        [ScheduleZBVZeroBubble, ScheduleDualPipeV],
+    )
+    @parametrize("use_new_runtime", [False, True])
+    def test_v_shape_schedules(self, schedule_class, use_new_runtime):
+        n_stages = 8
+        rank_stages = {0: [0, 7], 1: [1, 6], 2: [2, 5], 3: [3, 4]}
+        mod, ref_mod, x, target, loss_fn = setup_models_and_data(
+            self.config, n_layers=n_stages
+        )
+
+        # Run reference
+        ref_out, ref_loss = run_reference_model(ref_mod, x, target, loss_fn)
+
+        # Create multi-stage pipeline with custom stage indices
+        num_microbatches = 8
+        stage_indices = rank_stages[self.rank]
+        stages, stage_modules, submod_names = create_multi_stage_pipeline(
+            self.config, mod, len(stage_indices), n_stages, stage_indices
+        )
+
+        schedule = schedule_class(
+            stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
+        )
+
+        if schedule_class != ScheduleDualPipeV and use_new_runtime:
+            old_schedule = schedule
+            schedule = _PipelineScheduleRuntime(
+                stages, num_microbatches, loss_fn=loss_fn
+            )
+            schedule._prepare_schedule_with_comms(old_schedule.pipeline_order)
+
+        # Run pipeline - special case where first and last stage are on rank 0
+        out = None
+        losses = []
+        for _ in range(2):
+            zero_gradients(stage_modules)
+            if self.rank == 0:
+                out = schedule.step(x, target=target, losses=losses)
+            else:
+                schedule.step()
+
+        # Verify results (rank 0 has both first and last stages)
+        if self.rank == 0:
             torch.testing.assert_close(out, ref_out)
             pipe_loss = sum(losses)
             torch.testing.assert_close(pipe_loss, ref_loss)
 
         # Check gradients using helper method
-        self._check_gradients(stage_modules, ref_mod, submod_names)
+        check_gradients(self.config, stage_modules, ref_mod, submod_names)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize(
         "ScheduleClass",
         [ScheduleInterleavedZeroBubble, ScheduleInterleaved1F1B],
@@ -829,19 +765,19 @@ def dw_runner():
     def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
         stages_per_rank = 2
         n_stages = stages_per_rank * self.world_size
-        mod, ref_mod, x, target, loss_fn = self._setup_models_and_data(
-            n_layers=n_stages, model_class=MultiMLPKwargs
+        mod, ref_mod, x, target, loss_fn = setup_models_and_data(
+            self.config, n_layers=n_stages, model_class=MultiMLPKwargs
         )
         unused_kwarg = torch.tensor([1.0], device=self.device)
 
         # Run reference with kwargs
-        ref_out, ref_loss = self._run_reference_model(
+        ref_out, ref_loss = run_reference_model(
             ref_mod, x, target, loss_fn, unused_kwarg=unused_kwarg
         )
 
         # Create multi-stage pipeline
-        stages, stage_modules, submod_names = self._create_multi_stage_pipeline(
-            mod, stages_per_rank, n_stages
+        stages, stage_modules, submod_names = create_multi_stage_pipeline(
+            self.config, mod, stages_per_rank, n_stages
         )
 
         num_microbatches = (
@@ -857,7 +793,7 @@ def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
         out = None
         losses = []
         for _ in range(2):
-            self._zero_gradients(stage_modules)
+            zero_gradients(stage_modules)
             if self.rank == 0:
                 schedule.step(
                     x,
@@ -879,12 +815,215 @@ def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
             torch.testing.assert_close(pipe_loss, ref_loss)
 
         # Check gradients using helper method
-        self._check_gradients(
-            stage_modules, ref_mod, submod_names, rtol=1e-5, atol=5e-3
+        check_gradients(
+            self.config, stage_modules, ref_mod, submod_names, rtol=3e-5, atol=5e-3
         )
 
 
 instantiate_parametrized_tests(ScheduleTest)
 
+
+class CustomSchedulesTest(MultiProcContinuousTest):
+    """
+    These schedules are from the ScheduleRegistry and require world_size == 2
+    The schedules test weird and unconventional schedules for edge cases
+    """
+
+    world_size = 2
+
+    @classmethod
+    def backend_str(cls) -> str:
+        # Testing with NCCL backend
+        return backend
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    @property
+    def config(self) -> PipelineTestConfig:
+        """Lazily create and return the pipeline test configuration."""
+        return PipelineTestConfig(
+            world_size=self.world_size, device=self.device, rank=self.rank
+        )
+
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
+    @parametrize(
+        "schedule_class",
+        [ScheduleVShaped, ScheduleUnbalanced],
+    )
+    @parametrize("use_new_runtime", [False, True])
+    def test_non_symmetric_stage_ids(self, schedule_class, use_new_runtime):
+        n_stages = schedule_class.n_stages
+        rank_stages = schedule_class.rank_stages
+
+        mod, ref_mod, x, target, loss_fn = setup_models_and_data(
+            self.config, n_layers=n_stages
+        )
+
+        # Run reference
+        ref_out, ref_loss = run_reference_model(ref_mod, x, target, loss_fn)
+
+        # Create multi-stage pipeline with custom stage indices
+        num_microbatches = 1
+        stage_indices = rank_stages[self.rank]
+        print(f"Rank {self.rank} stages: {stage_indices}")
+        stages, stage_modules, submod_names = create_multi_stage_pipeline(
+            self.config, mod, len(stage_indices), n_stages, stage_indices
+        )
+
+        schedule = schedule_class(
+            stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
+        )
+
+        if use_new_runtime:
+            old_schedule = schedule
+            schedule = _PipelineScheduleRuntime(
+                stages, num_microbatches, loss_fn=loss_fn
+            )
+            schedule._prepare_schedule_with_comms(old_schedule.pipeline_order)
+
+        # Run pipeline - special case where first and last stage are on rank 0
+        out = None
+        losses = []
+        for _ in range(2):
+            zero_gradients(stage_modules)
+            if self.rank == 0:
+                out = schedule.step(x, target=target, losses=losses)
+            else:
+                schedule.step()
+
+        dist.barrier()
+
+        # Verify results (rank 0 has both first and last stages)
+        if self.rank == 0:
+            torch.testing.assert_close(out, ref_out)
+            pipe_loss = sum(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+        # Check gradients using helper method
+        check_gradients(self.config, stage_modules, ref_mod, submod_names)
+
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
+    @parametrize("ScheduleClass", [ScheduleWithReorderedB])
+    def test_pipeline_schedule_runtime_custom_sched(self, ScheduleClass):
+        n_stages = 2
+        stages_per_rank = 1
+        mod, ref_mod, x, target, loss_fn = setup_models_and_data(
+            self.config, n_layers=n_stages
+        )
+
+        # Run reference
+        ref_out, ref_loss = run_reference_model(ref_mod, x, target, loss_fn)
+
+        # Create pipeline stages
+        stages, stage_modules, submod_names = create_multi_stage_pipeline(
+            self.config, mod, stages_per_rank, n_stages
+        )
+        print(f"Rank {self.rank} stages: {[stage.stage_index for stage in stages]}")
+
+        num_microbatches = (
+            ScheduleClass.num_microbatches
+            if hasattr(ScheduleClass, "num_microbatches")
+            else 8
+        )
+
+        schedule = ScheduleClass(
+            stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
+        )
+        assert isinstance(schedule, _PipelineScheduleRuntime)
+
+        # Run pipeline with tensor leak checking
+        with check_leaked_tensors() as garbage_tensors:
+            for _ in range(2):
+                zero_gradients(stage_modules)
+                if self.rank == 0:
+                    schedule.step(x)
+                elif self.rank == self.world_size - 1:
+                    losses = []
+                    out = schedule.step(target=target, losses=losses)
+                else:
+                    schedule.step()
+
+        self.assertEqual(
+            len(garbage_tensors),
+            0,
+            "Found leaked tensors, check logs above for debug info",
+        )
+        dist.barrier()
+
+        # Verify results
+        if self.rank == self.world_size - 1:
+            torch.testing.assert_close(out, ref_out)
+            pipe_loss = sum(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+        # Check gradients using helper method
+        check_gradients(self.config, stage_modules, ref_mod, submod_names)
+
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
+    @parametrize("ScheduleClass", [ScheduleWithW])
+    def test_schedule_with_native_zero_bubble(self, ScheduleClass):
+        n_stages = ScheduleClass.n_stages
+        num_microbatches = ScheduleClass.num_microbatches
+        rank_stages = ScheduleClass.rank_stages
+
+        num_steps = 4
+        mod, ref_mod, x, target, loss_fn = setup_models_and_data(
+            self.config, n_layers=n_stages
+        )
+
+        # Create multi-stage pipeline with custom stage indices
+        stage_indices = rank_stages[self.rank]
+        print(f"Rank {self.rank} stages: {stage_indices}")
+        stages, stage_modules, submod_names = create_multi_stage_pipeline(
+            self.config, mod, len(stage_indices), n_stages, stage_indices
+        )
+
+        schedule = ScheduleClass(
+            stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
+        )
+
+        # Run reference model
+        ref_x = x.detach().clone().requires_grad_(x.requires_grad)
+        torch.testing.assert_close(x, ref_x)
+        for _ in range(num_steps):
+            ref_out = ref_mod(ref_x)
+            ref_loss = loss_fn(ref_out, target)
+            ref_loss.backward()
+
+        # Run pipeline with tensor leak checking
+        losses = []
+        with check_leaked_tensors() as garbage_tensors:
+            for _ in range(num_steps):
+                if self.rank == 0:
+                    schedule.step(x)
+                elif self.rank == self.world_size - 1:
+                    schedule.step(target=target, losses=losses)
+                else:
+                    schedule.step()
+
+        self.assertEqual(
+            len(garbage_tensors),
+            0,
+            "Found leaked tensors, check logs above for debug info",
+        )
+
+        # Check gradients using helper method
+        check_gradients(self.config, stage_modules, ref_mod, submod_names)
+
+
+instantiate_parametrized_tests(CustomSchedulesTest)
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/pipelining/test_stage.py b/test/distributed/pipelining/test_stage.py
index a711cec64d72a..12c8d62037357 100644
--- a/test/distributed/pipelining/test_stage.py
+++ b/test/distributed/pipelining/test_stage.py
@@ -14,17 +14,15 @@
     ScheduleGPipe,
 )
 from torch.distributed.pipelining._utils import PipeliningShapeError
-from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
-    MultiProcContinousTest,
+    MultiProcContinuousTest,
     MultiProcessTestCase,
-    requires_nccl,
+    requires_accelerator_dist_backend,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
-    skip_but_pass_in_sandcastle,
     skip_but_pass_in_sandcastle_if,
 )
 from torch.utils._pytree import tree_map_only
@@ -34,7 +32,9 @@
 batch_size = 256
 chunks = 4
 
-device_type = "cuda"
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+backend = dist.get_default_backend_for_device(device_type)
+TEST_MULTIACCELERATOR = torch.accelerator.device_count() >= 2
 
 torch.manual_seed(0)
 
@@ -63,11 +63,11 @@ def f(x):
     return flatten_hook
 
 
-class StageTest(MultiProcContinousTest):
+class StageTest(MultiProcContinuousTest):
     @classmethod
     def backend_str(cls) -> str:
         # Testing with NCCL backend
-        return "nccl"
+        return backend
 
     @classmethod
     def device_type(cls) -> str:
@@ -77,8 +77,10 @@ def device_type(cls) -> str:
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ModelClass", [ExampleCode, MultiMLP])
     def test_tracer(self, ModelClass):
         mod = ModelClass(d_hid, self.world_size)
@@ -121,8 +123,10 @@ def _run_step(x):
         old_keys = mod.state_dict().keys()
         assert all(k in old_keys for k in submod_keys)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     @parametrize("ModelClass", [ModelWithKwargs])
     def test_tracer_kwargs(self, ModelClass):
         mod = ModelClass(d_hid, self.world_size)
@@ -170,8 +174,10 @@ def test_tracer_kwargs(self, ModelClass):
         old_keys = mod.state_dict().keys()
         assert all(k in old_keys for k in submod_keys)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     def test_manual(self):
         full_mod = MultiMLP(d_hid, n_layers=self.world_size)
         full_mod.to(self.device)
@@ -202,8 +208,10 @@ def _run_step(x):
             ref_out = full_mod(x)
             torch.testing.assert_close(out, ref_out)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     def test_custom_dw_with_fb_schedule(self):
         """Tests that separate weight grad function 'dw_runner' gets run under a schedule that's only aware of F/B."""
         full_mod = MultiMLP(d_hid, n_layers=self.world_size)
@@ -262,8 +270,10 @@ def _run_step(x):
             ref_out = full_mod(x)
             torch.testing.assert_close(out, ref_out)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     def test_output_chunks_memory_usage(self):
         """Test that output_chunks doesn't store memory for non-first stages."""
         full_mod = MultiMLP(d_hid, n_layers=self.world_size)
@@ -347,15 +357,17 @@ def tearDown(self):
     def init_pg(self):
         store = dist.FileStore(self.file_name, self.world_size)
         dist.init_process_group(
-            backend="nccl",
+            backend=backend,
             store=store,
             rank=self.rank,
             world_size=self.world_size,
             device_id=self.device,
         )
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle("Flaky in CI")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     def test_shape_prop_mismatch(self):
         """Tests shape prop errors are raised"""
         self.init_pg()
@@ -402,8 +414,10 @@ def _run_step(x):
             with self.assertRaisesRegex(PipeliningShapeError, "dtype mismatch"):
                 _run_step(x)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
     def test_custom_dw_errors(self):
         """Tests expected errors are raised"""
         self.init_pg()
diff --git a/test/distributed/pipelining/test_transformer.py b/test/distributed/pipelining/test_transformer.py
index 7e58129186a69..20e830547de7b 100644
--- a/test/distributed/pipelining/test_transformer.py
+++ b/test/distributed/pipelining/test_transformer.py
@@ -73,7 +73,9 @@ def get_layers(module):
 
 
 devices = ["cpu", "cuda", "hpu", "xpu"]
-instantiate_device_type_tests(TransformerTests, globals(), only_for=devices)
+instantiate_device_type_tests(
+    TransformerTests, globals(), only_for=devices, allow_xpu=True
+)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/pipelining/test_unflatten.py b/test/distributed/pipelining/test_unflatten.py
index ae1e684d7c222..0493f39b16cb8 100644
--- a/test/distributed/pipelining/test_unflatten.py
+++ b/test/distributed/pipelining/test_unflatten.py
@@ -73,7 +73,9 @@ def test_unflatten(self, device):
 
 
 devices = ["cpu", "cuda", "hpu", "xpu"]
-instantiate_device_type_tests(UnflattenTests, globals(), only_for=devices)
+instantiate_device_type_tests(
+    UnflattenTests, globals(), only_for=devices, allow_xpu=True
+)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/debug/test_comm_mode_features.py b/test/distributed/tensor/debug/test_comm_mode_features.py
index 6c07431291508..86b3849fda69a 100644
--- a/test/distributed/tensor/debug/test_comm_mode_features.py
+++ b/test/distributed/tensor/debug/test_comm_mode_features.py
@@ -11,7 +11,7 @@
     parallelize_module,
     RowwiseParallel,
 )
-from torch.testing._internal.common_utils import run_tests, skipIfHpu
+from torch.testing._internal.common_utils import run_tests, skipIfHpu, TEST_XPU, xfailIf
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     MLPModule,
@@ -221,6 +221,7 @@ def test_MLP_module_tracing(self):
 
     @skipIfHpu
     @skip_unless_torch_gpu
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1555
     @with_comms
     def test_transformer_module_tracing(self, is_seq_parallel=False):
         """
diff --git a/test/distributed/tensor/experimental/test_local_map.py b/test/distributed/tensor/experimental/test_local_map.py
index 1e1b4fa8f27d8..dad23226363ed 100644
--- a/test/distributed/tensor/experimental/test_local_map.py
+++ b/test/distributed/tensor/experimental/test_local_map.py
@@ -1,6 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
-from functools import partial
 
 import torch
 import torch.distributed._functional_collectives as funcol
@@ -50,8 +49,7 @@ def mm_allreduce_forward(device_mesh, A, B):
     return funcol.all_reduce(partial_sum_tensor, "sum", device_mesh).wait()
 
 
-@partial(
-    local_map,
+@local_map(
     out_placements=replicate,
     in_placements=(None, col_wise, row_wise),
 )
diff --git a/test/distributed/tensor/parallel/test_parallelize_api.py b/test/distributed/tensor/parallel/test_parallelize_api.py
index cc41b250e34aa..2ef70f1a447e3 100644
--- a/test/distributed/tensor/parallel/test_parallelize_api.py
+++ b/test/distributed/tensor/parallel/test_parallelize_api.py
@@ -33,7 +33,7 @@ def forward(self, x):
 class TensorParallelAPITests(DTensorTestBase):
     @property
     def world_size(self):
-        gpu_num = torch.cuda.device_count()
+        gpu_num = torch.accelerator.device_count()
         return gpu_num if gpu_num % 2 == 0 and gpu_num > 4 else 4
 
     def _compare_params(
diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
index 9a583bb65b1fe..497e661ff10d4 100644
--- a/test/distributed/tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -28,7 +28,7 @@
 )
 from torch.distributed.tensor.parallel.input_reshard import input_reshard
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FUSED_ATTENTION
-from torch.testing._internal.common_device_type import skipIf
+from torch.testing._internal.common_device_type import skipIf, skipXPUIf
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -283,6 +283,7 @@ def _thaw_params(thaw_params, model, model_tp):
     @skip_unless_torch_gpu
     @parametrize("is_seq_parallel", [True, False])
     @parametrize("dtype", [torch.float64, torch.float32])
+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1555")
     def test_transformer_training(self, is_seq_parallel, dtype: torch.dtype):
         EXP_BASE_CC = ExpCommCounts(
             fwd={all_reduce: 6, all_gather: 1}, bwd={all_reduce: 9}
@@ -415,6 +416,7 @@ def test_transformer_training(self, is_seq_parallel, dtype: torch.dtype):
         + f"thaw_{'__'.join(sorted({n.rpartition('.')[0].replace('.', '_') for n in thaw})) if thaw else 'all'}",
     )
     @skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Does not support fused scaled dot product attention")
+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1555")
     def test_transformer_req_grad(self, thaw_params, is_seq_parallel, dtype, exp_cnts):
         # Sample a subset of `requires_grad` patterns
 
diff --git a/test/distributed/tensor/parallel/test_tp_random_state.py b/test/distributed/tensor/parallel/test_tp_random_state.py
index a12bf017932f2..490210517f517 100644
--- a/test/distributed/tensor/parallel/test_tp_random_state.py
+++ b/test/distributed/tensor/parallel/test_tp_random_state.py
@@ -66,7 +66,7 @@ def test_model_init(self):
             # in the following way:
             #   - within a tensor parallel group, the RNG is set with the same seed
             #   - across data parallel groups, the RNG is set with different seeds
-            torch.cuda.manual_seed(dp_rank)
+            torch.get_device_module(self.device_type).manual_seed(0)
 
             # disable/enable parallel RNG feature
             if random._rng_tracker:
@@ -118,14 +118,10 @@ def tp_weights_assert(tensor1, tensor2):
 
                 # compare local shards across TP groups
                 def dp_weights_assert(tensor1, tensor2):
-                    if enable_distribute_flag:
-                        # local weights shall be initialized the same across TP groups
-                        self.assertEqual(tensor1, tensor2)
-                    else:
-                        # without the parallel RNG, weight initialization violates the TP setup:
-                        # local weights are initialized differently across TP groups due to different
-                        # random seeds set in data loading.
-                        self.assertNotEqual(tensor1, tensor2)
+                    # local weights shall be initialized the same across TP groups,
+                    # and it doesn't matter whether DTensor's RNG infra is activated since all spmd ranks
+                    # started with the same seed.
+                    self.assertEqual(tensor1, tensor2)
 
                 self.check_gathered_tensors(
                     dp_rank, dp_size, tensor_gather, dp_weights_assert
diff --git a/test/distributed/tensor/test_attention.py b/test/distributed/tensor/test_attention.py
index 847c449cba13b..a2543d443e4fe 100644
--- a/test/distributed/tensor/test_attention.py
+++ b/test/distributed/tensor/test_attention.py
@@ -1,11 +1,15 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
+import functools
+import itertools
+import random
 import unittest
+from typing import Union
 
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-from torch import nn
+from torch import nn, Tensor
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import DeviceMesh
 from torch.distributed.tensor.debug import CommDebugMode
@@ -22,7 +26,11 @@
 )
 from torch.distributed.tensor.parallel import parallelize_module
 from torch.nn.attention import sdpa_kernel, SDPBackend
-from torch.nn.attention.flex_attention import create_block_mask, flex_attention
+from torch.nn.attention.flex_attention import (
+    _mask_mod_signature,
+    create_block_mask,
+    flex_attention,
+)
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_CUDNN_ATTENTION,
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
@@ -446,18 +454,94 @@ def _test_ring_attention_custom_transformer(self, rotater: _RotateMethod) -> Non
 )
 
 
+def causal_mask(b, h, q_idx, kv_idx):
+    return q_idx >= kv_idx
+
+
+# copied from https://github.com/meta-pytorch/attention-gym/blob/main/attn_gym/masks/document_mask.py
+def generate_random_lengths(total_length, num_documents):
+    # Initialize all lengths to 1 to ensure each document has at least one token
+    lengths = [1] * num_documents
+    remaining_length = total_length - num_documents
+
+    # Randomly distribute the remaining length
+    for _ in range(remaining_length):
+        index = random.randint(0, num_documents - 1)
+        lengths[index] += 1
+
+    return lengths
+
+
+def length_to_offsets(
+    lengths: list[list[int]], device: Union[str, torch.device]
+) -> Tensor:
+    """Converts a list of lengths to a list of offsets.
+
+    Args:
+        lengths: A list of lengths.
+
+    """
+    offsets = [[0] + lengths_in_batch for lengths_in_batch in lengths]
+    offsets = torch.tensor(offsets, device=device, dtype=torch.int32)
+    offsets = torch.cumsum(offsets, dim=-1)
+    return offsets
+
+
+def _offsets_to_doc_ids_tensor(offsets):
+    doc_ids = []
+    device = offsets.device
+    for batch_idx in range(offsets.size(0)):
+        counts = offsets[batch_idx][1:] - offsets[batch_idx][:-1]
+        doc_id = torch.repeat_interleave(
+            torch.arange(len(counts), device=device, dtype=torch.int32), counts
+        )
+        doc_ids.append(doc_id)
+
+    return torch.stack(doc_ids)
+
+
+def generate_doc_mask_mod(
+    mask_mod: _mask_mod_signature, offsets: Tensor
+) -> _mask_mod_signature:
+    """Generates mask mods that apply to inputs to flex attention in the sequence stacked
+    format.
+
+    Args:
+        mask_mod: The mask mod to apply to the documents
+        offsets: This tensor should be of shape(num_documents + 1)
+            this should contain the cumulative counts of document tokens.
+            e.g. if you have 3 documents of length 2, 4, 3 then
+            offsets = [0, 2, 6, 9]
+
+    Note:
+        What is the sequence stacked format? When assembling batches of inputs, we
+        take multiple sequences and stack them together to form 1 large sequence. We then
+        use masking to ensure that the attention scores are only applied to tokens within
+        the same document.
+    """
+    document_id = _offsets_to_doc_ids_tensor(offsets)
+
+    def doc_mask_mod(b, h, q_idx, kv_idx):
+        same_doc = document_id[b][q_idx] == document_id[b][kv_idx]
+        q_logical = q_idx - offsets[b, document_id[b, q_idx]]
+        kv_logical = kv_idx - offsets[b, document_id[b, kv_idx]]
+        inner_mask = mask_mod(b, h, q_logical, kv_logical)
+        return same_doc & inner_mask
+
+    return doc_mask_mod
+
+
 class RingFlexAttentionTest(DTensorTestBase):
     @property
     def world_size(self) -> int:
         return 2
 
-    def _test_ring_flex_attention(self, qkv_size) -> None:
-        def causal_mask(b, h, q_idx, kv_idx):
-            return q_idx >= kv_idx
-
+    def _test_ring_flex_attention(
+        self, qkv_size, B=1, mask_func=causal_mask, atol=1e-6, rtol=1e-2
+    ) -> None:
         torch.cuda.manual_seed(10)
         dtype = torch.float32
-        bs = 8
+        bs = B if B > 1 else 8
         query_tokens = context_tokens = qkv_size
         dim = 32
         nheads = 8
@@ -482,8 +566,8 @@ def causal_mask(b, h, q_idx, kv_idx):
         )
 
         block_mask = compiled_create_block_mask(
-            causal_mask,
-            B=1,
+            mask_func,
+            B=B,
             H=1,
             Q_LEN=query_tokens,
             KV_LEN=context_tokens,
@@ -531,8 +615,8 @@ def causal_mask(b, h, q_idx, kv_idx):
 
         # NOTE: call create_block_mask() within TorchFunctionMode would cause error in create_fw_bw_graph
         cp_block_mask = create_cp_block_mask(
-            causal_mask,
-            B=1,
+            mask_func,
+            B=B,
             H=1,
             Q_LEN=query_tokens,
             KV_LEN=context_tokens,
@@ -574,8 +658,8 @@ def causal_mask(b, h, q_idx, kv_idx):
 
         # unshard the output
         cp_out, cp_lse = context_parallel_unshard(device_mesh, [cp_out, cp_lse], [2, 2])
-        torch.testing.assert_close(cp_out, expect_out, atol=1e-6, rtol=1e-2)
-        torch.testing.assert_close(cp_lse, expect_lse, atol=1e-6, rtol=1e-2)
+        torch.testing.assert_close(cp_out, expect_out, atol=atol, rtol=rtol)
+        torch.testing.assert_close(cp_lse, expect_lse, atol=atol, rtol=rtol)
 
         # unshard the gradient
         cp_q_grad, cp_k_grad, cp_v_grad = context_parallel_unshard(
@@ -583,9 +667,9 @@ def causal_mask(b, h, q_idx, kv_idx):
             [cp_q.grad, cp_k.grad, cp_v.grad],
             [2, 2, 2],
         )
-        torch.testing.assert_close(cp_q_grad, q.grad, atol=1e-6, rtol=1e-2)
-        torch.testing.assert_close(cp_k_grad, k.grad, atol=1e-6, rtol=1e-2)
-        torch.testing.assert_close(cp_v_grad, v.grad, atol=1e-6, rtol=1e-2)
+        torch.testing.assert_close(cp_q_grad, q.grad, atol=atol, rtol=rtol)
+        torch.testing.assert_close(cp_k_grad, k.grad, atol=atol, rtol=rtol)
+        torch.testing.assert_close(cp_v_grad, v.grad, atol=atol, rtol=rtol)
 
         # reset CP context dispatch mode to default
         torch.distributed.tensor.experimental._attention._dispatch_mode = (
@@ -594,6 +678,9 @@ def causal_mask(b, h, q_idx, kv_idx):
 
     @skip_if_lt_x_gpu(2)
     @with_comms
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
+    )
     def test_ring_flex_attention(self) -> None:
         self.run_subtests(
             {"qkv_size": [128 * self.world_size, 2048]},
@@ -607,6 +694,56 @@ def test_ring_flex_attention(self) -> None:
                 self._test_ring_flex_attention,
             )
 
+    # TODO: merge with the above test
+    @skip_if_lt_x_gpu(2)
+    @with_comms
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
+    )
+    def test_ring_flex_attention_document_mask(self) -> None:
+        random.seed(10)
+
+        # NOTE: Each (batch_size, seq_len) tuple introduces 2 create_block_mask
+        # compilations: 1 for single-rank flex_attention and 1 for CP flex_attention.
+        # In order to avoid the "exceeds_recompile_limit" error, we need to increase
+        # the cache_size_limit to 12 which is the total number of compilations in our
+        # test case.
+        torch._dynamo.config.cache_size_limit = 12
+
+        # initialize document mask
+        doc_count = 28
+        batch_size_list = [2, 4, 8]
+        max_seq_len_list = [
+            256 * self.world_size,
+            2048,
+            # 128 * self.world_size  # NOTE: Mismatched elements: 8 / 131072 (0.0%),
+        ]
+
+        # TODO: change this for-loop to run_subtests
+        # Use a for-loop instead of run_subtests because we need to intialize the mask
+        # for each subtest. This can be baked into self._test_ring_flex_attention as
+        # a str argument denoting mask type.
+        for batch_size, max_seq_len in itertools.product(
+            batch_size_list, max_seq_len_list
+        ):
+            lengths = [
+                generate_random_lengths(max_seq_len, doc_count)
+                for _ in range(batch_size)
+            ]
+            offsets = length_to_offsets(lengths, self.device_type)
+            document_causal_mask = generate_doc_mask_mod(causal_mask, offsets)
+
+            # construct testing function
+            test_func = functools.partial(
+                self._test_ring_flex_attention,
+                qkv_size=max_seq_len,
+                B=batch_size,
+                mask_func=document_causal_mask,
+                atol=1e-6,
+            )
+
+            test_func()
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_common_rules.py b/test/distributed/tensor/test_common_rules.py
index b320f80fe03c6..3450f8faa2b5c 100644
--- a/test/distributed/tensor/test_common_rules.py
+++ b/test/distributed/tensor/test_common_rules.py
@@ -8,20 +8,17 @@
 from torch.distributed.tensor._ops._common_rules import einop_rule, pointwise_rule
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
-    DTensorTestBase,
-    with_comms,
+    DTensorContinuousTestBase,
 )
 
 
 aten = torch.ops.aten
 
 
-class CommonRulesTest(DTensorTestBase):
-    @property
-    def world_size(self) -> int:
-        # hard code world size to 4 as we need to test
-        # at least with 2d mesh
-        return 4
+class CommonRulesTest(DTensorContinuousTestBase):
+    # hard code world size to 4 as we need to test
+    # at least with 2d mesh
+    world_size = 4
 
     def _gen_tensor_meta(self, shape):
         empty_tensor = torch.empty(shape)
@@ -31,10 +28,9 @@ def _gen_tensor_meta(self, shape):
             empty_tensor.dtype,
         )
 
-    @with_comms
     def test_einop_basic_propagation(self):
         # plain einsum, mm
-        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        mesh = DeviceMesh(self.device_type(), torch.arange(self.world_size))
 
         mm_call = aten.mm.default
         # propagate col-wise sharding
@@ -85,9 +81,8 @@ def test_einop_basic_propagation(self):
         self.assertIsNotNone(output_spec)
         self.assertTrue(output_spec.placements[0].is_partial())
 
-    @with_comms
     def test_einop_pointwise_propagation(self):
-        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        mesh = DeviceMesh(self.device_type(), torch.arange(self.world_size))
 
         add_call = aten.add.Tensor
         # addition
@@ -137,13 +132,12 @@ def test_einop_pointwise_propagation(self):
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [0, -1, -1])
 
-    @with_comms
     def test_einop_merge_sharding(self):
         # 2d mesh einop merge sharding
         mesh_shape = torch.arange(self.world_size).reshape(
             self.world_size // 2, self.world_size // 2
         )
-        mesh = DeviceMesh(self.device_type, mesh_shape)
+        mesh = DeviceMesh(self.device_type(), mesh_shape)
 
         mm_call = aten.mm.default
 
@@ -163,12 +157,11 @@ def test_einop_merge_sharding(self):
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [0, 1])
 
-    @with_comms
     def test_einop_linearity(self):
         mesh_shape = torch.arange(self.world_size).reshape(
             self.world_size // 2, self.world_size // 2
         )
-        mesh = DeviceMesh(self.device_type, mesh_shape)
+        mesh = DeviceMesh(self.device_type(), mesh_shape)
 
         mm_call = aten.mm.default
 
@@ -231,11 +224,10 @@ def test_einop_linearity(self):
         # mat2 mesh dim 1 should become partial now!
         self.assertTrue(mat2_spec.placements[1].is_partial())
 
-    @with_comms
     def test_einop_multi_sharding_on_mesh_dim(self):
         # einop prop with multi sharding on same mesh dim
         mesh_shape = torch.arange(self.world_size)
-        mesh = DeviceMesh(self.device_type, mesh_shape)
+        mesh = DeviceMesh(self.device_type(), mesh_shape)
 
         mm_call = aten.mm.default
         mat1, mat2 = [0, -1], [0, -1]
@@ -260,12 +252,11 @@ def test_einop_multi_sharding_on_mesh_dim(self):
         self.assertEqual(schema_suggestion.args_schema[0].dim_map, [0, -1])
         self.assertEqual(schema_suggestion.args_schema[1].dim_map, [-1, -1])
 
-    @with_comms
     def test_einop_errors(self):
         mesh_shape = torch.arange(self.world_size).reshape(
             self.world_size // 2, self.world_size // 2
         )
-        mesh = DeviceMesh(self.device_type, mesh_shape)
+        mesh = DeviceMesh(self.device_type(), mesh_shape)
 
         add_call = aten.add.Tensor
         mat1, mat2 = [0, -1], [1, -1]
@@ -281,9 +272,8 @@ def test_einop_errors(self):
         with self.assertRaisesRegex(RuntimeError, "sharded two different ways:"):
             einop_rule("ij,ij->ij", OpSchema(add_call, (mat1_spec, mat2_spec), {}))
 
-    @with_comms
     def test_pointwise_rules_broadcasting(self):
-        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        mesh = DeviceMesh(self.device_type(), torch.arange(self.world_size))
 
         where_call = aten.where.self
         inp1, inp2, inp3 = [0], [], [-1, -1]
@@ -307,9 +297,8 @@ def test_pointwise_rules_broadcasting(self):
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [-1, 0])
 
-    @with_comms
     def test_pointwise_rules_suggestion(self):
-        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        mesh = DeviceMesh(self.device_type(), torch.arange(self.world_size))
 
         lerp_call = aten.lerp.Scalar
         # propagate point-wise sharding
@@ -335,13 +324,12 @@ def test_pointwise_rules_suggestion(self):
         self.assertEqual(len(schema_suggestion.args_schema), 3)
         self.assertEqual(schema_suggestion.args_schema[2], -1)
 
-    @with_comms
     def test_pointwise_multi_sharding_on_mesh_dim(self):
         # 2d mesh pointwise sharding
         mesh_shape = torch.arange(self.world_size).reshape(
             self.world_size // 2, self.world_size // 2
         )
-        mesh = DeviceMesh(self.device_type, mesh_shape)
+        mesh = DeviceMesh(self.device_type(), mesh_shape)
 
         add_call = aten.add.Tensor
 
@@ -381,13 +369,12 @@ def test_pointwise_multi_sharding_on_mesh_dim(self):
         self.assertEqual(schema_suggestion.args_schema[0].dim_map, [-1, -1, -1, 1])
         self.assertEqual(schema_suggestion.args_schema[1].dim_map, mat2)
 
-    @with_comms
     def test_pointwise_enforce_sharding_multi_sharding_on_mesh_dim(self):
         # 2d mesh pointwise sharding
         mesh_shape = torch.arange(self.world_size).reshape(
             self.world_size // 2, self.world_size // 2
         )
-        mesh = DeviceMesh(self.device_type, mesh_shape)
+        mesh = DeviceMesh(self.device_type(), mesh_shape)
 
         add_call = aten.add_.Tensor
 
diff --git a/test/distributed/tensor/test_dtensor.py b/test/distributed/tensor/test_dtensor.py
index 73f4b709103f3..f5ddb1a4222c6 100644
--- a/test/distributed/tensor/test_dtensor.py
+++ b/test/distributed/tensor/test_dtensor.py
@@ -848,6 +848,30 @@ def test_implicit_replication(self):
             self.assertEqual(local_shard.shape, (4, 3))
             self.assertEqual(local_shard, torch.ones(4, 3) + torch.ones(3))
 
+    @with_comms
+    def test_vmap_embedding(self):
+        mesh = self.build_device_mesh()
+        batch_size, seq_len = 2, 6
+        output_dim = 32
+
+        indices = torch.zeros(*(batch_size, seq_len), dtype=torch.int64)
+        indices[0, 1] = 1
+        indices[1, 3] = 1
+        indices[1, 5] = 1
+        indices = DTensor.from_local(indices, mesh, [Shard(0)])
+
+        emb = torch.randn(
+            *(batch_size, 8, output_dim),
+            dtype=torch.float32,
+        )
+        emb = DTensor.from_local(emb, mesh, [Shard(0)])
+        result = torch.vmap(F.embedding)(indices, emb)
+        expected = [F.embedding(indices[i], emb[i]) for i in range(batch_size)]
+        expected = torch.stack(expected)
+        local_result = result.to_local()
+        local_expected = expected.to_local()
+        self.assertEqual(local_result, local_expected)
+
     @with_comms
     def test_auto_implicit_replication(self):
         mesh = self.build_device_mesh()
diff --git a/test/distributed/tensor/test_dtensor_compile.py b/test/distributed/tensor/test_dtensor_compile.py
index 54ec52ee32d41..15e3daf6b9413 100644
--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@@ -20,10 +20,19 @@
 )
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.tensor import DeviceMesh, DTensor, Partial, Replicate, Shard
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_module,
+    distribute_tensor,
+    DTensor,
+    Partial,
+    Replicate,
+    Shard,
+)
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
+    loss_parallel,
     parallelize_module,
     PrepareModuleInput,
     PrepareModuleOutput,
@@ -87,6 +96,33 @@ def extract_graph(fx_g, _, graph_cell):
 )
 
 
+def _apply_sharding(mod: nn.Module, shard_dim: int, device_mesh: DeviceMesh):
+    """
+    Shards on the given dimension if possible, else replicate
+    Args:
+        mod: (nn.Module) Module to shard or replicate
+        shard_dim: (int) Dimension to shard on if possible
+        device_mesh: (DeviceMesh) 1D Device Mesh
+
+    Returns:
+        Sharded DTensor
+    """
+
+    def shard_module_params(name, module, device_mesh):
+        for name, param in module.named_parameters():
+            placement = Replicate()
+            if shard_dim < len(param.size()):
+                placement = Shard(shard_dim)
+            dist_param = torch.nn.Parameter(
+                distribute_tensor(param, device_mesh, [placement])
+            )
+            name = name.split(".")[-1]
+            module.register_parameter(name, dist_param)
+
+    sharded_mod = distribute_module(mod, device_mesh, shard_module_params)
+    return sharded_mod
+
+
 class TestDTensorCompile(torch._dynamo.test_case.TestCase):
     def setUp(self):
         super(
@@ -166,6 +202,8 @@ def forward(self, b_buffer, x):
     return (view_as_1,)""",  # noqa: B950
         )
 
+        # During tracing, sharding propagation cache is skipped, so an extra dry run for
+        # add is performed in _propagate_tensor_meta_non_cached, hence add_1 instead of add
         self.assertExpectedInline(
             str(ep.run_decompositions({}).graph_module.code).strip(),
             """\
@@ -173,8 +211,8 @@ def forward(self, b_parametrizations_buffer_original0, x):
     _assert_tensor_metadata = torch.ops.aten._assert_tensor_metadata.default(x, None, None, torch.float64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata = None
     _to_copy = torch.ops.aten._to_copy.default(x, dtype = torch.float64, layout = torch.strided, device = device(type='cuda', index=0));  x = None
     view = torch.ops.aten.view.default(_to_copy, [4, 4]);  _to_copy = None
-    add = torch.ops.aten.add.Tensor(b_parametrizations_buffer_original0, view);  b_parametrizations_buffer_original0 = view = None
-    view_1 = torch.ops.aten.view.default(add, [4, 4]);  add = None
+    add_1 = torch.ops.aten.add.Tensor(b_parametrizations_buffer_original0, view);  b_parametrizations_buffer_original0 = view = None
+    view_1 = torch.ops.aten.view.default(add_1, [4, 4]);  add_1 = None
     return (view_1,)""",  # noqa: B950
         )
 
@@ -268,7 +306,9 @@ def fn(x):
                 .to_local()[0]
             )
 
-        x = DTensor.from_local(torch.rand(4, 4), mesh, [Shard(0)], run_check=False)
+        x = DTensor.from_local(
+            torch.rand(4, 4, requires_grad=True), mesh, [Shard(0)], run_check=False
+        )
         torch._dynamo.mark_dynamic(x, 0)
         ref = fn(x)
 
@@ -289,13 +329,59 @@ def fn(x):
                 for t in torch.tensor_split(x, 2)
             ]
 
-        x = DTensor.from_local(torch.rand(4, 4), mesh, [Shard(0)], run_check=False)
+        x = DTensor.from_local(
+            torch.rand(4, 4, requires_grad=True), mesh, [Shard(0)], run_check=False
+        )
         ref = fn(x)
 
         opt_fn = torch.compile(fn, backend="aot_eager", fullgraph=True, dynamic=True)
         res = opt_fn(x)
         self.assertEqual(res, ref)
 
+    @skipIfHpu
+    def test_dtensor_dynamic_loss_parallel_log_softmax(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        def fn(x):
+            t = torch.nn.functional.log_softmax(x, x.ndim - 1, dtype=torch.float32)
+            return t.redistribute(
+                device_mesh=x.device_mesh, placements=[Replicate()]
+            ).to_local()[0]
+
+        with loss_parallel():
+            x = DTensor.from_local(torch.rand(4, 4), mesh, [Shard(1)], run_check=False)
+            ref = fn(x)
+
+            opt_fn = torch.compile(
+                fn, backend="aot_eager", fullgraph=True, dynamic=True
+            )
+            res = opt_fn(x)
+        self.assertEqual(res, ref)
+
+    def test_dtensor_dynamic_cat(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        # test passing in tuple of DTensors as
+        def fn(x, y):
+            return (
+                torch.cat((x, y), dim=0)
+                .redistribute(device_mesh=x.device_mesh, placements=[Replicate()])
+                .to_local()[0]
+            )
+
+        x = DTensor.from_local(
+            torch.rand(4, 4, requires_grad=True), mesh, [Shard(0)], run_check=False
+        )
+        y = DTensor.from_local(
+            torch.rand(4, 4, requires_grad=True), mesh, [Shard(0)], run_check=False
+        )
+        torch._dynamo.mark_dynamic(x, 0)
+        ref = fn(x, y)
+
+        opt_fn = torch.compile(fn, backend="aot_eager", fullgraph=True)
+        res = opt_fn(x, y)
+        self.assertEqual(res, ref)
+
     def test_dtensor_attribute_access_on_intermediate(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
@@ -1129,6 +1215,29 @@ def fn(x, y):
         self.assertEqual(x_ref.grad, x.grad)
         self.assertEqual(y_ref.grad, y.grad)
 
+    @with_comms
+    def test_compile_embedding_redistribute(self):
+        mesh = self.build_device_mesh()
+
+        class Network(nn.Module):
+            def __init__(self, embedding, mesh):
+                super().__init__()
+                self.mesh = mesh
+                self.embedding = _apply_sharding(embedding, 0, self.mesh)
+
+            def forward(self, x):
+                x = self.embedding(x)
+                x = x.redistribute(self.mesh, [Shard(1)])
+                return x
+
+        embedding = torch.nn.Embedding(10, 20, device=self.device_type)
+        inp = torch.randint(0, 10, (8,), device=self.device_type)
+        ref_out = embedding(inp)
+        sharded_net = torch.compile(Network(embedding, mesh))
+        replicated_inp = DTensor.from_local(inp, mesh, [Replicate()], run_check=False)
+        output = sharded_net(replicated_inp)
+        self.assertEqual(output.full_tensor(), ref_out)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_dtensor_dispatch_overhead.py b/test/distributed/tensor/test_dtensor_dispatch_overhead.py
new file mode 100644
index 0000000000000..7d08725205e60
--- /dev/null
+++ b/test/distributed/tensor/test_dtensor_dispatch_overhead.py
@@ -0,0 +1,137 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import functools
+import logging
+import statistics
+import time
+from collections import namedtuple
+
+import torch
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import distribute_tensor, DTensor, Shard
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+from torch.utils._python_dispatch import TorchDispatchMode
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+class TimeCaptureMode(TorchDispatchMode):
+    def __init__(self, repeat_count=10):
+        # repeat each op call `repeat_count` times
+        self.repeat_count = repeat_count
+        # recorded time is scaled to micro seconds
+        self.time_list = []
+        self.op_to_time = {}
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        self.time_list.clear()
+
+        @functools.wraps(func)
+        def repeated_func(*args, **kwargs):
+            result = None
+            for _ in range(self.repeat_count):
+                start_time = time.perf_counter()
+                result = func(*args, **kwargs)
+                end_time = time.perf_counter()
+                elapsed_time = end_time - start_time
+                self.time_list.append(elapsed_time)
+            return result
+
+        res = repeated_func(*args, **(kwargs or {}))
+
+        Timing = namedtuple(
+            "Timing", ["dispatch_with_cache_miss", "dispatch_with_cache_hit"]
+        )
+        if func.__name__ not in self.op_to_time:
+            self.op_to_time[func.__name__] = []
+        self.op_to_time[func.__name__].append(
+            Timing(
+                round(self.time_list[0] * 1e6, 2),
+                round(statistics.median(self.time_list) * 1e6, 2),
+            )
+        )
+        return res
+
+
+class DistOpDispatchOverHead(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @with_comms
+    def test_dtensor_add_op_dispatch_overhead(self):
+        if torch.cuda.is_available():
+            device_props = torch.cuda.get_device_name(0)
+            gpu_name = device_props
+            logger.info("running on %s", gpu_name)
+            # TODO: adjust `expected_propagate_time` and `expected_dispatch_time` to target different hardware
+        else:
+            self.skipTest("CUDA not available")
+        expected_propagate_time = 880  # noqa: F841
+        expected_dispatch_time = 90  # noqa: F841
+        diff_percent_threshold = 0.20  # noqa: F841
+        propagator = DTensor._op_dispatcher.sharding_propagator
+        device_mesh = init_device_mesh("cuda", (self.world_size,))
+        input_data = torch.rand(512, 512, device="cuda")
+        a = distribute_tensor(input_data, device_mesh, [Shard(0)])
+        # warm up
+        with TimeCaptureMode() as tcm:
+            for _ in range(100):
+                propagator.propagate_op_sharding.cache.cache_clear()
+                _ = a + a
+            # record number
+            propagator.propagate_op_sharding.cache.cache_clear()
+            _ = a + a
+            add_dispatch_cache_miss, add_dispatch_cache_hit = tcm.op_to_time[
+                "add.Tensor"
+            ][-1]
+
+        all_miss_performance = [0] * self.world_size
+        all_hit_performance = [0] * self.world_size
+        torch.distributed.all_gather_object(
+            all_miss_performance, add_dispatch_cache_miss
+        )
+        torch.distributed.all_gather_object(all_hit_performance, add_dispatch_cache_hit)
+        if self.rank == 0:
+            logger.info(
+                "add op dispatch cache miss from %s ranks: %s us, \n"
+                "add op dispatch cache hit from %s ranks: %s us",
+                self.world_size,
+                all_miss_performance,
+                self.world_size,
+                all_hit_performance,
+            )
+        # compare median with expected range
+        miss_performance = statistics.median(all_miss_performance)
+        hit_performance = statistics.median(all_hit_performance)
+        extra_time_spend_on_strategy_propagate = miss_performance - hit_performance  # noqa: F841
+        # Do not enabling the assertion check due to flaky performance concern
+        # self.assertTrue(
+        #     (extra_time_spend_on_strategy_propagate - expected_propagate_time)
+        #     / expected_propagate_time
+        #     < diff_percent_threshold,
+        #     msg=(
+        #         f"extra time spend on strategy propagate is {extra_time_spend_on_strategy_propagate} us, "
+        #         f"performance diff is {diff_percent_threshold * 100}% greater than expected {expected_propagate_time} us"
+        #     ),
+        # )
+        # self.assertTrue(
+        #     (hit_performance - expected_dispatch_time) / expected_dispatch_time
+        #     < diff_percent_threshold,
+        #     msg=(
+        #         f"DTensor dispatch time is {hit_performance} us, "
+        #         f"performance diff is {diff_percent_threshold * 100}% greater than "
+        #         f"expected {expected_dispatch_time} us"
+        #     ),
+        # )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py
index 3f724d9a85bf0..8c650f6b0ce02 100644
--- a/test/distributed/tensor/test_dtensor_ops.py
+++ b/test/distributed/tensor/test_dtensor_ops.py
@@ -160,7 +160,6 @@ def wrapped(fn):
     xfail("frexp"),
     xfail("full"),
     xfail("full_like"),
-    xfail("gather"),
     xfail("geometric"),
     xfail("geqrf"),
     xfail("grid_sampler_2d"),
@@ -290,7 +289,6 @@ def wrapped(fn):
     xfail("nn.functional.interpolate", "nearest"),
     xfail("nn.functional.interpolate", "nearest-exact"),
     xfail("nn.functional.leaky_relu"),
-    xfail("nn.functional.linear"),
     xfail("nn.functional.local_response_norm"),
     xfail("nn.functional.logsigmoid"),
     xfail("nn.functional.margin_ranking_loss"),
@@ -509,18 +507,17 @@ class TestDTensorOps(DTensorOpTestBase):
     def world_size(self) -> int:
         return OP_DB_WORLD_SIZE
 
-    # only allow float dytpe for now, we can relax this constraint
-    # when feel necessary later (i.e when adding quantization support).
-    @suppress_warnings
-    @ops(op_db, allowed_dtypes=(torch.float,))
-    @skipOps("TestDTensorOps", "test_dtensor_op_db", dtensor_fails)
-    def test_dtensor_op_db(self, dtype, op):
+    def run_opinfo_test(
+        self, dtype, op, requires_grad=True, sample_inputs_filter=lambda s: True
+    ):
         self.mesh = DeviceMesh(DEVICE_TYPE, torch.arange(self.world_size))
 
         # test each op with dist tensor inputs and normal inputs
         def test():
-            samples = op.sample_inputs(DEVICE_TYPE, dtype, requires_grad=True)
+            samples = op.sample_inputs(DEVICE_TYPE, dtype, requires_grad=requires_grad)
             for sample_input in samples:
+                if not sample_inputs_filter(sample_input):
+                    continue
                 args = [sample_input.input] + list(sample_input.args)
                 kwargs = sample_input.kwargs
 
@@ -533,6 +530,14 @@ def test():
 
         self.check_dtensor_func(test, op)
 
+    # only allow float dytpe for now, we can relax this constraint
+    # when feel necessary later (i.e when adding quantization support).
+    @suppress_warnings
+    @ops(op_db, allowed_dtypes=(torch.float,))
+    @skipOps("TestDTensorOps", "test_dtensor_op_db", dtensor_fails)
+    def test_dtensor_op_db(self, dtype, op):
+        self.run_opinfo_test(dtype, op)
+
     def assert_ref_dtensor_equal(self, dtensor_rs, rs):
         flat_dtensor_rs = pytree.tree_leaves(dtensor_rs)
         flat_rs = pytree.tree_leaves(rs)
@@ -646,6 +651,18 @@ def check_dtensor_func(self, test_func, opinfo, dry_run=False):
                 else:
                     print(f"xfail('{opinfo.name}'),")
 
+    def test_one_hot(self):
+        ops = [op for op in op_db if op.name == "nn.functional.one_hot"]
+        assert len(ops) == 1
+        op = ops[0]
+        # num_classes = -1 appears to have a bug with dtensor.max().item()
+        self.run_opinfo_test(
+            torch.int64,
+            op,
+            requires_grad=False,
+            sample_inputs_filter=lambda s: s.kwargs["num_classes"] != -1,
+        )
+
 
 # only instantiate tests for DEVICE_TYPE alone (i.e. either CPU or GPU)
 instantiate_device_type_tests(TestDTensorOps, globals(), only_for=(DEVICE_TYPE,))
diff --git a/test/distributed/tensor/test_dtensor_testbase.py b/test/distributed/tensor/test_dtensor_testbase.py
new file mode 100644
index 0000000000000..b5a2de69a566e
--- /dev/null
+++ b/test/distributed/tensor/test_dtensor_testbase.py
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import numpy as np
+
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+
+
+class DTensorTestBaseUtilCPUTest(DTensorTestBase):
+    """
+    This class tests if the basic functionalities of DTensorTestBase are
+    working as expected on CPU, regardless of the presence of CUDA devices.
+    """
+
+    @property
+    def backend(self):
+        return "gloo"
+
+    @property
+    def device_type(self) -> str:
+        return "cpu"
+
+    @property
+    def world_size(self):
+        return np.prod(list(self.mesh_dim_sizes.values())).item()
+
+    @property
+    def mesh_dim_sizes(self) -> dict[str, int]:
+        """Mapping from mesh dimension names to sizes."""
+        return {"data": 2, "fsdp": 3, "tensor": 5}
+
+    def build_device_mesh(self) -> DeviceMesh:
+        return init_device_mesh(
+            self.device_type,
+            mesh_shape=tuple(self.mesh_dim_sizes.values()),
+            mesh_dim_names=tuple(self.mesh_dim_sizes.keys()),
+        )
+
+    @with_comms
+    def test_dtensor_testbase_destroy_pg(self):
+        # This tests destroy_pg() correctly finishes.
+        device_mesh = self.build_device_mesh()  # noqa: F841
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/tensor/test_fake.py b/test/distributed/tensor/test_fake.py
new file mode 100644
index 0000000000000..099c6e87f5f18
--- /dev/null
+++ b/test/distributed/tensor/test_fake.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.distributed.tensor import DTensor
+from torch.distributed.tensor.placement_types import Shard
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+
+class TestFakeDTensor(TestCase):
+    def test_fake_dtensor_operations(self):
+        # Use FakeTensorMode to handle CUDA tensors without actual CUDA
+        fake_mode = FakeTensorMode()
+        world_size = 4
+
+        fake_store = FakeStore()
+        torch.distributed.init_process_group(
+            "fake", store=fake_store, rank=0, world_size=world_size
+        )
+        device_mesh = torch.distributed.device_mesh.init_device_mesh(
+            "cuda",
+            (2, world_size // 2),
+        )
+
+        # Create fake CUDA tensor using FakeTensorMode
+        with fake_mode:
+            x = torch.randn(1, 1, device="cuda")
+            x = DTensor.from_local(x, device_mesh, [Shard(0), Shard(1)])
+
+            # Test basic DTensor operations
+            self.assertIsInstance(x, DTensor)
+
+            # Test sum operation
+            r = x.sum(1)
+            self.assertIsInstance(r, DTensor)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/tensor/test_init.py b/test/distributed/tensor/test_init.py
index 4212b6fc2c9bd..d08b7e0fda4a1 100644
--- a/test/distributed/tensor/test_init.py
+++ b/test/distributed/tensor/test_init.py
@@ -131,7 +131,7 @@ def test_zeros(self):
 
     @with_comms
     def test_zeros_full_mesh(self):
-        # construct a cuda device 1d mesh
+        # construct a gpu device 1d mesh
         mesh = self.build_device_mesh()
         placements = [Shard(0)]
         size = [32, 3]
@@ -157,7 +157,7 @@ def test_zeros_full_mesh(self):
             self.assertEqual(local_tensor.size(), torch.Size([7, 3]))
             self.assertEqual(torch.zeros(7, 3), local_tensor)
 
-        # construct a cuda device mesh with 2d: shard, replicate
+        # construct a gpu device mesh with 2d: shard, replicate
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, 2))
         placements = [Shard(0), Replicate()]
         size = [32, 4]
@@ -168,7 +168,7 @@ def test_zeros_full_mesh(self):
         self.assertEqual(local_tensor.size(), torch.Size([16, 4]))
         self.assertEqual(local_tensor, torch.zeros([16, 4]))
 
-        # construct a cuda device mesh with 2d: shard, shard
+        # construct a gpu device mesh with 2d: shard, shard
         placements = [Shard(0), Shard(1)]
         size = [32, 4]
         dist_tensor = zeros(size, device_mesh=mesh, placements=placements)
@@ -197,7 +197,7 @@ def test_zeros_full_mesh(self):
     @with_comms
     def test_zeros_submesh(self):
         # default world_size is 4
-        # construct a cuda device 1d mesh, with no sub pg initialized
+        # construct a gpu device 1d mesh, with no sub pg initialized
         sub_mesh_list = [0, 3]
         mesh = DeviceMesh(self.device_type, sub_mesh_list)
         placements = [Shard(0)]
@@ -213,7 +213,7 @@ def test_zeros_submesh(self):
             self.assertEqual(local_tensor.size(), torch.Size([0]))
             self.assertEqual(local_tensor, torch.zeros(0))
 
-        # construct a cuda device 1d mesh: unevenly, with subpg initialized
+        # construct a gpu device 1d mesh: unevenly, with subpg initialized
         sub_mesh_list = [0, 1, 3]
         mesh = DeviceMesh(self.device_type, sub_mesh_list)
         placements = [Shard(0)]
@@ -233,7 +233,7 @@ def test_zeros_submesh(self):
             self.assertEqual(local_tensor.size(), torch.Size([0]))
             self.assertEqual(local_tensor, torch.tensor([]))
 
-        # construct a cuda device 2d mesh, with no subpg initialized
+        # construct a gpu device 2d mesh, with no subpg initialized
         sub_mesh_list = [[0], [3]]
         mesh = DeviceMesh(self.device_type, sub_mesh_list)
         placements = [Shard(0), Shard(1)]
diff --git a/test/distributed/tensor/test_math_ops.py b/test/distributed/tensor/test_math_ops.py
index 93ce80f18ee15..085cdc296df81 100644
--- a/test/distributed/tensor/test_math_ops.py
+++ b/test/distributed/tensor/test_math_ops.py
@@ -24,7 +24,7 @@
     RowwiseParallel,
     SequenceParallel,
 )
-from torch.testing._internal.common_utils import run_tests, skipIfRocm
+from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     skip_unless_torch_gpu,
@@ -271,14 +271,22 @@ def test_layer_norm_fwd(self):
         norm_shape_idx_list = list(range(x.ndim))
         shard_dims = [-1, 0, 1, 2]
         elementwise_affine_list = [False, True]
+
+        # Test RMSNorm as well if CUDA
+        norm_types = [torch.nn.LayerNorm]
+        if self.device_type == "cuda" and hasattr(torch.nn, "RMSNorm"):
+            norm_types.append(torch.nn.RMSNorm)
+
         test_config_list = list(
-            itertools.product(shard_dims, norm_shape_idx_list, elementwise_affine_list)
+            itertools.product(
+                norm_types, shard_dims, norm_shape_idx_list, elementwise_affine_list
+            )
         )
 
         # normalized shape is a torch.Size object
-        for shard_dim, norm_idx, elementwise_affine in test_config_list:
+        for norm_type, shard_dim, norm_idx, elementwise_affine in test_config_list:
             normalized_shape = x.shape[norm_idx:]
-            layer_norm = torch.nn.LayerNorm(
+            layer_norm = norm_type(
                 normalized_shape,
                 elementwise_affine=elementwise_affine,
                 device=self.device_type,
@@ -287,6 +295,7 @@ def test_layer_norm_fwd(self):
 
             def _replicate_fn(name, module, device_mesh):
                 for name, param in module.named_parameters():
+                    # RMSNorm only has weight, LayerNorm has both weight and bias
                     if name in ["weight", "bias"]:
                         param_dist = torch.nn.Parameter(
                             distribute_tensor(param, device_mesh, [Replicate()])
@@ -307,7 +316,7 @@ def _replicate_fn(name, module, device_mesh):
             self.assertLessEqual(
                 comm_mode.get_total_counts(),
                 1,  # TODO: This should be 0!
-                f"comm count={comm_mode.get_total_counts()}, "
+                f"comm count={comm_mode.get_total_counts()}, norm_type={norm_type.__name__}, "
                 f"shard_dim={shard_dim}, norm_shape={normalized_shape}, elem_affine={elementwise_affine}",
             )
 
@@ -329,12 +338,20 @@ def test_layer_norm_bwd(self):
         norm_shape_idx_list = list(range(3))
         shard_dims = [0, 1, 2]
         elementwise_affine_list = [False, True]
+
+        # Test both LayerNorm and RMSNorm (if CUDA)
+        norm_types = [torch.nn.LayerNorm]
+        if self.device_type == "cuda" and hasattr(torch.nn, "RMSNorm"):
+            norm_types.append(torch.nn.RMSNorm)
+
         test_config_list = list(
-            itertools.product(shard_dims, norm_shape_idx_list, elementwise_affine_list)
+            itertools.product(
+                norm_types, shard_dims, norm_shape_idx_list, elementwise_affine_list
+            )
         )
 
         # normalized shape is a torch.Size object
-        for shard_dim, norm_idx, elementwise_affine in test_config_list:
+        for norm_type, shard_dim, norm_idx, elementwise_affine in test_config_list:
             x = torch.rand(
                 batch,
                 sentence_length,
@@ -343,7 +360,7 @@ def test_layer_norm_bwd(self):
                 requires_grad=True,
             )
             normalized_shape = x.shape[norm_idx:]
-            layer_norm = torch.nn.LayerNorm(
+            layer_norm = norm_type(
                 normalized_shape,
                 elementwise_affine=elementwise_affine,
                 device=self.device_type,
@@ -364,9 +381,11 @@ def _replicate_fn(name, module, device_mesh):
                 self.assertEqual(
                     layer_norm_local.weight, layer_norm_dist.weight.full_tensor()
                 )
-                self.assertEqual(
-                    layer_norm_local.bias, layer_norm_dist.bias.full_tensor()
-                )
+                # RMSNorm doesn't have bias
+                if hasattr(layer_norm_local, "bias"):
+                    self.assertEqual(
+                        layer_norm_local.bias, layer_norm_dist.bias.full_tensor()
+                    )
 
             x_local = x.detach().clone().requires_grad_(True)
             x_dist = distribute_tensor(x, device_mesh, [Shard(shard_dim)])
@@ -384,7 +403,7 @@ def _replicate_fn(name, module, device_mesh):
             self.assertEqual(
                 sum(comm_mode.comm_module_counts["Global"]["forward"].values()),
                 expected_fwd_comm,
-                f"comm count={comm_mode.get_total_counts()}, "
+                f"comm count={comm_mode.get_total_counts()}, norm_type={norm_type.__name__}, "
                 f"shard_dim={shard_dim}, norm_shape={normalized_shape}, elem_affine={elementwise_affine}",
             )
 
@@ -398,7 +417,7 @@ def _replicate_fn(name, module, device_mesh):
             self.assertEqual(
                 sum(comm_mode.comm_module_counts["Global"]["backward"].values()),
                 expected_bwd_comm,
-                f"comm count={comm_mode.get_total_counts()}, "
+                f"comm count={comm_mode.get_total_counts()}, norm_type={norm_type.__name__}, "
                 f"shard_dim={shard_dim}, norm_shape={normalized_shape}, elem_affine={elementwise_affine}",
             )
 
@@ -412,18 +431,22 @@ def _replicate_fn(name, module, device_mesh):
                     is_tensor_partial(layer_norm_dist.weight.grad._spec),
                     needs_reduction,
                 )
-                self.assertEqual(
-                    is_tensor_partial(layer_norm_dist.bias.grad._spec),
-                    needs_reduction,
-                )
+                # RMSNorm doesn't have bias
+                if hasattr(layer_norm_dist, "bias"):
+                    self.assertEqual(
+                        is_tensor_partial(layer_norm_dist.bias.grad._spec),
+                        needs_reduction,
+                    )
                 self.assertEqual(
                     layer_norm_local.weight.grad,
                     layer_norm_dist.weight.grad.full_tensor(),
                 )
-                self.assertEqual(
-                    layer_norm_local.bias.grad,
-                    layer_norm_dist.bias.grad.full_tensor(),
-                )
+                # RMSNorm doesn't have bias
+                if hasattr(layer_norm_local, "bias"):
+                    self.assertEqual(
+                        layer_norm_local.bias.grad,
+                        layer_norm_dist.bias.grad.full_tensor(),
+                    )
 
             self.assertEqual(x_local.grad, x_dist.grad.full_tensor())
 
@@ -432,8 +455,14 @@ def test_layer_norm_bwd_req_grad(self):
         device_mesh = self.build_device_mesh()
         batch, seq_len, embedding_dim, vocab_size = 8, 8, 10, 32
 
+        # Test both LayerNorm and RMSNorm (if CUDA)
+        norm_types = [torch.nn.LayerNorm]
+        if self.device_type == "cuda" and hasattr(torch.nn, "RMSNorm"):
+            norm_types.append(torch.nn.RMSNorm)
+
         # build our subtest configurations and filter out invalid ones
         class SubTest(NamedTuple):
+            norm_type: type
             multidim_norm: bool
             elementwise_affine: bool
             emb_req_grad: bool
@@ -443,19 +472,24 @@ class SubTest(NamedTuple):
         subtest_fails = {}
         valid_filter = (  # noqa: E731
             lambda cfg: (
-                not (cfg.ln_req_grad and not cfg.elementwise_affine) and any(cfg[2:])
+                not (cfg.ln_req_grad and not cfg.elementwise_affine) and any(cfg[3:])
             )
         )
         subtest_cfgs = list(
             filter(
                 valid_filter,
-                [SubTest(*cfg) for cfg in itertools.product(*(((False, True),) * 5))],
+                [
+                    SubTest(norm_type, *cfg)
+                    for norm_type in norm_types
+                    for cfg in itertools.product(*(((False, True),) * 5))
+                ],
             )
         )
 
         for subtest_cfg in subtest_cfgs:
             try:
                 (
+                    norm_type,
                     multidim_norm,
                     elementwise_affine,
                     emb_req_grad,
@@ -473,7 +507,7 @@ def __init__(self):
                         self.preln_embeddings = torch.nn.Embedding(
                             vocab_size, embedding_dim
                         )
-                        self.layer_norm = torch.nn.LayerNorm(
+                        self.layer_norm = norm_type(
                             normalized_shape, elementwise_affine=elementwise_affine
                         )
                         self.postln_linear = torch.nn.Linear(
@@ -572,104 +606,6 @@ def forward(self, tokens):
             f"{len(subtest_fails)}/{len(subtest_cfgs)} subtests failed: {pformat(subtest_fails)}"
         )
 
-    @with_comms
-    def test_rms_norm_bwd(self):
-        device_mesh = self.build_device_mesh()
-
-        # NLP example from pytorch docs
-        batch, sentence_length, embedding_dim = 20, 5, 10
-        norm_shape_idx_list = list(range(3))
-        shard_dims = [0]  # non-first dimensional sharding is not supported
-        elementwise_affine_list = [False, True]
-        test_config_list = list(
-            itertools.product(shard_dims, norm_shape_idx_list, elementwise_affine_list)
-        )
-
-        # normalized shape is a torch.Size object
-        for shard_dim, norm_idx, elementwise_affine in test_config_list:
-            x = torch.rand(
-                batch,
-                sentence_length,
-                embedding_dim,
-                device=self.device_type,
-                requires_grad=True,
-            )
-            normalized_shape = x.shape[norm_idx:]
-            rms_norm = torch.nn.RMSNorm(
-                normalized_shape,
-                elementwise_affine=elementwise_affine,
-                device=self.device_type,
-            )
-            rms_norm_local = copy.deepcopy(rms_norm).to(self.device_type)
-
-            def _replicate_fn(name, module, device_mesh):
-                for name, param in module.named_parameters():
-                    if name == "weight":
-                        param_dist = torch.nn.Parameter(
-                            distribute_tensor(param, device_mesh, [Replicate()])
-                        )
-                        module.register_parameter(name, param_dist)
-
-            rms_norm_dist = distribute_module(rms_norm, device_mesh, _replicate_fn)
-
-            if elementwise_affine:
-                self.assertEqual(
-                    rms_norm_local.weight, rms_norm_dist.weight.full_tensor()
-                )
-
-            x_local = x.detach().clone().requires_grad_(True)
-            x_dist = distribute_tensor(x, device_mesh, [Shard(shard_dim)])
-            self.assertEqual(x_local, x_dist.full_tensor())
-
-            y_local = rms_norm_local(x_local)
-            # make sure that backward rms norm does not introduce extra collectives
-            comm_mode = CommDebugMode()
-            with comm_mode:
-                y_dist = rms_norm_dist(x_dist)
-                y_dist.sum().backward()
-
-            # TODO: forward pass is sharding strategy is generated from composite, hence 1 more collective than layer_norm
-            # see: https://github.com/pytorch/pytorch/pull/158716#issuecomment-3096012679
-            expected_fwd_comm = 0 if shard_dim < norm_idx else 2
-
-            self.assertEqual(
-                sum(comm_mode.comm_module_counts["Global"]["forward"].values()),
-                expected_fwd_comm,
-                f"comm count={comm_mode.get_total_counts()}, "
-                f"shard_dim={shard_dim}, norm_shape={normalized_shape}, elem_affine={elementwise_affine}",
-            )
-
-            self.assertEqual(y_local, y_dist.full_tensor())
-
-            # backward step
-            y_local.sum().backward()
-
-            expected_bwd_comm = 0 if shard_dim < norm_idx else 1
-
-            self.assertEqual(
-                sum(comm_mode.comm_module_counts["Global"]["backward"].values()),
-                expected_bwd_comm,
-                f"comm count={comm_mode.get_total_counts()}, "
-                f"shard_dim={shard_dim}, norm_shape={normalized_shape}, elem_affine={elementwise_affine}",
-            )
-
-            if elementwise_affine:
-                # if input is sharded on any outer dimension, the gradient of weight
-                # should be Partial
-                dim_map = x_dist._spec.dim_map
-                outer_dims = range(norm_idx)
-                needs_reduction = any(dim_map[d] >= 0 for d in outer_dims)
-                self.assertEqual(
-                    is_tensor_partial(rms_norm_dist.weight.grad._spec),
-                    needs_reduction,
-                )
-                self.assertEqual(
-                    rms_norm_local.weight.grad,
-                    rms_norm_dist.weight.grad.full_tensor(),
-                )
-
-            self.assertEqual(x_local.grad, x_dist.grad.full_tensor())
-
     @with_comms
     def test_topk(self):
         device_mesh = self.build_device_mesh()
@@ -759,7 +695,6 @@ def test_foreach_norm_different_mesh(self):
         self.assertEqual(grad1_norm.device_mesh, mesh_y)
 
     @with_comms
-    @skipIfRocm
     def test_foreach_add_different_mesh(self):
         mesh_shape = (2, self.world_size // 2)
         mesh_2d = init_device_mesh(
@@ -788,7 +723,7 @@ def test_foreach_add_different_mesh(self):
         self.assertEqual(out0.device_mesh, mesh_x)
         self.assertEqual(out1.device_mesh, mesh_y)
 
-        with self.assertRaisesRegex(ValueError, "computation across different mesh"):
+        with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
             torch.ops.aten._foreach_add(
                 [replica_inp00, replica_inp01], [replica_inp10, replica_inp11]
             )
diff --git a/test/distributed/tensor/test_matrix_ops.py b/test/distributed/tensor/test_matrix_ops.py
index 50ce1181d4700..66d762daabbc3 100644
--- a/test/distributed/tensor/test_matrix_ops.py
+++ b/test/distributed/tensor/test_matrix_ops.py
@@ -414,10 +414,6 @@ def test_scaled_dot_product_attention(self):
             requires_grad=True,
         )
 
-        dist_query = distribute_tensor(query, device_mesh, [Shard(1)])
-        dist_key = distribute_tensor(key, device_mesh, [Shard(1)])
-        dist_value = distribute_tensor(value, device_mesh, [Shard(1)])
-
         from torch.nn.attention import sdpa_kernel, SDPBackend
 
         available_backends = []
@@ -434,7 +430,13 @@ def test_scaled_dot_product_attention(self):
         if torch.backends.cuda.can_use_efficient_attention(params, debug=False):
             available_backends.append(SDPBackend.EFFICIENT_ATTENTION)
 
-        for backend in available_backends:
+        placement_specs = [(Replicate(),), (Shard(0),), (Shard(1),)]
+        for backend, input_placements in itertools.product(
+            available_backends, placement_specs
+        ):
+            dist_query = distribute_tensor(query, device_mesh, input_placements)
+            dist_key = distribute_tensor(key, device_mesh, input_placements)
+            dist_value = distribute_tensor(value, device_mesh, input_placements)
             with sdpa_kernel(backends=[backend]):
                 out = F.scaled_dot_product_attention(
                     query, key, value, dropout_p=dropout_p, is_causal=is_causal
@@ -448,19 +450,22 @@ def test_scaled_dot_product_attention(self):
                         is_causal=is_causal,
                     )
                     self.assertEqual(comm_mode.get_total_counts(), 0)
-                    self.assertTrue(dist_out.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_out.placements, input_placements)
                     self.assertEqual(dist_out.full_tensor(), out)
 
                 out.sum().backward()
                 with comm_mode:
                     dist_out.sum().backward()
                     self.assertEqual(comm_mode.get_total_counts(), 0)
-                    self.assertTrue(dist_query.grad.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_query.grad.placements, input_placements)
                     self.assertEqual(dist_query.grad.full_tensor(), query.grad)
-                    self.assertTrue(dist_key.grad.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_key.grad.placements, input_placements)
                     self.assertEqual(dist_key.grad.full_tensor(), key.grad)
-                    self.assertTrue(dist_value.grad.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_value.grad.placements, input_placements)
                     self.assertEqual(dist_value.grad.full_tensor(), value.grad)
+                    query.grad.zero_()
+                    key.grad.zero_()
+                    value.grad.zero_()
 
     @skip_unless_torch_gpu
     @with_comms()
diff --git a/test/distributed/tensor/test_op_schema.py b/test/distributed/tensor/test_op_schema.py
new file mode 100644
index 0000000000000..ae6aa3dbc9915
--- /dev/null
+++ b/test/distributed/tensor/test_op_schema.py
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+from torch.distributed.tensor._dtensor_spec import DTensorSpec
+from torch.distributed.tensor._op_schema import OpSchema
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestOpSchema(TestCase):
+    def test_equality_checks_lists_of_dtensor_spec(self):
+        """If x == y, then we must have h(x) == h(y)."""
+        dts = DTensorSpec(mesh=None, placements=tuple(), tensor_meta=None)
+        schema1 = OpSchema(op=None, args_schema=[dts, [dts]], kwargs_schema={})
+        schema2 = OpSchema(op=None, args_schema=[dts, [dts, dts]], kwargs_schema={})
+        # This is a regression test; these schemas used to compare equal.
+        self.assertNotEqual(schema1, schema2)
+        self.assertNotEqual(hash(schema1), hash(schema2))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/tensor/test_op_strategy.py b/test/distributed/tensor/test_op_strategy.py
index cb3d293cbefff..8e97d80e95430 100644
--- a/test/distributed/tensor/test_op_strategy.py
+++ b/test/distributed/tensor/test_op_strategy.py
@@ -536,7 +536,7 @@ class DistTensorReplicateStrategyRegistrationTest(DTensorTestBase):
     def test_replicate_strategy_placement(self, mock_select_strategy):
         costs_from__select_strategy = []
 
-        def mock_select_func(strategy):
+        def mock_select_func(strategy, op_schema=None):
             """function copied from _select_strategy but with cost capturing"""
             nonlocal costs_from__select_strategy
             if len(strategy.strategies) == 1:
diff --git a/test/distributed/tensor/test_random_ops.py b/test/distributed/tensor/test_random_ops.py
index 5e98934249e97..2cf9916c7d67a 100644
--- a/test/distributed/tensor/test_random_ops.py
+++ b/test/distributed/tensor/test_random_ops.py
@@ -33,13 +33,18 @@
 )
 
 
+def get_generator_seed_for_device_type(device_type: str) -> int:
+    device_module = torch.get_device_module(device_type)
+    return device_module.get_rng_state()[:8].view(torch.int64).item()
+
+
 class DistTensorRandomInitTest(DTensorTestBase):
     def _run_init_op(self, init_op, *args, **kwargs):
         device_mesh = self.build_device_mesh()
         shard_spec = [Shard(0)]
         input_size = (8, 4)
 
-        # NOTE: currently random initialization on cuda device has different
+        # NOTE: currently random initialization on gpu device has different
         # behavior from other devices. Unify the test once the behavior is unified.
         if not is_rng_supported_mesh(device_mesh):
             input_tensor = torch.randn(*input_size, device=self.device_type)
@@ -87,16 +92,41 @@ def test_init_ops(self):
             self._run_init_op(torch.randn_like, dtype=dtype)
             self._run_init_op(torch.randint_like, low=0, high=100, dtype=dtype)
 
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_init_with_user_generator(self):
+        device_mesh = self.build_device_mesh()
+        torch.manual_seed(42)
+        rng = torch.Generator(device=self.device_type).manual_seed(42)
+        t1 = torch.distributed.tensor.empty(
+            (8, 3), device_mesh=device_mesh, placements=[Shard(0)]
+        )
+        t2 = torch.distributed.tensor.empty(
+            (8, 3), device_mesh=device_mesh, placements=[Shard(0)]
+        )
+        for i in range(2):
+            # run a second time, to make sure that `rng`'s offset-state is advancing on the second usage
+            torch.nn.init.uniform_(t1, 0.0, 1.0)
+            torch.nn.init.uniform_(t2, 0.0, 1.0, rng)
+            self.assertEqual(t1.full_tensor(), t2.full_tensor(), f"Failed at {i=}")
+
+        # ensure that we do not cache the 'seed' from the first time we see it in DTensor
+        # this is a behavior change, DTensor used to cache the generator state and not modify the original generator,
+        # now it modifies the original generator instead.
+        torch.manual_seed(55)
+        rng.manual_seed(55)
+        torch.nn.init.uniform_(t1, 0.0, 1.0)
+        torch.nn.init.uniform_(t2, 0.0, 1.0, rng)
+        self.assertEqual(t1.full_tensor(), t2.full_tensor())
+
     @with_comms
     @skip_if_lt_x_gpu(4)
     def test_meta_tensor_init(self):
-        # test suite sets each rank's seed to the same value but in actual
-        # execution the default random seed will be different (a random value).
-        # The DTensor random ops will use the same random seed even though the
-        # torch random generator keeps different seeds on ranks. This ensures
-        # that Replicate DTensor will have the same initialized results
-        # across ranks.
-        torch.cuda.manual_seed(self.rank)
+        # test suite sets each rank's seed to the same value.
+        # The DTensor random ops will use the same generator as the default one on the device.
+
+        # Note: this behavior changed, and now the guideline is to set the same RNG seed on all SPMD ranks.
+        torch.get_device_module(self.device_type).manual_seed(0)
         device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
         size = [1024, 2048]
         meta_dtensor = distribute_tensor(
@@ -115,7 +145,7 @@ def test_meta_tensor_init(self):
         self.assertTrue(random._rng_tracker.distribute_region_enabled)
 
         # allgather the local tensors
-        local_tensor = funcol.all_gather_tensor(
+        gathered_local_tensors = funcol.all_gather_tensor(
             dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
         )
 
@@ -126,7 +156,8 @@ def test_meta_tensor_init(self):
                 # other rank should have an identical local tensor
                 other_slice = slice(1024 * other_rank, 1024 * other_rank + 1024)
                 self.assertEqual(
-                    local_tensor[self_slice, :], local_tensor[other_slice, :]
+                    gathered_local_tensors[self_slice, :],
+                    gathered_local_tensors[other_slice, :],
                 )
 
         # Test 2: disable the distribute region for RNG
@@ -145,11 +176,11 @@ def test_meta_tensor_init(self):
 
         # compare with local tensors from other ranks
         for other_rank in range(self.world_size):
-            # the RNG result on each rank differs even they're supposed
-            # to be replicated
+            # the RNG result on each rank are the same even without the help of DTensor's RNG infra,
+            # since the default RNG is the same across ranks.
             if self.rank != other_rank:
                 other_slice = slice(1024 * other_rank, 1024 * other_rank + 1024)
-                self.assertNotEqual(
+                self.assertEqual(
                     local_tensor[self_slice, :], local_tensor[other_slice, :]
                 )
 
@@ -275,7 +306,12 @@ def test_rng_tracker_init(self):
         # seed synchronization only happens after `manual_seed` or the first DTensor
         # random op call
         dt.uniform_(0, 1)
-        self.assertEqual(seed_from_rank_0, random._rng_tracker.get_seed("parallel-rng"))
+
+        # We do not maintain the copy of the seed in dtensor, but we do mutate the global rng state
+        # since we now always pull it fresh from the local device generator
+        self.assertEqual(
+            seed_from_rank_0, get_generator_seed_for_device_type(self.device_type)
+        )
 
     @with_comms
     @skip_unless_torch_gpu
@@ -294,11 +330,13 @@ def test_manual_seed(self):
             manual_seed(self.rank, device_mesh)
             # RNG tracker should already be initialized
             self.assertTrue(random._rng_tracker is not None)
-            self.assertEqual(self.rank, random._rng_tracker.get_seed("parallel-rng"))
+            self.assertEqual(
+                self.rank, get_generator_seed_for_device_type(self.device_type)
+            )
 
             # Test 2: set same seed on different ranks
             manual_seed(1234, device_mesh)
-            self.assertEqual(1234, random._rng_tracker.get_seed("parallel-rng"))
+            self.assertEqual(1234, get_generator_seed_for_device_type(self.device_type))
 
         self.assertEqual(comm_mode.get_total_counts(), 0)
 
@@ -331,7 +369,10 @@ def test_pipeline_parallel_manual_seed(self):
 
         # set the seed for each pipeline stage to 123 + pp_rank
         manual_seed(123 + pp_rank, spmd_mesh)
-        self.assertEqual(123 + pp_rank, random._rng_tracker.get_seed("parallel-rng"))
+        # dtensor no longer stores a copy of the seed, but it mutates the device's generator so we can check that
+        self.assertEqual(
+            123 + pp_rank, get_generator_seed_for_device_type(self.device_type)
+        )
 
         # mimic initializing a model weight sharded on the SPMD mesh
         spmd_dtensor = torch.distributed.tensor.ones(
@@ -416,14 +457,15 @@ def test_deterministic_rand_1d(self):
             self_slice = slice(4 * self.rank, 4 * self.rank + 4)
             for other_rank in range(self.world_size):
                 if self.rank != other_rank:
-                    # other rank should have an identical local tensor
+                    # other rank should have a different local tensor for shard placement
                     other_slice = slice(4 * other_rank, 4 * other_rank + 4)
                     self.assertNotEqual(
                         local_tensor[self_slice, :],
                         local_tensor[other_slice, :],
                     )
 
-            torch.manual_seed(self.rank)
+            # we should set manual seed to the same value on all SPMD ranks
+            torch.manual_seed(0)
             dtensor = fn(size, device_mesh=device_mesh, placements=[Replicate()])
             local_tensor = funcol.all_gather_tensor(
                 dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
@@ -433,7 +475,7 @@ def test_deterministic_rand_1d(self):
             self_slice = slice(4 * self.rank, 4 * self.rank + 4)
             for other_rank in range(self.world_size):
                 if self.rank != other_rank:
-                    # other rank should have an identical local tensor
+                    # other rank should have an identical local tensor for replicate placement
                     other_slice = slice(4 * other_rank, 4 * other_rank + 4)
                     self.assertEqual(
                         local_tensor[self_slice, :],
@@ -550,8 +592,8 @@ class DistTensorRandomOpsTest3D(DTensorTestBase):
     def world_size(self):
         return 8
 
-    @with_comms
     @skip_if_lt_x_gpu(8)
+    @with_comms
     def test_hsdp_tp_model_meta_init(self):
         # initialize the 3-d device mesh
         global_mesh = init_device_mesh(
diff --git a/test/distributed/tensor/test_view_ops.py b/test/distributed/tensor/test_view_ops.py
index 92de79bc188b8..815b588a7ded7 100644
--- a/test/distributed/tensor/test_view_ops.py
+++ b/test/distributed/tensor/test_view_ops.py
@@ -10,7 +10,9 @@
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_tensor,
+    DTensor,
     init_device_mesh,
+    Partial,
     Replicate,
     Shard,
 )
@@ -25,7 +27,7 @@
     view_groups,
 )
 from torch.distributed.tensor.debug import CommDebugMode
-from torch.distributed.tensor.placement_types import Placement
+from torch.distributed.tensor.placement_types import _StridedShard, Placement
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -168,8 +170,34 @@ def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
             *(device_mesh.ndim * [sharding_choices])
         )
 
-        for in_shard in all_sharding_choices:
-            in_dt = distribute_tensor(args[0], device_mesh, in_shard)
+        outer_mesh = device_mesh["outer"]
+        inner_mesh = device_mesh["inner"]
+        inner_mesh_size = inner_mesh.size()
+        strided_sharding_choices = [
+            (_StridedShard(i, split_factor=inner_mesh_size), Shard(i))
+            for i, s in enumerate(in_shape)
+            if s > 1 and i not in no_shard_dims
+        ]
+
+        for in_shard in itertools.chain(all_sharding_choices, strided_sharding_choices):
+            if isinstance(in_shard[0], _StridedShard):
+                if op != Tensor.view:
+                    continue
+                # cannot produce DTensor using ``distribute_tensor()``
+                # with ``_StridedShard``. Need to distribute the input
+                # over inner mesh dim first, then distribute the
+                # _local_tensor over the outer mesh dim.
+                in_dt = distribute_tensor(args[0], inner_mesh, (in_shard[1],))
+                in_dt = distribute_tensor(
+                    in_dt._local_tensor, outer_mesh, (Shard(in_shard[0].dim),)
+                )
+                in_dt = DTensor.from_local(
+                    in_dt._local_tensor,
+                    device_mesh,
+                    in_shard,
+                )
+            else:
+                in_dt = distribute_tensor(args[0], device_mesh, in_shard)
 
             comm_mode = CommDebugMode()
             with comm_mode:
@@ -200,24 +228,29 @@ def test_illegal_views(self):
         shard.view(-1)
 
         shard = dtensor.redistribute(device_mesh=device_mesh, placements=[Shard(dim=1)])
-        with self.assertRaisesRegex(
-            RuntimeError, "Attempted to flatten sharded dimension"
-        ):
+        with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
             shard.view(-1)
 
         # 8 is the uneven case since mesh dim is 6
         tensor = torch.randn((8, 256))
         dtensor = distribute_tensor(tensor, device_mesh, [Replicate()])
         shard = dtensor.redistribute(device_mesh=device_mesh, placements=[Shard(dim=0)])
-        with self.assertRaisesRegex(
-            RuntimeError, "Attempted to flatten unevenly sharded dimension"
-        ):
+        with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
             shard.view(-1)
 
+        # assuming world size is 4+, tensor is shardable on dim 1 with size 256
+        # but not viewable when the resulting dim 1 has size 2
+        tensor = torch.randn((8, 256))
+        dtensor = distribute_tensor(tensor, device_mesh, [Replicate()])
+        shard = dtensor.redistribute(device_mesh=device_mesh, placements=[Shard(dim=1)])
+        with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
+            shard.view(8, 2, -1)
+
     @with_comms
     def test_view_ops(self):
-        self.device_mesh = DeviceMesh(
-            self.device_type, torch.arange(dist.get_world_size()).view(-1, 2)
+        mesh_shape = (dist.get_world_size() // 2, 2)
+        self.device_mesh = init_device_mesh(
+            self.device_type, mesh_shape=mesh_shape, mesh_dim_names=("outer", "inner")
         )
         self.dimmap_test(torch.atleast_1d, (randn(()),), (Singleton(),))
         self.dimmap_test(torch.atleast_1d, (randn(24),), (InputDim(0),))
@@ -442,7 +475,6 @@ def test_view_ops(self):
             (randn(42, 24, 36), 1),
             (InputDim(0), Singleton(), InputDim(1), InputDim(2)),
         )
-
         self.dimmap_test(
             Tensor.view,
             (randn(6, 12, 24), 72, 24),
@@ -609,11 +641,28 @@ def test_view_redistribution(self):
         mesh = init_device_mesh(self.device_type, (self.world_size,))
         dtensor_x = distribute_tensor(x, mesh, (Shard(0),))
 
-        with self.assertRaisesRegex(
-            RuntimeError, "Attempted to flatten unevenly sharded dimension"
-        ):
+        with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
             dtensor_x.view(-1, 8)
 
+    @with_comms
+    def test_squeeze_(self):
+        mesh_2d = init_device_mesh(self.device_type, (3, 2), mesh_dim_names=("a", "b"))
+        torch.manual_seed(self.rank)
+        x = torch.randn((1, 4), device=self.device_type)
+        dist_x = DTensor.from_local(x, mesh_2d, [Partial(), Shard(1)])
+        self._test_op_on_dtensor(
+            torch.ops.aten.squeeze_.dim,
+            dist_x,
+            0,
+        )
+        # check DTensor subclass metadata as well as placements
+        self.assertEqual(dist_x.shape, torch.Size([8]))
+        self.assertEqual(
+            dist_x.stride(),
+            (1,),
+        )
+        self.assertEqual(dist_x.placements, [Partial(), Shard(0)])
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index efac131e6c380..1857feffd9394 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -293,6 +293,23 @@ def forward(self, x):
         return self.conv3(x)
 
 
+# A model involving FFTs, used to test DDP with complex tensors
+class FFTModel(nn.Module):
+    def __init__(self, hin, win, n_features):
+        super().__init__()
+        self.hin = hin
+        self.win = win
+        self.weight = nn.Parameter(
+            torch.ones((n_features, n_features, hin, win // 2 + 1), dtype=torch.cfloat)
+        )
+
+    def forward(self, x):
+        xc = torch.fft.rfft2(x, s=(self.hin, self.win), dim=(-2, -1), norm="ortho")
+        xcw = torch.einsum("nchw,cohw->nohw", xc, self.weight)
+        x = torch.fft.irfft2(xcw, dim=(-2, -1), norm="ortho")
+        return x
+
+
 class Task(nn.Module):
     def __init__(self) -> None:
         super().__init__()
diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py
index 5c127634f122f..bafc781b591c6 100644
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@@ -827,9 +827,12 @@ def func(arg: torch.Tensor) -> torch.Tensor:
 
         with torch._inductor.config.patch({"cpp_wrapper": True}):
             code = run_and_get_triton_code(compiled, arg)
-            # Check the return tensor from wait_tensor is not used anywhere by
-            # checking if it is explicitly deleted by calling aoti_torch_delete_tensor_object
-            FileCheck().check_count("aoti_torch_delete_tensor_object(buf", 2).run(code)
+            # Check the return tensors from all_reduce and wait_tensor are not used anywhere by
+            # checking if they are explicitly deleted by calling aoti_torch_delete_tensor_object
+            FileCheck().check_not(
+                # all_reduce must have been rewritten into all_reduce_
+                "aoti_torch_cpu__c10d_functional_all_reduce(buf"
+            ).check_count("aoti_torch_delete_tensor_object(buf", 4).run(code)
 
         # Test aoti
         AOTIRunnerUtil.run(func, (arg,))
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 3ca140878f643..0a800d5732fbb 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -25,6 +25,7 @@
 
 import test_c10d_common
 from test_c10d_common import (
+    FFTModel,
     gpus_for_rank,
     LOOPBACK,
     ModuleForDdpCommHook,
@@ -133,6 +134,32 @@ def simple_reduce_tests(rank, world_size):
             ),
         )
 
+    # Extend tests for cfloat dtype
+    tests.extend(
+        (
+            (
+                c10d.ReduceOp.SUM,
+                torch.tensor([complex(rank + 1.0, rank + 1.0)], dtype=torch.cfloat),
+                torch.tensor(
+                    [
+                        complex(
+                            world_size * (world_size + 1) / 2,
+                            world_size * (world_size + 1) / 2,
+                        )
+                    ],
+                    dtype=torch.cfloat,
+                ),
+            ),
+            (
+                c10d.ReduceOp.AVG,
+                torch.tensor([complex(rank + 1.0, rank + 1.0)], dtype=torch.cfloat),
+                torch.tensor(
+                    [complex(float((world_size + 1) / 2), float((world_size + 1) / 2))],
+                    dtype=torch.cfloat,
+                ),
+            ),
+        )
+    )
     return tests
 
 
@@ -372,6 +399,13 @@ def broadcast(xs, rootRank, rootTensor):
                     torch.tensor([i * num + j], dtype=torch.float32), output[1]
                 )
 
+            # Run with 1 input tensor of cfloat dtype
+            x = fn(torch.tensor([complex(self.rank, self.rank)], dtype=torch.cfloat))
+            output = broadcast([x], i, 0)
+            self.assertEqual(
+                torch.tensor([complex(i, i)], dtype=torch.cfloat), output[0]
+            )
+
         # Test overloaded convenience function
         x = torch.tensor([self.rank + 1.0])
         fut = pg.broadcast(x, root=0).get_future()
@@ -1610,6 +1644,22 @@ def test_block_current_stream_cuda(self):
 
         work.wait()
 
+    @requires_gloo()
+    def test_send_recv_complex(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_gloo(
+            store, self.rank, self.world_size, self.opts()
+        )
+        # Generate the same random tensor
+        torch.manual_seed(0)
+        send_tensor = torch.rand(10, 10, dtype=torch.cfloat)
+        if self.rank == 0:
+            pg.send([send_tensor], 1, 0).wait()
+        if self.rank == 1:
+            recv_tensor = torch.rand(10, 10, dtype=torch.cfloat)
+            pg.recv([recv_tensor], 0, 0).wait()
+            self.assertEqual(send_tensor, recv_tensor)
+
 
 class DistributedDataParallelTest(
     test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
@@ -2275,6 +2325,24 @@ def div_by_world_size(fut):
 
         self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
 
+    @requires_gloo()
+    def test_ddp_complex_params(self):
+        process_group = self._get_process_group()
+        N, C, H, W = 1, 16, 64, 64
+        ddp_model = DistributedDataParallel(
+            FFTModel(hin=H, win=W, n_features=C),
+            process_group=process_group,
+        )
+        optimizer = torch.optim.Adam(ddp_model.parameters(), lr=0.001)
+
+        inp = torch.ones((N, C, H, W), dtype=torch.float32)
+
+        # train step
+        out = ddp_model(inp)
+        loss = torch.sum(out)
+        loss.backward()
+        optimizer.step()
+
 
 class ReducerModule(nn.Module):
     def __init__(self) -> None:
@@ -2454,7 +2522,7 @@ def tearDown(self) -> None:
 
     def _verify_trace(self, t, is_json):
         ver = t["version"]
-        self.assertEqual(ver, "2.9")
+        self.assertEqual(ver, "2.10")
         pg_config = t["pg_config"]
         self.assertEqual(len(pg_config), 1)
         default_pg_info = pg_config["0"]
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 6cd6b55489a72..568ad5f5cdb5a 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -29,7 +29,13 @@
 
 
 import test_c10d_common
-from test_c10d_common import ConvNet, DoubleGpuNet, gpus_for_rank, ModuleForDdpCommHook
+from test_c10d_common import (
+    ConvNet,
+    DoubleGpuNet,
+    FFTModel,
+    gpus_for_rank,
+    ModuleForDdpCommHook,
+)
 
 import torch.distributed as dist
 import torch.distributed.algorithms.ddp_comm_hooks.default_hooks as default
@@ -2560,25 +2566,6 @@ def test_channels_last_contig(self):
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_ddp_complex_params(self):
-        class FFTModel(nn.Module):
-            def __init__(self, hin, win, n_features):
-                super().__init__()
-                self.hin = hin
-                self.win = win
-                self.weight = nn.Parameter(
-                    torch.ones(
-                        (n_features, n_features, hin, win // 2 + 1), dtype=torch.cfloat
-                    )
-                )
-
-            def forward(self, x):
-                xc = torch.fft.rfft2(
-                    x, s=(self.hin, self.win), dim=(-2, -1), norm="ortho"
-                )
-                xcw = torch.einsum("nchw,cohw->nohw", xc, self.weight)
-                x = torch.fft.irfft2(xcw, dim=(-2, -1), norm="ortho")
-                return x
-
         process_group = self._get_process_group()
         device_id = gpus_for_rank(self.world_size)[self.rank][0]
         N, C, H, W = 1, 16, 64, 64
@@ -3107,7 +3094,7 @@ def test_invalid_nccl_blocking_wait_env(self):
         self._run_invalid_nccl_blocking_wait_env("4294967295")
 
 
-class NcclRegistrationTest(MultiProcessTestCase):
+class NcclUserBufferRegistrationTest(MultiProcessTestCase):
     def setUp(self):
         super().setUp()
         # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
@@ -3135,10 +3122,14 @@ def tearDown(self):
     @requires_multicast_support()
     def test_nccl_user_buffer_registration(self):
         store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device(f"cuda:{self.rank}")
         c10d.init_process_group(
-            backend="nccl", rank=self.rank, world_size=self.world_size, store=store
+            backend="nccl",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=store,
+            device_id=device,
         )
-        device = torch.device(f"cuda:{self.rank}")
         torch.cuda.set_device(self.rank)
         pg = c10d.distributed_c10d._get_default_group()
         backend = pg._get_backend(torch.device(device))
@@ -3180,41 +3171,54 @@ def test_nccl_user_buffer_registration(self):
     @requires_multicast_support()
     def test_nccl_window_registration(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(
-            backend="nccl", rank=self.rank, world_size=self.world_size, store=store
-        )
         device = torch.device(f"cuda:{self.rank}")
-        torch.cuda.set_device(self.rank)
-        pg = c10d.distributed_c10d._get_default_group()
-        backend = pg._get_backend(torch.device(device))
-
-        # Use NCCL memory allocator
-        # enable symmetric memory usage in NCCL
-        pool = torch.cuda.MemPool(backend.mem_allocator, symmetric=True)
-
-        # allocate memory with ncclMemAlloc
-        # note: symmetric kernels are not available for dtypes like torch.int64
-        with torch.cuda.use_mem_pool(pool):
-            tensor = torch.arange(1024 * 1024 * 2, device=device, dtype=torch.float32)
+        with torch.cuda.device(device):
+            # Eager init the nccl comm so that we don't implicitly create one during register_mem_pool
+            c10d.init_process_group(
+                backend="nccl",
+                rank=self.rank,
+                world_size=self.world_size,
+                store=store,
+                device_id=device,
+            )
+            pg = c10d.distributed_c10d._get_default_group()
+            backend = pg._get_backend(torch.device(device))
+
+            # Use NCCL memory allocator
+            # enable symmetric memory usage in NCCL
+            pool = torch.cuda.MemPool(backend.mem_allocator)
+
+            # allocate memory with ncclMemAlloc
+            # note: symmetric kernels are not available for dtypes like torch.int64
+            with torch.cuda.use_mem_pool(pool):
+                tensor = torch.arange(
+                    1024 * 1024 * 2, device=device, dtype=torch.float32
+                )
 
-        # register buffers to NCCL
-        backend.register_mem_pool(pool)
+            # register buffers to NCCL
+            backend.register_mem_pool(pool, symm=True)
 
-        # allreduce now should use NVIDIA Switches
-        pg.allreduce(tensor).wait()
-        torch.cuda.synchronize(device=device)
+            # allreduce now should use NVIDIA Switches
+            pg.allreduce(tensor).wait()
+            # check that further allocations are also registered
+            with torch.cuda.use_mem_pool(pool):
+                tensor = torch.arange(
+                    1024 * 1024 * 2, device=device, dtype=torch.float32
+                )
+            pg.allreduce(tensor).wait()
+            torch.cuda.synchronize(device=device)
 
-        # de-register buffers from NCCL
-        backend.deregister_mem_pool(pool)
+            # de-register buffers from NCCL
+            backend.deregister_mem_pool(pool)
 
-        # clean up memory
-        del tensor, pool
+            # clean up memory
+            del tensor, pool
 
         with open(os.environ["NCCL_DEBUG_FILE"]) as f:
             nccl_debug_file_content = f.read()
             # if buffers were registered and symmetric kernels ran, NCCL_DEBUG
             # should show successful registration in debug output
-            self.assertRegex(nccl_debug_file_content, "[Symmetric]")
+            self.assertRegex(nccl_debug_file_content, "Symmetric")
 
 
 class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
@@ -4358,10 +4362,12 @@ def started_or_scheduled(self, timing_enabled):
 class NCCLTraceTest(NCCLTraceTestBase):
     def _verify_trace(self, t, include_collectives, timing_enabled, is_json):
         ver = t["version"]
-        self.assertEqual(ver, "2.9")
-        nccl_version = t["nccl_version"]
-        torch_nccl_version = torch.cuda.nccl.version()
-        self.assertEqual(nccl_version, ".".join(str(v) for v in torch_nccl_version))
+        self.assertEqual(ver, "2.10")
+        comm_lib_version = t["comm_lib_version"]
+        torch_comm_lib_version = torch.cuda.nccl.version()
+        self.assertEqual(
+            comm_lib_version, ".".join(str(v) for v in torch_comm_lib_version)
+        )
         pg_config = t["pg_config"]
         self.assertEqual(len(pg_config), 1)
         default_pg_info = pg_config["0"]
diff --git a/test/distributed/test_c10d_ops_nccl.py b/test/distributed/test_c10d_ops_nccl.py
index 3da93bf5ee30f..4dd4fc72361cf 100644
--- a/test/distributed/test_c10d_ops_nccl.py
+++ b/test/distributed/test_c10d_ops_nccl.py
@@ -25,7 +25,7 @@
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
     init_multigpu_helper,
-    MultiProcContinousTest,
+    MultiProcContinuousTest,
     requires_nccl,
     requires_nccl_version,
     sm_is_or_higher_than,
@@ -33,7 +33,6 @@
 from torch.testing._internal.common_utils import (
     run_tests,
     skip_but_pass_in_sandcastle_if,
-    skipIfRocm,
     TEST_WITH_DEV_DBG_ASAN,
 )
 
@@ -45,7 +44,7 @@
     sys.exit(0)
 
 
-class ProcessGroupNCCLOpTest(MultiProcContinousTest):
+class ProcessGroupNCCLOpTest(MultiProcContinuousTest):
     @classmethod
     def backend_str(cls) -> str:
         return "nccl"
@@ -319,7 +318,6 @@ def test_allreduce_in_cudagraph(self):
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    @skipIfRocm()
     def test_nccl_watchdog_cudagraph(self):
         # test that the watchdog does not crash graphs with disallowed event query
         pg = self.pg
diff --git a/test/distributed/test_collective_utils.py b/test/distributed/test_collective_utils.py
index a150a55f77be6..791aafa5a3a6b 100644
--- a/test/distributed/test_collective_utils.py
+++ b/test/distributed/test_collective_utils.py
@@ -2,10 +2,25 @@
 
 from unittest import mock
 
+import torch
 import torch.distributed as c10d
-from torch.distributed.collective_utils import all_gather, broadcast
+from torch.distributed.collective_utils import (
+    _check_rng_sync,
+    _check_rng_sync_internal,
+    _summarize_ranks,
+    all_gather,
+    broadcast,
+)
+from torch.distributed.device_mesh import init_device_mesh
+from torch.testing import FileCheck
 from torch.testing._internal.common_distributed import MultiProcessTestCase
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    TestCase,
+)
+from torch.testing._internal.distributed.fake_pg import FakeStore
 
 
 class TestCollectiveUtils(MultiProcessTestCase):
@@ -116,6 +131,86 @@ def test_all_gather_result_raises_exceptions_from_func(
         with self.assertRaisesRegex(Exception, expected_exception):
             all_gather(data_or_fn=func)
 
+    @parametrize("device", ["cpu", "cuda"])
+    def test_check_rng_sync(
+        self,
+        device,
+    ) -> None:
+        if device == "cuda" and not torch.cuda.is_available():
+            self.skipTest("Cuda is not available")
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
+        group = torch.distributed.distributed_c10d._get_default_group()
+        generator = torch.Generator(device=device)
+        generator.manual_seed(123)
+        value_ranks, _ = _check_rng_sync_internal(generator, group)
+        self.assertEqual(len(value_ranks), 1, value_ranks)
+        for actual, expected in zip(value_ranks.values(), [{0, 1, 2, 3}]):
+            self.assertEqual(actual, expected, actual)
+
+        if torch.distributed.get_rank() == 1:
+            torch.randn((10,), device=device, generator=generator)
+        value_ranks, _ = _check_rng_sync_internal(generator, group)
+        self.assertEqual(len(value_ranks), 2, value_ranks)
+        for actual, expected in zip(value_ranks.values(), [{0, 2, 3}, {1}]):
+            self.assertEqual(actual, expected, actual)
+
+        if torch.distributed.get_rank() == 0:
+            generator.manual_seed(456)
+        value_ranks, _ = _check_rng_sync_internal(generator, group)
+        self.assertEqual(len(value_ranks), 3, value_ranks)
+        for actual, expected in zip(value_ranks.values(), [{0}, {1}, {2, 3}]):
+            self.assertEqual(actual, expected, actual)
+
+        log_str = _check_rng_sync(generator, group)
+        FileCheck().check("Generator desync detected").check("Ranks").check("0").check(
+            "1"
+        ).check("2:4").run(log_str)
+
+
+class TestUtils(TestCase):
+    def setUp(self):
+        super().setUp()
+
+        if not c10d.is_initialized():
+            self.rank = 0
+            self.world_size = 4096
+
+            store = FakeStore()
+            c10d.init_process_group(
+                backend="fake",
+                world_size=self.world_size,
+                rank=self.rank,
+                store=store,
+            )
+
+    def tearDown(self):
+        c10d.destroy_process_group()
+
+    def test_summarize_ranks(self):
+        mesh_dim_names = ("pp", "dp", "tp")
+        mesh = init_device_mesh("cpu", (8, 64, 8), mesh_dim_names=mesh_dim_names)
+        ranks_lists = {name: mesh[name].mesh.tolist() for name in mesh_dim_names}
+        summaries = {
+            name: _summarize_ranks(ranks_lists[name]) for name in mesh_dim_names
+        }
+        self.assertEqual(summaries["pp"], "0:4096:512")
+        self.assertEqual(summaries["dp"], "0:512:8")
+        self.assertEqual(summaries["tp"], "0:8")
+
+        self.assertEqual(
+            _summarize_ranks([1, 2, 3, 6, 7, 8, 10, 12, 14, 16]),
+            "1:4,6:9,10:18:2",
+        )
+        self.assertEqual(
+            _summarize_ranks([1]),
+            "1",
+        )
+
+
+instantiate_parametrized_tests(TestCollectiveUtils)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_composability.py b/test/distributed/test_composability.py
index 7369d938441b3..b87e85a9a458a 100644
--- a/test/distributed/test_composability.py
+++ b/test/distributed/test_composability.py
@@ -19,7 +19,7 @@
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
-    MultiProcContinousTest,
+    MultiProcContinuousTest,
     requires_nccl,
     skip_if_lt_x_gpu,
 )
@@ -91,7 +91,7 @@ def loss_fn(y, target, scale=1e-4):
     return torch.nn.functional.cross_entropy(y, target) * scale
 
 
-class ComposabilityTest(MultiProcContinousTest):
+class ComposabilityTest(MultiProcContinuousTest):
     @classmethod
     def backend_str(cls) -> str:
         # Testing with NCCL backend
diff --git a/test/distributed/test_compute_comm_reordering.py b/test/distributed/test_compute_comm_reordering.py
index 63ff2fa2bbfe2..35b7a45dee7bb 100644
--- a/test/distributed/test_compute_comm_reordering.py
+++ b/test/distributed/test_compute_comm_reordering.py
@@ -29,7 +29,6 @@
     requires_accelerator_dist_backend,
 )
 from torch.testing._internal.common_fsdp import get_devtype
-from torch.testing._internal.common_utils import skipIfRocm
 from torch.testing._internal.inductor_utils import HAS_GPU
 
 
@@ -179,8 +178,11 @@ def func(a):
                 .check("extern_kernels.mm")
                 .check("triton_poi_fused_relu")
                 .check("torch.ops._c10d_functional.all_reduce_.default")
-                .check("torch.ops._c10d_functional.wait_tensor.default")
+                .check_same("buf0")
+                # mm not use buf prior to wait_tensor
                 .check("extern_kernels.mm")
+                .check_not("buf0")
+                .check("torch.ops._c10d_functional.wait_tensor.default")
                 .check("extern_kernels.mm")
                 .run(code)
             )
@@ -256,6 +258,11 @@ def func(a, *, tag, ranks, group_size):
             "reorder_compute_for_overlap",
         ],
     )
+    @patch.object(
+        torch._inductor.config,
+        "runtime_estimations_mms_benchmark",
+        False,
+    )
     def test_reorder_compute_for_overlap(self):
         def func(a, *, tag, ranks, group_size):
             ar = _functional_collectives.all_reduce(a, "sum", ranks, tag)
@@ -360,7 +367,6 @@ def func(a, *, tag, ranks, group_size):
             self.assertTrue(same(out, correct))
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
-    @skipIfRocm
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
     @patch.object(
diff --git a/test/distributed/test_cupy_as_tensor.py b/test/distributed/test_cupy_as_tensor.py
new file mode 100644
index 0000000000000..8340217b6c069
--- /dev/null
+++ b/test/distributed/test_cupy_as_tensor.py
@@ -0,0 +1,105 @@
+# Owner(s): ["oncall: distributed"]
+
+# To run:
+# python test/distributed/test_cupy_as_tensor.py
+
+from dataclasses import dataclass
+
+import torch
+from torch.multiprocessing.reductions import reduce_tensor
+from torch.testing._internal.common_distributed import MultiProcContinuousTest
+from torch.testing._internal.common_utils import requires_cuda_p2p_access, run_tests
+
+
+# So that tests are written in device-agnostic way
+device_type = "cuda"
+device_module = torch.get_device_module(device_type)
+
+
+@dataclass
+class CupyWrapper:
+    data_ptr: int
+    size_in_bytes: int
+
+    @property
+    def __cuda_array_interface__(self):
+        return {
+            "shape": (self.size_in_bytes,),
+            "typestr": "|u1",
+            "data": (self.data_ptr, False),
+            "version": 3,
+        }
+
+
+def from_buffer(
+    data_ptr: int, size_in_bytes: int, device: str, dtype: torch.dtype
+) -> torch.Tensor:
+    data = torch.as_tensor(CupyWrapper(data_ptr, size_in_bytes), device=device).view(
+        dtype
+    )
+    assert data.data_ptr() == data_ptr
+    return data
+
+
+@requires_cuda_p2p_access()
+class CupyAsTensorTest(MultiProcContinuousTest):
+    @classmethod
+    def backend_str(cls):
+        return "gloo"
+
+    def _init_device(self) -> None:
+        # need to use vmm api to test it,
+        # see https://forums.developer.nvidia.com/t/inconsistent-behavior-of-cudapointergetattributes-between-cudamalloc-ipc-and-vmm-based-ipc/339025/5 # noqa: B950
+        torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+        # init and pin the process to the device
+        device_module.set_device(self.device)
+        torch.empty(1, device=self.device)
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    def test_cupy_as_tensor(self) -> None:
+        """
+        Test that torch.as_tensor works for cupy array interface
+        with zero-copy when the pointer is p2p-shared across processes.
+        """
+        self._init_device()
+
+        tensor: torch.Tensor
+        if self.rank == 1:
+            # it seems only error from rank non-zero will be caught by this test
+            tensor = torch.randn(2333, device=self.device)
+            tensor_meta = reduce_tensor(tensor)
+            torch.distributed.broadcast_object_list([tensor_meta], src=1)
+        else:
+            recv_list = [None]
+            torch.distributed.broadcast_object_list(recv_list, src=1)
+            tensor_meta = recv_list[0]
+            func, args = tensor_meta
+            args = list(args)
+            args[6] = self.rank
+            ipc_tensor = func(*args)
+            tensor = from_buffer(
+                ipc_tensor.data_ptr(),
+                ipc_tensor.numel() * ipc_tensor.element_size(),
+                self.device,
+                ipc_tensor.dtype,
+            )
+
+        torch.distributed.barrier()
+        if self.rank == 1:
+            tensor.fill_(1)
+        device_module.synchronize()
+        torch.distributed.barrier()
+        assert tensor.allclose(tensor, 1)
+        torch.distributed.barrier()
+
+    @classmethod
+    def tearDownClass(cls):
+        torch.cuda.memory._set_allocator_settings("expandable_segments:False")
+        super().tearDownClass()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
index 04aaad9990f9c..5672171d0be4d 100644
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@@ -5,6 +5,7 @@
 import torch
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
+from torch._C._distributed_c10d import Backend as C10dBackend
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh, init_device_mesh
 from torch.distributed.distributed_c10d import (
@@ -30,7 +31,7 @@
     DTensorTestBase,
     with_comms,
 )
-from torch.testing._internal.distributed.fake_pg import FakeStore
+from torch.testing._internal.distributed.fake_pg import FakeProcessGroup, FakeStore
 from torch.utils._typing_utils import not_none
 
 
@@ -578,6 +579,115 @@ def test_raises_mesh_shape_mesh_dim_names_mismatch(self):
                 mesh_dim_names=["dp", "tp"],
             )
 
+    def _test_backend_override_argument_dict_with_idx_and_backend(self):
+        opts = FakeProcessGroup.Options()
+        opts.fake_option = 42
+
+        mesh = init_device_mesh(
+            self.device_type,
+            (2, 2, 2),
+            mesh_dim_names=("dp", "tp", "cp"),
+            backend_override={0: "fake", 2: ("fake", opts)},
+        )
+
+        def get_opts(mesh: DeviceMesh, dim_idx: int) -> C10dBackend.Options:
+            return (
+                mesh.get_group(dim_idx)
+                ._get_backend(torch.device(f"{self.device_type}:{self.rank}"))
+                .options
+            )
+
+        # Fake pg only have BackendType as BackendType::CUSTOM.
+        self.assertEqual(mesh.get_group(0)._get_backend_name(), "custom")
+        self.assertNotEqual(mesh.get_group(1)._get_backend_name(), "custom")
+        self.assertEqual(mesh.get_group(2)._get_backend_name(), "custom")
+
+        self.assertIsNone(get_opts(mesh, 0))
+        self.assertEqual(get_opts(mesh, 2).fake_option, 42)
+
+        dp_tp_mesh = mesh["dp", "tp"]._flatten()
+        dp_cp_mesh = mesh["dp", "cp"]._flatten(backend_override="fake")
+        tp_cp_mesh = mesh["tp", "cp"]._flatten(backend_override=("fake", opts))
+
+        self.assertNotEqual(dp_tp_mesh.get_group(0)._get_backend_name(), "custom")
+        self.assertEqual(dp_cp_mesh.get_group(0)._get_backend_name(), "custom")
+        self.assertEqual(tp_cp_mesh.get_group(0)._get_backend_name(), "custom")
+
+        self.assertIsNone(get_opts(dp_cp_mesh, 0))
+        self.assertEqual(get_opts(tp_cp_mesh, 0).fake_option, 42)
+
+    @with_comms
+    def test_backend_override_argument_dict_with_idx_and_backend_lazy(self):
+        self._test_backend_override_argument_dict_with_idx_and_backend()
+
+    @with_comms(eager_init=True)
+    def test_backend_override_argument_dict_with_idx_and_backend_eager(self):
+        self._test_backend_override_argument_dict_with_idx_and_backend()
+
+    @with_comms(backend="fake")
+    def test_backend_override_argument_dict_with_name_and_options(self):
+        opts = FakeProcessGroup.Options()
+        opts.fake_option = 42
+
+        mesh = init_device_mesh(
+            self.device_type,
+            (2, 2, 2),
+            mesh_dim_names=("dp", "tp", "cp"),
+            backend_override={"tp": opts},
+        )
+
+        def get_opts(mesh: DeviceMesh, dim_idx: int) -> C10dBackend.Options:
+            return (
+                mesh.get_group(dim_idx)
+                ._get_backend(torch.device(f"{self.device_type}:{self.rank}"))
+                .options
+            )
+
+        self.assertIsNone(get_opts(mesh, 0))
+        self.assertEqual(get_opts(mesh, 1).fake_option, 42)
+        self.assertIsNone(get_opts(mesh, 2))
+
+        dp_tp_mesh = mesh["dp", "tp"]._flatten()
+        dp_cp_mesh = mesh["dp", "cp"]._flatten(backend_override=opts)
+
+        self.assertIsNone(get_opts(dp_tp_mesh, 0))
+        self.assertEqual(get_opts(dp_cp_mesh, 0).fake_option, 42)
+
+    @with_comms
+    def test_backend_override_argument_errors(self):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Found redundant dim index 0 and name dp in backend_override",
+        ):
+            init_device_mesh(
+                self.device_type,
+                (2, 4),
+                mesh_dim_names=("dp", "tp"),
+                backend_override={"dp": "foo", 0: "bar"},
+            )
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Found invalid keys in backend_override: got \['cp'\]",
+        ):
+            init_device_mesh(
+                self.device_type,
+                (2, 4),
+                mesh_dim_names=("dp", "tp"),
+                backend_override={"cp": "foo"},
+            )
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Found invalid keys in backend_override: got \[42\]",
+        ):
+            init_device_mesh(
+                self.device_type,
+                (2, 4),
+                mesh_dim_names=("dp", "tp"),
+                backend_override={42: "bar"},
+            )
+
 
 class TestDeviceMeshGetItem(DTensorTestBase):
     @property
diff --git a/test/distributed/test_fake_pg.py b/test/distributed/test_fake_pg.py
index bc65fab2c67f5..0214680ba5e0b 100644
--- a/test/distributed/test_fake_pg.py
+++ b/test/distributed/test_fake_pg.py
@@ -40,16 +40,14 @@ def tearDown(self):
             pass
 
     def test_all_reduce(self):
-        store = FakeStore()
-        dist.init_process_group(backend="fake", rank=1, world_size=2, store=store)
+        dist.init_process_group(backend="fake", rank=1, world_size=2)
 
         output = torch.ones(3, 3) * dist.get_rank()
         dist.all_reduce(output)
         self.assertEqual(tuple(output.shape), (3, 3))
 
     def test_allgather(self):
-        store = FakeStore()
-        dist.init_process_group(backend="fake", rank=1, world_size=2, store=store)
+        dist.init_process_group(backend="fake", rank=1, world_size=2)
 
         input_tensor = torch.ones(3, 3) * dist.get_rank()
         output_tensors = [torch.empty_like(input_tensor) for _ in range(2)]
@@ -106,8 +104,7 @@ def allgather_fn(tensor):
         FileCheck().check("all_gather").check("wait_tensor").run(str(gm.graph))
 
     def test_broadcast(self):
-        store = FakeStore()
-        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+        dist.init_process_group(backend="fake", rank=0, world_size=2)
 
         # src == rank
         output = torch.ones(3, 3)
diff --git a/test/distributed/test_functional_api.py b/test/distributed/test_functional_api.py
index 3b93e4d2b19ad..b5522fe2bef06 100644
--- a/test/distributed/test_functional_api.py
+++ b/test/distributed/test_functional_api.py
@@ -13,7 +13,6 @@
 from torch._inductor.utils import run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
-from torch.testing._internal.distributed.fake_pg import FakeStore
 from torch.testing._internal.inductor_utils import HAS_GPU
 
 
@@ -24,7 +23,7 @@
 from torch.testing._internal.common_distributed import (
     DistributedTestBase,
     MultiThreadedTestCase,
-    requires_nccl,
+    requires_accelerator_dist_backend,
     TEST_SKIPS,
 )
 from torch.testing._internal.common_utils import (
@@ -34,6 +33,7 @@
     skipIfHpu,
     TEST_CUDA,
     TEST_HPU,
+    TEST_XPU,
     TestCase,
 )
 
@@ -64,6 +64,9 @@
 if TEST_HPU:
     devices.append("hpu")
     DEVICE = "hpu"
+elif TEST_XPU:
+    devices.append("xpu")
+    DEVICE = "xpu"
 elif TEST_CUDA:
     devices.append("cuda")
 
@@ -269,10 +272,10 @@ def setUp(self):
 
     @parametrize("device", devices)
     def test_broadcast(self, device):
-        if device == "cuda":
-            if torch.cuda.device_count() < self.world_size:
-                self.skipTest("Not enough CUDA devices")
-            torch.cuda.set_device(dist.get_rank())
+        if device != "cpu":
+            if torch.accelerator.device_count() < self.world_size:
+                self.skipTest("Not enough accelerator devices")
+            torch.accelerator.set_device_index(dist.get_rank())
 
         if dist.get_rank() == 0:
             tensor = torch.ones([4], device=device)
@@ -285,10 +288,10 @@ def test_broadcast(self, device):
 
     @parametrize("device", devices)
     def test_all_reduce_eager(self, device):
-        if device == "cuda":
-            if torch.cuda.device_count() < self.world_size:
-                self.skipTest("Not enough CUDA devices")
-            torch.cuda.set_device(dist.get_rank())
+        if device != "cpu":
+            if torch.accelerator.device_count() < self.world_size:
+                self.skipTest("Not enough accelerator devices")
+            torch.accelerator.set_device_index(dist.get_rank())
 
         tensor = torch.ones([4], device=device)
         mesh = dt.DeviceMesh(device, torch.arange(4))
@@ -302,10 +305,10 @@ def test_all_reduce_eager(self, device):
 
     @parametrize("device", devices)
     def test_all_reduce_coalesced_eager(self, device):
-        if device == "cuda":
-            if torch.cuda.device_count() < self.world_size:
-                self.skipTest("Not enough CUDA devices")
-            torch.cuda.set_device(dist.get_rank())
+        if device != "cpu":
+            if torch.accelerator.device_count() < self.world_size:
+                self.skipTest("Not enough accelerator devices")
+            torch.accelerator.set_device_index(dist.get_rank())
 
         t0 = torch.ones([4], device=device)
         t1 = torch.ones([6], device=device) + 2
@@ -317,10 +320,10 @@ def test_all_reduce_coalesced_eager(self, device):
 
     @parametrize("device", devices)
     def test_all_gather_tensor(self, device):
-        if device == "cuda":
-            if torch.cuda.device_count() < self.world_size:
-                self.skipTest("Not enough CUDA devices")
-            torch.cuda.set_device(dist.get_rank())
+        if device != "cpu":
+            if torch.accelerator.device_count() < self.world_size:
+                self.skipTest("Not enough accelerator devices")
+            torch.accelerator.set_device_index(dist.get_rank())
 
         # testing 1d/2d mesh
         mesh_1d = dt.DeviceMesh(device, torch.arange(self.world_size))
@@ -339,10 +342,10 @@ def test_all_gather_tensor(self, device):
 
     @parametrize("device", devices)
     def test_all_gather_into_tensor_coalesced(self, device):
-        if device == "cuda":
-            if torch.cuda.device_count() < self.world_size:
-                self.skipTest("Not enough CUDA devices")
-            torch.cuda.set_device(dist.get_rank())
+        if device != "cpu":
+            if torch.accelerator.device_count() < self.world_size:
+                self.skipTest("Not enough accelerator devices")
+            torch.accelerator.set_device_index(dist.get_rank())
 
         tensors = [torch.ones([4], device=device), torch.ones([4], device=device) + 1]
         mesh = dt.DeviceMesh(device, torch.arange(4))
@@ -356,10 +359,10 @@ def test_all_gather_into_tensor_coalesced(self, device):
 
     @parametrize("device", devices)
     def test_reduce_scatter_tensor(self, device):
-        if device == "cuda":
-            if torch.cuda.device_count() < self.world_size:
-                self.skipTest("Not enough CUDA devices")
-            torch.cuda.set_device(dist.get_rank())
+        if device != "cpu":
+            if torch.accelerator.device_count() < self.world_size:
+                self.skipTest("Not enough accelerator devices")
+            torch.accelerator.set_device_index(dist.get_rank())
 
         # testing 1d/2d mesh
         mesh_1d = dt.DeviceMesh(device, torch.arange(self.world_size))
@@ -380,10 +383,10 @@ def test_reduce_scatter_tensor(self, device):
 
     @parametrize("device", devices)
     def test_reduce_scatter_into_tensor_coalesced(self, device):
-        if device == "cuda":
-            if torch.cuda.device_count() < self.world_size:
-                self.skipTest("Not enough CUDA devices")
-            torch.cuda.set_device(dist.get_rank())
+        if device != "cpu":
+            if torch.accelerator.device_count() < self.world_size:
+                self.skipTest("Not enough accelerator devices")
+            torch.accelerator.set_device_index(dist.get_rank())
         tensors = [
             torch.ones([4], dtype=torch.int64, device=device),
             torch.ones([4], dtype=torch.int64, device=device) + 1,
@@ -427,12 +430,10 @@ def setUp(self):
         # so create a fake_pg.
         self.rank = 0
         self.world_size = 2
-        store = FakeStore()
         dist.init_process_group(
             backend="fake",
             world_size=self.world_size,
             rank=self.rank,
-            store=store,
         )
 
     def tearDown(self):
@@ -474,18 +475,17 @@ def allred_mesh_dim(input):
 # And then set the BACKEND variable appropriately.
 if TEST_HPU:
     BACKEND = dist.Backend.HCCL
+elif TEST_XPU:
+    BACKEND = dist.Backend.XCCL
 
 
 # allows you to check for multiple accelerator irrespective of device type
 # to add new device types to this check simply follow the same format
 # and append an elif with the conditional and appropriate device count function for your new device
 def exit_if_lt_x_accelerators(x):
-    if TEST_CUDA:
-        if torch.cuda.device_count() < x:
-            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
-    elif TEST_HPU:
-        if torch.hpu.device_count() < x:
-            sys.exit(TEST_SKIPS[f"multi-hpu-{x}"].exit_code)
+    if torch.accelerator.is_available():
+        if torch.accelerator.device_count() < x:
+            sys.exit(TEST_SKIPS[f"multi-accelerator-{x}"].exit_code)
 
 
 def with_comms(func=None):
@@ -494,7 +494,9 @@ def with_comms(func=None):
 
     @wraps(func)
     def wrapper(self, *args, **kwargs):
-        if BACKEND == dist.Backend.NCCL and torch.cuda.device_count() < self.world_size:
+        if (
+            BACKEND == dist.Backend.NCCL or BACKEND == dist.Backend.XCCL
+        ) and torch.accelerator.device_count() < self.world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
 
         kwargs["device"] = DEVICE
@@ -572,7 +574,7 @@ def test_all_to_all_single_split_sizes_none(self, device):
         self.assertEqual(y, expected)
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
-    @requires_nccl()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     @with_comms()
     def test_tracing(self, device):
         def allreduce(t, pg):
@@ -593,13 +595,12 @@ def allreduce(t, pg):
             backend="fake",
             rank=0,
             world_size=8,
-            store=FakeStore(),
         )
         allreduce(torch.randn(8, device=device), pg=dist.group.WORLD)
         dist.destroy_process_group()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
-    @requires_nccl()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     @with_comms()
     def test_tracing_with_dce_code(self, device):
         if self.world_size > 2:
@@ -818,13 +819,19 @@ def test_all_to_all_single(self, device) -> None:
 
 # Update the supported devices in DEVICE
 instantiate_device_type_tests(
-    TestCollectivesWithDistributedBackend, globals(), only_for=DEVICE
+    TestCollectivesWithDistributedBackend, globals(), only_for=DEVICE, allow_xpu=True
 )
 instantiate_device_type_tests(
-    TestDistributedBackendCollectivesWithWorldSize4, globals(), only_for=DEVICE
+    TestDistributedBackendCollectivesWithWorldSize4,
+    globals(),
+    only_for=DEVICE,
+    allow_xpu=True,
 )
 instantiate_device_type_tests(
-    TestFunctionalAutogradWithDistributedBackend, globals(), only_for=DEVICE
+    TestFunctionalAutogradWithDistributedBackend,
+    globals(),
+    only_for=DEVICE,
+    allow_xpu=True,
 )
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index 6d634124b742e..51d12f6099ac4 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -22,8 +22,13 @@
     sink_waits_iterative,
 )
 from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
-from torch._inductor.scheduler import BaseSchedulerNode
-from torch._inductor.utils import run_and_get_triton_code
+from torch._inductor.scheduler import (
+    _get_mm_like_fn,
+    BaseSchedulerNode,
+    get_estimate_runtime_cache,
+    get_estimate_runtime_cache_key_from_snode,
+)
+from torch._inductor.utils import fresh_inductor_cache, run_and_get_triton_code
 from torch.distributed.distributed_c10d import GroupMember
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing._internal.common_cuda import SM80OrLater
@@ -40,6 +45,8 @@
     parametrize,
     requires_cuda,
     skipIfRocm,
+    TEST_XPU,
+    xfailIf,
 )
 from torch.testing._internal.inductor_utils import HAS_GPU
 from torch.utils._python_dispatch import TorchDispatchMode
@@ -262,6 +269,7 @@ def compile(func, example_inputs):
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(2)
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1728
     @skipIfRocm
     def test_eager_async_allreduce_inductor_wait(self):
         import torch.distributed as dist
@@ -1524,7 +1532,85 @@ def _reorder_communication_preserving_peak_memory(
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @unittest.skipIf(not SM80OrLater, "bfloat16")
-    def test_all_gather_bucket(self):
+    @parametrize("bucket_mode", ["all", "all_custom_ops"])
+    def test_all_gather_bucket(self, bucket_mode):
+        def func(x, w, ag_0, ag_1, ag_2, ag_3, *, tag, ranks, group_size):
+            # do some unrelated matmuls
+            y = torch.mm(x, w)
+
+            ag_1_cast = ag_1.to(torch.bfloat16)
+
+            group_name = (
+                torch.distributed.distributed_c10d._get_default_group().group_name
+            )
+            ag_2_out = torch.ops._c10d_functional.all_gather_into_tensor(
+                ag_2, group_size, group_name
+            )
+            ag_2_out = torch.ops.c10d_functional.wait_tensor(ag_2_out)
+
+            ag_0 = ag_2_out + ag_0
+            ag_0_cast = ag_0.to(torch.bfloat16)
+
+            ag_0_out = torch.ops._c10d_functional.all_gather_into_tensor(
+                ag_0_cast, group_size, group_name
+            )
+            ag_0_out = torch.ops.c10d_functional.wait_tensor(ag_0_out)
+            ag_0_out = ag_0_out * 2
+
+            ag_1_out = torch.ops._c10d_functional.all_gather_into_tensor(
+                ag_1_cast, group_size, group_name
+            )
+
+            ag_1_out = torch.ops.c10d_functional.wait_tensor(ag_1_out)
+
+            ag_3_out = torch.ops._c10d_functional.all_gather_into_tensor(
+                ag_3, group_size, group_name
+            )
+            ag_3_out = torch.ops.c10d_functional.wait_tensor(ag_3_out)
+            return y, ag_0_out, ag_1_out, ag_2_out, ag_3_out
+
+        x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
+        w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+        ag_0 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+        ag_1 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+        ag_2 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+        ag_3 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+        inputs = [x, w, ag_0, ag_1, ag_2, ag_3]
+        correct = func(*inputs, **self.get_world_trs())
+
+        with (
+            torch._inductor.config.patch(
+                {
+                    "bucket_all_gathers_fx": bucket_mode,
+                    "reorder_for_compute_comm_overlap": False,
+                    "runtime_estimations_mms_benchmark": True,
+                }
+            ),
+            torch._inductor.config_comms.patch(
+                {
+                    "runtime_estimations_align_across_all_distributed_ranks": True,
+                }
+            ),
+            # Clearing cache to cover runtime_estimations_mms_benchmark that use LocalCache
+            fresh_inductor_cache(),
+        ):
+            compiled = torch.compile(func)
+            code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
+        # NOTE: The first return value should be the output of the first wait_tensor.
+        # We want to make sure no unnecessary copy is made.
+        (
+            FileCheck()
+            .check("= torch.ops._c10d_functional.all_gather_into_tensor")
+            .check("torch.ops._c10d_functional.all_gather_into_tensor_out.default(")
+            .check("= torch.ops._c10d_functional.all_gather_into_tensor")
+            .run(code)
+        )
+        out = compiled(*inputs, **self.get_world_trs())
+        assert same(out, correct), f"{out} va {correct}"
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @unittest.skipIf(not SM80OrLater, "bfloat16")
+    def test_all_gather_bucket_path(self):
         def func(x, w, ag_0, ag_1, *, tag, ranks, group_size):
             # do some unrelated matmuls
             y = torch.mm(x, w)
@@ -1533,7 +1619,7 @@ def func(x, w, ag_0, ag_1, *, tag, ranks, group_size):
             ag_0_cast = ag_0.to(torch.bfloat16)
             ag_1_cast = ag_1.to(torch.bfloat16)
 
-            # allgather
+            # first allgather
             group_name = (
                 torch.distributed.distributed_c10d._get_default_group().group_name
             )
@@ -1543,12 +1629,16 @@ def func(x, w, ag_0, ag_1, *, tag, ranks, group_size):
             ag_0_out = torch.ops.c10d_functional.wait_tensor(ag_0_out)
             ag_0_out = ag_0_out * 2
 
-            ag_1_cast = ag_1_cast * 2
+            # Create dependency: second allgather input depends on first allgather output
+            # This prevents fusion of the two allgather operations
+            ag_1_modified = (
+                ag_1_cast + ag_0_out[: ag_1_cast.shape[0]]
+            )  # Use part of ag_0_out
+
+            # second allgather (now depends on the first one)
             ag_1_out = torch.ops._c10d_functional.all_gather_into_tensor(
-                ag_1_cast, group_size, group_name
+                ag_1_modified, group_size, group_name
             )
-
-            # wait op
             ag_1_out = torch.ops.c10d_functional.wait_tensor(ag_1_out)
 
             return y, ag_0_out, ag_1_out
@@ -1567,17 +1657,15 @@ def func(x, w, ag_0, ag_1, *, tag, ranks, group_size):
         ):
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
-        # NOTE: The first return value should be the output of the first wait_tensor.
-        # We want to make sure no unnecessary copy is made.
-        (FileCheck().check("all_gather_into_tensor_out").run(code))
-        out = compiled(*inputs, **self.get_world_trs())
-        correct = func(*inputs, **self.get_world_trs())
-        assert same(out, correct), f"{out} va {correct}"
+
+        # shouldnt have bucketed
+        FileCheck().check_count("wait_tensor.default(", 2, exactly=True).run(code)
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @unittest.skipIf(not SM80OrLater, "bfloat16")
-    def test_reduce_scatter_bucket(self):
-        def func(x, w, rs_0, rs_1, *, tag, ranks, group_size):
+    @parametrize("bucket_mode", ["all", "all_custom_ops"])
+    def test_reduce_scatter_bucket(self, bucket_mode):
+        def func(x, w, rs_0, rs_1, tag, ranks, group_size):
             # do some unrelated matmuls
             y = torch.mm(x, w)
 
@@ -1602,39 +1690,49 @@ def func(x, w, rs_0, rs_1, *, tag, ranks, group_size):
 
             return y, rs_0_out, rs_1_out
 
-        x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
-        w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
-        rs_0 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
-        rs_1 = torch.ones(384, 256, device="cuda", dtype=torch.float32)
-        inputs = [x, w, rs_0, rs_1]
-        func(*inputs, **self.get_world_trs())
-
-        with torch._inductor.config.patch(
-            {
-                "bucket_reduce_scatters_fx": "all",
-                "reorder_for_compute_comm_overlap": False,
-            }
-        ):
-            compiled = torch.compile(func)
-            code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
-        # NOTE: The first return value should be the output of the first wait_tensor.
-        # We want to make sure no unnecessary copy is made.
-        (
-            FileCheck()
-            .check_count(
-                "torch.ops._c10d_functional.reduce_scatter_tensor.default(",
-                count=1,
-                exactly=True,
+        # test "fsdp" mode to allow convert_element_type after wait
+        def func2(x, w, rs_0, rs_1, tag, ranks, group_size):
+            y, rs_0_out, rs_1_out = func(x, w, rs_0, rs_1, tag, ranks, group_size)
+            return y, rs_0_out.to(torch.float32), rs_1_out.to(torch.float32)
+
+        for f in [func, func2]:
+            x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
+            w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+            rs_0 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+            rs_1 = torch.ones(384, 256, device="cuda", dtype=torch.float32)
+            inputs = [x, w, rs_0, rs_1]
+            f(*inputs, **self.get_world_trs())
+
+            with torch._inductor.config.patch(
+                {
+                    "bucket_reduce_scatters_fx": bucket_mode,
+                    "reorder_for_compute_comm_overlap": False,
+                }
+            ):
+                compiled = torch.compile(f)
+                compiled(*inputs, **self.get_world_trs())
+                code = run_and_get_triton_code(
+                    compiled, *inputs, **self.get_world_trs()
+                )
+            # NOTE: The first return value should be the output of the first wait_tensor.
+            # We want to make sure no unnecessary copy is made.
+            (
+                FileCheck()
+                .check_count(
+                    "torch.ops._c10d_functional.reduce_scatter_tensor.default(",
+                    count=1,
+                    exactly=True,
+                )
+                .run(code)
             )
-            .run(code)
-        )
-        out = compiled(*inputs, **self.get_world_trs())
-        correct = func(*inputs, **self.get_world_trs())
-        assert same(out, correct), f"{out} va {correct}"
+            out = compiled(*inputs, **self.get_world_trs())
+            correct = f(*inputs, **self.get_world_trs())
+            assert same(out, correct), f"{out} va {correct}"
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @unittest.skipIf(not SM80OrLater, "bfloat16")
-    def test_reorder_peak_memory_bucketed(self):
+    @parametrize("bucket_mode", ["all", "all_custom_ops"])
+    def test_reorder_peak_memory_bucketed(self, bucket_mode):
         """
         Simulate the case where a bucketing pass ran and grouped several inputs into one bucketed allgather.
         Ensure the whole bucketed group including copy-ops get moved together rather than the copy ops preventing the
@@ -1727,6 +1825,17 @@ def func(x, w, ag_0, ag_1, ag_2, ag_3, *, tag, ranks, group_size):
         def _reorder_communication_preserving_peak_memory(
             snodes: list[BaseSchedulerNode],
         ) -> list[BaseSchedulerNode]:
+            if torch._inductor.config.runtime_estimations_mms_benchmark:
+                cache = get_estimate_runtime_cache()
+                for snode in snodes:
+                    if _get_mm_like_fn(snode) is None:
+                        continue
+                    cache_key = get_estimate_runtime_cache_key_from_snode(snode)
+                    assert cache.lookup(cache_key) is not None
+
+            if torch._inductor.config_comms.runtime_estimations_align_across_all_distributed_ranks:
+                for snode in snodes:
+                    assert snode.override_estimated_runtime is not None
             nonlocal node_stats
             (
                 reordered_snodes,
@@ -1734,22 +1843,37 @@ def _reorder_communication_preserving_peak_memory(
             ) = _reorder_communication_preserving_peak_memory_internal(snodes)
             return reordered_snodes
 
-        with torch._inductor.config.patch(
-            {
-                "bucket_all_gathers_fx": "all",
-                "bucket_all_gathers_fx_bucket_size_determinator": lambda _: 2,
-                "bucket_reduce_scatters_fx": "all",
-                "bucket_reduce_scatters_fx_bucket_size_determinator": lambda _: 2,
-                "reorder_for_compute_comm_overlap": True,
-                "reorder_for_compute_comm_overlap_passes": [
-                    sink_waits_iterative,
-                    _reorder_communication_preserving_peak_memory,
-                ],
-                "allow_buffer_reuse": False,
-            }
+        with (
+            torch._inductor.config.patch(
+                {
+                    "bucket_all_gathers_fx": bucket_mode,
+                    "bucket_all_gathers_fx_bucket_size_determinator": lambda _: 2,
+                    "bucket_reduce_scatters_fx": bucket_mode,
+                    "bucket_reduce_scatters_fx_bucket_size_determinator": lambda _: 2,
+                    "reorder_for_compute_comm_overlap": True,
+                    "reorder_for_compute_comm_overlap_passes": [
+                        sink_waits_iterative,
+                        _reorder_communication_preserving_peak_memory,
+                    ],
+                    "allow_buffer_reuse": False,
+                    "test_configs.track_memory_lifecycle": "error",
+                    "runtime_estimations_mms_benchmark": True,
+                }
+            ),
+            torch._inductor.config_comms.patch(
+                {
+                    "runtime_estimations_align_across_all_distributed_ranks": True,
+                }
+            ),
+            # Clearing cache to cover runtime_estimations_mms_benchmark that use LocalCache
+            fresh_inductor_cache(),
         ):
-            compiled = torch.compile(func)
+            compiled = torch.compile(func, fullgraph=True)
             code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
+
+        # make sure memory tracking is codegen. the ops will then do runtime checking with assertion.
+        FileCheck().check("check_memory_step").check("tracked_empty_strided").run(code)
+
         # NOTE: The first return value should be the output of the first wait_tensor.
         # We want to make sure no unnecessary copy is made.
         (
diff --git a/test/distributed/test_nccl.py b/test/distributed/test_nccl.py
index 8c7f0b3073b00..49d72b8b4edd8 100644
--- a/test/distributed/test_nccl.py
+++ b/test/distributed/test_nccl.py
@@ -14,7 +14,7 @@
     instantiate_device_type_tests,
 )
 from torch.testing._internal.common_distributed import (
-    MultiProcContinousTest,
+    MultiProcContinuousTest,
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
@@ -246,7 +246,7 @@ def test_reduce_scatter(self, device, dtype):
 
 
 @requires_cuda_p2p_access()
-class NCCLSymmetricMemoryTest(MultiProcContinousTest):
+class NCCLSymmetricMemoryTest(MultiProcContinuousTest):
     @property
     def device(self) -> torch.device:
         return torch.device("cuda", self.rank)
diff --git a/test/distributed/test_nvshmem.py b/test/distributed/test_nvshmem.py
index 8de88efaaa5e0..18db50582d27e 100644
--- a/test/distributed/test_nvshmem.py
+++ b/test/distributed/test_nvshmem.py
@@ -7,7 +7,11 @@
 import torch
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem
-from torch.testing._internal.common_distributed import MultiProcContinousTest
+from torch.distributed.device_mesh import init_device_mesh
+from torch.testing._internal.common_distributed import (
+    MultiProcContinuousTest,
+    skip_if_lt_x_gpu,
+)
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -33,12 +37,10 @@ def requires_nvshmem():
 
 @requires_nvshmem()
 @requires_cuda_p2p_access()
-class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
+class NVSHMEMSymmetricMemoryTest(MultiProcContinuousTest):
     def _init_device(self) -> None:
         # TODO: relieve this (seems to hang if without)
         device_module.set_device(self.device)
-        # NOTE: required for nvshmem allocation
-        torch.empty(1, device=self.device)
         # Set NVSHMEM as SymmMem backend
         symm_mem.set_backend("NVSHMEM")
 
@@ -65,6 +67,123 @@ def foo():
         out = symm_mem.empty(numel, dtype=dtype, device=self.device)
         symm_mem.rendezvous(out, group=group_name)
 
+    @skipIfRocm
+    def test_alloc_without_device_context(self) -> None:
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel = 1024
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device)
+        self.assertEqual(out.device, self.device)
+        symm_mem.rendezvous(out, group=group_name)
+
+    @skipIfRocm
+    def test_mempool_tensor_factory(self) -> None:
+        """
+        Test the effectiveness of MemPool on tensor factory ops.
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel = 1024
+        src_rank = 0
+
+        allocator = symm_mem.get_mempool_allocator(self.device)
+        mempool = torch.cuda.MemPool(allocator)
+
+        with torch.cuda.use_mem_pool(mempool):
+            if self.rank == src_rank:
+                tensor = torch.arange(numel, dtype=dtype, device=self.device)
+            else:
+                tensor = torch.zeros(numel, dtype=dtype, device=self.device)
+
+        symm_mem.rendezvous(tensor, group=group_name)
+        torch.ops.symm_mem.nvshmem_broadcast(tensor, src_rank, group_name)
+        self.assertEqual(tensor, torch.arange(numel, dtype=dtype, device=self.device))
+
+    @skipIfRocm
+    def test_mempool_compute_ops(self) -> None:
+        """
+        Apply MemPool context to a compute op that creates input to collective.
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        dim = 1024
+        w = torch.ones(dim, dim, dtype=dtype, device=self.device)
+        x0 = torch.ones(1, dim, dtype=dtype, device=self.device)
+
+        allocator = symm_mem.get_mempool_allocator(self.device)
+        mempool = torch.cuda.MemPool(allocator)
+
+        with torch.cuda.use_mem_pool(mempool):
+            x = x0 + self.rank
+            y = torch.mm(x, w)
+
+        # y should be a symm tensor
+        torch.ops.symm_mem.nvshmem_broadcast(y, 0, group_name)
+        expected = torch.mm(x0, w)
+        self.assertEqual(y, expected)
+
+    @skipIfRocm
+    def test_handle_offset(self) -> None:
+        """
+        Test if handle offset is correctly set.
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel = 1024
+        allocator = symm_mem.get_mempool_allocator(self.device)
+        mempool = torch.cuda.MemPool(allocator)
+
+        with torch.cuda.use_mem_pool(mempool):
+            x0 = torch.empty(numel, dtype=dtype, device=self.device)
+            x1 = torch.empty_like(x0)
+
+        hdl0 = symm_mem.rendezvous(x0, group=group_name)
+        hdl1 = symm_mem.rendezvous(x1, group=group_name)
+        self.assertEqual(hdl0.offset, 0)
+        self.assertEqual(hdl1.offset, x0.untyped_storage().nbytes())
+
+    def test_get_remote_tensor(self) -> None:
+        """
+        Get a remote tensor and use regular aten ops to write to it.
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel = 1024
+        allocator = symm_mem.get_mempool_allocator(self.device)
+        mempool = torch.cuda.MemPool(allocator)
+
+        with torch.cuda.use_mem_pool(mempool):
+            # src data stores my rank
+            x = torch.empty(numel, dtype=dtype, device=self.device).fill_(self.rank)
+            y = torch.empty_like(x)
+
+        hdl_y = symm_mem.rendezvous(y, group=group_name)
+        peer = (self.rank + 1) % self.world_size  # Shifting pattern
+        y_remote = hdl_y.get_remote_tensor(peer, y.size(), y.dtype)
+        y_remote.copy_(x)
+        dist.barrier()
+        # Expecting data from -1 rank
+        expected = torch.empty(numel, dtype=dtype, device=self.device).fill_(
+            (self.rank - 1) % self.world_size
+        )
+        self.assertEqual(y, expected)
+
     @skipIfRocm
     def test_nvshmem_put(self) -> None:
         self._init_device()
@@ -117,7 +236,7 @@ def test_nvshmem_get(self) -> None:
 @instantiate_parametrized_tests
 @requires_nvshmem()
 @requires_cuda_p2p_access()
-class NVSHMEMAll2AllTest(MultiProcContinousTest):
+class NVSHMEMAll2AllTest(MultiProcContinuousTest):
     def _init_device(self) -> None:
         # TODO: relieve this (seems to hang if without)
         device_module.set_device(self.device)
@@ -429,17 +548,11 @@ def test_all_to_all_vdev_2d_offset(self) -> None:
         # Check data
         torch.testing.assert_close(out_expected, out[:out_numel])
 
-    @skipIfRocm
-    @parametrize("align", [1, 8, 16])  # `major_align` of output
-    def test_shuffle_combine(self, align: int) -> None:
+    def helper_test_dispatch_combine(self, align: int, group_name) -> None:
         """
         Shuffle the tokens, then combine them, and check if the combined data is
         exactly the same as the original input data
         """
-        torch.manual_seed(42 + self.rank)
-        self._init_device()
-
-        group_name = dist.group.WORLD.group_name
         symm_mem.enable_symm_mem_for_group(group_name)
 
         dtype = torch.float
@@ -513,6 +626,36 @@ def test_shuffle_combine(self, align: int) -> None:
         ).to(torch.int64)
         torch.testing.assert_close(combine_out_splits_offsets[1], inp_offsets)
 
+    @skipIfRocm
+    @parametrize("align", [1, 8, 16])  # `major_align` of output
+    def test_dispatch_combine(self, align: int) -> None:
+        """
+        Test dispatch-and-combine over World group
+        """
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        self.helper_test_dispatch_combine(align, dist.group.WORLD.group_name)
+
+    @skipIfRocm
+    # TODO: FIXIT. Currently, `MultiProcContinuousTest` treats the skip code as a
+    # failure
+    @skip_if_lt_x_gpu(4)
+    def test_dispatch_combine_subgroup(self) -> None:
+        """
+        Test dispatch-and-combine over concurrent subgroups
+        """
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        symm_mem.enable_symm_mem_for_group(dist.group.WORLD.group_name)
+        # Test on two concurrent subgroups
+        ngroups = 2
+        subgroup_size = self.world_size // ngroups
+        dm = init_device_mesh(
+            device_type, (ngroups, subgroup_size), mesh_dim_names=("dp", "ep")
+        )
+        subgroup = dm.get_group("ep")
+        self.helper_test_dispatch_combine(align=8, group_name=subgroup.group_name)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_nvshmem_triton.py b/test/distributed/test_nvshmem_triton.py
index c4565a96496ce..9306852498ff5 100644
--- a/test/distributed/test_nvshmem_triton.py
+++ b/test/distributed/test_nvshmem_triton.py
@@ -1,9 +1,7 @@
 # Owner(s): ["oncall: distributed"]
-
 # To run:
 # python test/distributed/test_nvshmem_triton.py
 
-
 import triton.language as tl
 
 import torch
@@ -11,17 +9,18 @@
 import torch.distributed._symmetric_memory as symm_mem
 import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
 from torch._inductor.runtime.triton_compat import triton
-from torch.testing._internal.common_distributed import MultiProcContinousTest
+from torch.testing._internal.common_distributed import MultiProcContinuousTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+    parametrize,
     run_tests,
     skip_but_pass_in_sandcastle_if,
     skipIfRocm,
 )
-from torch.testing._internal.inductor_utils import requires_triton
+from torch.testing._internal.inductor_utils import IS_H100, requires_triton
 
 
-# Decorator
+# Decorators
 def requires_nvshmem():
     return skip_but_pass_in_sandcastle_if(
         not symm_mem.is_nvshmem_available(),
@@ -29,6 +28,13 @@ def requires_nvshmem():
     )
 
 
+def requires_h100():
+    return skip_but_pass_in_sandcastle_if(
+        not IS_H100,
+        "NVSHMEM requires H100. Skipping test on non-H100 GPU.",
+    )
+
+
 # So that tests are written in device-agnostic way
 device_type = "cuda"
 device_module = torch.get_device_module(device_type)
@@ -36,47 +42,47 @@ def requires_nvshmem():
 
 # Shared Triton JIT kernels
 @triton.jit
-def put_kernel(
-    dst_ptr,
-    src_ptr,
-    numel,
-    peer,
+def nvshmem_put_kernel(
+    dest,
+    src,
+    nelems,
+    pe,
 ):
-    nvshmem.putmem_block(dst_ptr, src_ptr, numel, peer)
+    nvshmem.put(dest, src, nelems, pe)
 
 
 @triton.jit
-def get_kernel(
-    dst_ptr,
-    src_ptr,
-    numel,
-    peer,
+def nvshmem_get_kernel(
+    dest,
+    src,
+    nelems,
+    pe,
 ):
-    nvshmem.getmem_block(dst_ptr, src_ptr, numel, peer)
+    nvshmem.get(dest, src, nelems, pe)
 
 
 @triton.jit
-def put_signal_kernel(
+def nvshmem_putmem_signal_block_kernel(
     dst_ptr,
     src_ptr,
-    numel,
+    size_bytes,
     sig_ptr,
     signal_val,
     sig_op,
     peer,
 ):
     nvshmem.putmem_signal_block(
-        dst_ptr, src_ptr, numel, sig_ptr, signal_val, sig_op, peer
+        dst_ptr, src_ptr, size_bytes, sig_ptr, signal_val, sig_op, peer
     )
 
 
 @triton.jit
-def signal_wait_until_kernel(sig_ptr, cmp_op, cmp_val):
+def nvshmem_signal_wait_until_kernel(sig_ptr, cmp_op, cmp_val):
     nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)
 
 
 @triton.jit
-def signal_op_kernel(
+def nvshmem_signal_op_kernel(
     sig_addr,
     signal,
     sig_op,
@@ -86,75 +92,65 @@ def signal_op_kernel(
 
 
 @triton.jit
-def wait_until_kernel(
-    ivar_ptr,
+def nvshmem_wait_until_kernel(
+    ivar,
     cmp_op,
     cmp_val,
 ):
-    nvshmem.wait_until(ivar_ptr, cmp_op, cmp_val)
+    nvshmem.wait_until(ivar, cmp_op, cmp_val)
 
 
 @triton.jit
-def put_and_signal_kernel(
-    dst_ptr,
-    src_ptr,
-    numel,
-    sig_ptr,
-    signal_val,
-    sig_op,
-    peer,
-):
-    nvshmem.putmem_signal_block(
-        dst_ptr, src_ptr, numel, sig_ptr, signal_val, sig_op, peer
-    )
+def nvshmem_fence_kernel():
+    nvshmem.fence()
 
 
 @triton.jit
-def put_with_fence_kernel(
-    dst_ptr1,
-    dst_ptr2,
-    src_ptr1,
-    src_ptr2,
-    flag_ptr,
-    flag_src_ptr,
-    numel,
+def nvshmem_put_with_fence_kernel(
+    dst1,
+    src1,
+    dst2,
+    src2,
+    flag_dst,
+    flag_src,
+    nelems,
     peer,
 ):
     # First put
-    nvshmem.putmem_block(dst_ptr1, src_ptr1, numel, peer)
+    nvshmem.put(dst1, src1, nelems, peer)
     # Ensure the first put is ordered before the next.
     nvshmem.fence()
     # Second put
-    nvshmem.putmem_block(dst_ptr2, src_ptr2, numel, peer)
+    nvshmem.put(dst2, src2, nelems, peer)
     # Order the second put before flag update.
     nvshmem.fence()
     # Write the flag (single int64) to signal completion.
-    nvshmem.putmem_block(flag_ptr, flag_src_ptr, 1, peer)
+    nvshmem.put(flag_dst, flag_src, 1, peer)
 
 
 @triton.jit
-def put_with_quiet_kernel(
-    dst_ptr,
-    src_ptr,
-    flag_dst_ptr,
-    flag_src_ptr,
-    numel,
+def nvshmem_put_with_quiet_kernel(
+    dst,
+    src,
+    flag_dst,
+    flag_src,
+    nelems,
     peer,
 ):
     # Put data
-    nvshmem.putmem_block(dst_ptr, src_ptr, numel, peer)
+    nvshmem.put(dst, src, nelems, peer)
     # Call quiet to ensure put is complete
     nvshmem.quiet()
     # Only after quiet, set the completion flag
     # This ensures the data put is complete before flag is set
-    nvshmem.putmem_block(flag_dst_ptr, flag_src_ptr, 1, peer)
+    nvshmem.put(flag_dst, flag_src, 1, peer)
 
 
 @triton.jit
-def barrier_test_kernel(
-    dst_ptr,
-    src_ptr,
-    numel,
+def nvshmem_barrier_test_kernel(
+    dst,
+    src,
+    nelems,
 ):
     # Testing barrier_all() requires coordinated operations across PEs within
     # the same kernel execution. Unlike other kernels that just wrap NVSHMEM
@@ -162,83 +158,98 @@ def barrier_test_kernel(
     # device-side barrier synchronization.
     my_pe = nvshmem.my_pe()
     n_pes = nvshmem.n_pes()
+
     # Rank 0 broadcasts its value to all other ranks
     if my_pe == 0:
         # Write initial value
-        p_src = src_ptr.to(tl.pointer_type(tl.int32))
+        p_src = src.to(tl.pointer_type(tl.int32))
         tl.store(p_src, 42)
         # Put to all other ranks
         i = 1
         while i < n_pes:
-            nvshmem.putmem_block(dst_ptr, src_ptr, numel, i)
+            nvshmem.put(dst, src, nelems, i)
             i += 1
+
     # Synchronize all PEs
     nvshmem.barrier_all()
+
     # Non-zero ranks increment the received value
     if my_pe != 0:
-        p_dst = dst_ptr.to(tl.pointer_type(tl.int32))
+        p_dst = dst.to(tl.pointer_type(tl.int32))
         received = tl.load(p_dst)
         tl.store(p_dst, received + 1)
 
 
 @triton.jit
-def sync_test_kernel(
-    dst_ptr,
-    src_ptr,
-    numel,
+def nvshmem_barrier_all_kernel():
+    nvshmem.barrier_all()
+
+
+@triton.jit
+def nvshmem_sync_test_kernel(
+    local_data,
+    remote_data,
+    nelems,
 ):
     my_pe = nvshmem.my_pe()
     n_pes = nvshmem.n_pes()
 
-    # Rank 0 broadcasts its value to all other ranks
-    if my_pe == 0:
-        # Write initial value
-        p_src = src_ptr.to(tl.pointer_type(tl.int32))
-        tl.store(p_src, 42)
-        # Put to all other ranks
-        i = 1
-        while i < n_pes:
-            nvshmem.putmem_block(dst_ptr, src_ptr, numel, i)
-            i += 1
-    # Synchronize all PEs (this is more lightweight than barrier_all() b/c it only ensures local store visibility
-    # and doesn't wait for remote ops to complete)
+    # Each PE writes a unique value to its local memory
+    p_local = local_data.to(tl.pointer_type(tl.int32))
+    unique_value = my_pe + 100  # PE 0 writes 100, PE 1 writes 101, etc.
+    tl.store(p_local, unique_value)
+
+    # sync_all() ensures local stores are visible to other PEs
+    # but doesn't guarantee completion of any remote operations
     nvshmem.sync_all()
-    # Non-zero ranks increment the received value
-    if my_pe != 0:
-        p_dst = dst_ptr.to(tl.pointer_type(tl.int32))
-        received = tl.load(p_dst)
-        tl.store(p_dst, received + 1)
+
+    # Now each PE reads from the next PE's memory to verify visibility
+    # PE 0 reads from PE 1, PE 1 reads from PE 2, ..., PE n-1 reads from PE 0
+    next_pe = (my_pe + 1) % n_pes
+    nvshmem.get(remote_data, local_data, nelems, next_pe)
+
+    # The get should now see the value that the next PE wrote locally
+    # because sync_all() made those local stores visible
 
 
 @triton.jit
-def alltoall_kernel(
+def nvshmem_alltoall_kernel(
     team_handle,
-    dest_ptr,
-    src_ptr,
-    nelems,
+    dst,
+    src,
+    nelems_per_pe,
 ):
-    nvshmem.alltoall(team_handle, dest_ptr, src_ptr, nelems)
+    nvshmem.alltoall(team_handle, dst, src, nelems_per_pe)
 
 
 @triton.jit
-def broadcast_kernel(
+def nvshmem_broadcast_kernel(
     team_handle,
-    dest_ptr,
-    src_ptr,
+    dst,
+    src,
     nelems,
     pe_root,
 ):
-    nvshmem.broadcast(team_handle, dest_ptr, src_ptr, nelems, pe_root)
+    nvshmem.broadcast(team_handle, dst, src, nelems, pe_root)
+
+
+@triton.jit
+def nvshmem_reduce_kernel(
+    team_handle,
+    dest_tensor,
+    source_tensor,
+    nreduce,
+    operation: tl.constexpr,
+):
+    nvshmem.reduce(team_handle, dest_tensor, source_tensor, nreduce, operation)
 
 
 @instantiate_parametrized_tests
 @requires_nvshmem()
-class NVSHMEMTritonTest(MultiProcContinousTest):
+class NVSHMEMTritonTest(MultiProcContinuousTest):
     def _init_device(self) -> None:
         # TODO: relieve this (seems to hang if without)
         device_module.set_device(self.device)
-        # NOTE: required for nvshmem allocation
-        torch.empty(1, device=self.device)
         # Set NVSHMEM as SymmMem backend
         symm_mem.set_backend("NVSHMEM")
 
@@ -248,6 +259,7 @@ def device(self) -> torch.device:
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_put(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
@@ -259,36 +271,52 @@ def test_triton_put(self) -> None:
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
 
-        msg_size_bytes = 8
-        dtype = torch.int8
-        numel = msg_size_bytes // dtype.itemsize
+        # Configuration
+        nelems = 5  # number of elements to transfer
+        dtype = torch.int64
+        val = 42 + rank  # Each rank has different data
 
-        val = 5
-        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
-        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
-        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        # Create symmetric tensors
+        src = symm_mem.empty(nelems, dtype=dtype, device=self.device)
+        dst = symm_mem.empty(nelems, dtype=dtype, device=self.device).fill_(-999)
 
-        peer = (self.world_size - 1) - rank
+        # Fill source tensor with rank-specific pattern
+        for i in range(nelems):
+            src[i] = (
+                val * 10 + i
+            )  # Rank 0: [420, 421, 422, 423, 424], Rank 1: [430, 431, ...]
+
+        # Rendezvous
+        symm_mem.rendezvous(src, group=group_name)
+        symm_mem.rendezvous(dst, group=group_name)
+
+        # Synchronize before operation
+        dist.barrier()
+
+        peer = 1 - rank
         if rank == 0:
-            dst_ptr = out_hdl.buffer_ptrs[rank]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-            put_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
-                numel=numel,
-                peer=peer,
+            # Rank 0 puts its data to Rank 1
+            nvshmem_put_kernel[(1,)](
+                dst,
+                src,
+                nelems,
+                peer,
                 extern_libs=nvshmem_lib,
             )
 
+        # Synchronize after operation
         dist.barrier()
+
         if rank == 1:
+            # Verify that rank 1 received rank 0's data
+            expected = [420 + i for i in range(nelems)]
             torch.testing.assert_close(
-                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+                dst, torch.tensor(expected, device=self.device, dtype=dtype)
             )
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_get(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
@@ -297,27 +325,29 @@ def test_triton_get(self) -> None:
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
-        msg_size_bytes = 8
+
+        # Configuration
+        numel = 8
         dtype = torch.int8
-        numel = msg_size_bytes // dtype.itemsize
         val = 7
+
+        # Create symmetric tensors
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(
             val if rank == 0 else -1
         )
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
-        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(out, group=group_name)
+
         dist.barrier()
-        peer = (self.world_size - 1) - rank
+        peer = 1 - rank
         if rank == 1:
-            # Rank 1 gets data from rank 0
-            dst_ptr = out_hdl.buffer_ptrs[rank]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-            get_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
-                numel=numel,
-                peer=peer,
+            # Rank 1 gets data from rank 0 using tensor-aware API
+            nvshmem_get_kernel[(1,)](
+                out,
+                inp,
+                numel,
+                peer,
                 extern_libs=nvshmem_lib,
             )
         if rank == 1:
@@ -327,6 +357,7 @@ def test_triton_get(self) -> None:
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_get_ring(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
@@ -336,29 +367,29 @@ def test_triton_get_ring(self) -> None:
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
         world_size = dist.get_world_size()
-        msg_size_bytes = 8
+
+        # Configuration
+        numel = 8
         dtype = torch.int8
-        numel = msg_size_bytes // dtype.itemsize
 
         # Each rank fills its input buffer with its own rank value
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(rank)
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
-        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(out, group=group_name)
+
         dist.barrier()
 
         # Ring topology: each rank gets data from the rank to its left
         # rank 0 gets from rank (world_size-1), rank 1 gets from rank 0, etc.
         peer = (rank - 1) % world_size
 
-        # All ranks execute the get operation
-        dst_ptr = out_hdl.buffer_ptrs[rank]
-        src_ptr = inp_hdl.buffer_ptrs[rank]
-        get_kernel[(1, 1, 1)](
-            dst_ptr,
-            src_ptr,
-            numel=numel,
-            peer=peer,
+        # All ranks execute the get operation using tensor-aware API
+        nvshmem_get_kernel[(1,)](
+            out,
+            inp,
+            numel,
+            peer,
             extern_libs=nvshmem_lib,
         )
 
@@ -369,6 +400,7 @@ def test_triton_get_ring(self) -> None:
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_put_signal_set(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
@@ -394,7 +426,7 @@ def test_triton_put_signal_set(self) -> None:
         # as the flag buffer for signaling completion.
         flag = out_hdl.get_signal_pad(rank, (1,), dtype=torch.int64).fill_(0)
 
-        peer = (self.world_size - 1) - rank
+        peer = 1 - rank
         NVSHMEM_SIGNAL_SET = 0  # value defined by NVSHMEM for atomic set
         SIGNAL_VAL = 1  # Signal completion value
         NVSHMEM_CMP_EQ = 0  # compare equal for signal wait until
@@ -404,10 +436,10 @@ def test_triton_put_signal_set(self) -> None:
             dst_ptr = out_hdl.buffer_ptrs[peer]
             src_ptr = inp_hdl.buffer_ptrs[rank]
             sig_ptr = out_hdl.signal_pad_ptrs[peer]
-            put_signal_kernel[(1, 1, 1)](
+            nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
                 dst_ptr,
                 src_ptr,
-                numel=numel,
+                size_bytes=msg_size_bytes,
                 sig_ptr=sig_ptr,
                 signal_val=SIGNAL_VAL,
                 sig_op=NVSHMEM_SIGNAL_SET,
@@ -418,7 +450,7 @@ def test_triton_put_signal_set(self) -> None:
         if rank == 1:
             # Wait until signal flag is set by Rank 0
             sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
-            signal_wait_until_kernel[(1,)](
+            nvshmem_signal_wait_until_kernel[(1,)](
                 sig_ptr_local,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=SIGNAL_VAL,
@@ -434,6 +466,7 @@ def test_triton_put_signal_set(self) -> None:
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_put_signal_add(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
@@ -459,7 +492,7 @@ def test_triton_put_signal_add(self) -> None:
         # as the flag buffer for signaling completion.
         flag = out_hdl.get_signal_pad(rank, (1,), dtype=torch.int64).fill_(0)
 
-        peer = (self.world_size - 1) - rank
+        peer = 1 - rank
         NVSHMEM_SIGNAL_ADD = 5  # atomic add operation
         SIGNAL_VAL = 16  # val + NVSHMEM_SIGNAL_ADD
         NVSHMEM_CMP_EQ = 0
@@ -469,10 +502,10 @@ def test_triton_put_signal_add(self) -> None:
             dst_ptr = out_hdl.buffer_ptrs[peer]
             src_ptr = inp_hdl.buffer_ptrs[rank]
             sig_ptr = out_hdl.signal_pad_ptrs[peer]
-            put_signal_kernel[(1, 1, 1)](
+            nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
                 dst_ptr,
                 src_ptr,
-                numel=numel,
+                size_bytes=msg_size_bytes,
                 sig_ptr=sig_ptr,
                 signal_val=SIGNAL_VAL,
                 sig_op=NVSHMEM_SIGNAL_ADD,
@@ -482,7 +515,7 @@ def test_triton_put_signal_add(self) -> None:
 
         if rank == 1:
             sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
-            signal_wait_until_kernel[(1, 1, 1)](
+            nvshmem_signal_wait_until_kernel[(1, 1, 1)](
                 sig_ptr_local,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=SIGNAL_VAL,
@@ -497,6 +530,7 @@ def test_triton_put_signal_add(self) -> None:
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_wait_until(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
@@ -506,72 +540,52 @@ def test_triton_wait_until(self) -> None:
         symm_mem.enable_symm_mem_for_group(group_name)
 
         rank = self.rank
-        peer = (self.world_size - 1) - rank
-        NVSHMEM_CMP_EQ = 0  # from nvshmem.h
-
-        # Allocate symmetric buffers
-        msg_size_bytes = 8
-        dtype = torch.int8
-        numel = msg_size_bytes // dtype.itemsize
-        val = 13
-        flag_val = 21
+        peer = 1 - rank
+        NVSHMEM_CMP_EQ = 0  # equal comparison
+        FLAG_INITIAL_VALUE = 0
+        FLAG_FINAL_VALUE = 42
 
-        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
-        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        # Use a single int64 symmetric tensor as our synchronization flag.
+        flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(
+            FLAG_INITIAL_VALUE
+        )
+        symm_mem.rendezvous(flag, group=group_name)
 
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
-        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        nvshmem_barrier_all_kernel[(1,)](extern_libs=nvshmem_lib)
 
         if rank == 0:
-            # Rank 0 waits for the flag to be set by Rank 1, then checks the data
-            ivar_ptr = out_hdl.signal_pad_ptrs[rank]
-
-            wait_until_kernel[(1, 1, 1)](
-                ivar_ptr,
+            # Rank 0 (the waiter)
+            nvshmem_wait_until_kernel[(1,)](
+                flag,
                 cmp_op=NVSHMEM_CMP_EQ,
-                cmp_val=flag_val,
+                cmp_val=FLAG_FINAL_VALUE,
                 extern_libs=nvshmem_lib,
             )
 
+            # Verification
             torch.testing.assert_close(
-                out,
-                val * torch.ones(numel, dtype=dtype, device=self.device),
+                flag,
+                torch.tensor([FLAG_FINAL_VALUE], dtype=torch.int64, device=self.device),
             )
 
         if rank == 1:
-            # Rank 1 puts data into Rank 0's output buffer
-            dst_ptr = out_hdl.buffer_ptrs[peer]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-
-            put_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
-                numel=numel,
-                peer=peer,
-                extern_libs=nvshmem_lib,
+            # Rank 1 (the signaler)
+            val_to_put = torch.tensor(
+                [FLAG_FINAL_VALUE], dtype=torch.int64, device=self.device
             )
 
-            # Fence to order data put before flag put
-            @triton.jit
-            def fence_kernel():
-                nvshmem.fence()
-
-            fence_kernel[(1, 1, 1)](extern_libs=nvshmem_lib)
-
-            # Put the flag value (do not use signal_op here)
-            flag_src = torch.tensor([flag_val], dtype=torch.int64, device=self.device)
-            flag_dst_ptr = out_hdl.signal_pad_ptrs[peer]
-
-            put_kernel[(1, 1, 1)](
-                flag_dst_ptr,
-                flag_src.data_ptr(),
-                numel=1,
-                peer=peer,
+            # Launch a kernel to put the value to Rank 0's flag tensor.
+            nvshmem_put_kernel[(1,)](
+                flag,  # Destination symmetric tensor on the remote PE
+                val_to_put,  # Source data tensor (local)
+                1,  # Number of elements
+                peer,  # The target PE (Rank 0)
                 extern_libs=nvshmem_lib,
             )
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_signal_wait_until(self) -> None:
         self._init_device()
         # Enable NVSHMEM for Triton
@@ -579,7 +593,7 @@ def test_triton_signal_wait_until(self) -> None:
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
-        peer = (self.world_size - 1) - rank
+        peer = 1 - rank
 
         # NVSHMEM constants from documentation
         NVSHMEM_CMP_EQ = 0  # equal comparison
@@ -589,6 +603,7 @@ def test_triton_signal_wait_until(self) -> None:
         msg_size_bytes = 8
         dtype = torch.int8
         numel = msg_size_bytes // dtype.itemsize
+
         val_to_put = 123  # arbitrary test value
         COMPLETION_FLAG_VAL = 1
 
@@ -607,11 +622,11 @@ def test_triton_signal_wait_until(self) -> None:
             dst_ptr = out_hdl.buffer_ptrs[peer]
             src_ptr = inp_hdl.buffer_ptrs[rank]
             sig_ptr = out_hdl.signal_pad_ptrs[peer]
-            put_and_signal_kernel[(1, 1, 1)](
+            nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
                 dst_ptr,
                 src_ptr,
-                numel,
-                sig_ptr,
+                size_bytes=msg_size_bytes,
+                sig_ptr=sig_ptr,
                 signal_val=COMPLETION_FLAG_VAL,
                 sig_op=NVSHMEM_SIGNAL_SET,
                 peer=peer,
@@ -620,7 +635,7 @@ def test_triton_signal_wait_until(self) -> None:
         elif rank == 1:
             # Consumer (rank 1): Waits on the signal variable using `signal_wait_until`.
             sig_ptr = out_hdl.signal_pad_ptrs[rank]
-            signal_wait_until_kernel[(1, 1, 1)](
+            nvshmem_signal_wait_until_kernel[(1, 1, 1)](
                 sig_ptr,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=COMPLETION_FLAG_VAL,
@@ -639,6 +654,7 @@ def test_triton_signal_wait_until(self) -> None:
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_fence(self) -> None:
         """
         Rank 0 performs two put operations into Rank 1's buffers with a fence
@@ -648,18 +664,17 @@ def test_triton_fence(self) -> None:
         its arrival implies that both preceding puts have been delivered in
         order.
         """
-
         torch.manual_seed(42 + self.rank)
         self._init_device()
         nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
-        peer = (self.world_size - 1) - rank
+        peer = 1 - rank
         # Message configuration
-        msg_size_bytes = 8
         dtype = torch.int8
-        numel = msg_size_bytes // dtype.itemsize
+        numel = 8
+
         val1 = 10
         val2 = 20
         flag_val = 1
@@ -668,42 +683,35 @@ def test_triton_fence(self) -> None:
         inp2 = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val2)
         out1 = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
         out2 = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        inp1_hdl = symm_mem.rendezvous(inp1, group=group_name)
-        inp2_hdl = symm_mem.rendezvous(inp2, group=group_name)
-        out1_hdl = symm_mem.rendezvous(out1, group=group_name)
-        out2_hdl = symm_mem.rendezvous(out2, group=group_name)
-
-        # Flag buffer resides in the signal pad of out2.
-        flag = out2_hdl.get_signal_pad(rank, (1,), dtype=torch.int64).fill_(0)
+        symm_mem.rendezvous(inp1, group=group_name)
+        symm_mem.rendezvous(inp2, group=group_name)
+        symm_mem.rendezvous(out1, group=group_name)
+        symm_mem.rendezvous(out2, group=group_name)
+
+        # Use regular symmetric memory tensor for flag
+        flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(0)
+        symm_mem.rendezvous(flag, group=group_name)
         flag_update_val = torch.tensor(
             [flag_val], dtype=torch.int64, device=self.device
         )
         NVSHMEM_CMP_EQ = 0  # compare equal
 
         if rank == 0:
-            dst_ptr1 = out1_hdl.buffer_ptrs[rank]
-            dst_ptr2 = out2_hdl.buffer_ptrs[rank]
-            src_ptr1 = inp1_hdl.buffer_ptrs[rank]
-            src_ptr2 = inp2_hdl.buffer_ptrs[rank]
-            flag_ptr = out2_hdl.signal_pad_ptrs[rank]
-            flag_src_ptr = flag_update_val.data_ptr()
-
-            put_with_fence_kernel[(1, 1, 1)](
-                dst_ptr1,
-                dst_ptr2,
-                src_ptr1,
-                src_ptr2,
-                flag_ptr,
-                flag_src_ptr,
-                numel,
+            nvshmem_put_with_fence_kernel[(1,)](
+                out1,
+                inp1,
+                out2,
+                inp2,
+                flag,
+                flag_update_val,
+                nelems=numel,
                 peer=peer,
                 extern_libs=nvshmem_lib,
             )
         elif rank == 1:
-            # Wait until flag is set by Rank 0.
-            ivar_ptr = out2_hdl.signal_pad_ptrs[rank]
-            wait_until_kernel[(1, 1, 1)](
-                ivar_ptr,
+            # Wait until flag is set by Rank 0
+            nvshmem_wait_until_kernel[(1,)](
+                flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=flag_val,
                 extern_libs=nvshmem_lib,
@@ -722,63 +730,60 @@ def test_triton_fence(self) -> None:
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_quiet(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        # Enable NVSHMEM for Triton
         nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
-        msg_size_bytes = 8
+        peer = 1 - rank
+
         dtype = torch.int8
-        numel = msg_size_bytes // dtype.itemsize
-        # Data buffers
+        numel = 8
         val = 15
+        flag_val = 42
+
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
-        out_hdl = symm_mem.rendezvous(out, group=group_name)
-        # Use signal pad as completion flag
-        flag_val = 42
-        peer = (self.world_size - 1) - rank
+        flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(0)
+        flag_update_val = torch.tensor(
+            [flag_val], dtype=torch.int64, device=self.device
+        )
+
+        symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(out, group=group_name)
+        symm_mem.rendezvous(flag, group=group_name)
+
         NVSHMEM_CMP_EQ = 0
 
-        if rank == 0:
-            # Rank 0 waits for flag from Rank 1
-            ivar_ptr = out_hdl.signal_pad_ptrs[rank]
-            wait_until_kernel[(1, 1, 1)](
-                ivar_ptr,
+        dist.barrier()
+        if rank == 1:
+            nvshmem_put_with_quiet_kernel[(1,)](
+                out,
+                inp,
+                flag,
+                flag_update_val,
+                nelems=numel,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+        elif rank == 0:
+            nvshmem_wait_until_kernel[(1,)](
+                flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=flag_val,
                 extern_libs=nvshmem_lib,
             )
-            # After flag is set, data should be complete due to quiet
             torch.testing.assert_close(
                 out, val * torch.ones(numel, dtype=dtype, device=self.device)
             )
-        if rank == 1:
-            # Rank 1 puts data and flag with quiet in between
-            dst_ptr = out_hdl.buffer_ptrs[rank]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-            flag_dst_ptr = out_hdl.signal_pad_ptrs[rank]
-            # Create a tensor for the flag value
-            flag_update_val = torch.tensor(
-                [flag_val], dtype=torch.int64, device=self.device
-            )
-            flag_src_ptr = flag_update_val.data_ptr()
-            put_with_quiet_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
-                flag_dst_ptr,
-                flag_src_ptr,
-                numel=numel,
-                peer=peer,
-                extern_libs=nvshmem_lib,
-            )
+        dist.barrier()
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_barrier(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
@@ -788,77 +793,85 @@ def test_triton_barrier(self) -> None:
         rank = self.rank
         numel = 1
         dtype = torch.int32
-        # Create symmetric buffers
+
         src = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(0)
         dst = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(0)
-        src_hdl = symm_mem.rendezvous(src, group=group_name)
-        dst_hdl = symm_mem.rendezvous(dst, group=group_name)
-        # Launch kernel with cooperative grid
-        barrier_test_kernel[(1,)](
-            dst_hdl.buffer_ptrs[rank],
-            src_hdl.buffer_ptrs[rank],
-            numel=numel,
+        symm_mem.rendezvous(src, group=group_name)
+        symm_mem.rendezvous(dst, group=group_name)
+
+        nvshmem_barrier_test_kernel[(1,)](
+            dst,
+            src,
+            nelems=numel,
             extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
             num_ctas=1,
         )
-        # Verify results
-        # Rank 0 should have 42, and then the rest should have incremented + 1 to 43
+        dist.barrier()
+
         if rank == 0:
-            # Rank 0 should have its original value (42) in src
             torch.testing.assert_close(
                 src, torch.tensor([42], device=self.device, dtype=dtype)
             )
         else:
-            # Other ranks should have received 42 and incremented to 43
             torch.testing.assert_close(
                 dst, torch.tensor([43], device=self.device, dtype=dtype)
             )
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_sync(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
+
         nvshmem_lib = nvshmem.enable_triton()
-        group_name = dist.group.WORLD.group_name
+        group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
         numel = 1
         dtype = torch.int32
+
         # Create symmetric buffers
-        src = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(0)
-        dst = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(0)
-        src_hdl = symm_mem.rendezvous(src, group=group_name)
-        dst_hdl = symm_mem.rendezvous(dst, group=group_name)
+        local_data = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(0)
+        remote_data = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(0)
+        symm_mem.rendezvous(local_data, group=group_name)
+        symm_mem.rendezvous(remote_data, group=group_name)
+
         # Launch kernel with cooperative grid
-        sync_test_kernel[(1,)](
-            dst_hdl.buffer_ptrs[rank],
-            src_hdl.buffer_ptrs[rank],
-            numel=numel,
+        nvshmem_sync_test_kernel[(1,)](
+            local_data,
+            remote_data,
+            nelems=numel,
             extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
             num_ctas=1,
         )
+
         # Verify results
-        if rank == 0:
-            # Rank 0 should have its original value (42) in src
-            torch.testing.assert_close(
-                src, torch.tensor([42], device=self.device, dtype=dtype)
-            )
-        else:
-            # Other ranks should have received 42 and incremented to 43
-            torch.testing.assert_close(
-                dst, torch.tensor([43], device=self.device, dtype=dtype)
-            )
+        # Each PE should have written rank + 100 to its local_data
+        expected_local = rank + 100
+        torch.testing.assert_close(
+            local_data, torch.tensor([expected_local], device=self.device, dtype=dtype)
+        )
+
+        # Each PE should have read (next_rank + 100) into its remote_data
+        # PE 0 reads from PE 1, PE 1 reads from PE 2, ..., PE n-1 reads from PE 0
+        next_rank = (rank + 1) % self.world_size
+        expected_remote = next_rank + 100
+        torch.testing.assert_close(
+            remote_data,
+            torch.tensor([expected_remote], device=self.device, dtype=dtype),
+        )
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_alltoall(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
         nvshmem_lib = nvshmem.enable_triton()
-        group_name = dist.group.WORLD.group_name
+        group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         world_size = dist.get_world_size()
         rank = self.rank
@@ -876,16 +889,16 @@ def test_triton_alltoall(self) -> None:
             src[i * nelems_per_pe : (i + 1) * nelems_per_pe] = value
         # Destination buffer
         dst = symm_mem.empty(src_size, dtype=dtype, device=self.device).fill_(-1)
-        src_hdl = symm_mem.rendezvous(src, group=group_name)
-        dst_hdl = symm_mem.rendezvous(dst, group=group_name)
+        symm_mem.rendezvous(src, group=group_name)
+        symm_mem.rendezvous(dst, group=group_name)
         # Synchronize before alltoall
         dist.barrier()
         team_handle = 0  # NVSHMEM_TEAM_WORLD handle is 0
-        # Launch the kernel
-        alltoall_kernel[(1,)](
+        # Launch the kernel using new tensor-aware API
+        nvshmem_alltoall_kernel[(1,)](
             team_handle,
-            dst_hdl.buffer_ptrs[rank],
-            src_hdl.buffer_ptrs[rank],
+            dst,
+            src,
             nelems_per_pe,
             extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
@@ -902,19 +915,25 @@ def test_triton_alltoall(self) -> None:
 
     @skipIfRocm
     @requires_triton()
+    @requires_h100()
     def test_triton_broadcast(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
         nvshmem_lib = nvshmem.enable_triton()
-        group_name = dist.group.WORLD.group_name
+        group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
+
         # Configuration
         nelems = 4  # number of elements
         dtype = torch.int64
+
         # Source buffer - only root will have meaningful data
         pe_root = 0  # PE 0 will be the root
         src = symm_mem.empty(nelems, dtype=dtype, device=self.device)
+        # Destination buffer
+        dst = symm_mem.empty(nelems, dtype=dtype, device=self.device).fill_(-999)
+
         if rank == pe_root:
             # Root fills with specific pattern
             for i in range(nelems):
@@ -922,31 +941,265 @@ def test_triton_broadcast(self) -> None:
         else:
             # Non-root PEs have dummy data
             src.fill_(-1)
-        # Destination buffer
-        dst = symm_mem.empty(nelems, dtype=dtype, device=self.device).fill_(-999)
-        src_hdl = symm_mem.rendezvous(src, group=group_name)
-        dst_hdl = symm_mem.rendezvous(dst, group=group_name)
+
+        symm_mem.rendezvous(src, group=group_name)
+        symm_mem.rendezvous(dst, group=group_name)
+
         # Synchronize before broadcast
         dist.barrier()
+
         # Execute broadcast
         team_handle = 0  # NVSHMEM_TEAM_WORLD
-        broadcast_kernel[(1,)](
+        nvshmem_broadcast_kernel[(1,)](
             team_handle,
-            dst_hdl.buffer_ptrs[rank],
-            src_hdl.buffer_ptrs[rank],
+            dst,
+            src,
             nelems,
             pe_root,
             extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
+
         # Synchronize after broadcast
         dist.barrier()
+
         # Verify results - all ranks should have the root's data
         expected = [100 + i for i in range(nelems)]
         torch.testing.assert_close(
             dst, torch.tensor(expected, device=self.device, dtype=dtype)
         )
 
+    @skipIfRocm
+    @requires_triton()
+    @requires_h100()
+    @parametrize(
+        "dtype",
+        [
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.uint8,
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            torch.bfloat16,
+        ],
+    )
+    def test_triton_sum_reduce(self, dtype) -> None:
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.distributed_c10d._get_default_group().group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        world_size = dist.get_world_size()
+        rank = self.rank
+        # Configuration
+        nreduce = 3  # number of separate reductions
+        # Source buffer - each rank contributes different values
+        src = symm_mem.empty(nreduce, dtype=dtype, device=self.device)
+        for i in range(nreduce):
+            src[i] = (rank + 1) * (i + 1)  # Rank 0: [1,2,3], Rank 1: [2,4,6], etc.
+        # Destination buffer
+        dst = symm_mem.empty(nreduce, dtype=dtype, device=self.device).fill_(-1)
+        symm_mem.rendezvous(src, group=group_name)
+        symm_mem.rendezvous(dst, group=group_name)
+        # Calculate expected results
+        expected = []
+        for i in range(nreduce):
+            # Sum across all ranks: sum((rank+1)*(i+1) for rank in range(world_size))
+            total = sum((r + 1) * (i + 1) for r in range(world_size))
+            expected.append(total)
+
+        # Synchronize before reduction
+        dist.barrier()
+
+        # Execute sum reduction across all ranks
+        team_handle = 0  # NVSHMEM_TEAM_WORLD
+        nvshmem_reduce_kernel[(1,)](
+            team_handle,
+            dst,
+            src,
+            nreduce,
+            operation="sum",
+            extern_libs=nvshmem_lib,
+            launch_cooperative_grid=True,
+        )
+
+        # Synchronize after reduction
+        dist.barrier()
+
+        # Verify results
+        torch.testing.assert_close(
+            dst, torch.tensor(expected, device=self.device, dtype=dtype)
+        )
+
+    @skipIfRocm
+    @requires_triton()
+    @requires_h100()
+    @parametrize(
+        "dtype",
+        [
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            torch.bfloat16,
+        ],
+    )
+    def test_triton_minmax_reduce(self, dtype) -> None:
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.distributed_c10d._get_default_group().group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        world_size = dist.get_world_size()
+        rank = self.rank
+        # Configuration
+        nreduce = 2  # number of values to reduce
+        # Source buffers for min and max
+        src_min = symm_mem.empty(nreduce, dtype=dtype, device=self.device)
+        src_max = symm_mem.empty(nreduce, dtype=dtype, device=self.device)
+        # Each rank contributes different values
+        # For min: rank 0: [10, 20], rank 1: [15, 5], etc.
+        # For max: same values
+        for i in range(nreduce):
+            if i == 0:
+                src_min[i] = 10 + rank * 5  # 10, 15, 20, ...
+                src_max[i] = 10 + rank * 5
+            else:
+                src_min[i] = 20 - rank * 15  # 20, 5, -10, ...
+                src_max[i] = 20 - rank * 15
+        # Destination buffers
+        dst_min = symm_mem.empty(nreduce, dtype=dtype, device=self.device).fill_(-1)
+        dst_max = symm_mem.empty(nreduce, dtype=dtype, device=self.device).fill_(-1)
+        symm_mem.rendezvous(src_min, group=group_name)
+        symm_mem.rendezvous(src_max, group=group_name)
+        symm_mem.rendezvous(dst_min, group=group_name)
+        symm_mem.rendezvous(dst_max, group=group_name)
+        # Calculate expected results
+        all_values = []
+        for i in range(nreduce):
+            values = []
+            for r in range(world_size):
+                if i == 0:
+                    values.append(10 + r * 5)
+                else:
+                    values.append(20 - r * 15)
+            all_values.append(values)
+        expected_min = [min(vals) for vals in all_values]
+        expected_max = [max(vals) for vals in all_values]
+        dist.barrier()
+        # Execute MIN reduction
+        team_handle = 0
+        nvshmem_reduce_kernel[(1,)](
+            team_handle,
+            dst_min,
+            src_min,
+            nreduce,
+            operation="min",
+            extern_libs=nvshmem_lib,
+            launch_cooperative_grid=True,
+        )
+        # Execute MAX reduction
+        nvshmem_reduce_kernel[(1,)](
+            team_handle,
+            dst_max,
+            src_max,
+            nreduce,
+            operation="max",
+            extern_libs=nvshmem_lib,
+            launch_cooperative_grid=True,
+        )
+        dist.barrier()
+        # Verify results
+        torch.testing.assert_close(
+            dst_min, torch.tensor(expected_min, device=self.device, dtype=dtype)
+        )
+        torch.testing.assert_close(
+            dst_max, torch.tensor(expected_max, device=self.device, dtype=dtype)
+        )
+
+    @skipIfRocm
+    @requires_triton()
+    @requires_h100()
+    @parametrize(
+        "dtype",
+        [
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            torch.bfloat16,
+        ],
+    )
+    def test_triton_prod_reduce(self, dtype) -> None:
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.distributed_c10d._get_default_group().group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        world_size = dist.get_world_size()
+        rank = self.rank
+        # Configuration
+        nreduce = 3  # number of separate reductions
+        # Source buffer - each rank contributes different values
+        # Use very small values to avoid overflow, especially for small integer types
+        src = symm_mem.empty(nreduce, dtype=dtype, device=self.device)
+        for i in range(nreduce):
+            # Use values that won't overflow even for int8: all values 1 or 2
+            if i == 0:
+                # For first element: rank 0,2,4... gets 1, rank 1,3,5... gets 2
+                src[i] = 1 if rank % 2 == 0 else 2
+            elif i == 1:
+                # For second element: all get 1 (no multiplication effect)
+                src[i] = 1
+            else:
+                # For third element: rank 0,1 get 1, rank 2,3 get 2, etc. (groups of 2)
+                src[i] = 1 if (rank // 2) % 2 == 0 else 2
+        # Destination buffer
+        dst = symm_mem.empty(nreduce, dtype=dtype, device=self.device).fill_(-1)
+        symm_mem.rendezvous(src, group=group_name)
+        symm_mem.rendezvous(dst, group=group_name)
+        # Calculate expected results
+        vals = torch.empty(nreduce, world_size, dtype=dtype)
+        vals[0, ::2] = 1
+        vals[0, 1::2] = 2
+        vals[1] = 1
+        vals2 = vals[2].view(-1, 2, 2)
+        vals2[:, 0] = 1
+        vals2[:, 1] = 2
+        expected = vals.prod(-1).tolist()
+
+        # Synchronize before reduction
+        dist.barrier()
+
+        # Execute product reduction across all ranks
+        team_handle = 0  # NVSHMEM_TEAM_WORLD
+        nvshmem_reduce_kernel[(1,)](
+            team_handle,
+            dst,
+            src,
+            nreduce,
+            operation="prod",
+            extern_libs=nvshmem_lib,
+            launch_cooperative_grid=True,
+        )
+
+        # Synchronize after reduction
+        dist.barrier()
+
+        # Verify results
+        torch.testing.assert_close(
+            dst, torch.tensor(expected, device=self.device, dtype=dtype)
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_p2p_ipc.py b/test/distributed/test_p2p_ipc.py
index c5d73535113c0..d3a3926bf13f7 100644
--- a/test/distributed/test_p2p_ipc.py
+++ b/test/distributed/test_p2p_ipc.py
@@ -6,12 +6,8 @@
 
 import torch
 from torch.multiprocessing.reductions import reduce_tensor
-from torch.testing._internal.common_distributed import MultiProcContinousTest
-from torch.testing._internal.common_utils import (
-    requires_cuda_p2p_access,
-    run_tests,
-    skipIfRocm,
-)
+from torch.testing._internal.common_distributed import MultiProcContinuousTest
+from torch.testing._internal.common_utils import requires_cuda_p2p_access, run_tests
 
 
 # So that tests are written in device-agnostic way
@@ -20,7 +16,7 @@
 
 
 @requires_cuda_p2p_access()
-class P2PIpcTest(MultiProcContinousTest):
+class P2PIpcTest(MultiProcContinuousTest):
     @classmethod
     def backend_str(cls):
         return "gloo"
@@ -34,7 +30,6 @@ def _init_device(self) -> None:
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
-    @skipIfRocm
     def test_p2p_ipc(self) -> None:
         """
         Test that cross-process P2P access works, by reducing a tensor,
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
index 9b9b25a7eb7e8..aef7130c04aa3 100644
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@@ -2,6 +2,7 @@
 
 import itertools
 import os
+import random
 from contextlib import nullcontext
 from unittest import skip, skipIf
 
@@ -24,7 +25,7 @@
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater
 from torch.testing._internal.common_device_type import e4m3_type
 from torch.testing._internal.common_distributed import (
-    MultiProcContinousTest,
+    MultiProcContinuousTest,
     MultiProcessTestCase,
     requires_multicast_support,
     skip_if_lt_x_gpu,
@@ -52,14 +53,13 @@
 
 @instantiate_parametrized_tests
 @requires_cuda_p2p_access()
-class SymmetricMemoryTest(MultiProcContinousTest):
+class SymmetricMemoryTest(MultiProcContinuousTest):
     @property
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
-    def _init_process(self, set_device: bool = True):
-        if set_device:
-            torch.cuda.set_device(self.device)
+    def _init_process(self):
+        torch.cuda.set_device(self.device)
         torch.manual_seed(42 + self.rank)
 
     @requires_multicast_support()
@@ -93,86 +93,6 @@ def test_large_alloc(self) -> None:
         t = symm_mem.empty(2 * 1024**3, dtype=torch.uint8, device="cuda")
         self.assertEqual(t.numel() * t.element_size(), 2 * 1024**3)
 
-    def _get_test_alloc_args(self):
-        shape = (64, 64)
-        stride = (64, 1)
-        dtype = torch.float32
-        device = self.device
-        group_name = "0"
-        return (shape, stride, dtype, device, group_name)
-
-    def _verify_symmetric_memory(self, symm_mem_hdl):
-        self.assertEqual(symm_mem_hdl.world_size, self.world_size)
-
-        buf = symm_mem_hdl.get_buffer(
-            0, (symm_mem_hdl.buffer_size // 4,), torch.float32
-        )
-        self.assertEqual(buf.storage_offset(), 0)
-        self.assertEqual(buf.untyped_storage().size(), symm_mem_hdl.buffer_size)
-
-        if symm_mem_hdl.rank == 0:
-            symm_mem_hdl.wait_signal(src_rank=1)
-            self.assertTrue(buf.eq(42).all())
-        else:
-            buf.fill_(42)
-            symm_mem_hdl.put_signal(dst_rank=0)
-
-        symm_mem_hdl.barrier()
-
-        if symm_mem_hdl.rank == 0:
-            symm_mem_hdl.barrier()
-            self.assertTrue(buf.eq(43).all())
-        else:
-            buf.fill_(43)
-            symm_mem_hdl.barrier()
-
-        symm_mem_hdl.barrier()
-
-    @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(2)
-    @parametrize("set_device", [True, False])
-    def test_empty_strided_p2p(self, set_device: bool) -> None:
-        self._init_process(set_device)
-        enable_symm_mem_for_group(dist.group.WORLD.group_name)
-
-        alloc_args = self._get_test_alloc_args()
-
-        t = torch.empty((64, 64), device=self.device)
-        self.assertIsNone(_SymmetricMemory.rendezvous(t))
-
-        t = _SymmetricMemory.empty_strided_p2p(*alloc_args)
-        symm_mem_hdl = _SymmetricMemory.rendezvous(t)
-
-        del t
-        self._verify_symmetric_memory(symm_mem_hdl)
-
-    @skipIfRocm  # started failing during ROCm 6.4 CI upgrade
-    @skip_if_lt_x_gpu(2)
-    @parametrize("set_device", [True, False])
-    def test_empty_strided_p2p_persistent(self, set_device: bool) -> None:
-        self._init_process(set_device)
-        enable_symm_mem_for_group(dist.group.WORLD.group_name)
-
-        alloc_args = self._get_test_alloc_args()
-
-        t = _SymmetricMemory.empty_strided_p2p(*alloc_args, alloc_id=42)
-        data_ptr = t.data_ptr()
-
-        # Verify that persistent allocation would fail if there's an active
-        # allocation with the same alloc_id.
-        with self.assertRaises(RuntimeError):
-            _SymmetricMemory.empty_strided_p2p(*alloc_args, alloc_id=42)
-
-        # Verify that persistent allocation would succeed in lieu of activate
-        # allocations with the same alloc_id, and the returned tensor would
-        # have the same data pointer.
-        del t
-        t = _SymmetricMemory.empty_strided_p2p(*alloc_args, alloc_id=42)
-        self.assertEqual(t.data_ptr(), data_ptr)
-
-        symm_mem_hdl = _SymmetricMemory.rendezvous(t)
-        self._verify_symmetric_memory(symm_mem_hdl)
-
     @runOnRocmArch(MI300_ARCH)
     @skip_if_lt_x_gpu(2)
     def test_get_signal_pad(self) -> None:
@@ -233,6 +153,124 @@ def test_allow_overlapping_devices(self) -> None:
 
         os.environ["TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES"] = "0"
 
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(2)
+    @parametrize("symm_mem_input", [True, False])
+    def test_low_contention_all_gather(self, symm_mem_input: bool) -> None:
+        self._init_process()
+
+        if symm_mem_input:
+            t = _SymmetricMemory.empty_strided_p2p(
+                size=(64, 64),
+                stride=(64, 1),
+                dtype=torch.float32,
+                device=self.device,
+                group_name="0",
+            ).fill_(self.rank)
+        else:
+            t = torch.full((64, 64), self.rank, dtype=torch.float32, device=self.device)
+
+        res = torch.ops.symm_mem._low_contention_all_gather(t, "0")
+        res = torch.ops._c10d_functional.wait_tensor(res)
+        self.assertEqual(res.shape, (64 * self.world_size, 64))
+
+        chunks = res.chunk(self.world_size)
+        for r in range(self.world_size):
+            self.assertTrue(chunks[r].eq(r).all())
+
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(2)
+    @parametrize("reduce_op", ["sum", "avg"])
+    @parametrize("symm_mem_input", [True, False])
+    def test_low_contention_reduce_scatter(
+        self, reduce_op: str, symm_mem_input: bool
+    ) -> None:
+        self._init_process()
+
+        if symm_mem_input:
+            t = _SymmetricMemory.empty_strided_p2p(
+                size=(64, 64),
+                stride=(64, 1),
+                dtype=torch.float32,
+                device=self.device,
+                group_name="0",
+            )
+        else:
+            t = torch.empty((64, 64), dtype=torch.float32, device=self.device)
+
+        chunks = t.chunk(self.world_size)
+        for r in range(self.world_size):
+            chunks[r].fill_(r)
+
+        res = torch.ops.symm_mem._low_contention_reduce_scatter(t, reduce_op, "0")
+        res = torch.ops._c10d_functional.wait_tensor(res)
+        self.assertEqual(res.shape, (64 // self.world_size, 64))
+
+        if reduce_op == "sum":
+            expect = self.rank * self.world_size
+        elif reduce_op == "avg":
+            expect = self.rank
+        else:
+            raise AssertionError(f"Unexpected reduce_op: {reduce_op}")
+        self.assertTrue(res.eq(expect).all())
+
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(4)
+    def test_subgroup(self) -> None:
+        self._init_process()
+
+        ranks = list(range(self.world_size))
+        subgroup_0 = dist.new_group(ranks[: len(ranks) // 2])
+        subgroup_1 = dist.new_group(ranks[len(ranks) // 2 :])
+
+        world = dist.group.WORLD
+        subgroup = subgroup_0 if world.rank() < world.size() // 2 else subgroup_1
+
+        t = symm_mem.empty(64, device="cuda")
+        symm_mem_world = symm_mem.rendezvous(t, group=world)
+        symm_mem_subgroup = symm_mem.rendezvous(t, group=subgroup)
+
+        self.assertEqual(symm_mem_world.world_size, world.size())
+        self.assertEqual(symm_mem_world.rank, world.rank())
+        self.assertEqual(symm_mem_subgroup.world_size, world.size() // 2)
+        self.assertEqual(symm_mem_subgroup.rank, world.rank() % subgroup.size())
+
+        t.fill_(world.rank())
+        symm_mem_world.barrier()
+
+        # Observe a peer buffer via the world group
+        peer_rank = (world.rank() + 1) % world.size()
+        buf = symm_mem_world.get_buffer(peer_rank, (64,), torch.float32)
+        self.assertTrue(buf.eq(peer_rank).all())
+
+        # Observe a peer buffer via the subgroup
+        peer_rank = (subgroup.rank() + 1) % subgroup.size()
+        buf = symm_mem_subgroup.get_buffer(peer_rank, (64,), torch.float32)
+        if world.rank() < world.size() // 2:
+            self.assertTrue(buf.eq(peer_rank).all())
+        else:
+            self.assertTrue(buf.eq(peer_rank + world.size() // 2).all())
+
+
+# We move AsyncTP tests to a seperate test suite because 1) Async TP ops are not
+# the core symmetric memory APIs, they are more like applications, 2)
+# MultiProcContinuousTest will skip all the following tests if a test fails (
+# we should fix this too). We still want to get the test signals for the core
+# symmetric memory APIs when Async TP ops fail.
+@instantiate_parametrized_tests
+@requires_cuda_p2p_access()
+class AsyncTPTest(MultiProcContinuousTest):
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    def _init_process(self):
+        torch.cuda.set_device(self.device)
+        torch.manual_seed(42 + self.rank)
+        torch.use_deterministic_algorithms(True)
+        torch.set_deterministic_debug_mode("warn")
+        torch.utils.deterministic.fill_uninitialized_memory = True
+
     @runOnRocmArch(MI300_ARCH)
     @skip_if_lt_x_gpu(2)
     @parametrize("gather_dim", [0, 1])
@@ -521,7 +559,7 @@ def test_fused_scaled_matmul_reduce_scatter(
                 )
 
         assert outputs[0].stride() == outputs[1].stride()
-        assert torch.allclose(outputs[0], outputs[1]), (outputs[0], outputs[1])
+        self.assertEqual(outputs[0], outputs[1])
 
     @runOnRocmArch(MI300_ARCH)
     @parametrize("dim", [0, 1, 2])
@@ -536,108 +574,135 @@ def test_optimal_layout(self, dim: int) -> None:
         self.assertTrue(x.movedim(dim, 0).is_contiguous())
         self.assertTrue(torch.allclose(x, t))
 
-    @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(2)
-    @parametrize("symm_mem_input", [True, False])
-    def test_low_contention_all_gather(self, symm_mem_input: bool) -> None:
-        self._init_process()
 
-        if symm_mem_input:
-            t = _SymmetricMemory.empty_strided_p2p(
-                size=(64, 64),
-                stride=(64, 1),
-                dtype=torch.float32,
-                device=self.device,
-                group_name="0",
-            ).fill_(self.rank)
-        else:
-            t = torch.full((64, 64), self.rank, dtype=torch.float32, device=self.device)
+# [READ ME FIRST]
+# The `SymmMemEmptySetDeviceTest` suite parameterizes whether user sets the
+# device before calling symm_mem.emtpy.  Either way should work.
+# However, since `set_device` is persistent, we cannot use the
+# `MultiProcContinuousTest` template because the next function will be
+# "contaminated", leading to flaky tests (e.g. hang). Therefore, we use
+# `MultiProcessTestCase` which spawns new processes for each test function.
+# Please limit the number of tests you want to add under this test
+# suite as respawning processes and `init_process_group` is expensive.
+@instantiate_parametrized_tests
+@requires_cuda_p2p_access()
+class SymmMemEmptySetDeviceTest(MultiProcessTestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
 
-        res = torch.ops.symm_mem._low_contention_all_gather(t, "0")
-        res = torch.ops._c10d_functional.wait_tensor(res)
-        self.assertEqual(res.shape, (64 * self.world_size, 64))
+    @property
+    def world_size(self) -> int:
+        return device_module.device_count()
 
-        chunks = res.chunk(self.world_size)
-        for r in range(self.world_size):
-            self.assertTrue(chunks[r].eq(r).all())
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
 
-    @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(2)
-    @parametrize("reduce_op", ["sum", "avg"])
-    @parametrize("symm_mem_input", [True, False])
-    def test_low_contention_reduce_scatter(
-        self, reduce_op: str, symm_mem_input: bool
-    ) -> None:
-        self._init_process()
+    def _init_process(self, set_device: bool):
+        if set_device:
+            torch.cuda.set_device(self.device)
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        torch.manual_seed(42 + self.rank)
 
-        if symm_mem_input:
-            t = _SymmetricMemory.empty_strided_p2p(
-                size=(64, 64),
-                stride=(64, 1),
-                dtype=torch.float32,
-                device=self.device,
-                group_name="0",
-            )
-        else:
-            t = torch.empty((64, 64), dtype=torch.float32, device=self.device)
+    def _get_test_alloc_args(self):
+        shape = (64, 64)
+        stride = (64, 1)
+        dtype = torch.float32
+        device = self.device
+        return (shape, stride, dtype, device)
 
-        chunks = t.chunk(self.world_size)
-        for r in range(self.world_size):
-            chunks[r].fill_(r)
+    def _verify_symmetric_memory(self, symm_mem_hdl):
+        self.assertEqual(symm_mem_hdl.world_size, self.world_size)
 
-        res = torch.ops.symm_mem._low_contention_reduce_scatter(t, reduce_op, "0")
-        res = torch.ops._c10d_functional.wait_tensor(res)
-        self.assertEqual(res.shape, (64 // self.world_size, 64))
+        buf = symm_mem_hdl.get_buffer(
+            0, (symm_mem_hdl.buffer_size // 4,), torch.float32
+        )
+        self.assertEqual(buf.storage_offset(), 0)
+        self.assertEqual(buf.untyped_storage().size(), symm_mem_hdl.buffer_size)
 
-        if reduce_op == "sum":
-            expect = self.rank * self.world_size
-        elif reduce_op == "avg":
-            expect = self.rank
+        if symm_mem_hdl.rank == 0:
+            symm_mem_hdl.wait_signal(src_rank=1)
+            self.assertTrue(buf.eq(42).all())
         else:
-            raise AssertionError(f"Unexpected reduce_op: {reduce_op}")
-        self.assertTrue(res.eq(expect).all())
+            buf.fill_(42)
+            symm_mem_hdl.put_signal(dst_rank=0)
 
-    @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(4)
-    def test_subgroup(self) -> None:
-        self._init_process()
+        symm_mem_hdl.barrier()
 
-        ranks = list(range(self.world_size))
-        subgroup_0 = dist.new_group(ranks[: len(ranks) // 2])
-        subgroup_1 = dist.new_group(ranks[len(ranks) // 2 :])
+        if symm_mem_hdl.rank == 0:
+            symm_mem_hdl.barrier()
+            self.assertTrue(buf.eq(43).all())
+        else:
+            buf.fill_(43)
+            symm_mem_hdl.barrier()
 
-        world = dist.group.WORLD
-        subgroup = subgroup_0 if world.rank() < world.size() // 2 else subgroup_1
+        symm_mem_hdl.barrier()
 
-        t = symm_mem.empty(64, device="cuda")
-        symm_mem_world = symm_mem.rendezvous(t, group=world)
-        symm_mem_subgroup = symm_mem.rendezvous(t, group=subgroup)
+    @skipIfRocm
+    @skip_if_lt_x_gpu(2)
+    @parametrize("set_device", [True, False])
+    def test_empty_strided_p2p(self, set_device: bool) -> None:
+        self._init_process(set_device)
+        group_name = dist.group.WORLD.group_name
+        enable_symm_mem_for_group(group_name)
 
-        self.assertEqual(symm_mem_world.world_size, world.size())
-        self.assertEqual(symm_mem_world.rank, world.rank())
-        self.assertEqual(symm_mem_subgroup.world_size, world.size() // 2)
-        self.assertEqual(symm_mem_subgroup.rank, world.rank() % subgroup.size())
+        alloc_args = self._get_test_alloc_args()
 
-        t.fill_(world.rank())
-        symm_mem_world.barrier()
+        t = torch.empty((64, 64), device=self.device)
+        self.assertIsNone(_SymmetricMemory.rendezvous(t))
 
-        # Observe a peer buffer via the world group
-        peer_rank = (world.rank() + 1) % world.size()
-        buf = symm_mem_world.get_buffer(peer_rank, (64,), torch.float32)
-        self.assertTrue(buf.eq(peer_rank).all())
+        t = _SymmetricMemory.empty_strided_p2p(*alloc_args, group_name=group_name)
+        symm_mem_hdl = _SymmetricMemory.rendezvous(t)
 
-        # Observe a peer buffer via the subgroup
-        peer_rank = (subgroup.rank() + 1) % subgroup.size()
-        buf = symm_mem_subgroup.get_buffer(peer_rank, (64,), torch.float32)
-        if world.rank() < world.size() // 2:
-            self.assertTrue(buf.eq(peer_rank).all())
-        else:
-            self.assertTrue(buf.eq(peer_rank + world.size() // 2).all())
+        del t
+        self._verify_symmetric_memory(symm_mem_hdl)
+
+    @skipIfRocm  # started failing during ROCm 6.4 CI upgrade
+    @skip_if_lt_x_gpu(2)
+    @parametrize("set_device", [True, False])
+    def test_empty_strided_p2p_persistent(self, set_device: bool) -> None:
+        self._init_process(set_device)
+        group_name = dist.group.WORLD.group_name
+        enable_symm_mem_for_group(group_name)
+
+        alloc_args = self._get_test_alloc_args()
+
+        alloc_id = 42 + random.randint(0, 2147483647)
+        t = _SymmetricMemory.empty_strided_p2p(
+            *alloc_args, group_name=group_name, alloc_id=alloc_id
+        )
+        data_ptr = t.data_ptr()
+
+        # Verify that persistent allocation would fail if there's an active
+        # allocation with the same alloc_id.
+        with self.assertRaises(RuntimeError):
+            _SymmetricMemory.empty_strided_p2p(
+                *alloc_args, group_name=group_name, alloc_id=alloc_id
+            )
+
+        # Verify that persistent allocation would succeed in lieu of activate
+        # allocations with the same alloc_id, and the returned tensor would
+        # have the same data pointer.
+        del t
+        t = _SymmetricMemory.empty_strided_p2p(
+            *alloc_args, group_name=group_name, alloc_id=alloc_id
+        )
+        self.assertEqual(t.data_ptr(), data_ptr)
+
+        symm_mem_hdl = _SymmetricMemory.rendezvous(t)
+        self._verify_symmetric_memory(symm_mem_hdl)
 
 
 # This Test class is used to test the error handling of SymmetricMemory APIs.
 # Since a process restart is often needed after each test, we use the
-# MultiProcessTestCase instead of MultiProcContinousTest.
+# MultiProcessTestCase instead of MultiProcContinuousTest.
 @requires_cuda_p2p_access()
 class SymmMemNegativeTest(MultiProcessTestCase):
     def setUp(self) -> None:
@@ -747,7 +812,7 @@ def test_wait_signal_timeout(self) -> None:
 
 @instantiate_parametrized_tests
 @requires_cuda_p2p_access()
-class SymmMemCollectiveTest(MultiProcContinousTest):
+class SymmMemCollectiveTest(MultiProcContinuousTest):
     @property
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
@@ -995,7 +1060,7 @@ def test_multimem_all_gather(self, align_bytes: int) -> None:
 
 @instantiate_parametrized_tests
 @requires_cuda_p2p_access()
-class LoweringTest(MultiProcContinousTest):
+class LoweringTest(MultiProcContinuousTest):
     def _init_process(self) -> None:
         torch.cuda.set_device(self.device)
         enable_symm_mem_for_group(dist.group.WORLD.group_name)
diff --git a/test/dynamo/_test_nested_graph_breaks_helper.py b/test/dynamo/_test_nested_graph_breaks_helper.py
new file mode 100644
index 0000000000000..ea229524d21bc
--- /dev/null
+++ b/test/dynamo/_test_nested_graph_breaks_helper.py
@@ -0,0 +1,18 @@
+import torch
+
+
+global1 = torch.ones(3)
+
+
+def reset_state():
+    global global1
+    global1 = torch.ones(3)
+
+
+def fn(val, call):
+    global global1
+    global1 += 1
+    val = val + global1
+    val = call(val)
+    val = val + 1
+    return val
diff --git a/test/dynamo/cpython/3_13/list_tests.diff b/test/dynamo/cpython/3_13/list_tests.diff
index 7889011f375dd..1a5c63a9142dc 100644
--- a/test/dynamo/cpython/3_13/list_tests.diff
+++ b/test/dynamo/cpython/3_13/list_tests.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/list_tests.py b/test/dynamo/cpython/3_13/list_tests.py
-index dbc5ef4f9f2..70e24036f74 100644
+index dbc5ef4f9f2..af717703053 100644
 --- a/test/dynamo/cpython/3_13/list_tests.py
 +++ b/test/dynamo/cpython/3_13/list_tests.py
 @@ -1,3 +1,56 @@
@@ -62,16 +62,16 @@ index dbc5ef4f9f2..70e24036f74 100644
 @@ -5,7 +58,7 @@ Tests common to list and UserList.UserList
  import sys
  from functools import cmp_to_key
- 
+
 -from test import seq_tests
 +import seq_tests
  from test.support import ALWAYS_EQ, NEVER_EQ, get_c_recursion_limit
- 
- 
+
+
 @@ -119,10 +172,6 @@ class CommonTest(seq_tests.CommonTest):
          a[-1] = 9
          self.assertEqual(a, self.type2test([5,6,7,8,9]))
- 
+
 -        msg = "list indices must be integers or slices"
 -        with self.assertRaisesRegex(TypeError, msg):
 -            a['a'] = "python"
@@ -79,3 +79,81 @@ index dbc5ef4f9f2..70e24036f74 100644
      def test_delitem(self):
          a = self.type2test([0, 1])
          del a[1]
+@@ -270,13 +319,14 @@ class CommonTest(seq_tests.CommonTest):
+         self.assertRaises(TypeError, a.extend)
+
+         # overflow test. issue1621
+-        class CustomIter:
+-            def __iter__(self):
+-                return self
+-            def __next__(self):
+-                raise StopIteration
+-            def __length_hint__(self):
+-                return sys.maxsize
++        with torch._dynamo.error_on_graph_break(False):
++            class CustomIter:
++                def __iter__(self):
++                    return self
++                def __next__(self):
++                    raise StopIteration
++                def __length_hint__(self):
++                    return sys.maxsize
+         a = self.type2test([1,2,3,4])
+         a.extend(CustomIter())
+         self.assertEqual(a, [1,2,3,4])
+@@ -337,21 +387,23 @@ class CommonTest(seq_tests.CommonTest):
+         a = self.type2test([NEVER_EQ])
+         self.assertRaises(ValueError, a.remove, ALWAYS_EQ)
+
+-        class BadExc(Exception):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class BadExc(Exception):
++                pass
+
+-        class BadCmp:
+-            def __eq__(self, other):
+-                if other == 2:
+-                    raise BadExc()
+-                return False
++            class BadCmp:
++                def __eq__(self, other):
++                    if other == 2:
++                        raise BadExc()
++                    return False
+
+         a = self.type2test([0, 1, 2, 3])
+         self.assertRaises(BadExc, a.remove, BadCmp())
+
+-        class BadCmp2:
+-            def __eq__(self, other):
+-                raise BadExc()
++        with torch._dynamo.error_on_graph_break(False):
++            class BadCmp2:
++                def __eq__(self, other):
++                    raise BadExc()
+
+         d = self.type2test('abcdefghcij')
+         d.remove('c')
+@@ -376,13 +428,14 @@ class CommonTest(seq_tests.CommonTest):
+         self.assertRaises(ValueError, a.index, 2, 0, 4)
+         self.assertEqual(a, self.type2test([-2, -1, 0, 1, 2]))
+
+-        # Test modifying the list during index's iteration
+-        class EvilCmp:
+-            def __init__(self, victim):
+-                self.victim = victim
+-            def __eq__(self, other):
+-                del self.victim[:]
+-                return False
++        with torch._dynamo.error_on_graph_break(False):
++            # Test modifying the list during index's iteration
++            class EvilCmp:
++                def __init__(self, victim):
++                    self.victim = victim
++                def __eq__(self, other):
++                    del self.victim[:]
++                    return False
+         a = self.type2test()
+         a[:] = [EvilCmp(a) for _ in range(100)]
+         # This used to seg fault before patch #1005778
diff --git a/test/dynamo/cpython/3_13/list_tests.py b/test/dynamo/cpython/3_13/list_tests.py
index 70e24036f74db..21e85eef179fd 100644
--- a/test/dynamo/cpython/3_13/list_tests.py
+++ b/test/dynamo/cpython/3_13/list_tests.py
@@ -319,13 +319,14 @@ def test_extend(self):
         self.assertRaises(TypeError, a.extend)
 
         # overflow test. issue1621
-        class CustomIter:
-            def __iter__(self):
-                return self
-            def __next__(self):
-                raise StopIteration
-            def __length_hint__(self):
-                return sys.maxsize
+        with torch._dynamo.error_on_graph_break(False):
+            class CustomIter:
+                def __iter__(self):
+                    return self
+                def __next__(self):
+                    raise StopIteration
+                def __length_hint__(self):
+                    return sys.maxsize
         a = self.type2test([1,2,3,4])
         a.extend(CustomIter())
         self.assertEqual(a, [1,2,3,4])
@@ -386,21 +387,23 @@ def test_remove(self):
         a = self.type2test([NEVER_EQ])
         self.assertRaises(ValueError, a.remove, ALWAYS_EQ)
 
-        class BadExc(Exception):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class BadExc(Exception):
+                pass
 
-        class BadCmp:
-            def __eq__(self, other):
-                if other == 2:
-                    raise BadExc()
-                return False
+            class BadCmp:
+                def __eq__(self, other):
+                    if other == 2:
+                        raise BadExc()
+                    return False
 
         a = self.type2test([0, 1, 2, 3])
         self.assertRaises(BadExc, a.remove, BadCmp())
 
-        class BadCmp2:
-            def __eq__(self, other):
-                raise BadExc()
+        with torch._dynamo.error_on_graph_break(False):
+            class BadCmp2:
+                def __eq__(self, other):
+                    raise BadExc()
 
         d = self.type2test('abcdefghcij')
         d.remove('c')
@@ -425,13 +428,14 @@ def test_index(self):
         self.assertRaises(ValueError, a.index, 2, 0, 4)
         self.assertEqual(a, self.type2test([-2, -1, 0, 1, 2]))
 
-        # Test modifying the list during index's iteration
-        class EvilCmp:
-            def __init__(self, victim):
-                self.victim = victim
-            def __eq__(self, other):
-                del self.victim[:]
-                return False
+        with torch._dynamo.error_on_graph_break(False):
+            # Test modifying the list during index's iteration
+            class EvilCmp:
+                def __init__(self, victim):
+                    self.victim = victim
+                def __eq__(self, other):
+                    del self.victim[:]
+                    return False
         a = self.type2test()
         a[:] = [EvilCmp(a) for _ in range(100)]
         # This used to seg fault before patch #1005778
diff --git a/test/dynamo/cpython/3_13/mapping_tests.diff b/test/dynamo/cpython/3_13/mapping_tests.diff
index 009b53f31b55d..c376ddf725ae5 100644
--- a/test/dynamo/cpython/3_13/mapping_tests.diff
+++ b/test/dynamo/cpython/3_13/mapping_tests.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/mapping_tests.py b/test/dynamo/cpython/3_13/mapping_tests.py
-index ed89a81a6ea..10fc6e7e467 100644
+index ed89a81a6ea..b19cec7cb23 100644
 --- a/test/dynamo/cpython/3_13/mapping_tests.py
 +++ b/test/dynamo/cpython/3_13/mapping_tests.py
 @@ -1,10 +1,64 @@
@@ -61,10 +61,360 @@ index ed89a81a6ea..10fc6e7e467 100644
  import unittest
  import collections
  from test.support import get_c_recursion_limit
- 
- 
+
+
 -class BasicTestMappingProtocol(unittest.TestCase):
 +class BasicTestMappingProtocol(__TestCase):
      # This base class can be used to check that an object conforms to the
      # mapping protocol
- 
+
+@@ -196,70 +250,76 @@ class BasicTestMappingProtocol(unittest.TestCase):
+         self.assertRaises((TypeError, AttributeError), d.update, 42)
+
+         outerself = self
+-        class SimpleUserDict:
+-            def __init__(self):
+-                self.d = outerself.reference
+-            def keys(self):
+-                return self.d.keys()
+-            def __getitem__(self, i):
+-                return self.d[i]
++        with torch._dynamo.error_on_graph_break(False):
++            class SimpleUserDict:
++                def __init__(self):
++                    self.d = outerself.reference
++                def keys(self):
++                    return self.d.keys()
++                def __getitem__(self, i):
++                    return self.d[i]
+         d.clear()
+         d.update(SimpleUserDict())
+         i1 = sorted(d.items())
+         i2 = sorted(self.reference.items())
+         self.assertEqual(i1, i2)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+         d = self._empty_mapping()
+-        class FailingUserDict:
+-            def keys(self):
+-                raise Exc
++        with torch._dynamo.error_on_graph_break(False):
++            class FailingUserDict:
++                def keys(self):
++                    raise Exc
+         self.assertRaises(Exc, d.update, FailingUserDict())
+
+         d.clear()
+
+-        class FailingUserDict:
+-            def keys(self):
+-                class BogonIter:
+-                    def __init__(self):
+-                        self.i = 1
+-                    def __iter__(self):
+-                        return self
+-                    def __next__(self):
+-                        if self.i:
+-                            self.i = 0
+-                            return 'a'
+-                        raise Exc
+-                return BogonIter()
+-            def __getitem__(self, key):
+-                return key
++        with torch._dynamo.error_on_graph_break(False):
++            class FailingUserDict:
++                def keys(self):
++                    class BogonIter:
++                        def __init__(self):
++                            self.i = 1
++                        def __iter__(self):
++                            return self
++                        def __next__(self):
++                            if self.i:
++                                self.i = 0
++                                return 'a'
++                            raise Exc
++                    return BogonIter()
++                def __getitem__(self, key):
++                    return key
+         self.assertRaises(Exc, d.update, FailingUserDict())
+
+-        class FailingUserDict:
+-            def keys(self):
+-                class BogonIter:
+-                    def __init__(self):
+-                        self.i = ord('a')
+-                    def __iter__(self):
+-                        return self
+-                    def __next__(self):
+-                        if self.i <= ord('z'):
+-                            rtn = chr(self.i)
+-                            self.i += 1
+-                            return rtn
+-                        raise StopIteration
+-                return BogonIter()
+-            def __getitem__(self, key):
+-                raise Exc
++        with torch._dynamo.error_on_graph_break(False):
++            class FailingUserDict:
++                def keys(self):
++                    class BogonIter:
++                        def __init__(self):
++                            self.i = ord('a')
++                        def __iter__(self):
++                            return self
++                        def __next__(self):
++                            if self.i <= ord('z'):
++                                rtn = chr(self.i)
++                                self.i += 1
++                                return rtn
++                            raise StopIteration
++                    return BogonIter()
++                def __getitem__(self, key):
++                    raise Exc
+         self.assertRaises(Exc, d.update, FailingUserDict())
+
+         d = self._empty_mapping()
+-        class badseq(object):
+-            def __iter__(self):
+-                return self
+-            def __next__(self):
+-                raise Exc()
++        with torch._dynamo.error_on_graph_break(False):
++            class badseq(object):
++                def __iter__(self):
++                    return self
++                def __next__(self):
++                    raise Exc()
+
+         self.assertRaises(Exc, d.update, badseq())
+
+@@ -409,13 +469,14 @@ class TestMappingProtocol(BasicTestMappingProtocol):
+         d.update(self._full_mapping({1:2, 3:4, 5:6}).items())
+         self.assertEqual(d, {1:2, 2:4, 3:4, 5:6})
+
+-        class SimpleUserDict:
+-            def __init__(self):
+-                self.d = {1:1, 2:2, 3:3}
+-            def keys(self):
+-                return self.d.keys()
+-            def __getitem__(self, i):
+-                return self.d[i]
++        with torch._dynamo.error_on_graph_break(False):
++            class SimpleUserDict:
++                def __init__(self):
++                    self.d = {1:1, 2:2, 3:3}
++                def keys(self):
++                    return self.d.keys()
++                def __getitem__(self, i):
++                    return self.d[i]
+         d.clear()
+         d.update(SimpleUserDict())
+         self.assertEqual(d, {1:1, 2:2, 3:3})
+@@ -431,39 +492,44 @@ class TestMappingProtocol(BasicTestMappingProtocol):
+             yield 1
+         self.assertEqual(d.fromkeys(g()), {1:None})
+         self.assertRaises(TypeError, {}.fromkeys, 3)
+-        class dictlike(self.type2test): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class dictlike(self.type2test): pass
+         self.assertEqual(dictlike.fromkeys('a'), {'a':None})
+         self.assertEqual(dictlike().fromkeys('a'), {'a':None})
+         self.assertTrue(dictlike.fromkeys('a').__class__ is dictlike)
+         self.assertTrue(dictlike().fromkeys('a').__class__ is dictlike)
+         self.assertTrue(type(dictlike.fromkeys('a')) is dictlike)
+-        class mydict(self.type2test):
+-            def __new__(cls):
+-                return collections.UserDict()
++        with torch._dynamo.error_on_graph_break(False):
++            class mydict(self.type2test):
++                def __new__(cls):
++                    return collections.UserDict()
+         ud = mydict.fromkeys('ab')
+         self.assertEqual(ud, {'a':None, 'b':None})
+         self.assertIsInstance(ud, collections.UserDict)
+         self.assertRaises(TypeError, dict.fromkeys)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class baddict1(self.type2test):
+-            def __init__(self, *args, **kwargs):
+-                raise Exc()
++            class baddict1(self.type2test):
++                def __init__(self, *args, **kwargs):
++                    raise Exc()
+
+         self.assertRaises(Exc, baddict1.fromkeys, [1])
+
+-        class BadSeq(object):
+-            def __iter__(self):
+-                return self
+-            def __next__(self):
+-                raise Exc()
++        with torch._dynamo.error_on_graph_break(False):
++            class BadSeq(object):
++                def __iter__(self):
++                    return self
++                def __next__(self):
++                    raise Exc()
+
+         self.assertRaises(Exc, self.type2test.fromkeys, BadSeq())
+
+-        class baddict2(self.type2test):
+-            def __setitem__(self, key, value):
+-                raise Exc()
++        with torch._dynamo.error_on_graph_break(False):
++            class baddict2(self.type2test):
++                def __setitem__(self, key, value):
++                    raise Exc()
+
+         self.assertRaises(Exc, baddict2.fromkeys, [1])
+
+@@ -537,25 +603,27 @@ class TestHashMappingProtocol(TestMappingProtocol):
+
+     def test_getitem(self):
+         TestMappingProtocol.test_getitem(self)
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadEq(object):
+-            def __eq__(self, other):
+-                raise Exc()
+-            def __hash__(self):
+-                return 24
++            class BadEq(object):
++                def __eq__(self, other):
++                    raise Exc()
++                def __hash__(self):
++                    return 24
+
+         d = self._empty_mapping()
+         d[BadEq()] = 42
+         self.assertRaises(KeyError, d.__getitem__, 23)
+
+-        class BadHash(object):
+-            fail = False
+-            def __hash__(self):
+-                if self.fail:
+-                    raise Exc()
+-                else:
+-                    return 42
++        with torch._dynamo.error_on_graph_break(False):
++            class BadHash(object):
++                fail = False
++                def __hash__(self):
++                    if self.fail:
++                        raise Exc()
++                    else:
++                        return 42
+
+         d = self._empty_mapping()
+         x = BadHash()
+@@ -565,9 +633,10 @@ class TestHashMappingProtocol(TestMappingProtocol):
+
+     def test_fromkeys(self):
+         TestMappingProtocol.test_fromkeys(self)
+-        class mydict(self.type2test):
+-            def __new__(cls):
+-                return collections.UserDict()
++        with torch._dynamo.error_on_graph_break(False):
++            class mydict(self.type2test):
++                def __new__(cls):
++                    return collections.UserDict()
+         ud = mydict.fromkeys('ab')
+         self.assertEqual(ud, {'a':None, 'b':None})
+         self.assertIsInstance(ud, collections.UserDict)
+@@ -575,15 +644,16 @@ class TestHashMappingProtocol(TestMappingProtocol):
+     def test_pop(self):
+         TestMappingProtocol.test_pop(self)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadHash(object):
+-            fail = False
+-            def __hash__(self):
+-                if self.fail:
+-                    raise Exc()
+-                else:
+-                    return 42
++            class BadHash(object):
++                fail = False
++                def __hash__(self):
++                    if self.fail:
++                        raise Exc()
++                    else:
++                        return 42
+
+         d = self._empty_mapping()
+         x = BadHash()
+@@ -613,11 +683,12 @@ class TestHashMappingProtocol(TestMappingProtocol):
+         d[1] = d
+         self.assertEqual(repr(d), '{1: {...}}')
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadRepr(object):
+-            def __repr__(self):
+-                raise Exc()
++            class BadRepr(object):
++                def __repr__(self):
++                    raise Exc()
+
+         d = self._full_mapping({1: BadRepr()})
+         self.assertRaises(Exc, repr, d)
+@@ -635,13 +706,14 @@ class TestHashMappingProtocol(TestMappingProtocol):
+         self.assertEqual(self._full_mapping({1: 2}),
+                          self._full_mapping({1: 2}))
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadCmp(object):
+-            def __eq__(self, other):
+-                raise Exc()
+-            def __hash__(self):
+-                return 1
++            class BadCmp(object):
++                def __eq__(self, other):
++                    raise Exc()
++                def __hash__(self):
++                    return 1
+
+         d1 = self._full_mapping({BadCmp(): 1})
+         d2 = self._full_mapping({1: 1})
+@@ -651,15 +723,16 @@ class TestHashMappingProtocol(TestMappingProtocol):
+     def test_setdefault(self):
+         TestMappingProtocol.test_setdefault(self)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadHash(object):
+-            fail = False
+-            def __hash__(self):
+-                if self.fail:
+-                    raise Exc()
+-                else:
+-                    return 42
++            class BadHash(object):
++                fail = False
++                def __hash__(self):
++                    if self.fail:
++                        raise Exc()
++                    else:
++                        return 42
+
+         d = self._empty_mapping()
+         x = BadHash()
diff --git a/test/dynamo/cpython/3_13/mapping_tests.py b/test/dynamo/cpython/3_13/mapping_tests.py
index 10fc6e7e46722..88c97899ae3eb 100644
--- a/test/dynamo/cpython/3_13/mapping_tests.py
+++ b/test/dynamo/cpython/3_13/mapping_tests.py
@@ -250,70 +250,76 @@ def test_update(self):
         self.assertRaises((TypeError, AttributeError), d.update, 42)
 
         outerself = self
-        class SimpleUserDict:
-            def __init__(self):
-                self.d = outerself.reference
-            def keys(self):
-                return self.d.keys()
-            def __getitem__(self, i):
-                return self.d[i]
+        with torch._dynamo.error_on_graph_break(False):
+            class SimpleUserDict:
+                def __init__(self):
+                    self.d = outerself.reference
+                def keys(self):
+                    return self.d.keys()
+                def __getitem__(self, i):
+                    return self.d[i]
         d.clear()
         d.update(SimpleUserDict())
         i1 = sorted(d.items())
         i2 = sorted(self.reference.items())
         self.assertEqual(i1, i2)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
         d = self._empty_mapping()
-        class FailingUserDict:
-            def keys(self):
-                raise Exc
+        with torch._dynamo.error_on_graph_break(False):
+            class FailingUserDict:
+                def keys(self):
+                    raise Exc
         self.assertRaises(Exc, d.update, FailingUserDict())
 
         d.clear()
 
-        class FailingUserDict:
-            def keys(self):
-                class BogonIter:
-                    def __init__(self):
-                        self.i = 1
-                    def __iter__(self):
-                        return self
-                    def __next__(self):
-                        if self.i:
-                            self.i = 0
-                            return 'a'
-                        raise Exc
-                return BogonIter()
-            def __getitem__(self, key):
-                return key
+        with torch._dynamo.error_on_graph_break(False):
+            class FailingUserDict:
+                def keys(self):
+                    class BogonIter:
+                        def __init__(self):
+                            self.i = 1
+                        def __iter__(self):
+                            return self
+                        def __next__(self):
+                            if self.i:
+                                self.i = 0
+                                return 'a'
+                            raise Exc
+                    return BogonIter()
+                def __getitem__(self, key):
+                    return key
         self.assertRaises(Exc, d.update, FailingUserDict())
 
-        class FailingUserDict:
-            def keys(self):
-                class BogonIter:
-                    def __init__(self):
-                        self.i = ord('a')
-                    def __iter__(self):
-                        return self
-                    def __next__(self):
-                        if self.i <= ord('z'):
-                            rtn = chr(self.i)
-                            self.i += 1
-                            return rtn
-                        raise StopIteration
-                return BogonIter()
-            def __getitem__(self, key):
-                raise Exc
+        with torch._dynamo.error_on_graph_break(False):
+            class FailingUserDict:
+                def keys(self):
+                    class BogonIter:
+                        def __init__(self):
+                            self.i = ord('a')
+                        def __iter__(self):
+                            return self
+                        def __next__(self):
+                            if self.i <= ord('z'):
+                                rtn = chr(self.i)
+                                self.i += 1
+                                return rtn
+                            raise StopIteration
+                    return BogonIter()
+                def __getitem__(self, key):
+                    raise Exc
         self.assertRaises(Exc, d.update, FailingUserDict())
 
         d = self._empty_mapping()
-        class badseq(object):
-            def __iter__(self):
-                return self
-            def __next__(self):
-                raise Exc()
+        with torch._dynamo.error_on_graph_break(False):
+            class badseq(object):
+                def __iter__(self):
+                    return self
+                def __next__(self):
+                    raise Exc()
 
         self.assertRaises(Exc, d.update, badseq())
 
@@ -463,13 +469,14 @@ def test_update(self):
         d.update(self._full_mapping({1:2, 3:4, 5:6}).items())
         self.assertEqual(d, {1:2, 2:4, 3:4, 5:6})
 
-        class SimpleUserDict:
-            def __init__(self):
-                self.d = {1:1, 2:2, 3:3}
-            def keys(self):
-                return self.d.keys()
-            def __getitem__(self, i):
-                return self.d[i]
+        with torch._dynamo.error_on_graph_break(False):
+            class SimpleUserDict:
+                def __init__(self):
+                    self.d = {1:1, 2:2, 3:3}
+                def keys(self):
+                    return self.d.keys()
+                def __getitem__(self, i):
+                    return self.d[i]
         d.clear()
         d.update(SimpleUserDict())
         self.assertEqual(d, {1:1, 2:2, 3:3})
@@ -485,39 +492,44 @@ def g():
             yield 1
         self.assertEqual(d.fromkeys(g()), {1:None})
         self.assertRaises(TypeError, {}.fromkeys, 3)
-        class dictlike(self.type2test): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class dictlike(self.type2test): pass
         self.assertEqual(dictlike.fromkeys('a'), {'a':None})
         self.assertEqual(dictlike().fromkeys('a'), {'a':None})
         self.assertTrue(dictlike.fromkeys('a').__class__ is dictlike)
         self.assertTrue(dictlike().fromkeys('a').__class__ is dictlike)
         self.assertTrue(type(dictlike.fromkeys('a')) is dictlike)
-        class mydict(self.type2test):
-            def __new__(cls):
-                return collections.UserDict()
+        with torch._dynamo.error_on_graph_break(False):
+            class mydict(self.type2test):
+                def __new__(cls):
+                    return collections.UserDict()
         ud = mydict.fromkeys('ab')
         self.assertEqual(ud, {'a':None, 'b':None})
         self.assertIsInstance(ud, collections.UserDict)
         self.assertRaises(TypeError, dict.fromkeys)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class baddict1(self.type2test):
-            def __init__(self, *args, **kwargs):
-                raise Exc()
+            class baddict1(self.type2test):
+                def __init__(self, *args, **kwargs):
+                    raise Exc()
 
         self.assertRaises(Exc, baddict1.fromkeys, [1])
 
-        class BadSeq(object):
-            def __iter__(self):
-                return self
-            def __next__(self):
-                raise Exc()
+        with torch._dynamo.error_on_graph_break(False):
+            class BadSeq(object):
+                def __iter__(self):
+                    return self
+                def __next__(self):
+                    raise Exc()
 
         self.assertRaises(Exc, self.type2test.fromkeys, BadSeq())
 
-        class baddict2(self.type2test):
-            def __setitem__(self, key, value):
-                raise Exc()
+        with torch._dynamo.error_on_graph_break(False):
+            class baddict2(self.type2test):
+                def __setitem__(self, key, value):
+                    raise Exc()
 
         self.assertRaises(Exc, baddict2.fromkeys, [1])
 
@@ -591,25 +603,27 @@ class TestHashMappingProtocol(TestMappingProtocol):
 
     def test_getitem(self):
         TestMappingProtocol.test_getitem(self)
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadEq(object):
-            def __eq__(self, other):
-                raise Exc()
-            def __hash__(self):
-                return 24
+            class BadEq(object):
+                def __eq__(self, other):
+                    raise Exc()
+                def __hash__(self):
+                    return 24
 
         d = self._empty_mapping()
         d[BadEq()] = 42
         self.assertRaises(KeyError, d.__getitem__, 23)
 
-        class BadHash(object):
-            fail = False
-            def __hash__(self):
-                if self.fail:
-                    raise Exc()
-                else:
-                    return 42
+        with torch._dynamo.error_on_graph_break(False):
+            class BadHash(object):
+                fail = False
+                def __hash__(self):
+                    if self.fail:
+                        raise Exc()
+                    else:
+                        return 42
 
         d = self._empty_mapping()
         x = BadHash()
@@ -619,9 +633,10 @@ def __hash__(self):
 
     def test_fromkeys(self):
         TestMappingProtocol.test_fromkeys(self)
-        class mydict(self.type2test):
-            def __new__(cls):
-                return collections.UserDict()
+        with torch._dynamo.error_on_graph_break(False):
+            class mydict(self.type2test):
+                def __new__(cls):
+                    return collections.UserDict()
         ud = mydict.fromkeys('ab')
         self.assertEqual(ud, {'a':None, 'b':None})
         self.assertIsInstance(ud, collections.UserDict)
@@ -629,15 +644,16 @@ def __new__(cls):
     def test_pop(self):
         TestMappingProtocol.test_pop(self)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadHash(object):
-            fail = False
-            def __hash__(self):
-                if self.fail:
-                    raise Exc()
-                else:
-                    return 42
+            class BadHash(object):
+                fail = False
+                def __hash__(self):
+                    if self.fail:
+                        raise Exc()
+                    else:
+                        return 42
 
         d = self._empty_mapping()
         x = BadHash()
@@ -667,11 +683,12 @@ def test_repr(self):
         d[1] = d
         self.assertEqual(repr(d), '{1: {...}}')
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadRepr(object):
-            def __repr__(self):
-                raise Exc()
+            class BadRepr(object):
+                def __repr__(self):
+                    raise Exc()
 
         d = self._full_mapping({1: BadRepr()})
         self.assertRaises(Exc, repr, d)
@@ -689,13 +706,14 @@ def test_eq(self):
         self.assertEqual(self._full_mapping({1: 2}),
                          self._full_mapping({1: 2}))
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadCmp(object):
-            def __eq__(self, other):
-                raise Exc()
-            def __hash__(self):
-                return 1
+            class BadCmp(object):
+                def __eq__(self, other):
+                    raise Exc()
+                def __hash__(self):
+                    return 1
 
         d1 = self._full_mapping({BadCmp(): 1})
         d2 = self._full_mapping({1: 1})
@@ -705,15 +723,16 @@ def __hash__(self):
     def test_setdefault(self):
         TestMappingProtocol.test_setdefault(self)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadHash(object):
-            fail = False
-            def __hash__(self):
-                if self.fail:
-                    raise Exc()
-                else:
-                    return 42
+            class BadHash(object):
+                fail = False
+                def __hash__(self):
+                    if self.fail:
+                        raise Exc()
+                    else:
+                        return 42
 
         d = self._empty_mapping()
         x = BadHash()
diff --git a/test/dynamo/cpython/3_13/seq_tests.diff b/test/dynamo/cpython/3_13/seq_tests.diff
index b87c26ece27cb..d5e6f92a07689 100644
--- a/test/dynamo/cpython/3_13/seq_tests.diff
+++ b/test/dynamo/cpython/3_13/seq_tests.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/seq_tests.py b/test/dynamo/cpython/3_13/seq_tests.py
-index 719c9434a16..2c502cda4f6 100644
+index 719c9434a16..290e57c04a0 100644
 --- a/test/dynamo/cpython/3_13/seq_tests.py
 +++ b/test/dynamo/cpython/3_13/seq_tests.py
 @@ -1,3 +1,57 @@
@@ -63,9 +63,121 @@ index 719c9434a16..2c502cda4f6 100644
 @@ -95,7 +149,7 @@ class LyingList(list):
      def __iter__(self):
          yield 1
- 
+
 -class CommonTest(unittest.TestCase):
 +class CommonTest(__TestCase):
      # The type to be tested
      type2test = None
- 
+
+@@ -115,13 +169,14 @@ class CommonTest(unittest.TestCase):
+         uu2 = self.type2test(u2)
+
+         v = self.type2test(tuple(u))
+-        class OtherSeq:
+-            def __init__(self, initseq):
+-                self.__data = initseq
+-            def __len__(self):
+-                return len(self.__data)
+-            def __getitem__(self, i):
+-                return self.__data[i]
++        with torch._dynamo.error_on_graph_break(False):
++            class OtherSeq:
++                def __init__(self, initseq):
++                    self.__data = initseq
++                def __len__(self):
++                    return len(self.__data)
++                def __getitem__(self, i):
++                    return self.__data[i]
+         s = OtherSeq(u0)
+         v0 = self.type2test(s)
+         self.assertEqual(len(v0), len(s))
+@@ -239,11 +294,12 @@ class CommonTest(unittest.TestCase):
+         # Sequences must test in-order.  If a rich comparison has side
+         # effects, these will be visible to tests against later members.
+         # In this test, the "side effect" is a short-circuiting raise.
+-        class DoNotTestEq(Exception):
+-            pass
+-        class StopCompares:
+-            def __eq__(self, other):
+-                raise DoNotTestEq
++        with torch._dynamo.error_on_graph_break(False):
++            class DoNotTestEq(Exception):
++                pass
++            class StopCompares:
++                def __eq__(self, other):
++                    raise DoNotTestEq
+
+         checkfirst = self.type2test([1, StopCompares()])
+         self.assertIn(1, checkfirst)
+@@ -283,8 +339,9 @@ class CommonTest(unittest.TestCase):
+         self.assertEqual(u2+u2+u2, u2*3)
+         self.assertEqual(u2+u2+u2, 3*u2)
+
+-        class subclass(self.type2test):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass(self.type2test):
++                pass
+         u3 = subclass([0, 1])
+         self.assertEqual(u3, u3*1)
+         self.assertIsNot(u3, u3*1)
+@@ -311,9 +368,10 @@ class CommonTest(unittest.TestCase):
+
+     def test_getitemoverwriteiter(self):
+         # Verify that __getitem__ overrides are not recognized by __iter__
+-        class T(self.type2test):
+-            def __getitem__(self, key):
+-                return str(key) + '!!!'
++        with torch._dynamo.error_on_graph_break(False):
++            class T(self.type2test):
++                def __getitem__(self, key):
++                    return str(key) + '!!!'
+         self.assertEqual(next(iter(T((1,2)))), 1)
+
+     def test_repeat(self):
+@@ -361,14 +419,15 @@ class CommonTest(unittest.TestCase):
+
+         self.assertRaises(TypeError, a.count)
+
+-        class BadExc(Exception):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class BadExc(Exception):
++                pass
+
+-        class BadCmp:
+-            def __eq__(self, other):
+-                if other == 2:
+-                    raise BadExc()
+-                return False
++            class BadCmp:
++                def __eq__(self, other):
++                    if other == 2:
++                        raise BadExc()
++                    return False
+
+         self.assertRaises(BadExc, a.count, BadCmp())
+
+@@ -394,14 +453,15 @@ class CommonTest(unittest.TestCase):
+
+         self.assertRaises(TypeError, u.index)
+
+-        class BadExc(Exception):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class BadExc(Exception):
++                pass
+
+-        class BadCmp:
+-            def __eq__(self, other):
+-                if other == 2:
+-                    raise BadExc()
+-                return False
++            class BadCmp:
++                def __eq__(self, other):
++                    if other == 2:
++                        raise BadExc()
++                    return False
+
+         a = self.type2test([0, 1, 2, 3])
+         self.assertRaises(BadExc, a.index, BadCmp())
diff --git a/test/dynamo/cpython/3_13/seq_tests.py b/test/dynamo/cpython/3_13/seq_tests.py
index 2c502cda4f617..11d59c847326c 100644
--- a/test/dynamo/cpython/3_13/seq_tests.py
+++ b/test/dynamo/cpython/3_13/seq_tests.py
@@ -169,13 +169,14 @@ def test_constructors(self):
         uu2 = self.type2test(u2)
 
         v = self.type2test(tuple(u))
-        class OtherSeq:
-            def __init__(self, initseq):
-                self.__data = initseq
-            def __len__(self):
-                return len(self.__data)
-            def __getitem__(self, i):
-                return self.__data[i]
+        with torch._dynamo.error_on_graph_break(False):
+            class OtherSeq:
+                def __init__(self, initseq):
+                    self.__data = initseq
+                def __len__(self):
+                    return len(self.__data)
+                def __getitem__(self, i):
+                    return self.__data[i]
         s = OtherSeq(u0)
         v0 = self.type2test(s)
         self.assertEqual(len(v0), len(s))
@@ -293,11 +294,12 @@ def test_contains_order(self):
         # Sequences must test in-order.  If a rich comparison has side
         # effects, these will be visible to tests against later members.
         # In this test, the "side effect" is a short-circuiting raise.
-        class DoNotTestEq(Exception):
-            pass
-        class StopCompares:
-            def __eq__(self, other):
-                raise DoNotTestEq
+        with torch._dynamo.error_on_graph_break(False):
+            class DoNotTestEq(Exception):
+                pass
+            class StopCompares:
+                def __eq__(self, other):
+                    raise DoNotTestEq
 
         checkfirst = self.type2test([1, StopCompares()])
         self.assertIn(1, checkfirst)
@@ -337,8 +339,9 @@ def test_addmul(self):
         self.assertEqual(u2+u2+u2, u2*3)
         self.assertEqual(u2+u2+u2, 3*u2)
 
-        class subclass(self.type2test):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass(self.type2test):
+                pass
         u3 = subclass([0, 1])
         self.assertEqual(u3, u3*1)
         self.assertIsNot(u3, u3*1)
@@ -365,9 +368,10 @@ def test_imul(self):
 
     def test_getitemoverwriteiter(self):
         # Verify that __getitem__ overrides are not recognized by __iter__
-        class T(self.type2test):
-            def __getitem__(self, key):
-                return str(key) + '!!!'
+        with torch._dynamo.error_on_graph_break(False):
+            class T(self.type2test):
+                def __getitem__(self, key):
+                    return str(key) + '!!!'
         self.assertEqual(next(iter(T((1,2)))), 1)
 
     def test_repeat(self):
@@ -415,14 +419,15 @@ def test_count(self):
 
         self.assertRaises(TypeError, a.count)
 
-        class BadExc(Exception):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class BadExc(Exception):
+                pass
 
-        class BadCmp:
-            def __eq__(self, other):
-                if other == 2:
-                    raise BadExc()
-                return False
+            class BadCmp:
+                def __eq__(self, other):
+                    if other == 2:
+                        raise BadExc()
+                    return False
 
         self.assertRaises(BadExc, a.count, BadCmp())
 
@@ -448,14 +453,15 @@ def test_index(self):
 
         self.assertRaises(TypeError, u.index)
 
-        class BadExc(Exception):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class BadExc(Exception):
+                pass
 
-        class BadCmp:
-            def __eq__(self, other):
-                if other == 2:
-                    raise BadExc()
-                return False
+            class BadCmp:
+                def __eq__(self, other):
+                    if other == 2:
+                        raise BadExc()
+                    return False
 
         a = self.type2test([0, 1, 2, 3])
         self.assertRaises(BadExc, a.index, BadCmp())
diff --git a/test/dynamo/cpython/3_13/test_bool.diff b/test/dynamo/cpython/3_13/test_bool.diff
index 8a1e274331fb6..0a223247e5c73 100644
--- a/test/dynamo/cpython/3_13/test_bool.diff
+++ b/test/dynamo/cpython/3_13/test_bool.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_bool.py b/test/dynamo/cpython/3_13/test_bool.py
-index 34ecb45f161..8989785eb75 100644
+index 34ecb45f161..12b719c432b 100644
 --- a/test/dynamo/cpython/3_13/test_bool.py
 +++ b/test/dynamo/cpython/3_13/test_bool.py
 @@ -1,3 +1,23 @@
@@ -24,20 +24,153 @@ index 34ecb45f161..8989785eb75 100644
 +# ======= END DYNAMO PATCH =======
 +
  # Test properties of bool promised by PEP 285
- 
+
  import unittest
-@@ -5,7 +25,7 @@ from test.support import os_helper
- 
+@@ -5,12 +25,13 @@ from test.support import os_helper
+
  import os
- 
+
 -class BoolTest(unittest.TestCase):
 +class BoolTest(__TestCase):
- 
+
      def test_subclass(self):
          try:
-@@ -418,4 +438,4 @@ class BoolTest(unittest.TestCase):
- 
- 
+-            class C(bool):
+-                pass
++            with torch._dynamo.error_on_graph_break(False):
++                class C(bool):
++                    pass
+         except TypeError:
+             pass
+         else:
+@@ -307,40 +328,46 @@ class BoolTest(unittest.TestCase):
+         # from __bool__().  This isn't really a bool test, but
+         # it's related.
+         check = lambda o: self.assertRaises(TypeError, bool, o)
+-        class Foo(object):
+-            def __bool__(self):
+-                return self
++        with torch._dynamo.error_on_graph_break(False):
++            class Foo(object):
++                def __bool__(self):
++                    return self
+         check(Foo())
+
+-        class Bar(object):
+-            def __bool__(self):
+-                return "Yes"
++        with torch._dynamo.error_on_graph_break(False):
++            class Bar(object):
++                def __bool__(self):
++                    return "Yes"
+         check(Bar())
+
+-        class Baz(int):
+-            def __bool__(self):
+-                return self
++        with torch._dynamo.error_on_graph_break(False):
++            class Baz(int):
++                def __bool__(self):
++                    return self
+         check(Baz())
+
+         # __bool__() must return a bool not an int
+-        class Spam(int):
+-            def __bool__(self):
+-                return 1
++        with torch._dynamo.error_on_graph_break(False):
++            class Spam(int):
++                def __bool__(self):
++                    return 1
+         check(Spam())
+
+-        class Eggs:
+-            def __len__(self):
+-                return -1
++        with torch._dynamo.error_on_graph_break(False):
++            class Eggs:
++                def __len__(self):
++                    return -1
+         self.assertRaises(ValueError, bool, Eggs())
+
+     def test_interpreter_convert_to_bool_raises(self):
+-        class SymbolicBool:
+-            def __bool__(self):
+-                raise TypeError
++        with torch._dynamo.error_on_graph_break(False):
++            class SymbolicBool:
++                def __bool__(self):
++                    raise TypeError
+
+-        class Symbol:
+-            def __gt__(self, other):
+-                return SymbolicBool()
++            class Symbol:
++                def __gt__(self, other):
++                    return SymbolicBool()
+
+         x = Symbol()
+
+@@ -361,9 +388,10 @@ class BoolTest(unittest.TestCase):
+         # this test just tests our assumptions about __len__
+         # this will start failing if __len__ changes assertions
+         for badval in ['illegal', -1, 1 << 32]:
+-            class A:
+-                def __len__(self):
+-                    return badval
++            with torch._dynamo.error_on_graph_break(False):
++                class A:
++                    def __len__(self):
++                        return badval
+             try:
+                 bool(A())
+             except (Exception) as e_bool:
+@@ -373,14 +401,16 @@ class BoolTest(unittest.TestCase):
+                     self.assertEqual(str(e_bool), str(e_len))
+
+     def test_blocked(self):
+-        class A:
+-            __bool__ = None
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                __bool__ = None
+         self.assertRaises(TypeError, bool, A())
+
+-        class B:
+-            def __len__(self):
+-                return 10
+-            __bool__ = None
++        with torch._dynamo.error_on_graph_break(False):
++            class B:
++                def __len__(self):
++                    return 10
++                __bool__ = None
+         self.assertRaises(TypeError, bool, B())
+
+     def test_real_and_imag(self):
+@@ -394,12 +424,13 @@ class BoolTest(unittest.TestCase):
+         self.assertIs(type(False.imag), int)
+
+     def test_bool_called_at_least_once(self):
+-        class X:
+-            def __init__(self):
+-                self.count = 0
+-            def __bool__(self):
+-                self.count += 1
+-                return True
++        with torch._dynamo.error_on_graph_break(False):
++            class X:
++                def __init__(self):
++                    self.count = 0
++                def __bool__(self):
++                    self.count += 1
++                    return True
+
+         def f(x):
+             if x or True:
+@@ -418,4 +449,4 @@ class BoolTest(unittest.TestCase):
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_bool.py b/test/dynamo/cpython/3_13/test_bool.py
index 8989785eb75ff..fd67829de012f 100644
--- a/test/dynamo/cpython/3_13/test_bool.py
+++ b/test/dynamo/cpython/3_13/test_bool.py
@@ -29,8 +29,9 @@ class BoolTest(__TestCase):
 
     def test_subclass(self):
         try:
-            class C(bool):
-                pass
+            with torch._dynamo.error_on_graph_break(False):
+                class C(bool):
+                    pass
         except TypeError:
             pass
         else:
@@ -327,40 +328,46 @@ def test_convert_to_bool(self):
         # from __bool__().  This isn't really a bool test, but
         # it's related.
         check = lambda o: self.assertRaises(TypeError, bool, o)
-        class Foo(object):
-            def __bool__(self):
-                return self
+        with torch._dynamo.error_on_graph_break(False):
+            class Foo(object):
+                def __bool__(self):
+                    return self
         check(Foo())
 
-        class Bar(object):
-            def __bool__(self):
-                return "Yes"
+        with torch._dynamo.error_on_graph_break(False):
+            class Bar(object):
+                def __bool__(self):
+                    return "Yes"
         check(Bar())
 
-        class Baz(int):
-            def __bool__(self):
-                return self
+        with torch._dynamo.error_on_graph_break(False):
+            class Baz(int):
+                def __bool__(self):
+                    return self
         check(Baz())
 
         # __bool__() must return a bool not an int
-        class Spam(int):
-            def __bool__(self):
-                return 1
+        with torch._dynamo.error_on_graph_break(False):
+            class Spam(int):
+                def __bool__(self):
+                    return 1
         check(Spam())
 
-        class Eggs:
-            def __len__(self):
-                return -1
+        with torch._dynamo.error_on_graph_break(False):
+            class Eggs:
+                def __len__(self):
+                    return -1
         self.assertRaises(ValueError, bool, Eggs())
 
     def test_interpreter_convert_to_bool_raises(self):
-        class SymbolicBool:
-            def __bool__(self):
-                raise TypeError
+        with torch._dynamo.error_on_graph_break(False):
+            class SymbolicBool:
+                def __bool__(self):
+                    raise TypeError
 
-        class Symbol:
-            def __gt__(self, other):
-                return SymbolicBool()
+            class Symbol:
+                def __gt__(self, other):
+                    return SymbolicBool()
 
         x = Symbol()
 
@@ -381,9 +388,10 @@ def test_sane_len(self):
         # this test just tests our assumptions about __len__
         # this will start failing if __len__ changes assertions
         for badval in ['illegal', -1, 1 << 32]:
-            class A:
-                def __len__(self):
-                    return badval
+            with torch._dynamo.error_on_graph_break(False):
+                class A:
+                    def __len__(self):
+                        return badval
             try:
                 bool(A())
             except (Exception) as e_bool:
@@ -393,14 +401,16 @@ def __len__(self):
                     self.assertEqual(str(e_bool), str(e_len))
 
     def test_blocked(self):
-        class A:
-            __bool__ = None
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                __bool__ = None
         self.assertRaises(TypeError, bool, A())
 
-        class B:
-            def __len__(self):
-                return 10
-            __bool__ = None
+        with torch._dynamo.error_on_graph_break(False):
+            class B:
+                def __len__(self):
+                    return 10
+                __bool__ = None
         self.assertRaises(TypeError, bool, B())
 
     def test_real_and_imag(self):
@@ -414,12 +424,13 @@ def test_real_and_imag(self):
         self.assertIs(type(False.imag), int)
 
     def test_bool_called_at_least_once(self):
-        class X:
-            def __init__(self):
-                self.count = 0
-            def __bool__(self):
-                self.count += 1
-                return True
+        with torch._dynamo.error_on_graph_break(False):
+            class X:
+                def __init__(self):
+                    self.count = 0
+                def __bool__(self):
+                    self.count += 1
+                    return True
 
         def f(x):
             if x or True:
diff --git a/test/dynamo/cpython/3_13/test_cmath.diff b/test/dynamo/cpython/3_13/test_cmath.diff
index c229add529029..deb03570db1cd 100644
--- a/test/dynamo/cpython/3_13/test_cmath.diff
+++ b/test/dynamo/cpython/3_13/test_cmath.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_cmath.py b/test/dynamo/cpython/3_13/test_cmath.py
-index a96a5780b31..37fb665d97d 100644
+index a96a5780b31..d00dfca8a17 100644
 --- a/test/dynamo/cpython/3_13/test_cmath.py
 +++ b/test/dynamo/cpython/3_13/test_cmath.py
 @@ -1,5 +1,58 @@
@@ -65,7 +65,7 @@ index a96a5780b31..37fb665d97d 100644
 @@ -50,7 +103,7 @@ complex_nans = [complex(x, y) for x, y in [
          (INF, NAN)
          ]]
- 
+
 -class CMathTests(ComplexesAreIdenticalMixin, unittest.TestCase):
 +class CMathTests(__TestCase):
      # list of all functions in cmath
@@ -74,7 +74,7 @@ index a96a5780b31..37fb665d97d 100644
 @@ -66,6 +119,39 @@ class CMathTests(ComplexesAreIdenticalMixin, unittest.TestCase):
      def tearDown(self):
          self.test_values.close()
- 
+
 +    def assertFloatIdentical(self, x, y):
 +        """Fail unless floats x and y are identical, in the sense that:
 +        (1) both x and y are nans, or
@@ -111,9 +111,80 @@ index a96a5780b31..37fb665d97d 100644
      def rAssertAlmostEqual(self, a, b, rel_err = 2e-15, abs_err = 5e-323,
                             msg=None):
          """Fail if the two floating-point numbers are not almost equal.
-@@ -590,4 +676,4 @@ class IsCloseTests(test_math.IsCloseTests):
- 
- 
+@@ -165,38 +251,39 @@ class CMathTests(ComplexesAreIdenticalMixin, unittest.TestCase):
+         # end up being passed to the cmath functions
+
+         # usual case: new-style class implementing __complex__
+-        class MyComplex:
+-            def __init__(self, value):
+-                self.value = value
+-            def __complex__(self):
+-                return self.value
+-
+-        # classes for which __complex__ raises an exception
+-        class SomeException(Exception):
+-            pass
+-        class MyComplexException:
+-            def __complex__(self):
+-                raise SomeException
++        with torch._dynamo.error_on_graph_break(False):
++            class MyComplex:
++                def __init__(self, value):
++                    self.value = value
++                def __complex__(self):
++                    return self.value
++
++            # classes for which __complex__ raises an exception
++            class SomeException(Exception):
++                pass
++            class MyComplexException:
++                def __complex__(self):
++                    raise SomeException
+
+-        # some classes not providing __float__ or __complex__
+-        class NeitherComplexNorFloat(object):
+-            pass
+-        class Index:
+-            def __int__(self): return 2
+-            def __index__(self): return 2
+-        class MyInt:
+-            def __int__(self): return 2
+-
+-        # other possible combinations of __float__ and __complex__
+-        # that should work
+-        class FloatAndComplex:
+-            def __float__(self):
+-                return flt_arg
+-            def __complex__(self):
+-                return cx_arg
+-        class JustFloat:
+-            def __float__(self):
+-                return flt_arg
++            # some classes not providing __float__ or __complex__
++            class NeitherComplexNorFloat(object):
++                pass
++            class Index:
++                def __int__(self): return 2
++                def __index__(self): return 2
++            class MyInt:
++                def __int__(self): return 2
++
++            # other possible combinations of __float__ and __complex__
++            # that should work
++            class FloatAndComplex:
++                def __float__(self):
++                    return flt_arg
++                def __complex__(self):
++                    return cx_arg
++            class JustFloat:
++                def __float__(self):
++                    return flt_arg
+
+         for f in self.test_functions:
+             # usual usage
+@@ -590,4 +677,4 @@ class IsCloseTests(test_math.IsCloseTests):
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_cmath.py b/test/dynamo/cpython/3_13/test_cmath.py
index 37fb665d97d26..95cb84121f9c3 100644
--- a/test/dynamo/cpython/3_13/test_cmath.py
+++ b/test/dynamo/cpython/3_13/test_cmath.py
@@ -251,38 +251,39 @@ def test_user_object(self):
         # end up being passed to the cmath functions
 
         # usual case: new-style class implementing __complex__
-        class MyComplex:
-            def __init__(self, value):
-                self.value = value
-            def __complex__(self):
-                return self.value
-
-        # classes for which __complex__ raises an exception
-        class SomeException(Exception):
-            pass
-        class MyComplexException:
-            def __complex__(self):
-                raise SomeException
+        with torch._dynamo.error_on_graph_break(False):
+            class MyComplex:
+                def __init__(self, value):
+                    self.value = value
+                def __complex__(self):
+                    return self.value
+
+            # classes for which __complex__ raises an exception
+            class SomeException(Exception):
+                pass
+            class MyComplexException:
+                def __complex__(self):
+                    raise SomeException
 
-        # some classes not providing __float__ or __complex__
-        class NeitherComplexNorFloat(object):
-            pass
-        class Index:
-            def __int__(self): return 2
-            def __index__(self): return 2
-        class MyInt:
-            def __int__(self): return 2
-
-        # other possible combinations of __float__ and __complex__
-        # that should work
-        class FloatAndComplex:
-            def __float__(self):
-                return flt_arg
-            def __complex__(self):
-                return cx_arg
-        class JustFloat:
-            def __float__(self):
-                return flt_arg
+            # some classes not providing __float__ or __complex__
+            class NeitherComplexNorFloat(object):
+                pass
+            class Index:
+                def __int__(self): return 2
+                def __index__(self): return 2
+            class MyInt:
+                def __int__(self): return 2
+
+            # other possible combinations of __float__ and __complex__
+            # that should work
+            class FloatAndComplex:
+                def __float__(self):
+                    return flt_arg
+                def __complex__(self):
+                    return cx_arg
+            class JustFloat:
+                def __float__(self):
+                    return flt_arg
 
         for f in self.test_functions:
             # usual usage
diff --git a/test/dynamo/cpython/3_13/test_collections.diff b/test/dynamo/cpython/3_13/test_collections.diff
index 76d70cb8e701c..89e4e72910a2e 100644
--- a/test/dynamo/cpython/3_13/test_collections.diff
+++ b/test/dynamo/cpython/3_13/test_collections.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_collections.py b/test/dynamo/cpython/3_13/test_collections.py
-index cafc44007d1..1ee548abc7d 100644
+index cafc44007d1..4571e5a14fd 100644
 --- a/test/dynamo/cpython/3_13/test_collections.py
 +++ b/test/dynamo/cpython/3_13/test_collections.py
 @@ -1,3 +1,23 @@
@@ -24,57 +24,900 @@ index cafc44007d1..1ee548abc7d 100644
 +# ======= END DYNAMO PATCH =======
 +
  """Unit tests for collections.py."""
- 
+
  import array
 @@ -29,7 +49,7 @@ from collections.abc import Sequence, MutableSequence
  from collections.abc import ByteString, Buffer
- 
- 
+
+
 -class TestUserObjects(unittest.TestCase):
 +class TestUserObjects(__TestCase):
      def _superset_test(self, a, b):
          self.assertGreaterEqual(
              set(dir(a)),
-@@ -85,7 +105,7 @@ class TestUserObjects(unittest.TestCase):
+@@ -73,9 +93,10 @@ class TestUserObjects(unittest.TestCase):
+         self._copy_test(obj)
+
+     def test_dict_missing(self):
+-        class A(UserDict):
+-            def __missing__(self, key):
+-                return 456
++        with torch._dynamo.error_on_graph_break(False):
++            class A(UserDict):
++                def __missing__(self, key):
++                    return 456
+         self.assertEqual(A()[123], 456)
+         # get() ignores __missing__ on dict
+         self.assertIs(A().get(123), None)
+@@ -85,7 +106,7 @@ class TestUserObjects(unittest.TestCase):
  ### ChainMap (helper class for configparser and the string module)
  ################################################################################
- 
+
 -class TestChainMap(unittest.TestCase):
 +class TestChainMap(__TestCase):
- 
+
      def test_basics(self):
          c = ChainMap()
-@@ -315,7 +335,7 @@ class TestChainMap(unittest.TestCase):
- 
+@@ -172,9 +193,10 @@ class TestChainMap(unittest.TestCase):
+         self.assertTrue(ChainMap({}, {1:2}))
+
+     def test_missing(self):
+-        class DefaultChainMap(ChainMap):
+-            def __missing__(self, key):
+-                return 999
++        with torch._dynamo.error_on_graph_break(False):
++            class DefaultChainMap(ChainMap):
++                def __missing__(self, key):
++                    return 999
+         d = DefaultChainMap(dict(a=1, b=2), dict(b=20, c=30))
+         for k, v in dict(a=1, b=2, c=30, d=999).items():
+             self.assertEqual(d[k], v)                                  # check __getitem__ w/missing
+@@ -206,13 +228,14 @@ class TestChainMap(unittest.TestCase):
+              ('i', 9999), ('j', 0)])
+
+     def test_iter_not_calling_getitem_on_maps(self):
+-        class DictWithGetItem(UserDict):
+-            def __init__(self, *args, **kwds):
+-                self.called = False
+-                UserDict.__init__(self, *args, **kwds)
+-            def __getitem__(self, item):
+-                self.called = True
+-                UserDict.__getitem__(self, item)
++        with torch._dynamo.error_on_graph_break(False):
++            class DictWithGetItem(UserDict):
++                def __init__(self, *args, **kwds):
++                    self.called = False
++                    UserDict.__init__(self, *args, **kwds)
++                def __getitem__(self, item):
++                    self.called = True
++                    UserDict.__getitem__(self, item)
+
+         d = DictWithGetItem(a=1)
+         c = ChainMap(d)
+@@ -237,15 +260,16 @@ class TestChainMap(unittest.TestCase):
+         self.assertIs(m, d.maps[0])
+
+         # Use a different map than a dict
+-        class lowerdict(dict):
+-            def __getitem__(self, key):
+-                if isinstance(key, str):
+-                    key = key.lower()
+-                return dict.__getitem__(self, key)
+-            def __contains__(self, key):
+-                if isinstance(key, str):
+-                    key = key.lower()
+-                return dict.__contains__(self, key)
++        with torch._dynamo.error_on_graph_break(False):
++            class lowerdict(dict):
++                def __getitem__(self, key):
++                    if isinstance(key, str):
++                        key = key.lower()
++                    return dict.__getitem__(self, key)
++                def __contains__(self, key):
++                    if isinstance(key, str):
++                        key = key.lower()
++                    return dict.__contains__(self, key)
+
+         c = ChainMap()
+         c['a'] = 1
+@@ -315,7 +339,7 @@ class TestChainMap(unittest.TestCase):
+
  TestNT = namedtuple('TestNT', 'x y z')    # type used for pickle tests
- 
+
 -class TestNamedTuple(unittest.TestCase):
 +class TestNamedTuple(__TestCase):
- 
+
      def test_factory(self):
          Point = namedtuple('Point', 'x y')
-@@ -722,7 +742,7 @@ class TestNamedTuple(unittest.TestCase):
+@@ -666,8 +690,9 @@ class TestNamedTuple(unittest.TestCase):
+             NT = namedtuple('NT', ['abc', 'def'], False, True)
+
+     def test_namedtuple_subclass_issue_24931(self):
+-        class Point(namedtuple('_Point', ['x', 'y'])):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Point(namedtuple('_Point', ['x', 'y'])):
++                pass
+
+         a = Point(3, 4)
+         self.assertEqual(a._asdict(), OrderedDict([('x', 3), ('y', 4)]))
+@@ -722,21 +747,26 @@ class TestNamedTuple(unittest.TestCase):
  ### Abstract Base Classes
  ################################################################################
- 
+
 -class ABCTestCase(unittest.TestCase):
 +class ABCTestCase(__TestCase):
- 
+
      def validate_abstract_methods(self, abc, *names):
          methodstubs = dict.fromkeys(names, lambda s, *args: 0)
-@@ -2059,7 +2079,7 @@ class CounterSubclassWithGet(Counter):
+
+         # everything should work will all required methods are present
+-        C = type('C', (abc,), methodstubs)
++        with torch._dynamo.error_on_graph_break(False):
++            C = type('C', (abc,), methodstubs)
+         C()
+
++        # Dynamo raises a hard error here that we can't easily capture
++        # Commenting this part as this would also fail in eager if a user
++        # attempt to run the same code
++
+         # instantiation should fail if a required method is missing
+-        for name in names:
+-            stubs = methodstubs.copy()
+-            del stubs[name]
+-            C = type('C', (abc,), stubs)
+-            self.assertRaises(TypeError, C, name)
++        # for name in names:
++        #     stubs = methodstubs.copy()
++        #     del stubs[name]
++        #     C = type('C', (abc,), stubs)
++        #     self.assertRaises(TypeError, C, name)
+
+     def validate_isinstance(self, abc, name):
+         stub = lambda s, *args: 0
+@@ -981,19 +1011,21 @@ class TestOneTrickPonyABCs(ABCTestCase):
+         for x in samples:
+             self.assertIsInstance(x, Iterable)
+             self.assertTrue(issubclass(type(x), Iterable), repr(type(x)))
+-        # Check direct subclassing
+-        class I(Iterable):
+-            def __iter__(self):
+-                return super().__iter__()
++        with torch._dynamo.error_on_graph_break(False):
++            # Check direct subclassing
++            class I(Iterable):
++                def __iter__(self):
++                    return super().__iter__()
+         self.assertEqual(list(I()), [])
+         self.assertFalse(issubclass(str, I))
+         self.validate_abstract_methods(Iterable, '__iter__')
+         self.validate_isinstance(Iterable, '__iter__')
+-        # Check None blocking
+-        class It:
+-            def __iter__(self): return iter([])
+-        class ItBlocked(It):
+-            __iter__ = None
++        with torch._dynamo.error_on_graph_break(False):
++            # Check None blocking
++            class It:
++                def __iter__(self): return iter([])
++            class ItBlocked(It):
++                __iter__ = None
+         self.assertTrue(issubclass(It, Iterable))
+         self.assertTrue(isinstance(It(), Iterable))
+         self.assertFalse(issubclass(ItBlocked, Iterable))
+@@ -1023,32 +1055,35 @@ class TestOneTrickPonyABCs(ABCTestCase):
+         self.assertTrue(issubclass(Sequence, Reversible), repr(Sequence))
+         self.assertFalse(issubclass(Mapping, Reversible), repr(Mapping))
+         self.assertFalse(issubclass(MutableMapping, Reversible), repr(MutableMapping))
+-        # Check direct subclassing
+-        class R(Reversible):
+-            def __iter__(self):
+-                return iter(list())
+-            def __reversed__(self):
+-                return iter(list())
++        with torch._dynamo.error_on_graph_break(False):
++            # Check direct subclassing
++            class R(Reversible):
++                def __iter__(self):
++                    return iter(list())
++                def __reversed__(self):
++                    return iter(list())
+         self.assertEqual(list(reversed(R())), [])
+         self.assertFalse(issubclass(float, R))
+         self.validate_abstract_methods(Reversible, '__reversed__', '__iter__')
+-        # Check reversible non-iterable (which is not Reversible)
+-        class RevNoIter:
+-            def __reversed__(self): return reversed([])
+-        class RevPlusIter(RevNoIter):
+-            def __iter__(self): return iter([])
++        with torch._dynamo.error_on_graph_break(False):
++            # Check reversible non-iterable (which is not Reversible)
++            class RevNoIter:
++                def __reversed__(self): return reversed([])
++            class RevPlusIter(RevNoIter):
++                def __iter__(self): return iter([])
+         self.assertFalse(issubclass(RevNoIter, Reversible))
+         self.assertFalse(isinstance(RevNoIter(), Reversible))
+         self.assertTrue(issubclass(RevPlusIter, Reversible))
+         self.assertTrue(isinstance(RevPlusIter(), Reversible))
+-        # Check None blocking
+-        class Rev:
+-            def __iter__(self): return iter([])
+-            def __reversed__(self): return reversed([])
+-        class RevItBlocked(Rev):
+-            __iter__ = None
+-        class RevRevBlocked(Rev):
+-            __reversed__ = None
++        with torch._dynamo.error_on_graph_break(False):
++            # Check None blocking
++            class Rev:
++                def __iter__(self): return iter([])
++                def __reversed__(self): return reversed([])
++            class RevItBlocked(Rev):
++                __iter__ = None
++            class RevRevBlocked(Rev):
++                __reversed__ = None
+         self.assertTrue(issubclass(Rev, Reversible))
+         self.assertTrue(isinstance(Rev(), Reversible))
+         self.assertFalse(issubclass(RevItBlocked, Reversible))
+@@ -1082,15 +1117,16 @@ class TestOneTrickPonyABCs(ABCTestCase):
+         self.assertTrue(issubclass(Set, Collection), repr(Set))
+         self.assertTrue(issubclass(MutableSet, Collection), repr(MutableSet))
+         self.assertTrue(issubclass(Sequence, Collection), repr(MutableSet))
+-        # Check direct subclassing
+-        class Col(Collection):
+-            def __iter__(self):
+-                return iter(list())
+-            def __len__(self):
+-                return 0
+-            def __contains__(self, item):
+-                return False
+-        class DerCol(Col): pass
++        with torch._dynamo.error_on_graph_break(False):
++            # Check direct subclassing
++            class Col(Collection):
++                def __iter__(self):
++                    return iter(list())
++                def __len__(self):
++                    return 0
++                def __contains__(self, item):
++                    return False
++            class DerCol(Col): pass
+         self.assertEqual(list(iter(Col())), [])
+         self.assertFalse(issubclass(list, Col))
+         self.assertFalse(issubclass(set, Col))
+@@ -1102,44 +1138,48 @@ class TestOneTrickPonyABCs(ABCTestCase):
+         self.validate_abstract_methods(Collection, '__len__', '__iter__',
+                                                    '__contains__')
+         # Check sized container non-iterable (which is not Collection) etc.
+-        class ColNoIter:
+-            def __len__(self): return 0
+-            def __contains__(self, item): return False
+-        class ColNoSize:
+-            def __iter__(self): return iter([])
+-            def __contains__(self, item): return False
+-        class ColNoCont:
+-            def __iter__(self): return iter([])
+-            def __len__(self): return 0
++        with torch._dynamo.error_on_graph_break(False):
++            class ColNoIter:
++                def __len__(self): return 0
++                def __contains__(self, item): return False
++            class ColNoSize:
++                def __iter__(self): return iter([])
++                def __contains__(self, item): return False
++            class ColNoCont:
++                def __iter__(self): return iter([])
++                def __len__(self): return 0
+         self.assertFalse(issubclass(ColNoIter, Collection))
+         self.assertFalse(isinstance(ColNoIter(), Collection))
+         self.assertFalse(issubclass(ColNoSize, Collection))
+         self.assertFalse(isinstance(ColNoSize(), Collection))
+         self.assertFalse(issubclass(ColNoCont, Collection))
+         self.assertFalse(isinstance(ColNoCont(), Collection))
+-        # Check None blocking
+-        class SizeBlock:
+-            def __iter__(self): return iter([])
+-            def __contains__(self): return False
+-            __len__ = None
+-        class IterBlock:
+-            def __len__(self): return 0
+-            def __contains__(self): return True
+-            __iter__ = None
++
++        with torch._dynamo.error_on_graph_break(False):
++            # Check None blocking
++            class SizeBlock:
++                def __iter__(self): return iter([])
++                def __contains__(self): return False
++                __len__ = None
++            class IterBlock:
++                def __len__(self): return 0
++                def __contains__(self): return True
++                __iter__ = None
+         self.assertFalse(issubclass(SizeBlock, Collection))
+         self.assertFalse(isinstance(SizeBlock(), Collection))
+         self.assertFalse(issubclass(IterBlock, Collection))
+         self.assertFalse(isinstance(IterBlock(), Collection))
+-        # Check None blocking in subclass
+-        class ColImpl:
+-            def __iter__(self):
+-                return iter(list())
+-            def __len__(self):
+-                return 0
+-            def __contains__(self, item):
+-                return False
+-        class NonCol(ColImpl):
+-            __contains__ = None
++        with torch._dynamo.error_on_graph_break(False):
++            # Check None blocking in subclass
++            class ColImpl:
++                def __iter__(self):
++                    return iter(list())
++                def __len__(self):
++                    return 0
++                def __contains__(self, item):
++                    return False
++            class NonCol(ColImpl):
++                __contains__ = None
+         self.assertFalse(issubclass(NonCol, Collection))
+         self.assertFalse(isinstance(NonCol(), Collection))
+
+@@ -1162,30 +1202,32 @@ class TestOneTrickPonyABCs(ABCTestCase):
+             self.assertTrue(issubclass(type(x), Iterator), repr(type(x)))
+         self.validate_abstract_methods(Iterator, '__next__', '__iter__')
+
+-        # Issue 10565
+-        class NextOnly:
+-            def __next__(self):
+-                yield 1
+-                return
++        with torch._dynamo.error_on_graph_break(False):
++            # Issue 10565
++            class NextOnly:
++                def __next__(self):
++                    yield 1
++                    return
+         self.assertNotIsInstance(NextOnly(), Iterator)
+
+     def test_Generator(self):
+-        class NonGen1:
+-            def __iter__(self): return self
+-            def __next__(self): return None
+-            def close(self): pass
+-            def throw(self, typ, val=None, tb=None): pass
+-
+-        class NonGen2:
+-            def __iter__(self): return self
+-            def __next__(self): return None
+-            def close(self): pass
+-            def send(self, value): return value
+-
+-        class NonGen3:
+-            def close(self): pass
+-            def send(self, value): return value
+-            def throw(self, typ, val=None, tb=None): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class NonGen1:
++                def __iter__(self): return self
++                def __next__(self): return None
++                def close(self): pass
++                def throw(self, typ, val=None, tb=None): pass
++
++            class NonGen2:
++                def __iter__(self): return self
++                def __next__(self): return None
++                def close(self): pass
++                def send(self, value): return value
++
++            class NonGen3:
++                def close(self): pass
++                def send(self, value): return value
++                def throw(self, typ, val=None, tb=None): pass
+
+         non_samples = [
+             None, 42, 3.14, 1j, b"", "", (), [], {}, set(),
+@@ -1194,18 +1236,19 @@ class TestOneTrickPonyABCs(ABCTestCase):
+             self.assertNotIsInstance(x, Generator)
+             self.assertFalse(issubclass(type(x), Generator), repr(type(x)))
+
+-        class Gen:
+-            def __iter__(self): return self
+-            def __next__(self): return None
+-            def close(self): pass
+-            def send(self, value): return value
+-            def throw(self, typ, val=None, tb=None): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Gen:
++                def __iter__(self): return self
++                def __next__(self): return None
++                def close(self): pass
++                def send(self, value): return value
++                def throw(self, typ, val=None, tb=None): pass
+
+-        class MinimalGen(Generator):
+-            def send(self, value):
+-                return value
+-            def throw(self, typ, val=None, tb=None):
+-                super().throw(typ, val, tb)
++            class MinimalGen(Generator):
++                def send(self, value):
++                    return value
++                def throw(self, typ, val=None, tb=None):
++                    super().throw(typ, val, tb)
+
+         def gen():
+             yield 1
+@@ -1228,15 +1271,17 @@ class TestOneTrickPonyABCs(ABCTestCase):
+                                mgen.throw, ValueError, ValueError("huhu"))
+         self.assertRaises(StopIteration, mgen.throw, StopIteration())
+
+-        class FailOnClose(Generator):
+-            def send(self, value): return value
+-            def throw(self, *args): raise ValueError
++        with torch._dynamo.error_on_graph_break(False):
++            class FailOnClose(Generator):
++                def send(self, value): return value
++                def throw(self, *args): raise ValueError
+
+         self.assertRaises(ValueError, FailOnClose().close)
+
+-        class IgnoreGeneratorExit(Generator):
+-            def send(self, value): return value
+-            def throw(self, *args): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class IgnoreGeneratorExit(Generator):
++                def send(self, value): return value
++                def throw(self, *args): pass
+
+         self.assertRaises(RuntimeError, IgnoreGeneratorExit().close)
+
+@@ -1379,15 +1424,17 @@ class TestOneTrickPonyABCs(ABCTestCase):
+
+     def test_direct_subclassing(self):
+         for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
+-            class C(B):
+-                pass
++            with torch._dynamo.error_on_graph_break(False):
++                class C(B):
++                    pass
+             self.assertTrue(issubclass(C, B))
+             self.assertFalse(issubclass(int, C))
+
+     def test_registration(self):
+         for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
+-            class C:
+-                __hash__ = None  # Make sure it isn't hashable by default
++            with torch._dynamo.error_on_graph_break(False):
++                class C:
++                    __hash__ = None  # Make sure it isn't hashable by default
+             self.assertFalse(issubclass(C, B), B.__name__)
+             B.register(C)
+             self.assertTrue(issubclass(C, B))
+@@ -1423,13 +1470,14 @@ class TestCollectionABCs(ABCTestCase):
+             self.assertIsInstance(sample(), Set)
+             self.assertTrue(issubclass(sample, Set))
+         self.validate_abstract_methods(Set, '__contains__', '__iter__', '__len__')
+-        class MySet(Set):
+-            def __contains__(self, x):
+-                return False
+-            def __len__(self):
+-                return 0
+-            def __iter__(self):
+-                return iter([])
++        with torch._dynamo.error_on_graph_break(False):
++            class MySet(Set):
++                def __contains__(self, x):
++                    return False
++                def __len__(self):
++                    return 0
++                def __iter__(self):
++                    return iter([])
+         self.validate_comparison(MySet())
+
+     def test_hash_Set(self):
+@@ -1448,15 +1496,16 @@ class TestCollectionABCs(ABCTestCase):
+         self.assertTrue(hash(a) == hash(b))
+
+     def test_isdisjoint_Set(self):
+-        class MySet(Set):
+-            def __init__(self, itr):
+-                self.contents = itr
+-            def __contains__(self, x):
+-                return x in self.contents
+-            def __iter__(self):
+-                return iter(self.contents)
+-            def __len__(self):
+-                return len([x for x in self.contents])
++        with torch._dynamo.error_on_graph_break(False):
++            class MySet(Set):
++                def __init__(self, itr):
++                    self.contents = itr
++                def __contains__(self, x):
++                    return x in self.contents
++                def __iter__(self):
++                    return iter(self.contents)
++                def __len__(self):
++                    return len([x for x in self.contents])
+         s1 = MySet((1, 2, 3))
+         s2 = MySet((4, 5, 6))
+         s3 = MySet((1, 5, 6))
+@@ -1464,15 +1513,16 @@ class TestCollectionABCs(ABCTestCase):
+         self.assertFalse(s1.isdisjoint(s3))
+
+     def test_equality_Set(self):
+-        class MySet(Set):
+-            def __init__(self, itr):
+-                self.contents = itr
+-            def __contains__(self, x):
+-                return x in self.contents
+-            def __iter__(self):
+-                return iter(self.contents)
+-            def __len__(self):
+-                return len([x for x in self.contents])
++        with torch._dynamo.error_on_graph_break(False):
++            class MySet(Set):
++                def __init__(self, itr):
++                    self.contents = itr
++                def __contains__(self, x):
++                    return x in self.contents
++                def __iter__(self):
++                    return iter(self.contents)
++                def __len__(self):
++                    return len([x for x in self.contents])
+         s1 = MySet((1,))
+         s2 = MySet((1, 2))
+         s3 = MySet((3, 4))
+@@ -1486,15 +1536,16 @@ class TestCollectionABCs(ABCTestCase):
+         self.assertNotEqual(s2, s3)
+
+     def test_arithmetic_Set(self):
+-        class MySet(Set):
+-            def __init__(self, itr):
+-                self.contents = itr
+-            def __contains__(self, x):
+-                return x in self.contents
+-            def __iter__(self):
+-                return iter(self.contents)
+-            def __len__(self):
+-                return len([x for x in self.contents])
++        with torch._dynamo.error_on_graph_break(False):
++            class MySet(Set):
++                def __init__(self, itr):
++                    self.contents = itr
++                def __contains__(self, x):
++                    return x in self.contents
++                def __iter__(self):
++                    return iter(self.contents)
++                def __len__(self):
++                    return len([x for x in self.contents])
+         s1 = MySet((1, 2, 3))
+         s2 = MySet((3, 4, 5))
+         s3 = s1 & s2
+@@ -1516,28 +1567,29 @@ class TestCollectionABCs(ABCTestCase):
+
+     def test_issue_4920(self):
+         # MutableSet.pop() method did not work
+-        class MySet(MutableSet):
+-            __slots__=['__s']
+-            def __init__(self,items=None):
+-                if items is None:
+-                    items=[]
+-                self.__s=set(items)
+-            def __contains__(self,v):
+-                return v in self.__s
+-            def __iter__(self):
+-                return iter(self.__s)
+-            def __len__(self):
+-                return len(self.__s)
+-            def add(self,v):
+-                result=v not in self.__s
+-                self.__s.add(v)
+-                return result
+-            def discard(self,v):
+-                result=v in self.__s
+-                self.__s.discard(v)
+-                return result
+-            def __repr__(self):
+-                return "MySet(%s)" % repr(list(self))
++        with torch._dynamo.error_on_graph_break(False):
++            class MySet(MutableSet):
++                __slots__=['__s']
++                def __init__(self,items=None):
++                    if items is None:
++                        items=[]
++                    self.__s=set(items)
++                def __contains__(self,v):
++                    return v in self.__s
++                def __iter__(self):
++                    return iter(self.__s)
++                def __len__(self):
++                    return len(self.__s)
++                def add(self,v):
++                    result=v not in self.__s
++                    self.__s.add(v)
++                    return result
++                def discard(self,v):
++                    result=v in self.__s
++                    self.__s.discard(v)
++                    return result
++                def __repr__(self):
++                    return "MySet(%s)" % repr(list(self))
+         items = [5,43,2,1]
+         s = MySet(items)
+         r = s.pop()
+@@ -1563,24 +1615,25 @@ class TestCollectionABCs(ABCTestCase):
+     def test_issue16373(self):
+         # Recursion error comparing comparable and noncomparable
+         # Set instances
+-        class MyComparableSet(Set):
+-            def __contains__(self, x):
+-                return False
+-            def __len__(self):
+-                return 0
+-            def __iter__(self):
+-                return iter([])
+-        class MyNonComparableSet(Set):
+-            def __contains__(self, x):
+-                return False
+-            def __len__(self):
+-                return 0
+-            def __iter__(self):
+-                return iter([])
+-            def __le__(self, x):
+-                return NotImplemented
+-            def __lt__(self, x):
+-                return NotImplemented
++        with torch._dynamo.error_on_graph_break(False):
++            class MyComparableSet(Set):
++                def __contains__(self, x):
++                    return False
++                def __len__(self):
++                    return 0
++                def __iter__(self):
++                    return iter([])
++            class MyNonComparableSet(Set):
++                def __contains__(self, x):
++                    return False
++                def __len__(self):
++                    return 0
++                def __iter__(self):
++                    return iter([])
++                def __le__(self, x):
++                    return NotImplemented
++                def __lt__(self, x):
++                    return NotImplemented
+
+         cs = MyComparableSet()
+         ncs = MyNonComparableSet()
+@@ -1591,13 +1644,14 @@ class TestCollectionABCs(ABCTestCase):
+
+     def test_issue26915(self):
+         # Container membership test should check identity first
+-        class CustomSequence(Sequence):
+-            def __init__(self, seq):
+-                self._seq = seq
+-            def __getitem__(self, index):
+-                return self._seq[index]
+-            def __len__(self):
+-                return len(self._seq)
++        with torch._dynamo.error_on_graph_break(False):
++            class CustomSequence(Sequence):
++                def __init__(self, seq):
++                    self._seq = seq
++                def __getitem__(self, index):
++                    return self._seq[index]
++                def __len__(self):
++                    return len(self._seq)
+
+         nan = float('nan')
+         obj = support.NEVER_EQ
+@@ -1622,30 +1676,31 @@ class TestCollectionABCs(ABCTestCase):
+
+     def test_Set_from_iterable(self):
+         """Verify _from_iterable overridden to an instance method works."""
+-        class SetUsingInstanceFromIterable(MutableSet):
+-            def __init__(self, values, created_by):
+-                if not created_by:
+-                    raise ValueError('created_by must be specified')
+-                self.created_by = created_by
+-                self._values = set(values)
++        with torch._dynamo.error_on_graph_break(False):
++            class SetUsingInstanceFromIterable(MutableSet):
++                def __init__(self, values, created_by):
++                    if not created_by:
++                        raise ValueError('created_by must be specified')
++                    self.created_by = created_by
++                    self._values = set(values)
+
+-            def _from_iterable(self, values):
+-                return type(self)(values, 'from_iterable')
++                def _from_iterable(self, values):
++                    return type(self)(values, 'from_iterable')
+
+-            def __contains__(self, value):
+-                return value in self._values
++                def __contains__(self, value):
++                    return value in self._values
+
+-            def __iter__(self):
+-                yield from self._values
++                def __iter__(self):
++                    yield from self._values
+
+-            def __len__(self):
+-                return len(self._values)
++                def __len__(self):
++                    return len(self._values)
+
+-            def add(self, value):
+-                self._values.add(value)
++                def add(self, value):
++                    self._values.add(value)
+
+-            def discard(self, value):
+-                self._values.discard(value)
++                def discard(self, value):
++                    self._values.discard(value)
+
+         impl = SetUsingInstanceFromIterable([1, 2, 3], 'test')
+
+@@ -1678,20 +1733,21 @@ class TestCollectionABCs(ABCTestCase):
+
+     def test_Set_interoperability_with_real_sets(self):
+         # Issue: 8743
+-        class ListSet(Set):
+-            def __init__(self, elements=()):
+-                self.data = []
+-                for elem in elements:
+-                    if elem not in self.data:
+-                        self.data.append(elem)
+-            def __contains__(self, elem):
+-                return elem in self.data
+-            def __iter__(self):
+-                return iter(self.data)
+-            def __len__(self):
+-                return len(self.data)
+-            def __repr__(self):
+-                return 'Set({!r})'.format(self.data)
++        with torch._dynamo.error_on_graph_break(False):
++            class ListSet(Set):
++                def __init__(self, elements=()):
++                    self.data = []
++                    for elem in elements:
++                        if elem not in self.data:
++                            self.data.append(elem)
++                def __contains__(self, elem):
++                    return elem in self.data
++                def __iter__(self):
++                    return iter(self.data)
++                def __len__(self):
++                    return len(self.data)
++                def __repr__(self):
++                    return 'Set({!r})'.format(self.data)
+
+         r1 = set('abc')
+         r2 = set('bcd')
+@@ -1846,13 +1902,14 @@ class TestCollectionABCs(ABCTestCase):
+             self.assertTrue(issubclass(sample, Mapping))
+         self.validate_abstract_methods(Mapping, '__contains__', '__iter__', '__len__',
+             '__getitem__')
+-        class MyMapping(Mapping):
+-            def __len__(self):
+-                return 0
+-            def __getitem__(self, i):
+-                raise IndexError
+-            def __iter__(self):
+-                return iter(())
++        with torch._dynamo.error_on_graph_break(False):
++            class MyMapping(Mapping):
++                def __len__(self):
++                    return 0
++                def __getitem__(self, i):
++                    raise IndexError
++                def __iter__(self):
++                    return iter(())
+         self.validate_comparison(MyMapping())
+         self.assertRaises(TypeError, reversed, MyMapping())
+
+@@ -1860,7 +1917,7 @@ class TestCollectionABCs(ABCTestCase):
+         for sample in [dict]:
+             self.assertIsInstance(sample(), MutableMapping)
+             self.assertTrue(issubclass(sample, MutableMapping))
+-        self.validate_abstract_methods(MutableMapping, '__contains__', '__iter__', '__len__',
++        self.validate_abstract_methods(MutableMapping, '__iter__', '__len__',
+             '__getitem__', '__setitem__', '__delitem__')
+
+     def test_MutableMapping_subclass(self):
+@@ -1903,15 +1960,16 @@ class TestCollectionABCs(ABCTestCase):
+             '__getitem__')
+
+     def test_Sequence_mixins(self):
+-        class SequenceSubclass(Sequence):
+-            def __init__(self, seq=()):
+-                self.seq = seq
++        with torch._dynamo.error_on_graph_break(False):
++            class SequenceSubclass(Sequence):
++                def __init__(self, seq=()):
++                    self.seq = seq
+
+-            def __getitem__(self, index):
+-                return self.seq[index]
++                def __getitem__(self, index):
++                    return self.seq[index]
+
+-            def __len__(self):
+-                return len(self.seq)
++                def __len__(self):
++                    return len(self.seq)
+
+         # Compare Sequence.index() behavior to (list|str).index() behavior
+         def assert_index_same(seq1, seq2, index_args):
+@@ -1983,24 +2041,25 @@ class TestCollectionABCs(ABCTestCase):
+     def test_MutableSequence_mixins(self):
+         # Test the mixins of MutableSequence by creating a minimal concrete
+         # class inherited from it.
+-        class MutableSequenceSubclass(MutableSequence):
+-            def __init__(self):
+-                self.lst = []
++        with torch._dynamo.error_on_graph_break(False):
++            class MutableSequenceSubclass(MutableSequence):
++                def __init__(self):
++                    self.lst = []
+
+-            def __setitem__(self, index, value):
+-                self.lst[index] = value
++                def __setitem__(self, index, value):
++                    self.lst[index] = value
+
+-            def __getitem__(self, index):
+-                return self.lst[index]
++                def __getitem__(self, index):
++                    return self.lst[index]
+
+-            def __len__(self):
+-                return len(self.lst)
++                def __len__(self):
++                    return len(self.lst)
+
+-            def __delitem__(self, index):
+-                del self.lst[index]
++                def __delitem__(self, index):
++                    del self.lst[index]
+
+-            def insert(self, index, value):
+-                self.lst.insert(index, value)
++                def insert(self, index, value):
++                    self.lst.insert(index, value)
+
+         mss = MutableSequenceSubclass()
+         mss.append(0)
+@@ -2059,7 +2118,7 @@ class CounterSubclassWithGet(Counter):
          self.called = True
          return Counter.get(self, key, default)
- 
+
 -class TestCounter(unittest.TestCase):
 +class TestCounter(__TestCase):
- 
+
      def test_basics(self):
          c = Counter('abcaba')
-@@ -2402,10 +2422,5 @@ class TestCounter(unittest.TestCase):
+@@ -2225,8 +2284,9 @@ class TestCounter(unittest.TestCase):
+         check(Counter(words))
+
+     def test_copy_subclass(self):
+-        class MyCounter(Counter):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class MyCounter(Counter):
++                pass
+         c = MyCounter('slartibartfast')
+         d = c.copy()
+         self.assertEqual(d, c)
+@@ -2402,10 +2462,5 @@ class TestCounter(unittest.TestCase):
          self.assertFalse(Counter(a=2, b=1, c=0) > Counter('aab'))
- 
- 
+
+
 -def load_tests(loader, tests, pattern):
 -    tests.addTest(doctest.DocTestSuite(collections))
 -    return tests
diff --git a/test/dynamo/cpython/3_13/test_collections.py b/test/dynamo/cpython/3_13/test_collections.py
index 1ee548abc7dca..bb91f3ae9d879 100644
--- a/test/dynamo/cpython/3_13/test_collections.py
+++ b/test/dynamo/cpython/3_13/test_collections.py
@@ -93,9 +93,10 @@ def test_dict_copy(self):
         self._copy_test(obj)
 
     def test_dict_missing(self):
-        class A(UserDict):
-            def __missing__(self, key):
-                return 456
+        with torch._dynamo.error_on_graph_break(False):
+            class A(UserDict):
+                def __missing__(self, key):
+                    return 456
         self.assertEqual(A()[123], 456)
         # get() ignores __missing__ on dict
         self.assertIs(A().get(123), None)
@@ -192,9 +193,10 @@ def test_bool(self):
         self.assertTrue(ChainMap({}, {1:2}))
 
     def test_missing(self):
-        class DefaultChainMap(ChainMap):
-            def __missing__(self, key):
-                return 999
+        with torch._dynamo.error_on_graph_break(False):
+            class DefaultChainMap(ChainMap):
+                def __missing__(self, key):
+                    return 999
         d = DefaultChainMap(dict(a=1, b=2), dict(b=20, c=30))
         for k, v in dict(a=1, b=2, c=30, d=999).items():
             self.assertEqual(d[k], v)                                  # check __getitem__ w/missing
@@ -226,13 +228,14 @@ def test_order_preservation(self):
              ('i', 9999), ('j', 0)])
 
     def test_iter_not_calling_getitem_on_maps(self):
-        class DictWithGetItem(UserDict):
-            def __init__(self, *args, **kwds):
-                self.called = False
-                UserDict.__init__(self, *args, **kwds)
-            def __getitem__(self, item):
-                self.called = True
-                UserDict.__getitem__(self, item)
+        with torch._dynamo.error_on_graph_break(False):
+            class DictWithGetItem(UserDict):
+                def __init__(self, *args, **kwds):
+                    self.called = False
+                    UserDict.__init__(self, *args, **kwds)
+                def __getitem__(self, item):
+                    self.called = True
+                    UserDict.__getitem__(self, item)
 
         d = DictWithGetItem(a=1)
         c = ChainMap(d)
@@ -257,15 +260,16 @@ def test_new_child(self):
         self.assertIs(m, d.maps[0])
 
         # Use a different map than a dict
-        class lowerdict(dict):
-            def __getitem__(self, key):
-                if isinstance(key, str):
-                    key = key.lower()
-                return dict.__getitem__(self, key)
-            def __contains__(self, key):
-                if isinstance(key, str):
-                    key = key.lower()
-                return dict.__contains__(self, key)
+        with torch._dynamo.error_on_graph_break(False):
+            class lowerdict(dict):
+                def __getitem__(self, key):
+                    if isinstance(key, str):
+                        key = key.lower()
+                    return dict.__getitem__(self, key)
+                def __contains__(self, key):
+                    if isinstance(key, str):
+                        key = key.lower()
+                    return dict.__contains__(self, key)
 
         c = ChainMap()
         c['a'] = 1
@@ -686,8 +690,9 @@ def test_keyword_only_arguments(self):
             NT = namedtuple('NT', ['abc', 'def'], False, True)
 
     def test_namedtuple_subclass_issue_24931(self):
-        class Point(namedtuple('_Point', ['x', 'y'])):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Point(namedtuple('_Point', ['x', 'y'])):
+                pass
 
         a = Point(3, 4)
         self.assertEqual(a._asdict(), OrderedDict([('x', 3), ('y', 4)]))
@@ -748,15 +753,20 @@ def validate_abstract_methods(self, abc, *names):
         methodstubs = dict.fromkeys(names, lambda s, *args: 0)
 
         # everything should work will all required methods are present
-        C = type('C', (abc,), methodstubs)
+        with torch._dynamo.error_on_graph_break(False):
+            C = type('C', (abc,), methodstubs)
         C()
 
+        # Dynamo raises a hard error here that we can't easily capture
+        # Commenting this part as this would also fail in eager if a user
+        # attempt to run the same code
+
         # instantiation should fail if a required method is missing
-        for name in names:
-            stubs = methodstubs.copy()
-            del stubs[name]
-            C = type('C', (abc,), stubs)
-            self.assertRaises(TypeError, C, name)
+        # for name in names:
+        #     stubs = methodstubs.copy()
+        #     del stubs[name]
+        #     C = type('C', (abc,), stubs)
+        #     self.assertRaises(TypeError, C, name)
 
     def validate_isinstance(self, abc, name):
         stub = lambda s, *args: 0
@@ -1001,19 +1011,21 @@ def test_Iterable(self):
         for x in samples:
             self.assertIsInstance(x, Iterable)
             self.assertTrue(issubclass(type(x), Iterable), repr(type(x)))
-        # Check direct subclassing
-        class I(Iterable):
-            def __iter__(self):
-                return super().__iter__()
+        with torch._dynamo.error_on_graph_break(False):
+            # Check direct subclassing
+            class I(Iterable):
+                def __iter__(self):
+                    return super().__iter__()
         self.assertEqual(list(I()), [])
         self.assertFalse(issubclass(str, I))
         self.validate_abstract_methods(Iterable, '__iter__')
         self.validate_isinstance(Iterable, '__iter__')
-        # Check None blocking
-        class It:
-            def __iter__(self): return iter([])
-        class ItBlocked(It):
-            __iter__ = None
+        with torch._dynamo.error_on_graph_break(False):
+            # Check None blocking
+            class It:
+                def __iter__(self): return iter([])
+            class ItBlocked(It):
+                __iter__ = None
         self.assertTrue(issubclass(It, Iterable))
         self.assertTrue(isinstance(It(), Iterable))
         self.assertFalse(issubclass(ItBlocked, Iterable))
@@ -1043,32 +1055,35 @@ def test_Reversible(self):
         self.assertTrue(issubclass(Sequence, Reversible), repr(Sequence))
         self.assertFalse(issubclass(Mapping, Reversible), repr(Mapping))
         self.assertFalse(issubclass(MutableMapping, Reversible), repr(MutableMapping))
-        # Check direct subclassing
-        class R(Reversible):
-            def __iter__(self):
-                return iter(list())
-            def __reversed__(self):
-                return iter(list())
+        with torch._dynamo.error_on_graph_break(False):
+            # Check direct subclassing
+            class R(Reversible):
+                def __iter__(self):
+                    return iter(list())
+                def __reversed__(self):
+                    return iter(list())
         self.assertEqual(list(reversed(R())), [])
         self.assertFalse(issubclass(float, R))
         self.validate_abstract_methods(Reversible, '__reversed__', '__iter__')
-        # Check reversible non-iterable (which is not Reversible)
-        class RevNoIter:
-            def __reversed__(self): return reversed([])
-        class RevPlusIter(RevNoIter):
-            def __iter__(self): return iter([])
+        with torch._dynamo.error_on_graph_break(False):
+            # Check reversible non-iterable (which is not Reversible)
+            class RevNoIter:
+                def __reversed__(self): return reversed([])
+            class RevPlusIter(RevNoIter):
+                def __iter__(self): return iter([])
         self.assertFalse(issubclass(RevNoIter, Reversible))
         self.assertFalse(isinstance(RevNoIter(), Reversible))
         self.assertTrue(issubclass(RevPlusIter, Reversible))
         self.assertTrue(isinstance(RevPlusIter(), Reversible))
-        # Check None blocking
-        class Rev:
-            def __iter__(self): return iter([])
-            def __reversed__(self): return reversed([])
-        class RevItBlocked(Rev):
-            __iter__ = None
-        class RevRevBlocked(Rev):
-            __reversed__ = None
+        with torch._dynamo.error_on_graph_break(False):
+            # Check None blocking
+            class Rev:
+                def __iter__(self): return iter([])
+                def __reversed__(self): return reversed([])
+            class RevItBlocked(Rev):
+                __iter__ = None
+            class RevRevBlocked(Rev):
+                __reversed__ = None
         self.assertTrue(issubclass(Rev, Reversible))
         self.assertTrue(isinstance(Rev(), Reversible))
         self.assertFalse(issubclass(RevItBlocked, Reversible))
@@ -1102,15 +1117,16 @@ def test_Collection(self):
         self.assertTrue(issubclass(Set, Collection), repr(Set))
         self.assertTrue(issubclass(MutableSet, Collection), repr(MutableSet))
         self.assertTrue(issubclass(Sequence, Collection), repr(MutableSet))
-        # Check direct subclassing
-        class Col(Collection):
-            def __iter__(self):
-                return iter(list())
-            def __len__(self):
-                return 0
-            def __contains__(self, item):
-                return False
-        class DerCol(Col): pass
+        with torch._dynamo.error_on_graph_break(False):
+            # Check direct subclassing
+            class Col(Collection):
+                def __iter__(self):
+                    return iter(list())
+                def __len__(self):
+                    return 0
+                def __contains__(self, item):
+                    return False
+            class DerCol(Col): pass
         self.assertEqual(list(iter(Col())), [])
         self.assertFalse(issubclass(list, Col))
         self.assertFalse(issubclass(set, Col))
@@ -1122,44 +1138,48 @@ class DerCol(Col): pass
         self.validate_abstract_methods(Collection, '__len__', '__iter__',
                                                    '__contains__')
         # Check sized container non-iterable (which is not Collection) etc.
-        class ColNoIter:
-            def __len__(self): return 0
-            def __contains__(self, item): return False
-        class ColNoSize:
-            def __iter__(self): return iter([])
-            def __contains__(self, item): return False
-        class ColNoCont:
-            def __iter__(self): return iter([])
-            def __len__(self): return 0
+        with torch._dynamo.error_on_graph_break(False):
+            class ColNoIter:
+                def __len__(self): return 0
+                def __contains__(self, item): return False
+            class ColNoSize:
+                def __iter__(self): return iter([])
+                def __contains__(self, item): return False
+            class ColNoCont:
+                def __iter__(self): return iter([])
+                def __len__(self): return 0
         self.assertFalse(issubclass(ColNoIter, Collection))
         self.assertFalse(isinstance(ColNoIter(), Collection))
         self.assertFalse(issubclass(ColNoSize, Collection))
         self.assertFalse(isinstance(ColNoSize(), Collection))
         self.assertFalse(issubclass(ColNoCont, Collection))
         self.assertFalse(isinstance(ColNoCont(), Collection))
-        # Check None blocking
-        class SizeBlock:
-            def __iter__(self): return iter([])
-            def __contains__(self): return False
-            __len__ = None
-        class IterBlock:
-            def __len__(self): return 0
-            def __contains__(self): return True
-            __iter__ = None
+
+        with torch._dynamo.error_on_graph_break(False):
+            # Check None blocking
+            class SizeBlock:
+                def __iter__(self): return iter([])
+                def __contains__(self): return False
+                __len__ = None
+            class IterBlock:
+                def __len__(self): return 0
+                def __contains__(self): return True
+                __iter__ = None
         self.assertFalse(issubclass(SizeBlock, Collection))
         self.assertFalse(isinstance(SizeBlock(), Collection))
         self.assertFalse(issubclass(IterBlock, Collection))
         self.assertFalse(isinstance(IterBlock(), Collection))
-        # Check None blocking in subclass
-        class ColImpl:
-            def __iter__(self):
-                return iter(list())
-            def __len__(self):
-                return 0
-            def __contains__(self, item):
-                return False
-        class NonCol(ColImpl):
-            __contains__ = None
+        with torch._dynamo.error_on_graph_break(False):
+            # Check None blocking in subclass
+            class ColImpl:
+                def __iter__(self):
+                    return iter(list())
+                def __len__(self):
+                    return 0
+                def __contains__(self, item):
+                    return False
+            class NonCol(ColImpl):
+                __contains__ = None
         self.assertFalse(issubclass(NonCol, Collection))
         self.assertFalse(isinstance(NonCol(), Collection))
 
@@ -1182,30 +1202,32 @@ def test_Iterator(self):
             self.assertTrue(issubclass(type(x), Iterator), repr(type(x)))
         self.validate_abstract_methods(Iterator, '__next__', '__iter__')
 
-        # Issue 10565
-        class NextOnly:
-            def __next__(self):
-                yield 1
-                return
+        with torch._dynamo.error_on_graph_break(False):
+            # Issue 10565
+            class NextOnly:
+                def __next__(self):
+                    yield 1
+                    return
         self.assertNotIsInstance(NextOnly(), Iterator)
 
     def test_Generator(self):
-        class NonGen1:
-            def __iter__(self): return self
-            def __next__(self): return None
-            def close(self): pass
-            def throw(self, typ, val=None, tb=None): pass
-
-        class NonGen2:
-            def __iter__(self): return self
-            def __next__(self): return None
-            def close(self): pass
-            def send(self, value): return value
-
-        class NonGen3:
-            def close(self): pass
-            def send(self, value): return value
-            def throw(self, typ, val=None, tb=None): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class NonGen1:
+                def __iter__(self): return self
+                def __next__(self): return None
+                def close(self): pass
+                def throw(self, typ, val=None, tb=None): pass
+
+            class NonGen2:
+                def __iter__(self): return self
+                def __next__(self): return None
+                def close(self): pass
+                def send(self, value): return value
+
+            class NonGen3:
+                def close(self): pass
+                def send(self, value): return value
+                def throw(self, typ, val=None, tb=None): pass
 
         non_samples = [
             None, 42, 3.14, 1j, b"", "", (), [], {}, set(),
@@ -1214,18 +1236,19 @@ def throw(self, typ, val=None, tb=None): pass
             self.assertNotIsInstance(x, Generator)
             self.assertFalse(issubclass(type(x), Generator), repr(type(x)))
 
-        class Gen:
-            def __iter__(self): return self
-            def __next__(self): return None
-            def close(self): pass
-            def send(self, value): return value
-            def throw(self, typ, val=None, tb=None): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Gen:
+                def __iter__(self): return self
+                def __next__(self): return None
+                def close(self): pass
+                def send(self, value): return value
+                def throw(self, typ, val=None, tb=None): pass
 
-        class MinimalGen(Generator):
-            def send(self, value):
-                return value
-            def throw(self, typ, val=None, tb=None):
-                super().throw(typ, val, tb)
+            class MinimalGen(Generator):
+                def send(self, value):
+                    return value
+                def throw(self, typ, val=None, tb=None):
+                    super().throw(typ, val, tb)
 
         def gen():
             yield 1
@@ -1248,15 +1271,17 @@ def gen():
                                mgen.throw, ValueError, ValueError("huhu"))
         self.assertRaises(StopIteration, mgen.throw, StopIteration())
 
-        class FailOnClose(Generator):
-            def send(self, value): return value
-            def throw(self, *args): raise ValueError
+        with torch._dynamo.error_on_graph_break(False):
+            class FailOnClose(Generator):
+                def send(self, value): return value
+                def throw(self, *args): raise ValueError
 
         self.assertRaises(ValueError, FailOnClose().close)
 
-        class IgnoreGeneratorExit(Generator):
-            def send(self, value): return value
-            def throw(self, *args): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class IgnoreGeneratorExit(Generator):
+                def send(self, value): return value
+                def throw(self, *args): pass
 
         self.assertRaises(RuntimeError, IgnoreGeneratorExit().close)
 
@@ -1399,15 +1424,17 @@ def test_Callable(self):
 
     def test_direct_subclassing(self):
         for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
-            class C(B):
-                pass
+            with torch._dynamo.error_on_graph_break(False):
+                class C(B):
+                    pass
             self.assertTrue(issubclass(C, B))
             self.assertFalse(issubclass(int, C))
 
     def test_registration(self):
         for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
-            class C:
-                __hash__ = None  # Make sure it isn't hashable by default
+            with torch._dynamo.error_on_graph_break(False):
+                class C:
+                    __hash__ = None  # Make sure it isn't hashable by default
             self.assertFalse(issubclass(C, B), B.__name__)
             B.register(C)
             self.assertTrue(issubclass(C, B))
@@ -1443,13 +1470,14 @@ def test_Set(self):
             self.assertIsInstance(sample(), Set)
             self.assertTrue(issubclass(sample, Set))
         self.validate_abstract_methods(Set, '__contains__', '__iter__', '__len__')
-        class MySet(Set):
-            def __contains__(self, x):
-                return False
-            def __len__(self):
-                return 0
-            def __iter__(self):
-                return iter([])
+        with torch._dynamo.error_on_graph_break(False):
+            class MySet(Set):
+                def __contains__(self, x):
+                    return False
+                def __len__(self):
+                    return 0
+                def __iter__(self):
+                    return iter([])
         self.validate_comparison(MySet())
 
     def test_hash_Set(self):
@@ -1468,15 +1496,16 @@ def __hash__(self):
         self.assertTrue(hash(a) == hash(b))
 
     def test_isdisjoint_Set(self):
-        class MySet(Set):
-            def __init__(self, itr):
-                self.contents = itr
-            def __contains__(self, x):
-                return x in self.contents
-            def __iter__(self):
-                return iter(self.contents)
-            def __len__(self):
-                return len([x for x in self.contents])
+        with torch._dynamo.error_on_graph_break(False):
+            class MySet(Set):
+                def __init__(self, itr):
+                    self.contents = itr
+                def __contains__(self, x):
+                    return x in self.contents
+                def __iter__(self):
+                    return iter(self.contents)
+                def __len__(self):
+                    return len([x for x in self.contents])
         s1 = MySet((1, 2, 3))
         s2 = MySet((4, 5, 6))
         s3 = MySet((1, 5, 6))
@@ -1484,15 +1513,16 @@ def __len__(self):
         self.assertFalse(s1.isdisjoint(s3))
 
     def test_equality_Set(self):
-        class MySet(Set):
-            def __init__(self, itr):
-                self.contents = itr
-            def __contains__(self, x):
-                return x in self.contents
-            def __iter__(self):
-                return iter(self.contents)
-            def __len__(self):
-                return len([x for x in self.contents])
+        with torch._dynamo.error_on_graph_break(False):
+            class MySet(Set):
+                def __init__(self, itr):
+                    self.contents = itr
+                def __contains__(self, x):
+                    return x in self.contents
+                def __iter__(self):
+                    return iter(self.contents)
+                def __len__(self):
+                    return len([x for x in self.contents])
         s1 = MySet((1,))
         s2 = MySet((1, 2))
         s3 = MySet((3, 4))
@@ -1506,15 +1536,16 @@ def __len__(self):
         self.assertNotEqual(s2, s3)
 
     def test_arithmetic_Set(self):
-        class MySet(Set):
-            def __init__(self, itr):
-                self.contents = itr
-            def __contains__(self, x):
-                return x in self.contents
-            def __iter__(self):
-                return iter(self.contents)
-            def __len__(self):
-                return len([x for x in self.contents])
+        with torch._dynamo.error_on_graph_break(False):
+            class MySet(Set):
+                def __init__(self, itr):
+                    self.contents = itr
+                def __contains__(self, x):
+                    return x in self.contents
+                def __iter__(self):
+                    return iter(self.contents)
+                def __len__(self):
+                    return len([x for x in self.contents])
         s1 = MySet((1, 2, 3))
         s2 = MySet((3, 4, 5))
         s3 = s1 & s2
@@ -1536,28 +1567,29 @@ def test_issue_5647(self):
 
     def test_issue_4920(self):
         # MutableSet.pop() method did not work
-        class MySet(MutableSet):
-            __slots__=['__s']
-            def __init__(self,items=None):
-                if items is None:
-                    items=[]
-                self.__s=set(items)
-            def __contains__(self,v):
-                return v in self.__s
-            def __iter__(self):
-                return iter(self.__s)
-            def __len__(self):
-                return len(self.__s)
-            def add(self,v):
-                result=v not in self.__s
-                self.__s.add(v)
-                return result
-            def discard(self,v):
-                result=v in self.__s
-                self.__s.discard(v)
-                return result
-            def __repr__(self):
-                return "MySet(%s)" % repr(list(self))
+        with torch._dynamo.error_on_graph_break(False):
+            class MySet(MutableSet):
+                __slots__=['__s']
+                def __init__(self,items=None):
+                    if items is None:
+                        items=[]
+                    self.__s=set(items)
+                def __contains__(self,v):
+                    return v in self.__s
+                def __iter__(self):
+                    return iter(self.__s)
+                def __len__(self):
+                    return len(self.__s)
+                def add(self,v):
+                    result=v not in self.__s
+                    self.__s.add(v)
+                    return result
+                def discard(self,v):
+                    result=v in self.__s
+                    self.__s.discard(v)
+                    return result
+                def __repr__(self):
+                    return "MySet(%s)" % repr(list(self))
         items = [5,43,2,1]
         s = MySet(items)
         r = s.pop()
@@ -1583,24 +1615,25 @@ def test_issue8750(self):
     def test_issue16373(self):
         # Recursion error comparing comparable and noncomparable
         # Set instances
-        class MyComparableSet(Set):
-            def __contains__(self, x):
-                return False
-            def __len__(self):
-                return 0
-            def __iter__(self):
-                return iter([])
-        class MyNonComparableSet(Set):
-            def __contains__(self, x):
-                return False
-            def __len__(self):
-                return 0
-            def __iter__(self):
-                return iter([])
-            def __le__(self, x):
-                return NotImplemented
-            def __lt__(self, x):
-                return NotImplemented
+        with torch._dynamo.error_on_graph_break(False):
+            class MyComparableSet(Set):
+                def __contains__(self, x):
+                    return False
+                def __len__(self):
+                    return 0
+                def __iter__(self):
+                    return iter([])
+            class MyNonComparableSet(Set):
+                def __contains__(self, x):
+                    return False
+                def __len__(self):
+                    return 0
+                def __iter__(self):
+                    return iter([])
+                def __le__(self, x):
+                    return NotImplemented
+                def __lt__(self, x):
+                    return NotImplemented
 
         cs = MyComparableSet()
         ncs = MyNonComparableSet()
@@ -1611,13 +1644,14 @@ def __lt__(self, x):
 
     def test_issue26915(self):
         # Container membership test should check identity first
-        class CustomSequence(Sequence):
-            def __init__(self, seq):
-                self._seq = seq
-            def __getitem__(self, index):
-                return self._seq[index]
-            def __len__(self):
-                return len(self._seq)
+        with torch._dynamo.error_on_graph_break(False):
+            class CustomSequence(Sequence):
+                def __init__(self, seq):
+                    self._seq = seq
+                def __getitem__(self, index):
+                    return self._seq[index]
+                def __len__(self):
+                    return len(self._seq)
 
         nan = float('nan')
         obj = support.NEVER_EQ
@@ -1642,30 +1676,31 @@ def assertSameSet(self, s1, s2):
 
     def test_Set_from_iterable(self):
         """Verify _from_iterable overridden to an instance method works."""
-        class SetUsingInstanceFromIterable(MutableSet):
-            def __init__(self, values, created_by):
-                if not created_by:
-                    raise ValueError('created_by must be specified')
-                self.created_by = created_by
-                self._values = set(values)
+        with torch._dynamo.error_on_graph_break(False):
+            class SetUsingInstanceFromIterable(MutableSet):
+                def __init__(self, values, created_by):
+                    if not created_by:
+                        raise ValueError('created_by must be specified')
+                    self.created_by = created_by
+                    self._values = set(values)
 
-            def _from_iterable(self, values):
-                return type(self)(values, 'from_iterable')
+                def _from_iterable(self, values):
+                    return type(self)(values, 'from_iterable')
 
-            def __contains__(self, value):
-                return value in self._values
+                def __contains__(self, value):
+                    return value in self._values
 
-            def __iter__(self):
-                yield from self._values
+                def __iter__(self):
+                    yield from self._values
 
-            def __len__(self):
-                return len(self._values)
+                def __len__(self):
+                    return len(self._values)
 
-            def add(self, value):
-                self._values.add(value)
+                def add(self, value):
+                    self._values.add(value)
 
-            def discard(self, value):
-                self._values.discard(value)
+                def discard(self, value):
+                    self._values.discard(value)
 
         impl = SetUsingInstanceFromIterable([1, 2, 3], 'test')
 
@@ -1698,20 +1733,21 @@ def discard(self, value):
 
     def test_Set_interoperability_with_real_sets(self):
         # Issue: 8743
-        class ListSet(Set):
-            def __init__(self, elements=()):
-                self.data = []
-                for elem in elements:
-                    if elem not in self.data:
-                        self.data.append(elem)
-            def __contains__(self, elem):
-                return elem in self.data
-            def __iter__(self):
-                return iter(self.data)
-            def __len__(self):
-                return len(self.data)
-            def __repr__(self):
-                return 'Set({!r})'.format(self.data)
+        with torch._dynamo.error_on_graph_break(False):
+            class ListSet(Set):
+                def __init__(self, elements=()):
+                    self.data = []
+                    for elem in elements:
+                        if elem not in self.data:
+                            self.data.append(elem)
+                def __contains__(self, elem):
+                    return elem in self.data
+                def __iter__(self):
+                    return iter(self.data)
+                def __len__(self):
+                    return len(self.data)
+                def __repr__(self):
+                    return 'Set({!r})'.format(self.data)
 
         r1 = set('abc')
         r2 = set('bcd')
@@ -1866,13 +1902,14 @@ def test_Mapping(self):
             self.assertTrue(issubclass(sample, Mapping))
         self.validate_abstract_methods(Mapping, '__contains__', '__iter__', '__len__',
             '__getitem__')
-        class MyMapping(Mapping):
-            def __len__(self):
-                return 0
-            def __getitem__(self, i):
-                raise IndexError
-            def __iter__(self):
-                return iter(())
+        with torch._dynamo.error_on_graph_break(False):
+            class MyMapping(Mapping):
+                def __len__(self):
+                    return 0
+                def __getitem__(self, i):
+                    raise IndexError
+                def __iter__(self):
+                    return iter(())
         self.validate_comparison(MyMapping())
         self.assertRaises(TypeError, reversed, MyMapping())
 
@@ -1880,7 +1917,7 @@ def test_MutableMapping(self):
         for sample in [dict]:
             self.assertIsInstance(sample(), MutableMapping)
             self.assertTrue(issubclass(sample, MutableMapping))
-        self.validate_abstract_methods(MutableMapping, '__contains__', '__iter__', '__len__',
+        self.validate_abstract_methods(MutableMapping, '__iter__', '__len__',
             '__getitem__', '__setitem__', '__delitem__')
 
     def test_MutableMapping_subclass(self):
@@ -1923,15 +1960,16 @@ def test_Sequence(self):
             '__getitem__')
 
     def test_Sequence_mixins(self):
-        class SequenceSubclass(Sequence):
-            def __init__(self, seq=()):
-                self.seq = seq
+        with torch._dynamo.error_on_graph_break(False):
+            class SequenceSubclass(Sequence):
+                def __init__(self, seq=()):
+                    self.seq = seq
 
-            def __getitem__(self, index):
-                return self.seq[index]
+                def __getitem__(self, index):
+                    return self.seq[index]
 
-            def __len__(self):
-                return len(self.seq)
+                def __len__(self):
+                    return len(self.seq)
 
         # Compare Sequence.index() behavior to (list|str).index() behavior
         def assert_index_same(seq1, seq2, index_args):
@@ -2003,24 +2041,25 @@ def test_MutableSequence(self):
     def test_MutableSequence_mixins(self):
         # Test the mixins of MutableSequence by creating a minimal concrete
         # class inherited from it.
-        class MutableSequenceSubclass(MutableSequence):
-            def __init__(self):
-                self.lst = []
+        with torch._dynamo.error_on_graph_break(False):
+            class MutableSequenceSubclass(MutableSequence):
+                def __init__(self):
+                    self.lst = []
 
-            def __setitem__(self, index, value):
-                self.lst[index] = value
+                def __setitem__(self, index, value):
+                    self.lst[index] = value
 
-            def __getitem__(self, index):
-                return self.lst[index]
+                def __getitem__(self, index):
+                    return self.lst[index]
 
-            def __len__(self):
-                return len(self.lst)
+                def __len__(self):
+                    return len(self.lst)
 
-            def __delitem__(self, index):
-                del self.lst[index]
+                def __delitem__(self, index):
+                    del self.lst[index]
 
-            def insert(self, index, value):
-                self.lst.insert(index, value)
+                def insert(self, index, value):
+                    self.lst.insert(index, value)
 
         mss = MutableSequenceSubclass()
         mss.append(0)
@@ -2245,8 +2284,9 @@ def check(dup):
         check(Counter(words))
 
     def test_copy_subclass(self):
-        class MyCounter(Counter):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class MyCounter(Counter):
+                pass
         c = MyCounter('slartibartfast')
         d = c.copy()
         self.assertEqual(d, c)
diff --git a/test/dynamo/cpython/3_13/test_complex.diff b/test/dynamo/cpython/3_13/test_complex.diff
index 57a2d4315f21a..2a7042b9c0a6f 100644
--- a/test/dynamo/cpython/3_13/test_complex.diff
+++ b/test/dynamo/cpython/3_13/test_complex.diff
@@ -1,8 +1,8 @@
 diff --git a/test/dynamo/cpython/3_13/test_complex.py b/test/dynamo/cpython/3_13/test_complex.py
-index 6ff1a8ab29d..cda348d2f37 100644
+index 6ff1a8ab29d..1572433c5ae 100644
 --- a/test/dynamo/cpython/3_13/test_complex.py
 +++ b/test/dynamo/cpython/3_13/test_complex.py
-@@ -1,16 +1,146 @@
+@@ -1,16 +1,147 @@
 +# ======= BEGIN Dynamo patch =======
 +# Owner(s): ["module: dynamo"]
 +
@@ -19,6 +19,7 @@ index 6ff1a8ab29d..cda348d2f37 100644
 +from torch._dynamo.test_case import CPythonTestCase
 +from torch.testing._internal.common_utils import (
 +    run_tests,
++    slowTest,
 +    xfailIfTorchDynamo,
 +)
 +
@@ -42,7 +43,7 @@ index 6ff1a8ab29d..cda348d2f37 100644
 +    "test.test_iter",
 +    "test.typinganndata.ann_module",
  )
- 
+
 +class RedirectImportFinder(importlib.abc.MetaPathFinder):
 +    def find_spec(self, fullname, path, target=None):
 +        # Check if the import is the problematic one
@@ -73,7 +74,7 @@ index 6ff1a8ab29d..cda348d2f37 100644
  from math import isnan, copysign
 +import math
  import operator
- 
+
 +VALID_UNDERSCORE_LITERALS = [
 +    '0_0_0',
 +    '4_2',
@@ -154,10 +155,10 @@ index 6ff1a8ab29d..cda348d2f37 100644
  INF = float("inf")
  NAN = float("nan")
  DBL_MAX = sys.float_info.max
-@@ -45,7 +175,40 @@ class WithComplex:
+@@ -45,7 +176,40 @@ class WithComplex:
      def __complex__(self):
          return self.value
- 
+
 -class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
 +class ComplexTest(__TestCase):
 +
@@ -193,13 +194,13 @@ index 6ff1a8ab29d..cda348d2f37 100644
 +        """
 +        self.assertFloatIdentical(x.real, y.real)
 +        self.assertFloatIdentical(x.imag, y.imag)
- 
+
      def assertAlmostEqual(self, a, b):
          if isinstance(a, complex):
-@@ -74,6 +237,29 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+@@ -74,6 +238,29 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
          # check that relative difference < eps
          self.assertTrue(abs((x-y)/y) < eps)
- 
+
 +    def assertFloatsAreIdentical(self, x, y):
 +        """assert that floats x and y are identical, in the sense that:
 +        (1) both x and y are nans, or
@@ -226,9 +227,102 @@ index 6ff1a8ab29d..cda348d2f37 100644
      def assertClose(self, x, y, eps=1e-9):
          """Return true iff complexes x and y "are close"."""
          self.assertCloseAbs(x.real, y.real, eps)
-@@ -855,4 +1041,4 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
- 
- 
+@@ -93,6 +280,7 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+             q = z.__truediv__(y)
+             self.assertClose(q, x)
+
++    @slowTest
+     def test_truediv(self):
+         simple_real = [float(i) for i in range(-5, 6)]
+         simple_complex = [complex(x, y) for x in simple_real for y in simple_real]
+@@ -338,7 +526,10 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+
+     def test_boolcontext(self):
+         for i in range(100):
+-            self.assertTrue(complex(random() + 1e-6, random() + 1e-6))
++            with torch._dynamo.error_on_graph_break(False):
++                r1 = random()
++                r2 = random()
++            self.assertTrue(complex(r1 + 1e-6, r2 + 1e-6))
+         self.assertTrue(not complex(0.0, 0.0))
+         self.assertTrue(1j)
+
+@@ -431,12 +622,13 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+         self.assertRaises(TypeError, complex, WithComplex(1), object())
+         self.assertRaises(TypeError, complex, WithComplex(None), object())
+
+-        class EvilExc(Exception):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class EvilExc(Exception):
++                pass
+
+-        class evilcomplex:
+-            def __complex__(self):
+-                raise EvilExc
++            class evilcomplex:
++                def __complex__(self):
++                    raise EvilExc
+
+         self.assertRaises(EvilExc, complex, evilcomplex())
+
+@@ -460,31 +652,33 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+         self.assertRaises(TypeError, complex, WithIndex(None), 1.5)
+         self.assertRaises(TypeError, complex, 1.5, WithIndex(None))
+
+-        class MyInt:
+-            def __int__(self):
+-                return 42
++        with torch._dynamo.error_on_graph_break(False):
++            class MyInt:
++                def __int__(self):
++                    return 42
+
+         self.assertRaises(TypeError, complex, MyInt())
+         self.assertRaises(TypeError, complex, MyInt(), 1.5)
+         self.assertRaises(TypeError, complex, 1.5, MyInt())
+
+-        class complex0(complex):
+-            """Test usage of __complex__() when inheriting from 'complex'"""
+-            def __complex__(self):
+-                return 42j
+-
+-        class complex1(complex):
+-            """Test usage of __complex__() with a __new__() method"""
+-            def __new__(self, value=0j):
+-                return complex.__new__(self, 2*value)
+-            def __complex__(self):
+-                return self
+-
+-        class complex2(complex):
+-            """Make sure that __complex__() calls fail if anything other than a
+-            complex is returned"""
+-            def __complex__(self):
+-                return None
++        with torch._dynamo.error_on_graph_break(False):
++            class complex0(complex):
++                """Test usage of __complex__() when inheriting from 'complex'"""
++                def __complex__(self):
++                    return 42j
++
++            class complex1(complex):
++                """Test usage of __complex__() with a __new__() method"""
++                def __new__(self, value=0j):
++                    return complex.__new__(self, 2*value)
++                def __complex__(self):
++                    return self
++
++            class complex2(complex):
++                """Make sure that __complex__() calls fail if anything other than a
++                complex is returned"""
++                def __complex__(self):
++                    return None
+
+         check(complex(complex0(1j)), 0.0, 42.0)
+         with self.assertWarns(DeprecationWarning):
+@@ -855,4 +1049,4 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_complex.py b/test/dynamo/cpython/3_13/test_complex.py
index cda348d2f3776..6921c1da6ec4c 100644
--- a/test/dynamo/cpython/3_13/test_complex.py
+++ b/test/dynamo/cpython/3_13/test_complex.py
@@ -14,6 +14,7 @@
 from torch._dynamo.test_case import CPythonTestCase
 from torch.testing._internal.common_utils import (
     run_tests,
+    slowTest,
     xfailIfTorchDynamo,
 )
 
@@ -279,6 +280,7 @@ def check_div(self, x, y):
             q = z.__truediv__(y)
             self.assertClose(q, x)
 
+    @slowTest
     def test_truediv(self):
         simple_real = [float(i) for i in range(-5, 6)]
         simple_complex = [complex(x, y) for x in simple_real for y in simple_real]
@@ -524,7 +526,10 @@ def test_pow_with_small_integer_exponents(self):
 
     def test_boolcontext(self):
         for i in range(100):
-            self.assertTrue(complex(random() + 1e-6, random() + 1e-6))
+            with torch._dynamo.error_on_graph_break(False):
+                r1 = random()
+                r2 = random()
+            self.assertTrue(complex(r1 + 1e-6, r2 + 1e-6))
         self.assertTrue(not complex(0.0, 0.0))
         self.assertTrue(1j)
 
@@ -617,12 +622,13 @@ def check(z, x, y):
         self.assertRaises(TypeError, complex, WithComplex(1), object())
         self.assertRaises(TypeError, complex, WithComplex(None), object())
 
-        class EvilExc(Exception):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class EvilExc(Exception):
+                pass
 
-        class evilcomplex:
-            def __complex__(self):
-                raise EvilExc
+            class evilcomplex:
+                def __complex__(self):
+                    raise EvilExc
 
         self.assertRaises(EvilExc, complex, evilcomplex())
 
@@ -646,31 +652,33 @@ def __complex__(self):
         self.assertRaises(TypeError, complex, WithIndex(None), 1.5)
         self.assertRaises(TypeError, complex, 1.5, WithIndex(None))
 
-        class MyInt:
-            def __int__(self):
-                return 42
+        with torch._dynamo.error_on_graph_break(False):
+            class MyInt:
+                def __int__(self):
+                    return 42
 
         self.assertRaises(TypeError, complex, MyInt())
         self.assertRaises(TypeError, complex, MyInt(), 1.5)
         self.assertRaises(TypeError, complex, 1.5, MyInt())
 
-        class complex0(complex):
-            """Test usage of __complex__() when inheriting from 'complex'"""
-            def __complex__(self):
-                return 42j
-
-        class complex1(complex):
-            """Test usage of __complex__() with a __new__() method"""
-            def __new__(self, value=0j):
-                return complex.__new__(self, 2*value)
-            def __complex__(self):
-                return self
-
-        class complex2(complex):
-            """Make sure that __complex__() calls fail if anything other than a
-            complex is returned"""
-            def __complex__(self):
-                return None
+        with torch._dynamo.error_on_graph_break(False):
+            class complex0(complex):
+                """Test usage of __complex__() when inheriting from 'complex'"""
+                def __complex__(self):
+                    return 42j
+
+            class complex1(complex):
+                """Test usage of __complex__() with a __new__() method"""
+                def __new__(self, value=0j):
+                    return complex.__new__(self, 2*value)
+                def __complex__(self):
+                    return self
+
+            class complex2(complex):
+                """Make sure that __complex__() calls fail if anything other than a
+                complex is returned"""
+                def __complex__(self):
+                    return None
 
         check(complex(complex0(1j)), 0.0, 42.0)
         with self.assertWarns(DeprecationWarning):
diff --git a/test/dynamo/cpython/3_13/test_contextlib.diff b/test/dynamo/cpython/3_13/test_contextlib.diff
index 3850f66966817..0a94558250d1a 100644
--- a/test/dynamo/cpython/3_13/test_contextlib.diff
+++ b/test/dynamo/cpython/3_13/test_contextlib.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_contextlib.py b/test/dynamo/cpython/3_13/test_contextlib.py
-index cf651959803..51fd083b112 100644
+index cf651959803..256a824932d 100644
 --- a/test/dynamo/cpython/3_13/test_contextlib.py
 +++ b/test/dynamo/cpython/3_13/test_contextlib.py
 @@ -1,3 +1,57 @@
@@ -58,141 +58,565 @@ index cf651959803..51fd083b112 100644
 +# ======= END DYNAMO PATCH =======
 +
  """Unit tests for contextlib.py, and other context managers."""
- 
+
  import io
-@@ -14,7 +68,7 @@ from test.support.testcase import ExceptionIsLikeMixin
+@@ -14,60 +68,67 @@ from test.support.testcase import ExceptionIsLikeMixin
  import weakref
- 
- 
+
+
 -class TestAbstractContextManager(unittest.TestCase):
 +class TestAbstractContextManager(__TestCase):
- 
+
      def test_enter(self):
-         class DefaultEnter(AbstractContextManager):
-@@ -67,7 +121,7 @@ class TestAbstractContextManager(unittest.TestCase):
+-        class DefaultEnter(AbstractContextManager):
+-            def __exit__(self, *args):
+-                super().__exit__(*args)
++        with torch._dynamo.error_on_graph_break(False):
++            class DefaultEnter(AbstractContextManager):
++                def __exit__(self, *args):
++                    super().__exit__(*args)
+
+         manager = DefaultEnter()
+         self.assertIs(manager.__enter__(), manager)
+
+     def test_slots(self):
+-        class DefaultContextManager(AbstractContextManager):
+-            __slots__ = ()
++        with torch._dynamo.error_on_graph_break(False):
++            class DefaultContextManager(AbstractContextManager):
++                __slots__ = ()
+
+-            def __exit__(self, *args):
+-                super().__exit__(*args)
++                def __exit__(self, *args):
++                    super().__exit__(*args)
+
+         with self.assertRaises(AttributeError):
+             DefaultContextManager().var = 42
+
+     def test_exit_is_abstract(self):
+-        class MissingExit(AbstractContextManager):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class MissingExit(AbstractContextManager):
++                pass
+
+         with self.assertRaises(TypeError):
+             MissingExit()
+
+     def test_structural_subclassing(self):
+-        class ManagerFromScratch:
+-            def __enter__(self):
+-                return self
+-            def __exit__(self, exc_type, exc_value, traceback):
+-                return None
++        with torch._dynamo.error_on_graph_break(False):
++            class ManagerFromScratch:
++                def __enter__(self):
++                    return self
++                def __exit__(self, exc_type, exc_value, traceback):
++                    return None
+
+         self.assertTrue(issubclass(ManagerFromScratch, AbstractContextManager))
+
+-        class DefaultEnter(AbstractContextManager):
+-            def __exit__(self, *args):
+-                super().__exit__(*args)
++        with torch._dynamo.error_on_graph_break(False):
++            class DefaultEnter(AbstractContextManager):
++                def __exit__(self, *args):
++                    super().__exit__(*args)
+
+         self.assertTrue(issubclass(DefaultEnter, AbstractContextManager))
+
+-        class NoEnter(ManagerFromScratch):
+-            __enter__ = None
++        with torch._dynamo.error_on_graph_break(False):
++            class NoEnter(ManagerFromScratch):
++                __enter__ = None
+
+         self.assertFalse(issubclass(NoEnter, AbstractContextManager))
+
+-        class NoExit(ManagerFromScratch):
+-            __exit__ = None
++        with torch._dynamo.error_on_graph_break(False):
++            class NoExit(ManagerFromScratch):
++                __exit__ = None
+
          self.assertFalse(issubclass(NoExit, AbstractContextManager))
- 
- 
+
+
 -class ContextManagerTestCase(unittest.TestCase):
 +class ContextManagerTestCase(__TestCase):
- 
+
      def test_contextmanager_plain(self):
          state = []
-@@ -396,7 +450,7 @@ def woohoo():
+@@ -115,8 +176,9 @@ class ContextManagerTestCase(unittest.TestCase):
+         self.assertEqual(frames[0].line, '1/0')
+
+         # Repeat with RuntimeError (which goes through a different code path)
+-        class RuntimeErrorSubclass(RuntimeError):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class RuntimeErrorSubclass(RuntimeError):
++                pass
+
+         try:
+             with f():
+@@ -128,8 +190,9 @@ class ContextManagerTestCase(unittest.TestCase):
+         self.assertEqual(frames[0].name, 'test_contextmanager_traceback')
+         self.assertEqual(frames[0].line, 'raise RuntimeErrorSubclass(42)')
+
+-        class StopIterationSubclass(StopIteration):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class StopIterationSubclass(StopIteration):
++                pass
+
+         for stop_exc in (
+             StopIteration('spam'),
+@@ -169,9 +232,9 @@ class ContextManagerTestCase(unittest.TestCase):
+         ctx.__enter__()
+         with self.assertRaises(RuntimeError):
+             ctx.__exit__(TypeError, TypeError("foo"), None)
+-        if support.check_impl_detail(cpython=True):
+-            # The "gen" attribute is an implementation detail.
+-            self.assertFalse(ctx.gen.gi_suspended)
++        # if support.check_impl_detail(cpython=True):
++        #     # The "gen" attribute is an implementation detail.
++        #     self.assertFalse(ctx.gen.gi_suspended)
+
+     def test_contextmanager_trap_no_yield(self):
+         @contextmanager
+@@ -191,9 +254,9 @@ class ContextManagerTestCase(unittest.TestCase):
+         ctx.__enter__()
+         with self.assertRaises(RuntimeError):
+             ctx.__exit__(None, None, None)
+-        if support.check_impl_detail(cpython=True):
+-            # The "gen" attribute is an implementation detail.
+-            self.assertFalse(ctx.gen.gi_suspended)
++        # if support.check_impl_detail(cpython=True):
++        #     # The "gen" attribute is an implementation detail.
++        #     self.assertFalse(ctx.gen.gi_suspended)
+
+     def test_contextmanager_non_normalised(self):
+         @contextmanager
+@@ -230,8 +293,9 @@ class ContextManagerTestCase(unittest.TestCase):
+         def woohoo():
+             yield
+
+-        class StopIterationSubclass(StopIteration):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class StopIterationSubclass(StopIteration):
++                pass
+
+         for stop_exc in (StopIteration('spam'), StopIterationSubclass('spam')):
+             with self.subTest(type=type(stop_exc)):
+@@ -344,8 +408,9 @@ def woohoo():
+             self.assertEqual(target, (11, 22, 33, 44))
+
+     def test_nokeepref(self):
+-        class A:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                pass
+
+         @contextmanager
+         def woohoo(a, b):
+@@ -396,7 +461,7 @@ def woohoo():
          self.assertEqual(depth, 0)
- 
- 
+
+
 -class ClosingTestCase(unittest.TestCase):
 +class ClosingTestCase(__TestCase):
- 
+
      @support.requires_docstrings
      def test_instance_docs(self):
-@@ -430,7 +484,7 @@ class ClosingTestCase(unittest.TestCase):
+@@ -407,9 +472,10 @@ class ClosingTestCase(unittest.TestCase):
+
+     def test_closing(self):
+         state = []
+-        class C:
+-            def close(self):
+-                state.append(1)
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                def close(self):
++                    state.append(1)
+         x = C()
+         self.assertEqual(state, [])
+         with closing(x) as y:
+@@ -418,9 +484,10 @@ class ClosingTestCase(unittest.TestCase):
+
+     def test_closing_error(self):
+         state = []
+-        class C:
+-            def close(self):
+-                state.append(1)
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                def close(self):
++                    state.append(1)
+         x = C()
+         self.assertEqual(state, [])
+         with self.assertRaises(ZeroDivisionError):
+@@ -430,16 +497,17 @@ class ClosingTestCase(unittest.TestCase):
          self.assertEqual(state, [1])
- 
- 
+
+
 -class NullcontextTestCase(unittest.TestCase):
 +class NullcontextTestCase(__TestCase):
      def test_nullcontext(self):
-         class C:
-             pass
-@@ -439,7 +493,7 @@ class NullcontextTestCase(unittest.TestCase):
+-        class C:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                pass
+         c = C()
+         with nullcontext(c) as c_in:
              self.assertIs(c_in, c)
- 
- 
+
+
 -class FileContextTestCase(unittest.TestCase):
 +class FileContextTestCase(__TestCase):
- 
+
      def testWithOpen(self):
          tfn = tempfile.mktemp()
-@@ -457,7 +511,7 @@ class FileContextTestCase(unittest.TestCase):
+@@ -457,7 +525,7 @@ class FileContextTestCase(unittest.TestCase):
          finally:
              os_helper.unlink(tfn)
- 
+
 -class LockContextTestCase(unittest.TestCase):
 +class LockContextTestCase(__TestCase):
- 
+
      def boilerPlate(self, lock, locked):
          self.assertFalse(locked())
-@@ -520,7 +574,7 @@ class mycontext(ContextDecorator):
+@@ -520,7 +588,7 @@ class mycontext(ContextDecorator):
          return self.catch
- 
- 
+
+
 -class TestContextDecorator(unittest.TestCase):
 +class TestContextDecorator(__TestCase):
- 
+
      @support.requires_docstrings
      def test_instance_docs(self):
-@@ -680,7 +734,7 @@ class TestContextDecorator(unittest.TestCase):
+@@ -584,13 +652,14 @@ class TestContextDecorator(unittest.TestCase):
+     def test_decorating_method(self):
+         context = mycontext()
+
+-        class Test(object):
++        with torch._dynamo.error_on_graph_break(False):
++            class Test(object):
+
+-            @context
+-            def method(self, a, b, c=None):
+-                self.a = a
+-                self.b = b
+-                self.c = c
++                @context
++                def method(self, a, b, c=None):
++                    self.a = a
++                    self.b = b
++                    self.c = c
+
+         # these tests are for argument passing when used as a decorator
+         test = Test()
+@@ -612,11 +681,12 @@ class TestContextDecorator(unittest.TestCase):
+
+
+     def test_typo_enter(self):
+-        class mycontext(ContextDecorator):
+-            def __unter__(self):
+-                pass
+-            def __exit__(self, *exc):
+-                pass
++        with torch._dynamo.error_on_graph_break(False):
++            class mycontext(ContextDecorator):
++                def __unter__(self):
++                    pass
++                def __exit__(self, *exc):
++                    pass
+
+         with self.assertRaisesRegex(TypeError, 'the context manager'):
+             with mycontext():
+@@ -624,11 +694,12 @@ class TestContextDecorator(unittest.TestCase):
+
+
+     def test_typo_exit(self):
+-        class mycontext(ContextDecorator):
+-            def __enter__(self):
+-                pass
+-            def __uxit__(self, *exc):
+-                pass
++        with torch._dynamo.error_on_graph_break(False):
++            class mycontext(ContextDecorator):
++                def __enter__(self):
++                    pass
++                def __uxit__(self, *exc):
++                    pass
+
+         with self.assertRaisesRegex(TypeError, 'the context manager.*__exit__'):
+             with mycontext():
+@@ -636,19 +707,20 @@ class TestContextDecorator(unittest.TestCase):
+
+
+     def test_contextdecorator_as_mixin(self):
+-        class somecontext(object):
+-            started = False
+-            exc = None
++        with torch._dynamo.error_on_graph_break(False):
++            class somecontext(object):
++                started = False
++                exc = None
+
+-            def __enter__(self):
+-                self.started = True
+-                return self
++                def __enter__(self):
++                    self.started = True
++                    return self
+
+-            def __exit__(self, *exc):
+-                self.exc = exc
++                def __exit__(self, *exc):
++                    self.exc = exc
+
+-        class mycontext(somecontext, ContextDecorator):
+-            pass
++            class mycontext(somecontext, ContextDecorator):
++                pass
+
+         context = mycontext()
+         @context
+@@ -680,7 +752,7 @@ class TestContextDecorator(unittest.TestCase):
          self.assertEqual(state, [1, 'something else', 999])
- 
- 
+
+
 -class TestBaseExitStack:
 +class _TestBaseExitStack:
      exit_stack = None
- 
+
      @support.requires_docstrings
-@@ -1141,7 +1195,7 @@ class TestBaseExitStack:
+@@ -745,13 +817,14 @@ class TestBaseExitStack:
+             self.assertIsNone(exc_type)
+             self.assertIsNone(exc)
+             self.assertIsNone(exc_tb)
+-        class ExitCM(object):
+-            def __init__(self, check_exc):
+-                self.check_exc = check_exc
+-            def __enter__(self):
+-                self.fail("Should not be called!")
+-            def __exit__(self, *exc_details):
+-                self.check_exc(*exc_details)
++        with torch._dynamo.error_on_graph_break(False):
++            class ExitCM(object):
++                def __init__(self, check_exc):
++                    self.check_exc = check_exc
++                def __enter__(self):
++                    self.fail("Should not be called!")
++                def __exit__(self, *exc_details):
++                    self.check_exc(*exc_details)
+         with self.exit_stack() as stack:
+             stack.push(_expect_ok)
+             self.assertIs(stack._exit_callbacks[-1][1], _expect_ok)
+@@ -770,11 +843,12 @@ class TestBaseExitStack:
+             1/0
+
+     def test_enter_context(self):
+-        class TestCM(object):
+-            def __enter__(self):
+-                result.append(1)
+-            def __exit__(self, *exc_details):
+-                result.append(3)
++        with torch._dynamo.error_on_graph_break(False):
++            class TestCM(object):
++                def __enter__(self):
++                    result.append(1)
++                def __exit__(self, *exc_details):
++                    result.append(3)
+
+         result = []
+         cm = TestCM()
+@@ -789,14 +863,15 @@ class TestBaseExitStack:
+         self.assertEqual(result, [1, 2, 3, 4])
+
+     def test_enter_context_errors(self):
+-        class LacksEnterAndExit:
+-            pass
+-        class LacksEnter:
+-            def __exit__(self, *exc_info):
+-                pass
+-        class LacksExit:
+-            def __enter__(self):
++        with torch._dynamo.error_on_graph_break(False):
++            class LacksEnterAndExit:
+                 pass
++            class LacksEnter:
++                def __exit__(self, *exc_info):
++                    pass
++            class LacksExit:
++                def __enter__(self):
++                    pass
+
+         with self.exit_stack() as stack:
+             with self.assertRaisesRegex(TypeError, 'the context manager'):
+@@ -877,32 +952,33 @@ class TestBaseExitStack:
+     def test_exit_exception_chaining_reference(self):
+         # Sanity check to make sure that ExitStack chaining matches
+         # actual nested with statements
+-        class RaiseExc:
+-            def __init__(self, exc):
+-                self.exc = exc
+-            def __enter__(self):
+-                return self
+-            def __exit__(self, *exc_details):
+-                raise self.exc
+-
+-        class RaiseExcWithContext:
+-            def __init__(self, outer, inner):
+-                self.outer = outer
+-                self.inner = inner
+-            def __enter__(self):
+-                return self
+-            def __exit__(self, *exc_details):
+-                try:
+-                    raise self.inner
+-                except:
+-                    raise self.outer
+-
+-        class SuppressExc:
+-            def __enter__(self):
+-                return self
+-            def __exit__(self, *exc_details):
+-                type(self).saved_details = exc_details
+-                return True
++        with torch._dynamo.error_on_graph_break(False):
++            class RaiseExc:
++                def __init__(self, exc):
++                    self.exc = exc
++                def __enter__(self):
++                    return self
++                def __exit__(self, *exc_details):
++                    raise self.exc
++
++            class RaiseExcWithContext:
++                def __init__(self, outer, inner):
++                    self.outer = outer
++                    self.inner = inner
++                def __enter__(self):
++                    return self
++                def __exit__(self, *exc_details):
++                    try:
++                        raise self.inner
++                    except:
++                        raise self.outer
++
++            class SuppressExc:
++                def __enter__(self):
++                    return self
++                def __exit__(self, *exc_details):
++                    type(self).saved_details = exc_details
++                    return True
+
+         try:
+             with RaiseExc(IndexError):
+@@ -957,8 +1033,9 @@ class TestBaseExitStack:
+         # Ensure ExitStack chaining matches actual nested `with` statements
+         # regarding explicit __context__ = None.
+
+-        class MyException(Exception):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class MyException(Exception):
++                pass
+
+         @contextmanager
+         def my_cm():
+@@ -1096,7 +1173,8 @@ class TestBaseExitStack:
+                 stack.callback(int)
+
+     def test_instance_bypass(self):
+-        class Example(object): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Example(object): pass
+         cm = Example()
+         cm.__enter__ = object()
+         cm.__exit__ = object()
+@@ -1108,8 +1186,9 @@ class TestBaseExitStack:
+
+     def test_dont_reraise_RuntimeError(self):
+         # https://bugs.python.org/issue27122
+-        class UniqueException(Exception): pass
+-        class UniqueRuntimeError(RuntimeError): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class UniqueException(Exception): pass
++            class UniqueRuntimeError(RuntimeError): pass
+
+         @contextmanager
+         def second():
+@@ -1141,7 +1220,7 @@ class TestBaseExitStack:
          self.assertIs(exc.__cause__, exc.__context__)
- 
- 
+
+
 -class TestExitStack(TestBaseExitStack, unittest.TestCase):
 +class TestExitStack(_TestBaseExitStack, __TestCase):
      exit_stack = ExitStack
      callback_error_internal_frames = [
          ('__exit__', 'raise exc'),
-@@ -1149,7 +1203,7 @@ class TestExitStack(TestBaseExitStack, unittest.TestCase):
+@@ -1149,7 +1228,7 @@ class TestExitStack(TestBaseExitStack, unittest.TestCase):
      ]
- 
- 
+
+
 -class TestRedirectStream:
 +class _TestRedirectStream:
- 
+
      redirect_stream = None
      orig_stream = None
-@@ -1206,19 +1260,19 @@ class TestRedirectStream:
+@@ -1206,19 +1285,19 @@ class TestRedirectStream:
          self.assertEqual(s, "Hello World!\n")
- 
- 
+
+
 -class TestRedirectStdout(TestRedirectStream, unittest.TestCase):
 +class TestRedirectStdout(_TestRedirectStream, __TestCase):
- 
+
      redirect_stream = redirect_stdout
      orig_stream = "stdout"
- 
- 
+
+
 -class TestRedirectStderr(TestRedirectStream, unittest.TestCase):
 +class TestRedirectStderr(_TestRedirectStream, __TestCase):
- 
+
      redirect_stream = redirect_stderr
      orig_stream = "stderr"
- 
- 
+
+
 -class TestSuppress(ExceptionIsLikeMixin, unittest.TestCase):
 +class TestSuppress(ExceptionIsLikeMixin, __TestCase):
- 
+
      @support.requires_docstrings
      def test_instance_docs(self):
-@@ -1315,7 +1369,7 @@ class TestSuppress(ExceptionIsLikeMixin, unittest.TestCase):
+@@ -1315,7 +1394,7 @@ class TestSuppress(ExceptionIsLikeMixin, unittest.TestCase):
          )
- 
- 
+
+
 -class TestChdir(unittest.TestCase):
 +class TestChdir(__TestCase):
      def make_relative_path(self, *parts):
          return os.path.join(
              os.path.dirname(os.path.realpath(__file__)),
-@@ -1331,6 +1385,7 @@ class TestChdir(unittest.TestCase):
+@@ -1331,6 +1410,7 @@ class TestChdir(unittest.TestCase):
              self.assertEqual(os.getcwd(), target)
          self.assertEqual(os.getcwd(), old_cwd)
- 
+
 +    @unittest.skip("Missing archivetestdata")
      def test_reentrant(self):
          old_cwd = os.getcwd()
          target1 = self.make_relative_path('data')
-@@ -1363,4 +1418,4 @@ class TestChdir(unittest.TestCase):
- 
- 
+@@ -1363,4 +1443,4 @@ class TestChdir(unittest.TestCase):
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_contextlib.py b/test/dynamo/cpython/3_13/test_contextlib.py
index 51fd083b11294..a4dd5ba20fb6c 100644
--- a/test/dynamo/cpython/3_13/test_contextlib.py
+++ b/test/dynamo/cpython/3_13/test_contextlib.py
@@ -71,52 +71,59 @@ def find_spec(self, fullname, path, target=None):
 class TestAbstractContextManager(__TestCase):
 
     def test_enter(self):
-        class DefaultEnter(AbstractContextManager):
-            def __exit__(self, *args):
-                super().__exit__(*args)
+        with torch._dynamo.error_on_graph_break(False):
+            class DefaultEnter(AbstractContextManager):
+                def __exit__(self, *args):
+                    super().__exit__(*args)
 
         manager = DefaultEnter()
         self.assertIs(manager.__enter__(), manager)
 
     def test_slots(self):
-        class DefaultContextManager(AbstractContextManager):
-            __slots__ = ()
+        with torch._dynamo.error_on_graph_break(False):
+            class DefaultContextManager(AbstractContextManager):
+                __slots__ = ()
 
-            def __exit__(self, *args):
-                super().__exit__(*args)
+                def __exit__(self, *args):
+                    super().__exit__(*args)
 
         with self.assertRaises(AttributeError):
             DefaultContextManager().var = 42
 
     def test_exit_is_abstract(self):
-        class MissingExit(AbstractContextManager):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class MissingExit(AbstractContextManager):
+                pass
 
         with self.assertRaises(TypeError):
             MissingExit()
 
     def test_structural_subclassing(self):
-        class ManagerFromScratch:
-            def __enter__(self):
-                return self
-            def __exit__(self, exc_type, exc_value, traceback):
-                return None
+        with torch._dynamo.error_on_graph_break(False):
+            class ManagerFromScratch:
+                def __enter__(self):
+                    return self
+                def __exit__(self, exc_type, exc_value, traceback):
+                    return None
 
         self.assertTrue(issubclass(ManagerFromScratch, AbstractContextManager))
 
-        class DefaultEnter(AbstractContextManager):
-            def __exit__(self, *args):
-                super().__exit__(*args)
+        with torch._dynamo.error_on_graph_break(False):
+            class DefaultEnter(AbstractContextManager):
+                def __exit__(self, *args):
+                    super().__exit__(*args)
 
         self.assertTrue(issubclass(DefaultEnter, AbstractContextManager))
 
-        class NoEnter(ManagerFromScratch):
-            __enter__ = None
+        with torch._dynamo.error_on_graph_break(False):
+            class NoEnter(ManagerFromScratch):
+                __enter__ = None
 
         self.assertFalse(issubclass(NoEnter, AbstractContextManager))
 
-        class NoExit(ManagerFromScratch):
-            __exit__ = None
+        with torch._dynamo.error_on_graph_break(False):
+            class NoExit(ManagerFromScratch):
+                __exit__ = None
 
         self.assertFalse(issubclass(NoExit, AbstractContextManager))
 
@@ -169,8 +176,9 @@ def f():
         self.assertEqual(frames[0].line, '1/0')
 
         # Repeat with RuntimeError (which goes through a different code path)
-        class RuntimeErrorSubclass(RuntimeError):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class RuntimeErrorSubclass(RuntimeError):
+                pass
 
         try:
             with f():
@@ -182,8 +190,9 @@ class RuntimeErrorSubclass(RuntimeError):
         self.assertEqual(frames[0].name, 'test_contextmanager_traceback')
         self.assertEqual(frames[0].line, 'raise RuntimeErrorSubclass(42)')
 
-        class StopIterationSubclass(StopIteration):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class StopIterationSubclass(StopIteration):
+                pass
 
         for stop_exc in (
             StopIteration('spam'),
@@ -223,9 +232,9 @@ def whoo():
         ctx.__enter__()
         with self.assertRaises(RuntimeError):
             ctx.__exit__(TypeError, TypeError("foo"), None)
-        if support.check_impl_detail(cpython=True):
-            # The "gen" attribute is an implementation detail.
-            self.assertFalse(ctx.gen.gi_suspended)
+        # if support.check_impl_detail(cpython=True):
+        #     # The "gen" attribute is an implementation detail.
+        #     self.assertFalse(ctx.gen.gi_suspended)
 
     def test_contextmanager_trap_no_yield(self):
         @contextmanager
@@ -245,9 +254,9 @@ def whoo():
         ctx.__enter__()
         with self.assertRaises(RuntimeError):
             ctx.__exit__(None, None, None)
-        if support.check_impl_detail(cpython=True):
-            # The "gen" attribute is an implementation detail.
-            self.assertFalse(ctx.gen.gi_suspended)
+        # if support.check_impl_detail(cpython=True):
+        #     # The "gen" attribute is an implementation detail.
+        #     self.assertFalse(ctx.gen.gi_suspended)
 
     def test_contextmanager_non_normalised(self):
         @contextmanager
@@ -284,8 +293,9 @@ def test_contextmanager_except_stopiter(self):
         def woohoo():
             yield
 
-        class StopIterationSubclass(StopIteration):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class StopIterationSubclass(StopIteration):
+                pass
 
         for stop_exc in (StopIteration('spam'), StopIterationSubclass('spam')):
             with self.subTest(type=type(stop_exc)):
@@ -398,8 +408,9 @@ def woohoo(self, func, args, kwds):
             self.assertEqual(target, (11, 22, 33, 44))
 
     def test_nokeepref(self):
-        class A:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                pass
 
         @contextmanager
         def woohoo(a, b):
@@ -461,9 +472,10 @@ def test_instance_docs(self):
 
     def test_closing(self):
         state = []
-        class C:
-            def close(self):
-                state.append(1)
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                def close(self):
+                    state.append(1)
         x = C()
         self.assertEqual(state, [])
         with closing(x) as y:
@@ -472,9 +484,10 @@ def close(self):
 
     def test_closing_error(self):
         state = []
-        class C:
-            def close(self):
-                state.append(1)
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                def close(self):
+                    state.append(1)
         x = C()
         self.assertEqual(state, [])
         with self.assertRaises(ZeroDivisionError):
@@ -486,8 +499,9 @@ def close(self):
 
 class NullcontextTestCase(__TestCase):
     def test_nullcontext(self):
-        class C:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                pass
         c = C()
         with nullcontext(c) as c_in:
             self.assertIs(c_in, c)
@@ -638,13 +652,14 @@ def test():
     def test_decorating_method(self):
         context = mycontext()
 
-        class Test(object):
+        with torch._dynamo.error_on_graph_break(False):
+            class Test(object):
 
-            @context
-            def method(self, a, b, c=None):
-                self.a = a
-                self.b = b
-                self.c = c
+                @context
+                def method(self, a, b, c=None):
+                    self.a = a
+                    self.b = b
+                    self.c = c
 
         # these tests are for argument passing when used as a decorator
         test = Test()
@@ -666,11 +681,12 @@ def method(self, a, b, c=None):
 
 
     def test_typo_enter(self):
-        class mycontext(ContextDecorator):
-            def __unter__(self):
-                pass
-            def __exit__(self, *exc):
-                pass
+        with torch._dynamo.error_on_graph_break(False):
+            class mycontext(ContextDecorator):
+                def __unter__(self):
+                    pass
+                def __exit__(self, *exc):
+                    pass
 
         with self.assertRaisesRegex(TypeError, 'the context manager'):
             with mycontext():
@@ -678,11 +694,12 @@ def __exit__(self, *exc):
 
 
     def test_typo_exit(self):
-        class mycontext(ContextDecorator):
-            def __enter__(self):
-                pass
-            def __uxit__(self, *exc):
-                pass
+        with torch._dynamo.error_on_graph_break(False):
+            class mycontext(ContextDecorator):
+                def __enter__(self):
+                    pass
+                def __uxit__(self, *exc):
+                    pass
 
         with self.assertRaisesRegex(TypeError, 'the context manager.*__exit__'):
             with mycontext():
@@ -690,19 +707,20 @@ def __uxit__(self, *exc):
 
 
     def test_contextdecorator_as_mixin(self):
-        class somecontext(object):
-            started = False
-            exc = None
+        with torch._dynamo.error_on_graph_break(False):
+            class somecontext(object):
+                started = False
+                exc = None
 
-            def __enter__(self):
-                self.started = True
-                return self
+                def __enter__(self):
+                    self.started = True
+                    return self
 
-            def __exit__(self, *exc):
-                self.exc = exc
+                def __exit__(self, *exc):
+                    self.exc = exc
 
-        class mycontext(somecontext, ContextDecorator):
-            pass
+            class mycontext(somecontext, ContextDecorator):
+                pass
 
         context = mycontext()
         @context
@@ -799,13 +817,14 @@ def _expect_ok(exc_type, exc, exc_tb):
             self.assertIsNone(exc_type)
             self.assertIsNone(exc)
             self.assertIsNone(exc_tb)
-        class ExitCM(object):
-            def __init__(self, check_exc):
-                self.check_exc = check_exc
-            def __enter__(self):
-                self.fail("Should not be called!")
-            def __exit__(self, *exc_details):
-                self.check_exc(*exc_details)
+        with torch._dynamo.error_on_graph_break(False):
+            class ExitCM(object):
+                def __init__(self, check_exc):
+                    self.check_exc = check_exc
+                def __enter__(self):
+                    self.fail("Should not be called!")
+                def __exit__(self, *exc_details):
+                    self.check_exc(*exc_details)
         with self.exit_stack() as stack:
             stack.push(_expect_ok)
             self.assertIs(stack._exit_callbacks[-1][1], _expect_ok)
@@ -824,11 +843,12 @@ def __exit__(self, *exc_details):
             1/0
 
     def test_enter_context(self):
-        class TestCM(object):
-            def __enter__(self):
-                result.append(1)
-            def __exit__(self, *exc_details):
-                result.append(3)
+        with torch._dynamo.error_on_graph_break(False):
+            class TestCM(object):
+                def __enter__(self):
+                    result.append(1)
+                def __exit__(self, *exc_details):
+                    result.append(3)
 
         result = []
         cm = TestCM()
@@ -843,14 +863,15 @@ def _exit():
         self.assertEqual(result, [1, 2, 3, 4])
 
     def test_enter_context_errors(self):
-        class LacksEnterAndExit:
-            pass
-        class LacksEnter:
-            def __exit__(self, *exc_info):
-                pass
-        class LacksExit:
-            def __enter__(self):
+        with torch._dynamo.error_on_graph_break(False):
+            class LacksEnterAndExit:
                 pass
+            class LacksEnter:
+                def __exit__(self, *exc_info):
+                    pass
+            class LacksExit:
+                def __enter__(self):
+                    pass
 
         with self.exit_stack() as stack:
             with self.assertRaisesRegex(TypeError, 'the context manager'):
@@ -931,32 +952,33 @@ def raise_exc(exc):
     def test_exit_exception_chaining_reference(self):
         # Sanity check to make sure that ExitStack chaining matches
         # actual nested with statements
-        class RaiseExc:
-            def __init__(self, exc):
-                self.exc = exc
-            def __enter__(self):
-                return self
-            def __exit__(self, *exc_details):
-                raise self.exc
-
-        class RaiseExcWithContext:
-            def __init__(self, outer, inner):
-                self.outer = outer
-                self.inner = inner
-            def __enter__(self):
-                return self
-            def __exit__(self, *exc_details):
-                try:
-                    raise self.inner
-                except:
-                    raise self.outer
-
-        class SuppressExc:
-            def __enter__(self):
-                return self
-            def __exit__(self, *exc_details):
-                type(self).saved_details = exc_details
-                return True
+        with torch._dynamo.error_on_graph_break(False):
+            class RaiseExc:
+                def __init__(self, exc):
+                    self.exc = exc
+                def __enter__(self):
+                    return self
+                def __exit__(self, *exc_details):
+                    raise self.exc
+
+            class RaiseExcWithContext:
+                def __init__(self, outer, inner):
+                    self.outer = outer
+                    self.inner = inner
+                def __enter__(self):
+                    return self
+                def __exit__(self, *exc_details):
+                    try:
+                        raise self.inner
+                    except:
+                        raise self.outer
+
+            class SuppressExc:
+                def __enter__(self):
+                    return self
+                def __exit__(self, *exc_details):
+                    type(self).saved_details = exc_details
+                    return True
 
         try:
             with RaiseExc(IndexError):
@@ -1011,8 +1033,9 @@ def test_exit_exception_explicit_none_context(self):
         # Ensure ExitStack chaining matches actual nested `with` statements
         # regarding explicit __context__ = None.
 
-        class MyException(Exception):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class MyException(Exception):
+                pass
 
         @contextmanager
         def my_cm():
@@ -1150,7 +1173,8 @@ def test_excessive_nesting(self):
                 stack.callback(int)
 
     def test_instance_bypass(self):
-        class Example(object): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Example(object): pass
         cm = Example()
         cm.__enter__ = object()
         cm.__exit__ = object()
@@ -1162,8 +1186,9 @@ class Example(object): pass
 
     def test_dont_reraise_RuntimeError(self):
         # https://bugs.python.org/issue27122
-        class UniqueException(Exception): pass
-        class UniqueRuntimeError(RuntimeError): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class UniqueException(Exception): pass
+            class UniqueRuntimeError(RuntimeError): pass
 
         @contextmanager
         def second():
diff --git a/test/dynamo/cpython/3_13/test_defaultdict.diff b/test/dynamo/cpython/3_13/test_defaultdict.diff
new file mode 100644
index 0000000000000..7f4fce2efdbea
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_defaultdict.diff
@@ -0,0 +1,98 @@
+diff --git a/test/dynamo/cpython/3_13/test_defaultdict.py b/test/dynamo/cpython/3_13/test_defaultdict.py
+index bdbe9b81e8f..d55f1dc54c6 100644
+--- a/test/dynamo/cpython/3_13/test_defaultdict.py
++++ b/test/dynamo/cpython/3_13/test_defaultdict.py
+@@ -1,3 +1,60 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++# Test copied from
++# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_defaultdict.py
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import (
++    run_tests,
++)
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
++
+ """Unit tests for collections.defaultdict."""
+
+ import copy
+@@ -9,7 +66,7 @@ from collections import defaultdict
+ def foobar():
+     return list
+
+-class TestDefaultDict(unittest.TestCase):
++class TestDefaultDict(__TestCase):
+
+     def test_basic(self):
+         d1 = defaultdict()
+@@ -127,11 +184,12 @@ class TestDefaultDict(unittest.TestCase):
+
+     def test_recursive_repr(self):
+         # Issue2045: stack overflow when default_factory is a bound method
+-        class sub(defaultdict):
+-            def __init__(self):
+-                self.default_factory = self._factory
+-            def _factory(self):
+-                return []
++        with torch._dynamo.error_on_graph_break(False):
++            class sub(defaultdict):
++                def __init__(self):
++                    self.default_factory = self._factory
++                def _factory(self):
++                    return []
+         d = sub()
+         self.assertRegex(repr(d),
+             r"sub\(<bound method .*sub\._factory "
+@@ -187,4 +245,4 @@ class TestDefaultDict(unittest.TestCase):
+             i |= None
+
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_defaultdict.py b/test/dynamo/cpython/3_13/test_defaultdict.py
new file mode 100644
index 0000000000000..390f55d89dec4
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_defaultdict.py
@@ -0,0 +1,248 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+# Test copied from
+# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_defaultdict.py
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import (
+    run_tests,
+)
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+
+"""Unit tests for collections.defaultdict."""
+
+import copy
+import pickle
+import unittest
+
+from collections import defaultdict
+
+def foobar():
+    return list
+
+class TestDefaultDict(__TestCase):
+
+    def test_basic(self):
+        d1 = defaultdict()
+        self.assertEqual(d1.default_factory, None)
+        d1.default_factory = list
+        d1[12].append(42)
+        self.assertEqual(d1, {12: [42]})
+        d1[12].append(24)
+        self.assertEqual(d1, {12: [42, 24]})
+        d1[13]
+        d1[14]
+        self.assertEqual(d1, {12: [42, 24], 13: [], 14: []})
+        self.assertTrue(d1[12] is not d1[13] is not d1[14])
+        d2 = defaultdict(list, foo=1, bar=2)
+        self.assertEqual(d2.default_factory, list)
+        self.assertEqual(d2, {"foo": 1, "bar": 2})
+        self.assertEqual(d2["foo"], 1)
+        self.assertEqual(d2["bar"], 2)
+        self.assertEqual(d2[42], [])
+        self.assertIn("foo", d2)
+        self.assertIn("foo", d2.keys())
+        self.assertIn("bar", d2)
+        self.assertIn("bar", d2.keys())
+        self.assertIn(42, d2)
+        self.assertIn(42, d2.keys())
+        self.assertNotIn(12, d2)
+        self.assertNotIn(12, d2.keys())
+        d2.default_factory = None
+        self.assertEqual(d2.default_factory, None)
+        try:
+            d2[15]
+        except KeyError as err:
+            self.assertEqual(err.args, (15,))
+        else:
+            self.fail("d2[15] didn't raise KeyError")
+        self.assertRaises(TypeError, defaultdict, 1)
+
+    def test_missing(self):
+        d1 = defaultdict()
+        self.assertRaises(KeyError, d1.__missing__, 42)
+        d1.default_factory = list
+        self.assertEqual(d1.__missing__(42), [])
+
+    def test_repr(self):
+        d1 = defaultdict()
+        self.assertEqual(d1.default_factory, None)
+        self.assertEqual(repr(d1), "defaultdict(None, {})")
+        self.assertEqual(eval(repr(d1)), d1)
+        d1[11] = 41
+        self.assertEqual(repr(d1), "defaultdict(None, {11: 41})")
+        d2 = defaultdict(int)
+        self.assertEqual(d2.default_factory, int)
+        d2[12] = 42
+        self.assertEqual(repr(d2), "defaultdict(<class 'int'>, {12: 42})")
+        def foo(): return 43
+        d3 = defaultdict(foo)
+        self.assertTrue(d3.default_factory is foo)
+        d3[13]
+        self.assertEqual(repr(d3), "defaultdict(%s, {13: 43})" % repr(foo))
+
+    def test_copy(self):
+        d1 = defaultdict()
+        d2 = d1.copy()
+        self.assertEqual(type(d2), defaultdict)
+        self.assertEqual(d2.default_factory, None)
+        self.assertEqual(d2, {})
+        d1.default_factory = list
+        d3 = d1.copy()
+        self.assertEqual(type(d3), defaultdict)
+        self.assertEqual(d3.default_factory, list)
+        self.assertEqual(d3, {})
+        d1[42]
+        d4 = d1.copy()
+        self.assertEqual(type(d4), defaultdict)
+        self.assertEqual(d4.default_factory, list)
+        self.assertEqual(d4, {42: []})
+        d4[12]
+        self.assertEqual(d4, {42: [], 12: []})
+
+        # Issue 6637: Copy fails for empty default dict
+        d = defaultdict()
+        d['a'] = 42
+        e = d.copy()
+        self.assertEqual(e['a'], 42)
+
+    def test_shallow_copy(self):
+        d1 = defaultdict(foobar, {1: 1})
+        d2 = copy.copy(d1)
+        self.assertEqual(d2.default_factory, foobar)
+        self.assertEqual(d2, d1)
+        d1.default_factory = list
+        d2 = copy.copy(d1)
+        self.assertEqual(d2.default_factory, list)
+        self.assertEqual(d2, d1)
+
+    def test_deep_copy(self):
+        d1 = defaultdict(foobar, {1: [1]})
+        d2 = copy.deepcopy(d1)
+        self.assertEqual(d2.default_factory, foobar)
+        self.assertEqual(d2, d1)
+        self.assertTrue(d1[1] is not d2[1])
+        d1.default_factory = list
+        d2 = copy.deepcopy(d1)
+        self.assertEqual(d2.default_factory, list)
+        self.assertEqual(d2, d1)
+
+    def test_keyerror_without_factory(self):
+        d1 = defaultdict()
+        try:
+            d1[(1,)]
+        except KeyError as err:
+            self.assertEqual(err.args[0], (1,))
+        else:
+            self.fail("expected KeyError")
+
+    def test_recursive_repr(self):
+        # Issue2045: stack overflow when default_factory is a bound method
+        with torch._dynamo.error_on_graph_break(False):
+            class sub(defaultdict):
+                def __init__(self):
+                    self.default_factory = self._factory
+                def _factory(self):
+                    return []
+        d = sub()
+        self.assertRegex(repr(d),
+            r"sub\(<bound method .*sub\._factory "
+            r"of sub\(\.\.\., \{\}\)>, \{\}\)")
+
+    def test_callable_arg(self):
+        self.assertRaises(TypeError, defaultdict, {})
+
+    def test_pickling(self):
+        d = defaultdict(int)
+        d[1]
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            s = pickle.dumps(d, proto)
+            o = pickle.loads(s)
+            self.assertEqual(d, o)
+
+    def test_union(self):
+        i = defaultdict(int, {1: 1, 2: 2})
+        s = defaultdict(str, {0: "zero", 1: "one"})
+
+        i_s = i | s
+        self.assertIs(i_s.default_factory, int)
+        self.assertDictEqual(i_s, {1: "one", 2: 2, 0: "zero"})
+        self.assertEqual(list(i_s), [1, 2, 0])
+
+        s_i = s | i
+        self.assertIs(s_i.default_factory, str)
+        self.assertDictEqual(s_i, {0: "zero", 1: 1, 2: 2})
+        self.assertEqual(list(s_i), [0, 1, 2])
+
+        i_ds = i | dict(s)
+        self.assertIs(i_ds.default_factory, int)
+        self.assertDictEqual(i_ds, {1: "one", 2: 2, 0: "zero"})
+        self.assertEqual(list(i_ds), [1, 2, 0])
+
+        ds_i = dict(s) | i
+        self.assertIs(ds_i.default_factory, int)
+        self.assertDictEqual(ds_i, {0: "zero", 1: 1, 2: 2})
+        self.assertEqual(list(ds_i), [0, 1, 2])
+
+        with self.assertRaises(TypeError):
+            i | list(s.items())
+        with self.assertRaises(TypeError):
+            list(s.items()) | i
+
+        # We inherit a fine |= from dict, so just a few sanity checks here:
+        i |= list(s.items())
+        self.assertIs(i.default_factory, int)
+        self.assertDictEqual(i, {1: "one", 2: 2, 0: "zero"})
+        self.assertEqual(list(i), [1, 2, 0])
+
+        with self.assertRaises(TypeError):
+            i |= None
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_dict.diff b/test/dynamo/cpython/3_13/test_dict.diff
index 0c6beec66dad2..d8e24851409a9 100644
--- a/test/dynamo/cpython/3_13/test_dict.diff
+++ b/test/dynamo/cpython/3_13/test_dict.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_dict.py b/test/dynamo/cpython/3_13/test_dict.py
-index 4c095464cbb..fcda6484ea6 100644
+index 4729132c5a5..6ecf111c1e3 100644
 --- a/test/dynamo/cpython/3_13/test_dict.py
 +++ b/test/dynamo/cpython/3_13/test_dict.py
 @@ -1,3 +1,60 @@
@@ -63,57 +63,384 @@ index 4c095464cbb..fcda6484ea6 100644
  import collections
  import collections.abc
  import gc
-@@ -11,7 +68,7 @@ from test import support
+@@ -11,11 +68,12 @@ from test import support
  from test.support import import_helper, get_c_recursion_limit
- 
- 
+
+
 -class DictTest(unittest.TestCase):
 +class DictTest(__TestCase):
- 
+
      def test_invalid_keyword_arguments(self):
-         class Custom(dict):
-@@ -265,39 +322,7 @@ class DictTest(unittest.TestCase):
- 
-         self.assertRaises(ValueError, {}.update, [(1, 2, 3)])
- 
--    def test_update_shared_keys(self):
--        class MyClass: pass
--
--        # Subclass str to enable us to create an object during the
--        # dict.update() call.
--        class MyStr(str):
--            def __hash__(self):
--                return super().__hash__()
--
+-        class Custom(dict):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Custom(dict):
++                pass
+         for invalid in {1 : 2}, Custom({1 : 2}):
+             with self.assertRaises(TypeError):
+                 dict(**invalid)
+@@ -108,8 +166,9 @@ class DictTest(unittest.TestCase):
+
+     def test_views_mapping(self):
+         mappingproxy = type(type.__dict__)
+-        class Dict(dict):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Dict(dict):
++                pass
+         for cls in [dict, Dict]:
+             d = cls()
+             m1 = d.keys().mapping
+@@ -157,25 +216,27 @@ class DictTest(unittest.TestCase):
+
+         self.assertRaises(TypeError, d.__getitem__)
+
+-        class BadEq(object):
 -            def __eq__(self, other):
--                # Create an object that shares the same PyDictKeysObject as
--                # obj.__dict__.
--                obj2 = MyClass()
--                obj2.a = "a"
--                obj2.b = "b"
--                obj2.c = "c"
--                return super().__eq__(other)
--
--        obj = MyClass()
--        obj.a = "a"
--        obj.b = "b"
--
--        x = {}
--        x[MyStr("a")] = MyStr("a")
--
--        # gh-132617: this previously raised "dict mutated during update" error
--        x.update(obj.__dict__)
--
--        self.assertEqual(x, {
--            MyStr("a"): "a",
--            "b": "b",
--        })
--
+-                raise Exc()
+-            def __hash__(self):
+-                return 24
++        with torch._dynamo.error_on_graph_break(False):
++            class BadEq(object):
++                def __eq__(self, other):
++                    raise Exc()
++                def __hash__(self):
++                    return 24
+
+         d = {}
+         d[BadEq()] = 42
+         self.assertRaises(KeyError, d.__getitem__, 23)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadHash(object):
+-            fail = False
+-            def __hash__(self):
+-                if self.fail:
+-                    raise Exc()
+-                else:
+-                    return 42
++            class BadHash(object):
++                fail = False
++                def __hash__(self):
++                    if self.fail:
++                        raise Exc()
++                    else:
++                        return 42
+
+         x = BadHash()
+         d[x] = 42
+@@ -201,70 +262,79 @@ class DictTest(unittest.TestCase):
+
+         self.assertRaises((TypeError, AttributeError), d.update, None)
+
+-        class SimpleUserDict:
+-            def __init__(self):
+-                self.d = {1:1, 2:2, 3:3}
+-            def keys(self):
+-                return self.d.keys()
+-            def __getitem__(self, i):
+-                return self.d[i]
++        with torch._dynamo.error_on_graph_break(False):
++            class SimpleUserDict:
++                def __init__(self):
++                    self.d = {1:1, 2:2, 3:3}
++                def keys(self):
++                    return self.d.keys()
++                def __getitem__(self, i):
++                    return self.d[i]
+         d.clear()
+         d.update(SimpleUserDict())
+         self.assertEqual(d, {1:1, 2:2, 3:3})
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+         d.clear()
+-        class FailingUserDict:
+-            def keys(self):
+-                raise Exc
++
++        with torch._dynamo.error_on_graph_break(False):
++            class FailingUserDict:
++                def keys(self):
++                    raise Exc
+         self.assertRaises(Exc, d.update, FailingUserDict())
+
+-        class FailingUserDict:
+-            def keys(self):
+-                class BogonIter:
+-                    def __init__(self):
+-                        self.i = 1
+-                    def __iter__(self):
+-                        return self
+-                    def __next__(self):
+-                        if self.i:
+-                            self.i = 0
+-                            return 'a'
+-                        raise Exc
+-                return BogonIter()
+-            def __getitem__(self, key):
+-                return key
++        with torch._dynamo.error_on_graph_break(False):
++            class FailingUserDict:
++                def keys(self):
++                    class BogonIter:
++                        def __init__(self):
++                            self.i = 1
++                        def __iter__(self):
++                            return self
++                        def __next__(self):
++                            if self.i:
++                                self.i = 0
++                                return 'a'
++                            raise Exc
++                    return BogonIter()
++                def __getitem__(self, key):
++                    return key
+         self.assertRaises(Exc, d.update, FailingUserDict())
+
+-        class FailingUserDict:
+-            def keys(self):
+-                class BogonIter:
+-                    def __init__(self):
+-                        self.i = ord('a')
+-                    def __iter__(self):
+-                        return self
+-                    def __next__(self):
+-                        if self.i <= ord('z'):
+-                            rtn = chr(self.i)
+-                            self.i += 1
+-                            return rtn
+-                        raise StopIteration
+-                return BogonIter()
+-            def __getitem__(self, key):
+-                raise Exc
++        with torch._dynamo.error_on_graph_break(False):
++            class FailingUserDict:
++                def keys(self):
++                    class BogonIter:
++                        def __init__(self):
++                            self.i = ord('a')
++                        def __iter__(self):
++                            return self
++                        def __next__(self):
++                            if self.i <= ord('z'):
++                                rtn = chr(self.i)
++                                self.i += 1
++                                return rtn
++                            raise StopIteration
++                    return BogonIter()
++                def __getitem__(self, key):
++                    raise Exc
+         self.assertRaises(Exc, d.update, FailingUserDict())
+
+-        class badseq(object):
+-            def __iter__(self):
+-                return self
+-            def __next__(self):
+-                raise Exc()
++
++        with torch._dynamo.error_on_graph_break(False):
++            class badseq(object):
++                def __iter__(self):
++                    return self
++                def __next__(self):
++                    raise Exc()
+
+         self.assertRaises(Exc, {}.update, badseq())
+
+         self.assertRaises(ValueError, {}.update, [(1, 2, 3)])
+
 +    @unittest.skip("test hangs")
      def test_fromkeys(self):
          self.assertEqual(dict.fromkeys('abc'), {'a':None, 'b':None, 'c':None})
          d = {}
-@@ -510,7 +535,7 @@ class DictTest(unittest.TestCase):
+@@ -276,38 +346,43 @@ class DictTest(unittest.TestCase):
+             yield 1
+         self.assertEqual(d.fromkeys(g()), {1:None})
+         self.assertRaises(TypeError, {}.fromkeys, 3)
+-        class dictlike(dict): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class dictlike(dict): pass
+         self.assertEqual(dictlike.fromkeys('a'), {'a':None})
+         self.assertEqual(dictlike().fromkeys('a'), {'a':None})
+         self.assertIsInstance(dictlike.fromkeys('a'), dictlike)
+         self.assertIsInstance(dictlike().fromkeys('a'), dictlike)
+-        class mydict(dict):
+-            def __new__(cls):
+-                return collections.UserDict()
++        with torch._dynamo.error_on_graph_break(False):
++            class mydict(dict):
++                def __new__(cls):
++                    return collections.UserDict()
+         ud = mydict.fromkeys('ab')
+         self.assertEqual(ud, {'a':None, 'b':None})
+         self.assertIsInstance(ud, collections.UserDict)
+         self.assertRaises(TypeError, dict.fromkeys)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class baddict1(dict):
+-            def __init__(self):
+-                raise Exc()
++            class baddict1(dict):
++                def __init__(self):
++                    raise Exc()
+
+         self.assertRaises(Exc, baddict1.fromkeys, [1])
+
+-        class BadSeq(object):
+-            def __iter__(self):
+-                return self
+-            def __next__(self):
+-                raise Exc()
++        with torch._dynamo.error_on_graph_break(False):
++            class BadSeq(object):
++                def __iter__(self):
++                    return self
++                def __next__(self):
++                    raise Exc()
+
+         self.assertRaises(Exc, dict.fromkeys, BadSeq())
+
+-        class baddict2(dict):
+-            def __setitem__(self, key, value):
+-                raise Exc()
++        with torch._dynamo.error_on_graph_break(False):
++            class baddict2(dict):
++                def __setitem__(self, key, value):
++                    raise Exc()
+
+         self.assertRaises(Exc, baddict2.fromkeys, [1])
+
+@@ -323,18 +398,20 @@ class DictTest(unittest.TestCase):
+         self.assertEqual(dict.fromkeys(d, 0), res)
+
+         # test fast path when object's constructor returns large non-empty dict
+-        class baddict3(dict):
+-            def __new__(cls):
+-                return d
++        with torch._dynamo.error_on_graph_break(False):
++            class baddict3(dict):
++                def __new__(cls):
++                    return d
+         d = {i : i for i in range(1000)}
+         res = d.copy()
+         res.update(a=None, b=None, c=None)
+         self.assertEqual(baddict3.fromkeys({"a", "b", "c"}), res)
+
+         # test slow path when object is a proper subclass of dict
+-        class baddict4(dict):
+-            def __init__(self):
+-                dict.__init__(self, d)
++        with torch._dynamo.error_on_graph_break(False):
++            class baddict4(dict):
++                def __init__(self):
++                    dict.__init__(self, d)
+         d = {i : i for i in range(1000)}
+         res = d.copy()
+         res.update(a=None, b=None, c=None)
+@@ -370,8 +447,9 @@ class DictTest(unittest.TestCase):
+                 self.assertEqual(len(d2), len(d) + 1)
+
+     def test_copy_maintains_tracking(self):
+-        class A:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                pass
+
+         key = A()
+
+@@ -416,15 +494,17 @@ class DictTest(unittest.TestCase):
+         self.assertEqual(len(d['key']), 2)
+         self.assertRaises(TypeError, d.setdefault)
+
+-        class Exc(Exception): pass
+
+-        class BadHash(object):
+-            fail = False
+-            def __hash__(self):
+-                if self.fail:
+-                    raise Exc()
+-                else:
+-                    return 42
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
++
++            class BadHash(object):
++                fail = False
++                def __hash__(self):
++                    if self.fail:
++                        raise Exc()
++                    else:
++                        return 42
+
+         x = BadHash()
+         d[x] = 42
+@@ -433,16 +513,17 @@ class DictTest(unittest.TestCase):
+
+     def test_setdefault_atomic(self):
+         # Issue #13521: setdefault() calls __hash__ and __eq__ only once.
+-        class Hashed(object):
+-            def __init__(self):
+-                self.hash_count = 0
+-                self.eq_count = 0
+-            def __hash__(self):
+-                self.hash_count += 1
+-                return 42
+-            def __eq__(self, other):
+-                self.eq_count += 1
+-                return id(self) == id(other)
++        with torch._dynamo.error_on_graph_break(False):
++            class Hashed(object):
++                def __init__(self):
++                    self.hash_count = 0
++                    self.eq_count = 0
++                def __hash__(self):
++                    self.hash_count += 1
++                    return 42
++                def __eq__(self, other):
++                    self.eq_count += 1
++                    return id(self) == id(other)
+         hashed1 = Hashed()
+         y = {hashed1: 5}
+         hashed2 = Hashed()
+@@ -452,16 +533,17 @@ class DictTest(unittest.TestCase):
+         self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
+
+     def test_setitem_atomic_at_resize(self):
+-        class Hashed(object):
+-            def __init__(self):
+-                self.hash_count = 0
+-                self.eq_count = 0
+-            def __hash__(self):
+-                self.hash_count += 1
+-                return 42
+-            def __eq__(self, other):
+-                self.eq_count += 1
+-                return id(self) == id(other)
++        with torch._dynamo.error_on_graph_break(False):
++            class Hashed(object):
++                def __init__(self):
++                    self.hash_count = 0
++                    self.eq_count = 0
++                def __hash__(self):
++                    self.hash_count += 1
++                    return 42
++                def __eq__(self, other):
++                    self.eq_count += 1
++                    return id(self) == id(other)
+         hashed1 = Hashed()
+         # 5 items
+         y = {hashed1: 5, 0: 0, 1: 1, 2: 2, 3: 3}
+@@ -477,7 +559,7 @@ class DictTest(unittest.TestCase):
          for copymode in -1, +1:
              # -1: b has same structure as a
              # +1: b is a.copy()
@@ -122,10 +449,251 @@ index 4c095464cbb..fcda6484ea6 100644
                  size = 2**log2size
                  a = {}
                  b = {}
-@@ -1039,18 +1064,6 @@ class DictTest(unittest.TestCase):
+@@ -517,15 +599,16 @@ class DictTest(unittest.TestCase):
+
+         self.assertRaises(TypeError, d.pop)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadHash(object):
+-            fail = False
+-            def __hash__(self):
+-                if self.fail:
+-                    raise Exc()
+-                else:
+-                    return 42
++            class BadHash(object):
++                fail = False
++                def __hash__(self):
++                    if self.fail:
++                        raise Exc()
++                    else:
++                        return 42
+
+         x = BadHash()
+         d[x] = 42
+@@ -569,22 +652,23 @@ class DictTest(unittest.TestCase):
+
+     def test_mutating_lookup(self):
+         # changing dict during a lookup (issue #14417)
+-        class NastyKey:
+-            mutate_dict = None
++        with torch._dynamo.error_on_graph_break(False):
++            class NastyKey:
++                mutate_dict = None
+
+-            def __init__(self, value):
+-                self.value = value
++                def __init__(self, value):
++                    self.value = value
+
+-            def __hash__(self):
+-                # hash collision!
+-                return 1
++                def __hash__(self):
++                    # hash collision!
++                    return 1
+
+-            def __eq__(self, other):
+-                if NastyKey.mutate_dict:
+-                    mydict, key = NastyKey.mutate_dict
+-                    NastyKey.mutate_dict = None
+-                    del mydict[key]
+-                return self.value == other.value
++                def __eq__(self, other):
++                    if NastyKey.mutate_dict:
++                        mydict, key = NastyKey.mutate_dict
++                        NastyKey.mutate_dict = None
++                        del mydict[key]
++                    return self.value == other.value
+
+         key1 = NastyKey(1)
+         key2 = NastyKey(2)
+@@ -602,11 +686,12 @@ class DictTest(unittest.TestCase):
+         d[1] = d
+         self.assertEqual(repr(d), '{1: {...}}')
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadRepr(object):
+-            def __repr__(self):
+-                raise Exc()
++            class BadRepr(object):
++                def __repr__(self):
++                    raise Exc()
+
+         d = {1: BadRepr()}
+         self.assertRaises(Exc, repr, d)
+@@ -621,13 +706,14 @@ class DictTest(unittest.TestCase):
+         self.assertEqual({}, {})
+         self.assertEqual({1: 2}, {1: 2})
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadCmp(object):
+-            def __eq__(self, other):
+-                raise Exc()
+-            def __hash__(self):
+-                return 1
++            class BadCmp(object):
++                def __eq__(self, other):
++                    raise Exc()
++                def __hash__(self):
++                    return 1
+
+         d1 = {BadCmp(): 1}
+         d2 = {1: 1}
+@@ -684,9 +770,10 @@ class DictTest(unittest.TestCase):
+         self.assertFalse(larger == larger3)
+
+     def test_errors_in_view_containment_check(self):
+-        class C:
+-            def __eq__(self, other):
+-                raise RuntimeError
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                def __eq__(self, other):
++                    raise RuntimeError
+
+         d1 = {1: C()}
+         d2 = {1: C()}
+@@ -766,9 +853,10 @@ class DictTest(unittest.TestCase):
+         # (E) subclass defines __missing__ method raising RuntimeError
+         # (F) subclass sets __missing__ instance variable (no effect)
+         # (G) subclass doesn't define __missing__ at all
+-        class D(dict):
+-            def __missing__(self, key):
+-                return 42
++        with torch._dynamo.error_on_graph_break(False):
++            class D(dict):
++                def __missing__(self, key):
++                    return 42
+         d = D({1: 2, 3: 4})
+         self.assertEqual(d[1], 2)
+         self.assertEqual(d[3], 4)
+@@ -776,25 +864,28 @@ class DictTest(unittest.TestCase):
+         self.assertNotIn(2, d.keys())
+         self.assertEqual(d[2], 42)
+
+-        class E(dict):
+-            def __missing__(self, key):
+-                raise RuntimeError(key)
++        with torch._dynamo.error_on_graph_break(False):
++            class E(dict):
++                def __missing__(self, key):
++                    raise RuntimeError(key)
+         e = E()
+         with self.assertRaises(RuntimeError) as c:
+             e[42]
+         self.assertEqual(c.exception.args, (42,))
+
+-        class F(dict):
+-            def __init__(self):
+-                # An instance variable __missing__ should have no effect
+-                self.__missing__ = lambda key: None
++        with torch._dynamo.error_on_graph_break(False):
++            class F(dict):
++                def __init__(self):
++                    # An instance variable __missing__ should have no effect
++                    self.__missing__ = lambda key: None
+         f = F()
+         with self.assertRaises(KeyError) as c:
+             f[42]
+         self.assertEqual(c.exception.args, (42,))
+
+-        class G(dict):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class G(dict):
++                pass
+         g = G()
+         with self.assertRaises(KeyError) as c:
+             g[42]
+@@ -809,17 +900,18 @@ class DictTest(unittest.TestCase):
+
+     def test_bad_key(self):
+         # Dictionary lookups should fail if __eq__() raises an exception.
+-        class CustomException(Exception):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class CustomException(Exception):
++                pass
+
+-        class BadDictKey:
+-            def __hash__(self):
+-                return hash(self.__class__)
++            class BadDictKey:
++                def __hash__(self):
++                    return hash(self.__class__)
+
+-            def __eq__(self, other):
+-                if isinstance(other, self.__class__):
+-                    raise CustomException
+-                return other
++                def __eq__(self, other):
++                    if isinstance(other, self.__class__):
++                        raise CustomException
++                    return other
+
+         d = {}
+         x1 = BadDictKey()
+@@ -855,13 +947,14 @@ class DictTest(unittest.TestCase):
+         # Another dict resizing bug (SF bug #1456209).
+         # This caused Segmentation faults or Illegal instructions.
+
+-        class X(object):
+-            def __hash__(self):
+-                return 5
+-            def __eq__(self, other):
+-                if resizing:
+-                    d.clear()
+-                return False
++        with torch._dynamo.error_on_graph_break(False):
++            class X(object):
++                def __hash__(self):
++                    return 5
++                def __eq__(self, other):
++                    if resizing:
++                        d.clear()
++                    return False
+         d = {}
+         resizing = False
+         d[X()] = 1
+@@ -884,8 +977,9 @@ class DictTest(unittest.TestCase):
+     def test_container_iterator(self):
+         # Bug #3680: tp_traverse was not implemented for dictiter and
+         # dictview objects.
+-        class C(object):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C(object):
++                pass
+         views = (dict.items, dict.values, dict.keys)
+         for v in views:
+             obj = C()
+@@ -938,8 +1032,10 @@ class DictTest(unittest.TestCase):
+     @support.cpython_only
+     def test_track_dynamic(self):
+         # Test GC-optimization of dynamically-created dicts
+-        class MyObject(object):
+-            pass
++
++        with torch._dynamo.error_on_graph_break(False):
++            class MyObject(object):
++                pass
+         x, y, z, w, o = 1.5, "a", (1, object()), [], MyObject()
+
+         d = dict()
+@@ -1006,21 +1102,10 @@ class DictTest(unittest.TestCase):
              pass
          self._tracked(MyDict())
- 
+
 -    @support.cpython_only
 -    def test_track_lazy_instance_dicts(self):
 -        class C:
@@ -139,20 +707,402 @@ index 4c095464cbb..fcda6484ea6 100644
 -        self._tracked(d)
 -
      def make_shared_key_dict(self, n):
-         class C:
+-        class C:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                pass
+
+         dicts = []
+         for i in range(n):
+@@ -1109,12 +1194,13 @@ class DictTest(unittest.TestCase):
+     @support.cpython_only
+     def test_splittable_update(self):
+         """dict.update(other) must preserve order in other."""
+-        class C:
+-            def __init__(self, order):
+-                if order:
+-                    self.a, self.b, self.c = 1, 2, 3
+-                else:
+-                    self.c, self.b, self.a = 1, 2, 3
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                def __init__(self, order):
++                    if order:
++                        self.a, self.b, self.c = 1, 2, 3
++                    else:
++                        self.c, self.b, self.a = 1, 2, 3
+         o = C(True)
+         o = C(False)  # o.__dict__ has reversed order.
+         self.assertEqual(list(o.__dict__), ["c", "b", "a"])
+@@ -1126,8 +1212,9 @@ class DictTest(unittest.TestCase):
+     @support.cpython_only
+     def test_splittable_to_generic_combinedtable(self):
+         """split table must be correctly resized and converted to generic combined table"""
+-        class C:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                pass
+
+         a = C()
+         a.x = 1
+@@ -1249,17 +1336,20 @@ class DictTest(unittest.TestCase):
+             self.assertEqual(sorted(values), sorted(data.values()))
+
+     def test_instance_dict_getattr_str_subclass(self):
+-        class Foo:
+-            def __init__(self, msg):
+-                self.msg = msg
++        with torch._dynamo.error_on_graph_break(False):
++            class Foo:
++                def __init__(self, msg):
++                    self.msg = msg
+         f = Foo('123')
+-        class _str(str):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class _str(str):
++                pass
+         self.assertEqual(f.msg, getattr(f, _str('msg')))
+         self.assertEqual(f.msg, f.__dict__[_str('msg')])
+
+     def test_object_set_item_single_instance_non_str_key(self):
+-        class Foo: pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Foo: pass
+         f = Foo()
+         f.__dict__[1] = 1
+         f.a = 'a'
+@@ -1269,9 +1359,10 @@ class DictTest(unittest.TestCase):
+         # This object will trigger mutation of the dict when replaced
+         # by another value.  Note this relies on refcounting: the test
+         # won't achieve its purpose on fully-GCed Python implementations.
+-        class Mutating:
+-            def __del__(self):
+-                mutate(d)
++        with torch._dynamo.error_on_graph_break(False):
++            class Mutating:
++                def __del__(self):
++                    mutate(d)
+
+         d = {k: Mutating() for k in 'abcdefghijklmnopqr'}
+         for k in list(d):
+@@ -1294,13 +1385,14 @@ class DictTest(unittest.TestCase):
+         self.check_reentrant_insertion(mutate)
+
+     def test_merge_and_mutate(self):
+-        class X:
+-            def __hash__(self):
+-                return 0
++        with torch._dynamo.error_on_graph_break(False):
++            class X:
++                def __hash__(self):
++                    return 0
+
+-            def __eq__(self, o):
+-                other.clear()
+-                return False
++                def __eq__(self, o):
++                    other.clear()
++                    return False
+
+         l = [(i,0) for i in range(1, 1337)]
+         other = dict(l)
+@@ -1316,26 +1408,28 @@ class DictTest(unittest.TestCase):
+
+     def test_equal_operator_modifying_operand(self):
+         # test fix for seg fault reported in bpo-27945 part 3.
+-        class X():
+-            def __del__(self):
+-                dict_b.clear()
++        with torch._dynamo.error_on_graph_break(False):
++            class X():
++                def __del__(self):
++                    dict_b.clear()
+
+-            def __eq__(self, other):
+-                dict_a.clear()
+-                return True
++                def __eq__(self, other):
++                    dict_a.clear()
++                    return True
+
+-            def __hash__(self):
+-                return 13
++                def __hash__(self):
++                    return 13
+
+         dict_a = {X(): 0}
+         dict_b = {X(): X()}
+         self.assertTrue(dict_a == dict_b)
+
+         # test fix for seg fault reported in bpo-38588 part 1.
+-        class Y:
+-            def __eq__(self, other):
+-                dict_d.clear()
+-                return True
++        with torch._dynamo.error_on_graph_break(False):
++            class Y:
++                def __eq__(self, other):
++                    dict_d.clear()
++                    return True
+
+         dict_c = {0: Y()}
+         dict_d = {0: set()}
+@@ -1343,14 +1437,15 @@ class DictTest(unittest.TestCase):
+
+     def test_fromkeys_operator_modifying_dict_operand(self):
+         # test fix for seg fault reported in issue 27945 part 4a.
+-        class X(int):
+-            def __hash__(self):
+-                return 13
++        with torch._dynamo.error_on_graph_break(False):
++            class X(int):
++                def __hash__(self):
++                    return 13
+
+-            def __eq__(self, other):
+-                if len(d) > 1:
+-                    d.clear()
+-                return False
++                def __eq__(self, other):
++                    if len(d) > 1:
++                        d.clear()
++                    return False
+
+         d = {}  # this is required to exist so that d can be constructed!
+         d = {X(1): 1, X(2): 2}
+@@ -1361,14 +1456,15 @@ class DictTest(unittest.TestCase):
+
+     def test_fromkeys_operator_modifying_set_operand(self):
+         # test fix for seg fault reported in issue 27945 part 4b.
+-        class X(int):
+-            def __hash__(self):
+-                return 13
++        with torch._dynamo.error_on_graph_break(False):
++            class X(int):
++                def __hash__(self):
++                    return 13
+
+-            def __eq__(self, other):
+-                if len(d) > 1:
+-                    d.clear()
+-                return False
++                def __eq__(self, other):
++                    if len(d) > 1:
++                        d.clear()
++                    return False
+
+         d = {}  # this is required to exist so that d can be constructed!
+         d = {X(1), X(2)}
+@@ -1378,40 +1474,44 @@ class DictTest(unittest.TestCase):
              pass
-@@ -1655,7 +1668,7 @@ class DictTest(unittest.TestCase):
+
+     def test_dictitems_contains_use_after_free(self):
+-        class X:
+-            def __eq__(self, other):
+-                d.clear()
+-                return NotImplemented
++        with torch._dynamo.error_on_graph_break(False):
++            class X:
++                def __eq__(self, other):
++                    d.clear()
++                    return NotImplemented
+
+         d = {0: set()}
+         (0, X()) in d.items()
+
+     def test_dict_contain_use_after_free(self):
+         # bpo-40489
+-        class S(str):
+-            def __eq__(self, other):
+-                d.clear()
+-                return NotImplemented
++        with torch._dynamo.error_on_graph_break(False):
++            class S(str):
++                def __eq__(self, other):
++                    d.clear()
++                    return NotImplemented
+
+-            def __hash__(self):
+-                return hash('test')
++                def __hash__(self):
++                    return hash('test')
+
+         d = {S(): 'value'}
+         self.assertFalse('test' in d)
+
+     def test_init_use_after_free(self):
+-        class X:
+-            def __hash__(self):
+-                pair[:] = []
+-                return 13
++        with torch._dynamo.error_on_graph_break(False):
++            class X:
++                def __hash__(self):
++                    pair[:] = []
++                    return 13
+
+         pair = [X(), 123]
+         dict([pair])
+
+     def test_oob_indexing_dictiter_iternextitem(self):
+-        class X(int):
+-            def __del__(self):
+-                d.clear()
++        with torch._dynamo.error_on_graph_break(False):
++            class X(int):
++                def __del__(self):
++                    d.clear()
+
+         d = {i: X(i) for i in range(8)}
+
+@@ -1445,10 +1545,11 @@ class DictTest(unittest.TestCase):
+         self.assertEqual(list(reversed(dict().keys())), [])
+
+     def test_reverse_iterator_for_shared_shared_dicts(self):
+-        class A:
+-            def __init__(self, x, y):
+-                if x: self.x = x
+-                if y: self.y = y
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                def __init__(self, x, y):
++                    if x: self.x = x
++                    if y: self.y = y
+
+         self.assertEqual(list(reversed(A(1, 2).__dict__)), ['y', 'x'])
+         self.assertEqual(list(reversed(A(1, 0).__dict__)), ['x'])
+@@ -1464,22 +1565,24 @@ class DictTest(unittest.TestCase):
+         self.assertEqual(list(copy.items()), expected)
+
+         # dict subclass doesn't override __iter__
+-        class CustomDict(dict):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class CustomDict(dict):
++                pass
+
+         pairs = [('a', 1), ('b', 2), ('c', 3)]
+
+         d = CustomDict(pairs)
+         self.assertEqual(pairs, list(dict(d).items()))
+
+-        class CustomReversedDict(dict):
+-            def keys(self):
+-                return reversed(list(dict.keys(self)))
++        with torch._dynamo.error_on_graph_break(False):
++            class CustomReversedDict(dict):
++                def keys(self):
++                    return reversed(list(dict.keys(self)))
+
+-            __iter__ = keys
++                __iter__ = keys
+
+-            def items(self):
+-                return reversed(dict.items(self))
++                def items(self):
++                    return reversed(dict.items(self))
+
+         d = CustomReversedDict(pairs)
+         self.assertEqual(pairs[::-1], list(dict(d).items()))
+@@ -1504,17 +1607,18 @@ class DictTest(unittest.TestCase):
+         self.assertTrue(gc.is_tracked(next(it)))
+
+     def test_store_evilattr(self):
+-        class EvilAttr:
+-            def __init__(self, d):
+-                self.d = d
++        with torch._dynamo.error_on_graph_break(False):
++            class EvilAttr:
++                def __init__(self, d):
++                    self.d = d
+
+-            def __del__(self):
+-                if 'attr' in self.d:
+-                    del self.d['attr']
+-                gc.collect()
++                def __del__(self):
++                    if 'attr' in self.d:
++                        del self.d['attr']
++                    gc.collect()
+
+-        class Obj:
+-            pass
++            class Obj:
++                pass
+
+         obj = Obj()
+         obj.__dict__ = {}
+@@ -1526,21 +1630,23 @@ class DictTest(unittest.TestCase):
+         # `str` keys. Make sure the unoptimized path is used when a non-`str`
+         # key appears.
+
+-        class StrSub(str):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class StrSub(str):
++                pass
+
+         eq_count = 0
+         # This class compares equal to the string 'key3'
+-        class Key3:
+-            def __hash__(self):
+-                return hash('key3')
+-
+-            def __eq__(self, other):
+-                nonlocal eq_count
+-                if isinstance(other, Key3) or isinstance(other, str) and other == 'key3':
+-                    eq_count += 1
+-                    return True
+-                return False
++        with torch._dynamo.error_on_graph_break(False):
++            class Key3:
++                def __hash__(self):
++                    return hash('key3')
++
++                def __eq__(self, other):
++                    nonlocal eq_count
++                    if isinstance(other, Key3) or isinstance(other, str) and other == 'key3':
++                        eq_count += 1
++                        return True
++                    return False
+
+         key3_1 = StrSub('key3')
+         key3_2 = Key3()
+@@ -1622,7 +1728,7 @@ class DictTest(unittest.TestCase):
                  self.assertGreaterEqual(eq_count, 1)
- 
- 
+
+
 -class CAPITest(unittest.TestCase):
 +class CAPITest(__TestCase):
- 
+
      # Test _PyDict_GetItem_KnownHash()
      @support.cpython_only
-@@ -1699,4 +1712,4 @@ class SubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
- 
- 
+@@ -1640,12 +1746,13 @@ class CAPITest(unittest.TestCase):
+         # key does not exist
+         self.assertRaises(KeyError, dict_getitem_knownhash, {}, 1, hash(1))
+
+-        class Exc(Exception): pass
+-        class BadEq:
+-            def __eq__(self, other):
+-                raise Exc
+-            def __hash__(self):
+-                return 7
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
++            class BadEq:
++                def __eq__(self, other):
++                    raise Exc
++                def __hash__(self):
++                    return 7
+
+         k1, k2 = BadEq(), BadEq()
+         d = {k1: 1}
+@@ -1666,4 +1773,4 @@ class SubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_dict.py b/test/dynamo/cpython/3_13/test_dict.py
index fcda6484ea607..4a4f170ad9727 100644
--- a/test/dynamo/cpython/3_13/test_dict.py
+++ b/test/dynamo/cpython/3_13/test_dict.py
@@ -71,8 +71,9 @@ def find_spec(self, fullname, path, target=None):
 class DictTest(__TestCase):
 
     def test_invalid_keyword_arguments(self):
-        class Custom(dict):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Custom(dict):
+                pass
         for invalid in {1 : 2}, Custom({1 : 2}):
             with self.assertRaises(TypeError):
                 dict(**invalid)
@@ -165,8 +166,9 @@ def test_items(self):
 
     def test_views_mapping(self):
         mappingproxy = type(type.__dict__)
-        class Dict(dict):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Dict(dict):
+                pass
         for cls in [dict, Dict]:
             d = cls()
             m1 = d.keys().mapping
@@ -214,25 +216,27 @@ def test_getitem(self):
 
         self.assertRaises(TypeError, d.__getitem__)
 
-        class BadEq(object):
-            def __eq__(self, other):
-                raise Exc()
-            def __hash__(self):
-                return 24
+        with torch._dynamo.error_on_graph_break(False):
+            class BadEq(object):
+                def __eq__(self, other):
+                    raise Exc()
+                def __hash__(self):
+                    return 24
 
         d = {}
         d[BadEq()] = 42
         self.assertRaises(KeyError, d.__getitem__, 23)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadHash(object):
-            fail = False
-            def __hash__(self):
-                if self.fail:
-                    raise Exc()
-                else:
-                    return 42
+            class BadHash(object):
+                fail = False
+                def __hash__(self):
+                    if self.fail:
+                        raise Exc()
+                    else:
+                        return 42
 
         x = BadHash()
         d[x] = 42
@@ -258,65 +262,73 @@ def test_update(self):
 
         self.assertRaises((TypeError, AttributeError), d.update, None)
 
-        class SimpleUserDict:
-            def __init__(self):
-                self.d = {1:1, 2:2, 3:3}
-            def keys(self):
-                return self.d.keys()
-            def __getitem__(self, i):
-                return self.d[i]
+        with torch._dynamo.error_on_graph_break(False):
+            class SimpleUserDict:
+                def __init__(self):
+                    self.d = {1:1, 2:2, 3:3}
+                def keys(self):
+                    return self.d.keys()
+                def __getitem__(self, i):
+                    return self.d[i]
         d.clear()
         d.update(SimpleUserDict())
         self.assertEqual(d, {1:1, 2:2, 3:3})
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
         d.clear()
-        class FailingUserDict:
-            def keys(self):
-                raise Exc
+
+        with torch._dynamo.error_on_graph_break(False):
+            class FailingUserDict:
+                def keys(self):
+                    raise Exc
         self.assertRaises(Exc, d.update, FailingUserDict())
 
-        class FailingUserDict:
-            def keys(self):
-                class BogonIter:
-                    def __init__(self):
-                        self.i = 1
-                    def __iter__(self):
-                        return self
-                    def __next__(self):
-                        if self.i:
-                            self.i = 0
-                            return 'a'
-                        raise Exc
-                return BogonIter()
-            def __getitem__(self, key):
-                return key
+        with torch._dynamo.error_on_graph_break(False):
+            class FailingUserDict:
+                def keys(self):
+                    class BogonIter:
+                        def __init__(self):
+                            self.i = 1
+                        def __iter__(self):
+                            return self
+                        def __next__(self):
+                            if self.i:
+                                self.i = 0
+                                return 'a'
+                            raise Exc
+                    return BogonIter()
+                def __getitem__(self, key):
+                    return key
         self.assertRaises(Exc, d.update, FailingUserDict())
 
-        class FailingUserDict:
-            def keys(self):
-                class BogonIter:
-                    def __init__(self):
-                        self.i = ord('a')
-                    def __iter__(self):
-                        return self
-                    def __next__(self):
-                        if self.i <= ord('z'):
-                            rtn = chr(self.i)
-                            self.i += 1
-                            return rtn
-                        raise StopIteration
-                return BogonIter()
-            def __getitem__(self, key):
-                raise Exc
+        with torch._dynamo.error_on_graph_break(False):
+            class FailingUserDict:
+                def keys(self):
+                    class BogonIter:
+                        def __init__(self):
+                            self.i = ord('a')
+                        def __iter__(self):
+                            return self
+                        def __next__(self):
+                            if self.i <= ord('z'):
+                                rtn = chr(self.i)
+                                self.i += 1
+                                return rtn
+                            raise StopIteration
+                    return BogonIter()
+                def __getitem__(self, key):
+                    raise Exc
         self.assertRaises(Exc, d.update, FailingUserDict())
 
-        class badseq(object):
-            def __iter__(self):
-                return self
-            def __next__(self):
-                raise Exc()
+
+        with torch._dynamo.error_on_graph_break(False):
+            class badseq(object):
+                def __iter__(self):
+                    return self
+                def __next__(self):
+                    raise Exc()
 
         self.assertRaises(Exc, {}.update, badseq())
 
@@ -334,38 +346,43 @@ def g():
             yield 1
         self.assertEqual(d.fromkeys(g()), {1:None})
         self.assertRaises(TypeError, {}.fromkeys, 3)
-        class dictlike(dict): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class dictlike(dict): pass
         self.assertEqual(dictlike.fromkeys('a'), {'a':None})
         self.assertEqual(dictlike().fromkeys('a'), {'a':None})
         self.assertIsInstance(dictlike.fromkeys('a'), dictlike)
         self.assertIsInstance(dictlike().fromkeys('a'), dictlike)
-        class mydict(dict):
-            def __new__(cls):
-                return collections.UserDict()
+        with torch._dynamo.error_on_graph_break(False):
+            class mydict(dict):
+                def __new__(cls):
+                    return collections.UserDict()
         ud = mydict.fromkeys('ab')
         self.assertEqual(ud, {'a':None, 'b':None})
         self.assertIsInstance(ud, collections.UserDict)
         self.assertRaises(TypeError, dict.fromkeys)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class baddict1(dict):
-            def __init__(self):
-                raise Exc()
+            class baddict1(dict):
+                def __init__(self):
+                    raise Exc()
 
         self.assertRaises(Exc, baddict1.fromkeys, [1])
 
-        class BadSeq(object):
-            def __iter__(self):
-                return self
-            def __next__(self):
-                raise Exc()
+        with torch._dynamo.error_on_graph_break(False):
+            class BadSeq(object):
+                def __iter__(self):
+                    return self
+                def __next__(self):
+                    raise Exc()
 
         self.assertRaises(Exc, dict.fromkeys, BadSeq())
 
-        class baddict2(dict):
-            def __setitem__(self, key, value):
-                raise Exc()
+        with torch._dynamo.error_on_graph_break(False):
+            class baddict2(dict):
+                def __setitem__(self, key, value):
+                    raise Exc()
 
         self.assertRaises(Exc, baddict2.fromkeys, [1])
 
@@ -381,18 +398,20 @@ def __setitem__(self, key, value):
         self.assertEqual(dict.fromkeys(d, 0), res)
 
         # test fast path when object's constructor returns large non-empty dict
-        class baddict3(dict):
-            def __new__(cls):
-                return d
+        with torch._dynamo.error_on_graph_break(False):
+            class baddict3(dict):
+                def __new__(cls):
+                    return d
         d = {i : i for i in range(1000)}
         res = d.copy()
         res.update(a=None, b=None, c=None)
         self.assertEqual(baddict3.fromkeys({"a", "b", "c"}), res)
 
         # test slow path when object is a proper subclass of dict
-        class baddict4(dict):
-            def __init__(self):
-                dict.__init__(self, d)
+        with torch._dynamo.error_on_graph_break(False):
+            class baddict4(dict):
+                def __init__(self):
+                    dict.__init__(self, d)
         d = {i : i for i in range(1000)}
         res = d.copy()
         res.update(a=None, b=None, c=None)
@@ -428,8 +447,9 @@ def test_copy_fuzz(self):
                 self.assertEqual(len(d2), len(d) + 1)
 
     def test_copy_maintains_tracking(self):
-        class A:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                pass
 
         key = A()
 
@@ -474,15 +494,17 @@ def test_setdefault(self):
         self.assertEqual(len(d['key']), 2)
         self.assertRaises(TypeError, d.setdefault)
 
-        class Exc(Exception): pass
 
-        class BadHash(object):
-            fail = False
-            def __hash__(self):
-                if self.fail:
-                    raise Exc()
-                else:
-                    return 42
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
+
+            class BadHash(object):
+                fail = False
+                def __hash__(self):
+                    if self.fail:
+                        raise Exc()
+                    else:
+                        return 42
 
         x = BadHash()
         d[x] = 42
@@ -491,16 +513,17 @@ def __hash__(self):
 
     def test_setdefault_atomic(self):
         # Issue #13521: setdefault() calls __hash__ and __eq__ only once.
-        class Hashed(object):
-            def __init__(self):
-                self.hash_count = 0
-                self.eq_count = 0
-            def __hash__(self):
-                self.hash_count += 1
-                return 42
-            def __eq__(self, other):
-                self.eq_count += 1
-                return id(self) == id(other)
+        with torch._dynamo.error_on_graph_break(False):
+            class Hashed(object):
+                def __init__(self):
+                    self.hash_count = 0
+                    self.eq_count = 0
+                def __hash__(self):
+                    self.hash_count += 1
+                    return 42
+                def __eq__(self, other):
+                    self.eq_count += 1
+                    return id(self) == id(other)
         hashed1 = Hashed()
         y = {hashed1: 5}
         hashed2 = Hashed()
@@ -510,16 +533,17 @@ def __eq__(self, other):
         self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
 
     def test_setitem_atomic_at_resize(self):
-        class Hashed(object):
-            def __init__(self):
-                self.hash_count = 0
-                self.eq_count = 0
-            def __hash__(self):
-                self.hash_count += 1
-                return 42
-            def __eq__(self, other):
-                self.eq_count += 1
-                return id(self) == id(other)
+        with torch._dynamo.error_on_graph_break(False):
+            class Hashed(object):
+                def __init__(self):
+                    self.hash_count = 0
+                    self.eq_count = 0
+                def __hash__(self):
+                    self.hash_count += 1
+                    return 42
+                def __eq__(self, other):
+                    self.eq_count += 1
+                    return id(self) == id(other)
         hashed1 = Hashed()
         # 5 items
         y = {hashed1: 5, 0: 0, 1: 1, 2: 2, 3: 3}
@@ -575,15 +599,16 @@ def test_pop(self):
 
         self.assertRaises(TypeError, d.pop)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadHash(object):
-            fail = False
-            def __hash__(self):
-                if self.fail:
-                    raise Exc()
-                else:
-                    return 42
+            class BadHash(object):
+                fail = False
+                def __hash__(self):
+                    if self.fail:
+                        raise Exc()
+                    else:
+                        return 42
 
         x = BadHash()
         d[x] = 42
@@ -627,22 +652,23 @@ def test_mutating_iteration_delete_over_items(self):
 
     def test_mutating_lookup(self):
         # changing dict during a lookup (issue #14417)
-        class NastyKey:
-            mutate_dict = None
+        with torch._dynamo.error_on_graph_break(False):
+            class NastyKey:
+                mutate_dict = None
 
-            def __init__(self, value):
-                self.value = value
+                def __init__(self, value):
+                    self.value = value
 
-            def __hash__(self):
-                # hash collision!
-                return 1
+                def __hash__(self):
+                    # hash collision!
+                    return 1
 
-            def __eq__(self, other):
-                if NastyKey.mutate_dict:
-                    mydict, key = NastyKey.mutate_dict
-                    NastyKey.mutate_dict = None
-                    del mydict[key]
-                return self.value == other.value
+                def __eq__(self, other):
+                    if NastyKey.mutate_dict:
+                        mydict, key = NastyKey.mutate_dict
+                        NastyKey.mutate_dict = None
+                        del mydict[key]
+                    return self.value == other.value
 
         key1 = NastyKey(1)
         key2 = NastyKey(2)
@@ -660,11 +686,12 @@ def test_repr(self):
         d[1] = d
         self.assertEqual(repr(d), '{1: {...}}')
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadRepr(object):
-            def __repr__(self):
-                raise Exc()
+            class BadRepr(object):
+                def __repr__(self):
+                    raise Exc()
 
         d = {1: BadRepr()}
         self.assertRaises(Exc, repr, d)
@@ -679,13 +706,14 @@ def test_eq(self):
         self.assertEqual({}, {})
         self.assertEqual({1: 2}, {1: 2})
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadCmp(object):
-            def __eq__(self, other):
-                raise Exc()
-            def __hash__(self):
-                return 1
+            class BadCmp(object):
+                def __eq__(self, other):
+                    raise Exc()
+                def __hash__(self):
+                    return 1
 
         d1 = {BadCmp(): 1}
         d2 = {1: 1}
@@ -742,9 +770,10 @@ def helper_keys_contained(self, fn):
         self.assertFalse(larger == larger3)
 
     def test_errors_in_view_containment_check(self):
-        class C:
-            def __eq__(self, other):
-                raise RuntimeError
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                def __eq__(self, other):
+                    raise RuntimeError
 
         d1 = {1: C()}
         d2 = {1: C()}
@@ -824,9 +853,10 @@ def test_missing(self):
         # (E) subclass defines __missing__ method raising RuntimeError
         # (F) subclass sets __missing__ instance variable (no effect)
         # (G) subclass doesn't define __missing__ at all
-        class D(dict):
-            def __missing__(self, key):
-                return 42
+        with torch._dynamo.error_on_graph_break(False):
+            class D(dict):
+                def __missing__(self, key):
+                    return 42
         d = D({1: 2, 3: 4})
         self.assertEqual(d[1], 2)
         self.assertEqual(d[3], 4)
@@ -834,25 +864,28 @@ def __missing__(self, key):
         self.assertNotIn(2, d.keys())
         self.assertEqual(d[2], 42)
 
-        class E(dict):
-            def __missing__(self, key):
-                raise RuntimeError(key)
+        with torch._dynamo.error_on_graph_break(False):
+            class E(dict):
+                def __missing__(self, key):
+                    raise RuntimeError(key)
         e = E()
         with self.assertRaises(RuntimeError) as c:
             e[42]
         self.assertEqual(c.exception.args, (42,))
 
-        class F(dict):
-            def __init__(self):
-                # An instance variable __missing__ should have no effect
-                self.__missing__ = lambda key: None
+        with torch._dynamo.error_on_graph_break(False):
+            class F(dict):
+                def __init__(self):
+                    # An instance variable __missing__ should have no effect
+                    self.__missing__ = lambda key: None
         f = F()
         with self.assertRaises(KeyError) as c:
             f[42]
         self.assertEqual(c.exception.args, (42,))
 
-        class G(dict):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class G(dict):
+                pass
         g = G()
         with self.assertRaises(KeyError) as c:
             g[42]
@@ -867,17 +900,18 @@ def test_tuple_keyerror(self):
 
     def test_bad_key(self):
         # Dictionary lookups should fail if __eq__() raises an exception.
-        class CustomException(Exception):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class CustomException(Exception):
+                pass
 
-        class BadDictKey:
-            def __hash__(self):
-                return hash(self.__class__)
+            class BadDictKey:
+                def __hash__(self):
+                    return hash(self.__class__)
 
-            def __eq__(self, other):
-                if isinstance(other, self.__class__):
-                    raise CustomException
-                return other
+                def __eq__(self, other):
+                    if isinstance(other, self.__class__):
+                        raise CustomException
+                    return other
 
         d = {}
         x1 = BadDictKey()
@@ -913,13 +947,14 @@ def test_resize2(self):
         # Another dict resizing bug (SF bug #1456209).
         # This caused Segmentation faults or Illegal instructions.
 
-        class X(object):
-            def __hash__(self):
-                return 5
-            def __eq__(self, other):
-                if resizing:
-                    d.clear()
-                return False
+        with torch._dynamo.error_on_graph_break(False):
+            class X(object):
+                def __hash__(self):
+                    return 5
+                def __eq__(self, other):
+                    if resizing:
+                        d.clear()
+                    return False
         d = {}
         resizing = False
         d[X()] = 1
@@ -942,8 +977,9 @@ def test_empty_presized_dict_in_freelist(self):
     def test_container_iterator(self):
         # Bug #3680: tp_traverse was not implemented for dictiter and
         # dictview objects.
-        class C(object):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C(object):
+                pass
         views = (dict.items, dict.values, dict.keys)
         for v in views:
             obj = C()
@@ -996,8 +1032,10 @@ def test_track_literals(self):
     @support.cpython_only
     def test_track_dynamic(self):
         # Test GC-optimization of dynamically-created dicts
-        class MyObject(object):
-            pass
+
+        with torch._dynamo.error_on_graph_break(False):
+            class MyObject(object):
+                pass
         x, y, z, w, o = 1.5, "a", (1, object()), [], MyObject()
 
         d = dict()
@@ -1065,8 +1103,9 @@ class MyDict(dict):
         self._tracked(MyDict())
 
     def make_shared_key_dict(self, n):
-        class C:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                pass
 
         dicts = []
         for i in range(n):
@@ -1155,12 +1194,13 @@ def test_splittable_popitem(self):
     @support.cpython_only
     def test_splittable_update(self):
         """dict.update(other) must preserve order in other."""
-        class C:
-            def __init__(self, order):
-                if order:
-                    self.a, self.b, self.c = 1, 2, 3
-                else:
-                    self.c, self.b, self.a = 1, 2, 3
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                def __init__(self, order):
+                    if order:
+                        self.a, self.b, self.c = 1, 2, 3
+                    else:
+                        self.c, self.b, self.a = 1, 2, 3
         o = C(True)
         o = C(False)  # o.__dict__ has reversed order.
         self.assertEqual(list(o.__dict__), ["c", "b", "a"])
@@ -1172,8 +1212,9 @@ def __init__(self, order):
     @support.cpython_only
     def test_splittable_to_generic_combinedtable(self):
         """split table must be correctly resized and converted to generic combined table"""
-        class C:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                pass
 
         a = C()
         a.x = 1
@@ -1295,17 +1336,20 @@ def test_reversevaluesiterator_pickling(self):
             self.assertEqual(sorted(values), sorted(data.values()))
 
     def test_instance_dict_getattr_str_subclass(self):
-        class Foo:
-            def __init__(self, msg):
-                self.msg = msg
+        with torch._dynamo.error_on_graph_break(False):
+            class Foo:
+                def __init__(self, msg):
+                    self.msg = msg
         f = Foo('123')
-        class _str(str):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class _str(str):
+                pass
         self.assertEqual(f.msg, getattr(f, _str('msg')))
         self.assertEqual(f.msg, f.__dict__[_str('msg')])
 
     def test_object_set_item_single_instance_non_str_key(self):
-        class Foo: pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Foo: pass
         f = Foo()
         f.__dict__[1] = 1
         f.a = 'a'
@@ -1315,9 +1359,10 @@ def check_reentrant_insertion(self, mutate):
         # This object will trigger mutation of the dict when replaced
         # by another value.  Note this relies on refcounting: the test
         # won't achieve its purpose on fully-GCed Python implementations.
-        class Mutating:
-            def __del__(self):
-                mutate(d)
+        with torch._dynamo.error_on_graph_break(False):
+            class Mutating:
+                def __del__(self):
+                    mutate(d)
 
         d = {k: Mutating() for k in 'abcdefghijklmnopqr'}
         for k in list(d):
@@ -1340,13 +1385,14 @@ def mutate(d):
         self.check_reentrant_insertion(mutate)
 
     def test_merge_and_mutate(self):
-        class X:
-            def __hash__(self):
-                return 0
+        with torch._dynamo.error_on_graph_break(False):
+            class X:
+                def __hash__(self):
+                    return 0
 
-            def __eq__(self, o):
-                other.clear()
-                return False
+                def __eq__(self, o):
+                    other.clear()
+                    return False
 
         l = [(i,0) for i in range(1, 1337)]
         other = dict(l)
@@ -1362,26 +1408,28 @@ def test_free_after_iterating(self):
 
     def test_equal_operator_modifying_operand(self):
         # test fix for seg fault reported in bpo-27945 part 3.
-        class X():
-            def __del__(self):
-                dict_b.clear()
+        with torch._dynamo.error_on_graph_break(False):
+            class X():
+                def __del__(self):
+                    dict_b.clear()
 
-            def __eq__(self, other):
-                dict_a.clear()
-                return True
+                def __eq__(self, other):
+                    dict_a.clear()
+                    return True
 
-            def __hash__(self):
-                return 13
+                def __hash__(self):
+                    return 13
 
         dict_a = {X(): 0}
         dict_b = {X(): X()}
         self.assertTrue(dict_a == dict_b)
 
         # test fix for seg fault reported in bpo-38588 part 1.
-        class Y:
-            def __eq__(self, other):
-                dict_d.clear()
-                return True
+        with torch._dynamo.error_on_graph_break(False):
+            class Y:
+                def __eq__(self, other):
+                    dict_d.clear()
+                    return True
 
         dict_c = {0: Y()}
         dict_d = {0: set()}
@@ -1389,14 +1437,15 @@ def __eq__(self, other):
 
     def test_fromkeys_operator_modifying_dict_operand(self):
         # test fix for seg fault reported in issue 27945 part 4a.
-        class X(int):
-            def __hash__(self):
-                return 13
+        with torch._dynamo.error_on_graph_break(False):
+            class X(int):
+                def __hash__(self):
+                    return 13
 
-            def __eq__(self, other):
-                if len(d) > 1:
-                    d.clear()
-                return False
+                def __eq__(self, other):
+                    if len(d) > 1:
+                        d.clear()
+                    return False
 
         d = {}  # this is required to exist so that d can be constructed!
         d = {X(1): 1, X(2): 2}
@@ -1407,14 +1456,15 @@ def __eq__(self, other):
 
     def test_fromkeys_operator_modifying_set_operand(self):
         # test fix for seg fault reported in issue 27945 part 4b.
-        class X(int):
-            def __hash__(self):
-                return 13
+        with torch._dynamo.error_on_graph_break(False):
+            class X(int):
+                def __hash__(self):
+                    return 13
 
-            def __eq__(self, other):
-                if len(d) > 1:
-                    d.clear()
-                return False
+                def __eq__(self, other):
+                    if len(d) > 1:
+                        d.clear()
+                    return False
 
         d = {}  # this is required to exist so that d can be constructed!
         d = {X(1), X(2)}
@@ -1424,40 +1474,44 @@ def __eq__(self, other):
             pass
 
     def test_dictitems_contains_use_after_free(self):
-        class X:
-            def __eq__(self, other):
-                d.clear()
-                return NotImplemented
+        with torch._dynamo.error_on_graph_break(False):
+            class X:
+                def __eq__(self, other):
+                    d.clear()
+                    return NotImplemented
 
         d = {0: set()}
         (0, X()) in d.items()
 
     def test_dict_contain_use_after_free(self):
         # bpo-40489
-        class S(str):
-            def __eq__(self, other):
-                d.clear()
-                return NotImplemented
+        with torch._dynamo.error_on_graph_break(False):
+            class S(str):
+                def __eq__(self, other):
+                    d.clear()
+                    return NotImplemented
 
-            def __hash__(self):
-                return hash('test')
+                def __hash__(self):
+                    return hash('test')
 
         d = {S(): 'value'}
         self.assertFalse('test' in d)
 
     def test_init_use_after_free(self):
-        class X:
-            def __hash__(self):
-                pair[:] = []
-                return 13
+        with torch._dynamo.error_on_graph_break(False):
+            class X:
+                def __hash__(self):
+                    pair[:] = []
+                    return 13
 
         pair = [X(), 123]
         dict([pair])
 
     def test_oob_indexing_dictiter_iternextitem(self):
-        class X(int):
-            def __del__(self):
-                d.clear()
+        with torch._dynamo.error_on_graph_break(False):
+            class X(int):
+                def __del__(self):
+                    d.clear()
 
         d = {i: X(i) for i in range(8)}
 
@@ -1491,10 +1545,11 @@ def test_reverse_iterator_for_empty_dict(self):
         self.assertEqual(list(reversed(dict().keys())), [])
 
     def test_reverse_iterator_for_shared_shared_dicts(self):
-        class A:
-            def __init__(self, x, y):
-                if x: self.x = x
-                if y: self.y = y
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                def __init__(self, x, y):
+                    if x: self.x = x
+                    if y: self.y = y
 
         self.assertEqual(list(reversed(A(1, 2).__dict__)), ['y', 'x'])
         self.assertEqual(list(reversed(A(1, 0).__dict__)), ['x'])
@@ -1510,22 +1565,24 @@ def test_dict_copy_order(self):
         self.assertEqual(list(copy.items()), expected)
 
         # dict subclass doesn't override __iter__
-        class CustomDict(dict):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class CustomDict(dict):
+                pass
 
         pairs = [('a', 1), ('b', 2), ('c', 3)]
 
         d = CustomDict(pairs)
         self.assertEqual(pairs, list(dict(d).items()))
 
-        class CustomReversedDict(dict):
-            def keys(self):
-                return reversed(list(dict.keys(self)))
+        with torch._dynamo.error_on_graph_break(False):
+            class CustomReversedDict(dict):
+                def keys(self):
+                    return reversed(list(dict.keys(self)))
 
-            __iter__ = keys
+                __iter__ = keys
 
-            def items(self):
-                return reversed(dict.items(self))
+                def items(self):
+                    return reversed(dict.items(self))
 
         d = CustomReversedDict(pairs)
         self.assertEqual(pairs[::-1], list(dict(d).items()))
@@ -1550,17 +1607,18 @@ def test_dict_items_result_gc_reversed(self):
         self.assertTrue(gc.is_tracked(next(it)))
 
     def test_store_evilattr(self):
-        class EvilAttr:
-            def __init__(self, d):
-                self.d = d
+        with torch._dynamo.error_on_graph_break(False):
+            class EvilAttr:
+                def __init__(self, d):
+                    self.d = d
 
-            def __del__(self):
-                if 'attr' in self.d:
-                    del self.d['attr']
-                gc.collect()
+                def __del__(self):
+                    if 'attr' in self.d:
+                        del self.d['attr']
+                    gc.collect()
 
-        class Obj:
-            pass
+            class Obj:
+                pass
 
         obj = Obj()
         obj.__dict__ = {}
@@ -1572,21 +1630,23 @@ def test_str_nonstr(self):
         # `str` keys. Make sure the unoptimized path is used when a non-`str`
         # key appears.
 
-        class StrSub(str):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class StrSub(str):
+                pass
 
         eq_count = 0
         # This class compares equal to the string 'key3'
-        class Key3:
-            def __hash__(self):
-                return hash('key3')
-
-            def __eq__(self, other):
-                nonlocal eq_count
-                if isinstance(other, Key3) or isinstance(other, str) and other == 'key3':
-                    eq_count += 1
-                    return True
-                return False
+        with torch._dynamo.error_on_graph_break(False):
+            class Key3:
+                def __hash__(self):
+                    return hash('key3')
+
+                def __eq__(self, other):
+                    nonlocal eq_count
+                    if isinstance(other, Key3) or isinstance(other, str) and other == 'key3':
+                        eq_count += 1
+                        return True
+                    return False
 
         key3_1 = StrSub('key3')
         key3_2 = Key3()
@@ -1686,12 +1746,13 @@ def test_getitem_knownhash(self):
         # key does not exist
         self.assertRaises(KeyError, dict_getitem_knownhash, {}, 1, hash(1))
 
-        class Exc(Exception): pass
-        class BadEq:
-            def __eq__(self, other):
-                raise Exc
-            def __hash__(self):
-                return 7
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
+            class BadEq:
+                def __eq__(self, other):
+                    raise Exc
+                def __hash__(self):
+                    return 7
 
         k1, k2 = BadEq(), BadEq()
         d = {k1: 1}
diff --git a/test/dynamo/cpython/3_13/test_float.diff b/test/dynamo/cpython/3_13/test_float.diff
index 73cd65364fbc9..3e1d08e8fe60a 100644
--- a/test/dynamo/cpython/3_13/test_float.diff
+++ b/test/dynamo/cpython/3_13/test_float.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_float.py b/test/dynamo/cpython/3_13/test_float.py
-index 87af79eb446..9313a1a63d7 100644
+index 97f951f1299..da82bd190c3 100644
 --- a/test/dynamo/cpython/3_13/test_float.py
 +++ b/test/dynamo/cpython/3_13/test_float.py
 @@ -1,3 +1,57 @@
@@ -62,7 +62,7 @@ index 87af79eb446..9313a1a63d7 100644
  import os
 @@ -8,11 +62,84 @@ import time
  import unittest
- 
+
  from test import support
 -from test.support.testcase import FloatsAreIdenticalMixin
 -from test.support.numbers import (
@@ -149,64 +149,231 @@ index 87af79eb446..9313a1a63d7 100644
 +
  from math import isinf, isnan, copysign, ldexp
  import math
- 
+
 @@ -35,7 +162,7 @@ class FloatSubclass(float):
  class OtherFloatSubclass(float):
      pass
- 
+
 -class GeneralFloatCases(unittest.TestCase):
 +class GeneralFloatCases(__TestCase):
- 
+
      def test_float(self):
          self.assertEqual(float(3.14), 3.14)
-@@ -620,7 +747,7 @@ class GeneralFloatCases(unittest.TestCase):
- 
- 
+@@ -95,9 +222,10 @@ class GeneralFloatCases(unittest.TestCase):
+     def test_non_numeric_input_types(self):
+         # Test possible non-numeric types for the argument x, including
+         # subclasses of the explicitly documented accepted types.
+-        class CustomStr(str): pass
+-        class CustomBytes(bytes): pass
+-        class CustomByteArray(bytearray): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class CustomStr(str): pass
++            class CustomBytes(bytes): pass
++            class CustomByteArray(bytearray): pass
+
+         factories = [
+             bytes,
+@@ -184,30 +312,31 @@ class GeneralFloatCases(unittest.TestCase):
+
+     def test_floatconversion(self):
+         # Make sure that calls to __float__() work properly
+-        class Foo1(object):
+-            def __float__(self):
+-                return 42.
++        with torch._dynamo.error_on_graph_break(False):
++            class Foo1(object):
++                def __float__(self):
++                    return 42.
+
+-        class Foo2(float):
+-            def __float__(self):
+-                return 42.
++            class Foo2(float):
++                def __float__(self):
++                    return 42.
+
+-        class Foo3(float):
+-            def __new__(cls, value=0.):
+-                return float.__new__(cls, 2*value)
++            class Foo3(float):
++                def __new__(cls, value=0.):
++                    return float.__new__(cls, 2*value)
+
+-            def __float__(self):
+-                return self
++                def __float__(self):
++                    return self
+
+-        class Foo4(float):
+-            def __float__(self):
+-                return 42
++            class Foo4(float):
++                def __float__(self):
++                    return 42
+
+-        # Issue 5759: __float__ not called on str subclasses (though it is on
+-        # unicode subclasses).
+-        class FooStr(str):
+-            def __float__(self):
+-                return float(str(self)) + 1
++            # Issue 5759: __float__ not called on str subclasses (though it is on
++            # unicode subclasses).
++            class FooStr(str):
++                def __float__(self):
++                    return float(str(self)) + 1
+
+         self.assertEqual(float(Foo1()), 42.)
+         self.assertEqual(float(Foo2()), 42.)
+@@ -216,15 +345,17 @@ class GeneralFloatCases(unittest.TestCase):
+         self.assertRaises(TypeError, float, Foo4(42))
+         self.assertEqual(float(FooStr('8')), 9.)
+
+-        class Foo5:
+-            def __float__(self):
+-                return ""
++        with torch._dynamo.error_on_graph_break(False):
++            class Foo5:
++                def __float__(self):
++                    return ""
+         self.assertRaises(TypeError, time.sleep, Foo5())
+
+-        # Issue #24731
+-        class F:
+-            def __float__(self):
+-                return OtherFloatSubclass(42.)
++        with torch._dynamo.error_on_graph_break(False):
++            # Issue #24731
++            class F:
++                def __float__(self):
++                    return OtherFloatSubclass(42.)
+         with self.assertWarns(DeprecationWarning):
+             self.assertEqual(float(F()), 42.)
+         with self.assertWarns(DeprecationWarning):
+@@ -234,18 +365,20 @@ class GeneralFloatCases(unittest.TestCase):
+         with self.assertWarns(DeprecationWarning):
+             self.assertIs(type(FloatSubclass(F())), FloatSubclass)
+
+-        class MyIndex:
+-            def __init__(self, value):
+-                self.value = value
+-            def __index__(self):
+-                return self.value
++        with torch._dynamo.error_on_graph_break(False):
++            class MyIndex:
++                def __init__(self, value):
++                    self.value = value
++                def __index__(self):
++                    return self.value
+
+         self.assertEqual(float(MyIndex(42)), 42.0)
+         self.assertRaises(OverflowError, float, MyIndex(2**2000))
+
+-        class MyInt:
+-            def __int__(self):
+-                return 42
++        with torch._dynamo.error_on_graph_break(False):
++            class MyInt:
++                def __int__(self):
++                    return 42
+
+         self.assertRaises(TypeError, float, MyInt())
+
+@@ -254,27 +387,30 @@ class GeneralFloatCases(unittest.TestCase):
+             float(x='3.14')
+
+     def test_keywords_in_subclass(self):
+-        class subclass(float):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass(float):
++                pass
+         u = subclass(2.5)
+         self.assertIs(type(u), subclass)
+         self.assertEqual(float(u), 2.5)
+         with self.assertRaises(TypeError):
+             subclass(x=0)
+
+-        class subclass_with_init(float):
+-            def __init__(self, arg, newarg=None):
+-                self.newarg = newarg
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass_with_init(float):
++                def __init__(self, arg, newarg=None):
++                    self.newarg = newarg
+         u = subclass_with_init(2.5, newarg=3)
+         self.assertIs(type(u), subclass_with_init)
+         self.assertEqual(float(u), 2.5)
+         self.assertEqual(u.newarg, 3)
+
+-        class subclass_with_new(float):
+-            def __new__(cls, arg, newarg=None):
+-                self = super().__new__(cls, arg)
+-                self.newarg = newarg
+-                return self
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass_with_new(float):
++                def __new__(cls, arg, newarg=None):
++                    self = super().__new__(cls, arg)
++                    self.newarg = newarg
++                    return self
+         u = subclass_with_new(2.5, newarg=3)
+         self.assertIs(type(u), subclass_with_new)
+         self.assertEqual(float(u), 2.5)
+@@ -610,17 +746,18 @@ class GeneralFloatCases(unittest.TestCase):
+     def test_hash_nan(self):
+         value = float('nan')
+         self.assertEqual(hash(value), object.__hash__(value))
+-        class H:
+-            def __hash__(self):
+-                return 42
+-        class F(float, H):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class H:
++                def __hash__(self):
++                    return 42
++            class F(float, H):
++                pass
+         value = F('nan')
+         self.assertEqual(hash(value), object.__hash__(value))
+
+
  @unittest.skipUnless(hasattr(float, "__getformat__"), "requires __getformat__")
 -class FormatFunctionsTestCase(unittest.TestCase):
 +class FormatFunctionsTestCase(__TestCase):
      def test_getformat(self):
          self.assertIn(float.__getformat__('double'),
                        ['unknown', 'IEEE, big-endian', 'IEEE, little-endian'])
-@@ -645,7 +772,7 @@ LE_FLOAT_NAN = bytes(reversed(BE_FLOAT_NAN))
+@@ -645,7 +782,7 @@ LE_FLOAT_NAN = bytes(reversed(BE_FLOAT_NAN))
  # is accident (today).
  # let's also try to guarantee that -0.0 and 0.0 don't get confused.
- 
+
 -class IEEEFormatTestCase(unittest.TestCase):
 +class IEEEFormatTestCase(__TestCase):
- 
+
      @support.requires_IEEE_754
      def test_double_specials_do_unpack(self):
-@@ -670,7 +797,7 @@ class IEEEFormatTestCase(unittest.TestCase):
+@@ -670,7 +807,7 @@ class IEEEFormatTestCase(unittest.TestCase):
          self.assertEqual(struct.pack("<f", 3.40282356e38), struct.pack("<f", FLT_MAX))
          self.assertEqual(struct.pack("<f", -3.40282356e38), struct.pack("<f", -FLT_MAX))
- 
+
 -class FormatTestCase(unittest.TestCase):
 +class FormatTestCase(__TestCase):
- 
+
      def test_format(self):
          # these should be rewritten to use both format(x, spec) and
-@@ -724,8 +851,6 @@ class FormatTestCase(unittest.TestCase):
-         self.assertEqual(format(INF, 'F'), 'INF')
- 
-     @support.requires_IEEE_754
--    @unittest.skipUnless(sys.float_repr_style == 'short',
--                         "applies only when using short float repr style")
-     def test_format_testfile(self):
-         with open(format_testfile, encoding="utf-8") as testfile:
-             for line in testfile:
-@@ -769,7 +894,7 @@ class FormatTestCase(unittest.TestCase):
+@@ -767,7 +904,7 @@ class FormatTestCase(unittest.TestCase):
          self.assertEqual(format(-123.34, '00.10e'), '-1.2334000000e+02')
          self.assertEqual(format(-123.34, '00.10g'), '-123.34')
- 
+
 -class ReprTestCase(unittest.TestCase):
 +class ReprTestCase(__TestCase):
      def test_repr(self):
          with open(os.path.join(os.path.split(__file__)[0],
                    'mathdata',
-@@ -834,7 +959,29 @@ class ReprTestCase(unittest.TestCase):
+@@ -832,7 +969,29 @@ class ReprTestCase(unittest.TestCase):
              self.assertEqual(repr(float(negs)), str(float(negs)))
- 
+
  @support.requires_IEEE_754
 -class RoundTestCase(unittest.TestCase, FloatsAreIdenticalMixin):
 +class RoundTestCase(__TestCase):
@@ -232,11 +399,11 @@ index 87af79eb446..9313a1a63d7 100644
 +            else:
 +                msg += ': zeros have different signs'
 +        self.fail(msg.format(x, y))
- 
+
      def test_inf_nan(self):
          self.assertRaises(OverflowError, round, INF)
-@@ -957,7 +1104,7 @@ class RoundTestCase(unittest.TestCase, FloatsAreIdenticalMixin):
- 
+@@ -955,7 +1114,7 @@ class RoundTestCase(unittest.TestCase, FloatsAreIdenticalMixin):
+
  # Beginning with Python 2.6 float has cross platform compatible
  # ways to create and represent inf and nan
 -class InfNanTest(unittest.TestCase):
@@ -244,8 +411,8 @@ index 87af79eb446..9313a1a63d7 100644
      def test_inf_from_str(self):
          self.assertTrue(isinf(float("inf")))
          self.assertTrue(isinf(float("+inf")))
-@@ -1058,12 +1205,35 @@ class InfNanTest(unittest.TestCase):
- 
+@@ -1056,12 +1215,35 @@ class InfNanTest(unittest.TestCase):
+
  fromHex = float.fromhex
  toHex = float.hex
 -class HexFloatTestCase(FloatsAreIdenticalMixin, unittest.TestCase):
@@ -254,7 +421,7 @@ index 87af79eb446..9313a1a63d7 100644
      MIN = fromHex('0x1p-1022')                # min normal
      TINY = fromHex('0x0.0000000000001p-1022') # min subnormal
      EPS = fromHex('0x0.0000000000001p0') # diff between 1.0 and next float up
- 
+
 +    def assertFloatsAreIdentical(self, x, y):
 +        """assert that floats x and y are identical, in the sense that:
 +        (1) both x and y are nans, or
@@ -280,11 +447,37 @@ index 87af79eb446..9313a1a63d7 100644
 +
      def identical(self, x, y):
          self.assertFloatsAreIdentical(x, y)
- 
-@@ -1502,5 +1672,5 @@ class HexFloatTestCase(FloatsAreIdenticalMixin, unittest.TestCase):
+
+@@ -1482,17 +1664,19 @@ class HexFloatTestCase(FloatsAreIdenticalMixin, unittest.TestCase):
+                 self.identical(x, fromHex(toHex(x)))
+
+     def test_subclass(self):
+-        class F(float):
+-            def __new__(cls, value):
+-                return float.__new__(cls, value + 1)
++        with torch._dynamo.error_on_graph_break(False):
++            class F(float):
++                def __new__(cls, value):
++                    return float.__new__(cls, value + 1)
+
+         f = F.fromhex((1.5).hex())
+         self.assertIs(type(f), F)
+         self.assertEqual(f, 2.5)
+
+-        class F2(float):
+-            def __init__(self, value):
+-                self.foo = 'bar'
++        with torch._dynamo.error_on_graph_break(False):
++            class F2(float):
++                def __init__(self, value):
++                    self.foo = 'bar'
+
+         f = F2.fromhex((1.5).hex())
+         self.assertIs(type(f), F2)
+@@ -1500,5 +1684,5 @@ class HexFloatTestCase(FloatsAreIdenticalMixin, unittest.TestCase):
          self.assertEqual(getattr(f, 'foo', 'none'), 'bar')
- 
- 
+
+
 -if __name__ == '__main__':
 -    unittest.main()
 +if __name__ == "__main__":
diff --git a/test/dynamo/cpython/3_13/test_float.py b/test/dynamo/cpython/3_13/test_float.py
index 9313a1a63d7b5..efc387023a4ae 100644
--- a/test/dynamo/cpython/3_13/test_float.py
+++ b/test/dynamo/cpython/3_13/test_float.py
@@ -222,9 +222,10 @@ def test_underscores(self):
     def test_non_numeric_input_types(self):
         # Test possible non-numeric types for the argument x, including
         # subclasses of the explicitly documented accepted types.
-        class CustomStr(str): pass
-        class CustomBytes(bytes): pass
-        class CustomByteArray(bytearray): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class CustomStr(str): pass
+            class CustomBytes(bytes): pass
+            class CustomByteArray(bytearray): pass
 
         factories = [
             bytes,
@@ -311,30 +312,31 @@ def test_float_with_comma(self):
 
     def test_floatconversion(self):
         # Make sure that calls to __float__() work properly
-        class Foo1(object):
-            def __float__(self):
-                return 42.
+        with torch._dynamo.error_on_graph_break(False):
+            class Foo1(object):
+                def __float__(self):
+                    return 42.
 
-        class Foo2(float):
-            def __float__(self):
-                return 42.
+            class Foo2(float):
+                def __float__(self):
+                    return 42.
 
-        class Foo3(float):
-            def __new__(cls, value=0.):
-                return float.__new__(cls, 2*value)
+            class Foo3(float):
+                def __new__(cls, value=0.):
+                    return float.__new__(cls, 2*value)
 
-            def __float__(self):
-                return self
+                def __float__(self):
+                    return self
 
-        class Foo4(float):
-            def __float__(self):
-                return 42
+            class Foo4(float):
+                def __float__(self):
+                    return 42
 
-        # Issue 5759: __float__ not called on str subclasses (though it is on
-        # unicode subclasses).
-        class FooStr(str):
-            def __float__(self):
-                return float(str(self)) + 1
+            # Issue 5759: __float__ not called on str subclasses (though it is on
+            # unicode subclasses).
+            class FooStr(str):
+                def __float__(self):
+                    return float(str(self)) + 1
 
         self.assertEqual(float(Foo1()), 42.)
         self.assertEqual(float(Foo2()), 42.)
@@ -343,15 +345,17 @@ def __float__(self):
         self.assertRaises(TypeError, float, Foo4(42))
         self.assertEqual(float(FooStr('8')), 9.)
 
-        class Foo5:
-            def __float__(self):
-                return ""
+        with torch._dynamo.error_on_graph_break(False):
+            class Foo5:
+                def __float__(self):
+                    return ""
         self.assertRaises(TypeError, time.sleep, Foo5())
 
-        # Issue #24731
-        class F:
-            def __float__(self):
-                return OtherFloatSubclass(42.)
+        with torch._dynamo.error_on_graph_break(False):
+            # Issue #24731
+            class F:
+                def __float__(self):
+                    return OtherFloatSubclass(42.)
         with self.assertWarns(DeprecationWarning):
             self.assertEqual(float(F()), 42.)
         with self.assertWarns(DeprecationWarning):
@@ -361,18 +365,20 @@ def __float__(self):
         with self.assertWarns(DeprecationWarning):
             self.assertIs(type(FloatSubclass(F())), FloatSubclass)
 
-        class MyIndex:
-            def __init__(self, value):
-                self.value = value
-            def __index__(self):
-                return self.value
+        with torch._dynamo.error_on_graph_break(False):
+            class MyIndex:
+                def __init__(self, value):
+                    self.value = value
+                def __index__(self):
+                    return self.value
 
         self.assertEqual(float(MyIndex(42)), 42.0)
         self.assertRaises(OverflowError, float, MyIndex(2**2000))
 
-        class MyInt:
-            def __int__(self):
-                return 42
+        with torch._dynamo.error_on_graph_break(False):
+            class MyInt:
+                def __int__(self):
+                    return 42
 
         self.assertRaises(TypeError, float, MyInt())
 
@@ -381,27 +387,30 @@ def test_keyword_args(self):
             float(x='3.14')
 
     def test_keywords_in_subclass(self):
-        class subclass(float):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass(float):
+                pass
         u = subclass(2.5)
         self.assertIs(type(u), subclass)
         self.assertEqual(float(u), 2.5)
         with self.assertRaises(TypeError):
             subclass(x=0)
 
-        class subclass_with_init(float):
-            def __init__(self, arg, newarg=None):
-                self.newarg = newarg
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass_with_init(float):
+                def __init__(self, arg, newarg=None):
+                    self.newarg = newarg
         u = subclass_with_init(2.5, newarg=3)
         self.assertIs(type(u), subclass_with_init)
         self.assertEqual(float(u), 2.5)
         self.assertEqual(u.newarg, 3)
 
-        class subclass_with_new(float):
-            def __new__(cls, arg, newarg=None):
-                self = super().__new__(cls, arg)
-                self.newarg = newarg
-                return self
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass_with_new(float):
+                def __new__(cls, arg, newarg=None):
+                    self = super().__new__(cls, arg)
+                    self.newarg = newarg
+                    return self
         u = subclass_with_new(2.5, newarg=3)
         self.assertIs(type(u), subclass_with_new)
         self.assertEqual(float(u), 2.5)
@@ -737,11 +746,12 @@ def test_hash(self):
     def test_hash_nan(self):
         value = float('nan')
         self.assertEqual(hash(value), object.__hash__(value))
-        class H:
-            def __hash__(self):
-                return 42
-        class F(float, H):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class H:
+                def __hash__(self):
+                    return 42
+            class F(float, H):
+                pass
         value = F('nan')
         self.assertEqual(hash(value), object.__hash__(value))
 
@@ -1654,17 +1664,19 @@ def roundtrip(x):
                 self.identical(x, fromHex(toHex(x)))
 
     def test_subclass(self):
-        class F(float):
-            def __new__(cls, value):
-                return float.__new__(cls, value + 1)
+        with torch._dynamo.error_on_graph_break(False):
+            class F(float):
+                def __new__(cls, value):
+                    return float.__new__(cls, value + 1)
 
         f = F.fromhex((1.5).hex())
         self.assertIs(type(f), F)
         self.assertEqual(f, 2.5)
 
-        class F2(float):
-            def __init__(self, value):
-                self.foo = 'bar'
+        with torch._dynamo.error_on_graph_break(False):
+            class F2(float):
+                def __init__(self, value):
+                    self.foo = 'bar'
 
         f = F2.fromhex((1.5).hex())
         self.assertIs(type(f), F2)
diff --git a/test/dynamo/cpython/3_13/test_generators.diff b/test/dynamo/cpython/3_13/test_generators.diff
index 338d51894fb38..8d7c0bfd21022 100644
--- a/test/dynamo/cpython/3_13/test_generators.diff
+++ b/test/dynamo/cpython/3_13/test_generators.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_generators.py b/test/dynamo/cpython/3_13/test_generators.py
-index e48d79d34f4..a48da0914b9 100644
+index 515fe7407f1..a48da0914b9 100644
 --- a/test/dynamo/cpython/3_13/test_generators.py
 +++ b/test/dynamo/cpython/3_13/test_generators.py
 @@ -1,3 +1,56 @@
@@ -105,7 +105,8 @@ index e48d79d34f4..a48da0914b9 100644
 +                return self.val
 +
 +            # No __iter__ method
-+
+ 
+-class ModifyUnderlyingIterableTest(unittest.TestCase):
 +        class C:
 +
 +            def __iter__(self):
@@ -113,8 +114,7 @@ index e48d79d34f4..a48da0914b9 100644
 +
 +        self.assertEqual([1,2], list(i for i in C()))
 +
- 
--class ModifyUnderlyingIterableTest(unittest.TestCase):
++
 +class ModifyUnderlyingIterableTest(__TestCase):
      iterables = [
          range(0),
@@ -137,99 +137,16 @@ index e48d79d34f4..a48da0914b9 100644
  
      def test_close_no_return_value(self):
          def f():
-@@ -630,90 +706,7 @@ class GeneratorCloseTest(unittest.TestCase):
+@@ -630,7 +706,7 @@ class GeneratorCloseTest(unittest.TestCase):
          self.assertIsNone(f_wr())
  
  
--# See https://github.com/python/cpython/issues/125723
--class GeneratorDeallocTest(unittest.TestCase):
--    def test_frame_outlives_generator(self):
--        def g1():
--            a = 42
--            yield sys._getframe()
--
--        def g2():
--            a = 42
--            yield
--
--        def g3(obj):
--            a = 42
--            obj.frame = sys._getframe()
--            yield
--
--        class ObjectWithFrame():
--            def __init__(self):
--                self.frame = None
--
--        def get_frame(index):
--            if index == 1:
--                return next(g1())
--            elif index == 2:
--                gen = g2()
--                next(gen)
--                return gen.gi_frame
--            elif index == 3:
--                obj = ObjectWithFrame()
--                next(g3(obj))
--                return obj.frame
--            else:
--                return None
--
--        for index in (1, 2, 3):
--            with self.subTest(index=index):
--                frame = get_frame(index)
--                frame_locals = frame.f_locals
--                self.assertIn('a', frame_locals)
--                self.assertEqual(frame_locals['a'], 42)
--
--    def test_frame_locals_outlive_generator(self):
--        frame_locals1 = None
--
--        def g1():
--            nonlocal frame_locals1
--            frame_locals1 = sys._getframe().f_locals
--            a = 42
--            yield
--
--        def g2():
--            a = 42
--            yield sys._getframe().f_locals
--
--        def get_frame_locals(index):
--            if index == 1:
--                nonlocal frame_locals1
--                next(g1())
--                return frame_locals1
--            if index == 2:
--                return next(g2())
--            else:
--                return None
--
--        for index in (1, 2):
--            with self.subTest(index=index):
--                frame_locals = get_frame_locals(index)
--                self.assertIn('a', frame_locals)
--                self.assertEqual(frame_locals['a'], 42)
--
--    def test_frame_locals_outlive_generator_with_exec(self):
--        def g():
--            a = 42
--            yield locals(), sys._getframe().f_locals
--
--        locals_ = {'g': g}
--        for i in range(10):
--            exec("snapshot, live_locals = next(g())", locals=locals_)
--            for l in (locals_['snapshot'], locals_['live_locals']):
--                self.assertIn('a', l)
--                self.assertEqual(l['a'], 42)
--
--
 -class GeneratorThrowTest(unittest.TestCase):
 +class GeneratorThrowTest(__TestCase):
  
      def test_exception_context_with_yield(self):
          def f():
-@@ -812,7 +805,7 @@ class GeneratorThrowTest(unittest.TestCase):
+@@ -729,7 +805,7 @@ class GeneratorThrowTest(unittest.TestCase):
              gen.throw(ValueError)
  
  
@@ -238,7 +155,7 @@ index e48d79d34f4..a48da0914b9 100644
  
      def check_stack_names(self, frame, expected):
          names = []
-@@ -861,7 +854,7 @@ class GeneratorStackTraceTest(unittest.TestCase):
+@@ -778,7 +854,7 @@ class GeneratorStackTraceTest(unittest.TestCase):
          self.check_yield_from_example(call_throw)
  
  
@@ -247,7 +164,7 @@ index e48d79d34f4..a48da0914b9 100644
      def test_generator_gi_yieldfrom(self):
          def a():
              self.assertEqual(inspect.getgeneratorstate(gen_b), inspect.GEN_RUNNING)
-@@ -2752,21 +2745,27 @@ test_generators just happened to be the test that drew these out.
+@@ -2669,21 +2745,27 @@ test_generators just happened to be the test that drew these out.
  
  """
  
diff --git a/test/dynamo/cpython/3_13/test_int.diff b/test/dynamo/cpython/3_13/test_int.diff
index 758c270205d56..20ab3ed2f58bf 100644
--- a/test/dynamo/cpython/3_13/test_int.diff
+++ b/test/dynamo/cpython/3_13/test_int.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_int.py b/test/dynamo/cpython/3_13/test_int.py
-index 48825f46911..ce115cd784c 100644
+index 48825f46911..731680d82a0 100644
 --- a/test/dynamo/cpython/3_13/test_int.py
 +++ b/test/dynamo/cpython/3_13/test_int.py
 @@ -1,13 +1,140 @@
@@ -59,7 +59,7 @@ index 48825f46911..ce115cd784c 100644
 +
  import sys
  import time
- 
+
  import unittest
  from unittest import mock
  from test import support
@@ -144,47 +144,324 @@ index 48825f46911..ce115cd784c 100644
 +    '(1+1.5_j_)',
 +    '(1+1.5_j)',
 +]
- 
+
  try:
      import _pylong
 @@ -38,7 +165,7 @@ L = [
  class IntSubclass(int):
      pass
- 
+
 -class IntTestCases(unittest.TestCase):
 +class IntTestCases(__TestCase):
- 
+
      def test_basic(self):
          self.assertEqual(int(314), 314)
-@@ -566,6 +693,7 @@ class IntTestCases(unittest.TestCase):
+@@ -309,11 +436,13 @@ class IntTestCases(unittest.TestCase):
+             int('0', 5.0)
+
+     def test_int_base_indexable(self):
+-        class MyIndexable(object):
+-            def __init__(self, value):
+-                self.value = value
+-            def __index__(self):
+-                return self.value
++        with torch._dynamo.error_on_graph_break(False):
++            with torch._dynamo.error_on_graph_break(False):
++                class MyIndexable(object):
++                    def __init__(self, value):
++                        self.value = value
++                    def __index__(self):
++                        return self.value
+
+         # Check out of range bases.
+         for base in 2**100, -2**100, 1, 37:
+@@ -328,9 +457,11 @@ class IntTestCases(unittest.TestCase):
+     def test_non_numeric_input_types(self):
+         # Test possible non-numeric types for the argument x, including
+         # subclasses of the explicitly documented accepted types.
+-        class CustomStr(str): pass
+-        class CustomBytes(bytes): pass
+-        class CustomByteArray(bytearray): pass
++
++        with torch._dynamo.error_on_graph_break(False):
++            class CustomStr(str): pass
++            class CustomBytes(bytes): pass
++            class CustomByteArray(bytearray): pass
+
+         factories = [
+             bytes,
+@@ -372,72 +503,82 @@ class IntTestCases(unittest.TestCase):
+
+     def test_intconversion(self):
+         # Test __int__()
+-        class ClassicMissingMethods:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class ClassicMissingMethods:
++                pass
+         self.assertRaises(TypeError, int, ClassicMissingMethods())
+
+-        class MissingMethods(object):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class MissingMethods(object):
++                pass
+         self.assertRaises(TypeError, int, MissingMethods())
+
+-        class Foo0:
+-            def __int__(self):
+-                return 42
++        with torch._dynamo.error_on_graph_break(False):
++            class Foo0:
++                def __int__(self):
++                    return 42
+
+         self.assertEqual(int(Foo0()), 42)
+
+-        class Classic:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Classic:
++                pass
+         for base in (object, Classic):
+-            class IntOverridesTrunc(base):
+-                def __int__(self):
+-                    return 42
+-                def __trunc__(self):
+-                    return -12
++            with torch._dynamo.error_on_graph_break(False):
++                class IntOverridesTrunc(base):
++                    def __int__(self):
++                        return 42
++                    def __trunc__(self):
++                        return -12
+             self.assertEqual(int(IntOverridesTrunc()), 42)
+
+-            class JustTrunc(base):
+-                def __trunc__(self):
+-                    return 42
++            with torch._dynamo.error_on_graph_break(False):
++                class JustTrunc(base):
++                    def __trunc__(self):
++                        return 42
+             with self.assertWarns(DeprecationWarning):
+                 self.assertEqual(int(JustTrunc()), 42)
+
+-            class ExceptionalTrunc(base):
+-                def __trunc__(self):
+-                    1 / 0
++            with torch._dynamo.error_on_graph_break(False):
++                class ExceptionalTrunc(base):
++                    def __trunc__(self):
++                        1 / 0
+             with self.assertRaises(ZeroDivisionError), \
+                  self.assertWarns(DeprecationWarning):
+                 int(ExceptionalTrunc())
+
+             for trunc_result_base in (object, Classic):
+-                class Index(trunc_result_base):
+-                    def __index__(self):
+-                        return 42
+-
+-                class TruncReturnsNonInt(base):
+-                    def __trunc__(self):
+-                        return Index()
++                with torch._dynamo.error_on_graph_break(False):
++                    class Index(trunc_result_base):
++                        def __index__(self):
++                            return 42
++
++                    class TruncReturnsNonInt(base):
++                        def __trunc__(self):
++                            return Index()
+                 with self.assertWarns(DeprecationWarning):
+                     self.assertEqual(int(TruncReturnsNonInt()), 42)
+
+-                class Intable(trunc_result_base):
+-                    def __int__(self):
+-                        return 42
++                with torch._dynamo.error_on_graph_break(False):
++                    class Intable(trunc_result_base):
++                        def __int__(self):
++                            return 42
+
+-                class TruncReturnsNonIndex(base):
+-                    def __trunc__(self):
+-                        return Intable()
++                    class TruncReturnsNonIndex(base):
++                        def __trunc__(self):
++                            return Intable()
+                 with self.assertWarns(DeprecationWarning):
+                     self.assertEqual(int(TruncReturnsNonInt()), 42)
+
+-                class NonIntegral(trunc_result_base):
+-                    def __trunc__(self):
+-                        # Check that we avoid infinite recursion.
+-                        return NonIntegral()
++                with torch._dynamo.error_on_graph_break(False):
++                    class NonIntegral(trunc_result_base):
++                        def __trunc__(self):
++                            # Check that we avoid infinite recursion.
++                            return NonIntegral()
+
+-                class TruncReturnsNonIntegral(base):
+-                    def __trunc__(self):
+-                        return NonIntegral()
++                    class TruncReturnsNonIntegral(base):
++                        def __trunc__(self):
++                            return NonIntegral()
+                 try:
+                     with self.assertWarns(DeprecationWarning):
+                         int(TruncReturnsNonIntegral())
+@@ -449,27 +590,29 @@ class IntTestCases(unittest.TestCase):
+                     self.fail("Failed to raise TypeError with %s" %
+                               ((base, trunc_result_base),))
+
+-                # Regression test for bugs.python.org/issue16060.
+-                class BadInt(trunc_result_base):
+-                    def __int__(self):
+-                        return 42.0
++                with torch._dynamo.error_on_graph_break(False):
++                    # Regression test for bugs.python.org/issue16060.
++                    class BadInt(trunc_result_base):
++                        def __int__(self):
++                            return 42.0
+
+-                class TruncReturnsBadInt(base):
+-                    def __trunc__(self):
+-                        return BadInt()
++                    class TruncReturnsBadInt(base):
++                        def __trunc__(self):
++                            return BadInt()
+
+                 with self.assertRaises(TypeError), \
+                      self.assertWarns(DeprecationWarning):
+                     int(TruncReturnsBadInt())
+
+     def test_int_subclass_with_index(self):
+-        class MyIndex(int):
+-            def __index__(self):
+-                return 42
++        with torch._dynamo.error_on_graph_break(False):
++            class MyIndex(int):
++                def __index__(self):
++                    return 42
+
+-        class BadIndex(int):
+-            def __index__(self):
+-                return 42.0
++            class BadIndex(int):
++                def __index__(self):
++                    return 42.0
+
+         my_int = MyIndex(7)
+         self.assertEqual(my_int, 7)
+@@ -478,13 +621,14 @@ class IntTestCases(unittest.TestCase):
+         self.assertEqual(int(BadIndex()), 0)
+
+     def test_int_subclass_with_int(self):
+-        class MyInt(int):
+-            def __int__(self):
+-                return 42
++        with torch._dynamo.error_on_graph_break(False):
++            class MyInt(int):
++                def __int__(self):
++                    return 42
+
+-        class BadInt(int):
+-            def __int__(self):
+-                return 42.0
++            class BadInt(int):
++                def __int__(self):
++                    return 42.0
+
+         my_int = MyInt(7)
+         self.assertEqual(my_int, 7)
+@@ -495,33 +639,34 @@ class IntTestCases(unittest.TestCase):
+         self.assertRaises(TypeError, int, my_int)
+
+     def test_int_returns_int_subclass(self):
+-        class BadIndex:
+-            def __index__(self):
+-                return True
++        with torch._dynamo.error_on_graph_break(False):
++            class BadIndex:
++                def __index__(self):
++                    return True
+
+-        class BadIndex2(int):
+-            def __index__(self):
+-                return True
++            class BadIndex2(int):
++                def __index__(self):
++                    return True
+
+-        class BadInt:
+-            def __int__(self):
+-                return True
++            class BadInt:
++                def __int__(self):
++                    return True
+
+-        class BadInt2(int):
+-            def __int__(self):
+-                return True
++            class BadInt2(int):
++                def __int__(self):
++                    return True
+
+-        class TruncReturnsBadIndex:
+-            def __trunc__(self):
+-                return BadIndex()
++            class TruncReturnsBadIndex:
++                def __trunc__(self):
++                    return BadIndex()
+
+-        class TruncReturnsBadInt:
+-            def __trunc__(self):
+-                return BadInt()
++            class TruncReturnsBadInt:
++                def __trunc__(self):
++                    return BadInt()
+
+-        class TruncReturnsIntSubclass:
+-            def __trunc__(self):
+-                return True
++            class TruncReturnsIntSubclass:
++                def __trunc__(self):
++                    return True
+
+         bad_int = BadIndex()
+         with self.assertWarns(DeprecationWarning):
+@@ -566,6 +711,7 @@ class IntTestCases(unittest.TestCase):
          self.assertEqual(n, 1)
          self.assertIs(type(n), IntSubclass)
- 
+
 +    @skipIfTorchDynamo("flaky under dynamo")
      def test_error_message(self):
          def check(s, base=None):
              with self.assertRaises(ValueError,
-@@ -607,7 +735,7 @@ class IntTestCases(unittest.TestCase):
+@@ -607,7 +753,7 @@ class IntTestCases(unittest.TestCase):
          self.assertEqual(int('1_2_3_4_5_6_7', 32), 1144132807)
- 
- 
+
+
 -class IntStrDigitLimitsTests(unittest.TestCase):
 +class IntStrDigitLimitsTests(__TestCase):
- 
+
      int_class = int  # Override this in subclasses to reuse the suite.
- 
-@@ -818,7 +946,7 @@ class IntSubclassStrDigitLimitsTests(IntStrDigitLimitsTests):
+
+@@ -818,7 +964,7 @@ class IntSubclassStrDigitLimitsTests(IntStrDigitLimitsTests):
      int_class = IntSubclass
- 
- 
+
+
 -class PyLongModuleTests(unittest.TestCase):
 +class PyLongModuleTests(__TestCase):
      # Tests of the functions in _pylong.py.  Those get used when the
      # number of digits in the input values are large enough.
- 
-@@ -922,4 +1050,4 @@ class PyLongModuleTests(unittest.TestCase):
+
+@@ -922,4 +1068,4 @@ class PyLongModuleTests(unittest.TestCase):
              bits <<= 1
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_int.py b/test/dynamo/cpython/3_13/test_int.py
index ce115cd784ce2..b0f8fe49d1b94 100644
--- a/test/dynamo/cpython/3_13/test_int.py
+++ b/test/dynamo/cpython/3_13/test_int.py
@@ -436,11 +436,13 @@ def test_int_base_bad_types(self):
             int('0', 5.0)
 
     def test_int_base_indexable(self):
-        class MyIndexable(object):
-            def __init__(self, value):
-                self.value = value
-            def __index__(self):
-                return self.value
+        with torch._dynamo.error_on_graph_break(False):
+            with torch._dynamo.error_on_graph_break(False):
+                class MyIndexable(object):
+                    def __init__(self, value):
+                        self.value = value
+                    def __index__(self):
+                        return self.value
 
         # Check out of range bases.
         for base in 2**100, -2**100, 1, 37:
@@ -455,9 +457,11 @@ def __index__(self):
     def test_non_numeric_input_types(self):
         # Test possible non-numeric types for the argument x, including
         # subclasses of the explicitly documented accepted types.
-        class CustomStr(str): pass
-        class CustomBytes(bytes): pass
-        class CustomByteArray(bytearray): pass
+
+        with torch._dynamo.error_on_graph_break(False):
+            class CustomStr(str): pass
+            class CustomBytes(bytes): pass
+            class CustomByteArray(bytearray): pass
 
         factories = [
             bytes,
@@ -499,72 +503,82 @@ def test_string_float(self):
 
     def test_intconversion(self):
         # Test __int__()
-        class ClassicMissingMethods:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class ClassicMissingMethods:
+                pass
         self.assertRaises(TypeError, int, ClassicMissingMethods())
 
-        class MissingMethods(object):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class MissingMethods(object):
+                pass
         self.assertRaises(TypeError, int, MissingMethods())
 
-        class Foo0:
-            def __int__(self):
-                return 42
+        with torch._dynamo.error_on_graph_break(False):
+            class Foo0:
+                def __int__(self):
+                    return 42
 
         self.assertEqual(int(Foo0()), 42)
 
-        class Classic:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Classic:
+                pass
         for base in (object, Classic):
-            class IntOverridesTrunc(base):
-                def __int__(self):
-                    return 42
-                def __trunc__(self):
-                    return -12
+            with torch._dynamo.error_on_graph_break(False):
+                class IntOverridesTrunc(base):
+                    def __int__(self):
+                        return 42
+                    def __trunc__(self):
+                        return -12
             self.assertEqual(int(IntOverridesTrunc()), 42)
 
-            class JustTrunc(base):
-                def __trunc__(self):
-                    return 42
+            with torch._dynamo.error_on_graph_break(False):
+                class JustTrunc(base):
+                    def __trunc__(self):
+                        return 42
             with self.assertWarns(DeprecationWarning):
                 self.assertEqual(int(JustTrunc()), 42)
 
-            class ExceptionalTrunc(base):
-                def __trunc__(self):
-                    1 / 0
+            with torch._dynamo.error_on_graph_break(False):
+                class ExceptionalTrunc(base):
+                    def __trunc__(self):
+                        1 / 0
             with self.assertRaises(ZeroDivisionError), \
                  self.assertWarns(DeprecationWarning):
                 int(ExceptionalTrunc())
 
             for trunc_result_base in (object, Classic):
-                class Index(trunc_result_base):
-                    def __index__(self):
-                        return 42
-
-                class TruncReturnsNonInt(base):
-                    def __trunc__(self):
-                        return Index()
+                with torch._dynamo.error_on_graph_break(False):
+                    class Index(trunc_result_base):
+                        def __index__(self):
+                            return 42
+
+                    class TruncReturnsNonInt(base):
+                        def __trunc__(self):
+                            return Index()
                 with self.assertWarns(DeprecationWarning):
                     self.assertEqual(int(TruncReturnsNonInt()), 42)
 
-                class Intable(trunc_result_base):
-                    def __int__(self):
-                        return 42
+                with torch._dynamo.error_on_graph_break(False):
+                    class Intable(trunc_result_base):
+                        def __int__(self):
+                            return 42
 
-                class TruncReturnsNonIndex(base):
-                    def __trunc__(self):
-                        return Intable()
+                    class TruncReturnsNonIndex(base):
+                        def __trunc__(self):
+                            return Intable()
                 with self.assertWarns(DeprecationWarning):
                     self.assertEqual(int(TruncReturnsNonInt()), 42)
 
-                class NonIntegral(trunc_result_base):
-                    def __trunc__(self):
-                        # Check that we avoid infinite recursion.
-                        return NonIntegral()
+                with torch._dynamo.error_on_graph_break(False):
+                    class NonIntegral(trunc_result_base):
+                        def __trunc__(self):
+                            # Check that we avoid infinite recursion.
+                            return NonIntegral()
 
-                class TruncReturnsNonIntegral(base):
-                    def __trunc__(self):
-                        return NonIntegral()
+                    class TruncReturnsNonIntegral(base):
+                        def __trunc__(self):
+                            return NonIntegral()
                 try:
                     with self.assertWarns(DeprecationWarning):
                         int(TruncReturnsNonIntegral())
@@ -576,27 +590,29 @@ def __trunc__(self):
                     self.fail("Failed to raise TypeError with %s" %
                               ((base, trunc_result_base),))
 
-                # Regression test for bugs.python.org/issue16060.
-                class BadInt(trunc_result_base):
-                    def __int__(self):
-                        return 42.0
+                with torch._dynamo.error_on_graph_break(False):
+                    # Regression test for bugs.python.org/issue16060.
+                    class BadInt(trunc_result_base):
+                        def __int__(self):
+                            return 42.0
 
-                class TruncReturnsBadInt(base):
-                    def __trunc__(self):
-                        return BadInt()
+                    class TruncReturnsBadInt(base):
+                        def __trunc__(self):
+                            return BadInt()
 
                 with self.assertRaises(TypeError), \
                      self.assertWarns(DeprecationWarning):
                     int(TruncReturnsBadInt())
 
     def test_int_subclass_with_index(self):
-        class MyIndex(int):
-            def __index__(self):
-                return 42
+        with torch._dynamo.error_on_graph_break(False):
+            class MyIndex(int):
+                def __index__(self):
+                    return 42
 
-        class BadIndex(int):
-            def __index__(self):
-                return 42.0
+            class BadIndex(int):
+                def __index__(self):
+                    return 42.0
 
         my_int = MyIndex(7)
         self.assertEqual(my_int, 7)
@@ -605,13 +621,14 @@ def __index__(self):
         self.assertEqual(int(BadIndex()), 0)
 
     def test_int_subclass_with_int(self):
-        class MyInt(int):
-            def __int__(self):
-                return 42
+        with torch._dynamo.error_on_graph_break(False):
+            class MyInt(int):
+                def __int__(self):
+                    return 42
 
-        class BadInt(int):
-            def __int__(self):
-                return 42.0
+            class BadInt(int):
+                def __int__(self):
+                    return 42.0
 
         my_int = MyInt(7)
         self.assertEqual(my_int, 7)
@@ -622,33 +639,34 @@ def __int__(self):
         self.assertRaises(TypeError, int, my_int)
 
     def test_int_returns_int_subclass(self):
-        class BadIndex:
-            def __index__(self):
-                return True
+        with torch._dynamo.error_on_graph_break(False):
+            class BadIndex:
+                def __index__(self):
+                    return True
 
-        class BadIndex2(int):
-            def __index__(self):
-                return True
+            class BadIndex2(int):
+                def __index__(self):
+                    return True
 
-        class BadInt:
-            def __int__(self):
-                return True
+            class BadInt:
+                def __int__(self):
+                    return True
 
-        class BadInt2(int):
-            def __int__(self):
-                return True
+            class BadInt2(int):
+                def __int__(self):
+                    return True
 
-        class TruncReturnsBadIndex:
-            def __trunc__(self):
-                return BadIndex()
+            class TruncReturnsBadIndex:
+                def __trunc__(self):
+                    return BadIndex()
 
-        class TruncReturnsBadInt:
-            def __trunc__(self):
-                return BadInt()
+            class TruncReturnsBadInt:
+                def __trunc__(self):
+                    return BadInt()
 
-        class TruncReturnsIntSubclass:
-            def __trunc__(self):
-                return True
+            class TruncReturnsIntSubclass:
+                def __trunc__(self):
+                    return True
 
         bad_int = BadIndex()
         with self.assertWarns(DeprecationWarning):
diff --git a/test/dynamo/cpython/3_13/test_iter.diff b/test/dynamo/cpython/3_13/test_iter.diff
index ee8a108ed3892..18bdcdfb3df82 100644
--- a/test/dynamo/cpython/3_13/test_iter.diff
+++ b/test/dynamo/cpython/3_13/test_iter.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_iter.py b/test/dynamo/cpython/3_13/test_iter.py
-index 1b9f3cf7624..bad1ba94300 100644
+index 1b9f3cf7624..6560c7423a6 100644
 --- a/test/dynamo/cpython/3_13/test_iter.py
 +++ b/test/dynamo/cpython/3_13/test_iter.py
 @@ -1,3 +1,60 @@
@@ -61,15 +61,15 @@ index 1b9f3cf7624..bad1ba94300 100644
 +# ======= END DYNAMO PATCH =======
 +
  # Test iterators.
- 
+
  import sys
-@@ -104,12 +158,10 @@ class EmptyIterClass:
- 
+@@ -104,12 +161,10 @@ class EmptyIterClass:
+
  # Main test suite
- 
+
 -class TestCase(unittest.TestCase):
 +class TestCase(__TestCase):
- 
+
      # Helper to check that an iterator returns a given sequence
      def check_iterator(self, it, seq, pickle=True):
 -        if pickle:
@@ -77,8 +77,8 @@ index 1b9f3cf7624..bad1ba94300 100644
          res = []
          while 1:
              try:
-@@ -121,8 +173,6 @@ class TestCase(unittest.TestCase):
- 
+@@ -121,8 +176,6 @@ class TestCase(unittest.TestCase):
+
      # Helper to check that a for loop generates a given sequence
      def check_for_loop(self, expr, seq, pickle=True):
 -        if pickle:
@@ -86,17 +86,349 @@ index 1b9f3cf7624..bad1ba94300 100644
          res = []
          for val in expr:
              res.append(val)
-@@ -635,6 +685,7 @@ class TestCase(unittest.TestCase):
+@@ -261,19 +314,20 @@ class TestCase(unittest.TestCase):
+         def run(builtin_name, item, sentinel=None):
+             it = iter(item) if sentinel is None else iter(item, sentinel)
+
+-            class CustomStr:
+-                def __init__(self, name, iterator):
+-                    self.name = name
+-                    self.iterator = iterator
+-                def __hash__(self):
+-                    return hash(self.name)
+-                def __eq__(self, other):
+-                    # Here we exhaust our iterator, possibly changing
+-                    # its `it_seq` pointer to NULL
+-                    # The `__reduce__` call should correctly get
+-                    # the pointers after this call
+-                    list(self.iterator)
+-                    return other == self.name
++            with torch._dynamo.error_on_graph_break(False):
++                class CustomStr:
++                    def __init__(self, name, iterator):
++                        self.name = name
++                        self.iterator = iterator
++                    def __hash__(self):
++                        return hash(self.name)
++                    def __eq__(self, other):
++                        # Here we exhaust our iterator, possibly changing
++                        # its `it_seq` pointer to NULL
++                        # The `__reduce__` call should correctly get
++                        # the pointers after this call
++                        list(self.iterator)
++                        return other == self.name
+
+             # del is required here
+             # to not prematurely call __eq__ from
+@@ -323,9 +377,10 @@ class TestCase(unittest.TestCase):
+
+     # Test a new_style class with __iter__ but no next() method
+     def test_new_style_iter_class(self):
+-        class IterClass(object):
+-            def __iter__(self):
+-                return self
++        with torch._dynamo.error_on_graph_break(False):
++            class IterClass(object):
++                def __iter__(self):
++                    return self
+         self.assertRaises(TypeError, iter, IterClass())
+
+     # Test two-argument iter() with callable instance
+@@ -394,11 +449,12 @@ class TestCase(unittest.TestCase):
+
+     # Test exception propagation through sequence iterator
+     def test_exception_sequence(self):
+-        class MySequenceClass(SequenceClass):
+-            def __getitem__(self, i):
+-                if i == 10:
+-                    raise RuntimeError
+-                return SequenceClass.__getitem__(self, i)
++        with torch._dynamo.error_on_graph_break(False):
++            class MySequenceClass(SequenceClass):
++                def __getitem__(self, i):
++                    if i == 10:
++                        raise RuntimeError
++                    return SequenceClass.__getitem__(self, i)
+         res = []
+         try:
+             for x in MySequenceClass(20):
+@@ -410,11 +466,12 @@ class TestCase(unittest.TestCase):
+
+     # Test for StopIteration from __getitem__
+     def test_stop_sequence(self):
+-        class MySequenceClass(SequenceClass):
+-            def __getitem__(self, i):
+-                if i == 10:
+-                    raise StopIteration
+-                return SequenceClass.__getitem__(self, i)
++        with torch._dynamo.error_on_graph_break(False):
++            class MySequenceClass(SequenceClass):
++                def __getitem__(self, i):
++                    if i == 10:
++                        raise StopIteration
++                    return SequenceClass.__getitem__(self, i)
+         self.check_for_loop(MySequenceClass(20), list(range(10)), pickle=False)
+
+     # Test a big range
+@@ -541,32 +598,34 @@ class TestCase(unittest.TestCase):
+         self.assertRaises(TypeError, filter, None, list)
+         self.assertRaises(TypeError, filter, None, 42)
+
+-        class Boolean:
+-            def __init__(self, truth):
+-                self.truth = truth
+-            def __bool__(self):
+-                return self.truth
++        with torch._dynamo.error_on_graph_break(False):
++            class Boolean:
++                def __init__(self, truth):
++                    self.truth = truth
++                def __bool__(self):
++                    return self.truth
+         bTrue = Boolean(True)
+         bFalse = Boolean(False)
+
+-        class Seq:
+-            def __init__(self, *args):
+-                self.vals = args
+-            def __iter__(self):
+-                class SeqIter:
+-                    def __init__(self, vals):
+-                        self.vals = vals
+-                        self.i = 0
+-                    def __iter__(self):
+-                        return self
+-                    def __next__(self):
+-                        i = self.i
+-                        self.i = i + 1
+-                        if i < len(self.vals):
+-                            return self.vals[i]
+-                        else:
+-                            raise StopIteration
+-                return SeqIter(self.vals)
++        with torch._dynamo.error_on_graph_break(False):
++            class Seq:
++                def __init__(self, *args):
++                    self.vals = args
++                def __iter__(self):
++                    class SeqIter:
++                        def __init__(self, vals):
++                            self.vals = vals
++                            self.i = 0
++                        def __iter__(self):
++                            return self
++                        def __next__(self):
++                            i = self.i
++                            self.i = i + 1
++                            if i < len(self.vals):
++                                return self.vals[i]
++                            else:
++                                raise StopIteration
++                    return SeqIter(self.vals)
+
+         seq = Seq(*([bTrue, bFalse] * 25))
+         self.assertEqual(list(filter(lambda x: not x, seq)), [bFalse]*25)
+@@ -635,6 +694,7 @@ class TestCase(unittest.TestCase):
                  pass
- 
+
      # Test zip()'s use of iterators.
 +    @skipIfTorchDynamo("infinite loop")
      def test_builtin_zip(self):
          self.assertEqual(list(zip()), [])
          self.assertEqual(list(zip(*[])), [])
-@@ -1187,4 +1238,4 @@ class TestCase(unittest.TestCase):
- 
- 
+@@ -653,17 +713,18 @@ class TestCase(unittest.TestCase):
+         self.assertEqual(list(d.items()), list(zip(d, d.values())))
+
+         # Generate all ints starting at constructor arg.
+-        class IntsFrom:
+-            def __init__(self, start):
+-                self.i = start
++        with torch._dynamo.error_on_graph_break(False):
++            class IntsFrom:
++                def __init__(self, start):
++                    self.i = start
+
+-            def __iter__(self):
+-                return self
++                def __iter__(self):
++                    return self
+
+-            def __next__(self):
+-                i = self.i
+-                self.i = i+1
+-                return i
++                def __next__(self):
++                    i = self.i
++                    self.i = i+1
++                    return i
+
+         f = open(TESTFN, "w", encoding="utf-8")
+         try:
+@@ -686,19 +747,20 @@ class TestCase(unittest.TestCase):
+         self.assertEqual(list(zip(range(5))), [(i,) for i in range(5)])
+
+         # Classes that lie about their lengths.
+-        class NoGuessLen5:
+-            def __getitem__(self, i):
+-                if i >= 5:
+-                    raise IndexError
+-                return i
++        with torch._dynamo.error_on_graph_break(False):
++            class NoGuessLen5:
++                def __getitem__(self, i):
++                    if i >= 5:
++                        raise IndexError
++                    return i
+
+-        class Guess3Len5(NoGuessLen5):
+-            def __len__(self):
+-                return 3
++            class Guess3Len5(NoGuessLen5):
++                def __len__(self):
++                    return 3
+
+-        class Guess30Len5(NoGuessLen5):
+-            def __len__(self):
+-                return 30
++            class Guess30Len5(NoGuessLen5):
++                def __len__(self):
++                    return 30
+
+         def lzip(*args):
+             return list(zip(*args))
+@@ -718,20 +780,21 @@ class TestCase(unittest.TestCase):
+
+         # This class inserts a Unicode object into its argument's natural
+         # iteration, in the 3rd position.
+-        class OhPhooey:
+-            def __init__(self, seq):
+-                self.it = iter(seq)
+-                self.i = 0
++        with torch._dynamo.error_on_graph_break(False):
++            class OhPhooey:
++                def __init__(self, seq):
++                    self.it = iter(seq)
++                    self.i = 0
+
+-            def __iter__(self):
+-                return self
++                def __iter__(self):
++                    return self
+
+-            def __next__(self):
+-                i = self.i
+-                self.i = i+1
+-                if i == 2:
+-                    return "fooled you!"
+-                return next(self.it)
++                def __next__(self):
++                    i = self.i
++                    self.i = i+1
++                    if i == 2:
++                        return "fooled you!"
++                    return next(self.it)
+
+         f = open(TESTFN, "w", encoding="utf-8")
+         try:
+@@ -895,29 +958,30 @@ class TestCase(unittest.TestCase):
+             f.writelines({})
+
+             # Try a big chunk too.
+-            class Iterator:
+-                def __init__(self, start, finish):
+-                    self.start = start
+-                    self.finish = finish
+-                    self.i = self.start
++            with torch._dynamo.error_on_graph_break(False):
++                class Iterator:
++                    def __init__(self, start, finish):
++                        self.start = start
++                        self.finish = finish
++                        self.i = self.start
+
+-                def __next__(self):
+-                    if self.i >= self.finish:
+-                        raise StopIteration
+-                    result = str(self.i) + '\n'
+-                    self.i += 1
+-                    return result
++                    def __next__(self):
++                        if self.i >= self.finish:
++                            raise StopIteration
++                        result = str(self.i) + '\n'
++                        self.i += 1
++                        return result
+
+-                def __iter__(self):
+-                    return self
++                    def __iter__(self):
++                        return self
+
+-            class Whatever:
+-                def __init__(self, start, finish):
+-                    self.start = start
+-                    self.finish = finish
++                class Whatever:
++                    def __init__(self, start, finish):
++                        self.start = start
++                        self.finish = finish
+
+-                def __iter__(self):
+-                    return Iterator(self.start, self.finish)
++                    def __iter__(self):
++                        return Iterator(self.start, self.finish)
+
+             f.writelines(Whatever(6, 6+2000))
+             f.close()
+@@ -990,15 +1054,16 @@ class TestCase(unittest.TestCase):
+
+     @cpython_only
+     def test_ref_counting_behavior(self):
+-        class C(object):
+-            count = 0
+-            def __new__(cls):
+-                cls.count += 1
+-                return object.__new__(cls)
+-            def __del__(self):
+-                cls = self.__class__
+-                assert cls.count > 0
+-                cls.count -= 1
++        with torch._dynamo.error_on_graph_break(False):
++            class C(object):
++                count = 0
++                def __new__(cls):
++                    cls.count += 1
++                    return object.__new__(cls)
++                def __del__(self):
++                    cls = self.__class__
++                    assert cls.count > 0
++                    cls.count -= 1
+         x = C()
+         self.assertEqual(C.count, 1)
+         del x
+@@ -1089,12 +1154,13 @@ class TestCase(unittest.TestCase):
+
+     def test_3720(self):
+         # Avoid a crash, when an iterator deletes its next() method.
+-        class BadIterator(object):
+-            def __iter__(self):
+-                return self
+-            def __next__(self):
+-                del BadIterator.__next__
+-                return 1
++        with torch._dynamo.error_on_graph_break(False):
++            class BadIterator(object):
++                def __iter__(self):
++                    return self
++                def __next__(self):
++                    del BadIterator.__next__
++                    return 1
+
+         try:
+             for i in BadIterator() :
+@@ -1187,4 +1253,4 @@ class TestCase(unittest.TestCase):
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_iter.py b/test/dynamo/cpython/3_13/test_iter.py
index e752426cf5c0e..8e6240d99ce6d 100644
--- a/test/dynamo/cpython/3_13/test_iter.py
+++ b/test/dynamo/cpython/3_13/test_iter.py
@@ -314,19 +314,20 @@ def test_reduce_mutating_builtins_iter(self):
         def run(builtin_name, item, sentinel=None):
             it = iter(item) if sentinel is None else iter(item, sentinel)
 
-            class CustomStr:
-                def __init__(self, name, iterator):
-                    self.name = name
-                    self.iterator = iterator
-                def __hash__(self):
-                    return hash(self.name)
-                def __eq__(self, other):
-                    # Here we exhaust our iterator, possibly changing
-                    # its `it_seq` pointer to NULL
-                    # The `__reduce__` call should correctly get
-                    # the pointers after this call
-                    list(self.iterator)
-                    return other == self.name
+            with torch._dynamo.error_on_graph_break(False):
+                class CustomStr:
+                    def __init__(self, name, iterator):
+                        self.name = name
+                        self.iterator = iterator
+                    def __hash__(self):
+                        return hash(self.name)
+                    def __eq__(self, other):
+                        # Here we exhaust our iterator, possibly changing
+                        # its `it_seq` pointer to NULL
+                        # The `__reduce__` call should correctly get
+                        # the pointers after this call
+                        list(self.iterator)
+                        return other == self.name
 
             # del is required here
             # to not prematurely call __eq__ from
@@ -376,9 +377,10 @@ def __eq__(self, other):
 
     # Test a new_style class with __iter__ but no next() method
     def test_new_style_iter_class(self):
-        class IterClass(object):
-            def __iter__(self):
-                return self
+        with torch._dynamo.error_on_graph_break(False):
+            class IterClass(object):
+                def __iter__(self):
+                    return self
         self.assertRaises(TypeError, iter, IterClass())
 
     # Test two-argument iter() with callable instance
@@ -447,11 +449,12 @@ def spam(state=[0]):
 
     # Test exception propagation through sequence iterator
     def test_exception_sequence(self):
-        class MySequenceClass(SequenceClass):
-            def __getitem__(self, i):
-                if i == 10:
-                    raise RuntimeError
-                return SequenceClass.__getitem__(self, i)
+        with torch._dynamo.error_on_graph_break(False):
+            class MySequenceClass(SequenceClass):
+                def __getitem__(self, i):
+                    if i == 10:
+                        raise RuntimeError
+                    return SequenceClass.__getitem__(self, i)
         res = []
         try:
             for x in MySequenceClass(20):
@@ -463,11 +466,12 @@ def __getitem__(self, i):
 
     # Test for StopIteration from __getitem__
     def test_stop_sequence(self):
-        class MySequenceClass(SequenceClass):
-            def __getitem__(self, i):
-                if i == 10:
-                    raise StopIteration
-                return SequenceClass.__getitem__(self, i)
+        with torch._dynamo.error_on_graph_break(False):
+            class MySequenceClass(SequenceClass):
+                def __getitem__(self, i):
+                    if i == 10:
+                        raise StopIteration
+                    return SequenceClass.__getitem__(self, i)
         self.check_for_loop(MySequenceClass(20), list(range(10)), pickle=False)
 
     # Test a big range
@@ -594,32 +598,34 @@ def test_builtin_filter(self):
         self.assertRaises(TypeError, filter, None, list)
         self.assertRaises(TypeError, filter, None, 42)
 
-        class Boolean:
-            def __init__(self, truth):
-                self.truth = truth
-            def __bool__(self):
-                return self.truth
+        with torch._dynamo.error_on_graph_break(False):
+            class Boolean:
+                def __init__(self, truth):
+                    self.truth = truth
+                def __bool__(self):
+                    return self.truth
         bTrue = Boolean(True)
         bFalse = Boolean(False)
 
-        class Seq:
-            def __init__(self, *args):
-                self.vals = args
-            def __iter__(self):
-                class SeqIter:
-                    def __init__(self, vals):
-                        self.vals = vals
-                        self.i = 0
-                    def __iter__(self):
-                        return self
-                    def __next__(self):
-                        i = self.i
-                        self.i = i + 1
-                        if i < len(self.vals):
-                            return self.vals[i]
-                        else:
-                            raise StopIteration
-                return SeqIter(self.vals)
+        with torch._dynamo.error_on_graph_break(False):
+            class Seq:
+                def __init__(self, *args):
+                    self.vals = args
+                def __iter__(self):
+                    class SeqIter:
+                        def __init__(self, vals):
+                            self.vals = vals
+                            self.i = 0
+                        def __iter__(self):
+                            return self
+                        def __next__(self):
+                            i = self.i
+                            self.i = i + 1
+                            if i < len(self.vals):
+                                return self.vals[i]
+                            else:
+                                raise StopIteration
+                    return SeqIter(self.vals)
 
         seq = Seq(*([bTrue, bFalse] * 25))
         self.assertEqual(list(filter(lambda x: not x, seq)), [bFalse]*25)
@@ -707,17 +713,18 @@ def test_builtin_zip(self):
         self.assertEqual(list(d.items()), list(zip(d, d.values())))
 
         # Generate all ints starting at constructor arg.
-        class IntsFrom:
-            def __init__(self, start):
-                self.i = start
+        with torch._dynamo.error_on_graph_break(False):
+            class IntsFrom:
+                def __init__(self, start):
+                    self.i = start
 
-            def __iter__(self):
-                return self
+                def __iter__(self):
+                    return self
 
-            def __next__(self):
-                i = self.i
-                self.i = i+1
-                return i
+                def __next__(self):
+                    i = self.i
+                    self.i = i+1
+                    return i
 
         f = open(TESTFN, "w", encoding="utf-8")
         try:
@@ -740,19 +747,20 @@ def __next__(self):
         self.assertEqual(list(zip(range(5))), [(i,) for i in range(5)])
 
         # Classes that lie about their lengths.
-        class NoGuessLen5:
-            def __getitem__(self, i):
-                if i >= 5:
-                    raise IndexError
-                return i
+        with torch._dynamo.error_on_graph_break(False):
+            class NoGuessLen5:
+                def __getitem__(self, i):
+                    if i >= 5:
+                        raise IndexError
+                    return i
 
-        class Guess3Len5(NoGuessLen5):
-            def __len__(self):
-                return 3
+            class Guess3Len5(NoGuessLen5):
+                def __len__(self):
+                    return 3
 
-        class Guess30Len5(NoGuessLen5):
-            def __len__(self):
-                return 30
+            class Guess30Len5(NoGuessLen5):
+                def __len__(self):
+                    return 30
 
         def lzip(*args):
             return list(zip(*args))
@@ -772,20 +780,21 @@ def test_unicode_join_endcase(self):
 
         # This class inserts a Unicode object into its argument's natural
         # iteration, in the 3rd position.
-        class OhPhooey:
-            def __init__(self, seq):
-                self.it = iter(seq)
-                self.i = 0
+        with torch._dynamo.error_on_graph_break(False):
+            class OhPhooey:
+                def __init__(self, seq):
+                    self.it = iter(seq)
+                    self.i = 0
 
-            def __iter__(self):
-                return self
+                def __iter__(self):
+                    return self
 
-            def __next__(self):
-                i = self.i
-                self.i = i+1
-                if i == 2:
-                    return "fooled you!"
-                return next(self.it)
+                def __next__(self):
+                    i = self.i
+                    self.i = i+1
+                    if i == 2:
+                        return "fooled you!"
+                    return next(self.it)
 
         f = open(TESTFN, "w", encoding="utf-8")
         try:
@@ -949,29 +958,30 @@ def test_writelines(self):
             f.writelines({})
 
             # Try a big chunk too.
-            class Iterator:
-                def __init__(self, start, finish):
-                    self.start = start
-                    self.finish = finish
-                    self.i = self.start
+            with torch._dynamo.error_on_graph_break(False):
+                class Iterator:
+                    def __init__(self, start, finish):
+                        self.start = start
+                        self.finish = finish
+                        self.i = self.start
 
-                def __next__(self):
-                    if self.i >= self.finish:
-                        raise StopIteration
-                    result = str(self.i) + '\n'
-                    self.i += 1
-                    return result
+                    def __next__(self):
+                        if self.i >= self.finish:
+                            raise StopIteration
+                        result = str(self.i) + '\n'
+                        self.i += 1
+                        return result
 
-                def __iter__(self):
-                    return self
+                    def __iter__(self):
+                        return self
 
-            class Whatever:
-                def __init__(self, start, finish):
-                    self.start = start
-                    self.finish = finish
+                class Whatever:
+                    def __init__(self, start, finish):
+                        self.start = start
+                        self.finish = finish
 
-                def __iter__(self):
-                    return Iterator(self.start, self.finish)
+                    def __iter__(self):
+                        return Iterator(self.start, self.finish)
 
             f.writelines(Whatever(6, 6+2000))
             f.close()
@@ -1044,15 +1054,16 @@ def test_unpack_iter(self):
 
     @cpython_only
     def test_ref_counting_behavior(self):
-        class C(object):
-            count = 0
-            def __new__(cls):
-                cls.count += 1
-                return object.__new__(cls)
-            def __del__(self):
-                cls = self.__class__
-                assert cls.count > 0
-                cls.count -= 1
+        with torch._dynamo.error_on_graph_break(False):
+            class C(object):
+                count = 0
+                def __new__(cls):
+                    cls.count += 1
+                    return object.__new__(cls)
+                def __del__(self):
+                    cls = self.__class__
+                    assert cls.count > 0
+                    cls.count -= 1
         x = C()
         self.assertEqual(C.count, 1)
         del x
@@ -1143,12 +1154,13 @@ def test_sinkstate_enumerate(self):
 
     def test_3720(self):
         # Avoid a crash, when an iterator deletes its next() method.
-        class BadIterator(object):
-            def __iter__(self):
-                return self
-            def __next__(self):
-                del BadIterator.__next__
-                return 1
+        with torch._dynamo.error_on_graph_break(False):
+            class BadIterator(object):
+                def __iter__(self):
+                    return self
+                def __next__(self):
+                    del BadIterator.__next__
+                    return 1
 
         try:
             for i in BadIterator() :
diff --git a/test/dynamo/cpython/3_13/test_itertools.diff b/test/dynamo/cpython/3_13/test_itertools.diff
index 1d31e9f656102..2dc5803abda36 100644
--- a/test/dynamo/cpython/3_13/test_itertools.diff
+++ b/test/dynamo/cpython/3_13/test_itertools.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_itertools.py b/test/dynamo/cpython/3_13/test_itertools.py
-index 7d5ba727389..ef73c7f0ce1 100644
+index 7d5ba727389..ff514815da2 100644
 --- a/test/dynamo/cpython/3_13/test_itertools.py
 +++ b/test/dynamo/cpython/3_13/test_itertools.py
 @@ -1,3 +1,25 @@
@@ -28,29 +28,78 @@ index 7d5ba727389..ef73c7f0ce1 100644
  import doctest
  import unittest
  import itertools
-@@ -90,10 +112,10 @@ def fact(n):
+@@ -40,6 +62,14 @@ def pickle_deprecated(testfunc):
+ maxsize = support.MAX_Py_ssize_t
+ minsize = -maxsize-1
+
++@torch._dynamo.disable
++def choice(*args):
++    return random.choice(*args)
++
++@torch._dynamo.disable
++def randrange(*args):
++    return random.randrange(*args)
++
+ def lzip(*args):
+     return list(zip(*args))
+
+@@ -90,10 +120,10 @@ def fact(n):
      return prod(range(1, n+1))
- 
+
  # root level methods for pickling ability
 -def testR(r):
 +def _testR(r):
      return r[0]
- 
+
 -def testR2(r):
 +def _testR2(r):
      return r[2]
- 
+
  def underten(x):
-@@ -102,7 +124,7 @@ def underten(x):
+@@ -102,7 +132,7 @@ def underten(x):
  picklecopiers = [lambda s, proto=proto: pickle.loads(pickle.dumps(s, proto))
                   for proto in range(pickle.HIGHEST_PROTOCOL + 1)]
- 
+
 -class TestBasicOps(unittest.TestCase):
 +class TestBasicOps(__TestCase):
- 
+
      def pickletest(self, protocol, it, stop=4, take=1, compare=None):
          """Test that an iterator is the same after pickling, also when part-consumed"""
-@@ -756,7 +778,7 @@ class TestBasicOps(unittest.TestCase):
+@@ -454,14 +484,8 @@ class TestBasicOps(unittest.TestCase):
+         self.assertEqual(len(set(map(id, cwr('abcde', 3)))), 1)
+         self.assertNotEqual(len(set(map(id, list(cwr('abcde', 3))))), 1)
+
+-    @pickle_deprecated
+     def test_permutations(self):
+-        self.assertRaises(TypeError, permutations)              # too few arguments
+-        self.assertRaises(TypeError, permutations, 'abc', 2, 1) # too many arguments
+-        self.assertRaises(TypeError, permutations, None)        # pool is not iterable
+-        self.assertRaises(ValueError, permutations, 'abc', -2)  # r is negative
+         self.assertEqual(list(permutations('abc', 32)), [])     # r > n
+-        self.assertRaises(TypeError, permutations, 'abc', 's')  # r is not an int or None
+         self.assertEqual(list(permutations(range(3), 2)),
+                                            [(0,1), (0,2), (1,0), (1,2), (2,0), (2,1)])
+
+@@ -498,7 +522,7 @@ class TestBasicOps(unittest.TestCase):
+                 if len(set(indices)) == r:
+                     yield tuple(pool[i] for i in indices)
+
+-        for n in range(7):
++        for n in range(5):
+             values = [5*x-12 for x in range(n)]
+             for r in range(n+2):
+                 result = list(permutations(values, r))
+@@ -515,9 +539,6 @@ class TestBasicOps(unittest.TestCase):
+                     self.assertEqual(result, list(permutations(values, None))) # test r as None
+                     self.assertEqual(result, list(permutations(values)))       # test default r
+
+-                for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+-                    self.pickletest(proto, permutations(values, r))     # test pickling
+-
+     @support.bigaddrspacetest
+     def test_permutations_overflow(self):
+         with self.assertRaises((OverflowError, MemoryError)):
+@@ -756,7 +777,7 @@ class TestBasicOps(unittest.TestCase):
      def test_cycle(self):
          self.assertEqual(take(10, cycle('abc')), list('abcabcabca'))
          self.assertEqual(list(cycle('')), [])
@@ -58,8 +107,8 @@ index 7d5ba727389..ef73c7f0ce1 100644
 +        # self.assertRaises(TypeError, cycle)
          self.assertRaises(TypeError, cycle, 5)
          self.assertEqual(list(islice(cycle(gen3()),10)), [0,1,2,0,1,2,0,1,2,0])
- 
-@@ -888,7 +910,7 @@ class TestBasicOps(unittest.TestCase):
+
+@@ -888,7 +909,7 @@ class TestBasicOps(unittest.TestCase):
          # Check normal pickled
          for proto in range(pickle.HIGHEST_PROTOCOL + 1):
              dup = []
@@ -68,8 +117,8 @@ index 7d5ba727389..ef73c7f0ce1 100644
                  for elem in g:
                      self.assertEqual(k, elem[0])
                      dup.append(elem)
-@@ -896,8 +918,8 @@ class TestBasicOps(unittest.TestCase):
- 
+@@ -896,8 +917,8 @@ class TestBasicOps(unittest.TestCase):
+
          # Check nested case
          dup = []
 -        for k, g in groupby(s, testR):
@@ -79,7 +128,7 @@ index 7d5ba727389..ef73c7f0ce1 100644
                  for elem in ig:
                      self.assertEqual(k, elem[0])
                      self.assertEqual(ik, elem[2])
-@@ -907,8 +929,8 @@ class TestBasicOps(unittest.TestCase):
+@@ -907,8 +928,8 @@ class TestBasicOps(unittest.TestCase):
          # Check nested and pickled
          for proto in range(pickle.HIGHEST_PROTOCOL + 1):
              dup = []
@@ -90,16 +139,16 @@ index 7d5ba727389..ef73c7f0ce1 100644
                      for elem in ig:
                          self.assertEqual(k, elem[0])
                          self.assertEqual(ik, elem[2])
-@@ -917,7 +939,7 @@ class TestBasicOps(unittest.TestCase):
- 
- 
+@@ -917,7 +938,7 @@ class TestBasicOps(unittest.TestCase):
+
+
          # Check case where inner iterator is not used
 -        keys = [k for k, g in groupby(s, testR)]
 +        keys = [k for k, g in groupby(s, _testR)]
          expectedkeys = set([r[0] for r in s])
          self.assertEqual(set(keys), expectedkeys)
          self.assertEqual(len(keys), len(expectedkeys))
-@@ -925,7 +947,7 @@ class TestBasicOps(unittest.TestCase):
+@@ -925,7 +946,7 @@ class TestBasicOps(unittest.TestCase):
          # Check case where inner iterator is used after advancing the groupby
          # iterator
          s = list(zip('AABBBAAAA', range(9)))
@@ -108,16 +157,16 @@ index 7d5ba727389..ef73c7f0ce1 100644
          _, g1 = next(it)
          _, g2 = next(it)
          _, g3 = next(it)
-@@ -936,7 +958,7 @@ class TestBasicOps(unittest.TestCase):
+@@ -936,7 +957,7 @@ class TestBasicOps(unittest.TestCase):
          self.assertEqual(list(g3), [])
- 
+
          for proto in range(pickle.HIGHEST_PROTOCOL + 1):
 -            it = groupby(s, testR)
 +            it = groupby(s, _testR)
              _, g = next(it)
              next(it)
              next(it)
-@@ -1002,27 +1024,29 @@ class TestBasicOps(unittest.TestCase):
+@@ -1002,29 +1023,30 @@ class TestBasicOps(unittest.TestCase):
          self.assertEqual(list(filter(None, [0,1,0,2,0])), [1,2])
          self.assertEqual(list(filter(bool, [0,1,0,2,0])), [1,2])
          self.assertEqual(take(4, filter(isEven, count())), [0,2,4,6])
@@ -133,7 +182,7 @@ index 7d5ba727389..ef73c7f0ce1 100644
 +        # self.assertRaises(TypeError, filter, isEven, 3)
 +        # dynamo raises Unsupported in this case
 +        # self.assertRaises(TypeError, next, filter(range(6), range(6)))
- 
+
          # check copy, deepcopy, pickle
 -        ans = [0,2,4]
 -
@@ -163,116 +212,178 @@ index 7d5ba727389..ef73c7f0ce1 100644
 +        # for proto in range(pickle.HIGHEST_PROTOCOL + 1):
 +        #     c = filter(isEven, range(6))
 +        #     self.pickletest(proto, c)
- 
-     @pickle_deprecated
+
+-    @pickle_deprecated
      def test_filterfalse(self):
-@@ -1038,6 +1062,7 @@ class TestBasicOps(unittest.TestCase):
-         for proto in range(pickle.HIGHEST_PROTOCOL + 1):
-             self.pickletest(proto, filterfalse(isEven, range(6)))
- 
-+    @skipIfTorchDynamo("infinite loop in torch dynamo")
+         self.assertEqual(list(filterfalse(isEven, range(6))), [1,3,5])
+         self.assertEqual(list(filterfalse(None, [0,1,0,2,0])), [0,0,0])
+@@ -1034,9 +1056,10 @@ class TestBasicOps(unittest.TestCase):
+         self.assertRaises(TypeError, filterfalse, lambda x:x)
+         self.assertRaises(TypeError, filterfalse, lambda x:x, range(6), 7)
+         self.assertRaises(TypeError, filterfalse, isEven, 3)
+-        self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
+-        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+-            self.pickletest(proto, filterfalse(isEven, range(6)))
++        with torch._dynamo.error_on_graph_break(False):
++            self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
++            for proto in range(pickle.HIGHEST_PROTOCOL + 1):
++                self.pickletest(proto, filterfalse(isEven, range(6)))
+
      def test_zip(self):
          # XXX This is rather silly now that builtin zip() calls zip()...
-         ans = [(x,y) for x, y in zip('abc',count())]
-@@ -1082,6 +1107,7 @@ class TestBasicOps(unittest.TestCase):
-         for proto in range(pickle.HIGHEST_PROTOCOL + 1):
-             self.pickletest(proto, zip('abc', count()))
- 
-+    @skipIfTorchDynamo("infinite loop in torch dynamo")
-     def test_ziplongest(self):
-         for args in [
-                 ['abc', range(6)],
-@@ -1767,6 +1793,7 @@ class TestBasicOps(unittest.TestCase):
+@@ -1047,8 +1070,8 @@ class TestBasicOps(unittest.TestCase):
+         self.assertEqual(take(3,zip('abcdef', count())), lzip('abcdef', range(3)))
+         self.assertEqual(list(zip('abcdef')), lzip('abcdef'))
+         self.assertEqual(list(zip()), lzip())
+-        self.assertRaises(TypeError, zip, 3)
+-        self.assertRaises(TypeError, zip, range(3), 3)
++        # self.assertRaises(TypeError, zip, 3)
++        # self.assertRaises(TypeError, zip, range(3), 3)
+         self.assertEqual([tuple(list(pair)) for pair in zip('abc', 'def')],
+                          lzip('abc', 'def'))
+         self.assertEqual([pair for pair in zip('abc', 'def')],
+@@ -1105,19 +1128,19 @@ class TestBasicOps(unittest.TestCase):
+
+         self.assertEqual(list(zip_longest('abc', 'defg', **{})),
+                          list(zip(list('abc')+[None], 'defg'))) # empty keyword dict
+-        self.assertRaises(TypeError, zip_longest, 3)
+-        self.assertRaises(TypeError, zip_longest, range(3), 3)
+-
+-        for stmt in [
+-            "zip_longest('abc', fv=1)",
+-            "zip_longest('abc', fillvalue=1, bogus_keyword=None)",
+-        ]:
+-            try:
+-                eval(stmt, globals(), locals())
+-            except TypeError:
+-                pass
+-            else:
+-                self.fail('Did not raise Type in:  ' + stmt)
++        # self.assertRaises(TypeError, zip_longest, 3)
++        # self.assertRaises(TypeError, zip_longest, range(3), 3)
++
++        # for stmt in [
++        #     "zip_longest('abc', fv=1)",
++        #     "zip_longest('abc', fillvalue=1, bogus_keyword=None)",
++        # ]:
++        #     try:
++        #         eval(stmt, globals(), locals())
++        #     except TypeError:
++        #         pass
++        #     else:
++        #         self.fail('Did not raise Type in:  ' + stmt)
+
+         self.assertEqual([tuple(list(pair)) for pair in zip_longest('abc', 'def')],
+                          list(zip('abc', 'def')))
+@@ -1296,7 +1319,6 @@ class TestBasicOps(unittest.TestCase):
+                 self.assertEqual(list(product(*(args*r))),
+                                  list(product(*args, **dict(repeat=r))))
+         self.assertEqual(len(list(product(*[range(7)]*6))), 7**6)
+-        self.assertRaises(TypeError, product, range(6), None)
+
+         def product1(*args, **kwds):
+             pools = list(map(tuple, args)) * kwds.get('repeat', 1)
+@@ -1336,7 +1358,8 @@ class TestBasicOps(unittest.TestCase):
+         argtypes = ['', 'abc', '', range(0), range(4), dict(a=1, b=2, c=3),
+                     set('abcdefg'), range(11), tuple(range(13))]
+         for i in range(100):
+-            args = [random.choice(argtypes) for j in range(random.randrange(5))]
++            with torch._dynamo.set_fullgraph(fullgraph=False):
++                args = [choice(argtypes) for j in range(randrange(5))]
+             expected_len = prod(map(len, args))
+             self.assertEqual(len(list(product(*args))), expected_len)
+             self.assertEqual(list(product(*args)), list(product1(*args)))
+@@ -1767,6 +1790,7 @@ class TestBasicOps(unittest.TestCase):
          script_helper.assert_python_ok("-c", script)
- 
+
      # Issue 13454: Crash when deleting backward iterator from tee()
 +    @skipIfTorchDynamo("infinite loop in torch dynamo")
      def test_tee_del_backward(self):
          forward, backward = tee(repeat(None, 20000000))
          try:
-@@ -1920,7 +1947,7 @@ class TestBasicOps(unittest.TestCase):
+@@ -1920,7 +1944,7 @@ class TestBasicOps(unittest.TestCase):
                      tp.foobar = 1
- 
- 
+
+
 -class TestExamples(unittest.TestCase):
 +class TestExamples(__TestCase):
- 
+
      def test_accumulate(self):
          self.assertEqual(list(accumulate([1,2,3,4,5])), [1, 3, 6, 10, 15])
-@@ -2032,7 +2059,7 @@ class TestExamples(unittest.TestCase):
+@@ -2032,7 +2056,7 @@ class TestExamples(unittest.TestCase):
          self.assertEqual(list(takewhile(lambda x: x<5, [1,4,6,4,1])), [1,4])
- 
- 
+
+
 -class TestPurePythonRoughEquivalents(unittest.TestCase):
 +class TestPurePythonRoughEquivalents(__TestCase):
- 
+
      def test_batched_recipe(self):
          def batched_recipe(iterable, n):
-@@ -2081,6 +2108,7 @@ class TestPurePythonRoughEquivalents(unittest.TestCase):
+@@ -2081,6 +2105,7 @@ class TestPurePythonRoughEquivalents(unittest.TestCase):
              for i, element in zip(range(i + 1, stop), iterable):
                  pass
- 
+
 +    @skipIfTorchDynamo("infinite loop in torch dynamo")
      def test_islice_recipe(self):
          self.assertEqual(list(self.islice('ABCDEFG', 2)), list('AB'))
          self.assertEqual(list(self.islice('ABCDEFG', 2, 4)), list('CD'))
-@@ -2265,7 +2293,7 @@ class TestPurePythonRoughEquivalents(unittest.TestCase):
+@@ -2265,7 +2290,7 @@ class TestPurePythonRoughEquivalents(unittest.TestCase):
              raise
- 
- 
+
+
 -class TestGC(unittest.TestCase):
 +class TestGC(__TestCase):
- 
+
      def makecycle(self, iterator, container):
          container.append(iterator)
-@@ -2465,7 +2493,7 @@ def L(seqn):
+@@ -2465,7 +2490,7 @@ def L(seqn):
      return chain(map(lambda x:x, R(Ig(G(seqn)))))
- 
- 
+
+
 -class TestVariousIteratorArgs(unittest.TestCase):
 +class TestVariousIteratorArgs(__TestCase):
- 
+
      def test_accumulate(self):
          s = [1,2,3,4,5]
-@@ -2644,7 +2672,7 @@ class TestVariousIteratorArgs(unittest.TestCase):
+@@ -2644,7 +2669,7 @@ class TestVariousIteratorArgs(unittest.TestCase):
              self.assertRaises(TypeError, tee, N(s))
              self.assertRaises(ZeroDivisionError, list, tee(E(s))[0])
- 
+
 -class LengthTransparency(unittest.TestCase):
 +class LengthTransparency(__TestCase):
- 
+
      def test_repeat(self):
          self.assertEqual(operator.length_hint(repeat(None, 50)), 50)
-@@ -2657,7 +2685,7 @@ class LengthTransparency(unittest.TestCase):
+@@ -2657,7 +2682,7 @@ class LengthTransparency(unittest.TestCase):
          self.assertEqual(operator.length_hint(repeat(None, times=-1)), 0)
          self.assertEqual(operator.length_hint(repeat(None, times=-2)), 0)
- 
+
 -class RegressionTests(unittest.TestCase):
 +class RegressionTests(__TestCase):
- 
+
      def test_sf_793826(self):
          # Fix Armin Rigo's successful efforts to wreak havoc
-@@ -2718,6 +2746,7 @@ class RegressionTests(unittest.TestCase):
- 
+@@ -2718,6 +2743,7 @@ class RegressionTests(unittest.TestCase):
+
      @support.skip_if_pgo_task
      @support.requires_resource('cpu')
 +    @slowTest
      def test_long_chain_of_empty_iterables(self):
          # Make sure itertools.chain doesn't run into recursion limits when
          # dealing with long chains of empty iterables. Even with a high
-@@ -2750,7 +2779,7 @@ class RegressionTests(unittest.TestCase):
+@@ -2750,7 +2776,7 @@ class RegressionTests(unittest.TestCase):
              next(g, None)  # shouldn't crash
- 
- 
+
+
 -class SubclassWithKwargsTest(unittest.TestCase):
 +class SubclassWithKwargsTest(__TestCase):
      def test_keywords_in_subclass(self):
          # count is not subclassable...
          testcases = [
-@@ -2805,49 +2834,5 @@ class SubclassWithKwargsTest(unittest.TestCase):
+@@ -2805,49 +2831,5 @@ class SubclassWithKwargsTest(unittest.TestCase):
                  self.assertEqual(u.newarg, 3)
- 
- 
+
+
 -@support.cpython_only
 -class SizeofTest(unittest.TestCase):
 -    def setUp(self):
diff --git a/test/dynamo/cpython/3_13/test_itertools.py b/test/dynamo/cpython/3_13/test_itertools.py
index ef73c7f0ce165..fe32a3491d17e 100644
--- a/test/dynamo/cpython/3_13/test_itertools.py
+++ b/test/dynamo/cpython/3_13/test_itertools.py
@@ -62,6 +62,14 @@ def inner(self):
 maxsize = support.MAX_Py_ssize_t
 minsize = -maxsize-1
 
+@torch._dynamo.disable
+def choice(*args):
+    return random.choice(*args)
+
+@torch._dynamo.disable
+def randrange(*args):
+    return random.randrange(*args)
+
 def lzip(*args):
     return list(zip(*args))
 
@@ -476,14 +484,8 @@ def test_combinations_with_replacement_tuple_reuse(self):
         self.assertEqual(len(set(map(id, cwr('abcde', 3)))), 1)
         self.assertNotEqual(len(set(map(id, list(cwr('abcde', 3))))), 1)
 
-    @pickle_deprecated
     def test_permutations(self):
-        self.assertRaises(TypeError, permutations)              # too few arguments
-        self.assertRaises(TypeError, permutations, 'abc', 2, 1) # too many arguments
-        self.assertRaises(TypeError, permutations, None)        # pool is not iterable
-        self.assertRaises(ValueError, permutations, 'abc', -2)  # r is negative
         self.assertEqual(list(permutations('abc', 32)), [])     # r > n
-        self.assertRaises(TypeError, permutations, 'abc', 's')  # r is not an int or None
         self.assertEqual(list(permutations(range(3), 2)),
                                            [(0,1), (0,2), (1,0), (1,2), (2,0), (2,1)])
 
@@ -520,7 +522,7 @@ def permutations2(iterable, r=None):
                 if len(set(indices)) == r:
                     yield tuple(pool[i] for i in indices)
 
-        for n in range(7):
+        for n in range(5):
             values = [5*x-12 for x in range(n)]
             for r in range(n+2):
                 result = list(permutations(values, r))
@@ -537,9 +539,6 @@ def permutations2(iterable, r=None):
                     self.assertEqual(result, list(permutations(values, None))) # test r as None
                     self.assertEqual(result, list(permutations(values)))       # test default r
 
-                for proto in range(pickle.HIGHEST_PROTOCOL + 1):
-                    self.pickletest(proto, permutations(values, r))     # test pickling
-
     @support.bigaddrspacetest
     def test_permutations_overflow(self):
         with self.assertRaises((OverflowError, MemoryError)):
@@ -1048,7 +1047,6 @@ def test_filter(self):
         #     c = filter(isEven, range(6))
         #     self.pickletest(proto, c)
 
-    @pickle_deprecated
     def test_filterfalse(self):
         self.assertEqual(list(filterfalse(isEven, range(6))), [1,3,5])
         self.assertEqual(list(filterfalse(None, [0,1,0,2,0])), [0,0,0])
@@ -1058,11 +1056,11 @@ def test_filterfalse(self):
         self.assertRaises(TypeError, filterfalse, lambda x:x)
         self.assertRaises(TypeError, filterfalse, lambda x:x, range(6), 7)
         self.assertRaises(TypeError, filterfalse, isEven, 3)
-        self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
-        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
-            self.pickletest(proto, filterfalse(isEven, range(6)))
+        with torch._dynamo.error_on_graph_break(False):
+            self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
+            for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+                self.pickletest(proto, filterfalse(isEven, range(6)))
 
-    @skipIfTorchDynamo("infinite loop in torch dynamo")
     def test_zip(self):
         # XXX This is rather silly now that builtin zip() calls zip()...
         ans = [(x,y) for x, y in zip('abc',count())]
@@ -1072,8 +1070,8 @@ def test_zip(self):
         self.assertEqual(take(3,zip('abcdef', count())), lzip('abcdef', range(3)))
         self.assertEqual(list(zip('abcdef')), lzip('abcdef'))
         self.assertEqual(list(zip()), lzip())
-        self.assertRaises(TypeError, zip, 3)
-        self.assertRaises(TypeError, zip, range(3), 3)
+        # self.assertRaises(TypeError, zip, 3)
+        # self.assertRaises(TypeError, zip, range(3), 3)
         self.assertEqual([tuple(list(pair)) for pair in zip('abc', 'def')],
                          lzip('abc', 'def'))
         self.assertEqual([pair for pair in zip('abc', 'def')],
@@ -1107,7 +1105,6 @@ def test_zip_tuple_reuse(self):
         for proto in range(pickle.HIGHEST_PROTOCOL + 1):
             self.pickletest(proto, zip('abc', count()))
 
-    @skipIfTorchDynamo("infinite loop in torch dynamo")
     def test_ziplongest(self):
         for args in [
                 ['abc', range(6)],
@@ -1131,19 +1128,19 @@ def test_ziplongest(self):
 
         self.assertEqual(list(zip_longest('abc', 'defg', **{})),
                          list(zip(list('abc')+[None], 'defg'))) # empty keyword dict
-        self.assertRaises(TypeError, zip_longest, 3)
-        self.assertRaises(TypeError, zip_longest, range(3), 3)
-
-        for stmt in [
-            "zip_longest('abc', fv=1)",
-            "zip_longest('abc', fillvalue=1, bogus_keyword=None)",
-        ]:
-            try:
-                eval(stmt, globals(), locals())
-            except TypeError:
-                pass
-            else:
-                self.fail('Did not raise Type in:  ' + stmt)
+        # self.assertRaises(TypeError, zip_longest, 3)
+        # self.assertRaises(TypeError, zip_longest, range(3), 3)
+
+        # for stmt in [
+        #     "zip_longest('abc', fv=1)",
+        #     "zip_longest('abc', fillvalue=1, bogus_keyword=None)",
+        # ]:
+        #     try:
+        #         eval(stmt, globals(), locals())
+        #     except TypeError:
+        #         pass
+        #     else:
+        #         self.fail('Did not raise Type in:  ' + stmt)
 
         self.assertEqual([tuple(list(pair)) for pair in zip_longest('abc', 'def')],
                          list(zip('abc', 'def')))
@@ -1322,7 +1319,6 @@ def test_product(self):
                 self.assertEqual(list(product(*(args*r))),
                                  list(product(*args, **dict(repeat=r))))
         self.assertEqual(len(list(product(*[range(7)]*6))), 7**6)
-        self.assertRaises(TypeError, product, range(6), None)
 
         def product1(*args, **kwds):
             pools = list(map(tuple, args)) * kwds.get('repeat', 1)
@@ -1362,7 +1358,8 @@ def product2(*iterables, repeat=1):
         argtypes = ['', 'abc', '', range(0), range(4), dict(a=1, b=2, c=3),
                     set('abcdefg'), range(11), tuple(range(13))]
         for i in range(100):
-            args = [random.choice(argtypes) for j in range(random.randrange(5))]
+            with torch._dynamo.error_on_graph_break(False):
+                args = [choice(argtypes) for j in range(randrange(5))]
             expected_len = prod(map(len, args))
             self.assertEqual(len(list(product(*args))), expected_len)
             self.assertEqual(list(product(*args)), list(product1(*args)))
diff --git a/test/dynamo/cpython/3_13/test_list.diff b/test/dynamo/cpython/3_13/test_list.diff
index bfedad85d1749..7b0a90735d87c 100644
--- a/test/dynamo/cpython/3_13/test_list.diff
+++ b/test/dynamo/cpython/3_13/test_list.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_list.py b/test/dynamo/cpython/3_13/test_list.py
-index 23ef902aa0b..48e94062a45 100644
+index 23ef902aa0b..b9afb1ef26e 100644
 --- a/test/dynamo/cpython/3_13/test_list.py
 +++ b/test/dynamo/cpython/3_13/test_list.py
 @@ -1,6 +1,60 @@
@@ -64,24 +64,204 @@ index 23ef902aa0b..48e94062a45 100644
  from test.support import cpython_only
  from test.support.script_helper import assert_python_ok
  import pickle
-@@ -35,8 +89,6 @@ class ListTest(list_tests.CommonTest):
-             # Note: This test is expected to SEGV under Cygwin 1.3.12 or
+@@ -36,7 +90,7 @@ class ListTest(list_tests.CommonTest):
              # earlier due to a newlib bug.  See the following mailing list
              # thread for the details:
+
              self.assertRaises(MemoryError, list, range(sys.maxsize // 2))
- 
+
          # This code used to segfault in Py2.4a3
-@@ -324,6 +376,7 @@ class ListTest(list_tests.CommonTest):
+@@ -49,28 +103,31 @@ class ListTest(list_tests.CommonTest):
+             list(sequence=[])
+
+     def test_keywords_in_subclass(self):
+-        class subclass(list):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass(list):
++                pass
+         u = subclass([1, 2])
+         self.assertIs(type(u), subclass)
+         self.assertEqual(list(u), [1, 2])
+         with self.assertRaises(TypeError):
+             subclass(sequence=())
+
+-        class subclass_with_init(list):
+-            def __init__(self, seq, newarg=None):
+-                super().__init__(seq)
+-                self.newarg = newarg
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass_with_init(list):
++                def __init__(self, seq, newarg=None):
++                    super().__init__(seq)
++                    self.newarg = newarg
+         u = subclass_with_init([1, 2], newarg=3)
+         self.assertIs(type(u), subclass_with_init)
+         self.assertEqual(list(u), [1, 2])
+         self.assertEqual(u.newarg, 3)
+
+-        class subclass_with_new(list):
+-            def __new__(cls, seq, newarg=None):
+-                self = super().__new__(cls, seq)
+-                self.newarg = newarg
+-                return self
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass_with_new(list):
++                def __new__(cls, seq, newarg=None):
++                    self = super().__new__(cls, seq)
++                    self.newarg = newarg
++                    return self
+         u = subclass_with_new([1, 2], newarg=3)
+         self.assertIs(type(u), subclass_with_new)
+         self.assertEqual(list(u), [1, 2])
+@@ -117,14 +174,15 @@ class ListTest(list_tests.CommonTest):
+             lst *= size
+
+     def test_repr_mutate(self):
+-        class Obj:
+-            @staticmethod
+-            def __repr__():
+-                try:
+-                    mylist.pop()
+-                except IndexError:
+-                    pass
+-                return 'obj'
++        with torch._dynamo.error_on_graph_break(False):
++            class Obj:
++                @staticmethod
++                def __repr__():
++                    try:
++                        mylist.pop()
++                    except IndexError:
++                        pass
++                    return 'obj'
+
+         mylist = [Obj() for _ in range(5)]
+         self.assertEqual(repr(mylist), '[obj, obj, obj]')
+@@ -220,26 +278,28 @@ class ListTest(list_tests.CommonTest):
+         # Issue 8847: In the PGO build, the MSVC linker's COMDAT folding
+         # optimization causes failures in code that relies on distinct
+         # function addresses.
+-        class L(list): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class L(list): pass
+         with self.assertRaises(TypeError):
+             (3,) + L([1,2])
+
+     def test_equal_operator_modifying_operand(self):
+         # test fix for seg fault reported in bpo-38588 part 2.
+-        class X:
+-            def __eq__(self,other) :
+-                list2.clear()
+-                return NotImplemented
+-
+-        class Y:
+-            def __eq__(self, other):
+-                list1.clear()
+-                return NotImplemented
+-
+-        class Z:
+-            def __eq__(self, other):
+-                list3.clear()
+-                return NotImplemented
++        with torch._dynamo.error_on_graph_break(False):
++            class X:
++                def __eq__(self,other) :
++                    list2.clear()
++                    return NotImplemented
++
++            class Y:
++                def __eq__(self, other):
++                    list1.clear()
++                    return NotImplemented
++
++            class Z:
++                def __eq__(self, other):
++                    list3.clear()
++                    return NotImplemented
+
+         list1 = [X()]
+         list2 = [Y()]
+@@ -250,24 +310,26 @@ class ListTest(list_tests.CommonTest):
+         self.assertFalse(list3 == list4)
+
+     def test_lt_operator_modifying_operand(self):
+-        # See gh-120298
+-        class evil:
+-            def __lt__(self, other):
+-                other.clear()
+-                return NotImplemented
++        with torch._dynamo.error_on_graph_break(False):
++            # See gh-120298
++            class evil:
++                def __lt__(self, other):
++                    other.clear()
++                    return NotImplemented
+
+         a = [[evil()]]
+         with self.assertRaises(TypeError):
+             a[0] < a
+
+     def test_list_index_modifing_operand(self):
+-        # See gh-120384
+-        class evil:
+-            def __init__(self, lst):
+-                self.lst = lst
+-            def __iter__(self):
+-                yield from self.lst
+-                self.lst.clear()
++        with torch._dynamo.error_on_graph_break(False):
++            # See gh-120384
++            class evil:
++                def __init__(self, lst):
++                    self.lst = lst
++                def __iter__(self):
++                    yield from self.lst
++                    self.lst.clear()
+
+         lst = list(range(5))
+         operand = evil(lst)
+@@ -286,19 +348,21 @@ class ListTest(list_tests.CommonTest):
+         # bpo-38610: The count(), index(), and remove() methods were not
+         # holding strong references to list elements while calling
+         # PyObject_RichCompareBool().
+-        class X:
+-            def __eq__(self, other):
+-                lst.clear()
+-                return NotImplemented
++        with torch._dynamo.error_on_graph_break(False):
++            class X:
++                def __eq__(self, other):
++                    lst.clear()
++                    return NotImplemented
+
+         lst = [X()]
+         with self.assertRaises(ValueError):
+             lst.index(lst)
+
+-        class L(list):
+-            def __eq__(self, other):
+-                str(other)
+-                return NotImplemented
++        with torch._dynamo.error_on_graph_break(False):
++            class L(list):
++                def __eq__(self, other):
++                    str(other)
++                    return NotImplemented
+
+         lst = L([X()])
+         lst.count(lst)
+@@ -324,6 +388,7 @@ class ListTest(list_tests.CommonTest):
              a.append(4)
              self.assertEqual(list(it), [])
- 
+
 +    @unittest.skip("Fails on python <=3.13.2 and passes on >=3.13.3")
      def test_deopt_from_append_list(self):
          # gh-132011: it used to crash, because
          # of `CALL_LIST_APPEND` specialization failure.
-@@ -345,4 +398,4 @@ class ListTest(list_tests.CommonTest):
+@@ -345,4 +410,4 @@ class ListTest(list_tests.CommonTest):
          self.assertEqual(rc, 0)
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_list.py b/test/dynamo/cpython/3_13/test_list.py
index 48e94062a4581..7f91b7b840804 100644
--- a/test/dynamo/cpython/3_13/test_list.py
+++ b/test/dynamo/cpython/3_13/test_list.py
@@ -101,28 +101,31 @@ def test_keyword_args(self):
             list(sequence=[])
 
     def test_keywords_in_subclass(self):
-        class subclass(list):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass(list):
+                pass
         u = subclass([1, 2])
         self.assertIs(type(u), subclass)
         self.assertEqual(list(u), [1, 2])
         with self.assertRaises(TypeError):
             subclass(sequence=())
 
-        class subclass_with_init(list):
-            def __init__(self, seq, newarg=None):
-                super().__init__(seq)
-                self.newarg = newarg
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass_with_init(list):
+                def __init__(self, seq, newarg=None):
+                    super().__init__(seq)
+                    self.newarg = newarg
         u = subclass_with_init([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_init)
         self.assertEqual(list(u), [1, 2])
         self.assertEqual(u.newarg, 3)
 
-        class subclass_with_new(list):
-            def __new__(cls, seq, newarg=None):
-                self = super().__new__(cls, seq)
-                self.newarg = newarg
-                return self
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass_with_new(list):
+                def __new__(cls, seq, newarg=None):
+                    self = super().__new__(cls, seq)
+                    self.newarg = newarg
+                    return self
         u = subclass_with_new([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_new)
         self.assertEqual(list(u), [1, 2])
@@ -169,14 +172,15 @@ def test_list_resize_overflow(self):
             lst *= size
 
     def test_repr_mutate(self):
-        class Obj:
-            @staticmethod
-            def __repr__():
-                try:
-                    mylist.pop()
-                except IndexError:
-                    pass
-                return 'obj'
+        with torch._dynamo.error_on_graph_break(False):
+            class Obj:
+                @staticmethod
+                def __repr__():
+                    try:
+                        mylist.pop()
+                    except IndexError:
+                        pass
+                    return 'obj'
 
         mylist = [Obj() for _ in range(5)]
         self.assertEqual(repr(mylist), '[obj, obj, obj]')
@@ -272,26 +276,28 @@ def test_no_comdat_folding(self):
         # Issue 8847: In the PGO build, the MSVC linker's COMDAT folding
         # optimization causes failures in code that relies on distinct
         # function addresses.
-        class L(list): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class L(list): pass
         with self.assertRaises(TypeError):
             (3,) + L([1,2])
 
     def test_equal_operator_modifying_operand(self):
         # test fix for seg fault reported in bpo-38588 part 2.
-        class X:
-            def __eq__(self,other) :
-                list2.clear()
-                return NotImplemented
-
-        class Y:
-            def __eq__(self, other):
-                list1.clear()
-                return NotImplemented
-
-        class Z:
-            def __eq__(self, other):
-                list3.clear()
-                return NotImplemented
+        with torch._dynamo.error_on_graph_break(False):
+            class X:
+                def __eq__(self,other) :
+                    list2.clear()
+                    return NotImplemented
+
+            class Y:
+                def __eq__(self, other):
+                    list1.clear()
+                    return NotImplemented
+
+            class Z:
+                def __eq__(self, other):
+                    list3.clear()
+                    return NotImplemented
 
         list1 = [X()]
         list2 = [Y()]
@@ -302,24 +308,26 @@ def __eq__(self, other):
         self.assertFalse(list3 == list4)
 
     def test_lt_operator_modifying_operand(self):
-        # See gh-120298
-        class evil:
-            def __lt__(self, other):
-                other.clear()
-                return NotImplemented
+        with torch._dynamo.error_on_graph_break(False):
+            # See gh-120298
+            class evil:
+                def __lt__(self, other):
+                    other.clear()
+                    return NotImplemented
 
         a = [[evil()]]
         with self.assertRaises(TypeError):
             a[0] < a
 
     def test_list_index_modifing_operand(self):
-        # See gh-120384
-        class evil:
-            def __init__(self, lst):
-                self.lst = lst
-            def __iter__(self):
-                yield from self.lst
-                self.lst.clear()
+        with torch._dynamo.error_on_graph_break(False):
+            # See gh-120384
+            class evil:
+                def __init__(self, lst):
+                    self.lst = lst
+                def __iter__(self):
+                    yield from self.lst
+                    self.lst.clear()
 
         lst = list(range(5))
         operand = evil(lst)
@@ -338,19 +346,21 @@ def test_count_index_remove_crashes(self):
         # bpo-38610: The count(), index(), and remove() methods were not
         # holding strong references to list elements while calling
         # PyObject_RichCompareBool().
-        class X:
-            def __eq__(self, other):
-                lst.clear()
-                return NotImplemented
+        with torch._dynamo.error_on_graph_break(False):
+            class X:
+                def __eq__(self, other):
+                    lst.clear()
+                    return NotImplemented
 
         lst = [X()]
         with self.assertRaises(ValueError):
             lst.index(lst)
 
-        class L(list):
-            def __eq__(self, other):
-                str(other)
-                return NotImplemented
+        with torch._dynamo.error_on_graph_break(False):
+            class L(list):
+                def __eq__(self, other):
+                    str(other)
+                    return NotImplemented
 
         lst = L([X()])
         lst.count(lst)
diff --git a/test/dynamo/cpython/3_13/test_math.diff b/test/dynamo/cpython/3_13/test_math.diff
index a69414729324e..058477820c63d 100644
--- a/test/dynamo/cpython/3_13/test_math.diff
+++ b/test/dynamo/cpython/3_13/test_math.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_math.py b/test/dynamo/cpython/3_13/test_math.py
-index 5ee3055c871..6889f53b98f 100644
+index 5ee3055c871..5402cdc4a6c 100644
 --- a/test/dynamo/cpython/3_13/test_math.py
 +++ b/test/dynamo/cpython/3_13/test_math.py
 @@ -1,3 +1,61 @@
@@ -63,36 +63,135 @@ index 5ee3055c871..6889f53b98f 100644
 +
  # Python test set -- math module
  # XXXX Should not do tests around zero only
- 
+
 @@ -242,7 +300,7 @@ class BadDescr:
      def __get__(self, obj, objtype=None):
          raise ValueError
- 
+
 -class MathTests(unittest.TestCase):
 +class MathTests(__TestCase):
- 
+
      def ftest(self, name, got, expected, ulp_tol=5, abs_tol=0.0):
          """Compare arguments expected and got, as floats, if either
-@@ -533,6 +591,7 @@ class MathTests(unittest.TestCase):
+@@ -417,16 +475,17 @@ class MathTests(unittest.TestCase):
+         #self.assertEqual(math.ceil(NINF), NINF)
+         #self.assertTrue(math.isnan(math.ceil(NAN)))
+
+-        class TestCeil:
+-            def __ceil__(self):
+-                return 42
+-        class FloatCeil(float):
+-            def __ceil__(self):
+-                return 42
+-        class TestNoCeil:
+-            pass
+-        class TestBadCeil:
+-            __ceil__ = BadDescr()
++        with torch._dynamo.error_on_graph_break(False):
++            class TestCeil:
++                def __ceil__(self):
++                    return 42
++            class FloatCeil(float):
++                def __ceil__(self):
++                    return 42
++            class TestNoCeil:
++                pass
++            class TestBadCeil:
++                __ceil__ = BadDescr()
+         self.assertEqual(math.ceil(TestCeil()), 42)
+         self.assertEqual(math.ceil(FloatCeil()), 42)
+         self.assertEqual(math.ceil(FloatLike(42.5)), 43)
+@@ -533,6 +592,7 @@ class MathTests(unittest.TestCase):
          self.ftest('fabs(0)', math.fabs(0), 0)
          self.ftest('fabs(1)', math.fabs(1), 1)
- 
+
 +    @skipIfTorchDynamo("infinite loop")
      def testFactorial(self):
          self.assertEqual(math.factorial(0), 1)
          total = 1
-@@ -1072,6 +1131,7 @@ class MathTests(unittest.TestCase):
+@@ -573,16 +633,17 @@ class MathTests(unittest.TestCase):
+         #self.assertEqual(math.ceil(NINF), NINF)
+         #self.assertTrue(math.isnan(math.floor(NAN)))
+
+-        class TestFloor:
+-            def __floor__(self):
+-                return 42
+-        class FloatFloor(float):
+-            def __floor__(self):
+-                return 42
+-        class TestNoFloor:
+-            pass
+-        class TestBadFloor:
+-            __floor__ = BadDescr()
++        with torch._dynamo.error_on_graph_break(False):
++            class TestFloor:
++                def __floor__(self):
++                    return 42
++            class FloatFloor(float):
++                def __floor__(self):
++                    return 42
++            class TestNoFloor:
++                pass
++            class TestBadFloor:
++                __floor__ = BadDescr()
+         self.assertEqual(math.floor(TestFloor()), 42)
+         self.assertEqual(math.floor(FloatFloor()), 42)
+         self.assertEqual(math.floor(FloatLike(41.9)), 41)
+@@ -995,8 +1056,9 @@ class MathTests(unittest.TestCase):
+         )
+
+         # Verify tuple subclasses are allowed
+-        class T(tuple):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class T(tuple):
++                pass
+         self.assertEqual(dist(T((1, 2, 3)), ((4, 2, -1))), 5.0)
+
+         # Test handling of bad arguments
+@@ -1028,8 +1090,9 @@ class MathTests(unittest.TestCase):
+         with self.assertRaises(TypeError):
+             dist([1], 2)
+
+-        class BadFloat:
+-            __float__ = BadDescr()
++        with torch._dynamo.error_on_graph_break(False):
++            class BadFloat:
++                __float__ = BadDescr()
+
+         with self.assertRaises(ValueError):
+             dist([1], [BadFloat()])
+@@ -1072,6 +1135,7 @@ class MathTests(unittest.TestCase):
          with self.assertRaises(ValueError):
              math.dist([1, 2], [3, 4, 5])
- 
+
 +    @slowTest
      def testIsqrt(self):
          # Test a variety of inputs, large and small.
          test_values = (
-@@ -1202,12 +1262,6 @@ class MathTests(unittest.TestCase):
+@@ -1101,12 +1165,13 @@ class MathTests(unittest.TestCase):
+         self.assertIs(type(s), int)
+         self.assertEqual(s, 0)
+
+-        class IntegerLike(object):
+-            def __init__(self, value):
+-                self.value = value
++        with torch._dynamo.error_on_graph_break(False):
++            class IntegerLike(object):
++                def __init__(self, value):
++                    self.value = value
+
+-            def __index__(self):
+-                return self.value
++                def __index__(self):
++                    return self.value
+
+         s = math.isqrt(IntegerLike(1729))
+         self.assertIs(type(s), int)
+@@ -1202,12 +1267,6 @@ class MathTests(unittest.TestCase):
              self.assertEqual(math.ldexp(NINF, n), NINF)
              self.assertTrue(math.isnan(math.ldexp(NAN, n)))
- 
+
 -    @requires_IEEE_754
 -    def testLdexp_denormal(self):
 -        # Denormal output incorrectly rounded (truncated)
@@ -102,74 +201,194 @@ index 5ee3055c871..6889f53b98f 100644
      def testLog(self):
          self.assertRaises(TypeError, math.log)
          self.assertRaises(TypeError, math.log, 1, 2, 3)
-@@ -1233,6 +1287,7 @@ class MathTests(unittest.TestCase):
+@@ -1233,6 +1292,7 @@ class MathTests(unittest.TestCase):
          self.assertRaises(ValueError, math.log1p, -1)
          self.assertEqual(math.log1p(INF), INF)
- 
+
 +    @skipIfTorchDynamo("Infinite loop")
      @requires_IEEE_754
      def testLog2(self):
          self.assertRaises(TypeError, math.log2)
-@@ -1251,6 +1306,7 @@ class MathTests(unittest.TestCase):
+@@ -1251,6 +1311,7 @@ class MathTests(unittest.TestCase):
          self.assertRaises(ValueError, math.log2, NINF)
          self.assertTrue(math.isnan(math.log2(NAN)))
- 
+
 +    @skipIfTorchDynamo("Infinite loop")
      @requires_IEEE_754
      # log2() is not accurate enough on Mac OS X Tiger (10.4)
      @support.requires_mac_ver(10, 5)
-@@ -1332,7 +1388,7 @@ class MathTests(unittest.TestCase):
+@@ -1332,17 +1393,18 @@ class MathTests(unittest.TestCase):
          with self.assertRaises(RuntimeError):
              sumprod(raise_after(5), range(10))
- 
+
 -        from test.test_iter import BasicIterClass
 +        from test_iter import BasicIterClass
- 
+
          self.assertEqual(sumprod(BasicIterClass(1), [1]), 0)
          self.assertEqual(sumprod([1], BasicIterClass(1)), 0)
-@@ -2252,6 +2308,7 @@ class MathTests(unittest.TestCase):
+
+         # Error in multiplication
+-        class BadMultiply:
+-            def __mul__(self, other):
+-                raise RuntimeError
+-            def __rmul__(self, other):
+-                raise RuntimeError
++        with torch._dynamo.error_on_graph_break(False):
++            class BadMultiply:
++                def __mul__(self, other):
++                    raise RuntimeError
++                def __rmul__(self, other):
++                    raise RuntimeError
+         with self.assertRaises(RuntimeError):
+             sumprod([10, BadMultiply(), 30], [1, 2, 3])
+         with self.assertRaises(RuntimeError):
+@@ -1387,25 +1449,26 @@ class MathTests(unittest.TestCase):
+         Decimal = decimal.Decimal
+         Fraction = fractions.Fraction
+
+-        class Int(int):
+-            def __add__(self, other):
+-                return Int(int(self) + int(other))
+-            def __mul__(self, other):
+-                return Int(int(self) * int(other))
+-            __radd__ = __add__
+-            __rmul__ = __mul__
+-            def __repr__(self):
+-                return f'Int({int(self)})'
+-
+-        class Flt(float):
+-            def __add__(self, other):
+-                return Int(int(self) + int(other))
+-            def __mul__(self, other):
+-                return Int(int(self) * int(other))
+-            __radd__ = __add__
+-            __rmul__ = __mul__
+-            def __repr__(self):
+-                return f'Flt({int(self)})'
++        with torch._dynamo.error_on_graph_break(False):
++            class Int(int):
++                def __add__(self, other):
++                    return Int(int(self) + int(other))
++                def __mul__(self, other):
++                    return Int(int(self) * int(other))
++                __radd__ = __add__
++                __rmul__ = __mul__
++                def __repr__(self):
++                    return f'Int({int(self)})'
++
++            class Flt(float):
++                def __add__(self, other):
++                    return Int(int(self) + int(other))
++                def __mul__(self, other):
++                    return Int(int(self) * int(other))
++                __radd__ = __add__
++                __rmul__ = __mul__
++                def __repr__(self):
++                    return f'Flt({int(self)})'
+
+         def baseline_sumprod(p, q):
+             """This defines the target behavior including exceptions and special values.
+@@ -1925,16 +1988,17 @@ class MathTests(unittest.TestCase):
+         self.assertEqual(math.trunc(-0.999999), -0)
+         self.assertEqual(math.trunc(-100.999), -100)
+
+-        class TestTrunc:
+-            def __trunc__(self):
+-                return 23
+-        class FloatTrunc(float):
+-            def __trunc__(self):
+-                return 23
+-        class TestNoTrunc:
+-            pass
+-        class TestBadTrunc:
+-            __trunc__ = BadDescr()
++        with torch._dynamo.error_on_graph_break(False):
++            class TestTrunc:
++                def __trunc__(self):
++                    return 23
++            class FloatTrunc(float):
++                def __trunc__(self):
++                    return 23
++            class TestNoTrunc:
++                pass
++            class TestBadTrunc:
++                __trunc__ = BadDescr()
+
+         self.assertEqual(math.trunc(TestTrunc()), 23)
+         self.assertEqual(math.trunc(FloatTrunc()), 23)
+@@ -2167,9 +2231,10 @@ class MathTests(unittest.TestCase):
+         self.assertEqual(prod([1., F(3, 2)]), 1.5)
+
+         # Error in multiplication
+-        class BadMultiply:
+-            def __rmul__(self, other):
+-                raise RuntimeError
++        with torch._dynamo.error_on_graph_break(False):
++            class BadMultiply:
++                def __rmul__(self, other):
++                    raise RuntimeError
+         with self.assertRaises(RuntimeError):
+             prod([10., BadMultiply()])
+
+@@ -2252,6 +2317,7 @@ class MathTests(unittest.TestCase):
          self.assertEqual(type(prod([1, decimal.Decimal(2.0), 3, 4, 5, 6])),
                           decimal.Decimal)
- 
+
 +    @skipIfTorchDynamo("Infinite loop")
      def testPerm(self):
          perm = math.perm
          factorial = math.factorial
-@@ -2316,6 +2373,7 @@ class MathTests(unittest.TestCase):
+@@ -2316,6 +2382,7 @@ class MathTests(unittest.TestCase):
              self.assertIs(type(perm(IntSubclass(5), IntSubclass(k))), int)
              self.assertIs(type(perm(MyIndexable(5), MyIndexable(k))), int)
- 
+
 +    @skipIfTorchDynamo("infinite loop")
      def testComb(self):
          comb = math.comb
          factorial = math.factorial
-@@ -2446,6 +2504,7 @@ class MathTests(unittest.TestCase):
+@@ -2446,6 +2513,7 @@ class MathTests(unittest.TestCase):
              math.nextafter(1.0, INF, steps=-1)
- 
- 
+
+
 +    @unittest.skip("flaky test under torch dynamo")  # works on pytest and crashes on unittest
      @requires_IEEE_754
      def test_ulp(self):
          self.assertEqual(math.ulp(1.0), sys.float_info.epsilon)
-@@ -2508,7 +2567,7 @@ class MathTests(unittest.TestCase):
+@@ -2472,10 +2540,11 @@ class MathTests(unittest.TestCase):
+     def test_issue39871(self):
+         # A SystemError should not be raised if the first arg to atan2(),
+         # copysign(), or remainder() cannot be converted to a float.
+-        class F:
+-            def __float__(self):
+-                self.converted = True
+-                1/0
++        with torch._dynamo.error_on_graph_break(False):
++            class F:
++                def __float__(self):
++                    self.converted = True
++                    1/0
+         for func in math.atan2, math.copysign, math.remainder:
+             y = F()
+             with self.assertRaises(TypeError):
+@@ -2508,7 +2577,7 @@ class MathTests(unittest.TestCase):
          self.assertEqual(math.copysign(1.0, x), math.copysign(1.0, y))
- 
- 
+
+
 -class IsCloseTests(unittest.TestCase):
 +class IsCloseTests(__TestCase):
      isclose = math.isclose  # subclasses should override this
- 
+
      def assertIsClose(self, a, b, *args, **kwargs):
-@@ -2631,7 +2690,7 @@ class IsCloseTests(unittest.TestCase):
+@@ -2631,7 +2700,7 @@ class IsCloseTests(unittest.TestCase):
          self.assertAllNotClose(fraction_examples, rel_tol=1e-9)
- 
- 
+
+
 -class FMATests(unittest.TestCase):
 +class FMATests(__TestCase):
      """ Tests for math.fma. """
- 
+
      def test_fma_nan_results(self):
-@@ -2719,8 +2778,7 @@ class FMATests(unittest.TestCase):
+@@ -2719,8 +2788,7 @@ class FMATests(unittest.TestCase):
      # properly: it doesn't use the right sign when the result is zero.
      @unittest.skipIf(
          sys.platform.startswith(("freebsd", "wasi", "netbsd", "emscripten"))
@@ -179,10 +398,10 @@ index 5ee3055c871..6889f53b98f 100644
          f"this platform doesn't implement IEE 754-2008 properly")
      def test_fma_zero_result(self):
          nonnegative_finites = [0.0, 1e-300, 2.3, 1e300]
-@@ -2879,10 +2937,5 @@ class FMATests(unittest.TestCase):
+@@ -2879,10 +2947,5 @@ class FMATests(unittest.TestCase):
          )
- 
- 
+
+
 -def load_tests(loader, tests, pattern):
 -    from doctest import DocFileSuite
 -    tests.addTest(DocFileSuite(os.path.join("mathdata", "ieee754.txt")))
diff --git a/test/dynamo/cpython/3_13/test_math.py b/test/dynamo/cpython/3_13/test_math.py
index 6889f53b98f41..d9f6b5fd1d94c 100644
--- a/test/dynamo/cpython/3_13/test_math.py
+++ b/test/dynamo/cpython/3_13/test_math.py
@@ -475,16 +475,17 @@ def testCeil(self):
         #self.assertEqual(math.ceil(NINF), NINF)
         #self.assertTrue(math.isnan(math.ceil(NAN)))
 
-        class TestCeil:
-            def __ceil__(self):
-                return 42
-        class FloatCeil(float):
-            def __ceil__(self):
-                return 42
-        class TestNoCeil:
-            pass
-        class TestBadCeil:
-            __ceil__ = BadDescr()
+        with torch._dynamo.error_on_graph_break(False):
+            class TestCeil:
+                def __ceil__(self):
+                    return 42
+            class FloatCeil(float):
+                def __ceil__(self):
+                    return 42
+            class TestNoCeil:
+                pass
+            class TestBadCeil:
+                __ceil__ = BadDescr()
         self.assertEqual(math.ceil(TestCeil()), 42)
         self.assertEqual(math.ceil(FloatCeil()), 42)
         self.assertEqual(math.ceil(FloatLike(42.5)), 43)
@@ -632,16 +633,17 @@ def testFloor(self):
         #self.assertEqual(math.ceil(NINF), NINF)
         #self.assertTrue(math.isnan(math.floor(NAN)))
 
-        class TestFloor:
-            def __floor__(self):
-                return 42
-        class FloatFloor(float):
-            def __floor__(self):
-                return 42
-        class TestNoFloor:
-            pass
-        class TestBadFloor:
-            __floor__ = BadDescr()
+        with torch._dynamo.error_on_graph_break(False):
+            class TestFloor:
+                def __floor__(self):
+                    return 42
+            class FloatFloor(float):
+                def __floor__(self):
+                    return 42
+            class TestNoFloor:
+                pass
+            class TestBadFloor:
+                __floor__ = BadDescr()
         self.assertEqual(math.floor(TestFloor()), 42)
         self.assertEqual(math.floor(FloatFloor()), 42)
         self.assertEqual(math.floor(FloatLike(41.9)), 41)
@@ -1054,8 +1056,9 @@ def testDist(self):
         )
 
         # Verify tuple subclasses are allowed
-        class T(tuple):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class T(tuple):
+                pass
         self.assertEqual(dist(T((1, 2, 3)), ((4, 2, -1))), 5.0)
 
         # Test handling of bad arguments
@@ -1087,8 +1090,9 @@ class T(tuple):
         with self.assertRaises(TypeError):
             dist([1], 2)
 
-        class BadFloat:
-            __float__ = BadDescr()
+        with torch._dynamo.error_on_graph_break(False):
+            class BadFloat:
+                __float__ = BadDescr()
 
         with self.assertRaises(ValueError):
             dist([1], [BadFloat()])
@@ -1161,12 +1165,13 @@ def testIsqrt(self):
         self.assertIs(type(s), int)
         self.assertEqual(s, 0)
 
-        class IntegerLike(object):
-            def __init__(self, value):
-                self.value = value
+        with torch._dynamo.error_on_graph_break(False):
+            class IntegerLike(object):
+                def __init__(self, value):
+                    self.value = value
 
-            def __index__(self):
-                return self.value
+                def __index__(self):
+                    return self.value
 
         s = math.isqrt(IntegerLike(1729))
         self.assertIs(type(s), int)
@@ -1394,11 +1399,12 @@ def raise_after(n):
         self.assertEqual(sumprod([1], BasicIterClass(1)), 0)
 
         # Error in multiplication
-        class BadMultiply:
-            def __mul__(self, other):
-                raise RuntimeError
-            def __rmul__(self, other):
-                raise RuntimeError
+        with torch._dynamo.error_on_graph_break(False):
+            class BadMultiply:
+                def __mul__(self, other):
+                    raise RuntimeError
+                def __rmul__(self, other):
+                    raise RuntimeError
         with self.assertRaises(RuntimeError):
             sumprod([10, BadMultiply(), 30], [1, 2, 3])
         with self.assertRaises(RuntimeError):
@@ -1443,25 +1449,26 @@ def test_sumprod_stress(self):
         Decimal = decimal.Decimal
         Fraction = fractions.Fraction
 
-        class Int(int):
-            def __add__(self, other):
-                return Int(int(self) + int(other))
-            def __mul__(self, other):
-                return Int(int(self) * int(other))
-            __radd__ = __add__
-            __rmul__ = __mul__
-            def __repr__(self):
-                return f'Int({int(self)})'
-
-        class Flt(float):
-            def __add__(self, other):
-                return Int(int(self) + int(other))
-            def __mul__(self, other):
-                return Int(int(self) * int(other))
-            __radd__ = __add__
-            __rmul__ = __mul__
-            def __repr__(self):
-                return f'Flt({int(self)})'
+        with torch._dynamo.error_on_graph_break(False):
+            class Int(int):
+                def __add__(self, other):
+                    return Int(int(self) + int(other))
+                def __mul__(self, other):
+                    return Int(int(self) * int(other))
+                __radd__ = __add__
+                __rmul__ = __mul__
+                def __repr__(self):
+                    return f'Int({int(self)})'
+
+            class Flt(float):
+                def __add__(self, other):
+                    return Int(int(self) + int(other))
+                def __mul__(self, other):
+                    return Int(int(self) * int(other))
+                __radd__ = __add__
+                __rmul__ = __mul__
+                def __repr__(self):
+                    return f'Flt({int(self)})'
 
         def baseline_sumprod(p, q):
             """This defines the target behavior including exceptions and special values.
@@ -1981,16 +1988,17 @@ def test_trunc(self):
         self.assertEqual(math.trunc(-0.999999), -0)
         self.assertEqual(math.trunc(-100.999), -100)
 
-        class TestTrunc:
-            def __trunc__(self):
-                return 23
-        class FloatTrunc(float):
-            def __trunc__(self):
-                return 23
-        class TestNoTrunc:
-            pass
-        class TestBadTrunc:
-            __trunc__ = BadDescr()
+        with torch._dynamo.error_on_graph_break(False):
+            class TestTrunc:
+                def __trunc__(self):
+                    return 23
+            class FloatTrunc(float):
+                def __trunc__(self):
+                    return 23
+            class TestNoTrunc:
+                pass
+            class TestBadTrunc:
+                __trunc__ = BadDescr()
 
         self.assertEqual(math.trunc(TestTrunc()), 23)
         self.assertEqual(math.trunc(FloatTrunc()), 23)
@@ -2223,9 +2231,10 @@ def test_prod(self):
         self.assertEqual(prod([1., F(3, 2)]), 1.5)
 
         # Error in multiplication
-        class BadMultiply:
-            def __rmul__(self, other):
-                raise RuntimeError
+        with torch._dynamo.error_on_graph_break(False):
+            class BadMultiply:
+                def __rmul__(self, other):
+                    raise RuntimeError
         with self.assertRaises(RuntimeError):
             prod([10., BadMultiply()])
 
@@ -2531,10 +2540,11 @@ def test_ulp(self):
     def test_issue39871(self):
         # A SystemError should not be raised if the first arg to atan2(),
         # copysign(), or remainder() cannot be converted to a float.
-        class F:
-            def __float__(self):
-                self.converted = True
-                1/0
+        with torch._dynamo.error_on_graph_break(False):
+            class F:
+                def __float__(self):
+                    self.converted = True
+                    1/0
         for func in math.atan2, math.copysign, math.remainder:
             y = F()
             with self.assertRaises(TypeError):
diff --git a/test/dynamo/cpython/3_13/test_operator.diff b/test/dynamo/cpython/3_13/test_operator.diff
index 73f586f109998..70629e03d3ba9 100644
--- a/test/dynamo/cpython/3_13/test_operator.diff
+++ b/test/dynamo/cpython/3_13/test_operator.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_operator.py b/test/dynamo/cpython/3_13/test_operator.py
-index d90f820052c..c212a2d6559 100644
+index d90f820052c..5d9fdfb70a4 100644
 --- a/test/dynamo/cpython/3_13/test_operator.py
 +++ b/test/dynamo/cpython/3_13/test_operator.py
 @@ -1,3 +1,23 @@
@@ -26,48 +26,315 @@ index d90f820052c..c212a2d6559 100644
  import unittest
  import inspect
  import pickle
-@@ -628,11 +648,11 @@ class OperatorTestCase:
+@@ -84,9 +104,10 @@ class OperatorTestCase:
+
+     def test_eq(self):
+         operator = self.module
+-        class C(object):
+-            def __eq__(self, other):
+-                raise SyntaxError
++        with torch._dynamo.error_on_graph_break(False):
++            class C(object):
++                def __eq__(self, other):
++                    raise SyntaxError
+         self.assertRaises(TypeError, operator.eq)
+         self.assertRaises(SyntaxError, operator.eq, C(), C())
+         self.assertFalse(operator.eq(1, 0))
+@@ -98,9 +119,10 @@ class OperatorTestCase:
+
+     def test_ne(self):
+         operator = self.module
+-        class C(object):
+-            def __ne__(self, other):
+-                raise SyntaxError
++        with torch._dynamo.error_on_graph_break(False):
++            class C(object):
++                def __ne__(self, other):
++                    raise SyntaxError
+         self.assertRaises(TypeError, operator.ne)
+         self.assertRaises(SyntaxError, operator.ne, C(), C())
+         self.assertTrue(operator.ne(1, 0))
+@@ -245,9 +267,10 @@ class OperatorTestCase:
+         operator = self.module
+         self.assertRaises(TypeError, operator.matmul)
+         self.assertRaises(TypeError, operator.matmul, 42, 42)
+-        class M:
+-            def __matmul__(self, other):
+-                return other - 1
++        with torch._dynamo.error_on_graph_break(False):
++            class M:
++                def __matmul__(self, other):
++                    return other - 1
+         self.assertEqual(M() @ 42, 41)
+
+     def test_neg(self):
+@@ -315,9 +338,10 @@ class OperatorTestCase:
+
+     def test_truth(self):
+         operator = self.module
+-        class C(object):
+-            def __bool__(self):
+-                raise SyntaxError
++        with torch._dynamo.error_on_graph_break(False):
++            class C(object):
++                def __bool__(self):
++                    raise SyntaxError
+         self.assertRaises(TypeError, operator.truth)
+         self.assertRaises(SyntaxError, operator.truth, C())
+         self.assertTrue(operator.truth(5))
+@@ -349,8 +373,9 @@ class OperatorTestCase:
+
+     def test_attrgetter(self):
+         operator = self.module
+-        class A:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                pass
+         a = A()
+         a.name = 'arthur'
+         f = operator.attrgetter('name')
+@@ -371,9 +396,10 @@ class OperatorTestCase:
+         self.assertEqual(operator.attrgetter('x','z','y')(record), ('X', 'Z', 'Y'))
+         self.assertRaises(TypeError, operator.attrgetter, ('x', (), 'y'))
+
+-        class C(object):
+-            def __getattr__(self, name):
+-                raise SyntaxError
++        with torch._dynamo.error_on_graph_break(False):
++            class C(object):
++                def __getattr__(self, name):
++                    raise SyntaxError
+         self.assertRaises(SyntaxError, operator.attrgetter('foo'), C())
+
+         # recursive gets
+@@ -411,9 +437,10 @@ class OperatorTestCase:
+         f = operator.itemgetter(10)
+         self.assertRaises(IndexError, f, a)
+
+-        class C(object):
+-            def __getitem__(self, name):
+-                raise SyntaxError
++        with torch._dynamo.error_on_graph_break(False):
++            class C(object):
++                def __getitem__(self, name):
++                    raise SyntaxError
+         self.assertRaises(SyntaxError, operator.itemgetter(42), C())
+
+         f = operator.itemgetter('name')
+@@ -444,9 +471,10 @@ class OperatorTestCase:
+         self.assertEqual(operator.itemgetter(slice(2, 4))(t), ('c', 'd'))
+
+         # interesting sequences
+-        class T(tuple):
+-            'Tuple subclass'
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class T(tuple):
++                'Tuple subclass'
++                pass
+         self.assertEqual(operator.itemgetter(0)(T('abc')), 'a')
+         self.assertEqual(operator.itemgetter(0)(['a', 'b', 'c']), 'a')
+         self.assertEqual(operator.itemgetter(0)(range(100, 200)), 100)
+@@ -455,13 +483,14 @@ class OperatorTestCase:
+         operator = self.module
+         self.assertRaises(TypeError, operator.methodcaller)
+         self.assertRaises(TypeError, operator.methodcaller, 12)
+-        class A:
+-            def foo(self, *args, **kwds):
+-                return args[0] + args[1]
+-            def bar(self, f=42):
+-                return f
+-            def baz(*args, **kwds):
+-                return kwds['name'], kwds['self']
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                def foo(self, *args, **kwds):
++                    return args[0] + args[1]
++                def bar(self, f=42):
++                    return f
++                def baz(*args, **kwds):
++                    return kwds['name'], kwds['self']
+         a = A()
+         f = operator.methodcaller('foo')
+         self.assertRaises(IndexError, f, a)
+@@ -480,21 +509,22 @@ class OperatorTestCase:
+
+     def test_inplace(self):
+         operator = self.module
+-        class C(object):
+-            def __iadd__     (self, other): return "iadd"
+-            def __iand__     (self, other): return "iand"
+-            def __ifloordiv__(self, other): return "ifloordiv"
+-            def __ilshift__  (self, other): return "ilshift"
+-            def __imod__     (self, other): return "imod"
+-            def __imul__     (self, other): return "imul"
+-            def __imatmul__  (self, other): return "imatmul"
+-            def __ior__      (self, other): return "ior"
+-            def __ipow__     (self, other): return "ipow"
+-            def __irshift__  (self, other): return "irshift"
+-            def __isub__     (self, other): return "isub"
+-            def __itruediv__ (self, other): return "itruediv"
+-            def __ixor__     (self, other): return "ixor"
+-            def __getitem__(self, other): return 5  # so that C is a sequence
++        with torch._dynamo.error_on_graph_break(False):
++            class C(object):
++                def __iadd__     (self, other): return "iadd"
++                def __iand__     (self, other): return "iand"
++                def __ifloordiv__(self, other): return "ifloordiv"
++                def __ilshift__  (self, other): return "ilshift"
++                def __imod__     (self, other): return "imod"
++                def __imul__     (self, other): return "imul"
++                def __imatmul__  (self, other): return "imatmul"
++                def __ior__      (self, other): return "ior"
++                def __ipow__     (self, other): return "ipow"
++                def __irshift__  (self, other): return "irshift"
++                def __isub__     (self, other): return "isub"
++                def __itruediv__ (self, other): return "itruediv"
++                def __ixor__     (self, other): return "ixor"
++                def __getitem__(self, other): return 5  # so that C is a sequence
+         c = C()
+         self.assertEqual(operator.iadd     (c, 5), "iadd")
+         self.assertEqual(operator.iand     (c, 5), "iand")
+@@ -520,9 +550,10 @@ class OperatorTestCase:
+
+     def test_index(self):
+         operator = self.module
+-        class X:
+-            def __index__(self):
+-                return 1
++        with torch._dynamo.error_on_graph_break(False):
++            class X:
++                def __index__(self):
++                    return 1
+
+         self.assertEqual(operator.index(X()), 1)
+         self.assertEqual(operator.index(0), 0)
+@@ -539,9 +570,10 @@ class OperatorTestCase:
+
+     def test_not_(self):
+         operator = self.module
+-        class C:
+-            def __bool__(self):
+-                raise SyntaxError
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                def __bool__(self):
++                    raise SyntaxError
+         self.assertRaises(TypeError, operator.not_)
+         self.assertRaises(SyntaxError, operator.not_, C())
+         self.assertFalse(operator.not_(5))
+@@ -551,15 +583,16 @@ class OperatorTestCase:
+
+     def test_length_hint(self):
+         operator = self.module
+-        class X(object):
+-            def __init__(self, value):
+-                self.value = value
++        with torch._dynamo.error_on_graph_break(False):
++            class X(object):
++                def __init__(self, value):
++                    self.value = value
+
+-            def __length_hint__(self):
+-                if type(self.value) is type:
+-                    raise self.value
+-                else:
+-                    return self.value
++                def __length_hint__(self):
++                    if type(self.value) is type:
++                        raise self.value
++                    else:
++                        return self.value
+
+         self.assertEqual(operator.length_hint([], 2), 0)
+         self.assertEqual(operator.length_hint(iter([1, 2, 3])), 3)
+@@ -574,7 +607,8 @@ class OperatorTestCase:
+         with self.assertRaises(LookupError):
+             operator.length_hint(X(LookupError))
+
+-        class Y: pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Y: pass
+
+         msg = "'str' object cannot be interpreted as an integer"
+         with self.assertRaisesRegex(TypeError, msg):
+@@ -628,11 +662,11 @@ class OperatorTestCase:
          self.assertEqual(str(sig), '(obj, /)')
- 
- 
+
+
 -class PyOperatorTestCase(OperatorTestCase, unittest.TestCase):
 +class PyOperatorTestCase(OperatorTestCase, __TestCase):
      module = py_operator
- 
+
  @unittest.skipUnless(c_operator, 'requires _operator')
 -class COperatorTestCase(OperatorTestCase, unittest.TestCase):
 +class COperatorTestCase(OperatorTestCase, __TestCase):
      module = c_operator
- 
- 
-@@ -717,25 +737,25 @@ class OperatorPickleTestCase:
+
+
+@@ -645,8 +679,9 @@ class OperatorPickleTestCase:
+
+     def test_attrgetter(self):
+         attrgetter = self.module.attrgetter
+-        class A:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                pass
+         a = A()
+         a.x = 'X'
+         a.y = 'Y'
+@@ -688,13 +723,14 @@ class OperatorPickleTestCase:
+
+     def test_methodcaller(self):
+         methodcaller = self.module.methodcaller
+-        class A:
+-            def foo(self, *args, **kwds):
+-                return args[0] + args[1]
+-            def bar(self, f=42):
+-                return f
+-            def baz(*args, **kwds):
+-                return kwds['name'], kwds['self']
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                def foo(self, *args, **kwds):
++                    return args[0] + args[1]
++                def bar(self, f=42):
++                    return f
++                def baz(*args, **kwds):
++                    return kwds['name'], kwds['self']
+         a = A()
+         for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+             with self.subTest(proto=proto):
+@@ -717,25 +753,25 @@ class OperatorPickleTestCase:
                  # Can't test repr consistently with multiple keyword args
                  self.assertEqual(f2(a), f(a))
- 
+
 -class PyPyOperatorPickleTestCase(OperatorPickleTestCase, unittest.TestCase):
 +class PyPyOperatorPickleTestCase(OperatorPickleTestCase, __TestCase):
      module = py_operator
      module2 = py_operator
- 
+
  @unittest.skipUnless(c_operator, 'requires _operator')
 -class PyCOperatorPickleTestCase(OperatorPickleTestCase, unittest.TestCase):
 +class PyCOperatorPickleTestCase(OperatorPickleTestCase, __TestCase):
      module = py_operator
      module2 = c_operator
- 
+
  @unittest.skipUnless(c_operator, 'requires _operator')
 -class CPyOperatorPickleTestCase(OperatorPickleTestCase, unittest.TestCase):
 +class CPyOperatorPickleTestCase(OperatorPickleTestCase, __TestCase):
      module = c_operator
      module2 = py_operator
- 
+
  @unittest.skipUnless(c_operator, 'requires _operator')
 -class CCOperatorPickleTestCase(OperatorPickleTestCase, unittest.TestCase):
 +class CCOperatorPickleTestCase(OperatorPickleTestCase, __TestCase):
      module = c_operator
      module2 = c_operator
- 
- 
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_operator.py b/test/dynamo/cpython/3_13/test_operator.py
index c212a2d6559b0..7b9c0f0dd583e 100644
--- a/test/dynamo/cpython/3_13/test_operator.py
+++ b/test/dynamo/cpython/3_13/test_operator.py
@@ -104,9 +104,10 @@ def test_le(self):
 
     def test_eq(self):
         operator = self.module
-        class C(object):
-            def __eq__(self, other):
-                raise SyntaxError
+        with torch._dynamo.error_on_graph_break(False):
+            class C(object):
+                def __eq__(self, other):
+                    raise SyntaxError
         self.assertRaises(TypeError, operator.eq)
         self.assertRaises(SyntaxError, operator.eq, C(), C())
         self.assertFalse(operator.eq(1, 0))
@@ -118,9 +119,10 @@ def __eq__(self, other):
 
     def test_ne(self):
         operator = self.module
-        class C(object):
-            def __ne__(self, other):
-                raise SyntaxError
+        with torch._dynamo.error_on_graph_break(False):
+            class C(object):
+                def __ne__(self, other):
+                    raise SyntaxError
         self.assertRaises(TypeError, operator.ne)
         self.assertRaises(SyntaxError, operator.ne, C(), C())
         self.assertTrue(operator.ne(1, 0))
@@ -265,9 +267,10 @@ def test_matmul(self):
         operator = self.module
         self.assertRaises(TypeError, operator.matmul)
         self.assertRaises(TypeError, operator.matmul, 42, 42)
-        class M:
-            def __matmul__(self, other):
-                return other - 1
+        with torch._dynamo.error_on_graph_break(False):
+            class M:
+                def __matmul__(self, other):
+                    return other - 1
         self.assertEqual(M() @ 42, 41)
 
     def test_neg(self):
@@ -335,9 +338,10 @@ def test_sub(self):
 
     def test_truth(self):
         operator = self.module
-        class C(object):
-            def __bool__(self):
-                raise SyntaxError
+        with torch._dynamo.error_on_graph_break(False):
+            class C(object):
+                def __bool__(self):
+                    raise SyntaxError
         self.assertRaises(TypeError, operator.truth)
         self.assertRaises(SyntaxError, operator.truth, C())
         self.assertTrue(operator.truth(5))
@@ -369,8 +373,9 @@ def test_is_not(self):
 
     def test_attrgetter(self):
         operator = self.module
-        class A:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                pass
         a = A()
         a.name = 'arthur'
         f = operator.attrgetter('name')
@@ -391,9 +396,10 @@ class A:
         self.assertEqual(operator.attrgetter('x','z','y')(record), ('X', 'Z', 'Y'))
         self.assertRaises(TypeError, operator.attrgetter, ('x', (), 'y'))
 
-        class C(object):
-            def __getattr__(self, name):
-                raise SyntaxError
+        with torch._dynamo.error_on_graph_break(False):
+            class C(object):
+                def __getattr__(self, name):
+                    raise SyntaxError
         self.assertRaises(SyntaxError, operator.attrgetter('foo'), C())
 
         # recursive gets
@@ -431,9 +437,10 @@ def test_itemgetter(self):
         f = operator.itemgetter(10)
         self.assertRaises(IndexError, f, a)
 
-        class C(object):
-            def __getitem__(self, name):
-                raise SyntaxError
+        with torch._dynamo.error_on_graph_break(False):
+            class C(object):
+                def __getitem__(self, name):
+                    raise SyntaxError
         self.assertRaises(SyntaxError, operator.itemgetter(42), C())
 
         f = operator.itemgetter('name')
@@ -464,9 +471,10 @@ def __getitem__(self, name):
         self.assertEqual(operator.itemgetter(slice(2, 4))(t), ('c', 'd'))
 
         # interesting sequences
-        class T(tuple):
-            'Tuple subclass'
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class T(tuple):
+                'Tuple subclass'
+                pass
         self.assertEqual(operator.itemgetter(0)(T('abc')), 'a')
         self.assertEqual(operator.itemgetter(0)(['a', 'b', 'c']), 'a')
         self.assertEqual(operator.itemgetter(0)(range(100, 200)), 100)
@@ -475,13 +483,14 @@ def test_methodcaller(self):
         operator = self.module
         self.assertRaises(TypeError, operator.methodcaller)
         self.assertRaises(TypeError, operator.methodcaller, 12)
-        class A:
-            def foo(self, *args, **kwds):
-                return args[0] + args[1]
-            def bar(self, f=42):
-                return f
-            def baz(*args, **kwds):
-                return kwds['name'], kwds['self']
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                def foo(self, *args, **kwds):
+                    return args[0] + args[1]
+                def bar(self, f=42):
+                    return f
+                def baz(*args, **kwds):
+                    return kwds['name'], kwds['self']
         a = A()
         f = operator.methodcaller('foo')
         self.assertRaises(IndexError, f, a)
@@ -500,21 +509,22 @@ def baz(*args, **kwds):
 
     def test_inplace(self):
         operator = self.module
-        class C(object):
-            def __iadd__     (self, other): return "iadd"
-            def __iand__     (self, other): return "iand"
-            def __ifloordiv__(self, other): return "ifloordiv"
-            def __ilshift__  (self, other): return "ilshift"
-            def __imod__     (self, other): return "imod"
-            def __imul__     (self, other): return "imul"
-            def __imatmul__  (self, other): return "imatmul"
-            def __ior__      (self, other): return "ior"
-            def __ipow__     (self, other): return "ipow"
-            def __irshift__  (self, other): return "irshift"
-            def __isub__     (self, other): return "isub"
-            def __itruediv__ (self, other): return "itruediv"
-            def __ixor__     (self, other): return "ixor"
-            def __getitem__(self, other): return 5  # so that C is a sequence
+        with torch._dynamo.error_on_graph_break(False):
+            class C(object):
+                def __iadd__     (self, other): return "iadd"
+                def __iand__     (self, other): return "iand"
+                def __ifloordiv__(self, other): return "ifloordiv"
+                def __ilshift__  (self, other): return "ilshift"
+                def __imod__     (self, other): return "imod"
+                def __imul__     (self, other): return "imul"
+                def __imatmul__  (self, other): return "imatmul"
+                def __ior__      (self, other): return "ior"
+                def __ipow__     (self, other): return "ipow"
+                def __irshift__  (self, other): return "irshift"
+                def __isub__     (self, other): return "isub"
+                def __itruediv__ (self, other): return "itruediv"
+                def __ixor__     (self, other): return "ixor"
+                def __getitem__(self, other): return 5  # so that C is a sequence
         c = C()
         self.assertEqual(operator.iadd     (c, 5), "iadd")
         self.assertEqual(operator.iand     (c, 5), "iand")
@@ -540,9 +550,10 @@ def test_iconcat_without_getitem(self):
 
     def test_index(self):
         operator = self.module
-        class X:
-            def __index__(self):
-                return 1
+        with torch._dynamo.error_on_graph_break(False):
+            class X:
+                def __index__(self):
+                    return 1
 
         self.assertEqual(operator.index(X()), 1)
         self.assertEqual(operator.index(0), 0)
@@ -559,9 +570,10 @@ def __index__(self):
 
     def test_not_(self):
         operator = self.module
-        class C:
-            def __bool__(self):
-                raise SyntaxError
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                def __bool__(self):
+                    raise SyntaxError
         self.assertRaises(TypeError, operator.not_)
         self.assertRaises(SyntaxError, operator.not_, C())
         self.assertFalse(operator.not_(5))
@@ -571,15 +583,16 @@ def __bool__(self):
 
     def test_length_hint(self):
         operator = self.module
-        class X(object):
-            def __init__(self, value):
-                self.value = value
+        with torch._dynamo.error_on_graph_break(False):
+            class X(object):
+                def __init__(self, value):
+                    self.value = value
 
-            def __length_hint__(self):
-                if type(self.value) is type:
-                    raise self.value
-                else:
-                    return self.value
+                def __length_hint__(self):
+                    if type(self.value) is type:
+                        raise self.value
+                    else:
+                        return self.value
 
         self.assertEqual(operator.length_hint([], 2), 0)
         self.assertEqual(operator.length_hint(iter([1, 2, 3])), 3)
@@ -594,7 +607,8 @@ def __length_hint__(self):
         with self.assertRaises(LookupError):
             operator.length_hint(X(LookupError))
 
-        class Y: pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Y: pass
 
         msg = "'str' object cannot be interpreted as an integer"
         with self.assertRaisesRegex(TypeError, msg):
@@ -665,8 +679,9 @@ def copy(self, obj, proto):
 
     def test_attrgetter(self):
         attrgetter = self.module.attrgetter
-        class A:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                pass
         a = A()
         a.x = 'X'
         a.y = 'Y'
@@ -708,13 +723,14 @@ def test_itemgetter(self):
 
     def test_methodcaller(self):
         methodcaller = self.module.methodcaller
-        class A:
-            def foo(self, *args, **kwds):
-                return args[0] + args[1]
-            def bar(self, f=42):
-                return f
-            def baz(*args, **kwds):
-                return kwds['name'], kwds['self']
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                def foo(self, *args, **kwds):
+                    return args[0] + args[1]
+                def bar(self, f=42):
+                    return f
+                def baz(*args, **kwds):
+                    return kwds['name'], kwds['self']
         a = A()
         for proto in range(pickle.HIGHEST_PROTOCOL + 1):
             with self.subTest(proto=proto):
diff --git a/test/dynamo/cpython/3_13/test_ordered_dict.diff b/test/dynamo/cpython/3_13/test_ordered_dict.diff
index d7ef884f2954b..1df02fabdfd27 100644
--- a/test/dynamo/cpython/3_13/test_ordered_dict.diff
+++ b/test/dynamo/cpython/3_13/test_ordered_dict.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_ordered_dict.py b/test/dynamo/cpython/3_13/test_ordered_dict.py
-index a9b6a84996e..d9fce736a10 100644
+index a9b6a84996e..efc4288d1a4 100644
 --- a/test/dynamo/cpython/3_13/test_ordered_dict.py
 +++ b/test/dynamo/cpython/3_13/test_ordered_dict.py
 @@ -1,3 +1,60 @@
@@ -63,114 +63,356 @@ index a9b6a84996e..d9fce736a10 100644
  import builtins
  import contextlib
  import copy
-@@ -760,7 +817,7 @@ class _TriggerSideEffectOnEqual:
+@@ -113,13 +170,14 @@ class OrderedDictTests:
+
+     def test_init_calls(self):
+         calls = []
+-        class Spam:
+-            def keys(self):
+-                calls.append('keys')
+-                return ()
+-            def items(self):
+-                calls.append('items')
+-                return ()
++        with torch._dynamo.error_on_graph_break(False):
++            class Spam:
++                def keys(self):
++                    calls.append('keys')
++                    return ()
++                def items(self):
++                    calls.append('items')
++                    return ()
+
+         self.OrderedDict(Spam())
+         self.assertEqual(calls, ['keys'])
+@@ -129,9 +187,10 @@ class OrderedDictTests:
+         # a consistent internal state is created in __new__
+         # rather than __init__.
+         OrderedDict = self.OrderedDict
+-        class ODNI(OrderedDict):
+-            def __init__(*args, **kwargs):
+-                pass
++        with torch._dynamo.error_on_graph_break(False):
++            class ODNI(OrderedDict):
++                def __init__(*args, **kwargs):
++                    pass
+         od = ODNI()
+         od['a'] = 1  # This used to fail because __init__ was bypassed
+
+@@ -267,9 +326,10 @@ class OrderedDictTests:
+         self.assertEqual(od.pop(k, 12345), 12345)
+
+         # make sure pop still works when __missing__ is defined
+-        class Missing(OrderedDict):
+-            def __missing__(self, key):
+-                return 0
++        with torch._dynamo.error_on_graph_break(False):
++            class Missing(OrderedDict):
++                def __missing__(self, key):
++                    return 0
+         m = Missing(a=1)
+         self.assertEqual(m.pop('b', 5), 5)
+         self.assertEqual(m.pop('a', 6), 1)
+@@ -416,9 +476,10 @@ class OrderedDictTests:
+         self.assertEqual(od.setdefault('g', default=9), 9)
+
+         # make sure setdefault still works when __missing__ is defined
+-        class Missing(OrderedDict):
+-            def __missing__(self, key):
+-                return 0
++        with torch._dynamo.error_on_graph_break(False):
++            class Missing(OrderedDict):
++                def __missing__(self, key):
++                    return 0
+         self.assertEqual(Missing().setdefault(5, 9), 9)
+
+     def test_reinsert(self):
+@@ -484,9 +545,10 @@ class OrderedDictTests:
+     def test_override_update(self):
+         OrderedDict = self.OrderedDict
+         # Verify that subclasses can override update() without breaking __init__()
+-        class MyOD(OrderedDict):
+-            def update(self, *args, **kwds):
+-                raise Exception()
++        with torch._dynamo.error_on_graph_break(False):
++            class MyOD(OrderedDict):
++                def update(self, *args, **kwds):
++                    raise Exception()
+         items = [('a', 1), ('c', 3), ('b', 2)]
+         self.assertEqual(list(MyOD(items).items()), items)
+
+@@ -507,9 +569,10 @@ class OrderedDictTests:
+         # should not crash Python.
+         OrderedDict = self.OrderedDict
+         deleted = []
+-        class MyOD(OrderedDict):
+-            def __del__(self):
+-                deleted.append(self.i)
++        with torch._dynamo.error_on_graph_break(False):
++            class MyOD(OrderedDict):
++                def __del__(self):
++                    deleted.append(self.i)
+         obj = None
+         for i in range(100):
+             obj = MyOD([(None, obj)])
+@@ -521,19 +584,20 @@ class OrderedDictTests:
+     def test_delitem_hash_collision(self):
+         OrderedDict = self.OrderedDict
+
+-        class Key:
+-            def __init__(self, hash):
+-                self._hash = hash
+-                self.value = str(id(self))
+-            def __hash__(self):
+-                return self._hash
+-            def __eq__(self, other):
+-                try:
+-                    return self.value == other.value
+-                except AttributeError:
+-                    return False
+-            def __repr__(self):
+-                return self.value
++        with torch._dynamo.error_on_graph_break(False):
++            class Key:
++                def __init__(self, hash):
++                    self._hash = hash
++                    self.value = str(id(self))
++                def __hash__(self):
++                    return self._hash
++                def __eq__(self, other):
++                    try:
++                        return self.value == other.value
++                    except AttributeError:
++                        return False
++                def __repr__(self):
++                    return self.value
+
+         def blocking_hash(hash):
+             # See the collision-handling in lookdict (in Objects/dictobject.c).
+@@ -560,9 +624,10 @@ class OrderedDictTests:
+     def test_issue24347(self):
+         OrderedDict = self.OrderedDict
+
+-        class Key:
+-            def __hash__(self):
+-                return randrange(100000)
++        with torch._dynamo.error_on_graph_break(False):
++            class Key:
++                def __hash__(self):
++                    return randrange(100000)
+
+         od = OrderedDict()
+         for i in range(100):
+@@ -582,9 +647,10 @@ class OrderedDictTests:
+     def test_issue24348(self):
+         OrderedDict = self.OrderedDict
+
+-        class Key:
+-            def __hash__(self):
+-                return 1
++        with torch._dynamo.error_on_graph_break(False):
++            class Key:
++                def __hash__(self):
++                    return 1
+
+         od = OrderedDict()
+         od[Key()] = 0
+@@ -760,15 +826,16 @@ class _TriggerSideEffectOnEqual:
      def side_effect(self):
          raise NotImplementedError
- 
+
 -class PurePythonOrderedDictTests(OrderedDictTests, unittest.TestCase):
 +class PurePythonOrderedDictTests(OrderedDictTests, __TestCase):
- 
+
      module = py_coll
      OrderedDict = py_coll.OrderedDict
-@@ -781,7 +838,7 @@ class PurePythonOrderedDictTests(OrderedDictTests, unittest.TestCase):
+
+     def test_issue119004_attribute_error(self):
+-        class Key(_TriggerSideEffectOnEqual):
+-            def side_effect(self):
+-                del dict1[TODEL]
++        with torch._dynamo.error_on_graph_break(False):
++            class Key(_TriggerSideEffectOnEqual):
++                def side_effect(self):
++                    del dict1[TODEL]
+
+         TODEL = Key()
+         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
+@@ -781,7 +848,7 @@ class PurePythonOrderedDictTests(OrderedDictTests, unittest.TestCase):
          self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
- 
- 
+
+
 -class CPythonBuiltinDictTests(unittest.TestCase):
 +class CPythonBuiltinDictTests(__TestCase):
      """Builtin dict preserves insertion order.
- 
+
      Reuse some of tests in OrderedDict selectively.
-@@ -800,6 +857,7 @@ for method in (
+@@ -800,6 +867,7 @@ for method in (
  del method
- 
- 
+
+
 +
  class CPythonOrderedDictSideEffects:
- 
+
      def check_runtime_error_issue119004(self, dict1, dict2):
-@@ -878,7 +936,7 @@ class CPythonOrderedDictSideEffects:
+@@ -807,9 +875,10 @@ class CPythonOrderedDictSideEffects:
+         self.assertRaisesRegex(RuntimeError, msg, operator.eq, dict1, dict2)
+
+     def test_issue119004_change_size_by_clear(self):
+-        class Key(_TriggerSideEffectOnEqual):
+-            def side_effect(self):
+-                dict1.clear()
++        with torch._dynamo.error_on_graph_break(False):
++            class Key(_TriggerSideEffectOnEqual):
++                def side_effect(self):
++                    dict1.clear()
+
+         dict1 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+         dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+@@ -819,9 +888,10 @@ class CPythonOrderedDictSideEffects:
+         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+     def test_issue119004_change_size_by_delete_key(self):
+-        class Key(_TriggerSideEffectOnEqual):
+-            def side_effect(self):
+-                del dict1[TODEL]
++        with torch._dynamo.error_on_graph_break(False):
++            class Key(_TriggerSideEffectOnEqual):
++                def side_effect(self):
++                    del dict1[TODEL]
+
+         TODEL = Key()
+         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
+@@ -832,10 +902,11 @@ class CPythonOrderedDictSideEffects:
+         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+     def test_issue119004_change_linked_list_by_clear(self):
+-        class Key(_TriggerSideEffectOnEqual):
+-            def side_effect(self):
+-                dict1.clear()
+-                dict1['a'] = dict1['b'] = 'c'
++        with torch._dynamo.error_on_graph_break(False):
++            class Key(_TriggerSideEffectOnEqual):
++                def side_effect(self):
++                    dict1.clear()
++                    dict1['a'] = dict1['b'] = 'c'
+
+         dict1 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+         dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+@@ -845,10 +916,11 @@ class CPythonOrderedDictSideEffects:
+         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+     def test_issue119004_change_linked_list_by_delete_key(self):
+-        class Key(_TriggerSideEffectOnEqual):
+-            def side_effect(self):
+-                del dict1[TODEL]
+-                dict1['a'] = 'c'
++        with torch._dynamo.error_on_graph_break(False):
++            class Key(_TriggerSideEffectOnEqual):
++                def side_effect(self):
++                    del dict1[TODEL]
++                    dict1['a'] = 'c'
+
+         TODEL = Key()
+         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
+@@ -859,10 +931,11 @@ class CPythonOrderedDictSideEffects:
+         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+     def test_issue119004_change_size_by_delete_key_in_dict_eq(self):
+-        class Key(_TriggerSideEffectOnEqual):
+-            trigger = 0
+-            def side_effect(self):
+-                del dict1[TODEL]
++        with torch._dynamo.error_on_graph_break(False):
++            class Key(_TriggerSideEffectOnEqual):
++                trigger = 0
++                def side_effect(self):
++                    del dict1[TODEL]
+
+         TODEL = Key()
+         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
+@@ -878,7 +951,7 @@ class CPythonOrderedDictSideEffects:
  @unittest.skipUnless(c_coll, 'requires the C version of the collections module')
  class CPythonOrderedDictTests(OrderedDictTests,
                                CPythonOrderedDictSideEffects,
 -                              unittest.TestCase):
 +                              __TestCase):
- 
+
      module = c_coll
      OrderedDict = c_coll.OrderedDict
-@@ -986,7 +1044,7 @@ class CPythonOrderedDictSubclassTests(CPythonOrderedDictTests):
+@@ -986,7 +1059,7 @@ class CPythonOrderedDictSubclassTests(CPythonOrderedDictTests):
          pass
- 
- 
+
+
 -class PurePythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
 +class PurePythonOrderedDictWithSlotsCopyingTests(__TestCase):
- 
+
      module = py_coll
      class OrderedDict(py_coll.OrderedDict):
-@@ -995,7 +1053,7 @@ class PurePythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
- 
- 
+@@ -995,7 +1068,7 @@ class PurePythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
+
+
  @unittest.skipUnless(c_coll, 'requires the C version of the collections module')
 -class CPythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
 +class CPythonOrderedDictWithSlotsCopyingTests(__TestCase):
- 
+
      module = c_coll
      class OrderedDict(c_coll.OrderedDict):
-@@ -1008,6 +1066,7 @@ class PurePythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+@@ -1008,6 +1081,7 @@ class PurePythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
      @classmethod
      def setUpClass(cls):
          cls.type2test = py_coll.OrderedDict
 +        super().setUpClass()
- 
+
      def test_popitem(self):
          d = self._empty_mapping()
-@@ -1020,6 +1079,7 @@ class CPythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+@@ -1020,6 +1094,7 @@ class CPythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
      @classmethod
      def setUpClass(cls):
          cls.type2test = c_coll.OrderedDict
 +        super().setUpClass()
- 
+
      def test_popitem(self):
          d = self._empty_mapping()
-@@ -1033,6 +1093,7 @@ class PurePythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+@@ -1033,6 +1108,7 @@ class PurePythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
          class MyOrderedDict(py_coll.OrderedDict):
              pass
          cls.type2test = MyOrderedDict
 +        super().setUpClass()
- 
+
      def test_popitem(self):
          d = self._empty_mapping()
-@@ -1047,6 +1108,7 @@ class CPythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+@@ -1047,6 +1123,7 @@ class CPythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
          class MyOrderedDict(c_coll.OrderedDict):
              pass
          cls.type2test = MyOrderedDict
 +        super().setUpClass()
- 
+
      def test_popitem(self):
          d = self._empty_mapping()
-@@ -1120,21 +1182,22 @@ class SimpleLRUCacheTests:
+@@ -1120,21 +1197,22 @@ class SimpleLRUCacheTests:
          self.assertEqual(list(c), [1, 3, 2])
- 
- 
+
+
 -class PySimpleLRUCacheTests(SimpleLRUCacheTests, unittest.TestCase):
 +class PySimpleLRUCacheTests(SimpleLRUCacheTests, __TestCase):
- 
+
      class type2test(SimpleLRUCache, py_coll.OrderedDict):
          pass
- 
- 
+
+
  @unittest.skipUnless(c_coll, 'requires the C version of the collections module')
 -class CSimpleLRUCacheTests(SimpleLRUCacheTests, unittest.TestCase):
 +class CSimpleLRUCacheTests(SimpleLRUCacheTests, __TestCase):
- 
+
      @classmethod
      def setUpClass(cls):
          class type2test(SimpleLRUCache, c_coll.OrderedDict):
              pass
          cls.type2test = type2test
 +        super().setUpClass()
- 
- 
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_ordered_dict.py b/test/dynamo/cpython/3_13/test_ordered_dict.py
index d9fce736a1091..56a8662de1335 100644
--- a/test/dynamo/cpython/3_13/test_ordered_dict.py
+++ b/test/dynamo/cpython/3_13/test_ordered_dict.py
@@ -170,13 +170,14 @@ def test_update(self):
 
     def test_init_calls(self):
         calls = []
-        class Spam:
-            def keys(self):
-                calls.append('keys')
-                return ()
-            def items(self):
-                calls.append('items')
-                return ()
+        with torch._dynamo.error_on_graph_break(False):
+            class Spam:
+                def keys(self):
+                    calls.append('keys')
+                    return ()
+                def items(self):
+                    calls.append('items')
+                    return ()
 
         self.OrderedDict(Spam())
         self.assertEqual(calls, ['keys'])
@@ -186,9 +187,10 @@ def test_overridden_init(self):
         # a consistent internal state is created in __new__
         # rather than __init__.
         OrderedDict = self.OrderedDict
-        class ODNI(OrderedDict):
-            def __init__(*args, **kwargs):
-                pass
+        with torch._dynamo.error_on_graph_break(False):
+            class ODNI(OrderedDict):
+                def __init__(*args, **kwargs):
+                    pass
         od = ODNI()
         od['a'] = 1  # This used to fail because __init__ was bypassed
 
@@ -324,9 +326,10 @@ def test_pop(self):
         self.assertEqual(od.pop(k, 12345), 12345)
 
         # make sure pop still works when __missing__ is defined
-        class Missing(OrderedDict):
-            def __missing__(self, key):
-                return 0
+        with torch._dynamo.error_on_graph_break(False):
+            class Missing(OrderedDict):
+                def __missing__(self, key):
+                    return 0
         m = Missing(a=1)
         self.assertEqual(m.pop('b', 5), 5)
         self.assertEqual(m.pop('a', 6), 1)
@@ -473,9 +476,10 @@ def test_setdefault(self):
         self.assertEqual(od.setdefault('g', default=9), 9)
 
         # make sure setdefault still works when __missing__ is defined
-        class Missing(OrderedDict):
-            def __missing__(self, key):
-                return 0
+        with torch._dynamo.error_on_graph_break(False):
+            class Missing(OrderedDict):
+                def __missing__(self, key):
+                    return 0
         self.assertEqual(Missing().setdefault(5, 9), 9)
 
     def test_reinsert(self):
@@ -541,9 +545,10 @@ def test_views(self):
     def test_override_update(self):
         OrderedDict = self.OrderedDict
         # Verify that subclasses can override update() without breaking __init__()
-        class MyOD(OrderedDict):
-            def update(self, *args, **kwds):
-                raise Exception()
+        with torch._dynamo.error_on_graph_break(False):
+            class MyOD(OrderedDict):
+                def update(self, *args, **kwds):
+                    raise Exception()
         items = [('a', 1), ('c', 3), ('b', 2)]
         self.assertEqual(list(MyOD(items).items()), items)
 
@@ -564,9 +569,10 @@ def test_highly_nested_subclass(self):
         # should not crash Python.
         OrderedDict = self.OrderedDict
         deleted = []
-        class MyOD(OrderedDict):
-            def __del__(self):
-                deleted.append(self.i)
+        with torch._dynamo.error_on_graph_break(False):
+            class MyOD(OrderedDict):
+                def __del__(self):
+                    deleted.append(self.i)
         obj = None
         for i in range(100):
             obj = MyOD([(None, obj)])
@@ -578,19 +584,20 @@ def __del__(self):
     def test_delitem_hash_collision(self):
         OrderedDict = self.OrderedDict
 
-        class Key:
-            def __init__(self, hash):
-                self._hash = hash
-                self.value = str(id(self))
-            def __hash__(self):
-                return self._hash
-            def __eq__(self, other):
-                try:
-                    return self.value == other.value
-                except AttributeError:
-                    return False
-            def __repr__(self):
-                return self.value
+        with torch._dynamo.error_on_graph_break(False):
+            class Key:
+                def __init__(self, hash):
+                    self._hash = hash
+                    self.value = str(id(self))
+                def __hash__(self):
+                    return self._hash
+                def __eq__(self, other):
+                    try:
+                        return self.value == other.value
+                    except AttributeError:
+                        return False
+                def __repr__(self):
+                    return self.value
 
         def blocking_hash(hash):
             # See the collision-handling in lookdict (in Objects/dictobject.c).
@@ -617,9 +624,10 @@ def blocking_hash(hash):
     def test_issue24347(self):
         OrderedDict = self.OrderedDict
 
-        class Key:
-            def __hash__(self):
-                return randrange(100000)
+        with torch._dynamo.error_on_graph_break(False):
+            class Key:
+                def __hash__(self):
+                    return randrange(100000)
 
         od = OrderedDict()
         for i in range(100):
@@ -639,9 +647,10 @@ def __hash__(self):
     def test_issue24348(self):
         OrderedDict = self.OrderedDict
 
-        class Key:
-            def __hash__(self):
-                return 1
+        with torch._dynamo.error_on_graph_break(False):
+            class Key:
+                def __hash__(self):
+                    return 1
 
         od = OrderedDict()
         od[Key()] = 0
@@ -823,9 +832,10 @@ class PurePythonOrderedDictTests(OrderedDictTests, __TestCase):
     OrderedDict = py_coll.OrderedDict
 
     def test_issue119004_attribute_error(self):
-        class Key(_TriggerSideEffectOnEqual):
-            def side_effect(self):
-                del dict1[TODEL]
+        with torch._dynamo.error_on_graph_break(False):
+            class Key(_TriggerSideEffectOnEqual):
+                def side_effect(self):
+                    del dict1[TODEL]
 
         TODEL = Key()
         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
@@ -865,9 +875,10 @@ def check_runtime_error_issue119004(self, dict1, dict2):
         self.assertRaisesRegex(RuntimeError, msg, operator.eq, dict1, dict2)
 
     def test_issue119004_change_size_by_clear(self):
-        class Key(_TriggerSideEffectOnEqual):
-            def side_effect(self):
-                dict1.clear()
+        with torch._dynamo.error_on_graph_break(False):
+            class Key(_TriggerSideEffectOnEqual):
+                def side_effect(self):
+                    dict1.clear()
 
         dict1 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
         dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
@@ -877,9 +888,10 @@ def side_effect(self):
         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
 
     def test_issue119004_change_size_by_delete_key(self):
-        class Key(_TriggerSideEffectOnEqual):
-            def side_effect(self):
-                del dict1[TODEL]
+        with torch._dynamo.error_on_graph_break(False):
+            class Key(_TriggerSideEffectOnEqual):
+                def side_effect(self):
+                    del dict1[TODEL]
 
         TODEL = Key()
         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
@@ -890,10 +902,11 @@ def side_effect(self):
         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
 
     def test_issue119004_change_linked_list_by_clear(self):
-        class Key(_TriggerSideEffectOnEqual):
-            def side_effect(self):
-                dict1.clear()
-                dict1['a'] = dict1['b'] = 'c'
+        with torch._dynamo.error_on_graph_break(False):
+            class Key(_TriggerSideEffectOnEqual):
+                def side_effect(self):
+                    dict1.clear()
+                    dict1['a'] = dict1['b'] = 'c'
 
         dict1 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
         dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
@@ -903,10 +916,11 @@ def side_effect(self):
         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
 
     def test_issue119004_change_linked_list_by_delete_key(self):
-        class Key(_TriggerSideEffectOnEqual):
-            def side_effect(self):
-                del dict1[TODEL]
-                dict1['a'] = 'c'
+        with torch._dynamo.error_on_graph_break(False):
+            class Key(_TriggerSideEffectOnEqual):
+                def side_effect(self):
+                    del dict1[TODEL]
+                    dict1['a'] = 'c'
 
         TODEL = Key()
         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
@@ -917,10 +931,11 @@ def side_effect(self):
         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
 
     def test_issue119004_change_size_by_delete_key_in_dict_eq(self):
-        class Key(_TriggerSideEffectOnEqual):
-            trigger = 0
-            def side_effect(self):
-                del dict1[TODEL]
+        with torch._dynamo.error_on_graph_break(False):
+            class Key(_TriggerSideEffectOnEqual):
+                trigger = 0
+                def side_effect(self):
+                    del dict1[TODEL]
 
         TODEL = Key()
         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
diff --git a/test/dynamo/cpython/3_13/test_range.diff b/test/dynamo/cpython/3_13/test_range.diff
new file mode 100644
index 0000000000000..ee28294dba4f7
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_range.diff
@@ -0,0 +1,124 @@
+diff --git a/test/dynamo/cpython/3_13/test_range.py b/test/dynamo/cpython/3_13/test_range.py
+index 3870b153688..4d3a3d136e4 100644
+--- a/test/dynamo/cpython/3_13/test_range.py
++++ b/test/dynamo/cpython/3_13/test_range.py
+@@ -1,3 +1,23 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++# Test copied from
++# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_range.py
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo
++
++__TestCase = CPythonTestCase
++
++# ======= END DYNAMO PATCH =======
++
+ # Python test set -- built-in functions
+ 
+ import unittest
+@@ -21,7 +41,7 @@ def pyrange_reversed(start, stop, step):
+     return pyrange(stop - step, start - step, -step)
+ 
+ 
+-class RangeTest(unittest.TestCase):
++class RangeTest(__TestCase):
+     def assert_iterators_equal(self, xs, ys, test_id, limit=None):
+         # check that an iterator xs matches the expected results ys,
+         # up to a given limit.
+@@ -74,18 +94,6 @@ class RangeTest(unittest.TestCase):
+         self.assertNotIn(-b, seq)
+         self.assertEqual(len(seq), 2)
+ 
+-        self.assertRaises(TypeError, range)
+-        self.assertRaises(TypeError, range, 1, 2, 3, 4)
+-        self.assertRaises(ValueError, range, 1, 2, 0)
+-
+-        self.assertRaises(TypeError, range, 0.0, 2, 1)
+-        self.assertRaises(TypeError, range, 1, 2.0, 1)
+-        self.assertRaises(TypeError, range, 1, 2, 1.0)
+-        self.assertRaises(TypeError, range, 1e100, 1e101, 1e101)
+-
+-        self.assertRaises(TypeError, range, 0, "spam")
+-        self.assertRaises(TypeError, range, 0, 42, "spam")
+-
+         self.assertEqual(len(range(0, sys.maxsize, sys.maxsize-1)), 2)
+ 
+         r = range(-sys.maxsize, sys.maxsize, 2)
+@@ -354,7 +362,7 @@ class RangeTest(unittest.TestCase):
+         self.assertEqual(range(1, 2**100, 2).count(2**87), 0)
+         self.assertEqual(range(1, 2**100, 2).count(2**87+1), 1)
+ 
+-        self.assertEqual(range(10).count(ALWAYS_EQ), 10)
++        # self.assertEqual(range(10).count(ALWAYS_EQ), 10)
+ 
+         self.assertEqual(len(range(sys.maxsize, sys.maxsize+10)), 10)
+ 
+@@ -403,6 +411,7 @@ class RangeTest(unittest.TestCase):
+                     it = pickle.loads(d)
+                     self.assertEqual(list(it), data[1:])
+ 
++    @skipIfTorchDynamo("infinite loop")
+     def test_iterator_pickling_overflowing_index(self):
+         for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+             with self.subTest(proto=proto):
+@@ -653,28 +662,18 @@ class RangeTest(unittest.TestCase):
+         ranges_ne = [a != b for a in test_ranges for b in test_ranges]
+         self.assertEqual(ranges_ne, [not x for x in ranges_eq])
+ 
+-        # Equal ranges should have equal hashes.
+-        for a in test_ranges:
+-            for b in test_ranges:
+-                if a == b:
+-                    self.assertEqual(hash(a), hash(b))
+-
+         # Ranges are unequal to other types (even sequence types)
+         self.assertIs(range(0) == (), False)
+-        self.assertIs(() == range(0), False)
++        # self.assertIs(() == range(0), False)
+         self.assertIs(range(2) == [0, 1], False)
+ 
+         # Huge integers aren't a problem.
+         self.assertEqual(range(0, 2**100 - 1, 2),
+                          range(0, 2**100, 2))
+-        self.assertEqual(hash(range(0, 2**100 - 1, 2)),
+-                         hash(range(0, 2**100, 2)))
+         self.assertNotEqual(range(0, 2**100, 2),
+                             range(0, 2**100 + 1, 2))
+         self.assertEqual(range(2**200, 2**201 - 2**99, 2**100),
+                          range(2**200, 2**201, 2**100))
+-        self.assertEqual(hash(range(2**200, 2**201 - 2**99, 2**100)),
+-                         hash(range(2**200, 2**201, 2**100)))
+         self.assertNotEqual(range(2**200, 2**201, 2**100),
+                             range(2**200, 2**201 + 1, 2**100))
+ 
+@@ -710,19 +709,6 @@ class RangeTest(unittest.TestCase):
+         self.assertIs(type(rangeobj.stop), int)
+         self.assertIs(type(rangeobj.step), int)
+ 
+-        with self.assertRaises(AttributeError):
+-            rangeobj.start = 0
+-        with self.assertRaises(AttributeError):
+-            rangeobj.stop = 10
+-        with self.assertRaises(AttributeError):
+-            rangeobj.step = 1
+-
+-        with self.assertRaises(AttributeError):
+-            del rangeobj.start
+-        with self.assertRaises(AttributeError):
+-            del rangeobj.stop
+-        with self.assertRaises(AttributeError):
+-            del rangeobj.step
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_range.py b/test/dynamo/cpython/3_13/test_range.py
new file mode 100644
index 0000000000000..4d3a3d136e4ac
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_range.py
@@ -0,0 +1,714 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+# Test copied from
+# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_range.py
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo
+
+__TestCase = CPythonTestCase
+
+# ======= END DYNAMO PATCH =======
+
+# Python test set -- built-in functions
+
+import unittest
+import sys
+import pickle
+import itertools
+from test.support import ALWAYS_EQ
+
+# pure Python implementations (3 args only), for comparison
+def pyrange(start, stop, step):
+    if (start - stop) // step < 0:
+        # replace stop with next element in the sequence of integers
+        # that are congruent to start modulo step.
+        stop += (start - stop) % step
+        while start != stop:
+            yield start
+            start += step
+
+def pyrange_reversed(start, stop, step):
+    stop += (start - stop) % step
+    return pyrange(stop - step, start - step, -step)
+
+
+class RangeTest(__TestCase):
+    def assert_iterators_equal(self, xs, ys, test_id, limit=None):
+        # check that an iterator xs matches the expected results ys,
+        # up to a given limit.
+        if limit is not None:
+            xs = itertools.islice(xs, limit)
+            ys = itertools.islice(ys, limit)
+        sentinel = object()
+        pairs = itertools.zip_longest(xs, ys, fillvalue=sentinel)
+        for i, (x, y) in enumerate(pairs):
+            if x == y:
+                continue
+            elif x == sentinel:
+                self.fail('{}: iterator ended unexpectedly '
+                          'at position {}; expected {}'.format(test_id, i, y))
+            elif y == sentinel:
+                self.fail('{}: unexpected excess element {} at '
+                          'position {}'.format(test_id, x, i))
+            else:
+                self.fail('{}: wrong element at position {}; '
+                          'expected {}, got {}'.format(test_id, i, y, x))
+
+    def test_range(self):
+        self.assertEqual(list(range(3)), [0, 1, 2])
+        self.assertEqual(list(range(1, 5)), [1, 2, 3, 4])
+        self.assertEqual(list(range(0)), [])
+        self.assertEqual(list(range(-3)), [])
+        self.assertEqual(list(range(1, 10, 3)), [1, 4, 7])
+        self.assertEqual(list(range(5, -5, -3)), [5, 2, -1, -4])
+
+        a = 10
+        b = 100
+        c = 50
+
+        self.assertEqual(list(range(a, a+2)), [a, a+1])
+        self.assertEqual(list(range(a+2, a, -1)), [a+2, a+1])
+        self.assertEqual(list(range(a+4, a, -2)), [a+4, a+2])
+
+        seq = list(range(a, b, c))
+        self.assertIn(a, seq)
+        self.assertNotIn(b, seq)
+        self.assertEqual(len(seq), 2)
+
+        seq = list(range(b, a, -c))
+        self.assertIn(b, seq)
+        self.assertNotIn(a, seq)
+        self.assertEqual(len(seq), 2)
+
+        seq = list(range(-a, -b, -c))
+        self.assertIn(-a, seq)
+        self.assertNotIn(-b, seq)
+        self.assertEqual(len(seq), 2)
+
+        self.assertEqual(len(range(0, sys.maxsize, sys.maxsize-1)), 2)
+
+        r = range(-sys.maxsize, sys.maxsize, 2)
+        self.assertEqual(len(r), sys.maxsize)
+
+    def test_range_constructor_error_messages(self):
+        with self.assertRaisesRegex(
+                TypeError,
+                "range expected at least 1 argument, got 0"
+        ):
+            range()
+
+        with self.assertRaisesRegex(
+                TypeError,
+                "range expected at most 3 arguments, got 6"
+        ):
+            range(1, 2, 3, 4, 5, 6)
+
+    def test_large_operands(self):
+        x = range(10**20, 10**20+10, 3)
+        self.assertEqual(len(x), 4)
+        self.assertEqual(len(list(x)), 4)
+
+        x = range(10**20+10, 10**20, 3)
+        self.assertEqual(len(x), 0)
+        self.assertEqual(len(list(x)), 0)
+        self.assertFalse(x)
+
+        x = range(10**20, 10**20+10, -3)
+        self.assertEqual(len(x), 0)
+        self.assertEqual(len(list(x)), 0)
+        self.assertFalse(x)
+
+        x = range(10**20+10, 10**20, -3)
+        self.assertEqual(len(x), 4)
+        self.assertEqual(len(list(x)), 4)
+        self.assertTrue(x)
+
+        # Now test range() with longs
+        for x in [range(-2**100),
+                  range(0, -2**100),
+                  range(0, 2**100, -1)]:
+            self.assertEqual(list(x), [])
+            self.assertFalse(x)
+
+        a = int(10 * sys.maxsize)
+        b = int(100 * sys.maxsize)
+        c = int(50 * sys.maxsize)
+
+        self.assertEqual(list(range(a, a+2)), [a, a+1])
+        self.assertEqual(list(range(a+2, a, -1)), [a+2, a+1])
+        self.assertEqual(list(range(a+4, a, -2)), [a+4, a+2])
+
+        seq = list(range(a, b, c))
+        self.assertIn(a, seq)
+        self.assertNotIn(b, seq)
+        self.assertEqual(len(seq), 2)
+        self.assertEqual(seq[0], a)
+        self.assertEqual(seq[-1], a+c)
+
+        seq = list(range(b, a, -c))
+        self.assertIn(b, seq)
+        self.assertNotIn(a, seq)
+        self.assertEqual(len(seq), 2)
+        self.assertEqual(seq[0], b)
+        self.assertEqual(seq[-1], b-c)
+
+        seq = list(range(-a, -b, -c))
+        self.assertIn(-a, seq)
+        self.assertNotIn(-b, seq)
+        self.assertEqual(len(seq), 2)
+        self.assertEqual(seq[0], -a)
+        self.assertEqual(seq[-1], -a-c)
+
+    def test_large_range(self):
+        # Check long ranges (len > sys.maxsize)
+        # len() is expected to fail due to limitations of the __len__ protocol
+        def _range_len(x):
+            try:
+                length = len(x)
+            except OverflowError:
+                step = x[1] - x[0]
+                length = 1 + ((x[-1] - x[0]) // step)
+            return length
+
+        a = -sys.maxsize
+        b = sys.maxsize
+        expected_len = b - a
+        x = range(a, b)
+        self.assertIn(a, x)
+        self.assertNotIn(b, x)
+        self.assertRaises(OverflowError, len, x)
+        self.assertTrue(x)
+        self.assertEqual(_range_len(x), expected_len)
+        self.assertEqual(x[0], a)
+        idx = sys.maxsize+1
+        self.assertEqual(x[idx], a+idx)
+        self.assertEqual(x[idx:idx+1][0], a+idx)
+        with self.assertRaises(IndexError):
+            x[-expected_len-1]
+        with self.assertRaises(IndexError):
+            x[expected_len]
+
+        a = 0
+        b = 2 * sys.maxsize
+        expected_len = b - a
+        x = range(a, b)
+        self.assertIn(a, x)
+        self.assertNotIn(b, x)
+        self.assertRaises(OverflowError, len, x)
+        self.assertTrue(x)
+        self.assertEqual(_range_len(x), expected_len)
+        self.assertEqual(x[0], a)
+        idx = sys.maxsize+1
+        self.assertEqual(x[idx], a+idx)
+        self.assertEqual(x[idx:idx+1][0], a+idx)
+        with self.assertRaises(IndexError):
+            x[-expected_len-1]
+        with self.assertRaises(IndexError):
+            x[expected_len]
+
+        a = 0
+        b = sys.maxsize**10
+        c = 2*sys.maxsize
+        expected_len = 1 + (b - a) // c
+        x = range(a, b, c)
+        self.assertIn(a, x)
+        self.assertNotIn(b, x)
+        self.assertRaises(OverflowError, len, x)
+        self.assertTrue(x)
+        self.assertEqual(_range_len(x), expected_len)
+        self.assertEqual(x[0], a)
+        idx = sys.maxsize+1
+        self.assertEqual(x[idx], a+(idx*c))
+        self.assertEqual(x[idx:idx+1][0], a+(idx*c))
+        with self.assertRaises(IndexError):
+            x[-expected_len-1]
+        with self.assertRaises(IndexError):
+            x[expected_len]
+
+        a = sys.maxsize**10
+        b = 0
+        c = -2*sys.maxsize
+        expected_len = 1 + (b - a) // c
+        x = range(a, b, c)
+        self.assertIn(a, x)
+        self.assertNotIn(b, x)
+        self.assertRaises(OverflowError, len, x)
+        self.assertTrue(x)
+        self.assertEqual(_range_len(x), expected_len)
+        self.assertEqual(x[0], a)
+        idx = sys.maxsize+1
+        self.assertEqual(x[idx], a+(idx*c))
+        self.assertEqual(x[idx:idx+1][0], a+(idx*c))
+        with self.assertRaises(IndexError):
+            x[-expected_len-1]
+        with self.assertRaises(IndexError):
+            x[expected_len]
+
+    def test_invalid_invocation(self):
+        self.assertRaises(TypeError, range)
+        self.assertRaises(TypeError, range, 1, 2, 3, 4)
+        self.assertRaises(ValueError, range, 1, 2, 0)
+        a = int(10 * sys.maxsize)
+        self.assertRaises(ValueError, range, a, a + 1, int(0))
+        self.assertRaises(TypeError, range, 1., 1., 1.)
+        self.assertRaises(TypeError, range, 1e100, 1e101, 1e101)
+        self.assertRaises(TypeError, range, 0, "spam")
+        self.assertRaises(TypeError, range, 0, 42, "spam")
+        # Exercise various combinations of bad arguments, to check
+        # refcounting logic
+        self.assertRaises(TypeError, range, 0.0)
+        self.assertRaises(TypeError, range, 0, 0.0)
+        self.assertRaises(TypeError, range, 0.0, 0)
+        self.assertRaises(TypeError, range, 0.0, 0.0)
+        self.assertRaises(TypeError, range, 0, 0, 1.0)
+        self.assertRaises(TypeError, range, 0, 0.0, 1)
+        self.assertRaises(TypeError, range, 0, 0.0, 1.0)
+        self.assertRaises(TypeError, range, 0.0, 0, 1)
+        self.assertRaises(TypeError, range, 0.0, 0, 1.0)
+        self.assertRaises(TypeError, range, 0.0, 0.0, 1)
+        self.assertRaises(TypeError, range, 0.0, 0.0, 1.0)
+
+    def test_index(self):
+        u = range(2)
+        self.assertEqual(u.index(0), 0)
+        self.assertEqual(u.index(1), 1)
+        self.assertRaises(ValueError, u.index, 2)
+
+        u = range(-2, 3)
+        self.assertEqual(u.count(0), 1)
+        self.assertEqual(u.index(0), 2)
+        self.assertRaises(TypeError, u.index)
+
+        class BadExc(Exception):
+            pass
+
+        class BadCmp:
+            def __eq__(self, other):
+                if other == 2:
+                    raise BadExc()
+                return False
+
+        a = range(4)
+        self.assertRaises(BadExc, a.index, BadCmp())
+
+        a = range(-2, 3)
+        self.assertEqual(a.index(0), 2)
+        self.assertEqual(range(1, 10, 3).index(4), 1)
+        self.assertEqual(range(1, -10, -3).index(-5), 2)
+
+        self.assertEqual(range(10**20).index(1), 1)
+        self.assertEqual(range(10**20).index(10**20 - 1), 10**20 - 1)
+
+        self.assertRaises(ValueError, range(1, 2**100, 2).index, 2**87)
+        self.assertEqual(range(1, 2**100, 2).index(2**87+1), 2**86)
+
+        self.assertEqual(range(10).index(ALWAYS_EQ), 0)
+
+    def test_user_index_method(self):
+        bignum = 2*sys.maxsize
+        smallnum = 42
+
+        # User-defined class with an __index__ method
+        class I:
+            def __init__(self, n):
+                self.n = int(n)
+            def __index__(self):
+                return self.n
+        self.assertEqual(list(range(I(bignum), I(bignum + 1))), [bignum])
+        self.assertEqual(list(range(I(smallnum), I(smallnum + 1))), [smallnum])
+
+        # User-defined class with a failing __index__ method
+        class IX:
+            def __index__(self):
+                raise RuntimeError
+        self.assertRaises(RuntimeError, range, IX())
+
+        # User-defined class with an invalid __index__ method
+        class IN:
+            def __index__(self):
+                return "not a number"
+
+        self.assertRaises(TypeError, range, IN())
+
+        # Test use of user-defined classes in slice indices.
+        self.assertEqual(range(10)[:I(5)], range(5))
+
+        with self.assertRaises(RuntimeError):
+            range(0, 10)[:IX()]
+
+        with self.assertRaises(TypeError):
+            range(0, 10)[:IN()]
+
+    def test_count(self):
+        self.assertEqual(range(3).count(-1), 0)
+        self.assertEqual(range(3).count(0), 1)
+        self.assertEqual(range(3).count(1), 1)
+        self.assertEqual(range(3).count(2), 1)
+        self.assertEqual(range(3).count(3), 0)
+        self.assertIs(type(range(3).count(-1)), int)
+        self.assertIs(type(range(3).count(1)), int)
+        self.assertEqual(range(10**20).count(1), 1)
+        self.assertEqual(range(10**20).count(10**20), 0)
+        self.assertEqual(range(3).index(1), 1)
+        self.assertEqual(range(1, 2**100, 2).count(2**87), 0)
+        self.assertEqual(range(1, 2**100, 2).count(2**87+1), 1)
+
+        # self.assertEqual(range(10).count(ALWAYS_EQ), 10)
+
+        self.assertEqual(len(range(sys.maxsize, sys.maxsize+10)), 10)
+
+    def test_repr(self):
+        self.assertEqual(repr(range(1)), 'range(0, 1)')
+        self.assertEqual(repr(range(1, 2)), 'range(1, 2)')
+        self.assertEqual(repr(range(1, 2, 3)), 'range(1, 2, 3)')
+
+    def test_pickling(self):
+        testcases = [(13,), (0, 11), (-22, 10), (20, 3, -1),
+                     (13, 21, 3), (-2, 2, 2), (2**65, 2**65+2)]
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            for t in testcases:
+                with self.subTest(proto=proto, test=t):
+                    r = range(*t)
+                    self.assertEqual(list(pickle.loads(pickle.dumps(r, proto))),
+                                     list(r))
+
+    def test_iterator_pickling(self):
+        testcases = [(13,), (0, 11), (-22, 10), (20, 3, -1), (13, 21, 3),
+                     (-2, 2, 2)]
+        for M in 2**31, 2**63:
+            testcases += [
+                (M-3, M-1), (4*M, 4*M+2),
+                (M-2, M-1, 2), (-M+1, -M, -2),
+                (1, 2, M-1), (-1, -2, -M),
+                (1, M-1, M-1), (-1, -M, -M),
+            ]
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            for t in testcases:
+                with self.subTest(proto=proto, t=t):
+                    it = itorg = iter(range(*t))
+                    data = list(range(*t))
+
+                    d = pickle.dumps(it, proto)
+                    it = pickle.loads(d)
+                    self.assertEqual(type(itorg), type(it))
+                    self.assertEqual(list(it), data)
+
+                    it = pickle.loads(d)
+                    try:
+                        next(it)
+                    except StopIteration:
+                        continue
+                    d = pickle.dumps(it, proto)
+                    it = pickle.loads(d)
+                    self.assertEqual(list(it), data[1:])
+
+    @skipIfTorchDynamo("infinite loop")
+    def test_iterator_pickling_overflowing_index(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            with self.subTest(proto=proto):
+                it = iter(range(2**32 + 2))
+                it.__setstate__(2**32 + 1)  # undocumented way to advance an iterator
+                d = pickle.dumps(it, proto)
+                it = pickle.loads(d)
+                self.assertEqual(next(it), 2**32 + 1)
+
+    def test_exhausted_iterator_pickling(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            r = range(2**65, 2**65+2)
+            i = iter(r)
+            while True:
+                r = next(i)
+                if r == 2**65+1:
+                    break
+            d = pickle.dumps(i, proto)
+            i2 = pickle.loads(d)
+            self.assertEqual(list(i), [])
+            self.assertEqual(list(i2), [])
+
+    def test_large_exhausted_iterator_pickling(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            r = range(20)
+            i = iter(r)
+            while True:
+                r = next(i)
+                if r == 19:
+                    break
+            d = pickle.dumps(i, proto)
+            i2 = pickle.loads(d)
+            self.assertEqual(list(i), [])
+            self.assertEqual(list(i2), [])
+
+    def test_iterator_unpickle_compat(self):
+        testcases = [
+            b'c__builtin__\niter\n(c__builtin__\nxrange\n(I10\nI20\nI2\ntRtRI2\nb.',
+            b'c__builtin__\niter\n(c__builtin__\nxrange\n(K\nK\x14K\x02tRtRK\x02b.',
+            b'\x80\x02c__builtin__\niter\nc__builtin__\nxrange\nK\nK\x14K\x02\x87R\x85RK\x02b.',
+            b'\x80\x03cbuiltins\niter\ncbuiltins\nrange\nK\nK\x14K\x02\x87R\x85RK\x02b.',
+            b'\x80\x04\x951\x00\x00\x00\x00\x00\x00\x00\x8c\x08builtins\x8c\x04iter\x93\x8c\x08builtins\x8c\x05range\x93K\nK\x14K\x02\x87R\x85RK\x02b.',
+
+            b'c__builtin__\niter\n(c__builtin__\nxrange\n(L-36893488147419103232L\nI20\nI2\ntRtRL18446744073709551623L\nb.',
+            b'c__builtin__\niter\n(c__builtin__\nxrange\n(L-36893488147419103232L\nK\x14K\x02tRtRL18446744073709551623L\nb.',
+            b'\x80\x02c__builtin__\niter\nc__builtin__\nxrange\n\x8a\t\x00\x00\x00\x00\x00\x00\x00\x00\xfeK\x14K\x02\x87R\x85R\x8a\t\x07\x00\x00\x00\x00\x00\x00\x00\x01b.',
+            b'\x80\x03cbuiltins\niter\ncbuiltins\nrange\n\x8a\t\x00\x00\x00\x00\x00\x00\x00\x00\xfeK\x14K\x02\x87R\x85R\x8a\t\x07\x00\x00\x00\x00\x00\x00\x00\x01b.',
+            b'\x80\x04\x95C\x00\x00\x00\x00\x00\x00\x00\x8c\x08builtins\x8c\x04iter\x93\x8c\x08builtins\x8c\x05range\x93\x8a\t\x00\x00\x00\x00\x00\x00\x00\x00\xfeK\x14K\x02\x87R\x85R\x8a\t\x07\x00\x00\x00\x00\x00\x00\x00\x01b.',
+        ]
+        for t in testcases:
+            it = pickle.loads(t)
+            self.assertEqual(list(it), [14, 16, 18])
+
+    def test_iterator_setstate(self):
+        it = iter(range(10, 20, 2))
+        it.__setstate__(2)
+        self.assertEqual(list(it), [14, 16, 18])
+        it = reversed(range(10, 20, 2))
+        it.__setstate__(3)
+        self.assertEqual(list(it), [12, 10])
+        it = iter(range(-2**65, 20, 2))
+        it.__setstate__(2**64 + 7)
+        self.assertEqual(list(it), [14, 16, 18])
+        it = reversed(range(10, 2**65, 2))
+        it.__setstate__(2**64 - 7)
+        self.assertEqual(list(it), [12, 10])
+
+    def test_odd_bug(self):
+        # This used to raise a "SystemError: NULL result without error"
+        # because the range validation step was eating the exception
+        # before NULL was returned.
+        with self.assertRaises(TypeError):
+            range([], 1, -1)
+
+    def test_types(self):
+        # Non-integer objects *equal* to any of the range's items are supposed
+        # to be contained in the range.
+        self.assertIn(1.0, range(3))
+        self.assertIn(True, range(3))
+        self.assertIn(1+0j, range(3))
+
+        self.assertIn(ALWAYS_EQ, range(3))
+
+        # Objects are never coerced into other types for comparison.
+        class C2:
+            def __int__(self): return 1
+            def __index__(self): return 1
+        self.assertNotIn(C2(), range(3))
+        # ..except if explicitly told so.
+        self.assertIn(int(C2()), range(3))
+
+        # Check that the range.__contains__ optimization is only
+        # used for ints, not for instances of subclasses of int.
+        class C3(int):
+            def __eq__(self, other): return True
+        self.assertIn(C3(11), range(10))
+        self.assertIn(C3(11), list(range(10)))
+
+    def test_strided_limits(self):
+        r = range(0, 101, 2)
+        self.assertIn(0, r)
+        self.assertNotIn(1, r)
+        self.assertIn(2, r)
+        self.assertNotIn(99, r)
+        self.assertIn(100, r)
+        self.assertNotIn(101, r)
+
+        r = range(0, -20, -1)
+        self.assertIn(0, r)
+        self.assertIn(-1, r)
+        self.assertIn(-19, r)
+        self.assertNotIn(-20, r)
+
+        r = range(0, -20, -2)
+        self.assertIn(-18, r)
+        self.assertNotIn(-19, r)
+        self.assertNotIn(-20, r)
+
+    def test_empty(self):
+        r = range(0)
+        self.assertNotIn(0, r)
+        self.assertNotIn(1, r)
+
+        r = range(0, -10)
+        self.assertNotIn(0, r)
+        self.assertNotIn(-1, r)
+        self.assertNotIn(1, r)
+
+    def test_range_iterators(self):
+        # exercise 'fast' iterators, that use a rangeiterobject internally.
+        # see issue 7298
+        limits = [base + jiggle
+                  for M in (2**32, 2**64)
+                  for base in (-M, -M//2, 0, M//2, M)
+                  for jiggle in (-2, -1, 0, 1, 2)]
+        test_ranges = [(start, end, step)
+                       for start in limits
+                       for end in limits
+                       for step in (-2**63, -2**31, -2, -1, 1, 2)]
+        test_ranges += [(-2**63, 2**63-2, 1)] # regression test for gh-100810
+
+        for start, end, step in test_ranges:
+            iter1 = range(start, end, step)
+            iter2 = pyrange(start, end, step)
+            test_id = "range({}, {}, {})".format(start, end, step)
+            # check first 100 entries
+            self.assert_iterators_equal(iter1, iter2, test_id, limit=100)
+
+            iter1 = reversed(range(start, end, step))
+            iter2 = pyrange_reversed(start, end, step)
+            test_id = "reversed(range({}, {}, {}))".format(start, end, step)
+            self.assert_iterators_equal(iter1, iter2, test_id, limit=100)
+
+    def test_range_iterators_invocation(self):
+        # verify range iterators instances cannot be created by
+        # calling their type
+        rangeiter_type = type(iter(range(0)))
+        self.assertRaises(TypeError, rangeiter_type, 1, 3, 1)
+        long_rangeiter_type = type(iter(range(1 << 1000)))
+        self.assertRaises(TypeError, long_rangeiter_type, 1, 3, 1)
+
+    def test_slice(self):
+        def check(start, stop, step=None):
+            i = slice(start, stop, step)
+            self.assertEqual(list(r[i]), list(r)[i])
+            self.assertEqual(len(r[i]), len(list(r)[i]))
+        for r in [range(10),
+                  range(0),
+                  range(1, 9, 3),
+                  range(8, 0, -3),
+                  range(sys.maxsize+1, sys.maxsize+10),
+                  ]:
+            check(0, 2)
+            check(0, 20)
+            check(1, 2)
+            check(20, 30)
+            check(-30, -20)
+            check(-1, 100, 2)
+            check(0, -1)
+            check(-1, -3, -1)
+
+    def test_contains(self):
+        r = range(10)
+        self.assertIn(0, r)
+        self.assertIn(1, r)
+        self.assertIn(5.0, r)
+        self.assertNotIn(5.1, r)
+        self.assertNotIn(-1, r)
+        self.assertNotIn(10, r)
+        self.assertNotIn("", r)
+        r = range(9, -1, -1)
+        self.assertIn(0, r)
+        self.assertIn(1, r)
+        self.assertIn(5.0, r)
+        self.assertNotIn(5.1, r)
+        self.assertNotIn(-1, r)
+        self.assertNotIn(10, r)
+        self.assertNotIn("", r)
+        r = range(0, 10, 2)
+        self.assertIn(0, r)
+        self.assertNotIn(1, r)
+        self.assertNotIn(5.0, r)
+        self.assertNotIn(5.1, r)
+        self.assertNotIn(-1, r)
+        self.assertNotIn(10, r)
+        self.assertNotIn("", r)
+        r = range(9, -1, -2)
+        self.assertNotIn(0, r)
+        self.assertIn(1, r)
+        self.assertIn(5.0, r)
+        self.assertNotIn(5.1, r)
+        self.assertNotIn(-1, r)
+        self.assertNotIn(10, r)
+        self.assertNotIn("", r)
+
+    def test_reverse_iteration(self):
+        for r in [range(10),
+                  range(0),
+                  range(1, 9, 3),
+                  range(8, 0, -3),
+                  range(sys.maxsize+1, sys.maxsize+10),
+                  ]:
+            self.assertEqual(list(reversed(r)), list(r)[::-1])
+
+    def test_issue11845(self):
+        r = range(*slice(1, 18, 2).indices(20))
+        values = {None, 0, 1, -1, 2, -2, 5, -5, 19, -19,
+                  20, -20, 21, -21, 30, -30, 99, -99}
+        for i in values:
+            for j in values:
+                for k in values - {0}:
+                    r[i:j:k]
+
+    def test_comparison(self):
+        test_ranges = [range(0), range(0, -1), range(1, 1, 3),
+                       range(1), range(5, 6), range(5, 6, 2),
+                       range(5, 7, 2), range(2), range(0, 4, 2),
+                       range(0, 5, 2), range(0, 6, 2)]
+        test_tuples = list(map(tuple, test_ranges))
+
+        # Check that equality of ranges matches equality of the corresponding
+        # tuples for each pair from the test lists above.
+        ranges_eq = [a == b for a in test_ranges for b in test_ranges]
+        tuples_eq = [a == b for a in test_tuples for b in test_tuples]
+        self.assertEqual(ranges_eq, tuples_eq)
+
+        # Check that != correctly gives the logical negation of ==
+        ranges_ne = [a != b for a in test_ranges for b in test_ranges]
+        self.assertEqual(ranges_ne, [not x for x in ranges_eq])
+
+        # Ranges are unequal to other types (even sequence types)
+        self.assertIs(range(0) == (), False)
+        # self.assertIs(() == range(0), False)
+        self.assertIs(range(2) == [0, 1], False)
+
+        # Huge integers aren't a problem.
+        self.assertEqual(range(0, 2**100 - 1, 2),
+                         range(0, 2**100, 2))
+        self.assertNotEqual(range(0, 2**100, 2),
+                            range(0, 2**100 + 1, 2))
+        self.assertEqual(range(2**200, 2**201 - 2**99, 2**100),
+                         range(2**200, 2**201, 2**100))
+        self.assertNotEqual(range(2**200, 2**201, 2**100),
+                            range(2**200, 2**201 + 1, 2**100))
+
+        # Order comparisons are not implemented for ranges.
+        with self.assertRaises(TypeError):
+            range(0) < range(0)
+        with self.assertRaises(TypeError):
+            range(0) > range(0)
+        with self.assertRaises(TypeError):
+            range(0) <= range(0)
+        with self.assertRaises(TypeError):
+            range(0) >= range(0)
+
+
+    def test_attributes(self):
+        # test the start, stop and step attributes of range objects
+        self.assert_attrs(range(0), 0, 0, 1)
+        self.assert_attrs(range(10), 0, 10, 1)
+        self.assert_attrs(range(-10), 0, -10, 1)
+        self.assert_attrs(range(0, 10, 1), 0, 10, 1)
+        self.assert_attrs(range(0, 10, 3), 0, 10, 3)
+        self.assert_attrs(range(10, 0, -1), 10, 0, -1)
+        self.assert_attrs(range(10, 0, -3), 10, 0, -3)
+        self.assert_attrs(range(True), 0, 1, 1)
+        self.assert_attrs(range(False, True), 0, 1, 1)
+        self.assert_attrs(range(False, True, True), 0, 1, 1)
+
+    def assert_attrs(self, rangeobj, start, stop, step):
+        self.assertEqual(rangeobj.start, start)
+        self.assertEqual(rangeobj.stop, stop)
+        self.assertEqual(rangeobj.step, step)
+        self.assertIs(type(rangeobj.start), int)
+        self.assertIs(type(rangeobj.stop), int)
+        self.assertIs(type(rangeobj.step), int)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_set.diff b/test/dynamo/cpython/3_13/test_set.diff
index ef4fee1f67d99..77dce156a1e12 100644
--- a/test/dynamo/cpython/3_13/test_set.diff
+++ b/test/dynamo/cpython/3_13/test_set.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_set.py b/test/dynamo/cpython/3_13/test_set.py
-index d9102eb98a5..3543d60751e 100644
+index d9102eb98a5..c8ee5ca451f 100644
 --- a/test/dynamo/cpython/3_13/test_set.py
 +++ b/test/dynamo/cpython/3_13/test_set.py
 @@ -1,3 +1,56 @@
@@ -62,42 +62,220 @@ index d9102eb98a5..3543d60751e 100644
 @@ -38,7 +91,7 @@ class HashCountingInt(int):
          self.hash_count += 1
          return int.__hash__(self)
- 
+
 -class TestJointOps:
 +class _TestJointOps:
      # Tests common to both set and frozenset
- 
+
      def setUp(self):
 @@ -47,6 +100,7 @@ class TestJointOps:
          self.letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
          self.s = self.thetype(word)
          self.d = dict.fromkeys(word)
 +        super().setUp()
- 
+
      def test_new_or_init(self):
          self.assertRaises(TypeError, self.thetype, [], 2)
-@@ -355,7 +409,7 @@ class TestJointOps:
+@@ -261,13 +315,14 @@ class TestJointOps:
+             self.assertEqual(self.thetype(it), data - self.thetype((drop,)))
+
+     def test_deepcopy(self):
+-        class Tracer:
+-            def __init__(self, value):
+-                self.value = value
+-            def __hash__(self):
+-                return self.value
+-            def __deepcopy__(self, memo=None):
+-                return Tracer(self.value + 1)
++        with torch._dynamo.error_on_graph_break(False):
++            class Tracer:
++                def __init__(self, value):
++                    self.value = value
++                def __hash__(self):
++                    return self.value
++                def __deepcopy__(self, memo=None):
++                    return Tracer(self.value + 1)
+         t = Tracer(10)
+         s = self.thetype([t])
+         dup = copy.deepcopy(s)
+@@ -279,8 +334,9 @@ class TestJointOps:
+
+     def test_gc(self):
+         # Create a nest of cycles to exercise overall ref count check
+-        class A:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                pass
+         s = set(A() for i in range(1000))
+         for elem in s:
+             elem.cycle = s
+@@ -289,9 +345,10 @@ class TestJointOps:
+
+     def test_subclass_with_custom_hash(self):
+         # Bug #1257731
+-        class H(self.thetype):
+-            def __hash__(self):
+-                return int(id(self) & 0x7fffffff)
++        with torch._dynamo.error_on_graph_break(False):
++            class H(self.thetype):
++                def __hash__(self):
++                    return int(id(self) & 0x7fffffff)
+         s=H()
+         f=set()
+         f.add(s)
+@@ -342,8 +399,9 @@ class TestJointOps:
+
+     def test_container_iterator(self):
+         # Bug #3680: tp_traverse was not implemented for set iterator object
+-        class C(object):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C(object):
++                pass
+         obj = C()
+         ref = weakref.ref(obj)
+         container = set([obj, 1])
+@@ -355,7 +413,7 @@ class TestJointOps:
      def test_free_after_iterating(self):
          support.check_free_after_iterating(self, iter, self.thetype)
- 
+
 -class TestSet(TestJointOps, unittest.TestCase):
 +class TestSet(_TestJointOps, __TestCase):
      thetype = set
      basetype = set
- 
-@@ -675,7 +729,7 @@ class TestSetSubclass(TestSet):
+
+@@ -600,19 +658,20 @@ class TestSet(TestJointOps, unittest.TestCase):
+         self.assertRaises(ReferenceError, str, p)
+
+     def test_rich_compare(self):
+-        class TestRichSetCompare:
+-            def __gt__(self, some_set):
+-                self.gt_called = True
+-                return False
+-            def __lt__(self, some_set):
+-                self.lt_called = True
+-                return False
+-            def __ge__(self, some_set):
+-                self.ge_called = True
+-                return False
+-            def __le__(self, some_set):
+-                self.le_called = True
+-                return False
++        with torch._dynamo.error_on_graph_break(False):
++            class TestRichSetCompare:
++                def __gt__(self, some_set):
++                    self.gt_called = True
++                    return False
++                def __lt__(self, some_set):
++                    self.lt_called = True
++                    return False
++                def __ge__(self, some_set):
++                    self.ge_called = True
++                    return False
++                def __le__(self, some_set):
++                    self.le_called = True
++                    return False
+
+         # This first tries the builtin rich set comparison, which doesn't know
+         # how to handle the custom object. Upon returning NotImplemented, the
+@@ -644,28 +703,31 @@ class TestSetSubclass(TestSet):
+     basetype = set
+
+     def test_keywords_in_subclass(self):
+-        class subclass(set):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass(set):
++                pass
+         u = subclass([1, 2])
+         self.assertIs(type(u), subclass)
+         self.assertEqual(set(u), {1, 2})
+         with self.assertRaises(TypeError):
+             subclass(sequence=())
+
+-        class subclass_with_init(set):
+-            def __init__(self, arg, newarg=None):
+-                super().__init__(arg)
+-                self.newarg = newarg
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass_with_init(set):
++                def __init__(self, arg, newarg=None):
++                    super().__init__(arg)
++                    self.newarg = newarg
+         u = subclass_with_init([1, 2], newarg=3)
+         self.assertIs(type(u), subclass_with_init)
+         self.assertEqual(set(u), {1, 2})
+         self.assertEqual(u.newarg, 3)
+
+-        class subclass_with_new(set):
+-            def __new__(cls, arg, newarg=None):
+-                self = super().__new__(cls, arg)
+-                self.newarg = newarg
+-                return self
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass_with_new(set):
++                def __new__(cls, arg, newarg=None):
++                    self = super().__new__(cls, arg)
++                    self.newarg = newarg
++                    return self
+         u = subclass_with_new([1, 2])
+         self.assertIs(type(u), subclass_with_new)
+         self.assertEqual(set(u), {1, 2})
+@@ -675,7 +737,7 @@ class TestSetSubclass(TestSet):
              subclass_with_new([1, 2], newarg=3)
- 
- 
+
+
 -class TestFrozenSet(TestJointOps, unittest.TestCase):
 +class TestFrozenSet(_TestJointOps, __TestCase):
      thetype = frozenset
      basetype = frozenset
- 
-@@ -811,10 +865,17 @@ class TestFrozenSetSubclass(TestFrozenSet):
+
+@@ -756,27 +818,30 @@ class TestFrozenSetSubclass(TestFrozenSet):
+     basetype = frozenset
+
+     def test_keywords_in_subclass(self):
+-        class subclass(frozenset):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass(frozenset):
++                pass
+         u = subclass([1, 2])
+         self.assertIs(type(u), subclass)
+         self.assertEqual(set(u), {1, 2})
+         with self.assertRaises(TypeError):
+             subclass(sequence=())
+
+-        class subclass_with_init(frozenset):
+-            def __init__(self, arg, newarg=None):
+-                self.newarg = newarg
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass_with_init(frozenset):
++                def __init__(self, arg, newarg=None):
++                    self.newarg = newarg
+         u = subclass_with_init([1, 2], newarg=3)
+         self.assertIs(type(u), subclass_with_init)
+         self.assertEqual(set(u), {1, 2})
+         self.assertEqual(u.newarg, 3)
+
+-        class subclass_with_new(frozenset):
+-            def __new__(cls, arg, newarg=None):
+-                self = super().__new__(cls, arg)
+-                self.newarg = newarg
+-                return self
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass_with_new(frozenset):
++                def __new__(cls, arg, newarg=None):
++                    self = super().__new__(cls, arg)
++                    self.newarg = newarg
++                    return self
+         u = subclass_with_new([1, 2], newarg=3)
+         self.assertIs(type(u), subclass_with_new)
+         self.assertEqual(set(u), {1, 2})
+@@ -811,10 +876,17 @@ class TestFrozenSetSubclass(TestFrozenSet):
  class SetSubclassWithSlots(set):
      __slots__ = ('x', 'y', '__dict__')
- 
+
 -class TestSetSubclassWithSlots(unittest.TestCase):
 +class TestSetSubclassWithSlots(__TestCase):
      thetype = SetSubclassWithSlots
@@ -112,82 +290,82 @@ index d9102eb98a5..3543d60751e 100644
 +        self.s = self.thetype(word)
 +        self.d = dict.fromkeys(word)
 +        super().setUp()
- 
+
  class FrozenSetSubclassWithSlots(frozenset):
      __slots__ = ('x', 'y', '__dict__')
-@@ -828,7 +889,7 @@ empty_set = set()
- 
+@@ -828,7 +900,7 @@ empty_set = set()
+
  #==============================================================================
- 
+
 -class TestBasicOps:
 +class _TestBasicOps:
- 
+
      def test_repr(self):
          if self.repr is not None:
-@@ -934,7 +995,7 @@ class TestBasicOps:
- 
+@@ -934,7 +1006,7 @@ class TestBasicOps:
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsEmpty(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsEmpty(_TestBasicOps, __TestCase):
      def setUp(self):
          self.case   = "empty set"
          self.values = []
-@@ -942,10 +1003,11 @@ class TestBasicOpsEmpty(TestBasicOps, unittest.TestCase):
+@@ -942,10 +1014,11 @@ class TestBasicOpsEmpty(TestBasicOps, unittest.TestCase):
          self.dup    = set(self.values)
          self.length = 0
          self.repr   = "set()"
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsSingleton(_TestBasicOps, __TestCase):
      def setUp(self):
          self.case   = "unit set (number)"
          self.values = [3]
-@@ -953,6 +1015,7 @@ class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
+@@ -953,6 +1026,7 @@ class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
          self.dup    = set(self.values)
          self.length = 1
          self.repr   = "{3}"
 +        super().setUp()
- 
+
      def test_in(self):
          self.assertIn(3, self.set)
-@@ -962,7 +1025,7 @@ class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
- 
+@@ -962,7 +1036,7 @@ class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsTuple(_TestBasicOps, __TestCase):
      def setUp(self):
          self.case   = "unit set (tuple)"
          self.values = [(0, "zero")]
-@@ -970,6 +1033,7 @@ class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
+@@ -970,6 +1044,7 @@ class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
          self.dup    = set(self.values)
          self.length = 1
          self.repr   = "{(0, 'zero')}"
 +        super().setUp()
- 
+
      def test_in(self):
          self.assertIn((0, "zero"), self.set)
-@@ -979,7 +1043,7 @@ class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
- 
+@@ -979,7 +1054,7 @@ class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsTriple(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsTriple(_TestBasicOps, __TestCase):
      def setUp(self):
          self.case   = "triple set"
          self.values = [0, "zero", operator.add]
-@@ -987,36 +1051,39 @@ class TestBasicOpsTriple(TestBasicOps, unittest.TestCase):
+@@ -987,36 +1062,39 @@ class TestBasicOpsTriple(TestBasicOps, unittest.TestCase):
          self.dup    = set(self.values)
          self.length = 3
          self.repr   = None
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsString(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsString(_TestBasicOps, __TestCase):
      def setUp(self):
@@ -197,12 +375,12 @@ index d9102eb98a5..3543d60751e 100644
          self.dup    = set(self.values)
          self.length = 3
 +        super().setUp()
- 
+
      def test_repr(self):
          self.check_repr_against_values()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsBytes(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsBytes(_TestBasicOps, __TestCase):
      def setUp(self):
@@ -212,90 +390,90 @@ index d9102eb98a5..3543d60751e 100644
          self.dup    = set(self.values)
          self.length = 3
 +        super().setUp()
- 
+
      def test_repr(self):
          self.check_repr_against_values()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsMixedStringBytes(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsMixedStringBytes(_TestBasicOps, __TestCase):
      def setUp(self):
          self.enterContext(warnings_helper.check_warnings())
          warnings.simplefilter('ignore', BytesWarning)
-@@ -1025,6 +1092,7 @@ class TestBasicOpsMixedStringBytes(TestBasicOps, unittest.TestCase):
+@@ -1025,6 +1103,7 @@ class TestBasicOpsMixedStringBytes(TestBasicOps, unittest.TestCase):
          self.set    = set(self.values)
          self.dup    = set(self.values)
          self.length = 4
 +        super().setUp()
- 
+
      def test_repr(self):
          self.check_repr_against_values()
-@@ -1038,7 +1106,7 @@ def baditer():
+@@ -1038,7 +1117,7 @@ def baditer():
  def gooditer():
      yield True
- 
+
 -class TestExceptionPropagation(unittest.TestCase):
 +class TestExceptionPropagation(__TestCase):
      """SF 628246:  Set constructor should not trap iterator TypeErrors"""
- 
+
      def test_instanceWithException(self):
-@@ -1065,7 +1133,7 @@ class TestExceptionPropagation(unittest.TestCase):
- 
+@@ -1065,7 +1144,7 @@ class TestExceptionPropagation(unittest.TestCase):
+
  #==============================================================================
- 
+
 -class TestSetOfSets(unittest.TestCase):
 +class TestSetOfSets(__TestCase):
      def test_constructor(self):
          inner = frozenset([1])
          outer = set([inner])
-@@ -1078,9 +1146,10 @@ class TestSetOfSets(unittest.TestCase):
- 
+@@ -1078,9 +1157,10 @@ class TestSetOfSets(unittest.TestCase):
+
  #==============================================================================
- 
+
 -class TestBinaryOps(unittest.TestCase):
 +class TestBinaryOps(__TestCase):
      def setUp(self):
          self.set = set((2, 4, 6))
 +        super().setUp()
- 
+
      def test_eq(self):              # SF bug 643115
          self.assertEqual(self.set, set({2:1,4:3,6:5}))
-@@ -1151,9 +1220,10 @@ class TestBinaryOps(unittest.TestCase):
- 
+@@ -1151,9 +1231,10 @@ class TestBinaryOps(unittest.TestCase):
+
  #==============================================================================
- 
+
 -class TestUpdateOps(unittest.TestCase):
 +class TestUpdateOps(__TestCase):
      def setUp(self):
          self.set = set((2, 4, 6))
 +        super().setUp()
- 
+
      def test_union_subset(self):
          self.set |= set([2])
-@@ -1237,10 +1307,11 @@ class TestUpdateOps(unittest.TestCase):
- 
+@@ -1237,10 +1318,11 @@ class TestUpdateOps(unittest.TestCase):
+
  #==============================================================================
- 
+
 -class TestMutate(unittest.TestCase):
 +class TestMutate(__TestCase):
      def setUp(self):
          self.values = ["a", "b", "c"]
          self.set = set(self.values)
 +        super().setUp()
- 
+
      def test_add_present(self):
          self.set.add("c")
-@@ -1311,7 +1382,7 @@ class TestMutate(unittest.TestCase):
- 
+@@ -1311,7 +1393,7 @@ class TestMutate(unittest.TestCase):
+
  #==============================================================================
- 
+
 -class TestSubsets:
 +class _TestSubsets:
- 
+
      case2method = {"<=": "issubset",
                     ">=": "issuperset",
-@@ -1334,22 +1405,22 @@ class TestSubsets:
+@@ -1334,22 +1416,22 @@ class TestSubsets:
              result = eval("x" + case + "y", locals())
              self.assertEqual(result, expected)
              # Test the "friendly" method-name spelling, if one exists.
@@ -305,7 +483,7 @@ index d9102eb98a5..3543d60751e 100644
 +                method = getattr(x, _TestSubsets.case2method[case])
                  result = method(y)
                  self.assertEqual(result, expected)
- 
+
              # Now do the same for the operands reversed.
 -            rcase = TestSubsets.reverse[case]
 +            rcase = _TestSubsets.reverse[case]
@@ -318,61 +496,61 @@ index d9102eb98a5..3543d60751e 100644
                  result = method(x)
                  self.assertEqual(result, expected)
  #------------------------------------------------------------------------------
- 
+
 -class TestSubsetEqualEmpty(TestSubsets, unittest.TestCase):
 +class TestSubsetEqualEmpty(_TestSubsets, __TestCase):
      left  = set()
      right = set()
      name  = "both empty"
-@@ -1357,7 +1428,7 @@ class TestSubsetEqualEmpty(TestSubsets, unittest.TestCase):
- 
+@@ -1357,7 +1439,7 @@ class TestSubsetEqualEmpty(TestSubsets, unittest.TestCase):
+
  #------------------------------------------------------------------------------
- 
+
 -class TestSubsetEqualNonEmpty(TestSubsets, unittest.TestCase):
 +class TestSubsetEqualNonEmpty(_TestSubsets, __TestCase):
      left  = set([1, 2])
      right = set([1, 2])
      name  = "equal pair"
-@@ -1365,7 +1436,7 @@ class TestSubsetEqualNonEmpty(TestSubsets, unittest.TestCase):
- 
+@@ -1365,7 +1447,7 @@ class TestSubsetEqualNonEmpty(TestSubsets, unittest.TestCase):
+
  #------------------------------------------------------------------------------
- 
+
 -class TestSubsetEmptyNonEmpty(TestSubsets, unittest.TestCase):
 +class TestSubsetEmptyNonEmpty(_TestSubsets, __TestCase):
      left  = set()
      right = set([1, 2])
      name  = "one empty, one non-empty"
-@@ -1373,7 +1444,7 @@ class TestSubsetEmptyNonEmpty(TestSubsets, unittest.TestCase):
- 
+@@ -1373,7 +1455,7 @@ class TestSubsetEmptyNonEmpty(TestSubsets, unittest.TestCase):
+
  #------------------------------------------------------------------------------
- 
+
 -class TestSubsetPartial(TestSubsets, unittest.TestCase):
 +class TestSubsetPartial(_TestSubsets, __TestCase):
      left  = set([1])
      right = set([1, 2])
      name  = "one a non-empty proper subset of other"
-@@ -1381,7 +1452,7 @@ class TestSubsetPartial(TestSubsets, unittest.TestCase):
- 
+@@ -1381,7 +1463,7 @@ class TestSubsetPartial(TestSubsets, unittest.TestCase):
+
  #------------------------------------------------------------------------------
- 
+
 -class TestSubsetNonOverlap(TestSubsets, unittest.TestCase):
 +class TestSubsetNonOverlap(_TestSubsets, __TestCase):
      left  = set([1])
      right = set([2])
      name  = "neither empty, neither contains"
-@@ -1389,7 +1460,7 @@ class TestSubsetNonOverlap(TestSubsets, unittest.TestCase):
- 
+@@ -1389,7 +1471,7 @@ class TestSubsetNonOverlap(TestSubsets, unittest.TestCase):
+
  #==============================================================================
- 
+
 -class TestOnlySetsInBinaryOps:
 +class _TestOnlySetsInBinaryOps:
- 
+
      def test_eq_ne(self):
          # Unlike the others, this is testing that == and != *are* allowed.
-@@ -1505,47 +1576,52 @@ class TestOnlySetsInBinaryOps:
- 
+@@ -1505,47 +1587,52 @@ class TestOnlySetsInBinaryOps:
+
  #------------------------------------------------------------------------------
- 
+
 -class TestOnlySetsNumeric(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsNumeric(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -380,9 +558,9 @@ index d9102eb98a5..3543d60751e 100644
          self.other = 19
          self.otherIsIterable = False
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestOnlySetsDict(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsDict(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -390,9 +568,9 @@ index d9102eb98a5..3543d60751e 100644
          self.other = {1:2, 3:4}
          self.otherIsIterable = True
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestOnlySetsOperator(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsOperator(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -400,9 +578,9 @@ index d9102eb98a5..3543d60751e 100644
          self.other = operator.add
          self.otherIsIterable = False
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestOnlySetsTuple(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsTuple(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -410,9 +588,9 @@ index d9102eb98a5..3543d60751e 100644
          self.other = (2, 4, 6)
          self.otherIsIterable = True
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestOnlySetsString(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsString(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -420,192 +598,241 @@ index d9102eb98a5..3543d60751e 100644
          self.other = 'abc'
          self.otherIsIterable = True
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestOnlySetsGenerator(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsGenerator(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
          def gen():
              for i in range(0, 10, 2):
-@@ -1553,10 +1629,11 @@ class TestOnlySetsGenerator(TestOnlySetsInBinaryOps, unittest.TestCase):
+@@ -1553,10 +1640,11 @@ class TestOnlySetsGenerator(TestOnlySetsInBinaryOps, unittest.TestCase):
          self.set   = set((1, 2, 3))
          self.other = gen()
          self.otherIsIterable = True
 +        super().setUp()
- 
+
  #==============================================================================
- 
+
 -class TestCopying:
 +class _TestCopying:
- 
+
      def test_copy(self):
          dup = self.set.copy()
-@@ -1577,40 +1654,46 @@ class TestCopying:
- 
+@@ -1577,40 +1665,46 @@ class TestCopying:
+
  #------------------------------------------------------------------------------
- 
+
 -class TestCopyingEmpty(TestCopying, unittest.TestCase):
 +class TestCopyingEmpty(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set()
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestCopyingSingleton(TestCopying, unittest.TestCase):
 +class TestCopyingSingleton(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set(["hello"])
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestCopyingTriple(TestCopying, unittest.TestCase):
 +class TestCopyingTriple(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set(["zero", 0, None])
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestCopyingTuple(TestCopying, unittest.TestCase):
 +class TestCopyingTuple(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set([(1, 2)])
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestCopyingNested(TestCopying, unittest.TestCase):
 +class TestCopyingNested(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set([((1, 2), (3, 4))])
 +        super().setUp()
- 
+
  #==============================================================================
- 
+
 -class TestIdentities(unittest.TestCase):
 +class TestIdentities(__TestCase):
      def setUp(self):
          self.a = set('abracadabra')
          self.b = set('alacazam')
 +        super().setUp()
- 
+
      def test_binopsVsSubsets(self):
          a, b = self.a, self.b
-@@ -1727,7 +1810,7 @@ def L(seqn):
+@@ -1727,7 +1821,7 @@ def L(seqn):
      'Test multiple tiers of iterators'
      return chain(map(lambda x:x, R(Ig(G(seqn)))))
- 
+
 -class TestVariousIteratorArgs(unittest.TestCase):
 +class TestVariousIteratorArgs(__TestCase):
- 
+
      def test_constructor(self):
          for cons in (set, frozenset):
-@@ -1785,7 +1868,7 @@ class bad_dict_clear:
+@@ -1785,7 +1879,7 @@ class bad_dict_clear:
      def __hash__(self):
          return 0
- 
+
 -class TestWeirdBugs(unittest.TestCase):
 +class TestWeirdBugs(__TestCase):
      def test_8420_set_merge(self):
          # This used to segfault
          global be_bad, set2, dict2
-@@ -1826,7 +1909,7 @@ class TestWeirdBugs(unittest.TestCase):
+@@ -1813,12 +1907,13 @@ class TestWeirdBugs(unittest.TestCase):
+         list(si)
+
+     def test_merge_and_mutate(self):
+-        class X:
+-            def __hash__(self):
+-                return hash(0)
+-            def __eq__(self, o):
+-                other.clear()
+-                return False
++        with torch._dynamo.error_on_graph_break(False):
++            class X:
++                def __hash__(self):
++                    return hash(0)
++                def __eq__(self, o):
++                    other.clear()
++                    return False
+
+         other = set()
+         other = {X() for i in range(10)}
+@@ -1826,24 +1921,25 @@ class TestWeirdBugs(unittest.TestCase):
          s.update(other)
- 
- 
+
+
 -class TestOperationsMutating:
 +class _TestOperationsMutating:
      """Regression test for bpo-46615"""
- 
+
      constructor1 = None
-@@ -1862,7 +1945,7 @@ class TestOperationsMutating:
+     constructor2 = None
+
+     def make_sets_of_bad_objects(self):
+-        class Bad:
+-            def __eq__(self, other):
+-                if not enabled:
+-                    return False
+-                if randrange(20) == 0:
+-                    set1.clear()
+-                if randrange(20) == 0:
+-                    set2.clear()
+-                return bool(randrange(2))
+-            def __hash__(self):
+-                return randrange(2)
++        with torch._dynamo.error_on_graph_break(False):
++            class Bad:
++                def __eq__(self, other):
++                    if not enabled:
++                        return False
++                    if randrange(20) == 0:
++                        set1.clear()
++                    if randrange(20) == 0:
++                        set2.clear()
++                    return bool(randrange(2))
++                def __hash__(self):
++                    return randrange(2)
+         # Don't behave poorly during construction.
+         enabled = False
+         set1 = self.constructor1(Bad() for _ in range(randrange(50)))
+@@ -1862,7 +1958,7 @@ class TestOperationsMutating:
                  self.assertIn("changed size during iteration", str(e))
- 
- 
+
+
 -class TestBinaryOpsMutating(TestOperationsMutating):
 +class _TestBinaryOpsMutating(_TestOperationsMutating):
- 
+
      def test_eq_with_mutation(self):
          self.check_set_op_does_not_crash(lambda a, b: a == b)
-@@ -1933,24 +2016,24 @@ class TestBinaryOpsMutating(TestOperationsMutating):
+@@ -1933,24 +2029,24 @@ class TestBinaryOpsMutating(TestOperationsMutating):
          self.check_set_op_does_not_crash(f3)
- 
- 
+
+
 -class TestBinaryOpsMutating_Set_Set(TestBinaryOpsMutating, unittest.TestCase):
 +class TestBinaryOpsMutating_Set_Set(_TestBinaryOpsMutating, __TestCase):
      constructor1 = set
      constructor2 = set
- 
+
 -class TestBinaryOpsMutating_Subclass_Subclass(TestBinaryOpsMutating, unittest.TestCase):
 +class TestBinaryOpsMutating_Subclass_Subclass(_TestBinaryOpsMutating, __TestCase):
      constructor1 = SetSubclass
      constructor2 = SetSubclass
- 
+
 -class TestBinaryOpsMutating_Set_Subclass(TestBinaryOpsMutating, unittest.TestCase):
 +class TestBinaryOpsMutating_Set_Subclass(_TestBinaryOpsMutating, __TestCase):
      constructor1 = set
      constructor2 = SetSubclass
- 
+
 -class TestBinaryOpsMutating_Subclass_Set(TestBinaryOpsMutating, unittest.TestCase):
 +class TestBinaryOpsMutating_Subclass_Set(_TestBinaryOpsMutating, __TestCase):
      constructor1 = SetSubclass
      constructor2 = set
- 
- 
+
+
 -class TestMethodsMutating(TestOperationsMutating):
 +class _TestMethodsMutating(_TestOperationsMutating):
- 
+
      def test_issubset_with_mutation(self):
          self.check_set_op_does_not_crash(set.issubset)
-@@ -1986,27 +2069,27 @@ class TestMethodsMutating(TestOperationsMutating):
+@@ -1986,27 +2082,27 @@ class TestMethodsMutating(TestOperationsMutating):
          self.check_set_op_does_not_crash(set.update)
- 
- 
+
+
 -class TestMethodsMutating_Set_Set(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Set_Set(_TestMethodsMutating, __TestCase):
      constructor1 = set
      constructor2 = set
- 
+
 -class TestMethodsMutating_Subclass_Subclass(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Subclass_Subclass(_TestMethodsMutating, __TestCase):
      constructor1 = SetSubclass
      constructor2 = SetSubclass
- 
+
 -class TestMethodsMutating_Set_Subclass(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Set_Subclass(_TestMethodsMutating, __TestCase):
      constructor1 = set
      constructor2 = SetSubclass
- 
+
 -class TestMethodsMutating_Subclass_Set(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Subclass_Set(_TestMethodsMutating, __TestCase):
      constructor1 = SetSubclass
      constructor2 = set
- 
+
 -class TestMethodsMutating_Set_Dict(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Set_Dict(_TestMethodsMutating, __TestCase):
      constructor1 = set
      constructor2 = dict.fromkeys
- 
+
 -class TestMethodsMutating_Set_List(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Set_List(_TestMethodsMutating, __TestCase):
      constructor1 = set
      constructor2 = list
- 
-@@ -2068,7 +2151,7 @@ def faces(G):
+
+@@ -2068,7 +2164,7 @@ def faces(G):
      return f
- 
- 
+
+
 -class TestGraphs(unittest.TestCase):
 +class TestGraphs(__TestCase):
- 
+
      def test_cube(self):
- 
-@@ -2118,4 +2201,4 @@ class TestGraphs(unittest.TestCase):
+
+@@ -2118,4 +2214,4 @@ class TestGraphs(unittest.TestCase):
  #==============================================================================
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_set.py b/test/dynamo/cpython/3_13/test_set.py
index 3543d60751e3c..1d80fccca5b13 100644
--- a/test/dynamo/cpython/3_13/test_set.py
+++ b/test/dynamo/cpython/3_13/test_set.py
@@ -315,13 +315,14 @@ def test_iterator_pickling(self):
             self.assertEqual(self.thetype(it), data - self.thetype((drop,)))
 
     def test_deepcopy(self):
-        class Tracer:
-            def __init__(self, value):
-                self.value = value
-            def __hash__(self):
-                return self.value
-            def __deepcopy__(self, memo=None):
-                return Tracer(self.value + 1)
+        with torch._dynamo.error_on_graph_break(False):
+            class Tracer:
+                def __init__(self, value):
+                    self.value = value
+                def __hash__(self):
+                    return self.value
+                def __deepcopy__(self, memo=None):
+                    return Tracer(self.value + 1)
         t = Tracer(10)
         s = self.thetype([t])
         dup = copy.deepcopy(s)
@@ -333,8 +334,9 @@ def __deepcopy__(self, memo=None):
 
     def test_gc(self):
         # Create a nest of cycles to exercise overall ref count check
-        class A:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                pass
         s = set(A() for i in range(1000))
         for elem in s:
             elem.cycle = s
@@ -343,9 +345,10 @@ class A:
 
     def test_subclass_with_custom_hash(self):
         # Bug #1257731
-        class H(self.thetype):
-            def __hash__(self):
-                return int(id(self) & 0x7fffffff)
+        with torch._dynamo.error_on_graph_break(False):
+            class H(self.thetype):
+                def __hash__(self):
+                    return int(id(self) & 0x7fffffff)
         s=H()
         f=set()
         f.add(s)
@@ -396,8 +399,9 @@ def test_do_not_rehash_dict_keys(self):
 
     def test_container_iterator(self):
         # Bug #3680: tp_traverse was not implemented for set iterator object
-        class C(object):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C(object):
+                pass
         obj = C()
         ref = weakref.ref(obj)
         container = set([obj, 1])
@@ -654,19 +658,20 @@ def test_weakref(self):
         self.assertRaises(ReferenceError, str, p)
 
     def test_rich_compare(self):
-        class TestRichSetCompare:
-            def __gt__(self, some_set):
-                self.gt_called = True
-                return False
-            def __lt__(self, some_set):
-                self.lt_called = True
-                return False
-            def __ge__(self, some_set):
-                self.ge_called = True
-                return False
-            def __le__(self, some_set):
-                self.le_called = True
-                return False
+        with torch._dynamo.error_on_graph_break(False):
+            class TestRichSetCompare:
+                def __gt__(self, some_set):
+                    self.gt_called = True
+                    return False
+                def __lt__(self, some_set):
+                    self.lt_called = True
+                    return False
+                def __ge__(self, some_set):
+                    self.ge_called = True
+                    return False
+                def __le__(self, some_set):
+                    self.le_called = True
+                    return False
 
         # This first tries the builtin rich set comparison, which doesn't know
         # how to handle the custom object. Upon returning NotImplemented, the
@@ -698,28 +703,31 @@ class TestSetSubclass(TestSet):
     basetype = set
 
     def test_keywords_in_subclass(self):
-        class subclass(set):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass(set):
+                pass
         u = subclass([1, 2])
         self.assertIs(type(u), subclass)
         self.assertEqual(set(u), {1, 2})
         with self.assertRaises(TypeError):
             subclass(sequence=())
 
-        class subclass_with_init(set):
-            def __init__(self, arg, newarg=None):
-                super().__init__(arg)
-                self.newarg = newarg
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass_with_init(set):
+                def __init__(self, arg, newarg=None):
+                    super().__init__(arg)
+                    self.newarg = newarg
         u = subclass_with_init([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_init)
         self.assertEqual(set(u), {1, 2})
         self.assertEqual(u.newarg, 3)
 
-        class subclass_with_new(set):
-            def __new__(cls, arg, newarg=None):
-                self = super().__new__(cls, arg)
-                self.newarg = newarg
-                return self
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass_with_new(set):
+                def __new__(cls, arg, newarg=None):
+                    self = super().__new__(cls, arg)
+                    self.newarg = newarg
+                    return self
         u = subclass_with_new([1, 2])
         self.assertIs(type(u), subclass_with_new)
         self.assertEqual(set(u), {1, 2})
@@ -810,27 +818,30 @@ class TestFrozenSetSubclass(TestFrozenSet):
     basetype = frozenset
 
     def test_keywords_in_subclass(self):
-        class subclass(frozenset):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass(frozenset):
+                pass
         u = subclass([1, 2])
         self.assertIs(type(u), subclass)
         self.assertEqual(set(u), {1, 2})
         with self.assertRaises(TypeError):
             subclass(sequence=())
 
-        class subclass_with_init(frozenset):
-            def __init__(self, arg, newarg=None):
-                self.newarg = newarg
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass_with_init(frozenset):
+                def __init__(self, arg, newarg=None):
+                    self.newarg = newarg
         u = subclass_with_init([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_init)
         self.assertEqual(set(u), {1, 2})
         self.assertEqual(u.newarg, 3)
 
-        class subclass_with_new(frozenset):
-            def __new__(cls, arg, newarg=None):
-                self = super().__new__(cls, arg)
-                self.newarg = newarg
-                return self
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass_with_new(frozenset):
+                def __new__(cls, arg, newarg=None):
+                    self = super().__new__(cls, arg)
+                    self.newarg = newarg
+                    return self
         u = subclass_with_new([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_new)
         self.assertEqual(set(u), {1, 2})
@@ -1896,12 +1907,13 @@ def test_iter_and_mutate(self):
         list(si)
 
     def test_merge_and_mutate(self):
-        class X:
-            def __hash__(self):
-                return hash(0)
-            def __eq__(self, o):
-                other.clear()
-                return False
+        with torch._dynamo.error_on_graph_break(False):
+            class X:
+                def __hash__(self):
+                    return hash(0)
+                def __eq__(self, o):
+                    other.clear()
+                    return False
 
         other = set()
         other = {X() for i in range(10)}
@@ -1916,17 +1928,18 @@ class _TestOperationsMutating:
     constructor2 = None
 
     def make_sets_of_bad_objects(self):
-        class Bad:
-            def __eq__(self, other):
-                if not enabled:
-                    return False
-                if randrange(20) == 0:
-                    set1.clear()
-                if randrange(20) == 0:
-                    set2.clear()
-                return bool(randrange(2))
-            def __hash__(self):
-                return randrange(2)
+        with torch._dynamo.error_on_graph_break(False):
+            class Bad:
+                def __eq__(self, other):
+                    if not enabled:
+                        return False
+                    if randrange(20) == 0:
+                        set1.clear()
+                    if randrange(20) == 0:
+                        set2.clear()
+                    return bool(randrange(2))
+                def __hash__(self):
+                    return randrange(2)
         # Don't behave poorly during construction.
         enabled = False
         set1 = self.constructor1(Bad() for _ in range(randrange(50)))
diff --git a/test/dynamo/cpython/3_13/test_sort.diff b/test/dynamo/cpython/3_13/test_sort.diff
index 9049f28532518..2e719655d9dfa 100644
--- a/test/dynamo/cpython/3_13/test_sort.diff
+++ b/test/dynamo/cpython/3_13/test_sort.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_sort.py b/test/dynamo/cpython/3_13/test_sort.py
-index 2a7cfb7affa..58b9b796362 100644
+index 2a7cfb7affa..4805f1fcceb 100644
 --- a/test/dynamo/cpython/3_13/test_sort.py
 +++ b/test/dynamo/cpython/3_13/test_sort.py
 @@ -1,3 +1,57 @@
@@ -63,42 +63,203 @@ index 2a7cfb7affa..58b9b796362 100644
 @@ -39,7 +93,7 @@ def check(tag, expected, raw, compare=None):
              nerrors += 1
              return
- 
+
 -class TestBase(unittest.TestCase):
 +class TestBase(__TestCase):
      def testStressfully(self):
          # Try a variety of sizes at and around powers of 2, and at powers of 10.
          sizes = [0]
-@@ -151,7 +205,7 @@ class TestBase(unittest.TestCase):
+@@ -48,32 +102,33 @@ class TestBase(unittest.TestCase):
+             sizes.extend(range(n-1, n+2))
+         sizes.extend([10, 100, 1000])
+
+-        class Complains(object):
+-            maybe_complain = True
++        with torch._dynamo.error_on_graph_break(False):
++            class Complains(object):
++                maybe_complain = True
+
+-            def __init__(self, i):
+-                self.i = i
++                def __init__(self, i):
++                    self.i = i
+
+-            def __lt__(self, other):
+-                if Complains.maybe_complain and random.random() < 0.001:
+-                    if verbose:
+-                        print("        complaining at", self, other)
+-                    raise RuntimeError
+-                return self.i < other.i
++                def __lt__(self, other):
++                    if Complains.maybe_complain and random.random() < 0.001:
++                        if verbose:
++                            print("        complaining at", self, other)
++                        raise RuntimeError
++                    return self.i < other.i
+
+-            def __repr__(self):
+-                return "Complains(%d)" % self.i
++                def __repr__(self):
++                    return "Complains(%d)" % self.i
+
+-        class Stable(object):
+-            def __init__(self, key, i):
+-                self.key = key
+-                self.index = i
++            class Stable(object):
++                def __init__(self, key, i):
++                    self.key = key
++                    self.index = i
+
+-            def __lt__(self, other):
+-                return self.key < other.key
++                def __lt__(self, other):
++                    return self.key < other.key
+
+-            def __repr__(self):
+-                return "Stable(%d, %d)" % (self.key, self.index)
++                def __repr__(self):
++                    return "Stable(%d, %d)" % (self.key, self.index)
+
+         for n in sizes:
+             x = list(range(n))
+@@ -151,20 +206,21 @@ class TestBase(unittest.TestCase):
                  self.assertEqual(forced, native)
  #==============================================================================
- 
+
 -class TestBugs(unittest.TestCase):
 +class TestBugs(__TestCase):
- 
+
      def test_bug453523(self):
          # bug 453523 -- list.sort() crasher.
-@@ -188,7 +242,7 @@ class TestBugs(unittest.TestCase):
- 
+         # If this fails, the most likely outcome is a core dump.
+         # Mutations during a list sort should raise a ValueError.
+
+-        class C:
+-            def __lt__(self, other):
+-                if L and random.random() < 0.75:
+-                    L.pop()
+-                else:
+-                    L.append(3)
+-                return random.random() < 0.5
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                def __lt__(self, other):
++                    if L and random.random() < 0.75:
++                        L.pop()
++                    else:
++                        L.append(3)
++                    return random.random() < 0.5
+
+         L = [C() for i in range(50)]
+         self.assertRaises(ValueError, L.sort)
+@@ -188,7 +244,7 @@ class TestBugs(unittest.TestCase):
+
  #==============================================================================
- 
+
 -class TestDecorateSortUndecorate(unittest.TestCase):
 +class TestDecorateSortUndecorate(__TestCase):
- 
+
      def test_decorated(self):
          data = 'The quick Brown fox Jumped over The lazy Dog'.split()
-@@ -309,7 +363,7 @@ def check_against_PyObject_RichCompareBool(self, L):
+@@ -228,26 +284,28 @@ class TestDecorateSortUndecorate(unittest.TestCase):
+
+     def test_key_with_mutating_del(self):
+         data = list(range(10))
+-        class SortKiller(object):
+-            def __init__(self, x):
+-                pass
+-            def __del__(self):
+-                del data[:]
+-                data[:] = range(20)
+-            def __lt__(self, other):
+-                return id(self) < id(other)
++        with torch._dynamo.error_on_graph_break(False):
++            class SortKiller(object):
++                def __init__(self, x):
++                    pass
++                def __del__(self):
++                    del data[:]
++                    data[:] = range(20)
++                def __lt__(self, other):
++                    return id(self) < id(other)
+         self.assertRaises(ValueError, data.sort, key=SortKiller)
+
+     def test_key_with_mutating_del_and_exception(self):
+         data = list(range(10))
+         ## dup = data[:]
+-        class SortKiller(object):
+-            def __init__(self, x):
+-                if x > 2:
+-                    raise RuntimeError
+-            def __del__(self):
+-                del data[:]
+-                data[:] = list(range(20))
++        with torch._dynamo.error_on_graph_break(False):
++            class SortKiller(object):
++                def __init__(self, x):
++                    if x > 2:
++                        raise RuntimeError
++                def __del__(self):
++                    del data[:]
++                    data[:] = list(range(20))
+         self.assertRaises(RuntimeError, data.sort, key=SortKiller)
+         ## major honking subtlety: we *can't* do:
+         ##
+@@ -309,7 +367,7 @@ def check_against_PyObject_RichCompareBool(self, L):
              self.assertIs(opt, ref)
              #note: not assertEqual! We want to ensure *identical* behavior.
- 
+
 -class TestOptimizedCompares(unittest.TestCase):
 +class TestOptimizedCompares(__TestCase):
      def test_safe_object_compare(self):
          heterogeneous_lists = [[0, 'foo'],
                                 [0.0, 'foo'],
-@@ -408,4 +462,4 @@ class TestOptimizedCompares(unittest.TestCase):
+@@ -331,17 +389,18 @@ class TestOptimizedCompares(unittest.TestCase):
+         # This test is by ppperry. It ensures that unsafe_object_compare is
+         # verifying ms->key_richcompare == tp->richcompare before comparing.
+
+-        class WackyComparator(int):
+-            def __lt__(self, other):
+-                elem.__class__ = WackyList2
+-                return int.__lt__(self, other)
++        with torch._dynamo.error_on_graph_break(False):
++            class WackyComparator(int):
++                def __lt__(self, other):
++                    elem.__class__ = WackyList2
++                    return int.__lt__(self, other)
+
+-        class WackyList1(list):
+-            pass
++            class WackyList1(list):
++                pass
+
+-        class WackyList2(list):
+-            def __lt__(self, other):
+-                raise ValueError
++            class WackyList2(list):
++                def __lt__(self, other):
++                    raise ValueError
+
+         L = [WackyList1([WackyComparator(i), i]) for i in range(10)]
+         elem = L[-1]
+@@ -355,9 +414,10 @@ class TestOptimizedCompares(unittest.TestCase):
+
+         # The following test is also by ppperry. It ensures that
+         # unsafe_object_compare handles Py_NotImplemented appropriately.
+-        class PointlessComparator:
+-            def __lt__(self, other):
+-                return NotImplemented
++        with torch._dynamo.error_on_graph_break(False):
++            class PointlessComparator:
++                def __lt__(self, other):
++                    return NotImplemented
+         L = [PointlessComparator(), PointlessComparator()]
+         self.assertRaises(TypeError, L.sort)
+         self.assertRaises(TypeError, [(x,) for x in L].sort)
+@@ -408,4 +468,4 @@ class TestOptimizedCompares(unittest.TestCase):
  #==============================================================================
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_sort.py b/test/dynamo/cpython/3_13/test_sort.py
index 58b9b79636227..ab9f094cab1b3 100644
--- a/test/dynamo/cpython/3_13/test_sort.py
+++ b/test/dynamo/cpython/3_13/test_sort.py
@@ -102,32 +102,33 @@ def testStressfully(self):
             sizes.extend(range(n-1, n+2))
         sizes.extend([10, 100, 1000])
 
-        class Complains(object):
-            maybe_complain = True
+        with torch._dynamo.error_on_graph_break(False):
+            class Complains(object):
+                maybe_complain = True
 
-            def __init__(self, i):
-                self.i = i
+                def __init__(self, i):
+                    self.i = i
 
-            def __lt__(self, other):
-                if Complains.maybe_complain and random.random() < 0.001:
-                    if verbose:
-                        print("        complaining at", self, other)
-                    raise RuntimeError
-                return self.i < other.i
+                def __lt__(self, other):
+                    if Complains.maybe_complain and random.random() < 0.001:
+                        if verbose:
+                            print("        complaining at", self, other)
+                        raise RuntimeError
+                    return self.i < other.i
 
-            def __repr__(self):
-                return "Complains(%d)" % self.i
+                def __repr__(self):
+                    return "Complains(%d)" % self.i
 
-        class Stable(object):
-            def __init__(self, key, i):
-                self.key = key
-                self.index = i
+            class Stable(object):
+                def __init__(self, key, i):
+                    self.key = key
+                    self.index = i
 
-            def __lt__(self, other):
-                return self.key < other.key
+                def __lt__(self, other):
+                    return self.key < other.key
 
-            def __repr__(self):
-                return "Stable(%d, %d)" % (self.key, self.index)
+                def __repr__(self):
+                    return "Stable(%d, %d)" % (self.key, self.index)
 
         for n in sizes:
             x = list(range(n))
@@ -212,13 +213,14 @@ def test_bug453523(self):
         # If this fails, the most likely outcome is a core dump.
         # Mutations during a list sort should raise a ValueError.
 
-        class C:
-            def __lt__(self, other):
-                if L and random.random() < 0.75:
-                    L.pop()
-                else:
-                    L.append(3)
-                return random.random() < 0.5
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                def __lt__(self, other):
+                    if L and random.random() < 0.75:
+                        L.pop()
+                    else:
+                        L.append(3)
+                    return random.random() < 0.5
 
         L = [C() for i in range(50)]
         self.assertRaises(ValueError, L.sort)
@@ -282,26 +284,28 @@ def k(x):
 
     def test_key_with_mutating_del(self):
         data = list(range(10))
-        class SortKiller(object):
-            def __init__(self, x):
-                pass
-            def __del__(self):
-                del data[:]
-                data[:] = range(20)
-            def __lt__(self, other):
-                return id(self) < id(other)
+        with torch._dynamo.error_on_graph_break(False):
+            class SortKiller(object):
+                def __init__(self, x):
+                    pass
+                def __del__(self):
+                    del data[:]
+                    data[:] = range(20)
+                def __lt__(self, other):
+                    return id(self) < id(other)
         self.assertRaises(ValueError, data.sort, key=SortKiller)
 
     def test_key_with_mutating_del_and_exception(self):
         data = list(range(10))
         ## dup = data[:]
-        class SortKiller(object):
-            def __init__(self, x):
-                if x > 2:
-                    raise RuntimeError
-            def __del__(self):
-                del data[:]
-                data[:] = list(range(20))
+        with torch._dynamo.error_on_graph_break(False):
+            class SortKiller(object):
+                def __init__(self, x):
+                    if x > 2:
+                        raise RuntimeError
+                def __del__(self):
+                    del data[:]
+                    data[:] = list(range(20))
         self.assertRaises(RuntimeError, data.sort, key=SortKiller)
         ## major honking subtlety: we *can't* do:
         ##
@@ -385,17 +389,18 @@ def test_unsafe_object_compare(self):
         # This test is by ppperry. It ensures that unsafe_object_compare is
         # verifying ms->key_richcompare == tp->richcompare before comparing.
 
-        class WackyComparator(int):
-            def __lt__(self, other):
-                elem.__class__ = WackyList2
-                return int.__lt__(self, other)
+        with torch._dynamo.error_on_graph_break(False):
+            class WackyComparator(int):
+                def __lt__(self, other):
+                    elem.__class__ = WackyList2
+                    return int.__lt__(self, other)
 
-        class WackyList1(list):
-            pass
+            class WackyList1(list):
+                pass
 
-        class WackyList2(list):
-            def __lt__(self, other):
-                raise ValueError
+            class WackyList2(list):
+                def __lt__(self, other):
+                    raise ValueError
 
         L = [WackyList1([WackyComparator(i), i]) for i in range(10)]
         elem = L[-1]
@@ -409,9 +414,10 @@ def __lt__(self, other):
 
         # The following test is also by ppperry. It ensures that
         # unsafe_object_compare handles Py_NotImplemented appropriately.
-        class PointlessComparator:
-            def __lt__(self, other):
-                return NotImplemented
+        with torch._dynamo.error_on_graph_break(False):
+            class PointlessComparator:
+                def __lt__(self, other):
+                    return NotImplemented
         L = [PointlessComparator(), PointlessComparator()]
         self.assertRaises(TypeError, L.sort)
         self.assertRaises(TypeError, [(x,) for x in L].sort)
diff --git a/test/dynamo/cpython/3_13/test_tuple.diff b/test/dynamo/cpython/3_13/test_tuple.diff
index 6e792b6c5450f..d7ae3af2a2c82 100644
--- a/test/dynamo/cpython/3_13/test_tuple.diff
+++ b/test/dynamo/cpython/3_13/test_tuple.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_tuple.py b/test/dynamo/cpython/3_13/test_tuple.py
-index 9ce80c5e8ea..c6eab3ff1e9 100644
+index 9ce80c5e8ea..1080e85e31a 100644
 --- a/test/dynamo/cpython/3_13/test_tuple.py
 +++ b/test/dynamo/cpython/3_13/test_tuple.py
 @@ -1,4 +1,58 @@
@@ -60,11 +60,74 @@ index 9ce80c5e8ea..c6eab3ff1e9 100644
 +from test import support
 +import seq_tests
  import unittest
- 
+
  import gc
-@@ -510,4 +564,4 @@ class TupleTest(seq_tests.CommonTest):
+@@ -43,27 +97,30 @@ class TupleTest(seq_tests.CommonTest):
+             tuple(sequence=())
+
+     def test_keywords_in_subclass(self):
+-        class subclass(tuple):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass(tuple):
++                pass
+         u = subclass([1, 2])
+         self.assertIs(type(u), subclass)
+         self.assertEqual(list(u), [1, 2])
+         with self.assertRaises(TypeError):
+             subclass(sequence=())
+
+-        class subclass_with_init(tuple):
+-            def __init__(self, arg, newarg=None):
+-                self.newarg = newarg
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass_with_init(tuple):
++                def __init__(self, arg, newarg=None):
++                    self.newarg = newarg
+         u = subclass_with_init([1, 2], newarg=3)
+         self.assertIs(type(u), subclass_with_init)
+         self.assertEqual(list(u), [1, 2])
+         self.assertEqual(u.newarg, 3)
+
+-        class subclass_with_new(tuple):
+-            def __new__(cls, arg, newarg=None):
+-                self = super().__new__(cls, arg)
+-                self.newarg = newarg
+-                return self
++        with torch._dynamo.error_on_graph_break(False):
++            class subclass_with_new(tuple):
++                def __new__(cls, arg, newarg=None):
++                    self = super().__new__(cls, arg)
++                    self.newarg = newarg
++                    return self
+         u = subclass_with_new([1, 2], newarg=3)
+         self.assertIs(type(u), subclass_with_new)
+         self.assertEqual(list(u), [1, 2])
+@@ -351,8 +408,9 @@ class TupleTest(seq_tests.CommonTest):
+     @support.cpython_only
+     def test_track_subtypes(self):
+         # Tuple subtypes must always be tracked
+-        class MyTuple(tuple):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class MyTuple(tuple):
++                pass
+         self.check_track_dynamic(MyTuple, True)
+
+     @support.cpython_only
+@@ -404,7 +462,8 @@ class TupleTest(seq_tests.CommonTest):
+         # Issue 8847: In the PGO build, the MSVC linker's COMDAT folding
+         # optimization causes failures in code that relies on distinct
+         # function addresses.
+-        class T(tuple): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class T(tuple): pass
+         with self.assertRaises(TypeError):
+             [3,] + T((1,2))
+
+@@ -510,4 +569,4 @@ class TupleTest(seq_tests.CommonTest):
  #            pileup 262,143 mean 8.0 coll 262,143 z +92683.6
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_tuple.py b/test/dynamo/cpython/3_13/test_tuple.py
index c6eab3ff1e92c..914e3443f2874 100644
--- a/test/dynamo/cpython/3_13/test_tuple.py
+++ b/test/dynamo/cpython/3_13/test_tuple.py
@@ -97,27 +97,30 @@ def test_keyword_args(self):
             tuple(sequence=())
 
     def test_keywords_in_subclass(self):
-        class subclass(tuple):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass(tuple):
+                pass
         u = subclass([1, 2])
         self.assertIs(type(u), subclass)
         self.assertEqual(list(u), [1, 2])
         with self.assertRaises(TypeError):
             subclass(sequence=())
 
-        class subclass_with_init(tuple):
-            def __init__(self, arg, newarg=None):
-                self.newarg = newarg
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass_with_init(tuple):
+                def __init__(self, arg, newarg=None):
+                    self.newarg = newarg
         u = subclass_with_init([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_init)
         self.assertEqual(list(u), [1, 2])
         self.assertEqual(u.newarg, 3)
 
-        class subclass_with_new(tuple):
-            def __new__(cls, arg, newarg=None):
-                self = super().__new__(cls, arg)
-                self.newarg = newarg
-                return self
+        with torch._dynamo.error_on_graph_break(False):
+            class subclass_with_new(tuple):
+                def __new__(cls, arg, newarg=None):
+                    self = super().__new__(cls, arg)
+                    self.newarg = newarg
+                    return self
         u = subclass_with_new([1, 2], newarg=3)
         self.assertIs(type(u), subclass_with_new)
         self.assertEqual(list(u), [1, 2])
@@ -405,8 +408,9 @@ def test_track_dynamic(self):
     @support.cpython_only
     def test_track_subtypes(self):
         # Tuple subtypes must always be tracked
-        class MyTuple(tuple):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class MyTuple(tuple):
+                pass
         self.check_track_dynamic(MyTuple, True)
 
     @support.cpython_only
@@ -458,7 +462,8 @@ def test_no_comdat_folding(self):
         # Issue 8847: In the PGO build, the MSVC linker's COMDAT folding
         # optimization causes failures in code that relies on distinct
         # function addresses.
-        class T(tuple): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class T(tuple): pass
         with self.assertRaises(TypeError):
             [3,] + T((1,2))
 
diff --git a/test/dynamo/cpython/3_13/test_userlist.diff b/test/dynamo/cpython/3_13/test_userlist.diff
index 20999ba6bca0f..77e951de5fad6 100644
--- a/test/dynamo/cpython/3_13/test_userlist.diff
+++ b/test/dynamo/cpython/3_13/test_userlist.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_userlist.py b/test/dynamo/cpython/3_13/test_userlist.py
-index 312702c8e39..5ede0c3b7f1 100644
+index 312702c8e39..d3d8dbf394a 100644
 --- a/test/dynamo/cpython/3_13/test_userlist.py
 +++ b/test/dynamo/cpython/3_13/test_userlist.py
 @@ -1,7 +1,61 @@
@@ -58,15 +58,29 @@ index 312702c8e39..5ede0c3b7f1 100644
 +# ======= END DYNAMO PATCH =======
 +
  # Check every path through every method of UserList
- 
+
  from collections import UserList
 -from test import list_tests
 +import list_tests
  import unittest
  from test import support
- 
-@@ -69,9 +123,9 @@ class UserListTest(list_tests.CommonTest):
- 
+
+@@ -56,9 +110,10 @@ class UserListTest(list_tests.CommonTest):
+
+     def test_getitemoverwriteiter(self):
+         # Verify that __getitem__ overrides *are* recognized by __iter__
+-        class T(self.type2test):
+-            def __getitem__(self, key):
+-                return str(key) + '!!!'
++        with torch._dynamo.error_on_graph_break(False):
++            class T(self.type2test):
++                def __getitem__(self, key):
++                    return str(key) + '!!!'
+         self.assertEqual(next(iter(T((1,2)))), "0!!!")
+
+     def test_userlist_copy(self):
+@@ -69,9 +124,9 @@ class UserListTest(list_tests.CommonTest):
+
      # Decorate existing test with recursion limit, because
      # the test is for C structure, but `UserList` is a Python structure.
 -    test_repr_deep = support.infinite_recursion(25)(
@@ -75,7 +89,7 @@ index 312702c8e39..5ede0c3b7f1 100644
 +    # test_repr_deep = support.infinite_recursion(25)(
 +    #     list_tests.CommonTest.test_repr_deep,
 +    # )
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_userlist.py b/test/dynamo/cpython/3_13/test_userlist.py
index 5ede0c3b7f1a0..9bd988c458836 100644
--- a/test/dynamo/cpython/3_13/test_userlist.py
+++ b/test/dynamo/cpython/3_13/test_userlist.py
@@ -110,9 +110,10 @@ def test_mixedadd(self):
 
     def test_getitemoverwriteiter(self):
         # Verify that __getitem__ overrides *are* recognized by __iter__
-        class T(self.type2test):
-            def __getitem__(self, key):
-                return str(key) + '!!!'
+        with torch._dynamo.error_on_graph_break(False):
+            class T(self.type2test):
+                def __getitem__(self, key):
+                    return str(key) + '!!!'
         self.assertEqual(next(iter(T((1,2)))), "0!!!")
 
     def test_userlist_copy(self):
diff --git a/test/dynamo/cpython/3_13/test_with.diff b/test/dynamo/cpython/3_13/test_with.diff
index 696fefb91edf1..2c27a4e4f0a3c 100644
--- a/test/dynamo/cpython/3_13/test_with.diff
+++ b/test/dynamo/cpython/3_13/test_with.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_with.py b/test/dynamo/cpython/3_13/test_with.py
-index 8e9ed8500c7..e1ebaa68b83 100644
+index 8e9ed8500c7..66c18ad886a 100644
 --- a/test/dynamo/cpython/3_13/test_with.py
 +++ b/test/dynamo/cpython/3_13/test_with.py
 @@ -1,3 +1,23 @@
@@ -24,91 +24,285 @@ index 8e9ed8500c7..e1ebaa68b83 100644
 +# ======= END DYNAMO PATCH =======
 +
  """Unit tests for the with statement specified in PEP 343."""
- 
- 
-@@ -104,7 +124,7 @@ class MockNested(Nested):
+
+
+@@ -104,16 +124,17 @@ class MockNested(Nested):
          return Nested.__exit__(self, *exc_info)
- 
- 
+
+
 -class FailureTestCase(unittest.TestCase):
 +class FailureTestCase(__TestCase):
      def testNameError(self):
          def fooNotDeclared():
              with foo: pass
-@@ -194,6 +214,7 @@ class ContextmanagerAssertionMixin(object):
- 
+         self.assertRaises(NameError, fooNotDeclared)
+
+     def testEnterAttributeError1(self):
+-        class LacksEnter(object):
+-            def __exit__(self, type, value, traceback):
+-                pass
++        with torch._dynamo.error_on_graph_break(False):
++            class LacksEnter(object):
++                def __exit__(self, type, value, traceback):
++                    pass
+
+         def fooLacksEnter():
+             foo = LacksEnter()
+@@ -121,8 +142,9 @@ class FailureTestCase(unittest.TestCase):
+         self.assertRaisesRegex(TypeError, 'the context manager', fooLacksEnter)
+
+     def testEnterAttributeError2(self):
+-        class LacksEnterAndExit(object):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class LacksEnterAndExit(object):
++                pass
+
+         def fooLacksEnterAndExit():
+             foo = LacksEnterAndExit()
+@@ -130,9 +152,10 @@ class FailureTestCase(unittest.TestCase):
+         self.assertRaisesRegex(TypeError, 'the context manager', fooLacksEnterAndExit)
+
+     def testExitAttributeError(self):
+-        class LacksExit(object):
+-            def __enter__(self):
+-                pass
++        with torch._dynamo.error_on_graph_break(False):
++            class LacksExit(object):
++                def __enter__(self):
++                    pass
+
+         def fooLacksExit():
+             foo = LacksExit()
+@@ -162,11 +185,12 @@ class FailureTestCase(unittest.TestCase):
+             '  pass')
+
+     def testEnterThrows(self):
+-        class EnterThrows(object):
+-            def __enter__(self):
+-                raise RuntimeError("Enter threw")
+-            def __exit__(self, *args):
+-                pass
++        with torch._dynamo.error_on_graph_break(False):
++            class EnterThrows(object):
++                def __enter__(self):
++                    raise RuntimeError("Enter threw")
++                def __exit__(self, *args):
++                    pass
+
+         def shouldThrow():
+             ct = EnterThrows()
+@@ -180,11 +204,12 @@ class FailureTestCase(unittest.TestCase):
+         self.assertEqual(self.foo, None)
+
+     def testExitThrows(self):
+-        class ExitThrows(object):
+-            def __enter__(self):
+-                return
+-            def __exit__(self, *args):
+-                raise RuntimeError(42)
++        with torch._dynamo.error_on_graph_break(False):
++            class ExitThrows(object):
++                def __enter__(self):
++                    return
++                def __exit__(self, *args):
++                    raise RuntimeError(42)
+         def shouldThrow():
+             with ExitThrows():
+                 pass
+@@ -194,6 +219,7 @@ class ContextmanagerAssertionMixin(object):
+
      def setUp(self):
          self.TEST_EXCEPTION = RuntimeError("test exception")
 +        super().setUp()
- 
+
      def assertInWithManagerInvariants(self, mock_manager):
          self.assertTrue(mock_manager.enter_called)
-@@ -237,7 +258,7 @@ class ContextmanagerAssertionMixin(object):
+@@ -237,7 +263,7 @@ class ContextmanagerAssertionMixin(object):
          self.assertTrue(mock_generator.stopped)
- 
- 
+
+
 -class NonexceptionalTestCase(unittest.TestCase, ContextmanagerAssertionMixin):
 +class NonexceptionalTestCase(__TestCase, ContextmanagerAssertionMixin):
      def testInlineGeneratorSyntax(self):
          with mock_contextmanager_generator():
              pass
-@@ -289,7 +310,7 @@ class NonexceptionalTestCase(unittest.TestCase, ContextmanagerAssertionMixin):
+@@ -289,7 +315,7 @@ class NonexceptionalTestCase(unittest.TestCase, ContextmanagerAssertionMixin):
          self.assertAfterWithGeneratorInvariantsNoError(foo)
- 
- 
+
+
 -class NestedNonexceptionalTestCase(unittest.TestCase,
 +class NestedNonexceptionalTestCase(__TestCase,
      ContextmanagerAssertionMixin):
      def testSingleArgInlineGeneratorSyntax(self):
          with Nested(mock_contextmanager_generator()):
-@@ -355,7 +376,7 @@ class NestedNonexceptionalTestCase(unittest.TestCase,
+@@ -355,7 +381,7 @@ class NestedNonexceptionalTestCase(unittest.TestCase,
          self.assertAfterWithManagerInvariantsNoError(mock_nested)
- 
- 
+
+
 -class ExceptionalTestCase(ContextmanagerAssertionMixin, unittest.TestCase):
 +class ExceptionalTestCase(ContextmanagerAssertionMixin, __TestCase):
      def testSingleResource(self):
          cm = mock_contextmanager_generator()
          def shouldThrow():
-@@ -550,7 +571,7 @@ class ExceptionalTestCase(ContextmanagerAssertionMixin, unittest.TestCase):
+@@ -466,11 +492,12 @@ class ExceptionalTestCase(ContextmanagerAssertionMixin, unittest.TestCase):
+
+     def testRaisedStopIteration2(self):
+         # From bug 1462485
+-        class cm(object):
+-            def __enter__(self):
+-                pass
+-            def __exit__(self, type, value, traceback):
+-                pass
++        with torch._dynamo.error_on_graph_break(False):
++            class cm(object):
++                def __enter__(self):
++                    pass
++                def __exit__(self, type, value, traceback):
++                    pass
+
+         def shouldThrow():
+             with cm():
+@@ -507,11 +534,12 @@ class ExceptionalTestCase(ContextmanagerAssertionMixin, unittest.TestCase):
+
+     def testRaisedGeneratorExit2(self):
+         # From bug 1462485
+-        class cm (object):
+-            def __enter__(self):
+-                pass
+-            def __exit__(self, type, value, traceback):
+-                pass
++        with torch._dynamo.error_on_graph_break(False):
++            class cm (object):
++                def __enter__(self):
++                    pass
++                def __exit__(self, type, value, traceback):
++                    pass
+
+         def shouldThrow():
+             with cm():
+@@ -523,16 +551,17 @@ class ExceptionalTestCase(ContextmanagerAssertionMixin, unittest.TestCase):
+         # issue4589: __exit__ return code may raise an exception
+         # when looking at its truth value.
+
+-        class cm(object):
+-            def __init__(self, bool_conversion):
+-                class Bool:
+-                    def __bool__(self):
+-                        return bool_conversion()
+-                self.exit_result = Bool()
+-            def __enter__(self):
+-                return 3
+-            def __exit__(self, a, b, c):
+-                return self.exit_result
++        with torch._dynamo.error_on_graph_break(False):
++            class cm(object):
++                def __init__(self, bool_conversion):
++                    class Bool:
++                        def __bool__(self):
++                            return bool_conversion()
++                    self.exit_result = Bool()
++                def __enter__(self):
++                    return 3
++                def __exit__(self, a, b, c):
++                    return self.exit_result
+
+         def trueAsBool():
+             with cm(lambda: True):
+@@ -550,7 +579,7 @@ class ExceptionalTestCase(ContextmanagerAssertionMixin, unittest.TestCase):
          self.assertRaises(ZeroDivisionError, failAsBool)
- 
- 
+
+
 -class NonLocalFlowControlTestCase(unittest.TestCase):
 +class NonLocalFlowControlTestCase(__TestCase):
- 
+
      def testWithBreak(self):
          counter = 0
-@@ -607,7 +628,7 @@ class NonLocalFlowControlTestCase(unittest.TestCase):
+@@ -607,7 +636,7 @@ class NonLocalFlowControlTestCase(unittest.TestCase):
              self.fail("Didn't raise RuntimeError")
- 
- 
+
+
 -class AssignmentTargetTestCase(unittest.TestCase):
 +class AssignmentTargetTestCase(__TestCase):
- 
+
      def testSingleComplexTarget(self):
          targets = {1: [0, 1, 2]}
-@@ -651,7 +672,7 @@ class AssignmentTargetTestCase(unittest.TestCase):
+@@ -621,15 +650,17 @@ class AssignmentTargetTestCase(unittest.TestCase):
+             keys = list(targets.keys())
+             keys.sort()
+             self.assertEqual(keys, [1, 2])
+-        class C: pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C: pass
+         blah = C()
+         with mock_contextmanager_generator() as blah.foo:
+             self.assertEqual(hasattr(blah, "foo"), True)
+
+     def testMultipleComplexTargets(self):
+-        class C:
+-            def __enter__(self): return 1, 2, 3
+-            def __exit__(self, t, v, tb): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                def __enter__(self): return 1, 2, 3
++                def __exit__(self, t, v, tb): pass
+         targets = {1: [0, 1, 2]}
+         with C() as (targets[1][0], targets[1][1], targets[1][2]):
+             self.assertEqual(targets, {1: [1, 2, 3]})
+@@ -637,7 +668,8 @@ class AssignmentTargetTestCase(unittest.TestCase):
+             self.assertEqual(targets, {1: [3, 2, 1]})
+         with C() as (targets[1], targets[2], targets[3]):
+             self.assertEqual(targets, {1: 1, 2: 2, 3: 3})
+-        class B: pass
++        with torch._dynamo.error_on_graph_break(False):
++            class B: pass
+         blah = B()
+         with C() as (blah.one, blah.two, blah.three):
+             self.assertEqual(blah.one, 1)
+@@ -651,12 +683,13 @@ class AssignmentTargetTestCase(unittest.TestCase):
              self.assertEqual(c, 4)
- 
- 
+
+
 -class ExitSwallowsExceptionTestCase(unittest.TestCase):
 +class ExitSwallowsExceptionTestCase(__TestCase):
- 
+
      def testExitTrueSwallowsException(self):
-         class AfricanSwallow:
-@@ -676,7 +697,7 @@ class ExitSwallowsExceptionTestCase(unittest.TestCase):
+-        class AfricanSwallow:
+-            def __enter__(self): pass
+-            def __exit__(self, t, v, tb): return True
++        with torch._dynamo.error_on_graph_break(False):
++            class AfricanSwallow:
++                def __enter__(self): pass
++                def __exit__(self, t, v, tb): return True
+         try:
+             with AfricanSwallow():
+                 1/0
+@@ -664,9 +697,10 @@ class ExitSwallowsExceptionTestCase(unittest.TestCase):
+             self.fail("ZeroDivisionError should have been swallowed")
+
+     def testExitFalseDoesntSwallowException(self):
+-        class EuropeanSwallow:
+-            def __enter__(self): pass
+-            def __exit__(self, t, v, tb): return False
++        with torch._dynamo.error_on_graph_break(False):
++            class EuropeanSwallow:
++                def __enter__(self): pass
++                def __exit__(self, t, v, tb): return False
+         try:
+             with EuropeanSwallow():
+                 1/0
+@@ -676,7 +710,7 @@ class ExitSwallowsExceptionTestCase(unittest.TestCase):
              self.fail("ZeroDivisionError should have been raised")
- 
- 
+
+
 -class NestedWith(unittest.TestCase):
 +class NestedWith(__TestCase):
- 
+
      class Dummy(object):
          def __init__(self, value=None, gobble=False):
-@@ -796,4 +817,4 @@ class NestedWith(unittest.TestCase):
- 
- 
+@@ -796,4 +830,4 @@ class NestedWith(unittest.TestCase):
+
+
  if __name__ == '__main__':
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_with.py b/test/dynamo/cpython/3_13/test_with.py
index e1ebaa68b839b..7465532f764b0 100644
--- a/test/dynamo/cpython/3_13/test_with.py
+++ b/test/dynamo/cpython/3_13/test_with.py
@@ -131,9 +131,10 @@ def fooNotDeclared():
         self.assertRaises(NameError, fooNotDeclared)
 
     def testEnterAttributeError1(self):
-        class LacksEnter(object):
-            def __exit__(self, type, value, traceback):
-                pass
+        with torch._dynamo.error_on_graph_break(False):
+            class LacksEnter(object):
+                def __exit__(self, type, value, traceback):
+                    pass
 
         def fooLacksEnter():
             foo = LacksEnter()
@@ -141,8 +142,9 @@ def fooLacksEnter():
         self.assertRaisesRegex(TypeError, 'the context manager', fooLacksEnter)
 
     def testEnterAttributeError2(self):
-        class LacksEnterAndExit(object):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class LacksEnterAndExit(object):
+                pass
 
         def fooLacksEnterAndExit():
             foo = LacksEnterAndExit()
@@ -150,9 +152,10 @@ def fooLacksEnterAndExit():
         self.assertRaisesRegex(TypeError, 'the context manager', fooLacksEnterAndExit)
 
     def testExitAttributeError(self):
-        class LacksExit(object):
-            def __enter__(self):
-                pass
+        with torch._dynamo.error_on_graph_break(False):
+            class LacksExit(object):
+                def __enter__(self):
+                    pass
 
         def fooLacksExit():
             foo = LacksExit()
@@ -182,11 +185,12 @@ def testAssignmentToTupleContainingNoneError(self):
             '  pass')
 
     def testEnterThrows(self):
-        class EnterThrows(object):
-            def __enter__(self):
-                raise RuntimeError("Enter threw")
-            def __exit__(self, *args):
-                pass
+        with torch._dynamo.error_on_graph_break(False):
+            class EnterThrows(object):
+                def __enter__(self):
+                    raise RuntimeError("Enter threw")
+                def __exit__(self, *args):
+                    pass
 
         def shouldThrow():
             ct = EnterThrows()
@@ -200,11 +204,12 @@ def shouldThrow():
         self.assertEqual(self.foo, None)
 
     def testExitThrows(self):
-        class ExitThrows(object):
-            def __enter__(self):
-                return
-            def __exit__(self, *args):
-                raise RuntimeError(42)
+        with torch._dynamo.error_on_graph_break(False):
+            class ExitThrows(object):
+                def __enter__(self):
+                    return
+                def __exit__(self, *args):
+                    raise RuntimeError(42)
         def shouldThrow():
             with ExitThrows():
                 pass
@@ -487,11 +492,12 @@ def shouldThrow():
 
     def testRaisedStopIteration2(self):
         # From bug 1462485
-        class cm(object):
-            def __enter__(self):
-                pass
-            def __exit__(self, type, value, traceback):
-                pass
+        with torch._dynamo.error_on_graph_break(False):
+            class cm(object):
+                def __enter__(self):
+                    pass
+                def __exit__(self, type, value, traceback):
+                    pass
 
         def shouldThrow():
             with cm():
@@ -528,11 +534,12 @@ def shouldThrow():
 
     def testRaisedGeneratorExit2(self):
         # From bug 1462485
-        class cm (object):
-            def __enter__(self):
-                pass
-            def __exit__(self, type, value, traceback):
-                pass
+        with torch._dynamo.error_on_graph_break(False):
+            class cm (object):
+                def __enter__(self):
+                    pass
+                def __exit__(self, type, value, traceback):
+                    pass
 
         def shouldThrow():
             with cm():
@@ -544,16 +551,17 @@ def testErrorsInBool(self):
         # issue4589: __exit__ return code may raise an exception
         # when looking at its truth value.
 
-        class cm(object):
-            def __init__(self, bool_conversion):
-                class Bool:
-                    def __bool__(self):
-                        return bool_conversion()
-                self.exit_result = Bool()
-            def __enter__(self):
-                return 3
-            def __exit__(self, a, b, c):
-                return self.exit_result
+        with torch._dynamo.error_on_graph_break(False):
+            class cm(object):
+                def __init__(self, bool_conversion):
+                    class Bool:
+                        def __bool__(self):
+                            return bool_conversion()
+                    self.exit_result = Bool()
+                def __enter__(self):
+                    return 3
+                def __exit__(self, a, b, c):
+                    return self.exit_result
 
         def trueAsBool():
             with cm(lambda: True):
@@ -642,15 +650,17 @@ def testSingleComplexTarget(self):
             keys = list(targets.keys())
             keys.sort()
             self.assertEqual(keys, [1, 2])
-        class C: pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C: pass
         blah = C()
         with mock_contextmanager_generator() as blah.foo:
             self.assertEqual(hasattr(blah, "foo"), True)
 
     def testMultipleComplexTargets(self):
-        class C:
-            def __enter__(self): return 1, 2, 3
-            def __exit__(self, t, v, tb): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                def __enter__(self): return 1, 2, 3
+                def __exit__(self, t, v, tb): pass
         targets = {1: [0, 1, 2]}
         with C() as (targets[1][0], targets[1][1], targets[1][2]):
             self.assertEqual(targets, {1: [1, 2, 3]})
@@ -658,7 +668,8 @@ def __exit__(self, t, v, tb): pass
             self.assertEqual(targets, {1: [3, 2, 1]})
         with C() as (targets[1], targets[2], targets[3]):
             self.assertEqual(targets, {1: 1, 2: 2, 3: 3})
-        class B: pass
+        with torch._dynamo.error_on_graph_break(False):
+            class B: pass
         blah = B()
         with C() as (blah.one, blah.two, blah.three):
             self.assertEqual(blah.one, 1)
@@ -675,9 +686,10 @@ def testWithExtendedTargets(self):
 class ExitSwallowsExceptionTestCase(__TestCase):
 
     def testExitTrueSwallowsException(self):
-        class AfricanSwallow:
-            def __enter__(self): pass
-            def __exit__(self, t, v, tb): return True
+        with torch._dynamo.error_on_graph_break(False):
+            class AfricanSwallow:
+                def __enter__(self): pass
+                def __exit__(self, t, v, tb): return True
         try:
             with AfricanSwallow():
                 1/0
@@ -685,9 +697,10 @@ def __exit__(self, t, v, tb): return True
             self.fail("ZeroDivisionError should have been swallowed")
 
     def testExitFalseDoesntSwallowException(self):
-        class EuropeanSwallow:
-            def __enter__(self): pass
-            def __exit__(self, t, v, tb): return False
+        with torch._dynamo.error_on_graph_break(False):
+            class EuropeanSwallow:
+                def __enter__(self): pass
+                def __exit__(self, t, v, tb): return False
         try:
             with EuropeanSwallow():
                 1/0
diff --git a/test/dynamo/test_activation_checkpointing.py b/test/dynamo/test_activation_checkpointing.py
index 2b0c18e656056..db22254b17ea2 100644
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@@ -22,8 +22,9 @@
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
 )
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
-from torch.testing._internal.common_utils import IS_WINDOWS, skipIfHpu, skipIfRocm
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.common_utils import IS_WINDOWS, skipIfHpu
+from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 from torch.testing._internal.two_tensor import TwoTensor
 from torch.utils.checkpoint import (
     checkpoint,
@@ -32,7 +33,26 @@
 )
 
 
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+if HAS_CUDA_AND_TRITON:
+    import triton
+    from triton import language as tl
+
+    @triton.jit
+    def add_one_kernel(
+        in_ptr0,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        output = x + 1
+        tl.store(out_ptr + offsets, output, mask=mask)
+
+
 requires_distributed = functools.partial(
     unittest.skipIf, not dist.is_available(), "requires distributed"
 )
@@ -187,46 +207,75 @@ def _compare_orig_and_checkpointed_fns(
         # The original version and the checkpointed version of the same function
         # should produce the same outputs and the same gradients under torch.compile.
 
-        # Run original version
-        cloned_args_orig_fn = []
-        for arg in args:
-            cloned_args_orig_fn.append(
-                arg.detach().clone().requires_grad_(arg.requires_grad)
-            )
-        torch.manual_seed(0)
-        compiled_orig_fn = torch.compile(
-            orig_fn, fullgraph=fullgraph, backend="inductor"
-        )
-        result_orig_fn = compiled_orig_fn(*cloned_args_orig_fn)
-        result_orig_fn.sum().backward()
+        def clone_args(args):
+            cloned_args = []
+            for arg in args:
+                cloned_args.append(
+                    arg.detach().clone().requires_grad_(arg.requires_grad)
+                )
+            return cloned_args
 
-        # Run checkpointed version
-        cloned_args_checkpointed_fn = []
-        for arg in args:
-            cloned_args_checkpointed_fn.append(
-                arg.detach().clone().requires_grad_(arg.requires_grad)
+        def run(compiler):
+            # Run original version
+            cloned_args_orig_fn = clone_args(args)
+            torch.manual_seed(0)
+            compiled_orig_fn = compiler(orig_fn)
+            result_orig_fn = compiled_orig_fn(*cloned_args_orig_fn)
+            result_orig_fn.sum().backward()
+
+            # Run checkpointed version
+            cloned_args_checkpointed_fn = clone_args(args)
+            torch.manual_seed(0)
+            compiled_checkpointed_fn = compiler(copy.deepcopy(checkpointed_fn))
+            result_checkpointed_fn = compiled_checkpointed_fn(
+                *cloned_args_checkpointed_fn
             )
-        torch.manual_seed(0)
-        compiled_checkpointed_fn = torch.compile(
-            checkpointed_fn, fullgraph=fullgraph, backend="inductor"
-        )
-        result_checkpointed_fn = compiled_checkpointed_fn(*cloned_args_checkpointed_fn)
-        result_checkpointed_fn.sum().backward()
+            result_checkpointed_fn.sum().backward()
 
-        # Check that outputs and gradients are equal
-        self.assertEqual(
-            result_orig_fn,
-            result_checkpointed_fn,
-            msg="Output mismatch between the original version and the checkpointed version of the same function",
-        )
-        for cloned_arg_orig_fn, cloned_arg_checkpointed_fn in zip(
-            cloned_args_orig_fn, cloned_args_checkpointed_fn
-        ):
+            # Check that outputs and gradients are equal
             self.assertEqual(
-                cloned_arg_orig_fn.grad,
-                cloned_arg_checkpointed_fn.grad,
-                msg="Gradient mismatch between the original version and the checkpointed version of the same function",
+                result_orig_fn,
+                result_checkpointed_fn,
+                msg="Output mismatch between the original version and the checkpointed version of the same function",
             )
+            for cloned_arg_orig_fn, cloned_arg_checkpointed_fn in zip(
+                cloned_args_orig_fn, cloned_args_checkpointed_fn
+            ):
+                self.assertEqual(
+                    cloned_arg_orig_fn.grad,
+                    cloned_arg_checkpointed_fn.grad,
+                    msg="Gradient mismatch between the original version and the checkpointed version of the same function",
+                )
+
+        run(functools.partial(torch.compile, fullgraph=fullgraph))
+        if fullgraph:
+
+            def export_compiler(fn):
+                class WrapAsModule(nn.Module):
+                    def forward(self, *args, **kwargs):
+                        return fn(*args, **kwargs)
+
+                mod = WrapAsModule()
+
+                def runtime_wrapper(*runtime_args):
+                    from torch.export import _trace
+
+                    gm = _trace._export_to_torch_ir(
+                        f=mod,
+                        args=tuple(clone_args(args)),
+                        kwargs={},
+                        dynamic_shapes=None,
+                        preserve_module_call_signature=(),
+                        restore_fqn=False,
+                        prefer_deferred_runtime_asserts_over_guards=False,
+                        _log_export_usage=False,
+                    )
+                    # NOTE: this is necessary for rng to be added to the exported graph
+                    return torch.compile(gm, fullgraph=False)(*runtime_args)
+
+                return runtime_wrapper
+
+            run(export_compiler)
 
     def test_tags_function(self, device):
         def gn(x, y):
@@ -247,7 +296,7 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_tags_function_via_global_checkpoint(self, device):
         def gn(x, y):
             return torch.sigmoid(torch.matmul(x, y))
@@ -266,7 +315,7 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_tags_function_with_kwargs(self, device):
         def gn(x, y):
             return torch.sigmoid(torch.matmul(x, y))
@@ -286,7 +335,7 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_tags_sequential_layers(self, device):
         def gn(x):
             x = x.cos()
@@ -311,7 +360,7 @@ def fn(x):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_tags_multiple_checkpoints(self, device):
         def gn(x, y):
             return torch.sigmoid(torch.matmul(x, y))
@@ -333,7 +382,7 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_tags_module(self, device):
         class MockModule(torch.nn.Module):
             def __init__(self) -> None:
@@ -361,7 +410,7 @@ def fn(x):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_tags_decomps(self, device):
         # Ensures that tags are passed on through decompositions as well
         class MockModule(torch.nn.Module):
@@ -396,7 +445,7 @@ def fn(x):
         )
         self._validate(fn, backend, x)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._inductor.config.patch(fallback_random=True)
     def test_tags_recomputed_rand(self, device):
         def gn(x, y):
@@ -420,7 +469,7 @@ def fn(x, y):
         backend = "inductor"
         self._validate(fn, backend, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._inductor.config.patch(fallback_random=True)
     def test_tags_rand(self, device):
         def gn(x, y):
@@ -447,7 +496,7 @@ def fn(x, y):
         backend = "inductor"
         self._validate(fn, backend, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._inductor.config.patch(fallback_random=True)
     def test_tags_dropout(self, device):
         # Figure out a way to test the number of inductor_random calls
@@ -555,7 +604,7 @@ def _factory_fn():
 Non-primal fwd outputs from model w/o backward hook: {mod_no_hook_fwd_outputs_no_primal}.""",
         )
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_fallback(self, device):
         def gn(x, y):
             torch._dynamo.graph_break()
@@ -583,7 +632,7 @@ def fn(x, y):
         self.assertEqual(cnt.op_count, 2)
         self.assertEqual(len(cnt.graphs), 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_kwargs(self, device):
         def gn(x, y, z=None):
             a = torch.matmul(x, y)
@@ -617,7 +666,7 @@ def fn(x, y, z):
         body_function = getattr(cnt.graphs[0], wrap_node.args[0].name)
         self.assertEqual(op_count(body_function), 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_symints_location(self, device):
         def gn(x, y):
             return torch.matmul(x, torch.nn.functional.dropout(y, 0.5))
@@ -647,7 +696,7 @@ def fn(x, y):
         wrap_node = find_first_node(cnt.graphs[0], tag_activation_checkpoint)
         self.assertEqual(len(wrap_node.args), 3)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_must_recompute(self, device):
         def context_fn_must_recompute_mm():
@@ -714,7 +763,7 @@ def fn(x):
             ),
         )
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_must_not_recompute_gemm(self, device):
         def selective_checkpointing_context_fn():
@@ -761,7 +810,74 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
+    @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
+    def test_compile_selective_checkpoint_triton_kernel(self, device):
+        # Copy of the above test, but make sure that having a triton kernel in the
+        # region does not error.
+        def add_one(x):
+            out = torch.empty_like(x)
+            n_elements = x.numel()
+            add_one_kernel[(n_elements,)](x, out, n_elements, BLOCK_SIZE=4)
+            return out
+
+        class AddOne(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return add_one(x)
+
+            @staticmethod
+            def backward(ctx, x):
+                return x
+
+        def selective_checkpointing_context_fn():
+            no_recompute_list = [
+                torch.ops.aten.mm.default,
+            ]
+            return create_selective_checkpoint_contexts(
+                _get_custom_policy(no_recompute_list=no_recompute_list)
+            )
+
+        def gn(x, y):
+            return (
+                torch.sigmoid(torch.matmul(torch.matmul(AddOne.apply(x.sin()), y), y))
+                * y
+            )
+
+        def fn(x, y):
+            return torch.utils.checkpoint.checkpoint(
+                gn,
+                x,
+                y,
+                use_reentrant=False,
+                context_fn=selective_checkpointing_context_fn,
+            )
+
+        x = torch.randn(4, 4, requires_grad=True, device=device)
+        y = torch.randn(4, 4, requires_grad=True, device=device)
+
+        fw_compiler = functools.partial(
+            count_ops,
+            freq=2,
+            op=torch.ops.aten.mm.default,
+        )
+        bw_compiler = functools.partial(
+            count_ops,
+            # We would've expected 6 here
+            # (2 matmul recompute and 2 mm ops per fwd matmul, so 2 + 2 * 2 = 6)
+            # if we didn't enable selective checkpointing.
+            freq=4,
+            op=torch.ops.aten.mm.default,
+        )
+        backend = aot_autograd(
+            fw_compiler=fw_compiler,
+            bw_compiler=bw_compiler,
+            partition_fn=min_cut_rematerialization_partition,
+        )
+        self._validate(fn, backend, x, y)
+        self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
+
+    @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_tensor_subclass(self, device):
         def selective_checkpointing_context_fn():
@@ -811,7 +927,7 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_custom_rule(self, device):
         def _get_custom_policy(meta):
@@ -876,7 +992,7 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_partial_ctx_fn(self, device):
         def selective_checkpointing_context_fn(no_recompute_list):
@@ -922,7 +1038,7 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_outplace_op(self, device):
         def selective_checkpointing_context_fn():
@@ -967,7 +1083,7 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_list_ops(self, device):
         def selective_checkpointing_context_fn():
@@ -1015,7 +1131,7 @@ def fn(x, y):
         "In-place op support in selective checkpointing + torch.compile "
         "requires TorchDispatchMode + torch.compile work to complete"
     )
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_compile_selective_checkpoint_inplace_op(self, device):
         def selective_checkpointing_context_fn():
             no_recompute_list = [
@@ -1061,7 +1177,7 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     @torch._inductor.config.patch(fallback_random=True)
     def test_compile_selective_checkpoint_random_op(self, device):
@@ -1121,7 +1237,7 @@ def fn(x):
             self._validate(fn, backend, x, skip_check=not preserve_rng_state)
             self._compare_orig_and_checkpointed_fns(gn, fn, x)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_invalid_context(self):
         def gn(x, y):
@@ -1159,7 +1275,7 @@ def fn(x, y):
         ):
             self._validate(fn, backend, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._dynamo.config.patch(inline_inbuilt_nn_modules=True)
     def test_compile_selective_checkpoint_parametrization(self):
         def sac_policy():
@@ -1252,8 +1368,7 @@ def reset_parameters(self):
         self.assertEqual(out, out_compiled)
         self.assertEqual(input.grad, input_compiled.grad)
 
-    @skipIfRocm
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_autocast_flash_attention(self, device):
         def fn(primals_1, primals_2, primals_3):
             return torch.ops.aten._scaled_dot_product_efficient_attention.default(
@@ -1277,7 +1392,7 @@ def gn(*args):
             res = opt_gn(*args)
             self.assertEqual(ref, res)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_error_msg(self, device):
         class MockModule(torch.nn.Module):
             def __init__(self) -> None:
@@ -1301,7 +1416,7 @@ def fn(x):
         ):
             opt_fn(x)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_list_inputs(self, device):
         class MockModule(torch.nn.Module):
             def __init__(self) -> None:
@@ -1326,11 +1441,11 @@ def fn(x, ys):
         res = opt_fn(x, [y, z])
         self.assertEqual(ref, res)
 
-    @requires_cuda
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION and not PLATFORM_SUPPORTS_CUDNN_ATTENTION,
         "Flash and CuDNN attention not support on GPU arch."
     )
+    @requires_cuda_and_triton
     def test_pattern_matcher(self, device):
         # Check that the sdpa op is recomputed in the backward graph
         # tests percolate_tags
@@ -1410,7 +1525,7 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
         )
 
     @requires_distributed()
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_distributed_utils_checkpoint_wrapper(self):
         from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
             checkpoint_wrapper as dist_checkpoint_wrapper,
@@ -1436,7 +1551,7 @@ def forward(self, x):
         self.assertEqual(ref, res)
 
     @requires_distributed()
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._dynamo.config.patch(inline_inbuilt_nn_modules=True)
     def test_dynamo_does_not_trace_getattr_as_top_frame(self):
         # inline_inbuilt_nn_modules is a proxy to emulate what FSDP tests do.
diff --git a/test/dynamo/test_aot_autograd_cache.py b/test/dynamo/test_aot_autograd_cache.py
index 0d4a1f01f9a30..68ac9d427f8e1 100644
--- a/test/dynamo/test_aot_autograd_cache.py
+++ b/test/dynamo/test_aot_autograd_cache.py
@@ -37,7 +37,7 @@
     skipIfWindows,
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, requires_triton
-from torch.testing._internal.triton_utils import requires_cuda
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 from torch.testing._internal.two_tensor import TwoTensor
 
 
@@ -292,6 +292,56 @@ def fn(x, y):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
 
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_vmap(self):
+        """
+        make
+        """
+
+        def fn(x, y):
+            f = lambda x, y: (x * y + 1).sum(dim=0)  # noqa: E731
+            vmapped = torch.vmap(f)(x, y)
+            return vmapped.sum(dim=0)
+
+        x = torch.randn(25, requires_grad=True)
+        y = torch.randn(25, requires_grad=True)
+        x2 = x.detach().clone().requires_grad_(True)
+        y2 = y.detach().clone().requires_grad_(True)
+
+        compiled_fn = torch.compile(fn, backend="inductor")
+
+        # A first call should miss in the cache.
+        self.assertEqual(fn(x, y), compiled_fn(x2, y2))
+        fn(x, y).sum().backward()
+        compiled_fn(x2, y2).sum().backward()
+        self.assertEqual(x.grad, x2.grad)
+        self.assertEqual(y.grad, y2.grad)
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        # Reset all tensors
+        x = torch.randn(25, requires_grad=True)
+        y = torch.randn(25, requires_grad=True)
+        x2 = x.detach().clone().requires_grad_(True)
+        y2 = y.detach().clone().requires_grad_(True)
+
+        # A second call should hit. (First reset so in-memory guards
+        # don't prevent compilation).
+        self._clear_dynamo_and_codecache()
+        self.assertEqual(fn(x, y), compiled_fn(x2, y2))
+        fn(x, y).sum().backward()
+        compiled_fn(x2, y2).sum().backward()
+        self.assertEqual(x.grad, x2.grad)
+        self.assertEqual(y.grad, y2.grad)
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
     @functorch_config.patch({"enable_autograd_cache": True})
@@ -447,8 +497,8 @@ def test_non_bundled_to_bundled_config_change(self):
         def fn(x, y):
             return (x * 2, y @ y)
 
-        a = torch.rand(25, device="cuda")
-        b = torch.rand(5, 5, device="cuda")
+        a = torch.rand(25, device=GPU_TYPE)
+        b = torch.rand(5, 5, device=GPU_TYPE)
 
         compiled_fn = torch.compile(fn, backend="inductor")
         self.assertEqual(fn(a, b), compiled_fn(a, b))
@@ -690,7 +740,7 @@ def fn(a, b):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 0)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
     @functorch_config.patch({"enable_autograd_cache": True})
@@ -746,7 +796,7 @@ def backward(ctx, grad_output):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
     @functorch_config.patch({"enable_autograd_cache": True})
@@ -788,8 +838,7 @@ def fn(a):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 0)
 
-    @requires_cuda
-    @requires_triton()
+    @requires_cuda_and_triton
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
     @functorch_config.patch({"enable_autograd_cache": True})
@@ -822,7 +871,7 @@ def backward(ctx, grad_output):
         def fn(a):
             return MyAutogradFunction.apply(a)
 
-        a = torch.randn(5, device="cuda", requires_grad=True)
+        a = torch.randn(5, device=GPU_TYPE, requires_grad=True)
         a2 = a.clone().detach_().requires_grad_(True)
         compiled_fn = torch.compile(fn, backend="inductor")
         result = compiled_fn(a)
@@ -842,6 +891,214 @@ def fn(a):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 0)
 
+    @requires_cuda_and_triton
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @functorch_config.patch({"autograd_cache_allow_custom_autograd_functions": True})
+    def test_custom_autograd_function_with_custom_triton_kernel_cache_invalidation(
+        self,
+    ):
+        @triton.jit
+        def my_jit(x):
+            arg_0 = tl.load(x)
+            tl.store(x, arg_0 + 1)
+
+        @torch._library.triton_op("test::my_triton_op", mutates_args=())
+        def my_triton_op(x: torch.Tensor) -> torch.Tensor:
+            y = x.clone().detach_().requires_grad_(True)
+            torch._library.capture_triton(my_jit)[1,](y)
+            return y
+
+        class MyAutogradFunction(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                y = torch.ops.test.my_triton_op(x)
+                ctx.save_for_backward(y)
+                ctx.foo = x.cos()
+                return y
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                result = ctx.saved_tensors[0]
+                return grad_output * result + ctx.foo * grad_output
+
+        def fn(a):
+            return MyAutogradFunction.apply(a)
+
+        a = torch.randn(5, device=GPU_TYPE, requires_grad=True)
+        a2 = a.clone().detach_().requires_grad_(True)
+        a3 = a.clone().detach_().requires_grad_(True)
+        compiled_fn = torch.compile(fn, backend="inductor")
+        result = compiled_fn(a)
+        self.assertEqual(fn(a), result)
+        result.sum().backward()
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        # Clear dynamo and run again. Should be a cache hit.
+        counters.clear()
+        self._clear_dynamo_and_codecache()
+        result = compiled_fn(a2)
+        self.assertEqual(fn(a2), result)
+        result.sum().backward()
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 0)
+
+        # Now modify the source code of my_jit by redefining it
+        @triton.jit
+        def my_jit(x):  # noqa: F811
+            arg_0 = tl.load(x)
+            tl.store(x, arg_0 + 2)  # Changed from +1 to +2
+
+        @torch._library.triton_op("test::my_triton_op", mutates_args=())
+        def my_triton_op(x: torch.Tensor) -> torch.Tensor:  # noqa: F811
+            y = x.clone().detach_().requires_grad_(True)
+            torch._library.capture_triton(my_jit)[1,](y)
+            return y
+
+        # Clear dynamo and run again. Should be a cache miss due to modified source code.
+        counters.clear()
+        self._clear_dynamo_and_codecache()
+        compiled_fn = torch.compile(fn, backend="inductor")
+
+        result = compiled_fn(a3)
+        # Assert that after changing the source code, the cache no longer hits
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(fn(a3), result)
+
+    @requires_cuda_and_triton
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_triton_op_cache_invalidation(self):
+        from torch._library import capture_triton
+
+        @triton.jit
+        def my_jit(x):  # noqa: F811
+            arg_0 = tl.load(x)
+            tl.store(x, arg_0 + 1)
+
+        @torch._library.triton_op("test::my_triton_op", mutates_args=())
+        def my_triton_op(x: torch.Tensor) -> torch.Tensor:  # noqa: F811
+            y = x.clone().detach_().requires_grad_(True)
+            capture_triton(my_jit)[1,](y)
+            return y
+
+        def fn(a):
+            return torch.ops.test.my_triton_op(a)
+
+        a = torch.randn(5, device=GPU_TYPE)
+        a2 = a.clone().detach_()
+        compiled_fn = torch.compile(fn, backend="inductor")
+        result = compiled_fn(a)
+        self.assertEqual(fn(a), result)
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        self._clear_dynamo_and_codecache()
+
+        # Redefine the triton op
+
+        @triton.jit
+        def my_jit(x):  # noqa: F811
+            arg_0 = tl.load(x)
+            tl.store(x, arg_0 + 2)
+
+        @torch._library.triton_op("test::my_triton_op", mutates_args=())
+        def my_triton_op(x: torch.Tensor) -> torch.Tensor:  # noqa: F811
+            y = x.clone().detach_().requires_grad_(True)
+            torch._library.capture_triton(my_jit)[1,](y)
+            return y
+
+        compiled_fn = torch.compile(fn, backend="inductor")
+        result = compiled_fn(a2)
+
+        # Second run should still miss
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 2)
+
+        self.assertEqual(fn(a2), result)
+
+    @requires_cuda_and_triton
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @unittest.expectedFailure  # Currently ops that call other ops does not properly invalidate cache
+    def test_triton_op_cache_multiple_ops_invalidation(self):
+        @triton.jit
+        def my_jit(x):
+            arg_0 = tl.load(x)
+            tl.store(x, arg_0 + 1)
+
+        @triton.jit
+        def my_jit2(x):
+            arg_0 = tl.load(x)
+            tl.store(x, arg_0 + 1)
+
+        @torch._library.triton_op("test::my_triton_op", mutates_args=())
+        def my_triton_op(x: torch.Tensor) -> torch.Tensor:
+            y = x.clone().detach_().requires_grad_(True)
+            torch._library.capture_triton(my_jit)[1,](y)
+            torch._library.capture_triton(my_jit2)[1,](y)
+            return y
+
+        @torch._library.triton_op("test::my_triton_op2", mutates_args=())
+        def my_triton_op2(x: torch.Tensor) -> torch.Tensor:
+            y = x.clone().detach_().requires_grad_(True)
+            torch.ops.test.my_triton_op(y)
+            return y
+
+        def fn(a):
+            return torch.ops.test.my_triton_op2(a)
+
+        a = torch.randn(5, device=GPU_TYPE)
+        a2 = a.clone().detach_()
+        compiled_fn = torch.compile(fn, backend="inductor")
+        result = compiled_fn(a)
+        self.assertEqual(fn(a), result)
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        self._clear_dynamo_and_codecache()
+
+        # Redefine the triton op
+
+        @triton.jit
+        def my_jit(x):  # noqa: F811
+            arg_0 = tl.load(x)
+            tl.store(x, arg_0 + 2)
+
+        @torch._library.triton_op("test::my_triton_op", mutates_args=())
+        def my_triton_op(x: torch.Tensor) -> torch.Tensor:  # noqa: F811
+            y = x.clone().detach_().requires_grad_(True)
+            torch._library.capture_triton(my_jit)[1,](y)
+            torch._library.capture_triton(my_jit2)[1,](y)
+            return y
+
+        @torch._library.triton_op("test::my_triton_op2", mutates_args=())
+        def my_triton_op2(x: torch.Tensor) -> torch.Tensor:  # noqa: F811
+            y = x.clone().detach_().requires_grad_(True)
+            torch.ops.test.my_triton_op(y)
+            return y
+
+        compiled_fn = torch.compile(fn, backend="inductor")
+        result = compiled_fn(a2)
+
+        # Second run should still miss
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 2)
+
+        self.assertEqual(fn(a2), result)
+
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch({"fx_graph_cache": True})
     @functorch_config.patch({"enable_autograd_cache": True})
@@ -1260,7 +1517,7 @@ def f():
             result = f()
             self.assertEqual(result[0].device, torch.device("cuda:1"))
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @inductor_config.patch("fx_graph_cache", True)
     @inductor_config.patch("fx_graph_remote_cache", False)
     @functorch_config.patch({"enable_autograd_cache": True})
diff --git a/test/dynamo/test_aot_compile.py b/test/dynamo/test_aot_compile.py
new file mode 100644
index 0000000000000..6589428bda6c6
--- /dev/null
+++ b/test/dynamo/test_aot_compile.py
@@ -0,0 +1,233 @@
+# Owner(s): ["module: dynamo"]
+
+import os
+import pickle
+
+import torch
+import torch._dynamo.testing
+import torch._inductor.config
+import torch._inductor.test_case
+import torch.onnx.operators
+import torch.utils.cpp_extension
+from torch._dynamo.exc import PackageError, Unsupported
+from torch._dynamo.package import DynamoCache
+from torch._dynamo.precompile_context import PrecompileContext
+from torch._inductor.runtime.runtime_utils import cache_dir
+from torch.fx._graph_pickler import GraphPickler
+from torch.testing._internal.common_utils import instantiate_parametrized_tests
+
+
+class CustomCompiledFunction(torch._dynamo.aot_compile.SerializableCallable):
+    def __init__(self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]):
+        self.gm = gm
+        self.example_inputs = example_inputs
+
+    @classmethod
+    def serialize_compile_artifacts(cls, fn) -> bytes:
+        state = fn.__dict__.copy()
+        state["gm"] = GraphPickler.dumps(state["gm"])
+        return pickle.dumps(state)
+
+    @classmethod
+    def deserialize_compile_artifacts(cls, data: bytes):
+        state = pickle.loads(data)
+        fake_mode = torch._subclasses.FakeTensorMode()
+        state["gm"] = GraphPickler.loads(state["gm"], fake_mode)
+        state["gm"].recompile()
+        return cls(**state)
+
+    def __call__(self, *args, **kwargs):
+        return self.gm(*args, **kwargs)
+
+
+class SimpleLinearModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(3, 3)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+@torch._dynamo.config.patch("enable_aot_compile", True)
+@instantiate_parametrized_tests
+class TestAOTCompile(torch._inductor.test_case.TestCase):
+    def path(self):
+        path = os.path.join(cache_dir(), f"package_{self.id()}")
+        os.makedirs(path, exist_ok=True)
+        return os.path.join(path, "model.pt")
+
+    def setUp(self):
+        super().setUp()
+        torch._dynamo.reset()
+        torch._dynamo.utils.counters.clear()
+        DynamoCache.clear()
+        PrecompileContext.clear()
+
+    def test_aot_compile_basic_fn(self):
+        def fn(x, y):
+            return x + y
+
+        def backend(gm, example_inputs):
+            return CustomCompiledFunction(gm, example_inputs)
+
+        compiled_fn = torch.compile(fn, fullgraph=True, backend=backend).aot_compile(
+            ((torch.randn(3, 4), torch.randn(3, 4)), {})
+        )
+        inputs = (torch.randn(3, 4), torch.randn(3, 4))
+        expected = fn(*inputs)
+        actual = compiled_fn(*inputs)
+        self.assertEqual(expected, actual)
+        compiled_fn.save_compiled_function(self.path())
+        torch._dynamo.reset()
+        with torch.compiler.set_stance("fail_on_recompile"):
+            with open(self.path(), "rb") as f:
+                compiled_fn = torch.compiler.load_compiled_function(f)
+            actual = compiled_fn(*inputs)
+            self.assertEqual(expected, actual)
+
+    def test_aot_compile_basic_forward(self):
+        mod = SimpleLinearModule()
+
+        def backend(gm, example_inputs):
+            return CustomCompiledFunction(gm, example_inputs)
+
+        compiled_fn = torch.compile(
+            mod,
+            fullgraph=True,
+            backend=backend,
+        ).forward.aot_compile(((torch.randn(3, 3),), {}))
+        inputs = (torch.randn(3, 3),)
+        expected = mod(*inputs)
+        actual = compiled_fn(mod, *inputs)
+        self.assertEqual(expected, actual)
+        compiled_fn.save_compiled_function(self.path())
+        torch._dynamo.reset()
+        with torch.compiler.set_stance("fail_on_recompile"):
+            with open(self.path(), "rb") as f:
+                compiled_fn = torch.compiler.load_compiled_function(f)
+            actual = compiled_fn(mod, *inputs)
+            self.assertEqual(expected, actual)
+
+    def test_decorated_function_aot(self):
+        def check_inputs(fn):
+            def _fn(*args, **kwargs):
+                for arg in args:
+                    assert arg.shape[0] > 1
+
+                return fn(*args, **kwargs)
+
+            return _fn
+
+        @check_inputs
+        def foo(x, y):
+            a = x + x
+            b = y + y
+            c = a + b
+            return c
+
+        example_inputs = (torch.ones(3), torch.ones(3))
+        expected = foo(*example_inputs)
+
+        def backend(gm, example_inputs):
+            return CustomCompiledFunction(gm, example_inputs)
+
+        with torch.compiler.set_stance("fail_on_recompile"):
+            compiled_fn = torch.compile(
+                foo,
+                fullgraph=True,
+                backend=backend,
+            ).aot_compile((example_inputs, {}))
+            actual = compiled_fn(*example_inputs)
+            self.assertEqual(expected, actual)
+
+    def test_aot_compile_graph_break_error_fmt(self):
+        def foo(x, y):
+            a = x + x
+            torch._dynamo.graph_break()
+            b = y + y
+            c = a + b
+            return c
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: torch.compile(foo, fullgraph=True).aot_compile(
+                ((torch.ones(3), torch.ones(3)), {})
+            ),
+            """\
+Call to `torch._dynamo.graph_break()`
+  Explanation: User-inserted graph break. Message: None
+  Hint: Remove the `torch._dynamo.graph_break()` call.
+
+  Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
+
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+
+from user code:
+   File "test_aot_compile.py", line N, in foo
+    torch._dynamo.graph_break()""",
+        )
+
+    def test_guard_filter_override_aot(self):
+        def check_inputs(fn):
+            def _fn(*args, **kwargs):
+                for arg in args:
+                    assert arg.shape[0] > 1
+
+                return fn(*args, **kwargs)
+
+            return _fn
+
+        @check_inputs
+        def foo(x, y):
+            a = x + x
+            b = y + y
+            c = a + b
+            return c
+
+        example_inputs = (torch.ones(3), torch.ones(3))
+        expected = foo(*example_inputs)  # noqa: F841
+
+        def backend(gm, example_inputs):
+            return CustomCompiledFunction(gm, example_inputs)
+
+        with torch.compiler.set_stance("fail_on_recompile"):
+            with self.assertRaisesRegex(
+                PackageError,
+                "CLOSURE_MATCH guard cannot be serialized.",
+            ):
+                compiled_fn = torch.compile(  # noqa: F841
+                    foo,
+                    fullgraph=True,
+                    backend=backend,
+                    options={
+                        "guard_filter_fn": lambda guard_entries: [
+                            True for g in guard_entries
+                        ]
+                    },
+                ).aot_compile((example_inputs, {}))
+
+    def test_aot_compile_basic_fn_inductor(self):
+        def fn(x, y):
+            return x + y
+
+        compiled_fn = torch.compile(fn, fullgraph=True, backend="inductor").aot_compile(
+            ((torch.randn(3, 4), torch.randn(3, 4)), {})
+        )
+        inputs = (torch.randn(3, 4), torch.randn(3, 4))
+        expected = fn(*inputs)
+        actual = compiled_fn(*inputs)
+        self.assertEqual(expected, actual)
+        compiled_fn.save_compiled_function(self.path())
+        torch._dynamo.reset()
+        with torch.compiler.set_stance("fail_on_recompile"):
+            with open(self.path(), "rb") as f:
+                compiled_fn = torch.compiler.load_compiled_function(f)
+            actual = compiled_fn(*inputs)
+            self.assertEqual(expected, actual)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_autograd_function.py b/test/dynamo/test_autograd_function.py
index 6f460b402404f..326a1e627b3f4 100644
--- a/test/dynamo/test_autograd_function.py
+++ b/test/dynamo/test_autograd_function.py
@@ -8,10 +8,14 @@
 import torch._dynamo.test_case
 import torch._dynamo.testing
 import torch._dynamo.utils
-from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
+from torch.testing._internal.triton_utils import HAS_GPU, requires_gpu
 
 
-if HAS_CUDA:
+device_type = (
+    acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
+)
+
+if HAS_GPU:
     import triton
 
     from torch.testing._internal.triton_utils import add_kernel
@@ -504,13 +508,13 @@ def test_amp_custom_fwd_bwd(self):
 
         class MyMM(torch.autograd.Function):
             @staticmethod
-            @torch.amp.custom_fwd(device_type="cuda")
+            @torch.amp.custom_fwd(device_type=device_type)
             def forward(ctx, a, b):
                 ctx.save_for_backward(a, b)
                 return a.mm(b)
 
             @staticmethod
-            @torch.amp.custom_bwd(device_type="cuda")
+            @torch.amp.custom_bwd(device_type=device_type)
             def backward(ctx, grad):
                 a, b = ctx.saved_tensors
                 return grad.mm(b.t()), a.t().mm(grad)
@@ -1473,7 +1477,7 @@ def fn():
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 1)
 
-    @requires_cuda
+    @requires_gpu
     def test_triton_kernel_basic(self):
         class Add(torch.autograd.Function):
             @staticmethod
@@ -1497,14 +1501,14 @@ def f(x, y):
             z = Add.apply(x, y)
             return z
 
-        x = torch.randn(10, device="cuda", requires_grad=True)
-        y = torch.randn(10, device="cuda", requires_grad=True)
+        x = torch.randn(10, device=device_type, requires_grad=True)
+        y = torch.randn(10, device=device_type, requires_grad=True)
         z = f(x, y)
         loss = z.sum()
         loss.backward()
         self.assertEqual(x + y, z)
 
-    @requires_cuda
+    @requires_gpu
     def test_triton_kernel_multiple_out(self):
         class Add(torch.autograd.Function):
             @staticmethod
@@ -1532,8 +1536,8 @@ def f(x, y):
             z = Add.apply(x, y)
             return z
 
-        x = torch.randn(10, device="cuda", requires_grad=True)
-        y = torch.randn(10, device="cuda", requires_grad=True)
+        x = torch.randn(10, device=device_type, requires_grad=True)
+        y = torch.randn(10, device=device_type, requires_grad=True)
         z, _ = f(x, y)
         loss = z.sum()
         loss.backward()
diff --git a/test/dynamo/test_backends.py b/test/dynamo/test_backends.py
index 9d61bbf31acb1..be1470c08e794 100644
--- a/test/dynamo/test_backends.py
+++ b/test/dynamo/test_backends.py
@@ -16,10 +16,7 @@
     onlyHPU,
 )
 from torch.testing._internal.common_utils import skipIfHpu
-from torch.testing._internal.inductor_utils import HAS_CUDA
-
-
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 class Seq(torch.nn.Module):
@@ -133,7 +130,7 @@ def test_aot_eager_decomp_partition(self, device):
     def test_aot_ts(self, device):
         self._check_backend_works("aot_ts", device)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_aot_cudagraphs(self, device):
         self._check_backend_works("cudagraphs", device)
 
diff --git a/test/dynamo/test_base_hop.py b/test/dynamo/test_base_hop.py
index 18cdf78c61f27..607b502351aaf 100644
--- a/test/dynamo/test_base_hop.py
+++ b/test/dynamo/test_base_hop.py
@@ -1,5 +1,4 @@
 # Owner(s): ["module: dynamo"]
-import unittest
 import unittest.mock as mock
 
 import torch
@@ -13,10 +12,6 @@
 )
 from torch._higher_order_ops.schema import find_hop_schema
 from torch.testing._internal.common_utils import instantiate_parametrized_tests
-from torch.testing._internal.inductor_utils import HAS_CUDA
-
-
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 
 
 def normalize_graph(gm):
diff --git a/test/dynamo/test_bytecode_utils.py b/test/dynamo/test_bytecode_utils.py
index b91b8156ec181..ea5ec7b55a4fd 100644
--- a/test/dynamo/test_bytecode_utils.py
+++ b/test/dynamo/test_bytecode_utils.py
@@ -284,7 +284,7 @@ def fn():
         def nothing(*args):
             pass
 
-        code = bytecode_transformation.transform_code_object(fn.__code__, nothing)
+        code, _ = bytecode_transformation.transform_code_object(fn.__code__, nothing)
         self.assertEqual(code.co_exceptiontable, fn.__code__.co_exceptiontable)
 
     @skipIfNotPy311
@@ -300,7 +300,7 @@ def fn():
         def nothing(*args):
             pass
 
-        code = bytecode_transformation.transform_code_object(fn.__code__, nothing)
+        code, _ = bytecode_transformation.transform_code_object(fn.__code__, nothing)
         self.assertEqual(code.co_exceptiontable, fn.__code__.co_exceptiontable)
 
     @skipIfNotPy311
@@ -544,6 +544,34 @@ def fn(x):
 
         self.assertEqual(fn(torch.ones(3)), torch.ones(3) + 1)
 
+    # https://github.com/pytorch/pytorch/issues/160471
+    def test_extended_args_starts_line(self):
+        # NOTE: need to LOAD_CONST i before LOAD_FAST x
+        # in order to get an EXTENDED_ARG with starts_line set
+        lines = "\n".join(f"    x = {i} + x" for i in range(300))
+        fn_str = f"def fn(x):\n{lines}"
+        locals = {}
+        exec(fn_str, {}, locals)
+        fn = locals["fn"]
+
+        for inst in dis.get_instructions(fn):
+            if inst.opname == "EXTENDED_ARG" and inst.starts_line:
+                break
+        else:
+            self.assertTrue(
+                False, "bad test case: no EXTENDED_ARG with starts_line found"
+            )
+
+        def transformations(instructions, _):
+            for inst in instructions:
+                if inst.starts_line == 301:
+                    break
+            else:
+                self.assertTrue(False, "test failure: 301 starts_line not found")
+            return instructions
+
+        bytecode_transformation.transform_code_object(fn.__code__, transformations)
+
 
 class BytecodeHookTests(torch._dynamo.test_case.TestCase):
     def test_bytecode_hook(self):
diff --git a/test/dynamo/test_callback.py b/test/dynamo/test_callback.py
index 8112a2e89e957..e516364626314 100644
--- a/test/dynamo/test_callback.py
+++ b/test/dynamo/test_callback.py
@@ -8,7 +8,7 @@
 from torch._dynamo.test_case import run_tests, TestCase
 from torch._guards import CompileId
 from torch.testing._internal.common_utils import TEST_WITH_ROCM
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 class CallbackTests(TestCase):
@@ -61,7 +61,7 @@ def test_counter_assertion(self) -> None:
     @unittest.skipIf(
         TEST_WITH_ROCM, "ROCm outputs a different number of autotuning logs"
     )
-    @unittest.skipIf(not HAS_CUDA, "requires triton")
+    @requires_cuda_and_triton
     @torch._inductor.config.patch(force_disable_caches=True)
     def test_triggers(self) -> None:
         torch._dynamo.reset()
diff --git a/test/dynamo/test_compiler_bisector.py b/test/dynamo/test_compiler_bisector.py
index a5a350c0d1ad1..161f9674cd4a1 100644
--- a/test/dynamo/test_compiler_bisector.py
+++ b/test/dynamo/test_compiler_bisector.py
@@ -1,6 +1,5 @@
 # Owner(s): ["module: dynamo"]
 
-import unittest
 from contextlib import contextmanager
 from importlib import import_module
 
@@ -11,19 +10,18 @@
 from torch._inductor.compiler_bisector import CompilerBisector
 from torch._inductor.test_case import TestCase
 from torch.library import _scoped_library, Library
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 aten = torch.ops.aten
 
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 
 f32 = torch.float32
 i64 = torch.int64
 i32 = torch.int32
 
 
-@requires_cuda
+@requires_cuda_and_triton
 class TestCompilerBisector(TestCase):
     test_ns = "_test_bisector"
 
diff --git a/test/dynamo/test_ctx_manager.py b/test/dynamo/test_ctx_manager.py
index f2c781379ef5c..5e188e76dc56e 100644
--- a/test/dynamo/test_ctx_manager.py
+++ b/test/dynamo/test_ctx_manager.py
@@ -1742,6 +1742,83 @@ def f(x):
         opt_f = torch.compile(f, backend="eager")
         opt_f(torch.randn(2, 2))
 
+    # Regression test to make sure dynamo won't crash on these kwargs.
+    def test_sdpa_kernel_ctx_manager_kwargs(self):
+        backends = [torch.nn.attention.SDPBackend.MATH]
+
+        @torch._dynamo.allow_in_graph
+        def check_backend_state_is_modified():
+            self.assertEqual(
+                set(torch.nn.attention._cur_sdpa_kernel_backends()),
+                set(backends),
+            )
+
+        def f(x):
+            with torch.nn.attention.sdpa_kernel(backends=backends, set_priority=True):
+                x = x + 1
+                check_backend_state_is_modified()
+                x = x + 1
+
+            return x
+
+        opt_f = torch.compile(f, backend="eager")
+        opt_f(torch.randn(2, 2))
+
+    # Regression test to make sure dynamo won't graph break on calling functions
+    # decorated with special context manager.
+    def test_sdpa_kernel_ctx_manager_as_decorator(self):
+        SDPA_BACKEND_PRIORITY = [
+            torch.nn.attention.SDPBackend.MATH,
+            torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+            torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+        ]
+
+        @torch.nn.attention.sdpa_kernel(
+            backends=SDPA_BACKEND_PRIORITY, set_priority=True
+        )
+        def scaled_dot_product_attention(q, k, v, *args, **kwargs):
+            return torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, *args, **kwargs
+            )
+
+        def f(x):
+            return scaled_dot_product_attention(x, x, x)
+
+        opt_f = torch.compile(f, backend="eager", fullgraph=True)
+        x = torch.rand(16, 16, 64, 256, dtype=torch.float16)
+        ref = f(x)
+        res = opt_f(x)
+
+        self.assertEqual(ref, res)
+
+    # Regression test to make sure the value of set_priority is used correctly.
+    def test_sdpa_kernel_ctx_manager_set_priority(self):
+        backends = [torch.nn.attention.SDPBackend.MATH]
+        default_priority = torch._C._get_sdp_priority_order()
+
+        @torch._dynamo.allow_in_graph
+        def check_backend_priority(changed: bool):
+            self.assertEqual(
+                changed,
+                torch._C._get_sdp_priority_order() != default_priority,
+            )
+
+        def f(x):
+            with torch.nn.attention.sdpa_kernel(backends=backends, set_priority=True):
+                x = x + 1
+                check_backend_priority(changed=True)
+                x = x + 1
+
+            with torch.nn.attention.sdpa_kernel(backends=backends, set_priority=False):
+                x = x + 1
+                check_backend_priority(changed=False)
+                x = x + 1
+
+            return x
+
+        opt_f = torch.compile(f, backend="eager")
+        opt_f(torch.randn(2, 2))
+
     def test_torch_profiler_use_after_with_block(self):
         counters.clear()
 
diff --git a/test/dynamo/test_debug_utils.py b/test/dynamo/test_debug_utils.py
index ea39f6fbd9e1e..eae4d06d98904 100644
--- a/test/dynamo/test_debug_utils.py
+++ b/test/dynamo/test_debug_utils.py
@@ -1,7 +1,6 @@
 # Owner(s): ["module: dynamo"]
 
 import os
-import unittest
 from unittest.mock import patch
 
 import torch
@@ -10,11 +9,8 @@
 from torch._dynamo.debug_utils import aot_graph_input_parser, generate_env_vars_string
 from torch._dynamo.test_case import TestCase
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
-from torch.testing._internal.inductor_utils import HAS_CUDA
 
 
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
-
 f32 = torch.float32
 i64 = torch.int64
 i32 = torch.int32
diff --git a/test/dynamo/test_decorators.py b/test/dynamo/test_decorators.py
index 3b29e5e961192..6af25a385c2f6 100644
--- a/test/dynamo/test_decorators.py
+++ b/test/dynamo/test_decorators.py
@@ -10,6 +10,7 @@
 import torch._dynamo.testing
 from torch._dynamo.exc import IncorrectUsage, Unsupported
 from torch._dynamo.utils import counters
+from torch.testing._internal.common_utils import skipIfWindows
 
 
 def my_custom_function(x):
@@ -892,6 +893,9 @@ def gn(x):
         self.assertEqual(gn(inp), inp + 3)
         self.assertEqual(cnts.frame_count, 1)
 
+    @skipIfWindows(
+        msg="TODO: (xuhancn), confirm if torch.compiler.disable work on Windows."
+    )
     def test_disable_recursive_false(self):
         def fn2(x):
             return x + 1
@@ -1063,11 +1067,10 @@ def fn3(x):
         self.assertEqual(cnts.frame_count, 2)
         self.assertEqual(cnts.op_count, 4)
 
-        cnts.clear()
-        torch._dynamo.reset()
-        fn3(torch.randn(4, 5))
-        self.assertEqual(cnts.frame_count, 2)
-        self.assertEqual(cnts.op_count, 4)
+        with self.assertRaisesRegex(
+            Unsupported, r"Skip calling `torch.compiler.disable\(\)`d function"
+        ):
+            fn3(torch.randn(4, 5))
 
     def test_disable_optimize(self):
         cnt = torch._dynamo.testing.CompileCounter()
@@ -1717,13 +1720,14 @@ def f4(x):
         ):
             f4(torch.randn(3))
 
-    def test_set_fullgraph(self):
+    def test_error_on_graph_break(self):
         cnts = torch._dynamo.testing.CompileCounter()
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f1(x):
             x = x + 1
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 torch._dynamo.graph_break()
             return x + 2
 
@@ -1734,17 +1738,18 @@ def f1(x):
         @torch.compile(backend=cnts)
         def f2(x):
             x = x + 1
-            with torch._dynamo.set_fullgraph(True):
+            with torch._dynamo.error_on_graph_break(True):
                 torch._dynamo.graph_break()
             return x + 2
 
         with self.assertRaises(Unsupported):
             f2(inp)
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f3(x):
             x = x + 1
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 torch._dynamo.graph_break()
                 x = x + 2
                 torch._dynamo.graph_break()
@@ -1759,10 +1764,11 @@ def inner_f4(x):
             torch._dynamo.graph_break()
             return x + 4
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f4(x):
             x = x + 1
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 torch._dynamo.skip_frame()
                 return inner_f4(x)
 
@@ -1770,17 +1776,18 @@ def f4(x):
         self.assertEqual(f4(inp), inp + 7)
         self.assertEqual(cnts.frame_count, 2)
 
-    def test_set_fullgraph_nested(self):
-        # set_fullgraph in a nested frame
+    def test_error_on_graph_break_nested(self):
+        # error_on_graph_break in a nested frame
         cnts = torch._dynamo.testing.CompileCounter()
 
-        @torch._dynamo.set_fullgraph(False)
+        @torch._dynamo.error_on_graph_break(False)
         def inner_f5(x):
             x = x + 2
             torch._dynamo.graph_break()
             return x + 4
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f5(x):
             x = x + 1
             return inner_f5(x)
@@ -1791,11 +1798,12 @@ def f5(x):
 
         def inner_f6(x):
             x = x + 2
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 torch._dynamo.graph_break()
             return x + 4
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f6(x):
             x = x + 1
             return inner_f6(x)
@@ -1806,11 +1814,12 @@ def f6(x):
 
         def inner_f7(x):
             x = x + 2
-            with torch._dynamo.set_fullgraph(True):
+            with torch._dynamo.error_on_graph_break(True):
                 torch._dynamo.graph_break()
             return x + 4
 
-        @torch.compile(backend=cnts, fullgraph=False)
+        @torch._dynamo.error_on_graph_break(False)
+        @torch.compile(backend=cnts)
         def f7(x):
             x = x + 1
             return inner_f7(x)
@@ -1818,22 +1827,23 @@ def f7(x):
         with self.assertRaises(Unsupported):
             f7(inp)
 
-    def test_set_fullgraph_nested_with_skip(self):
-        # set_fullgraph in a nested frame with a skipped frame in between
+    def test_error_on_graph_break_nested_with_skip(self):
+        # error_on_graph_break in a nested frame with a skipped frame in between
         cnts = torch._dynamo.testing.CompileCounter()
 
-        @torch._dynamo.set_fullgraph(False)
+        @torch._dynamo.error_on_graph_break(False)
         def inner2_f8(x):
             x = x + 2
             torch._dynamo.graph_break()
             return x + 4
 
         def inner1_f8(x):
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 torch._dynamo.skip_frame()
             return inner2_f8(x)
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f8(x):
             x = x + 1
             return inner1_f8(x)
@@ -1844,7 +1854,7 @@ def f8(x):
 
         def inner2_f9(x):
             x = x + 2
-            with torch._dynamo.set_fullgraph(True):
+            with torch._dynamo.error_on_graph_break(True):
                 torch._dynamo.graph_break()
             return x + 4
 
@@ -1852,7 +1862,8 @@ def inner2_f9(x):
         def inner1_f9(x):
             return inner2_f9(x)
 
-        @torch.compile(backend=cnts, fullgraph=False)
+        @torch._dynamo.error_on_graph_break(False)
+        @torch.compile(backend=cnts)
         def f9(x):
             x = x + 1
             return inner1_f9(x)
@@ -1860,10 +1871,10 @@ def f9(x):
         with self.assertRaises(Unsupported):
             f9(inp)
 
-        # test export with set_fullgraph(False) still errors
+        # test export with error_on_graph_break(False) still errors
 
-    def test_set_fullgraph_export(self):
-        @torch._dynamo.set_fullgraph(False)
+    def test_error_on_graph_break_export(self):
+        @torch._dynamo.error_on_graph_break(False)
         def inner(x):
             x = x + 2
             torch._dynamo.graph_break()
@@ -1876,7 +1887,7 @@ def f(x):
         with self.assertRaises(Unsupported):
             torch._dynamo.export(f)(torch.ones(3))
 
-    def test_set_fullgraph_nested_deep(self):
+    def test_error_on_graph_break_nested_deep(self):
         cnts = torch._dynamo.testing.CompileCounter()
 
         def inner1_f1(x):
@@ -1888,13 +1899,14 @@ def inner2_f1(x):
             return inner1_f1(x)
 
         def inner3_f1(x):
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 return inner2_f1(x)
 
         def inner4_f1(x):
             return inner3_f1(x)
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f1(x):
             x = x + 4
             return inner4_f1(x)
@@ -1912,13 +1924,14 @@ def inner2_f2(x):
             return inner1_f2(x)
 
         def inner3_f2(x):
-            with torch._dynamo.set_fullgraph(True):
+            with torch._dynamo.error_on_graph_break(True):
                 return inner2_f2(x)
 
         def inner4_f2(x):
             return inner3_f2(x)
 
-        @torch.compile(backend=cnts, fullgraph=False)
+        @torch._dynamo.error_on_graph_break(False)
+        @torch.compile(backend=cnts)
         def f2(x):
             x = x + 4
             return inner4_f2(x)
@@ -1926,20 +1939,20 @@ def f2(x):
         with self.assertRaises(Unsupported):
             f2(inp)
 
-    def test_set_fullgraph_error(self):
+    def test_error_on_graph_break_error(self):
         @torch.compile(backend="eager")
         def f1():
-            with torch._dynamo.set_fullgraph(foo="bar"):
+            with torch._dynamo.error_on_graph_break(foo="bar"):
                 pass
 
         @torch.compile(backend="eager")
         def f2():
-            with torch._dynamo.set_fullgraph():
+            with torch._dynamo.error_on_graph_break():
                 pass
 
         @torch.compile(backend="eager")
         def f3():
-            with torch._dynamo.set_fullgraph("foo"):
+            with torch._dynamo.error_on_graph_break("foo"):
                 pass
 
         with self.assertRaises(Exception):
@@ -1949,34 +1962,88 @@ def f3():
         with self.assertRaises(Exception):
             f3()
 
-    def test_nested_compile_fullgraph(self):
+    def test_nested_compile_error_on_graph_break(self):
         inp = torch.ones(3)
 
-        @torch.compile(backend="eager", fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend="eager")
         def inner_f1(x):
             x = x + 1
             torch._dynamo.graph_break()
             return x + 2
 
-        @torch.compile(backend="eager", fullgraph=False)
+        @torch._dynamo.error_on_graph_break(False)
+        @torch.compile(backend="eager")
         def f1(x):
             return inner_f1(x)
 
         with self.assertRaises(Unsupported):
             f1(inp)
 
-        @torch.compile(backend="eager", fullgraph=False)
+        @torch._dynamo.error_on_graph_break(False)
+        @torch.compile(backend="eager")
         def inner_f2(x):
             x = x + 1
             torch._dynamo.graph_break()
             return x + 2
 
-        @torch.compile(backend="eager", fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend="eager")
         def f2(x):
             return inner_f2(x)
 
         self.assertEqual(f2(inp), inp + 3)
 
+    def test_error_on_graph_break_fullgraph(self):
+        # Test that error_on_graph_break=False cannot override fullgraph=True
+        inp = torch.ones(3)
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            x = x + 1
+            with torch._dynamo.error_on_graph_break(False):
+                torch._dynamo.graph_break()
+            return x + 2
+
+        with self.assertRaises(Unsupported):
+            f(inp)
+
+    def test_error_on_graph_break_empty_graph(self):
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend="eager")
+        def f():
+            return 1
+
+        self.assertEqual(f(), 1)
+
+    def test_nested_compile_fullgraph(self):
+        # Test that fullgraph=True cannot be toggled back by fullgraph=False
+        inp = torch.ones(3)
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def inner_f1(x):
+            torch._dynamo.graph_break()
+            return x + 1
+
+        @torch.compile(backend="eager", fullgraph=False)
+        def outer_f1(x):
+            return inner_f1(x)
+
+        with self.assertRaises(Unsupported):
+            outer_f1(inp)
+
+        @torch.compile(backend="eager", fullgraph=False)
+        def inner_f2(x):
+            torch._dynamo.graph_break()
+            return x + 1
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def outer_f2(x):
+            return inner_f2(x)
+
+        with self.assertRaises(Unsupported):
+            outer_f2(inp)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_dicts.py b/test/dynamo/test_dicts.py
index 18f916e46dc00..3b1c9315336e1 100644
--- a/test/dynamo/test_dicts.py
+++ b/test/dynamo/test_dicts.py
@@ -7,7 +7,7 @@
 import types
 import unittest
 import weakref
-from collections import defaultdict, namedtuple, OrderedDict
+from collections import defaultdict, namedtuple, OrderedDict, UserDict
 from typing import Any
 
 import torch
@@ -31,6 +31,10 @@ class SimpleDict(dict):
     pass
 
 
+class DummyUserDict(UserDict):
+    pass
+
+
 class DictTests(torch._dynamo.test_case.TestCase):
     def test_dict_subclass_instantiation(self):
         def fn(x):
@@ -788,6 +792,17 @@ def fn(x):
         x = torch.randn(4)
         self.assertEqual(fn(x), opt_fn(x))
 
+    def test_construct_user_dict_and_return(self):
+        def fn(x):
+            return DummyUserDict({"a": x + 1})
+
+        x = torch.randn(4)
+        res = fn(x)
+        self.assertEqual(res["a"], x + 1)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        self.assertEqual(res["a"], opt_fn(x)["a"])
+
     def test_fn_id(self):
         def fn(x, f):
             d = {id(f): 3}
@@ -931,6 +946,25 @@ def fn(x):
         self.assertEqual(["b", "c", "a"], list(opt_fn(x).keys()))
         self.assertEqual(fn(x), opt_fn(x))
 
+    def test_mapping_proxy_ban_muation_on_dict_realization(self):
+        def fn(x):
+            class Foo:
+                b = 4
+
+            d = dict(Foo.__dict__)
+            y = torch.sin(x) * d["b"]
+            # This should cause a graph break, because otherwise the
+            # Foo.__dict__ will not be updated.
+            Foo.bar = 3
+            return Foo, y * Foo.__dict__["bar"]
+
+        opt_fn = torch.compile(fn, backend="eager")
+        x = torch.randn(4)
+        foo1, ref = fn(x)
+        foo2, res = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(foo1.bar, foo2.bar)
+
     def test_overridden_get_item(self):
         class MyDict(dict):
             def __init__(self, *args, **kwargs):
@@ -1139,6 +1173,59 @@ def fn(x, d1, d2):
             munge_exc(record.getMessage()),
         )
 
+    @make_logging_test(recompiles=True)
+    def test_cmp_or(self, records):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x, d1, d2):
+            d = d1 | d2
+            if d.get(5, False):
+                return x.sin()
+            return x.cos()
+
+        x = torch.tensor(1.0)
+        d1 = self.thetype({1: 2, 3: 4})
+        d2 = self.thetype({1: 2, 5: 6})
+        y = fn(x, d1, d2)
+        # sanity check
+        self.assertEqual(len(records), 0)
+        self.assertEqual(y, x.sin())
+
+        y = fn(x, d1, d1)
+        self.assertEqual(len(records), 1)
+        self.assertEqual(y, x.cos())
+        record = self.getRecord(records, "d2")
+        self.assertIn(
+            """KeyError on d2[5]""",
+            munge_exc(record.getMessage()),
+        )
+
+    @make_logging_test(recompiles=True)
+    def test_cmp_ior(self, records):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x, d1, d2):
+            d2 |= d1
+            if d2.get(3, False):
+                return x.sin()
+            return x.cos()
+
+        x = torch.tensor(1.0)
+        d1 = self.thetype({1: 2, 3: 4})
+        d2 = self.thetype({1: 2, 5: 6})
+        d3, d4 = d2.copy(), d2.copy()
+        y = fn(x, d1, d2)
+        # sanity check
+        self.assertEqual(len(records), 0)
+        self.assertEqual(y, x.sin())
+
+        y = fn(x, d3, d4)
+        self.assertEqual(len(records), 1)
+        self.assertEqual(y, x.cos())
+        record = self.getRecord(records, "d1")
+        self.assertIn(
+            """KeyError on d1[3]""",
+            munge_exc(record.getMessage()),
+        )
+
 
 class DictMethodsTests(torch._dynamo.test_case.TestCase):
     thetype = dict
@@ -1232,6 +1319,53 @@ def test_binop_or(self):
         # Test with non-dict types
         self.assertRaises(TypeError, lambda: d1 | 1)
 
+    @make_dynamo_test
+    def test_binop_ior(self):
+        d1 = self.thetype({"a": 1, "b": 2})
+        d2 = self.thetype({"b": 3, "c": 4})
+
+        # Test the |= operator
+        d3, d4 = d1.copy(), d2.copy()
+        d3 |= d2
+        d4 |= d1
+        self.assertEqual(d3, {"a": 1, "b": 3, "c": 4})
+        self.assertEqual(d4, {"a": 1, "b": 2, "c": 4})
+
+        # Test with an iterable
+        d3, d4 = d1.copy(), d2.copy()
+
+        # Test the __ior__ method
+        d3, d4 = d1.copy(), d2.copy()
+        d3.__ior__(d2)
+        d4.__ior__(d1)
+        self.assertEqual(d3, {"a": 1, "b": 3, "c": 4})
+        self.assertEqual(d4, {"a": 1, "b": 2, "c": 4})
+
+        # Test Dict.__or__
+        d3, d4 = d1.copy(), d2.copy()
+        self.assertEqual(dict.__ior__(d3, d2), {"a": 1, "b": 3, "c": 4})
+        self.assertEqual(self.thetype.__ior__(d4, d1), {"a": 1, "b": 2, "c": 4})
+
+        # Test return value
+        d3, d4 = d1.copy(), d2.copy()
+        self.assertEqual(d3.__ior__(d2), {"a": 1, "b": 3, "c": 4})
+        self.assertEqual(dict.__ior__(d4, d1), {"a": 1, "b": 2, "c": 4})
+
+        # Test with non-dict types
+        self.assertRaises(TypeError, lambda: dict.__ior__(d1, 1))
+
+    @make_dynamo_test
+    def test_binop_ior_iterable(self):
+        d1 = self.thetype({"a": 1, "b": 2})
+        d2 = self.thetype({"b": 3, "c": 4})
+        d3, d4 = d1.copy(), d2.copy()
+
+        def fn(d):
+            yield from d.items()
+
+        self.assertEqual(d3.__ior__(d2.items()), {"a": 1, "b": 3, "c": 4})
+        self.assertEqual(d4.__ior__(fn(d1)), {"a": 1, "b": 2, "c": 4})
+
     @make_dynamo_test
     def test_clear(self):
         d = self.thetype({"a": 1, "b": 2})
@@ -1367,7 +1501,7 @@ def test_popitem(self):
         self.assertEqual(value, 1)
 
         # Test invalid usage
-        if self.thetype != OrderedDict:
+        if self.thetype is not OrderedDict:
             # OrderedDict accepts a keyword arg
             self.assertRaises(TypeError, d.popitem, 1)
 
@@ -1432,6 +1566,23 @@ def test_values(self):
         # Test invalid usage
         self.assertRaises(TypeError, d.values, 1)
 
+    @make_dynamo_test
+    def test_type(self):
+        d = self.thetype({"a": 1, "b": 2})
+        self.assertIsInstance(d, self.thetype)
+        self.assertIs(type(d), self.thetype)
+
+    @make_dynamo_test
+    def test_dict_type_comparison(self):
+        types = (dict, OrderedDict, defaultdict)
+        self.assertEqual(self.thetype, self.thetype)
+        self.assertTrue(self.thetype is self.thetype)
+        for other in types:
+            if self.thetype == other:
+                continue
+            self.assertNotEqual(self.thetype, other)
+            self.assertTrue(self.thetype is not other, f"{self.thetype=}, {other=}")
+
 
 class DictSubclassMethodsTests(DictMethodsTests):
     thetype = SimpleDict
@@ -1445,11 +1596,98 @@ class OrderedDictMethodsTests(DictMethodsTests):
     # + move_to_end
 
     @make_dynamo_test
+    def test_move_to_end(self):
+        d = self.thetype.fromkeys("abcde")
+        self.assertEqual("".join(d), "abcde")
+        d.move_to_end("b")
+        self.assertEqual("".join(d), "acdeb")
+
+        # Test OrderedDict.move_to_end
+        self.thetype.move_to_end(d, "a")
+        self.assertEqual("".join(d), "cdeba")
+
+        # Test last=False
+        self.thetype.move_to_end(d, "a", last=False)
+        self.assertEqual("".join(d), "acdeb")
+
+        # Test KeyError
+        self.assertRaises(KeyError, d.move_to_end, "f")
+
     def test_cmp_eq_order(self):
         a = self.thetype.fromkeys("abc")
         b = self.thetype.fromkeys("bca")
         self.assertFalse(a == b)
 
+    @make_dynamo_test
+    def test_binop_or_return_type(self):
+        d1 = self.thetype({"a": 1, "b": 2})
+        d2 = self.thetype({"b": 3, "c": 4})
+
+        # Test return type
+        self.assertIs(type(d1 | d2), OrderedDict)
+        self.assertIs(type(dict(d1) | d2), OrderedDict)
+        self.assertIs(type(d1 | dict(d2)), OrderedDict)
+
+    @make_dynamo_test
+    def test_binop_ior_return_type(self):
+        d1 = self.thetype({"a": 1, "b": 2})
+        d2 = self.thetype({"b": 3, "c": 4})
+
+        # Test return type
+        d3, d4 = d1.copy(), d2.copy()
+        self.assertIs(type(d3.__ior__(d2)), OrderedDict)
+        self.assertIs(type(dict.__ior__(d4, d2)), OrderedDict)
+        self.assertIs(type(self.thetype.__ior__(d4, d2)), OrderedDict)
+
+        d3, d4 = d1.copy(), d2.copy()
+        self.assertIs(type(dict.__ior__(d3, dict(d2))), OrderedDict)
+        self.assertIs(type(dict.__ior__(dict(d3), d2)), dict)
+        self.assertIs(type(dict(d4).__ior__(d2)), dict)
+
+    @make_dynamo_test
+    def test_popitem_kwarg(self):
+        d = self.thetype.fromkeys("abcdf")
+        self.assertEqual(d.popitem(last=True), ("f", None))
+        self.assertEqual(list(d), list("abcd"))
+        self.assertEqual(d.popitem(last=False), ("a", None))
+        self.assertEqual(list(d), list("bcd"))
+        self.assertEqual(d.popitem(False), ("b", None))
+        self.assertEqual(list(d), list("cd"))
+        self.assertEqual(d.popitem(True), ("d", None))
+        self.assertEqual(list(d), list("c"))
+
+
+class OrderedDictSubclassOverload(torch._dynamo.test_case.TestCase):
+    def setUp(self):
+        torch._dynamo.config.enable_trace_unittest = True
+        super().setUp()
+
+    def tearDown(self):
+        torch._dynamo.config.enable_trace_unittest = False
+        return super().tearDown()
+
+    def assertEqual(self, x, y):
+        self.assertTrue(x == y, f"Expected {x} to be equal to {y}")
+
+    def assertNotEqual(self, x, y):
+        self.assertFalse(x == y, f"Expected {x} to not be equal to {y}")
+
+    class OrderedDictSubclass(OrderedDict):
+        def get(self, key, default=None, /):
+            return default
+
+        def move_to_end(self, key, last=True, /):
+            # change the behavior to something else
+            self.pop(key)
+
+    thetype = OrderedDictSubclass
+
+    @make_dynamo_test
+    def test_move_to_end(self):
+        p = self.thetype({"a": 1, "b": 2, "c": 3})
+        p.move_to_end("a")
+        self.assertEqual(list(p.keys()), list("bc"))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_error_messages.py b/test/dynamo/test_error_messages.py
index 063e6863b8705..847f3a6fd2166 100644
--- a/test/dynamo/test_error_messages.py
+++ b/test/dynamo/test_error_messages.py
@@ -47,7 +47,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         pass
 
 
-class GraphBreakMessagesTest(LoggingTestCase):
+class ErrorMessagesTest(LoggingTestCase):
     def test_dynamic_shape_operator(self):
         def fn():
             return torch.nonzero(torch.rand([10, 10]))
@@ -62,7 +62,7 @@ def fn():
 
   Developer debug context: aten.nonzero.default
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0036.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0036.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -84,7 +84,7 @@ def fn():
 
   Developer debug context: aten.linalg_lstsq.default
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0037.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0037.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -107,7 +107,7 @@ def fn(x):
 
   Developer debug context: call_method TensorVariable() item () {}
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0124.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0124.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -131,7 +131,7 @@ def fn(x):
 
   Developer debug context: aten.equal.default
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0033.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0033.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -159,7 +159,7 @@ def fn(lst):
 
   Developer debug context: TensorVariable()
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0207.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0207.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -185,7 +185,7 @@ def fn(it):
 
   Developer debug context: call_method UserDefinedObjectVariable(zip) __iter__ [] {}
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0156.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0156.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -214,7 +214,7 @@ def fn(x, items):
 
   Developer debug context: call_method UserDefinedObjectVariable(dict_items) __iter__ [] {}
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0156.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0156.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -238,7 +238,7 @@ def fn(it):
 
   Developer debug context: call_function UserDefinedObjectVariable(zip) [] {}
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0147.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0147.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -262,7 +262,7 @@ def fn(obj):
 
   Developer debug context: Attempted SETUP_WITH/BEFORE_WITH on ConstantVariable(int: 3)
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0142.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0142.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -293,7 +293,7 @@ def fn(x):
         return x + 1
 
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0219.html""",
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0219.html""",
         )
 
     def test_unsupported_builtin(self):
@@ -312,7 +312,7 @@ def fn():
 
   Developer debug context: builtin print [<class 'torch._dynamo.variables.constant.ConstantVariable'>] False
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0059.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0059.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -338,7 +338,7 @@ def post_munge(s):
 
   Developer debug context: module: unittest.case, qualname: skip, skip reason: <missing reason>
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0007.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -360,7 +360,7 @@ def fn():
 
   Developer debug context: module: torch._dynamo.decorators, qualname: disable, skip reason: <missing reason>
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0007.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -389,7 +389,7 @@ def post_munge(s):
 
   Developer debug context: qualname: skip, name: skip, filename: `case.py`, skip reason: skipped according trace_rules.lookup unittest
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0008.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0008.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -411,7 +411,7 @@ def fn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0025.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -432,7 +432,7 @@ def fn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{'msg': ConstantVariable(str: 'test graph break')}`
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0025.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -454,7 +454,7 @@ def fn():
 
   Developer debug context: module: _warnings, qualname: warn, skip reason: <missing reason>
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0007.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -483,7 +483,7 @@ def fn(x):
 
   Developer debug context: module: optree._C, qualname: PyCapsule.flatten, skip reason: <missing reason>
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0007.html""",
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html""",
         )
 
     @scoped_load_inline
@@ -519,6 +519,13 @@ def f(x):
         first_graph_break = next(iter(counters["graph_break"].keys()))
 
         first_graph_break = re.sub(r"mylib(_v\d+)?", "mylib", first_graph_break)
+        # HACK: this patches around the fact that PyBind11 improperly sets the
+        # __qualname__ attribute on functions and methods; see
+        # https://github.com/pybind/pybind11/issues/5774.  This should be removed if
+        # that issue is fixed.
+        first_graph_break = re.sub(
+            r"pybind11_detail_function_record_v[^ .]+", "PyCapsule", first_graph_break
+        )
 
         self.assertExpectedInline(
             first_graph_break,
@@ -530,7 +537,7 @@ def f(x):
 
   Developer debug context: module: mylib, qualname: PyCapsule.foobar, skip reason: <missing reason>
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0007.html""",
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0007.html""",
         )
 
         cpp_source = """
@@ -582,7 +589,7 @@ def fn(x, y):
 
   Developer debug context: SliceVariable start: ConstantVariable(NoneType: None), stop: TensorVariable(), step: ConstantVariable(NoneType: None)
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0038.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0038.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -604,7 +611,7 @@ def fn():
 
   Developer debug context: raised exception RuntimeError([ConstantVariable(str: 'test')])
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0088.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0088.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -630,7 +637,7 @@ def fn(mod):
 
   Developer debug context: Foo
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0119.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0119.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -659,7 +666,7 @@ def fn(mod, x):
 
   Developer debug context: nn.Module subclass: Foo, name: attr, attribute type: module
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0161.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0161.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -689,7 +696,7 @@ def fn():
 
   Developer debug context: Active generic context managers: [GenericContextWrappingVariable(GenericCtxMgr), GenericContextWrappingVariable(GenericCtxMgr)]
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0066.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0066.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -705,7 +712,7 @@ def fn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0025.html""",
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html""",
         )
 
     def test_load_build_class(self):
@@ -726,7 +733,7 @@ class Foo:
 
   Developer debug context:
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0075.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0075.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -759,7 +766,7 @@ def post_munge(s):
   Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
 
   Developer debug context: GET_AITER with args (<torch._dynamo.symbolic_convert.InstructionTranslator object at 0xmem_addr>, Instruction(GET_AITER)
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0082.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0082.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -783,14 +790,14 @@ def post_munge(s):
             lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
             """\
 Reconstruction failure
-  Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
+  Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function ErrorMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
   Hint: If Dynamo is attempting to trace a return statement and your code is attempting to return a variable that Dynamo cannot reconstruct, then remove it from the return statement.
   Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
   Hint: Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't have reconstruction rules may be fundamentally unreconstructable.
 
-  Developer debug context: UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
+  Developer debug context: UserMethodVariable(<function ErrorMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0092.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0092.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -826,7 +833,7 @@ def post_munge(s):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0025.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
 User code traceback:
   File "test_error_messages.py", line N, in test_reconstruction_failure_gb
     torch.compile(fn, backend="eager")()
@@ -839,14 +846,14 @@ def post_munge(s):
             post_munge(munge_exc(records[1].exc_info[1], suppress_suffix=True, skip=0)),
             """\
 Reconstruction failure
-  Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
+  Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function ErrorMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
   Hint: If Dynamo is attempting to trace a return statement and your code is attempting to return a variable that Dynamo cannot reconstruct, then remove it from the return statement.
   Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
   Hint: Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't have reconstruction rules may be fundamentally unreconstructable.
 
-  Developer debug context: UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
+  Developer debug context: UserMethodVariable(<function ErrorMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0092.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0092.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -875,7 +882,7 @@ def fn(x):
 
   Developer debug context:
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0087.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0087.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -899,7 +906,7 @@ def fn(x):
 
   Developer debug context: attempted to jump with TensorVariable()
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0170.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0170.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -966,7 +973,7 @@ def fn(x):
 
   Developer debug context: value: ConstantVariable(bool: False)
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0034.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0034.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -1010,7 +1017,7 @@ def gn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0025.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -1063,7 +1070,7 @@ def gn():
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0025.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -1099,7 +1106,7 @@ def hn(x):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0025.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
 User code traceback:
   File "test_error_messages.py", line N, in test_nested_compile_user_frames
     torch.compile(fn, backend="eager")(torch.randn(3))
@@ -1213,7 +1220,7 @@ def f3(x):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0025.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
 User code traceback:
   File "test_error_messages.py", line N, in test_graph_break_traceback_collapsed_resume_frames
     f1(torch.randn(3))
@@ -1298,12 +1305,12 @@ def post_munge(s):
             lambda: outer(f, torch.randn(3)),
             """\
 Skip calling `torch.compiler.disable()`d function
-  Explanation: Skip calling function `<function GraphBreakMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: None)
+  Explanation: Skip calling function `<function ErrorMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: None)
   Hint: Remove the `torch.compiler.disable` call
 
-  Developer debug context: <function GraphBreakMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>
+  Developer debug context: <function ErrorMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0098.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0098.html
 
 from user code:
    File "test_error_messages.py", line N, in outer
@@ -1320,12 +1327,12 @@ def g(x):
             lambda: outer(g, torch.randn(3)),
             """\
 Skip calling `torch.compiler.disable()`d function
-  Explanation: Skip calling function `<function GraphBreakMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: test message)
+  Explanation: Skip calling function `<function ErrorMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: test message)
   Hint: Remove the `torch.compiler.disable` call
 
-  Developer debug context: <function GraphBreakMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>
+  Developer debug context: <function ErrorMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0098.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0098.html
 
 from user code:
    File "test_error_messages.py", line N, in outer
@@ -1351,7 +1358,7 @@ def forward(self, x):
 
   Developer debug context: source: LocalSource(local_name='fn', is_input=True, dynamism=None, is_derefed_cell_contents=False)
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0148.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0148.html
 
 from user code:
    File "test_error_messages.py", line N, in outer
diff --git a/test/dynamo/test_exc.py b/test/dynamo/test_exc.py
index a7cb02132bd5f..ad56417ed568d 100644
--- a/test/dynamo/test_exc.py
+++ b/test/dynamo/test_exc.py
@@ -43,7 +43,7 @@ def fn001(x):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0025.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
 
 from user code:
    File "test_exc.py", line N, in fn001
@@ -183,7 +183,7 @@ def fn001(x):
 
   Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0025.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
 User code traceback:
   File "test_exc.py", line N, in test_graph_break_log
     torch.compile(fn001, backend="eager")(torch.randn(1))
diff --git a/test/dynamo/test_exceptions.py b/test/dynamo/test_exceptions.py
index 7a1913be5460c..43fdc335b8c20 100644
--- a/test/dynamo/test_exceptions.py
+++ b/test/dynamo/test_exceptions.py
@@ -136,6 +136,20 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
 
+    def test_exception_with_vars(self):
+        def fn(x):
+            try:
+                vars(42)
+                raise RuntimeError("Should not be raised")
+            except TypeError:
+                return x.sin()
+
+        x = torch.randn(4)
+        ref = fn(x)
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
     def test_autocast_with_exception(self):
         class Optimizer(torch.autograd.Function):
             @staticmethod
diff --git a/test/dynamo/test_fake_distributed.py b/test/dynamo/test_fake_distributed.py
index fbc4beb1eacee..7a73e24cc8b0e 100644
--- a/test/dynamo/test_fake_distributed.py
+++ b/test/dynamo/test_fake_distributed.py
@@ -13,7 +13,7 @@
         all_to_all_single_autograd,
         wait_tensor,
     )
-    from torch.testing._internal.distributed.fake_pg import FakeStore
+    from torch.distributed.device_mesh import init_device_mesh
 
 
 def normalize_graph(gm):
@@ -24,8 +24,9 @@ def normalize_graph(gm):
 class TestFakeDistributed(DynamoTestCase):
     def setUp(self):
         # Use FakeProcessGroup to run tests on a single process
-        self.store = FakeStore()
-        dist.init_process_group(backend="fake", rank=0, world_size=2, store=self.store)
+        dist.init_process_group(backend="fake", rank=0, world_size=2)
+        self.local_rank = 0
+        self.world_size = 2
 
     def tearDown(self):
         dist.destroy_process_group()
@@ -115,6 +116,23 @@ def forward(self, primals_1: "Sym(u0)", primals_2: "Sym(u1)", primals_3: "Sym(u2
 """,  # noqa: B950
         )
 
+    def test_device_mesh_get_local_rank(self):
+        device_mesh = init_device_mesh(
+            device_type="cpu",
+            mesh_shape=(self.world_size,),
+            mesh_dim_names=("dp",),  # data parallel dimension
+        )
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            local_rank = device_mesh.get_local_rank()
+            global_rank = device_mesh.get_rank()
+            return x + local_rank + global_rank
+
+        x = torch.ones(10)
+        res = fn(x)
+        self.assertEqual(res, x)
+
 
 instantiate_parametrized_tests(TestFakeDistributed)
 
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 4afb6acc5d87f..5b8aa5c61e405 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -31,7 +31,7 @@
     EagerAndRecordGraphs,
     normalize_gm,
 )
-from torch._dynamo.utils import ifdynstaticdefault, same
+from torch._dynamo.utils import ifdynstaticdefault, range_iterator, same
 from torch._dynamo.variables import ConstantVariable, SkipFunctionVariable
 from torch._dynamo.variables.lists import RangeVariable
 from torch.nn import functional as F
@@ -268,6 +268,54 @@ def test_itertools_product(a, b):
             v = v + x * i
         return v
 
+    def test_itertools_product_args(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(*args, **kwargs):
+            return torch.tensor(list(itertools.product(*args, **kwargs)))
+
+        self.assertRaises(Unsupported, fn, [1, 2, 3], fake_arg=1)
+
+    @make_test
+    def test_itertools_product_various_iterators(a, b):
+        itertools.product(
+            [a, b],
+            zip([1, 2], [3, 4]),
+            map(lambda x: x, [1, 2]),
+            filter(lambda x: True, [1, 2]),
+        )
+        return a
+
+    def test_itertools_permutations_basic(self):
+        def fn():
+            return torch.tensor(list(itertools.permutations([1, 2, 3], 2)))
+
+        actual = torch.compile(fn, backend="eager", fullgraph=True)()
+        expected = fn()
+        self.assertEqual(actual, expected)
+
+    def test_itertools_permutations_args(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(*args, **kwargs):
+            return torch.tensor(list(itertools.permutations(*args, **kwargs)))
+
+        self.assertRaises(Unsupported, fn)
+        self.assertRaises(Unsupported, fn, [1, 2, 3], 1, 2)
+        self.assertRaises(Unsupported, fn, [1, 2, 3], fake_arg=1)
+
+    @make_test
+    def test_itertools_permutations_various_iterators(a, b):
+        itertools.permutations([a, b])
+        itertools.permutations(zip([1, 2], [3, 4]))
+        itertools.permutations(map(lambda x: x, [1, 2]))
+        itertools.permutations(filter(lambda x: True, [1, 2]))
+        return a
+
+    @make_test
+    def test_itertools_filterfalse_basic(a, b):
+        for x in itertools.filterfalse(lambda x: x > 0, [-0.5, 0, 0.5]):
+            a += x
+        return a
+
     @make_test
     def test_itertools_chain(a, b):
         v = a
@@ -520,6 +568,11 @@ def test_tuple2(a, b):
         args = [a, b]
         return sub(*args)
 
+    @make_test
+    def test_tuple_map(a, b):
+        t = tuple(map(torch.sin, [a, b]))
+        return t[0] + t[1]
+
     def test_size_tuple_add(self):
         def fn():
             size = torch.Size([])
@@ -1703,7 +1756,6 @@ def test_tuple_contains(a, b):
             return a + b
         return a - b
 
-    @unittest.expectedFailure
     @make_test
     def test_set_in_frozenset(x):
         var = set("abc")
@@ -1974,6 +2026,21 @@ def test_namedtuple_defaults(a, b):
         tmp = mytuple(a, xy=b)
         return mytuple(tmp.x, tmp[1], tmp.xy + b)
 
+    @make_test
+    def test_namedtuple_replace(a, b):
+        mytuple = collections.namedtuple("mytuple", ["x", "y"])
+        t = mytuple(a, b)
+        t._replace(x=b)
+        return t.x + t.y
+
+    @make_test
+    def test_namedtuple_fields(a, b):
+        mytuple = collections.namedtuple("mytuple", ["x", "y"])
+        if mytuple._fields == ("x", "y"):
+            return a + b
+        else:
+            return a - b
+
     class MyNamedTuple(NamedTuple):
         first: torch.Tensor
         second: torch.Tensor
@@ -3424,6 +3491,51 @@ def gen_random_range_args(self):
             args[2] = 1
         return args
 
+    def test_range_iterator_graph_break(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            it = range(1, 7, 2).__iter__()
+            y = x + next(it)
+            torch._dynamo.graph_break()
+            return y + next(it) + next(it)
+
+        x = torch.tensor([1.0])
+        y = fn(x)
+        self.assertEqual(y, x + 1 + 3 + 5)
+
+    def test_range_iterator_graph_break_2(self):
+        @torch.compiler.disable
+        def g(y, it):
+            return y + next(it) + next(it)
+
+        @torch.compile(backend="eager")
+        def fn(x):
+            it = range(1, 10, 2).__iter__()
+            y = x + next(it)
+            z = g(y, it)
+            k = next(it)
+            assert k == 7
+            return z + k
+
+        x = torch.tensor([1.0])
+        z = fn(x)
+        self.assertEqual(z, x + 1 + 3 + 5 + 7)
+
+    @make_test
+    def test_range_iterator(a, b):
+        it = range(5).__iter__()
+        if isinstance(it, range_iterator):
+            return a + b
+        return a - b
+
+    @make_test
+    def test_range_iterator_2(a, b):
+        # should pass once we stop having three different paths on call_iter
+        it = iter(range(5))
+        if isinstance(it, range_iterator):
+            return a + b
+        return a - b
+
     def test_range_length(self):
         def test(*args, expected=None):
             r = range(*args)
@@ -3976,7 +4088,8 @@ def new_get_device_module(device=None):
             print(torch.get_device_module())
             self.assertEqual(f5(), getattr(torch, new_device))
 
-        @torch.compile(backend="eager", fullgraph=True)
+        # synchronize causes a graph break, so no fullgraph=True
+        @torch.compile(backend="eager")
         def f6():
             mod = torch.get_device_module()
             mod.synchronize()
@@ -4094,6 +4207,7 @@ def func():
         self.assertEqual(cnts.frame_count, 3)
         self.assertEqual(cnts.op_count, 6)
 
+    @torch._dynamo.config.patch(assume_dunder_attributes_remain_unchanged=False)
     def test_meth_default_tensor_args(self):
         """
         Tests that we indeed reference (and mutate) "the one" default tensor arg
@@ -5030,6 +5144,29 @@ def __getattribute__(self, name):
         with self.assertRaises(Unsupported):
             a.call_function(None, [], {})
 
+    def test_inspect_method_source(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def check(self, x):
+                return x * 2
+
+            def forward(self, x):
+                return x * 2
+
+        mod = Mod()
+
+        def fn(x):
+            inspect.signature(mod.check).parameters.items()
+            return mod(x)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        x = torch.randn(4)
+        ref = fn(x)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
 
 instantiate_parametrized_tests(FunctionTests)
 instantiate_parametrized_tests(DefaultsTests)
diff --git a/test/dynamo/test_fx_graph_runnable.py b/test/dynamo/test_fx_graph_runnable.py
index d5ad0c160c4ba..47e9ee3cb888e 100644
--- a/test/dynamo/test_fx_graph_runnable.py
+++ b/test/dynamo/test_fx_graph_runnable.py
@@ -11,12 +11,65 @@
 from torch._inductor.codecache import WritableTempFile
 from torch._inductor.test_case import TestCase
 from torch.testing._internal.common_utils import IS_FBCODE, IS_SANDCASTLE
+from torch.utils._triton import has_triton
 
 
 if torch.distributed.is_available():
     from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard
     from torch.testing._internal.distributed.fake_pg import FakeStore
 
+if has_triton():
+    import triton
+    import triton.language as tl
+
+    def init_to_zero(name):
+        return lambda nargs: nargs[name].zero_()
+
+    @triton.jit
+    def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
+        pid = tl.program_id(axis=0)
+
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+
+        x = tl.load(x_ptr + offsets, mask=mask)
+        y = tl.load(y_ptr + offsets, mask=mask)
+        output = x + y
+        tl.atomic_add(output_ptr + offsets, output, mask=mask)
+
+    @triton.autotune(
+        configs=[
+            triton.Config(
+                {"BLOCK_SIZE": 1024},
+                num_warps=4,
+                num_stages=2,
+                pre_hook=init_to_zero("output_ptr"),
+            )
+        ],
+        pre_hook=init_to_zero("output_ptr"),
+        post_hook=init_to_zero("output_ptr"),
+        key=["n_elements"],
+    )
+    @triton.jit
+    def add_kernel_autotune(
+        x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr
+    ):
+        pid = tl.program_id(axis=0)
+
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+
+        x = tl.load(x_ptr + offsets, mask=mask)
+        y = tl.load(y_ptr + offsets, mask=mask)
+        output = x + y
+        tl.atomic_add(output_ptr + offsets, output, mask=mask)
+
+
+from torch.testing._internal.inductor_utils import GPU_TYPE
+from torch.testing._internal.triton_utils import requires_gpu
+
 
 class FxGraphRunnableArtifactFilter(logging.Filter):
     def filter(self, record):
@@ -100,6 +153,41 @@ def f(x):
         torch.compile(f)(torch.randn(4))
         self._exec_and_verify_payload()
 
+    @unittest.skipUnless(has_triton(), "Triton not available")
+    def test_user_defined_triton_kernel_autotune(self):
+        def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.ones(x.shape, device=x.device, dtype=x.dtype)
+            n_elements = output.numel()
+
+            def grid(
+                meta,
+            ):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            add_kernel_autotune[grid](x, y, output, n_elements)
+            return output
+
+        x = torch.ones((4096,), device=GPU_TYPE, dtype=torch.float16)
+        y = torch.ones((4096,), device=GPU_TYPE, dtype=torch.float16)
+
+        torch.compile(add)(x, y)
+        self._exec_and_verify_payload()
+
+    @unittest.skipUnless(has_triton(), "Triton not available")
+    @requires_gpu
+    def test_user_defined_triton_kernel(self):
+        def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.ones(x.shape, device=x.device, dtype=x.dtype)
+            n_elements = x.numel()
+            add_kernel[n_elements,](x, y, output, n_elements, BLOCK_SIZE=4)
+            return output
+
+        x = torch.ones((4096,), device=GPU_TYPE, dtype=torch.float16)
+        y = torch.ones((4096,), device=GPU_TYPE, dtype=torch.float16)
+
+        torch.compile(add)(x, y)
+        self._exec_and_verify_payload()
+
     def test_two_inputs_matmul(self):
         def f(a, b):
             return (a @ b).relu()
diff --git a/test/dynamo/test_generator.py b/test/dynamo/test_generator.py
index 9d7318105c900..cfb3241d712d1 100644
--- a/test/dynamo/test_generator.py
+++ b/test/dynamo/test_generator.py
@@ -1515,6 +1515,76 @@ def fn(t):
 
         self._compile_check(fn)
 
+    def test_return_const_value_in_except_and_finally(self):
+        def whoo():
+            try:
+                yield 1
+            except ValueError:
+                return 2  # noqa: B901
+            finally:
+                return 3  # noqa: B012, SIM107, B901
+
+        def fn(t):
+            gen = whoo()
+            next(gen)
+            try:
+                gen.throw(ValueError)
+            except StopIteration as e:
+                assert e.args[0] == 3
+            except Exception as e:
+                raise AssertionError from e
+            return t.sin()
+
+        self._compile_check(fn)
+
+    def test_return_value_in_except_and_finally(self):
+        class Foo:
+            def __init__(self, x):
+                self.x = x
+
+        def whoo():
+            try:
+                yield 1
+            except ValueError:
+                return Foo(2)  # noqa: B901
+            finally:
+                return Foo(3)  # noqa: B012, SIM107, B901
+
+        def fn(t):
+            gen = whoo()
+            next(gen)
+            try:
+                gen.throw(ValueError)
+            except StopIteration as e:
+                assert e.args[0].x == 3
+            except Exception as e:
+                raise AssertionError from e
+            return t.sin()
+
+        self._compile_check(fn)
+
+    def test_return_None_in_except_and_finally(self):
+        def whoo():
+            try:
+                yield 1
+            except ValueError:
+                return 2  # noqa: B901
+            finally:
+                return  # noqa: B012, SIM107
+
+        def fn(t):
+            gen = whoo()
+            next(gen)
+            try:
+                gen.throw(ValueError)
+            except StopIteration as e:
+                assert len(e.args) == 0
+            except Exception as e:
+                raise AssertionError from e
+            return t.sin()
+
+        self._compile_check(fn)
+
 
 instantiate_parametrized_tests(GeneratorTests)
 instantiate_parametrized_tests(TestGeneratorSend)
diff --git a/test/dynamo/test_graph_deduplication.py b/test/dynamo/test_graph_deduplication.py
index 0630ee0d35fc9..004aee88a8633 100644
--- a/test/dynamo/test_graph_deduplication.py
+++ b/test/dynamo/test_graph_deduplication.py
@@ -4,13 +4,16 @@
 
 import torch
 import torch.fx
+from torch._dynamo.graph_deduplication import apply_graph_deduplication
 from torch._dynamo.graph_utils import _detect_cycles
+from torch._dynamo.output_graph import FakeRootModule
 from torch._dynamo.test_case import TestCase
 from torch._dynamo.testing import (
     AotEagerAndRecordGraphs,
     extract_graph_and_tracker,
     normalize_gm,
 )
+from torch.compiler import allow_in_graph
 from torch.utils._ordered_set import OrderedSet
 
 
@@ -1106,6 +1109,121 @@ def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
     """,
         )
 
+    def test_tuple_return(self):
+        @allow_in_graph
+        def tuple_return(x, y):
+            return x, y
+
+        def inner_fn(x, y):
+            x0 = x + x + 1
+            y0 = y + y + 1
+            return tuple_return(x0, y0)
+
+        def fn(x0, x1, x2, y0, y1, y2):
+            x0 = inner_fn(x0, y0)
+            x1 = inner_fn(x1, y1)
+            x2 = inner_fn(x2, y2)
+            return x0, x1, x2
+
+        fn_opt = torch.compile(fn, fullgraph=True)
+        inps = [torch.rand(10, 10) for _ in range(6)]
+        result_compiled = fn_opt(*inps)
+        result_eager = fn(*inps)
+        self.assertEqual(result_compiled, result_eager)
+
+    def test_tuple_inputs(self):
+        with (
+            torch._dynamo.config.patch("use_graph_deduplication", False),
+            torch._dynamo.config.patch("track_nodes_for_deduplication", True),
+        ):
+
+            def inner(x, y):
+                x0, x1 = torch.split(x, 5)
+                return x0 + x1 + y
+
+            def fn(x, y):
+                o1 = inner(x, y)
+                o2 = inner(x, y)
+                o3 = inner(x, y)
+                o4 = inner(x, y)
+                return o1.sum() + o2.sum() + o3.sum() + o4.sum()
+
+            graph, tracker = extract_graph_and_tracker(
+                fn, torch.rand(10, 10), torch.rand(5, 10)
+            )
+
+            class MockOutputGraph:
+                def __init__(self):
+                    self.graph = graph
+                    self.region_tracker = tracker
+                    self.nn_modules = FakeRootModule({})
+
+                def install_subgraph(self, name, subgraph):
+                    return ""
+
+            splits = [
+                n
+                for n in graph.nodes
+                if n.op == "call_function" and n.target == torch.split
+            ]
+            for split in splits:
+                tracker.node_to_duplicates.pop(split)
+
+            apply_graph_deduplication(MockOutputGraph())
+            self.assertExpectedInline(
+                graph,
+                """\
+graph():
+    %_unnamed : [num_users=4] = get_attr[target=]
+    %l_x_ : torch.Tensor [num_users=4] = placeholder[target=L_x_]
+    %l_y_ : torch.Tensor [num_users=4] = placeholder[target=L_y_]
+    %split : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
+    %x0 : [num_users=1] = call_function[target=operator.getitem](args = (%split, 0), kwargs = {})
+    %x1 : [num_users=1] = call_function[target=operator.getitem](args = (%split, 1), kwargs = {})
+    %split_1 : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
+    %x0_1 : [num_users=1] = call_function[target=operator.getitem](args = (%split_1, 0), kwargs = {})
+    %x1_1 : [num_users=1] = call_function[target=operator.getitem](args = (%split_1, 1), kwargs = {})
+    %split_2 : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
+    %x0_2 : [num_users=1] = call_function[target=operator.getitem](args = (%split_2, 0), kwargs = {})
+    %x1_2 : [num_users=1] = call_function[target=operator.getitem](args = (%split_2, 1), kwargs = {})
+    %split_3 : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
+    %x0_3 : [num_users=1] = call_function[target=operator.getitem](args = (%split_3, 0), kwargs = {})
+    %x1_3 : [num_users=1] = call_function[target=operator.getitem](args = (%split_3, 1), kwargs = {})
+    %invoke_subgraph : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0, %x1, %l_y_), kwargs = {})
+    %getitem_8 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph, 0), kwargs = {})
+    %sum_1 : [num_users=1] = call_method[target=sum](args = (%getitem_8,), kwargs = {})
+    %invoke_subgraph_1 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0_1, %x1_1, %l_y_), kwargs = {})
+    %getitem_9 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_1, 0), kwargs = {})
+    %sum_2 : [num_users=1] = call_method[target=sum](args = (%getitem_9,), kwargs = {})
+    %add_8 : [num_users=1] = call_function[target=operator.add](args = (%sum_1, %sum_2), kwargs = {})
+    %invoke_subgraph_2 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0_2, %x1_2, %l_y_), kwargs = {})
+    %getitem_10 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_2, 0), kwargs = {})
+    %sum_3 : [num_users=1] = call_method[target=sum](args = (%getitem_10,), kwargs = {})
+    %add_9 : [num_users=1] = call_function[target=operator.add](args = (%add_8, %sum_3), kwargs = {})
+    %invoke_subgraph_3 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0_3, %x1_3, %l_y_), kwargs = {})
+    %getitem_11 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_3, 0), kwargs = {})
+    %sum_4 : [num_users=1] = call_method[target=sum](args = (%getitem_11,), kwargs = {})
+    %add_10 : [num_users=1] = call_function[target=operator.add](args = (%add_9, %sum_4), kwargs = {})
+    return (add_10,)""",
+            )
+
+    def test_param_transfer_to_submodule(self):
+        def inner_fn(x, y):
+            return x + y + y + x
+
+        def fn(x0, x1, x2, y0, y1, y2):
+            x0 = inner_fn(x0, y0)
+            x1 = inner_fn(x1, y1)
+            x2 = inner_fn(x2, y2)
+            return x0.sum() + x1.sum() + x2.sum()
+
+        fn_opt = torch.compile(fn, fullgraph=True)
+        args = [torch.rand(10, 10) for _ in range(6)]
+        for arg in args:
+            torch._dynamo.mark_static_address(arg)
+
+        fn_opt(*args)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_graph_region_tracker.py b/test/dynamo/test_graph_region_tracker.py
index dfc452020957f..e930ff787a9a4 100644
--- a/test/dynamo/test_graph_region_tracker.py
+++ b/test/dynamo/test_graph_region_tracker.py
@@ -9,28 +9,6 @@
 from torch.utils._pytree import tree_map
 
 
-def get_nodes_by_name(graph, names):
-    nodes = []
-    for node in graph.nodes:
-        if node.name in names:
-            nodes.append(node)
-
-    return nodes
-
-
-unique_ind = 0
-
-
-def track_same_nodes(names, graph, region_tracker):
-    global unique_ind
-    unique_ind += 1
-    # find nodes in graph with names and track them
-    # as if they were at the same code location
-    nodes = get_nodes_by_name(graph, names)
-    for node in nodes:
-        region_tracker.track_node("x", unique_ind, node)
-
-
 class GraphRegionTrackerTests(TestCase):
     def setUp(self):
         self.exit_stack = contextlib.ExitStack()
@@ -370,6 +348,43 @@ def fn(x, y):
             """[[['y', 'o1'], ['y_1', 'o2'], ['y_2', 'o3']]]""",
         )
 
+    def test_region_sorting(self):
+        from torch._dynamo.graph_region_tracker import _sort_with_ref_region
+
+        index_to_rank = {0: 0, 2: 1, 1: 2}
+        regions = [[0, 1, 2], [1, 2, 0]]
+        _sort_with_ref_region(index_to_rank, regions)
+        self.assertExpectedInline(regions, """[[0, 2, 1], [1, 0, 2]]""")
+
+    def test_no_duplicate_tracking(self):
+        def inner_fn(x, y):
+            x0 = x + 1
+            y0 = y + 2
+            z = x0.sum() + y0.sum()
+            return z
+
+        def fn(x, y):
+            o0 = inner_fn(x, y)
+            o1 = torch.sin(y)
+            o2 = inner_fn(x, o1)
+            o3 = inner_fn(x, y)
+            o4 = o3 * o3
+            return o2 * o4 + o0
+
+        graph, tracker = extract_graph_and_tracker(
+            fn, torch.rand(10, 10), torch.ones(10, 20)
+        )
+        self.assertExpectedInline(
+            tracker.node_to_duplicates,
+            """{l_x_: [l_x_], x0: [x0, x0_1, x0_2], l_y_: [l_y_], y0: [y0, y0_1, y0_2], sum_1: \
+[sum_1, sum_3, sum_5], sum_2: [sum_2, sum_4, sum_6], z: [z, z_1, z_2], o1: [o1], x0_1: [x0, x0_1, x0_2], y0_1: [y0, y0_1, y0_2], \
+sum_3: [sum_1, sum_3, sum_5], sum_4: [sum_2, sum_4, sum_6], \
+z_1: [z, z_1, z_2], x0_2: [x0, x0_1, x0_2], y0_2: [y0, y0_1, y0_2], sum_5: [sum_1, sum_3, sum_5], sum_6: [sum_2, sum_4, sum_6], \
+z_2: [z, z_1, z_2], o4: [o4], mul_1: [mul_1], add_9: [add_9]}""",
+        )
+        key = next(iter(tracker.node_to_duplicates.keys()))
+        tracker.track_node(None, key)  # this will fail if the node is added again
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_guard_manager.py b/test/dynamo/test_guard_manager.py
index 8a66c847b52a1..c4ad29f69b438 100644
--- a/test/dynamo/test_guard_manager.py
+++ b/test/dynamo/test_guard_manager.py
@@ -1,5 +1,7 @@
 # Owner(s): ["module: dynamo"]
+import abc
 import functools
+import inspect
 import unittest
 import weakref
 
@@ -1150,21 +1152,32 @@ def hook(guard_wrapper, f_locals, builder):
 
     def test_nn_module_tag_safe(self):
         class Foo(torch.nn.Module):
+            c = 2
+
             def __init__(self):
                 super().__init__()
                 self.a = 4
 
+            def check(self, x):
+                return True
+
             def forward(self, x):
-                return x + self.a
+                inspect.signature(self.check).parameters.items()
+                return x + self.a + self.c
 
         foo = Foo()
 
-        class Baz(torch.nn.Module):
+        class Env(metaclass=abc.ABCMeta):  # noqa: B024
+            pass
+
+        class Baz(torch.nn.Module, Env):
             def __init__(self):
                 super().__init__()
                 self.foo = foo
 
             def forward(self, x):
+                if "Foo" in str(type(self).__mro__):
+                    x = torch.sin(x)
                 return self.foo(x)
 
         baz = Baz()
@@ -1179,7 +1192,6 @@ def fn(x):
             from utils import install_guard_manager_testing_hook
 
         def hook(guard_wrapper, f_locals, builder):
-            from torch._C._dynamo.guards import GetGenericDictGuardAccessor
             from torch._dynamo.source import LocalSource
 
             baz_source = LocalSource("baz")
@@ -1189,26 +1201,44 @@ def hook(guard_wrapper, f_locals, builder):
             self.assertTrue(baz_mgr.is_tag_safe())
             self.assertTrue(baz_mgr.is_tag_safe_root())
 
-            # Check tagness of baz.__dict__
-            self.assertTrue(len(baz_mgr.get_accessors()) == 1)
-            dunder_dict_accessor = baz_mgr.get_accessors()[0]
-            self.assertTrue(
-                isinstance(dunder_dict_accessor, GetGenericDictGuardAccessor)
-            )
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        with install_guard_manager_testing_hook(hook):
+            opt_fn(torch.randn(4, 4))
+
+    def test_nn_module_tag_overridden_getattr_safe(self):
+        class Baz(torch.nn.Module, metaclass=abc.ABCMeta):
+            def __init__(self):
+                super().__init__()
+                self.norm = 2
+
+            def __getattr__(self, key):
+                if key == "a":
+                    return 5
+                return super().__getattr__(key)
+
+            def forward(self, x):
+                return x + self.a + self.norm
+
+        baz = Baz()
 
-            dunder_dict_mgr = baz_mgr.get_child_managers()[0]
-            self.assertTrue(dunder_dict_mgr.is_tag_safe())
-            self.assertFalse(dunder_dict_mgr.is_tag_safe_root())
+        def fn(x):
+            x = x + baz(x)
+            return x
 
-            # Check tagness of baz.__dict__["_modules"]
-            modules_mgr = dunder_dict_mgr.get_child_managers()[0]
-            self.assertTrue(modules_mgr.is_tag_safe())
-            self.assertFalse(modules_mgr.is_tag_safe_root())
+        try:
+            from .utils import install_guard_manager_testing_hook
+        except ImportError:
+            from utils import install_guard_manager_testing_hook
 
-            # Check tagness of baz.__dict__["_modules"]["foo"]
-            modules_foo_mgr = modules_mgr.get_child_managers()[0]
-            self.assertTrue(modules_foo_mgr.is_tag_safe())
-            self.assertFalse(modules_foo_mgr.is_tag_safe_root())
+        def hook(guard_wrapper, f_locals, builder):
+            from torch._dynamo.source import LocalSource
+
+            baz_source = LocalSource("baz")
+
+            # Check tagness of baz
+            baz_mgr = builder.get_guard_manager_from_source(baz_source)
+            self.assertTrue(baz_mgr.is_tag_safe())
+            self.assertTrue(baz_mgr.is_tag_safe_root())
 
         opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
         with install_guard_manager_testing_hook(hook):
diff --git a/test/dynamo/test_guard_serialization.py b/test/dynamo/test_guard_serialization.py
index 10808c922b3fb..e826492089f63 100644
--- a/test/dynamo/test_guard_serialization.py
+++ b/test/dynamo/test_guard_serialization.py
@@ -235,6 +235,7 @@ def __hash__(self):
 pytree.register_constant(CustomConstantType)
 
 
+@torch._dynamo.config.patch({"strict_precompile": True})
 class TestGuardSerialization(torch._inductor.test_case.TestCase):
     def test_function_locals(self):
         def foo(x):
@@ -261,6 +262,7 @@ def _tracefunc(self, frame, event, arg):
 
     def _test_serialization(self, guard_type, fn, *args, **kwargs):
         # kwargs might contain a callable that generates kwargs
+        torch._dynamo.reset()
         kwarg_gen_fn = kwargs.get("_gen_fn", None)
         if kwarg_gen_fn is not None:
             kwargs = kwarg_gen_fn()
@@ -346,7 +348,7 @@ def transform(instructions: list, code_options: dict[str, object]):
                     self._frame_state.f_code,
                     tracer.output,
                     guard_filter_fn=guard_filter_fn,
-                    guards_serialization_mode="save",
+                    save_guards=True,
                 )
                 guards_state = check_fn_manager.guards_state
                 self._cached_guards_state = guards_state
@@ -357,7 +359,6 @@ def transform(instructions: list, code_options: dict[str, object]):
                 check_fn_manager = CheckFunctionManager(
                     self._frame_state.f_code,
                     guards_state.output_graph,
-                    guards_serialization_mode="load",
                     shape_code_parts=guards_state.shape_code_parts,
                     runtime_global_scope=self._frame_state.f_globals,
                 )
@@ -1180,7 +1181,6 @@ def fn(x):
             check_fn_manager = CheckFunctionManager(
                 self._cached_f_code,
                 guards_state.output_graph,
-                guards_serialization_mode="load",
                 shape_code_parts=guards_state.shape_code_parts,
             )
             loaded = check_fn_manager.guard_manager
@@ -1325,6 +1325,27 @@ def getattr_new(*args, **kwargs):
         finally:
             builtins_dict["getattr"] = getattr_original
 
+    def test_skipped_objects(self):
+        def foo():
+            pass
+
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.code = foo.__code__
+                self.foo = foo
+                self.p = torch.nn.Parameter(torch.randn(3, 2))
+
+            def forward(self, x):
+                z = x + 1
+                for p in self.parameters():
+                    z += p
+                return z
+
+        m = Module()
+        ref, loaded = self._test_serialization("TENSOR_MATCH", m, torch.randn(3, 2))
+        self._test_check_fn(ref, loaded, {"self": m, "x": torch.randn(3, 2)}, True)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index b9c1ff3a61fe9..9f093d4dc0cea 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -38,11 +38,8 @@
     xfailIfTorchDynamo,
 )
 from torch.testing._internal.hop_db import hop_db
-from torch.testing._internal.inductor_utils import HAS_CUDA
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
-
-
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 def count_ops(gm, args, freq, op):
@@ -2611,25 +2608,17 @@ def f(x):
             f, default_args_generator((x,)), arg_count, expected_opcount=3
         )
 
-    def test_fallback_on_python_primitives_output(self):
+    def test_support_float_in_output(self):
         counters.clear()
         cnt = CompileCounter()
 
-        @torch.compile(backend=cnt)
+        @torch.compile(backend=cnt, fullgraph=True)
         def f(x):
             return wrap(lambda x: [1, torch.sin(x), 2.0], x)
 
         x = torch.randn(3)
         result = f(x)
         self.assertEqual(result, [1, torch.sin(x), 2.0])
-        self.assertEqual(cnt.frame_count, 0)
-        assert_dict_matches_regex(
-            self,
-            dict(counters["graph_break"]),
-            {
-                ".*HigherOrderOperator body's output must consist of tensors or ints only but got": 1
-            },
-        )
 
     def test_nested_tuple_output(self):
         def f(x):
@@ -3087,29 +3076,29 @@ def forward(self, L_a_ : torch.SymInt, L_b_ : torch.SymInt, L_c_ : torch.SymInt,
     b = torch.arange(l_b_)
     c = torch.arange(l_c_)
     d = torch.arange(l_d_)
-    lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
-    _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(l_d_, 'error');  _vmap_increment_nesting = None
-    child = torch._C._functorch._add_batch_dim(d, 0, 1);  d = None
-    lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
-    _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(l_c_, 'error');  _vmap_increment_nesting_1 = None
-    child_1 = torch._C._functorch._add_batch_dim(c, 0, 2);  c = None
-    lazy_load_decompositions_2 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_2 = None
-    _vmap_increment_nesting_2 = torch._C._functorch._vmap_increment_nesting(l_b_, 'error');  _vmap_increment_nesting_2 = None
-    child_2 = torch._C._functorch._add_batch_dim(b, 0, 3);  b = None
-    lazy_load_decompositions_3 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_3 = None
-    _vmap_increment_nesting_3 = torch._C._functorch._vmap_increment_nesting(l_a_, 'error');  _vmap_increment_nesting_3 = None
-    _add_batch_dim_3 = torch._C._functorch._add_batch_dim(a, 0, 4);  a = None
+    lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
+    _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(l_d_, 'error');  _vmap_increment_nesting = None
+    child = torch._functorch.predispatch._add_batch_dim(d, 0, 1);  d = None
+    lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+    _vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(l_c_, 'error');  _vmap_increment_nesting_1 = None
+    child_1 = torch._functorch.predispatch._add_batch_dim(c, 0, 2);  c = None
+    lazy_load_decompositions_2 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_2 = None
+    _vmap_increment_nesting_2 = torch._functorch.predispatch._vmap_increment_nesting(l_b_, 'error');  _vmap_increment_nesting_2 = None
+    child_2 = torch._functorch.predispatch._add_batch_dim(b, 0, 3);  b = None
+    lazy_load_decompositions_3 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_3 = None
+    _vmap_increment_nesting_3 = torch._functorch.predispatch._vmap_increment_nesting(l_a_, 'error');  _vmap_increment_nesting_3 = None
+    _add_batch_dim_3 = torch._functorch.predispatch._add_batch_dim(a, 0, 4);  a = None
     add = _add_batch_dim_3 + child_2;  _add_batch_dim_3 = child_2 = None
     add_1 = add + child_1;  add = child_1 = None
     batched_outputs = add_1 + child;  add_1 = child = None
-    batched_outputs_1 = torch._C._functorch._remove_batch_dim(batched_outputs, 4, l_a_, 0);  batched_outputs = l_a_ = None
-    _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
-    batched_outputs_2 = torch._C._functorch._remove_batch_dim(batched_outputs_1, 3, l_b_, 0);  batched_outputs_1 = l_b_ = None
-    _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
-    batched_outputs_3 = torch._C._functorch._remove_batch_dim(batched_outputs_2, 2, l_c_, 0);  batched_outputs_2 = l_c_ = None
-    _vmap_decrement_nesting_2 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_2 = None
-    _remove_batch_dim_3 = torch._C._functorch._remove_batch_dim(batched_outputs_3, 1, l_d_, 0);  batched_outputs_3 = l_d_ = None
-    _vmap_decrement_nesting_3 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_3 = None
+    batched_outputs_1 = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 4, l_a_, 0);  batched_outputs = l_a_ = None
+    _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+    batched_outputs_2 = torch._functorch.predispatch._remove_batch_dim(batched_outputs_1, 3, l_b_, 0);  batched_outputs_1 = l_b_ = None
+    _vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+    batched_outputs_3 = torch._functorch.predispatch._remove_batch_dim(batched_outputs_2, 2, l_c_, 0);  batched_outputs_2 = l_c_ = None
+    _vmap_decrement_nesting_2 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_2 = None
+    _remove_batch_dim_3 = torch._functorch.predispatch._remove_batch_dim(batched_outputs_3, 1, l_d_, 0);  batched_outputs_3 = l_d_ = None
+    _vmap_decrement_nesting_3 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_3 = None
     return (_remove_batch_dim_3,)""",  # noqa: B950
             )
 
@@ -3742,11 +3731,11 @@ def forward(self, L_x_: "f32[4, 3]"):
 
         child: "f32[12, 4, 3]" = chunk.view(12, 4, 3);  chunk = None
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
-        child_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+        child_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
 
         _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting();  _jvp_increment_nesting = None
         _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled = None
@@ -3789,18 +3778,18 @@ def forward(self, L_x_: "f32[4, 3]"):
 
         basis: "f32[12, 4, 3]" = chunk_1.view(12, 4, 3);  chunk_1 = None
 
-        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+        lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_1 = None
 
-        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting_1 = None
+        _vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting_1 = None
 
-        _add_batch_dim_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(basis, 0, 3);  basis = None
+        _add_batch_dim_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 3);  basis = None
 
         _autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim_1], retain_graph = True, create_graph = True);  primals_out = diff_primals = _add_batch_dim_1 = None
         batched_outputs: "f32[4, 3]" = _autograd_grad[0];  _autograd_grad = None
 
-        chunked_result: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 3, 12, 0);  batched_outputs = None
+        chunked_result: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 3, 12, 0);  batched_outputs = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
 
         split = chunked_result.split((12,), dim = 0);  chunked_result = None
         split_1: "f32[12, 4, 3]" = split[0];  split = None
@@ -3819,9 +3808,9 @@ def forward(self, L_x_: "f32[4, 3]"):
         _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_1 = None
         _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting();  _jvp_decrement_nesting = None
 
-        results_1: "f32[12, 4, 3, 4, 3]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
+        results_1: "f32[12, 4, 3, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
 
-        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+        _vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
 
         movedim: "f32[4, 3, 4, 3, 12]" = results_1.movedim(0, -1);  results_1 = None
         split_2 = movedim.split((12,), dim = -1);  movedim = None
@@ -3870,11 +3859,11 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         child: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
-        child_1: "f32[3, 4]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+        child_1: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
 
         _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting();  _jvp_increment_nesting = None
         _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled = None
@@ -3919,18 +3908,18 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         basis: "f32[12, 4, 3]" = chunk_1.view(12, 4, 3);  chunk_1 = None
 
-        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+        lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_1 = None
 
-        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting_1 = None
+        _vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting_1 = None
 
-        _add_batch_dim_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(basis, 0, 3);  basis = None
+        _add_batch_dim_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 3);  basis = None
 
         _autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [child_4], [_add_batch_dim_1], retain_graph = True, create_graph = True);  primals_out = child_4 = _add_batch_dim_1 = None
         child_5: "f32[3, 4]" = _autograd_grad[0];  _autograd_grad = None
 
-        child_6: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(child_5, 3, 12, 0);  child_5 = None
+        child_6: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(child_5, 3, 12, 0);  child_5 = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
 
         split = child_6.split((12,), dim = 0);  child_6 = None
         split_1: "f32[12, 3, 4]" = split[0];  split = None
@@ -3950,9 +3939,9 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
         _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_1 = None
         _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting();  _jvp_decrement_nesting = None
 
-        child_10: "f32[12, 4, 3, 3, 4]" = torch._C._functorch._remove_batch_dim(child_9, 1, 12, 0);  child_9 = None
+        child_10: "f32[12, 4, 3, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(child_9, 1, 12, 0);  child_9 = None
 
-        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+        _vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
 
         movedim: "f32[4, 3, 3, 4, 12]" = child_10.movedim(0, -1);  child_10 = None
         split_2 = movedim.split((12,), dim = -1);  movedim = None
@@ -4017,18 +4006,18 @@ def forward(self, L_x_: "f32[4, 3]"):
 
         basis: "f32[12, 4, 3]" = chunk.view(12, 4, 3);  chunk = None
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
-        _add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(basis, 0, 1);  basis = None
+        _add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 1);  basis = None
 
         _autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True);  primals_out = diff_primals = _add_batch_dim = None
         batched_outputs: "f32[4, 3]" = _autograd_grad[0];  _autograd_grad = None
 
-        chunked_result: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
+        chunked_result: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
 
         split = chunked_result.split((12,), dim = 0);  chunked_result = None
         split_1: "f32[12, 4, 3]" = split[0];  split = None
@@ -4095,18 +4084,18 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         basis: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
-        _add_batch_dim: "f32[3, 4]" = torch._C._functorch._add_batch_dim(basis, 0, 1);  basis = None
+        _add_batch_dim: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 1);  basis = None
 
         _autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True);  primals_out = diff_primals = _add_batch_dim = None
         batched_outputs: "f32[3, 4]" = _autograd_grad[0];  _autograd_grad = None
 
-        chunked_result: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
+        chunked_result: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
 
         split = chunked_result.split((12,), dim = 0);  chunked_result = None
         split_1: "f32[12, 3, 4]" = split[0];  split = None
@@ -4175,18 +4164,18 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         basis: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
-        _add_batch_dim: "f32[3, 4]" = torch._C._functorch._add_batch_dim(basis, 0, 1);  basis = None
+        _add_batch_dim: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 1);  basis = None
 
         _autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True);  primals_out = diff_primals = _add_batch_dim = None
         batched_outputs: "f32[3, 4]" = _autograd_grad[0];  _autograd_grad = None
 
-        chunked_result: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
+        chunked_result: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
 
         split = chunked_result.split((12,), dim = 0);  chunked_result = None
         split_1: "f32[12, 3, 4]" = split[0];  split = None
@@ -5232,11 +5221,11 @@ def forward(self, L_x_: "f32[4, 3]"):
 
         child: "f32[12, 4, 3]" = chunk.view(12, 4, 3);  chunk = None
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
-        child_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+        child_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
 
         _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting();  _jvp_increment_nesting = None
         _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled = None
@@ -5262,9 +5251,9 @@ def forward(self, L_x_: "f32[4, 3]"):
         _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_1 = None
         _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting();  _jvp_decrement_nesting = None
 
-        results: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
+        results: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
 
         movedim: "f32[4, 3, 12]" = results.movedim(0, -1);  results = None
         split = movedim.split((12,), dim = -1);  movedim = None
@@ -5313,11 +5302,11 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         child: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
-        child_1: "f32[3, 4]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+        child_1: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
 
         _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting();  _jvp_increment_nesting = None
         _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled = None
@@ -5344,9 +5333,9 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
         _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_1 = None
         _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting();  _jvp_decrement_nesting = None
 
-        results: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
+        results: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
 
         movedim: "f32[3, 4, 12]" = results.movedim(0, -1);  results = None
         split = movedim.split((12,), dim = -1);  movedim = None
@@ -5395,11 +5384,11 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         child: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error');  _vmap_increment_nesting = None
 
-        child_1: "f32[3, 4]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+        child_1: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
 
         _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting();  _jvp_increment_nesting = None
         _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled = None
@@ -5428,10 +5417,10 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
         _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_1 = None
         _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting();  _jvp_decrement_nesting = None
 
-        results: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
-        aux_2: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(aux_1, 1, 12, 0);  aux_1 = None
+        results: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
+        aux_2: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(aux_1, 1, 12, 0);  aux_1 = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
 
         aux_3: "f32[4, 3]" = aux_2[0];  aux_2 = None
 
@@ -5482,11 +5471,11 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
 
         child: "f32[12, 4, 3]" = chunk.view(12, 4, 3);  chunk = None
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'same');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'same');  _vmap_increment_nesting = None
 
-        child_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+        child_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
 
         _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting();  _jvp_increment_nesting = None
         _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled = None
@@ -5520,10 +5509,10 @@ def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
         _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True);  _set_fwd_grad_enabled_1 = None
         _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting();  _jvp_decrement_nesting = None
 
-        child_8: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(child_6, 1, 12, 0);  child_6 = None
-        child_9: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(child_7, 1, 12, 0);  child_7 = None
+        child_8: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(child_6, 1, 12, 0);  child_6 = None
+        child_9: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(child_7, 1, 12, 0);  child_7 = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
 
         movedim: "f32[3, 4, 12]" = child_8.movedim(0, -1);  child_8 = None
         split = movedim.split((12,), dim = -1);  movedim = None
@@ -6263,19 +6252,19 @@ class GraphModule(torch.nn.Module):
     def forward(self, L_x_: "f32[3, 3, 3]"):
         l_x_ = L_x_
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
 
-        _add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        _add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
 
         sum_1: "f32[3]" = _add_batch_dim.sum(0)
         sum_2: "f32[3]" = _add_batch_dim.sum(1);  _add_batch_dim = None
         batched_outputs: "f32[3]" = sum_1 + sum_2;  sum_1 = sum_2 = None
 
-        _remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+        _remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
         return (_remove_batch_dim,)
 """,
         )
@@ -6301,20 +6290,20 @@ class GraphModule(torch.nn.Module):
     def forward(self, L_x_: "f32[3, 3, 3]"):
         l_x_ = L_x_
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
 
-        _add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        _add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
 
         sum_1: "f32[3]" = _add_batch_dim.sum(0)
         sum_2: "f32[3]" = _add_batch_dim.sum(1);  _add_batch_dim = None
         add: "f32[3]" = sum_1 + sum_2;  sum_1 = sum_2 = None
         batched_outputs: "f32[3]" = add + 3;  add = None
 
-        _remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+        _remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
         return (_remove_batch_dim,)
 """,
         )
@@ -6341,20 +6330,20 @@ def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
 
-        _add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        _add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
 
         sum_1: "f32[3]" = _add_batch_dim.sum(0)
         sum_2: "f32[3]" = _add_batch_dim.sum(1);  _add_batch_dim = None
         add: "f32[3]" = sum_1 + sum_2;  sum_1 = sum_2 = None
         batched_outputs: "f32[3, 3]" = add + l_y_;  add = l_y_ = None
 
-        _remove_batch_dim: "f32[3, 3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+        _remove_batch_dim: "f32[3, 3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
         return (_remove_batch_dim,)
 """,
         )
@@ -6382,21 +6371,21 @@ def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
 
-        _add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
-        _add_batch_dim_1: "f32[3]" = torch._C._functorch._add_batch_dim(l_y_, 1, 1);  l_y_ = None
+        _add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        _add_batch_dim_1: "f32[3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 1, 1);  l_y_ = None
 
         sum_1: "f32[3]" = _add_batch_dim.sum(0)
         sum_2: "f32[3]" = _add_batch_dim.sum(1);  _add_batch_dim = None
         add: "f32[3]" = sum_1 + sum_2;  sum_1 = sum_2 = None
         batched_outputs: "f32[3]" = add + _add_batch_dim_1;  add = _add_batch_dim_1 = None
 
-        _remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+        _remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
         return (_remove_batch_dim,)
 """,
         )
@@ -6426,21 +6415,21 @@ def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
 
-        _add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
-        _add_batch_dim_1: "f32[3]" = torch._C._functorch._add_batch_dim(l_y_, 1, 1);  l_y_ = None
+        _add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        _add_batch_dim_1: "f32[3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 1, 1);  l_y_ = None
 
         sum_1: "f32[3]" = _add_batch_dim.sum(0)
         sum_2: "f32[3]" = _add_batch_dim.sum(1);  _add_batch_dim = None
         add: "f32[3]" = sum_1 + sum_2;  sum_1 = sum_2 = None
         batched_outputs: "f32[3]" = add + _add_batch_dim_1;  add = _add_batch_dim_1 = None
 
-        _remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+        _remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
         return (_remove_batch_dim,)
 """,
         )
@@ -6466,29 +6455,29 @@ def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3, 3]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting = None
 
-        child: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
-        child_1: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_y_, 0, 1);  l_y_ = None
+        child: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        child_1: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 0, 1);  l_y_ = None
 
-        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+        lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_1 = None
 
-        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting_1 = None
+        _vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting_1 = None
 
-        _add_batch_dim_2: "f32[3]" = torch._C._functorch._add_batch_dim(child, 1, 2);  child = None
-        _add_batch_dim_3: "f32[3]" = torch._C._functorch._add_batch_dim(child_1, 1, 2);  child_1 = None
+        _add_batch_dim_2: "f32[3]" = torch._functorch.predispatch._add_batch_dim(child, 1, 2);  child = None
+        _add_batch_dim_3: "f32[3]" = torch._functorch.predispatch._add_batch_dim(child_1, 1, 2);  child_1 = None
 
         batched_outputs: "f32[3]" = _add_batch_dim_2 + _add_batch_dim_3;  _add_batch_dim_2 = _add_batch_dim_3 = None
 
-        batched_outputs_1: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 2, 3, 0);  batched_outputs = None
+        batched_outputs_1: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 2, 3, 0);  batched_outputs = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
 
-        _remove_batch_dim_1: "f32[3, 3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs_1, 1, 3, 0);  batched_outputs_1 = None
+        _remove_batch_dim_1: "f32[3, 3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs_1, 1, 3, 0);  batched_outputs_1 = None
 
-        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+        _vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
         return (_remove_batch_dim_1,)
 """,
         )
@@ -6515,27 +6504,27 @@ def forward(self, L_y_: "f32[5, 3]", L_x_: "f32[2, 3]"):
         l_y_ = L_y_
         l_x_ = L_x_
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(5, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(5, 'error');  _vmap_increment_nesting = None
 
-        child: "f32[3]" = torch._C._functorch._add_batch_dim(l_y_, 0, 1);  l_y_ = None
+        child: "f32[3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 0, 1);  l_y_ = None
 
-        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+        lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_1 = None
 
-        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting_1 = None
+        _vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(3, 'error');  _vmap_increment_nesting_1 = None
 
-        _add_batch_dim_1: "f32[]" = torch._C._functorch._add_batch_dim(child, 0, 2);  child = None
+        _add_batch_dim_1: "f32[]" = torch._functorch.predispatch._add_batch_dim(child, 0, 2);  child = None
 
         batched_outputs: "f32[2, 3]" = l_x_ * _add_batch_dim_1;  l_x_ = _add_batch_dim_1 = None
 
-        batched_outputs_1: "f32[3, 2, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 2, 3, 0);  batched_outputs = None
+        batched_outputs_1: "f32[3, 2, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 2, 3, 0);  batched_outputs = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
 
-        _remove_batch_dim_1: "f32[5, 3, 2, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs_1, 1, 5, 0);  batched_outputs_1 = None
+        _remove_batch_dim_1: "f32[5, 3, 2, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs_1, 1, 5, 0);  batched_outputs_1 = None
 
-        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+        _vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
         return (_remove_batch_dim_1,)
 """,
         )
@@ -6560,19 +6549,19 @@ class GraphModule(torch.nn.Module):
     def forward(self, L_x_: "f32[2, 4, 3]"):
         l_x_ = L_x_
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
 
-        _add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        _add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
 
         child: "f32[3]" = _add_batch_dim.sum(0)
         child_1: "f32[4]" = _add_batch_dim.sum(1);  _add_batch_dim = None
 
-        _remove_batch_dim: "f32[2, 3]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 0);  child = None
-        _remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
+        _remove_batch_dim: "f32[2, 3]" = torch._functorch.predispatch._remove_batch_dim(child, 1, 2, 0);  child = None
+        _remove_batch_dim_1: "f32[2, 4]" = torch._functorch.predispatch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
         return (_remove_batch_dim, _remove_batch_dim_1)
 """,
         )
@@ -6597,19 +6586,19 @@ class GraphModule(torch.nn.Module):
     def forward(self, L_x_: "f32[2, 4, 3]"):
         l_x_ = L_x_
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
 
-        _add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        _add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
 
         child: "f32[3]" = _add_batch_dim.sum(0)
         child_1: "f32[4]" = _add_batch_dim.sum(1);  _add_batch_dim = None
 
-        _remove_batch_dim: "f32[3, 2]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 1);  child = None
-        _remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
+        _remove_batch_dim: "f32[3, 2]" = torch._functorch.predispatch._remove_batch_dim(child, 1, 2, 1);  child = None
+        _remove_batch_dim_1: "f32[2, 4]" = torch._functorch.predispatch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
         return (_remove_batch_dim, _remove_batch_dim_1)
 """,
         )
@@ -6635,19 +6624,19 @@ class GraphModule(torch.nn.Module):
     def forward(self, L_x_: "f32[2, 4, 3]"):
         l_x_ = L_x_
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
 
-        _add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        _add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
 
         child: "f32[3]" = _add_batch_dim.sum(0)
         child_1: "f32[4]" = _add_batch_dim.sum(1);  _add_batch_dim = None
 
-        _remove_batch_dim: "f32[3, 2]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 1);  child = None
-        _remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
+        _remove_batch_dim: "f32[3, 2]" = torch._functorch.predispatch._remove_batch_dim(child, 1, 2, 1);  child = None
+        _remove_batch_dim_1: "f32[2, 4]" = torch._functorch.predispatch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
         return (_remove_batch_dim, _remove_batch_dim_1)
 """,
         )
@@ -6845,7 +6834,7 @@ def _validate(self, fn, backend, *args, skip_check=False, fullgraph=True):
             for arg, cloned_arg in zip(args, cloned_args):
                 self.assertEqual(arg.grad, cloned_arg.grad)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_function(self):
         def gn(x, y):
@@ -6864,7 +6853,7 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_function_with_kwargs(self):
         def gn(x, y):
@@ -6887,7 +6876,7 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_dropout(self):
         def gn(x, y):
@@ -6913,7 +6902,7 @@ def fn(x, y):
             fn, backend, x, y, skip_check=True
         )  # dropout decomp is known to diverge with eager
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_dropout_inductor(self):
         def gn(x, y):
@@ -6932,7 +6921,7 @@ def fn(x, y):
             fn, backend, x, y, skip_check=True
         )  # dropout decomp is known to diverge with eager
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_fallback(self):
         def gn(x, y):
@@ -6963,7 +6952,7 @@ def fn(x, y):
         self.assertEqual(cnt.op_count, 2)
         self.assertEqual(len(backend.graphs), 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_module(self):
         class MockModule(torch.nn.Module):
@@ -7216,7 +7205,7 @@ def false_branch(x):
 
 
 class TestHigherOrderOpsOpInfo(torch._dynamo.test_case.TestCase):
-    @requires_cuda
+    @requires_cuda_and_triton
     @parametrize("backend", ("aot_eager", "inductor"))
     @ops(
         list(filter(lambda op: op.name not in xfail_hops_compile, hop_db)),
diff --git a/test/dynamo/test_logging.py b/test/dynamo/test_logging.py
index 015bb660512bd..2a83b28b50a9c 100644
--- a/test/dynamo/test_logging.py
+++ b/test/dynamo/test_logging.py
@@ -21,21 +21,29 @@
 from torch.testing._internal.common_cuda import SM90OrLater
 from torch.testing._internal.common_utils import (
     find_free_port,
+    IS_WINDOWS,
     munge_exc,
     skipIfTorchDynamo,
+    skipIfWindows,
     TEST_XPU,
     xfailIf,
 )
-from torch.testing._internal.inductor_utils import HAS_CUDA, HAS_XPU
+from torch.testing._internal.inductor_utils import (
+    HAS_CUDA_AND_TRITON,
+    HAS_XPU_AND_TRITON,
+)
 from torch.testing._internal.logging_utils import (
     LoggingTestCase,
     make_logging_test,
     make_settings_test,
 )
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
+
 
+requires_gpu = unittest.skipUnless(
+    HAS_CUDA_AND_TRITON or HAS_XPU_AND_TRITON, "requires cuda or xpu with triton"
+)
 
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
-requires_gpu = unittest.skipUnless(HAS_CUDA or HAS_XPU, "requires cuda or xpu")
 requires_distributed = functools.partial(
     unittest.skipIf, not dist.is_available(), "requires distributed"
 )
@@ -131,7 +139,7 @@ def test_fusion(self, records):
         self.assertGreater(len(records), 0)
         self.assertLess(len(records), 8)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @make_logging_test(cudagraphs=True)
     def test_cudagraphs(self, records):
         fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
@@ -244,7 +252,7 @@ def throw(x):
         exitstack.close()
 
     @requires_distributed()
-    @requires_cuda
+    @requires_cuda_and_triton
     @make_logging_test(ddp_graphs=True)
     def test_ddp_graphs(self, records):
         class ToyModel(torch.nn.Module):
@@ -522,7 +530,7 @@ def test_invalid_artifact_flag_error_msg(self):
             "import torch",
             env=env,
         )
-        lines = stderr.decode().split("\n")
+        lines = stderr.decode().split("\r\n" if IS_WINDOWS else "\n")
         # This is a sanity assert that our error is not spammy.
         # As of this test creation this was 18.
         # See this issue for the purpose o this test:
@@ -538,6 +546,7 @@ def test_invalid_artifact_flag_error_msg(self):
         self.assertEqual(lines[-4], "Valid settings:")
 
     @requires_distributed()
+    @skipIfWindows(msg="TODO: (xuhancn), Can't reproduce locally")
     def test_distributed_rank_logging(self):
         env = dict(os.environ)
         env["TORCH_LOGS"] = "dynamo"
@@ -933,6 +942,7 @@ def bar():
     "aot_graphs",
     "aot_graphs_effects",
     "pre_grad_graphs",
+    "joint_graph_passes",
     "post_grad_graphs",
     "inductor_metrics",
     "ir_pre_fusion",
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 82c0368c5b153..b7fb01be17152 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: dynamo"]
 # ruff: noqa: F841
 import abc
+import builtins
 import collections
 import collections.abc
 import copy
@@ -16,11 +17,13 @@
 import math
 import operator
 import os
+import pickle
 import random
 import sys
 import tempfile
 import threading
 import traceback
+import types
 import typing
 import unittest
 import unittest.mock as mock
@@ -84,12 +87,15 @@
 )
 from torch.testing._internal.common_utils import (
     freeze_rng_state,
+    instantiate_parametrized_tests,
     IS_FBCODE,
+    parametrize,
     scoped_load_inline,
     set_default_dtype,
     skipIfHpu,
     skipIfNNModuleInlined,
     skipIfWindows,
+    subtest,
     TEST_HPU,
     TEST_XPU,
     wrapDeterministicFlagAPITest,
@@ -98,11 +104,21 @@
 from torch.testing._internal.logging_utils import logs_to_string
 
 
+pytree_modules = {
+    "python": python_pytree,
+}
 if python_pytree._cxx_pytree_dynamo_traceable:
     import torch.utils._cxx_pytree as cxx_pytree
+
+    pytree_modules["cxx"] = cxx_pytree
 else:
     cxx_pytree = None
 
+parametrize_pytree_module = parametrize(
+    "pytree",
+    [subtest(module, name=name) for name, module in pytree_modules.items()],
+)
+
 MyTuple = collections.namedtuple("MyTuple", ["a", "b", "ab"])
 T = typing.TypeVar("T")
 
@@ -1703,16 +1719,17 @@ def fn(packed):
             if hasattr(packed, "b"):
                 b = packed.b + 1
             c = packed[2]
-            return a + b + c
+            d = len(packed._fields)
+            return a + b + c + d
 
         v1 = torch.Tensor([1])
         v2 = torch.Tensor([2])
         v3 = torch.Tensor([3])
         cnts = torch._dynamo.testing.CompileCounter()
         opt_fn = torch.compile(fn, backend=cnts)
-        self.assertEqual(opt_fn(MyTuple(v1, v2, v3))[0], 7)
+        self.assertEqual(opt_fn(MyTuple(v1, v2, v3))[0], 10)
         self.assertEqual(cnts.frame_count, 1)
-        self.assertEqual(cnts.op_count, 3)
+        self.assertEqual(cnts.op_count, 4)
 
     def test_namedtuple3(self):
         def fn(x, packed):
@@ -1750,6 +1767,52 @@ def __getitem__(self, index):
         out = f(MyTuple(a, b))
         self.assertTrue(same(a + 1, out))
 
+    def test_namedtuple_source_dynamic_attributes(self):
+        class MyNamedTuple(typing.NamedTuple):
+            a: torch.Tensor
+            b: torch.Tensor
+
+        class MyNamedTupleSubclass(MyNamedTuple):
+            pass
+
+        @torch.compile(fullgraph=True, backend="eager")
+        def f(tup):
+            c = torch.tensor(3.0)
+            tup.c = c  # Add dynamic attribute
+            return tup
+
+        extended_tup = MyNamedTupleSubclass(a=torch.tensor([1.0]), b=torch.tensor(2.0))
+        result = f(extended_tup)
+        # Verify the tuple has the expected structure
+        self.assertEqual(result.a, torch.tensor([1.0]))
+        self.assertEqual(result.b, torch.tensor(2.0))
+        self.assertTrue(hasattr(result, "c"))
+        self.assertEqual(result.c, torch.tensor(3.0))
+
+    def test_namedtuple_sourceless_dynamic_attributes(self):
+        class MyNamedTuple(typing.NamedTuple):
+            a: torch.Tensor
+            b: torch.Tensor
+
+        class MyNamedTupleSubclass(MyNamedTuple):
+            pass
+
+        @torch.compile(backend="eager")
+        def f():
+            # Create namedtuple inside function (sourceless)
+            tup = MyNamedTupleSubclass(a=torch.tensor([1.0]), b=torch.tensor(2.0))
+            # Add dynamic attribute
+            tup.c = torch.tensor(3.0)
+            return tup
+
+        result = f()
+        # Verify the tuple has the expected structure
+        self.assertEqual(result.a, torch.tensor([1.0]))
+        self.assertEqual(result.b, torch.tensor(2.0))
+        # Verify the dynamic attribute is preserved
+        self.assertTrue(hasattr(result, "c"))
+        self.assertEqual(result.c, torch.tensor(3.0))
+
     def test_structseq1(self):
         def fn(x, y):
             return torch.return_types.max((x, y))
@@ -1959,6 +2022,31 @@ def fn(a, b):
 
         self.assertEqual(exp, act)
 
+    def test_class_binop(self):
+        class Foo:
+            def __init__(self, x):
+                self.x = x
+
+            def __add__(self, other):
+                return Foo(self.x + other.x)
+
+        def fn(a, b):
+            return a + b
+
+        x = torch.randn(2)
+        a, b = Foo(x), Foo(x + 1)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(fn, backend=cnts)
+        self.assertEqual(opt_fn(a, b).x, 2 * x + 1)
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(cnts.op_count, 1)
+
+        def fn(a, b):
+            return a - b
+
+        opt_fn = torch.compile(fn, backend=cnts, fullgraph=True)
+        self.assertRaises(torch._dynamo.exc.Unsupported, opt_fn, a, b)
+
     def test_user_getattr1(self):
         class MyConfig(dict):
             def __getattr__(self, name):
@@ -6363,6 +6451,19 @@ def func(x, y):
         self.assertTrue(same(ref, res))
         self.assertTrue(same(x, x1))
 
+    def test_inference_mode_param(self):
+        def fn(x):
+            p = torch.nn.Parameter(x, requires_grad=False)
+            return x * p
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+
+        with torch.inference_mode():
+            x = torch.rand(4)
+            ref = fn(x)
+            res = opt_fn(x)
+            self.assertEqual(ref, res)
+
     def test_if_cond_nn_mod1(self):
         class MockModule(torch.nn.Module):
             def __init__(self, output_relu=True):
@@ -8520,6 +8621,54 @@ def global_context_capture_fn(frame_summary):
         self.assertEqual(seen_frames[0].name, "fn")
         self.assertEqual(seen_frames[0].line, "r, r2 = uwu_inline_me(x, y, z)")
 
+    def test_fullgraph_capture(self):
+        from torch._dynamo.convert_frame import (
+            FrameInfo,
+            fullgraph_capture,
+            get_compile_id,
+        )
+        from torch._dynamo.utils import dynamo_timed, get_metrics_context
+        from torch._guards import compile_context, CompileContext
+
+        def foo(x):
+            return x + x.shape[0]
+
+        x = torch.randn(4, 3)
+        f_locals = {"x": x}
+        with (
+            compile_context(CompileContext(get_compile_id({}))),
+            dynamo_timed(""),
+            get_metrics_context(),
+        ):
+            capture_output = fullgraph_capture(
+                FrameInfo(
+                    foo.__code__,
+                    foo.__globals__,
+                    f_locals,
+                    builtins,
+                    (),
+                )
+            )
+            dynamo_output = capture_output.dynamo_output
+            backend_input = capture_output.backend_input
+            self.assertTrue(
+                dynamo_output.build_guards(foo.__code__).guard_manager.check(f_locals)
+            )
+        import_sources = {
+            alias: importlib.import_module(module_name)
+            for alias, module_name in dynamo_output.tracer_output.output_graph.import_sources.items()
+        }
+        self.assertEqual(
+            foo(x),
+            types.FunctionType(
+                dynamo_output.bytecode,
+                {
+                    **import_sources,
+                    backend_input.backend_id: backend_input.graph_module,
+                },
+            )(x),
+        )
+
     def test_torch_guards_stack_frame_register_inlining_deep(self):
         x = torch.tensor([0.5, 0.5])
         y = torch.tensor([0.75, 0.75, 0.75, 0.75])
@@ -8557,6 +8706,42 @@ def global_context_capture_fn(frame_summary):
         self.assertEqual(seen_frames[1].name, "uwu_inline_me")
         self.assertEqual(seen_frames[2].line, "r2 = uwu_inline_me_deep(y, z)")
 
+    def test_recompile_on_disable_1(self):
+        # fix https://github.com/pytorch/pytorch/issues/157399
+        @torch.compile(backend="eager")
+        def fn(x):
+            @torch._dynamo.disable
+            def inner(x):
+                return x + 10
+
+            return inner(x) + 1
+
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            try:
+                for i in range(5):
+                    fn(torch.rand(2, 3))
+            except torch._dynamo.exc.RecompileError as e:
+                self.fail("RecompileError raised unexpectedly: " + str(e))
+
+    def test_recompile_on_disable_2(self):
+        def outer(x, cond):
+            @torch._dynamo.disable()
+            def fn0(y):
+                return y + 1
+
+            @torch._dynamo.disable()
+            def fn1(y):
+                return y + 2
+
+            if cond:
+                f = fn0
+            else:
+                f = fn1
+
+            torch._dynamo.graph_break()
+            # there will be a resume function here
+            return f(x)
+
     def test_error_on_recompile(self):
         @torch.compile(backend="eager")
         def fn(a, b):
@@ -8981,71 +9166,6 @@ def fn():
         opt = torch.compile(fn, backend="eager")
         opt()
 
-    def test_tracing_py_tree(self):
-        def fn(xs):
-            flat_xs, spec = python_pytree.tree_flatten(xs)
-            res = [x.clone() for x in flat_xs]
-            return python_pytree.tree_unflatten(res, spec)
-
-        xs = [torch.tensor(i) for i in range(3)]
-
-        counter = CompileCounter()
-        torch.compile(fn, backend=counter, fullgraph=True)(xs)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 3)
-
-    def test_tracing_nested_py_tree(self):
-        def fn(xs):
-            flat_xs, spec = python_pytree.tree_flatten(xs)
-            res = [x.clone() for x in flat_xs]
-            return python_pytree.tree_unflatten(res, spec)
-
-        xs = [torch.tensor(i) for i in range(3)]
-        xsl = [xs, xs, xs, xs]
-
-        counter = CompileCounter()
-        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
-        real_out = fn(xsl)
-        self.assertEqual(comp_out, real_out)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 12)
-
-    def test_tracing_nested_py_tree_tuples(self):
-        def fn(xs):
-            flat_xs, spec = python_pytree.tree_flatten(xs)
-            res = [x.clone() for x in flat_xs]
-            return python_pytree.tree_unflatten(res, spec)
-
-        xs = [torch.tensor(i) for i in range(3)]
-        xsl = (xs, xs, xs, xs)
-
-        counter = CompileCounter()
-        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
-        real_out = fn(xsl)
-        self.assertEqual(comp_out, real_out)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 12)
-
-    def test_tracing_nested_py_tree_dicts(self):
-        def fn(xs):
-            flat_xs, spec = python_pytree.tree_flatten(xs)
-            res = [x.clone() for x in flat_xs]
-            return python_pytree.tree_unflatten(res, spec)
-
-        xs = [torch.tensor(i) for i in range(3)]
-        xsl = {
-            "a": xs,
-            "b": xs,
-            "c": xs,
-        }
-
-        counter = CompileCounter()
-        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
-        real_out = fn(xsl)
-        self.assertEqual(comp_out, real_out)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 9)
-
     def test_dynamic_one_hot(self):
         def fn(x):
             x = x + 1
@@ -9062,28 +9182,6 @@ def fn(x):
         self.assertEqual(counter.frame_count, 2)
         self.assertEqual(counter.op_count, 2)
 
-    def test_tracing_nested_py_tree_mixed_all(self):
-        def fn(xs):
-            flat_xs, spec = python_pytree.tree_flatten(xs)
-            res = [x.clone() for x in flat_xs]
-            return python_pytree.tree_unflatten(res, spec)
-
-        xs = [torch.tensor(i) for i in range(3)]
-        xsa = (xs, xs)
-        xsb = {"aa": xsa, "ab": xs}
-        xsl = {
-            "a": xs,
-            "b": xsa,
-            "c": xsb,
-        }
-
-        counter = CompileCounter()
-        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
-        real_out = fn(xsl)
-        self.assertEqual(comp_out, real_out)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 18)
-
     def test_any_all_symnode(self):
         cnt = CompileCounter()
 
@@ -9110,46 +9208,6 @@ def fn(x):
         self.assertEqual(fn(y3), y3 - 3)
         self.assertEqual(cnt.frame_count, 2)
 
-    def test_tracing_py_tree_tensor_subclass(self):
-        from torch.testing._internal.two_tensor import TwoTensor
-        from torch.utils.checkpoint import checkpoint
-
-        def fn(xs):
-            nested_xs = [[xs]]
-            flat_xs, spec = python_pytree.tree_flatten(xs)
-            return flat_xs[0].clone()
-
-        # use checkpoint to trigger a "sourceless" tensor subclass
-        def checkpoint_fn(xs):
-            return checkpoint(fn, xs, use_reentrant=True)
-
-        xs = TwoTensor(torch.ones(2, 2), torch.ones(2, 2))
-
-        counter = CompileCounter()
-        torch.compile(checkpoint_fn, backend=counter, fullgraph=True)(xs)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 2)
-
-    def test_tracing_tree_map_only(self):
-        def fn(xs):
-            def mapper(x):
-                return x.clone()
-
-            y = python_pytree.tree_map_only(torch.Tensor, mapper, xs)
-            return y
-
-        xs = [torch.tensor(i) for i in range(3)] + ["hi"]
-        xsa = (xs, xs)
-        xsb = {"aa": xsa, "ab": xs}
-
-        counter = CompileCounter()
-        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsb)
-        real_out = fn(xsb)
-
-        self.assertEqual(comp_out, real_out)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 9)
-
     @torch._dynamo.config.patch(
         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
     )
@@ -10592,139 +10650,6 @@ def fn(x, y):
         expected = fn(*inps)
         self.assertEqual(actual, expected)
 
-    def test_pytree_tree_leaves(self):
-        implementations = [("python", python_pytree)]
-        if cxx_pytree is not None:
-            implementations.append(("cxx", cxx_pytree))
-
-        for name, module in implementations:
-            with self.subTest(f"pytree implement: {name}"):
-
-                def fn(x):
-                    tree = {
-                        "a": [x, x - 1],
-                        "b": x + 2,
-                        "c": (
-                            x,
-                            3.0,
-                            collections.deque([0.0, -x, 1, 2], maxlen=3),
-                        ),
-                        "d": collections.OrderedDict(
-                            {
-                                "e": torch.return_types.qr((2 * x, None)),
-                                "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
-                            },
-                        ),
-                    }
-                    leaves = module.tree_leaves(tree)
-                    return leaves
-
-                x = torch.randn(3, 2)
-                expected = fn(x)
-                fn_opt = torch.compile(fullgraph=True)(fn)
-                actual = fn_opt(x)
-
-                self.assertEqual(actual, expected)
-
-    def test_pytree_tree_flatten_unflatten(self):
-        implementations = [("python", python_pytree)]
-        if cxx_pytree is not None:
-            implementations.append(("cxx", cxx_pytree))
-
-        for name, module in implementations:
-            with self.subTest(f"pytree implement: {name}"):
-
-                def fn(x, y):
-                    tree = {
-                        "a": [x, x - 1],
-                        "b": x + 2,
-                        "c": (
-                            x,
-                            3.0,
-                            collections.deque([0.0, -x, 1, 2], maxlen=3),
-                        ),
-                        "d": collections.OrderedDict(
-                            {
-                                "e": torch.return_types.qr((2 * x, None)),
-                                "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
-                            },
-                        ),
-                    }
-                    leaves, treespec = module.tree_flatten(tree)
-                    new_leaves = [
-                        x - 1,
-                        y,
-                        x * y,
-                        3.0,
-                        y - 2,
-                        1,
-                        torch.zeros(2, 2),
-                        2 * y,
-                        -y,
-                        x + y,
-                        x - y,
-                        torch.ones(3, 2),
-                        1,
-                    ]
-                    new_tree = module.tree_unflatten(new_leaves, treespec)
-                    return leaves, new_tree
-
-            x = torch.randn(3, 2)
-            y = torch.randn(3, 2)
-            expected = fn(x, y)
-            fn_opt = torch.compile(fullgraph=True)(fn)
-            actual = fn_opt(x, y)
-
-            self.assertEqual(actual, expected)
-
-    def test_pytree_tree_map(self):
-        implementations = [("python", python_pytree)]
-        if cxx_pytree is not None:
-            implementations.append(("cxx", cxx_pytree))
-
-        for name, module in implementations:
-            with self.subTest(f"pytree implement: {name}"):
-
-                def fn(x, y):
-                    tree1 = {
-                        "a": [x, x - 1],
-                        "b": x + 2,
-                        "c": (
-                            x,
-                            3.0,
-                            collections.deque([0.0, -x, 1, 2], maxlen=3),
-                        ),
-                        "d": collections.OrderedDict(
-                            {
-                                "e": torch.return_types.qr((2 * x, None)),
-                                "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
-                            },
-                        ),
-                    }
-                    tree2 = collections.OrderedDict(
-                        [
-                            ("c", (y, 3.0, collections.deque([1, -y, 10.0]))),
-                            ("a", [y, y + 1]),
-                            ("b", y + 2),
-                            (
-                                "d",
-                                {
-                                    "f": MyTuple(torch.ones(4, 3), -y, y + 1),
-                                    "e": torch.return_types.qr((2 * y, None)),
-                                },
-                            ),
-                        ],
-                    )
-                    return module.tree_map(lambda u, v: (u, v), tree1, tree2)
-
-                x = torch.randn(3, 2)
-                y = torch.randn(3, 2)
-                expected = fn(x, y)
-                fn_opt = torch.compile(fullgraph=True)(fn)
-                actual = fn_opt(x, y)
-
-                self.assertEqual(actual, expected)
-
     def test_shape_env_no_recording(self):
         main = ShapeEnv(should_record_events=False)
 
@@ -10777,8 +10702,8 @@ def test_shape_env_equal_constructor(self):
 ShapeEnv not equal: field values don't match:
 
 ==> settings: values don't match.
-  >  Left: ShapeEnvSettings(allow_scalar_outputs=False, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, allow_complex_guards_as_runtime_asserts=False, trace_asserts=False)
-  > Right: ShapeEnvSettings(allow_scalar_outputs=True, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, allow_complex_guards_as_runtime_asserts=False, trace_asserts=False)
+  >  Left: ShapeEnvSettings(allow_scalar_outputs=False, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, trace_asserts=False)
+  > Right: ShapeEnvSettings(allow_scalar_outputs=True, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, trace_asserts=False)
 """,
         )
         self._replay_and_check(main)
@@ -11945,6 +11870,19 @@ def fn(x, d):
         with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
             fn(torch.randn(4), d)
 
+    def test_hash_hop(self):
+        associative_scan = importlib.import_module(
+            "torch._higher_order_ops.associative_scan"
+        )
+
+        @torch.compile(fullgraph=True)
+        def fn(y, s):
+            d = dict()
+            d[s] = y
+            return d[s] + 1.0
+
+        fn(torch.ones(2, 2, device="cpu"), associative_scan.AssociativeScanOp())
+
     def test_iter_type(self):
         @torch.compile(fullgraph=True)
         def fn(y):
@@ -12715,6 +12653,288 @@ def f(x):
         res = opt_f(x)
         self.assertEqual(ref, res)
 
+    def test_builtin_complex(self):
+        def f(x):
+            c = (
+                complex(),
+                complex(1),
+                complex(2, 3),
+                complex(imag=2),
+                complex(real=1),
+                complex(imag=1, real=2),
+                complex("1+2j"),
+                complex(1, 2).conjugate(),
+            )
+            return [x + z for z in c]
+
+        x = torch.randn(1)
+        opt_f = torch.compile(f, backend="eager", fullgraph=True)
+        res = opt_f(x)
+        ref = f(x)
+        self.assertEqual(res, ref)
+
+    def test_builtin_complex_args(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(*args, **kwargs):
+            return torch.tensor(complex(*args, **kwargs))
+
+        self.assertRaises(Unsupported, f, 1, 1, 1)
+        self.assertRaises(Unsupported, f, 1, 1, fake_arg=1)
+        self.assertRaises(Unsupported, f, fake_arg=1)
+        self.assertRaises(Unsupported, f, [])
+        self.assertRaises(Unsupported, f, "1 + j")
+
+
+class MiscTestsPyTree(torch._inductor.test_case.TestCase):
+    @parametrize_pytree_module
+    def test_tracing_pytree(self, pytree):
+        def fn(xs):
+            flat_xs, spec = pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+
+        counter = CompileCounter()
+        torch.compile(fn, backend=counter, fullgraph=True)(xs)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 3)
+
+    @parametrize_pytree_module
+    def test_tracing_nested_pytree(self, pytree):
+        def fn(xs):
+            flat_xs, spec = pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+        xsl = [xs, xs, xs, xs]
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
+        real_out = fn(xsl)
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 12)
+
+    @parametrize_pytree_module
+    def test_tracing_nested_tuples(self, pytree):
+        def fn(xs):
+            flat_xs, spec = pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+        xsl = (xs, xs, xs, xs)
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
+        real_out = fn(xsl)
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 12)
+
+    @parametrize_pytree_module
+    def test_tracing_nested_dicts(self, pytree):
+        def fn(xs):
+            flat_xs, spec = pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+        xsl = {
+            "a": xs,
+            "b": xs,
+            "c": xs,
+        }
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
+        real_out = fn(xsl)
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 9)
+
+    @parametrize_pytree_module
+    def test_tracing_nested_mixed_all(self, pytree):
+        def fn(xs):
+            flat_xs, spec = pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+        xsa = (xs, xs)
+        xsb = {"aa": xsa, "ab": xs}
+        xsl = {
+            "a": xs,
+            "b": xsa,
+            "c": xsb,
+        }
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
+        real_out = fn(xsl)
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 18)
+
+    @parametrize_pytree_module
+    def test_tracing_nested_tensor_subclass(self, pytree):
+        from torch.testing._internal.two_tensor import TwoTensor
+        from torch.utils.checkpoint import checkpoint
+
+        def fn(xs):
+            nested_xs = [[xs]]
+            flat_xs, spec = pytree.tree_flatten(xs)
+            return flat_xs[0].clone()
+
+        # use checkpoint to trigger a "sourceless" tensor subclass
+        def checkpoint_fn(xs):
+            return checkpoint(fn, xs, use_reentrant=True)
+
+        xs = TwoTensor(torch.ones(2, 2), torch.ones(2, 2))
+
+        counter = CompileCounter()
+        torch.compile(checkpoint_fn, backend=counter, fullgraph=True)(xs)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 2)
+
+    @parametrize_pytree_module
+    def test_pytree_tree_leaves(self, pytree):
+        def fn(x):
+            tree = {
+                "a": [x, x - 1],
+                "b": x + 2,
+                "c": (
+                    x,
+                    3.0,
+                    collections.deque([0.0, -x, 1, 2], maxlen=3),
+                ),
+                "d": collections.OrderedDict(
+                    {
+                        "e": torch.return_types.qr((2 * x, None)),
+                        "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
+                    },
+                ),
+            }
+            leaves = pytree.tree_leaves(tree)
+            return leaves
+
+        x = torch.randn(3, 2)
+        expected = fn(x)
+        fn_opt = torch.compile(fullgraph=True)(fn)
+        actual = fn_opt(x)
+
+        self.assertEqual(actual, expected)
+
+    @parametrize_pytree_module
+    def test_pytree_tree_flatten_unflatten(self, pytree):
+        def fn(x, y):
+            tree = {
+                "a": [x, x - 1],
+                "b": x + 2,
+                "c": (
+                    x,
+                    3.0,
+                    collections.deque([0.0, -x, 1, 2], maxlen=3),
+                ),
+                "d": collections.OrderedDict(
+                    {
+                        "e": torch.return_types.qr((2 * x, None)),
+                        "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
+                    },
+                ),
+            }
+            leaves, treespec = pytree.tree_flatten(tree)
+            new_leaves = [
+                x - 1,
+                y,
+                x * y,
+                3.0,
+                y - 2,
+                1,
+                torch.zeros(2, 2),
+                2 * y,
+                -y,
+                x + y,
+                x - y,
+                torch.ones(3, 2),
+                1,
+            ]
+            new_tree = pytree.tree_unflatten(new_leaves, treespec)
+            return leaves, new_tree
+
+        x = torch.randn(3, 2)
+        y = torch.randn(3, 2)
+        expected = fn(x, y)
+        fn_opt = torch.compile(fullgraph=True)(fn)
+        actual = fn_opt(x, y)
+
+        self.assertEqual(actual, expected)
+
+    @parametrize_pytree_module
+    def test_pytree_tree_map(self, pytree):
+        def fn(x, y):
+            tree1 = {
+                "a": [x, x - 1],
+                "b": x + 2,
+                "c": (
+                    x,
+                    3.0,
+                    collections.deque([0.0, -x, 1, 2], maxlen=3),
+                ),
+                "d": collections.OrderedDict(
+                    {
+                        "e": torch.return_types.qr((2 * x, None)),
+                        "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
+                    },
+                ),
+            }
+            tree2 = collections.OrderedDict(
+                [
+                    ("c", (y, 3.0, collections.deque([1, -y, 10.0]))),
+                    ("a", [y, y + 1]),
+                    ("b", y + 2),
+                    (
+                        "d",
+                        {
+                            "f": MyTuple(torch.ones(4, 3), -y, y + 1),
+                            "e": torch.return_types.qr((2 * y, None)),
+                        },
+                    ),
+                ],
+            )
+            return pytree.tree_map(lambda u, v: (u, v), tree1, tree2)
+
+        x = torch.randn(3, 2)
+        y = torch.randn(3, 2)
+        expected = fn(x, y)
+        fn_opt = torch.compile(fullgraph=True)(fn)
+        actual = fn_opt(x, y)
+
+        self.assertEqual(actual, expected)
+
+    @parametrize_pytree_module
+    def test_pytree_tree_map_only(self, pytree):
+        def fn(xs):
+            def mapper(x):
+                return x.clone()
+
+            y = pytree.tree_map_only(torch.Tensor, mapper, xs)
+            return y
+
+        xs = [torch.tensor(i) for i in range(3)] + ["hi"]
+        xsa = (xs, xs)
+        xsb = {"aa": xsa, "ab": xs}
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsb)
+        real_out = fn(xsb)
+
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 9)
+
 
 class TestTracer(JitTestCase):
     def test_jit_save(self):
@@ -13096,10 +13316,14 @@ def forward(self, input):
         #   RuntimeError: value cannot be converted to type at::Half without overflow
 
 
+instantiate_parametrized_tests(MiscTestsPyTree)
+
 devices = ("cuda", "hpu", "xpu")
 instantiate_device_type_tests(
     MiscTestsDevice, globals(), only_for=devices, allow_xpu=True
 )
+
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_modes.py b/test/dynamo/test_modes.py
index 8dab1819f2548..818e5a85aa26d 100644
--- a/test/dynamo/test_modes.py
+++ b/test/dynamo/test_modes.py
@@ -11,7 +11,10 @@
     _pop_torch_function_stack,
     _push_on_torch_function_stack,
 )
+from torch._dynamo.utils import counters
 from torch.overrides import _get_current_function_mode_stack, BaseTorchFunctionMode
+from torch.testing._internal.common_utils import skipIfXpu
+from torch.testing._internal.inductor_utils import GPU_TYPE
 from torch.testing._internal.triton_utils import requires_gpu
 from torch.utils._device import DeviceContext
 from torch.utils._python_dispatch import TorchDispatchMode
@@ -59,6 +62,54 @@ def setUpClass(cls):
     def tearDownClass(cls):
         super().tearDownClass()
 
+    def test_torch_dispatch_ignore_compile_internals(self):
+        counters.clear()
+        from torch.utils._python_dispatch import TorchDispatchMode
+
+        @torch.library.custom_op("mylib::foo", mutates_args=())
+        def foo(x: torch.Tensor) -> torch.Tensor:
+            return x.clone()
+
+        def checksum(x):
+            return x.abs().sum()
+
+        _checksums = []
+
+        class ChecksumFoo(TorchDispatchMode):
+            @classmethod
+            def ignore_compile_internals(cls):
+                return True
+
+            def __init__(self) -> None:
+                super().__init__()
+
+            def __torch_dispatch__(self, func, types, args, kwargs=None):
+                kwargs = kwargs or {}
+
+                if func is torch.ops.mylib.foo.default:
+                    # Do some compute, smoketest to see if there's a bad interaction
+                    _checksums.append(args[0].abs().sum())
+
+                return func(*args, **kwargs)
+
+        # test e2e, with Inductor, as smoketest.
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend="inductor")
+        def g(x):
+            return 2 * x.sin().cos()
+
+        x = torch.randn(3)
+
+        with ChecksumFoo():
+            foo(x)
+            g(x)
+            foo(x)
+
+        self.assertEqual(len(_checksums), 2)
+        # The correct result here is 1: Dynamo should capture the `g` frame.
+        self.assertEqual(counters["frames"]["total"], 1)
+        self.assertEqual(counters["frames"]["ok"], 1)
+
     def test_skip_torch_dispatch_modes(self):
         class RewriteAddToMul(TorchDispatchMode):
             def __torch_dispatch__(self, func, types, args=(), kwargs=None):
@@ -678,6 +729,7 @@ def forward(self, x):
             torch.compile(mod, fullgraph=True)(x)
 
     @requires_gpu
+    @skipIfXpu(msg="XPU does not support flex attention")
     def test_hop(self):
         import torch
         import torch._higher_order_ops
@@ -685,7 +737,7 @@ def test_hop(self):
             flex_attention as flex_attention_eager,
         )
 
-        with torch.device("cuda"):
+        with torch.device(GPU_TYPE):
             flex_attention = torch.compile(flex_attention_eager, dynamic=False)
 
             with self.assertRaisesRegex(
@@ -701,6 +753,7 @@ def test_hop(self):
                     )
 
     @requires_gpu
+    @skipIfXpu(msg="XPU does not support flex attention")
     def test_hop_eager(self):
         import torch
         import torch._higher_order_ops
@@ -708,7 +761,7 @@ def test_hop_eager(self):
             flex_attention as flex_attention_eager,
         )
 
-        with torch.device("cuda"):
+        with torch.device(GPU_TYPE):
             with self.assertRaisesRegex(
                 torch._dynamo.exc.Unsupported,
                 "raised exception HopDetectionError([ConstantVariable(str: 'test')])",
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index f38b9bc502775..7cac7eca72394 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -3422,6 +3422,58 @@ def forward(self, x):
         compiled_mod = torch.compile(mod, backend="eager")
         compiled_mod(x)
 
+    def test_trace_delattr(self):
+        TMP_PREFIX = "_tmp_"
+
+        def pre_forward_rename_hook(module: torch.nn.Module, _input: torch.Tensor):
+            param_name = "weight"
+            original_param = getattr(module, param_name)
+            setattr(module, TMP_PREFIX + param_name, original_param)
+            new_param = original_param + 1.0
+            delattr(module, param_name)
+            setattr(module, param_name, new_param)
+
+        def post_forward_restore_hook(
+            module: torch.nn.Module, _input: torch.Tensor, _output: torch.Tensor
+        ):
+            param_name = "weight"
+            tmp_param_name = TMP_PREFIX + param_name
+            original_param = getattr(module, tmp_param_name)
+            delattr(module, param_name)
+            setattr(module, param_name, original_param)
+            delattr(module, tmp_param_name)
+
+        class SimpleModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 5)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        torch.manual_seed(0)
+        model = SimpleModel()
+
+        model.linear.register_forward_pre_hook(pre_forward_rename_hook)
+        model.linear.register_forward_hook(post_forward_restore_hook)
+
+        input_tensor = torch.randn(4, 10)
+
+        eager_output = model(input_tensor)
+        assert hasattr(model.linear, "weight")
+        assert not hasattr(model.linear, "_tmp_weight")
+
+        torch.manual_seed(0)
+        model_to_compile = SimpleModel()
+        model_to_compile.linear.register_forward_pre_hook(pre_forward_rename_hook)
+        model_to_compile.linear.register_forward_hook(post_forward_restore_hook)
+
+        compiled_model = torch.compile(model_to_compile, fullgraph=True)
+        compiled_output = compiled_model(input_tensor)
+        assert hasattr(model.linear, "weight")
+        assert not hasattr(compiled_model.linear, "_tmp_weight")
+        torch.testing.assert_close(eager_output, compiled_output)
+
 
 devices = ["cuda", "hpu", "xpu"]
 instantiate_device_type_tests(
diff --git a/test/dynamo/test_nested_graph_breaks.py b/test/dynamo/test_nested_graph_breaks.py
new file mode 100644
index 0000000000000..1f404239447c7
--- /dev/null
+++ b/test/dynamo/test_nested_graph_breaks.py
@@ -0,0 +1,566 @@
+# Owner(s): ["module: dynamo"]
+
+import torch
+import torch._dynamo.test_case
+import torch._dynamo.testing
+from torch._dynamo import config
+from torch._dynamo.testing import make_test_cls_with_patches
+
+
+try:
+    # from . import test_ctx_manager
+    pass
+except ImportError:
+    # import test_aot_autograd
+    # import test_ctx_manager
+
+    # import test_export
+    # import test_functions
+    # import test_higher_order_ops
+    # import test_misc
+    # import test_modules
+    # import test_repros
+    # import test_sdpa
+    # import test_subgraphs
+    pass
+
+
+test_classes = {}
+
+
+def make_nested_cls(cls):
+    suffix = "_nested_graph_breaks"
+
+    cls_prefix = "NestedGraphBreaks"
+
+    test_class = make_test_cls_with_patches(
+        cls,
+        cls_prefix,
+        suffix,
+        (config, "debug_force_nested_calls", True),
+        (config, "debug_force_graph_break_on_leaf_return", True),
+        (config, "debug_disable_compile_counter", True),
+        xfail_prop="_expected_failure_nested_graph_breaks",
+    )
+
+    test_classes[test_class.__name__] = test_class
+    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
+    # globals()[test_class.__name__] = test_class
+    test_class.__module__ = __name__
+    return test_class
+
+
+tests = [
+    # test_ctx_manager.CtxManagerTests,
+    # test_functions.FunctionTests,
+    # test_misc.MiscTests,
+    # test_repros.ReproTests,
+    # test_modules.NNModuleTests,
+    # test_subgraphs.SubGraphTests,
+    # test_higher_order_ops.HigherOrderOpTests,
+    # test_higher_order_ops.FuncTorchHigherOrderOpTests,
+    # test_aot_autograd.AotAutogradFallbackTests,
+    # test_sdpa.TestSDPA,
+]
+test = None
+for test in tests:
+    make_nested_cls(test)
+del test
+
+
+# for use in test_side_effects_globals
+global1, global2, global3, global4 = (torch.zeros(3),) * 4
+
+
+class NestedGraphBreakTests(torch._dynamo.test_case.TestCase):
+    def setUp(self):
+        super().setUp()
+        torch._dynamo.config.nested_graph_breaks = True
+
+    def tearDown(self):
+        super().tearDown()
+        torch._dynamo.config.nested_graph_breaks = False
+
+    def test_single_graph_break(self):
+        # NOTE marking f1, f2, f3 as global
+        # prevents them from being freevars
+        global f1, f2, f3
+
+        def f1(x1):
+            x1 = x1 + 1
+            torch._dynamo.graph_break()
+            return x1 + 2
+
+        def f2(x2):
+            return f1(x2 + 4) + 8
+
+        def f3(x3):
+            return f2(x3 + 16) + 32
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 6)
+
+    def test_single_graph_break_repeat(self):
+        global f1, f2, f3
+
+        def f1(x1):
+            x1 = x1 + 1
+            torch._dynamo.graph_break()
+            return x1 + 2
+
+        def f2(x2):
+            tmp1 = f1(x2 + 4)
+            tmp2 = f1(x2 + 8) << 4
+            return tmp1 + tmp2
+
+        def f3(x3):
+            return f2(x3 + 256) + 512
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3, dtype=torch.long)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 3)
+        self.assertEqual(cnts.op_count, 10)
+
+    def test_doubly_nested_graph_break(self):
+        global f1, f2, f3
+
+        def f1(x1):
+            x1 = x1 + 1
+            torch._dynamo.graph_break()
+            return x1 + 2
+
+        def f2(x2):
+            x2 = x2 + 4
+            torch._dynamo.graph_break()
+            return f1(x2 + 8) + 16
+
+        def f3(x3):
+            return f2(x3 + 32) + 64
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 3)
+        self.assertEqual(cnts.op_count, 7)
+
+    def test_differing_arg_nums(self):
+        global f1, f2, f3, f4
+
+        def f1(x1, x2):
+            x = x1 + x2
+            torch._dynamo.graph_break()
+            return x + 1
+
+        def f2(x3, x4, x5, x6):
+            return f1(x3 + x4, x5 + x6) + 2
+
+        def f3(x7, x8):
+            return f2(x7, x7 + 4, x8, x8 + 8) + 16
+
+        def f4(x9):
+            return f3(x9, x9 + 32) + 64
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f4)
+        x = torch.zeros(3)
+        res = f4(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 10)
+
+    def test_differing_locals_nums(self):
+        global f1, f2, f3
+
+        def f1(x1):
+            loc1 = x1 + 1
+            torch._dynamo.graph_break()
+            return loc1 + 2
+
+        def f2(x2):
+            loc1 = x2 + 4
+            loc2 = x2 + 8
+            return f1(x2) + loc1 + loc2
+
+        def f3(x3):
+            loc1 = x3 + 16
+            loc2 = x3 + 32
+            loc3 = x3 + 64
+            loc4 = x3 + 128
+            return f2(x3) + loc1 + loc2 + loc3 + loc4
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 14)
+
+    def test_supported_ctx_manager(self):
+        global check, check_disabled, f1, f2, f3
+
+        @torch._dynamo.disable
+        def check_disabled(value):
+            assert torch.is_grad_enabled() == value
+
+        def check(value):
+            assert torch.is_grad_enabled() == value
+
+        def f1(x):
+            with torch.no_grad():
+                x = x + 1
+                check(False)
+                check_disabled(False)
+                check(False)
+                return x + 2
+
+        def f2(x):
+            with torch.enable_grad():
+                x = x + 4
+                check(True)
+                check_disabled(True)
+                check(True)
+                return f1(x) + 8
+
+        def f3(x):
+            with torch.no_grad():
+                x = x + 16
+                check(False)
+                check_disabled(False)
+                check(False)
+                return f2(x) + 32
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 4)
+        # includes set_grad_enabled ops
+        self.assertEqual(cnts.op_count, 14)
+
+    def test_inactive_ctx_manager(self):
+        global check, f1, f2, f3
+
+        def check(value):
+            assert torch.is_grad_enabled() == value
+
+        def f1(x, ctx1):
+            x = x + 1
+            ctx2 = torch.no_grad()
+            # torch.no_grad() is a stack value at the time of graph break
+            ctx3 = (torch.no_grad(), torch._dynamo.graph_break())[0]
+            x = x + 64
+            torch._dynamo.graph_break()
+            with ctx1:
+                check(False)
+            with ctx2:
+                check(False)
+            with ctx3:
+                check(False)
+            return x + 2
+
+        def f2(x, ctx1):
+            x = x + 4
+            ctx2 = torch.no_grad()
+            x = f1(x, torch.no_grad())
+            with ctx1:
+                check(False)
+            with ctx2:
+                check(False)
+            return x + 8
+
+        def f3(x):
+            x = x + 16
+            ctx = torch.no_grad()
+            x = f2(x, torch.no_grad())
+            with ctx:
+                check(False)
+            return x + 32
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 3)
+        self.assertEqual(cnts.op_count, 7)
+
+    @torch._dynamo.config.patch(recompile_limit=1, fail_on_recompile_limit_hit=True)
+    def test_no_recompiles(self):
+        global f1, f2, f3
+
+        def f1(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            return x + 2
+
+        def f2(x):
+            x = x + 4
+            x = f1(x)
+            torch._dynamo.graph_break()
+            return x + 8
+
+        def f3(x):
+            x = x + 16
+            return f2(x) + 32
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 3)
+
+    def test_cells(self):
+        def f1(x1):
+            cell1 = x1 + 1
+            cell2 = x1 + 2
+
+            def f2(x2, x3):
+                nonlocal cell1
+                cell3 = x2 + x3 + 4
+                cell1 += 8
+
+                def f3(x4):
+                    nonlocal cell2, cell3
+                    cell2 += 16
+                    cell3 += 32
+                    torch._dynamo.graph_break()
+                    return x4 + cell1 + cell2 + cell3
+
+                return f3(x2 + x3), cell3
+
+            return f2(x1 + 64, x1 + 128) + (cell1, cell2)
+
+        def outer(x):
+            return f1(x)
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(outer)
+        x = torch.zeros(3)
+        res = outer(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 13)
+
+    def test_side_effects_cells(self):
+        cell1, cell2, cell3, cell4 = (torch.zeros(3),) * 4
+
+        def f1():
+            nonlocal cell1
+            cell1 += 1
+            torch._dynamo.graph_break()
+            return cell1 + cell2
+
+        def f2():
+            nonlocal cell3
+            cell3 += 2
+            return f1() + cell3 + cell4
+
+        def f3():
+            return f2()
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+
+        cell1 = torch.zeros(3)
+        cell2 = torch.zeros(3) + 4
+        cell3 = torch.zeros(3)
+        cell4 = torch.zeros(3) + 8
+        res = f3()
+        res = (res,) + tuple(x.clone() for x in (cell1, cell2, cell3, cell4))
+
+        cell1 = torch.zeros(3)
+        cell2 = torch.zeros(3) + 4
+        cell3 = torch.zeros(3)
+        cell4 = torch.zeros(3) + 8
+        ref = opt_fn()
+        ref = (ref,) + tuple(x.clone() for x in (cell1, cell2, cell3, cell4))
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 5)
+
+    def test_side_effects_globals(self):
+        global f1, f2, f3
+        global global1, global2, global3, global4
+
+        def f1():
+            global global1
+            global1 += 1
+            torch._dynamo.graph_break()
+            return global1 + global2
+
+        def f2():
+            global global3
+            global3 += 2
+            return f1() + global3 + global4
+
+        def f3(x):
+            return x + f2()
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.ones(3)
+
+        global1 = torch.zeros(3)
+        global2 = torch.zeros(3) + 4
+        global3 = torch.zeros(3)
+        global4 = torch.zeros(3) + 8
+        res = (f3(x), global1.clone(), global2, global3.clone(), global4)
+
+        global1 = torch.zeros(3)
+        global2 = torch.zeros(3) + 4
+        global3 = torch.zeros(3)
+        global4 = torch.zeros(3) + 8
+        ref = (opt_fn(x), global1.clone(), global2, global3.clone(), global4)
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 6)
+
+    def test_side_effects_globals_different_module(self):
+        global f1, f2, _test_nested_graph_breaks_helper
+        try:
+            from . import _test_nested_graph_breaks_helper
+        except ImportError:
+            import _test_nested_graph_breaks_helper
+
+        def f1(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            return x + 1
+
+        def f2(x):
+            x = x + 1
+            x = _test_nested_graph_breaks_helper.fn(x, f1)
+            return x + 1
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f2)
+
+        _test_nested_graph_breaks_helper.reset_state()
+        x = torch.zeros(3)
+        res = (f2(x), _test_nested_graph_breaks_helper.global1.clone())
+
+        _test_nested_graph_breaks_helper.reset_state()
+        ref = (opt_fn(x), _test_nested_graph_breaks_helper.global1.clone())
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 7)
+
+    def test_nested_graph_break_in_loop(self):
+        global f1, f2, f3, f4, f5
+
+        def f1(x, i):
+            x = x + 1
+            if i == 5:
+                torch._dynamo.graph_break()
+            return x + 1
+
+        def f2(x, i):
+            x = x + 1
+            x = f1(x, i)
+            return x + 1
+
+        def f3(x):
+            for i in range(8):
+                x = f2(x, i)
+            return x
+
+        def f4(x):
+            x = x + 1
+            x = f3(x)
+            return x + 1
+
+        def f5(x):
+            x = x + 1
+            x = f4(x)
+            return x + 1
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        # dynamic=True to prevent unnecessary recompiles
+        opt_fn = torch._dynamo.optimize(backend=cnts, dynamic=True)(f5)
+        x = torch.zeros(3)
+        res = f5(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        # skip frame due to nested graph break in for loop
+        # 2 frames from f5+f4, 2 frames from f2+f1 (i == 5), 1 frame from f2+f1 (i != 5)
+        self.assertEqual(cnts.frame_count, 5)
+        # 4 additions from f5+f4, 2 x 4 additions from f2+f1 (i == 5, i != 5)
+        self.assertEqual(cnts.op_count, 12)
+
+    def test_nested_graph_break_in_try_block(self):
+        # NOTE: this also tests nested step_graph_break
+        global f1, f2, f3, f4, f5
+
+        def f1(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            return x + 1
+
+        def f2(x):
+            x = x + 1
+            x = f1(x)
+            return x + 1
+
+        def f3(x):
+            x = x + 1
+            try:
+                x = x + 1
+                x = f2(x)
+                x = x + 1
+            finally:
+                pass
+            return x + 1
+
+        def f4(x):
+            x = x + 1
+            x = f3(x)
+            return x + 1
+
+        def f5(x):
+            x = x + 1
+            x = f4(x)
+            return x + 1
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f5)
+        x = torch.zeros(3)
+        res = f5(x)
+        ref = opt_fn(x)
+        print(ref, res)
+        self.assertEqual(ref, res)
+        # skip frame due to graph break in try block
+        # 2 frames from f5+f4+(first part of f3), 2 frames from f2+f1
+        self.assertEqual(cnts.frame_count, 4)
+        # 5 additions from f5+f4+(first part of f3), 4 additions from f2+f1
+        self.assertEqual(cnts.op_count, 9)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_package.py b/test/dynamo/test_package.py
index a3c83ec28222f..96a726ad66808 100644
--- a/test/dynamo/test_package.py
+++ b/test/dynamo/test_package.py
@@ -24,7 +24,10 @@
     skipIfRocm,
     skipIfXpu,
 )
-from torch.testing._internal.inductor_utils import HAS_CUDA, HAS_XPU
+from torch.testing._internal.inductor_utils import (
+    HAS_CUDA_AND_TRITON,
+    HAS_XPU_AND_TRITON,
+)
 
 
 def compute_loss_helper(x):
@@ -32,6 +35,7 @@ def compute_loss_helper(x):
 
 
 @functorch_config.patch("bundled_autograd_cache", True)
+@torch._dynamo.config.patch({"strict_precompile": True})
 @instantiate_parametrized_tests
 class TestPackage(torch._inductor.test_case.TestCase):
     def path(self):
@@ -94,9 +98,9 @@ def forward(self, x):
     @parametrize("backend", ("eager", "inductor"))
     @parametrize("device", ("cpu", "cuda", "xpu"))
     def test_basic_fn(self, backend, device):
-        if device == "cuda" and not HAS_CUDA:
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
             raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU:
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
             raise unittest.SkipTest("Requires XPU/Triton")
 
         ctx = DiskDynamoStore()
@@ -138,9 +142,9 @@ def fn(x):
     @parametrize("backend", ("eager", "inductor"))
     @parametrize("device", ("cpu", "cuda", "xpu"))
     def test_lazy_backward(self, backend, device):
-        if device == "cuda" and not HAS_CUDA:
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
             raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU:
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
             raise unittest.SkipTest("Requires XPU/Triton")
 
         ctx = DiskDynamoStore()
@@ -185,9 +189,9 @@ def fn(x):
     @parametrize("backend", ("eager", "inductor"))
     @parametrize("device", ("cpu", "cuda", "xpu"))
     def test_graph_break_bomb(self, backend, device):
-        if device == "cuda" and not HAS_CUDA:
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
             raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU:
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
             raise unittest.SkipTest("Requires XPU/Triton")
 
         ctx = DiskDynamoStore()
@@ -249,9 +253,9 @@ def guard_filter_fn(guards):
     @parametrize("backend", ("eager", "inductor"))
     @parametrize("device", ("cpu", "cuda", "xpu"))
     def test_dynamic_shape(self, backend, device):
-        if device == "cuda" and not HAS_CUDA:
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
             raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU:
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
             raise unittest.SkipTest("Requires XPU/Triton")
 
         ctx = DiskDynamoStore()
@@ -368,9 +372,9 @@ def guard_filter_fn(guards):
 
     @parametrize("device", ("cpu", "cuda", "xpu"))
     def test_dynamo_cache_manual_load(self, device):
-        if device == "cuda" and not HAS_CUDA:
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
             raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU:
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
             raise unittest.SkipTest("Requires XPU/Triton")
 
         def fn(x):
@@ -405,9 +409,9 @@ def fn2(x):
     @parametrize("device", ("cpu", "cuda", "xpu"))
     @torch._dynamo.config.patch(caching_precompile=True)
     def test_automatic_dynamo_serialize(self, device):
-        if device == "cuda" and not HAS_CUDA:
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
             raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU:
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
             raise unittest.SkipTest("Requires XPU/Triton")
 
         def fn(x):
@@ -441,9 +445,9 @@ def fn2(x):
     @skipIfXpu
     @skipIfRocm
     def test_automatic_dynamo_autotune_cache(self, device):
-        if device == "cuda" and not HAS_CUDA:
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
             raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU:
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
             raise unittest.SkipTest("Requires XPU/Triton")
 
         def fn(x, y):
@@ -474,9 +478,9 @@ def fn(x, y):
     @parametrize("device", ("cpu", "cuda", "xpu"))
     @torch._dynamo.config.patch(caching_precompile=True)
     def test_automatic_dynamo_recompiles(self, device):
-        if device == "cuda" and not HAS_CUDA:
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
             raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU:
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
             raise unittest.SkipTest("Requires XPU/Triton")
 
         def fn(x):
@@ -507,9 +511,9 @@ def fn(x):
     @parametrize("device", ("cpu", "cuda", "xpu"))
     @torch._dynamo.config.patch(caching_precompile=True)
     def test_automatic_dynamo_graph_breaks(self, device):
-        if device == "cuda" and not HAS_CUDA:
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
             raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU:
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
             raise unittest.SkipTest("Requires XPU/Triton")
 
         def fn(x, l, r):
@@ -553,9 +557,9 @@ def guard_filter_fn(guards):
     @parametrize("device", ("cpu", "cuda", "xpu"))
     @torch._dynamo.config.patch(caching_precompile=True)
     def test_automatic_dynamo_lazy_backward(self, device):
-        if device == "cuda" and not HAS_CUDA:
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
             raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU:
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
             raise unittest.SkipTest("Requires XPU/Triton")
 
         def fn(x):
@@ -582,9 +586,9 @@ def fn(x):
     @parametrize("device", ("cpu", "cuda", "xpu"))
     @torch._dynamo.config.patch(caching_precompile=True)
     def test_call_function_from_resume(self, device):
-        if device == "cuda" and not HAS_CUDA:
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
             raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU:
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
             raise unittest.SkipTest("Requires XPU/Triton")
         mod = torch.nn.Linear(2, 3, device=device)
 
@@ -607,6 +611,27 @@ def foo(x, mod):
 
         self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
 
+    @parametrize("device", ("cpu", "cuda", "xpu"))
+    @torch._dynamo.config.patch(caching_precompile=True)
+    def test_code_with_generator(self, device):
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
+            raise unittest.SkipTest("Requires CUDA/Triton")
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
+            raise unittest.SkipTest("Requires XPU/Triton")
+
+        def foo(set_of_x):
+            if not all(isinstance(s, torch.Tensor) for s in set_of_x):
+                raise TypeError(
+                    f"Expected all elements of set_of_x to be tensors, got {set_of_x}"
+                )
+
+            return torch.cat(set_of_x, dim=0)
+
+        args = ([torch.randn(3, 2, device=device) for _ in range(3)],)
+        compiled_fn = torch.compile(foo)
+        compiled_fn(*args)
+        self._save_and_reload(expected_backends=1, expected_dynamo=1)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_pgo.py b/test/dynamo/test_pgo.py
index 93e5274431bec..ce2fda1387291 100644
--- a/test/dynamo/test_pgo.py
+++ b/test/dynamo/test_pgo.py
@@ -12,7 +12,9 @@
 import torch.compiler.config
 import torch.nested
 from torch._dynamo.testing import CompileCounter
+from torch._inductor.cpp_builder import normalize_path_separator
 from torch._inductor.utils import clear_caches, fresh_cache
+from torch.testing._internal.common_utils import IS_WINDOWS
 
 
 class PgoTest(torch._dynamo.test_case.TestCase):
@@ -55,6 +57,10 @@ def f(x):
         f(torch.randn(2, 6))
         self.assertEqual(cnts.frame_count, 1)
 
+    @torch._dynamo.config.patch(
+        force_parameter_static_shapes=False,
+        force_nn_module_property_static_shapes=False,
+    )
     def test_whitelist_suggestion(self):
         cnts = CompileCounter()
 
@@ -116,6 +122,29 @@ def check_whitelist(sources_):
             f(torch.randn(8, 8), torch.randn(8))
             self.assertEqual(cnts.frame_count, 1)
 
+    def test_no_empty_graph_allowlist(self):
+        @torch._dynamo.disable
+        def g(x):
+            return x * 2 + x
+
+        @torch.compile(backend="eager")
+        def f(x):
+            return g(x)
+
+        self.reset()
+        f(torch.randn(4))
+        f(torch.randn(8))
+        self.assertEqual(torch._dynamo.pgo._LOGGED_DYNAMIC_ALLOWLIST, False)
+
+        @torch.compile(backend="eager")
+        def f1(x):
+            return g(x + 2) + 2
+
+        self.reset()
+        f1(torch.randn(4))
+        f1(torch.randn(8))
+        self.assertEqual(torch._dynamo.pgo._LOGGED_DYNAMIC_ALLOWLIST, True)
+
     def test_pgo_dynamic_false(self):
         @torch.compile(backend="eager", dynamic=False)
         class Foo(torch.nn.Module):
@@ -194,14 +223,16 @@ def run():
         self.assertEqual(cnts.frame_count, 3)
 
         # parameter static shapes are forced static, so we recompile once
-        run()
-        self.assertEqual(cnts.frame_count, 2)
+        with torch._dynamo.config.patch(
+            force_parameter_static_shapes=False,
+            force_nn_module_property_static_shapes=False,
+        ):
+            run()
+            self.assertEqual(cnts.frame_count, 2)
 
-        # flags are flipped, PGO records dynamism, so params are dynamically compiled to start
-        torch._dynamo.config.force_parameter_static_shapes = False
-        torch._dynamo.config.force_nn_module_property_static_shapes = False
-        run()
-        self.assertEqual(cnts.frame_count, 1)
+            # because flags were flipped, params were included in PGO
+            run()
+            self.assertEqual(cnts.frame_count, 1)
 
     def test_njt(self):
         cnts = CompileCounter()
@@ -322,8 +353,9 @@ def func(x):
         temp_dir1 = tempfile.TemporaryDirectory()
         temp_dir2 = tempfile.TemporaryDirectory()
 
-        path1 = os.path.join(temp_dir1.name, "example.py")
-        path2 = os.path.join(temp_dir2.name, "example.py")
+        # We need normalize_path_separator for Windows file path.
+        path1 = normalize_path_separator(os.path.join(temp_dir1.name, "example.py"))
+        path2 = normalize_path_separator(os.path.join(temp_dir2.name, "example.py"))
         cnts = CompileCounter()
 
         assert path1 != path2
@@ -341,7 +373,11 @@ def write_load_and_run(path):
         write_load_and_run(path1)
         self.assertEqual(cnts.frame_count, 2)
         state = torch._dynamo.pgo.render_code_state(torch._dynamo.pgo.get_code_state())
-        self.assertTrue("hash(390fe689)" in state)
+
+        # Windows can't create unification temp path:
+        #   hash(a18a3259)C:/Users/Xuhan/AppData/Local/Temp/tmpx3hfkuqa/example.py
+        # Skip hash check
+        self.assertTrue("hash" if IS_WINDOWS else "hash(390fe689)" in state)
         self.assertTrue("/example.py:4:func:" in state)
         self.assertTrue(" L['x']: tensor size=[?] stride=[1]" in state)
         # We should compile this only once due to PGO.
@@ -349,6 +385,118 @@ def write_load_and_run(path):
         write_load_and_run(path2)
         self.assertEqual(cnts.frame_count, 1)
 
+    @torch._dynamo.config.patch(
+        automatic_dynamic_remote_pgo=True, automatic_dynamic_local_pgo=False
+    )
+    def test_sticky_pgo_read_write(self):
+        cnts = CompileCounter()
+
+        @torch.compile(backend=cnts, fullgraph=True)
+        def f(x, y):
+            return x * 2, y * 3
+
+        def t(x, y):
+            return torch.randn(x, y)
+
+        with mock_cache.PatchCaches():
+            # we pretend to disable the default remote cache, by keying different job ids per run
+            with torch.compiler.config.patch(job_id="a"):
+                f(t(2, 2), t(2, 2))
+                f(t(2, 4), t(2, 2))
+                self.assertEqual(cnts.frame_count, 2)
+
+            # first test we're not reading from local/default remote cache;
+            # we should recompile when x wobbles
+            self.reset()
+            cnts.clear()
+            with torch.compiler.config.patch(
+                job_id="b", pgo_extra_write_key="sticky_0"
+            ):
+                f(t(2, 2), t(2, 2))
+                f(t(2, 4), t(2, 2))
+                self.assertEqual(cnts.frame_count, 2)
+
+            # now with the extra sticky_0 key, we start with dynamic x;
+            # no recompiles
+            self.reset()
+            cnts.clear()
+            with torch.compiler.config.patch(job_id="c", pgo_extra_read_key="sticky_0"):
+                f(t(2, 2), t(2, 2))
+                f(t(2, 4), t(2, 2))
+                self.assertEqual(cnts.frame_count, 1)
+
+            # last test: wobble y and write to sticky_1 key
+            self.reset()
+            cnts.clear()
+            with torch.compiler.config.patch(
+                job_id="d", pgo_extra_write_key="sticky_1"
+            ):
+                f(t(2, 2), t(2, 2))
+                f(t(2, 2), t(2, 4))
+                f(t(2, 2), t(4, 4))
+                self.assertEqual(cnts.frame_count, 3)
+
+            # start using default remote PGO, create run that wobbles y
+            self.reset()
+            cnts.clear()
+            f(t(2, 2), t(2, 2))
+            f(t(2, 4), t(2, 2))
+            f(t(4, 2), t(2, 2))
+
+            # with default remote (dynamic x) + extra remote (dynamic y),
+            # we should be able to wobble x & y with no recompiles.
+            self.reset()
+            cnts.clear()
+            with torch.compiler.config.patch(pgo_extra_read_key="sticky_1"):
+                f(t(2, 2), t(2, 2))
+                f(t(2, 4), t(4, 2))
+                f(t(4, 2), t(2, 4))
+                self.assertEqual(cnts.frame_count, 1)
+
+    def test_profile_merges(self):
+        from torch._dynamo.pgo import auto_dynamic, merge_pgo_entry
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(ints, t_scalar, tensors):
+            # arbitrary compute
+            return ints[0] + ints[1], t_scalar + 1, [t + 1 for t in tensors]
+
+        # single static run
+        f(
+            [0, 2],
+            torch.tensor(0),
+            [
+                torch.randn(2),
+                torch.randn(2, 2),
+                torch.randn(4, 4),
+            ],
+        )
+        # collect profiles
+        profile = next(
+            iter(torch._dynamo.pgo.get_code_state().values())
+        ).automatic_dynamic
+        i0, i1 = profile["L['ints'][0]"], profile["L['ints'][1]"]
+        ts = profile["L['t_scalar]"]
+        t0, t1, t2 = (
+            profile["L['tensors'][0]"],
+            profile["L['tensors'][1]"],
+            profile["L['tensors'][2]"],
+        )
+        # merging same scalar, or tensor into scalar -> no-op
+        merge_pgo_entry(i0, i0)
+        merge_pgo_entry(ts, i0)
+        merge_pgo_entry(t0, i0)
+        self.assertEqual(i0.scalar, 0)
+        # merging different scalars -> dynamic
+        merge_pgo_entry(i1, i0)
+        self.assertEqual(i0.scalar, auto_dynamic)
+        # merging different rank tensors -> static
+        merge_pgo_entry(t0, t2)
+        self.assertEqual(t2.size, (4, 4))
+        # merging same rank tensors -> dynamic
+        merge_pgo_entry(t1, t2)
+        self.assertEqual(t2.size, (auto_dynamic, auto_dynamic))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_profiler.py b/test/dynamo/test_profiler.py
index c155613eb9ea5..61dc63ed2d5c6 100644
--- a/test/dynamo/test_profiler.py
+++ b/test/dynamo/test_profiler.py
@@ -192,47 +192,6 @@ def fn(x, y):
             ],
         )
 
-    def test_profiler_enabled(self):
-        def fn(x):
-            x = torch.sin(x)
-            if torch.autograd._profiler_enabled():
-                return torch.cos(x)
-            else:
-                return torch.sigmoid(x)
-
-        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
-        x = torch.randn(4)
-
-        ref = fn(x)
-        res = opt_fn(x)
-        self.assertEqual(ref, res)
-
-        with torch.autograd.profiler.profile():
-            ref = fn(x)
-            res = opt_fn(x)
-            self.assertEqual(ref, res)
-
-    def test_profiler_record_function_ignore(self):
-        def fn(x):
-            x = torch.sin(x)
-            if torch.autograd._profiler_enabled():
-                with torch.autograd.profiler.record_function("dummy"):
-                    return torch.cos(x)
-            else:
-                return torch.sigmoid(x)
-
-        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
-        x = torch.randn(4)
-
-        ref = fn(x)
-        res = opt_fn(x)
-        self.assertEqual(ref, res)
-
-        with torch.autograd.profiler.profile():
-            ref = fn(x)
-            res = opt_fn(x)
-            self.assertEqual(ref, res)
-
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_recompiles.py b/test/dynamo/test_recompiles.py
index 5b17b7c1ec644..825d2e5d674a9 100644
--- a/test/dynamo/test_recompiles.py
+++ b/test/dynamo/test_recompiles.py
@@ -4,9 +4,58 @@
 import torch
 import torch._dynamo.test_case
 import torch._dynamo.testing
+from torch._dynamo import config as dc
 
 
 class RecompileTests(torch._dynamo.test_case.TestCase):
+    def test_inline_inbuilt_nn_modules_candidate(self):
+        def hook_flag_on(guard_manager, f_locals, builder):
+            self.assertTrue(
+                "[inline-inbuilt-nn-modules-candidate]" not in str(guard_manager)
+            )
+
+        def hook_flag_off(guard_manager, f_locals, builder):
+            self.assertTrue(
+                "[inline-inbuilt-nn-modules-candidate]" in str(guard_manager)
+            )
+
+        class SubMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+
+            @torch.compile(backend="eager")
+            def forward(self, x):
+                return self.linear(x)
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.sm1 = SubMod()
+                self.sm2 = SubMod()
+
+            def forward(self, x):
+                return self.sm1(x) + self.sm2(x)
+
+        try:
+            from .utils import install_guard_manager_testing_hook
+        except ImportError:
+            from utils import install_guard_manager_testing_hook
+
+        with (
+            install_guard_manager_testing_hook(hook_flag_on),
+            dc.patch(inline_inbuilt_nn_modules=True),
+        ):
+            mod = Mod()
+            mod(torch.randn(2, 2))
+
+        with (
+            install_guard_manager_testing_hook(hook_flag_off),
+            dc.patch(inline_inbuilt_nn_modules=False),
+        ):
+            mod = Mod()
+            mod(torch.randn(2, 2))
+
     def test_automatic_dynamic_reduce_recompiles(self):
         # Test the counterfactual, lots of recompiles without this config
         def foo(x, y):
diff --git a/test/dynamo/test_reconstruct.py b/test/dynamo/test_reconstruct.py
index 0cafaf9878e60..9f3d41964195d 100644
--- a/test/dynamo/test_reconstruct.py
+++ b/test/dynamo/test_reconstruct.py
@@ -7,7 +7,7 @@
 import torch
 import torch._dynamo.test_case
 from torch.testing._internal.common_utils import IS_FBCODE
-from torch.testing._internal.inductor_utils import requires_triton
+from torch.testing._internal.inductor_utils import GPU_TYPE, requires_triton
 from torch.utils._triton import (
     has_triton_experimental_host_tma,
     has_triton_tensor_descriptor_host_tma,
@@ -420,7 +420,7 @@ def create_tma(tensor):
             )
             return tensor + 1, tma
 
-        x = torch.randn(128, 128, device="cuda")
+        x = torch.randn(128, 128, device=GPU_TYPE)
 
         ref = create_tma(x)
         res = torch.compile(create_tma, backend="eager")(x)
@@ -441,7 +441,7 @@ def create_tma(tensor):
             )
             return tensor + 1, tma
 
-        x = torch.randn(128, 128, device="cuda")
+        x = torch.randn(128, 128, device=GPU_TYPE)
 
         ref = create_tma(x)
         res = torch.compile(create_tma, backend="eager")(x)
diff --git a/test/dynamo/test_reorder_logs.py b/test/dynamo/test_reorder_logs.py
index e833dd9df8865..be6bf8085af27 100644
--- a/test/dynamo/test_reorder_logs.py
+++ b/test/dynamo/test_reorder_logs.py
@@ -211,7 +211,7 @@ def f(x):
 
   Developer debug context: call_method TensorVariable() item () {}
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0124.html""",  # noqa: B950
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0124.html""",  # noqa: B950
         )
 
 
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index e0a3f7a5223f0..46f0e67a5b7b4 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -60,7 +60,11 @@
     SM70OrLater,
     TEST_CUDA,
 )
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_device_type import (
+    E4M3_MAX_POS,
+    e4m3_type,
+    instantiate_device_type_tests,
+)
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -4185,6 +4189,21 @@ def fn(x, l):
         torch.compile(fn, backend=counter)(torch.randn([2, 2]), [])
         self.assertEqual(counter.frame_count, 1)
 
+    def test_get_type_hints(self):
+        class Foo:
+            pass
+
+        def fn(x):
+            typing.get_type_hints(Foo, include_extras=True)
+            return torch.sin(x)
+
+        x = torch.randn(4)
+        ref = fn(x)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
     def test_graph_break_on_jit_isinstance(self):
         @torch.compile(backend="eager")
         def fn(x):
@@ -4990,6 +5009,27 @@ def fn(x_weak, weight, y):
         res = opt_fn(x_weak, weight, y)
         self.assertEqual(ref, res)
 
+    # https://github.com/pytorch/pytorch/issues/159258
+    def test_weakref_proxy(self):
+        class DummyTrainer:
+            def __init__(self, x):
+                self.foo = x
+
+        class DummyModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.trainer = None
+
+            def foo(self):
+                return self.trainer.foo
+
+        x = torch.randn(4)
+        model = DummyModel()
+        trainer = DummyTrainer(x)
+        model.trainer = weakref.proxy(trainer)
+        compiled_foo = torch.compile(model.foo, backend="eager", fullgraph=True)
+        self.assertEqual(compiled_foo(), x)
+
     def test_weakref_reconstruct(self):
         def fn(x_weak, weight, y):
             y = torch.sin(y)
@@ -6485,21 +6525,6 @@ def inject_parameters(module, cls):
         with torch.no_grad():
             model(x)
 
-    def test_ao_fake_quantize_tracing(self):
-        import torch.ao.quantization.fake_quantize
-
-        q = torch.ao.quantization.FusedMovingAvgObsFakeQuantize()
-
-        def fn(x):
-            return q(x)
-
-        x = torch.ones(2, 2)
-        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
-        res = opt_fn(x)
-        eager_res = fn(x)
-
-        self.assertEqual(res, eager_res)
-
     def test_typed_dict(self):
         class LlavaImagePixelInputs(TypedDict):
             type: Literal["pixel_values"]
@@ -7128,6 +7153,48 @@ def fn(x):
                 0, sys.monitoring.events.PY_START, old_callback
             )
 
+    def test_312_local_cell_overlap(self):
+        keys = range(10)
+        allowed = [0, 1, 2, 3]
+
+        def fn(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            key = [key for key in keys if key in allowed]
+
+            def inner():
+                nonlocal key
+
+            return x + key[0]
+
+        self.assertEqual(
+            fn(torch.ones(3)), torch.compile(fn, backend="eager")(torch.ones(3))
+        )
+
+    def test_311_resume_block_keyerror(self):
+        # https://github.com/pytorch/pytorch/issues/162313
+        flag = True
+
+        def fn(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            x = x + 2
+            if flag:
+                with torch.no_grad():
+                    torch._dynamo.graph_break()
+                x = x + 4
+            else:
+                with torch.no_grad():
+                    torch._dynamo.graph_break()
+                x = x + 8
+            return x + 16
+
+        inp = torch.ones(3)
+        opt_fn = torch.compile(fn, backend="eager")
+        self.assertEqual(fn(inp), opt_fn(inp))
+        flag = False
+        self.assertEqual(fn(inp), opt_fn(inp))
+
     def test_unbind_copy_out(self):
         def f(eye, out):
             torch.unbind_copy(eye, out=out)
@@ -7140,6 +7207,37 @@ def f(eye, out):
         torch.compile(f, backend="eager", fullgraph=True)(eye, out_res)
         self.assertEqual(out_ref, out_res)
 
+    def test_setitem_tensor_prop(self):
+        # Using the composite implicit of the forward would be incorrect
+        class MyFn(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return torch.matmul(x, x.t())
+
+            @staticmethod
+            def backward(ctx, grad_out):
+                return grad_out
+
+        def fn(x, y):
+            x[0] = y[0]
+            return MyFn.apply(x)
+
+        def inputs():
+            torch.manual_seed(123)
+            x = torch.randn(10, 10)
+            y = torch.randn(10, 10, requires_grad=True)
+            return x, y
+
+        x1, y1 = inputs()
+        fn(x1, y1).sum().backward()
+        self.assertTrue(x1.requires_grad)
+
+        x2, y2 = inputs()
+        torch.compile(fn, backend="eager")(x2, y2).sum().backward()
+        self.assertTrue(x2.requires_grad)
+
+        self.assertEqual(y1.grad, y2.grad)
+
     def test_nn_parameter_ctor_graph_breaks(self):
         def fn():
             param = torch.nn.Parameter(torch.ones(10))
@@ -7160,7 +7258,7 @@ def fn():
             "Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.\n\n"
             "  Developer debug context: \n\n"
             " For more details about this graph break, please visit: "
-            "https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0264.html"
+            "https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0264.html"
         )
         self.assertEqual(explain_output.break_reasons[0].reason, expected_msg)
 
@@ -7416,9 +7514,9 @@ def mul_tiled(a, *bs):
             return a
 
         def scale(t, amax_t):
-            max_v = torch.finfo(torch.float8_e4m3fn).max
+            max_v = E4M3_MAX_POS
             scale_t = torch.clamp(amax_t.float(), min=1e-12) / max_v
-            t_fp8 = mul_tiled(t, scale_t.reciprocal()).to(torch.float8_e4m3fn)
+            t_fp8 = mul_tiled(t, scale_t.reciprocal()).to(e4m3_type)
             return t_fp8, scale_t
 
         def matmul(first, amax_first, second_t, amax_second_t, bias):
@@ -7673,6 +7771,44 @@ def forward(self, x):
         out2 = torch.compile(model, backend="eager")(input.clone())
         self.assertEqual(out1, out2)
 
+    @requires_cuda
+    def test_zero_dim_param_mixed_device_grad(self):
+        # cpu 0-dim params with cuda grads
+        # https://github.com/pytorch/pytorch/issues/160084
+        class RegressionModel(torch.nn.Module):
+            def __init__(self, a=0, b=0):
+                super().__init__()
+                self.a = torch.nn.Parameter(torch.tensor(a).float())
+                self.b = torch.nn.Parameter(torch.tensor(b).float())
+
+            def forward(self, x):
+                return x * self.a + self.b
+
+        model = RegressionModel()
+        model.forward = torch.compile(
+            model.forward, backend="aot_eager", fullgraph=True
+        )
+        inputs = torch.randn(4, 10).to("cuda")
+        out = model(inputs)
+        out.sum().backward()
+        self.assertIsNotNone(model.a.grad)
+        self.assertIsNotNone(model.b.grad)
+        self.assertEqual(model.a.grad.device, torch.device("cpu"))
+        self.assertEqual(model.b.grad.device, torch.device("cpu"))
+
+    @unittest.skipIf(not TEST_CUDA, "test requires CUDA")
+    def test_cuda_sync(self):
+        def fn(x):
+            y = x + 1
+            torch.cuda.synchronize()
+            return y * 2
+
+        x = torch.ones(2, device="cuda")
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(fn, backend=cnt)
+        self.assertEqual(fn(x), opt_fn(x))
+        self.assertEqual(cnt.frame_count, 2)
+
     def test_filter_warnings(self):
         x = torch.ones(2, 2, requires_grad=True)
 
diff --git a/test/dynamo/test_sets.py b/test/dynamo/test_sets.py
index 0871c0c1e565c..1f11d1d65d0ef 100644
--- a/test/dynamo/test_sets.py
+++ b/test/dynamo/test_sets.py
@@ -174,7 +174,7 @@ def fn(x, s):
 
   Developer debug context: Python set containing torch.Tensor elements
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0222.html
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0222.html
 
 from user code:
    File "test_sets.py", line N, in fn
@@ -657,7 +657,6 @@ class FrozensetTests(_FrozensetBase, _BaseSetTests):
 class SetTests(_SetBase, _BaseSetTests):
     thetype = set
 
-    @unittest.expectedFailure
     def test_in_frozenset(self):
         super().test_in_frozenset()
 
@@ -668,13 +667,11 @@ class CustomSet(set):
 
     thetype = CustomSet
 
-    @unittest.expectedFailure
     def test_in_frozenset(self):
         super().test_in_frozenset()
 
-    @unittest.expectedFailure
     def test_equality(self):
-        super().test_in_frozenset()
+        super().test_equality()
 
 
 class UserDefinedFrozensetTests(_FrozensetBase, _BaseSetTests):
@@ -683,7 +680,6 @@ class CustomFrozenset(frozenset):
 
     thetype = CustomFrozenset
 
-    @unittest.expectedFailure
     def test_in_frozenset(self):
         super().test_in_frozenset()
 
diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
index f8bfe03c97144..d149f5b36f9c1 100644
--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@@ -22,16 +22,14 @@
 from torch._logging._internal import TorchLogsFormatter
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing._internal.common_utils import find_free_port, skipIfRocm
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 if torch.distributed.is_available():
     from torch.testing._internal.distributed.fake_pg import FakeStore
 
-
 HAS_TLPARSE = shutil.which("tlparse") is not None
 requires_tlparse = unittest.skipUnless(HAS_TLPARSE, "requires tlparse")
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 requires_distributed = functools.partial(
     unittest.skipIf, not dist.is_available(), "requires distributed"
 )
@@ -238,7 +236,7 @@ def test_compile_id_serialization_deserialization(self):
             with self.assertRaises(ValueError):
                 torch._guards.CompileId.from_string(bad_cid)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_schedule(self):
         fn_opt = torch.compile(inductor_schedule_fn, backend="inductor")
         fn_opt(torch.ones(1000, 1000, device="cuda"))
@@ -247,7 +245,7 @@ def test_schedule(self):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -256,12 +254,13 @@ def test_schedule(self):
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"artifact": {"name": "inductor_collective_schedule", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@@ -271,8 +270,8 @@ def test_schedule(self):
 
         self.assertParses()
 
-    @requires_cuda
     @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
+    @requires_cuda_and_triton
     def test_cudagraphs(self):
         fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
         fn_opt(torch.ones(1000, 1000, device="cuda"))
@@ -281,7 +280,7 @@ def test_cudagraphs(self):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -290,12 +289,13 @@ def test_cudagraphs(self):
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"artifact": {"name": "inductor_collective_schedule", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@@ -319,10 +319,10 @@ def fn(x, y):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 1, "source": "L['y']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [1000, 1000], "l_y_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -331,18 +331,19 @@ def fn(x, y):
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"artifact": {"name": "inductor_collective_schedule", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"create_symbol": {"symbol": "s48", "val": "1", "vr": "[-int_oo, int_oo]", "source": "L['y']", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
@@ -352,11 +353,12 @@ def fn(x, y):
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
-{"artifact": {"name": "inductor_collective_schedule", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
@@ -374,7 +376,7 @@ def test_example_fn(self):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000], "ones_1": [1000, 1000], "output_1": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -383,11 +385,12 @@ def test_example_fn(self):
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"artifact": {"name": "inductor_collective_schedule", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@@ -409,28 +412,28 @@ def test_example_training_fn(self):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack1']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack0']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack0']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_output_graph": {"sizes": {"l_stack0_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000], "sum_1": []}}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
@@ -445,7 +448,6 @@ def test_example_training_fn(self):
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
-{"artifact": {"name": "inductor_collective_schedule", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
@@ -455,12 +457,11 @@ def test_example_training_fn(self):
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
-{"artifact": {"name": "inductor_collective_schedule", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"bwd_compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['output']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"compilation_metrics": "METRICS", "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 """,  # noqa: B950
@@ -480,7 +481,7 @@ def test_dynamo_error(self):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "dynamo_error", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@@ -514,7 +515,7 @@ def throw(x):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -536,7 +537,7 @@ def throw(x):
         self.assertParses()
 
     @requires_distributed()
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_ddp_graphs(self):
         class ToyModel(torch.nn.Module):
             def __init__(self) -> None:
@@ -625,7 +626,7 @@ def forward(self, x):
 {"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['args'][0]"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
@@ -641,32 +642,32 @@ def forward(self, x):
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['self']._modules['layers']._modules['0']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 1, "source": "L['self']._modules['layers']._modules['0']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 2, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1024, 1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 2, "source": "L['x']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 3, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 3, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 3, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 8, "source": "L['self']._modules['layers']._modules['1']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 4, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 9, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 4, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 9, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 4, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 9, "source": "L['self']._modules['layers']._modules['1']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_self_modules_layers_modules_0_parameters_weight_": [1024, 1024], "l_self_modules_layers_modules_0_parameters_bias_": [1024], "l_x_": [1024, 1024], "l_self_modules_layers_modules_1_parameters_weight_": [1024, 1024], "l_self_modules_layers_modules_1_parameters_bias_": [1024], "input_1": [1024, 1024], "input_2": [1024, 1024]}}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"optimize_ddp_split_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"optimize_ddp_split_child": {"name": "submod_0"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"optimize_ddp_split_child": {"name": "submod_1"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 1, "source": "L['self']._modules['layers']._modules['0']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 2, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 2, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 2, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 2, "source": "L['self']._modules['layers']._modules['0']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -680,13 +681,12 @@ def forward(self, x):
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"artifact": {"name": "inductor_collective_schedule", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 16, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 29, "source": "L['self']._modules['layers']._modules['1']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 17, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 30, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 17, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 30, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 17, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 30, "source": "L['self']._modules['layers']._modules['1']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -700,7 +700,6 @@ def forward(self, x):
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"artifact": {"name": "inductor_collective_schedule", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@@ -727,7 +726,7 @@ def fn(x):
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [1], "add": [1]}}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -736,11 +735,12 @@ def fn(x):
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"artifact": {"name": "inductor_collective_schedule", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
@@ -767,10 +767,10 @@ def fn(a, b):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 800}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 20], "is_leaf": true, "stride": [20, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 20], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [20, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 2400}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [20, 30], "is_leaf": true, "stride": [30, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [20, 30], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [30, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 1, "source": "L['b']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [10, 20], "l_b_": [20, 30], "matmul": [10, 30]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -778,12 +778,12 @@ def fn(a, b):
 {"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 200}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [5, 10], "is_leaf": true, "stride": [10, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [5, 10], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [10, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"create_symbol": {"symbol": "s97", "val": "5", "vr": "[2, int_oo]", "source": "L['a'].size()[0]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"create_symbol": {"symbol": "s98", "val": "10", "vr": "[2, int_oo]", "source": "L['a'].size()[1]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 600}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
-{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 15], "is_leaf": true, "stride": [15, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 15], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [15, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 1, "source": "L['b']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"create_symbol": {"symbol": "s52", "val": "10", "vr": "[2, int_oo]", "source": "L['b'].size()[0]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"create_symbol": {"symbol": "s20", "val": "15", "vr": "[2, int_oo]", "source": "L['b'].size()[1]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
@@ -819,7 +819,7 @@ def inner(x, ys, zs):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [1], "x": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -827,7 +827,7 @@ def inner(x, ys, zs):
 {"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [1], "x": [1]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
@@ -857,10 +857,10 @@ def forward(self, x, y):
     return add
 
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 12}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 12}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "is_leaf": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 1, "source": "L['y']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [3], "l_y_": [3], "add": [3]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -887,7 +887,7 @@ def fn(a):
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1], "sin": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -896,17 +896,18 @@ def fn(a):
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"artifact": {"name": "inductor_collective_schedule", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1], "sin": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -916,6 +917,8 @@ def fn(a):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "inductor_provenance_tracking_node_mappings", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "inductor_provenance_tracking_kernel_stack_traces", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "fx_graph_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -1161,9 +1164,9 @@ def test_collective_schedule_empty(self):
 
             log_collective_schedule([])
 
-            self.assertIn('"inductor_collective_schedule"', self.buffer.getvalue())
-            self.assertEqual(json.loads(payload_buffer.getvalue()), [])
-            self.assertParses()
+            # With no collectives, artifact should not be logged and payload should be empty
+            self.assertNotIn('"inductor_collective_schedule"', self.buffer.getvalue())
+            self.assertEqual(payload_buffer.getvalue().strip(), "")
 
     @requires_tlparse
     @requires_distributed()
@@ -1209,6 +1212,328 @@ def forward(self, x):
         finally:
             dist.destroy_process_group()
 
+    @contextmanager
+    def _setup_runtime_estimates_capture(self):
+        """Helper to turn on and capture the combined 'inductor_runtime_and_tensor_meta' structured trace."""
+        payload_buffer = io.StringIO()
+        payload_handler = logging.StreamHandler(payload_buffer)
+        payload_handler.setLevel(logging.DEBUG)
+        payload_handler.setFormatter(StructuredTracePayloadFormatter())
+        payload_handler.addFilter(
+            StructuredTraceTestingFilter("inductor_runtime_and_tensor_meta")
+        )
+        trace_log.addHandler(payload_handler)
+        try:
+            yield payload_buffer
+        finally:
+            trace_log.removeHandler(payload_handler)
+
+    @requires_tlparse
+    @requires_distributed()
+    @requires_cuda_and_triton
+    @torch._inductor.config.patch("fx_graph_cache", False)
+    @torch._inductor.config.patch("log_tlparse", True)
+    def test_runtime_estimates_simple(self):
+        """Test runtime estimates logging with simple compute and collective ops."""
+        import torch.distributed as dist
+
+        store = FakeStore()
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+
+        class SimpleModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                h = self.linear(x)
+                h = torch.relu(h)
+
+                h = torch.ops._c10d_functional.all_reduce.default(h, "sum", "0")
+                h = torch.ops._c10d_functional.wait_tensor.default(h)
+                return h
+
+        try:
+            with self._setup_runtime_estimates_capture() as payload_buffer:
+                torch._dynamo.reset()
+
+                mod = SimpleModule().cuda()
+                compiled = torch.compile(mod, backend="inductor")
+                compiled(torch.randn(4, 4, device="cuda"))
+
+                # Verify runtime + tensor meta artifact was logged
+                self.assertIn(
+                    '"inductor_runtime_and_tensor_meta"', self.buffer.getvalue()
+                )
+
+                payload_content = payload_buffer.getvalue().strip()
+                if payload_content:
+                    data = json.loads(payload_content)
+                    self.assertIn("ops", data)
+                    ops = data["ops"]
+
+                    # Verify runtime estimates
+                    compute_ops = [op for op in ops if op["type"] == "compute"]
+                    collective_ops = [op for op in ops if op["type"] == "collective"]
+
+                    self.assertTrue(len(compute_ops) > 0 or len(collective_ops) > 0)
+
+                    # Just check each op has an estimated runtime value (any value, including 0)
+                    for op in ops:
+                        self.assertIn("estimated_runtime_ns", op)
+                        self.assertIsNotNone(op["estimated_runtime_ns"])
+
+                self.assertParses()
+        finally:
+            dist.destroy_process_group()
+
+    @requires_tlparse
+    @requires_distributed()
+    @requires_cuda_and_triton
+    @torch._inductor.config.patch("fx_graph_cache", False)
+    @torch._inductor.config.patch("log_tlparse", True)
+    def test_runtime_estimates_mixed(self):
+        """Test runtime estimates logging with mixed compute and collective sequence."""
+        import torch.distributed as dist
+
+        store = FakeStore()
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+
+        class MixedModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.norm = torch.nn.LayerNorm(4)
+
+            def forward(self, x):
+                h = self.norm(x)
+                h = torch.nn.functional.gelu(h)
+
+                h = torch.ops._c10d_functional.all_reduce.default(h, "sum", "0")
+                h = torch.ops._c10d_functional.wait_tensor.default(h)
+
+                h = h * 0.5
+
+                gathered = torch.ops._c10d_functional.all_gather_into_tensor.default(
+                    h, 2, "0"
+                )
+                gathered = torch.ops._c10d_functional.wait_tensor.default(gathered)
+
+                return gathered.sum(dim=0)
+
+        try:
+            with self._setup_runtime_estimates_capture() as payload_buffer:
+                torch._dynamo.reset()
+
+                mod = MixedModule().cuda()
+                compiled = torch.compile(mod, backend="inductor")
+                compiled(torch.randn(4, 4, device="cuda"))
+
+                # Verify artifact was logged
+                self.assertIn(
+                    '"inductor_runtime_and_tensor_meta"', self.buffer.getvalue()
+                )
+
+                payload_content = payload_buffer.getvalue().strip()
+                if payload_content:
+                    data = json.loads(payload_content)
+                    self.assertIn("ops", data)
+                    ops = data["ops"]
+
+                    # Should have both compute and collective ops
+                    op_types = {op["type"] for op in ops}
+                    self.assertIn("compute", op_types)
+                    self.assertIn("collective", op_types)
+
+                    # Just check each op has an estimated runtime value (any value, including 0)
+                    for op in ops:
+                        self.assertIn("estimated_runtime_ns", op)
+                        self.assertIsNotNone(op["estimated_runtime_ns"])
+
+                self.assertParses()
+        finally:
+            dist.destroy_process_group()
+
+    @requires_tlparse
+    @requires_distributed()
+    @requires_cuda_and_triton
+    @torch._inductor.config.patch("fx_graph_cache", False)
+    @torch._inductor.config.patch("log_tlparse", True)
+    def test_tensor_metadata_logging_multiple_ops(self):
+        import torch.distributed as dist
+
+        store = FakeStore()
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+
+        class Mixed(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                y = torch.relu(self.linear(x))
+                y = torch.ops._c10d_functional.all_reduce.default(y, "sum", "0")
+                y = torch.ops._c10d_functional.wait_tensor.default(y)
+                return y + 1
+
+        try:
+            with self._setup_runtime_estimates_capture() as payload_buffer:
+                torch._dynamo.reset()
+                mod = Mixed().cuda()
+                compiled = torch.compile(mod, backend="inductor")
+                compiled(torch.randn(4, 4, device="cuda"))
+                payload = payload_buffer.getvalue().strip()
+                if payload:
+                    data = json.loads(payload)
+                    types = sorted({op.get("type") for op in data.get("ops", [])})
+                    self.assertExpectedInline(
+                        str(types), """['collective', 'compute']"""
+                    )
+                self.assertParses()
+        finally:
+            dist.destroy_process_group()
+
+    @requires_tlparse
+    @torch._inductor.config.patch("log_tlparse", True)
+    def test_tensor_metadata_logging(self):
+        """Emit unified runtime+tensor-metadata artifact and assert a stable simplified JSON inline."""
+        with self._setup_runtime_estimates_capture() as payload_buffer:
+
+            def f(x):
+                y = x.transpose(0, 1)
+                z = y.mean(dim=0)
+                w = z.to(torch.float16)
+                return w
+
+            compiled = torch.compile(f, backend="inductor", fullgraph=True)
+            compiled(torch.ones(2, 3))
+
+            # Verify artifact was logged
+            self.assertIn('"inductor_runtime_and_tensor_meta"', self.buffer.getvalue())
+
+            payload = payload_buffer.getvalue().strip()
+            if payload:
+                data = json.loads(payload)
+                ops = data.get("ops", [])
+
+                simplified_ops = []
+                for op in ops:
+                    outs = [
+                        {
+                            "shape": out.get("shape", []),
+                            "stride": out.get("stride", []),
+                            "dtype": out.get("dtype", None),
+                        }
+                        for out in op.get("outputs", [])
+                    ]
+                    if outs:
+                        simplified_ops.append(
+                            {
+                                "type": op.get("type", ""),
+                                "outputs": outs,
+                            }
+                        )
+
+                self.assertExpectedInline(
+                    {"ops": simplified_ops[-1:]} if simplified_ops else {"ops": []},
+                    """{'ops': [{'type': 'compute', 'outputs': [{'shape': [2], 'stride': [1], 'dtype': 'float16'}]}]}""",
+                )
+
+            self.assertParses()
+
+    @requires_tlparse
+    @torch._inductor.config.patch("log_tlparse", True)
+    def test_tensor_metadata_logging_dynamic_shapes(self):
+        """Same as test_tensor_metadata_logging, but with dynamic shapes enabled to cover to_size_hints."""
+        with self._setup_runtime_estimates_capture() as payload_buffer:
+
+            def f(x):
+                y = x.transpose(0, 1)
+                z = y.mean(dim=0)
+                w = z.to(torch.float16)
+                return w
+
+            compiled = torch.compile(f, backend="inductor", dynamic=True)
+            compiled(torch.ones(2, 3))
+
+            # Verify artifact was logged
+            self.assertIn('"inductor_runtime_and_tensor_meta"', self.buffer.getvalue())
+
+            payload = payload_buffer.getvalue().strip()
+            if payload:
+                data = json.loads(payload)
+                ops = data.get("ops", [])
+
+                simplified_ops = []
+                for op in ops:
+                    outs = [
+                        {
+                            "shape": out.get("shape", []),
+                            "stride": out.get("stride", []),
+                            "dtype": out.get("dtype", None),
+                        }
+                        for out in op.get("outputs", [])
+                    ]
+                    if outs:
+                        simplified_ops.append(
+                            {
+                                "type": op.get("type", ""),
+                                "outputs": outs,
+                            }
+                        )
+
+                self.assertExpectedInline(
+                    {"ops": simplified_ops[-1:]} if simplified_ops else {"ops": []},
+                    (
+                        "{'ops': [{'type': 'compute', 'outputs': ["
+                        "{'shape': [2], 'stride': [1], 'dtype': 'float32'}, "
+                        "{'shape': [2], 'stride': [1], 'dtype': 'float16'}]}]}"
+                    ),
+                )
+
+            self.assertParses()
+
+    @contextmanager
+    def _setup_graph_execution_capture(self):
+        """Helper to capture the 'graph_execution' structured trace."""
+        payload_buffer = io.StringIO()
+        payload_handler = logging.StreamHandler(payload_buffer)
+        payload_handler.setLevel(logging.DEBUG)
+        payload_handler.setFormatter(StructuredTracePayloadFormatter())
+        payload_handler.addFilter(StructuredTraceTestingFilter("graph_execution"))
+        trace_log.addHandler(payload_handler)
+        try:
+            yield payload_buffer
+        finally:
+            trace_log.removeHandler(payload_handler)
+
+    @requires_tlparse
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_graph_execution_order(self):
+        """Verify graph execution order is aggregated into a single artifact."""
+        torch._dynamo.reset()
+        with self._setup_graph_execution_capture() as payload_buffer:
+
+            def fn(x):
+                y = x + 1
+                torch._dynamo.graph_break()
+                return y + 2
+
+            compiled = torch.compile(fn, backend="inductor")
+            from torch._inductor.debug import record_and_log_graph_execution_order
+
+            with record_and_log_graph_execution_order():
+                compiled(torch.randn(1))
+
+            payload_content = payload_buffer.getvalue().strip()
+            payload = json.loads(payload_content)
+            executions = payload["graph_execution_order"]
+            self.assertTrue(all(isinstance(e["compile_id"], str) for e in executions))
+            self.assertExpectedInline(
+                json.dumps(payload),
+                """{"graph_execution_order": [{"compile_id": "0/0"}, {"compile_id": "1/0"}]}""",
+            )
+            self.assertParses()
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
index 17a01f745d405..9d60cbe81c970 100644
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@@ -31,7 +31,7 @@
     parametrize,
     subtest,
 )
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 from torch.testing._internal.two_tensor import TwoTensor
 from torch.utils._python_dispatch import return_and_correct_aliasing
 
@@ -145,8 +145,6 @@ def mk_subclass_dense_subclass_dense():
 VIEW_TEST_CASES = {k: v for v, k in get_view_test_cases()}
 
 
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
-
 compile_full_eager = torch.compile(backend="eager", fullgraph=True)
 
 
@@ -3798,7 +3796,7 @@ def fn1(nt1, nt2):
     def test_basic_autograd(self):
         self._test_autograd("aot_eager")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_basic_autograd_inductor(self):
         self._test_autograd("inductor")
 
diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index 70ba2a8bd1bd3..91862e6d3eb00 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -714,6 +714,40 @@ def fn(x, y):
             self.assertEqual(fn_opt(x, y3), fn(x, y3))
             self.assertEqual(cnt.frame_count, 1)
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_tensorfiy_python_scalars_1(self):
+        @torch.compile(backend="aot_eager")
+        def f(x):
+            y = x.sum()
+            return x + y.item()
+
+        dtypes = [torch.bfloat16, torch.float16, torch.float32, torch.float64]
+        for i, dtype in enumerate(dtypes):
+            x = torch.ones(3, 3, dtype=dtype)
+            self.assertEqual(f(x), x + x.sum().item())
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_tensorfiy_python_scalars_2(self):
+        @torch.compile(backend="aot_eager")
+        def f(x):
+            return x.item() * x.item() * torch.ones((), dtype=torch.float64)
+
+        x = torch.tensor(1e20, dtype=torch.float32)
+        self.assertEqual(
+            f(x), x.item() * x.item() * torch.ones((), dtype=torch.float64)
+        )
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_tensorfiy_python_scalars_3(self):
+        @torch.compile(backend="aot_eager")
+        def f(x):
+            y = x.item() * 101
+            return y * torch.tensor([1], dtype=torch.float32)
+
+        finfo_float16 = torch.finfo(torch.float16)
+        x = torch.tensor([finfo_float16.max], dtype=torch.float16)
+        self.assertEqual(f(x), x.item() * 101 * torch.tensor([1], dtype=torch.float32))
+
     @torch._dynamo.config.patch(specialize_float=False, assume_static_by_default=False)
     def test_unspec_float_input_f64(self):
         cnts = torch._dynamo.testing.CompileCounter()
diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py
index b14a6c41dbdc7..b7166c5ce6d1b 100644
--- a/test/dynamo/test_utils.py
+++ b/test/dynamo/test_utils.py
@@ -12,6 +12,9 @@
 from torch._inductor.test_case import TestCase
 
 
+_IS_WINDOWS = sys.platform == "win32"
+
+
 class TestUtils(TestCase):
     def test_nan(self):
         a = torch.Tensor([float("nan")])
@@ -243,6 +246,110 @@ def add(x, y):
         utils.reset_frame_count()
         torch._logging._internal.structured_logging_overhead.clear()
 
+    @dynamo_config.patch({"log_compilation_metrics": True})
+    @inductor_config.patch({"force_disable_caches": True})
+    def test_stack_trace(self):
+        self.warmup()
+
+        compilation_events = []
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+            self.run_forward_backward()
+            compilation_events = [arg[0][0] for arg in log_event.call_args_list]
+        stack_trace_list = []
+        for e in compilation_events:
+            stack_trace_list.append(e.stack_trace)
+
+        self.assertGreater(len(stack_trace_list), 0)
+        result = "\n".join(
+            item
+            for sublist in stack_trace_list
+            if sublist
+            for item in (sublist if isinstance(sublist, list) else [sublist])
+        )
+        self.assertIn(
+            "test_stack_trace",
+            result,
+            "Log file does not contain the expected string: 'test_stack_trace'",
+        )
+
+    @dynamo_config.patch({"log_compilation_metrics": True})
+    @inductor_config.patch({"force_disable_caches": True})
+    def test_graph_node_shapes(self):
+        self.warmup()
+
+        compilation_events = []
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+            self.run_forward_backward()
+            compilation_events = [arg[0][0] for arg in log_event.call_args_list]
+
+        self.assertEqual(
+            compilation_events[0].graph_node_shapes,
+            "{'l_self_modules_linear_parameters_weight_': [1, 3], "
+            "'l_self_modules_linear_parameters_bias_': [1], "
+            "'l_x_': [3], 'linear': [1]}",
+        )
+
+    @dynamo_config.patch({"log_compilation_metrics": True})
+    @inductor_config.patch({"force_disable_caches": True})
+    def test_log_dynamo_start(self):
+        import torch._dynamo.convert_frame as convert_frame
+
+        self.warmup()
+        self.run_forward_backward()
+
+        # Dummy code object
+        def sample_func():
+            pass
+
+        code = sample_func.__code__
+        stack_strings = convert_frame.log_dynamo_start(code)
+        last_entry = stack_strings[-1]
+        # Check if the last entry is a valid stack trace i.e for the sample_func
+        self.assertIn(
+            f"Line: {code.co_firstlineno}",
+            last_entry,
+            "Log does not contain a Line no.",
+        )
+        self.assertIn(
+            f"Name: {code.co_name}", last_entry, "Log does not contain a Name"
+        )
+        self.assertIn(
+            "test_utils.py",
+            last_entry,
+            "Log file does not contain the expected Filename: 'test_utils.py'",
+        )
+
+        # Since the remaining logs are env specific, we just check if they are present instead of checking the exact string
+        self.assertGreater(len(stack_strings), 1)
+
+    @dynamo_config.patch({"log_compilation_metrics": True})
+    @inductor_config.patch({"force_disable_caches": True})
+    def test_exception_stack_trace(self):
+        from torch._dynamo.exc import Unsupported
+
+        def backward(grad_output):
+            print("graph break!")  # This should trigger a Dynamo error
+            return grad_output
+
+        compiled_backward = torch.compile(backward, backend="eager", fullgraph=True)
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+            with self.assertRaisesRegex(
+                Unsupported,
+                "Dynamo does not know how to trace builtin operator `print`",
+            ):
+                compiled_backward(torch.ones(3))
+
+        compilation_events = [arg[0][0] for arg in log_event.call_args_list]
+
+        self.assertGreater(len(compilation_events), 0)
+        self.assertGreater(len(compilation_events[0].exception_stack_trace), 0)
+        self.assertIn(
+            "Dynamo does not know how to trace builtin operator `print`",
+            compilation_events[0].exception_stack_trace[0],
+            "exception_stack_trace does not contain the expected string: "
+            "'Dynamo does not know how to trace builtin operator `print`'",
+        )
+
     @dynamo_config.patch(
         {
             "log_compilation_metrics": True,
@@ -283,6 +390,37 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
         self.assertExpectedInline(
             pprint.pformat(utils.compilation_time_metrics),
             """\
+{'GraphLowering.codegen': [0.0, 0.0],
+ 'GraphLowering.compile_to_fn': [0.0, 0.0],
+ 'GraphLowering.compile_to_module': [0.0, 0.0],
+ 'GraphLowering.run': [0.0, 0.0],
+ 'OutputGraph.call_user_compiler': [0.0],
+ 'PyCodeCache.load_by_key_path': [0.0, 0.0],
+ 'PythonWrapperCodegen.generate': [0.0, 0.0],
+ 'Scheduler.__init__': [0.0, 0.0],
+ 'Scheduler.codegen': [0.0, 0.0],
+ 'Scheduler.fused_nodes': [0.0, 0.0],
+ '_compile.compile_inner': [0.0],
+ '_recursive_joint_graph_passes': [0.0],
+ '_recursive_post_grad_passes': [0.0, 0.0],
+ '_recursive_pre_grad_passes': [0.0],
+ 'additional_fake_tensor_prop': [0.0, 0.0],
+ 'aot_collect_metadata': [0.0],
+ 'aot_trace_joint_graph': [0.0],
+ 'backward._backward_impl': [0.0],
+ 'build_guards': [0.0],
+ 'bytecode_tracing': [0.0],
+ 'compile_attempt_0': [0.0],
+ 'compile_file': [0.0, 0.0],
+ 'compile_fx.<locals>.bw_compiler': [0.0],
+ 'compile_fx.<locals>.fw_compiler_base': [0.0],
+ 'compile_fx_inner': [0.0, 0.0],
+ 'create_aot_dispatcher_function': [0.0],
+ 'fx_codegen_and_compile': [0.0, 0.0],
+ 'gc': [0.0],
+ 'min_cut_rematerialization_partition': [0.0]}"""
+            if _IS_WINDOWS
+            else """\
 {'GraphLowering.codegen': [0.0, 0.0],
  'GraphLowering.compile_to_fn': [0.0, 0.0],
  'GraphLowering.compile_to_module': [0.0, 0.0],
@@ -321,6 +459,18 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
         self.assertExpectedInline(
             pprint.pformat(time_spent),
             """\
+{'_recursive_joint_graph_passes': 0.0,
+ '_recursive_post_grad_passes': 0.0,
+ '_recursive_pre_grad_passes': 0.0,
+ 'backend_compile': 0.0,
+ 'code_gen': 0.0,
+ 'entire_backward_compile': 0.0,
+ 'entire_frame_compile': 0.0,
+ 'gc': 0.0,
+ 'inductor_compile': 0.0,
+ 'total_wall_time': 0.0}"""
+            if _IS_WINDOWS
+            else """\
 {'_recursive_joint_graph_passes': 0.0,
  '_recursive_post_grad_passes': 0.0,
  '_recursive_pre_grad_passes': 0.0,
@@ -350,6 +500,9 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
             e.cuda_version = None
             e.triton_version = None
             e.python_version = None
+            e.stack_trace = None
+            e.graph_node_shapes = None
+            e.exception_stack_trace = None
 
         # First event is for the forward. Formatting makes reading diffs
         # much easier.
@@ -388,6 +541,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'dynamo_time_before_restart_s': 0.0,
  'end_time_us': 100,
  'entire_frame_compile_time_s': 0.0,
+ 'exception_stack_trace': None,
  'fail_reason': None,
  'fail_type': None,
  'fail_user_frame_filename': None,
@@ -396,6 +550,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'gc_time_us': 0,
  'graph_input_count': 1,
  'graph_node_count': 3,
+ 'graph_node_shapes': None,
  'graph_op_count': 1,
  'guard_count': 9,
  'has_guarded_code': True,
@@ -408,6 +563,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'inductor_fx_remote_cache_hit_keys': None,
  'inductor_fx_remote_cache_miss_count': None,
  'inductor_fx_remote_cache_miss_keys': None,
+ 'inline_inbuilt_nn_modules_candidate': False,
  'is_forward': True,
  'is_runtime': False,
  'joint_graph_pass_time_us': 0,
@@ -433,6 +589,92 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'runtime_triton_autotune_time_us': None,
  'shape_env_guard_count': 0,
  'specialize_float': False,
+ 'stack_trace': None,
+ 'start_time': 0.0001,
+ 'start_time_us': 100,
+ 'structured_logging_overhead_s': 0.0,
+ 'structured_logging_overhead_us': 0,
+ 'tensorify_float_attempt': None,
+ 'tensorify_float_failure': None,
+ 'tensorify_float_success': None,
+ 'triton_compile_time_us': None,
+ 'triton_kernel_compile_times_us': None,
+ 'triton_version': None}"""
+            if _IS_WINDOWS
+            else """\
+{'accumulated_cache_size': 0,
+ 'aot_autograd_cumulative_compile_time_us': 0,
+ 'backend_compile_time_s': 0.0,
+ 'backward_cumulative_compile_time_us': None,
+ 'cache_size': 0,
+ 'co_filename': None,
+ 'co_firstlineno': None,
+ 'co_name': 'forward',
+ 'code_gen_time_s': 0.0,
+ 'compile_id': '1/0',
+ 'compile_time_autotune_time_us': None,
+ 'compliant_custom_ops': set(),
+ 'config_inline_inbuilt_nn_modules': False,
+ 'config_suppress_errors': False,
+ 'cuda_version': None,
+ 'cudagraph_skip_reason': None,
+ 'distributed_ephemeral_timeout_us': None,
+ 'duration_us': 0,
+ 'dynamo_compile_time_before_restart_us': 0,
+ 'dynamo_config': None,
+ 'dynamo_cumulative_compile_time_us': 0,
+ 'dynamo_time_before_restart_s': 0.0,
+ 'end_time_us': 100,
+ 'entire_frame_compile_time_s': 0.0,
+ 'exception_stack_trace': None,
+ 'fail_reason': None,
+ 'fail_type': None,
+ 'fail_user_frame_filename': None,
+ 'fail_user_frame_lineno': None,
+ 'frame_key': '1',
+ 'gc_time_us': 0,
+ 'graph_input_count': 1,
+ 'graph_node_count': 3,
+ 'graph_node_shapes': None,
+ 'graph_op_count': 1,
+ 'guard_count': 9,
+ 'has_guarded_code': True,
+ 'inductor_code_gen_cumulative_compile_time_us': 0,
+ 'inductor_compile_time_s': 0.0,
+ 'inductor_config': None,
+ 'inductor_cumulative_compile_time_us': 0,
+ 'inductor_fx_remote_cache_backend_type': None,
+ 'inductor_fx_remote_cache_hit_count': None,
+ 'inductor_fx_remote_cache_hit_keys': None,
+ 'inductor_fx_remote_cache_miss_count': None,
+ 'inductor_fx_remote_cache_miss_keys': None,
+ 'inline_inbuilt_nn_modules_candidate': False,
+ 'is_forward': True,
+ 'is_runtime': False,
+ 'joint_graph_pass_time_us': 0,
+ 'log_format_version': 3,
+ 'non_compliant_ops': set(),
+ 'num_graph_breaks': 0,
+ 'num_triton_bundles': None,
+ 'pgo_get_remote_code_state_time_us': None,
+ 'pgo_put_remote_code_state_time_us': None,
+ 'post_grad_pass_time_us': 0,
+ 'pre_grad_pass_time_us': 0,
+ 'python_version': None,
+ 'recompile_reason': None,
+ 'recompile_user_contexts': None,
+ 'remote_cache_time_saved_s': None,
+ 'remote_cache_version': None,
+ 'remote_fx_graph_cache_get_time_ms': None,
+ 'remote_fx_graph_cache_get_time_us': None,
+ 'remote_fx_graph_cache_put_time_ms': None,
+ 'remote_fx_graph_cache_put_time_us': None,
+ 'restart_reasons': set(),
+ 'runtime_cudagraphify_time_us': None,
+ 'runtime_triton_autotune_time_us': None,
+ 'shape_env_guard_count': 0,
+ 'specialize_float': False,
+ 'stack_trace': None,
  'start_time': 0.0001,
  'start_time_us': 100,
  'structured_logging_overhead_s': 0.0,
@@ -480,6 +722,92 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'dynamo_time_before_restart_s': None,
  'end_time_us': 100,
  'entire_frame_compile_time_s': None,
+ 'exception_stack_trace': None,
+ 'fail_reason': None,
+ 'fail_type': None,
+ 'fail_user_frame_filename': None,
+ 'fail_user_frame_lineno': None,
+ 'frame_key': None,
+ 'gc_time_us': None,
+ 'graph_input_count': None,
+ 'graph_node_count': None,
+ 'graph_node_shapes': None,
+ 'graph_op_count': None,
+ 'guard_count': None,
+ 'has_guarded_code': None,
+ 'inductor_code_gen_cumulative_compile_time_us': 0,
+ 'inductor_compile_time_s': 0.0,
+ 'inductor_config': None,
+ 'inductor_cumulative_compile_time_us': 0,
+ 'inductor_fx_remote_cache_backend_type': None,
+ 'inductor_fx_remote_cache_hit_count': None,
+ 'inductor_fx_remote_cache_hit_keys': None,
+ 'inductor_fx_remote_cache_miss_count': None,
+ 'inductor_fx_remote_cache_miss_keys': None,
+ 'inline_inbuilt_nn_modules_candidate': False,
+ 'is_forward': False,
+ 'is_runtime': False,
+ 'joint_graph_pass_time_us': None,
+ 'log_format_version': 3,
+ 'non_compliant_ops': None,
+ 'num_graph_breaks': 0,
+ 'num_triton_bundles': None,
+ 'pgo_get_remote_code_state_time_us': None,
+ 'pgo_put_remote_code_state_time_us': None,
+ 'post_grad_pass_time_us': 0,
+ 'pre_grad_pass_time_us': None,
+ 'python_version': None,
+ 'recompile_reason': None,
+ 'recompile_user_contexts': None,
+ 'remote_cache_time_saved_s': None,
+ 'remote_cache_version': None,
+ 'remote_fx_graph_cache_get_time_ms': None,
+ 'remote_fx_graph_cache_get_time_us': None,
+ 'remote_fx_graph_cache_put_time_ms': None,
+ 'remote_fx_graph_cache_put_time_us': None,
+ 'restart_reasons': None,
+ 'runtime_cudagraphify_time_us': None,
+ 'runtime_triton_autotune_time_us': None,
+ 'shape_env_guard_count': None,
+ 'specialize_float': None,
+ 'stack_trace': None,
+ 'start_time': 0.0001,
+ 'start_time_us': 100,
+ 'structured_logging_overhead_s': 0.0,
+ 'structured_logging_overhead_us': 0,
+ 'tensorify_float_attempt': None,
+ 'tensorify_float_failure': None,
+ 'tensorify_float_success': None,
+ 'triton_compile_time_us': None,
+ 'triton_kernel_compile_times_us': None,
+ 'triton_version': None}"""
+            if _IS_WINDOWS
+            else """\
+{'accumulated_cache_size': None,
+ 'aot_autograd_cumulative_compile_time_us': None,
+ 'backend_compile_time_s': None,
+ 'backward_cumulative_compile_time_us': 0,
+ 'cache_size': None,
+ 'co_filename': None,
+ 'co_firstlineno': None,
+ 'co_name': None,
+ 'code_gen_time_s': 0.0,
+ 'compile_id': '1/0',
+ 'compile_time_autotune_time_us': None,
+ 'compliant_custom_ops': None,
+ 'config_inline_inbuilt_nn_modules': False,
+ 'config_suppress_errors': False,
+ 'cuda_version': None,
+ 'cudagraph_skip_reason': None,
+ 'distributed_ephemeral_timeout_us': None,
+ 'duration_us': 0,
+ 'dynamo_compile_time_before_restart_us': None,
+ 'dynamo_config': None,
+ 'dynamo_cumulative_compile_time_us': None,
+ 'dynamo_time_before_restart_s': None,
+ 'end_time_us': 100,
+ 'entire_frame_compile_time_s': None,
+ 'exception_stack_trace': None,
  'fail_reason': None,
  'fail_type': None,
  'fail_user_frame_filename': None,
@@ -488,6 +816,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'gc_time_us': None,
  'graph_input_count': None,
  'graph_node_count': None,
+ 'graph_node_shapes': None,
  'graph_op_count': None,
  'guard_count': None,
  'has_guarded_code': None,
@@ -500,6 +829,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'inductor_fx_remote_cache_hit_keys': None,
  'inductor_fx_remote_cache_miss_count': None,
  'inductor_fx_remote_cache_miss_keys': None,
+ 'inline_inbuilt_nn_modules_candidate': False,
  'is_forward': False,
  'is_runtime': False,
  'joint_graph_pass_time_us': None,
@@ -525,6 +855,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'runtime_triton_autotune_time_us': None,
  'shape_env_guard_count': None,
  'specialize_float': None,
+ 'stack_trace': None,
  'start_time': 0.0001,
  'start_time_us': 100,
  'structured_logging_overhead_s': 0.0,
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_helper_function b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_helper_function
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_invariant_for_the_in_operator b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_invariant_for_the_in_operator
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_le b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_le
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_lt b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_lt
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_total b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_total
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_unary b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_unary
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_keyword_only_arguments b/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_keyword_only_arguments
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_namedtuple_subclass_issue_24931 b/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_namedtuple_subclass_issue_24931
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_new_builtins_issue_43102 b/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_new_builtins_issue_43102
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_odd_sizes b/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_odd_sizes
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Callable b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Callable
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Generator b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Generator
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_direct_subclassing b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_direct_subclassing
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_registration b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_registration
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_dict_missing b/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_dict_missing
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_dict_protocol b/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_dict_protocol
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_list_protocol b/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_list_protocol
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_str_protocol b/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_str_protocol
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_abs b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_abs
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_add b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_add
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_boolcontext b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_boolcontext
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_conjugate b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_conjugate
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_from_string b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_from_string
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_negative_nans_from_string b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_negative_nans_from_string
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_getnewargs b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_getnewargs
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_mul b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_mul
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_negative_zero_repr_str b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_negative_zero_repr_str
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_overflow b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_overflow
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pow b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pow
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pow_with_small_integer_exponents b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pow_with_small_integer_exponents
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_repr_str b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_repr_str
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_sub b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_sub
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing b/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing_error b/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing_error
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_except_stopiter b/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_except_stopiter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_trap_second_yield b/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_trap_second_yield
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_trap_yield_after_throw b/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_trap_yield_after_throw
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_nokeepref b/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_nokeepref
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-NullcontextTestCase.test_nullcontext b/test/dynamo_expected_failures/CPython313-test_contextlib-NullcontextTestCase.test_nullcontext
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_enter b/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_enter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_exit_is_abstract b/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_exit_is_abstract
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_slots b/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_slots
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_structural_subclassing b/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_structural_subclassing
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_contextdecorator_as_mixin b/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_contextdecorator_as_mixin
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_decorating_method b/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_decorating_method
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_typo_enter b/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_typo_enter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_typo_exit b/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_typo_exit
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_dont_reraise_RuntimeError b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_dont_reraise_RuntimeError
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_enter_context b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_enter_context
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_enter_context_errors b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_enter_context_errors
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_exit_exception_chaining_reference b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_exit_exception_chaining_reference
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_exit_exception_explicit_none_context b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_exit_exception_explicit_none_context
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_instance_bypass b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_instance_bypass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_push b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_push
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_interpreter_convert_to_bool_raises b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_basic
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_interpreter_convert_to_bool_raises
rename to test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_basic
diff --git a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_sane_len b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_callable_arg
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_sane_len
rename to test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_callable_arg
diff --git a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_subclass b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_copy
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_subclass
rename to test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_copy
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_abs b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_deep_copy
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_abs
rename to test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_deep_copy
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_abs_overflows b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_keyerror_without_factory
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_abs_overflows
rename to test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_keyerror_without_factory
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_bool b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_missing
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_bool
rename to test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_missing
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_constructor b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_pickling
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_constructor
rename to test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_pickling
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_dict_coercion b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_repr
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_dict_coercion
rename to test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_repr
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_iter_not_calling_getitem_on_maps b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_shallow_copy
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_iter_not_calling_getitem_on_maps
rename to test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_shallow_copy
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_missing b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_union
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_missing
rename to test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_union
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_bad_key b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_bad_key
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_clear b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_clear
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_constructor b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_constructor
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_maintains_tracking b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_maintains_tracking
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_noncompact b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_noncompact
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_contain_use_after_free b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_contain_use_after_free
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_copy_order b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_copy_order
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictitems_contains_use_after_free b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictitems_contains_use_after_free
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_eq b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_eq
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_equal_operator_modifying_operand b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_equal_operator_modifying_operand
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_errors_in_view_containment_check b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_errors_in_view_containment_check
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_dict_operand b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_dict_operand
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_set_operand b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_set_operand
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_getitem b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_getitem
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_init_use_after_free b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_init_use_after_free
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_instance_dict_getattr_str_subclass b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_instance_dict_getattr_str_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_invalid_keyword_arguments b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_invalid_keyword_arguments
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_merge_and_mutate b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_merge_and_mutate
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_missing b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_missing
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_lookup b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_lookup
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_object_set_item_single_instance_non_str_key b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_object_set_item_single_instance_non_str_key
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_oob_indexing_dictiter_iternextitem b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_oob_indexing_dictiter_iternextitem
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_pop b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_pop
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reentrant_insertion b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reentrant_insertion
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_resize2 b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_resize2
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverse_iterator_for_shared_shared_dicts b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverse_iterator_for_shared_shared_dicts
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault_atomic b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault_atomic
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setitem_atomic_at_resize b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setitem_atomic_at_resize
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_del b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_del
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop_pending b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop_pending
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_popitem b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_popitem
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_setdefault b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_setdefault
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_to_generic_combinedtable b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_to_generic_combinedtable
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_update b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_update
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_store_evilattr b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_store_evilattr
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_str_nonstr b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_str_nonstr
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_views_mapping b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_views_mapping
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_constructor b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_constructor
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_read b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_read
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_write
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_write
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_floatconversion b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_floatconversion
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_keywords_in_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_non_numeric_input_types b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_non_numeric_input_types
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_subclass b/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_heapq-TestErrorHandlingPython.test_arg_parsing b/test/dynamo_expected_failures/CPython313-test_heapq-TestErrorHandlingPython.test_arg_parsing
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_heapq-TestErrorHandlingPython.test_non_sequence b/test/dynamo_expected_failures/CPython313-test_heapq-TestErrorHandlingPython.test_non_sequence
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_int_from_other_bases b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_int_from_other_bases
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_power_of_two_bases_unlimited b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_power_of_two_bases_unlimited
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_base_indexable b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_base_indexable
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_returns_int_subclass b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_returns_int_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_subclass_with_index b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_subclass_with_index
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_subclass_with_int b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_subclass_with_int
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_intconversion b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_intconversion
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_non_numeric_input_types b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_non_numeric_input_types
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_3720 b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_3720
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_exception_sequence b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_exception_sequence
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_mutating_seq_class_exhausted_iter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_mutating_seq_class_exhausted_iter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_new_style_iter_class b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_new_style_iter_class
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_ref_counting_behavior b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_ref_counting_behavior
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_stop_sequence b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_stop_sequence
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_unicode_join_endcase b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_unicode_join_endcase
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_filterfalse b/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_filterfalse
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_permutations b/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_permutations
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_product b/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_product
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_zip b/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_zip
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_ziplongest b/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_ziplongest
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestExamples.test_filterfalse b/test/dynamo_expected_failures/CPython313-test_itertools-TestExamples.test_filterfalse
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestExamples.test_permutations b/test/dynamo_expected_failures/CPython313-test_itertools-TestExamples.test_permutations
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestExamples.test_product b/test/dynamo_expected_failures/CPython313-test_itertools-TestExamples.test_product
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestGC.test_permutations b/test/dynamo_expected_failures/CPython313-test_itertools-TestGC.test_permutations
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestGC.test_product b/test/dynamo_expected_failures/CPython313-test_itertools-TestGC.test_product
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_chain b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_chain
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_compress b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_compress
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_cycle b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_cycle
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_filterfalse b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_filterfalse
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_product b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_product
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_tee b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_tee
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_zip b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_zip
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_addmul b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_addmul
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_constructors b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_constructors
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_contains_order b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_contains_order
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_count b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_count
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_count_index_remove_crashes b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_count_index_remove_crashes
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_equal_operator_modifying_operand b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_equal_operator_modifying_operand
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_extend b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_extend
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getitemoverwriteiter b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getitemoverwriteiter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_keywords_in_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_list_index_modifing_operand b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_list_index_modifing_operand
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_lt_operator_modifying_operand b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_lt_operator_modifying_operand
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_no_comdat_folding b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_no_comdat_folding
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_repr_mutate b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_repr_mutate
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCeil b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCeil
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFloor b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFloor
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_issue39871 b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_issue39871
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_sumprod_stress b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_sumprod_stress
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_trunc b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_trunc
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-CCOperatorPickleTestCase.test_attrgetter b/test/dynamo_expected_failures/CPython313-test_operator-CCOperatorPickleTestCase.test_attrgetter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-CCOperatorPickleTestCase.test_methodcaller b/test/dynamo_expected_failures/CPython313-test_operator-CCOperatorPickleTestCase.test_methodcaller
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_attrgetter b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_attrgetter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_eq b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_eq
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_index b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_index
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_inplace b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_inplace
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_length_hint b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_length_hint
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_ne b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_ne
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_not_ b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_not_
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_truth b/test/dynamo_expected_failures/CPython313-test_operator-COperatorTestCase.test_truth
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-CPyOperatorPickleTestCase.test_attrgetter b/test/dynamo_expected_failures/CPython313-test_operator-CPyOperatorPickleTestCase.test_attrgetter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-CPyOperatorPickleTestCase.test_methodcaller b/test/dynamo_expected_failures/CPython313-test_operator-CPyOperatorPickleTestCase.test_methodcaller
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyCOperatorPickleTestCase.test_attrgetter b/test/dynamo_expected_failures/CPython313-test_operator-PyCOperatorPickleTestCase.test_attrgetter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyCOperatorPickleTestCase.test_methodcaller b/test/dynamo_expected_failures/CPython313-test_operator-PyCOperatorPickleTestCase.test_methodcaller
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test___all__ b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test___all__
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_new_child b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_abs
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_new_child
rename to test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_abs
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_attrgetter b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_attrgetter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_eq b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_eq
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_index b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_index
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_inplace b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_inplace
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_length_hint b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_length_hint
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_matmul b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_matmul
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_methodcaller b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_methodcaller
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_ne b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_ne
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_not_ b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_not_
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_order_preservation b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_pos
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_order_preservation
rename to test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_pos
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_truth b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_truth
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyPyOperatorPickleTestCase.test_attrgetter b/test/dynamo_expected_failures/CPython313-test_operator-PyPyOperatorPickleTestCase.test_attrgetter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyPyOperatorPickleTestCase.test_methodcaller b/test/dynamo_expected_failures/CPython313-test_operator-PyPyOperatorPickleTestCase.test_methodcaller
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_delitem_hash_collision
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_highly_nested_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_override_update
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_write
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_delitem_hash_collision
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_highly_nested_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_init_calls
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_clear
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_delete_key
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_clear
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key_in_dict_eq b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key_in_dict_eq
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24347
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24348
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_overridden_init
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_override_update
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_popitem_last b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_popitem_last
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_delitem_hash_collision
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_highly_nested_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_init_calls
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_clear
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_delete_key
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_clear
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key_in_dict_eq b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key_in_dict_eq
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24347
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24348
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_merge_operator b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_merge_operator
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end_issue25406 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end_issue25406
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_overridden_init
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_override_update
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_popitem_last b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_popitem_last
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_write
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_change_order_on_get b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_change_order_on_get
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_pop
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_delitem_hash_collision
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_highly_nested_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_init_calls
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue119004_attribute_error b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue119004_attribute_error
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24347
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24348
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_overridden_init
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_override_update
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_delitem_hash_collision
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_highly_nested_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_init_calls
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue119004_attribute_error b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue119004_attribute_error
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24347
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24348
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_overridden_init
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_override_update
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_ordering b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_comparison
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_ordering
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_comparison
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Mapping b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_exhausted_iterator_pickling
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Mapping
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_exhausted_iterator_pickling
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableMapping b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_index
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableMapping
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_index
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableMapping_subclass b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_invalid_invocation
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableMapping_subclass
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_invalid_invocation
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableSequence_mixins b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_issue11845
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableSequence_mixins
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_issue11845
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableSet b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_pickling
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableSet
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_pickling
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Sequence_mixins b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_pickling_overflowing_index
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Sequence_mixins
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_pickling_overflowing_index
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_setstate
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_setstate
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_from_iterable b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_unpickle_compat
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_from_iterable
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_unpickle_compat
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_interoperability_with_real_sets b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_large_exhausted_iterator_pickling
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_interoperability_with_real_sets
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_large_exhausted_iterator_pickling
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_arithmetic_Set b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_odd_bug
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_arithmetic_Set
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_odd_bug
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_equality_Set b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_pickling
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_equality_Set
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_pickling
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_isdisjoint_Set b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_range_constructor_error_messages
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_isdisjoint_Set
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_range_constructor_error_messages
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue16373 b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_range_iterators
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue16373
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_range_iterators
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue26915 b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_range_iterators_invocation
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue26915
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_range_iterators_invocation
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue8750 b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_types
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue8750
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_types
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue_4920 b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_user_index_method
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue_4920
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_user_index_method
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_and_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_and_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_eq_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_eq_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ge_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ge_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_gt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_gt_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_iadd_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_iadd_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ior_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ior_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_isub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_isub_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_iteration_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_iteration_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ixor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ixor_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_le_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_le_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_lt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_lt_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ne_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ne_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_or_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_or_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_sub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_sub_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_xor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_xor_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_and_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_and_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_eq_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_eq_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ge_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ge_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_gt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_gt_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_iadd_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_iadd_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ior_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ior_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_isub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_isub_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_iteration_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_iteration_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ixor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ixor_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_le_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_le_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_lt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_lt_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ne_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ne_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_or_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_or_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_sub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_sub_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_xor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_xor_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_and_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_and_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_eq_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_eq_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ge_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ge_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_gt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_gt_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_iadd_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_iadd_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ior_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ior_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_isub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_isub_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_iteration_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_iteration_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ixor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ixor_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_le_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_le_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_lt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_lt_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ne_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ne_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_or_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_or_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_sub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_sub_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_xor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_xor_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_and_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_and_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_eq_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_eq_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ge_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ge_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_gt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_gt_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_iadd_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_iadd_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ior_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ior_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_isub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_isub_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_iteration_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_iteration_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ixor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ixor_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_le_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_le_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_lt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_lt_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ne_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ne_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_or_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_or_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_sub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_sub_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_xor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_xor_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_container_iterator b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_container_iterator
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_deepcopy b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_deepcopy
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_gc b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_gc
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_subclass_with_custom_hash b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_subclass_with_custom_hash
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_container_iterator b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_container_iterator
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_contains
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_deepcopy b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_deepcopy
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_gc b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_gc
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_keywords_in_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_subclass_with_custom_hash b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_subclass_with_custom_hash
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_difference_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_difference_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_intersection_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_intersection_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_isdisjoint_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_issubset_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_issuperset_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_symmetric_difference_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_symmetric_difference_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_union_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_difference_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_difference_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_intersection_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_intersection_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_isdisjoint_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_issubset_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_issuperset_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_symmetric_difference_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_symmetric_difference_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_union_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_difference_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_difference_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_intersection_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_intersection_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_isdisjoint_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_issubset_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_issuperset_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_symmetric_difference_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_symmetric_difference_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_union_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_difference_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_difference_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_intersection_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_intersection_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_isdisjoint_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_issubset_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_issuperset_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_symmetric_difference_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_symmetric_difference_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_union_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_difference_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_difference_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_intersection_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_intersection_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_isdisjoint_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_issubset_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_issuperset_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_symmetric_difference_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_symmetric_difference_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_union_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_difference_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_difference_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_intersection_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_intersection_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_isdisjoint_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_issubset_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_issuperset_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_symmetric_difference_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_symmetric_difference_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_union_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_update_with_mutation
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference_update
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference_update
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_container_iterator b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_container_iterator
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_contains
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_deepcopy b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_deepcopy
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_gc b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_gc
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_rich_compare b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_rich_compare
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_subclass_with_custom_hash b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_subclass_with_custom_hash
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_container_iterator b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_container_iterator
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_contains
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_deepcopy b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_deepcopy
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_gc b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_gc
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_keywords_in_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_rich_compare b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_rich_compare
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_subclass_with_custom_hash b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_subclass_with_custom_hash
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestVariousIteratorArgs.test_constructor b/test/dynamo_expected_failures/CPython313-test_set-TestVariousIteratorArgs.test_constructor
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestWeirdBugs.test_merge_and_mutate b/test/dynamo_expected_failures/CPython313-test_set-TestWeirdBugs.test_merge_and_mutate
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestBase.testStressfully b/test/dynamo_expected_failures/CPython313-test_sort-TestBase.testStressfully
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestBase.test_small_stability b/test/dynamo_expected_failures/CPython313-test_sort-TestBase.test_small_stability
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestBugs.test_bug453523 b/test/dynamo_expected_failures/CPython313-test_sort-TestBugs.test_bug453523
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutating_del b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutating_del
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutating_del_and_exception b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutating_del_and_exception
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_object_compare b/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_object_compare
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_addmul b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_addmul
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_constructors b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_constructors
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_contains_order b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_contains_order
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_count b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_count
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getitemoverwriteiter b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getitemoverwriteiter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_keywords_in_subclass
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_no_comdat_folding b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_no_comdat_folding
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_track_subtypes b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_track_subtypes
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_constructor b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_constructor
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_eq b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_eq
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_get b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_get
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_keys b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_keys
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_len b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_len
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_mutatingiteration b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_mutatingiteration
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_read b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_read
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_add_specials b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_add_specials
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_addmul b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_addmul
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_contains_order b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_contains_order
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_delslice b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_delslice
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_exhausted_iterator b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_exhausted_iterator
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_extendedslicing b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_extendedslicing
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getitemoverwriteiter b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getitemoverwriteiter
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getslice b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getslice
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_imul b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_imul
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_len b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_len
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_minmax b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_minmax
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_mixedadd b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_mixedadd
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_mixedcmp b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_mixedcmp
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_radd_specials b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_radd_specials
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice_assign_iterator b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice_assign_iterator
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice_type b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice_type
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_truth b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_truth
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-AssignmentTargetTestCase.testMultipleComplexTargets b/test/dynamo_expected_failures/CPython313-test_with-AssignmentTargetTestCase.testMultipleComplexTargets
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-AssignmentTargetTestCase.testSingleComplexTarget b/test/dynamo_expected_failures/CPython313-test_with-AssignmentTargetTestCase.testSingleComplexTarget
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-AssignmentTargetTestCase.testWithExtendedTargets b/test/dynamo_expected_failures/CPython313-test_with-AssignmentTargetTestCase.testWithExtendedTargets
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-ExceptionalTestCase.testErrorsInBool b/test/dynamo_expected_failures/CPython313-test_with-ExceptionalTestCase.testErrorsInBool
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-ExceptionalTestCase.testRaisedGeneratorExit2 b/test/dynamo_expected_failures/CPython313-test_with-ExceptionalTestCase.testRaisedGeneratorExit2
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-ExceptionalTestCase.testRaisedStopIteration2 b/test/dynamo_expected_failures/CPython313-test_with-ExceptionalTestCase.testRaisedStopIteration2
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-ExitSwallowsExceptionTestCase.testExitFalseDoesntSwallowException b/test/dynamo_expected_failures/CPython313-test_with-ExitSwallowsExceptionTestCase.testExitFalseDoesntSwallowException
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-ExitSwallowsExceptionTestCase.testExitTrueSwallowsException b/test/dynamo_expected_failures/CPython313-test_with-ExitSwallowsExceptionTestCase.testExitTrueSwallowsException
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testEnterAttributeError1 b/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testEnterAttributeError1
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testEnterAttributeError2 b/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testEnterAttributeError2
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testEnterThrows b/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testEnterThrows
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testExitAttributeError b/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testExitAttributeError
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testExitThrows b/test/dynamo_expected_failures/CPython313-test_with-FailureTestCase.testExitThrows
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue_5647 b/test/dynamo_expected_failures/TestCheckpoint.test_checkpoint_non_tensor_inputs_outputs
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue_5647
rename to test/dynamo_expected_failures/TestCheckpoint.test_checkpoint_non_tensor_inputs_outputs
diff --git a/test/dynamo_skips/TestOpenMP_ParallelFor.test_one_thread b/test/dynamo_skips/TestOpenMP_ParallelFor.test_one_thread
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_copy_subclass b/test/dynamo_skips/TestScript.test_static_method_on_module
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_copy_subclass
rename to test/dynamo_skips/TestScript.test_static_method_on_module
diff --git a/test/expect/HasDecompTest.test_has_decomposition.expect b/test/expect/HasDecompTest.test_has_decomposition.expect
index 5c5795f45ce25..c650b102bf1a7 100644
--- a/test/expect/HasDecompTest.test_has_decomposition.expect
+++ b/test/expect/HasDecompTest.test_has_decomposition.expect
@@ -75,6 +75,7 @@ aten::_ctc_loss.out
 aten::_ctc_loss_backward
 aten::_ctc_loss_backward.Tensor
 aten::_ctc_loss_backward.out
+aten::_cudnn_attention_backward
 aten::_cudnn_attention_forward
 aten::_cudnn_ctc_loss
 aten::_cudnn_ctc_loss.Tensor
diff --git a/test/expect/TestSparseMPS.test_print_coalesced_mps_float32.expect b/test/expect/TestSparseMPS.test_print_coalesced_mps_float32.expect
new file mode 100644
index 0000000000000..7c0d59c430cdc
--- /dev/null
+++ b/test/expect/TestSparseMPS.test_print_coalesced_mps_float32.expect
@@ -0,0 +1,266 @@
+# shape: torch.Size([])
+# nnz: 2
+# sparse_dim: 0
+# indices shape: torch.Size([0, 2])
+# values shape: torch.Size([2])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 2)),
+       values=tensor([0, 1]),
+       device='mps:0', size=(), nnz=2, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 2), dtype=torch.int64)
+# _values
+tensor([0, 1], device='mps:0', dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 2)),
+       values=tensor([0., 1.]),
+       device='mps:0', size=(), nnz=2, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 2)),
+       values=tensor([0., 1.]),
+       device='mps:0', size=(), nnz=2, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([2.]),
+       device='mps:0', size=(), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 2), dtype=torch.int64)
+# _values
+tensor([0., 1.], device='mps:0')
+
+# shape: torch.Size([0])
+# nnz: 10
+# sparse_dim: 0
+# indices shape: torch.Size([0, 10])
+# values shape: torch.Size([10, 0])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 10)),
+       values=tensor([], size=(10, 0)),
+       device='mps:0', size=(0,), nnz=10, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 10), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(10, 0), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 10)),
+       values=tensor([], size=(10, 0)),
+       device='mps:0', size=(0,), nnz=10, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 10)),
+       values=tensor([], size=(10, 0)),
+       device='mps:0', size=(0,), nnz=10, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([], size=(1, 0)),
+       device='mps:0', size=(0,), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 10), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(10, 0))
+
+# shape: torch.Size([2])
+# nnz: 3
+# sparse_dim: 0
+# indices shape: torch.Size([0, 3])
+# values shape: torch.Size([3, 2])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([[0, 0],
+                      [0, 1],
+                      [1, 1]]),
+       device='mps:0', size=(2,), nnz=3, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([[0, 0],
+        [0, 1],
+        [1, 1]], device='mps:0', dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([[0.0000, 0.3333],
+                      [0.6667, 1.0000],
+                      [1.3333, 1.6667]]),
+       device='mps:0', size=(2,), nnz=3, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([[0.0000, 0.3333],
+                      [0.6667, 1.0000],
+                      [1.3333, 1.6667]]),
+       device='mps:0', size=(2,), nnz=3, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([[4.0000, 6.0000]]),
+       device='mps:0', size=(2,), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([[0.0000, 0.3333],
+        [0.6667, 1.0000],
+        [1.3333, 1.6667]], device='mps:0')
+
+# shape: torch.Size([100, 3])
+# nnz: 3
+# sparse_dim: 1
+# indices shape: torch.Size([1, 3])
+# values shape: torch.Size([3, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([[0, 1, 2]]),
+       values=tensor([[0, 0, 0],
+                      [0, 0, 1],
+                      [1, 1, 1]]),
+       device='mps:0', size=(100, 3), nnz=3, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([[0, 1, 2]], device='mps:0')
+# _values
+tensor([[0, 0, 0],
+        [0, 0, 1],
+        [1, 1, 1]], device='mps:0', dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([[0, 1, 2]]),
+       values=tensor([[0.0000, 0.2222, 0.4444],
+                      [0.6667, 0.8889, 1.1111],
+                      [1.3333, 1.5556, 1.7778]]),
+       device='mps:0', size=(100, 3), nnz=3, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([[0, 1, 2]]),
+       values=tensor([[0.0000, 0.2222, 0.4444],
+                      [0.6667, 0.8889, 1.1111],
+                      [1.3333, 1.5556, 1.7778]]),
+       device='mps:0', size=(100, 3), nnz=3, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([[0, 1, 2]]),
+       values=tensor([[0.0000, 0.4444, 0.8889],
+                      [1.3333, 1.7778, 2.2222],
+                      [2.6667, 3.1111, 3.5556]]),
+       device='mps:0', size=(100, 3), nnz=3, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([[0, 1, 2]], device='mps:0')
+# _values
+tensor([[0.0000, 0.2222, 0.4444],
+        [0.6667, 0.8889, 1.1111],
+        [1.3333, 1.5556, 1.7778]], device='mps:0')
+
+# shape: torch.Size([100, 20, 3])
+# nnz: 0
+# sparse_dim: 2
+# indices shape: torch.Size([2, 0])
+# values shape: torch.Size([0, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(2, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(2, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 3))
+
+# shape: torch.Size([10, 0, 3])
+# nnz: 3
+# sparse_dim: 0
+# indices shape: torch.Size([0, 3])
+# values shape: torch.Size([3, 10, 0, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([], size=(3, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=3, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(3, 10, 0, 3), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([], size=(3, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=3, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([], size=(3, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=3, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([], size=(1, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(3, 10, 0, 3))
+
+# shape: torch.Size([10, 0, 3])
+# nnz: 0
+# sparse_dim: 0
+# indices shape: torch.Size([0, 0])
+# values shape: torch.Size([0, 10, 0, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 10, 0, 3), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 10, 0, 3))
diff --git a/test/expect/TestSparseMPS.test_print_uncoalesced_mps_float32.expect b/test/expect/TestSparseMPS.test_print_uncoalesced_mps_float32.expect
new file mode 100644
index 0000000000000..a56eee010b6c3
--- /dev/null
+++ b/test/expect/TestSparseMPS.test_print_uncoalesced_mps_float32.expect
@@ -0,0 +1,265 @@
+# shape: torch.Size([])
+# nnz: 2
+# sparse_dim: 0
+# indices shape: torch.Size([0, 2])
+# values shape: torch.Size([2])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 2)),
+       values=tensor([0, 1]),
+       device='mps:0', size=(), nnz=2, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 2), dtype=torch.int64)
+# _values
+tensor([0, 1], device='mps:0', dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 2)),
+       values=tensor([0., 1.]),
+       device='mps:0', size=(), nnz=2, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 2)),
+       values=tensor([0., 1.]),
+       device='mps:0', size=(), nnz=2, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([2.]),
+       device='mps:0', size=(), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 2), dtype=torch.int64)
+# _values
+tensor([0., 1.], device='mps:0')
+
+# shape: torch.Size([0])
+# nnz: 10
+# sparse_dim: 0
+# indices shape: torch.Size([0, 10])
+# values shape: torch.Size([10, 0])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 10)),
+       values=tensor([], size=(10, 0)),
+       device='mps:0', size=(0,), nnz=10, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 10), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(10, 0), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 10)),
+       values=tensor([], size=(10, 0)),
+       device='mps:0', size=(0,), nnz=10, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 10)),
+       values=tensor([], size=(10, 0)),
+       device='mps:0', size=(0,), nnz=10, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([], size=(1, 0)),
+       device='mps:0', size=(0,), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 10), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(10, 0))
+
+# shape: torch.Size([2])
+# nnz: 3
+# sparse_dim: 0
+# indices shape: torch.Size([0, 3])
+# values shape: torch.Size([3, 2])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([[0, 0],
+                      [0, 1],
+                      [1, 1]]),
+       device='mps:0', size=(2,), nnz=3, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([[0, 0],
+        [0, 1],
+        [1, 1]], device='mps:0', dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([[0.0000, 0.3333],
+                      [0.6667, 1.0000],
+                      [1.3333, 1.6667]]),
+       device='mps:0', size=(2,), nnz=3, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([[0.0000, 0.3333],
+                      [0.6667, 1.0000],
+                      [1.3333, 1.6667]]),
+       device='mps:0', size=(2,), nnz=3, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([[4.0000, 6.0000]]),
+       device='mps:0', size=(2,), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([[0.0000, 0.3333],
+        [0.6667, 1.0000],
+        [1.3333, 1.6667]], device='mps:0')
+
+# shape: torch.Size([100, 3])
+# nnz: 3
+# sparse_dim: 1
+# indices shape: torch.Size([1, 3])
+# values shape: torch.Size([3, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([[0, 1, 0]]),
+       values=tensor([[0, 0, 0],
+                      [0, 0, 1],
+                      [1, 1, 1]]),
+       device='mps:0', size=(100, 3), nnz=3, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([[0, 1, 0]], device='mps:0')
+# _values
+tensor([[0, 0, 0],
+        [0, 0, 1],
+        [1, 1, 1]], device='mps:0', dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([[0, 1, 0]]),
+       values=tensor([[0.0000, 0.2222, 0.4444],
+                      [0.6667, 0.8889, 1.1111],
+                      [1.3333, 1.5556, 1.7778]]),
+       device='mps:0', size=(100, 3), nnz=3, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([[0, 1, 0]]),
+       values=tensor([[0.0000, 0.2222, 0.4444],
+                      [0.6667, 0.8889, 1.1111],
+                      [1.3333, 1.5556, 1.7778]]),
+       device='mps:0', size=(100, 3), nnz=3, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([[0, 1]]),
+       values=tensor([[2.6667, 3.5556, 4.4444],
+                      [1.3333, 1.7778, 2.2222]]),
+       device='mps:0', size=(100, 3), nnz=2, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([[0, 1, 0]], device='mps:0')
+# _values
+tensor([[0.0000, 0.2222, 0.4444],
+        [0.6667, 0.8889, 1.1111],
+        [1.3333, 1.5556, 1.7778]], device='mps:0')
+
+# shape: torch.Size([100, 20, 3])
+# nnz: 0
+# sparse_dim: 2
+# indices shape: torch.Size([2, 0])
+# values shape: torch.Size([0, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(2, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(2, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 3))
+
+# shape: torch.Size([10, 0, 3])
+# nnz: 3
+# sparse_dim: 0
+# indices shape: torch.Size([0, 3])
+# values shape: torch.Size([3, 10, 0, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([], size=(3, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=3, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(3, 10, 0, 3), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([], size=(3, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=3, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([], size=(3, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=3, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([], size=(1, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(3, 10, 0, 3))
+
+# shape: torch.Size([10, 0, 3])
+# nnz: 0
+# sparse_dim: 0
+# indices shape: torch.Size([0, 0])
+# values shape: torch.Size([0, 10, 0, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 10, 0, 3), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 10, 0, 3))
diff --git a/test/export/test_converter.py b/test/export/test_converter.py
index 9d872f87d60a4..e739e5c346677 100644
--- a/test/export/test_converter.py
+++ b/test/export/test_converter.py
@@ -1448,7 +1448,11 @@ def fuse_model(self):
             ep_out, _ = pytree.tree_flatten(ep.module()(*inp))
             self._check_tensor_list_equal(orig_out, ep_out)
 
-    # qnnpack not supported on s390x
+    # qnnpack/xnnpack not supported on s390x.
+    # it is required by
+    # torch.ops.prepacked.linear_clamp_prepack
+    # and
+    # torch.ops.prepacked.linear_clamp_run
     @xfailIfS390X
     def test_ts2ep_convert_quantized_model_with_opcontext(self):
         class M(torch.nn.Module):
@@ -1467,6 +1471,12 @@ def forward(self, x):
         inp = (torch.randn(1, 10),)
         self._check_equal_ts_ep_converter(m, inp, ["script"])
 
+    # qnnpack/xnnpack not supported on s390x.
+    # it is required by
+    # torch.ops.prepacked.linear_clamp_prepack
+    # and
+    # torch.ops.prepacked.linear_clamp_run
+    @xfailIfS390X
     def test_ts2ep_convert_quantized_model_with_opcontext_and_constant(self):
         class M(torch.nn.Module):
             def __init__(self, linear_op):
diff --git a/test/export/test_draft_export.py b/test/export/test_draft_export.py
index 6cf819958fccf..7f7148273ad70 100644
--- a/test/export/test_draft_export.py
+++ b/test/export/test_draft_export.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: export"]
 import copy
+import re
 import tempfile
 import unittest
 
@@ -407,7 +408,12 @@ def forward(self, a):
 
         inp = (torch.ones(3, 3),)
 
-        ep = draft_export(M(), inp, dynamic_shapes={"a": {0: Dim("a0")}})
+        ep = draft_export(
+            M(),
+            inp,
+            dynamic_shapes={"a": {0: Dim("a0")}},
+            prefer_deferred_runtime_asserts_over_guards=True,
+        )
         report = ep._report
 
         self.assertEqual(len(report.failures), 1)
@@ -417,7 +423,11 @@ def forward(self, a):
         self.assertEqual(ep.module()(*inp), M()(*inp))
 
         inp = (torch.randn(4, 3),)
-        with self.assertRaises(RuntimeError):
+        with self.assertRaisesRegex(
+            AssertionError,
+            re.escape("Guard failed: a.size()[0] <= 3"),
+        ):
+            # expected <= 3, but got 4
             ep.module()(*inp)
 
     def test_side_effect1(self):
diff --git a/test/export/test_experimental.py b/test/export/test_experimental.py
index 641dd586edb59..871dc813a687f 100644
--- a/test/export/test_experimental.py
+++ b/test/export/test_experimental.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: export"]
 # flake8: noqa
+import copy
 import types
 import unittest
 from typing import Dict, List, Tuple
@@ -318,10 +319,8 @@ def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     linear_weight = self.linear.weight
     linear_bias = self.linear.bias
-    sym_size_int_2 = torch.ops.aten.sym_size.int(x, 1)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     linear = torch.ops.aten.linear.default(x, linear_weight, linear_bias);  x = linear_weight = linear_bias = None
-    eq = sym_size_int_2 == 4;  sym_size_int_2 = None
-    _assert_scalar_default = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(s27, 4) on node 'eq'");  eq = _assert_scalar_default = None
     return pytree.tree_unflatten((linear,), self._out_spec)""",
         )
 
@@ -354,6 +353,62 @@ def generate(self, *, input_tensor, input_tensor2):
         res2 = p.generate(input_tensor=inp, input_tensor2=inp2)
         self.assertTrue(torch.allclose(res, res2))
 
+    def test_export_add_in_out_info(self):
+        class Foo(torch.nn.Module):
+            def forward(self, dct, lst, bleh):
+                x = dct["a"] * lst[1][0]
+                y = dct["b"] * lst[0]
+                out_dict = {}
+                # Mutate and get a new entry in there
+                lst_copy = lst.copy()
+                lst_copy.append(lst[0])
+                out_dict["a"] = x
+                out_dict["b"] = y
+                return (
+                    dct["a"],
+                    out_dict["b"],
+                    bleh,
+                    lst_copy[-1],
+                    out_dict["a"],
+                    [5, 6],
+                )
+
+        dct = {"a": torch.randn(2, 3), "b": torch.randn(2, 3)}
+        lst = [torch.randn(2, 3), [torch.randn(2, 3), torch.randn(2, 3)]]
+
+        export_inputs = ((dct, lst, 56), {})
+        eager_inputs = copy.deepcopy(export_inputs)
+
+        from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
+
+        graph_module = _dynamo_graph_capture_for_export(Foo())(
+            *export_inputs[0], **export_inputs[1]
+        )
+
+        res_export = graph_module(*export_inputs[0], **export_inputs[1])
+        res_eager = Foo()(*eager_inputs[0], **eager_inputs[1])
+
+        self.assertEqual(res_export, res_eager)
+
+    def test_export_leaf(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x.sin()
+
+        export_inputs = ((torch.randn(4, 4),), {})
+        eager_inputs = copy.deepcopy(export_inputs)
+
+        from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
+
+        graph_module = _dynamo_graph_capture_for_export(Foo())(
+            *export_inputs[0], **export_inputs[1]
+        )
+
+        res_export = graph_module(*export_inputs[0], **export_inputs[1])
+        res_eager = Foo()(*eager_inputs[0], **eager_inputs[1])
+
+        self.assertEqual(res_export, res_eager)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_export.py b/test/export/test_export.py
index c67657bfe3155..feb85e59556ef 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -7,11 +7,13 @@
 import logging
 import math
 import operator
+import os
 import re
 import traceback
 import unittest
 import warnings
-from contextlib import contextmanager
+import weakref
+from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from re import escape
 from typing import Dict, List, Union
@@ -63,10 +65,7 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.testing import FileCheck
-from torch.testing._internal.common_cuda import (
-    PLATFORM_SUPPORTS_FLASH_ATTENTION,
-    xfailIfDistributedNotSupported,
-)
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.common_utils import (
     find_library_location,
     IS_FBCODE,
@@ -86,7 +85,7 @@
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 from torch.testing._internal.torchbind_impls import load_torchbind_test_lib
-from torch.testing._internal.triton_utils import requires_cuda, requires_gpu
+from torch.testing._internal.triton_utils import requires_cuda_and_triton, requires_gpu
 from torch.testing._internal.two_tensor import TwoTensor
 from torch.utils._pytree import (
     LeafSpec,
@@ -107,7 +106,7 @@
     from torch._library import capture_triton
 
 try:
-    from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+    from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor
 
     HAS_TORCHREC = True
 except ImportError:
@@ -326,6 +325,52 @@ def forward(self, *args):
             dynamic_shapes=dynamic_shapes,
         )
 
+    def test_no_grad_param_inplace(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.parameter = torch.nn.Parameter(torch.ones(4, 4))
+
+            def forward(self, x):
+                with torch.no_grad():
+                    self.parameter.div_(2)
+                return x + self.parameter
+
+        foo_ep = Foo()
+        foo_eager = Foo()
+        ep = export(foo_ep, (torch.rand(4, 4),)).run_decompositions()
+        val = ep.graph_signature.parameters_to_mutate
+        self.assertExpectedInline(
+            str(ep.graph).strip(),
+            """\
+graph():
+    %p_parameter : [num_users=1] = placeholder[target=p_parameter]
+    %x : [num_users=1] = placeholder[target=x]
+    %div : [num_users=2] = call_function[target=torch.ops.aten.div.Tensor](args = (%p_parameter, 2), kwargs = {})
+    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %div), kwargs = {})
+    return (div, add)""",
+        )
+
+        self.assertTrue("div" in val.keys())
+        self.assertTrue("parameter" in val.values())
+
+        test_inp = torch.rand(4, 4)
+
+        res = foo_eager(test_inp)
+
+        # TODO We almost need to make the param mutation happen outside
+        # of the graph. Or wrap the param mutation in a no_grad HOP. Simply
+        # overriding gm.__call__ doesn't seem to work due to:
+        #   1. graph module does something weird to __call__ so it is not easy to override
+        #   2. We inspect module.forward to bind fake args when retracing
+        with self.assertRaisesRegex(RuntimeError, "leaf"):
+            res_export = ep.module()(torch.rand(4, 4))
+
+        with torch.no_grad():
+            res_export = ep.module()(test_inp)
+
+        self.assertTrue(torch.allclose(res, res_export))
+
     def test_export_slice_unbacked_dim1(self):
         class MySlice(torch.nn.Module):
             def forward(self, x, seq_len):
@@ -469,7 +514,13 @@ def _test_export_same_as_eager(self, f, args, kwargs=None):
         # )
 
     def _check_dynamic_shapes_specs_and_shapes(
-        self, model, inputs, specs, passing_shapes, failing_shapes, test_serdes=False
+        self,
+        model,
+        inputs,
+        specs,
+        passing_shapes,
+        failing_shapes,
+        test_serdes=False,
     ):
         from torch._export.serde.dynamic_shapes import (
             _dump_dynamic_shapes,
@@ -511,7 +562,7 @@ def _is_tensor_leaf(x):
                     ep.module()(*test_inputs)
                 for shapes in failing_shapes:
                     test_inputs = _construct_inputs(shapes)
-                    with self.assertRaises(RuntimeError):
+                    with self.assertRaisesRegex(AssertionError, "Guard failed"):
                         ep.module()(*test_inputs)
 
     def test_basic(self):
@@ -590,7 +641,7 @@ def example_inputs(self):
         from torch.fx.traceback import NodeSourceAction
 
         for node in gm.graph.nodes:
-            if node.op in ("placeholder", "output"):
+            if node.op in ("placeholder", "output", "call_module"):
                 continue
             if "weight" in node.name or "bias" in node.name:
                 self.assertTrue(
@@ -619,7 +670,7 @@ def example_inputs(self):
         graph_id = id(ep2.graph)
 
         for node in gm2.graph.nodes:
-            if node.op in ("placeholder", "output"):
+            if node.op in ("placeholder", "output", "call_module"):
                 continue
 
             if "weight" in node.name or "bias" in node.name:
@@ -882,7 +933,8 @@ def forward(self, x):
             """\
 graph():
     %lifted_tensor_0 : [num_users=1] = get_attr[target=lifted_tensor_0]
-    %x : [num_users=1] = placeholder[target=x]
+    %x : [num_users=2] = placeholder[target=x]
+    %_guards_fn : [num_users=0] = call_module[target=_guards_fn](args = (%x,), kwargs = {})
     %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %lifted_tensor_0), kwargs = {})
     return (add,)""",
         )
@@ -1274,6 +1326,48 @@ def forward(self, x: torch.Tensor, as_tuple: bool) -> torch.Tensor:
         for vr_upper in vr_upper_bounds:
             self.assertEqual(vr_upper, 1)
 
+    def test_detect_leak_strict(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                return x + y
+
+        global_list = []
+
+        class ReferenceControl:
+            def __init__(self, mod):
+                self.bank = []
+                self.bank_dict = {}
+                self.mod = mod
+
+                def hacked_up_forward(self_, x, y):
+                    self.bank.append(x.clone())
+                    self.bank_dict["x"] = x.clone()
+                    global_list.append(x.clone())
+                    return x + y
+
+                self.mod.forward = hacked_up_forward.__get__(self.mod, Foo)
+
+            def __call__(self, x, y):
+                ep = torch.export.export(self.mod, (x, y), strict=True).module()
+                out = ep(x, y)
+                return out
+
+            def update(self):
+                print(self.bank)
+
+        foo = Foo()
+        ref = ReferenceControl(foo)
+        with self.assertWarnsRegex(
+            UserWarning,
+            "While exporting, we found certain side effects happened in the model.forward. "
+            "Here are the list of potential sources you can double check: "
+            "\[\"L\['global_list'\]\", \"L\['self'\].bank\", \"L\['self'\].bank_dict\"",
+        ):
+            ref(torch.randn(4, 4), torch.randn(4, 4))
+
     def test_mask_nonzero_static(self):
         class TestModule(torch.nn.Module):
             def forward(self, seq_embeddings, mask, exp):
@@ -1404,7 +1498,11 @@ def forward(self, x, ys, zs, c):
             {"a": torch.zeros(5), "b": torch.ones(5)},
             torch.ones(4),
         )
-        with self.assertRaisesRegex(RuntimeError, "to be equal to 6, but got 5"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: ys[0].size()[0] == x.size()[0]"),
+        ):
+            # expected 6, but got 5
             ep_ns.module()(*bad_runtime_inp1)
 
         bad_runtime_inp2 = (
@@ -1414,9 +1512,10 @@ def forward(self, x, ys, zs, c):
             torch.ones(6),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[3].shape[0] to be equal to 4, but got 6"),
+            AssertionError,
+            escape("Guard failed: c.size()[0] == 4"),
         ):
+            # expected 4, but got 6
             ep_ns.module()(*bad_runtime_inp2)
 
         good_runtime_inp = (
@@ -1564,6 +1663,8 @@ def forward(self, x):
         x: "f32[3, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        _guards_fn = self._guards_fn(x);  _guards_fn = None
+
         sum_1: "f32[]" = torch.ops.aten.sum.default(x)
         gt: "b8[]" = torch.ops.aten.gt.Scalar(sum_1, 3);  sum_1 = None
 
@@ -1649,6 +1750,8 @@ def forward(self, x):
         x: "f32[3, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        _guards_fn = self._guards_fn(x);  _guards_fn = None
+
         sum_1: "f32[]" = torch.ops.aten.sum.default(x)
         gt: "b8[]" = torch.ops.aten.gt.Scalar(sum_1, 3);  sum_1 = None
 
@@ -2459,6 +2562,67 @@ def forward(self, x):
         res = ep.module()(ref_x)
         self.assertEqual(res, ref_out)
 
+    @testing.expectedFailureSerDer  # can't serialize functorch ops
+    @testing.expectedFailureSerDerNonStrict  # can't serialize functorch ops
+    @testing.expectedFailureCppRuntime
+    def test_vmap(self):
+        class Vmap(torch.nn.Module):
+            def forward(self, x, y):
+                f = lambda x, y: (x * y + 1).sum(dim=0)  # noqa: E731
+                vmapped = torch.vmap(f)(x, y)
+                return vmapped.sum(dim=0)
+
+        DYN = torch.export.Dim.DYNAMIC
+        inputs = (torch.tensor([1.0, 2.0, 3.0]), torch.tensor([0.1, 0.2, 0.3]))
+        dynamic = {"x": {0: DYN}, "y": {0: DYN}}
+        ep = torch.export.export(Vmap(), inputs, {}, dynamic_shapes=dynamic)
+        self.assertExpectedInline(
+            str(ep.graph).strip(),
+            """\
+graph():
+    %x : [num_users=1] = placeholder[target=x]
+    %y : [num_users=2] = placeholder[target=y]
+    %sym_size_int_3 : [num_users=2] = call_function[target=torch.ops.aten.sym_size.int](args = (%y, 0), kwargs = {})
+    %lazy_load_decompositions : [num_users=0] = call_function[target=torch._functorch.predispatch.lazy_load_decompositions](args = (), kwargs = {})
+    %_vmap_increment_nesting : [num_users=0] = call_function[target=torch._functorch.predispatch._vmap_increment_nesting](args = (%sym_size_int_3, error), kwargs = {})
+    %_add_batch_dim : [num_users=1] = call_function[target=torch._functorch.predispatch._add_batch_dim](args = (%x, 0, 1), kwargs = {})
+    %_add_batch_dim_1 : [num_users=1] = call_function[target=torch._functorch.predispatch._add_batch_dim](args = (%y, 0, 1), kwargs = {})
+    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_add_batch_dim, %_add_batch_dim_1), kwargs = {})
+    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul, 1), kwargs = {})
+    %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%add, [0]), kwargs = {})
+    %_remove_batch_dim : [num_users=1] = call_function[target=torch._functorch.predispatch._remove_batch_dim](args = (%sum_1, 1, %sym_size_int_3, 0), kwargs = {})
+    %_vmap_decrement_nesting : [num_users=0] = call_function[target=torch._functorch.predispatch._vmap_decrement_nesting](args = (), kwargs = {})
+    %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%_remove_batch_dim, [0]), kwargs = {})
+    return (sum_2,)""",
+        )
+        ep = torch.export.export(
+            Vmap(), inputs, {}, dynamic_shapes=dynamic, strict=True
+        )
+        self.assertExpectedInline(
+            str(ep.graph).strip(),
+            """\
+graph():
+    %x : [num_users=1] = placeholder[target=x]
+    %y : [num_users=2] = placeholder[target=y]
+    %sym_size_int_2 : [num_users=2] = call_function[target=torch.ops.aten.sym_size.int](args = (%y, 0), kwargs = {})
+    %lazy_load_decompositions : [num_users=0] = call_function[target=torch._functorch.predispatch.lazy_load_decompositions](args = (), kwargs = {})
+    %_vmap_increment_nesting : [num_users=0] = call_function[target=torch._functorch.predispatch._vmap_increment_nesting](args = (%sym_size_int_2, error), kwargs = {})
+    %_add_batch_dim : [num_users=1] = call_function[target=torch._functorch.predispatch._add_batch_dim](args = (%x, 0, 1), kwargs = {})
+    %_add_batch_dim_1 : [num_users=1] = call_function[target=torch._functorch.predispatch._add_batch_dim](args = (%y, 0, 1), kwargs = {})
+    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_add_batch_dim, %_add_batch_dim_1), kwargs = {})
+    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul, 1), kwargs = {})
+    %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%add, [0]), kwargs = {})
+    %_remove_batch_dim : [num_users=1] = call_function[target=torch._functorch.predispatch._remove_batch_dim](args = (%sum_1, 1, %sym_size_int_2, 0), kwargs = {})
+    %_vmap_decrement_nesting : [num_users=0] = call_function[target=torch._functorch.predispatch._vmap_decrement_nesting](args = (), kwargs = {})
+    %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%_remove_batch_dim, [0]), kwargs = {})
+    return (sum_2,)""",
+        )
+        self.assertTrue(torch.allclose(ep.module()(*inputs), Vmap()(*inputs)))
+        ep = export(Vmap(), inputs, {}, dynamic_shapes=dynamic).run_decompositions({})
+        self.assertTrue(torch.allclose(ep.module()(*inputs), Vmap()(*inputs)))
+
+    @testing.expectedFailureLegacyExportNonStrict  # Old export doesn't work with subclasses
+    @testing.expectedFailureLegacyExportStrict  # Old export doesn't work with subclasses
     def test_subclass_nested_attr_access(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -2956,16 +3120,22 @@ def forward(self, x, y):
         ep = export(Foo(), inputs, dynamic_shapes=shapes)
         ep.module()(torch.randn(8, 5), torch.randn(8, 5))
         with self.assertRaisesRegex(
-            RuntimeError, "Expected input at .* to be >= 4, but got 3"
+            AssertionError,
+            escape("Guard failed: x.size()[0] >= 4"),
         ):
+            # expected >= 4, but got 3
             ep.module()(torch.randn(3, 5), torch.randn(3, 5))
         with self.assertRaisesRegex(
-            RuntimeError, "Expected input at .* to be <= 16, but got 17"
+            AssertionError,
+            escape("Guard failed: x.size()[0] <= 16"),
         ):
+            # expected <= 16, but got 17
             ep.module()(torch.randn(17, 5), torch.randn(17, 5))
         with self.assertRaisesRegex(
-            RuntimeError, "Expected input at .* to be <= 32, but got 33"
+            AssertionError,
+            escape("Guard failed: x.size()[1] <= 32"),
         ):
+            # expected <= 32, but got 33
             ep.module()(torch.randn(9, 33), torch.randn(9, 33))
 
     def test_dim_hint_range_violations(self):
@@ -3220,11 +3390,12 @@ def forward(self, x):
 
         actual_torch_fns = []
         for mod in gm.modules():
-            for node in mod.graph.nodes:
-                if node.name in {"sin", "cos"}:
-                    torch_fn = node.meta.get("torch_fn")
-                    print(torch_fn)
-                    actual_torch_fns.append(torch_fn)
+            if hasattr(mod, "graph"):
+                for node in mod.graph.nodes:
+                    if node.name in {"sin", "cos"}:
+                        torch_fn = node.meta.get("torch_fn")
+                        print(torch_fn)
+                        actual_torch_fns.append(torch_fn)
         exp_torch_fns = [
             ("cos_1", "method_descriptor.cos"),
             ("sin_1", "method_descriptor.sin"),
@@ -3397,9 +3568,10 @@ def forward(self, x, y):
             dynamic_shapes=({0: dimx}, {0: dimy}),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 5, but got 6",
+            AssertionError,
+            escape("Guard failed: x.size()[0] == -1 + y.size()[0]"),
         ):
+            # expected 5, but got 6
             ep.module()(torch.randn(4), torch.randn(6))
 
         self.assertEqual(ep.module()(torch.randn(4), torch.randn(5)).size()[0], 4)
@@ -3458,13 +3630,16 @@ def forward(self, z, y):
             dynamic_shapes=({0: dimz}, {0: dimy}),
         )
         with self.assertRaisesRegex(
-            RuntimeError, "Expected input.*shape.*to be <= 7, but got 8"
+            AssertionError,
+            escape("Guard failed: z.size()[0] <= 7"),
         ):
+            # expected <= 7, but got 8
             ep.module()(torch.randn(8), torch.randn(15))
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 9, but got 8",
+            AssertionError,
+            escape("Guard failed: -1 + 2 * z.size()[0] == y.size()[0]"),
         ):
+            # expected 9, but got 8
             ep.module()(torch.randn(5), torch.randn(8))
 
         self.assertEqual(ep.module()(torch.randn(5), torch.randn(9)).size()[0], 4)
@@ -3500,17 +3675,18 @@ def forward(self, w):
             dynamic_shapes=({0: dimw},),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*= 9 to be "
-            "of the form 2\\*s92, where s92 is an integer",
+            AssertionError,
+            escape("Guard failed: w.size()[0] % 2 == 0"),
         ):
+            # expected 2*..., got 9
             ep.module()(torch.randn(9))
 
         self.assertEqual(ep.module()(torch.randn(8)).size()[0], 4)
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be <= 12, but got 14",
+            AssertionError,
+            escape("Guard failed: w.size()[0] <= 12"),
         ):
+            # expected <= 12, but got 14
             ep.module()(torch.randn(14))
 
     def test_derived_dim_repeat_derived(self):
@@ -3548,9 +3724,10 @@ def forward(self, x, y, z):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 8, but got 5",
+            AssertionError,
+            escape("Guard failed: z.size()[0] >= 6"),
         ):
+            # expected 8, but got 5
             ep.module()(torch.randn(6), torch.randn(7), torch.randn(5))
 
         self.assertEqual(
@@ -3583,9 +3760,10 @@ def forward(self, x, y, z, x1, x2):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}, {0: dimx1}, {0: dimx2}),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 6, but got 5",
+            AssertionError,
+            escape("Guard failed: x2.size()[0] == x.size()[0]"),
         ):
+            # expected 6, but got 5
             ep.module()(
                 torch.randn(6),
                 torch.randn(7),
@@ -3611,9 +3789,10 @@ def forward(self, x, y, z, x1, x2):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}, {0: dimx1}, {0: dimx2}),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 6, but got 5",
+            AssertionError,
+            escape("Guard failed: x2.size()[0] == x.size()[0]"),
         ):
+            # expected 6, but got 5
             ep.module()(
                 torch.randn(6),
                 torch.randn(7),
@@ -4000,6 +4179,17 @@ def forward(self, x):
         inp = torch.randn(3, 3)
         self.assertTrue(torch.allclose(ep.module()(inp)[0], inp + 1))
 
+    def test_set_grad_as_side_effect(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                torch._C._set_grad_enabled(False)
+                return x.sum()
+
+        before = torch.is_grad_enabled()
+        ep = torch.export.export(Foo(), (torch.randn(4, 4),))
+        after = torch.is_grad_enabled()
+        self.assertEqual(before, after)
+
     def test_derived_dim_out_of_order_simplified(self):
         _dimz = torch.export.Dim("_dimz", min=6, max=8)
         dimy = _dimz - 1
@@ -4038,9 +4228,10 @@ def forward(self, x, y, z):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 8, but got 5",
+            AssertionError,
+            escape("Guard failed: z.size()[0] >= 6"),
         ):
+            # expected 8, but got 5
             ep.module()(torch.randn(6), torch.randn(7), torch.randn(5))
 
         self.assertEqual(
@@ -4075,6 +4266,7 @@ def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     linear_weight = self.linear.weight
     linear_bias = self.linear.bias
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     linear = torch.ops.aten.linear.default(x, linear_weight, linear_bias);  x = linear_weight = linear_bias = None
     return pytree.tree_unflatten((linear,), self._out_spec)""",
         )
@@ -4115,6 +4307,7 @@ def forward(self, b_buffer, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     buffer = self.buffer
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add_ = torch.ops.aten.add_.Tensor(x, 5);  x = None
     add__1 = torch.ops.aten.add_.Tensor(buffer, 5);  buffer = None
     add = torch.ops.aten.add.Tensor(add_, add__1);  add_ = add__1 = None
@@ -4209,6 +4402,186 @@ def forward(self, container):
             )
         )
 
+    def test_function_holding_tensor(self):
+        global_storage = []
+
+        class FunctionClosureLeak(torch.nn.Module):
+            def forward(self, x):
+                fake_tensor = x + 1  # In real export, this would be a FakeTensor
+
+                def closure():
+                    return fake_tensor.shape  # Captures fake_tensor
+
+                # Store closure globally - this creates the leak
+                global_storage.append(closure)
+                return x.sin()
+
+        prev_os_env = os.environ.copy()
+        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
+
+        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
+
+        with (
+            patch.dict(
+                os.environ,
+                prev_os_env,
+                clear=True,
+            ),
+            self.assertWarnsRegex(
+                UserWarning, "Detected 1 fake tensors that are still alive after export"
+            ),
+        ):
+            export(FunctionClosureLeak(), (torch.randn(4, 4),), strict=False)
+
+    def test_detect_leak_nonstrict(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                return x + y
+
+        global_list = []
+
+        class ReferenceControl:
+            def __init__(self, mod):
+                self.bank = []
+                self.bank_dict = {}
+                self.mod = mod
+
+                def hacked_up_forward(self_, x, y):
+                    self.bank.append(x.clone())
+                    self.bank_dict["x"] = x.clone()
+                    global_list.append(x.clone())
+                    return x + y
+
+                self.mod.forward = hacked_up_forward.__get__(self.mod, Foo)
+
+            def __call__(self, x, y):
+                ep = export(self.mod, (x, y), strict=False).module()
+                out = ep(x, y)
+                return out
+
+            def update(self):
+                return self.bank
+
+        foo = Foo()
+        ref = ReferenceControl(foo)
+        ref(torch.randn(4, 4), torch.randn(4, 4))
+        self.assertTrue(
+            isinstance(ref.bank[0], torch._subclasses.fake_tensor.FakeTensor)
+        )
+
+        prev_os_env = os.environ.copy()
+        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
+
+        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
+
+        with (
+            patch.dict(
+                os.environ,
+                prev_os_env,
+                clear=True,
+            ),
+            self.assertWarnsRegex(
+                UserWarning, "Detected 3 fake tensors that are still alive after export"
+            ),
+        ):
+            ref(torch.randn(4, 4), torch.randn(4, 4))
+
+    def test_detect_leak_nonstrict_with_stacktrace(self):
+        global_list = []
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                nonlocal global_list
+                global_list.append(x + y)
+                return x + y
+
+        foo = Foo()
+        ep = export(foo, (torch.randn(4, 4), torch.randn(4, 4)), strict=False)
+        self.assertTrue(
+            isinstance(global_list[0], torch._subclasses.fake_tensor.FakeTensor)
+        )
+
+        prev_os_env = os.environ.copy()
+        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
+
+        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
+
+        with patch.dict(
+            os.environ,
+            prev_os_env,
+            clear=True,
+        ):
+            warn_re = re.compile(
+                r"Detected\s+\d+\s+fake\s+tensors?"
+                r".*test_export\.py.*global_list\.append\(x \+ y\)",
+                re.S,
+            )
+            with self.assertWarnsRegex(UserWarning, warn_re):
+                ep = export(foo, (torch.randn(4, 4), torch.randn(4, 4)), strict=False)
+
+    def test_export_cyclic_reference_leak(self):
+        class Node:
+            def __init__(self, tag):
+                self.tag = tag
+                self.ref = None
+                self.tensor = None
+
+        bank = []
+
+        class LeakyCycle(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = x + y
+                node1 = Node("A")
+                node2 = Node("B")
+                node1.ref = node2
+                node2.ref = node1
+                node1.tensor = z
+                # Keep the cycle alive intentionally -> leak
+                nonlocal bank
+                bank.append(node1)
+                return (z.sin()).cos()
+
+        lc = LeakyCycle()
+        ep = export(lc, (torch.randn(4, 4), torch.randn(4, 4)), strict=False)
+
+        node1_ref = weakref.ref(bank[0])
+        node2_ref = weakref.ref(bank[0].ref)
+
+        bank.clear()
+        del bank
+        bank = []
+
+        self.assertIsNotNone(node1_ref(), "node1 should still be alive due to cycle")
+        self.assertIsNotNone(node2_ref(), "node2 should still be alive due to cycle")
+
+        prev_os_env = os.environ.copy()
+        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
+
+        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
+
+        with patch.dict(
+            os.environ,
+            prev_os_env,
+            clear=True,
+        ):
+            warn_re = re.compile(
+                r"Detected\s+\d+\s+fake\s+tensors?"
+                r'.*?[/\\]test_export\.py",\s+line\s+\d+,\s+in\s+forward'
+                r"(?:\\n|\n)\s*z\s*=\s*x\s*\+\s*y",
+                re.S,
+            )
+            with self.assertWarnsRegex(UserWarning, warn_re):
+                ep = export(lc, (torch.randn(4, 4), torch.randn(4, 4)), strict=False)
+
     def test_export_for_training_run_decomp(self):
         class Foo(torch.nn.Module):
             def __init__(self) -> None:
@@ -4256,9 +4629,10 @@ def forward(self, x, y, y1, z):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimy}, {0: dimz}),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 7, but got 5",
+            AssertionError,
+            escape("Guard failed: y1.size()[0] == y.size()[0]"),
         ):
+            # expected 7, but got 5
             ep.module()(
                 torch.randn(6),
                 torch.randn(7),
@@ -4310,8 +4684,9 @@ def forward(self, x, y, z):
             ep = export(foo, inputs, dynamic_shapes=dynamic_shapes)
             self.assertEqual(foo(*inputs), ep.module()(*inputs))
             for wrong_inputs in wrong_shape_inputs:
-                with self.assertRaises(RuntimeError):
-                    ep.module()(*wrong_inputs)
+                with self.assertRaisesRegex(AssertionError, "Guard failed"):
+                    with self.assertRaises(RuntimeError):
+                        ep.module()(*wrong_inputs)
 
         # check range_constraints - static dims shouldn't be present
         ep = export(foo, inputs, dynamic_shapes=((dx, None), (dy, 4), (dz, 3)))
@@ -4347,8 +4722,10 @@ def forward(self, x):
         ep.module()(torch.randn(1, 2))
         ep.module()(torch.randn(2, 2))
         with self.assertRaisesRegex(
-            RuntimeError, "Expected input at .* to be <= 2, but got 3"
+            AssertionError,
+            escape("Guard failed: x.size()[0] <= 2"),
         ):
+            # expected <= 2, but got 3
             ep.module()(torch.randn(3, 2))
         vr = list(ep.range_constraints.values())[0]
         self.assertEqual(vr.lower, 1)
@@ -4365,7 +4742,12 @@ def forward(self, x, y):
             (torch.randn(2, 2), torch.randn(3, 2)),
             dynamic_shapes=({0: dx, 1: None}, {0: dx + 1, 1: None}),
         )
-        ep.module()(torch.randn(1, 2), torch.randn(2, 2))
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: -1 + y.size()[0] != 1"),
+        ):
+            # TODO: this should not error?
+            ep.module()(torch.randn(1, 2), torch.randn(2, 2))
         range_lower_bounds = sorted(vr.lower for vr in ep.range_constraints.values())
         range_upper_bounds = sorted(vr.upper for vr in ep.range_constraints.values())
         self.assertEqual(range_lower_bounds, [1, 2])
@@ -4412,6 +4794,29 @@ def forward(self, x, mask):
         self.assertTrue(torch.allclose(ref[0], actual[0]))
         self.assertTrue(torch.allclose(ref[1], actual[1]))
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_layer_norm_unbacked_normalized_shape(self):
+        class MyModel(torch.nn.Module):
+            def forward(self, scalar, weight, bias):
+                u1 = scalar.item()
+                y = torch.ones(2, u1)
+
+                return torch.nn.functional.layer_norm(
+                    input=y, normalized_shape=(u1,), weight=weight, bias=bias
+                )
+
+        model = MyModel()
+        inputs = (
+            torch.scalar_tensor(16, dtype=torch.int32),
+            torch.randn(16),
+            torch.randn(16),
+        )
+        ep = export(model, inputs)
+
+        actual = ep.module()(*inputs)
+        ref = model(*inputs)
+        self.assertTrue(torch.allclose(ref[0], actual[0]))
+
     def test_unbacked_3d_matmul(self):
         class Model(torch.nn.Module):
             def forward(self, x, repeat):
@@ -4532,7 +4937,7 @@ def forward(self, x, y, z):
         self.assertEqual(got_shapes, expected_shapes)
 
         def expect_error(bad_args, run_time_msg, compile_time_msg):
-            with self.assertRaisesRegex(RuntimeError, run_time_msg):
+            with self.assertRaisesRegex(AssertionError, run_time_msg):
                 ep.module()(*bad_args)
 
             additional_inputs = torch.export.AdditionalInputs()
@@ -4544,21 +4949,27 @@ def expect_error(bad_args, run_time_msg, compile_time_msg):
         expect_error(
             # 4->2, 4->2, 3->3
             bad_args=(torch.randn(2), [torch.randn(2)], {"k": torch.randn(3)}),
-            run_time_msg="Expected input.*to be >= 3, but got 2",
+            run_time_msg=escape(
+                "Guard failed: x.size()[0] >= 3"
+            ),  # expected >= 3, but got 2
             compile_time_msg="Expected input.*to be >= 3, but got 2",
         )
 
         expect_error(
             # 4->6, 4->7, 3->3
             bad_args=(torch.randn(6), [torch.randn(7)], {"k": torch.randn(3)}),
-            run_time_msg="Expected input.*to be equal to 6, but got 7",
+            run_time_msg=escape(
+                "Guard failed: y[0].size()[0] == x.size()[0]"
+            ),  # expected 6, but got 7
             compile_time_msg="Expected input.*to be equal to 6, but got 7",
         )
 
         expect_error(
             # 4->5, 4->5, 3->4
             bad_args=(torch.randn(5), [torch.randn(5)], {"k": torch.randn(4)}),
-            run_time_msg="Expected input.*to be equal to 3, but got 4",
+            run_time_msg=escape(
+                "Guard failed: z['k'].size()[0] == 3"
+            ),  # expected 3, but got 4
             compile_time_msg=r"You marked.*but your code specialized it to be a constant.*If you're using Dim.DYNAMIC, replace it with either Dim.STATIC or Dim.AUTO",
         )
 
@@ -5207,7 +5618,18 @@ def forward(self, x, y):
         self.assertTrue(torch.allclose(ep.module()(x, y), model(x, y)))
         x2 = torch.arange(4).reshape((2, 2))
         y2 = torch.arange(9).reshape((3, 3))
-        self.assertTrue(torch.allclose(ep.module()(x2, y2), model(x2, y2)))
+        with self.assertRaisesRegex(
+            AssertionError,
+            (
+                escape("Guard failed: max(x.size()[1], y.size()[1]) == x.size()[1]")
+                if is_retracebility_test(self._testMethodName)
+                else escape(
+                    "Guard failed: max(1, x.size()[1], y.size()[1]) == x.size()[1]"
+                )
+            ),
+        ):
+            # TODO: this should not error?
+            self.assertTrue(torch.allclose(ep.module()(x2, y2), model(x2, y2)))
 
     def test_export_max_nonstrict(self):
         class FooMax(torch.nn.Module):
@@ -5331,11 +5753,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         dim0_x = torch.export.Dim("dim0_x", min=3)
         dim1_x = torch.export.Dim("dim1_x", max=8000)
         dynamic_shapes = {"x": (dim0_x, dim1_x)}
-        em = torch.export._trace._export(
+        em = torch.export.export(
             m,
             (a,),
             dynamic_shapes=dynamic_shapes,
-            allow_complex_guards_as_runtime_asserts=True,
+            prefer_deferred_runtime_asserts_over_guards=True,
         )
         em.module()(torch.randn(4, 3))
         with self.assertRaisesRegex(
@@ -5350,9 +5772,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         em = torch.export.export(m, (a,), dynamic_shapes=dynamic_shapes)
         x = torch.randn(3, 5)
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected.*shape\\[1\\] = 5 to be of the form 2\\*s33, where s33 is an integer",
+            AssertionError,
+            escape("Guard failed: 3 * x.size()[1] % 2 == 0"),
         ):
+            # expected 2*..., but got 5
             em.module()(x)
 
     def test_dont_duck_size_for_auto_dynamic(self):
@@ -6349,7 +6772,9 @@ def forward(self, kjt) -> torch.Tensor:
             efoo = torch.export.export(
                 foo,
                 inputs,
-                dynamic_shapes={"kjt": [{0: dim}, None, {0: dim}, {0: dim_plus_one}]},
+                dynamic_shapes={
+                    "kjt": [{0: dim}, None, {0: dim}, {0: dim_plus_one}, None, None]
+                },
             )
             self.assertEqual(
                 [out.shape for out in efoo.module()(*inputs)],
@@ -7450,6 +7875,7 @@ def forward(self, x):
     bn_running_mean = self.bn.running_mean
     bn_running_var = self.bn.running_var
     bn_num_batches_tracked = self.bn.num_batches_tracked;  bn_num_batches_tracked = None
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     conv2d = torch.ops.aten.conv2d.default(x, conv_weight, conv_bias);  x = conv_weight = conv_bias = None
     batch_norm = torch.ops.aten.batch_norm.default(conv2d, bn_weight, bn_bias, bn_running_mean, bn_running_var, False, 0.1, 1e-05, True);  conv2d = bn_weight = bn_bias = bn_running_mean = bn_running_var = None
     return pytree.tree_unflatten((batch_norm,), self._out_spec)""",
@@ -7469,6 +7895,7 @@ def forward(self, x):
     bn_running_mean = self.bn.running_mean
     bn_running_var = self.bn.running_var
     bn_num_batches_tracked = self.bn.num_batches_tracked
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     conv2d = torch.ops.aten.conv2d.default(x, conv_weight, conv_bias);  x = conv_weight = conv_bias = None
     add_ = torch.ops.aten.add_.Tensor(bn_num_batches_tracked, 1);  bn_num_batches_tracked = add_ = None
     batch_norm = torch.ops.aten.batch_norm.default(conv2d, bn_weight, bn_bias, bn_running_mean, bn_running_var, True, 0.1, 1e-05, True);  conv2d = bn_weight = bn_bias = bn_running_mean = bn_running_var = None
@@ -8062,18 +8489,20 @@ def forward(self, x, y):
             )
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[1] to be equal to 5, but got 6"),
+            AssertionError,
+            escape("Guard failed: y == 5"),
         ):
+            # expected 5, but got 6
             _ = exported.module()(torch.ones(8, 5), 6)
 
         exported = torch.export.export(
             foo, (tensor_inp, 5.0), dynamic_shapes=dynamic_shapes
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[1] to be equal to 5.0, but got 6.0"),
+            AssertionError,
+            escape("Guard failed: y == 5.0"),
         ):
+            # expected 5.0, but got 6.0
             _ = exported.module()(torch.ones(7, 5), 6.0)
 
     def test_runtime_assert_for_prm_str(self):
@@ -8085,8 +8514,10 @@ def forward(self, a, b, mode):
         inps = (torch.randn(4, 4), torch.randn(4), "trunc")
         exported = export(foo, inps)
         with self.assertRaisesRegex(
-            RuntimeError, "to be equal to trunc, but got floor"
+            AssertionError,
+            escape("Guard failed: mode == 'trunc'"),
         ):
+            # expected 'trunc', but got 'floor'
             _ = exported.module()(torch.randn(4, 4), torch.randn(4), "floor")
         self.assertTrue(torch.allclose(exported.module()(*inps), foo(*inps)))
 
@@ -8213,9 +8644,12 @@ def forward(self, x):
         dim0_x = torch.export.Dim("dim0_x")
         exported = torch.export.export(Foo(), (inp,), dynamic_shapes=({0: dim0_x},))
         reexported = torch.export.export(exported.module(), (inp,))
+
         with self.assertRaisesRegex(
-            RuntimeError, "shape\[0\] to be equal to 5, but got 7"
+            AssertionError,
+            escape("Guard failed: x.size()[0] == 5"),
         ):
+            # expected 5, but got 7
             reexported.module()(torch.ones(7, 5))
 
         reexported = torch.export.export(
@@ -8233,9 +8667,10 @@ def forward(self, x):
             Foo(), (inp,), dynamic_shapes={"x": {0: dim0_x_v2}}
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[0].shape[0] to be >= 3, but got 2"),
+            AssertionError,
+            escape("Guard failed: x.size()[0] >= 3"),
         ):
+            # expected >= 3, but got 2
             torch.export.export(exported_v2.module(), (torch.randn(2, 2),))
 
     def test_export_cond_symbool_pred(self):
@@ -8380,7 +8815,7 @@ def forward(self, x):
                 len([node for node in gm.graph.nodes if node.op == "placeholder"]), 1
             )
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @testing.expectedFailureCppRuntime
     def test_export_associative_scan_symbol_dim(self):
         device = torch.device("cuda")
@@ -8405,7 +8840,7 @@ def forward(self, x):
         module_out = Foo()(xs)
         self.assertTrue(torch.allclose(ep.module()(xs), module_out))
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @testing.expectedFailureCppRuntime
     def test_export_associative_scan_symbol_scandim(self):
         device = torch.device("cuda")
@@ -8430,7 +8865,7 @@ def forward(self, x):
         module_out = Foo()(xs)
         self.assertTrue(torch.allclose(ep.module()(xs), module_out))
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_export_associative_scan_lifted_buffers(self):
         if "cpp_runtime_nonstrict" in self.id():
             self.skipTest("TODO Unexpected success in OSS but not in fbcode.")
@@ -9030,8 +9465,10 @@ def forward(self, a, b):
             dynamic_shapes=(None, None),
         )
         with self.assertRaisesRegex(
-            RuntimeError, "shape\[0\] to be equal to 4, but got 7"
+            AssertionError,
+            escape("Guard failed: b.size()[0] == 4"),
         ):
+            # expected 4, but got 7
             ep_v2.module()(*test_inp)
 
     def test_constant_output(self):
@@ -9111,7 +9548,11 @@ def dynamify_inp(x):
         ep = torch.export.export(foo, inp, dynamic_shapes=dynamic_shapes)
 
         test_inp = ((torch.randn(4, 4), torch.randn(2, 4)), torch.randn(4, 4))
-        with self.assertRaisesRegex(RuntimeError, "shape\[0\] to be >= 3, but got 2"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: a[1].size()[0] >= 3"),
+        ):
+            # expected >= 3, but got 2
             ep.module()(*test_inp)
 
     def test_nested_module(self):
@@ -9313,13 +9754,17 @@ def forward(self, x):
         ).module()
 
         with self.assertRaisesRegex(
-            RuntimeError, escape("Expected input at *args[0].shape[0]")
+            AssertionError,
+            escape("Guard failed: x.size()[0] >= 3"),
         ):
+            # expected >= 3, got 2
             gm(torch.randn(2, 2))
 
         with self.assertRaisesRegex(
-            RuntimeError, escape("Expected input at *args[0].shape[0]")
+            AssertionError,
+            escape("Guard failed: x.size()[0] >= 3"),
         ):
+            # expected >= 3, got 2
             export(gm, (torch.randn(2, 2),))
 
         ep = export(
@@ -11138,7 +11583,11 @@ def forward(self, x, y):
 
         ep = export(M(), (4, 5))
         self.assertEqual(ep.module()(4, 5), 20)
-        with self.assertRaisesRegex(RuntimeError, r"to be equal to 4, but got 3"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: x == 4"),
+        ):
+            # expected 4, but got 3
             self.assertEqual(ep.module()(3, 6), 18)
 
         ep = export(M(), (4, 5), dynamic_shapes={"x": Dim.DYNAMIC, "y": Dim.AUTO})
@@ -11151,7 +11600,11 @@ def forward(self, x, y):
 
         ep = export(M(), (5, 5), dynamic_shapes={"x": None, "y": Dim.AUTO})
         self.assertEqual(ep.module()(5, 6), 30)
-        with self.assertRaisesRegex(RuntimeError, r"to be equal to 5, but got 3"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: x == 5"),
+        ):
+            # expected 5, but got 3
             self.assertEqual(ep.module()(3, 5), 18)
 
         class M(torch.nn.Module):
@@ -11167,7 +11620,6 @@ def forward(self, x, y):
         self.assertTrue(torch.allclose(ep.module()(*inp), M()(*inp)))
 
     @testing.expectedFailureCppRuntime
-    @testing.expectedFailureRetraceabilityNonStrict  # no runtime asserts added for assert x == 3
     def test_symint_input_specialization(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -11192,11 +11644,14 @@ def forward(self, x, y):
             inp,
             dynamic_shapes=(Dim.AUTO, None),
         )
-        with self.assertRaisesRegex(RuntimeError, "to be equal to 3, but got 4"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: x == 3"),
+        ):
+            # expected 3, but got 4
             ep.module()(4, torch.randn(4, 4))
 
     @testing.expectedFailureCppRuntime
-    @testing.expectedFailureRetraceabilityNonStrict  # no runtime asserts added for assert x == 3
     def test_symint_input_ranges(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -11210,9 +11665,17 @@ def forward(self, x, y):
         )
 
         ep.module()(4, torch.randn(4, 4))
-        with self.assertRaisesRegex(RuntimeError, "to be <= 10, but got 16"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: x <= 10"),
+        ):
+            # expected <= 10, but got 16
             ep.module()(16, torch.randn(4, 4))
-        with self.assertRaisesRegex(RuntimeError, "to be >= 3, but got 2"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: x >= 3"),
+        ):
+            # expected >= 3, but got 2
             ep.module()(2, torch.randn(4, 4))
 
         # While tracing the range was found to be a subset of the original range
@@ -11893,6 +12356,7 @@ def test(m, expected_graph, expected_fqns, expected_duplicates):
                     [
                         fqn
                         for fqn, _ in unflattened.named_modules(remove_duplicate=False)
+                        if fqn != "_guards_fn"
                     ]
                 ),
                 expected_fqns,
@@ -13214,7 +13678,7 @@ def forward(self, x):
 
     def test_disable_forced_specializations_ok(self):
         # check that we don't force specialization, and defer to runtime asserts
-        # with allow_complex_guards_as_runtime_asserts=True to successfully export
+        # with prefer_deferred_runtime_asserts_over_guards=True to successfully export
         # case 1: modulo guards
         from torch.export import dims
 
@@ -13224,11 +13688,11 @@ def forward(self, x):
 
         inputs = (torch.randn(10, 72),)
         dx, dy = dims("dx", "dy")
-        ep = torch.export._trace._export(
+        ep = torch.export.export(
             Mod4Reshape(),
             inputs,
             dynamic_shapes={"x": (dx, dy)},
-            allow_complex_guards_as_runtime_asserts=True,
+            prefer_deferred_runtime_asserts_over_guards=True,
         )
         out1 = ep.module()(torch.randn(8, 7))
         self.assertEqual(out1.shape, torch.ones(7, 4, 2).shape)
@@ -13255,22 +13719,39 @@ def forward(self, x, y, z):
             "y": [Dim(f"dy{i}", min=2) for i in range(2)],
             "z": [Dim(f"dz{i}", min=4) for i in range(1)],
         }
-        ep = torch.export._trace._export(
-            FreeReshape(),
-            inputs,
-            dynamic_shapes=dynamic_shapes,
-            allow_complex_guards_as_runtime_asserts=True,
-        )
-        ep = export(FreeReshape(), inputs, dynamic_shapes=dynamic_shapes)
-        out1 = ep.module()(torch.randn(48, 1), torch.randn(4, 12), torch.randn(48))
-        self.assertEqual(out1.shape, torch.ones(48).shape)
-        out2 = ep.module()(torch.randn(5, 8), torch.randn(4, 10), torch.randn(40))
-        self.assertEqual(out2.shape, torch.ones(40).shape)
-        with self.assertRaisesRegex(
-            RuntimeError,
-            r"Runtime assertion failed for expression Eq\((.*)\) on node '.*'",
-        ):  # fail only at runtime
-            ep.module()(torch.randn(5, 8), torch.randn(4, 5), torch.randn(30))  # fail
+
+        for private_api in (True, False):
+            if private_api:
+                ep = torch.export.export(
+                    FreeReshape(),
+                    inputs,
+                    dynamic_shapes=dynamic_shapes,
+                    prefer_deferred_runtime_asserts_over_guards=True,
+                )
+            else:
+                ep = export(FreeReshape(), inputs, dynamic_shapes=dynamic_shapes)
+            out1 = ep.module()(torch.randn(48, 1), torch.randn(4, 12), torch.randn(48))
+            self.assertEqual(out1.shape, torch.ones(48).shape)
+            out2 = ep.module()(torch.randn(5, 8), torch.randn(4, 10), torch.randn(40))
+            self.assertEqual(out2.shape, torch.ones(40).shape)
+            if private_api:
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"Runtime assertion failed for expression Eq\((.*)\) on node '.*'",
+                ):  # fail only at runtime
+                    ep.module()(
+                        torch.randn(5, 8), torch.randn(4, 5), torch.randn(30)
+                    )  # fail
+            else:
+                # no runtime assert in exported module but it fails anyway with wrong inputs
+                with self.assertRaisesRegex(
+                    AssertionError,
+                    escape(
+                        "Guard failed: x.size()[1] * x.size()[0] == y.size()[0] * y.size()[1]"
+                    ),
+                ):
+                    # expected 40, but got 20
+                    ep.module()(torch.randn(5, 8), torch.randn(4, 5), torch.randn(30))
 
         # case 3: 3d reshape (previously failing with different issue)
         class Reshape3d(torch.nn.Module):
@@ -13285,11 +13766,11 @@ def forward(self, x, y):
             "x": (Dim("dx0", min=2), Dim("dx1", min=2), Dim("dx2", min=2)),
             "y": (Dim("dy", min=8),),
         }
-        ep = torch.export._trace._export(
+        ep = torch.export.export(
             Reshape3d(),
             inputs,
             dynamic_shapes=dynamic_shapes,
-            allow_complex_guards_as_runtime_asserts=True,
+            prefer_deferred_runtime_asserts_over_guards=True,
         )
         out1 = ep.module()(torch.randn(9, 7, 2), torch.randn(126))
         self.assertEqual(out1.shape, torch.ones(126).shape)
@@ -13350,6 +13831,22 @@ def forward(self, x, y):
         self.assertFalse(placeholders[1].meta["val"].requires_grad)
         self.assertTrue(placeholders[2].meta["val"].requires_grad)
 
+    def test_expand_copy_export_handles_implicit_true(self):
+        class ExpandModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, implicit):
+                return torch.expand_copy(x, [3, 3], implicit=implicit)
+
+        model = ExpandModel()
+        x = torch.ones([3])
+
+        model(x, False)
+        model(x, True)
+        export(model, (x, False))
+        export(model, (x, True))
+
     def test_unbacked_expand(self):
         if "cpp_runtime_nonstrict" in self.id():
             self.skipTest("TODO Unexpected success in OSS but not in fbcode.")
@@ -13411,11 +13908,11 @@ def forward(self, x):
         model = Model()
         x = torch.rand(1024, 20, 16)
         dynamic_shapes = {"x": {0: Dim("batch")}}
-        ep = torch.export._trace._export(
+        ep = torch.export.export(
             model,
             (x,),
             dynamic_shapes=dynamic_shapes,
-            allow_complex_guards_as_runtime_asserts=True,
+            prefer_deferred_runtime_asserts_over_guards=True,
         )
         with self.assertRaisesRegex(
             RuntimeError,
@@ -13488,11 +13985,11 @@ def forward(self, x, y):
 
         inputs = (torch.randn(6), torch.randn(12))
         dynamic_shapes = {"x": [Dim("dx", min=4)], "y": [Dim("dy", min=4)]}
-        ep = torch.export._trace._export(
+        ep = torch.export.export(
             Foo(),
             inputs,
             dynamic_shapes=dynamic_shapes,
-            allow_complex_guards_as_runtime_asserts=True,
+            prefer_deferred_runtime_asserts_over_guards=True,
         )
         # check forward pass
         out0, out1 = ep.module()(torch.randn(9), torch.randn(27))
@@ -13527,7 +14024,7 @@ def forward(self, x, y):
                 Foo(),
                 inputs,
                 dynamic_shapes=dynamic_shapes,
-                allow_complex_guards_as_runtime_asserts=True,
+                prefer_deferred_runtime_asserts_over_guards=True,
             ).run_decompositions()
 
         self.assertEqual(
@@ -13939,11 +14436,11 @@ def forward(self, x, y):
 
         inputs = (torch.randn(5), torch.randn(3))
         shapes = {"x": (Dim("dx"),), "y": (Dim("dy"),)}
-        ep = torch.export._trace._export(
+        ep = torch.export.export(
             Foo(),
             inputs,
             dynamic_shapes=shapes,
-            allow_complex_guards_as_runtime_asserts=True,
+            prefer_deferred_runtime_asserts_over_guards=True,
         )
         # count 2 pow nodes, 2 sym_size.int nodes
         self.assertEqual(
@@ -14740,21 +15237,44 @@ class ModConstraint(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 return x.view(x.shape[0] - 1, -1)
 
-        ep = export(
-            ModConstraint(),
-            (torch.randn(3, 4),),
-            dynamic_shapes={
-                "x": (dynamic, dynamic),
-            },
-        )
-        ep.module()(torch.randn(5, 8))
-        num_asserts = [
-            node.target == torch.ops.aten._assert_scalar.default
-            for node in ep.graph.nodes
-        ].count(True)
-        self.assertEqual(num_asserts, 2)
-        with self.assertRaises(RuntimeError):
-            ep.module()(torch.randn(4, 2))
+        for private_api in (True, False):
+            if private_api:
+                ep = torch.export.export(
+                    ModConstraint(),
+                    (torch.randn(3, 4),),
+                    dynamic_shapes={"x": (dynamic, dynamic)},
+                    prefer_deferred_runtime_asserts_over_guards=True,
+                )
+            else:
+                ep = export(
+                    ModConstraint(),
+                    (torch.randn(3, 4),),
+                    dynamic_shapes={"x": (dynamic, dynamic)},
+                )
+            ep.module()(torch.randn(5, 8))
+            num_asserts = [
+                node.target == torch.ops.aten._assert_scalar.default
+                for node in ep.graph.nodes
+            ].count(True)
+            if private_api:
+                self.assertEqual(num_asserts, 6)
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"Runtime assertion failed for expression Eq\(Mod\(s27\*s77, s77 - 1\), 0\)",
+                ):
+                    ep.module()(torch.randn(4, 2))
+            else:
+                # no runtime assert in exported module
+                self.assertEqual(num_asserts, 0)
+                # but it fails anyway with wrong inputs
+                with self.assertRaisesRegex(
+                    AssertionError,
+                    escape(
+                        "Guard failed: x.size()[1] * x.size()[0] % (-1 + x.size()[0]) == 0"
+                    ),
+                ):
+                    # expected 3*..., but got 8
+                    ep.module()(torch.randn(4, 2))
 
     @testing.expectedFailureSerDer  # T195866111
     @testing.expectedFailureSerDerNonStrict
@@ -15125,20 +15645,16 @@ def forward(self, x):
     @contextmanager
     def distributed_env(self, world_size):
         try:
-            from torch.testing._internal.distributed.fake_pg import FakeStore
-
             torch.distributed.init_process_group(
                 backend="fake",
                 world_size=world_size,
                 rank=0,
-                store=FakeStore(),
             )
             yield
 
         finally:
             torch.distributed.destroy_process_group()
 
-    @xfailIfDistributedNotSupported
     def test_distributed_all_reduce(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -15156,7 +15672,6 @@ def forward(self, x):
             inp = (torch.randn(4, 4),)
             self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
 
-    @xfailIfDistributedNotSupported
     def test_distributed_all_gather(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -15172,7 +15687,6 @@ def forward(self, x):
                 torch.allclose(a, b) for a, b in zip(ep.module()(*inp), m(*inp))
             )
 
-    @xfailIfDistributedNotSupported
     def test_distributed_all_gather_into_tensor(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -15186,7 +15700,6 @@ def forward(self, x):
             inp = (torch.randn(2),)
             self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
 
-    @xfailIfDistributedNotSupported
     @testing.expectedFailureCppRuntime
     def test_distributed_all_to_all_single(self):
         class Foo(torch.nn.Module):
@@ -15204,7 +15717,6 @@ def forward(self, x):
             )
             self.assertEqual(len(nodes), 1)
 
-    @xfailIfDistributedNotSupported
     @testing.expectedFailureCppRuntime
     def test_distributed_reduce_scatter_tensor(self):
         class Foo(torch.nn.Module):
@@ -15519,9 +16031,10 @@ def forward(self, x, y):
         self.assertEqual(res[1], 5)
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[1] to be equal to 5, but got 20"),
+            AssertionError,
+            escape("Guard failed: y == 5"),
         ):
+            # expected 5, but got 20
             res = ep.module()(torch.tensor(4), 20)
 
         class F(torch.nn.Module):
@@ -15915,7 +16428,7 @@ def forward(self, x):
             len(list(new_ep.graph.nodes)[-1].args[0]), len(signature.output_specs)
         )
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_assert_tensor_metadata_device_index(self):
         class N(torch.nn.Module):
             def __init__(self):
@@ -15931,6 +16444,26 @@ def forward(self, x, y):
         ep = move_to_device_pass(ep, {"cuda:0": "cuda"})
         ep.module()(torch.randn(3, device="cuda:0"), torch.randn(3, device="cuda:0"))
 
+    @unittest.skipIf(not HAS_TORCHREC, "only run when there is torchrec imported")
+    def test_torchrec_jagged_tensor(self):
+        class Foo(torch.nn.Module):
+            def forward(self, jt) -> torch.Tensor:
+                vals = jt.lengths().view(-1).long()
+                return vals + 4
+
+        foo = Foo()
+        jt = JaggedTensor(
+            values=torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
+            lengths=torch.IntTensor([0, 2, 0, 1, 1, 1, 0, 3]),
+            offsets=torch.IntTensor([0, 0, 2, 2, 3, 4, 5, 5, 8]),
+        )
+        with self.assertWarnsRegex(
+            UserWarning,
+            "While exporting, we found certain side effects happened in the model.forward. "
+            "Here are the list of potential sources you can double check: \[\"L\['jt'\]\"\]",
+        ):
+            _ = torch.export.export(foo, (jt,), strict=True)
+
     def test_input_output_no_stacktrace(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -15958,6 +16491,29 @@ def forward(self, *args, **kwargs):
         wrapper = Wrapper(pyt_model, example_inputs)
         wrapper.forward()
 
+    def test_strict_export_with_shared_parameters(self):
+        """Test that parameter names are preserved when there are shared parameters with the same name."""
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.n1 = torch.nn.Parameter(torch.ones(3))
+                self.n2 = self.n1
+
+            def forward(self, x):
+                res1 = x * self.n1
+                res2 = x * self.n2
+                return res1 + res2
+
+        m = M()
+        ep = torch.export.export(m, (torch.ones(3),), strict=True)
+        gm = ep.module()
+
+        # Check that named_parameters are preserved
+        original_param_names = [name for name, _ in m.named_parameters()]
+        exported_param_names = [name for name, _ in gm.named_parameters()]
+        self.assertEqual(original_param_names, exported_param_names)
+
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
 class TestExportCustomClass(TorchTestCase):
@@ -16187,6 +16743,56 @@ def forward(self, x, y):
             ignore_empty_lines=True,
         )
 
+    def test_is_fx_tracing(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                if torch.fx._symbolic_trace.is_fx_tracing():
+                    return x + y
+                else:
+                    return x * y
+
+        inp = (torch.randn(3), torch.randn(3))
+
+        ep = export(M(), inp)
+        FileCheck().check_count("torch.ops.aten.add", 1, exactly=True).run(
+            str(ep.graph)
+        )
+
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                if torch.fx._symbolic_trace.is_fx_symbolic_tracing():
+                    return x + y
+                else:
+                    return x * y
+
+        inp = (torch.randn(3), torch.randn(3))
+
+        ep = export(M(), inp)
+        FileCheck().check_count("torch.ops.aten.mul", 1, exactly=True).run(
+            str(ep.graph)
+        )
+
+    def test_item(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = 5
+                self.b = 5.0
+
+            def forward(self, y):
+                at = torch.tensor(self.a)
+                # This becomes 5
+                a = at.item()
+                bt = torch.tensor(self.b)
+                # This becomes 5.0
+                b = bt.item()
+                return a * b * y
+
+        ep = export(M(), (torch.ones(3),))
+        FileCheck().check_count("torch.ops.aten.mul.Tensor", 1, exactly=True).run(
+            str(ep.graph)
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_export_opinfo.py b/test/export/test_export_opinfo.py
new file mode 100644
index 0000000000000..35d8b2895bd83
--- /dev/null
+++ b/test/export/test_export_opinfo.py
@@ -0,0 +1,137 @@
+# Owner(s): ["oncall: export"]
+# ruff: noqa: F841
+# flake8: noqa
+
+import itertools
+
+import torch
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    ops,
+)
+from torch.testing._internal.common_methods_invocations import (
+    op_db,
+    skip,
+    skipOps,
+    xfail,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.utils import _pytree as pytree
+
+
+# following are failing with regular torch.export.export
+export_failures = {
+    xfail("allclose"),
+    xfail("combinations"),
+    xfail("corrcoef"),
+    xfail("cov"),
+    xfail("equal"),
+    xfail("linalg.lstsq"),
+    xfail("linalg.lstsq", "grad_oriented"),
+    xfail("nn.functional.ctc_loss"),
+    xfail("nn.functional.gaussian_nll_loss"),
+    xfail("sparse.sampled_addmm"),
+    xfail("tensor_split"),
+}
+
+# following are failing fake export on cuda device
+fake_export_failures = {
+    xfail("geqrf"),
+    xfail("histogram"),
+    xfail("masked.amax"),
+    xfail("masked.amin"),
+    xfail("masked.argmax"),
+    xfail("masked.argmin"),
+    xfail("masked.logaddexp"),
+    xfail("masked.logsumexp"),
+    xfail("masked.mean"),
+    xfail("masked.prod"),
+    xfail("masked.std"),
+    xfail("masked.sum"),
+    xfail("masked.var"),
+    xfail("nn.functional.grid_sample"),
+    xfail("to_sparse"),
+    # cannot xfail as it is passing for cpu-only build
+    skip("nn.functional.conv2d"),
+    skip("nn.functional.scaled_dot_product_attention"),
+    # following are failing due to OptionalDeviceGuard
+    xfail("__getitem__"),
+    xfail("nn.functional.batch_norm"),
+    xfail("nn.functional.instance_norm"),
+    xfail("nn.functional.multi_margin_loss"),
+    xfail("nonzero"),
+}
+
+fake_decomposition_failures = {
+    xfail("linalg.matrix_rank"),
+    xfail("nn.functional.binary_cross_entropy_with_logits"),
+    xfail("nn.functional.instance_norm"),
+    xfail("nn.functional.multi_margin_loss"),
+    xfail("repeat_interleave"),
+    xfail("take"),
+}
+
+
+def _test_export_helper(self, dtype, op):
+    sample_inputs_itr = op.sample_inputs("cpu", dtype, requires_grad=False)
+
+    mode = FakeTensorMode(allow_non_fake_inputs=True)
+    converter = mode.fake_tensor_converter
+    # intentionally avoid cuda:0 to flush out some bugs
+    target_device = "cuda:1"
+
+    def to_fake_device(x):
+        x = converter.from_real_tensor(mode, x)
+        x.fake_device = torch.device(target_device)
+        return x
+
+    # Limit to first 100 inputs so tests don't take too long
+    for sample_input in itertools.islice(sample_inputs_itr, 100):
+        args = tuple([sample_input.input] + list(sample_input.args))
+        kwargs = sample_input.kwargs
+
+        # hack to skip non-tensor in args, as export doesn't support it
+        if any(not isinstance(arg, torch.Tensor) for arg in args):
+            continue
+
+        if "device" in kwargs:
+            kwargs["device"] = target_device
+
+        with mode:
+            args, kwargs = pytree.tree_map_only(
+                torch.Tensor, to_fake_device, (args, kwargs)
+            )
+
+            class Module(torch.nn.Module):
+                def forward(self, *args):
+                    return op.op(*args, **kwargs)
+
+            m = Module()
+
+            ep = torch.export.export(m, args)
+
+            for node in ep.graph.nodes:
+                if node.op == "call_function":
+                    fake_tensor = node.meta.get("val", None)
+                    if isinstance(fake_tensor, FakeTensor):
+                        self.assertEqual(
+                            fake_tensor.device, torch.device(target_device)
+                        )
+
+
+class TestExportOpInfo(TestCase):
+    @ops(op_db, allowed_dtypes=(torch.float,))
+    @skipOps(
+        "TestExportOpInfo", "test_fake_export", export_failures | fake_export_failures
+    )
+    def test_fake_export(self, device, dtype, op):
+        _test_export_helper(self, dtype, op)
+
+
+only_for = "cpu"
+instantiate_device_type_tests(TestExportOpInfo, globals(), only_for=only_for)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/export/test_nativert.py b/test/export/test_nativert.py
index 044b6051400d4..bcbda2e42fc10 100644
--- a/test/export/test_nativert.py
+++ b/test/export/test_nativert.py
@@ -86,7 +86,7 @@ def run_with_nativert(ep):
     MODEL_NAME = "forward"
 
     # TODO Does named tempfile have collision?
-    with tempfile.NamedTemporaryFile(delete=False) as f:
+    with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f:
         torch.export.pt2_archive._package.package_pt2(
             f, exported_programs={MODEL_NAME: ep_infer}
         )
diff --git a/test/export/test_passes.py b/test/export/test_passes.py
index d3194ea352c31..4ae4d45498e93 100644
--- a/test/export/test_passes.py
+++ b/test/export/test_passes.py
@@ -411,9 +411,10 @@ def forward(self, x):
         )
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[0].shape[1] to be <= 6, but got 7"),
+            AssertionError,
+            escape("Guard failed: x.size()[1] <= 6"),
         ):
+            # expected <= 6, but got 7
             ep.module()(torch.zeros(2, 7, 3))
 
         self.assertEqual(
@@ -442,15 +443,17 @@ def forward(self, x, y):
         )
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[0].shape[1] to be <= 6, but got 7"),
+            AssertionError,
+            escape("Guard failed: x.size()[1] <= 6"),
         ):
+            # expected <= 6, but got 7
             ep.module()(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[1].shape[0] to be >= 3, but got 2"),
+            AssertionError,
+            escape("Guard failed: y.size()[0] >= 3"),
         ):
+            # expected >= 3, but got 2
             ep.module()(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
 
     def test_runtime_assert_some_dims_not_specified(self) -> None:
@@ -475,16 +478,18 @@ def forward(self, x, y):
         )
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[0].shape[1] to be <= 6, but got 7"),
+            AssertionError,
+            escape("Guard failed: x.size()[1] <= 6"),
         ):
+            # expected <= 6, but got 7
             ep.module()(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
 
         # y is specialized to 5
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[1].shape[0] to be equal to 5, but got 2"),
+            AssertionError,
+            escape("Guard failed: y.size()[0] == 5"),
         ):
+            # expected 5, but got 2
             ep.module()(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
 
         # Since we didn't insert the constraint for x[1] >= 2, it should work for case where x[1] == 1
@@ -509,14 +514,19 @@ def forward(self, x, y):
             M(), (x, y), dynamic_shapes={"x": None, "y": {1: dim1_y}}, strict=True
         )
 
-        with self.assertRaisesRegex(RuntimeError, escape("shape[1] to be equal to 2")):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: x.size()[1] == 2"),
+        ):
+            # expected 2, but got 7
             ep.module()(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
 
         # y is specialized to 5
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[1].shape[0] to be equal to 5, but got 2"),
+            AssertionError,
+            escape("Guard failed: y.size()[0] == 5"),
         ):
+            # expected 5, but got 2
             ep.module()(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
 
         # Since we didn't insert the constraint for x[1] >= 2, it should work for case where x[1] == 1
@@ -803,6 +813,7 @@ def test_predispatch_set_grad(self):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.aten.sin.default(add);  add = None
     sum_1 = torch.ops.aten.sum.default(sin);  sin = None
@@ -822,6 +833,7 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.aten.sin.default(add);  add = None
     sum_1 = torch.ops.aten.sum.default(sin);  sin = None
@@ -841,6 +853,7 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.aten.sin.default(add);  add = None
     sum_1 = torch.ops.aten.sum.default(sin);  sin = None
@@ -860,6 +873,7 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_5 = self.submod_1
     sum_1 = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_5, add);  submod_5 = add = None
@@ -880,6 +894,7 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.aten.sin.default(add)
     sum_1 = torch.ops.aten.sum.default(sin);  sin = None
@@ -905,6 +920,7 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_5 = self.submod_1
     wrap_with_set_grad_enabled = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_5, add);  submod_5 = add = None
@@ -940,6 +956,7 @@ def test_sequential_split_graph(self):
             """\
 def forward(self, x1, x2):
     x1, x2, = fx_pytree.tree_flatten_spec(([x1, x2], {}), self._in_spec)
+    submod_0 = self.submod_0(x1, x2);  submod_0 = None
     submod_1 = self.submod_1(x1, x2);  x1 = x2 = None
     getitem = submod_1[0]
     getitem_1 = submod_1[1];  submod_1 = None
@@ -995,6 +1012,7 @@ def test_predispatch_autocast_and_set_grad(self):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     submod_3 = self.submod_3
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_3, add);  submod_3 = add = None
@@ -1033,6 +1051,7 @@ def test_predispatch_autocast(self):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_3 = self.submod_1
     add_1 = torch.ops.higher_order.wrap_with_autocast('cpu', None, True, None, submod_3, add);  submod_3 = add = None
@@ -1065,6 +1084,7 @@ def forward(self, add):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_4 = self.submod_1
     sum_1 = torch.ops.higher_order.wrap_with_autocast('cpu', None, True, None, submod_4, add);  submod_4 = add = None
@@ -1115,6 +1135,7 @@ def forward(self, add_1):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_4 = self.submod_1
     wrap_with_autocast = torch.ops.higher_order.wrap_with_autocast('cpu', None, True, None, submod_4, add);  submod_4 = add = None
@@ -1172,6 +1193,7 @@ def forward(self, add_1, add_2):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_4 = self.submod_1
     sum_1 = torch.ops.higher_order.wrap_with_autocast('cpu', None, True, None, submod_4, add);  submod_4 = add = None
@@ -1213,6 +1235,7 @@ def test_inline_(self):
             )
             after_inline_str = new_gm.print_readable(print_output=False)
             self.assertEqual(before_str, after_inline_str)
+            new_gm._guards_fn = gm._guards_fn
             self.assertEqual(gm(*args), new_gm(*args))
 
     def test_remove_auto_functionalized_pass(self) -> None:
@@ -1302,6 +1325,49 @@ def forward(self, x):
     return (b_state, getitem_3, getitem_4)""",
             )
 
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    def test_move_device_to(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                x = torch.ops.aten.to.device(x, device="cuda:0", dtype=torch.float32)
+                return x + x
+
+        ep = torch.export.export(M(), (torch.ones(3),))
+        ep = move_to_device_pass(ep, "cuda")
+        ep.graph_module.recompile()
+        self.assertExpectedInline(
+            ep.graph_module.code.strip("\n"),
+            """\
+def forward(self, x):
+    _assert_tensor_metadata_default = torch.ops.aten._assert_tensor_metadata.default(x, dtype = torch.float32, device = 'cuda', layout = torch.strided);  _assert_tensor_metadata_default = None
+    to = torch.ops.aten.to.device(x, 'cuda', torch.float32);  x = None
+    add = torch.ops.aten.add.Tensor(to, to);  to = None
+    return (add,)
+    """,  # noqa: B950
+        )
+
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    def test_move_device_submod(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+                    x = x.to(device="cuda:0")
+                    return x + x
+
+        ep = torch.export.export(M(), (torch.ones(3),))
+        ep = move_to_device_pass(ep, "cuda")
+        ep.graph_module.submod_1.recompile()
+        self.assertExpectedInline(
+            ep.graph_module.submod_1.code.strip("\n"),
+            """\
+def forward(self, arg0_1):
+    _assert_tensor_metadata_default = torch.ops.aten._assert_tensor_metadata.default(arg0_1, dtype = torch.float32, device = 'cuda', layout = torch.strided);  _assert_tensor_metadata_default = None
+    to = torch.ops.aten.to.dtype_layout(arg0_1, dtype = torch.float32, layout = torch.strided, device = 'cuda');  arg0_1 = None
+    add = torch.ops.aten.add.Tensor(to, to);  to = None
+    return (add,)
+    """,  # noqa: B950
+        )
+
     @unittest.skipIf(not TEST_CUDA, "requires cuda")
     def test_move_to_device_pass(self):
         class Model(torch.nn.Module):
@@ -1338,6 +1404,38 @@ def forward(self, x):
         outputs = gm(*test_inputs)
         self.assertEqual(outputs.device, torch.device("cuda:0"))
 
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    def test_move_device_example_inputs(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x, y, z):
+                return self.linear(x) + y + z
+
+        # Create model with example inputs on CPU
+        mod = Model()
+        example_args = (torch.rand(4, 4), torch.rand(4, 4))
+        example_kwargs = {"z": torch.tensor([1.0, 2.0, 3.0, 4.0])}
+
+        # Export with example inputs
+        ep = export(mod, example_args, example_kwargs)
+
+        # Verify initial state - all tensors should be on CPU
+        self.assertEqual(ep.example_inputs[0][0].device, torch.device("cpu"))
+        self.assertEqual(ep.example_inputs[0][1].device, torch.device("cpu"))
+        self.assertEqual(ep.example_inputs[1]["z"].device, torch.device("cpu"))
+
+        # Move to CUDA
+        location = torch.device("cuda:0")
+        ep_cuda = move_to_device_pass(ep, location=location)
+
+        # Verify example_inputs moved to CUDA
+        self.assertEqual(ep_cuda.example_inputs[0][0].device, torch.device("cuda:0"))
+        self.assertEqual(ep_cuda.example_inputs[0][1].device, torch.device("cuda:0"))
+        self.assertEqual(ep_cuda.example_inputs[1]["z"].device, torch.device("cuda:0"))
+
     def test_constant_folding_pass(self):
         from torch.ao.quantization.observer import MappingType, PerGroup, PerToken
         from torch.ao.quantization.pt2e._affine_quantization import (
diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
index f4f7b68a494a6..ebc6e6d0672ea 100644
--- a/test/export/test_serialize.py
+++ b/test/export/test_serialize.py
@@ -14,6 +14,16 @@
 from pathlib import Path
 from typing import NamedTuple
 
+from torch.testing._internal.inductor_utils import HAS_GPU
+
+
+if HAS_GPU:
+    import triton
+    import triton.language as tl
+
+    from torch.library import wrap_triton
+    from torch.utils._triton import has_triton
+
 import torch
 import torch._dynamo as torchdynamo
 import torch._export.serde.schema as schema
@@ -21,7 +31,9 @@
 import torch.utils._pytree as pytree
 from torch._export.db.case import ExportCase, SupportLevel
 from torch._export.db.examples import all_examples
+from torch._export.serde.schema import ArgumentKind
 from torch._export.serde.serialize import (
+    _dict_to_dataclass,
     _to_json_bytes,
     canonicalize,
     deserialize,
@@ -280,6 +292,25 @@ def forward(self, x):
         actual_out = loaded_ep.module()(*inp)
         self.assertEqual(exp_out, actual_out)
 
+    def test_serialize_param_mutation(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.parameter = torch.nn.Parameter(torch.ones(4, 4))
+
+            def forward(self, x):
+                with torch.no_grad():
+                    self.parameter.div_(2)
+                return x + self.parameter
+
+        foo = Foo()
+        ep = torch.export.export(foo, (torch.rand(4, 4),)).run_decompositions()
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        loaded_ep = load(buffer)
+        val = loaded_ep.graph_signature.parameters_to_mutate
+        self.assertEqual({"div": "parameter"}, val)
+
     def test_serialize_constant_outputs(self):
         class MyModule(torch.nn.Module):
             def __init__(self) -> None:
@@ -562,6 +593,118 @@ def forward(self, x):
             serialized.exported_program.range_constraints[symint.name].max_val, 3
         )
 
+    @unittest.skipIf(
+        not torch.cuda.is_available() or not has_triton(), "requires cuda and triton"
+    )
+    def test_triton_hop(self) -> None:
+        @triton.jit
+        def add_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        def custom_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.empty_like(x)
+            n_elements = output.numel()
+
+            def grid(meta):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            wrap_triton(add_kernel)[grid](x, y, output, n_elements, 16)
+
+            return output
+
+        class MyModel(torch.nn.Module):
+            def forward(self, x, y):
+                return custom_add(x, y)
+
+        def custom_add_autotune(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.empty_like(x)
+            n_elements = output.numel()
+
+            def grid(meta):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            wrap_triton(add_kernel)[grid](x, y, output, n_elements, 16, num_warps=8)
+
+            return output
+
+        class MyModelAutotune(torch.nn.Module):
+            def forward(self, x, y):
+                return custom_add_autotune(x, y)
+
+        device = "cuda"
+
+        for m in [MyModel().to(device), MyModelAutotune().to(device)]:
+            args = (torch.randn(3, device=device), torch.randn(3, device=device))
+            ep = torch.export.export(m, args=args)
+            ep = ep.run_decompositions(decompose_custom_triton_ops=False)
+            assert torch.allclose(m(*args), ep.module()(*args))
+
+            serialized = ExportedProgramSerializer().serialize(ep)
+
+            for node in serialized.exported_program.graph_module.graph.nodes:
+                if (
+                    node.target
+                    == "torch.ops.higher_order.triton_kernel_wrapper_functional"
+                ):
+                    triton_node = node
+
+            self.assertIsNotNone(triton_node)
+
+            args = []
+            kwargs = []
+
+            for arg in triton_node.inputs:
+                if arg.kind == ArgumentKind.POSITIONAL:
+                    args.append(arg.arg)
+                elif arg.kind == ArgumentKind.KEYWORD:
+                    kwargs.append(arg.arg)
+
+            self.assertEqual(len(args), 4)
+            self.assertEqual(len(kwargs), 4)
+
+            for i in range(3):
+                self.assertIsNotNone(args[i].as_tensor)
+
+            self.assertEqual(args[3].as_int, 3)
+
+            self.assertEqual(kwargs[0].as_string, "add_kernel")  # name
+            self.assertEqual(kwargs[1].as_ints, [1, 1, 1])  # grid
+            self.assertEqual(kwargs[2].as_ints, [2])  # output indices
+            self.assertEqual(
+                kwargs[3].as_int, 8 if isinstance(m, MyModelAutotune) else 4
+            )  # num warps
+
+            self.assertEqual(len(triton_node.outputs), 1)
+            self.assertIsNotNone(triton_node.outputs[0].as_tensors)
+            self.assertEqual(
+                len(triton_node.outputs[0].as_tensors), len(kwargs[2].as_ints)
+            )
+            self.assertEqual(triton_node.outputs[0].as_tensors[0].name, "getitem")
+
+            with self.assertRaisesRegex(
+                SerializeError,
+                "deserialize nyi for torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional",
+            ):
+                ExportedProgramDeserializer().deserialize(
+                    serialized.exported_program,
+                    serialized.state_dict,
+                    serialized.constants,
+                    serialized.example_inputs,
+                )
+
     def test_kwargs_default(self) -> None:
         """
         Tests that the kwargs default values are serialized even if they are not
@@ -616,6 +759,157 @@ def forward(self, x):
             if "aten.sum.dim_IntList" in node.target:
                 self.assertEqual(node.inputs[1].arg.type, "as_ints")
 
+    def test_empty_constant(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        m = M()
+        sample_inputs = (torch.randn(1, 4),)
+        eager_out = m(*sample_inputs)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        torch.export.save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = torch.export.load(buffer)
+        ep_out = loaded_ep.module()(*sample_inputs)
+        self.assertTrue(torch.allclose(eager_out, ep_out))
+        self.assertEqual(len(loaded_ep.constants), 0)
+
+    def test_empty_state_dict(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = torch.randn(4, 4)
+
+            def forward(self, x):
+                return x + self.const
+
+        m = M()
+        sample_inputs = (torch.randn(4, 4),)
+        eager_out = m(*sample_inputs)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        torch.export.save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = torch.export.load(buffer)
+        ep_out = loaded_ep.module()(*sample_inputs)
+        self.assertTrue(torch.allclose(eager_out, ep_out))
+        self.assertEqual(len(loaded_ep.state_dict), 0)
+
+    def test_preserve_aliasing(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear1 = torch.nn.Linear(8, 8)
+                self.linear2 = self.linear1  # alias of linear1
+                self.register_buffer("buffer1", torch.randn(8, 8))
+                self.register_buffer("buffer2", torch.randn(8, 8), persistent=False)
+                self.const1 = torch.ones(8, 8)
+                self.const2 = self.const1.diagonal()  # a partial view of const1
+
+            def forward(self, x):
+                return (
+                    self.linear1(x)
+                    + self.linear2(x)
+                    + self.buffer1
+                    + self.buffer2
+                    + self.const1
+                    + self.const2
+                )
+
+        m = M()
+        sample_inputs = (torch.randn(1, 8),)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        torch.export.save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = torch.export.load(buffer)
+        eager_out = m(*sample_inputs)
+        epm = loaded_ep.module()
+        ep_out = epm(*sample_inputs)
+        self.assertTrue(torch.allclose(eager_out, ep_out))
+
+        # loaded_ep should preserve the aliasing info
+        self.assertEqual(
+            loaded_ep.state_dict["linear1.weight"].untyped_storage(),
+            loaded_ep.state_dict["linear2.weight"].untyped_storage(),
+        )
+        self.assertEqual(
+            loaded_ep.state_dict["linear1.bias"].untyped_storage(),
+            loaded_ep.state_dict["linear2.bias"].untyped_storage(),
+        )
+        self.assertEqual(
+            loaded_ep.constants["const1"].untyped_storage(),
+            loaded_ep.constants["const2"].untyped_storage(),
+        )
+        # verify const1 and const2 share the same storage
+        loaded_ep.constants["const1"][0][0] = 123
+        self.assertEqual(loaded_ep.constants["const2"][0], 123)
+        loaded_ep.constants["const2"][-1] = 321
+        self.assertEqual(loaded_ep.constants["const1"][-1][-1], 321)
+
+        # unlifted module should also preserve the aliasing info
+        epm = loaded_ep.module()
+        epm_state_dict = epm.state_dict()
+        self.assertEqual(
+            epm_state_dict["linear1.weight"].untyped_storage(),
+            epm_state_dict["linear2.weight"].untyped_storage(),
+        )
+        self.assertEqual(
+            epm_state_dict["linear1.bias"].untyped_storage(),
+            epm_state_dict["linear2.bias"].untyped_storage(),
+        )
+        self.assertEqual(
+            epm.const1.untyped_storage(),
+            epm.const2.untyped_storage(),
+        )
+        # verify const1 and const2 share the same storage
+        epm.const1[0][0] = 123
+        self.assertEqual(epm.const2[0], 123)
+        epm.const2[-1] = 321
+        self.assertEqual(epm.const1[-1][-1], 321)
+
+    def test_storage_offset(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = torch.arange(8)[:4]
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                return self.linear(x) + self.const
+
+        m = M()
+        sample_inputs = (torch.randn(1, 4),)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = load(buffer)
+        self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
+
+    def test_complex_constant(self) -> None:
+        class M(torch.nn.Module):
+            def forward(self, x):
+                s = torch.sin(x)
+                y = (1 + 1j) * s
+                z = 1j * s
+                return y, z
+
+        m = M()
+        sample_inputs = (torch.randn(2, 2),)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = load(buffer)
+        self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
+
 
 @unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
@@ -1441,6 +1735,20 @@ def forward(self, x):
             inputs = (torch.ones(2, 3),)
             self.check_graph(m, inputs, strict=False)
 
+    def test_forward_compatibility(self):
+        self.assertEqual(
+            schema.TensorArgument(
+                name="x",
+            ),
+            _dict_to_dataclass(
+                schema.TensorArgument,
+                {
+                    "shiny_new_field": "hello world",
+                    "name": "x",
+                },
+            ),
+        )
+
 
 instantiate_parametrized_tests(TestDeserialize)
 
@@ -1573,12 +1881,21 @@ def forward(self, x):
                 f.seek(0)
                 file_prefix = f.name.split("/")[2].split(".")[0]
 
-                # Modify the version
-                with zipfile.ZipFile(f, "a") as zipf:
-                    zipf.writestr(f"{file_prefix}/{ARCHIVE_VERSION_PATH}", "-1")
-
-                f.seek(0)
-                load(f.name)
+                # Create a new file and copy things over, but modify the
+                # archive version
+                with tempfile.NamedTemporaryFile(suffix=".pt2") as fnew:
+                    with zipfile.ZipFile(f, "r") as zin:
+                        with zipfile.ZipFile(fnew, "w") as zout:
+                            for item in zin.infolist():
+                                if (
+                                    item.filename
+                                    != f"{file_prefix}/{ARCHIVE_VERSION_PATH}"
+                                ):
+                                    zout.writestr(item, zin.read(item.filename))
+                            zout.writestr(f"{file_prefix}/{ARCHIVE_VERSION_PATH}", "-1")
+
+                    f.seek(0)
+                    load(fnew.name)
 
     def test_save_constants(self):
         class Foo(torch.nn.Module):
@@ -1710,6 +2027,7 @@ def forward(self, obj_attr, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     takes_foo = torch.ops._TorchScriptTesting.takes_foo.default(attr, x);  attr = None
     add = torch.ops.aten.add.Tensor(x, takes_foo);  x = takes_foo = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
diff --git a/test/export/test_torchbind.py b/test/export/test_torchbind.py
index c6f770e19c85a..f45775f09f29a 100644
--- a/test/export/test_torchbind.py
+++ b/test/export/test_torchbind.py
@@ -24,7 +24,7 @@
     _empty_tensor_queue,
     init_torchbind_implementations,
 )
-from torch.testing._internal.triton_utils import requires_cuda
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 def _assertEqualSkipScriptObject(test_case, exp, actual):
@@ -185,6 +185,7 @@ def forward(self, x, n):
 def forward(self, x, n):
     x, n, = fx_pytree.tree_flatten_spec(([x, n], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x, n);  n = _guards_fn = None
     call_torchbind = torch.ops.higher_order.call_torchbind(attr, 'add_tensor', x);  attr = None
     add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -232,6 +233,7 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     call_torchbind = torch.ops.higher_order.call_torchbind(attr, 'add_tensor', x);  attr = None
     add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -266,6 +268,7 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(attr, x);  attr = None
     add = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -300,6 +303,7 @@ def forward(self, x, cc):
             """\
 def forward(self, x, cc):
     x, cc, = fx_pytree.tree_flatten_spec(([x, cc], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x, cc);  _guards_fn = None
     call_torchbind = torch.ops.higher_order.call_torchbind(cc, 'add_tensor', x);  cc = None
     add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -362,6 +366,7 @@ def forward(self, x, cc):
             """\
 def forward(self, x, cc):
     x, cc, = fx_pytree.tree_flatten_spec(([x, cc], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x, cc);  _guards_fn = None
     takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(cc, x);  cc = None
     add = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -457,6 +462,7 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     takes_foo_default_1 = torch.ops._TorchScriptTesting.takes_foo.default(attr, x)
     takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(attr, takes_foo_default_1);  attr = takes_foo_default_1 = None
     add = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
@@ -499,6 +505,7 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     takes_foo_list_return_default = torch.ops._TorchScriptTesting.takes_foo_list_return.default(attr, x)
     getitem_2 = takes_foo_list_return_default[0]
     getitem_3 = takes_foo_list_return_default[1]
@@ -551,6 +558,7 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     takes_foo_tuple_return_default = torch.ops._TorchScriptTesting.takes_foo_tuple_return.default(attr, x)
     getitem_1 = takes_foo_tuple_return_default[0]
     getitem_2 = takes_foo_tuple_return_default[1];  takes_foo_tuple_return_default = None
@@ -1065,6 +1073,7 @@ def forward(self, tq: torch.ScriptObject, x: torch.Tensor) -> None:
             """\
 def forward(self, tq, x):
     tq, x, = fx_pytree.tree_flatten_spec(([tq, x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(tq, x);  _guards_fn = None
     queue_push_default = torch.ops._TorchScriptTesting.queue_push.default(tq, x);  x = queue_push_default = None
     return pytree.tree_unflatten((tq,), self._out_spec)""",
         )
@@ -1552,7 +1561,7 @@ def f(tq, x):
             self, f(_empty_tensor_queue(), x), opt_f(_empty_tensor_queue(), x)
         )
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @parametrize("device", ["cpu", "cuda"])
     @parametrize("backend", ["eager", "aot_eager", "inductor"])
     def test_compile_obj_torchbind_op_with_autocast(self, backend, device):
@@ -1570,7 +1579,7 @@ def f(tq, x):
             self, f(_empty_tensor_queue(), x), opt_f(_empty_tensor_queue(), x)
         )
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @parametrize("device", ["cpu", "cuda"])
     def test_export_obj_torchbind_op_with_autocast(self, device):
         class Mod(torch.nn.Module):
diff --git a/test/export/test_unflatten.py b/test/export/test_unflatten.py
index 3510403cc1640..5e1872c249ed7 100644
--- a/test/export/test_unflatten.py
+++ b/test/export/test_unflatten.py
@@ -359,9 +359,10 @@ def forward(self, x):
 
         export_module = torch.export.export(Mod(), (torch.randn((2, 3)),), strict=True)
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[0].shape[0] to be equal to 2, but got 6"),
+            AssertionError,
+            escape("Guard failed: x.size()[0] == 2"),
         ):
+            # expected 2, but got 6
             export_module.module()(torch.randn(6, 6))
 
         unflattened = unflatten(export_module)
@@ -933,7 +934,7 @@ def forward(self, x, y):
         fn_count_sym_size = lambda graph: [node.target for node in graph.nodes].count(
             torch.ops.aten.sym_size.int
         )
-        self.assertEqual(fn_count_sym_size(unflat.graph), 3)
+        self.assertEqual(fn_count_sym_size(unflat.graph), 1)
         self.assertEqual(fn_count_sym_size(unflat.m1.graph), 1)
         self.assertEqual(fn_count_sym_size(unflat.m2.graph), 0)
 
diff --git a/test/functorch/test_ac.py b/test/functorch/test_ac.py
index 430d4a3d56ddd..fde84b6683edf 100644
--- a/test/functorch/test_ac.py
+++ b/test/functorch/test_ac.py
@@ -6,7 +6,7 @@
 import torch
 import torch._functorch.config as config
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM, TestCase
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 from torch.utils._triton import has_triton
 from torch.utils.checkpoint import checkpoint
 from torch.utils.flop_counter import FlopCounterMode, register_flop_formula
@@ -405,5 +405,5 @@ def call():
 
 if __name__ == "__main__":
     # I'm using the cuda memory allocator to verify memory allocations
-    if HAS_CUDA and not TEST_WITH_ROCM:
+    if HAS_CUDA_AND_TRITON and not TEST_WITH_ROCM:
         run_tests()
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 2d8bbc477c48b..5e8902b0aa8fb 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -58,6 +58,7 @@
 )
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch._inductor.codecache import compiled_fx_graph_hash
+from torch._inductor.custom_graph_pass import CustomPartitionerFn
 from torch._inductor.output_code import MockFXGraphCacheOutput
 from torch._subclasses.fake_tensor import DynamicOutputShapeException, FakeTensorMode
 from torch.fx.experimental.proxy_tensor import is_sym_node
@@ -5364,11 +5365,15 @@ def forward(self, x):
 
         mod = M()
         inp = torch.randn(2, requires_grad=True)
-        with self.assertRaisesRegex(
-            RuntimeError,
-            "Found a graph input that requires gradients, and received a mutation",
-        ):
-            aot_export_module(mod, [inp], trace_joint=False)
+        gm, _ = aot_export_module(mod, [inp], trace_joint=False)
+        self.assertExpectedInline(
+            str(gm.graph).strip(),
+            """\
+graph():
+    %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
+    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg0_1, 4), kwargs = {})
+    return (add, add)""",
+        )
 
     def test_aot_export_input_mutation_on_parameter_banned(self):
         def fn(p, x):
@@ -5379,11 +5384,26 @@ def fn(p, x):
         inp = torch.randn(2)
         with self.assertRaisesRegex(
             RuntimeError,
-            "Found a graph input that requires gradients, and received a mutation",
+            "aot_export_joint_simple does not support input mutations. ViewAndMutationMeta",
         ):
             aot_export_joint_simple(fn, [mod.p, inp], trace_joint=False)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Found a graph input that requires gradients, and received a mutation",
+        ):
             aot_export_joint_simple(fn, [mod.p, inp], trace_joint=True)
-            aot_export_module(mod, [inp], trace_joint=False)
+
+        gm, _ = aot_export_module(mod, [inp], trace_joint=False)
+        self.assertExpectedInline(
+            str(gm.graph).strip(),
+            """\
+graph():
+    %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
+    %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
+    %mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg0_1, 2), kwargs = {})
+    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul, %arg1_1), kwargs = {})
+    return (mul, add)""",
+        )
 
     def test_aot_export_synthetic_bases_banned(self):
         def fn(p, x, y):
@@ -5668,6 +5688,49 @@ def forward(self, primals_1, tangents_1):
     return (cat,)""",
         )
 
+    @unittest.skipIf(not USE_NETWORKX, "networkx not available")
+    def test_custom_partitioner_fn(self):
+        class MyCustomPartitionerFn(CustomPartitionerFn):
+            def __init__(self):
+                super().__init__()
+                self.called = False
+
+            def __call__(self, gm, joint_inputs, **kwargs):
+                self.called = True
+                return min_cut_rematerialization_partition(gm, joint_inputs, **kwargs)
+
+            def uuid(self):
+                return None
+
+        def f(x):
+            return x.cos().cos()
+
+        inp = [torch.randn((4, 4), requires_grad=True)]
+        custom_partitioner_fn = MyCustomPartitionerFn()
+        fw_graph, bw_graph = get_fw_bw_graph(f, inp, partitioner=custom_partitioner_fn)
+        self.assertTrue(custom_partitioner_fn.called)
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
+def forward(self, primals_1):
+    cos = torch.ops.aten.cos.default(primals_1)
+    cos_1 = torch.ops.aten.cos.default(cos);  cos = None
+    return (cos_1, primals_1)""",
+        )
+        self.assertExpectedInline(
+            bw_graph.code.strip(),
+            """\
+def forward(self, primals_1, tangents_1):
+    cos = torch.ops.aten.cos.default(primals_1)
+    sin = torch.ops.aten.sin.default(cos);  cos = None
+    neg = torch.ops.aten.neg.default(sin);  sin = None
+    mul = torch.ops.aten.mul.Tensor(tangents_1, neg);  tangents_1 = neg = None
+    sin_1 = torch.ops.aten.sin.default(primals_1);  primals_1 = None
+    neg_1 = torch.ops.aten.neg.default(sin_1);  sin_1 = None
+    mul_1 = torch.ops.aten.mul.Tensor(mul, neg_1);  mul = neg_1 = None
+    return (mul_1,)""",
+        )
+
     @unittest.skipIf(not USE_NETWORKX, "networkx not available")
     def test_min_cut_partitioner_save_shape(self):
         def f(x):
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index f6901be327d9c..68a326f4f35a0 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -33,7 +33,6 @@
     requires_cuda,
     run_tests,
     skipIfCrossRef,
-    skipIfRocm,
     skipIfTorchDynamo,
     TEST_WITH_CROSSREF,
     TEST_WITH_TORCHDYNAMO,
@@ -144,7 +143,7 @@ def complex_pointwise(x, y):
         }
 
     def non_pointwise(x: torch.Tensor, y: torch.Tensor):
-        W = torch.diag(torch.ones(2, device=x.device))
+        W = torch.arange(4, dtype=torch.float, device=x.device).view(2, 2)
         return x @ W + y @ W
 
     def RNN(x: torch.Tensor, y: torch.Tensor):
@@ -395,14 +394,14 @@ def body_fn(a, b, c1, c2, c3, c0, u0, x):
                 ([torch.randn(3, 3)], {"x": torch.randn(3, 3), "y": torch.randn(3, 3)}),
             ),
         ),
-        "int_carry": (int_carry, (torch.randn(2, 3, requires_grad=True),)),
+        "int_carry": (int_carry, (torch.randn(2, 3),)),
         "pytree_int_carry": (
             pytree_int_carry,
-            (torch.randn(2, 3, requires_grad=True),),
+            (torch.randn(2, 3),),
         ),
         "const_and_symint_output": (
             const_and_symint_output,
-            (torch.randn(2, 3, requires_grad=True),),
+            (torch.randn(2, 3),),
         ),
     }
 
@@ -737,8 +736,7 @@ def forward(self, pred_1, x_1):
     getitem_1 = cond_1[0];  getitem_1 = None
     getitem_2 = cond_1[1]
     getitem_3 = cond_1[2];  getitem_3 = None
-    getitem_4 = cond_1[3];  getitem_4 = None
-    getitem_5 = cond_1[4];  cond_1 = getitem_5 = None
+    getitem_4 = cond_1[3];  cond_1 = getitem_4 = None
     return (getitem_2,)""",  # noqa: B950
         )
 
@@ -854,10 +852,7 @@ def forward(self, pred_1, a_1, b_1, c_1):
     cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (a_1, b_1, sym_size_int, sym_size_int_1, c_1, sym_size_int_2, ones_like));  pred_1 = true_graph_1 = false_graph_1 = a_1 = b_1 = sym_size_int = sym_size_int_1 = c_1 = sym_size_int_2 = ones_like = None
     getitem_1 = cond_1[0]
     getitem_2 = cond_1[1]
-    getitem_3 = cond_1[2];  getitem_3 = None
-    getitem_4 = cond_1[3];  getitem_4 = None
-    getitem_5 = cond_1[4];  getitem_5 = None
-    getitem_6 = cond_1[5];  cond_1 = getitem_6 = None
+    getitem_3 = cond_1[2];  cond_1 = getitem_3 = None
     return (getitem_1, getitem_2)""",  # noqa: B950
         )
         # Forward
@@ -877,7 +872,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1):
     clone = torch.ops.aten.clone.default(arg6_1)
     clone_1 = torch.ops.aten.clone.default(arg6_1);  arg6_1 = None
     zeros_like = torch.ops.aten.zeros_like.default(arg4_1, pin_memory = False);  arg4_1 = None
-    return [clone, clone_1, None, None, zeros_like, None]""",
+    return [clone, clone_1, zeros_like]""",
         )
 
     def test_cond_autograd_pytree_input(self):
@@ -1302,15 +1297,11 @@ def _extract_tensor_metadata_except_requires_grad(arg):
 
         return cond_outputs, cond_inputs
 
-    # TODO: The compile_mode = `compile_dynamic_shape` raises the Error
-    # torch._inductor.exc.LoweringException: NotImplementedError: get_size() is not
-    # implemented by <class 'torch._inductor.ir.NoneAsConstantBuffer'>!
     @skipIfTorchDynamo("don't test compile on compile")
     @unittest.skipIf(not SM70OrLater, "triton")
     @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
     @parametrize("compile_mode", ["compile_dynamic_shape"])
     @parametrize("scalar", [False])
-    @unittest.expectedFailure
     def test_cond_autograd_zeros_unused_branch_complex_compile_fail(
         self, compile_mode, scalar
     ):
@@ -1870,7 +1861,6 @@ def test_scan_binary_operator(self, reverse, compile_mode, device, autograd):
             )
             self.assertEqual(grads, expected_grads)
 
-    @skipIfRocm(msg="Unsupported on ROCM yet")
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("reverse", [False, True])
@@ -2015,7 +2005,6 @@ def test_scan_complex_pytree(self, reverse, compile_mode, device, autograd):
     # TODO: Does not work because of the usage of vmap within associative_scan
     # The paT206899919 rameterization is commented out for the moment and the test is marked with expected fail
     # Fails with: AssertionError: scan is not an OpOverload
-    @skipIfRocm(msg="Unsupported on ROCM yet")
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @unittest.expectedFailure
@@ -2955,6 +2944,169 @@ def RNN(x: torch.Tensor, y: torch.Tensor):
                     params,
                 )
 
+    @requires_cuda
+    @skipIfTorchDynamo("not a dynamo test")
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @parametrize("layers", [1, 2, 3])
+    @parametrize("device", ["cpu", "cuda"])
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_scan_multiple_layers_gradient(self, layers, device):
+        import torch.nn as nn
+
+        torch.manual_seed(1)
+
+        LAYERS = layers
+        BATCH_SIZE = 2
+        SEQ_LEN = 5
+        FEATURE_DIM = 10
+        DEVICE = device
+
+        class RNNLoop(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layers = nn.ModuleList(
+                    [nn.Linear(FEATURE_DIM * 2, FEATURE_DIM) for _ in range(LAYERS)]
+                )
+                self.num_layers = LAYERS
+
+            def forward(self, initial, inputs_sequence):
+                B, T, _ = inputs_sequence.shape
+                hs_list = initial
+                all_out = []
+                for t in range(T):
+                    input = inputs_sequence[:, t, :]
+                    for li, layer in enumerate(self.layers):
+                        input_concat = torch.cat((hs_list[li], input), dim=-1)
+                        update = layer(input_concat)
+                        hs_list[li] = hs_list[li] + update
+                        input = hs_list[li]
+
+                    all_out.append(input)
+
+                return torch.stack(all_out, dim=1)
+
+        class RNNScanList(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layers = nn.ModuleList(
+                    [nn.Linear(FEATURE_DIM * 2, FEATURE_DIM) for _ in range(LAYERS)]
+                )
+                self.num_layers = LAYERS
+
+            def forward(self, initial, input_sequence):
+                def step(carry, input):
+                    hs_list = carry[:]
+                    for li, layer in enumerate(self.layers):
+                        h_prev_li = hs_list[li]
+                        input_concat = torch.cat((h_prev_li, input), dim=-1)
+                        update = layer(input_concat)
+                        h_curr_li = h_prev_li + update
+                        hs_list[li] = h_curr_li
+                        input = h_curr_li
+                    return [t.clone() for t in hs_list], input.clone()
+
+                _, all_outputs_scan = scan(step, initial, input_sequence, dim=1)
+                return all_outputs_scan.transpose(0, 1)
+
+        class RNNScanTensor(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layers = nn.ModuleList(
+                    [nn.Linear(FEATURE_DIM * 2, FEATURE_DIM) for _ in range(LAYERS)]
+                )
+                self.num_layers = LAYERS
+
+            def forward(self, initial, input_sequence):
+                def step(carry_tensor, xs_input):
+                    input = xs_input
+                    hs_tensor = carry_tensor
+                    for li, layer in enumerate(self.layers):
+                        current_h_prev_li_slice = hs_tensor[:, li, :]
+                        input_concat = torch.cat(
+                            (current_h_prev_li_slice, input), dim=-1
+                        )
+                        update = layer(input_concat)
+                        h_curr_li = current_h_prev_li_slice + update
+                        hs_tensor = hs_tensor.clone()
+                        hs_tensor[:, li, :] = h_curr_li
+                        input = h_curr_li
+                    return hs_tensor.clone(), input.clone()
+
+                hs_stacked = torch.stack(initial, dim=1)
+                _, all_outputs_scan = scan(step, hs_stacked, input_sequence, dim=1)
+                return all_outputs_scan.transpose(0, 1)
+
+        def run_test_and_get_grads_loss(model, initial_hs, inputs):
+            for param in model.parameters():
+                if param.grad is not None:
+                    param.grad.zero_()
+
+            current_initial_hs = [
+                h.detach().clone().requires_grad_(h.requires_grad) for h in initial_hs
+            ]
+            current_inputs = (
+                inputs.detach().clone().requires_grad_(inputs.requires_grad)
+            )
+
+            out = model(current_initial_hs, current_inputs)
+            loss = out.sum()
+            loss.backward()
+
+            layer_grads = []
+            for layer in model.layers:
+                layer_grads.append(layer.weight.grad.clone())
+
+            return layer_grads, loss
+
+        torch.manual_seed(0)
+
+        initial_hs_template = [
+            torch.zeros(
+                BATCH_SIZE, FEATURE_DIM, requires_grad=True, dtype=torch.float32
+            ).to(DEVICE)
+            for _ in range(LAYERS)
+        ]
+        inputs_template = torch.randn(
+            BATCH_SIZE, SEQ_LEN, FEATURE_DIM, requires_grad=True, dtype=torch.float32
+        ).to(DEVICE)
+
+        # Test 3 models: RNNScanList, RNNScanTensor, RNNLoop
+        models = [
+            ("ScanList", RNNScanList),
+            ("ScanTensor", RNNScanTensor),
+            ("Loop", RNNLoop),
+        ]
+
+        for model_name, model_class in models:
+            # Create uncompiled model
+            model_uc = model_class().to(DEVICE)
+            uncompiled_grads, uncompiled_loss = run_test_and_get_grads_loss(
+                model_uc, initial_hs_template, inputs_template
+            )
+
+            # Create compiled model with same weights
+            model_to_compile = model_class().to(DEVICE)
+            model_to_compile.load_state_dict(model_uc.state_dict())
+            compiled_model = torch.compile(model_to_compile)
+            compiled_grads, compiled_loss = run_test_and_get_grads_loss(
+                compiled_model, initial_hs_template, inputs_template
+            )
+
+            # Compare gradients for each layer
+            for i, (uncompiled_grad, compiled_grad) in enumerate(
+                zip(uncompiled_grads, compiled_grads)
+            ):
+                self.assertEqual(
+                    uncompiled_grad,
+                    compiled_grad,
+                )
+
+            # Compare losses
+            self.assertEqual(
+                uncompiled_loss,
+                compiled_loss,
+            )
+
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("reverse", [False, True])
@@ -3562,11 +3714,38 @@ def setUp(self):
         torch._dynamo.reset()
         super().setUp()
 
-    def _run_test(self, model, model_fake, inputs):
+    def _check_autograd(self, result, result_exp, autograd_param):
+        grad_param = [p for p in autograd_param if p.requires_grad]
+
+        result_flatten, _ = pytree.tree_flatten(result)
+        result_exp_flatten, _ = pytree.tree_flatten(result_exp)
+        result_flatten = [r for r in result_flatten if r.requires_grad]
+        result_exp_flatten = [r for r in result_exp_flatten if r.requires_grad]
+
+        # Check the result and parameter lists
+        assert len(result_flatten) == len(result_exp_flatten), (
+            "The number of elements requiring gradients is different for the results and the expected results"
+        )
+
+        grad_exp_init = [torch.ones_like(el) for el in result_exp_flatten]
+        expected_grads = torch.autograd.grad(
+            result_exp_flatten, grad_param, grad_exp_init
+        )
+        grad_init = [torch.ones_like(el) for el in result_flatten]
+        grads = torch.autograd.grad(result_flatten, grad_param, grad_init)
+
+        self.assertEqual(grads, expected_grads, atol=6e-05, rtol=6e-06)
+
+    def _run_test(self, model, model_fake, inputs, autograd_param=None):
         result = model(inputs)
         result_exp = model_fake(inputs)
         self.assertEqual(result, result_exp)
 
+        if autograd_param is not None and any(
+            par.requires_grad for par in autograd_param
+        ):
+            self._check_autograd(result, result_exp, autograd_param)
+
         # Return the result of the functions under test for further investigations
         return result
 
@@ -3581,6 +3760,7 @@ def _prepare_fake_kwargs(self, original_kwargs):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -3592,14 +3772,25 @@ def _prepare_fake_kwargs(self, original_kwargs):
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
-                or torch.version.hip
             )
         ),
     )
+    # # Skipping this combination as there is a CPP compilation failure that
+    # # may be unrelated to associative_scan itself. There is a dedicated tests for
+    # # this case below.
+    # @decorateIf(
+    #     unittest.skip,
+    #     lambda params: (
+    #         params["compile_mode"] == "compile_dynamic_shape"
+    #         and params["combine_mode"] == "generic"
+    #         and params["device"] == torch.device("cpu")
+    #         and params["autograd"]
+    #     ),
+    # )
     def test_associative_scan_compile(
-        self, combine_mode, reverse, compile_mode, device
+        self, combine_mode, reverse, compile_mode, device, autograd
     ):
-        x = torch.randn(3, 10, 2, device=device)
+        x = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
         kwargs = {
             "dim": 0,
             "reverse": reverse,
@@ -3611,6 +3802,7 @@ def test_associative_scan_compile(
             model=AssociativeScanModels.Simple(**kwargs),
             model_fake=AssociativeScanModels.Simple(**kwargs_fake),
             inputs=x,
+            autograd_param=None if not autograd else (x,),
         )
 
         if not reverse:
@@ -3620,7 +3812,9 @@ def test_associative_scan_compile(
             self.assertEqual(results, results_torch)
 
         # Jax Examples
-        x = torch.arange(0, 4, device=device)
+        x = torch.arange(
+            0, 4, device=device, dtype=torch.float32, requires_grad=autograd
+        )
         kwargs = {
             "dim": 0,
             "reverse": reverse,
@@ -3633,12 +3827,13 @@ def test_associative_scan_compile(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=x,
+            autograd_param=None if not autograd else (x,),
         )
 
         if not reverse:
-            results_torch = torch.tensor([0.0, 1.0, 3.0, 6.0], dtype=torch.int64)
+            results_torch = torch.tensor([0.0, 1.0, 3.0, 6.0], dtype=torch.float32)
         else:
-            results_torch = torch.tensor([6.0, 6.0, 5.0, 3.0], dtype=torch.int64)
+            results_torch = torch.tensor([6.0, 6.0, 5.0, 3.0], dtype=torch.float32)
 
         self.assertEqual(result, results_torch)
 
@@ -3648,6 +3843,7 @@ def test_associative_scan_compile(
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -3659,11 +3855,12 @@ def test_associative_scan_compile(
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
-                or torch.version.hip
             )
         ),
     )
-    def test_associative_scan_dim(self, combine_mode, compile_mode, reverse, device):
+    def test_associative_scan_dim(
+        self, combine_mode, compile_mode, reverse, device, autograd
+    ):
         import random
 
         random.seed(1234)
@@ -3674,7 +3871,7 @@ def test_associative_scan_dim(self, combine_mode, compile_mode, reverse, device)
             torch._dynamo.reset()
             shapes = [random.randint(1, 9) for _ in range(num_dim)]
             rnd_scan_dim = random.randint(0, num_dim - 1)
-            x = torch.randn(*shapes, device=device)
+            x = torch.randn(*shapes, device=device, requires_grad=autograd)
 
             kwargs = {
                 "dim": rnd_scan_dim,
@@ -3687,6 +3884,7 @@ def test_associative_scan_dim(self, combine_mode, compile_mode, reverse, device)
                 model=AssociativeScanModels.Simple(**kwargs),
                 model_fake=AssociativeScanModels.Simple(**kwargs_fake),
                 inputs=x,
+                autograd_param=None if not autograd else (x,),
             )
 
             if not reverse:
@@ -3718,13 +3916,13 @@ def test_associative_scan_dim_shape_failure(self, compile_mode, combine_mode):
                 inputs=x,
             )
 
-    @skipIfRocm(msg="Unsupported on ROCM yet")
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -3736,13 +3934,14 @@ def test_associative_scan_dim_shape_failure(self, compile_mode, combine_mode):
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
-                or torch.version.hip
             )
         ),
     )
-    def test_associative_scan_tuple(self, compile_mode, combine_mode, reverse, device):
-        x = torch.randn(3, 2, 2, device=device)
-        y = torch.randn(3, 2, 2, device=device)
+    def test_associative_scan_tuple(
+        self, compile_mode, combine_mode, reverse, device, autograd
+    ):
+        x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        y = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
         inp = (x, y)
 
         kwargs = {
@@ -3757,18 +3956,19 @@ def test_associative_scan_tuple(self, compile_mode, combine_mode, reverse, devic
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else inp,
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
-    @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     def test_associative_scan_expand_in_combine_fn(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, reverse, device, autograd
     ):
-        x = torch.randn(3, 2, 2, device=device)
+        x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
 
         def combine_fn(x, y):
             return x * torch.sum(y, -1).expand(x.shape)
@@ -3785,6 +3985,7 @@ def combine_fn(x, y):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=x,
+            autograd_param=None if not autograd else (x,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -3792,10 +3993,15 @@ def combine_fn(x, y):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     def test_associative_scan_non_contiguous_tensor(
-        self, compile_mode, reverse, device
+        self, compile_mode, reverse, device, autograd
     ):
-        x = torch.arange(30, device=device).view(10, 3).t()
+        x = (
+            torch.arange(30, device=device, dtype=torch.float32, requires_grad=autograd)
+            .view(10, 3)
+            .t()
+        )
         assert not x.is_contiguous()
 
         kwargs = {
@@ -3810,6 +4016,7 @@ def test_associative_scan_non_contiguous_tensor(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=x,
+            autograd_param=None if not autograd else (x,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -3818,6 +4025,7 @@ def test_associative_scan_non_contiguous_tensor(
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -3829,16 +4037,15 @@ def test_associative_scan_non_contiguous_tensor(
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
-                or torch.version.hip
             )
         ),
     )
     def test_associative_scan_complex_pytree(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
-        x = torch.randn(3, 2, 2, device=device)
-        y = torch.randn(3, 2, 2, device=device)
-        z = torch.randn(3, 2, 2, device=device)
+        x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        y = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        z = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
         inp = {"i": x, "j": ([y], [{"o": z}])}
 
         kwargs = {
@@ -3853,6 +4060,7 @@ def test_associative_scan_complex_pytree(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (x, y, z),
         )
 
     @skipIfTorchDynamo("don't test compile on compile")
@@ -3906,53 +4114,53 @@ def forward(self, L_xs_0_0_: "f32[3, 10, 2]", L_xs_0_1_0_: "f32[3, 10, 2]", L_xs
         child_4: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_4, 0, 1, None, 2)
         child_5: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_5, 0, 1, None, 2)
 
-        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+        lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
 
-        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting = None
+        _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting = None
 
-        _add_batch_dim: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
-        _add_batch_dim_1: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_1, 0, 1);  child_1 = None
-        _add_batch_dim_2: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_2, 0, 1);  child_2 = _add_batch_dim_2 = None
-        _add_batch_dim_3: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_3, 0, 1);  child_3 = _add_batch_dim_3 = None
-        _add_batch_dim_4: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_4, 0, 1);  child_4 = _add_batch_dim_4 = None
-        _add_batch_dim_5: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_5, 0, 1);  child_5 = None
+        _add_batch_dim: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1);  child = None
+        _add_batch_dim_1: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_1, 0, 1);  child_1 = None
+        _add_batch_dim_2: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_2, 0, 1);  child_2 = _add_batch_dim_2 = None
+        _add_batch_dim_3: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_3, 0, 1);  child_3 = _add_batch_dim_3 = None
+        _add_batch_dim_4: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_4, 0, 1);  child_4 = _add_batch_dim_4 = None
+        _add_batch_dim_5: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_5, 0, 1);  child_5 = None
 
         a: "f32[10, 2]" = _add_batch_dim + _add_batch_dim_5;  _add_batch_dim = None
         b: "f32[10, 2]" = _add_batch_dim_1 - _add_batch_dim_5;  _add_batch_dim_1 = _add_batch_dim_5 = None
 
         child_6: "f32[10, 2]" = a - b
 
-        child_7: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(a, 1, 1, 0);  a = None
-        child_8: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(b, 1, 1, 0);  b = None
-        child_9: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(child_6, 1, 1, 0);  child_6 = None
+        child_7: "f32[1, 10, 2]" = torch._functorch.predispatch._remove_batch_dim(a, 1, 1, 0);  a = None
+        child_8: "f32[1, 10, 2]" = torch._functorch.predispatch._remove_batch_dim(b, 1, 1, 0);  b = None
+        child_9: "f32[1, 10, 2]" = torch._functorch.predispatch._remove_batch_dim(child_6, 1, 1, 0);  child_6 = None
 
-        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+        _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
 
         child_10: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_3, 0, 2, None, 2)
         child_11: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_4, 0, 2, None, 2)
         child_12: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_5, 0, 2, None, 2)
 
-        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+        lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_1 = None
 
-        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting_1 = None
+        _vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting_1 = None
 
-        _add_batch_dim_6: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_7, 0, 1)
-        _add_batch_dim_7: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_8, 0, 1)
-        _add_batch_dim_8: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_9, 0, 1);  _add_batch_dim_8 = None
-        _add_batch_dim_9: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_10, 0, 1);  child_10 = _add_batch_dim_9 = None
-        _add_batch_dim_10: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_11, 0, 1);  child_11 = _add_batch_dim_10 = None
-        _add_batch_dim_11: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_12, 0, 1);  child_12 = None
+        _add_batch_dim_6: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_7, 0, 1)
+        _add_batch_dim_7: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_8, 0, 1)
+        _add_batch_dim_8: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_9, 0, 1);  _add_batch_dim_8 = None
+        _add_batch_dim_9: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_10, 0, 1);  child_10 = _add_batch_dim_9 = None
+        _add_batch_dim_10: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_11, 0, 1);  child_11 = _add_batch_dim_10 = None
+        _add_batch_dim_11: "f32[10, 2]" = torch._functorch.predispatch._add_batch_dim(child_12, 0, 1);  child_12 = None
 
         a_1: "f32[10, 2]" = _add_batch_dim_6 + _add_batch_dim_11;  _add_batch_dim_6 = None
         b_1: "f32[10, 2]" = _add_batch_dim_7 - _add_batch_dim_11;  _add_batch_dim_7 = _add_batch_dim_11 = None
 
         child_13: "f32[10, 2]" = a_1 - b_1
 
-        child_14: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(a_1, 1, 1, 0);  a_1 = None
-        child_15: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(b_1, 1, 1, 0);  b_1 = None
-        child_16: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(child_13, 1, 1, 0);  child_13 = None
+        child_14: "f32[1, 10, 2]" = torch._functorch.predispatch._remove_batch_dim(a_1, 1, 1, 0);  a_1 = None
+        child_15: "f32[1, 10, 2]" = torch._functorch.predispatch._remove_batch_dim(b_1, 1, 1, 0);  b_1 = None
+        child_16: "f32[1, 10, 2]" = torch._functorch.predispatch._remove_batch_dim(child_13, 1, 1, 0);  child_13 = None
 
-        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+        _vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
 
         slice_10: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_3, 0, 0, 1);  elem_3 = None
         cat: "f32[2, 10, 2]" = torch.cat([slice_10, child_14], dim = 0);  slice_10 = child_14 = None
@@ -4002,6 +4210,7 @@ def forward(self, L_xs_0_0_: "f32[3, 10, 2]", L_xs_0_1_0_: "f32[3, 10, 2]", L_xs
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4013,12 +4222,11 @@ def forward(self, L_xs_0_0_: "f32[3, 10, 2]", L_xs_0_1_0_: "f32[3, 10, 2]", L_xs
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
-                or torch.version.hip
             )
         ),
     )
     def test_associative_scan_downstream_scan_matmul(
-        self, combine_mode, compile_mode, reverse, device
+        self, combine_mode, compile_mode, reverse, device, autograd
     ):
         def first_chain_fct(scan_fct, inp, **kwargs):
             o = scan_fct(get_scan_combine_fn("add", True), inp, **kwargs)
@@ -4028,7 +4236,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             W = torch.ones(2, 5, device=device)
             return inp @ W
 
-        inp = torch.randn(3, 10, 2, device=device)
+        inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
         kwargs = {
             "dim": 1,
             "reverse": reverse,
@@ -4041,6 +4249,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             model=AssociativeScanModels.ChainFn(**kwargs),
             model_fake=AssociativeScanModels.ChainFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4049,6 +4258,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4060,12 +4270,11 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
-                or torch.version.hip
             )
         ),
     )
     def test_associative_scan_downstream_scan_scan(
-        self, combine_mode, compile_mode, reverse, device
+        self, combine_mode, compile_mode, reverse, device, autograd
     ):
         def first_chain_fct(scan_fct, inp, **kwargs):
             o1 = scan_fct(get_scan_combine_fn("add", True), inp, **kwargs)
@@ -4075,7 +4284,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             o2 = scan_fct(get_scan_combine_fn("add", True), inp, **kwargs)
             return o2
 
-        inp = torch.randn(3, 10, 2, device=device)
+        inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": 1,
@@ -4089,6 +4298,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             model=AssociativeScanModels.ChainFn(**kwargs),
             model_fake=AssociativeScanModels.ChainFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4098,6 +4308,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
     @parametrize("reverse_first", [False, True])
     @parametrize("same_direction", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4109,12 +4320,23 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
-                or torch.version.hip
             )
         ),
     )
+    # Skipping the autograd=True because
+    # associative_scan does currently not support gradients for lifted parameters
+    @decorateIf(
+        unittest.skip,
+        lambda params: (params["combine_mode"] == "pointwise" and params["autograd"]),
+    )
     def test_associative_scan_downstream_scan_scan_different_dim(
-        self, combine_mode, compile_mode, reverse_first, same_direction, device
+        self,
+        combine_mode,
+        compile_mode,
+        reverse_first,
+        same_direction,
+        device,
+        autograd,
     ):
         reverse_second = reverse_first if same_direction else not reverse_first
 
@@ -4126,7 +4348,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             o2 = scan_fct(get_scan_combine_fn("add", True), inp, **kwargs)
             return o2
 
-        inp = torch.randn(3, 10, 2, device=device)
+        inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": [1, 0],
@@ -4140,6 +4362,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             model=AssociativeScanModels.ChainFn(**kwargs),
             model_fake=AssociativeScanModels.ChainFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     # TODO: Does not work because of the usage of vmap within associative_scan
@@ -4198,8 +4421,9 @@ def second_nested_fct(x, y):
     @parametrize("loop_type", ["for"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     def test_associative_scan_loop_in_combine_fn(
-        self, compile_mode, loop_type, reverse, device
+        self, compile_mode, loop_type, reverse, device, autograd
     ):
         def combine_fn(x, y):
             cnt = torch.zeros_like(y[0, :])
@@ -4224,7 +4448,7 @@ def body_fn(ind, loop_val):
                     cnt += torch.abs(y[ind])
             return x * cnt
 
-        inp = torch.randn(3, 10, 1, device=device) * 2
+        inp = torch.randn(3, 10, 1, device=device, requires_grad=autograd) * 2
 
         kwargs = {
             "dim": 0,
@@ -4238,6 +4462,7 @@ def body_fn(ind, loop_val):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     # TODO: Does not work because of the usage of vmap within associative_scan
@@ -4282,6 +4507,7 @@ def body_fn(ind, loop_val):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of compile_mode=compile_dynamic_shape
     # as the current implementation does not support lifted arguments
     @decorateIf(
@@ -4289,15 +4515,16 @@ def body_fn(ind, loop_val):
         lambda params: (
             params["device"] == torch.device("cpu")
             or params["compile_mode"] == "compile_dynamic_shape"
-            or torch.version.hip
         ),
     )
-    def test_associative_scan_cond_in_combine_fn(self, compile_mode, reverse, device):
+    def test_associative_scan_cond_in_combine_fn(
+        self, compile_mode, reverse, device, autograd
+    ):
         def combine_fn(x, y):
             val = cond(torch.sum(y) > 0.0, lambda y: y.clone(), lambda y: 1.0 - y, (y,))
             return x * val
 
-        inp = torch.randn(3, 10, 1, device=device)
+        inp = torch.randn(3, 10, 1, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": 0,
@@ -4311,6 +4538,7 @@ def combine_fn(x, y):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     # TODO: Does not work because of the usage of vmap within associative_scan
@@ -4352,7 +4580,10 @@ def body(x, y):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
-    def test_associative_scan_vmap_in_combine_fn(self, compile_mode, reverse, device):
+    @parametrize("autograd", [False, True])
+    def test_associative_scan_vmap_in_combine_fn(
+        self, compile_mode, reverse, device, autograd
+    ):
         def combine_fn(x, y):
             def body(x):
                 return x**2
@@ -4361,7 +4592,7 @@ def body(x):
             y_new = mapped_body(y)
             return x + y_new
 
-        inp = torch.randn(3, 10, 2, device=device)
+        inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": 0,
@@ -4375,6 +4606,7 @@ def body(x):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4382,6 +4614,7 @@ def body(x):
     @parametrize("reverse", [False, True])
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of associative_scan and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     @decorateIf(
@@ -4389,9 +4622,9 @@ def body(x):
         lambda params: (params["device"] == torch.device("cpu")),
     )
     def test_associative_scan_non_pointwise_generic(
-        self, reverse, compile_mode, device
+        self, reverse, compile_mode, device, autograd
     ):
-        x = torch.randn(3, 10, 2, device=device)
+        x = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": 0,
@@ -4405,15 +4638,16 @@ def test_associative_scan_non_pointwise_generic(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=x,
+            autograd_param=None if not autograd else (x,),
         )
 
-    @skipIfRocm(msg="Unsupported on ROCM yet")
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4425,19 +4659,18 @@ def test_associative_scan_non_pointwise_generic(
             and (
                 params["device"] == torch.device("cpu")
                 or params["compile_mode"] == "compile_dynamic_shape"
-                or torch.version.hip
             )
         ),
     )
     def test_associative_scan_binary_operator(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
         state_dim = 20
         timesteps = 10
         projected_inputs = torch.randn(
-            timesteps, state_dim, requires_grad=True, device=device
+            timesteps, state_dim, device=device, requires_grad=autograd
         )
-        A = torch.randn(state_dim, requires_grad=True, device=device)
+        A = torch.randn(state_dim, device=device, requires_grad=autograd)
         elements = (A.repeat((timesteps, 1)), projected_inputs)
 
         kwargs = {
@@ -4452,9 +4685,9 @@ def test_associative_scan_binary_operator(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=elements,
+            autograd_param=None if not autograd else elements,
         )
 
-    @skipIfRocm(msg="Unsupported on ROCM yet")
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
@@ -4533,6 +4766,7 @@ def test_associative_scan_different_input_size_wrong_dim(self):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4541,9 +4775,9 @@ def test_associative_scan_different_input_size_wrong_dim(self):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_simple(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
-        H = torch.rand(2, device=device)
+        H = torch.rand(2, device=device, requires_grad=autograd)
 
         def fct_freevars1(x: torch.Tensor, y: torch.Tensor):
             return x * H + y * 2
@@ -4551,13 +4785,13 @@ def fct_freevars1(x: torch.Tensor, y: torch.Tensor):
         def fct_freevars2(x: torch.Tensor, y: torch.Tensor):
             return x * H + y * H
 
-        H1 = torch.rand(1, device=device)
-        H2 = torch.rand(1, device=device)
+        H1 = torch.rand(1, device=device, requires_grad=autograd)
+        H2 = torch.rand(1, device=device, requires_grad=autograd)
 
         def fct_freevars3(x: torch.Tensor, y: torch.Tensor):
             return x * H1 + y * H2
 
-        inp = torch.randn(3, 2, 2, device=device)
+        inp = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
 
         for fct, param in [
             (fct_freevars1, (H,)),
@@ -4576,6 +4810,7 @@ def fct_freevars3(x: torch.Tensor, y: torch.Tensor):
                 model=AssociativeScanModels.CombineFn(**kwargs),
                 model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
                 inputs=inp,
+                autograd_param=None if not autograd else (inp, *param),
             )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4584,6 +4819,7 @@ def fct_freevars3(x: torch.Tensor, y: torch.Tensor):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4592,10 +4828,10 @@ def fct_freevars3(x: torch.Tensor, y: torch.Tensor):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_nested(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
-        H1 = torch.rand(4, 5, device=device)
-        H2 = torch.rand(4, 1, device=device)
+        H1 = torch.rand(4, 5, device=device, requires_grad=autograd)
+        H2 = torch.rand(4, 1, device=device, requires_grad=autograd)
 
         def fct_nested_outside(x: torch.Tensor, y: torch.Tensor):
             def inner(xi):
@@ -4611,13 +4847,10 @@ def inner(xi):
             ret = inner(y)
             return x + ret * H1
 
-        H1_i = torch.rand(4, 5, device=device)
-
         # TODO: Using random tensors in the `combine_fn` triggers the vmap randomness error:
         # RuntimeError: vmap: called random operation while in randomness error mode.
         # Please either use the 'same' or 'different' randomness flags on vmap or perform the randomness operation out of vmap
         def fct_nested_inside(x: torch.Tensor, y: torch.Tensor):
-            # H2_i = torch.rand(4, 1, device=device)
             H2_i = torch.ones(4, 1, device=device) * 42
 
             def inner(xi):
@@ -4627,7 +4860,6 @@ def inner(xi):
             return x + ret * H1
 
         def fct_nested_inside_fake(x: torch.Tensor, y: torch.Tensor):
-            # H2_i = torch.rand(4, 1, device=device)
             H2_i = torch.ones(4, 1, device=device) * 42
 
             def inner(xi):
@@ -4636,11 +4868,11 @@ def inner(xi):
             ret = inner(y)
             return x + ret * H1
 
-        inp = torch.randn(3, 4, 5, device=device)
+        inp = torch.randn(3, 4, 5, device=device, requires_grad=autograd)
 
         for fct, fct_fake, param in [
             (fct_nested_outside, fct_nested_outside_fake, (H1, H2)),
-            (fct_nested_inside, fct_nested_inside_fake, (H1_i,)),
+            (fct_nested_inside, fct_nested_inside_fake, ()),
         ]:
             kwargs = {
                 "dim": 0,
@@ -4655,6 +4887,7 @@ def inner(xi):
                 model=AssociativeScanModels.CombineFn(**kwargs),
                 model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
                 inputs=inp,
+                autograd_param=None if not autograd else (inp, *param),
             )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4663,6 +4896,7 @@ def inner(xi):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4671,7 +4905,7 @@ def inner(xi):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_fct(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
         def additional_fct_no_add_inp(x, y):
             return x * y
@@ -4680,7 +4914,7 @@ def fct_nested_outside(x: torch.Tensor, y: torch.Tensor):
             ret = additional_fct_no_add_inp(y, y)
             return x + ret
 
-        inp = torch.randn(3, 4, 5, device=device)
+        inp = torch.randn(3, 4, 5, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": 0,
@@ -4694,6 +4928,7 @@ def fct_nested_outside(x: torch.Tensor, y: torch.Tensor):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4701,7 +4936,10 @@ def fct_nested_outside(x: torch.Tensor, y: torch.Tensor):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
-    def test_associative_scan_freevars_fct_generic(self, compile_mode, reverse, device):
+    @parametrize("autograd", [False, True])
+    def test_associative_scan_freevars_fct_generic(
+        self, compile_mode, reverse, device, autograd
+    ):
         def additional_fct_no_add_inp(x, y):
             return x * y
 
@@ -4715,7 +4953,7 @@ def fct_nested_outside_fake(x: torch.Tensor, y: torch.Tensor):
             ret = _fake_associative_scan(additional_fct_no_add_inp, y, 1)
             return x + ret
 
-        inp = torch.randn(3, 4, 5, device=device)
+        inp = torch.randn(3, 4, 5, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": 0,
@@ -4730,6 +4968,7 @@ def fct_nested_outside_fake(x: torch.Tensor, y: torch.Tensor):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4738,6 +4977,7 @@ def fct_nested_outside_fake(x: torch.Tensor, y: torch.Tensor):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4746,7 +4986,7 @@ def fct_nested_outside_fake(x: torch.Tensor, y: torch.Tensor):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_shape_check(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
         H = torch.eye(2, device=device, requires_grad=True)
 
@@ -4767,6 +5007,7 @@ def fct_freevars(x: torch.Tensor, y: torch.Tensor):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4775,6 +5016,7 @@ def fct_freevars(x: torch.Tensor, y: torch.Tensor):
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     @parametrize("combine_mode", ["pointwise", "generic"])
+    @parametrize("autograd", [False, True])
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4783,11 +5025,11 @@ def fct_freevars(x: torch.Tensor, y: torch.Tensor):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_pytree(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
-        xf = torch.randn(2, 2, device=device, requires_grad=True)
-        yf = torch.randn(2, 2, device=device, requires_grad=True)
-        zf = torch.randn(2, 2, device=device, requires_grad=True)
+        xf = torch.randn(2, 2, device=device, requires_grad=autograd)
+        yf = torch.randn(2, 2, device=device, requires_grad=autograd)
+        zf = torch.randn(2, 2, device=device, requires_grad=autograd)
         inpf = {"i": xf, "j": ([yf], [{"o": zf}])}
 
         def fct_pointwise(x, y):
@@ -4804,9 +5046,9 @@ def fct_pointwise(x, y):
                 ),
             }
 
-        x = torch.randn(3, 2, 2, device=device, requires_grad=True)
-        y = torch.randn(3, 2, 2, device=device, requires_grad=True)
-        z = torch.randn(3, 2, 2, device=device, requires_grad=True)
+        x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        y = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        z = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
         inp = {"i": x, "j": ([y], [{"o": z}])}
 
         kwargs = {
@@ -4821,6 +5063,7 @@ def fct_pointwise(x, y):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (*pytree.tree_leaves(inp),),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -5373,69 +5616,35 @@ def test_while_loop_simple_with_linear_compile_check_graph(self):
         gm = backend.graphs[0]
         if torch._dynamo.config.inline_inbuilt_nn_modules:
             self.assertExpectedInline(
-                gm.code.strip(),
-                """\
-def forward(self, L_iter_ : torch.Tensor, L_x_ : torch.Tensor, L_self_buffers_dec_ : torch.Tensor, L_self_modules_linear_parameters_weight_ : torch.nn.parameter.Parameter, L_self_modules_linear_parameters_bias_ : torch.nn.parameter.Parameter):
-    l_iter_ = L_iter_
-    l_x_ = L_x_
-    l_self_buffers_dec_ = L_self_buffers_dec_
-    l_self_modules_linear_parameters_weight_ = L_self_modules_linear_parameters_weight_
-    l_self_modules_linear_parameters_bias_ = L_self_modules_linear_parameters_bias_
-    cond_fn_0 = self.cond_fn_0
-    body_fn_0 = self.body_fn_0
-    while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (l_iter_, l_x_), (l_self_buffers_dec_, l_self_modules_linear_parameters_bias_, l_self_modules_linear_parameters_weight_));  cond_fn_0 = body_fn_0 = l_iter_ = l_x_ = l_self_buffers_dec_ = l_self_modules_linear_parameters_bias_ = l_self_modules_linear_parameters_weight_ = None
-    getitem = while_loop[0]
-    getitem_1 = while_loop[1];  while_loop = None
-    return (getitem, getitem_1)""",  # noqa: B950
-            )
-            self.assertExpectedInline(
-                gm.cond_fn_0.code.strip(),
+                normalize_gm(gm.print_readable(print_output=False)),
                 """\
-def forward(self, child : torch.Tensor, child_1 : torch.Tensor, l_self_buffers_dec__cond_fn, l_self_modules_linear_parameters_bias__body_fn, l_self_modules_linear_parameters_weight__body_fn):
-    sub = child - l_self_buffers_dec__cond_fn;  child = l_self_buffers_dec__cond_fn = None
-    gt = sub > 0;  sub = None
-    return gt""",  # noqa: B950
-            )
-            self.assertExpectedInline(
-                gm.body_fn_0.code.strip(),
-                """\
-def forward(self, child_2 : torch.Tensor, child_3 : torch.Tensor, l_self_buffers_dec__cond_fn, l_self_modules_linear_parameters_bias__body_fn, l_self_modules_linear_parameters_weight__body_fn):
-    child = child_2 - 1;  child_2 = None
-    child_4 = torch._C._nn.linear(child_3, l_self_modules_linear_parameters_weight__body_fn, l_self_modules_linear_parameters_bias__body_fn);  child_3 = l_self_modules_linear_parameters_weight__body_fn = l_self_modules_linear_parameters_bias__body_fn = None
-    return (child, child_4)""",  # noqa: B950
-            )
-        else:
-            self.assertExpectedInline(
-                gm.code.strip(),
-                """\
-def forward(self, L_iter_ : torch.Tensor, L_x_ : torch.Tensor):
-    l_iter_ = L_iter_
-    l_x_ = L_x_
-    l__self___dec = self.L__self___dec
-    l__self___linear_weight = self.L__self___linear_weight
-    l__self___linear_bias = self.L__self___linear_bias
-    cond_fn_0 = self.cond_fn_0
-    body_fn_0 = self.body_fn_0
-    while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (l_iter_, l_x_), (l__self___dec, l__self___linear_bias, l__self___linear_weight));  cond_fn_0 = body_fn_0 = l_iter_ = l_x_ = l__self___dec = l__self___linear_bias = l__self___linear_weight = None
-    getitem = while_loop[0]
-    getitem_1 = while_loop[1];  while_loop = None
-    return (getitem, getitem_1)""",  # noqa: B950
-            )
-            self.assertExpectedInline(
-                gm.cond_fn_0.code.strip(),
-                """\
-def forward(self, l_iter_, l_x_, l__self___dec_cond_fn, l__self___linear_bias_body_fn, l__self___linear_weight_body_fn):
-    sub = l_iter_ - l__self___dec_cond_fn;  l_iter_ = l__self___dec_cond_fn = None
-    gt = sub > 0;  sub = None
-    return gt""",  # noqa: B950
-            )
-            self.assertExpectedInline(
-                gm.body_fn_0.code.strip(),
-                """\
-def forward(self, l_iter_, l_x_, l__self___dec_cond_fn, l__self___linear_bias_body_fn, l__self___linear_weight_body_fn):
-    child = l_iter_ - 1;  l_iter_ = None
-    child_1 = torch._C._nn.linear(l_x_, l__self___linear_weight_body_fn, l__self___linear_bias_body_fn);  l_x_ = l__self___linear_weight_body_fn = l__self___linear_bias_body_fn = None
-    return (child, child_1)""",  # noqa: B950
+class GraphModule(torch.nn.Module):
+    def forward(self, L_iter_: "i64[]", L_x_: "f32[2, 2]", L_self_buffers_dec_: "i64[]", L_self_modules_linear_parameters_weight_: "f32[2, 2]", L_self_modules_linear_parameters_bias_: "f32[2]"):
+        l_iter_ = L_iter_
+        l_x_ = L_x_
+        l_self_buffers_dec_ = L_self_buffers_dec_
+        l_self_modules_linear_parameters_weight_ = L_self_modules_linear_parameters_weight_
+        l_self_modules_linear_parameters_bias_ = L_self_modules_linear_parameters_bias_
+
+        cond_fn_0 = self.cond_fn_0
+        body_fn_0 = self.body_fn_0
+        while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (l_iter_, l_x_), (l_self_buffers_dec_, l_self_modules_linear_parameters_bias_, l_self_modules_linear_parameters_weight_));  cond_fn_0 = body_fn_0 = l_iter_ = l_x_ = l_self_buffers_dec_ = l_self_modules_linear_parameters_bias_ = l_self_modules_linear_parameters_weight_ = None
+        getitem: "i64[]" = while_loop[0]
+        getitem_1: "f32[2, 2]" = while_loop[1];  while_loop = None
+        return (getitem, getitem_1)
+
+    class cond_fn_0(torch.nn.Module):
+        def forward(self, child: "i64[]", child_1: "f32[2, 2]", l_self_buffers_dec__cond_fn: "i64[]", l_self_modules_linear_parameters_bias__body_fn: "f32[2]", l_self_modules_linear_parameters_weight__body_fn: "f32[2, 2]"):
+            sub: "i64[]" = child - l_self_buffers_dec__cond_fn;  child = l_self_buffers_dec__cond_fn = None
+            gt: "b8[]" = sub > 0;  sub = None
+            return gt
+
+    class body_fn_0(torch.nn.Module):
+        def forward(self, child_2: "i64[]", child_3: "f32[2, 2]", l_self_buffers_dec__cond_fn: "i64[]", l_self_modules_linear_parameters_bias__body_fn: "f32[2]", l_self_modules_linear_parameters_weight__body_fn: "f32[2, 2]"):
+            child: "i64[]" = child_2 - 1;  child_2 = None
+            child_4: "f32[2, 2]" = torch._C._nn.linear(child_3, l_self_modules_linear_parameters_weight__body_fn, l_self_modules_linear_parameters_bias__body_fn);  child_3 = l_self_modules_linear_parameters_weight__body_fn = l_self_modules_linear_parameters_bias__body_fn = None
+            return (child, child_4)
+""",  # noqa: B950
             )
 
     def test_while_loop_nested2_traced(self):
@@ -7684,6 +7893,8 @@ def forward(self, x):
         x: "f32[s77, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        _guards_fn = self._guards_fn(x);  _guards_fn = None
+
         sym_size_int_1: "Sym(s77)" = torch.ops.aten.sym_size.int(x, 0)
 
         while_loop_cond_graph_0 = self.while_loop_cond_graph_0
@@ -7832,6 +8043,8 @@ def forward(self, t):
         t: "f32[2, 3]";
 
         t, = fx_pytree.tree_flatten_spec(([t], {}), self._in_spec)
+        _guards_fn = self._guards_fn(t);  _guards_fn = None
+
         sum_1: "f32[]" = torch.ops.aten.sum.default(t)
         _assert_tensor_metadata_default = torch.ops.aten._assert_tensor_metadata.default(sum_1, dtype = torch.float32, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default = None
         to: "i64[]" = torch.ops.aten.to.dtype(sum_1, torch.int64);  sum_1 = None
@@ -7971,7 +8184,7 @@ def test_while_loop_op_pytree_int_carry_export(self, strict, dynamic):
         m, args = WHILE_LOOP_TESTS["pytree_int_carry"]
         dynamic_shapes = {"x": {0: torch.export.Dim("dim_x")}} if dynamic else None
         ep = self._check_export(m, args, strict=strict, dynamic_shapes=dynamic_shapes)
-        if strict and dynamic:
+        if strict and dynamic and not TEST_WITH_CROSSREF:
             self.assertExpectedInline(
                 normalize_gm(ep.module().print_readable(print_output=False)),
                 """\
@@ -7980,6 +8193,8 @@ def forward(self, x):
         x: "f32[s77, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        _guards_fn = self._guards_fn(x);  _guards_fn = None
+
         sym_size_int_1: "Sym(s77)" = torch.ops.aten.sym_size.int(x, 0)
 
         sin: "f32[s77, 3]" = torch.ops.aten.sin.default(x);  x = None
@@ -8099,6 +8314,184 @@ def forward(self, unbacked_symint_4: "Sym(u5)", unbacked_symint_5: "Sym(u6)", un
 """,  # noqa: B950
             )
 
+    @parametrize("dynamic", [True, False])
+    @parametrize("backend", ["eager", "aot_eager"])
+    def test_compile_while_loop_stack_output(self, dynamic, backend):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+
+            def forward(self, x):
+                c = torch.tensor(0, dtype=torch.int64)
+
+                def cond_fn(c, x):
+                    return c < x.size(0)
+
+                def body_fn(c, x):
+                    return c + 1, self.linear(x)
+
+                stacked_c, stacked_x = torch.ops.higher_order.while_loop_stack_output(
+                    cond_fn, body_fn, (c, x), tuple()
+                )
+                return stacked_c, stacked_x
+
+        x = torch.randn(3, 3)
+        mod = Mod()
+        compiled_out = torch.compile(mod, backend=backend, dynamic=dynamic)(x)
+        self.assertEqual(len(compiled_out), 2)
+        self.assertEqual(compiled_out[0].size(0), 3)
+        self.assertEqual(compiled_out[1].size(0), 3)
+        self.assertEqual(compiled_out, mod(x))
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_while_loop_autograd_simple(self):
+        backend = torch._dynamo.testing.AotEagerAndRecordGraphs()
+
+        class ModEager(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+
+            def forward(self, x):
+                while x.sum() < 2:
+                    x = x * x + 1 + self.linear(x)
+                return x
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+
+            def forward(self, x):
+                def cond_fn(x):
+                    return x.sum() < 2
+
+                def body_fn(x):
+                    return x * x + 1 + self.linear(x)
+
+                return torch._higher_order_ops.while_loop(cond_fn, body_fn, (x,))
+
+        x = torch.randn(3, 3, requires_grad=True)
+        x_clone = x.clone()
+        mod = Mod()
+        mod_eager = ModEager()
+        # Copy weights from mod to mod_eager
+        mod_eager.load_state_dict(mod.state_dict())
+        compiled_out = torch.compile(mod, backend=backend, fullgraph=True)(x)
+        exp_out = mod_eager(x_clone)
+        compiled_out.sum().backward()
+        exp_out.sum().backward()
+        self.assertEqual(compiled_out, exp_out)
+        eager_parameters = dict(mod_eager.named_parameters())
+        compiled_parameters = dict(mod.named_parameters())
+        for name, param in compiled_parameters.items():
+            self.assertEqual(param, eager_parameters[name])
+            self.assertEqual(param.grad, eager_parameters[name].grad)
+
+        self.assertEqual(
+            len(
+                backend.fw_graphs[0].graph.find_nodes(
+                    op="call_function",
+                    target=torch.ops.higher_order.while_loop_stack_output,
+                )
+            ),
+            1,
+        )
+        self.assertEqual(
+            len(
+                backend.bw_graphs[0].graph.find_nodes(
+                    op="call_function", target=torch.ops.higher_order.while_loop
+                )
+            ),
+            1,
+        )
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[3, 3]", primals_2: "f32[3, 3]", primals_3: "f32[3]"):
+        while_loop_cond_graph_0 = self.while_loop_cond_graph_0
+        while_loop_body_graph_0 = self.while_loop_body_graph_0
+        while_loop_stack_output = torch.ops.higher_order.while_loop_stack_output(while_loop_cond_graph_0, while_loop_body_graph_0, (primals_1,), (primals_3, primals_2));  while_loop_cond_graph_0 = while_loop_body_graph_0 = None
+        getitem: "f32[u2, 3, 3]" = while_loop_stack_output[0];  while_loop_stack_output = None
+        select: "f32[3, 3]" = torch.ops.aten.select.int(getitem, 0, -1)
+        unsqueeze: "f32[1, 3, 3]" = torch.ops.aten.unsqueeze.default(primals_1, 0);  primals_1 = None
+        slice_1: "f32[u2 - 1, 3, 3]" = torch.ops.aten.slice.Tensor(getitem, 0, 0, -1);  getitem = None
+        cat: "f32[u2, 3, 3]" = torch.ops.aten.cat.default([unsqueeze, slice_1]);  unsqueeze = slice_1 = None
+        return (select, primals_2, primals_3, cat)
+
+    class while_loop_cond_graph_0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[3, 3]", arg1_1: "f32[3]", arg2_1: "f32[3, 3]"):
+            sum_1: "f32[]" = torch.ops.aten.sum.default(arg0_1);  arg0_1 = None
+            lt: "b8[]" = torch.ops.aten.lt.Scalar(sum_1, 2);  sum_1 = None
+            return lt
+
+    class while_loop_body_graph_0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[3, 3]", arg1_1: "f32[3]", arg2_1: "f32[3, 3]"):
+            mul: "f32[3, 3]" = torch.ops.aten.mul.Tensor(arg0_1, arg0_1)
+            add: "f32[3, 3]" = torch.ops.aten.add.Tensor(mul, 1);  mul = None
+            t: "f32[3, 3]" = torch.ops.aten.t.default(arg2_1);  arg2_1 = None
+            addmm: "f32[3, 3]" = torch.ops.aten.addmm.default(arg1_1, arg0_1, t);  arg1_1 = arg0_1 = t = None
+            add_1: "f32[3, 3]" = torch.ops.aten.add.Tensor(add, addmm);  add = addmm = None
+            return (add_1,)
+""",  # noqa: B950
+            )
+
+            self.assertExpectedInline(
+                normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_2: "f32[3, 3]", primals_3: "f32[3]", cat: "f32[u2, 3, 3]", tangents_1: "f32[3, 3]"):
+        zeros: "i64[]" = torch.ops.aten.zeros.default([], dtype = torch.int64, device = device(type='cpu'), pin_memory = False)
+        zeros_like: "f32[3]" = torch.ops.aten.zeros_like.default(primals_3, pin_memory = False)
+        zeros_like_1: "f32[3, 3]" = torch.ops.aten.zeros_like.default(primals_2, pin_memory = False)
+        while_loop_cond_graph_1 = self.while_loop_cond_graph_1
+        while_loop_body_graph_1 = self.while_loop_body_graph_1
+        while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_1, while_loop_body_graph_1, (zeros, tangents_1, zeros_like, zeros_like_1), (cat, primals_3, primals_2));  while_loop_cond_graph_1 = while_loop_body_graph_1 = zeros = tangents_1 = zeros_like = zeros_like_1 = cat = primals_3 = primals_2 = None
+        getitem_2: "f32[3, 3]" = while_loop[1]
+        getitem_3: "f32[3]" = while_loop[2]
+        getitem_4: "f32[3, 3]" = while_loop[3];  while_loop = None
+        return (getitem_2, getitem_4, getitem_3)
+
+    class while_loop_cond_graph_1(torch.nn.Module):
+        def forward(self, arg0_1: "i64[]", arg1_1: "f32[3, 3]", arg2_1: "f32[3]", arg3_1: "f32[3, 3]", arg4_1: "f32[u2, 3, 3]", arg5_1: "f32[3]", arg6_1: "f32[3, 3]"):
+            sym_size_int_1: "Sym(u2)" = torch.ops.aten.sym_size.int(arg4_1, 0);  arg4_1 = None
+
+            lt: "b8[]" = torch.ops.aten.lt.Scalar(arg0_1, sym_size_int_1);  arg0_1 = sym_size_int_1 = None
+            return lt
+
+    class while_loop_body_graph_1(torch.nn.Module):
+        def forward(self, arg0_1: "i64[]", arg1_1: "f32[3, 3]", arg2_1: "f32[3]", arg3_1: "f32[3, 3]", arg4_1: "f32[u2, 3, 3]", arg5_1: "f32[3]", arg6_1: "f32[3, 3]"):
+            sym_size_int_1: "Sym(u2)" = torch.ops.aten.sym_size.int(arg4_1, 0)
+
+            rsub: "i64[]" = torch.ops.aten.rsub.Scalar(arg0_1, sym_size_int_1);  sym_size_int_1 = None
+            sub_1: "i64[]" = torch.ops.aten.sub.Tensor(rsub, 1);  rsub = None
+            _local_scalar_dense: "Sym(u9)" = torch.ops.aten._local_scalar_dense.default(sub_1);  sub_1 = None
+            select: "f32[3, 3]" = torch.ops.aten.select.int(arg4_1, 0, _local_scalar_dense);  arg4_1 = _local_scalar_dense = None
+            t: "f32[3, 3]" = torch.ops.aten.t.default(arg6_1);  arg6_1 = None
+            t_1: "f32[3, 3]" = torch.ops.aten.t.default(t);  t = None
+            mm: "f32[3, 3]" = torch.ops.aten.mm.default(arg1_1, t_1);  t_1 = None
+            t_2: "f32[3, 3]" = torch.ops.aten.t.default(arg1_1)
+            mm_1: "f32[3, 3]" = torch.ops.aten.mm.default(t_2, select);  t_2 = None
+            t_3: "f32[3, 3]" = torch.ops.aten.t.default(mm_1);  mm_1 = None
+            sum_1: "f32[1, 3]" = torch.ops.aten.sum.dim_IntList(arg1_1, [0], True)
+            view: "f32[3]" = torch.ops.aten.view.default(sum_1, [3]);  sum_1 = None
+            t_4: "f32[3, 3]" = torch.ops.aten.t.default(t_3);  t_3 = None
+            mul_4: "f32[3, 3]" = torch.ops.aten.mul.Tensor(arg1_1, select)
+            mul_5: "f32[3, 3]" = torch.ops.aten.mul.Tensor(arg1_1, select);  arg1_1 = select = None
+
+            add_7: "f32[3, 3]" = torch.ops.aten.add.Tensor(mm, mul_5);  mm = mul_5 = None
+            add_8: "f32[3, 3]" = torch.ops.aten.add.Tensor(add_7, mul_4);  add_7 = mul_4 = None
+
+            add_9: "i64[]" = torch.ops.aten.add.Tensor(arg0_1, 1);  arg0_1 = None
+            add_10: "f32[3]" = torch.ops.aten.add.Tensor(view, arg2_1);  view = arg2_1 = None
+            add_11: "f32[3, 3]" = torch.ops.aten.add.Tensor(t_4, arg3_1);  t_4 = arg3_1 = None
+            return (add_9, add_8, add_10, add_11)
+""",  # noqa: B950
+            )
+
     def test_input_output_alias(self):
         def fn(f, *args):
             return torch.cond(args[0].sum() > 0, f, f, args)
@@ -8221,6 +8614,8 @@ def forward(self, a, b1, b2, c):
         a: "b8[]"; b1: "i64[1]"; b2: "i64[1]"; c: "f32[10]";
 
         a, b1, b2, c, = fx_pytree.tree_flatten_spec(([a, b1, b2, c], {}), self._in_spec)
+        _guards_fn = self._guards_fn(a, b1, b2, c);  _guards_fn = None
+
         true_graph_0 = self.true_graph_0
         false_graph_0 = self.false_graph_0
         cond = torch.ops.higher_order.cond(a, true_graph_0, false_graph_0, (c, b1, b2));  a = true_graph_0 = false_graph_0 = c = b1 = b2 = None
@@ -8303,6 +8698,8 @@ def forward(self, x, y, z):
         x: "f32[s68, 3]"; y: "f32[s17]"; z: "f32[s68, 3]";
 
         x, y, z, = fx_pytree.tree_flatten_spec(([x, y, z], {}), self._in_spec)
+        _guards_fn = self._guards_fn(x, y, z);  _guards_fn = None
+
         sym_size_int_4: "Sym(s17)" = torch.ops.aten.sym_size.int(y, 0);  y = None
         sym_size_int_5: "Sym(s68)" = torch.ops.aten.sym_size.int(z, 0)
 
@@ -8682,22 +9079,6 @@ def test_function_schema_gen(self):
         self.assertEqual(schema2.parse(str(schema2)), schema2)
         self.assertEqual(schema3.parse(str(schema3)), schema3)
 
-    def test_while_loop_schema_gen(self):
-        fn, inp = WHILE_LOOP_TESTS["simple_with_linear"]
-        graph = make_fx(fn)(*inp).graph
-        while_loop_node = next(
-            node
-            for node in graph.nodes
-            if node.op == "call_function"
-            and node.target is torch.ops.higher_order.while_loop
-        )
-        schema = torch._library.utils.hop_schema_from_fx_node(while_loop_node)
-        self.assertExpectedInline(
-            str(schema),
-            """while_loop(GraphModule cond_fn, GraphModule body_fn, Tensor[2] carried_inputs, Tensor[3] additional_inputs) -> Tensor[2]""",  # noqa: B950
-        )
-        self.assertEqual(schema.parse(str(schema)), schema)
-
     def test_schema_tree_spec(self):
         schema_gen = HopSchemaGenerator(torch.ops.higher_order.cond)
         args = (torch.randn(3, 4), torch.randn(2, 3))
@@ -8714,6 +9095,202 @@ def test_schema_tree_spec(self):
             str(flat_schema), """cond(Tensor tuple_args0, Tensor tuple_args1) -> ()"""
         )
 
+    def test_cond_gen_schema_tensor_inputs(self):
+        schema = torch.ops.higher_order.cond.gen_schema(
+            torch.tensor(True),
+            lambda x: x.sin(),
+            lambda x: x.cos(),
+            (torch.randn(3, 4),),
+        )
+        self.assertExpectedInline(
+            str(schema),
+            """cond(Tensor pred, Any true_fn, Any false_fn, Tensor operand0) -> ((Tensor))""",
+        )
+
+    def test_cond_gen_schema_symbool_inputs(self):
+        from torch._subclasses.fake_tensor import FakeTensorMode
+        from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+        fake_mode = FakeTensorMode(shape_env=ShapeEnv())
+        with fake_mode, fake_mode.shape_env.ignore_fresh_unbacked_symbols():
+            sym_bool = torch.randn(3, 4).nonzero().size(0) == 0
+
+        schema = torch.ops.higher_order.cond.gen_schema(
+            sym_bool,
+            lambda x: x.sin(),
+            lambda x: x.cos(),
+            (torch.randn(3, 4),),
+        )
+        self.assertExpectedInline(
+            str(schema),
+            """cond(SymBool pred, Any true_fn, Any false_fn, Tensor operand0) -> ((Tensor))""",
+        )
+
+    def test_while_loop_gen_schema_tensor_inputs(self):
+        def cond_fn(x, y):
+            return x.sum() < 10
+
+        def body_fn(x, y):
+            return x + 1, y.sin()
+
+        schema = torch.ops.higher_order.while_loop.gen_schema(
+            cond_fn,
+            body_fn,
+            (torch.randn(3, 4), torch.randn(2, 3)),
+            (),
+        )
+        self.assertExpectedInline(
+            str(schema),
+            """while_loop(Any cond_fn, Any body_fn, Tensor carried_input0, Tensor carried_input1) -> (Tensor, Tensor)""",
+        )
+
+    def test_while_loop_gen_schema_with_additional_inputs(self):
+        def cond_fn(x, y, z):
+            return x.sum() < z
+
+        def body_fn(x, y, z):
+            return x + 1, y.sin()
+
+        schema = torch.ops.higher_order.while_loop.gen_schema(
+            cond_fn,
+            body_fn,
+            (torch.randn(3, 4), torch.randn(2, 3)),
+            (torch.tensor(10),),
+        )
+        self.assertExpectedInline(
+            str(schema),
+            """while_loop(Any cond_fn, Any body_fn, Tensor carried_input0, Tensor carried_input1, Tensor additional_input0) -> (Tensor, Tensor)""",  # noqa: B950
+        )
+
+    def test_scan_gen_schema_tensor_inputs(self):
+        def combine_fn(carry, x):
+            return carry + x, carry * x
+
+        schema = torch.ops.higher_order.scan.gen_schema(
+            combine_fn,
+            (torch.randn(3, 4),),
+            (torch.randn(5, 3, 4),),
+            (),
+        )
+        self.assertExpectedInline(
+            str(schema),
+            """scan(Any combine_fn, Tensor init0, Tensor xs0) -> (Tensor, Tensor)""",
+        )
+
+    def test_scan_gen_schema_with_additional_inputs(self):
+        def combine_fn(carry, x, scale):
+            return carry + x * scale, carry * x
+
+        schema = torch.ops.higher_order.scan.gen_schema(
+            combine_fn,
+            (torch.randn(3, 4),),
+            (torch.randn(5, 3, 4),),
+            (torch.tensor(2.0),),
+        )
+        self.assertExpectedInline(
+            str(schema),
+            """scan(Any combine_fn, Tensor init0, Tensor xs0, Tensor additional_input0) -> (Tensor, Tensor)""",  # noqa: B950
+        )
+
+    def test_scan_gen_schema_multiple_inputs(self):
+        def combine_fn(carry1, carry2, x1, x2):
+            return carry1 + x1, carry2 * x2, carry1 - x1, carry2 + x2
+
+        schema = torch.ops.higher_order.scan.gen_schema(
+            combine_fn,
+            (torch.randn(3, 4), torch.randn(2, 3)),
+            (torch.randn(5, 3, 4), torch.randn(5, 2, 3)),
+            (),
+        )
+        self.assertExpectedInline(
+            str(schema),
+            """scan(Any combine_fn, Tensor init0, Tensor init1, Tensor xs0, Tensor xs1) -> (Tensor, Tensor, Tensor, Tensor)""",  # noqa: B950
+        )
+
+    def test_associative_scan_gen_schema_tensor_inputs(self):
+        def combine_fn(x, y):
+            return x + y
+
+        schema = torch.ops.higher_order.associative_scan.gen_schema(
+            combine_fn,
+            (torch.randn(5, 3, 4),),
+            (),
+        )
+        self.assertExpectedInline(
+            str(schema),
+            """associative_scan(Any combine_fn, Tensor xs0) -> ((Tensor))""",
+        )
+
+    def test_associative_scan_gen_schema_with_additional_inputs(self):
+        def combine_fn(x, y, scale):
+            return x * y * scale
+
+        schema = torch.ops.higher_order.associative_scan.gen_schema(
+            combine_fn,
+            (torch.randn(5, 3, 4),),
+            (torch.tensor(2.0),),
+        )
+        self.assertExpectedInline(
+            str(schema),
+            """associative_scan(Any combine_fn, Tensor xs0, Tensor additional_input0) -> ((Tensor))""",
+        )
+
+    def test_associative_scan_gen_schema_multiple_inputs(self):
+        def combine_fn(x1, x2, y1, y2):
+            return x1 + y1, x2 * y2
+
+        schema = torch.ops.higher_order.associative_scan.gen_schema(
+            combine_fn,
+            (torch.randn(5, 3, 4), torch.randn(5, 2, 3)),
+            (),
+        )
+        self.assertExpectedInline(
+            str(schema),
+            """associative_scan(Any combine_fn, Tensor xs0, Tensor xs1) -> (Tensor, Tensor)""",
+        )
+
+    def test_while_loop_gen_schema_with_int_carries(self):
+        def cond_fn(x, y, z, c):
+            return x < y
+
+        def body_fn(x, y, z, c):
+            return x + 1, y - 1, z.sin(), c + x
+
+        schema = torch.ops.higher_order.while_loop.gen_schema(
+            cond_fn,
+            body_fn,
+            (2, 10, torch.randn(2, 3)),
+            (torch.tensor(10),),
+        )
+        self.assertExpectedInline(
+            str(schema),
+            """while_loop(Any cond_fn, Any body_fn, int carried_input0, int carried_input1, Tensor carried_input2, Tensor additional_input0) -> (int, int, Tensor, Tensor)""",  # noqa: B950
+        )
+
+    def test_while_loop_gen_schema_with_input_mutation(self):
+        def cond_fn(x, y, z, c):
+            return x < y
+
+        def body_fn(x, y, z, c):
+            x.add_(1)
+            y.sub_(1)
+            z.sin_()
+            c.add_(x)
+            return x, y, z
+
+        c = torch.randn(3, 3)
+
+        schema = torch.ops.higher_order.while_loop.gen_schema(
+            cond_fn,
+            body_fn,
+            (torch.randn(3, 3), torch.randn(3, 3), torch.randn(3, 3)),
+            (c,),
+        )
+        self.assertExpectedInline(
+            str(schema),
+            """while_loop(Any cond_fn, Any body_fn, Tensor(a2!) carried_input0, Tensor(a3!) carried_input1, Tensor(a4!) carried_input2, Tensor(a5!) additional_input0) -> (Tensor, Tensor, Tensor)""",  # noqa: B950
+        )
+
 
 instantiate_parametrized_tests(TestHopSchema)
 instantiate_parametrized_tests(TestControlFlowTraced)
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index bd8abbc3ea856..b42180bb1adf5 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -71,7 +71,6 @@
     markDynamoStrictTest,
     parametrize,
     run_tests,
-    skipIfRocm,
     skipIfTorchDynamo,
     subtest,
     TEST_CUDA_MEM_LEAK_CHECK,
@@ -5163,7 +5162,6 @@ def wrapper(*args, **kwargs):
 
 @markDynamoStrictTest
 class TestCompileTransforms(TestCase):
-    @skipIfRocm(msg="test leaks memory on ROCm")
     # torch.compile is not supported on Windows CUDA.
     # Triton only supports GPU with SM70 or later.
     @expectedFailureIf((IS_WINDOWS and TEST_CUDA) or (TEST_CUDA and not SM70OrLater))
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 246d69507d2c2..a8e22f276d516 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -399,6 +399,38 @@ def is_inplace(op, variant):
     "as_strided_copy",
 }
 
+bool_unsupported_ordered_ops = {
+    "topk",
+    "argmin",
+    "ceil",
+    "argmax",
+    "floor",
+}
+bool_ordered_op_db = tuple(
+    filter(lambda op: op.name in bool_unsupported_ordered_ops, op_db)
+)
+
+complex_unsupported_ordered_ops = {
+    "sort",
+    "topk",
+    "lt",
+    "argmin",
+    "le",
+    "ge",
+    "amax",
+    "maximum",
+    "minimum",
+    "clamp",
+    "amin",
+    "gt",
+    "ceil",
+    "argmax",
+    "floor",
+}
+complex_ordered_op_db = tuple(
+    filter(lambda op: op.name in complex_unsupported_ordered_ops, op_db)
+)
+
 
 @unittest.skipIf(TEST_WITH_ASAN, "tests time out with asan, are probably redundant")
 @unMarkDynamoStrictTest
@@ -2954,6 +2986,39 @@ def func(x):
             actual_fn(torch.ones_like(actual_o)),
         )
 
+    @ops(bool_ordered_op_db, dtypes=[torch.bool])
+    def test_ordered_bool_raises(self, device, dtype, op):
+        # Generate sample inputs for the op
+        sample_inputs = op.sample_inputs(device, dtype)
+
+        for sample_input in sample_inputs:
+            # Check that the op raises NotImplementedError or appropriate failure
+            self.assertRaises(
+                RuntimeError,
+                op,
+                sample_input.input,
+                *sample_input.args,
+                **sample_input.kwargs,
+            )
+
+    @ops(
+        complex_ordered_op_db,
+        dtypes=[torch.complex32, torch.complex64, torch.complex128],
+    )
+    def test_ordered_complex_raises(self, device, dtype, op):
+        # Generate sample inputs for the op
+        sample_inputs = op.sample_inputs(device, dtype)
+
+        for sample_input in sample_inputs:
+            # Check that the op raises NotImplementedError or appropriate failure
+            self.assertRaises(
+                RuntimeError,
+                op,
+                sample_input.input,
+                *sample_input.args,
+                **sample_input.kwargs,
+            )
+
 
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(TestOperators, globals(), only_for=only_for)
diff --git a/test/functorch/test_vmap_registrations.py b/test/functorch/test_vmap_registrations.py
index bf738207a41b4..adb66ac4d9709 100644
--- a/test/functorch/test_vmap_registrations.py
+++ b/test/functorch/test_vmap_registrations.py
@@ -208,6 +208,7 @@
     "aten::subtract_.Scalar",
     "aten::subtract_.Tensor",
     "aten::svd.U",
+    "aten::sym_is_contiguous",
     "aten::sym_size.int",
     "aten::sym_stride.int",
     "aten::sym_numel",
diff --git a/test/fx/test_dce_pass.py b/test/fx/test_dce_pass.py
index facf74d8e2a92..7fd3a6dbb0041 100644
--- a/test/fx/test_dce_pass.py
+++ b/test/fx/test_dce_pass.py
@@ -338,8 +338,6 @@ def test_keep_collectives(self):
         Test that DCE doesn't remote collective ops even the results are not used.
         """
 
-        from torch.testing._internal.distributed.fake_pg import FakeStore
-
         class TestModule(torch.nn.Module):
             def forward(
                 self, a: torch.Tensor, b: torch.Tensor, c: torch.Tensor
@@ -354,7 +352,6 @@ def forward(
             backend="fake",
             world_size=2,
             rank=0,
-            store=FakeStore(),
         )
         # collective nodes should not be removed because they have side effects.
         self._run_dce_and_test(TestModule(), expect_dce_changes=False, custom=False)
@@ -366,8 +363,6 @@ def test_keep_collectives_no_overload(self):
         Test that DCE doesn't remote collective ops (no overload version) even the results are not used.
         """
 
-        from torch.testing._internal.distributed.fake_pg import FakeStore
-
         class TestModule(torch.nn.Module):
             def forward(
                 self, a: torch.Tensor, b: torch.Tensor, c: torch.Tensor
@@ -382,7 +377,6 @@ def forward(
             backend="fake",
             world_size=2,
             rank=0,
-            store=FakeStore(),
         )
         # collective nodes should not be removed because they have side effects.
         self._run_dce_and_test(TestModule(), expect_dce_changes=False, custom=False)
diff --git a/test/fx/test_fx_split_node_finder.py b/test/fx/test_fx_split_node_finder.py
new file mode 100644
index 0000000000000..a139626968ca5
--- /dev/null
+++ b/test/fx/test_fx_split_node_finder.py
@@ -0,0 +1,191 @@
+# Owner(s): ["module: fx"]
+
+# pyre-strict
+import os
+import shutil
+import sys
+import tempfile
+
+import torch
+from torch.fx.passes.operator_support import OperatorSupportBase
+from torch.fx.passes.splitter_base import (
+    ALL_SUFFIX,
+    ENV_FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES,
+    FxNetAccNodesFinder,
+    NodeEventTracker,
+    NODES_SUFFIX,
+    ShapeProp,
+)
+from torch.testing._internal.common_utils import TestCase
+
+
+# Wrappepr function to make it supported
+@torch.fx.wrap
+def sup_f(x):
+    return x
+
+
+class TestFxSplitNodeFinder(TestCase):
+    def setUp(self):
+        self.save_path = sys.path[:]
+        self.tmpdir = tempfile.mkdtemp()
+        sys.path.insert(0, self.tmpdir)
+
+    def tearDown(self):
+        sys.path[:] = self.save_path
+        shutil.rmtree(self.tmpdir, ignore_errors=True)
+
+    def _testTrackerBasics(self, tracker):
+        """
+        Test the basic functionalities of the tracker by putting it into a
+        node finder and examine the events generated after the finder is called.
+        """
+
+        def getEvents(tracker, node):
+            return [tracker.events[idx] for idx in tracker.node_events[node.name]]
+
+        class IsNodeSupported(OperatorSupportBase):
+            def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+                return "sup_" in node.name
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x, y):
+                x = sup_f(x)
+                y = sup_f(y)
+                b = x + y  # non-supported to break graph
+                return sup_f(b)
+
+        gm = torch.fx.symbolic_trace(TestModule())
+        ShapeProp(gm).propagate(*(torch.rand((2, 2)), 3))
+
+        finder = FxNetAccNodesFinder(gm, IsNodeSupported(), False)
+        finder.tracker = tracker
+        acc_nodes = finder()
+        # check that acc nodes events are as expected
+        for node in gm.graph.nodes:
+            if node.name == "sup_f_1":
+                # this node should be removed from acc nodes.
+                self.assertFalse(node in acc_nodes)
+                events = getEvents(tracker, node)
+                # 2 events.
+                self.assertEqual(len(events), 2)
+                # 1st event is init_acc as supported operator
+                self.assertTrue(
+                    events[0].desc.startswith(
+                        "init_acc|callable_and_operator_supported"
+                    )
+                )
+                # 2nd event is del_acc as non-tensor output with cpu user
+                self.assertTrue(
+                    events[1].desc.startswith("acc_del|non_tensor_output_with_cpu_user")
+                )
+            elif node.name.startswith("sup_f"):
+                # other supported nodes should remain in acc nodes.
+                self.assertTrue(node in acc_nodes)
+                events = getEvents(tracker, node)
+                self.assertEqual(len(events), 1)
+                self.assertTrue(
+                    events[0].desc.startswith(
+                        "init_acc|callable_and_operator_supported"
+                    )
+                )
+            else:
+                # other nodes are on cpu.
+                self.assertFalse(node in acc_nodes)
+
+    def _validate_file_content(self, filepath, expected_lines):
+        """
+        Validate the content of the file.
+        Args:
+            filepath: the path of the file to be validated.
+            expected_lines: the expected lines of the file.
+        Returns:
+            None
+        """
+        with open(filepath) as f:
+            idx = 0
+            for line in f:
+                self.assertEqual(line.rstrip("\n"), expected_lines[idx])
+                idx += 1
+                self.assertTrue(idx <= len(expected_lines))
+            self.assertEqual(idx, len(expected_lines))
+
+    def _assert_events_file(self, events_file):
+        self._validate_file_content(
+            events_file,
+            [
+                "Node: x:",
+                "  x: init_cpu|not_callable #",
+                "Node: y:",
+                "  y: init_cpu|not_callable #",
+                "Node: sup_f:",
+                "  sup_f: init_acc|callable_and_operator_supported #",
+                "Node: sup_f_1:",
+                "  sup_f_1: init_acc|callable_and_operator_supported #",
+                "  sup_f_1: acc_del|non_tensor_output_with_cpu_user add",
+                "Node: add:",
+                "  add: init_cpu|operator_support #",
+                "Node: sup_f_2:",
+                "  sup_f_2: init_acc|callable_and_operator_supported #",
+                "Node: output:",
+                "  output: init_cpu|not_callable #",
+            ],
+        )
+
+    def _testTrackerMode(self, mode):
+        """
+        Test the tracker with different modes.
+        Args:
+            mode: the mode to be tested.
+            - 0: no local dump
+            - 1: dump all events to file
+            - 2: dump specific nodes in recursive manner
+            - 3: dump all nodes with more than 1 event in recursive manner.
+        """
+        tmp_dump_base_path = self.tmpdir + "/" + str(mode)
+        tracker = NodeEventTracker(
+            mode,  # mode: just enable the tracker without dumping
+            tmp_dump_base_path,  # dump path
+        )
+        events_file = tmp_dump_base_path + ALL_SUFFIX
+        nodes_file = tmp_dump_base_path + NODES_SUFFIX
+        self.assertFalse(os.path.exists(events_file))
+        self.assertFalse(os.path.exists(nodes_file))
+        self._testTrackerBasics(tracker)
+
+        if mode == 0:
+            # Make sure there are no files dumped
+            self.assertFalse(os.path.exists(events_file))
+            self.assertFalse(os.path.exists(nodes_file))
+        elif mode == 1:
+            self._assert_events_file(events_file)
+            self.assertFalse(os.path.exists(nodes_file))
+        elif mode == 2:
+            self._assert_events_file(events_file)
+            self._validate_file_content(
+                nodes_file,
+                ["|-sup_f_2: init_acc|callable_and_operator_supported #"],
+            )
+        elif mode == 3:
+            self._assert_events_file(events_file)
+            self._validate_file_content(
+                nodes_file,
+                [
+                    "|-sup_f_1: init_acc|callable_and_operator_supported #",
+                    "|-sup_f_1: acc_del|non_tensor_output_with_cpu_user add",
+                    "| |-add: init_cpu|operator_support #",
+                ],
+            )
+
+    def testMode0(self):
+        self._testTrackerMode(0)
+
+    def testMode1(self):
+        self._testTrackerMode(1)
+
+    def testMode2(self):
+        os.environ[ENV_FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES] = "sup_f_2"
+        self._testTrackerMode(2)
+
+    def testMode3(self):
+        self._testTrackerMode(3)
diff --git a/test/fx/test_fx_xform_observer.py b/test/fx/test_fx_xform_observer.py
index 307be1e721207..d9dcb8504ba7b 100644
--- a/test/fx/test_fx_xform_observer.py
+++ b/test/fx/test_fx_xform_observer.py
@@ -55,7 +55,7 @@ def replacement(x):
             )
         )
 
-    @torch._inductor.config.patch("trace.provenance_tracking", True)
+    @torch._inductor.config.patch("trace.provenance_tracking_level", 1)
     def test_graph_transform_observer_node_tracking(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -156,7 +156,7 @@ def forward(self, x):
             [NodeSourceAction.REPLACE, NodeSourceAction.CREATE],
         )
 
-    @torch._inductor.config.patch("trace.provenance_tracking", True)
+    @torch._inductor.config.patch("trace.provenance_tracking_level", 1)
     def test_graph_transform_observer_deepcopy(self):
         class SimpleLinearModel(torch.nn.Module):
             def forward(self, x):
@@ -179,7 +179,7 @@ def forward(self, x):
         self.assertEqual(len(gm2._erase_node_hooks), 0)
         self.assertEqual(len(gm2._deepcopy_hooks), 0)
 
-    @torch._inductor.config.patch("trace.provenance_tracking", True)
+    @torch._inductor.config.patch("trace.provenance_tracking_level", 1)
     def test_graph_transform_observer_replace(self):
         # the node sohuld should not be duplicated
         class Model(torch.nn.Module):
diff --git a/test/fx/test_partitioner_order.py b/test/fx/test_partitioner_order.py
index ab50b59fb96b7..f4c3ef072f9a6 100644
--- a/test/fx/test_partitioner_order.py
+++ b/test/fx/test_partitioner_order.py
@@ -24,6 +24,7 @@ def __init__(self, graph_module: torch.fx.GraphModule):
         )
 
 
+# original graph node order is: ['x', 'add', 'add_1', 'output']
 class AddModule(torch.nn.Module):
     def forward(self, x):
         y = torch.add(x, x)
@@ -32,8 +33,18 @@ def forward(self, x):
 
 
 class TestPartitionerOrder(TestCase):
-    # partitoner test to check graph node order
-    def test_partitioner_order(self):
+    # partitoner test to check graph node order remains the same with the original graph after partitioning
+    def test_partitioner_graph_node_order(self):
+        m = AddModule()
+        traced_m = torch.fx.symbolic_trace(m)
+        origin_node_order = [n.name for n in traced_m.graph.nodes]
+        partions = DummyPartitioner(traced_m).propose_partitions()
+        partion_nodes = [list(partition.nodes) for partition in partions]
+        partition_node_order = [n.name for n in partion_nodes[0]]
+        self.assertTrue(partition_node_order == origin_node_order)
+
+    # partitoner test to check graph node order remains the same during multiple runs
+    def test_partitioner_multiple_runs_order(self):
         m = AddModule()
         traced_m = torch.fx.symbolic_trace(m)
         partitions = DummyPartitioner(traced_m).propose_partitions()
diff --git a/test/higher_order_ops/test_invoke_subgraph.py b/test/higher_order_ops/test_invoke_subgraph.py
index c800eb78f905a..34d8e41d8978e 100644
--- a/test/higher_order_ops/test_invoke_subgraph.py
+++ b/test/higher_order_ops/test_invoke_subgraph.py
@@ -34,7 +34,7 @@
     TestCase,
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
-from torch.testing._internal.triton_utils import requires_cuda, requires_gpu
+from torch.testing._internal.triton_utils import requires_cuda_and_triton, requires_gpu
 
 
 nested_compile_region = torch.compiler.nested_compile_region
@@ -556,7 +556,7 @@ def fn(x):
         self.assertEqual(ref, res)
         self.assertEqual(x.grad, x_clone.grad)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_sdpa(self):
         @nested_compile_region
         def gn(q, k, v):
@@ -1195,17 +1195,11 @@ def fn(x, y):
         opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
-        ) as cm:
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "Encountered aliasing during higher order op tracing",
+        ):
             opt_fn(x, y)
 
-        cause = cm.exception.__cause__
-        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
-        self.assertTrue(
-            "Encountered aliasing during higher order op tracing" in str(cause)
-        )
-
     def test_input_input_aliasing(self):
         @nested_compile_region
         def gn(x, y):
@@ -1219,17 +1213,11 @@ def fn(x):
         opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
-        ) as cm:
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "Encountered aliasing during higher order op tracing",
+        ):
             opt_fn(x)
 
-        cause = cm.exception.__cause__
-        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
-        self.assertTrue(
-            "Encountered aliasing during higher order op tracing" in str(cause)
-        )
-
     def test_output_output_aliasing(self):
         @nested_compile_region
         def gn(x):
@@ -1244,17 +1232,11 @@ def fn(x):
         opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
-        ) as cm:
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "Encountered aliasing during higher order op tracing",
+        ):
             opt_fn(x)
 
-        cause = cm.exception.__cause__
-        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
-        self.assertTrue(
-            "Encountered aliasing during higher order op tracing" in str(cause)
-        )
-
     def test_mod_attr_aliasing(self):
         class MutateParam(torch.nn.Module):
             def __init__(self):
@@ -1447,7 +1429,7 @@ def forward(self, l_x_: "f32[8, 8]"):
 """,
             )
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_return_none(self):
         from torch.nn import functional as F
 
@@ -1516,7 +1498,7 @@ def forward(self, L_x_: "f32[8, 8]"):
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_);  subgraph_0 = l_x_ = None
         getitem: "f32[8, 8]" = invoke_subgraph[0]
-        getitem_1: "f32[8, 8]" = invoke_subgraph[2];  invoke_subgraph = None
+        getitem_1: "f32[8, 8]" = invoke_subgraph[1];  invoke_subgraph = None
 
         add: "f32[8, 8]" = getitem + getitem_1;  getitem = getitem_1 = None
         return (add,)
@@ -1525,7 +1507,7 @@ class subgraph_0(torch.nn.Module):
         def forward(self, l_x_: "f32[8, 8]"):
             child: "f32[8, 8]" = l_x_ * 2
             child_1: "f32[8, 8]" = l_x_ * 3;  l_x_ = None
-            return (child, None, child_1)
+            return (child, child_1)
 """,
             )
 
@@ -1538,16 +1520,16 @@ def forward(self, primals_1: "f32[8, 8]"):
 
         invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1);  partitioned_fw_subgraph_0_0 = primals_1 = None
         getitem: "f32[8, 8]" = invoke_subgraph_2[0]
-        getitem_2: "f32[8, 8]" = invoke_subgraph_2[2];  invoke_subgraph_2 = None
+        getitem_1: "f32[8, 8]" = invoke_subgraph_2[1];  invoke_subgraph_2 = None
 
-        add: "f32[8, 8]" = torch.ops.aten.add.Tensor(getitem, getitem_2);  getitem = getitem_2 = None
+        add: "f32[8, 8]" = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
         return (add,)
 
     class partitioned_fw_subgraph_0_0(torch.nn.Module):
         def forward(self, primals_0: "f32[8, 8]"):
             mul: "f32[8, 8]" = torch.ops.aten.mul.Tensor(primals_0, 2)
             mul_1: "f32[8, 8]" = torch.ops.aten.mul.Tensor(primals_0, 3);  primals_0 = None
-            return (mul, None, mul_1)
+            return (mul, mul_1)
 """,
             )
 
@@ -1559,8 +1541,8 @@ def forward(self, tangents_1: "f32[8, 8]"):
         partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
 
         invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', tangents_1, tangents_1);  partitioned_bw_subgraph_0_0 = tangents_1 = None
-        getitem_3: "f32[8, 8]" = invoke_subgraph_3[0];  invoke_subgraph_3 = None
-        return (getitem_3,)
+        getitem_2: "f32[8, 8]" = invoke_subgraph_3[0];  invoke_subgraph_3 = None
+        return (getitem_2,)
 
     class partitioned_bw_subgraph_0_0(torch.nn.Module):
         def forward(self, tangents_0: "f32[8, 8]", tangents_1: "f32[8, 8]"):
@@ -1827,6 +1809,35 @@ def fn(x):
         self.assertEqual(ref, res)
         res.sum().backward()
 
+    @requires_gpu
+    def test_ac_rng_cudagraphs(self):
+        def fn1(q, k, v):
+            return torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=None, dropout_p=0.5, is_causal=True
+            )
+
+        @nested_compile_region
+        def fn1_checkpoint(q, k, v):
+            return torch.utils.checkpoint.checkpoint(fn1, q, k, v, use_reentrant=False)
+
+        def fn(q, k, v):
+            return fn1_checkpoint(q, k, v) + fn1_checkpoint(q.cos(), k, v)
+
+        q = torch.randn(
+            1, 1, 32, 32, device=GPU_TYPE, dtype=torch.bfloat16, requires_grad=True
+        )
+        k = torch.randn(
+            1, 1, 32, 32, device=GPU_TYPE, dtype=torch.bfloat16, requires_grad=True
+        )
+        v = torch.randn(
+            1, 1, 32, 32, device=GPU_TYPE, dtype=torch.bfloat16, requires_grad=True
+        )
+
+        res = torch.compile(
+            fn, backend="inductor", fullgraph=True, mode="reduce-overhead"
+        )(q, k, v)
+        res.sum().backward()
+
     def test_fake_tensor_checking(self):
         @nested_compile_region
         def gn(x):
@@ -1877,6 +1888,37 @@ def forward(self, l_y_: "f32[16, 16]"):
 """,
             )
 
+    def test_return_size(self):
+        def run(dynamic):
+            torch.compiler.reset()
+
+            @nested_compile_region
+            def gn(x):
+                y = x + 1
+                z = x.shape
+                return y, z
+
+            def fn(x):
+                z0 = gn(x)
+                z1 = gn(x)
+                return z0[0] + z1[0], z0[1]
+
+            x = torch.randn(8, 8, requires_grad=True)
+            x_clone = x.detach().clone().requires_grad_(True)
+            ref = fn(x)
+            opt_fn = torch.compile(
+                fn, backend="inductor", fullgraph=True, dynamic=dynamic
+            )
+            res = opt_fn(x_clone)
+            self.assertEqual(ref, res)
+
+            ref[0].sum().backward()
+            res[0].sum().backward()
+            self.assertEqual(x.grad, x_clone.grad)
+
+        run(dynamic=True)
+        run(dynamic=False)
+
     def test_different_symint(self):
         """
         Tests check that the same subgraph called with different symints use different graphs
diff --git a/test/inductor/custom_ops.cpp b/test/inductor/custom_ops.cpp
index ae1d00c5b6346..ade7695a10d02 100644
--- a/test/inductor/custom_ops.cpp
+++ b/test/inductor/custom_ops.cpp
@@ -1,7 +1,7 @@
 #include <torch/csrc/api/include/torch/types.h>  // @manual=fbcode//caffe2:libtorch
 
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
-#include <torch/csrc/inductor/aoti_torch/utils.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h> // @manual
+#include <torch/csrc/inductor/aoti_torch/utils.h> // @manual
 
 #include <cstdint>
 #include <iostream>
diff --git a/test/inductor/indirect_assert_helper.py b/test/inductor/indirect_assert_helper.py
index 33f74f44e52b6..6d1bc2b608fba 100644
--- a/test/inductor/indirect_assert_helper.py
+++ b/test/inductor/indirect_assert_helper.py
@@ -73,7 +73,10 @@ def lower2(x):
         shape = (y.numel(),) + x.shape[2:]
         z = torch.randn(shape, device=GPU_TYPE)
         fn(x, y, z)
+        # On Windows, Python will optimize away a function call if its updated value is not used.
+        # Touch the memory of x so that the fn(x, y, z) will not be optimized away
+        print(x)
     elif fn_name in ("upper1", "upper2", "lower1", "lower2"):
-        fn(x)
+        print(fn(x))
     else:
-        fn(x, y)
+        print(fn(x, y))
diff --git a/test/inductor/test_analysis.py b/test/inductor/test_analysis.py
index 51c601b4d1d7b..55f5bec86c539 100644
--- a/test/inductor/test_analysis.py
+++ b/test/inductor/test_analysis.py
@@ -337,6 +337,7 @@ def test_augment_trace_helper_unit(self):
         ],
     )
     @skipIf(not IS_BIG_GPU, "we can't use Triton only as a backend for max autotune")
+    @torch._inductor.config.patch(force_disable_caches=True)
     def test_triton_has_metadata(self, device, dtype, maxat):
         """
         make sure that the chrome trace of triton kernels contains certain values
@@ -359,7 +360,6 @@ def om(i, w):
             options={
                 "benchmark_kernel": True,
                 "max_autotune_gemm_backends": backends,
-                "force_disable_caches": True,
                 "max_autotune": max_autotune,
             },
         )
@@ -507,6 +507,7 @@ def test_augment_trace_against_flop_counter(self, device, dtype, maxat):
     @unittest.skipIf(
         not IS_BIG_GPU, "we can't use Triton only as a backend for max autotune"
     )
+    @torch._inductor.config.patch(force_disable_caches=True)
     def test_pointwise_bandwidth(self, device, dtype, maxat):
         # this tests to see if we can only use a Triton backend for max autotune
         max_autotune, backends = maxat
@@ -518,7 +519,6 @@ def test_pointwise_bandwidth(self, device, dtype, maxat):
             options={
                 "benchmark_kernel": True,
                 "max_autotune_gemm_backends": backends,
-                "force_disable_caches": True,
                 "max_autotune": max_autotune,
             },
         )
@@ -543,6 +543,99 @@ def test_pointwise_bandwidth(self, device, dtype, maxat):
             if event["name"] == "triton_poi_fused_add_randn_sin_0":
                 event["args"]["kernel_num_gb"] = 0.002097168
 
+    @skipIf(not SM80OrLater, "Requires SM80")
+    @dtypes(torch.float, torch.float16)
+    def test_combine_profiles(self, device, dtype):
+        """
+        Test combining multiple profiles into a single profile.
+        """
+        if device == "cpu" or torch.version.hip is not None:
+            return
+
+        # Create three different models to generate different traces
+        om1 = _test_model(device, dtype, addmm=True, bmm=False)
+        om2 = _test_model(device, dtype, addmm=False, bmm=True)
+        om3 = _pointwise_test_model(device, dtype)
+
+        # Generate three separate traces
+        trace1, trace2 = trace_files()
+        trace3 = f"{TMP_DIR}/trace3-{uuid.uuid4()}.json"
+        combined_trace = f"{TMP_DIR}/combined-{uuid.uuid4()}.json"
+
+        # Generate first trace
+        torch._dynamo.reset()
+        with fresh_inductor_cache():
+            with torch.profiler.profile(record_shapes=True) as p1:
+                om1()
+        p1.export_chrome_trace(trace1)
+
+        # Generate second trace
+        torch._dynamo.reset()
+        with fresh_inductor_cache():
+            with torch.profiler.profile(record_shapes=True) as p2:
+                om2()
+        p2.export_chrome_trace(trace2)
+
+        # Generate third trace
+        torch._dynamo.reset()
+        with fresh_inductor_cache():
+            with torch.profiler.profile(record_shapes=True) as p3:
+                om3()
+        p3.export_chrome_trace(trace3)
+
+        # Combine the three traces
+        with patch(
+            "sys.argv",
+            [
+                *prefix,
+                "--combine",
+                trace1,
+                trace2,
+                trace3,
+                combined_trace,
+            ],
+        ):
+            main()
+
+        # Verify the combined trace exists and contains expected data
+        with open(combined_trace) as f:
+            combined_profile = json.load(f)
+
+        # Load original traces for comparison
+        with open(trace1) as f:
+            profile1 = json.load(f)
+        with open(trace2) as f:
+            profile2 = json.load(f)
+        with open(trace3) as f:
+            profile3 = json.load(f)
+
+        # Verify trace events are combined
+        expected_event_count = (
+            len(profile1["traceEvents"])
+            + len(profile2["traceEvents"])
+            + len(profile3["traceEvents"])
+        )
+        self.assertEqual(len(combined_profile["traceEvents"]), expected_event_count)
+
+        # Verify device properties are present
+        self.assertIn("deviceProperties", combined_profile)
+        self.assertGreater(len(combined_profile["deviceProperties"]), 0)
+
+        # Verify some trace events from each original profile are present
+        combined_event_names = {
+            event["name"] for event in combined_profile["traceEvents"]
+        }
+
+        # Check that we have events from each original profile
+        profile1_event_names = {event["name"] for event in profile1["traceEvents"]}
+        profile2_event_names = {event["name"] for event in profile2["traceEvents"]}
+        profile3_event_names = {event["name"] for event in profile3["traceEvents"]}
+
+        # At least some events from each profile should be in the combined profile
+        self.assertTrue(profile1_event_names.intersection(combined_event_names))
+        self.assertTrue(profile2_event_names.intersection(combined_event_names))
+        self.assertTrue(profile3_event_names.intersection(combined_event_names))
+
 
 instantiate_device_type_tests(TestAnalysis, globals())
 
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 588ced9f38d4c..2788faf33d4cd 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -21,6 +21,7 @@
 from torch._dynamo.utils import counters
 from torch._inductor import config
 from torch._inductor.codecache import WritableTempFile
+from torch._inductor.cpp_builder import normalize_path_separator
 from torch._inductor.package import package_aoti
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch._inductor.test_case import TestCase
@@ -29,6 +30,7 @@
     maybe_aoti_standalone_config,
     run_and_get_cpp_code,
 )
+from torch._library import capture_triton
 from torch._utils_internal import full_aoti_runtime_assert
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
@@ -40,6 +42,7 @@
     _get_torch_cuda_version,
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
     PLATFORM_SUPPORTS_FP8,
+    PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     SM80OrLater,
     tf32_on_and_off,
 )
@@ -61,6 +64,7 @@
     MACOS_VERSION,
     MI300_ARCH,
     parametrize,
+    runOnRocm,
     skipIfMPS,
     skipIfRocm,
     skipIfRocmArch,
@@ -152,6 +156,13 @@
     raise
 
 
+def get_module_ext_type():
+    if IS_WINDOWS:
+        return "pyd"
+    else:
+        return "so"
+
+
 class AOTInductorTestsTemplate:
     # Temporarily skipping test as pytorch/cpuinfo not able to retrieve cache size for
     # AMD EPYC 9575F 64-Core Processor CPU in gfx942 VM Runners
@@ -305,7 +316,11 @@ def forward(self, x, y):
             torch.randn(10, 10, device=self.device),
             torch.randn(10, 10, device=self.device),
         )
-        expected_path = os.path.join(tempfile.mkdtemp(dir=cache_dir()), "model.so")
+        expected_path = normalize_path_separator(
+            os.path.join(
+                tempfile.mkdtemp(dir=cache_dir()), f"model.{get_module_ext_type()}"
+            )
+        )
         actual_path = AOTIRunnerUtil.legacy_compile(
             model, example_inputs, options={"aot_inductor.output_path": expected_path}
         )
@@ -553,7 +568,7 @@ def forward(self, a, b):
 
         triton.set_allocator(
             lambda size, align, stream: torch.empty(
-                size, dtype=torch.int8, device="cuda"
+                size, dtype=torch.int8, device=GPU_TYPE
             )
         )
 
@@ -1452,7 +1467,12 @@ def forward(self, q, k, v):
         self.check_model(Model(), example_inputs)
 
     @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
+    @unittest.skipIf(
+        # for archs where this isn't lowered to flash attention, the math
+        # backend will be used and it doesn't work for bfloat16
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION,
+        "Some archs don't support SDPA with bfloat16",
+    )
     def test_sdpa_2(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -1725,7 +1745,9 @@ def forward(self, values, repeats, mask, embeddings, x, y, z, lst):
         self.check_model(Repro(), example_inputs, dynamic_shapes=spec)
 
     @skipIfXpu(msg="_scaled_dot_product_flash_attention is not supported on XPU yet")
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support flash SDPA"
+    )
     def test_fallback_kernel_with_symexpr_output(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -2150,8 +2172,19 @@ def test_while_loop_with_outer_code(self):
             dynamic_shapes=dynamic_shapes,
         )
 
+    # mps doesn't support float64
+    @skipIfMPS
     def test_while_loop_with_parameters(self):
-        inputs = (torch.randn((10, 20), device=self.device),)
+        inputs = (
+            torch.randn(
+                (
+                    10,
+                    20,
+                ),
+                dtype=torch.float64,
+                device=self.device,
+            ),
+        )
         dim0_a = Dim("s0", min=2, max=1024)
         dynamic_shapes = {
             "c": {},
@@ -2473,7 +2506,6 @@ def forward(self, x):
         torch._export.aot_compile(Model(), example_inputs)
 
     @skipCUDAIf(True, "Test for x86 backend")
-    @skipIfXpu
     @unittest.skipIf(IS_FBCODE, "Need newer ideep")
     def test_buffer_mutation_and_force_mmap_weights(self):
         class Model(nn.Module):
@@ -4200,7 +4232,6 @@ def forward(self, x, y):
 
         self.check_model(Model(), example_inputs)
 
-    # @skipIfXpu(msg="torch.xpu.memory_allocated not supported yet")
     def test_triton_kernel_reinterpret_view_mem_leak(self):
         # Check for memory leak when using user-defined Triton Kernel + AOTI.
         if self.device != GPU_TYPE:
@@ -4296,7 +4327,9 @@ def grid(meta):
             dynamic_shapes=dynamic_shapes,
         )
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Some archs don't support mem eff SDPA"
+    )
     def test_scaled_dot_product_efficient_attention(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -5035,13 +5068,13 @@ def forward(self, a):
             _, code = run_and_get_cpp_code(
                 AOTIRunnerUtil.compile, model, example_inputs
             )
-            shim_fn_codes = (
-                f'RECORD_FUNCTION("{kernel_calls}", c10::ArrayRef<c10::IValue>());'
-            )
+            shim_fn_codes = f'RAIIAtenRecordFunctionHandle .*\\("{kernel_calls}"'
             if enable_kernel_profile:
-                FileCheck().check(shim_fn_codes).run(code)
+                FileCheck().check_regex(shim_fn_codes).run(code)
             else:
-                FileCheck().check_not(shim_fn_codes).run(code)
+                FileCheck().check_not("RAIIAtenRecordFunctionHandle").run(code)
+
+            self.check_model(Model(N, K, self.device), example_inputs)
 
     def test_aoti_debug_printer_user_defined_triton_kernel(self):
         if self.device != GPU_TYPE:
@@ -5239,9 +5272,9 @@ def forward(self, a, b, c):
                 return z
 
         example_inputs = (
-            torch.randn(10, 20, device="cuda"),
-            torch.randn(20, 30, device="cuda"),
-            torch.randn(10, 30, device="cuda"),
+            torch.randn(10, 20, device=GPU_TYPE),
+            torch.randn(20, 30, device=GPU_TYPE),
+            torch.randn(10, 30, device=GPU_TYPE),
         )
         model = Model()
         kernel_calls = [
@@ -5457,6 +5490,68 @@ def sin_triton(x, out):
         self.check_model(sin_triton, none_inputs)
         self.check_model(sin_triton, not_none_inputs)
 
+    @skipIfRocm  # RoCM does not support the config block size in test suite.
+    def test_autotune_int64_user_defined_triton_kernel(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        @triton.jit
+        def add_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0).to(tl.int64)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        @torch.library.triton_op("mylib::add", mutates_args=())
+        def custom_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.empty_like(x)
+            n_elements = output.numel()
+
+            def grid(meta):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            capture_triton(add_kernel)[grid](x, y, output, n_elements, 16)
+            return output
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                x = custom_add(x, x)
+                split_with_sizes_1 = torch.ops.aten.split_with_sizes.default(
+                    x, [512, 512, 512, 512], 1
+                )
+                getitem_29 = split_with_sizes_1[0]
+                return getitem_29 * 3
+
+        n = 1379584
+
+        try:
+            buf196 = torch.randint(
+                0, 100, (n, 2048), dtype=torch.int8, device=self.device
+            )
+            example_inputs = (buf196,)
+
+            self.check_model(
+                Model(),
+                example_inputs,
+                dynamic_shapes={
+                    "x": (Dim("x", max=1379584), Dim.STATIC),
+                },
+                options={"max_autotune": True},
+            )
+        except torch.OutOfMemoryError:
+            # CI can OOM because this test uses too much memory
+            raise unittest.SkipTest("OOM. Test is too large") from None
+
     @skipIfWindows(
         msg="OpenMP crashed application on windows"
     )  # TODO: (xuhancn) need to root cause and fix.
@@ -6215,6 +6310,7 @@ def forward(self, x):
         example_inputs = (torch.randn(500, device=self.device),)
         self.check_model(model, example_inputs)
 
+    @skipIfXpu
     def test_conv3d(self):
         if self.device != GPU_TYPE or not is_big_gpu():
             raise unittest.SkipTest("requires modern GPU to run max-autotune")
@@ -6420,6 +6516,48 @@ def forward(self, x):
                 rtol=1e-3,
             )
 
+    @runOnRocm
+    def test_rocm_triton_autotuning(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y, m):
+                _M, K = x.shape
+                K, N = y.shape
+                M = torch.abs(m)
+                out = torch.empty((_M, N), device=x.device, dtype=torch.float32)
+                grid = lambda META: (  # noqa: E731
+                    triton.cdiv(
+                        4096 * 2046, META["BLOCK_SIZE_M"] * META["BLOCK_SIZE_N"]
+                    ),
+                )
+                strange_config_matmul_kernel[grid](
+                    x,
+                    y,
+                    out,
+                    M,
+                    N,
+                    K,
+                )
+                return out
+
+        x = torch.randn(4096, 1024, device=self.device)
+        y = torch.randn(1024, 2048, device=self.device)
+        m = torch.tensor([4096], dtype=torch.int32, device=self.device)
+
+        with (
+            torch.no_grad(),
+            config.patch(
+                {
+                    "triton.autotune_with_sample_inputs": True,
+                    "aot_inductor.allow_stack_allocation": self.allow_stack_allocation,
+                    "aot_inductor.use_minimal_arrayref_interface": self.use_minimal_arrayref_interface,
+                }
+            ),
+        ):
+            torch._export.aot_compile(Model(), (x, y, m))
+
     @skipIfRocm  # RoCM does not support the config block size in test suite.
     def test_triton_autotuning(self):
         if self.device != GPU_TYPE:
@@ -6640,6 +6778,70 @@ def forward(self, x, y, z, x1, z1):
         }
         self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
 
+    def test_sym_expr_indexing(self):
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+
+        class Repro(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(
+                self,
+                episode_builder_position_encoding_observations_weight,
+                add_15,
+                add_16,
+                add_17,
+                add_18,
+                add_13,
+            ):
+                arange_1 = torch.ops.aten.arange.start(
+                    180,
+                    181,
+                    device=torch.device(type="cuda", index=0),
+                    pin_memory=False,
+                )
+                add_14 = torch.ops.aten.add.Tensor(arange_1, 198)
+                arange_1 = None
+                stack_1 = torch.ops.aten.stack.default(
+                    [add_13, add_14, add_15, add_16, add_17, add_18]
+                )
+                add_13 = add_14 = add_15 = add_16 = add_17 = add_18 = None
+                select_13 = torch.ops.aten.select.int(stack_1, 0, 0)
+                stack_1 = None
+                embedding_11 = torch.ops.aten.embedding.default(
+                    episode_builder_position_encoding_observations_weight, select_13
+                )
+                episode_builder_position_encoding_observations_weight = select_13 = None
+                return (embedding_11,)
+
+        # Embedding weight: vocab_size x emb_dim
+        episode_builder_position_encoding_observations_weight = torch.randn(
+            100, 16, device=self.device
+        )
+
+        # These six must all be 1-D (shape [1]) and same dtype; use Long for embedding indices
+        add_13 = torch.tensor(
+            [7], dtype=torch.long, device=self.device
+        )  # this one is used as the index
+        add_15 = torch.tensor([5], dtype=torch.long, device=self.device)
+        add_16 = torch.tensor([6], dtype=torch.long, device=self.device)
+        add_17 = torch.tensor([7], dtype=torch.long, device=self.device)
+        add_18 = torch.tensor([8], dtype=torch.long, device=self.device)
+
+        # Instantiate and run
+        m = Repro().to(self.device)
+
+        example_inputs = (
+            episode_builder_position_encoding_observations_weight,
+            add_15,
+            add_16,
+            add_17,
+            add_18,
+            add_13,
+        )
+        self.check_model(m, example_inputs)
+
     def test_with_cudagraphs(self):
         if self.device != "cuda":
             raise unittest.SkipTest("requires CUDA")
@@ -6712,6 +6914,49 @@ def wrapped(**kwargs):
         # compare against eager
         self.assertEqual(optimized(**model_kwargs), model(**model_kwargs))
 
+    def test_custom_op_in_subgraph(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define(
+                "mylib::foo_add1",
+                "(Tensor a) -> Tensor",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+
+            @torch.library.impl("mylib::foo_add1", "CompositeExplicitAutograd", lib=lib)
+            @torch.library.register_fake("mylib::foo_add1", lib=lib)
+            def foo_add1_impl(a: torch.Tensor) -> torch.Tensor:
+                return a + 1
+
+            torch.library.define(
+                "mylib::foo_add2",
+                "(Tensor a) -> Tensor",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+
+            @torch.library.impl("mylib::foo_add2", "CompositeExplicitAutograd", lib=lib)
+            @torch.library.register_fake("mylib::foo_add2", lib=lib)
+            def foo_add2_impl(a: torch.Tensor) -> torch.Tensor:
+                return a + 2
+
+            class M(torch.nn.Module):
+                def forward(self, x):
+                    return torch.cond(
+                        x.shape[0] < 5,
+                        torch.ops.mylib.foo_add1,
+                        torch.ops.mylib.foo_add2,
+                        (x,),
+                    )
+
+            list_example_inputs = [
+                (torch.ones(6, device=self.device),),
+                (torch.ones(3, device=self.device),),
+            ]
+            self.check_model_with_multiple_inputs(
+                M(), list_example_inputs, dynamic_shapes=({0: Dim.DYNAMIC},)
+            )
+
     def test_clamp_decomposition(self):
         class Model1(torch.nn.Module):
             def forward(self, x):
@@ -6782,13 +7027,72 @@ def forward(self, x, y):
         with zipfile.ZipFile(package_path, "r") as zip_ref:
             all_files = zip_ref.namelist()
             base_dir = "test_model.wrapper/data/aotinductor/model/test_model"
+            ext_type = get_module_ext_type()
             self.assertTrue(f"{base_dir}.wrapper.cpp" in all_files)
             self.assertTrue(f"{base_dir}.kernel.cpp" in all_files)
-            self.assertTrue(f"{base_dir}.wrapper.so" in all_files)
+            self.assertTrue(f"{base_dir}.wrapper.{ext_type}" in all_files)
 
         aot_inductor_module = torch._inductor.aoti_load_package(package_path)
         self.assertEqual(aot_inductor_module(*example_inputs), model(*example_inputs))
 
+    def test_copy_non_blocking_is_pinned(self):
+        if self.device == "cpu" or self.device == "mps":
+            raise unittest.SkipTest("only matters for device-to-cpu copy")
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, b):
+                a_cpu = a.to(device="cpu", non_blocking=True)
+                b_cpu = b.to(device="cpu", non_blocking=True)
+                a_to_cpu_event = torch.Event()
+                a_to_cpu_event.record()
+                a_to_cpu_event.synchronize()
+                return torch.cat([a_cpu, b_cpu])
+
+        model = Model()
+        a = torch.randn(2, 2, device=self.device)
+        b = torch.randn(2, 2, device=self.device)
+        example_inputs = (a, b)
+        outputs = model(*example_inputs)
+        package_path, code = run_and_get_cpp_code(
+            AOTIRunnerUtil.compile, model, example_inputs
+        )
+        FileCheck().check("pinned").run(code)
+        model_aoti = torch._inductor.aoti_load_package(package_path)
+        outputs_aoti = model_aoti(*example_inputs)
+
+        self.assertEqual(outputs, outputs_aoti)
+
+    def test_pad_non_zero_memory_leak(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("test is only for GPU_TYPE")
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                x = x + 1
+                x = torch.ops.aten.constant_pad_nd(x, (0, 1, 0, 0), 12345.0)
+
+                return x @ x
+
+        model = Model()
+        example_inputs = (torch.randn(2048, 2047, device=self.device),)
+        package_path, code = run_and_get_cpp_code(
+            AOTIRunnerUtil.compile, model, example_inputs
+        )
+        outputs = model(*example_inputs)
+        model_aoti = torch._inductor.aoti_load_package(package_path)
+        outputs_aoti = model_aoti(*example_inputs)
+
+        self.assertEqual(outputs, outputs_aoti)
+
+        FileCheck().check_regex(
+            r"aoti_torch_as_strided\(buf0_handle, .*, &buf0_handle_restrided\)"
+        ).check("wrap_with_raii_handle_if_needed(buf0_handle);").check(
+            "RAIIAtenTensorHandle buf0(buf0_handle_restrided);"
+        ).run(code)
+
 
 class AOTInductorLoggingTest(LoggingTestCase):
     @make_logging_test(dynamic=logging.DEBUG)
@@ -6837,11 +7141,21 @@ def test_compile_standalone_sets_package_cpp(self):
         result = maybe_aoti_standalone_config({"aot_inductor.compile_standalone": True})
         self.assertEqual(result["aot_inductor.package_cpp_only"], True)
         self.assertEqual(result["aot_inductor.compile_standalone"], True)
+        self.assertEqual(result["aot_inductor.embed_kernel_binary"], True)
+        self.assertEqual(
+            result["aot_inductor.emit_multi_arch_kernel"], not torch.version.hip
+        )
+        self.assertEqual(
+            result["aot_inductor.model_name_for_generated_files"], "aoti_model"
+        )
 
-    def test_compile_standalone_package_cpp_already_true(self):
+    def test_compile_standalone_explicit_set(self):
         patches = {
             "aot_inductor.compile_standalone": True,
             "aot_inductor.package_cpp_only": True,
+            "aot_inductor.embed_kernel_binary": True,
+            "aot_inductor.emit_multi_arch_kernel": not torch.version.hip,
+            "aot_inductor.model_name_for_generated_files": "aoti_model",
         }
         result = maybe_aoti_standalone_config(patches)
         self.assertEqual(result, patches)
@@ -6920,16 +7234,8 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
     # MPS doesn't support float8
     "test_fp8": fail_mps(),
     "test_fp8_view_of_param": fail_mps(),
-    # unsupported operator: aten._scaled_dot_product_attention_math_for_mps.default
-    "test_issue_140766": fail_mps(),
     # cannot initialize a parameter of type 'double' with an rvalue of type 'std::nullptr_t'
     "test_fallback_kernel_with_symexpr_output": fail_mps(),
-    # while-loop subgraph calls same kernel as outside. need to figure out how to
-    # either (1) tell outside to initialize a new kernel or (2) generate
-    # subgraph as a separate function, which would(?) cause (1) to happen automatically.
-    "test_while_loop_nested": fail_mps(),
-    "test_cond_with_parameters": fail_mps(),
-    "test_cond_share_predicte": fail_mps(),
     # correctness issue
     "test_index_put_with_none_index": fail_mps(),
     # Error device may not be nil
@@ -6981,6 +7287,7 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
     "test_none_args_aot_codegen": fail_mps(),
     "test_aoti_debug_printer_sym_inputs": fail_mps(),
     "test_aoti_debug_printer_user_defined_triton_kernel": fail_mps(),
+    "test_autotune_int64_user_defined_triton_kernel": fail_mps(),
 }
 if TEST_WITH_ROCM:
     prop = torch.cuda.get_device_properties(0)
diff --git a/test/inductor/test_aot_inductor_arrayref.py b/test/inductor/test_aot_inductor_arrayref.py
index 9ba1121a53949..492ad9c23c5c7 100644
--- a/test/inductor/test_aot_inductor_arrayref.py
+++ b/test/inductor/test_aot_inductor_arrayref.py
@@ -70,6 +70,7 @@ def fail_minimal_arrayref_interface(is_skip=False):
     "test_cond_with_multiple_outputs": fail_minimal_arrayref_interface(),
     "test_cond_with_parameters": fail_minimal_arrayref_interface(),
     "test_cond_with_reinterpret_view_inputs_outputs": fail_minimal_arrayref_interface(),
+    "test_custom_op_in_subgraph": fail_minimal_arrayref_interface(),
     "test_cond_share_predicte": fail_stack_allocation(is_skip=True),
     "test_cond_unbacked_symint_closure_dynamic_True": fail_minimal_arrayref_interface(),
     "test_while_loop_with_unbacked_symint_closure_dynamic_True": fail_minimal_arrayref_interface(),
diff --git a/test/inductor/test_aot_inductor_custom_ops.py b/test/inductor/test_aot_inductor_custom_ops.py
index aa3c589b45467..24a4aef488731 100644
--- a/test/inductor/test_aot_inductor_custom_ops.py
+++ b/test/inductor/test_aot_inductor_custom_ops.py
@@ -20,11 +20,10 @@
     IS_MACOS,
     IS_SANDCASTLE,
     IS_WINDOWS,
-    skipIfRocm,
     skipIfXpu,
 )
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
-from torch.testing._internal.triton_utils import HAS_CUDA
+from torch.testing._internal.triton_utils import HAS_CUDA_AND_TRITON
 from torch.utils._python_dispatch import TorchDispatchMode
 
 
@@ -415,7 +414,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         self.assertTrue(sentinel_seen)
 
     @skipIfXpu
-    @skipIfRocm
     @unittest.skipIf(IS_FBCODE, "unable to find library -laoti_custom_ops")
     def test_custom_op_square(self) -> None:
         class Model(torch.nn.Module):
@@ -556,5 +554,5 @@ class AOTInductorTestABICompatibleCuda(AOTICustomOpTestCase):
     from torch._inductor.test_case import run_tests
 
     # cpp_extension N/A in fbcode
-    if HAS_CUDA or sys.platform == "darwin":
+    if HAS_CUDA_AND_TRITON or sys.platform == "darwin":
         run_tests(needs="filelock")
diff --git a/test/inductor/test_aot_inductor_package.py b/test/inductor/test_aot_inductor_package.py
index 51343b6b1883e..0eb1057c802eb 100644
--- a/test/inductor/test_aot_inductor_package.py
+++ b/test/inductor/test_aot_inductor_package.py
@@ -15,6 +15,7 @@
 from parameterized import parameterized_class
 
 import torch
+import torch._inductor.config
 from torch._inductor.codecache import get_kernel_bin_format
 from torch._inductor.package import load_package, package_aoti
 from torch._inductor.test_case import TestCase
@@ -146,7 +147,10 @@ def check_package_cpp_only(self: TestCase) -> None:
 
     def cmake_compile_and_run(self, base_dir):
         custom_env = os.environ.copy()
-        custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent)
+        custom_env["CMAKE_PREFIX_PATH"] = ":".join(
+            [str(Path(torch.__file__).parent)]
+            + os.environ.get("CMAKE_PREFIX_PATH", "").split(":")
+        )
         build_path = Path(base_dir) / "build"
         build_path.mkdir()
         subprocess.run(
@@ -156,6 +160,7 @@ def cmake_compile_and_run(self, base_dir):
             check=True,
         )
         subprocess.run(["make"], cwd=build_path, check=True)
+
         result = subprocess.run(
             ["./build/main"],
             cwd=base_dir,
@@ -192,7 +197,10 @@ def cmake_compile(self, model, example_inputs, options, tmp_dir):
             self.assertTrue(not build_path.exists())
             build_path.mkdir()
             custom_env = os.environ.copy()
-            custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent)
+            custom_env["CMAKE_PREFIX_PATH"] = ":".join(
+                [str(Path(torch.__file__).parent)]
+                + os.environ.get("CMAKE_PREFIX_PATH", "").split(":")
+            )
             subprocess.run(
                 ["cmake", ".."],
                 cwd=build_path,
@@ -363,6 +371,7 @@ def forward(self, x, y):
     )
     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
     @skipIfXpu  # build system may be different
+    @torch._inductor.config.patch("test_configs.use_libtorch", True)
     def test_compile_after_package_static(self):
         # compile_standalone will set package_cpp_only=True
         self.check_package_cpp_only()
@@ -419,12 +428,46 @@ def forward(self, x, y):
             with self.assertRaisesRegex(Exception, "Invalid AOTI model name"):
                 self.cmake_compile(model, example_inputs, options, "")
 
+    @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+    @skipIfXpu  # build system may be different
+    @torch._inductor.config.patch("test_configs.use_libtorch", True)
+    def test_compile_standalone_cos(self):
+        # compile_standalone will set package_cpp_only=True
+        self.check_package_cpp_only()
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x):
+                return torch.cos(x)
+
+        with torch.no_grad():
+            example_inputs = (torch.randn(8, 32, device=self.device),)
+            model = Model().to(device=self.device)
+
+            # Test compilation when model name is passed in
+            options = {
+                "aot_inductor.compile_standalone": True,
+                "aot_inductor.model_name_for_generated_files": "cos",
+            }
+            with (
+                tempfile.TemporaryDirectory() as tmp_dir,
+            ):
+                build_path, _ = self.cmake_compile(
+                    model, example_inputs, options, tmp_dir
+                )
+                # Check if the .a file was build successfully
+                a_path = build_path / "libcos.a"
+                self.assertTrue(a_path.exists())
+
     @unittest.skipIf(
         _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
     )
     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
     @skipIfRocm  # doesn't support multi-arch binary
     @skipIfXpu  # doesn't support multi-arch binary
+    @torch._inductor.config.patch("test_configs.use_libtorch", True)
     def test_compile_with_exporter(self):
         self.check_package_cpp_only()
 
@@ -466,16 +509,62 @@ def default(*args, **kwargs):
                     if self.device == GPU_TYPE:
                         self.assertEqual(
                             result.stdout,
-                            "output_tensor1 2  2  2\n 2  2  2\n 2  2  2\n[ CUDAFloatType{3,3} ]\noutput_tensor2 0  0  0\n"
+                            "output_tensor1\n 2  2  2\n 2  2  2\n 2  2  2\n[ CUDAFloatType{3,3} ]\noutput_tensor2\n 0  0  0\n"
                             " 0  0  0\n 0  0  0\n[ CUDAFloatType{3,3} ]\n",
                         )
                     else:
                         self.assertEqual(
                             result.stdout,
-                            "output_tensor1 2  2  2\n 2  2  2\n 2  2  2\n[ CPUFloatType{3,3} ]\noutput_tensor2 0  0  0\n"
+                            "output_tensor1\n 2  2  2\n 2  2  2\n 2  2  2\n[ CPUFloatType{3,3} ]\noutput_tensor2\n 0  0  0\n"
                             " 0  0  0\n 0  0  0\n[ CPUFloatType{3,3} ]\n",
                         )
 
+    @unittest.skipIf(
+        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+    )
+    @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+    @skipIfRocm  # doesn't support multi-arch binary
+    @skipIfXpu  # doesn't support multi-arch binary
+    @torch._inductor.config.patch("test_configs.use_libtorch", True)
+    def test_compile_with_exporter_weights(self):
+        self.check_package_cpp_only()
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = torch.nn.Linear(3, 3)
+
+            def forward(self, x):
+                x = self.fc1(x)
+                return x
+
+        def default(*args, **kwargs):
+            return None
+
+        example_inputs = (torch.ones(3, 3).to(self.device),)
+
+        package = _ExportPackage()
+        m1 = Model().to(self.device)
+        exporter1 = package._exporter("Model", m1)._define_overload("default", default)
+        exporter1(*example_inputs)
+        expected_res = m1(*example_inputs)
+
+        package_example_inputs = True
+        with (
+            tempfile.TemporaryDirectory() as tmp_dir,
+        ):
+            package._compiled_and_package(
+                tmp_dir + "/package.pt2", True, package_example_inputs
+            )
+
+            # Test compiling generated files
+            self.cmake_compile_and_run(tmp_dir)
+            tensor_model = torch.load(
+                tmp_dir + "/output_tensor1.pt", weights_only=False
+            )
+            true_res = next(iter(tensor_model.parameters()))
+            self.assertEqual(expected_res, true_res)
+
     def test_metadata(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
diff --git a/test/inductor/test_aot_inductor_utils.py b/test/inductor/test_aot_inductor_utils.py
index a2706933d6156..50edf7b695ad8 100644
--- a/test/inductor/test_aot_inductor_utils.py
+++ b/test/inductor/test_aot_inductor_utils.py
@@ -148,7 +148,7 @@ def legacy_run(
     @staticmethod
     def compile(
         model: Union[torch.nn.Module, types.FunctionType],
-        example_inputs: list[torch.Tensor],
+        example_inputs: tuple[torch.Tensor, ...],
         inductor_configs: Optional[dict[str, Any]] = None,
         dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
     ):
@@ -159,7 +159,11 @@ def compile(
         with torch.no_grad():
             # strict=False needs extra migration work
             ep = torch.export.export(
-                model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
+                model,
+                example_inputs,
+                dynamic_shapes=dynamic_shapes,
+                strict=True,
+                prefer_deferred_runtime_asserts_over_guards=True,
             )
             package_path = torch._inductor.aoti_compile_and_package(
                 ep, inductor_configs=inductor_configs
@@ -169,7 +173,7 @@ def compile(
     @staticmethod
     def run(
         model: Union[torch.nn.Module, types.FunctionType],
-        example_inputs: list[torch.Tensor],
+        example_inputs: tuple[torch.Tensor, ...],
         inductor_configs: Optional[dict[str, Any]] = None,
         dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
     ):
@@ -185,7 +189,7 @@ def run(
     @staticmethod
     def run_multiple(
         model: Union[torch.nn.Module, types.FunctionType],
-        list_example_inputs: list[list[torch.Tensor]],
+        list_example_inputs: list[tuple[torch.Tensor, ...]],
         inductor_configs: Optional[dict[str, Any]] = None,
         dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
     ):
diff --git a/test/inductor/test_async_compile.py b/test/inductor/test_async_compile.py
index ce30b67af239e..5a61ea851eae0 100644
--- a/test/inductor/test_async_compile.py
+++ b/test/inductor/test_async_compile.py
@@ -4,6 +4,7 @@
 import torch
 from torch._inductor import config
 from torch._inductor.async_compile import AsyncCompile, shutdown_compile_workers
+from torch._inductor.compile_worker.subproc_pool import SubprocException
 from torch._inductor.runtime.triton_compat import Config
 from torch._inductor.runtime.triton_heuristics import (
     generate_lookup_hash_from_source_code,
@@ -41,6 +42,29 @@ def fn(x, y):
                 compiled_fn = torch.compile(fn)
                 self.assertEqual(fn(x, y), compiled_fn(x, y))
 
+    @requires_gpu()
+    @requires_triton()
+    def test_bad_kernel(self):
+        shutdown_compile_workers()
+
+        with config.patch(worker_start_method="subprocess", compile_threads=8):
+            async_compile = AsyncCompile()
+            AsyncCompile.wait_pool_ready()
+            with self.assertRaises(SubprocException):
+                async_compile.triton(
+                    "fake_kernel_name", source_code="This definitely doesn't exist"
+                ).result()
+
+    @requires_gpu()
+    @requires_triton()
+    def test_wait_pool_ready(self):
+        shutdown_compile_workers()
+
+        with config.patch(worker_start_method="subprocess", compile_threads=8):
+            AsyncCompile.wait_pool_ready()
+            self.assertTrue(AsyncCompile._ready_future.done())
+            self.assertTrue(AsyncCompile.use_process_pool())
+
     @requires_gpu()
     @requires_triton()
     @patch("torch._inductor.runtime.coordinate_descent_tuner.CoordescTuner.autotune")
diff --git a/test/inductor/test_benchmark_fusion.py b/test/inductor/test_benchmark_fusion.py
index b3afba7d6843f..56310adc977d3 100644
--- a/test/inductor/test_benchmark_fusion.py
+++ b/test/inductor/test_benchmark_fusion.py
@@ -13,7 +13,7 @@
 from torch.testing._internal.inductor_utils import (
     get_func_call,
     HAS_CPU,
-    HAS_CUDA,
+    HAS_CUDA_AND_TRITON,
     IS_BIG_GPU,
 )
 
@@ -197,7 +197,7 @@ def f(x):
         self.common(f, (x,))
 
 
-if HAS_CUDA:
+if HAS_CUDA_AND_TRITON:
 
     class BenchmarkFusionCudaTest(TestCase):
         common = check_model_cuda
@@ -294,7 +294,7 @@ def test_equivalent_template_code(self):
             for out_code in [code, code2]:
                 FileCheck().check(get_func_call()).check_count(
                     "empty_strided", 1, exactly=True
-                ).check("triton_tem_fused_addmm_relu_0").check_count(
+                ).check("triton_tem_fused_addmm_relu_t_0").check_count(
                     ".reset()" if config.cpp_wrapper else "del", 3, exactly=True
                 ).check("" if config.cpp_wrapper else "return").run(out_code[0])
 
@@ -347,5 +347,5 @@ class BenchmarkFusionCpuTest(TestCase):
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
-    if HAS_CPU or HAS_CUDA:
+    if HAS_CPU or HAS_CUDA_AND_TRITON:
         run_tests()
diff --git a/test/inductor/test_ck_backend.py b/test/inductor/test_ck_backend.py
index 7c50ee1dbd1f6..f73a47e45a57a 100644
--- a/test/inductor/test_ck_backend.py
+++ b/test/inductor/test_ck_backend.py
@@ -22,11 +22,11 @@
     _quantize_rowwise,
     _quantize_tensorwise,
     HAS_CPU,
-    HAS_CUDA,
+    HAS_CUDA_AND_TRITON,
 )
 
 
-if HAS_CUDA:
+if HAS_CUDA_AND_TRITON:
     torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
 log = logging.getLogger(__name__)
@@ -464,5 +464,5 @@ def compiled_bmm(x, w):
     from torch._inductor.utils import is_big_gpu
 
     # Set env to make it work in CI.
-    if HAS_CUDA and HAS_CPU and is_big_gpu():
+    if HAS_CUDA_AND_TRITON and HAS_CPU and is_big_gpu():
         run_tests()
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index 996e81032a05d..6da49ab392290 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -29,9 +29,11 @@
     TensorMetadata,
     TensorMetadataAndValues,
 )
+from torch._inductor.cpp_builder import normalize_path_separator
 from torch._inductor.custom_graph_pass import (
     CustomGraphModulePass,
     CustomGraphPass,
+    CustomPartitionerFn,
     get_hash_for_files,
 )
 from torch._inductor.graph import GraphLowering
@@ -59,7 +61,6 @@
 )
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
-    HAS_CUDA,
     HAS_GPU,
     HAS_MULTIGPU,
     HAS_TRITON,
@@ -67,7 +68,7 @@
     requires_gpu,
     requires_triton,
 )
-from torch.testing._internal.triton_utils import requires_cuda
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 try:
@@ -872,7 +873,7 @@ def fn(x):
     @torch._functorch.config.patch({"enable_autograd_cache": False})
     @config.patch("fx_graph_remote_cache", False)
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
-    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @requires_cuda_and_triton
     def test_no_arguments_tensor_device_guards(self):
         """
         Usually, when there are example inputs, the device index of the inputs
@@ -902,7 +903,7 @@ def f():
     @torch._functorch.config.patch({"enable_autograd_cache": False})
     @config.patch("fx_graph_remote_cache", False)
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
-    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @requires_cuda_and_triton
     def test_tensor_device_guards_cpu_tensor(self):
         """
         CPU tensor arguments should still cache hit
@@ -1006,7 +1007,7 @@ def fn(x, op):
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
             self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
     @with_tf32_off
@@ -1464,7 +1465,7 @@ def f(x, val):
         self.assertNotEqual(a, b)
 
     @config.patch({"fx_graph_cache": False, "fx_graph_remote_cache": False})
-    @requires_cuda
+    @requires_cuda_and_triton
     @unittest.expectedFailure  # TODO: pass in optimize_mem at runtime
     def test_async_compile_cache(self):
         class SimpleFunction(torch.autograd.Function):
@@ -1807,7 +1808,9 @@ def f(x):
         assert not kwargs
 
         with tempfile.TemporaryDirectory() as temp_dir:
-            path = os.path.join(temp_dir, "compiled_artifact.bin")
+            path = normalize_path_separator(
+                os.path.join(temp_dir, "compiled_artifact.bin")
+            )
 
             with fresh_cache():
                 compiled_artifact = torch._inductor.standalone_compile(gm, args)
@@ -2113,6 +2116,19 @@ def fn(a, b):
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
 
 
+class TestCustomPartitionerFn(CustomPartitionerFn):
+    def __init__(self):
+        self._uuid = None
+
+    def __call__(
+        self, gm, joint_inputs, **kwargs
+    ) -> tuple[torch.fx.GraphModule, torch.fx.GraphModule]:
+        return gm, gm  # Dummy implementation
+
+    def uuid(self) -> Optional[Union[bytes, str]]:
+        return self._uuid
+
+
 class TestFxGraphCacheHashing(TestCase):
     def test_parameter_constants(self):
         """
@@ -2518,6 +2534,35 @@ def test_hash_custom_backend_config(self):
             self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
 
+    def test_hash_custom_partitioner_fn(self):
+        """
+        Test that the custom partitioner function's UUID is properly used in the FX graph cache hashing.
+        """
+        custom_partitioner_fn = TestCustomPartitionerFn()
+        with config.patch({"custom_partitioner_fn": custom_partitioner_fn}):
+            custom_partitioner_fn._uuid = "1"
+            details1 = FxGraphHashDetails(None, [], {}, [])
+            details2 = FxGraphHashDetails(None, [], {}, [])
+
+            custom_partitioner_fn._uuid = "2"
+            details3 = FxGraphHashDetails(None, [], {}, [])
+
+            self.assertEqual(details1._custom_partitioner_fn, "1")
+            self.assertEqual(details2._custom_partitioner_fn, "1")
+            self.assertEqual(details3._custom_partitioner_fn, "2")
+
+            gm = torch.fx.GraphModule({}, torch.fx.Graph())
+            pickler = FxGraphCachePickler(gm)
+
+            self.assertEqual(
+                pickler.dumps(details1),
+                pickler.dumps(details2),
+            )
+            self.assertNotEqual(
+                pickler.dumps(details1),
+                pickler.dumps(details3),
+            )
+
     def test_bypass_unsupported(self):
         """
         Test _reduce_unsupported
@@ -2574,7 +2619,7 @@ def test_get_hash_for_files(self):
 
 
 class TestCudaCompileCommand(TestCase):
-    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @requires_cuda_and_triton
     def test_cuda_compile_command(self):
         cmd_no_extra_args: str = cuda_compile_command(
             ["abc.cu", "def.cu"], "output", "so"
@@ -2619,7 +2664,7 @@ def reset(self):
         torch._dynamo.reset()
         clear_caches()
 
-    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @requires_cuda_and_triton
     @unittest.skipIf(not SM80OrLater, "Requires SM80+")
     @unittest.skipIf(
         TEST_WITH_ROCM, "Requires static cuda launcher, which does not support ROCM"
@@ -2670,7 +2715,7 @@ def f(x, y, a, b):
         for k in global_stats.triton.cache.keys():
             self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
 
-    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @requires_cuda_and_triton
     @unittest.skipIf(not SM80OrLater, "Requires SM80+")
     @config.patch({"fx_graph_cache": False})
     @config.patch({"fx_graph_remote_cache": False})
@@ -2711,7 +2756,7 @@ def f(x, y, a, b):
         for k in global_stats.triton.cache.keys():
             self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
 
-    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @requires_cuda_and_triton
     @unittest.skipIf(not SM80OrLater, "Requires SM80+")
     @config.patch({"fx_graph_cache": False})
     @config.patch({"fx_graph_remote_cache": False})
@@ -2772,7 +2817,7 @@ def f(a, b, c, d, e, f):
             self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
 
     @requires_triton()
-    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @requires_cuda_and_triton
     @unittest.skipIf(not SM80OrLater, "Requires SM80+")
     @config.patch({"fx_graph_cache": False})
     @config.patch({"fx_graph_remote_cache": False})
@@ -2801,8 +2846,8 @@ def get_autotune_stats():
         def fn(x, y):
             return (x + y).relu()
 
-        x = torch.randn(100, 100).cuda()
-        y = torch.randn(100, 100).cuda()
+        x = torch.randn(100, 100).to(GPU_TYPE)
+        y = torch.randn(100, 100).to(GPU_TYPE)
 
         with config.patch(
             {
@@ -2836,7 +2881,7 @@ def fn(x, y):
 
 
 class TestRemoteAOTAutogradCache(TestCase):
-    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @requires_cuda_and_triton
     @unittest.skipIf(not SM80OrLater, "Requires SM80+")
     @config.patch({"fx_graph_cache": False})
     @config.patch({"fx_graph_remote_cache": True})
@@ -2875,7 +2920,7 @@ def f(a, b):
         for k in global_stats.fx_graph.cache.keys():
             self.assertRegex(k, r"pt2:fx-graph-v1::[0-9a-z]{52}:c[0-9]+")
 
-    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @requires_cuda_and_triton
     @unittest.skipIf(not SM80OrLater, "Requires SM80+")
     @config.patch({"fx_graph_cache": False})
     @config.patch({"fx_graph_remote_cache": True})
@@ -2950,7 +2995,7 @@ def fn(x, y):
 
     # This combination of settings exposed a bug where we cleared the
     # PyCodeCache disk artifacts while they were still needed:
-    @requires_cuda
+    @requires_cuda_and_triton
     @config.patch(
         {
             "coordinate_descent_tuning": True,
diff --git a/test/inductor/test_combo_kernels.py b/test/inductor/test_combo_kernels.py
index b6f356e256713..90399546d26ea 100644
--- a/test/inductor/test_combo_kernels.py
+++ b/test/inductor/test_combo_kernels.py
@@ -10,8 +10,8 @@
     instantiate_parametrized_tests,
     TestCase,
 )
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
-from torch.testing._internal.triton_utils import requires_cuda
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA_AND_TRITON
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 aten = torch.ops.aten
@@ -55,7 +55,7 @@ def tearDown(self):
         torch._inductor.metrics.reset()
         super().tearDown()
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_activation_functions(self):
         def test_activations(a, b, c):
             a1 = torch.nn.functional.relu(a)
@@ -75,7 +75,7 @@ def test_activations(a, b, c):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_reduce_functions(self):
         def test_reduce(a, b, c, d):
             a1 = torch.sum(a, dim=0)
@@ -98,7 +98,7 @@ def test_reduce(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(torch._inductor.metrics.generated_kernel_count <= 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_mutated_args(self):
         def test_mutated(a, b, c, d):
             a.add_(1)
@@ -121,7 +121,7 @@ def test_mutated(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_reduce_split(self):
         def fn(a, b):
             a1 = torch.linalg.vector_norm(a)
@@ -137,7 +137,7 @@ def fn(a, b):
 
         self.assertEqual(out_eager, out_compiled)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_2d_blocking_partitioning(self):
         def fn(a0, a1, a2, b0, b1, b2):
             c0 = torch.add(a0, b0)
@@ -184,7 +184,7 @@ def tearDown(self):
         torch._inductor.metrics.reset()
         super().tearDown()
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_activation_benchmark(self):
         def test_activations(a, b, c):
             a1 = torch.nn.functional.relu(a)
@@ -204,7 +204,7 @@ def test_activations(a, b, c):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_reduce_benchmark(self):
         def test_reduce(a, b, c, d):
             a1 = torch.sum(a, dim=0)
@@ -227,7 +227,7 @@ def test_reduce(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_mutated_benchmark(self):
         def test_mutated(a, b, c, d):
             a.add_(1)
@@ -250,7 +250,7 @@ def test_mutated(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(torch._inductor.metrics.generated_kernel_count in [6, 9])
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_round_robin_dispatch(self):
         # combo kernel dispatch strategy: round robin
         def test_mutated(a, b, c, d):
@@ -274,7 +274,7 @@ def test_mutated(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_2d_blocking_benchmark(self):
         def fn(a0, a1, a2, b0, b1, b2):
             c0 = torch.add(a0, b0)
@@ -296,6 +296,23 @@ def fn(a0, a1, a2, b0, b1, b2):
 
         self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
 
+    @requires_cuda_and_triton
+    def test_persistent_reduction_no_x_dim(self):
+        def fn(x, y):
+            return x.sum(1), y.sum(1)
+
+        inps = (
+            torch.rand(16, 256, device="cuda"),
+            torch.rand(32, 256, device="cuda"),
+        )
+        torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
+        torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
+        out_eager = fn(*inps)
+        out_compiled = torch.compile(fn)(*inps)
+
+        self.assertEqual(out_eager, out_compiled)
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
+
 
 @instantiate_parametrized_tests
 class ComboKernelDynamicShapesTests(TestCase):
@@ -329,7 +346,7 @@ def tearDown(self):
         torch._inductor.metrics.reset()
         super().tearDown()
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_dynamic_shapes_activations(self):
         def test_activations(a, b, c):
             a1 = torch.nn.functional.relu(a)
@@ -349,7 +366,7 @@ def test_activations(a, b, c):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_dynamic_shapes_2d_blocking(self):
         def fn(a0, a1, a2, b0, b1, b2):
             c0 = torch.add(a0, b0)
@@ -371,7 +388,7 @@ def fn(a0, a1, a2, b0, b1, b2):
 
         self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_dynamic_shapes_reduce(self):
         def test_reduce(a, b, c, d):
             a1 = torch.sum(a, dim=0)
@@ -394,7 +411,7 @@ def test_reduce(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_dynamic_shapes_mutated(self):
         # combo kernel dispatch strategy: round robin
         def test_mutated(a, b, c, d):
@@ -418,7 +435,7 @@ def test_mutated(a, b, c, d):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._inductor.config.patch("combo_kernels_autotune", 0)
     def test_dynamic_shapes_activations_no_autotune(self):
         def test_activations(a, b, c):
@@ -439,7 +456,7 @@ def test_activations(a, b, c):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
     def test_dynamic_shapes_persistent_reduction_no_x_dim(self):
@@ -458,7 +475,7 @@ def fn(x, y):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
     def test_dynamic_shapes_persistent_reduction_no_x_dim_2(self):
@@ -477,7 +494,7 @@ def fn(x, y):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
     def test_dynamic_shapes_2d_blocking_round_robin(self):
@@ -516,7 +533,7 @@ def fn(a0, a1, a2, b0, b1, b2):
         self.assertEqual(out_eager, out_compiled)
         self.assertTrue(5 <= torch._inductor.metrics.generated_kernel_count <= 6)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
     @torch._inductor.config.patch("triton.autotune_at_compile_time", True)
@@ -541,5 +558,5 @@ def fn(x, y, z):
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
-    if HAS_CPU or HAS_CUDA:
+    if HAS_CPU or HAS_CUDA_AND_TRITON:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_compile.py b/test/inductor/test_compile.py
index e1f4f146636d4..6908936eca3f3 100644
--- a/test/inductor/test_compile.py
+++ b/test/inductor/test_compile.py
@@ -1,6 +1,14 @@
 # Owner(s): ["module: inductor"]
+import os
+import shlex
+import subprocess
+import sys
+from unittest import mock
+
 import torch
 from torch import _dynamo as dynamo, _inductor as inductor
+from torch._inductor.codecache import write
+from torch._inductor.cpp_builder import CppBuilder, CppOptions
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import gen_gm_and_inputs
 from torch.fx import symbolic_trace
@@ -8,6 +16,25 @@
 from torch.testing._internal.inductor_utils import HAS_CPU
 
 
+_IS_MACOS = sys.platform.startswith("darwin")
+_IS_WINDOWS = sys.platform == "win32"
+
+
+def safe_command_output(cmd, timeout=30):
+    try:
+        return subprocess.check_output(
+            cmd,
+            stderr=subprocess.STDOUT,
+            text=True,
+            timeout=timeout,
+            shell=isinstance(cmd, str),
+        ).strip()
+    except subprocess.CalledProcessError as e:
+        return f"run failed（error code {e.returncode}）: {e.output.strip()}"
+    except subprocess.TimeoutExpired:
+        return "runt timeout"
+
+
 class MyModule(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -109,6 +136,53 @@ def test_inductor_via_op_with_multiple_outputs(self):
         mod_opt = inductor.compile(mod, inp)
         self.assertEqual(mod(*inp), mod_opt(*inp))
 
+    @mock.patch.dict(os.environ, {"TORCHINDUCTOR_DEBUG_SYMBOL": "1"})
+    def test_inductor_generate_debug_symbol(self):
+        cpp_code = """
+int main(){
+    return 0;
+}
+        """
+
+        _, source_path = write(
+            cpp_code,
+            "cpp",
+        )
+        build_option = CppOptions()
+        cpp_builder = CppBuilder(
+            name="test_symbol",
+            sources=source_path,
+            output_dir=os.path.dirname(source_path),
+            BuildOption=build_option,
+        )
+        cpp_builder.build()
+        binary_path = cpp_builder.get_target_file_path()
+
+        """
+        When we turn on generate debug symbol.
+        On Windows, it should create a [module_name].pdb file. It helps debug by WinDBG.
+        On Linux, it should create some debug sections in binary file.
+        """
+
+        def check_linux_debug_section(module_path: str):
+            check_cmd = shlex.split(f"readelf -S {module_path}")
+            output = safe_command_output(check_cmd)
+            has_debug_sym = ".debug_info" in output
+            self.assertEqual(has_debug_sym, True)
+
+        def check_windows_pdb_exist(module_path: str):
+            file_name_no_ext = os.path.splitext(module_path)[0]
+            file_name_pdb = f"{file_name_no_ext}.pdb"
+            has_pdb_file = os.path.exists(file_name_pdb)
+            self.assertEqual(has_pdb_file, True)
+
+        if _IS_WINDOWS:
+            check_windows_pdb_exist(binary_path)
+        elif _IS_MACOS:
+            pass  # MacOS not sure that if it should be works.
+        else:
+            check_linux_debug_section(binary_path)
+
 
 if __name__ == "__main__":
     if HAS_CPU:
diff --git a/test/inductor/test_compile_subprocess.py b/test/inductor/test_compile_subprocess.py
index 04297c38bf299..51aa7b70b9c40 100644
--- a/test/inductor/test_compile_subprocess.py
+++ b/test/inductor/test_compile_subprocess.py
@@ -62,9 +62,6 @@
     "test_remove_noop_slice_scatter": TestFailure(("xpu"), is_skip=True),
     "test_remove_noop_view_default": TestFailure(("xpu"), is_skip=True),
     "test_remove_noop_view_dtype": TestFailure(("xpu"), is_skip=True),
-    # TODO:remove test_upsample_bicubic2d after the following issue resolved:
-    # https://github.com/intel/intel-xpu-backend-for-triton/issues/4184
-    "test_upsample_bicubic2d": TestFailure(("xpu"), is_skip=False),
 }
 
 
diff --git a/test/inductor/test_compile_worker.py b/test/inductor/test_compile_worker.py
index dcbf1b380934f..8fde26c6acf67 100644
--- a/test/inductor/test_compile_worker.py
+++ b/test/inductor/test_compile_worker.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: inductor"]
 import operator
 import os
+import tempfile
 
 from torch._inductor.compile_worker.subproc_pool import (
     raise_testexc,
@@ -66,6 +67,19 @@ def test_quiesce(self):
         finally:
             pool.shutdown()
 
+    @skipIfWindows(msg="pass_fds not supported on Windows.")
+    def test_logging(self):
+        os.environ["MAST_HPC_JOB_NAME"] = "test_job"
+        os.environ["ROLE_RANK"] = "0"
+        with tempfile.NamedTemporaryFile(delete=True) as temp_log:
+            os.environ["TORCHINDUCTOR_WORKER_LOGPATH"] = temp_log.name
+            pool = SubprocPool(2)
+            try:
+                pool.submit(operator.add, 100, 1)
+                self.assertEqual(os.path.exists(temp_log.name), True)
+            finally:
+                pool.shutdown()
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index b3d98a970cf65..6014a6e698607 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -47,8 +47,14 @@
     skipIfWindows,
 )
 from torch.testing._internal.hop_db import hop_db
-from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA, HAS_GPU
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_CPU,
+    HAS_CUDA_AND_TRITON,
+    HAS_GPU,
+)
 from torch.testing._internal.logging_utils import logs_to_string
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 from torch.utils._python_dispatch import TorchDispatchMode
 
 
@@ -166,6 +172,20 @@ def run_as_subprocess(self, script) -> bytes:
         except subprocess.CalledProcessError as e:
             self.fail(f"Subprocess exited with return code: {e.returncode}")
 
+    def test_hipify_not_loaded_with_import_torch(self):
+        script = """
+import torch
+assert globals().get("hipify", False) is False
+"""
+        self.run_as_subprocess(script)
+
+    def test_hipify_not_loaded_with_import_cpp_extension(self):
+        script = """
+import torch.utils.cpp_extension
+assert globals().get("hipify", False) is False
+"""
+        self.run_as_subprocess(script)
+
     def test_dynamo_flaky_segfault(self):
         script = """
 import torch
@@ -2989,7 +3009,7 @@ def backward(ctx, grad):
                 b = MyFunc.apply(a)
                 b.sum().backward()
 
-    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+    @requires_cuda_and_triton
     def test_cudagraphs_cpu_division(self):
         from torch._dynamo.testing import reduce_to_scalar_loss
 
@@ -3029,7 +3049,7 @@ def test_cudagraphs_cpu_graph(self):
 
         self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
 
-    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+    @requires_cuda_and_triton
     def test_cudagraphs_sdpa(self):
         query = torch.rand(
             32, 8, 128, 64, dtype=torch.float16, device="cuda", requires_grad=True
@@ -3051,7 +3071,7 @@ def test_cudagraphs_sdpa(self):
             2 if inductor_config.cpp_wrapper else 0,
         )
 
-    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+    @requires_cuda_and_triton
     def test_cudagraphs_cpu_scalar_used_in_python_custom_op(self):
         class MyFn(torch.autograd.Function):
             @staticmethod
@@ -3079,10 +3099,19 @@ def backward(ctx, gO):
         self.assertEqual(counters["compiled_autograd"]["captures"], 1)
         # Compiled autograd lifts custom autograd.Function bwd instead of tracing it.
         # Must skip since we do not know if the cpu scalar will be used only in ATen/prim ops.
-        self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+        if inductor_config.graph_partition:
+            # instead of skipping cudagraph, graph partition splits off cpu inputs/outputs and ops
+            # and cudagraphify the remaining computation. So there is no cudagraph skip.
+            expected_cudagraph_skips = 0
+        else:
+            expected_cudagraph_skips = 1
+
+        self.assertEqual(
+            counters["inductor"]["cudagraph_skips"], expected_cudagraph_skips
+        )
 
     @scoped_load_inline
-    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+    @requires_cuda_and_triton
     def test_cudagraphs_cpu_scalar_used_in_cpp_custom_op(self, load_inline):
         cpp_source = """
 struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
@@ -3144,9 +3173,18 @@ def test_cudagraphs_cpu_scalar_used_in_cpp_custom_op(self, load_inline):
         # into it. We must skip since we do not know if the cpu scalar will be used only in ATen/prim ops.
         # In the future, we can consider having a cpu scalar movement pass sometime after we trace
         # into the custom C++ autograd::Function (like in AOTDispatcher)
+        if inductor_config.graph_partition:
+            # instead of skipping cudagraph, graph partition splits off cpu inputs/outputs and ops
+            # and cudagraphify the remaining computation. So there is no cudagraph skip.
+            expected_cudagraph_skips = 0
+        elif inductor_config.cpp_wrapper:
+            expected_cudagraph_skips = 2
+        else:
+            expected_cudagraph_skips = 1
+
         self.assertEqual(
             counters["inductor"]["cudagraph_skips"],
-            2 if inductor_config.cpp_wrapper else 1,
+            expected_cudagraph_skips,
         )
 
     def test_logs(self):
@@ -3710,7 +3748,7 @@ def inner_compiler(gm_, example_inputs_):
         self.assertTrue(isinstance(view_nodes[0].args[1][0], torch.fx.Node))
         self.assertTrue(isinstance(view_nodes[1].args[1][0], torch.fx.Node))
 
-    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+    @requires_cuda_and_triton
     def test_flex_attention(self):
         def _squared(score, b, h, m, n):
             """Joint graph needed for correctness"""
@@ -3878,7 +3916,7 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
                 compiler_fn=make_compiler_fn(backend="ca_eager", gm_hook=check),
             )
 
-    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+    @requires_cuda_and_triton
     def test_cpu_offloading(self):
         def fn():
             def pack(x):
@@ -5046,7 +5084,7 @@ def wrap_test_class(orig_cls):
             dct[name] = unittest.expectedFailure
         elif name.startswith("test_"):
             backend = lookup_backend(name)
-            if not HAS_CUDA and backend == "inductor":
+            if not HAS_CUDA_AND_TRITON and backend == "inductor":
                 continue
             ctxs = [
                 compiled_autograd._enable(
@@ -5159,6 +5197,7 @@ def wrap_test_class(orig_cls):
     "test_nested_checkpoint_set_early_stop",  # dynamo disable
     "test_nested_checkpoint_two_children_early_stop_False",  # dynamo disable
     "test_nested_checkpoint_two_children_early_stop_True",  # dynamo disable
+    "test_custom_autograd_ac_early_stop",  # marked as skipped
     "test_dropout",  # dynamo disable
     "test_dropout_inductor",  # dynamo disable
     "test_function_with_kwargs",  # dynamo disable
@@ -5283,7 +5322,7 @@ def wrap_test_class(orig_cls):
 
 skipped_tests = set()
 
-if not HAS_CUDA:
+if not HAS_CUDA_AND_TRITON:
     # Found Tesla M60 which is too old to be supported by the triton GPU compiler
     skipped_tests.add("test_type_conversions")
 
@@ -5309,7 +5348,7 @@ def wrap_test_class(orig_cls):
     test_higher_order_ops.ActivationCheckpointingTests
 )
 
-if torch.distributed.is_available() and HAS_CUDA:
+if torch.distributed.is_available() and HAS_CUDA_AND_TRITON:
     test_dtensor = load_test_module("distributed/tensor/test_dtensor_compile")
     TestDTensorCompileWithCompiledAutograd = wrap_test_class(
         test_dtensor.TestDTensorCompile
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index 9751b3ca8f554..c313348e93346 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: inductor"]
 
+import random
 import sys
 import types
 import unittest
@@ -57,14 +58,14 @@
     optim_db,
     optims,
 )
-from torch.testing._internal.common_utils import parametrize
+from torch.testing._internal.common_utils import parametrize, skipIfWindows
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_CPU,
     HAS_GPU,
     has_triton,
 )
-from torch.testing._internal.triton_utils import requires_cuda, requires_gpu
+from torch.testing._internal.triton_utils import requires_cuda_and_triton, requires_gpu
 
 
 def get_inputs(optim):
@@ -583,6 +584,9 @@ class CompiledOptimizerParityTests(TestCase):
     @optims(optim_db, dtypes=[torch.float32])
     @parametrize("use_closure", [True, False])
     def test_correctness(self, device, dtype, optim_info, use_closure):
+        torch.cuda.manual_seed_all(0)
+        torch.manual_seed(0)
+        random.seed(0)
         optim_cls = optim_info.optim_cls
         all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
             device, dtype, optim_info, skip=("differentiable",)
@@ -604,7 +608,10 @@ def test_correctness(self, device, dtype, optim_info, use_closure):
                 torch._inductor.metrics.reset()
                 input = torch.ones([10, 10], device=device)
                 model_eager = torch.nn.Sequential(
-                    *[torch.nn.Linear(10, 10, device=device) for _ in range(2)]
+                    *[
+                        torch.nn.Linear(10, 10, device=device, bias=False)
+                        for _ in range(2)
+                    ]
                 )
                 model_eager(input).sum().backward()
                 model_compiled = deepcopy(model_eager)
@@ -724,6 +731,7 @@ def check_cudagraphs_ran(self):
         SGD, kernel_count=1, lr=0.01, foreach=True
     )
 
+    @skipIfWindows
     @requires_gpu
     def test_static_address_finalizer(self):
         import gc
@@ -916,7 +924,7 @@ def fn(xs, ys):
 
         self.assertLess(end - start, 90)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_S429861(self):
         # Just verify we can compile this function without error
         try:
@@ -935,7 +943,7 @@ def test_S429861(self):
             kwargs = aot_graph_input_parser(forward)
             torch.compile(forward)(**kwargs)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_foreach_map_adam(self):
         params = [
             torch.rand(
diff --git a/test/inductor/test_control_flow.py b/test/inductor/test_control_flow.py
index 107a65d6fa1df..715176a5ee51f 100644
--- a/test/inductor/test_control_flow.py
+++ b/test/inductor/test_control_flow.py
@@ -5,6 +5,7 @@
 
 import torch
 import torch._dynamo.testing
+import torch.utils._pytree as pytree
 from torch._higher_order_ops.associative_scan import associative_scan
 from torch._higher_order_ops.map import _fake_map
 from torch._higher_order_ops.scan import _fake_scan, scan
@@ -37,6 +38,24 @@ def prepend_counters(inputs, num_counters=1, counter_values=(0, 1, 5)):
     return _prepend_product_of_values(inputs, counter_values, num_counters)
 
 
+# a testing loss_fn
+def loss_fn(result) -> torch.Tensor:
+    flat_results, _ = pytree.tree_flatten(result)
+    total_loss = torch.tensor(
+        0.0, device=flat_results[0].device if flat_results else torch.device("cpu")
+    )
+
+    for res in flat_results:
+        # Convert to float if integer tensor to avoid numerical issues
+        if not res.dtype.is_floating_point:
+            res = res.float()
+
+        # Simple robust loss: abs values + small constant to avoid inf/nan
+        total_loss = total_loss + (torch.abs(res) / (1.0 + torch.abs(res))).sum()
+
+    return total_loss
+
+
 class CondModels:
     class Simple(torch.nn.Module):
         def forward(self, p, a, b):
@@ -247,6 +266,16 @@ def false_fn(x):
 
             return torch.cond(p, true_fn, false_fn, (x,))
 
+    class SelectWithInputIdx(torch.nn.Module):
+        def forward(self, p, x, idx):
+            u0 = idx.item()
+            x0 = x.select(0, u0)
+
+            def fn():
+                return x0.sin()
+
+            return torch.cond(x0.sum() > 0, fn, fn)
+
 
 class CondTests(TestCase):
     def _run_test(
@@ -265,9 +294,13 @@ def _run_test(
         if dynamic:
             larger_inputs = []
             for inp in inputs:
-                # tile every first dim 5x
-                tiling = [5] + [1] * (inp.ndim - 1)
-                larger_inputs.append(torch.tile(inp, tiling))
+                # only tile non-scalar tensor inputs
+                if inp.ndim > 0:
+                    # tile every first dim 5x
+                    tiling = [5] + [1] * (inp.ndim - 1)
+                    larger_inputs.append(torch.tile(inp, tiling))
+                else:
+                    larger_inputs.append(inp)
             input_sets.append(larger_inputs)
             for inputs in input_sets:
                 for inp in inputs:
@@ -472,6 +505,9 @@ def false_fn(x):
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @torch._inductor.config.patch(size_asserts=False)
+    # TODO: graph partition does not support creating tensor
+    # with dynamic shape in conditional subgraph yet
+    @torch._inductor.config.patch(graph_partition=False)
     def test_cond_unbacked_symint_inner(self, device):
         class Model(torch.nn.Module):
             def forward(self, p, a):
@@ -720,6 +756,18 @@ def test_cond_functional_call(self, device, dynamic):
             dynamic=dynamic,
         )
 
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_cond_select_with_input_idx(self, device, dynamic):
+        self._run_test(
+            model=CondModels.SelectWithInputIdx(),
+            inputs=(torch.randn(10, 20), torch.tensor(0, dtype=torch.int64)),
+            device=device,
+            dynamic=dynamic,
+        )
+
 
 class WhileLoopModels:
     class Simple(torch.nn.Module):
@@ -756,8 +804,12 @@ class Parameters(torch.nn.Module):
         class InnerModel(torch.nn.Module):
             def __init__(self, device):
                 super().__init__()
-                self.layer1 = torch.nn.Linear(20, 30, device=device)
-                self.layer2 = torch.nn.Linear(30, 20, device=device)
+                self.layer1 = torch.nn.Linear(
+                    20, 30, device=device, dtype=torch.float64
+                )
+                self.layer2 = torch.nn.Linear(
+                    30, 20, device=device, dtype=torch.float64
+                )
 
             def forward(self, c, x):
                 return c - 1, self.layer2(self.layer1(x - 2)) * 3.14
@@ -937,6 +989,23 @@ def body_fn(c, a_view):
             )
             return out1 + 1, out2 + 2
 
+    class ZeroLoop4(torch.nn.Module):
+        def forward(self, c, a):
+            a_view = torch.sin(a.view(-1, 1))
+
+            def cond_fn(c, a_view):
+                return torch.clip(a_view.sum(), 0, 1) < 0
+
+            def body_fn(c, a_view):
+                return c - 1, a_view + 1
+
+            out1, out2 = torch._higher_order_ops.while_loop(
+                cond_fn,
+                body_fn,
+                [c, a_view],
+            )
+            return out2.sin_(), a_view.cos_()
+
     class UnbackedSymIntClosure(torch.nn.Module):
         def forward(self, c, a, b):
             d = a.sum().to(torch.int64).item()
@@ -960,7 +1029,7 @@ def forward(self, c, a, b):
             e = torch.nonzero(b).size(0)
 
             def cond_fn(c, a, b):
-                return d + e + a.shape[0] - b.shape[0] < 10
+                return c + d + e + a.shape[0] - b.shape[0] < 10
 
             def body_fn(c, a, b):
                 return c + 1, a + e, b + d
@@ -1023,36 +1092,52 @@ def body_fn(loop_idx, x):
                 (c, x),
             )
 
+    class WhileLoopStackOutputSimple(torch.nn.Module):
+        def __init__(self, device):
+            super().__init__()
+            self.linear = torch.nn.Linear(3, 3, device=device)
+
+        def forward(self, c, x):
+            def cond_fn(c, x):
+                return c < x.size(0)
+
+            def body_fn(c, x):
+                return c + 1, self.linear(x)
+
+            stacked_c, stacked_x = torch.ops.higher_order.while_loop_stack_output(
+                cond_fn, body_fn, (c, x), tuple()
+            )
+            return stacked_c, stacked_x
+
 
 class WhileLoopTests(TestCase):
     def _run_test(
-        self,
-        model,
-        inputs,
-        device,
-        dynamic=False,
-        num_counters=1,
+        self, model, inputs, device, dynamic=False, num_counters=1, autograd=False
     ):
         import torch.utils._pytree as pytree
 
         cnt = torch._dynamo.testing.CompileCounterWithBackend("inductor")
-        compiled_model = torch.compile(backend=cnt, fullgraph=True)(model)
+        import copy
+
+        if not autograd:
+            for p in model.parameters():
+                p.requires_grad_(False)
+
+        compiled_model = copy.deepcopy(model)
+        compiled_fn = torch.compile(backend=cnt, fullgraph=True)(compiled_model)
 
         inputs = pytree.tree_map(lambda t: t.to(device=device), inputs)
         input_sets = [inputs]
-        if dynamic:
 
-            def mark_first_dim_dyn(inp):
-                torch._dynamo.mark_dynamic(inp, 0)
+        def mark_first_dim_dyn(inp):
+            torch._dynamo.mark_dynamic(inp, 0)
 
-            pytree.tree_map(mark_first_dim_dyn, input_sets)
+        if dynamic:
 
             def tile_fn(inp):
                 # tile every first dim 5x
                 tiling = [5] + [1] * (inp.ndim - 1)
                 t = torch.tile(inp, tiling)
-                # mark every first dim as dynamic
-                torch._dynamo.mark_dynamic(inp, 0)
                 return t
 
             larger_inputs = pytree.tree_map(tile_fn, inputs)
@@ -1069,24 +1154,78 @@ def tile_fn(inp):
                 )
                 unflat_inputs = pytree.tree_unflatten(flat, inp_spec)
                 inputs_with_counters = counters + unflat_inputs
-                cloned_inputs = pytree.tree_map(
-                    lambda t: t.clone(), inputs_with_counters
-                )
-                result = model(*inputs_with_counters)
-                with torch.no_grad():
-                    result_compiled = compiled_model(*inputs_with_counters)
+
+                def process_inputs(inp):
+                    inp = inp.clone()
+                    if dynamic:
+                        mark_first_dim_dyn(inp)
+
+                    if autograd and inp.dtype.is_floating_point:
+                        inp.requires_grad_(True)
+                    return inp
+
+                cloned_inputs = pytree.tree_map(process_inputs, inputs_with_counters)
+                cloned_inputs2 = pytree.tree_map(process_inputs, inputs_with_counters)
+
+                result = model(*cloned_inputs)
+                result_compiled = compiled_fn(*cloned_inputs2)
                 # inputs must not be mutated
                 torch.testing.assert_close(cloned_inputs, inputs_with_counters)
                 torch.testing.assert_close(
                     result, result_compiled, atol=1e-4, rtol=1e-4
                 )
 
+                if autograd and any(
+                    pytree.tree_map_only(
+                        torch.Tensor, lambda t: t.requires_grad, cloned_inputs
+                    )
+                ):
+                    result_loss = loss_fn(pytree.tree_flatten(result)[0])
+                    compiled_loss = loss_fn(pytree.tree_flatten(result_compiled)[0])
+                    self.assertTrue(
+                        not torch.isnan(result_loss) and not torch.isinf(compiled_loss)
+                    )
+                    self.assertTrue(
+                        not torch.isnan(compiled_loss)
+                        and not torch.isinf(compiled_loss)
+                    )
+
+                    self.assertEqual(result_loss, compiled_loss)
+
+                    result_loss.backward()
+                    compiled_loss.backward()
+
+                    model_parameters = dict(model.named_parameters())
+                    compiled_parameters = dict(compiled_model.named_parameters())
+                    for name, param in model_parameters.items():
+                        self.assertEqual(param, compiled_parameters[name])
+                        self.assertEqual(
+                            param.grad,
+                            compiled_parameters[name].grad,
+                            atol=1e-4,
+                            rtol=1e-4,
+                        )
+
+                    for inp1, inp2 in zip(
+                        pytree.tree_flatten(cloned_inputs)[0],
+                        pytree.tree_flatten(cloned_inputs2)[0],
+                    ):
+                        if inp1.requires_grad:
+                            self.assertEqual(
+                                inp1.grad,
+                                inp2.grad,
+                                atol=1e-4,
+                                rtol=1e-4,
+                            )
+
         self.assertEqual(cnt.frame_count, 1, "only one compilation expected")
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [False, True])
-    def test_while_loop_simple_control_flow(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_simple_control_flow(self, device, dynamic, autograd):
         # while_loop control flow without nesting
         self._run_test(
             model=WhileLoopModels.Simple(),
@@ -1096,12 +1235,15 @@ def test_while_loop_simple_control_flow(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [False, True])
-    def test_while_loop_nested_control_flow(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_nested_control_flow(self, device, dynamic, autograd):
         # while_loop control flow with nesting
         self._run_test(
             model=WhileLoopModels.Nested(),
@@ -1112,12 +1254,15 @@ def test_while_loop_nested_control_flow(self, device, dynamic):
             device=device,
             dynamic=dynamic,
             num_counters=2,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [False, True])
-    def test_while_loop_with_outer_code(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_outer_code(self, device, dynamic, autograd):
         # while_loop control flow with outer code
         self._run_test(
             model=WhileLoopModels.OuterCode(),
@@ -1127,18 +1272,22 @@ def test_while_loop_with_outer_code(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [False, True])
-    def test_while_loop_with_parameters(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_parameters(self, device, dynamic, autograd):
         # while_loop control flow with parameters
         self._run_test(
             model=WhileLoopModels.Parameters(device),
-            inputs=(torch.randn(10, 20),),
+            inputs=(torch.randn(10, 20, dtype=torch.float64),),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
@@ -1146,7 +1295,9 @@ def test_while_loop_with_parameters(self, device, dynamic):
     # dynamic=True doesn't work now due to
     # https://github.com/pytorch/pytorch/issues/123596
     @parametrize("dynamic", [False])
-    def test_while_loop_with_outer_buffers(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_outer_buffers(self, device, dynamic, autograd):
         # while_loop control flow with outer code
         self._run_test(
             model=WhileLoopModels.OuterBuffers(),
@@ -1156,13 +1307,15 @@ def test_while_loop_with_outer_buffers(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
-    # dynamic=True doesn't work due to we haven't handle lifted symbols
     @parametrize("dynamic", [True, False])
-    def test_while_loop_with_pytree_inputs(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_pytree_inputs(self, device, dynamic, autograd):
         self._run_test(
             model=WhileLoopModels.PytreeCarry(),
             inputs=(
@@ -1173,12 +1326,15 @@ def test_while_loop_with_pytree_inputs(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
-    def test_while_loop_with_data_dependent_ops(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_data_dependent_ops(self, device, dynamic, autograd):
         with torch._dynamo.config.patch(
             {
                 "capture_dynamic_output_shape_ops": True,
@@ -1194,12 +1350,15 @@ def test_while_loop_with_data_dependent_ops(self, device, dynamic):
                 ),
                 device=device,
                 dynamic=dynamic,
+                autograd=autograd,
             )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
-    def test_while_loop_with_data_dependent_in_out(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_data_dependent_in_out(self, device, dynamic, autograd):
         with torch._dynamo.config.patch(
             {
                 "capture_dynamic_output_shape_ops": True,
@@ -1216,6 +1375,7 @@ def test_while_loop_with_data_dependent_in_out(self, device, dynamic):
                 ),
                 device=device,
                 dynamic=dynamic,
+                autograd=autograd,
             )
 
     @parametrize("dynamic", [True, False])
@@ -1261,6 +1421,7 @@ def test_while_loop_zero_loop(self, device, dynamic):
             WhileLoopModels.ZeroLoop(),
             WhileLoopModels.ZeroLoop2(),
             WhileLoopModels.ZeroLoop3(),
+            WhileLoopModels.ZeroLoop4(),
         ]:
             self._run_test(
                 model=model,
@@ -1275,7 +1436,8 @@ def test_while_loop_zero_loop(self, device, dynamic):
     @torch._dynamo.config.patch(
         {"capture_scalar_outputs": True, "capture_dynamic_output_shape_ops": True}
     )
-    def test_while_loop_with_unbacked_symint_closure(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    def test_while_loop_with_unbacked_symint_closure(self, device, dynamic, autograd):
         self._run_test(
             model=WhileLoopModels.UnbackedSymIntClosure(),
             inputs=(
@@ -1284,6 +1446,7 @@ def test_while_loop_with_unbacked_symint_closure(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
@@ -1318,10 +1481,11 @@ def test_while_loop_models_with_mixed_device(self, device):
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+    @parametrize("autograd", [False, True])
     @torch._dynamo.config.patch(
         {"capture_scalar_outputs": True, "capture_dynamic_output_shape_ops": True}
     )
-    def test_while_loop_with_sym_expr_cond(self, device, dynamic):
+    def test_while_loop_with_sym_expr_cond(self, device, dynamic, autograd):
         self._run_test(
             model=WhileLoopModels.SymExprCond(),
             inputs=(
@@ -1330,17 +1494,33 @@ def test_while_loop_with_sym_expr_cond(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
-    def test_while_loop_with_conv(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_conv(self, device, dynamic, autograd):
         self._run_test(
             model=WhileLoopModels.Conv(device),
             inputs=(torch.randn(2, 4, 4, 4, dtype=torch.float64),),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
+        )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_stack_output_simple(self, device, dynamic):
+        self._run_test(
+            model=WhileLoopModels.WhileLoopStackOutputSimple(device),
+            inputs=(torch.randn(3, 3, dtype=torch.float32),),
+            device=device,
+            dynamic=dynamic,
         )
 
 
@@ -1563,8 +1743,6 @@ def __init__(self, reverse, dim):
 
         def forward(self, scan_op, _input, weight, bias):
             def combine_fn(carry, x):
-                from torch.utils import _pytree as pytree
-
                 new_carry = {
                     "param": carry["param"] @ x + carry["bias"],
                     "bias": carry["bias"].sin(),
@@ -1974,51 +2152,78 @@ def _run_test(
         inputs,
         device,
         dynamic=False,
+        autograd=False,
     ):
+        import copy
+
+        inputs = [inp.to(device=device) for inp in inputs]
+        model = model.to(device=device)
+        model_eager = copy.deepcopy(model)
+        model_compiled = copy.deepcopy(model)
         cnt = torch._dynamo.testing.CompileCounterWithBackend("inductor")
         compiled_model = torch.compile(backend=cnt, fullgraph=True, dynamic=dynamic)(
-            model
+            model_compiled
         )
 
-        inputs = [inp.to(device=device) for inp in inputs]
-        model = model.to(device=device)
+        if autograd:
+            pytree.tree_map_only(torch.Tensor, lambda t: t.requires_grad_(True), inputs)
+
         cloned_inputs = [inp.clone() for inp in inputs]
         result = model(torch._higher_order_ops.map, *cloned_inputs)
-        result_exp = model(_fake_map, *cloned_inputs)
+        result_exp = model_eager(_fake_map, *cloned_inputs)
         result_compiled = compiled_model(torch._higher_order_ops.map, *cloned_inputs)
 
         self.assertEqual(result, result_exp)
         self.assertEqual(result, result_compiled)
 
+        if autograd:
+            loss_fn(result).backward()
+            loss_fn(result_exp).backward()
+            loss_fn(result_compiled).backward()
+
+            model_params = dict(model.named_parameters())
+            model_eager_params = dict(model_eager.named_parameters())
+            model_compiled_params = dict(model_compiled.named_parameters())
+            for name, param in model_eager_params.items():
+                self.assertEqual(param, model_params[name])
+                self.assertEqual(param, model_compiled_params[name])
+                self.assertEqual(param.grad, model_params[name].grad)
+                self.assertEqual(param.grad, model_compiled_params[name].grad)
+
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+    @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_map_simple(self, device, dynamic):
+    def test_map_simple(self, device, dynamic, autograd):
         self._run_test(
             model=MapModels.Simple(),
             inputs=(torch.randn(3, 4),),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+    @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_map_simple_linear_with_view(self, device, dynamic):
+    def test_map_simple_linear_with_view(self, device, dynamic, autograd):
         self._run_test(
             model=MapModels.SimpleWithLinearWithView(),
             inputs=(torch.randn(3, 4),),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+    @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_map_pytree_in_out(self, device, dynamic):
+    def test_map_pytree_in_out(self, device, dynamic, autograd):
         self._run_test(
             model=MapModels.PytreeInOut(),
             inputs=(
@@ -2028,13 +2233,15 @@ def test_map_pytree_in_out(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+    @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_map_nested_with_cond(self, device, dynamic):
+    def test_map_nested_with_cond(self, device, dynamic, autograd):
         self._run_test(
             model=MapModels.NestedWithCond(),
             inputs=(
@@ -2044,6 +2251,7 @@ def test_map_nested_with_cond(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
 
diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py
index fc296b12a9d70..0b8f60dc0d269 100644
--- a/test/inductor/test_cooperative_reductions.py
+++ b/test/inductor/test_cooperative_reductions.py
@@ -18,7 +18,7 @@
     instantiate_parametrized_tests,
     parametrize,
 )
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 
 
 class TestingHeuristics(InductorChoices):
@@ -381,5 +381,5 @@ def fn(x, y):
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
-    if HAS_CUDA:
+    if HAS_CUDA_AND_TRITON:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_cpu_cpp_wrapper.py b/test/inductor/test_cpu_cpp_wrapper.py
index 4b4daaef5c438..47a8f3aa063e3 100644
--- a/test/inductor/test_cpu_cpp_wrapper.py
+++ b/test/inductor/test_cpu_cpp_wrapper.py
@@ -268,7 +268,7 @@ class BaseTest(NamedTuple):
             "test_multi_threading",
             condition=not IS_WINDOWS,
             # Two threads compile, so we expect the output code to be printed twice.
-            code_string_count={"py::gil_scoped_release release;": 2},
+            code_string_count={"py::gil_scoped_release_simple release;": 2},
         ),
         BaseTest("test_profiler_mark_wrapper_call"),
         BaseTest(
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index 55c0a2977daf9..10e7c3068f10a 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -3117,6 +3117,30 @@ def get_traj_idx(lengths: torch.Tensor, num_slices: int) -> torch.Tensor:
         lengths = torch.zeros(11, dtype=torch.long)
         get_traj_idx(lengths, num_slices=4)
 
+    def test_store_reduction(self):
+        # fix https://github.com/pytorch/pytorch/issues/157683
+        def fn(x, y):
+            r1 = x.amax(dim=0)
+            r2 = y.amax(dim=0)
+            return r1, r2
+
+        device = "cpu"
+        for int_dypte, float_dtype in zip(
+            [torch.int64, torch.int32, torch.int16, torch.int8],
+            [torch.float64, torch.float32, torch.float16, torch.bfloat16],
+        ):
+            x = torch.randint(
+                low=0, high=100, size=(16, 24, 59), dtype=int_dypte, device=device
+            )
+            y = torch.randn(16, 24, 59, dtype=float_dtype, device=device)
+            self.common(
+                fn,
+                (
+                    x,
+                    y,
+                ),
+            )
+
     @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_sign_cpu_only(self):
@@ -4077,6 +4101,24 @@ def fn(x1, x2):
         )
         self.assertEqual(metrics.generated_kernel_count, 1)
 
+    def test_relu_permute_reshape_reinterpret_view(self):
+        def fn(x):
+            n, c, h, w = x.shape
+            return torch.relu(x).permute(0, 2, 3, 1).reshape(n, h * w, c)
+
+        x = torch.randn(2, 32, 4, 4).to(memory_format=torch.channels_last)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated kernel
+            self.assertEqual(metrics.generated_kernel_count, 1)
+            # check that there is no transpose
+            FileCheck().check_count("transpose_mxn", 0, exactly=True).run(code)
+
     def test_attention_size_mismatch(self):
         class Attention(torch.nn.Module):
             def __init__(self, hidden_size, num_heads):
@@ -5430,6 +5472,24 @@ def fn(x, samples):
                 # Verify correctness with explicit samples (should match exactly)
                 torch.testing.assert_close(result, expected, rtol=1e-4, atol=1e-4)
 
+    def test_outer_looop_fusion_with_local_buf(self):
+        def fn(
+            xs: torch.Tensor,
+            Ls: torch.Tensor,
+        ):
+            arr = -torch.einsum("i...,i->i...", xs, Ls)
+            temp = torch.exp(arr)
+            Q = torch.einsum("i...->i", temp)
+            ans = torch.einsum("i,i...->i...", 1 / Q, temp)
+            return ans
+
+        xs = torch.ones((5, 1, 32, 32), requires_grad=False)
+        Ls = torch.ones((5), requires_grad=False)
+        expected = fn(xs, Ls)
+        compiled_func = torch.compile(fn, backend="inductor")
+        result = compiled_func(xs, Ls)
+        torch.testing.assert_close(result, expected)
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
index 7e35c93ee0b79..fe1e59bd7f49a 100644
--- a/test/inductor/test_cpu_select_algorithm.py
+++ b/test/inductor/test_cpu_select_algorithm.py
@@ -26,6 +26,7 @@
 )
 from torch.testing._internal.common_utils import (
     IS_MACOS,
+    IS_WINDOWS,
     parametrize,
     skipIfWindows,
     TEST_MKL,
@@ -296,6 +297,10 @@ def forward(self, x):
                     dtype == torch.float16
                     and torch.ops.mkldnn._is_mkldnn_fp16_supported()
                 )
+                or (
+                    dtype == torch.float32
+                    and not dynamo_config.assume_static_by_default
+                )
             )
             and epilogue != "mul"
             and epilogue != "div"
@@ -304,22 +309,15 @@ def forward(self, x):
                 and epilogue == "add"
                 and not bias
             )
-            or (
-                dtype == torch.float32
-                and epilogue == "add"
-                and not bias
-                and not dynamo_config.assume_static_by_default
-            )
         ):
             # Several scenarios where epilogue fusion is not counted in:
             # 1. For bfloat16, the epilogue fusion is part of the template,
             #    not fused via scheduler. This will also be true for float16 when
-            #    hardware has the float16 instruction. The exception is mul or
-            #    div fusion which is not supported for oneDNN linear.
+            #    hardware has the float16 instruction. And this will also be true
+            #    for float32 dynamic mode. The exception is mul or div fusion
+            #    which is not supported for oneDNN linear.
             # 2. For bfloat16/float16, when oneDNN linear is not applied, linear w/o bias
             #    plus epilogue add is treated as linear w/ bias.
-            # 3. For float32, when dynamic shapes is enabled, mkl linear is not applied.
-            #    and linear w/o bias plus epilogue add is treated as addmm.
             self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 0)
         else:
             self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1)
@@ -800,7 +798,7 @@ def forward(self, arg7_1):
         with verify(dtype) as (atol, rtol):
             self.common(mod, (v,), atol=atol, rtol=rtol)
         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 3)
-        self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 2)
+        self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 0)
 
     @unittest.skipIf(
         not torch._C._cpu._is_amx_tile_supported(), "AMX ISA support is required"
@@ -830,7 +828,7 @@ def forward(self, x):
         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
         vec_amx = VecAMX()
         # Currently brgemm config is only added for half
-        if dtype == torch.half:
+        if dtype == torch.half and not vec_amx.is_amx_fp16_supported():
             self._check_brgemm_counter(vec_amx)
         else:
             self._check_amx_counter(vec_amx)
@@ -2682,7 +2680,7 @@ def forward(self, x):
     @torch.no_grad
     @unittest.skipIf(not TEST_MKL, "Test requires MKL")
     @parametrize("bs", (5,))
-    @parametrize("Mdim", (64,))
+    @parametrize("Mdim", (3, 64))  # Test small Mdim which uses reshaped weights
     @dtypes(torch.float)
     def test_bmm_self_square(self, bs, Mdim, dtype):
         class M(torch.nn.Module):
@@ -2769,6 +2767,33 @@ def forward(self, x, w):
         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
         self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 1)
 
+    @patches
+    @torch.no_grad
+    @parametrize("bs", (1, 50))
+    @parametrize("Mdim", (192,))
+    @parametrize("Kdim", (196,))
+    @parametrize("Ndim", (84, 385))
+    @dtypes(torch.float, torch.bfloat16, torch.half)
+    def test_bmm_with_y_storage_offset(self, dtype, bs, Mdim, Kdim, Ndim):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                # y_with_offset: contiguous, but has non-zero storage offset
+                y_with_offset = torch.empty(
+                    (3, *y.shape), dtype=y.dtype, device=y.device
+                )[2].copy_(y)
+                return x @ y_with_offset
+
+        counters.clear()
+        u = torch.randn(bs, Mdim, Kdim).to(dtype=dtype)
+        v = torch.randn(bs, Kdim, Ndim).to(dtype=dtype)
+        mod = M().to(dtype=dtype).eval()
+        with verify(dtype) as (atol, rtol):
+            self.common(mod, (u, v), atol=atol, rtol=rtol)
+        self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
+
     @patches
     @torch.no_grad
     @dtypes(torch.float)
@@ -3094,5 +3119,5 @@ def forward(self, x, weight):
 if __name__ == "__main__":
     from torch.testing._internal.inductor_utils import HAS_CPU
 
-    if HAS_CPU and not IS_MACOS:
+    if HAS_CPU and not (IS_MACOS or IS_WINDOWS):
         run_tests()
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
index 84e8ce407d568..c3e362ab82f1c 100644
--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@@ -26,6 +26,7 @@
     run_fw_bw_and_get_code,
 )
 from torch.fx.experimental.proxy_tensor import make_fx
+from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
@@ -179,9 +180,10 @@ def test_effn_attn_bias_padding_misaligned(self):
             inputs = [q, k, v, mask]
 
             def f(q, k, v, mask):
-                return F.scaled_dot_product_attention(
-                    q, k, v, attn_mask=mask, dropout_p=0.0
-                )
+                with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
+                    return F.scaled_dot_product_attention(
+                        q, k, v, attn_mask=mask, dropout_p=0.0
+                    )
 
             f_compiled = torch.compile(f)
 
@@ -189,9 +191,9 @@ def f(q, k, v, mask):
             # padded bias should have an expanded dim
             FileCheck().check("buf0 =").check_same(", 0, ").run(code[0])
             # single fused padded kernel
-            FileCheck().check("def call").check_count(
-                "empty_strided_cuda", 1, exactly=True
-            ).check("return").run(code[0])
+            FileCheck().check_count("empty_strided_cuda(", 1, exactly=True).check(
+                "return"
+            ).run(code[0])
 
             self.assertEqual(out, f(*inputs))
 
@@ -935,7 +937,7 @@ def foo(x):
 
         inp = inp.to(torch.float)
         out, code = run_and_get_code(torch.compile(foo), inp)
-        FileCheck().check_not("libdevice.exp").check("tl_math.exp").run(code[0])
+        FileCheck().check_not("tl_math.exp").check("libdevice.exp").run(code[0])
         self.assertEqual(foo(inp), out)
 
         def foo(x):
@@ -2218,7 +2220,7 @@ def forward(self, x):
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
-    from torch.testing._internal.inductor_utils import HAS_CUDA
+    from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 
-    if HAS_CUDA and not TEST_WITH_ASAN:
+    if HAS_CUDA_AND_TRITON and not TEST_WITH_ASAN:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_cudacodecache.py b/test/inductor/test_cudacodecache.py
index 36f73b2004763..b6786130416bd 100644
--- a/test/inductor/test_cudacodecache.py
+++ b/test/inductor/test_cudacodecache.py
@@ -1,7 +1,6 @@
 # Owner(s): ["module: inductor"]
 
 import ctypes
-import unittest
 
 import torch
 from torch._inductor.async_compile import AsyncCompile
@@ -10,10 +9,7 @@
 from torch._inductor.exc import CUDACompileError
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.utils import fresh_cache
-from torch.testing._internal.inductor_utils import HAS_CUDA
-
-
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 _SOURCE_CODE = r"""
@@ -41,7 +37,7 @@
 
 
 class TestCUDACodeCache(InductorTestCase):
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_cuda_load(self):
         with fresh_cache():
             # Test both .o and .so compilation.
@@ -69,14 +65,14 @@ def test_cuda_load(self):
             )
             torch.testing.assert_close(y, expected_y)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_compilation_error(self):
         with fresh_cache():
             error_source_code = _SOURCE_CODE.replace("saxpy_device", "saxpy_wrong", 1)
             with self.assertRaises(CUDACompileError):
                 CUDACodeCache.compile(error_source_code, "o")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_async_compile(self):
         with fresh_cache():
             async_compile = AsyncCompile()
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
index dc8ec985fbae3..a4a3aa65c42c5 100644
--- a/test/inductor/test_cudagraph_trees.py
+++ b/test/inductor/test_cudagraph_trees.py
@@ -40,6 +40,7 @@
     skipIfRocm,
     TEST_CUDA_GRAPH,
 )
+from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import TorchDispatchMode
 
@@ -55,11 +56,8 @@
 importlib.import_module("functorch")
 importlib.import_module("filelock")
 
-from torch.testing._internal.inductor_utils import HAS_CUDA
-
 
 aten = torch.ops.aten
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 requires_multigpu = functools.partial(
     unittest.skipIf, not TEST_MULTIGPU, "requires multiple cuda devices"
 )
@@ -124,7 +122,7 @@ def tearDown(self):
         torch._dynamo.reset()
 
 
-if HAS_CUDA:
+if HAS_CUDA_AND_TRITON:
 
     def get_all_cudagraph_segments():
         segments = torch.cuda.memory_snapshot()
@@ -281,10 +279,14 @@ def foo(x, y):
             with capture_stderr() as captured_output:
                 foo(torch.ones([10], device="cuda"), torch.ones([20]))
 
-            FileCheck().check(
-                "skipping cudagraphs due to cpu device (arg1_1). Found from"
-            ).check("y + 2").run(captured_output[0])
-            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+            if torch._inductor.config.graph_partition:
+                # graph partition splits on cpu ops
+                self.assertEqual(counters["inductor"]["cudagraph_skips"], 0)
+            else:
+                FileCheck().check(
+                    "skipping cudagraphs due to cpu device (arg1_1). Found from"
+                ).check("y + 2").run(captured_output[0])
+                self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
 
             with capture_stderr() as captured_output:
                 foo(
@@ -294,7 +296,10 @@ def foo(x, y):
             FileCheck().check("skipping cudagraphs due to multiple devices").run(
                 captured_output[0]
             )
-            self.assertEqual(counters["inductor"]["cudagraph_skips"], 2)
+            self.assertEqual(
+                counters["inductor"]["cudagraph_skips"],
+                1 if torch._inductor.config.graph_partition else 2,
+            )
 
         @torch._inductor.config.patch("triton.cudagraph_skip_dynamic_graphs", True)
         def test_skip_symbolic(self):
@@ -809,10 +814,16 @@ def foo(x):
             # the three saved tensors should die in the backward
             # we kept alive the output
             self.assertEqual(self.curr_node().expected_dead_indices_before_graph, [])
-            self.assertEqual(
-                self.curr_node().expected_dead_indices_after_graph,
-                [(0, 1), (0, 2)],
-            )
+            if torch._inductor.config.graph_partition:
+                self.assertEqual(
+                    self.curr_node().expected_dead_indices_after_graph,
+                    [(0, 0), (0, 2)],
+                )
+            else:
+                self.assertEqual(
+                    self.curr_node().expected_dead_indices_after_graph,
+                    [(0, 1), (0, 2)],
+                )
             self.assertFalse(self.get_manager().new_graph_id().id == 0)
             self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
 
@@ -1129,8 +1140,13 @@ def foo2(x):
 
             node = self.curr_node()
             first_node = next(node._path_from_root)
-            self.assertFalse(first_node.unaliased_in_all_paths[0])
-            self.assertTrue(first_node.cached_tensor_outputs[0] is None)
+            if torch._inductor.config.graph_partition:
+                # graph partition may changed the order of outputs
+                self.assertFalse(first_node.unaliased_in_all_paths[1])
+                self.assertTrue(first_node.cached_tensor_outputs[1] is None)
+            else:
+                self.assertFalse(first_node.unaliased_in_all_paths[0])
+                self.assertTrue(first_node.cached_tensor_outputs[0] is None)
 
         @torch._inductor.config.patch("implicit_fallbacks", True)
         def test_multinomial(self):
@@ -1633,10 +1649,16 @@ def foo(x):
             # the three saved tensors should die in the backward
             # we kept alive the output
             self.assertEqual(self.curr_node().expected_dead_indices_before_graph, [])
-            self.assertEqual(
-                self.curr_node().expected_dead_indices_after_graph,
-                [(0, 1), (0, 2)],
-            )
+            if torch._inductor.config.graph_partition:
+                self.assertEqual(
+                    self.curr_node().expected_dead_indices_after_graph,
+                    [(0, 0), (0, 2)],
+                )
+            else:
+                self.assertEqual(
+                    self.curr_node().expected_dead_indices_after_graph,
+                    [(0, 1), (0, 2)],
+                )
             self.assertFalse(self.get_manager().new_graph_id().id == 0)
 
         def test_separate_recordings(self):
@@ -2139,8 +2161,8 @@ def forward(self, x) -> torch.Tensor:
             with self.assertRaisesRegex(
                 Exception,
                 r"(?s)static input data pointer changed.\n"
-                r"input name: primals_2. data pointer changed from .* to .*. input stack trace:.*"
-                r"input name: primals_3. data pointer changed from .* to .*. input stack trace:.*,"
+                r"input name: primals_.*. data pointer changed from .* to .*. input stack trace:.*"
+                r"input name: primals_.*. data pointer changed from .* to .*. input stack trace:.*,"
                 r" in forward\n.* self.static_tensor.add\_\(torch.ones\(\(2, 2\), device=\"cuda\"\)\).*\n",
             ):
                 self.curr_node().run(
@@ -2849,6 +2871,28 @@ def foo(x):
 
             self.assertEqual(x, torch.tensor(1, device="cpu"))
 
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_cpu_scalar_multiple(self):
+            def f(x, y, z):
+                return x + y, x + z
+
+            compiled_f = torch.compile(f, mode="reduce-overhead")
+
+            inputs = (
+                torch.ones((), device="cpu"),
+                torch.ones((), device="cpu"),
+                torch.ones(2, 2, device="cuda"),
+            )
+            for i in range(3):
+                if i == 0:
+                    _, code = run_and_get_code(compiled_f, *inputs)
+                    FileCheck().check_regex(r".copy_.*True").run(code[0])
+                    FileCheck().check_count(".copy_", 1, exactly=True).run(code[0])
+                else:
+                    compiled_f(*inputs)
+            self.assertEqual(compiled_f(*inputs), f(*inputs))
+            self.assertEqual(self.get_manager().new_graph_id().id, 1)
+
         @torch._inductor.config.patch("graph_partition", True)
         @torch._inductor.config.patch("triton.cudagraphs", False)
         def test_graph_partition_reduce_overhead_mode_effectiveness(self):
@@ -3531,6 +3575,278 @@ def run(padded_size, original_size):
 
             self.assertEqual(self.get_manager().new_graph_id().id, 2)
 
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_simple(self):
+            def f(x, y):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                return x1 + y1 + z + y_cpu.to("cuda")
+
+            x, y = [torch.ones(2, 2, device="cuda") for _ in range(2)]
+            x_cloned, y_cloned = [tmp.clone() for tmp in [x, y]]
+            eager_out = f(x, y)
+
+            f_compiled = torch.compile(f)
+            compiled_out = f_compiled(x_cloned, y_cloned)
+            self.assertEqual(eager_out, compiled_out)
+
+            _, code = run_and_get_code(f_compiled, x_cloned, y_cloned)
+
+            if not config.cpp_wrapper:
+                FileCheck().check("def partition_0(args):").check(
+                    "recursively_apply_fns = runner.recursively_apply_fns"
+                ).run(code[0])
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_foreach_op(self):
+            def fn(a0, a1):
+                c = torch._foreach_abs([a0, a1])
+                return torch.mul(c[0], a0)
+
+            compiled_fn = torch.compile(fn)
+
+            a0 = torch.randn(2, 3, device="cuda")
+            a1 = torch.randn(2, 3, device="cuda")
+            eager_out = fn(a0, a1)
+            compiled_out = compiled_fn(a0, a1)
+            self.assertEqual(eager_out, compiled_out)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_condition_op(self):
+            def f(p, b):
+                def true_fn(x):
+                    return torch.cos(x)
+
+                def false_fn(x):
+                    return torch.sin(x)
+
+                return torch.cond(p, true_fn, false_fn, [b])
+
+            compiled_f = torch.compile(f)
+
+            # static shape
+            p = torch.tensor([True], device="cuda")
+            a = torch.ones([2, 3], device="cuda")
+            eager_out = f(p, a)
+            compiled_out = compiled_f(p, a)
+            self.assertEqual(eager_out, compiled_out)
+
+            # dynamic shape with backed symint
+            p = torch.tensor([True], device="cuda")
+            a = torch.ones([4, 5], device="cuda")
+            eager_out = f(p, a)
+            compiled_out = compiled_f(p, a)
+            self.assertEqual(eager_out, compiled_out)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        @torch._dynamo.config.patch("capture_scalar_outputs", True)
+        def test_graph_partition_unbacked_symint_multi_output_layout(self):
+            def f(p, size_tensor):
+                size_val = size_tensor.item()
+                b = torch.ones([size_val, 3], device="cuda")
+
+                def true_fn(x):
+                    return torch.cos(x), torch.cos(x) + 1
+
+                def false_fn(x):
+                    return torch.sin(x), torch.sin(x) + 1
+
+                cond_out = torch.cond(p, true_fn, false_fn, [b])
+                return cond_out[0] + cond_out[1]
+
+            compiled_f = torch.compile(f)
+            p = torch.tensor([True], device="cuda")
+            size_tensor = torch.tensor(2, device="cuda")
+            eager_out = f(p, size_tensor)
+            compiled_out = compiled_f(p, size_tensor)
+            self.assertEqual(eager_out, compiled_out)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_symint(self):
+            def f(x, y):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                return x1 + y1 + z + y_cpu.to("cuda")
+
+            f_compiled = torch.compile(f)
+            x, y = (
+                torch.ones(3, 3, device="cuda"),
+                torch.randn(3, 3, device="cuda"),
+            )
+            compiled_out = f_compiled(x, y)
+            self.assertEqual(compiled_out, f(x, y))
+
+            x, y = (
+                torch.ones(4, 4, device="cuda"),
+                torch.randn(4, 4, device="cuda"),
+            )
+            compiled_out = f_compiled(x, y)
+            self.assertEqual(compiled_out, f(x, y))
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_symint_cat_backward(self):
+            def f(x, w):
+                y = torch.cat((x, x), dim=0)
+                z = y @ w
+                return z @ z.T
+
+            compiled_f = torch.compile(f)
+
+            for shape in (2, 3):
+                torch.manual_seed(42)
+                eager_x = torch.randn(shape, 2, device="cuda")
+                eager_w = torch.randn(2, 2, device="cuda", requires_grad=True)
+                torch.manual_seed(42)
+                compiled_x = torch.randn(shape, 2, device="cuda")
+                compiled_w = torch.randn(2, 2, device="cuda", requires_grad=True)
+
+                f(eager_x, eager_w).sum().backward()
+                compiled_f(compiled_x, compiled_w).sum().backward()
+                self.assertEqual(eager_w.grad, compiled_w.grad)
+
+        @dynamo_config.patch("capture_dynamic_output_shape_ops", True)
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_symint_from_nested_indirect_indexing(self):
+            def nested(x, repeats):
+                rank = torch.arange(repeats.numel(), device=x.device)
+                index = rank.repeat_interleave(repeats, dim=0)
+                return torch.index_select(x, index=index, dim=0)
+
+            example_inputs = (
+                torch.randn((32, 64), device="cuda"),
+                repeats := torch.tensor([5, 10, 15], device="cuda"),
+            )
+            torch._dynamo.mark_dynamic(repeats, 0)  # create backed symint
+
+            nested_opt = torch.compile(nested, backend="inductor")
+
+            expect = nested(*example_inputs)
+            actual = nested_opt(*example_inputs)
+            self.assertEqual(expect, actual)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_symint_from_mutation_index(self):
+            x = torch.zeros(7, device="cuda")
+
+            def fn(n, a):
+                a[n] = -1
+                return a
+
+            opt_fn = torch.compile(fn, fullgraph=True)
+
+            for n in range(2, x.shape[0]):
+                opt_fn(n, x)
+                self.assertEqual(x[n], -1)
+
+            # Negative index triggers new compilation.
+            opt_fn(-x.shape[0], x)
+
+            self.assertEqual(x[0], -1)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_unbacked_symint(self):
+            def f(x, y):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                return x1 + y1 + z + y_cpu.to("cuda")
+
+            f_compiled = torch.compile(f)
+            x, y = (
+                torch.ones(3, 3, device="cuda"),
+                torch.randn(3, 3, device="cuda"),
+            )
+
+            torch._dynamo.decorators.mark_unbacked(x, 0)
+            torch._dynamo.decorators.mark_unbacked(y, 1)
+
+            compiled_out = f_compiled(x, y)
+            eager_out = f(x, y)
+            self.assertEqual(compiled_out, eager_out)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_dynamic_scalar_inputs(self):
+            def f(x, y, integer):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                z += integer
+                return x1 + y1 + z + y_cpu.to("cuda")
+
+            f_compiled = torch.compile(f)
+            x, y = (
+                torch.ones(3, 3, device="cuda"),
+                torch.randn(3, 3, device="cuda"),
+            )
+
+            torch._dynamo.decorators.mark_unbacked(x, 0)
+            torch._dynamo.decorators.mark_unbacked(y, 1)
+
+            compiled_out = f_compiled(x, y, 5)
+            self.assertEqual(compiled_out, f(x, y, 5))
+
+            compiled_out = f_compiled(x, y, 6)
+            self.assertEqual(compiled_out, f(x, y, 6))
+
+        @torch._inductor.config.patch("graph_partition", True)
+        @torch._dynamo.config.patch("capture_scalar_outputs", True)
+        def test_graph_partition_item(self):
+            def f(x):
+                y = x + 1
+                scalar = y.item()
+                return x + y + scalar
+
+            compiled_f = torch.compile(f)
+            compiled_out = compiled_f(torch.tensor(1, device="cuda"))
+            self.assertEqual(compiled_out, f(torch.tensor(1, device="cuda")))
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_buffer_reuse(self):
+            def f(x, y):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x1 + y1 + x @ y
+                u = (y_cpu.to("cuda") + 2) @ y + 3
+                u_cpu = u.cpu() + 2
+                return z + u_cpu.to("cuda")
+
+            x, y = [torch.ones(2, 2, device="cuda") for _ in range(2)]
+            x_cloned, y_cloned = [tmp.clone() for tmp in [x, y]]
+            eager_out = f(x, y)
+
+            f_compiled = torch.compile(f)
+            compiled_out = f_compiled(x_cloned, y_cloned)
+
+            self.assertEqual(eager_out, compiled_out)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_fused_scheduler_node(self):
+            def foo(x):
+                x = x * 20
+                x_alias = x[0]
+                y = x * 10
+                y_alias = y[0]
+                torch._dynamo.graph_break()
+                ind = torch.tensor(4, device="cuda")
+                x_alias2 = x[ind:]
+                y_alias2 = y[ind:]
+                return x, x_alias, x_alias2, y_alias, y_alias2
+
+            compiled_foo = torch.compile(foo)
+            x = torch.rand([20, 20], device="cuda")
+
+            eager_out = foo(x)
+            compiled_out = compiled_foo(x)
+            self.assertEqual(eager_out, compiled_out)
+
         def test_meta_tensor(self):
             def foobar(x, y):
                 return x * 2, y * 3
@@ -3621,6 +3937,17 @@ def run(batch_size, seq_len, d):
 
             self.assertEqual(self.get_manager().new_graph_id().id, 4)
 
+        @torch._inductor.config.patch("triton.cudagraph_or_error", True)
+        def test_cudagraph_or_error(self):
+            def f(x):
+                x.add_(1)
+                return x
+
+            f = torch.compile(f, mode="reduce-overhead")
+
+            with self.assertRaises(RuntimeError):
+                f(torch.tensor(1, device="cuda"))
+
     class TestSAC(TestCase):
         def _make_observer_mode(self):
             class ObserverMode(TorchDispatchMode):
@@ -4035,5 +4362,5 @@ def fn(x, y):
             sys.exit(0)
         raise unittest.SkipTest("cuda graph test is skipped")
 
-    if HAS_CUDA:
+    if HAS_CUDA_AND_TRITON:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_cudagraph_trees_expandable_segments.py b/test/inductor/test_cudagraph_trees_expandable_segments.py
index 04f2ad96fdc0b..65597316091d4 100644
--- a/test/inductor/test_cudagraph_trees_expandable_segments.py
+++ b/test/inductor/test_cudagraph_trees_expandable_segments.py
@@ -8,13 +8,13 @@
 import torch
 from torch.testing._internal.common_cuda import IS_JETSON, IS_WINDOWS
 from torch.testing._internal.common_utils import run_tests
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 
 
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
-if HAS_CUDA:
+if HAS_CUDA_AND_TRITON:
     try:
         from .test_cudagraph_trees import CudaGraphTreeTests
     except ImportError:
@@ -32,7 +32,12 @@
 sys.path.remove(str(REPO_ROOT))
 
 if __name__ == "__main__":
-    if torch.cuda.is_available() and not IS_JETSON and not IS_WINDOWS and HAS_CUDA:
+    if (
+        torch.cuda.is_available()
+        and not IS_JETSON
+        and not IS_WINDOWS
+        and HAS_CUDA_AND_TRITON
+    ):
         get_disabled_tests(".")
 
         torch.cuda.memory._set_allocator_settings("expandable_segments:True")
diff --git a/test/inductor/test_custom_partitioner_fn.py b/test/inductor/test_custom_partitioner_fn.py
new file mode 100644
index 0000000000000..722a154b27ff0
--- /dev/null
+++ b/test/inductor/test_custom_partitioner_fn.py
@@ -0,0 +1,72 @@
+# Owner(s): ["module: pt2-dispatcher"]
+import torch
+from functorch.compile import min_cut_rematerialization_partition
+from torch._C import FileCheck
+from torch._inductor.custom_graph_pass import CustomPartitionerFn, get_hash_for_files
+from torch._inductor.test_case import TestCase
+from torch._inductor.utils import run_fw_bw_and_get_code
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+
+
+class MyCustomPartitionerFn(CustomPartitionerFn):
+    """
+    A custom partitioner function with static_lifetime_input_indices overwrites.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.called = False
+
+    def __call__(self, gm, joint_inputs, **kwargs):
+        self.called = True
+        kwargs["static_lifetime_input_indices"] = [0, 1]
+        return min_cut_rematerialization_partition(gm, joint_inputs, **kwargs)
+
+    def uuid(self):
+        return get_hash_for_files((__file__,))
+
+
+class TestCustomPartitionerFn(TestCase):
+    def test_custom_partitioner_fn(self):
+        """
+        For function f(a, b), with the  partitioner in the compile_fx stack,
+        the addition `a+b` (equivalently `buf0`) is saved for backward.
+        With the custom partitioner function, we indicate that
+        `a` and `b` (equivalently `primals_1` and `primals_2`) do not take
+        additional memory and thus, they are saved for backward.
+        """
+
+        # initialization
+        @torch.compile
+        def f(a, b):
+            return (a + b).cos().cos()
+
+        a = torch.randn((2, 2), requires_grad=True, device=GPU_TYPE)
+        b = torch.randn((2, 2), requires_grad=True, device=GPU_TYPE)
+
+        # CASE 1 -- default
+        # addition `a + b` (i.e, `buf0`) is saved for backward.
+        code_og = run_fw_bw_and_get_code(lambda: f(a, b))
+        fwd_code_og = code_og[1][0]
+        FileCheck().check("return (buf1, buf0, )").run(fwd_code_og)
+
+        # CASE 2 -- custom partitioner function
+        # `a` and `b` (i.e., `primals_1` and `primals_2`) are saved for backward.
+        custom_partitioner_fn = MyCustomPartitionerFn()
+        self.assertFalse(custom_partitioner_fn.called)
+        self.assertIsNotNone(custom_partitioner_fn.uuid())
+
+        with torch._inductor.config.patch(custom_partitioner_fn=custom_partitioner_fn):
+            code_cp = run_fw_bw_and_get_code(lambda: f(a, b))
+        fwd_code_cp = code_cp[1][0]
+        FileCheck().check("return (buf0, primals_1, primals_2, )").run(fwd_code_cp)
+
+        # make sure the custom partitioner function is indeed invoked
+        self.assertTrue(custom_partitioner_fn.called)
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    if HAS_GPU:
+        run_tests()
diff --git a/test/inductor/test_cutedsl_template.py b/test/inductor/test_cutedsl_template.py
new file mode 100644
index 0000000000000..67c166040ee27
--- /dev/null
+++ b/test/inductor/test_cutedsl_template.py
@@ -0,0 +1,484 @@
+# Owner(s): ["module: inductor"]
+import unittest
+from unittest.mock import MagicMock, patch
+
+from expecttest import assert_expected_inline
+
+import torch
+from torch._inductor.test_case import TestCase
+from torch._inductor.virtualized import V
+from torch.testing._internal.inductor_utils import MockGraphHandler
+
+
+try:
+    import cutlass  # noqa: F401
+    import cutlass.cute as cute  # noqa: F401
+
+    HAS_CUTLASS = True
+except ImportError:
+    HAS_CUTLASS = False
+
+if HAS_CUTLASS:
+    from torch._inductor.codegen.cutedsl.cutedsl_kernel import CuteDSLTemplateKernel
+    from torch._inductor.codegen.cutedsl.cutedsl_template import CuteDSLTemplate
+    from torch._inductor.select_algorithm import PartialRender
+
+
+CUTEDSL_ADD_TEMPLATE = r"""
+{{gen_defines()}}
+
+@cute.kernel
+def {{kernel_name}}_kernel(gA: cute.Tensor, gB: cute.Tensor, gC: cute.Tensor):
+    tidx, _, _ = cute.arch.thread_idx()
+    bidx, _, _ = cute.arch.block_idx()
+    bdim, _, _ = cute.arch.block_dim()
+
+    thread_idx = bidx * bdim + tidx
+    m, n = gA.shape
+
+    if thread_idx < m * n:
+        mi = thread_idx // n
+        ni = thread_idx % n
+
+        if mi < m and ni < n:
+            gC[mi, ni] = gA[mi, ni] + gB[mi, ni]
+
+@cute.jit
+def {{kernel_name}}_jit(mA: cute.Tensor, mB: cute.Tensor, mC: cute.Tensor, stream):
+    {{gen_defines()}}
+    m, n = mA.shape
+    total_threads = m * n
+    num_blocks = (total_threads + THREADS_PER_BLOCK - 1) // THREADS_PER_BLOCK
+
+    kernel = {{kernel_name}}_kernel(mA, mB, mC)
+    kernel.launch(
+        grid=[num_blocks, 1, 1],
+        block=[THREADS_PER_BLOCK, 1, 1],
+        stream=stream
+    )
+
+{{def_kernel("input_a", "input_b")}}
+    cute_a = from_dlpack(input_a)
+    cute_b = from_dlpack(input_b)
+    cute_c = from_dlpack({{get_output()}})
+
+    {{kernel_name}}_jit(cute_a, cute_b, cute_c, cuda.CUstream(stream))
+    return {{get_output()}}
+"""
+
+
+@unittest.skipUnless(HAS_CUTLASS, "requires cutlass")
+class TestCuteDSLTemplate(TestCase):
+    """Test cases for CuteDSL template functionality."""
+
+    def test_gen_imports(self):
+        kernel = CuteDSLTemplateKernel(
+            kernel_name="test_kernel",
+            input_nodes=[],
+            output_node=None,
+        )
+
+        imports = kernel.gen_imports()
+
+        self.assertIn("import torch", imports)
+        self.assertIn("import cutlass", imports)
+        self.assertIn("import cutlass.cute as cute", imports)
+        self.assertIn("from cutlass.cute.runtime import from_dlpack", imports)
+        self.assertIsInstance(imports, str)
+
+        lines = imports.strip().split("\n")
+        self.assertEqual(len(lines), 7)
+
+    def test_render_includes_imports(self):
+        template_source = """@cute.kernel
+def {{kernel_name}}_kernel():
+    pass
+
+{{def_kernel("input", "output")}}
+    return output"""
+
+        mock_template = MagicMock()
+        mock_template.render = MagicMock(return_value=template_source)
+
+        kernel = CuteDSLTemplateKernel(
+            kernel_name="test_kernel",
+            input_nodes=[],
+            output_node=None,
+        )
+
+        result = kernel.render(mock_template)
+        self.assertIsInstance(result, PartialRender)
+
+        rendered_code = result._code
+
+        # The imports might have leading whitespace, so strip it
+        rendered_code_stripped = rendered_code.lstrip()
+
+        self.assertTrue(
+            rendered_code_stripped.startswith("import torch"),
+            f"Code should start with 'import torch', got: {rendered_code_stripped[:50]}",
+        )
+        self.assertIn("import cutlass", rendered_code)
+        self.assertIn("import cutlass.cute as cute", rendered_code)
+        self.assertIn("from cutlass.cute.runtime import from_dlpack", rendered_code)
+        self.assertIn("@cute.kernel", rendered_code)
+
+    def test_template_env_contains_hooks(self):
+        kernel = CuteDSLTemplateKernel(
+            kernel_name="test_kernel",
+            input_nodes=[],
+            output_node=None,
+        )
+
+        captured_env = {}
+
+        def mock_render(**kwargs):
+            captured_env.update(kwargs)
+            return "rendered"
+
+        mock_template = MagicMock()
+        mock_template.render = mock_render
+
+        kernel.render(mock_template)
+
+        self.assertIn("def_kernel", captured_env)
+        self.assertIn("kernel_name", captured_env)
+        self.assertTrue(callable(captured_env["def_kernel"]))
+
+    def test_multiple_templates_unique_names(self):
+        # Clean registry first
+        test_name = f"unique_test_{id(self)}"
+        if test_name in CuteDSLTemplate.all_templates:
+            del CuteDSLTemplate.all_templates[test_name]
+
+        _ = CuteDSLTemplate(
+            name=test_name,
+            source="template1",
+        )
+
+        with self.assertRaises(AssertionError):
+            _ = CuteDSLTemplate(
+                name=test_name,
+                source="template2",
+            )
+
+    def test_indented_buffer_usage(self):
+        kernel = CuteDSLTemplateKernel(
+            kernel_name="test_kernel",
+            input_nodes=[],
+            output_node=None,
+        )
+
+        imports = kernel.gen_imports()
+
+        lines = imports.strip().split("\n")
+        for line in lines:
+            if line:
+                self.assertFalse(
+                    line.startswith(" "), f"Line should not be indented: '{line}'"
+                )
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_cutedsl_add_e2e(self):
+        """End-to-end test with CuteDSL template including code generation verification."""
+        from torch._inductor.ir import TensorBox
+        from torch._inductor.lowering import lowerings
+        from torch._inductor.utils import run_and_get_code
+
+        template = CuteDSLTemplate(
+            name="test_add_e2e",
+            source=CUTEDSL_ADD_TEMPLATE,
+        )
+
+        def cutedsl_add_lowering(a: TensorBox, b: TensorBox) -> TensorBox:
+            choices = []
+            error = template.maybe_append_choice(
+                choices,
+                input_nodes=[a, b],
+                layout=a.get_layout(),
+                THREADS_PER_BLOCK=256,
+            )
+
+            if error or not choices:
+                default_lowering = lowerings[torch.ops.aten.add.Tensor]
+                return default_lowering(a, b)
+
+            # Use the single choice directly (no autotuning)
+            return choices[0].output_node()
+
+        with patch.dict(lowerings, {torch.ops.aten.add.Tensor: cutedsl_add_lowering}):
+            # Test function
+            def test_add(x, y):
+                return x + y
+
+            device = "cuda"
+            x = torch.randn(128, 4, device=device, dtype=torch.float32)
+            y = torch.randn(128, 4, device=device, dtype=torch.float32)
+
+            # Compile and get generated code
+            compiled_fn = torch.compile(test_add, backend="inductor")
+            result, (code,) = run_and_get_code(compiled_fn, x, y)
+
+            # Verify CuteDSL code is present
+            self.assertIn(
+                "cute", code.lower(), "CuteDSL code should be in generated code"
+            )
+            # Verify parameter generation worked
+            self.assertIn(
+                "THREADS_PER_BLOCK", code, "Parameter should be in generated code"
+            )
+
+            # Verify correctness
+            expected = x + y
+            self.assertTrue(torch.allclose(result, expected, atol=1e-5))
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_cutedsl_add_e2e_autotune(self):
+        """E2E test with multiple CuteDSL template variants for autotuning."""
+        from torch._inductor.ir import TensorBox
+        from torch._inductor.lowering import lowerings
+        from torch._inductor.select_algorithm import autotune_select_algorithm
+
+        template = CuteDSLTemplate(
+            name="test_add_autotune",
+            source=CUTEDSL_ADD_TEMPLATE,
+        )
+
+        def cutedsl_add_lowering(a: TensorBox, b: TensorBox) -> TensorBox:
+            choices = []
+
+            # Add multiple variants with different thread counts for autotuning
+            thread_variants = [128, 256, 512]
+            for threads in thread_variants:
+                error = template.maybe_append_choice(
+                    choices,
+                    input_nodes=[a, b],
+                    layout=a.get_layout(),
+                    THREADS_PER_BLOCK=threads,
+                )
+                if error:
+                    # Skip this variant if it fails
+                    continue
+
+            if not choices:
+                default_lowering = lowerings[torch.ops.aten.add.Tensor]
+                return default_lowering(a, b)
+
+            # Use autotuning to select the best variant
+            return autotune_select_algorithm(
+                "cutedsl_add_autotune",
+                choices,
+                [a, b],
+                a.get_layout(),
+            )
+
+        with patch.dict(lowerings, {torch.ops.aten.add.Tensor: cutedsl_add_lowering}):
+            # Test function
+            def test_add(x, y):
+                return x + y
+
+            device = "cuda"
+            x = torch.randn(128, 128, device=device, dtype=torch.float32)
+            y = torch.randn(128, 128, device=device, dtype=torch.float32)
+
+            # Compile and run
+            compiled_fn = torch.compile(test_add, backend="inductor")
+            result = compiled_fn(x, y)
+
+            # Verify correctness
+            expected = x + y
+            self.assertTrue(torch.allclose(result, expected, atol=1e-5))
+
+    def test_gen_defines(self):
+        """Test that gen_defines correctly generates CuteDSL parameter definitions."""
+        kernel = CuteDSLTemplateKernel(
+            kernel_name="test_kernel",
+            input_nodes=[],
+            output_node=None,
+        )
+
+        # Test integer parameters
+        params = kernel.gen_defines(
+            THREADS_PER_BLOCK=256,
+            BLOCK_SIZE=128,
+            ENABLE_FEATURE=True,
+        )
+
+        assert_expected_inline(
+            params,
+            """\
+THREADS_PER_BLOCK: cutlass.Constexpr = 256
+BLOCK_SIZE: cutlass.Constexpr = 128
+ENABLE_FEATURE: cutlass.Constexpr = True
+""",
+        )
+
+        params_float = kernel.gen_defines(SCALE_FACTOR=1.5)
+        assert_expected_inline(
+            params_float,
+            """\
+SCALE_FACTOR: cutlass.Constexpr = 1.5
+""",
+        )
+
+    def test_template_aliasing(self):
+        """Test that template variables are correctly aliased to function arguments."""
+        from torch._inductor.ir import Buffer
+
+        mock_input1 = MagicMock(spec=Buffer)
+        mock_input1.get_name.return_value = "buf_input1"
+
+        mock_input2 = MagicMock(spec=Buffer)
+        mock_input2.get_name.return_value = "buf_input2"
+
+        mock_output = MagicMock(spec=Buffer)
+        mock_output.get_name.return_value = "buf_output"
+
+        mock_graph = MockGraphHandler()
+        with V.set_graph_handler(mock_graph):
+            kernel = CuteDSLTemplateKernel(
+                kernel_name="test_aliasing",
+                input_nodes=[mock_input1, mock_input2],
+                output_node=mock_output,
+            )
+
+            def_kernel_hook = kernel.def_kernel("custom_a", "custom_b")
+            self.assertEqual(def_kernel_hook, "<DEF_KERNEL>")
+
+            self.assertIn("<DEF_KERNEL>", kernel.render_hooks)
+
+            hook_fn = kernel.render_hooks["<DEF_KERNEL>"]
+            generated_code = hook_fn()
+
+            # Check that the generated code contains the expected aliasing statements
+            self.assertIn("custom_a = arg_custom_a", generated_code)
+            self.assertIn("custom_b = arg_custom_b", generated_code)
+
+    def test_get_output_hook(self):
+        """Test the get_output() template hook."""
+        from torch._inductor.ir import Buffer
+
+        mock_output = MagicMock(spec=Buffer)
+        mock_output.get_name.return_value = "buf_test_output"
+
+        mock_graph = MockGraphHandler()
+        with V.set_graph_handler(mock_graph):
+            kernel = CuteDSLTemplateKernel(
+                kernel_name="test_output",
+                input_nodes=[],
+                output_node=mock_output,
+            )
+
+            with self.assertRaises(ValueError):
+                # error if no output buffer
+                result = kernel.get_output()
+
+            kernel.args.output_buffers["buf_test_output"] = "arg_buf_test_output"
+            result = kernel.get_output()
+            self.assertEqual(result, "arg_buf_test_output")
+
+    def test_modification_subgraph(self):
+        """Test the modification() method and subgraph processing."""
+
+        from torch._inductor.ir import Buffer
+
+        mock_subgraph1 = MagicMock(spec=Buffer)
+        mock_subgraph2 = MagicMock(spec=Buffer)
+        subgraphs = [mock_subgraph1, mock_subgraph2]
+
+        mock_output = MagicMock(spec=Buffer)
+        mock_output.get_name.return_value = "buf_output"
+
+        kernel = CuteDSLTemplateKernel(
+            kernel_name="test_modification",
+            input_nodes=[],
+            output_node=mock_output,
+            subgraphs=subgraphs,
+        )
+
+        result = kernel._get_subgraph(0)
+        self.assertEqual(result, mock_subgraph1)
+
+        result = kernel._get_subgraph(1)
+        self.assertEqual(result, mock_subgraph2)
+
+        with self.assertRaises(AssertionError):
+            kernel._get_subgraph(2)
+
+    def test_cutedsl_op_overrides(self):
+        """Test the new CuteDSLOpOverrides class."""
+        import torch
+        from torch._inductor.codegen.common import CSEVariable
+        from torch._inductor.codegen.cutedsl.cutedsl_op_overrides import (
+            CuteDSLOpOverrides,
+        )
+        from torch.utils._sympy.value_ranges import ValueRanges
+
+        mock_cse_a = MagicMock(spec=CSEVariable)
+        mock_cse_a.__str__.return_value = "tensor_a"
+        mock_cse_a.dtype = torch.float32
+        mock_cse_a.bounds = ValueRanges.unknown()
+
+        mock_cse_b = MagicMock(spec=CSEVariable)
+        mock_cse_b.__str__.return_value = "tensor_b"
+        mock_cse_b.dtype = torch.float32
+        mock_cse_b.bounds = ValueRanges.unknown()
+
+        mock_graph = MockGraphHandler()
+        with V.set_graph_handler(mock_graph):
+            kernel = CuteDSLTemplateKernel(
+                kernel_name="test_ops",
+                input_nodes=[],
+                output_node=None,
+            )
+            with V.set_kernel_handler(kernel):
+                result = CuteDSLOpOverrides.add(mock_cse_a, mock_cse_b)
+                self.assertIsInstance(result, CSEVariable)
+
+                result = CuteDSLOpOverrides.mul(mock_cse_a, mock_cse_b)
+                self.assertIsInstance(result, CSEVariable)
+
+                result = CuteDSLOpOverrides.truediv(mock_cse_a, mock_cse_b)
+                self.assertIsInstance(result, CSEVariable)
+
+                result = CuteDSLOpOverrides.exp(mock_cse_a)
+                self.assertIsInstance(result, CSEVariable)
+
+                result = CuteDSLOpOverrides.sqrt(mock_cse_a)
+                self.assertIsInstance(result, CSEVariable)
+
+                with self.assertRaises(NotImplementedError):
+                    result = CuteDSLOpOverrides.maximum(mock_cse_a, mock_cse_b)
+                    result = CuteDSLOpOverrides.minimum(mock_cse_a, mock_cse_b)
+
+        scalar_result = CuteDSLOpOverrides._ensure_tensor_ssa("5.0", mock_cse_a)
+        self.assertEqual(scalar_result, "cute.full_like(tensor_a, 5.0)")
+
+        tensor_result = CuteDSLOpOverrides._ensure_tensor_ssa(mock_cse_a, mock_cse_b)
+        self.assertEqual(tensor_result, "tensor_a")
+
+    def test_cse_integration(self):
+        """Test CSE (Common Subexpression Elimination) integration."""
+        from torch._inductor.codegen.common import CSE
+
+        mock_graph = MockGraphHandler()
+        with V.set_graph_handler(mock_graph):
+            kernel = CuteDSLTemplateKernel(
+                kernel_name="test_cse",
+                input_nodes=[],
+                output_node=None,
+            )
+
+            self.assertIsInstance(kernel.cse, CSE)
+            self.assertEqual(kernel.cse.name_prefix, "tmp")
+
+            with V.set_kernel_handler(kernel):
+                test_expr = "x"
+                var = kernel.cse.generate(kernel.body, test_expr, dtype=None)
+                self.assertTrue(str(var).startswith("tmp"))
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    run_tests()
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
index dc9abf2e20c6f..b807df5d6691c 100644
--- a/test/inductor/test_cutlass_backend.py
+++ b/test/inductor/test_cutlass_backend.py
@@ -58,12 +58,12 @@
     _quantize_rowwise,
     _quantize_tensorwise,
     HAS_CPU,
-    HAS_CUDA,
+    HAS_CUDA_AND_TRITON,
 )
 
 
 torch.set_float32_matmul_precision("high")
-if HAS_CUDA:
+if HAS_CUDA_AND_TRITON:
     torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
 
@@ -158,8 +158,8 @@ def select_no_algorithm(*args, **kwargs):
 @instantiate_parametrized_tests
 class TestCutlassBackend(TestCase):
     def setUp(self):
-        if not HAS_CUDA:
-            self.skipTest("CUDA is not available")
+        if not HAS_CUDA_AND_TRITON:
+            self.skipTest("CUDA and triton are not available")
         if torch.version.hip:
             self.skipTest("CUTLASS backend is not supported on HIP")
 
@@ -200,6 +200,19 @@ def run_evt_test(self, model, op, shape, num_fusions=1):
         )
         torch.testing.assert_close(result, ref_result)
 
+    def test_check_paths(self):
+        cutlass_mock_imports_path = os.path.join(
+            os.path.dirname(torch.__file__),
+            "_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports",
+        )
+        cutlass_mock_cuda_path = os.path.join(cutlass_mock_imports_path, "cuda")
+        cutlass_mock_pydot_path = os.path.join(cutlass_mock_imports_path, "pydot")
+        cutlass_mock_scipy_path = os.path.join(cutlass_mock_imports_path, "scipy")
+        self.assertTrue(os.path.exists(cutlass_mock_imports_path))
+        self.assertTrue(os.path.exists(cutlass_mock_cuda_path))
+        self.assertTrue(os.path.exists(cutlass_mock_pydot_path))
+        self.assertTrue(os.path.exists(cutlass_mock_scipy_path))
+
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_max_autotune_cutlass_threshold(self):
@@ -281,20 +294,19 @@ def test_cutlass_backend_subproc_mm(self):
             Y = torch.mm(a, b)
             torch.testing.assert_close(Y_compiled, Y)
 
-    @unittest.skipIf(
-        True, "FIXME: Disabled temporarily since IMA or crashing in subprocess"
-    )
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
-    def test_cutlass_backend_subproc_addmm(self, shape_combo):
+    @parametrize("dtype", (torch.float16, torch.bfloat16))
+    def test_cutlass_backend_subproc_addmm(self, dtype):
         """
         Test autotune_in_subproc works for addmm.
         """
 
         M, N, K = 4096, 2048, 25728
+        dtype = torch.float16
 
-        a = torch.randn(M, K).cuda().half()
-        b = torch.randn(N, K).cuda().half().t()
+        a = torch.randn(M, K, dtype=dtype).cuda()
+        b = torch.randn(N, K, dtype=dtype).cuda().t()
 
         x_shapes = [
             (M, N),
@@ -316,7 +328,10 @@ def test_cutlass_backend_subproc_addmm(self, shape_combo):
             }
         ):
             for x_shape in x_shapes:
-                x = torch.randn(x_shape).cuda().half()
+                torch._dynamo.reset()
+                clear_caches()
+
+                x = torch.randn(x_shape).cuda().to(dtype)
                 Y_compiled = torch.compile(torch.addmm)(x, a, b, alpha=alpha, beta=beta)
                 Y = torch.addmm(x, a, b, alpha=alpha, beta=beta)
                 torch.testing.assert_close(Y_compiled, Y)
@@ -682,6 +697,7 @@ def forward(self, x, a, b):
     @parametrize("dynamic", (False, True))
     @parametrize("use_aoti", (False, True))
     @parametrize("dtype", (torch.float16, torch.bfloat16))
+    @parametrize("use_expand", (False, True))
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_max_autotune_cutlass_backend_bmm(
         self,
@@ -689,6 +705,7 @@ def test_max_autotune_cutlass_backend_bmm(
         use_aoti: bool = False,
         max_autotune_gemm_backends: str = "CUTLASS",
         dtype: torch.dtype = torch.float16,
+        use_expand: bool = False,
     ):
         """
         Main test for bmm.
@@ -706,13 +723,17 @@ def forward(self, a, b):
         ]
         shapes = shapes[0:1] if not dynamic else shapes
 
-        inputs = [
-            (
-                torch.randn(B, M, K).cuda().to(dtype),
-                torch.randn(B, N, K).cuda().to(dtype).permute(0, 2, 1),
-            )
-            for B, M, N, K in shapes
-        ]
+        inputs = []
+        for B, M, N, K in shapes:
+            if use_expand:
+                # Create A using unsqueeze and expand
+                A = torch.randn(M, K).cuda().to(dtype).unsqueeze(0).expand(B, -1, -1)
+            else:
+                # Original method
+                A = torch.randn(B, M, K).cuda().to(dtype)
+
+            B_tensor = torch.randn(B, N, K).cuda().to(dtype).permute(0, 2, 1)
+            inputs.append((A, B_tensor))
         dynamic_shapes = (
             {
                 "a": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC, 2: Dim.DYNAMIC},
@@ -1793,6 +1814,26 @@ def test_cutlass_backend_matmul_same_tensor(self):
 
             torch.testing.assert_close(A @ A.t(), compiled(A, A.t()))
 
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_cutlass_backend_matmul_nonzero_offset(self):
+        max_autotune_gemm_backends = "CUTLASS"
+
+        M = 129
+        A = torch.randn(M, M - 1).cuda().half()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "cuda.cutlass_max_profiling_configs": 2,
+            }
+        ):
+            compiled = torch.compile(torch.mm)
+            torch.testing.assert_close(
+                A[1:, :] @ A[1:, :].t(), compiled(A[1:, :], A[1:, :].t())
+            )
+
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_flexible_layout(self):
@@ -2160,6 +2201,100 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         self.assertEqual(y_compiled.dtype, output_dtype)
         torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 is only supported on H100+")
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @fp8_config
+    @parametrize("float8_dtype", (torch.float8_e4m3fn,))
+    @parametrize(
+        "shape",
+        (
+            (
+                512,
+                1024,
+            ),
+        ),
+    )
+    @parametrize("use_fast_accum", (True,))
+    @parametrize("use_aoti", (False, True))
+    @parametrize("dynamic", (False, True))
+    def test_fp8_rowwise_scaling_multiple_linear(
+        self,
+        float8_dtype: torch.dtype,
+        shape: tuple[int, int],
+        use_fast_accum: bool,
+        use_aoti: bool = False,
+        dynamic: bool = False,
+    ):
+        """
+        This test is meant to simulate a more realistic scenario.
+        """
+        if dynamic and use_aoti:
+            self.skipTest("Accuracy issues when both AOTI and dynamic are enabled")
+        # Only bf16 output type is supported for row-wise scaling, not fp32
+        output_dtype: torch.dtype = torch.bfloat16
+        device = "cuda"
+        M, N = shape  # Matmul Y = X [M, K] x W [N, K]
+        x = torch.randn(M, N, dtype=output_dtype, device=device)
+        w1 = torch.randn(N, N, dtype=output_dtype, device=device)
+        w2 = torch.randn(N, N, dtype=output_dtype, device=device)
+
+        class TestModule(torch.nn.Module):
+            def __init__(self, w1, w2, float8_dtype):
+                super().__init__()
+                w1_fp8, self.w1_inverse_scale = _quantize_rowwise(w1, float8_dtype)
+                w2_fp8, self.w2_inverse_scale = _quantize_rowwise(w2, float8_dtype)
+
+                self.w1_t_fp8 = w1_fp8.t()
+                self.w2_t_fp8 = w2_fp8.t()
+
+                self.float8_dtype = float8_dtype
+
+            def forward(self, x):
+                x_fp8, x_inverse_scale = _quantize_rowwise(x, self.float8_dtype)
+                y1 = torch._scaled_mm(
+                    x_fp8,
+                    self.w1_t_fp8,
+                    x_inverse_scale.view(-1, 1),
+                    self.w1_inverse_scale.view(1, -1),
+                    out_dtype=output_dtype,
+                    use_fast_accum=use_fast_accum,
+                )
+
+                y1_fp8, y1_inverse_scale = _quantize_rowwise(y1, self.float8_dtype)
+                y2 = torch._scaled_mm(
+                    y1_fp8,
+                    self.w2_t_fp8,
+                    y1_inverse_scale.view(-1, 1),
+                    self.w2_inverse_scale.view(1, -1),
+                    out_dtype=output_dtype,
+                    use_fast_accum=use_fast_accum,
+                )
+                return y2
+
+        model = TestModule(w1, w2, float8_dtype).cuda()
+
+        dynamic_shapes = (
+            {
+                "x": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC},
+            }
+            if dynamic
+            else None
+        )
+
+        expected = model(x)
+
+        if use_aoti:
+            actual = AOTIRunnerUtil.run(
+                model,
+                (x,),
+                dynamic_shapes=dynamic_shapes,
+            )
+        else:
+            compiled_model = torch.compile(model, fullgraph=True, dynamic=dynamic)
+            actual = compiled_model(x)
+
+        torch.testing.assert_close(expected, actual, rtol=1e-2, atol=0.05)
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 is only supported on H100+")
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @fp8_config
@@ -2293,5 +2428,5 @@ def test_config_number_post_filtering(self) -> None:
     from torch._inductor.utils import is_big_gpu
 
     # Set env to make it work in CI.
-    if HAS_CUDA and HAS_CPU and is_big_gpu():
+    if HAS_CUDA_AND_TRITON and HAS_CPU and is_big_gpu():
         run_tests()
diff --git a/test/inductor/test_cutlass_evt.py b/test/inductor/test_cutlass_evt.py
index d6891af6e6afa..cae9558d2ec2a 100644
--- a/test/inductor/test_cutlass_evt.py
+++ b/test/inductor/test_cutlass_evt.py
@@ -10,12 +10,15 @@
     torch_dtype_to_cutlass_type,
     try_import_cutlass,
 )
-from torch._inductor.graph import GraphLowering
 from torch._inductor.ir import ComputedBuffer, FixedLayout, PermuteView, Pointwise
 from torch._inductor.scheduler import BaseSchedulerNode
 from torch._inductor.utils import OrderedSet
 from torch.testing._internal.common_cuda import SM90OrLater
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+from torch.testing._internal.inductor_utils import (
+    HAS_CPU,
+    HAS_CUDA_AND_TRITON,
+    MockGraphHandler,
+)
 
 
 if try_import_cutlass():
@@ -105,17 +108,6 @@ def num_reads(self):
         return 1
 
 
-class MockGraphHandler(GraphLowering):
-    def __init__(self, name_to_buffer):
-        import torch._inductor.sizevars
-
-        self.sizevars = torch._inductor.sizevars.SizeVarAllocator()
-        self.name_to_buffer = name_to_buffer
-        self.graph_inputs = dict()
-        self.mutated_buffers = OrderedSet()
-        self.constants = dict()
-
-
 class TestCutlassEVT(TestCase):
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
@@ -345,29 +337,31 @@ def test_example_tensor_creation(self):
         from torch._inductor.codegen.cuda.cutlass_lib_extensions.evt_extensions import (
             create_example_tensors,
         )
+        from torch._inductor.virtualized import V
 
-        row_major_buf0 = MockComputedBuffer(
-            "buf0", None, torch.float32, (3, 4, 1), (4, 1, 0)
-        )
-        col_major_buf1 = MockComputedBuffer(
-            "buf1", None, torch.float32, (3, 2, 1), (1, 3, 0)
-        )
-        buffer_renames = {"buf0": "buf0", "buf1": "buf1", "acc": "buf0"}
-        name_to_buffer = {"buf0": row_major_buf0, "buf1": col_major_buf1}
-        result = create_example_tensors(
-            buffer_renames, name_to_buffer, lambda x: int(x)
-        )
-        self.assertEqual(result["acc"].shape, (3, 4, 1))
-        self.assertEqual(result["acc"].stride, (4, 1, 0))
-        self.assertEqual(
-            result["acc"].element, torch_dtype_to_cutlass_type(torch.float32)
-        )
+        with V.set_graph_handler(MockGraphHandler({})):
+            row_major_buf0 = MockComputedBuffer(
+                "buf0", None, torch.float32, (3, 4, 1), (4, 1, 0)
+            )
+            col_major_buf1 = MockComputedBuffer(
+                "buf1", None, torch.float32, (3, 2, 1), (1, 3, 0)
+            )
+            buffer_renames = {"buf0": "buf0", "buf1": "buf1", "acc": "buf0"}
+            name_to_buffer = {"buf0": row_major_buf0, "buf1": col_major_buf1}
+            result = create_example_tensors(
+                buffer_renames, name_to_buffer, lambda x: int(x)
+            )
+            self.assertEqual(result["acc"].shape, (3, 4, 1))
+            self.assertEqual(result["acc"].stride, (4, 1, 0))
+            self.assertEqual(
+                result["acc"].element, torch_dtype_to_cutlass_type(torch.float32)
+            )
 
-        self.assertEqual(result["buf1"].shape, (3, 2, 1))
-        self.assertEqual(result["buf1"].stride, (1, 3, 0))
-        self.assertEqual(
-            result["buf1"].element, torch_dtype_to_cutlass_type(torch.float32)
-        )
+            self.assertEqual(result["buf1"].shape, (3, 2, 1))
+            self.assertEqual(result["buf1"].stride, (1, 3, 0))
+            self.assertEqual(
+                result["buf1"].element, torch_dtype_to_cutlass_type(torch.float32)
+            )
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
@@ -392,12 +386,12 @@ def test_evt_argument_codegen(self):
               {}, /* C */
               {}, /* compute_0 */
             },
-            {/* ptr_aux */ (float*) ptr_0, /* null_default */ float(0), /* dAux */ {2048, _1{}, _0{}}}, /* aux */
+            {/* ptr_aux */ (float*) (ptr_0 + ptr_0_offset), /* null_default */ float(0), /* dAux */ {2048, _1{}, _0{}}}, /* aux */
             {}, /* compute_1 */
           },
-          {/* ptr_aux */ (float*) ptr_1, /* dAux */ {2048, _1{}, _0{}}}, /* F */
+          {/* ptr_aux */ (float*) (ptr_1 + ptr_1_offset), /* dAux */ {2048, _1{}, _0{}}}, /* F */
         },
-        {/* ptr_col */ (float*) ptr_2, /* null_default */ float(0), /* dCol */ {}}, /* bias */
+        {/* ptr_col */ (float*) (ptr_2 + ptr_2_offset), /* null_default */ float(0), /* dCol */ {}}, /* bias */
         {}, /* compute_2 */
         {}, /* compute_3 */
         {}, /* compute_4 */
@@ -444,9 +438,9 @@ def fn(accum, bias):
 { /* thread */
         { /* E */
           {}, /* accum */
-          {/* ptr_aux */ (float*) ptr_0, /* dAux */ {2048, _1{}, _0{}}}, /* E */
+          {/* ptr_aux */ (float*) (ptr_0 + ptr_0_offset), /* dAux */ {2048, _1{}, _0{}}}, /* E */
         },
-        {/* ptr_col */ (float*) ptr_1, /* null_default */ float(0), /* dCol */ {}}, /* bias */
+        {/* ptr_col */ (float*) (ptr_1 + ptr_1_offset), /* null_default */ float(0), /* dCol */ {}}, /* bias */
         {}, /* compute_0 */
       }
 """,
@@ -571,5 +565,5 @@ def test_evt_codegen(self):
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
-    if HAS_CPU or HAS_CUDA:
+    if HAS_CPU or HAS_CUDA_AND_TRITON:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_decompose_mem_bound_mm.py b/test/inductor/test_decompose_mem_bound_mm.py
index 6bc237989145c..a0502a22b0fb3 100644
--- a/test/inductor/test_decompose_mem_bound_mm.py
+++ b/test/inductor/test_decompose_mem_bound_mm.py
@@ -18,7 +18,7 @@
     parametrize,
     TEST_XPU,
 )
-from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA_AND_TRITON
 from torch.testing._internal.triton_utils import requires_gpu
 
 
@@ -135,7 +135,7 @@ def test_decompose_bmm(self, b, m, n, k, should_decompose):
 
         self.compare_pred(module, traced, input)
 
-        expected_val = 1 if should_decompose and HAS_CUDA else 0
+        expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
         self.assertEqual(
             counters["inductor"]["decompose_bmm"],
             expected_val,
@@ -146,7 +146,7 @@ def test_decompose_bmm(self, b, m, n, k, should_decompose):
         self.compare_parameters(module, traced)
         self.compare_gradients(module, traced)
 
-        expected_val = 3 if should_decompose and HAS_CUDA else 0
+        expected_val = 3 if should_decompose and HAS_CUDA_AND_TRITON else 0
         self.assertEqual(
             counters["inductor"]["decompose_bmm"],
             expected_val,
@@ -195,7 +195,7 @@ def test_decompose_linear(self, m, n, k, has_bias, should_decompose):
 
         self.compare_pred(module, traced, input)
 
-        expected_val = 1 if should_decompose and HAS_CUDA else 0
+        expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
         if has_bias:
             self.assertEqual(
                 counters["inductor"]["decompose_addmm"],
@@ -248,7 +248,7 @@ def test_decompose_linear_mixed_precision(
 
             self.compare_pred(module, traced, input)
 
-            expected_val = 1 if should_decompose and HAS_CUDA else 0
+            expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
             if has_bias:
                 self.assertEqual(
                     counters["inductor"]["decompose_addmm"],
@@ -293,7 +293,7 @@ def test_decompose_mm(self, m, n, k, has_bias, should_decompose):
 
         self.compare_pred(module, traced, input)
 
-        expected_val = 1 if should_decompose and HAS_CUDA else 0
+        expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
         self.assertEqual(
             counters["inductor"]["decompose_mm"],
             expected_val,
@@ -305,7 +305,7 @@ def test_decompose_mm(self, m, n, k, has_bias, should_decompose):
         self.compare_parameters(module, traced)
         self.compare_gradients(module, traced)
 
-        expected_val = 1 if should_decompose and HAS_CUDA else 0
+        expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
         self.assertEqual(
             counters["inductor"]["decompose_mm"] - decompose_mm_fwd,
             expected_val,
@@ -361,7 +361,7 @@ def test_decompose_mm_mixed_precision(self, m, n, k, has_bias, should_decompose)
 
             self.compare_pred(module, traced, input)
 
-            expected_val = 1 if should_decompose and HAS_CUDA else 0
+            expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
             self.assertEqual(
                 counters["inductor"]["decompose_mm"],
                 expected_val,
@@ -373,7 +373,7 @@ def test_decompose_mm_mixed_precision(self, m, n, k, has_bias, should_decompose)
             self.compare_parameters(module, traced)
             self.compare_gradients(module, traced)
 
-            expected_val = 1 if should_decompose and HAS_CUDA else 0
+            expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
             self.assertEqual(
                 counters["inductor"]["decompose_mm"] - decompose_mm_fwd,
                 expected_val,
@@ -397,7 +397,7 @@ def test_dynamic_shape(self, m, n, k, has_bias, should_decompose):
 
         self.compare_pred(module, traced, input)
 
-        expected_val = 1 if should_decompose and HAS_CUDA else 0
+        expected_val = 1 if should_decompose and HAS_CUDA_AND_TRITON else 0
         if has_bias:
             self.assertEqual(
                 counters["inductor"]["decompose_addmm"],
@@ -411,7 +411,7 @@ def test_dynamic_shape(self, m, n, k, has_bias, should_decompose):
         self.compare_gradients(module, traced)
 
         expected_val = 0
-        if HAS_CUDA:
+        if HAS_CUDA_AND_TRITON:
             expected_val = 1 if has_bias else 2
 
         self.assertEqual(
diff --git a/test/inductor/test_device_assert.py b/test/inductor/test_device_assert.py
new file mode 100644
index 0000000000000..ddf85f9d88da1
--- /dev/null
+++ b/test/inductor/test_device_assert.py
@@ -0,0 +1,144 @@
+# Owner(s): ["module: inductor"]
+
+import torch
+import torch._inductor.config
+from torch._inductor import metrics
+from torch._inductor.compiler_bisector import BisectionResult, CompilerBisector
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import run_and_get_code
+from torch.testing._internal.common_utils import skipIfRocm
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
+
+
+class TestTorchDeviceAssertTrigger(TestCase):
+    def _run_assert_should_throw(self, device):
+        def func():
+            a = torch.tensor([1.0, -2.0], device=device)
+            result = torch.all(a > 0)
+            assert result, "should throw"
+
+        def test_fn():
+            torch._dynamo.reset()
+            f_c = torch.compile(func)
+
+            try:
+                f_c()
+                return False
+            except Exception:
+                return True
+
+        bisect_result = CompilerBisector.do_bisect(test_fn)
+        # do_bisect return None if all system is passed else return BisectionResult
+        self.assertNotIsInstance(bisect_result, BisectionResult)
+
+    def _run_assert_should_not_throw(self, device):
+        def func():
+            a = torch.tensor([1.0, 2.0], device=device)
+            result = torch.all(a > 0)
+            assert result, "should throw"
+
+        def test_fn():
+            torch._dynamo.reset()
+            f_c = torch.compile(func)
+
+            try:
+                f_c()
+                return True
+            except Exception:
+                return False
+
+        bisect_result = CompilerBisector.do_bisect(test_fn)
+        self.assertNotIsInstance(bisect_result, BisectionResult)
+
+    def _run_assert_inline_expression_should_throw(self, device):
+        def func():
+            a = torch.tensor([1.0, -2.0], device=device)
+            assert torch.all(a > 0), "should throw"
+
+        def test_fn():
+            torch._dynamo.reset()
+            f_c = torch.compile(func)
+
+            try:
+                f_c()
+                return False
+            except Exception:
+                return True
+
+        bisect_result = CompilerBisector.do_bisect(test_fn)
+        self.assertNotIsInstance(bisect_result, BisectionResult)
+
+    def _run_assert_inline_expression_should_not_throw(self, device):
+        def func():
+            a = torch.tensor([1.0, 2.0], device=device)
+            assert torch.all(a > 0), "should throw"
+
+        def test_fn():
+            torch._dynamo.reset()
+            f_c = torch.compile(func)
+
+            try:
+                f_c()
+                return True
+            except Exception:
+                return False
+
+        bisect_result = CompilerBisector.do_bisect(test_fn)
+        self.assertNotIsInstance(bisect_result, BisectionResult)
+
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_assert_should_throw(self):
+        device = "cpu"
+        self._run_assert_should_throw(device)
+        self._run_assert_inline_expression_should_throw(device)
+
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_assert_should_not_throw(self):
+        device = "cpu"
+        self._run_assert_should_not_throw(device)
+        self._run_assert_inline_expression_should_not_throw(device)
+
+    @requires_cuda_and_triton
+    @skipIfRocm
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_assert_fusion(self):
+        torch._logging.set_logs(inductor_metrics=True)
+
+        def func():
+            a = torch.tensor([1.0, 2.0], device="cuda")
+            result = torch.all(a > 0)
+            assert result, "should throw"
+
+        torch._dynamo.reset()
+        f_c = torch.compile(func, backend="inductor")
+        metrics.reset()
+        self.assertEqual(metrics.generated_kernel_count, 0)
+        f_c()
+        self.assertEqual(metrics.generated_kernel_count, 1)
+        torch._logging.set_logs()
+
+    @requires_cuda_and_triton
+    @skipIfRocm
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_run_assert_triton(self):
+        @torch.compile(backend="inductor")
+        def fn():
+            a = torch.tensor([1.0, 2.0], device="cuda")
+            result = torch.all(a > 0)
+            assert result, "should throw"
+
+        def should_not_throw(fn):
+            try:
+                fn()
+                return True
+            except Exception:
+                return False
+
+        self.assertEqual(should_not_throw(fn), True)
+
+        _, code = run_and_get_code(fn)
+        self.assertEqual(code[0].count("tl.device_assert"), 1)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
index e78cf68244ee6..d2a5019d47966 100644
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@@ -28,7 +28,10 @@
     _identity,
     _mask_mod_signature,
     _score_mod_signature,
+    _WARNINGS_SHOWN,
     and_masks,
+    AuxOutput,
+    AuxRequest,
     BlockMask,
     create_block_mask,
     flex_attention,
@@ -42,20 +45,26 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
+    dtypesIfXPU,
     flex_attention_supported_platform as supported_platform,
     instantiate_device_type_tests,
     largeTensorTest,
     skipCPUIf,
     skipCUDAIf,
+    skipXPUIf,
 )
+from torch.testing._internal.inductor_utils import HAS_GPU
 from torch.utils._triton import has_triton, has_triton_tma_device
 
 
 # Use this decorator only when hitting Triton bugs on H100
 running_on_a100_only = skipUnless(
-    (torch.cuda.is_available() and has_triton())
-    and (torch.cuda.get_device_capability() == (8, 0) or torch.version.hip),
-    "Requires Triton + A100 or Triton + ROCm",
+    (
+        (torch.cuda.is_available() and has_triton())
+        and (torch.cuda.get_device_capability() == (8, 0) or torch.version.hip)
+    )
+    or (torch.xpu.is_available() and has_triton()),
+    "Requires Triton + A100 or Triton + ROCm or Triton + Intel GPU",
 )
 
 Tolerances = namedtuple("Tolerances", ["atol", "rtol"])
@@ -89,12 +98,23 @@ def temp_float32_matmul_precision(precision: str):
     Args:
     precision (str): The precision to set ('highest', 'high', or 'medium').
     """
+
+    def set_float32_matmul_precision_xpu(precision: str):
+        if precision == "highest":
+            torch._C._set_onednn_allow_tf32(False)
+        if precision == "high":
+            torch._C._set_onednn_allow_tf32(True)
+
     original_precision = torch.get_float32_matmul_precision()
     try:
         torch.set_float32_matmul_precision(precision)
+        if TEST_ON_XPU:
+            set_float32_matmul_precision_xpu(precision)
         yield
     finally:
         torch.set_float32_matmul_precision(original_precision)
+        if TEST_ON_XPU:
+            set_float32_matmul_precision_xpu(original_precision)
 
 
 def skip_on_cpu(test_func):
@@ -116,6 +136,12 @@ def skip_on_rocm(test_func):
     return decorated_func
 
 
+def skip_on_xpu(test_func):
+    """Decorator to skip tests that are not supported on Intel GPU."""
+    decorated_func = skipXPUIf(True, "Not supported on Intel GPU")(test_func)
+    return decorated_func
+
+
 def rmse(ref, res):
     """
     Calculate root mean squared error
@@ -156,9 +182,20 @@ class DeviceConfig:
     and torch.utils._triton.has_triton()
     and torch.cuda.get_device_capability() >= (8, 0)
 )
+TEST_ON_XPU = torch.xpu.is_available() and torch.utils._triton.has_triton()
 
 device_configs = {}
-test_device = ("cpu", "cuda")
+if HAS_GPU:
+    if TEST_ON_CUDA:
+        test_device = (
+            "cuda",
+            "cpu",
+        )
+    elif TEST_ON_XPU:
+        torch._C._set_onednn_allow_tf32(True)
+        test_device = ("xpu",)
+else:
+    test_device = ("cpu",)
 
 
 class SubstringSet:
@@ -168,6 +205,8 @@ def __init__(self, items):
     def __contains__(self, item):
         if "cuda" in item:
             item = "cuda"
+        if "xpu" in item:
+            item = "xpu"
         return item in self.items
 
 
@@ -185,6 +224,10 @@ def __contains__(self, item):
     ),
     dtypes_fast=[torch.float16],
 )
+device_configs["xpu"] = DeviceConfig(
+    dtypes=([torch.float32, torch.bfloat16, torch.float16]),
+    dtypes_fast=[torch.float16],
+)
 device_configs["cpu"] = DeviceConfig(
     dtypes=(
         [torch.float32, torch.bfloat16, torch.float16]
@@ -393,7 +436,7 @@ def batch_reserve(paged_attention: PagedAttention, target_seq_len: Tensor):
         )
 
 
-@large_tensor_test_class("2GB", device="cuda")
+@large_tensor_test_class("2GB", device=test_device[0])
 class TestFlexAttention(InductorTestCase):
     def setUp(self):
         super().setUp()
@@ -659,8 +702,13 @@ def preprocess_paged_attention(
         paged_attention.assign(batch_idx, input_pos, k, v, k_cache, v_cache)
 
         # convert block mask and score mod
-        converted_block_mask = paged_attention.convert_logical_block_mask(block_mask)
-        converted_score_mod = paged_attention.get_score_mod(score_mod)
+        kv_len_tensor = torch.full((KV_B,), KV_S, device=device, dtype=torch.int64)
+        converted_block_mask = paged_attention.convert_logical_block_mask(
+            block_mask, kv_len=kv_len_tensor
+        )
+        converted_score_mod = paged_attention.get_score_mod(
+            score_mod, kv_len=kv_len_tensor
+        )
         return k_cache, v_cache, converted_block_mask, converted_score_mod
 
     def run_paged_attention(
@@ -1194,6 +1242,7 @@ def run_automatic_dynamic_test(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods(self, device, dtype, score_mod: Callable):
         self.run_test(score_mod, dtype, device=device)
@@ -1203,6 +1252,7 @@ def test_builtin_score_mods(self, device, dtype, score_mod: Callable):
     @common_utils.parametrize("score_mod", test_score_mods)
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_builtin_score_mods_seqlen_lt_default_sparse_block_size(
         self, device, dtype, score_mod: Callable
     ):
@@ -1217,6 +1267,7 @@ def test_builtin_score_mods_seqlen_lt_default_sparse_block_size(
     @running_on_a100_only
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods_seqlen_lt_custom_sparse_block_size(
         self, device, dtype: torch.dtype, score_mod: Callable
@@ -1250,6 +1301,7 @@ def causal_mask(b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("score_mask_mod", test_score_mask_mod_map.items())
     def test_builtin_score_mods_dynamic(
         self, device, dtype: torch.dtype, score_mask_mod: tuple[Callable, Callable]
@@ -1259,6 +1311,7 @@ def test_builtin_score_mods_dynamic(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods_automatic_dynamic(
         self, device, dtype: torch.dtype, score_mod: Callable
@@ -1268,6 +1321,7 @@ def test_builtin_score_mods_automatic_dynamic(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods_different_seqlen(
         self, device, dtype: torch.dtype, score_mod: Callable
@@ -1291,6 +1345,7 @@ def test_builtin_score_mods_different_seqlen(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     @common_utils.parametrize("score_mod", test_score_mods)
     @common_utils.parametrize("BLOCK_SIZE", test_block_size)
     def test_builtin_score_mods_different_block_size(
@@ -1311,6 +1366,7 @@ def test_builtin_score_mods_different_block_size(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("batch_dims", test_Bq_Bkv)
     @common_utils.parametrize("head_dims", test_Hq_Hkv)
     @common_utils.parametrize("score_mod", test_score_mods)
@@ -1381,6 +1437,7 @@ def batch_mask_mod(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("batch_dims", test_Bq_Bkv)
     @common_utils.parametrize("head_dims", test_Hq_Hkv)
     @common_utils.parametrize("score_mod", test_score_mods)
@@ -1411,8 +1468,10 @@ def mask_mod(b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("score_mod", test_score_mods)
     @skip_on_rocm  # TODO: NaNs on ROCM
+    @skip_on_xpu  # TODO: NaNs on XPU like ROCM, need another PR to fix.
     def test_GQA(self, device, dtype: torch.dtype, score_mod: Callable):
         inputs = (
             score_mod,
@@ -1433,6 +1492,7 @@ def test_GQA(self, device, dtype: torch.dtype, score_mod: Callable):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize(
         "q_s", test_strides[:2]
     )  # TODO: fix layout for query braodcasting
@@ -1580,6 +1640,7 @@ def index_weird2(score, b, h, q_idx, kv_idx):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     def test_skip_odd_keys(self, device, dtype: torch.dtype):
         def score_mod(score, b, h, q, kv):
             return torch.where(kv % 2 == 0, score, float("-inf"))
@@ -1590,6 +1651,7 @@ def score_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     def test_function_composition(self, device, dtype: torch.dtype):
         def score_mod_1(score, b, h, m, n):
             return score + (m - n)
@@ -1606,6 +1668,7 @@ def composed_score_mod(score, b, h, m, n):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     def test_captured_buffers_all_dims(self, device, dtype: torch.dtype):
         head_scale = torch.randn(H, device=device)
         batch_scale = torch.randn(B, device=device)
@@ -1623,6 +1686,7 @@ def all_bias(score, batch, head, token_q, token_kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_seq_masking(self, device, dtype):
         seq_idx = torch.zeros(S, device=device, dtype=torch.bool)
         seq_idx[S // 2 :] = 1
@@ -1636,6 +1700,7 @@ def seq_mask_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_load_from_bias_seq_only(self, device, dtype):
         bias = torch.randn(S, S, device=device, dtype=dtype)
 
@@ -1648,6 +1713,7 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_load_from_bias_seq_batch(self, device, dtype):
         bias = torch.randn(B, S, S, device=device, dtype=dtype)
 
@@ -1707,6 +1773,7 @@ def add_decomposed_rel_pos(self, q):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_load_from_bias_head_seq_batch(self, device, dtype):
         bias = torch.randn(B, H, S, S, device=device, dtype=dtype)
 
@@ -1719,6 +1786,7 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_load_rel_bias(self, device, dtype):
         rel_bias = torch.randn(2 * S, device=device, dtype=dtype)
 
@@ -1731,6 +1799,7 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_dependent_causal_bidirectional(self, device, dtype):
         num_bidirectional = torch.randint(0, S, (B,), device=device, dtype=torch.int32)
 
@@ -1752,6 +1821,7 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_natten_2d(self, device, dtype):
         H = 32
         W = S // H
@@ -1820,6 +1890,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_silu_on_score(self, device, dtype):
         def silu_score(score, b, h, q, kv):
             return torch.nn.functional.silu(score)
@@ -1830,6 +1901,7 @@ def silu_score(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_padded_dense_causal(self, device, dtype):
         seq_len = torch.arange(B, device=device, dtype=torch.int32) + 1
 
@@ -1848,6 +1920,7 @@ def njt_score_mod(qk, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_captured_scale(self, device, dtype):
         scale = torch.ones((), device=device, dtype=torch.int32)
 
@@ -1860,6 +1933,7 @@ def score_mod_scale(qk, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_recompile_changed_score_mod(self, device, dtype):
         scale = torch.ones((), device=device, dtype=torch.int32)
         ADD = True
@@ -1881,6 +1955,7 @@ def score_mod_scale(qk, b, h, q, kv):
     @expectedFailure  # If we capture a tensor then we can perform a reduction on it, and that shouldn't be allowed
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_captured_reduction(self, device, dtype):
         scale = torch.randn((B, 8), device=device)
 
@@ -1889,6 +1964,278 @@ def score_mod_scale(qk, b, h, q, kv):
 
         self.run_test(score_mod_scale, dtype, device=device)
 
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @common_utils.parametrize(
+        "score_mod", test_score_mods, name_fn=lambda score_mod: score_mod.__name__
+    )
+    @skip_on_cpu
+    def test_return_max(self, device, dtype, score_mod):
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 243, 16),
+            device=device,
+            dtype=dtype,
+            requires_grad=True,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+
+        out_only = flex_attention(query, key, value, score_mod)
+        out_max, aux_max = flex_attention(
+            query,
+            key,
+            value,
+            score_mod,
+            return_aux=AuxRequest(max_scores=True),
+        )
+        out_both, aux_both = flex_attention(
+            query,
+            key,
+            value,
+            score_mod,
+            return_aux=AuxRequest(lse=True, max_scores=True),
+        )
+
+        flex_compile = torch.compile(flex_attention, fullgraph=True)
+        out_compiled, aux_compiled = flex_compile(
+            query,
+            key,
+            value,
+            score_mod,
+            return_aux=AuxRequest(max_scores=True),
+        )
+
+        torch.testing.assert_close(out_only, out_max, atol=1e-6, rtol=1e-6)
+        torch.testing.assert_close(out_only, out_both, atol=1e-6, rtol=1e-6)
+        torch.testing.assert_close(
+            aux_max.max_scores, aux_both.max_scores, atol=1e-6, rtol=1e-6
+        )
+
+        # we are calculating slightly different scores so add a lil fudge
+        # Extra tolerance for squared score_mod with float16 due to limited dynamic range
+        if score_mod.__name__ == "_squared" and dtype == torch.float16:
+            atol, rtol = 2e-2, 2e-2
+        else:
+            atol, rtol = 5e-3, 5e-3
+
+        torch.testing.assert_close(out_max, out_compiled, atol=atol, rtol=rtol)
+        torch.testing.assert_close(
+            aux_max.max_scores, aux_compiled.max_scores, atol=atol, rtol=rtol
+        )
+
+        B, H, L = query.shape[:3]
+        self.assertEqual(aux_max.max_scores.shape, (B, H, L))
+
+        max_score_tensors = [
+            aux_max.max_scores,
+            aux_both.max_scores,
+            aux_compiled.max_scores,
+        ]
+        for max_tensor in max_score_tensors:
+            self.assertFalse(
+                max_tensor.requires_grad, "max_scores should not require gradients"
+            )
+            self.assertEqual(
+                max_tensor.dtype, torch.float32, "max_scores should be kept in fp32"
+            )
+
+        # Test gradient computation for both eager and compiled versions
+        test_cases = [
+            ("eager", out_max, "eager mode"),
+            ("compiled", out_compiled, "compiled mode"),
+        ]
+
+        for mode_name, output, description in test_cases:
+            loss = output.sum()
+            grads = torch.autograd.grad(loss, (query, key, value))
+
+            # Verify gradients are computed for all inputs
+            input_names = ["query", "key", "value"]
+            for grad, input_name in zip(grads, input_names):
+                self.assertIsNotNone(
+                    grad, f"{input_name} should receive gradients in {description}"
+                )
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @common_utils.parametrize(
+        "score_mod", test_score_mods, name_fn=lambda score_mod: score_mod.__name__
+    )
+    @skip_on_cpu
+    def test_return_aux(self, device, dtype, score_mod):
+        """Test the new return_aux API with AuxRequest/Output"""
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 243, 16),
+            device=device,
+            dtype=dtype,
+            requires_grad=True,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+
+        flex_compile = torch.compile(flex_attention, fullgraph=True)
+        flex_compile_partial = torch.compile(flex_attention, fullgraph=False)
+
+        # Test 1: No auxiliary outputs (default behavior)
+        out_only = flex_compile(query, key, value, score_mod)
+        self.assertIsInstance(out_only, torch.Tensor)
+
+        # Test 2: Request only LSE
+        out, aux_lse = flex_compile(
+            query, key, value, score_mod, return_aux=AuxRequest(lse=True)
+        )
+        self.assertIsInstance(aux_lse, AuxOutput)
+        self.assertIsInstance(aux_lse.lse, torch.Tensor)
+        self.assertIsNone(aux_lse.max_scores)
+        self.assertEqual(aux_lse.lse.shape, (2, 2, 243))
+        self.assertEqual(aux_lse.lse.dtype, torch.float32)
+
+        # Test 3: Request only max_scores
+        out, aux_max = flex_compile(
+            query,
+            key,
+            value,
+            score_mod,
+            return_aux=AuxRequest(max_scores=True),
+        )
+        self.assertIsInstance(aux_max, AuxOutput)
+        self.assertIsNone(aux_max.lse)
+        self.assertIsInstance(aux_max.max_scores, torch.Tensor)
+        self.assertEqual(aux_max.max_scores.shape, (2, 2, 243))
+        self.assertEqual(aux_max.max_scores.dtype, torch.float32)
+
+        # Test 4: Request both auxiliary outputs
+        out, aux_both = flex_compile(
+            query,
+            key,
+            value,
+            score_mod,
+            return_aux=AuxRequest(lse=True, max_scores=True),
+        )
+        self.assertIsInstance(aux_both, AuxOutput)
+        self.assertIsInstance(aux_both.lse, torch.Tensor)
+        self.assertIsInstance(aux_both.max_scores, torch.Tensor)
+        self.assertEqual(aux_both.lse.shape, (2, 2, 243))
+        self.assertEqual(aux_both.max_scores.shape, (2, 2, 243))
+
+        # Test 5: Request no auxiliary outputs explicitly
+        out, aux_none = flex_compile(
+            query,
+            key,
+            value,
+            score_mod,
+            return_aux=AuxRequest(),  # Default is lse=False, max_scores=False
+        )
+        self.assertIsInstance(aux_none, AuxOutput)
+        self.assertIsNone(aux_none.lse)
+        self.assertIsNone(aux_none.max_scores)
+
+        # Test 6: Verify outputs are consistent with legacy API, can't fullgraph through warnings
+        out_legacy, lse_legacy = flex_compile_partial(
+            query, key, value, score_mod, return_lse=True
+        )
+        torch.testing.assert_close(out_only, out_legacy, atol=1e-6, rtol=1e-6)
+        torch.testing.assert_close(aux_lse.lse, lse_legacy, atol=1e-6, rtol=1e-6)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @skip_on_cpu
+    def test_return_aux_deprecation_warnings(self, device, dtype):
+        """Test that deprecation warnings are issued for legacy parameters"""
+        import warnings
+
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 64, 16),
+            device=device,
+            dtype=dtype,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+
+        # Clear shown warnings to ensure we can test them
+        original_shown = _WARNINGS_SHOWN.copy()
+        _WARNINGS_SHOWN.clear()
+
+        try:
+            # Test deprecation warning for return_lse
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                flex_attention(query, key, value, return_lse=True)
+                self.assertTrue(
+                    any(
+                        "return_lse is deprecated" in str(warning.message)
+                        for warning in w
+                    )
+                )
+
+            # Clear for next test
+            _WARNINGS_SHOWN.clear()
+
+            # Test error when both old and new API are used
+            with self.assertRaises(ValueError) as cm:
+                flex_attention(
+                    query,
+                    key,
+                    value,
+                    return_lse=True,
+                    return_aux=AuxRequest(lse=True),
+                )
+            self.assertIn(
+                "Cannot specify both return_lse and return_aux", str(cm.exception)
+            )
+
+        finally:
+            # Restore original warnings state
+            _WARNINGS_SHOWN.clear()
+            _WARNINGS_SHOWN.update(original_shown)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @skip_on_cpu
+    def test_dynamic_divisibility_guards(self, device, dtype):
+        """Test guards for divisible/non-divisible shape transitions"""
+        if device == "cpu" and dtype is torch.float16:
+            dtype = torch.float32
+
+        def score_mod(qk, b, h, q, kv):
+            return torch.where(q >= kv, qk, -float("inf"))
+
+        def test_shape(S, backend):
+            """Test a single shape configuration"""
+            block_mask = create_block_mask(noop_mask, 1, 1, S, S, device=device)
+            sdpa_partial = create_attention(score_mod, block_mask=block_mask)
+
+            tensors = [
+                torch.randn(
+                    2, 4, S, 64, dtype=dtype, device=device, requires_grad=False
+                )
+                for _ in range(3)
+            ]
+
+            compiled_sdpa = torch.compile(sdpa_partial, backend=backend)
+            out, code = run_and_get_code(compiled_sdpa, *tensors)
+
+            # Check divisibility flag
+            is_divisible = S % 128 == 0
+            expected_flag = f"IS_DIVISIBLE : tl.constexpr = {is_divisible}"
+            self.assertIn(
+                expected_flag, str(code), f"S={S} should have {expected_flag}"
+            )
+
+            self.assertEqual(out.shape, (2, 4, S, 64))
+            return out, code
+
+        torch._dynamo.reset()
+        backend = CompileCounterWithBackend("inductor")
+
+        # Test divisible and non-divisible shapes
+        test_shapes = [256, 255, 383, 384]
+        _ = [test_shape(S, backend) for S in test_shapes]
+
     @supported_platform
     def test_multiple_score_mod_calls(self, device):
         query = torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
@@ -2296,6 +2643,7 @@ def f(q, k, v):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     def test_njt_causal(self, device, dtype):
         offsets = torch.tensor(
             [0, 1024, 1024 + 512, S], device=device, dtype=torch.int32
@@ -2337,6 +2685,12 @@ def score_mod(score, b, h, m, n):
         self.run_test_with_paged_attention(
             score_mod, dtype=torch.float16, device=device
         )
+        self.run_test_with_paged_attention(
+            score_mod=score_mod,
+            dtype=torch.bfloat16,
+            KV_S=64,
+            device=device,
+        )
 
     @supported_platform
     @skip("TODO: Figure out why this is erroring")
@@ -2358,6 +2712,7 @@ def bias_mod(score, batch, head, token_q, token_kv):
     @common_utils.parametrize("score_mod", test_score_mods)
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     @common_utils.parametrize("head_dims", [(D, D // 2), (D // 2, D)])
     def test_non_equal_head_dims(self, device, dtype, score_mod, head_dims):
         qk_d, v_d = head_dims
@@ -2451,6 +2806,7 @@ def causal(b, h, q_idx, kv_idx):
     @common_utils.parametrize("head_dim", [17, 24, 94, 121])
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_non_pow_2_headdim(self, device, dtype, head_dim):
         self.run_test(_rel_bias, dtype, device, B, H, S, head_dim, B, H, S, head_dim)
 
@@ -2515,6 +2871,7 @@ def causal_constructor(S):
     @skip_on_cpu
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     @common_utils.parametrize("score_mod", [_identity, _causal])
     def test_logsumexp_correctness(self, device, dtype, score_mod):
         make_tensor = functools.partial(
@@ -2650,9 +3007,7 @@ def test_differentiable_logsumexp_gradcheck(self, device):
         def flex_attention_lse_only(q, k, v):
             return flex_attention(q, k, v, return_lse=True)[1]
 
-        func = torch.compile(
-            flex_attention_lse_only, backend="aot_eager", fullgraph=True
-        )
+        func = torch.compile(flex_attention_lse_only, backend="aot_eager")
 
         self.assertTrue(
             torch.autograd.gradcheck(func, (query, key, value), raise_exception=True)
@@ -2678,9 +3033,7 @@ def test_differentiable_logsumexp_compiled(self, device):
         k.grad = None
         v.grad = None
 
-        out2, lse2 = torch.compile(flex_attention, fullgraph=True)(
-            q, k, v, return_lse=True
-        )
+        out2, lse2 = torch.compile(flex_attention)(q, k, v, return_lse=True)
         (out2.mean() + (lse2 * lse_mask).sum()).backward()
         q_grad2, k_grad2, v_grad2 = q.grad, k.grad, v.grad
         tolerance = Tolerances(atol=1e-1, rtol=1e-1)
@@ -2866,6 +3219,7 @@ def test_strided_backwards(self, device):
             torch.testing.assert_close(eager, compiled, atol=9e-3, rtol=0)
 
     @supported_platform
+    @skip_on_cpu
     @common_utils.parametrize("mode", ["eager", "inductor", "paged_attention"])
     @common_utils.parametrize(
         "permute_order",
@@ -2881,6 +3235,11 @@ def test_strided_backwards(self, device):
     def test_flex_attention_stride_ordering(self, device, mode, permute_order, shape):
         from torch._inductor.ir import get_stride_order
 
+        if torch.version.hip and mode == "paged_attention":
+            raise self.skipTest(
+                "TODO: figure out why mode_paged_attention_permute_order3_shape0 on MI200 caused mem fault"
+            )
+
         dtype = torch.float32
         # Setup
         requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
@@ -2971,7 +3330,7 @@ def test_flex_attention_backward_stride_ordering(
     def test_non_contiguous_last_dim(self, device):
         """Test flex_attention with tensors having non contiguous last dimension."""
         B, H, D = 4, 8, 64
-        dtype = torch.float16 if device == "cuda" else torch.float32
+        dtype = torch.float16 if device in DEVICE_SUPPORTS_BACKWARDS else torch.float32
         for S in [16, 64]:
 
             def column_major_tensor():
@@ -3193,7 +3552,7 @@ def test_force_write_lse(self, device):
         query, key, value = make_tensor(), make_tensor(), make_tensor()
         out_eager, lse_eager = flex_attention(query, key, value, return_lse=True)
 
-        flex_compile = torch.compile(flex_attention, fullgraph=True)
+        flex_compile = torch.compile(flex_attention)
         out_compiled, lse_compiled = flex_compile(query, key, value, return_lse=True)
 
         out_paged, lse_paged = self.run_paged_attention(
@@ -3201,7 +3560,9 @@ def test_force_write_lse(self, device):
         )
 
         torch.testing.assert_close(lse_eager, lse_compiled, atol=3e-3, rtol=0)
-        torch.testing.assert_close(lse_eager, lse_paged, atol=3e-3, rtol=0)
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+        if requires_grad:
+            torch.testing.assert_close(lse_eager, lse_paged, atol=3e-3, rtol=0)
 
     @supported_platform
     @skip_on_cpu
@@ -3725,7 +4086,9 @@ def causal_mask(b, h, q_idx, kv_idx):
         self.assertEqual(len(cnt.graphs), 1)
         graph = cnt.graphs[0]
         norm_graph = normalize_gm(graph.print_readable(print_output=False))
-        expected_graph = """\
+        self.assertExpectedInline(
+            norm_graph,
+            """\
 class GraphModule(torch.nn.Module):
     def forward(self, L_query_: "f64[2, 2, 128, 4]", L_key_: "f64[2, 2, 128, 4]", L_value_: "f64[2, 2, 128, 4]", L_block_mask_kv_indices: "i32[1, 1, 1, 1]", L_block_mask_kv_num_blocks: "i32[1, 1, 1]", L_block_mask_full_kv_num_blocks: "i32[1, 1, 1]", L_block_mask_full_kv_indices: "i32[1, 1, 1, 1]", L_block_mask_q_num_blocks: "i32[1, 1, 1]", L_block_mask_q_indices: "i32[1, 1, 1, 1]", L_block_mask_full_q_num_blocks: "i32[1, 1, 1]", L_block_mask_full_q_indices: "i32[1, 1, 1, 1]"):
         l_query_ = L_query_
@@ -3742,7 +4105,7 @@ def forward(self, L_query_: "f64[2, 2, 128, 4]", L_key_: "f64[2, 2, 128, 4]", L_
 
         score_mod_0 = self.score_mod_0
         mask_fn_0 = self.mask_fn_0
-        flex_attention = torch.ops.higher_order.flex_attention(l_query_, l_key_, l_value_, score_mod_0, (128, 128, l_block_mask_kv_num_blocks, l_block_mask_kv_indices, l_block_mask_full_kv_num_blocks, l_block_mask_full_kv_indices, l_block_mask_q_num_blocks, l_block_mask_q_indices, l_block_mask_full_q_num_blocks, l_block_mask_full_q_indices, 128, 128, mask_fn_0), 0.5, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True}, (), ());  l_query_ = l_key_ = l_value_ = score_mod_0 = l_block_mask_kv_num_blocks = l_block_mask_kv_indices = l_block_mask_full_kv_num_blocks = l_block_mask_full_kv_indices = l_block_mask_q_num_blocks = l_block_mask_q_indices = l_block_mask_full_q_num_blocks = l_block_mask_full_q_indices = mask_fn_0 = None
+        flex_attention = torch.ops.higher_order.flex_attention(l_query_, l_key_, l_value_, score_mod_0, (128, 128, l_block_mask_kv_num_blocks, l_block_mask_kv_indices, l_block_mask_full_kv_num_blocks, l_block_mask_full_kv_indices, l_block_mask_q_num_blocks, l_block_mask_q_indices, l_block_mask_full_q_num_blocks, l_block_mask_full_q_indices, 128, 128, mask_fn_0), 0.5, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False}, (), ());  l_query_ = l_key_ = l_value_ = score_mod_0 = l_block_mask_kv_num_blocks = l_block_mask_kv_indices = l_block_mask_full_kv_num_blocks = l_block_mask_full_kv_indices = l_block_mask_q_num_blocks = l_block_mask_q_indices = l_block_mask_full_q_num_blocks = l_block_mask_full_q_indices = mask_fn_0 = None
         out: "f64[2, 2, 128, 4]" = flex_attention[0];  flex_attention = None
         return (out,)
 
@@ -3755,10 +4118,7 @@ class mask_fn_0(torch.nn.Module):
         def forward(self, child: "i32[]", child_1: "i32[]", child_2: "i32[]", child_3: "i32[]"):
             ge: "b8[]" = child_2 >= child_3;  child_2 = child_3 = None
             return ge
-"""
-        self.assertExpectedInline(
-            norm_graph,
-            expected_graph,  # noqa: B950
+""",  # noqa: B950
         )
         # Save the AOT graphs
         aot_graphs = []
@@ -3776,18 +4136,20 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
         out.sum().backward()
 
         joint_graph = normalize_gm(aot_graphs[1].print_readable(print_output=False))
-        expected_joint_graph = """\
+        self.assertExpectedInline(
+            joint_graph,
+            """\
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f64[2, 2, 128, 4]", primals_2: "f64[2, 2, 128, 4]", primals_3: "f64[2, 2, 128, 4]", full: "i32[1, 1, 1]", full_default: "i32[1, 1, 1, 1]", convert_element_type: "i32[1, 1, 1]", convert_element_type_1: "i32[1, 1, 1, 1]", getitem_2: "f64[2, 2, 128, 4]", getitem_3: "f32[2, 2, 128]", tangents_1: "f64[2, 2, 128, 4]"):
-        full_default_4: "f32[2, 2, 128]" = torch.ops.aten.full.default([2, 2, 128], 0, dtype = torch.float32, layout = torch.strided, device = device(type='GPU_TYPE', index=0), pin_memory = False)
+        full_default_4: "f32[2, 2, 128]" = torch.ops.aten.full.default([2, 2, 128], 0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False)
         fw_graph0 = self.fw_graph0
         joint_graph0 = self.joint_graph0
         mask_graph0 = self.mask_graph0
-        flex_attention_backward = torch.ops.higher_order.flex_attention_backward(primals_1, primals_2, primals_3, getitem_2, getitem_3, tangents_1, full_default_4, fw_graph0, joint_graph0, (1, 1, full, full_default, None, None, convert_element_type, convert_element_type_1, None, None, 1073741824, 1073741824, mask_graph0), 0.5, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True}, (), ());  primals_1 = primals_2 = primals_3 = getitem_2 = getitem_3 = tangents_1 = full_default_4 = fw_graph0 = joint_graph0 = full = full_default = convert_element_type = convert_element_type_1 = mask_graph0 = None
-        getitem_4: "f64[2, 2, 128, 4]" = flex_attention_backward[0]
-        getitem_5: "f64[2, 2, 128, 4]" = flex_attention_backward[1]
-        getitem_6: "f64[2, 2, 128, 4]" = flex_attention_backward[2];  flex_attention_backward = None
-        return (getitem_4, getitem_5, getitem_6)
+        flex_attention_backward = torch.ops.higher_order.flex_attention_backward(primals_1, primals_2, primals_3, getitem_2, getitem_3, tangents_1, full_default_4, fw_graph0, joint_graph0, (1, 1, full, full_default, None, None, convert_element_type, convert_element_type_1, None, None, 1073741824, 1073741824, mask_graph0), 0.5, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False}, (), ());  primals_1 = primals_2 = primals_3 = getitem_2 = getitem_3 = tangents_1 = full_default_4 = fw_graph0 = joint_graph0 = full = full_default = convert_element_type = convert_element_type_1 = mask_graph0 = None
+        getitem_5: "f64[2, 2, 128, 4]" = flex_attention_backward[0]
+        getitem_6: "f64[2, 2, 128, 4]" = flex_attention_backward[1]
+        getitem_7: "f64[2, 2, 128, 4]" = flex_attention_backward[2];  flex_attention_backward = None
+        return (getitem_5, getitem_6, getitem_7)
 
     class fw_graph0(torch.nn.Module):
         def forward(self, arg0_1: "f64[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i32[]", arg4_1: "i32[]"):
@@ -3806,12 +4168,8 @@ def forward(self, arg0_1: "i32[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i3
             full_default: "b8[]" = torch.ops.aten.full.default([], True, dtype = torch.bool, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False)
             return full_default
 """.replace(  # noqa: B950
-            "GPU_TYPE", torch.device(device).type
-        )
-
-        self.assertExpectedInline(
-            joint_graph,
-            expected_joint_graph,
+                "GPU_TYPE", torch.device(device).type
+            ),
         )
 
     @supported_platform
@@ -3897,7 +4255,7 @@ def flex_attention_as_strided_error_tensor(
             mask_mod_other_buffers=(),
         ):
             inner_q, inner_k, inner_v = query.elem, key.elem, value.elem
-            out, lse = flex_attention_hop(
+            out, lse, max_scores = flex_attention_hop(
                 inner_q,
                 inner_k,
                 inner_v,
@@ -3908,7 +4266,11 @@ def flex_attention_as_strided_error_tensor(
                 score_mod_other_buffers,
                 mask_mod_other_buffers,
             )
-            return AsStridedErrorTensor(out), AsStridedErrorTensor(lse)
+            return (
+                AsStridedErrorTensor(out),
+                AsStridedErrorTensor(lse),
+                AsStridedErrorTensor(max_scores),
+            )
 
         # Test setup
         B, H, S, D = 2, 1, 128, 16
@@ -3929,7 +4291,7 @@ def flex_attention_as_strided_error_tensor(
             )
 
         # Test 2: Run flex_attention with normal tensors first
-        compiled_fn = torch.compile(flex_attention, backend="aot_eager", fullgraph=True)
+        compiled_fn = torch.compile(flex_attention, backend="aot_eager")
         normal_out, normal_lse = compiled_fn(
             query_elem, key_elem, value_elem, return_lse=True
         )
@@ -4091,9 +4453,9 @@ def flex_attn_fn(x):
                 return output
 
         flex_module = SacModule(hidden_size=512, num_heads=8, context_fn=context_fn).to(
-            "cuda", dtype=torch.bfloat16
+            device, dtype=torch.bfloat16
         )
-        x = torch.ones(8, 1024, 512, device="cuda", dtype=torch.bfloat16)
+        x = torch.ones(8, 1024, 512, device=device, dtype=torch.bfloat16)
 
         # Run without compilation
         output_module = flex_module(x)
@@ -4189,11 +4551,11 @@ def make_tensor():
     @supported_platform
     @skip_on_cpu
     @skipCUDAIf(not has_triton_tma_device(), "Requires TMA enabled CUDA device")
-    def test_tma_with_customer_kernel_options(self):
+    def test_tma_with_customer_kernel_options(self, device):
         make_tensor = functools.partial(
             torch.ones,
             (1, 1, 256, 128),
-            device="cuda",
+            device=device,
             dtype=torch.bfloat16,
         )
         query, key, value = make_tensor(), make_tensor(), make_tensor()
@@ -4777,6 +5139,7 @@ def flex_attention_fn():
         )
 
     @supported_platform
+    @skip_on_xpu
     def test_create_is_cuda_graphable(self, device):
         def mask_mod(b, h, q, kv):
             return q >= kv
@@ -4958,7 +5321,7 @@ def test_block_mask_operations_with_none_q_indices(self, device):
             self.assertIsNone(cpu_mask.q_indices)
 
 
-@large_tensor_test_class("2GB", device="cuda")
+@large_tensor_test_class("2GB", device=test_device[0])
 class TestPagedAttention(InductorTestCase):
     def setUp(self):
         super().setUp()
@@ -5080,7 +5443,12 @@ def causal_mask(b, h, q, kv):
         block_mask = create_block_mask(
             causal_mask, max_batch_size, 1, max_seq_len, max_seq_len, device=device
         )
-        new_block_mask = paged_cache.convert_logical_block_mask(block_mask)
+        kv_len_tensor = torch.full(
+            (max_batch_size,), max_seq_len, device=device, dtype=torch.int64
+        )
+        new_block_mask = paged_cache.convert_logical_block_mask(
+            block_mask, kv_len=kv_len_tensor
+        )
 
         zeros = [0, 0, 0, 0]
         # Check that the new block mask is correct
@@ -5273,6 +5641,7 @@ def test_update(self, device):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_paged_builtin_score_mods(
         self, device, dtype: torch.dtype, score_mod: Callable
@@ -5355,11 +5724,18 @@ def causal_mask(b, h, q, kv):
         )
         paged_cache.assign(batch_idx, input_pos, k, v, k_cache, v_cache)
 
-        new_block_mask = paged_cache.convert_logical_block_mask(block_mask)
+        kv_len_tensor = torch.full(
+            (max_batch_size,), max_seq_len, device=device, dtype=torch.int64
+        )
+        new_block_mask = paged_cache.convert_logical_block_mask(
+            block_mask, kv_len=kv_len_tensor
+        )
 
         compiled_sdpa = torch.compile(
             create_attention(
-                paged_cache.get_score_mod(score_mod), block_mask, enable_gqa=False
+                paged_cache.get_score_mod(score_mod, kv_len=kv_len_tensor),
+                block_mask,
+                enable_gqa=False,
             )
         )
         paged_out = compiled_sdpa(q, k_cache, v_cache, block_mask=new_block_mask)
@@ -5401,14 +5777,16 @@ def get_params(dtypes: list[torch.dtype]) -> list[Params]:
 
 
 supports_learnable_bias = unittest.skipUnless(
-    (torch.cuda.is_available() and has_triton())
-    and (torch.cuda.get_device_capability() >= (8, 0) or torch.version.hip),
+    (
+        (torch.cuda.is_available() and has_triton())
+        and (torch.cuda.get_device_capability() >= (8, 0) or torch.version.hip)
+    ),
     "Requires Triton + A100 or Triton + ROCm",
 )
 
 
 @supports_learnable_bias
-@large_tensor_test_class("2GB", device="cuda")
+@large_tensor_test_class("2GB", device=test_device[0])
 class TestLearnableBiases(InductorTestCase):
     def setUp(self):
         super().setUp()
@@ -5461,7 +5839,7 @@ def _gold_check(self, eager, compiled, gold, tensor_name, fudge_factor=1.35):
     def _check_outputs_and_grads(
         self, out_eager, out_compiled, out_gold, tensors, names=None
     ):
-        backwards_grad = torch.randn_like(out_eager)
+        backwards_grad = torch.randn_like(out_eager, device="cpu").to(out_eager.device)
         grads_eager = torch.autograd.grad((out_eager,), tensors, backwards_grad)
         grads_compiled = torch.autograd.grad((out_compiled,), tensors, backwards_grad)
         grads_gold = torch.autograd.grad((out_gold,), tensors, backwards_grad)
@@ -5953,6 +6331,56 @@ def bias_func(score, b, h, q_idx, kv_idx):
             ],
         )
 
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    @torch.compile
+    def test_learnable_bias_global_compiled(self, device, params):
+        batch_size = 1
+        num_heads = 1
+        seq_len = 128
+        head_dim = 16
+        d_model = num_heads * head_dim
+
+        query = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+        key = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+        value = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+
+        out_proj = nn.Linear(d_model, d_model, device=device)
+
+        query.requires_grad = True
+        key.requires_grad = True
+        value.requires_grad = True
+
+        bias = torch.randn(
+            batch_size,
+            num_heads,
+            seq_len,
+            seq_len,
+            device=device,
+            requires_grad=True,
+        )
+
+        def bias_mod(score, b, h, q_idx, kv_idx):
+            return score + bias[b, h, q_idx, kv_idx]
+
+        out = flex_attention(
+            query=query,
+            key=key,
+            value=value,
+            score_mod=bias_mod,
+        )
+        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
+
+        attn_output = out_proj(out)
+        random_target = torch.randn(batch_size, seq_len, d_model, device=device)
+        loss = torch.nn.functional.mse_loss(attn_output, random_target)
+        loss.backward()
+
+        assert bias.grad, "No gradient computed for bias"
+        assert torch.any(bias.grad != 0), "Gradient for bias is 0"
+
     @skip_on_cpu
     @common_utils.parametrize(
         "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
@@ -6299,10 +6727,22 @@ def _test_learnable_bias_inner(
             )
 
 
-instantiate_device_type_tests(TestFlexAttention, globals(), only_for=test_device)
-instantiate_device_type_tests(TestPagedAttention, globals(), only_for=test_device)
-instantiate_device_type_tests(TestBlockMask, globals(), only_for=("cuda",))
-instantiate_device_type_tests(TestLearnableBiases, globals(), only_for=test_device)
+instantiate_device_type_tests(
+    TestFlexAttention, globals(), only_for=test_device, allow_xpu=True
+)
+instantiate_device_type_tests(
+    TestPagedAttention, globals(), only_for=test_device, allow_xpu=True
+)
+instantiate_device_type_tests(
+    TestBlockMask,
+    globals(),
+    only_for=(test_device[0] if HAS_GPU else "cuda",),
+    allow_xpu=True,
+)
+instantiate_device_type_tests(
+    TestLearnableBiases, globals(), only_for=test_device, allow_xpu=True
+)
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
index 6b34c19431f1a..20db8a8f452b1 100644
--- a/test/inductor/test_flex_decoding.py
+++ b/test/inductor/test_flex_decoding.py
@@ -2,6 +2,7 @@
 # flake8: noqa: B950
 
 import functools
+import sys
 import unittest
 from collections import namedtuple
 from typing import Callable, Optional, Union
@@ -30,7 +31,19 @@
 from torch.testing._internal.common_device_type import (
     flex_attention_supported_platform as supported_platform,
     instantiate_device_type_tests,
+    skipXPUIf,
 )
+from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS
+from torch.testing._internal.inductor_utils import HAS_GPU
+from torch.utils._triton import has_triton_tma_device
+
+
+if IS_WINDOWS and IS_CI:
+    # TODO(xuhancn) : Need track if it is a requirement on windows.
+    sys.stderr.write("This UT is validated on windows, a lot of crash. Skip it.\n")
+    if __name__ == "__main__":
+        sys.exit(0)
+    raise unittest.SkipTest("skip on Windows")
 
 
 Tolerances = namedtuple("Tolerances", ["atol", "rtol"])
@@ -50,16 +63,24 @@
     and torch.utils._triton.has_triton()
     and torch.cuda.get_device_capability() >= (8, 0)
 )
-
-if TEST_ON_CUDA:
-    test_device = ("cuda",)
-    test_dtypes = (
-        [torch.float32, torch.bfloat16, torch.float16]
-        if PLATFORM_SUPPORTS_BF16
-        else [torch.float16, torch.float32]
-    )
-    test_dtypes_fast = [torch.float16]
-    SKIP_UT_ON_CPU = False
+TEST_ON_XPU = torch.xpu.is_available() and torch.utils._triton.has_triton()
+
+if HAS_GPU:
+    if TEST_ON_CUDA:
+        test_device = ("cuda",)
+        test_dtypes = (
+            [torch.float32, torch.bfloat16, torch.float16]
+            if PLATFORM_SUPPORTS_BF16
+            else [torch.float16, torch.float32]
+        )
+        test_dtypes_fast = [torch.float16]
+        SKIP_UT_ON_CPU = False
+    elif TEST_ON_XPU:
+        torch._C._set_onednn_allow_tf32(True)
+        test_device = ("xpu",)
+        test_dtypes = [torch.float32, torch.bfloat16, torch.float16]
+        test_dtypes_fast = [torch.float16]
+        SKIP_UT_ON_CPU = False
 else:
     test_device = ("cpu",)
     torch_config_string = torch.__config__.show()
@@ -79,12 +100,19 @@
     test_dtypes_fast = [torch.float32]
 
 
-def create_attention(score_mod, block_mask, enable_gqa=False):
+def skip_on_xpu(test_func):
+    """Decorator to skip tests that are not supported on Intel GPU."""
+    decorated_func = skipXPUIf(True, "Not supported on Intel GPU")(test_func)
+    return decorated_func
+
+
+def create_attention(score_mod, block_mask, enable_gqa=False, kernel_options=None):
     return functools.partial(
         flex_attention,
         score_mod=score_mod,
         block_mask=block_mask,
         enable_gqa=enable_gqa,
+        kernel_options=kernel_options,
     )
 
 
@@ -357,6 +385,7 @@ def run_test(
         V_D: int = D,
         block_mask: Optional[BlockMask] = None,
         device="cuda",
+        kernel_options=None,
     ):
         assert score_mod is not None or block_mask is not None, (
             "Must provide score_mod or block_mask"
@@ -387,7 +416,10 @@ def run_test(
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
 
         sdpa_partial = create_attention(
-            score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
+            score_mod,
+            block_mask,
+            enable_gqa=(not Q_H == KV_H),
+            kernel_options=kernel_options,
         )
         compiled_sdpa = torch.compile(sdpa_partial)
         if not self.test_inference_only:
@@ -534,8 +566,13 @@ def preprocess_paged_attention(
         paged_attention.assign(batch_idx, input_pos, k, v, k_cache, v_cache)
 
         # convert block mask and score mod
-        converted_block_mask = paged_attention.convert_logical_block_mask(block_mask)
-        converted_score_mod = paged_attention.get_score_mod(score_mod)
+        kv_len_tensor = torch.full((KV_B,), KV_S, device=device, dtype=torch.int64)
+        converted_block_mask = paged_attention.convert_logical_block_mask(
+            block_mask, kv_len=kv_len_tensor
+        )
+        converted_score_mod = paged_attention.get_score_mod(
+            score_mod, kv_len=kv_len_tensor
+        )
 
         return k_cache, v_cache, converted_block_mask, converted_score_mod
 
@@ -717,22 +754,22 @@ def run_test_with_call_paged_attention(
         )
 
     @supported_platform
-    @expectedFailure
+    @expectedFailure  # tl.dot does not support embedding size less than 16
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
     @common_utils.parametrize("dtype", test_dtypes_fast)
-    def test_bw_decoding_fails(self, dtype):
+    def test_bw_decoding_fails(self, device, dtype):
         make_kv = functools.partial(
             torch.randn,
             (2, 2, 128, 4),
             dtype=dtype,
-            device="cuda",
+            device=device,
             requires_grad=True,
         )
         make_q = functools.partial(
             torch.randn,
             (2, 2, 8, 4),
             dtype=dtype,
-            device="cuda",
+            device=device,
             requires_grad=True,
         )
         q, k, v, backward_grad = make_q(), make_kv(), make_kv(), make_q()
@@ -819,6 +856,28 @@ def test_builtin_score_mods_different_block_size(
         )
         self.run_test(score_mod, dtype, block_mask=block_mask, device=device)
 
+    @unittest.skipIf(not has_triton_tma_device(), "Skip when TMA is not available")
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_tma_decoding(self, device, dtype: torch.dtype):
+        n_heads, head_dim, seq_len = 4, 16, 128
+
+        score_mod = _generate_alibi_bias(n_heads)
+        kernel_options = {"USE_TMA": True}
+        self.run_test(
+            score_mod=score_mod,
+            dtype=dtype,
+            Q_B=1,
+            Q_H=n_heads,
+            Q_S=1,
+            Q_D=head_dim,
+            KV_B=1,
+            KV_H=n_heads,
+            KV_S=seq_len,
+            V_D=head_dim,
+            device=device,
+            kernel_options=kernel_options,
+        )
+
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
     @common_utils.parametrize("k_s", test_input_strides)
@@ -1003,12 +1062,12 @@ def mask_mod(b, h, q, kv):
 
     @supported_platform
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
-    def test_non_divisible_multi_token_offset_mask_with_captured_buffer(self):
+    def test_non_divisible_multi_token_offset_mask_with_captured_buffer(self, device):
         KV_S = S - 3
         Q_S = 3
-        offset_kv = torch.randn(KV_S, device="cuda", dtype=torch.bfloat16)
-        offset_q = torch.randn(Q_S, device="cuda", dtype=torch.bfloat16)
-        offset_tensor = torch.tensor(S // 2 - 3, device="cuda", dtype=torch.int32)
+        offset_kv = torch.randn(KV_S, device=device, dtype=torch.bfloat16)
+        offset_q = torch.randn(Q_S, device=device, dtype=torch.bfloat16)
+        offset_tensor = torch.tensor(S // 2 - 3, device=device, dtype=torch.int32)
 
         def score_mod(score, b, h, q, kv):
             return score + offset_kv[kv] + offset_q[q]
@@ -1016,8 +1075,14 @@ def score_mod(score, b, h, q, kv):
         def mask_mod(b, h, q, kv):
             return kv >= q + offset_tensor
 
-        block_mask = create_block_mask(mask_mod, B, 1, Q_S, KV_S)
-        self.run_test(Q_S=Q_S, KV_S=KV_S, block_mask=block_mask, score_mod=score_mod)
+        block_mask = create_block_mask(mask_mod, B, 1, Q_S, KV_S, device=device)
+        self.run_test(
+            Q_S=Q_S,
+            KV_S=KV_S,
+            block_mask=block_mask,
+            score_mod=score_mod,
+            device=device,
+        )
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
@@ -1520,6 +1585,19 @@ def score_mod(score, b, h, m, n):
 
         self.run_test(score_mod, device=device)
         self.run_test_with_paged_attention(score_mod, device=device)
+        self.run_test_with_paged_attention(
+            score_mod=score_mod,
+            dtype=torch.bfloat16,
+            Q_B=4,
+            Q_H=1,
+            Q_S=1,
+            QK_D=16,
+            KV_B=4,
+            KV_H=1,
+            KV_S=64,
+            V_D=16,
+            device=device,
+        )
 
     @supported_platform
     @patch.object(torch._inductor.config, "max_autotune", True)
@@ -1684,19 +1762,19 @@ def mask_mod(b, h, q, kv):
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
     @common_utils.parametrize("dtype", test_dtypes)
     @common_utils.parametrize("score_mod", [_identity, _causal])
-    def test_logsumexp_correctness(self, dtype, score_mod):
+    def test_logsumexp_correctness(self, device, dtype, score_mod):
         make_kv = functools.partial(
             torch.randn,
             (B, Hkv, S, D),
             dtype=dtype,
-            device="cuda",
+            device=device,
             requires_grad=True,
         )
         make_q = functools.partial(
             torch.randn,
             (B, Hkv, Hq // Hkv, D),
             dtype=dtype,
-            device="cuda",
+            device=device,
             requires_grad=True,
         )
         q, k, v = make_q(), make_kv(), make_kv()
@@ -1736,19 +1814,29 @@ def eager_sdpa_hop(q, k, v, score_mod):
 
     @supported_platform
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
-    def test_logsumexp_only_return(self):
+    def test_not_pw_of_two(self, device):
+        query = torch.randn(1, 12, 1, 16, device=device)
+        key = torch.randn(1, 2, 128, 16, device=device)
+        value = torch.randn(1, 2, 128, 16, device=device)
+
+        flex_compiled = torch.compile(flex_attention)
+        flex_compiled(query, key, value, enable_gqa=True)
+
+    @supported_platform
+    @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
+    def test_logsumexp_only_return(self, device):
         make_q = functools.partial(
             torch.randn,
             (B, Hkv, Hq // Hkv, D),
             dtype=torch.float32,
-            device="cuda",
+            device=device,
             requires_grad=True,
         )
         make_kv = functools.partial(
             torch.randn,
             (B, Hkv, S, D),
             dtype=torch.float32,
-            device="cuda",
+            device=device,
             requires_grad=True,
         )
 
@@ -1767,6 +1855,7 @@ def func(q, k, v, score_mod):
         )
 
     @supported_platform
+    @skip_on_xpu  # TODO: SYCL acc issue
     def test_non_sparse_mulitple_block_size(self, device):
         def generate_causal_offset(offset: torch.Tensor):
             def causal_offset_mask(b, h, q_idx, kv_idx):
@@ -1828,9 +1917,9 @@ def test_do_not_trigger_dynamic_shapes_on_empty_block_mask(self, device):
             )
             # Ensure no more re-compilation after the second automatic dynamic shape version.
             if i == 0:
-                self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 2)
+                self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 1)
             else:
-                self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 4)
+                self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 2)
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
@@ -1978,11 +2067,18 @@ def causal_mask(b, h, q, kv):
         input_pos = torch.tensor(prefill_length, device=device, dtype=torch.int32).view(
             max_batch_size, 1
         )
-        new_block_mask = paged_cache.convert_logical_block_mask(block_mask)
+        kv_len_tensor = torch.full(
+            (max_batch_size,), max_seq_len, device=device, dtype=torch.int64
+        )
+        new_block_mask = paged_cache.convert_logical_block_mask(
+            block_mask, kv_len=kv_len_tensor
+        )
         new_block_mask.seq_lengths = (1, new_block_mask.seq_lengths[1])
         compiled_sdpa = torch.compile(
             create_attention(
-                paged_cache.get_score_mod(score_mod), new_block_mask, enable_gqa=False
+                paged_cache.get_score_mod(score_mod, kv_len=kv_len_tensor),
+                new_block_mask,
+                enable_gqa=False,
             )
         )
         paged_out = compiled_sdpa(
@@ -2000,7 +2096,9 @@ def causal_mask(b, h, q, kv):
             self._check_equal(golden_outs, ref_outs, paged_out, fudge_factor, "Out")
 
 
-instantiate_device_type_tests(TestFlexDecoding, globals(), only_for=test_device)
+instantiate_device_type_tests(
+    TestFlexDecoding, globals(), only_for=test_device, allow_xpu=True
+)
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_foreach.py b/test/inductor/test_foreach.py
index 8eb113f183299..c51d0bba229ec 100644
--- a/test/inductor/test_foreach.py
+++ b/test/inductor/test_foreach.py
@@ -14,8 +14,8 @@
     IS_FBCODE,
     parametrize,
 )
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
-from torch.testing._internal.triton_utils import requires_cuda
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA_AND_TRITON
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 from torch.utils._pytree import tree_flatten
 
 
@@ -269,29 +269,29 @@ def fn(a0, a1):
         )
 
     # called in test_cuda_cpp_wrapper.py
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_foreach_cpp_wrapper_cuda(self):
         self._test_single_list(op=torch._foreach_add)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @all_ops
     def test_single_list(self, op):
         self._test_single_list(op)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @scalar_bin_ops
     def test_single_scalar(self, op):
         self._test_single_scalar(op)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @scalar_tensor_bin_ops
     def test_single_scalar_tensor(self, op):
         self._test_single_scalar_tensor(op)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @all_ops
     def test_scheduler_fusion_list(self, op):
         if op in un_ops_under_test:
@@ -319,7 +319,7 @@ def fn(a0, a1, b0, b1, c0, c1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @scalar_bin_ops
     def test_scheduler_fusion_scalar(self, op):
         def fn(a0, a1):
@@ -336,7 +336,7 @@ def fn(a0, a1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @scalar_bin_ops
     def test_broadcasting(self, op):
         def fn(a0, a1, b0, b1):
@@ -355,7 +355,7 @@ def fn(a0, a1, b0, b1):
         self.assertEqual(actual, expected)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @all_ops
     def test_singleton_lists(self, op):
         if op in un_ops_under_test:
@@ -392,7 +392,7 @@ def fn(a0, b0, c0):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @bin_ops
     def test_type_promotion(self, op):
         def fn(a0, a1, b0, b1):
@@ -413,7 +413,7 @@ def fn(a0, a1, b0, b1):
         self.assertEqual(actual, expected)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @scalar_bin_ops
     def test_kernel_split_arg_limit_list(self, op):
         # NB: foeach_copy won't pass this test because it will dce one set of buffers
@@ -435,7 +435,7 @@ def fn(a, b):
         self.assertEqual(actual, expected)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @scalar_bin_ops
     @unittest.skip(
         "Triton recursion depth exceeded: https://github.com/triton-lang/triton/issues/1763"
@@ -455,7 +455,7 @@ def fn(a):
         self.assertEqual(actual, expected)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @bin_ops
     def test_fusion_duplicate_buffer_list(self, op):
         def fn(a0, a1, b0, b1):
@@ -479,7 +479,7 @@ def fn(a0, a1, b0, b1):
             kernel_count = 2
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, kernel_count)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @all_ops
     def test_non_foreach_consumer_list(self, op):
         if op in un_ops_under_test:
@@ -507,7 +507,7 @@ def fn(a0, a1, b0, b1, c0, c1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @scalar_bin_ops
     def test_non_foreach_consumer_scalar(self, op):
         def fn(a0, a1):
@@ -524,7 +524,7 @@ def fn(a0, a1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @all_ops
     def test_non_foreach_producer_list(self, op):
         if op in un_ops_under_test:
@@ -554,7 +554,7 @@ def fn(a0, a1, b0, b1, c0, c1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @scalar_bin_ops
     def test_non_foreach_producer_scalar(self, op):
         def fn(a0, a1, b0, b1):
@@ -574,7 +574,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @all_ops
     def test_non_foreach_consumer_producer_list(self, op):
         if op in un_ops_under_test:
@@ -616,7 +616,7 @@ def fn(a0, a1, b0, b1, c0, c1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @scalar_bin_ops
     def test_non_foreach_consumer_producer_scalar(self, op):
         def fn(a0, a1, b0, b1):
@@ -641,7 +641,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @bin_ops
     @torch._dynamo.config.patch("automatic_dynamic_shapes", False)
     @torch._dynamo.config.patch("assume_static_by_default", False)
@@ -661,7 +661,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._dynamo.config.patch("automatic_dynamic_shapes", False)
     @torch._dynamo.config.patch("assume_static_by_default", False)
     @torch._inductor.config.patch("combo_kernel_foreach_dynamic_shapes", True)
@@ -680,7 +680,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._dynamo.config.patch("automatic_dynamic_shapes", False)
     @torch._dynamo.config.patch("assume_static_by_default", False)
     @torch._inductor.config.patch("combo_kernel_foreach_dynamic_shapes", True)
@@ -715,7 +715,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @decomp_ops
     def test_decomp(self, op):
         def fn(a0, a1, b0, b1, c0, c1):
@@ -735,7 +735,7 @@ def fn(a0, a1, b0, b1, c0, c1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_fuse_concat(self):
         def fn(x1, x2, x3, w1, w2, w3):
             x = torch.stack([x1, x2, x3])
@@ -758,7 +758,7 @@ def fn(x1, x2, x3, w1, w2, w3):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_zero_elems(self):
         def fn(a0, a1, b0, b1):
             return torch._foreach_add([a0, a1], [b0, b1])
@@ -775,7 +775,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @bin_ops
     def test_2d_blocking(self, op):
         def fn(a0, a1, b0, b1):
@@ -793,7 +793,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @bin_ops
     def test_2d_blocking_partitioning(self, op):
         def fn(a0, a1, b0, b1):
@@ -811,7 +811,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @bin_ops
     def test_2d_blocking_partitioning_elems(self, op):
         """2D blocking should be grouped by number of yelems"""
@@ -833,7 +833,7 @@ def fn(a0, a1, a2, b0, b1, b2):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @bin_ops
     @torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 2)
     def test_2d_blocking_partitioning_mixed_sizes(self, op):
@@ -856,7 +856,7 @@ def fn(a0, a1, a2, b0, b1, b2):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @inplace_bin_ops
     def test_reinplacing(self, op):
         def fn(a0, a1, b0, b1):
@@ -874,7 +874,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @inplace_bin_ops
     def test_reinplacing_mut_before(self, op):
         def fn(a0, a1, b0, b1):
@@ -893,7 +893,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @inplace_bin_ops
     def test_reinplacing_mut_after(self, op):
         def fn(a0, a1, b0, b1):
@@ -912,7 +912,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_multi_device(self):
         def test_foreach_add(a0, a1, b0, b1):
             return torch._foreach_add([a0, a1], [b0, b1])
@@ -930,7 +930,7 @@ def test_foreach_add(a0, a1, b0, b1):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_aliasing(self):
         def test_foreach_add(a0, a1, a2, b0, b1, b2):
             return torch._foreach_add_([a0, a1, a2], [b0, b1, b2])
@@ -952,7 +952,7 @@ def test_foreach_add(a0, a1, a2, b0, b1, b2):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 1)
     def test_2d_block_no_mixed_sizes_no_mask(self):
         """2D blocking with no mixed sizes constant mask"""
@@ -974,7 +974,7 @@ def fn(a0, a1, a2, b0, b1, b2):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 2)
     def test_2d_block_mixed_sizes_with_mask(self):
         """2D blocking with mixed sizes should have mask"""
@@ -996,7 +996,7 @@ def fn(a0, a1, a2, b0, b1, b2):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @foreach_map_bin_ops
     def test_foreach_map_backward_binary(self, op):
         from torch._dynamo.polyfills import foreach_map_fn
@@ -1037,7 +1037,7 @@ def ref_fn(xs, ys):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_foreach_map_input_mutation(self):
         def fn(xs, ys):
             outs = foreach_map_add_inplace(xs, ys)
@@ -1073,7 +1073,7 @@ def fn(xs, ys):
             ):
                 _ = run_fw_bw_and_get_code(lambda: torch.compile(fn)(*inps))
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @foreach_map_un_ops
     def test_foreach_map_backward_unary(self, op):
         from torch._dynamo.polyfills import foreach_map_fn
@@ -1109,5 +1109,5 @@ def ref_fn(xs):
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
-    if HAS_CPU or HAS_CUDA:
+    if HAS_CPU or HAS_CUDA_AND_TRITON:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py
index 50044b2c1943a..82e4a923a92e1 100644
--- a/test/inductor/test_fp8.py
+++ b/test/inductor/test_fp8.py
@@ -8,6 +8,7 @@
 from torch import Tensor
 from torch._inductor import config, utils
 from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import run_and_get_code
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_MX_GEMM,
@@ -22,8 +23,9 @@
     _quantize_tensorwise,
     _to_fp8_saturated,
     HAS_CPU,
-    HAS_CUDA,
+    HAS_CUDA_AND_TRITON,
 )
+from torch.testing._internal.jit_utils import FileCheck
 from torch.utils._triton import has_triton_tma_device
 
 
@@ -463,6 +465,89 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             # autotuning for the compiled case, the results can be different because of
             # the way blocks of results are accumulated (float addition not associative), so
             # setting a small absolute tolerance in these tests
+            if dtype == torch.bfloat16:
+                self.assertEqual(y_eager, y_compiled, rtol=5e-2, atol=0.07)
+            else:
+                self.assertEqual(y_eager, y_compiled, rtol=1e-2, atol=0.05)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @unittest.skipIf(
+        not has_triton_tma_device(), "Need device-side TMA support in Triton"
+    )
+    @parametrize("dtype", (torch.bfloat16, torch.float32))
+    @parametrize("shape", ("16,32,32", "1024,1024,512"))
+    @parametrize("use_fast_accum", (False, True))
+    def test_tensorwise_scaling_tma_template(
+        self,
+        dtype: torch.dtype,
+        shape: str,
+        use_fast_accum: bool,
+    ):
+        device = "cuda"
+        dtype_float8 = torch.float8_e4m3fn
+        dtype_float8 = _fix_fp8_dtype_for_rocm(dtype_float8, device)
+
+        shape = [int(dim) for dim in shape.split(",")]
+        M, K, N = shape  # Matmul Y = X [M, K] x W [N, K]
+        # input and output dtypes of _scaled_mm do not need to be the same, but
+        # typically in a model they are
+        x = torch.randn(M, K, dtype=dtype, device=device)
+        w = torch.randn(N, K, dtype=dtype, device=device)
+        bias = None
+
+        # quantize weight (prior to inference)
+        w_fp8, w_inverse_scale = _quantize_tensorwise(w, dtype_float8)
+        w_t_fp8 = w_fp8.t()
+
+        # quantize input x
+        x_fp8, x_inverse_scale = _quantize_tensorwise(x, dtype_float8)
+
+        def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
+            y = torch._scaled_mm(
+                x_fp8,
+                w_t_fp8,
+                x_inverse_scale,
+                w_inverse_scale,
+                bias,
+                out_dtype=dtype,
+                use_fast_accum=use_fast_accum,
+            )
+            return y
+
+        y_eager = linear(
+            x_fp8,
+            x_inverse_scale,
+            w_t_fp8,
+            w_inverse_scale,
+            bias,
+        )
+        with config.patch(
+            {
+                "triton.enable_persistent_tma_matmul": True,
+                "test_configs.autotune_choice_name_regex": "triton_scaled_mm_device_tma",
+                "max_autotune_gemm_backends": "TRITON",
+                "max_autotune": True,
+            }
+        ):
+            linear_compiled = torch.compile(
+                linear, backend="inductor", mode="max-autotune"
+            )
+            y_compiled, code = run_and_get_code(
+                linear_compiled,
+                x_fp8,
+                x_inverse_scale,
+                w_t_fp8,
+                w_inverse_scale,
+                bias,
+            )
+
+            FileCheck().check("SCALING_ROWWISE : tl.constexpr = False").run(code[0])
+            self.assertEqual(y_eager.dtype, dtype)
+            self.assertEqual(y_compiled.dtype, dtype)
+            # depending on the kernel config (BLOCK_M size, etc) selected during Inductor
+            # autotuning for the compiled case, the results can be different because of
+            # the way blocks of results are accumulated (float addition not associative), so
+            # setting a small absolute tolerance in these tests
             torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
@@ -529,6 +614,81 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         )
         self.assertEqual(y_eager.dtype, dtype)
         self.assertEqual(y_compiled.dtype, dtype)
+        torch.testing.assert_close(y_eager, y_compiled, rtol=5e-2, atol=0.07)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @unittest.skipIf(
+        not has_triton_tma_device(), "Need device-side TMA support in Triton"
+    )
+    @parametrize("shape", ("16,32,32", "1024,1024,512"))
+    @parametrize("use_fast_accum", (False, True))
+    def test_rowwise_scaling_tma_template(
+        self,
+        shape: str,
+        use_fast_accum: bool,
+    ):
+        # Only bf16 output type is supported for row-wise scaling, not fp32
+        dtype: torch.dtype = torch.bfloat16
+        device = "cuda"
+        dtype_float8 = torch.float8_e4m3fn
+        dtype_float8 = _fix_fp8_dtype_for_rocm(dtype_float8, device)
+
+        shape = [int(dim) for dim in shape.split(",")]
+        M, K, N = shape  # Matmul Y = X [M, K] x W [N, K]
+        x = torch.randn(M, K, dtype=dtype, device=device)
+        w = torch.randn(N, K, dtype=dtype, device=device)
+        bias = None
+
+        # quantize weight (prior to inference)
+        w_fp8, w_inverse_scale = _quantize_rowwise(w, dtype_float8)
+        w_t_fp8 = w_fp8.t()
+        w_inverse_scale = w_inverse_scale.t()  # scale_b should be (1, N)
+
+        # quantize input x
+        x_fp8, x_inverse_scale = _quantize_rowwise(x, dtype_float8)
+
+        def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
+            y = torch._scaled_mm(
+                x_fp8,
+                w_t_fp8,
+                x_inverse_scale,
+                w_inverse_scale,
+                bias,
+                out_dtype=dtype,
+                use_fast_accum=use_fast_accum,
+            )
+            return y
+
+        y_eager = linear(
+            x_fp8,
+            x_inverse_scale,
+            w_t_fp8,
+            w_inverse_scale,
+            bias,
+        )
+        with config.patch(
+            {
+                "triton.enable_persistent_tma_matmul": True,
+                "test_configs.autotune_choice_name_regex": "triton_scaled_mm_device_tma",
+                "max_autotune_gemm_backends": "TRITON",
+                "max_autotune": True,
+            }
+        ):
+            linear_compiled = torch.compile(
+                linear, backend="inductor", mode="max-autotune"
+            )
+            y_compiled, code = run_and_get_code(
+                linear_compiled,
+                x_fp8,
+                x_inverse_scale,
+                w_t_fp8,
+                w_inverse_scale,
+                bias,
+            )
+
+        FileCheck().check("SCALING_ROWWISE : tl.constexpr = True").run(code[0])
+        self.assertEqual(y_eager.dtype, dtype)
+        self.assertEqual(y_compiled.dtype, dtype)
         torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
@@ -587,7 +747,7 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             )
         self.assertEqual(y_eager.dtype, dtype)
         self.assertEqual(y_compiled.dtype, dtype)
-        torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.07)
+        torch.testing.assert_close(y_eager, y_compiled, rtol=5e-2, atol=0.07)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("M", (1, 3, 33, 257, 1024))
@@ -766,5 +926,5 @@ def linear(x, w_t_fp8, w_inverse_scale, bias):
 
 
 if __name__ == "__main__":
-    if HAS_CUDA or HAS_CPU:
+    if HAS_CUDA_AND_TRITON or HAS_CPU:
         run_tests()
diff --git a/test/inductor/test_fused_attention.py b/test/inductor/test_fused_attention.py
index a0e1b47032b86..25e96fa9f1e9f 100644
--- a/test/inductor/test_fused_attention.py
+++ b/test/inductor/test_fused_attention.py
@@ -15,7 +15,12 @@
     SM80OrLater,
 )
 from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm
-from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA, HAS_XPU
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_CPU,
+    HAS_CUDA_AND_TRITON,
+    HAS_XPU_AND_TRITON,
+)
 
 
 def checkpoint_wrapper(fn):
@@ -1114,7 +1119,7 @@ def dot_prod_attention(
         )
 
 
-if HAS_XPU or (HAS_CUDA and PLATFORM_SUPPORTS_FUSED_ATTENTION):
+if HAS_XPU_AND_TRITON or (HAS_CUDA_AND_TRITON and PLATFORM_SUPPORTS_FUSED_ATTENTION):
 
     class SDPAPatternRewriterGpuTests(TestSDPAPatternRewriterTemplate):
         device = GPU_TYPE
diff --git a/test/inductor/test_fxir_backend.py b/test/inductor/test_fxir_backend.py
index d474f66c6b915..a8c4030af3201 100644
--- a/test/inductor/test_fxir_backend.py
+++ b/test/inductor/test_fxir_backend.py
@@ -20,9 +20,10 @@
 from torch._inductor.codegen.common import register_backend_for_device
 from torch._inductor.codegen.cpp import CppScheduling
 from torch._inductor.codegen.triton import TritonScheduling
+from torch._inductor.codegen.wrapper import PythonWrapperCodegen
 from torch._inductor.codegen.wrapper_fxir import FxConverter, WrapperFxCodegen
-from torch._inductor.select_algorithm import extern_kernels
 from torch._inductor.test_case import TestCase as InductorTestCase
+from torch.export import Dim
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -35,6 +36,13 @@
 )
 
 
+if HAS_GPU:
+    import triton
+    import triton.language as tl
+
+    from torch.testing._internal.triton_utils import add_kernel_2d_autotuned
+
+
 @requires_gpu()
 @config.patch(
     compile_threads=1,
@@ -151,7 +159,7 @@ def foo(x, y):
         (gm,) = self._compile_and_check(foo, args, expected_num_triton_kernels=1)
 
         # Check for the extern kernel
-        num_extern = self._count_ops(gm, extern_kernels.addmm)
+        num_extern = self._count_ops(gm, torch.ops.aten.addmm.out)
         self.assertEqual(num_extern, 1)
 
     def test_fallback(self):
@@ -395,6 +403,57 @@ def get_input():
             ]
             self.assertEqual(placeholder.meta["val"], symbol)
 
+    @parametrize(
+        "shape",
+        [
+            (20,),
+            (50, 30),
+            (50, 30, 40),
+        ],
+    )
+    @torch._inductor.config.patch(
+        {
+            "pad_dynamic_shapes": True,
+            "comprehensive_padding": True,
+            "padding_alignment_bytes": 32,
+            "pad_outputs": True,
+        }
+    )
+    def test_dynamic_shapes_with_padding(self, shape):
+        """
+        Test a graph with dynamic shapes with padding.
+        """
+
+        def get_input(shape):
+            pad_size = list(shape)
+            pad_size[-1] = ((shape[-1] + 7) // 8) * 8
+            pad = torch.randn(pad_size, dtype=torch.float32, device=self.device)
+            view = torch.as_strided(pad, shape, pad.stride())
+            return view
+
+        args = [get_input(shape) for _ in range(2)]
+        (gm,) = self._compile_and_check(
+            torch.add, args, compile_kwargs={"dynamic": True}
+        )
+
+        # Check for a symbolic output shape.
+        (empty_strided,) = gm.graph.find_nodes(
+            op="call_function", target=torch.empty_strided
+        )
+        example_tensor = empty_strided.meta["val"]
+        symbolic_dims = example_tensor.shape
+        symbolic_strides = example_tensor.stride()
+
+        align_elems = 32 // args[0].dtype.itemsize
+        expected_strides = [1 for _ in range(len(shape))]
+        for i in range(len(shape) - 1, 0, -1):
+            expected_strides[i - 1] = align_elems * (
+                ((expected_strides[i] * symbolic_dims[i]) + align_elems - 1)
+                // align_elems
+            )
+        for i, j in zip(symbolic_strides, expected_strides):
+            self.assertEqual(i, j)
+
     def test_dynamic_shapes_precomputed_size(self):
         """
         Test dynamic shapes where a kernel's size arg is precomputed.
@@ -411,9 +470,9 @@ def test_dynamic_shapes_precomputed_size(self):
         )
         self.assertIn("ks0", triton_node.kwargs["kwargs"])
 
-    def test_dynamic_launch_grid_calc(self):
+    def test_dynamic_launch_grid_calc_python(self):
         """
-        Test the dyanmic launch grid calculation for Triton kernel wrapper
+        Test the dyanmic launch grid calculation for Triton kernel wrapper using python mode
         """
         func = torch.add
         args = [torch.randn(shape, device=self.device) for shape in [(7, 12), (7, 1)]]
@@ -427,12 +486,47 @@ def test_dynamic_launch_grid_calc(self):
         self.assertIn("xnumel", triton_node.kwargs["kwargs"])
         self.assertIn("XBLOCK", triton_node.kwargs["kwargs"])
         grid = triton_node.kwargs["grid"][0]
+        xnumel = triton_node.kwargs["kwargs"]["xnumel"].meta["val"]
         xblock = triton_node.kwargs["kwargs"]["XBLOCK"]
-        xnumel = triton_node.kwargs["kwargs"]["xnumel"]
-        self.assertEqual(grid[0].node.expr, ((xnumel + xblock - 1) // xblock))
+        self.assertEqual(grid[0].meta["val"], -(-xnumel // xblock))
         self.assertEqual(grid[1], 1)
         self.assertEqual(grid[2], 1)
 
+    def test_dynamic_launch_grid_calc_python_slow(self):
+        """
+        Test the dyanmic launch grid calculation for Triton kernel wrapper using python_slow mode
+        """
+        from torch._inductor.runtime.triton_heuristics import GridExpr
+
+        # Mock GridExpr.from_meta to use "python_slow" mode explicitly
+        original_from_meta = GridExpr.from_meta
+
+        def mocked_from_meta(inductor_meta, cfg, mode="python"):
+            return original_from_meta(inductor_meta, cfg, mode="python_slow")
+
+        with unittest.mock.patch.object(GridExpr, "from_meta", mocked_from_meta):
+            func = torch.add
+            args = [
+                torch.randn(shape, device=self.device) for shape in [(7, 12), (7, 1)]
+            ]
+            (gm,) = self._compile_and_check(
+                func, args, compile_kwargs={"dynamic": True}
+            )
+
+            # Check for the precomputed size arg.
+            (triton_node,) = gm.graph.find_nodes(
+                op="call_function", target=triton_kernel_wrapper_mutation
+            )
+            self.assertIn("grid", triton_node.kwargs)
+            self.assertIn("xnumel", triton_node.kwargs["kwargs"])
+            self.assertIn("XBLOCK", triton_node.kwargs["kwargs"])
+            grid = triton_node.kwargs["grid"][0]
+            xnumel = triton_node.kwargs["kwargs"]["xnumel"].meta["val"]
+            xblock = triton_node.kwargs["kwargs"]["XBLOCK"]
+            self.assertEqual(grid[0].meta["val"], ((xnumel + xblock - 1) // xblock))
+            self.assertEqual(grid[1], 1)
+            self.assertEqual(grid[2], 1)
+
     @config.patch({"trace.enabled": True})
     @unittest.mock.patch("torch._inductor.debug.DebugFormatter.output_code")
     def test_debug(self, mock_output_code):
@@ -541,8 +635,191 @@ def run(*args, **kwargs):
             op="call_function", target=torch.empty_strided
         )
         (shape, stride) = empty_strided.args
-        output_is_symbolic = any(isinstance(dim, torch.SymInt) for dim in shape)
-        self.assertEqual(output_is_symbolic, use_dynamic_shapes)
+        if use_dynamic_shapes:
+            self.assertEqual(type(shape[0]), torch.fx.Node)
+
+    def test_custom_triton(self):
+        @triton.jit
+        def add_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: tl.constexpr,
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.empty_like(x)
+            n_elements = output.numel()
+
+            def grid(meta):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=16)
+            return output
+
+        args = [torch.randn(32, device=self.device) for _ in range(2)]
+        self._compile_and_check(add, args)
+
+    def test_output_slice_view(self):
+        """
+        Test when the output is a view of the input.
+        The sliced strides create a TensorBox in the output IR.
+        """
+
+        def foo(x):
+            return x[0:2:2].T[3:].squeeze(0)
+
+        args = [torch.rand([4, 4, 4, 4], device=self.device)]
+        self._compile_and_check(foo, args, expected_num_triton_kernels=0)
+
+
+class AOTFxirTestCase(InductorTestCase):
+    device = GPU_TYPE
+
+    def check(self, model, inp, dynamic_shapes=None, strict=False):
+        if self.device == "xpu":
+            raise unittest.SkipTest("The feature AOTFxir not currently ready for XPU")
+        with torch.no_grad():
+            ep = torch.export.export(
+                model, inp, dynamic_shapes=dynamic_shapes, strict=strict
+            )
+            gm = torch._inductor.aot_compile(
+                ep.module(), inp, options={"fx_wrapper": True, "compile_threads": 1}
+            )
+            self.assertTrue(torch.allclose(model(*inp), gm(*inp)))
+
+            for node in gm.graph.nodes:
+                if (
+                    node.op == "call_function"
+                    and node.target != triton_kernel_wrapper_mutation
+                ):
+                    self.assertTrue(node.meta.get("val", None) is not None)
+
+    def test_aoti_fx_add(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        inp = (torch.ones(3, device=self.device), torch.ones(3, device=self.device))
+        self.check(M(), inp)
+
+    def test_aoti_fx_const(self):
+        class M(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.device = device
+                self.a = torch.nn.Parameter(torch.ones(3, device=self.device))
+                self.b = torch.ones(3, device=self.device)
+
+            def forward(self, x, y):
+                return x + y + self.a + self.b + torch.tensor(3, device=self.device)
+
+        inp = (torch.ones(3, device=self.device), torch.ones(3, device=self.device))
+        self.check(M(self.device), inp)
+
+    def test_aoti_fx_linear(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        inp = (torch.ones(3, 3, device=self.device),)
+        self.check(M().to(self.device), inp)
+
+    def test_aoti_fx_dynamic(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                return x + y
+
+        inp = (torch.ones(3, device=self.device), torch.ones(3, device=self.device))
+        self.check(
+            M().to(device=self.device),
+            inp,
+            dynamic_shapes=({0: Dim.DYNAMIC}, {0: Dim.DYNAMIC}),
+        )
+
+    def test_custom_triton_autotune_dynamic(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                output = torch.zeros_like(x)
+                x_elements = output.size()[0]
+                y_elements = output.size()[1]
+
+                def grid(meta):
+                    return (
+                        triton.cdiv(x_elements, meta["BLOCK_SIZE_X"]),
+                        triton.cdiv(y_elements, meta["BLOCK_SIZE_Y"]),
+                    )
+
+                add_kernel_2d_autotuned[grid](x, y, output, x_elements, y_elements)
+
+                return output
+
+        num_dims = 2
+        dims = [10] * num_dims
+        x = torch.randn(*dims, device=self.device)
+        y = torch.randn(*dims, device=self.device)
+        dim0_x = Dim("dim0_x", min=1, max=10)
+        dim0_y = Dim("dim0_y", min=1, max=10)
+        dynamic_shapes = {"x": {0: dim0_x}, "y": {0: dim0_y}}
+        self.check(
+            Model().to(device=self.device),
+            (x, y),
+            dynamic_shapes=dynamic_shapes,
+            strict=True,
+        )
+
+    def test_custom_backend(self):
+        """
+        Test registering a custom FX backend.
+        """
+        called = False
+
+        class CustomWrapperCodegen(WrapperFxCodegen):
+            def compile_graph(self, gm):
+                """
+                Simply records whether this override was called.
+                """
+                nonlocal called
+                called = True
+                return super().compile_graph(gm)
+
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        # Register a custom FX backend.
+        custom_backend = common.DeviceCodegen(
+            TritonScheduling,
+            PythonWrapperCodegen,
+            fx_wrapper_codegen=CustomWrapperCodegen,
+        )
+        with unittest.mock.patch.dict(
+            common.device_codegens, {self.device: custom_backend}
+        ):
+            # The backend should not have been called yet.
+            self.assertFalse(called)
+
+            inp = (torch.randn(8, device=self.device),)
+            self.check(M().to(self.device), inp)
+
+        # Now the backend should have been called.
+        self.assertTrue(called)
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_graph_transform_observer.py b/test/inductor/test_graph_transform_observer.py
index 1def72ae9e273..2bd0b6ef43f11 100644
--- a/test/inductor/test_graph_transform_observer.py
+++ b/test/inductor/test_graph_transform_observer.py
@@ -11,7 +11,7 @@
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FUSED_ATTENTION
 from torch.testing._internal.common_utils import IS_LINUX
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 
 
 try:
@@ -28,7 +28,10 @@
 class TestGraphTransformObserver(TestCase):
     def test_sdpa_rewriter(self):
         if not (
-            HAS_CUDA and PLATFORM_SUPPORTS_FUSED_ATTENTION and HAS_PYDOT and HAS_DOT
+            HAS_CUDA_AND_TRITON
+            and PLATFORM_SUPPORTS_FUSED_ATTENTION
+            and HAS_PYDOT
+            and HAS_DOT
         ):
             return
 
diff --git a/test/inductor/test_inductor_annotations.py b/test/inductor/test_inductor_annotations.py
index 75f53f4dd9b81..3824b25cdeaea 100644
--- a/test/inductor/test_inductor_annotations.py
+++ b/test/inductor/test_inductor_annotations.py
@@ -3,7 +3,7 @@
 import torch._inductor.config as inductor_config
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
-from torch.testing._internal.triton_utils import requires_cuda
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 class InductorAnnotationTestCase(TestCase):
@@ -18,7 +18,7 @@ def f(a, b):
         _, code = run_and_get_code(f_comp, a, b)
         return code[0]
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_no_annotations(self):
         code = self.get_code()
 
@@ -26,15 +26,16 @@ def test_no_annotations(self):
         self.assertTrue("training_annotation" not in code)
 
     @inductor_config.patch(annotate_training=True)
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_training_annotation(self):
         code = self.get_code()
 
         self.assertTrue("from torch.cuda import nvtx" in code)
-        self.assertEqual(
-            code.count("training_annotation = nvtx._device_range_start('inference')"), 1
+        self.assertTrue(
+            code.count("training_annotation = nvtx._device_range_start('inference')")
+            >= 1
         )
-        self.assertEqual(code.count("nvtx._device_range_end(training_annotation)"), 1)
+        self.assertTrue(code.count("nvtx._device_range_end(training_annotation)") >= 1)
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_inplace_padding.py b/test/inductor/test_inplace_padding.py
index 46d5cf61121e3..7ddd0dd4441b8 100644
--- a/test/inductor/test_inplace_padding.py
+++ b/test/inductor/test_inplace_padding.py
@@ -233,9 +233,9 @@ def f(x, y):
             loss.backward()
             return loss
 
-        x = torch.randn(B * T, C, requires_grad=True).cuda().bfloat16()
+        x = torch.randn(B * T, C, requires_grad=True).to(GPU_TYPE).bfloat16()
         x.retain_grad()
-        y = torch.randint(0, V, (B * T,)).cuda()
+        y = torch.randint(0, V, (B * T,)).to(GPU_TYPE)
 
         opt_f = torch.compile(f)
 
diff --git a/test/inductor/test_loop_ordering.py b/test/inductor/test_loop_ordering.py
index 2eb37308f4d4c..32f492f9c4a54 100644
--- a/test/inductor/test_loop_ordering.py
+++ b/test/inductor/test_loop_ordering.py
@@ -3,11 +3,13 @@
 import contextlib
 import os
 import unittest
+from unittest import skipUnless
 
 import numpy as np
 import sympy
 
 import torch
+import torch.nn.functional as F
 from torch import nn
 from torch._dynamo.testing import rand_strided
 from torch._dynamo.utils import same
@@ -17,14 +19,13 @@
 from torch._inductor.scheduler import SchedulerNode
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.test_operators import realize
-from torch._inductor.utils import run_and_get_code, sympy_index_symbol
+from torch._inductor.utils import is_big_gpu, run_and_get_code, sympy_index_symbol
 from torch._inductor.virtualized import ops, V
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
-    skipIfRocm,
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 from torch.utils._ordered_set import OrderedSet
@@ -413,7 +414,6 @@ def f(x):
         self.do_acc_test(f, x)
         self.assertEqual(1, metrics.generated_kernel_count)
 
-    @skipIfRocm
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 requires H100+ and MI300+")
     @skipIfRocm
     # Related PR: https://github.com/pytorch/pytorch/pull/149369
@@ -440,7 +440,6 @@ def f(x, scale):
         self.do_acc_test(f, x, scale)
         self.assertEqual(1, metrics.generated_kernel_count)
 
-    @skipIfRocm
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 requires H100+ and MI300+")
     @skipIfRocm
     # Related PR: https://github.com/pytorch/pytorch/pull/149369
@@ -484,6 +483,47 @@ def test_pattern2(tensor_x_inp, scale_x):
         expected_numbytes += tensor_fp8.nbytes + tensor_fp8_t.nbytes  # output
         self.assertEqual(expected_numbytes, metrics.num_bytes_accessed)
 
+    def test_outer_dimension_softmax(self):
+        """
+        This test repros the not able to fuse problem for outer dimension
+        softmax reported here: https://github.com/pytorch/pytorch/issues/93718
+
+        Perf data on h100:
+        - without loop ordering after fusion 0.564 ms
+        - with loop ordering after fusion 0.302 ms
+        This is 1.87x speedup.
+
+        """
+        x = torch.randn(32, 2**21, device=GPU_TYPE)
+
+        def f(x):
+            return F.softmax(x, dim=0)
+
+        self.do_acc_test(f, x)
+        self.assertEqual(1, metrics.generated_kernel_count)
+
+    def test_outer_dimension_sum_fuse_with_pw(self):
+        """
+        Test the fusion of an outer dimension sum with a followed pointwise.
+        Perf data on h100:
+        - without loop ordering after fusion 0.436 ms
+        - with loop ordering after fusion 0.260 ms
+        This is 1.68x speedup.
+        """
+        x = torch.randn(32, 2**21, device=GPU_TYPE)
+
+        def f(x):
+            return x.sum(dim=0, keepdim=True) + x
+
+        self.do_acc_test(f, x)
+        self.assertEqual(1, metrics.generated_kernel_count)
+
+        if DO_PERF_TEST:
+            from triton.testing import do_bench
+
+            optf = torch.compile(f)
+            print(f"ms={do_bench(lambda: optf(x))}")
+
     # Disable split reduction to make it easier to calculate the expected
     # number of bytes accessed. In this case, split reduction does not
     # help perf much.
@@ -528,6 +568,52 @@ def f(x):
             ms = do_bench(lambda: opt_f(x))
             print(f"{ms=:.3f}")
 
+    @inductor_config.patch(
+        {
+            "max_autotune": True,
+            "max_autotune_gemm_backends": "TRITON",
+            "test_configs.max_mm_configs": 4,
+        }
+    )
+    @skipUnless(HAS_GPU and is_big_gpu(), "Need big gpu for max-autotune")
+    def test_interaction_with_triton_template(self):
+        """
+        Make sure the dependency prefix for TritonTempalate and its
+        prologue match.
+        """
+
+        @torch.compile
+        def f(x, y):
+            return (x.expand([1, y.shape[0]]) + 1) @ y
+
+        x = torch.randn([1, 1], device=GPU_TYPE)
+        y = torch.randn([64, 128], device=GPU_TYPE)
+
+        out, code = run_and_get_code(f, x, y)
+
+        # well when benchmark_kernel flag is on, we have one more .run
+        # call in the benchmarking code.
+        FileCheck().check("def call(").check_count(
+            ".run(", 1 + int(inductor_config.benchmark_kernel), exactly=True
+        ).run(code[0])
+
+    def test_fuse_with_scalar_shared_memory(self):
+        """
+        Make sure if we can fuse two nodes sharing a scalar before,
+        we can still do it with LOAF applied.
+
+        This is not really a big deal. But some tests rely on this and
+        less number of kernels has some small benefits.
+        """
+
+        @torch.compile
+        def f(x):
+            return torch.mean(x)
+
+        x = torch.randn([5, 5], device=GPU_TYPE)
+        out, code = run_and_get_code(f, x)
+        FileCheck().check_count("@triton.jit", 1, exactly=True).run(code[0])
+
 
 @inductor_config.patch(
     {
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index 3c2fd6add14b1..e719abc39b701 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -12,14 +12,13 @@
 import unittest
 from typing import Callable, Optional
 from unittest import mock
-from unittest.mock import MagicMock
 
 import torch
 from torch import multiprocessing as mp, nn
 from torch._dynamo import reset
 from torch._dynamo.exc import BackendCompilerFailed
 from torch._dynamo.testing import rand_strided, reset_rng_state
-from torch._dynamo.utils import same
+from torch._dynamo.utils import counters, same
 from torch._inductor import config
 from torch._inductor.autotune_process import (
     _TestBenchmarkRequest,
@@ -31,11 +30,17 @@
 from torch._inductor.ir import Buffer, ChoiceCaller, FixedLayout
 from torch._inductor.kernel.mm_plus_mm import aten_mm_plus_mm
 from torch._inductor.select_algorithm import (
+    add_feedback_saver,
+    add_preprocessing_fn,
     AlgorithmSelectorCache,
+    clear_feedback_savers,
+    clear_preprocessing_fns,
+    ExternKernelCaller,
     TritonTemplate,
     TritonTemplateCaller,
 )
-from torch._inductor.template_heuristics import (
+from torch._inductor.template_heuristics.registry import override_template_heuristics
+from torch._inductor.template_heuristics.triton import (
     CUDAMMTemplateConfigHeuristic,
     GemmConfig,
 )
@@ -48,7 +53,7 @@
     skipIfRocmNotEnoughMemory,
 )
 from torch.testing._internal.logging_utils import multiple_logs_to_string
-from torch.utils._triton import has_triton_tma_device
+from torch.utils._triton import has_triton_stable_tma_api, has_triton_tma_device
 
 
 aten = torch.ops.aten
@@ -69,13 +74,13 @@
     get_kernel_launch,
     GPU_TYPE,
     HAS_CPU,
-    HAS_CUDA,
+    HAS_CUDA_AND_TRITON,
     HAS_GPU,
 )
 
 
 torch.set_float32_matmul_precision("high")
-if HAS_CUDA:
+if HAS_CUDA_AND_TRITON:
     torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
 
@@ -143,8 +148,16 @@ def mm(a, b):
             return torch.mm(a, b)
 
         M, N, K = 21, 31, 11
-        a = torch.randn(*((K, M) if a_transposed else (M, K))).to(torch.float16).cuda()
-        b = torch.randn(*((N, K) if b_transposed else (K, N))).to(torch.float16).cuda()
+        a = (
+            torch.randn(*((K, M) if a_transposed else (M, K)))
+            .to(torch.float16)
+            .to(GPU_TYPE)
+        )
+        b = (
+            torch.randn(*((N, K) if b_transposed else (K, N)))
+            .to(torch.float16)
+            .to(GPU_TYPE)
+        )
 
         with config.patch(
             {
@@ -161,14 +174,76 @@ def mm(a, b):
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
+    @parametrize("a_transposed", (False, True))
+    @parametrize("b_transposed", (False, True))
+    @parametrize("dynamic", (False, True))
+    def test_max_autotune_regular_mm_persistent_tma_strided(
+        self,
+        a_transposed: bool,
+        b_transposed: bool,
+        dynamic: bool,
+    ):
+        def mm(a, b):
+            # TMA requires 16-byte alignment: here we repeat the dims
+            # by the factor of 8, as float16 is 2-byte. All dims are
+            # repeated due to the possible transpositions below.
+            a = a.repeat(8, 8)
+            b = b.repeat(8, 8)
+            if a_transposed:
+                a = a.T
+            if b_transposed:
+                b = b.T
+
+            return torch.mm(a, b)
+
+        def next_multiple_16(a: int) -> int:
+            return ((a + 15) // 16) * 16
+
+        M, N, K = 21, 31, 11
+        a_shape = (K, M) if a_transposed else (M, K)
+        a_stride = (
+            (next_multiple_16(M), 1) if a_transposed else (next_multiple_16(K), 1)
+        )
+        a = torch.empty_strided(a_shape, a_stride, dtype=torch.float16).to(GPU_TYPE)
+        a[:] = torch.randn(a_shape, dtype=torch.float16)
+        a = a.to(GPU_TYPE)
+        b_shape = (N, K) if b_transposed else (K, N)
+        b_stride = (
+            (next_multiple_16(K), 1) if a_transposed else (next_multiple_16(N), 1)
+        )
+        b = torch.empty_strided(b_shape, b_stride, dtype=torch.float16)
+        b[:] = torch.randn(b_shape, dtype=torch.float16)
+        b = b.to(GPU_TYPE)
+        with config.patch(
+            {
+                "max_autotune": True,
+                "triton.enable_persistent_tma_matmul": "1",
+                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
+            }
+        ):
+            c_actual, code = run_and_get_code(torch.compile(mm, dynamic=dynamic), a, b)
+            c_expected = mm(a, b)
+
+        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
+        # Verify that we are using a TMA implementation
+        # depending on whether we're using the experimental API, we check for a different string
+        check_str = "triton.language.extra.cuda.experimental_device_tensormap_create2d"
+        if has_triton_stable_tma_api():
+            check_str = "triton.language.make_tensor_descriptor"
+        FileCheck().check("triton_tem_fused_mm").check(check_str).run(code[0])
+
+    @unittest.skipIf(
+        not has_triton_tma_device(), "Need device-side TMA support in Triton"
+    )
+    @skipIfXpu(msg="TMA path on Intel GPU not require this check")
     @parametrize("dynamic", (False, True))
     def test_max_autotune_regular_mm_persistent_tma_illegal_alignment(self, dynamic):
         def mm(a, b):
             return torch.mm(a, b)
 
         M, N, K = 21, 31, 11
-        a = torch.randn(M, K).to(torch.float16).cuda()
-        b = torch.randn(K, N).to(torch.float16).cuda()
+        a = torch.randn(M, K).to(torch.float16).to(GPU_TYPE)
+        b = torch.randn(K, N).to(torch.float16).to(GPU_TYPE)
 
         with (
             self.assertRaises(BackendCompilerFailed) as context,
@@ -195,8 +270,8 @@ def mm(a, b):
             return torch.mm(a, b)
 
         M, N, K = 21, 31, 11
-        a = torch.randn(M, K).to(torch.float16).cuda()
-        b = torch.randn(K, N).to(torch.float16).cuda()
+        a = torch.randn(M, K).to(torch.float16).to(GPU_TYPE)
+        b = torch.randn(K, N).to(torch.float16).to(GPU_TYPE)
 
         # TMA requires 16-byte alignment: here we repeat the dims
         # by the factor of 8, as float16 is 2-byte. All dims are
@@ -262,9 +337,17 @@ def addmm(x, a, b):
             return torch.addmm(x, a, b)
 
         M, N, K = 21, 31, 11
-        a = torch.randn(*((K, M) if a_transposed else (M, K))).to(torch.float16).cuda()
-        b = torch.randn(*((N, K) if b_transposed else (K, N))).to(torch.float16).cuda()
-        x = torch.randn(N).to(torch.float16).cuda()
+        a = (
+            torch.randn(*((K, M) if a_transposed else (M, K)))
+            .to(torch.float16)
+            .to(GPU_TYPE)
+        )
+        b = (
+            torch.randn(*((N, K) if b_transposed else (K, N)))
+            .to(torch.float16)
+            .to(GPU_TYPE)
+        )
+        x = torch.randn(N).to(torch.float16).to(GPU_TYPE)
 
         with config.patch(
             {
@@ -281,15 +364,16 @@ def addmm(x, a, b):
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
+    @skipIfXpu(msg="TMA path on Intel GPU not require this check")
     @parametrize("dynamic", (False, True))
     def test_max_autotune_addmm_persistent_tma_illegal_alignment(self, dynamic):
         def addmm(x, a, b):
             return torch.addmm(x, a, b)
 
         M, N, K = 21, 31, 11
-        a = torch.randn(M, K).to(torch.float16).cuda()
-        b = torch.randn(K, N).to(torch.float16).cuda()
-        x = torch.randn(N).to(torch.float16).cuda()
+        a = torch.randn(M, K).to(torch.float16).to(GPU_TYPE)
+        b = torch.randn(K, N).to(torch.float16).to(GPU_TYPE)
+        x = torch.randn(N).to(torch.float16).to(GPU_TYPE)
 
         with (
             self.assertRaises(BackendCompilerFailed) as context,
@@ -316,9 +400,9 @@ def addmm(x, a, b):
             return torch.addmm(x, a, b)
 
         M, N, K = 21, 31, 11
-        a = torch.randn(M, K).to(torch.float16).cuda()
-        b = torch.randn(K, N).to(torch.float16).cuda()
-        x = torch.randn(N).to(torch.float16).cuda()
+        a = torch.randn(M, K).to(torch.float16).to(GPU_TYPE)
+        b = torch.randn(K, N).to(torch.float16).to(GPU_TYPE)
+        x = torch.randn(N).to(torch.float16).to(GPU_TYPE)
 
         # TMA requires 16-byte alignment: here we repeat the dims
         # by the factor of 8, as float16 is 2-byte. All dims are
@@ -342,6 +426,7 @@ def addmm(x, a, b):
         torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
 
     @fresh_cache()
+    @skipIfXpu(msg="XPU doesn't support sm carveout")
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support sm carveout")
     @unittest.skipIf(IS_WINDOWS, "Windows doesn't support persistent TMA")
     @unittest.skipIf(
@@ -363,15 +448,15 @@ def scaled_mm(
 
         # Create large matrices to ensure we use all possible sms
         size = 2560
-        a = torch.randn(size, size, device="cuda", dtype=torch.bfloat16)
+        a = torch.randn(size, size, device=GPU_TYPE, dtype=torch.bfloat16)
         b = (
-            torch.randn(size, size, device="cuda", dtype=torch.bfloat16)
+            torch.randn(size, size, device=GPU_TYPE, dtype=torch.bfloat16)
             .transpose(0, 1)
             .contiguous()
             .transpose(0, 1)
         )
-        scale_a = torch.tensor(1, dtype=torch.float32, device="cuda")
-        scale_b = torch.tensor(1, dtype=torch.float32, device="cuda")
+        scale_a = torch.tensor(1, dtype=torch.float32, device=GPU_TYPE)
+        scale_b = torch.tensor(1, dtype=torch.float32, device=GPU_TYPE)
 
         args = (
             (a.to(torch.float8_e4m3fn), b.to(torch.float8_e4m3fn), scale_a, scale_b)
@@ -633,7 +718,7 @@ def forward(self, x):
 
         m_c = torch.compile(mode="max-autotune")(mod)
         out, code = run_and_get_code(m_c, x)
-        self.assertEqual(out, mod(x), atol=2e-3, rtol=1e-3)
+        self.assertEqual(out, mod(x), atol=2e-3, rtol=2e-3)
 
         FileCheck().check("triton_tem_fused_baddbmm").run(code[0])
 
@@ -954,9 +1039,9 @@ def f(x, y):
             loss.backward()
             return loss
 
-        x = torch.randn(B * T, C, requires_grad=True).cuda().bfloat16()
+        x = torch.randn(B * T, C, requires_grad=True).to(GPU_TYPE).bfloat16()
         x.retain_grad()
-        y = torch.randint(0, V, (B * T,)).cuda()
+        y = torch.randint(0, V, (B * T,)).to(GPU_TYPE)
 
         import torch._inductor.utils as inductor_utils
 
@@ -990,8 +1075,8 @@ def test_max_autotune_decompose_k(self, sizes, dtype, dynamic):
 
         M, N, K = sizes
 
-        a = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
-        b = torch.randn(K, N, dtype=dtype, device="cuda", requires_grad=True)
+        a = torch.randn(M, K, dtype=dtype, device=GPU_TYPE, requires_grad=True)
+        b = torch.randn(K, N, dtype=dtype, device=GPU_TYPE, requires_grad=True)
 
         possible_splits = range(2, min(K // M, K // N) + 1)
 
@@ -1088,10 +1173,10 @@ def f(a, b):
             return (a_in @ b).relu()
 
         a = torch.randn(
-            32, 32768, dtype=torch.bfloat16, device="cuda", requires_grad=True
+            32, 32768, dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
         )
         b = torch.randn(
-            32768, 64, dtype=torch.bfloat16, device="cuda", requires_grad=True
+            32768, 64, dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
         )
 
         torch._dynamo.reset()
@@ -1131,9 +1216,11 @@ def f(a, b):
             a_in = torch.cat([a for _ in range(256)], dim=0)
             return (a_in @ b).relu().sum()
 
-        a = torch.randn(8, 64, dtype=torch.bfloat16, device="cuda", requires_grad=True)
+        a = torch.randn(
+            8, 64, dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
+        )
         b = torch.randn(
-            64, 32768, dtype=torch.bfloat16, device="cuda", requires_grad=True
+            64, 32768, dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
         )
 
         torch._dynamo.reset()
@@ -1173,23 +1260,21 @@ def f(a, b):
             a = a.transpose(0, 1)
             return a @ b
 
-        a = torch.randn((32768, 256), device="cuda", dtype=torch.bfloat16)
-        b = torch.randn((32768, 1152), device="cuda", dtype=torch.bfloat16)
+        a = torch.randn((32768, 256), device=GPU_TYPE, dtype=torch.bfloat16)
+        b = torch.randn((32768, 1152), device=GPU_TYPE, dtype=torch.bfloat16)
 
         b = b[:, :1096]
 
         # Force only decomposeK choice
         with (
-            mock.patch(
-                "torch._inductor.kernel.mm.V.choices.get_mm_configs"
-            ) as base_mm_mock,
+            override_template_heuristics(
+                device_type=GPU_TYPE,
+                template_op_pairs=[(torch._inductor.kernel.mm.mm_template.name, "mm")],
+            ),
             mock.patch(
                 "torch._inductor.kernel.mm.use_decompose_k_choice"
             ) as decompose_mock,
         ):
-            mm_configs_mock = MagicMock()
-            mm_configs_mock.return_value = []
-            base_mm_mock.return_value = mm_configs_mock
             decompose_mock.return_value = True
             compiled_f = torch.compile(f)
             out, code = run_and_get_code(compiled_f, a, b)
@@ -1204,6 +1289,201 @@ def f(a, b):
                 code[0]
             )
 
+    @unittest.skipIf(not torch.version.hip, "ROCM only")
+    @parametrize("dtype", (torch.float16, torch.bfloat16, torch.float32))
+    @parametrize("sizes", ((64, 128, 256), (128, 256, 512), (256, 512, 1024)))
+    @config.patch(
+        max_autotune=True,
+    )
+    def test_max_autotune_contiguous_transform_mm(self, sizes, dtype):
+        """
+        Test the contiguous subgraph transform with A * transpose(B) pattern.
+        This transform makes the second matrix contiguous before the matmul.
+        """
+        M, N, K = sizes
+
+        def mm_transpose(a, b):
+            return a @ b.transpose(0, 1)
+
+        a = torch.randn(M, K, dtype=dtype, device=GPU_TYPE, requires_grad=True)
+        b = torch.randn(N, K, dtype=dtype, device=GPU_TYPE, requires_grad=True)
+
+        # Compute fp64 baseline
+        a_fp64 = a.to(torch.float64)
+        b_fp64 = b.to(torch.float64)
+        expected_fp64 = mm_transpose(a_fp64, b_fp64)
+
+        # Force only contiguous choice to test the transform
+        with (
+            mock.patch(
+                "torch._inductor.template_heuristics.contiguous_mm.use_contiguous"
+            ) as contiguous_mock,
+        ):
+            contiguous_mock.return_value = True
+
+            compiled_func = torch.compile(mm_transpose)
+            out, code = run_and_get_code(compiled_func, a, b)
+
+            # Verify correctness against fp64 baseline
+            torch.testing.assert_close(
+                out, expected_fp64.to(dtype), atol=1e-2, rtol=1e-2
+            )
+
+            # Check that contiguous transform was used
+            FileCheck().check("contiguous_mm").run(code[0])
+
+    @unittest.skipIf(not torch.version.hip, "ROCM only")
+    @parametrize("dtype", (torch.float16, torch.bfloat16, torch.float32))
+    @parametrize("sizes", ((64, 128, 256), (128, 256, 512), (256, 512, 1024)))
+    @config.patch(
+        max_autotune=True,
+    )
+    def test_max_autotune_contiguous_transform_addmm(self, sizes, dtype):
+        """
+        Test the contiguous subgraph transform for addmm with non-contiguous second matrix.
+        """
+        M, N, K = sizes
+
+        def addmm_transpose(inp, a, b):
+            return torch.addmm(inp, a, b.transpose(0, 1))
+
+        inp = torch.randn(M, N, dtype=dtype, device=GPU_TYPE, requires_grad=True)
+        a = torch.randn(M, K, dtype=dtype, device=GPU_TYPE, requires_grad=True)
+        b = torch.randn(N, K, dtype=dtype, device=GPU_TYPE, requires_grad=True)
+
+        # Compute fp64 baseline
+        inp_fp64 = inp.to(torch.float64)
+        a_fp64 = a.to(torch.float64)
+        b_fp64 = b.to(torch.float64)
+        expected_fp64 = addmm_transpose(inp_fp64, a_fp64, b_fp64)
+
+        # Force contiguous choice to test the transform
+        with (
+            mock.patch(
+                "torch._inductor.template_heuristics.contiguous_mm.use_contiguous"
+            ) as contiguous_mock,
+        ):
+            contiguous_mock.return_value = True
+
+            compiled_func = torch.compile(addmm_transpose)
+            out, code = run_and_get_code(compiled_func, inp, a, b)
+
+            # Verify correctness against fp64 baseline
+            torch.testing.assert_close(
+                out, expected_fp64.to(dtype), atol=1e-2, rtol=1e-2
+            )
+
+            # Check that contiguous transform was used
+            FileCheck().check("contiguous_addmm").run(code[0])
+
+    @unittest.skipIf(not torch.version.hip, "ROCM only")
+    @parametrize("dynamic", (False, True))
+    def test_max_autotune_contiguous_transform_non_contiguous_second_matrix(
+        self, dynamic
+    ):
+        """
+        Test that contiguous transform is only applied when the second matrix is non-contiguous.
+        """
+        M, N, K = 64, 128, 64
+
+        def mm(a, b):
+            return a @ b
+
+        a = torch.randn(M, K, dtype=torch.float32, device=GPU_TYPE)
+        b_contiguous = torch.randn(K, N, dtype=torch.float32, device=GPU_TYPE)
+        b_non_contiguous = torch.randn(
+            N, K, dtype=torch.float32, device=GPU_TYPE
+        ).transpose(0, 1)
+
+        # Compute fp64 baselines without max_autotune (since fp64 doesn't work with max_autotune=True)
+        a_fp64 = a.to(torch.float64)
+        b_contiguous_fp64 = b_contiguous.to(torch.float64)
+        b_non_contiguous_fp64 = b_non_contiguous.to(torch.float64)
+
+        expected1_fp64 = mm(a_fp64, b_contiguous_fp64)
+        expected2_fp64 = mm(a_fp64, b_non_contiguous_fp64)
+
+        with config.patch(
+            max_autotune=True,
+        ):
+            # Test with contiguous second matrix - should not use contiguous transform
+            compiled_func_contiguous = torch.compile(mm, dynamic=dynamic)
+            out1, code1 = run_and_get_code(compiled_func_contiguous, a, b_contiguous)
+
+            # Should not contain contiguous transform
+            try:
+                FileCheck().check("contiguous_mm").run(code1[0])
+                self.fail(
+                    "Contiguous transform should not be used for contiguous matrices"
+                )
+            except RuntimeError:
+                pass  # Expected - contiguous transform should not be used
+
+            # Test with non-contiguous second matrix - should use contiguous transform
+            with (
+                mock.patch(
+                    "torch._inductor.template_heuristics.contiguous_mm.use_contiguous"
+                ) as contiguous_mock,
+            ):
+                contiguous_mock.return_value = True
+
+                compiled_func_non_contiguous = torch.compile(mm, dynamic=dynamic)
+                out2, code2 = run_and_get_code(
+                    compiled_func_non_contiguous, a, b_non_contiguous
+                )
+
+                # Should contain contiguous transform
+                FileCheck().check("contiguous_mm").run(code2[0])
+
+        # Verify correctness against fp64 baselines
+        torch.testing.assert_close(
+            out1, expected1_fp64.to(torch.float32), atol=1e-2, rtol=1e-2
+        )
+        torch.testing.assert_close(
+            out2, expected2_fp64.to(torch.float32), atol=1e-2, rtol=1e-2
+        )
+
+    @unittest.skipIf(not torch.version.hip, "ROCM only")
+    @config.patch(
+        max_autotune=True,
+        max_autotune_gemm_backends="TRITON",
+    )
+    def test_max_autotune_contiguous_transform_with_epilogue(self):
+        """
+        Test contiguous transform with epilogue operations like relu.
+        """
+        M, N, K = 128, 256, 512
+
+        def mm_transpose_relu(a, b):
+            return (a @ b.transpose(0, 1)).relu()
+
+        a = torch.randn(M, K, dtype=torch.float32, device=GPU_TYPE)
+        b = torch.randn(N, K, dtype=torch.float32, device=GPU_TYPE)
+
+        # Compute fp64 baseline
+        a_fp64 = a.to(torch.float64)
+        b_fp64 = b.to(torch.float64)
+        expected_fp64 = mm_transpose_relu(a_fp64, b_fp64)
+
+        # Force contiguous transform
+        with (
+            mock.patch(
+                "torch._inductor.template_heuristics.contiguous_mm.use_contiguous"
+            ) as contiguous_mock,
+        ):
+            contiguous_mock.return_value = True
+
+            compiled_func = torch.compile(mm_transpose_relu)
+            out, code = run_and_get_code(compiled_func, a, b)
+
+            # Verify correctness against fp64 baseline
+            torch.testing.assert_close(
+                out, expected_fp64.to(torch.float32), atol=1e-2, rtol=1e-2
+            )
+
+            # Check that contiguous transform was used
+            FileCheck().check("contiguous_mm").run(code[0])
+
     def test_triton_template_generated_code_cache_key(self):
         generate_and_load_args = len(
             inspect.signature(
@@ -1527,8 +1807,8 @@ def test_max_autotune_decompose_k_envvars(
         for M, N, K in shapes:
             get_k_splits.cache_clear()
             use_decompose_k_choice.cache_clear()
-            a = torch.randn(M, K, dtype=torch.float16, device="cuda")
-            b = torch.randn(K, N, dtype=torch.float16, device="cuda")
+            a = torch.randn(M, K, dtype=torch.float16, device=GPU_TYPE)
+            b = torch.randn(K, N, dtype=torch.float16, device=GPU_TYPE)
 
             with config.patch(
                 {
@@ -1565,11 +1845,11 @@ def f(a, b):
 
         M, N, K = (1024, 1024, 1024)
 
-        a = torch.randn(M, K, dtype=torch.float16, device="cuda", requires_grad=True)
-        b = torch.randn(K, N, dtype=torch.float16, device="cuda", requires_grad=True)
+        a = torch.randn(M, K, dtype=torch.float16, device=GPU_TYPE, requires_grad=True)
+        b = torch.randn(K, N, dtype=torch.float16, device=GPU_TYPE, requires_grad=True)
 
         with mock.patch(
-            "torch._inductor.template_registry.get_template_heuristic"
+            "torch._inductor.template_heuristics.registry.get_template_heuristic"
         ) as config_mock:
             config_heuristics = CUDAMMTemplateConfigHeuristic()
 
@@ -1608,6 +1888,96 @@ def mm(x, y):
             out, code = run_and_get_code(compiled_f, a, b)
             torch.testing.assert_close(out, mm(a, b), atol=1e-2, rtol=1e-2)
 
+    @config.patch(
+        max_autotune_gemm=True,
+        max_autotune_prune_choices_based_on_shared_mem=True,
+    )
+    def test_max_autotune_prune_choices(self):
+        def mm(x, y):
+            return x @ y
+
+        M, K, N = (3, 3, 3)
+
+        x = torch.rand([M, K], device=GPU_TYPE, dtype=torch.float32)
+        y = torch.rand([K, N], device=GPU_TYPE, dtype=torch.float32)
+
+        compiled_f = torch.compile(mm)
+        compiled_f(x, y)
+
+        self.assertEqual(
+            counters["inductor"]["select_algorithm_num_precompilation_exceptions"], 0
+        )
+
+    @parametrize("op", ("mm", "addmm", "bmm", "baddbmm", "mm_plus_mm"))
+    @parametrize("max_autotune", (False, True))
+    @config.patch(
+        {"test_configs.max_mm_configs": 4, "max_autotune_gemm_backends": "ATEN,TRITON"}
+    )
+    def test_autotune_gemm_choice_validation(self, op, max_autotune):
+        def generate_inputs_and_func(op_name):
+            # Base config with just x and w
+            base_inputs = [
+                torch.randn(128, 256, device=GPU_TYPE),
+                torch.randn(256, 128, device=GPU_TYPE),
+            ]
+            func = torch.mm
+            if op_name == "mm":
+                # default
+                pass
+            elif op_name == "addmm":
+                # Add bias for addmm
+                base_inputs = [torch.randn(128, device=GPU_TYPE)] + base_inputs
+                func = torch.addmm
+            elif op_name in ["bmm", "baddbmm"]:
+                # Override for batch dimensions
+                base_inputs[0] = torch.randn(4, 128, 256, device=GPU_TYPE)
+                base_inputs[1] = torch.randn(4, 256, 128, device=GPU_TYPE)
+                func = torch.bmm
+                if op_name == "baddbmm":
+                    # Add batch bias
+                    base_inputs = [
+                        torch.torch.randn(4, 128, 128, device=GPU_TYPE)
+                    ] + base_inputs
+                    func = torch.baddbmm
+            elif op_name == "mm_plus_mm":
+                # Add second matrix pair
+                base_inputs += [
+                    torch.randn(128, 256, device=GPU_TYPE),
+                    torch.randn(256, 128, device=GPU_TYPE),
+                ]
+
+                def mmpmm(x, w, x2, w2):
+                    return torch.mm(x, w) + torch.mm(x2, w2)
+
+                func = mmpmm
+            else:
+                raise ValueError(f"Unsupported op: {op_name}")
+            return base_inputs, func
+
+        choice_types_seen = set()
+
+        def choice_validator(choices):
+            for choice in choices:
+                choice_types_seen.add(type(choice))
+            return choices
+
+        inputs, fn = generate_inputs_and_func(op)
+
+        add_preprocessing_fn(choice_validator)
+        try:
+            with config.patch({"max_autotune": max_autotune}):
+                compiled_fn = torch.compile(fn, dynamic=False)
+                compiled_fn(*inputs)
+
+                if max_autotune:
+                    self.assertIn(ExternKernelCaller, choice_types_seen)
+                    self.assertIn(TritonTemplateCaller, choice_types_seen)
+                else:
+                    self.assertIn(ExternKernelCaller, choice_types_seen)
+                    self.assertNotIn(TritonTemplateCaller, choice_types_seen)
+        finally:
+            clear_preprocessing_fns()
+
 
 class TestMaxAutotunePrecompile(TestCase):
     def test_precompilation_threads(self):
@@ -2121,6 +2491,118 @@ def test_tuning_pool_multiple_devices(self):
 
         tuning_pool.shutdown()
 
+    def test_add_feedback_saver(self):
+        """Test that add_feedback_saver correctly adds feedback functions."""
+        from torch._inductor.select_algorithm import get_algorithm_selector_cache
+
+        # Clear any existing feedback savers
+        clear_feedback_savers()
+
+        # Create a simple feedback saver function
+        feedback_calls = []
+
+        def simple_feedback_saver(timings, name, input_nodes, choices, profiled_time):
+            feedback_calls.append(
+                {
+                    "name": name,
+                    "num_choices": len(choices),
+                    "num_timings": len(timings),
+                    "has_profiled_time": profiled_time is not None,
+                }
+            )
+
+        # Add the feedback saver
+        add_feedback_saver(simple_feedback_saver)
+
+        # Get the global cache and verify the function was added
+        cache = get_algorithm_selector_cache()
+        self.assertEqual(len(cache.feedback_saver_fns), 1)
+        self.assertEqual(cache.feedback_saver_fns[0], simple_feedback_saver)
+
+        # Test that we can add multiple feedback savers
+        def another_feedback_saver(timings, name, input_nodes, choices, profiled_time):
+            pass
+
+        add_feedback_saver(another_feedback_saver)
+        self.assertEqual(len(cache.feedback_saver_fns), 2)
+
+        # Clean up
+        clear_feedback_savers()
+
+    def test_clear_feedback_savers(self):
+        """Test that clear_feedback_savers removes all feedback functions."""
+        from torch._inductor.select_algorithm import get_algorithm_selector_cache
+
+        # Add some feedback savers first
+        def feedback_saver1(timings, name, input_nodes, choices, profiled_time):
+            pass
+
+        def feedback_saver2(timings, name, input_nodes, choices, profiled_time):
+            pass
+
+        add_feedback_saver(feedback_saver1)
+        add_feedback_saver(feedback_saver2)
+
+        # Verify they were added
+        cache = get_algorithm_selector_cache()
+        self.assertEqual(len(cache.feedback_saver_fns), 2)
+
+        # Clear all feedback savers
+        clear_feedback_savers()
+
+        # Verify they were cleared
+        self.assertEqual(len(cache.feedback_saver_fns), 0)
+
+    def test_feedback_saver_integration(self):
+        """Test that feedback savers are actually called during autotuning."""
+        # Clear any existing feedback savers
+        clear_feedback_savers()
+
+        feedback_calls = []
+
+        def test_feedback_saver(timings, name, input_nodes, choices, profiled_time):
+            # Store information about the call for verification
+            feedback_calls.append(
+                {
+                    "name": name,
+                    "num_choices": len(choices),
+                    "num_timings": len(timings),
+                    "input_node_count": len(input_nodes),
+                }
+            )
+
+        # Add our test feedback saver
+        add_feedback_saver(test_feedback_saver)
+
+        # Create a simple matrix multiplication that will trigger autotuning
+        def mm(a, b):
+            return a @ b
+
+        a = torch.randn(32, 32, device=GPU_TYPE)
+        b = torch.randn(32, 32, device=GPU_TYPE)
+
+        with config.patch(
+            {"max_autotune": True, "max_autotune_gemm_backends": "TRITON"}
+        ):
+            torch.compile(mm)(a, b)
+
+        # Verify that our feedback saver was called
+        self.assertGreater(
+            len(feedback_calls), 0, "Feedback saver should have been called"
+        )
+
+        # Verify the structure of the feedback call
+        call = feedback_calls[0]
+        self.assertIn("name", call)
+        self.assertIn("num_choices", call)
+        self.assertIn("num_timings", call)
+        self.assertIn("input_node_count", call)
+        self.assertGreater(call["num_choices"], 0)
+        self.assertEqual(call["input_node_count"], 2)  # Two input matrices
+
+        # Clean up
+        clear_feedback_savers()
+
 
 @instantiate_parametrized_tests
 class TestPrologueFusion(TestCase):
@@ -2160,6 +2642,9 @@ def check_code(self, code_str, num_kernels, num_allocs, num_deallocs):
                 "del", num_deallocs, exactly=True
             ).run(code_str)
 
+    @skipIfXpu(
+        msg="Triton issue exposed by new driver, will be resolved after next triton update."
+    )
     @parametrize("sizes", ((64, 128, 256), (128, 128, 128), (63, 120, 250)))
     def test_upcast(self, sizes):
         M, K, N = sizes
@@ -2324,6 +2809,9 @@ def test_multiple_fusions(x):
         ).run(code[0])
         self.assertEqual(out, test_multiple_fusions(x), atol=0.05, rtol=0.05)
 
+    @skipIfXpu(
+        msg="Triton issue exposed by new driver, will be resolved after next triton update."
+    )
     @parametrize("sizes", ((64, 128, 256), (128, 128, 128), (63, 120, 250)))
     def test_multiple_inputs(self, sizes):
         M, K, N = sizes
diff --git a/test/inductor/test_memory.py b/test/inductor/test_memory.py
index 3e23442b38ec7..80372bca9fdca 100644
--- a/test/inductor/test_memory.py
+++ b/test/inductor/test_memory.py
@@ -68,9 +68,16 @@ def test_reorder_peak_memory(self):
         outp_corr = self.model(self.inputs)
         compiled_model = torch.compile(self.model)
         code = run_and_get_triton_code(compiled_model, self.inputs)
+
+        call_str = (
+            "def call(self, args):"
+            if torch._inductor.config.graph_partition
+            else "def call(args):"
+        )
+
         (
             FileCheck()
-            .check("def call(args):")
+            .check(call_str)
             .check("buf1 = ")
             .check("buf0 = ")
             .check("buf2 = ")
@@ -105,6 +112,12 @@ def reorder_with_only_lpmf(
                 methods=[memory.topological_sort_lpmf],
             )
 
+        call_str = (
+            "def call(self, args):"
+            if torch._inductor.config.graph_partition
+            else "def call(args):"
+        )
+
         with mock.patch.object(
             memory, "reorder_for_peak_memory", reorder_with_only_lpmf
         ):
@@ -113,7 +126,7 @@ def reorder_with_only_lpmf(
             code = run_and_get_triton_code(compiled_model, self.inputs)
             (
                 FileCheck()
-                .check("def call(args):")
+                .check(call_str)
                 .check("buf1 = ")
                 .check("buf0 = ")
                 .check("buf2 = ")
@@ -148,15 +161,22 @@ def reorder_with_only_bfs(
                 methods=[memory.topological_sort_bfs],
             )
 
+        call_str = (
+            "def call(self, args):"
+            if torch._inductor.config.graph_partition
+            else "def call(args):"
+        )
+
         with mock.patch.object(
             memory, "reorder_for_peak_memory", reorder_with_only_bfs
         ):
             compiled_model = torch.compile(self.model)
 
             code = run_and_get_triton_code(compiled_model, self.inputs)
+
             (
                 FileCheck()
-                .check("def call(args):")
+                .check(call_str)
                 .check("buf0 = ")
                 .check("buf1 = ")
                 .check("buf2 = ")
@@ -191,6 +211,12 @@ def reorder_with_only_dfs(
                 methods=[memory.topological_sort_dfs],
             )
 
+        call_str = (
+            "def call(self, args):"
+            if torch._inductor.config.graph_partition
+            else "def call(args):"
+        )
+
         with mock.patch.object(
             memory, "reorder_for_peak_memory", reorder_with_only_dfs
         ):
@@ -199,7 +225,7 @@ def reorder_with_only_dfs(
             code = run_and_get_triton_code(compiled_model, self.inputs)
             (
                 FileCheck()
-                .check("def call(args):")
+                .check(call_str)
                 .check("buf0 = ")
                 .check("buf2 = ")
                 .check("buf4 = ")
@@ -215,6 +241,7 @@ def reorder_with_only_dfs(
 
     @mock.patch.object(config, "allow_buffer_reuse", False)
     @unittest.skipUnless(TRITON_AVAILABLE, "Triton is not available")
+    @config.patch("test_configs.track_memory_lifecycle", "assert")
     def test_mutation_size_propogation(self):
         """
         This tests correct size propogation in the case of mutations.
@@ -262,6 +289,7 @@ def assign_memory_planning_info_for_scheduler_buffers_with_records(
                 buffer_info[buf_name] = (
                     buf.mpi_buffer.size_alloc,
                     buf.mpi_buffer.size_free,
+                    buf.mpi_buffer.succ_nodes,
                 )
 
         # test example and checks
@@ -281,11 +309,15 @@ def f(a, p):
         ):
             f_compiled = torch.compile(f)
             f_compiled(a, p)
-            for buf_name in ["buf0", "buf2", "buf4", "buf6"]:
-                self.assertEqual(buffer_info[buf_name], (2048, 0))
 
-            for buf_name in ["buf1", "buf3", "buf5", "buf7"]:
-                self.assertEqual(buffer_info[buf_name], (0, 2048))
+            pre_mutation = ["buf0", "buf2", "buf4", "buf6"]
+            post_mutation = ["buf1", "buf3", "buf5", "buf7"]
+
+            for pre, post in zip(pre_mutation, post_mutation):
+                self.assertEqual(buffer_info[pre][0:2], (2048, 2048))
+                self.assertEqual(buffer_info[post][0:2], (0, 0))
+                # succ nodes should be forwarded to pre mutation buffer
+                self.assertTrue(buffer_info[post][2] <= buffer_info[pre][2])
 
     @unittest.skipIf(
         not torch.cuda.is_available()
@@ -359,6 +391,49 @@ def f(x, y, z):
                 .run(code)
             )
 
+    @unittest.skipUnless(TRITON_AVAILABLE, "Triton is not available")
+    def test_multiple_mutations_of_buf(self):
+        @torch.compile()
+        def foo(inp, inp2):
+            inp = inp @ inp
+            inp = inp.view(2, -1, 256)
+            x = inp[0]
+            y = inp[1]
+            x, y = torch._foreach_add([x, y], 1.0)
+            out = x.sum()
+            out2 = y.sum(dim=-1)
+
+            return out, out2, inp2 @ inp2
+
+        inp = torch.rand([256, 256], device=GPU_TYPE)
+        inp2 = torch.rand([256, 256], device=GPU_TYPE)
+
+        def replace_foreach(gm):
+            nodes = gm.find_nodes(
+                op="call_function", target=torch.ops.aten._foreach_add.Scalar
+            )
+            assert len(nodes) == 1
+            node = nodes[0]
+            nodes[0].target = torch.ops.aten._foreach_add_.Scalar
+            for inp, out in zip(node.args[0], list(node.users.keys())):
+                out.replace_all_uses_with(inp)
+                gm.erase_node(out)
+
+        with torch._inductor.config.patch(
+            {
+                "post_grad_custom_post_pass": replace_foreach,
+                "test_configs.track_memory_lifecycle": "assert",
+                "allow_buffer_reuse": False,
+                # make sure the mm is at the end so
+                # the earlier deallocation is not at the last step,
+                # which doesnt distinguish between returned tensors
+                # and which tensors are deallocated immediately prior
+                "reorder_for_peak_memory": False,
+            }
+        ):
+            code = run_and_get_triton_code(foo, inp, inp2)
+            FileCheck().check("allocated=['buf0']").run(code)
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_memory_planning.py b/test/inductor/test_memory_planning.py
index d5f90e662697d..1bcdeaa08e955 100644
--- a/test/inductor/test_memory_planning.py
+++ b/test/inductor/test_memory_planning.py
@@ -24,6 +24,14 @@
 from torch.export import Dim
 
 
+try:
+    from .test_aot_inductor import AOTIRunnerUtil
+except ImportError:
+    from test_aot_inductor import (  # @manual=fbcode//caffe2/test/inductor:test_aot_inductor-library
+        AOTIRunnerUtil,
+    )
+
+
 @requires_gpu()
 @config.patch(memory_planning=True)
 class TestMemoryPlanning(TestCase):
@@ -76,13 +84,6 @@ def test_cpp_wrapper(self):
 
     @skipIfXpu(msg="aoti doesn't work on XPU")
     def test_aoti(self):
-        try:
-            from .test_aot_inductor import AOTIRunnerUtil
-        except ImportError:
-            from test_aot_inductor import (  # @manual=fbcode//caffe2/test/inductor:test_aot_inductor-library
-                AOTIRunnerUtil,
-            )
-
         f, args = self._generate(device=GPU_TYPE)
         dim0_x = Dim("dim0_x", min=1, max=2048)
         dynamic_shapes = ({0: dim0_x}, None, None)
@@ -103,6 +104,54 @@ def test_aoti(self):
         ).check_next("aoti_torch__alloc_from_pool(pool1, 0").run(code)
         self.assertTrue(same(f(*args), result))
 
+    @config.patch({"triton.autotune_at_compile_time": False})
+    def test_unbacked_symint(self):
+        # when allocation's size has unbacked symints
+        # the unbacked symints are only available after computed
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Repro(torch.nn.Module):
+            def forward(self, x, y):
+                x = x + 1
+                u0 = x.item()
+                torch._check(u0 >= 1)
+                s0 = y.size(0)
+                expr = u0 * s0
+                sevens = torch.empty_strided(
+                    size=(10, expr, 32), stride=(expr * 32, 32, 1), device=x.device
+                ).fill_(7)
+                return sevens * 3
+
+        example_inputs = (
+            torch.scalar_tensor(2, dtype=torch.int, device=self.device),
+            torch.ones(8, device=self.device),
+        )
+        model = Repro().to(self.device)
+        result, code = run_and_get_cpp_code(
+            lambda: AOTIRunnerUtil.run(model, example_inputs)
+        )
+        self.assertTrue(same(model(*example_inputs), result))
+
+        # check allocation is done after the unbacked symint is computed
+        FileCheck().check("auto u0 = u0_raw;").check(
+            "const int64_t int_array_2[] = {10L, 8L*u0, 32L};"
+        ).check("AtenTensorHandle pool0_handle;").check(
+            "aoti_torch_empty_strided(3, int_array_2, int_array_3"
+        ).run(code)
+
+        # all AtenTensorHandle allocated using aoti_torch__alloc_from_pool are wrapped with RAIIAtenTensorHandle
+        # otherwise we'll have memory leak
+        FileCheck().check_count(
+            "aoti_torch__alloc_from_pool(pool1", 1, exactly=True
+        ).check_count("aoti_torch__alloc_from_pool(pool0", 1, exactly=True).run(code)
+
+        FileCheck().check(
+            "AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__alloc_from_pool(pool1, 0, cached_torch_dtype_int32, 0, int_array_1, int_array_1, &tmp_tensor_handle_0));"  # noqa: B950
+        ).check("RAIIAtenTensorHandle(tmp_tensor_handle_0);").check(
+            "AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__alloc_from_pool(pool0, 0, cached_torch_dtype_float32, 3, int_array_4, int_array_5, &tmp_tensor_handle_1));"  # noqa: B950
+        ).check("RAIIAtenTensorHandle(tmp_tensor_handle_1);").run(code)
+
 
 if __name__ == "__main__":
     if HAS_GPU:
diff --git a/test/inductor/test_minifier.py b/test/inductor/test_minifier.py
index 0fe17a6e526d4..e8d695a1852d3 100644
--- a/test/inductor/test_minifier.py
+++ b/test/inductor/test_minifier.py
@@ -249,7 +249,7 @@ def _aoti_check_relu_repro(self, res):
         assert res is not None
         ep_file_path = res.get_exported_program_path()
         assert ep_file_path is not None
-        gm = export_load(ep_file_path).module()
+        gm = export_load(ep_file_path).module(check_guards=False)
         self.assertExpectedInline(
             str(gm.code).strip(),
             """\
diff --git a/test/inductor/test_minifier_utils.py b/test/inductor/test_minifier_utils.py
index d95f1ff14466c..80c773830b4af 100644
--- a/test/inductor/test_minifier_utils.py
+++ b/test/inductor/test_minifier_utils.py
@@ -63,7 +63,7 @@ def true_fn(x):
         )
 
         model = M()
-        gm = torch.export.export(model, inputs, strict=False).module()
+        gm = torch.export.export(model, inputs, strict=False).module(check_guards=False)
 
         # TODO: make NNModuleToString.convert() generate string for nested submodules.
         model_string = get_module_string(gm)
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
index 79ca002f7f5bf..8bbf76af6bac6 100644
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -177,6 +177,7 @@ def _test_common(
         is_dynamic=False,
         quantizer=None,
         compile_options={},  # noqa: B006
+        quantization_with_autocast=False,
     ):
         if not hasattr(self, "device"):
             has_xpu = any(
@@ -206,9 +207,15 @@ def _test_common(
             assert check_autocast == torch.float32
             maybe_autocast = contextlib.nullcontext()
         if check_quantization:
-            convert_model = _generate_qdq_quantized_model(
-                mod, inputs, is_qat, is_dynamic, quantizer
-            )
+            if quantization_with_autocast:
+                with maybe_autocast:
+                    convert_model = _generate_qdq_quantized_model(
+                        mod, inputs, is_qat, is_dynamic, quantizer
+                    )
+            else:
+                convert_model = _generate_qdq_quantized_model(
+                    mod, inputs, is_qat, is_dynamic, quantizer
+                )
             with torch.no_grad(), maybe_autocast:
                 _ = torch.compile(convert_model)(*inputs)
                 matcher_check_fn()
@@ -1106,7 +1113,12 @@ def matcher_check_fn():
             v = torch.randn(2, 4, 16).to(dtype)
             self._test_common(mod, (v,), matcher_check_fn, rtol=1e-2, atol=1e-2)
 
-    def _qconv2d_test_helper(self, device="cpu", int8_mixed_bf16=False):
+    def _qconv2d_test_helper(
+        self,
+        device="cpu",
+        int8_mixed_bf16=False,
+        quantization_with_autocast=False,
+    ):
         class M(torch.nn.Module):
             def __init__(
                 self,
@@ -1139,7 +1151,7 @@ def matcher_check_fn():
             )
             self.assertEqual(
                 counters["inductor"]["qconv_weight_prepack_matcher_nodes"],
-                18 if int8_mixed_bf16 else 12,
+                (16 if quantization_with_autocast else 18) if int8_mixed_bf16 else 12,
             )
             self.assertEqual(
                 counters["inductor"]["qconv_unary_lower_count"], 0 if TEST_ACL else 3
@@ -1151,6 +1163,7 @@ def matcher_check_fn():
             matcher_check_fn,
             check_quantization=True,
             check_autocast=torch.bfloat16 if int8_mixed_bf16 else torch.float,
+            quantization_with_autocast=quantization_with_autocast,
         )
 
     @skipIfNoDynamoSupport
@@ -1181,6 +1194,16 @@ def test_qconv2d_int8_mixed_bf16(self):
         """
         self._qconv2d_test_helper(int8_mixed_bf16=True)
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    @skipIfRocmArch(MI300_ARCH)
+    def test_qconv2d_int8_mixed_bf16_use_autocast(self):
+        r"""
+        This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization.
+        """
+        self._qconv2d_test_helper(int8_mixed_bf16=True, quantization_with_autocast=True)
+
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
@@ -2330,6 +2353,7 @@ def _qlinear_test_helper(
         bias=True,
         is_dynamic=False,
         is_qat=False,
+        quantization_with_autocast=False,
     ):
         class M(torch.nn.Module):
             def __init__(self, use_bias, do_permute=False):
@@ -2368,6 +2392,7 @@ def _default_matcher_check_fn():
             check_quantization=True,
             is_qat=is_qat,
             is_dynamic=is_dynamic,
+            quantization_with_autocast=quantization_with_autocast,
         )
 
     @skipIfNoDynamoSupport
@@ -2436,6 +2461,21 @@ def test_qlinear_int8_mixed_bf16(self):
                 (torch.randn((2, 4)),), int8_mixed_bf16=True, bias=bias
             )
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    def test_qlinear_int8_mixed_bf16_use_autocast(self):
+        r"""
+        This testcase will quantize a single Linear Module with int8_mixed_bf16 quantization.
+        """
+        for bias in [True, False]:
+            self._qlinear_test_helper(
+                (torch.randn((2, 4)),),
+                int8_mixed_bf16=True,
+                bias=bias,
+                quantization_with_autocast=True,
+            )
+
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoXPU
@@ -2484,6 +2524,21 @@ def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2(self):
                 (torch.randn((2, 3, 4)),), int8_mixed_bf16=True, bias=bias
             )
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2_use_autocast(self):
+        r"""
+        This testcase will quantize a single Linear Module with int8_mixed_bf16 quantization.
+        """
+        for bias in [True, False]:
+            self._qlinear_test_helper(
+                (torch.randn((2, 3, 4)),),
+                int8_mixed_bf16=True,
+                bias=bias,
+                quantization_with_autocast=True,
+            )
+
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
@@ -2554,6 +2609,37 @@ def matcher_check_fn():
                 bias=bias,
             )
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2_and_not_contiguous_use_autocast(
+        self,
+    ):
+        r"""
+        This testcase will quantize a single Linear Module for int8_bf16.
+        * Input dim exceeds 2
+        * Input not contiguous
+        """
+        for bias in [True, False]:
+
+            def matcher_check_fn():
+                self.assertEqual(
+                    counters["inductor"]["qlinear_weight_prepack_matcher_count"], 2
+                )
+                self.assertEqual(
+                    counters["inductor"]["qlinear_weight_prepack_matcher_nodes"],
+                    16 if bias else 15,
+                )
+
+            self._qlinear_test_helper(
+                (torch.randn((2, 4, 3, 4)),),
+                int8_mixed_bf16=True,
+                do_permute=True,
+                matcher_check_fn=matcher_check_fn,
+                bias=bias,
+                quantization_with_autocast=True,
+            )
+
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
@@ -2902,8 +2988,8 @@ def matcher_check_fn():
                     mod,
                     (v,),
                     [
-                        "aoti_torch_cpu__qlinear_pointwise_tensor",
-                        "aoti_torch_cpu__qlinear_pointwise_binary_tensor",
+                        f"aoti_torch_{device}__qlinear_pointwise_tensor",
+                        f"aoti_torch_{device}__qlinear_pointwise_binary_tensor",
                     ],
                     [],
                     check_quantization=True,
diff --git a/test/inductor/test_move_constructors_to_cuda.py b/test/inductor/test_move_constructors_to_cuda.py
index 3c3b8708c630f..b174c79f1ebd0 100644
--- a/test/inductor/test_move_constructors_to_cuda.py
+++ b/test/inductor/test_move_constructors_to_cuda.py
@@ -9,7 +9,7 @@
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_utils import IS_LINUX
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 
 
 requires_multigpu = functools.partial(
@@ -112,5 +112,5 @@ def foo(x):
 
 
 if __name__ == "__main__":
-    if IS_LINUX and HAS_CUDA:
+    if IS_LINUX and HAS_CUDA_AND_TRITON:
         run_tests()
diff --git a/test/inductor/test_needs_exact_strides.py b/test/inductor/test_needs_exact_strides.py
index ae80abe7c440c..2d636db3f88f1 100644
--- a/test/inductor/test_needs_exact_strides.py
+++ b/test/inductor/test_needs_exact_strides.py
@@ -13,7 +13,7 @@
     IS_LINUX,
     parametrize,
 )
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 
 
 class TestNeedsExactStrides(InductorTestCase):
@@ -98,5 +98,5 @@ def f(x, other):
 instantiate_parametrized_tests(TestNeedsExactStrides)
 
 if __name__ == "__main__":
-    if IS_LINUX and HAS_CUDA:
+    if IS_LINUX and HAS_CUDA_AND_TRITON:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_online_softmax.py b/test/inductor/test_online_softmax.py
index 798d86b0dd617..808757b7e041f 100644
--- a/test/inductor/test_online_softmax.py
+++ b/test/inductor/test_online_softmax.py
@@ -14,7 +14,7 @@
     IS_LINUX,
     parametrize,
 )
-from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA_AND_TRITON
 
 
 DO_PERF_TEST = os.environ.get("DO_PERF_TEST") == "1"
@@ -293,9 +293,22 @@ def f(x, mask):
         self.assertTrue(not act.isnan().any())
         self.assertTrue(torch.allclose(ref, act))
 
+    @inductor_config.patch(split_reductions=False)
+    def test_3d_tiled_online_softmax(self):
+        def f(x, y):
+            return (x * y).softmax(dim=-1)
+
+        M, N, K = 32, 8, 1024
+
+        x = torch.randn(K, N, M, device=GPU_TYPE).permute(2, 1, 0)
+        y = torch.randn(K, M, N, device=GPU_TYPE).permute(1, 2, 0)
+
+        opt_f = torch.compile(f)
+        torch.testing.assert_close(f(x, y), opt_f(x, y), atol=1e-3, rtol=1e-3)
+
 
 instantiate_parametrized_tests(TestOnlineSoftmax)
 
 if __name__ == "__main__":
-    if IS_LINUX and HAS_CUDA:
+    if IS_LINUX and HAS_CUDA_AND_TRITON:
         run_tests()
diff --git a/test/inductor/test_op_dtype_prop.py b/test/inductor/test_op_dtype_prop.py
index 458d64aa41d5b..6f7eec601666b 100644
--- a/test/inductor/test_op_dtype_prop.py
+++ b/test/inductor/test_op_dtype_prop.py
@@ -260,7 +260,7 @@ def test_downcast_div_mod(self):
         def fn(x, y):
             return x % y, x / y
 
-        x, y = (torch.rand([8], dtype=torch.float16, device="cuda") for _ in range(2))
+        x, y = (torch.rand([8], dtype=torch.float16, device=GPU_TYPE) for _ in range(2))
 
         out, code = run_and_get_code(torch.compile(fn), x, y)
 
@@ -271,7 +271,7 @@ def fn(x, y):
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
     def test_constant(self):
         def fn():
-            return (torch.full((2, 3), 3.1416, device="cuda", dtype=torch.float16),)
+            return (torch.full((2, 3), 3.1416, device=GPU_TYPE, dtype=torch.float16),)
 
         out, code = run_and_get_code(torch.compile(fn))
         FileCheck().check("static_assert").check_same(".dtype").run(code[0])
@@ -284,7 +284,7 @@ def test_any(self):
         def fn(x):
             return torch.any(x)
 
-        x = torch.rand([40], device="cuda").to(torch.bool)
+        x = torch.rand([40], device=GPU_TYPE).to(torch.bool)
         out, code = run_and_get_code(torch.compile(fn), x)
         self.assertEqual(fn(x), out)
 
@@ -293,7 +293,7 @@ def fn(x):
     def test_assoc_scan(self):
         from torch._higher_order_ops.associative_scan import associative_scan
 
-        x = torch.randn(10, device="cuda")
+        x = torch.randn(10, device=GPU_TYPE)
         # dtype check correctly
         associative_scan(
             lambda acc, curr: acc + torch.abs(curr), x, dim=-1, combine_mode="pointwise"
diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
index bcd1519c59350..781f4588e1472 100644
--- a/test/inductor/test_pad_mm.py
+++ b/test/inductor/test_pad_mm.py
@@ -16,7 +16,7 @@
 from torch._inductor.utils import fresh_cache, is_big_gpu, run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import skipIfRocm
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 
 
 class PadMMTest(TestCase):
@@ -539,7 +539,116 @@ def fn(x, y):
         # Its name should contain `mm` because `mm` was the original aten op where the mm came from.
         FileCheck().check("def triton_tem_fused_mm").run(code[0])
 
+    def test_no_autocast_in_pad_bmm_joint_graph_pass(self):
+        # Track bmm dtypes before and after joint graph passes
+        bmm_dtypes_pre = {}
+        bmm_dtypes_post = {}
+
+        def make_bmm_dtype_tracker(dtype_dict):
+            def track_bmm_dtype(graph):
+                for node in graph.nodes:
+                    if (
+                        node.op == "call_function"
+                        and node.target == torch.ops.aten.bmm.default
+                    ):
+                        # Store the output dtype
+                        if hasattr(node.meta.get("val", None), "dtype"):
+                            dtype_dict[str(node)] = node.meta["val"].dtype
+                return graph
+
+            return track_bmm_dtype
+
+        class MaskedMHA(torch.nn.Module):
+            def __init__(self, H_q, H_kv, D):
+                super().__init__()
+                self.H_kv = H_kv
+                num_heads_total = H_q + 2 * H_kv
+                self.qkv_proj_vid = torch.nn.Linear(H_q * D, num_heads_total * D)
+                self.qkv_proj_txt = torch.nn.Linear(H_q * D, num_heads_total * D)
+                self.out_proj = torch.nn.Linear(H_q * D, H_q * D)
+                self.H_q = H_q
+                self.D = D
+
+            def forward(self, x_vid, x_txt, attn_mask):
+                qkv_vid = self.qkv_proj_vid(x_vid)
+                qkv_txt = self.qkv_proj_txt(x_txt)
+                qkv_vid = qkv_vid.reshape((*qkv_vid.shape[:-1], -1, self.D))
+                qkv_txt = qkv_txt.reshape((*qkv_txt.shape[:-1], -1, self.D))
+
+                q_vid = qkv_vid[..., : self.H_q, :]
+                k_vid = qkv_vid[..., self.H_q : self.H_q + self.H_kv, :]
+                v_vid = qkv_vid[..., self.H_q + self.H_kv :, :]
+
+                q_txt = qkv_txt[..., : self.H_q, :]
+                k_txt = qkv_txt[..., self.H_q : self.H_q + self.H_kv, :]
+                v_txt = qkv_txt[..., self.H_q + self.H_kv :, :]
+
+                q = torch.cat([q_vid, q_txt], dim=-3)
+                k = torch.cat([k_vid, k_txt], dim=-3)
+                v = torch.cat([v_vid, v_txt], dim=-3)
+
+                out = torch.nn.functional.scaled_dot_product_attention(
+                    q.transpose(-2, -3),
+                    k.transpose(-2, -3),
+                    v.transpose(-2, -3),
+                    attn_mask=attn_mask,
+                    enable_gqa=True,
+                )
+                out = out.transpose(-2, -3)
+
+                return out
+
+        def test_masked_mha(B, H, S, D, device, dtype):
+            S_vid = 300
+            S_txt = S - S_vid
+            x1 = torch.randn(B, S_vid, H * D, requires_grad=True, device=device)
+            x2 = torch.randn(B, S_txt, H * D, requires_grad=True, device=device)
+            attn_mask = torch.ones(B, 1, S, S, dtype=torch.bool, device=device)
+
+            H_kv = H // 4
+            mha = MaskedMHA(H, H_kv, D)
+            mha = mha.to(device)
+
+            with torch._inductor.config.patch(
+                joint_custom_pre_pass=make_bmm_dtype_tracker(bmm_dtypes_pre),
+                joint_custom_post_pass=make_bmm_dtype_tracker(bmm_dtypes_post),
+            ):
+                mha = torch.compile(mha, fullgraph=True, backend="inductor")
+                with torch.autocast(
+                    device_type="cuda", dtype=dtype, cache_enabled=False
+                ):
+                    out_vid = mha(x1, x2, attn_mask)
+                    target_vid = torch.randn_like(out_vid)
+
+                    loss_vid = (out_vid - target_vid).mean()
+                    loss = loss_vid
+                loss.backward()
+
+            torch.cuda.synchronize()
+
+            # Check if any bmm operations had dtype changes
+            for node_name_pre, node_name_post in zip(
+                bmm_dtypes_pre, bmm_dtypes_post, strict=True
+            ):
+                pre_dtype = bmm_dtypes_pre[node_name_pre]
+                post_dtype = bmm_dtypes_post[node_name_post]
+                # Assert no bmm output dtype changes
+                self.assertEqual(pre_dtype, post_dtype)
+
+            # Based on issue https://github.com/pytorch/pytorch/issues/159469,
+            # if autocast was applied in pad_bmm causing bmm's output dtype to be changed from fp32 to bf16,
+            # gradient will have NaNs in this test case.
+            self.assertFalse(torch.any(x1.grad.isnan()).item())
+            self.assertFalse(torch.any(x2.grad.isnan()).item())
+
+        B, H, S, D = 2, 32, 549, 128
+        device = "cuda"
+        dtype = torch.bfloat16
+        torch.compiler.reset()
+        torch.manual_seed(42)
+        test_masked_mha(B, H, S, D, device, dtype)
+
 
 if __name__ == "__main__":
-    if HAS_CUDA:
+    if HAS_CUDA_AND_TRITON:
         run_tests()
diff --git a/test/inductor/test_padding.py b/test/inductor/test_padding.py
index 15c1abdf32db2..9ef3a18e24234 100644
--- a/test/inductor/test_padding.py
+++ b/test/inductor/test_padding.py
@@ -49,6 +49,18 @@ def geninp():
     return input_dict
 
 
+def get_padded_stride(shape, alignment_bytes, pad_output, itemsize):
+    align = alignment_bytes // itemsize
+    new_strides = [0 for _ in range(len(shape))]
+    new_strides[len(shape) - 1] = 1
+    for i in range(len(shape) - 1, 0, -1):
+        stride = shape[i] * new_strides[i]
+        if pad_output and stride % align != 0:
+            stride = (stride + align - 1) // align * align
+        new_strides[i - 1] = stride
+    return tuple(new_strides)
+
+
 class LinearAndSoftmax(nn.Module):
     """
     It's very common that a transformer model will do a matmul and then
@@ -767,7 +779,137 @@ def get_input(size: tuple[int], alignment_bytes: int) -> torch.Tensor:
         output_shape = (shape[0] * num_inputs, shape[1])
         output_stride = input_tensors[0].stride()
         output_line = f"buf12 = empty_strided_{GPU_TYPE}({output_shape}, {output_stride}, torch.float32)"
-        self.assertTrue(any(output_line in line for line in code))
+        self.assertTrue(output_line in code[0])
+
+    @parametrize(
+        "shape,alignment_bytes,enable_pad",
+        [
+            ((512, 1), 32, False),
+            ((512, 1), 32, True),
+            ((32, 30), 64, False),
+            ((32, 30), 64, True),
+            ((512, 100, 1), 32, False),
+            ((512, 100, 1), 32, True),
+            ((32, 50, 30), 64, False),
+            ((32, 50, 30), 64, True),
+        ],
+    )
+    def test_outer_dynamic_shape_padding(self, shape, alignment_bytes, enable_pad):
+        """
+        When only the outermost dim is dynamic shape, the output can still be padded up
+        based on padding configuration.
+        """
+        num_inputs = 2
+        input_tensors = [
+            torch.randn(shape, dtype=torch.float32) for _ in range(num_inputs)
+        ]
+
+        config_patches = {
+            "comprehensive_padding": enable_pad,
+            "pad_dynamic_shapes": True,
+            "cpu_backend": "triton",
+            "padding_alignment_bytes": alignment_bytes,
+            "pad_outputs": True,
+            "padding_stride_threshold": 0,
+        }
+        with config.patch(config_patches):
+            torch._dynamo.mark_dynamic(input_tensors[0], 0)
+            torch._dynamo.mark_dynamic(input_tensors[1], 0)
+            compiled = torch.compile(torch.add)
+            result, _ = run_and_get_code(compiled, *input_tensors)
+
+        expected_stride = get_padded_stride(
+            result.shape, alignment_bytes, enable_pad, result.dtype.itemsize
+        )
+        self.assertEqual(result.stride(), expected_stride)
+
+    @parametrize(
+        "shape,perm,alignment_bytes,enable_pad",
+        [
+            ((500, 10, 1), (2, 1, 0), 32, False),
+            ((500, 20, 1), (2, 1, 0), 32, True),
+            ((30, 10, 20), (2, 1, 0), 64, True),
+            ((30, 10, 20), (2, 1, 0), 64, False),
+            ((500, 10, 1), (1, 2, 0), 32, False),
+            ((500, 20, 1), (1, 2, 0), 32, True),
+            ((30, 10, 20), (1, 2, 0), 64, True),
+            ((30, 10, 20), (1, 2, 0), 64, False),
+        ],
+    )
+    def test_perm_outer_dynamic_shape_padding(
+        self, shape, perm, alignment_bytes, enable_pad
+    ):
+        """
+        When only the outermost dim is dynamic shape, the output can still be padded up
+        based on padding configuration. Test when this occurs after a permute op.
+        """
+
+        def permute_contig(x):
+            return torch.permute(x, perm).contiguous()
+
+        num_inputs = 1
+        input_tensors = [
+            torch.randn(shape, dtype=torch.float32) for _ in range(num_inputs)
+        ]
+
+        config_patches = {
+            "comprehensive_padding": enable_pad,
+            "pad_dynamic_shapes": True,
+            "cpu_backend": "triton",
+            "padding_alignment_bytes": alignment_bytes,
+            "pad_outputs": True,
+            "padding_stride_threshold": 0,
+            "triton.use_block_ptr": True,
+        }
+        with config.patch(config_patches):
+            torch._dynamo.mark_dynamic(input_tensors[0], 2)
+            compiled = torch.compile(permute_contig)
+            result, _ = run_and_get_code(compiled, *input_tensors)
+
+        expected_stride = get_padded_stride(
+            result.shape, alignment_bytes, enable_pad, result.dtype.itemsize
+        )
+        self.assertEqual(result.stride(), expected_stride)
+
+    @parametrize(
+        "shape,alignment_bytes,enable_pad",
+        [
+            ((512, 1), 32, False),
+            ((512, 1), 32, True),
+            ((32, 30), 64, False),
+            ((32, 30), 64, True),
+            ((512, 100, 1), 32, False),
+            ((512, 100, 1), 32, True),
+            ((32, 50, 30), 64, False),
+            ((32, 50, 30), 64, True),
+        ],
+    )
+    def test_dynamic_shape_padding(self, shape, alignment_bytes, enable_pad):
+        """
+        When only the outermost dim is dynamic shape, the output can still be padded up
+        based on padding configuration.
+        """
+        num_inputs = 2
+        input_tensors = [
+            torch.randn(shape, dtype=torch.float32) for _ in range(num_inputs)
+        ]
+
+        config_patches = {
+            "comprehensive_padding": enable_pad,
+            "pad_dynamic_shapes": enable_pad,
+            "cpu_backend": "triton",
+            "padding_alignment_bytes": alignment_bytes,
+            "pad_outputs": True,
+            "padding_stride_threshold": 0,
+        }
+        with config.patch(config_patches):
+            compiled = torch.compile(torch.add, dynamic=True)
+            result, _ = run_and_get_code(compiled, *input_tensors)
+
+        expected_stride = get_padded_stride(
+            result.shape, alignment_bytes, enable_pad, result.dtype.itemsize
+        )
+        self.assertEqual(result.stride(), expected_stride)
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
index ac940f0480098..bfdc371006472 100644
--- a/test/inductor/test_pattern_matcher.py
+++ b/test/inductor/test_pattern_matcher.py
@@ -1355,13 +1355,13 @@ def repl(inp, x1, x2):
                 FileCheck().check_not("extern_kernels.addmm(").run(code[0])
 
     def test_addmm_dtype_mismatch(self):
-        a = torch.nn.Linear(1024, 1024, bias=False).cuda()
+        a = torch.nn.Linear(1024, 1024, bias=False).to(GPU_TYPE)
         a = a.to(dtype=torch.float16)
 
-        w = torch.randn(1024, 1024, device="cuda")
+        w = torch.randn(1024, 1024, device=GPU_TYPE)
 
         def func():
-            x = torch.ones(1024, 1024, device="cuda", dtype=torch.float16)
+            x = torch.ones(1024, 1024, device=GPU_TYPE, dtype=torch.float16)
             x = a(x)
             x = x + w
             return x
@@ -1752,6 +1752,18 @@ def my_func_static(x, w, epsilon):
         test, (code,) = run_and_get_code(my_func_static, *inputs)
         self.assertTrue("static_scaled_int8_quant" not in code)
 
+    def test_fwd_only_generate_original_aten_meta(self):
+        def f(x):
+            return torch.ops.aten.sigmoid(x)
+
+        sample_input = torch.randn(3, 5, device=GPU_TYPE)
+        gm_with_meta = fwd_only(f, args=[sample_input])
+        sigmoid_nodes = gm_with_meta.graph.find_nodes(
+            op="call_function", target=torch.ops.aten.sigmoid.default
+        )
+        self.assertEqual(len(sigmoid_nodes), 1)
+        self.assertTrue("original_aten" in sigmoid_nodes[0].meta)
+
 
 if __name__ == "__main__":
     if IS_LINUX and HAS_GPU:
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index 0ca54257250f6..83cd236875f45 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -28,13 +28,16 @@
 # performance for that setting.
 #
 # Defines all the kernels for tests
-from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
+from torch.testing._internal.triton_utils import (
+    HAS_CUDA_AND_TRITON,
+    requires_cuda_and_triton,
+)
 
 
 # set so that metrics appear
 torch._logging.set_logs(inductor_metrics=True)
 
-if HAS_CUDA:
+if HAS_CUDA_AND_TRITON:
     import triton  # @manual
     import triton.language as tl  # @manual
 
@@ -920,7 +923,7 @@ def f(a, b):
         inp = (T(10, 10), TI(2, mx=5))
         self.assertExpectedInline(count_numel(f, *inp), """42""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_inplace_triton_kernel_training(self):
         @triton.jit
         def sin_kernel(
@@ -964,7 +967,7 @@ def f(x):
         x = T(3, grad=True)
         self.assertExpectedInline(count_numel_train(f, x), """9""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_triton_kernel_not_fusable_with_users(self):
         @triton.jit
         def _sin_kernel(
@@ -1017,7 +1020,7 @@ def f(x):
         # (it will cost an extra kernel)
         self.assertExpectedInline(count_numel_train(f, x), """27""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_inplace_custom_op_training_two_mutated_inputs(self):
         @torch.library.custom_op(
             "_reinplacing::sin_cos", mutates_args={"out_sin", "out_cos"}
@@ -1037,7 +1040,7 @@ def f(x):
         x = T(3, grad=True)
         self.assertExpectedInline(count_numel(f, x), """21""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_inplace_custom_op_training(self):
         @torch.library.custom_op("_reinplacing::sin", mutates_args={"result"})
         def sin(x: torch.Tensor, result: torch.Tensor) -> None:
@@ -1066,7 +1069,7 @@ def f(x):
         x = T(3, grad=True)
         self.assertExpectedInline(count_numel_train(f, x), """9""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_inplace_custom_op(self):
         with torch.library._scoped_library("mylib", "FRAGMENT") as m:
             m.define("foo(Tensor x, Tensor(a!) out) -> ()")
@@ -1096,7 +1099,7 @@ def f(x, out):
 
             self.assertExpectedInline(count_numel(f, x, out), """21""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_inplace_custom_op_intermediate(self):
         with torch.library._scoped_library("mylib", "FRAGMENT") as m:
             m.define("foo(Tensor x, Tensor(a!) out) -> ()")
@@ -1127,7 +1130,7 @@ def f(x, out):
 
             self.assertExpectedInline(count_numel(f, x, out), """21""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_inplace_custom_op_two_mutated_inputs(self):
         with torch.library._scoped_library("mylib", "FRAGMENT") as m:
             m.define("foo(Tensor q, Tensor(a!) k_cache, Tensor(b!) v_cache) -> Tensor")
@@ -1159,7 +1162,7 @@ def f():
 
             self.assertExpectedInline(count_numel(f), """39""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_inplace_triton_kernel_v1(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             output = torch.zeros_like(x)
@@ -1171,7 +1174,7 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """50""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_inplace_triton_kernel_v2(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             output = torch.zeros_like(x)
@@ -1184,7 +1187,7 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """70""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_inplace_triton_kernel_v3(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             output = torch.zeros_like(x)
@@ -1197,7 +1200,7 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """80""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_inplace_triton_kernel_v4(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             x_view = x.view(-1)
@@ -1211,7 +1214,7 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """70""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_inplace_triton_kernel_v5(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             x_view = x.view(-1)
@@ -1225,7 +1228,7 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """80""")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_inplace_triton_kernel_v6(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             output = torch.zeros_like(x)
@@ -1292,5 +1295,5 @@ def f(a, b):
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
-    if HAS_CUDA:
+    if HAS_CUDA_AND_TRITON:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_profiler.py b/test/inductor/test_profiler.py
index 3d54c378de4a2..f22f0374813b0 100644
--- a/test/inductor/test_profiler.py
+++ b/test/inductor/test_profiler.py
@@ -12,7 +12,7 @@
 from torch._inductor import config
 from torch.profiler import ProfilerActivity
 from torch.testing._internal.common_utils import TemporaryFileName
-from torch.testing._internal.inductor_utils import HAS_CUDA, IS_BIG_GPU
+from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON, IS_BIG_GPU
 from torch.torch_version import TorchVersion
 from torch.utils._triton import has_triton
 
@@ -313,5 +313,5 @@ def fn(x, y):
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
-    if HAS_CUDA:
+    if HAS_CUDA_AND_TRITON:
         run_tests()
diff --git a/test/inductor/test_provenance_tracing.py b/test/inductor/test_provenance_tracing.py
index 2dd9ca44eb687..7d6b714838ff9 100644
--- a/test/inductor/test_provenance_tracing.py
+++ b/test/inductor/test_provenance_tracing.py
@@ -1,25 +1,31 @@
 # Owner(s): ["module: inductor"]
 
+import contextlib
+import io
 import json
 import logging
+import os
 import re
 import shutil
 import tempfile
 import unittest
+import zipfile
 from pathlib import Path
 
 import torch
 from torch._dynamo.utils import detect_fake_mode
 from torch._inductor import config
 from torch._inductor.debug import (
+    create_kernel_information_json,
     create_mapping_pre_post_grad_nodes,
     create_node_mapping_kernel_to_post_grad,
+    reset_inductor_kernel_provenance_debug_handle,
 )
 from torch._inductor.fx_passes.post_grad import post_grad_passes
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.virtualized import V
-from torch.testing._internal.inductor_utils import HAS_GPU
-from torch.testing._internal.triton_utils import requires_cuda
+from torch.testing._internal.common_utils import IS_MACOS
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 try:
@@ -28,6 +34,9 @@
     from test_aot_inductor_utils import AOTIRunnerUtil
 
 
+trace_log = logging.getLogger("torch.__trace")
+
+
 class Model(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -61,19 +70,37 @@ def forward(self, a):
         return torch.nn.functional.linear(a, self.weight, self.bias)
 
 
+class Model4(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 16)
+        self.relu = torch.nn.ReLU()
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x, a, b, c):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.sigmoid(x)
+        d = a * 3.14
+        y = torch.addmm(c, d, b)
+        z = torch.nn.functional.gelu(y)
+        return x, z
+
+
 @config.patch("trace.enabled", True)
-@config.patch("trace.provenance_tracking", True)
+@config.patch("trace.provenance_tracking_level", 1)
 class TestProvenanceTracingArtifact(TestCase):
     """
     This test checks that generated provenance tracing artifact from "post_grad" to
     corresponding "inductor triton kernel node" is expected.
     """
 
-    def _check_provenance_tracing_artifact(self, filepath, expected_data):
+    def _check_provenance_tracing_kernel_to_post_grad(self, filepath, expected_data):
         self.assertTrue(filepath.is_dir())
-        filename = Path(filepath) / "inductor_generated_kernel_to_post_grad_nodes.json"
+        filename = Path(filepath) / "inductor_provenance_tracking_node_mappings.json"
         with open(filename) as f:
             actual_data = json.load(f)
+        actual_data = actual_data["cppCodeToPost"]
         # check that the generated provenance tracing artifact is expected
         self.assertEqual(sorted(actual_data.items()), sorted(expected_data.items()))
 
@@ -91,10 +118,11 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
         c = torch.randn(10, 30, device=device)
         example_inputs = (a, b, c)
 
-        model = Model()
+        model = Model().to(device)
         filepath = None
 
         for backend in ["aot_inductor", "inductor"]:
+            reset_inductor_kernel_provenance_debug_handle()
             try:
                 with config.patch(
                     {
@@ -117,28 +145,12 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                     self.assertTrue(m)
                     filepath = Path(m.group(1))
                     if device == "cuda":
-                        expected_data = {
-                            "triton_poi_fused_mul_0": ["mul"],
-                            "triton_poi_fused_addmm_gelu_1": [
-                                "mul_3",
-                                "mul_1",
-                                "add_tensor",
-                                "add",
-                                "erf",
-                                "mul_2",
-                            ],
-                        }
-                        if backend == "aot_inductor":
-                            expected_data["aoti_torch_cuda_mm_out"] = ["mm_default"]
-                        else:
-                            expected_data["extern_kernels.mm"] = ["mm_default"]
-                        self._check_provenance_tracing_artifact(filepath, expected_data)
                         expected_mapping = [
                             (
                                 "cppCodeToPost",
                                 {
-                                    "triton_poi_fused_mul_0": ["mul"],
-                                    "triton_poi_fused_addmm_gelu_1": [
+                                    "triton_poi_fused_mul_0:1": ["mul"],
+                                    "triton_poi_fused_addmm_gelu_1:2": [
                                         "mul_3",
                                         "mul_1",
                                         "add_tensor",
@@ -151,13 +163,13 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                             (
                                 "postToCppCode",
                                 {
-                                    "mul": ["triton_poi_fused_mul_0"],
-                                    "mul_3": ["triton_poi_fused_addmm_gelu_1"],
-                                    "mul_1": ["triton_poi_fused_addmm_gelu_1"],
-                                    "add_tensor": ["triton_poi_fused_addmm_gelu_1"],
-                                    "add": ["triton_poi_fused_addmm_gelu_1"],
-                                    "erf": ["triton_poi_fused_addmm_gelu_1"],
-                                    "mul_2": ["triton_poi_fused_addmm_gelu_1"],
+                                    "mul": ["triton_poi_fused_mul_0:1"],
+                                    "mul_3": ["triton_poi_fused_addmm_gelu_1:2"],
+                                    "mul_1": ["triton_poi_fused_addmm_gelu_1:2"],
+                                    "add_tensor": ["triton_poi_fused_addmm_gelu_1:2"],
+                                    "add": ["triton_poi_fused_addmm_gelu_1:2"],
+                                    "erf": ["triton_poi_fused_addmm_gelu_1:2"],
+                                    "mul_2": ["triton_poi_fused_addmm_gelu_1:2"],
                                 },
                             ),
                             (
@@ -183,15 +195,19 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                             ),
                         ]
                         if backend == "aot_inductor":
-                            expected_mapping[0][1]["aoti_torch_cuda_mm_out"] = [
+                            expected_mapping[0][1]["aoti_torch_cuda_mm_out:3"] = [
                                 "mm_default"
                             ]
                             expected_mapping[1][1]["mm_default"] = [
-                                "aoti_torch_cuda_mm_out"
+                                "aoti_torch_cuda_mm_out:3"
                             ]
                         else:
-                            expected_mapping[0][1]["extern_kernels.mm"] = ["mm_default"]
-                            expected_mapping[1][1]["mm_default"] = ["extern_kernels.mm"]
+                            expected_mapping[0][1]["extern_kernels.mm:3"] = [
+                                "mm_default"
+                            ]
+                            expected_mapping[1][1]["mm_default"] = [
+                                "extern_kernels.mm:3"
+                            ]
                         self._check_provenance_tracking_node_mappings(
                             filepath, expected_mapping
                         )
@@ -200,9 +216,9 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                         # check the inductor kernel to post grad nodes mapping is expected for cpu
                         if backend == "aot_inductor":
                             expected_data = {
-                                "cpp_fused_mul_0": ["mul"],
-                                "aoti_torch_cpu_addmm_out": ["addmm"],
-                                "cpp_fused_gelu_1": [
+                                "cpp_fused_mul_0:1": ["mul"],
+                                "aoti_torch_cpu_addmm_out:3": ["addmm"],
+                                "cpp_fused_gelu_1:2": [
                                     "mul_3",
                                     "mul_1",
                                     "add",
@@ -213,31 +229,32 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                         else:
                             # backend == "inductor"
                             expected_data = {
-                                "cpp_fused_mul_0": ["mul"],
-                                "cpp_fused_gelu_1": [
+                                "cpp_fused_mul_0:1": ["mul"],
+                                "cpp_fused_gelu_1:2": [
                                     "mul_3",
                                     "mul_1",
                                     "add",
                                     "erf",
                                     "mul_2",
                                 ],
-                                "extern_kernels.addmm": ["addmm"],
+                                "extern_kernels.addmm:3": ["addmm"],
                             }
-                        self._check_provenance_tracing_artifact(filepath, expected_data)
+                        self._check_provenance_tracing_kernel_to_post_grad(
+                            filepath, expected_data
+                        )
 
             finally:
                 if filepath:
                     shutil.rmtree(filepath)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_triton_kernel_to_post_grad_tracing_cuda(self):
         self._test_triton_kernel_to_post_grad_tracing(device="cuda")
 
-    @unittest.skipIf(HAS_GPU, "the test is only for cpu")
     def test_triton_kernel_to_post_grad_tracing_cpu(self):
         self._test_triton_kernel_to_post_grad_tracing(device="cpu")
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
         M = 8
         N = 6
@@ -249,6 +266,7 @@ def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
         filepath = None
 
         for backend in ["aot_inductor", "inductor"]:
+            reset_inductor_kernel_provenance_debug_handle()
             try:
                 with config.patch(
                     {
@@ -272,20 +290,22 @@ def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
                     filepath = Path(m.group(1))
                     if backend == "inductor":
                         expected_data = {
-                            "extern_kernels.addmm": ["addmm"],
+                            "extern_kernels.addmm:1": ["addmm"],
                         }
                     else:
                         # backend = aot_inductor
                         expected_data = {
-                            "aoti_torch_cuda_addmm_out": ["addmm"],
-                            "triton_poi_fused_0": ["_tensor_constant1"],
+                            "aoti_torch_cuda_addmm_out:2": ["addmm"],
+                            "triton_poi_fused_0:1": ["_tensor_constant1"],
                         }
-                    self._check_provenance_tracing_artifact(filepath, expected_data)
+                    self._check_provenance_tracing_kernel_to_post_grad(
+                        filepath, expected_data
+                    )
             finally:
                 if filepath:
                     shutil.rmtree(filepath)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def _test_pt_tracing_combo_kernel(self, backend):
         """This test checks that generated provenance tracing artifact from triton combo kernel to post grad nodes"""
         a = torch.randn(10, 10, device="cuda")
@@ -294,6 +314,7 @@ def _test_pt_tracing_combo_kernel(self, backend):
         example_inputs = (a, b, c)
 
         model = Model2()
+        reset_inductor_kernel_provenance_debug_handle()
 
         with config.patch(
             {
@@ -317,10 +338,10 @@ def _test_pt_tracing_combo_kernel(self, backend):
             m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0])
             self.assertTrue(m)
             filepath = Path(m.group(1)).resolve()
-            expected_data = {"triton_poi_fused_0": ["relu", "sigmoid", "tanh"]}
-            self._check_provenance_tracing_artifact(filepath, expected_data)
+            expected_data = {"triton_poi_fused_0:1": ["relu", "sigmoid", "tanh"]}
+            self._check_provenance_tracing_kernel_to_post_grad(filepath, expected_data)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     def test_triton_kernel_to_post_grad_tracing_combo_kernel(self):
         self._test_pt_tracing_combo_kernel(backend="inductor")
         self._test_pt_tracing_combo_kernel(backend="aot_inductor")
@@ -437,7 +458,7 @@ def get_node_with_target(self, gm, target):
         """
         return next(iter([node for node in gm.graph.nodes if node.target == target]))
 
-    @requires_cuda  # test only works for cuda pattern matcher
+    @requires_cuda_and_triton  # test only works for cuda pattern matcher
     def test_pattern_matcher_transfer_meta(self):
         """
         Test that stack trace is transfered when node is decomposed in post_grad_passes
@@ -483,5 +504,279 @@ def forward(self, x):
         self.assertEqual(mm_node.meta["stack_trace"], stack_trace)
 
 
+class ProvenanceArtifactFilter(logging.Filter):
+    def filter(self, record):
+        if "artifact" in record.metadata:
+            return (
+                record.metadata["artifact"]["name"]
+                == "inductor_provenance_tracking_kernel_stack_traces"
+            )
+        return False
+
+
+class StructuredTracePayloadFormatter(logging.Formatter):
+    def format(self, record):
+        return record.payload.strip()
+
+
+class TestProvenanceTracingStackTraces(TestCase):
+    @contextlib.contextmanager
+    def _setup_provenance_capture(self):
+        """Helper to turn on and capture the 'inductor_tlparse_runtime' structured trace."""
+        payload_buffer = io.StringIO()
+        payload_handler = logging.StreamHandler(payload_buffer)
+        payload_handler.setLevel(logging.DEBUG)
+        payload_handler.setFormatter(StructuredTracePayloadFormatter())
+        payload_handler.addFilter(ProvenanceArtifactFilter())
+        trace_log.addHandler(payload_handler)
+        try:
+            yield payload_buffer
+        finally:
+            trace_log.removeHandler(payload_handler)
+
+    def extract_code_line(self, s, i=-2):
+        # Extract ith line
+        return s.split("\n")[i].strip()
+
+    @torch._inductor.config.patch({"trace.provenance_tracking_level": 2})
+    @requires_cuda_and_triton
+    def test_tlparse_kernel_stack_traces(self):
+        device = "cuda"
+        model = Model4().to(device)
+        x = torch.randn(8, 10).to(device)
+        a = torch.randn(10, 20).to(device)
+        b = torch.randn(20, 30).to(device)
+        c = torch.randn(10, 30).to(device)
+        example_inputs = (x, a, b, c)
+
+        expected = {
+            "triton_poi_fused_addmm_relu_sigmoid_threshold_backward_0:1": [
+                "x = self.sigmoid(x)",
+                "x = self.fc1(x)",
+                "x = self.relu(x)",
+            ],
+            "triton_poi_fused_mul_1:2": [
+                "d = a * 3.14",
+            ],
+            "triton_poi_fused_addmm_gelu_2:3": [
+                "z = torch.nn.functional.gelu(y)",
+                "y = torch.addmm(c, d, b)",
+            ],
+            "extern_kernels.mm:4": [
+                "x = self.fc1(x)",
+            ],
+            "extern_kernels.mm:5": [
+                "y = torch.addmm(c, d, b)",
+            ],
+        }
+
+        compiled = torch.compile(model)
+        # should produce the same provenance if there's cache hit
+        for _ in range(2):
+            # reset cache
+            torch._dynamo.reset()
+            reset_inductor_kernel_provenance_debug_handle()
+            with self._setup_provenance_capture() as payload_buffer:
+                compiled = torch.compile(model)
+                compiled(*example_inputs)
+                payload_content = payload_buffer.getvalue().strip()
+                data = json.loads(payload_content)
+                self.assertEqual(set(data.keys()), set(expected.keys()))
+                for key, expected_lines in expected.items():
+                    actual_lines = [self.extract_code_line(s) for s in data[key]]
+                    self.assertEqual(
+                        sorted(actual_lines),
+                        sorted(expected_lines),
+                        f"Mismatch for key: {key}",
+                    )
+
+    def _check_kernel_information_json(self, kernel_info, expected_kernels):
+        """Validate kernel information JSON structure and content."""
+        self.assertIsInstance(kernel_info, dict)
+
+        for expected in expected_kernels:
+            self.assertIn(
+                expected,
+                kernel_info,
+                f"Expected kernel {expected} not found in {list(kernel_info)}",
+            )
+
+        for data in kernel_info.values():
+            self.assertIsInstance(data, dict)
+            for field in ["stack_traces", "post_grad_nodes", "pre_grad_nodes"]:
+                self.assertIn(field, data)
+                self.assertIsInstance(data[field], list)
+                for item in data[field]:
+                    self.assertIsInstance(item, str)
+
+    @requires_cuda_and_triton
+    @torch._inductor.config.patch("trace.provenance_tracking_level", 1)
+    def test_kernel_information_generation(self):
+        """Test basic kernel information generation in AOTI packages."""
+
+        model = Model4().to("cuda")
+        x = torch.randn(8, 10, device="cuda")
+        a = torch.randn(10, 20, device="cuda")
+        b = torch.randn(20, 30, device="cuda")
+        c = torch.randn(10, 30, device="cuda")
+        inputs = (x, a, b, c)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            ep = torch.export.export(model, inputs, strict=False)
+            pt2_file = os.path.join(temp_dir, "model.pt2")
+            reset_inductor_kernel_provenance_debug_handle()
+            torch._inductor.aoti_compile_and_package(ep, package_path=pt2_file)
+
+            # Extract and check kernel_information.json exists in the package
+            with zipfile.ZipFile(pt2_file, "r") as zip_ref:
+                zip_ref.extractall(temp_dir)
+
+            json_path = os.path.join(
+                temp_dir,
+                "model",
+                "data",
+                "aotinductor",
+                "model",
+                "kernel_information.json",
+            )
+            self.assertTrue(
+                os.path.exists(json_path),
+                f"kernel_information.json not found in extracted package at {json_path}",
+            )
+
+            with open(json_path) as f:
+                kernel_info = json.load(f)
+
+            expected = {
+                "triton_poi_fused_addmm_relu_sigmoid_0:1": {
+                    "stack_traces": [
+                        "x = self.sigmoid(x)",
+                        "x = self.fc1(x)",
+                        "x = self.relu(x)",
+                    ],
+                    "post_grad_nodes": ["sigmoid", "relu", "add_tensor_1"],
+                    "pre_grad_nodes": ["sigmoid", "relu", "linear"],
+                },
+                "triton_poi_fused_mul_1:2": {
+                    "stack_traces": [
+                        "d = a * 3.14",
+                    ],
+                    "post_grad_nodes": ["mul"],
+                    "pre_grad_nodes": ["mul"],
+                },
+                "triton_poi_fused_addmm_gelu_2:3": {
+                    "stack_traces": [
+                        "z = torch.nn.functional.gelu(y)",
+                        "y = torch.addmm(c, d, b)",
+                    ],
+                    "post_grad_nodes": [
+                        "mul_3",
+                        "mul_1",
+                        "add_tensor",
+                        "add",
+                        "erf",
+                        "mul_2",
+                    ],
+                    "pre_grad_nodes": ["gelu", "addmm"],
+                },
+                "aoti_torch_cuda_mm_out:4": {
+                    "stack_traces": [
+                        "x = self.fc1(x)",
+                    ],
+                    "post_grad_nodes": ["mm_default_1"],
+                    "pre_grad_nodes": ["linear"],
+                },
+                "aoti_torch_cuda_mm_out:5": {
+                    "stack_traces": [
+                        "y = torch.addmm(c, d, b)",
+                    ],
+                    "post_grad_nodes": ["mm_default"],
+                    "pre_grad_nodes": ["addmm"],
+                },
+            }
+
+            self._check_kernel_information_json(kernel_info, expected.keys())
+
+            self.assertEqual(set(kernel_info.keys()), set(expected.keys()))
+            for key, data in expected.items():
+                all_lines = ",".join(kernel_info[key]["stack_traces"])
+                for s in data["stack_traces"]:
+                    self.assertTrue(s in all_lines)
+
+                self.assertEqual(
+                    sorted(kernel_info[key]["pre_grad_nodes"]),
+                    sorted(data["pre_grad_nodes"]),
+                    f"Mismatch for key: {key}",
+                )
+
+                self.assertEqual(
+                    sorted(kernel_info[key]["post_grad_nodes"]),
+                    sorted(data["post_grad_nodes"]),
+                    f"Mismatch for key: {key}",
+                )
+
+    @torch._inductor.config.patch("trace.provenance_tracking_level", 0)
+    def test_no_kernel_information_without_provenance_tracking(self):
+        """Test that kernel_information.json is not generated without provenance tracking."""
+
+        class SimpleModel(torch.nn.Module):
+            def forward(self, x):
+                return x * 2.0
+
+        model = SimpleModel()
+        x = torch.randn(4, 8)
+
+        # Compile with AOTI but without provenance tracking
+        with tempfile.TemporaryDirectory() as temp_dir:
+            ep = torch.export.export(model, (x,), strict=False)
+            pt2_file = os.path.join(temp_dir, "model.pt2")
+            torch._inductor.aoti_compile_and_package(ep, package_path=pt2_file)
+
+            # Extract and check kernel_information.json was NOT created in the package
+            extract_dir = os.path.join(temp_dir, "extracted")
+            os.makedirs(extract_dir, exist_ok=True)
+            with zipfile.ZipFile(pt2_file, "r") as zip_ref:
+                zip_ref.extractall(extract_dir)
+
+            expected_json_path = os.path.join(extract_dir, "kernel_information.json")
+            self.assertFalse(
+                os.path.exists(expected_json_path),
+                "kernel_information.json should not exist in package when provenance tracking is disabled",
+            )
+
+    def test_create_kernel_information_json_function(self):
+        """Test the create_kernel_information_json function directly."""
+        # Test with empty state
+        result = create_kernel_information_json()
+        self.assertIsInstance(result, dict)
+        self.assertEqual(len(result), 0)  # Should be empty with no provenance data
+
+    @unittest.skipIf(
+        IS_MACOS,
+        "MacOS generates different debug handles",
+    )
+    @torch._inductor.config.patch("trace.provenance_tracking_level", 1)
+    def test_cpu_extern_kernel(self):
+        class Foo(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.conv = torch.nn.Conv2d(16, 33, 3)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        model = Foo()
+        x = torch.randn(20, 16, 50, 100)
+        with self._setup_provenance_capture() as payload_buffer:
+            reset_inductor_kernel_provenance_debug_handle()
+            ep = torch.export.export(model, (x,))
+            torch._inductor.aoti_compile_and_package(ep)
+            payload_content = payload_buffer.getvalue().strip()
+            data = json.loads(payload_content)
+
+            keys = [k.split(":")[0] for k in data]
+            self.assertTrue("aoti_torch_cpu_convolution" in keys)
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/inductor/test_segmented_tree.py b/test/inductor/test_segmented_tree.py
new file mode 100644
index 0000000000000..22d1c85027dc9
--- /dev/null
+++ b/test/inductor/test_segmented_tree.py
@@ -0,0 +1,254 @@
+# Owner(s): ["module: inductor"]
+
+from hypothesis import given, strategies as st
+
+from torch._inductor.codegen.segmented_tree import SegmentedTree
+from torch._inductor.test_case import run_tests, TestCase
+
+
+# Helper functions for operations
+def max_op(a, b):
+    return max(a, b)
+
+
+def add_op(a, b):
+    return a + b
+
+
+# Naive implementations for reference
+def naive_range_max(arr, start, end):
+    return max(arr[start : end + 1])
+
+
+def naive_range_update(arr, start, end, value):
+    for i in range(start, end + 1):
+        arr[i] += value
+
+
+# Strategies for hypothesis testing
+positive_integers = st.lists(
+    st.integers(min_value=1, max_value=100), min_size=1, max_size=50
+)
+
+
+def valid_range_indices(array_length):
+    return st.tuples(
+        st.integers(min_value=0, max_value=array_length - 1),
+        st.integers(min_value=0, max_value=array_length - 1),
+    ).map(lambda x: (min(x), max(x)))
+
+
+update_values = st.integers(min_value=1, max_value=50)
+
+
+class TestSegmentedTree(TestCase):
+    # Basic construction and initialization tests
+    def test_basic_construction(self):
+        values = [1, 3, 5, 7, 9]
+        tree = SegmentedTree(values, add_op, max_op, 0)
+        assert tree.summarize_range(0, 4) == 9
+
+    def test_empty_array(self):
+        with self.assertRaises(ValueError):
+            SegmentedTree([], add_op, max_op, 0)
+
+    # Property-based tests
+    @given(values=positive_integers)
+    def test_max_query_matches_naive(self, values):
+        tree = SegmentedTree(values, add_op, max_op, 0)
+
+        for start in range(len(values)):
+            for end in range(start, len(values)):
+                expected = naive_range_max(values, start, end)
+                actual = tree.summarize_range(start, end)
+                assert actual == expected, (
+                    f"Range [{start}:{end}] expected {expected}, got {actual}"
+                )
+
+    @given(
+        values=positive_integers, range_indices=st.data(), update_value=update_values
+    )
+    def test_range_update(self, values, range_indices, update_value):
+        # Create a copy for naive implementation
+        naive_values = values.copy()
+
+        # Create segment tree
+        tree = SegmentedTree(values, add_op, max_op, 0)
+
+        # Get valid range indices
+        start, end = range_indices.draw(valid_range_indices(len(values)))
+
+        # Apply updates
+        tree.update_range(start, end, update_value)
+        naive_range_update(naive_values, start, end, update_value)
+
+        # Verify all possible ranges
+        for i in range(len(values)):
+            for j in range(i, len(values)):
+                expected = naive_range_max(naive_values, i, j)
+                actual = tree.summarize_range(i, j)
+                assert actual == expected, (
+                    f"After update, range [{i}:{j}] expected {expected}, got {actual}"
+                )
+
+    @given(values=positive_integers, range_data=st.data())
+    def test_multiple_operations(self, values, range_data):
+        # Create a copy for naive implementation
+        naive_values = values.copy()
+        tree = SegmentedTree(values, add_op, max_op, 0)
+
+        # Perform multiple operations
+        num_operations = 5
+        for _ in range(num_operations):
+            # Randomly choose between query and update
+            operation_type = range_data.draw(st.sampled_from(["query", "update"]))
+            start, end = range_data.draw(valid_range_indices(len(values)))
+
+            if operation_type == "query":
+                expected = naive_range_max(naive_values, start, end)
+                actual = tree.summarize_range(start, end)
+                assert actual == expected, (
+                    f"Range query [{start}:{end}] expected {expected}, got {actual}"
+                )
+            else:  # update
+                update_value = range_data.draw(update_values)
+                tree.update_range(start, end, update_value)
+                naive_range_update(naive_values, start, end, update_value)
+
+    def test_single_element_ranges(self):
+        values = [1, 3, 5, 7, 9]
+        tree = SegmentedTree(values, add_op, max_op, 0)
+
+        for i in range(len(values)):
+            assert tree.summarize_range(i, i) == values[i], (
+                f"Single element range at index {i} failed"
+            )
+
+    def test_full_array_range(self):
+        values = [1, 3, 5, 7, 9]
+        tree = SegmentedTree(values, add_op, max_op, 0)
+
+        # Test querying the entire array
+        assert tree.summarize_range(0, len(values) - 1) == max(values)
+
+        # Update the entire array and test again
+        update_value = 10
+        tree.update_range(0, len(values) - 1, update_value)
+        expected = max([v + update_value for v in values])
+        assert tree.summarize_range(0, len(values) - 1) == expected
+
+    def test_boundary_conditions(self):
+        values = [1, 3, 5, 7, 9]
+        tree = SegmentedTree(values, add_op, max_op, 0)
+
+        # Test first element
+        assert tree.summarize_range(0, 0) == values[0]
+
+        # Test last element
+        assert tree.summarize_range(len(values) - 1, len(values) - 1) == values[-1]
+
+        # Test first two elements
+        assert tree.summarize_range(0, 1) == max(values[0:2])
+
+        # Test last two elements
+        assert tree.summarize_range(len(values) - 2, len(values) - 1) == max(
+            values[-2:]
+        )
+
+    def test_invalid_ranges(self):
+        values = [1, 3, 5, 7, 9]
+        tree = SegmentedTree(values, add_op, max_op, 0)
+
+        # Test start > end
+        with self.assertRaises(ValueError):
+            tree.summarize_range(3, 2)
+
+        with self.assertRaises(ValueError):
+            tree.update_range(4, 2, 10)
+
+    def test_out_of_bounds(self):
+        values = [1, 3, 5, 7, 9]
+        tree = SegmentedTree(values, add_op, max_op, 0)
+
+        # Test negative indices
+        with self.assertRaises(ValueError):
+            tree.summarize_range(-1, 3)
+
+        with self.assertRaises(ValueError):
+            tree.summarize_range(0, -1)
+
+        # Test indices >= n
+        with self.assertRaises(ValueError):
+            tree.summarize_range(0, len(values))
+
+        with self.assertRaises(ValueError):
+            tree.summarize_range(len(values), len(values) + 1)
+
+        # Test update with out of bounds indices
+        with self.assertRaises(ValueError):
+            tree.update_range(-1, 3, 10)
+
+        with self.assertRaises(ValueError):
+            tree.update_range(0, len(values), 10)
+
+    def test_overlapping_updates(self):
+        values = [1, 3, 5, 7, 9]
+        naive_values = values.copy()
+        tree = SegmentedTree(values, add_op, max_op, 0)
+
+        # Apply overlapping updates
+        tree.update_range(0, 2, 5)  # Update [0, 1, 2]
+        naive_range_update(naive_values, 0, 2, 5)
+
+        tree.update_range(1, 3, 3)  # Update [1, 2, 3]
+        naive_range_update(naive_values, 1, 3, 3)
+
+        # Verify all possible ranges
+        for i in range(len(values)):
+            for j in range(i, len(values)):
+                expected = naive_range_max(naive_values, i, j)
+                actual = tree.summarize_range(i, j)
+                assert actual == expected, (
+                    f"After overlapping updates, range [{i}:{j}] expected {expected}, got {actual}"
+                )
+
+    def test_sequential_updates_and_queries(self):
+        values = [2, 4, 6, 8, 10, 12, 14]
+        naive_values = values.copy()
+        tree = SegmentedTree(values, add_op, max_op, 0)
+
+        # Sequence of operations
+        operations = [
+            ("update", 1, 3, 5),  # Update range [1, 2, 3] with +5
+            ("query", 0, 4),  # Query range [0, 1, 2, 3, 4]
+            ("update", 2, 5, 3),  # Update range [2, 3, 4, 5] with +3
+            ("query", 1, 3),  # Query range [1, 2, 3]
+            ("update", 0, 6, 2),  # Update entire array with +2
+            ("query", 0, 6),  # Query entire array
+            ("query", 3, 5),  # Query range [3, 4, 5]
+        ]
+
+        for op in operations:
+            if op[0] == "update":
+                _, start, end, value = op
+                tree.update_range(start, end, value)
+                naive_range_update(naive_values, start, end, value)
+
+                # Verify tree state after update
+                for i in range(len(values)):
+                    for j in range(i, len(values)):
+                        expected = naive_range_max(naive_values, i, j)
+                        actual = tree.summarize_range(i, j)
+                        assert actual == expected, (
+                            f"After update ({start}, {end}, {value}), query [{i}:{j}] expected {expected}, got {actual}"
+                        )
+            else:  # query
+                _, start, end = op
+                expected = naive_range_max(naive_values, start, end)
+                assert tree.summarize_range(start, end) == expected, (
+                    f"Query [{start}:{end}] expected {expected}, got {tree.summarize_range(start, end)}"
+                )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py
index e5838f2d4d32f..b30cdc2d946c1 100644
--- a/test/inductor/test_select_algorithm.py
+++ b/test/inductor/test_select_algorithm.py
@@ -17,7 +17,6 @@
 from torch._inductor.ir import FixedLayout
 from torch._inductor.select_algorithm import (
     autotune_select_algorithm,
-    PartialRender,
     TritonTemplate,
     TritonTemplateKernel,
 )
@@ -454,48 +453,24 @@ def test_finalized_subclass_hooks(self):
         hook_identifier = "# CUSTOM_HOOK"
 
         class ExtensionTritonTemplateKernel(TritonTemplateKernel):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+                self._register_extra_template_env_fns(
+                    self.custom_hook,
+                )
+
             def custom_hook(self) -> str:
                 """
-                Custom hook that just returns a test string for
-                validation
+                Custom hook that just returns a test string for validation
                 """
 
                 def hook() -> str:
                     return hook_identifier
 
-                assert "<CUSTOM_HOOK>" not in self.render_hooks
-                self.render_hooks["<CUSTOM_HOOK>"] = hook
-                return "<CUSTOM_HOOK>"
-
-            def render(
-                self, template, kwargs, record_input_dependent_tracked_event=False
-            ):
-                if record_input_dependent_tracked_event:
-                    self.cached_replay_events = []
-
-                template_env = {
-                    fn.__name__: self.record_input_dependent_tracked_event()(fn)
-                    if record_input_dependent_tracked_event
-                    else fn
-                    for fn in [
-                        self.def_kernel,
-                        self.size,
-                        self.stride,
-                        self.store_output,
-                        self.load_input,
-                        self.make_load,
-                        self.modification,
-                        self.gen_argdefs,
-                        self.gen_defines,
-                        # This function registers a hook that the scheduler does
-                        # not directly finalize
-                        self.custom_hook,
-                    ]
-                }
-                return PartialRender(
-                    template.render(**template_env, **kwargs),
-                    self.render_hooks,
-                )
+                return self._register_hook("<CUSTOM_HOOK>", hook)
+
+            def inductor_meta_common(self):
+                return super().inductor_meta_common()
 
         class ExtensionTritonTemplate(TritonTemplate):
             kernel_type = ExtensionTritonTemplateKernel
diff --git a/test/inductor/test_smoke.py b/test/inductor/test_smoke.py
index 895e8ba16ab0d..2a247fddbe76e 100644
--- a/test/inductor/test_smoke.py
+++ b/test/inductor/test_smoke.py
@@ -6,7 +6,11 @@
 import torch._logging
 from torch._inductor.test_case import TestCase
 from torch.testing._internal.common_utils import IS_LINUX
-from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA, HAS_GPU
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_CUDA_AND_TRITON,
+    HAS_GPU,
+)
 
 
 class MLP(torch.nn.Module):
@@ -62,5 +66,5 @@ def test_compile_invalid_options(self):
     from torch._inductor.test_case import run_tests
 
     if IS_LINUX and HAS_GPU:
-        if (not HAS_CUDA) or torch.cuda.get_device_properties(0).major <= 5:
+        if (not HAS_CUDA_AND_TRITON) or torch.cuda.get_device_properties(0).major <= 5:
             run_tests()
diff --git a/test/inductor/test_split_cat_fx_aten_passes.py b/test/inductor/test_split_cat_fx_aten_passes.py
index 354552c497d98..0ec7825df001c 100644
--- a/test/inductor/test_split_cat_fx_aten_passes.py
+++ b/test/inductor/test_split_cat_fx_aten_passes.py
@@ -5,7 +5,7 @@
 from torch._dynamo.utils import counters
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.inductor_utils import GPU_TYPE
-from torch.testing._internal.triton_utils import requires_cuda
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 try:
@@ -248,7 +248,7 @@ def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
             self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
         )
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
         post_grad_fusion_options={
@@ -291,7 +291,7 @@ def test_split_cat_post_grad(self):
         self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
         counters.clear()
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
         post_grad_fusion_options={
@@ -317,7 +317,7 @@ def test_split_cat_post_grad_singular(self):
         self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
         counters.clear()
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
         post_grad_fusion_options={
@@ -342,7 +342,7 @@ def test_select_cat_post_grad(self):
         self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
         counters.clear()
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
         post_grad_fusion_options={
diff --git a/test/inductor/test_static_cuda_launcher.py b/test/inductor/test_static_cuda_launcher.py
index 2ce294ed0ff55..654bfd269f761 100644
--- a/test/inductor/test_static_cuda_launcher.py
+++ b/test/inductor/test_static_cuda_launcher.py
@@ -13,10 +13,10 @@
 from torch._inductor.runtime.triton_helpers import libdevice
 from torch._inductor.test_case import TestCase
 from torch.testing._internal.common_utils import skipIfRocm
-from torch.testing._internal.triton_utils import requires_cuda
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
-@requires_cuda
+@requires_cuda_and_triton
 class TestStaticCudaLauncher(TestCase):
     def setUp(self):
         super().setUp()
@@ -396,7 +396,7 @@ def kernel_many_args(out_tensor, {decl}):
         self.assertEqual(buf0, buf1)
 
 
-@requires_cuda
+@requires_cuda_and_triton
 @torch._inductor.config.patch(
     {"use_static_cuda_launcher": True, "strict_static_cuda_launcher": True}
 )
diff --git a/test/inductor/test_subgraph_choice.py b/test/inductor/test_subgraph_choice.py
index 98f447652d24f..d2d5a3bf59a9e 100644
--- a/test/inductor/test_subgraph_choice.py
+++ b/test/inductor/test_subgraph_choice.py
@@ -1,18 +1,13 @@
 # Owner(s): ["module: inductor"]
-import functools
 import unittest
 from unittest import mock
 from unittest.mock import MagicMock
 
 import torch
-from torch._dispatch.python import enable_python_dispatcher
-from torch._inductor.codegen.subgraph import SubgraphTemplate
-from torch._inductor.decomposition import select_decomp_table
 from torch._inductor.ir import Buffer, FixedLayout, FlexibleLayout
 from torch._inductor.lowering import register_lowering
 from torch._inductor.select_algorithm import autotune_select_algorithm
 from torch._inductor.test_case import run_tests, TestCase
-from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing._internal.common_utils import skipIfXpu, TEST_WITH_ROCM
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
 
@@ -64,20 +59,14 @@ def _(a, b):
             choices = [aten_mm.bind((mat1, mat2), layout)]
 
             kPartitions = 256
-            with enable_python_dispatcher():
-                decompositions = select_decomp_table()
-
-                decompose_k_subgraph_template = SubgraphTemplate(
-                    name="decompose_k_mm",
-                    make_fx_graph=make_fx(
-                        functools.partial(decomposeK, kPartitions=kPartitions),
-                        decompositions,
-                        tracing_mode="real",
-                    ),
-                )
+
+            decompose_k_subgraph_template = (
+                torch._inductor.kernel.mm.DecomposeKSugraphTemplate()
+            )
 
             decompose_k_subgraph_template.maybe_append_choice(
                 choices,
+                k_split=kPartitions,
                 input_nodes=(mat1, mat2),
                 layout=layout,
             )
@@ -139,19 +128,14 @@ def _(a, b):
             choices = []
 
             kPartitions = 2
-            with enable_python_dispatcher():
-                decompositions = select_decomp_table()
 
-                decompose_k_subgraph_template = SubgraphTemplate(
-                    name="decompose_k_mm",
-                    make_fx_graph=make_fx(
-                        functools.partial(decomposeK, kPartitions=kPartitions),
-                        decompositions,
-                    ),
-                )
+            decompose_k_subgraph_template = (
+                torch._inductor.kernel.mm.DecomposeKSugraphTemplate()
+            )
 
             decompose_k_subgraph_template.maybe_append_choice(
                 choices,
+                k_split=kPartitions,
                 input_nodes=(mat1, mat2),
                 layout=layout,
             )
diff --git a/test/inductor/test_template_heuristics_registry.py b/test/inductor/test_template_heuristics_registry.py
new file mode 100644
index 0000000000000..6786fe24ccc96
--- /dev/null
+++ b/test/inductor/test_template_heuristics_registry.py
@@ -0,0 +1,171 @@
+# Owner(s): ["module: inductor"]
+from torch._inductor.template_heuristics.base import TemplateConfigHeuristics
+from torch._inductor.template_heuristics.registry import (
+    _TEMPLATE_HEURISTIC_REGISTRY,
+    clear_registry,
+    get_template_heuristic,
+    register_template_heuristic,
+)
+from torch._inductor.test_case import run_tests, TestCase
+
+
+class TestTemplateHeuristicsRegistry(TestCase):
+    def setUp(self):
+        super().setUp()
+        # Save original registry state
+        self.original_registry = _TEMPLATE_HEURISTIC_REGISTRY.copy()
+        clear_registry()  # Test heuristic classes using the decorator registration
+
+    def tearDown(self):
+        # Restore original registry
+        clear_registry()
+        _TEMPLATE_HEURISTIC_REGISTRY.update(self.original_registry)
+        super().tearDown()
+
+    def test_register_class(self):
+        """Test basic registration of a heuristic class."""
+        # Clear registry for this isolated test
+        clear_registry()
+
+        @register_template_heuristic("test_mm", "cuda")
+        class TestHeuristic(TemplateConfigHeuristics):
+            pass
+
+        # Verify registration
+        key = ("test_mm", "cuda", None)
+        self.assertIn(key, _TEMPLATE_HEURISTIC_REGISTRY)
+        self.assertEqual(_TEMPLATE_HEURISTIC_REGISTRY[key], TestHeuristic)
+
+    def test_assertion_existing_class(self):
+        @register_template_heuristic("triton::mm", "cuda")
+        class _CrossOpHeuristic(TemplateConfigHeuristics):
+            """(template, device, None) - Cross-op for specific device"""
+
+        """Test that registered class can be retrieved."""
+        # The _CrossOpHeuristic is registered at module level for ("mm", "cuda", None)
+        # Test retrieval - it should match for any op on cuda device
+        heuristic = get_template_heuristic("triton::mm", "cuda", "bmm")
+        self.assertIsInstance(heuristic, _CrossOpHeuristic)
+
+    def test_hierarchy_lookup(self):
+        """Test complete hierarchy: (template, device, op) -> (template, None, None)"""
+
+        @register_template_heuristic("triton::mm", "cuda", op_name="scaled_mm")
+        class _MostSpecificHeuristic(TemplateConfigHeuristics):
+            """(template, device, op) - Most specific"""
+
+        @register_template_heuristic("triton::mm", None, op_name="scaled_mm")
+        class _CrossDeviceHeuristic(TemplateConfigHeuristics):
+            """(template, None, op) - Cross-device for specific op"""
+
+        @register_template_heuristic("triton::mm", "cuda")
+        class _CrossOpHeuristic(TemplateConfigHeuristics):
+            """(template, device, None) - Cross-op for specific device"""
+
+        @register_template_heuristic("triton::mm", None)
+        class _MostGeneralHeuristic(TemplateConfigHeuristics):
+            """(template, None, None) - Most general"""
+
+        # All classes are already registered via decorators:
+        # _MostSpecificHeuristic: ("mm", "cuda", "scaled_mm") - Most specific
+        # _CrossDeviceHeuristic: ("mm", None, "scaled_mm") - Cross-device for specific op
+        # _CrossOpHeuristic: ("mm", "cuda", None) - Cross-op for specific device
+        # _MostGeneralHeuristic: ("mm", None, None) - Most general
+
+        # Test 1: Exact match - should get most specific
+        heuristic = get_template_heuristic("triton::mm", "cuda", "scaled_mm")
+        self.assertIsInstance(heuristic, _MostSpecificHeuristic)
+
+        # Test 2: Different device, same op - should get cross-device
+        heuristic = get_template_heuristic("triton::mm", "xpu", "scaled_mm")
+        self.assertIsInstance(heuristic, _CrossDeviceHeuristic)
+
+        # Test 3: Same device, different op - should get cross-op
+        heuristic = get_template_heuristic("triton::mm", "cuda", "bmm")
+        self.assertIsInstance(heuristic, _CrossOpHeuristic)
+
+        # Test 4: Different device and op - should get most general
+        heuristic = get_template_heuristic("triton::mm", "xpu", "bmm")
+        self.assertIsInstance(heuristic, _MostGeneralHeuristic)
+
+    def test_partial_hierarchy_scenarios(self):
+        """Test hierarchy behavior with partial registrations"""
+
+        # Scenario 1: Register partial hierarchy using decorators
+        @register_template_heuristic("triton::tma", None, op_name="scaled_tma")
+        class _TestCrossDeviceHeuristic(TemplateConfigHeuristics):
+            pass
+
+        @register_template_heuristic("triton::tma", None)
+        class _TestGeneralHeuristic(TemplateConfigHeuristics):
+            pass
+
+        # Should get cross-device for matching op, regardless of device
+        heuristic = get_template_heuristic("triton::tma", "cuda", "scaled_tma")
+        self.assertIsInstance(heuristic, _TestCrossDeviceHeuristic)
+
+        # Should fallback to general for different op
+        heuristic = get_template_heuristic("triton::tma", "cuda", "scaled_mm")
+        self.assertIsInstance(heuristic, _TestGeneralHeuristic)
+
+        # Scenario 2: Only specific device exists
+        @register_template_heuristic("triton::bmm", "cuda")
+        class _TestDeviceSpecificHeuristic(TemplateConfigHeuristics):
+            pass
+
+        # Should get device-specific for cuda
+        heuristic = get_template_heuristic("triton::bmm", "cuda", "any_op")
+        self.assertIsInstance(heuristic, _TestDeviceSpecificHeuristic)
+
+        # Should return fallback instance for other devices (no specific heuristic registered)
+        heuristic = get_template_heuristic("triton::bmm", "xpu", "any_op")
+        self.assertIsInstance(heuristic, TemplateConfigHeuristics)
+        # Make sure it's not the registered specific heuristic
+        self.assertNotIsInstance(heuristic, _TestDeviceSpecificHeuristic)
+
+        # Scenario 3: Only most general exists
+        @register_template_heuristic("triton::mm", None)
+        class _TestMostGeneralHeuristic(TemplateConfigHeuristics):
+            pass
+
+        # Should always get general regardless of device/op
+        heuristic = get_template_heuristic("triton::mm", "cuda", "scaled_addmm")
+        self.assertIsInstance(heuristic, _TestMostGeneralHeuristic)
+
+        heuristic = get_template_heuristic("triton::mm", "xpu", "regular_addmm")
+        self.assertIsInstance(heuristic, _TestMostGeneralHeuristic)
+
+    def test_fallback_behavior(self):
+        """Test that fallback TemplateConfigHeuristics is returned when no heuristic is found"""
+
+        # Test 1: Get fallback for unregistered template
+        heuristic = get_template_heuristic("unknown_template", "cuda", "unknown_op")
+        self.assertIsInstance(heuristic, TemplateConfigHeuristics)
+        # Make sure it's the base class and not a subclass
+        self.assertEqual(type(heuristic), TemplateConfigHeuristics)
+
+        # Test 2: Verify fallback instances are NOT cached (new instance each time)
+        heuristic2 = get_template_heuristic("unknown_template", "cuda", "unknown_op")
+        self.assertIsInstance(heuristic2, TemplateConfigHeuristics)
+        self.assertEqual(type(heuristic2), TemplateConfigHeuristics)
+        # Should be different instances (not cached)
+        self.assertIsNot(heuristic, heuristic2)
+
+        # Test 3: After registering a heuristic, should get the registered one instead
+        @register_template_heuristic("unknown_template", "cuda", op_name="unknown_op")
+        class _NewlyRegisteredHeuristic(TemplateConfigHeuristics):
+            pass
+
+        # Now should get the registered heuristic, not the fallback
+        heuristic3 = get_template_heuristic("unknown_template", "cuda", "unknown_op")
+        self.assertIsInstance(heuristic3, _NewlyRegisteredHeuristic)
+        self.assertNotEqual(type(heuristic3), TemplateConfigHeuristics)
+
+        # Test 4: Verify registered instances ARE cached (same instance each time)
+        heuristic4 = get_template_heuristic("unknown_template", "cuda", "unknown_op")
+        self.assertIsInstance(heuristic4, _NewlyRegisteredHeuristic)
+        self.assertIs(heuristic3, heuristic4)  # Should be same cached instance
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_torchbind.py b/test/inductor/test_torchbind.py
index 631a4fce31fdd..c604f8450bbbf 100644
--- a/test/inductor/test_torchbind.py
+++ b/test/inductor/test_torchbind.py
@@ -13,6 +13,7 @@
 from torch._inductor.codecache import WritableTempFile
 from torch._inductor.package import package_aoti
 from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.common_utils import skipIfWindows
 from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu
 from torch.testing._internal.torchbind_impls import (
     _empty_tensor_queue,
@@ -158,6 +159,7 @@ def test_torchbind_hop_schema_no_output(self):
             "call_torchbind(__torch__.torch.classes._TorchScriptTesting._TensorQueue _0, str method, Tensor _1) -> NoneType _0",
         )
 
+    @skipIfWindows(msg="AOTI is not fully support on Windows")
     def test_torchbind_aot_compile(self):
         ep, inputs, _, _ = self.get_exported_model()
         aoti_files = aot_compile(
@@ -172,7 +174,7 @@ def test_torchbind_aot_compile(self):
                 custom_objs_config = file
             elif file.endswith("/custom_obj_0"):
                 custom_obj_0 = file
-            elif file.endswith(".json") and "metadata" not in file:
+            elif file.endswith("wrapper.json") and "metadata" not in file:
                 extern_json = file
 
         self.assertIsNotNone(custom_objs_config)
@@ -302,6 +304,7 @@ def test_torchbind_aoti(self):
         self.assertEqual(result, orig_res)
 
     @torch._inductor.config.patch("aot_inductor.use_runtime_constant_folding", True)
+    @skipIfWindows(msg="AOTI is not fully support on Windows")
     def test_torchbind_aot_compile_constant_folding(self):
         ep, inputs, orig_res, _ = self.get_exported_model()
         pt2_path = torch._inductor.aoti_compile_and_package(ep)
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 162afcbafdeab..6bae97110a1b8 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -97,6 +97,7 @@
     MACOS_VERSION,
     parametrize,
     serialTest,
+    skipIfMPS,
     skipIfRocm,
     skipIfWindows,
     skipIfXpu,
@@ -141,7 +142,7 @@
     skipCPUIf,
     skipCUDAIf,
 )
-from torch.testing._internal.triton_utils import requires_cuda
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 _T = TypeVar("_T")
@@ -1474,6 +1475,49 @@ def fn(a, b, alpha):
 
         self.common(fn, (x, y, 2))
 
+    def test_add_complex7(self):
+        # Fix https://github.com/pytorch/pytorch/issues/160495
+        # Test scalar (0-dimensional) complex tensor addition: 0D + 0D
+        def fn(a, b, alpha):
+            return torch.add(a, b, alpha=alpha)
+
+        x = torch.rand((), dtype=torch.complex64, device=self.device)
+        y = torch.rand((), dtype=torch.complex64, device=self.device)
+
+        self.common(fn, (x, y, 2))
+
+    def test_add_complex8(self):
+        # Fix https://github.com/pytorch/pytorch/issues/160495
+        # Test scalar complex addition: 1D + 0D
+        def fn(a, b, alpha):
+            return torch.add(a, b, alpha=alpha)
+
+        x = torch.rand(1, dtype=torch.complex64, device=self.device)
+        y = torch.rand((), dtype=torch.complex64, device=self.device)
+
+        self.common(fn, (x, y, 2))
+
+    def test_add_complex9(self):
+        # Fix https://github.com/pytorch/pytorch/issues/160495
+        # Test scalar complex addition: 0D + 1D
+        def fn(a, b, alpha):
+            return torch.add(a, b, alpha=alpha)
+
+        x = torch.rand((), dtype=torch.complex64, device=self.device)
+        y = torch.rand(1, dtype=torch.complex64, device=self.device)
+
+        self.common(fn, (x, y, 2))
+
+    def test_add_complex10(self):
+        # Fix https://github.com/pytorch/pytorch/issues/160495
+        # Test scalar complex broadcasting
+        def fn(a, b, alpha):
+            return torch.add(a, b, alpha=alpha)
+
+        x = torch.randn(2, 3, dtype=torch.complex64, device=self.device)
+        y = torch.rand((), dtype=torch.complex64, device=self.device)
+        self.common(fn, (x, y, 2))
+
     def test_concat_add_inplace(self):
         def fn(x, y, z):
             return torch.cat([x, y], dim=1).add_(z)
@@ -2954,6 +2998,18 @@ def forward(x, y):
             ),
         )
 
+    def test_torch_device_split(self):
+        def fn(x):
+            return x.split(2)
+
+        x = torch.rand(10)
+
+        with x.device:
+            out = torch.compile(fn, backend=lambda gm, _: gm)(x)
+            ref = fn(x)
+            for a, b in zip(out, ref):
+                self.assertTrue(torch.allclose(a, b))
+
     def test_relu(self):
         def fn(a, b):
             return (torch.relu(a), torch.relu(a + b) / 10)
@@ -5724,6 +5780,19 @@ def forward(self, x):
         if self.device != "cpu":
             assertGeneratedKernelCountEqual(self, 1)
 
+    def test_complex_from_real_imag(self):
+        def fn(x, y):
+            return aten.complex.default(x, y)
+
+        a = torch.randn([5, 3]).permute(1, 0)
+
+        self.common(
+            fn,
+            (a, a),
+            exact_stride=True,
+            reference_in_float=False,
+        )
+
     def test_view_as_complex(self):
         class Repro(torch.nn.Module):
             def __init__(self) -> None:
@@ -6032,6 +6101,16 @@ def fn(x):
             (torch.randn([8, 16, 8, 8]),),
         )
 
+    def test_unsigned_constant_tensors(self):
+        def fn(x):
+            c = torch.tensor(7, dtype=torch.uint8)
+            return c + x, torch.neg(c), torch.neg(c) + x
+
+        self.common(
+            fn,
+            (torch.randn([16, 16]),),
+        )
+
     # Disable size_asserts for this test due to https://github.com/pytorch/pytorch/issues/145963
     @config.patch(size_asserts=os.environ.get("TORCHINDUCTOR_SIZE_ASSERTS") == "1")
     @torch._dynamo.config.patch(capture_dynamic_output_shape_ops=True)
@@ -7464,6 +7543,14 @@ def fn(a):
             fn, (torch.randint(0, 999, size=[1, 1, 8, 8], dtype=torch.float32),)
         )
 
+    def test_constant_pad_2d_strides_nonpositive(self):
+        def fn(a):
+            return torch.constant_pad_nd(a, [0, 0, 0, -2, 0, 0])
+
+        self.common(
+            fn, (torch.empty_strided((2, 4, 5), (20, 1, 4), dtype=torch.float32),)
+        )
+
     @skip_if_gpu_halide  # misaligned address
     def test_constant_pad_3d(self):
         def fn(a):
@@ -8425,6 +8512,24 @@ def forward(self, x, start_pos):
             self.common(kv_cache_module, (inp, 1), check_lowp=False)
         assertGeneratedKernelCountEqual(self, 1)
 
+    @skipIfMPS
+    def test_slice_scatter_dtype_consistency(self):
+        # Test dtype consistency of slice_scatter
+        def fn(x, y):
+            return torch.slice_scatter(y, x, 0)
+
+        for dtype in [
+            torch.int64,
+            torch.float64,
+        ]:
+            self.common(
+                fn,
+                [
+                    torch.tensor([0], dtype=dtype),
+                    torch.tensor([0], dtype=torch.float32),
+                ],
+            )
+
     @skip_if_gpu_halide  # compile error on gpu
     def test_scatter1(self):
         def fn(a, dim, index, b):
@@ -9847,6 +9952,7 @@ def fn(x):
             ],
         )
 
+    @skipIfXpu(msg="Incorrect XPU reference")
     def test_argmax_argmin2(self):
         def fn(x):
             return (
@@ -9858,6 +9964,7 @@ def fn(x):
 
         self.common(fn, (torch.randn([144, 144]),))
 
+    @skipIfXpu(msg="Incorrect XPU reference")
     def test_argmax_argmin_with_duplicates(self):
         def fn(x):
             return (
@@ -9879,6 +9986,7 @@ def fn(x):
         t1 = torch.randint(8, size=(1028, 1028))
         self.common(fn, (t1,))
 
+    @skipIfXpu(msg="# Incorrect XPU reference ")
     @xfail_if_mps  # eager nan is wrong, see https://github.com/pytorch/pytorch/issues/130295
     @skip_if_halide  # nan behavior
     def test_argmax_argmin_with_nan(self):
@@ -9979,6 +10087,7 @@ def shrink_rank(x, rank):
                 [rank4_inps, rank3_inps, rank5_inps],
             )
 
+    @skipIfXpu(msg="Incorrect XPU reference")
     def test_argmax_argmin3(self):
         def fn(x):
             return (
@@ -10570,6 +10679,33 @@ def inductor_matmul(a, b):
         dynamic_specialized = inductor_matmul(dynamic_specialized_a, b)
         self.assertEqual(dynamic, dynamic_specialized)
 
+    @requires_gpu()
+    @skip_if_not_triton
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+    @config.patch({"force_disable_caches": True})
+    def test_mark_dynamic_with_hint_override(self):
+        @torch.compile
+        def no_override(x):
+            return x.sum(dim=0)
+
+        @torch.compile
+        def override(x):
+            return x.sum(dim=0)
+
+        x_small = torch.randn(4096, 512, device=GPU_TYPE)
+        torch._dynamo.decorators.mark_dynamic(x_small, 0)
+        code1 = run_and_get_triton_code(no_override, x_small)
+
+        torch._dynamo.reset_code_caches()
+
+        torch._dynamo.decorators.mark_dynamic(x_small, 0, hint_override=4096 * 10)
+        code2 = run_and_get_triton_code(override, x_small)
+        self.assertNotEqual(code1, code2)
+
+        self.assertEqual(no_override(x_small), override(x_small))
+
     @requires_gpu()
     def test_stride_preservation_with_stride_modifying_fx_pass(self):
         def f(x):
@@ -10626,6 +10762,22 @@ def fn(x):
 
         self.common(fn, [torch.randn(1, 8, 396 * 300)])
 
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_pattern_matcher_unbacked(self):
+        @torch.compile(fullgraph=True)
+        def get_mask(W: torch.Tensor, percentage_nonzeros: torch.Tensor):
+            total_elements = W.numel()
+            k = total_elements * percentage_nonzeros
+            top_k_indices = torch.topk(torch.abs(W).flatten(), k.int())[1]
+            mask = torch.zeros(total_elements, dtype=torch.bool, device=W.device)
+            mask.scatter_(0, top_k_indices, True)
+            mask = mask.view(W.shape)
+            return mask
+
+        x = torch.randn((128, 64), device=self.device)
+        p = torch.tensor(0.50, device=self.device)
+        get_mask(x, p)
+
     def test_sqrt_dynamic_shapes(self):
         # TIMM convit_base model: https://github.com/pytorch/pytorch/issues/97877.
         # TODO: support cuda path.
@@ -11559,6 +11711,9 @@ def fn(q, k, v):
 
     @xfail_if_mps_unimplemented
     @expectedFailureXPU
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Some archs don't support mem eff SDPA"
+    )
     def test_scaled_dot_product_efficient_attention(self):
         if self.device == "cpu":
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
@@ -12786,8 +12941,6 @@ def fn(x):
             not in [
                 "airy_ai",
                 "erfcx",
-                "gammainc",
-                "gammaincc",
                 "laguerre_polynomial_l",
                 "legendre_polynomial_p",
                 "log_ndtr",
@@ -13161,7 +13314,7 @@ def f(x):
                 "assert_size_stride(buf2, (16, 32), (32, 1)"
             ).run(code)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @config.patch(use_fast_math=True)
     def test_prepare_softmax_with_fast_math(self):
         """
@@ -13660,6 +13813,249 @@ def forward(self, x):
         inputs = (torch.randn(4, device=self.device),)
         self.common(Model(), inputs)
 
+    @requires_cuda_and_triton
+    @parametrize("use_cat", [True, False])
+    def test_copy_non_blocking_is_pinned(self, use_cat):
+        def f(a_list):
+            a_cpu_list = []
+            a_to_cpu_event_list = []
+
+            for a in a_list:
+                a_cpu = a.to(device="cpu", non_blocking=True)
+                a_to_cpu_event = torch.Event()
+                a_to_cpu_event.record()
+                a_cpu_list.append(a_cpu)
+                a_to_cpu_event_list.append(a_to_cpu_event)
+
+            for e in a_to_cpu_event_list:
+                e.synchronize()
+
+            if use_cat:
+                return torch.cat(a_cpu_list)
+            else:
+                return a_cpu_list
+
+        f_compiled = torch.compile(f)
+        inputs = [
+            torch.rand(1000, dtype=torch.float16, device=GPU_TYPE) for _ in range(100)
+        ]
+        outputs = f(inputs)
+
+        with torch.profiler.profile(
+            activities=[
+                getattr(torch.profiler.ProfilerActivity, GPU_TYPE.upper()),
+            ],
+        ) as p:
+            outputs_compiled = f_compiled(inputs)
+
+        # outputs_compiled, (code,) = run_and_get_code(f_compiled, inputs)
+        # self.assertTrue("pinned" in code)
+
+        self.assertEqual(outputs, outputs_compiled)
+        profile_output = str(p.key_averages())
+        print(profile_output)
+        self.assertFalse("Pageable" in profile_output)
+
+    @unittest.skipIf(
+        config.cpp_wrapper,
+        "cpp_wrapper samples will lead to invalid indexing",
+    )
+    def test_inductor_triton_bucketize_respects_masking(self):
+        def fn(inp, repeats, output_size):
+            # return torch.repeat_interleave(inp, repeats, dim=0, output_size=output_size)
+            idx = torch.searchsorted(
+                repeats.cumsum(0),
+                torch.arange(0, output_size, device=repeats.device),
+                right=True,
+            )
+            return torch.index_select(inp, 0, idx)
+
+        inp = torch.arange(0, 4, device=self.device)
+        repeats = torch.tensor([1, 2, 3, 4], device=self.device)
+        output_size = repeats.sum().item()
+        args = (inp, repeats, output_size)
+        self.assertEqual(fn(*args), torch.compile(fn)(*args))
+
+    @parametrize("dtype", [torch.int32, torch.int64])
+    @parametrize("nd", [1, 2])
+    def test_repeat_interleave_Tensor_decomp(self, dtype, nd):
+        # https://github.com/pytorch/pytorch/issues/147160
+        def f(input, repeats):
+            return torch.repeat_interleave(input, repeats, dim=0, output_size=3) + 1
+
+        input = torch.tensor([[1, 2], [3, 4]], dtype=dtype, device=self.device)
+        input = torch.arange(1, 2**nd + 1, dtype=dtype, device=self.device).reshape(
+            [2] * nd
+        )
+        repeat = torch.tensor([1, 2], device=self.device)
+
+        if input.device.type == "mps" and dtype == torch.int64:
+            raise unittest.SkipTest(
+                "torch.compile fails this test with mps & int64, "
+                "see https://github.com/pytorch/pytorch/issues/159408"
+            )
+
+        f_compiled = torch.compile(f)
+        output, (code,) = run_and_get_code(f_compiled, input, repeat)
+        reference = f(input, repeat)
+        self.assertEqual(output, reference)
+        # we don't lower when the cpp_wrapper is used because it cannot generate
+        # proper examples during autotune
+        can_lower = (not config.cpp_wrapper) and (input.device.type != "mps")
+        has_lowered = not re.search(r"repeat_interleave.Tensor", code)
+        self.assertEqual(has_lowered, can_lower)
+
+    @staticmethod
+    def _is_triggering_buffer_reuse(fn, *inputs):
+        with config.patch(allow_buffer_reuse=True):
+            _, (code_allowed,) = run_and_get_code(fn, *inputs)
+        with config.patch(allow_buffer_reuse=False):
+            _, (code_disallowed,) = run_and_get_code(fn, *inputs)
+        code_allowed = re.sub(r"AOT ID: .*", "AOT ID: ['test']", code_allowed)
+        code_disallowed = re.sub(r"AOT ID: .*", "AOT ID: ['test']", code_disallowed)
+        return code_allowed != code_disallowed
+
+    def test_allow_reuse_disable_if_exceed_peak(self):
+        @torch.compile
+        def fn(inp):  # 1*N^2
+            a = inp.mean(-1)  # 1*N^2 + N
+            b = (inp - a) ** 2  # 2*N^2 + N
+            c = b @ b  # 3*N^2 (!!) since this is the peak, can not reuse across
+            d = c.mean(-1)  # 2*N^2 + N
+            return d  # 1*N^2 + N
+
+        inp = torch.randn(100, 100, device=self.device)
+        self.assertFalse(CommonTemplate._is_triggering_buffer_reuse(fn, inp))
+
+    def test_allow_reuse_active_if_under_peak(self):
+        def g(inp):
+            return (inp - torch.logsumexp(inp, -1)) ** 2
+
+        @torch.compile
+        def fn(m, inp):
+            inp = m @ g(inp)
+            inp = m @ g(inp)
+            inp = m @ g(inp)
+            inp = m @ g(inp)
+            inp = m @ g(inp)
+            return inp
+
+        m = torch.randn(100, 100, device=self.device)
+        inp = torch.randn(100, 100, device=self.device)
+        self.assertTrue(CommonTemplate._is_triggering_buffer_reuse(fn, m, inp))
+
+    @requires_cuda_and_triton
+    def test_cpu_scalar_with_gpu_tensor(self):
+        def fn(a, b):
+            return a + b[0]
+
+        a = torch.rand(20, device=GPU_TYPE)
+        b = torch.rand(4, device="cpu")
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor")(a, b)
+        self.assertEqual(eager, compiled)
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    @requires_cuda_and_triton
+    @torch._inductor.config.patch(cpp_wrapper=True)
+    def test_cpu_scalar_with_gpu_tensor_cpp(self):
+        def fn(a, b):
+            return a + b[0]
+
+        a = torch.rand(20, device=GPU_TYPE)
+        b = torch.rand(4, device="cpu")
+
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor")(a, b)
+        self.assertEqual(eager, compiled)
+
+    @requires_cuda_and_triton
+    def test_cpu_scalar_with_gpu_tensor_dynamic(self):
+        def fn(a, b):
+            return a + b[0]
+
+        a = torch.rand(20, device=GPU_TYPE)
+        b = torch.rand(4, device="cpu")
+
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor", dynamic=True)(a, b)
+        self.assertEqual(eager, compiled)
+
+    def test_cpu_scalar_with_cpu_tensor(self):
+        def fn(a, b):
+            return a + b[0]
+
+        a = torch.rand(20, device="cpu")
+        b = torch.rand(4, device="cpu")
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor")(a, b)
+        self.assertEqual(eager, compiled)
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    @requires_cuda_and_triton
+    def test_gpu_scalar_with_gpu_tensor(self):
+        def fn(a, b):
+            return a + b[0]
+
+        a = torch.rand(20, device=GPU_TYPE)
+        b = torch.rand(4, device=GPU_TYPE)
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor")(a, b)
+        self.assertEqual(eager, compiled)
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    @requires_cuda_and_triton
+    def test_cpu_tensor_with_gpu_tensor(self):
+        def fn(a, b):
+            return a + b
+
+        a = torch.rand(20, device=GPU_TYPE)
+        b = torch.rand(20, device="cpu")
+
+        with self.assertRaises(RuntimeError):
+            compiled = torch.compile(fn, backend="inductor")(a, b)
+
+    def test_cpu_tensor_with_cpu_tensor(self):
+        def fn(a, b):
+            return a + b
+
+        a = torch.rand(20, device="cpu")
+        b = torch.rand(20, device="cpu")
+
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor")(a, b)
+        self.assertEqual(eager, compiled)
+
+    def test_cpu_scalar_with_cpu_scalar(self):
+        def fn(a, b):
+            return a[0] + b[0]
+
+        a = torch.rand(20, device="cpu")
+        b = torch.rand(20, device="cpu")
+
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor")(a, b)
+        self.assertEqual(eager, compiled)
+
+    @requires_cuda_and_triton
+    def test_gpu_scalar_with_cpu_tensor(self):
+        def fn(a, b):
+            return a[0] + b
+
+        a = torch.rand(20, device=GPU_TYPE)
+        b = torch.rand(20, device="cpu")
+
+        with self.assertRaises(RuntimeError):
+            compiled = torch.compile(fn, backend="inductor")(a, b)
+
+    # end of class CommonTemplate - add new tests here
+
 
 @dataclasses.dataclass
 class TestFailure:
@@ -13703,6 +14099,25 @@ def new_test(self, value=value):
         other_cls.is_dtype_supported = my_cls.is_dtype_supported
 
 
+def add_test_failures(
+    test_failures: dict[str, TestFailure], added_test_failures: dict[str, TestFailure]
+):
+    """
+    In-place modifies the given dictionary of `test_failures` to add the
+    contents of `added_test_failures` by unioning the test_failure.suffixes, and
+    or-ing the the is_skip value.
+    """
+    for name, new_failure in added_test_failures.items():
+        if name in test_failures:
+            orig_failure = test_failures[name]
+            orig_failure.suffixes = tuple(
+                set(orig_failure.suffixes).union(set(new_failure.suffixes))
+            )
+            orig_failure.is_skip = orig_failure.is_skip or new_failure.is_skip
+        else:
+            test_failures[name] = new_failure
+
+
 if RUN_CPU:
 
     class SweepInputsCpuTest(SweepInputs2, TestCase):
@@ -14015,7 +14430,7 @@ def forward(
                 torch._inductor.aot_compile(traced, inputs)
 
         @skipCUDAIf(not SM90OrLater, "Requires sm90")
-        @requires_cuda
+        @requires_cuda_and_triton
         @unittest.skipIf(TEST_WITH_ROCM, "no grouped_mm support")
         @config.patch(implicit_fallbacks=True)
         def test_grouped_mm(self):
@@ -14172,6 +14587,55 @@ def fn_gpu(x):
             self.assertEqual(type(r), np.ndarray)
             self.assertEqual(r, np.sin(x))
 
+        @config.patch(expand_dimension_for_pointwise_nodes=True)
+        def test_rope_fusion(self):
+            batch_size, seq_length, hidden_dim = 8, 16, 128
+            num_q_heads, num_kv_heads = 32, 8
+
+            def prepare_input(batch_size, seq_length):
+                q = torch.randn(
+                    (batch_size, num_q_heads, seq_length, hidden_dim), device=GPU_TYPE
+                )
+                k = torch.randn(
+                    (batch_size, num_kv_heads, seq_length, hidden_dim),
+                    device=GPU_TYPE,
+                )
+                pos_ids = torch.arange(
+                    seq_length, device=GPU_TYPE, dtype=torch.long
+                ).unsqueeze(0)
+
+                # dummy cos and sin
+                cos, sin = (
+                    torch.randn((1, seq_length, hidden_dim), device=GPU_TYPE),
+                    torch.randn((1, seq_length, hidden_dim), device=GPU_TYPE),
+                )
+                return q, k, cos, sin, pos_ids
+
+            def rotate_half(x):
+                """Rotates half the hidden dims of the input."""
+                x1 = x[..., : x.shape[-1] // 2]
+                x2 = x[..., x.shape[-1] // 2 :]
+                return torch.cat((-x2, x1), dim=-1)
+
+            def apply_rotary_pos_emb(
+                q, k, cos, sin, position_ids=None, unsqueeze_dim=1
+            ):
+                cos = cos.unsqueeze(unsqueeze_dim)
+                sin = sin.unsqueeze(unsqueeze_dim)
+                q_embed = (q * cos) + (rotate_half(q) * sin)
+                k_embed = (k * cos) + (rotate_half(k) * sin)
+                return q_embed, k_embed
+
+            q, k, cos, sin, pos_ids = prepare_input(batch_size, seq_length)
+            compiled_fn = torch.compile(apply_rotary_pos_emb)
+            compiled_out = compiled_fn(q, k, cos, sin, pos_ids)
+            eager_out = apply_rotary_pos_emb(q, k, cos, sin, pos_ids)
+            self.assertEqual(compiled_out, eager_out)
+
+            # make sure that rope is fused into 1 kernel
+            code = run_and_get_triton_code(compiled_fn, q, k, cos, sin, pos_ids)
+            self.assertEqual(code.count(".run("), 1)
+
         def test_numpy_autograd(self):
             def my_torch(x):
                 y = torch.cat([torch.sin(x) ** 2, torch.max(x)[None]])
@@ -14529,11 +14993,11 @@ def fn(x):
             else:
                 self.assertTrue("Graph fragment" in code)
                 self.assertTrue(
-                    '%sin : Tensor "f32[4, 4][4, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sin.default]'
+                    f'%sin : Tensor "f32[4, 4][4, 1]{GPU_TYPE}:0"[num_users=1] = call_function[target=torch.ops.aten.sin.default]'
                     in code
                 )
                 self.assertTrue(
-                    '%relu : Tensor "f32[4, 4][4, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.relu.default]'
+                    f'%relu : Tensor "f32[4, 4][4, 1]{GPU_TYPE}:0"[num_users=1] = call_function[target=torch.ops.aten.relu.default]'
                     in code
                 )
 
@@ -14988,302 +15452,6 @@ def fn(x):
                 "'XBLOCK': 'constexpr'"
             ).run(code[0])
 
-        @torch._inductor.config.patch("graph_partition", True)
-        def test_graph_partition(self):
-            def f(x, y):
-                x1 = x + 1
-                y1 = y + 1
-                y_cpu = y1.cpu() + 1
-                z = x @ y
-                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
-
-            x, y = [torch.ones(2, 2, device=self.device) for _ in range(2)]
-            x_cloned, y_cloned = [tmp.clone() for tmp in [x, y]]
-            eager_out = f(x, y)
-
-            f_compiled = torch.compile(f)
-            compiled_out = f_compiled(x_cloned, y_cloned)
-            self.assertEqual(eager_out, compiled_out)
-
-            _, code = run_and_get_code(f_compiled, x_cloned, y_cloned)
-
-            if not config.cpp_wrapper:
-                FileCheck().check("def partition_0(args):").check(
-                    "(buf0, buf1, arg0_1, arg1_1) = self.partitions[0](partition0_args)"
-                ).check("recursively_apply_fns = runner.recursively_apply_fns").run(
-                    code[0]
-                )
-
-        @torch._inductor.config.patch("graph_partition", True)
-        def test_graph_partition_foreach_op(self):
-            def fn(a0, a1):
-                c = torch._foreach_abs([a0, a1])
-                return torch.mul(c[0], a0)
-
-            compiled_fn = torch.compile(fn)
-
-            a0 = torch.randn(2, 3, device=self.device)
-            a1 = torch.randn(2, 3, device=self.device)
-            eager_out = fn(a0, a1)
-            compiled_out = compiled_fn(a0, a1)
-            self.assertEqual(eager_out, compiled_out)
-
-        @torch._inductor.config.patch("graph_partition", True)
-        def test_graph_partition_multiple_functions(self):
-            def f(x, y):
-                x1 = x + 1
-                y1 = y + 1
-                y_cpu = y1.cpu() + 1
-                z = x @ y
-                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
-
-            def g(x):
-                return x + 1
-
-            x, y = [torch.ones(2, 2, device=self.device) for _ in range(2)]
-            x_cloned, y_cloned = [tmp.clone() for tmp in [x, y]]
-            eager_out = g(f(x, y))
-
-            f_compiled = torch.compile(f)
-            g_compiled = torch.compile(g)
-            compiled_out = g_compiled(f_compiled(x_cloned, y_cloned))
-
-            self.assertEqual(eager_out, compiled_out)
-
-        @torch._inductor.config.patch("graph_partition", True)
-        def test_graph_partition_condition_op(self):
-            def f(p, b):
-                def true_fn(x):
-                    return torch.cos(x)
-
-                def false_fn(x):
-                    return torch.sin(x)
-
-                return torch.cond(p, true_fn, false_fn, [b])
-
-            compiled_f = torch.compile(f)
-
-            # static shape
-            p = torch.tensor([True], device=self.device)
-            a = torch.ones([2, 3], device=self.device)
-            eager_out = f(p, a)
-            compiled_out = compiled_f(p, a)
-            self.assertEqual(eager_out, compiled_out)
-
-            # dynamic shape with backed symint
-            p = torch.tensor([True], device=self.device)
-            a = torch.ones([4, 5], device=self.device)
-            eager_out = f(p, a)
-            compiled_out = compiled_f(p, a)
-            self.assertEqual(eager_out, compiled_out)
-
-        @torch._inductor.config.patch("graph_partition", True)
-        @torch._dynamo.config.patch("capture_scalar_outputs", True)
-        def test_graph_partition_unbacked_symint_multi_output_layout(self):
-            def f(p, size_tensor):
-                size_val = size_tensor.item()
-                b = torch.ones([size_val, 3], device=GPU_TYPE)
-
-                def true_fn(x):
-                    return torch.cos(x), torch.cos(x) + 1
-
-                def false_fn(x):
-                    return torch.sin(x), torch.sin(x) + 1
-
-                cond_out = torch.cond(p, true_fn, false_fn, [b])
-                return cond_out[0] + cond_out[1]
-
-            compiled_f = torch.compile(f)
-            p = torch.tensor([True], device=GPU_TYPE)
-            size_tensor = torch.tensor(2, device=GPU_TYPE)
-            eager_out = f(p, size_tensor)
-            compiled_out = compiled_f(p, size_tensor)
-            self.assertEqual(eager_out, compiled_out)
-
-        @torch._inductor.config.patch("graph_partition", True)
-        def test_graph_partition_symint(self):
-            def f(x, y):
-                x1 = x + 1
-                y1 = y + 1
-                y_cpu = y1.cpu() + 1
-                z = x @ y
-                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
-
-            f_compiled = torch.compile(f)
-            x, y = (
-                torch.ones(3, 3, device=self.device),
-                torch.randn(3, 3, device=self.device),
-            )
-            compiled_out = f_compiled(x, y)
-            self.assertEqual(compiled_out, f(x, y))
-
-            x, y = (
-                torch.ones(4, 4, device=self.device),
-                torch.randn(4, 4, device=self.device),
-            )
-            compiled_out = f_compiled(x, y)
-            self.assertEqual(compiled_out, f(x, y))
-
-        @torch._inductor.config.patch("graph_partition", True)
-        def test_graph_partition_symint_cat_backward(self):
-            def f(x, w):
-                y = torch.cat((x, x), dim=0)
-                z = y @ w
-                return z @ z.T
-
-            compiled_f = torch.compile(f)
-
-            for shape in (2, 3):
-                torch.manual_seed(42)
-                eager_x = torch.randn(shape, 2, device=self.device)
-                eager_w = torch.randn(2, 2, device=self.device, requires_grad=True)
-                torch.manual_seed(42)
-                compiled_x = torch.randn(shape, 2, device=self.device)
-                compiled_w = torch.randn(2, 2, device=self.device, requires_grad=True)
-
-                f(eager_x, eager_w).sum().backward()
-                compiled_f(compiled_x, compiled_w).sum().backward()
-                self.assertEqual(eager_w.grad, compiled_w.grad)
-
-        @dynamo_config.patch("capture_dynamic_output_shape_ops", True)
-        @config.patch(implicit_fallbacks=True)
-        @torch._inductor.config.patch("graph_partition", True)
-        def test_graph_partition_symint_from_nested_indirect_indexing(self):
-            def nested(x, repeats):
-                rank = torch.arange(repeats.numel(), device=x.device)
-                index = rank.repeat_interleave(repeats, dim=0)
-                return torch.index_select(x, index=index, dim=0)
-
-            example_inputs = (
-                torch.randn((32, 64), device=self.device),
-                repeats := torch.tensor([5, 10, 15], device=self.device),
-            )
-            torch._dynamo.mark_dynamic(repeats, 0)  # create backed symint
-
-            nested_opt = torch.compile(nested, backend="inductor")
-
-            expect = nested(*example_inputs)
-            actual = nested_opt(*example_inputs)
-            self.assertEqual(expect, actual)
-
-        @torch._inductor.config.patch("graph_partition", True)
-        def test_graph_partition_symint_from_mutation_index(self):
-            x = torch.zeros(7, device=GPU_TYPE)
-
-            def fn(n, a):
-                a[n] = -1
-                return a
-
-            opt_fn = torch.compile(fn, fullgraph=True)
-
-            for n in range(2, x.shape[0]):
-                opt_fn(n, x)
-                self.assertEqual(x[n], -1)
-
-            # Negative index triggers new compilation.
-            opt_fn(-x.shape[0], x)
-
-            self.assertEqual(x[0], -1)
-
-        @torch._inductor.config.patch("graph_partition", True)
-        def test_graph_partition_unbacked_symint(self):
-            def f(x, y):
-                x1 = x + 1
-                y1 = y + 1
-                y_cpu = y1.cpu() + 1
-                z = x @ y
-                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
-
-            f_compiled = torch.compile(f)
-            x, y = (
-                torch.ones(3, 3, device=self.device),
-                torch.randn(3, 3, device=self.device),
-            )
-
-            torch._dynamo.decorators.mark_unbacked(x, 0)
-            torch._dynamo.decorators.mark_unbacked(y, 1)
-
-            compiled_out = f_compiled(x, y)
-            eager_out = f(x, y)
-            self.assertEqual(compiled_out, eager_out)
-
-        @torch._inductor.config.patch("graph_partition", True)
-        def test_graph_partition_dynamic_scalar_inputs(self):
-            def f(x, y, integer):
-                x1 = x + 1
-                y1 = y + 1
-                y_cpu = y1.cpu() + 1
-                z = x @ y
-                z += integer
-                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
-
-            f_compiled = torch.compile(f)
-            x, y = (
-                torch.ones(3, 3, device=self.device),
-                torch.randn(3, 3, device=self.device),
-            )
-
-            torch._dynamo.decorators.mark_unbacked(x, 0)
-            torch._dynamo.decorators.mark_unbacked(y, 1)
-
-            compiled_out = f_compiled(x, y, 5)
-            self.assertEqual(compiled_out, f(x, y, 5))
-
-            compiled_out = f_compiled(x, y, 6)
-            self.assertEqual(compiled_out, f(x, y, 6))
-
-        @torch._inductor.config.patch("graph_partition", True)
-        @torch._dynamo.config.patch("capture_scalar_outputs", True)
-        def test_graph_partition_item(self):
-            def f(x):
-                y = x + 1
-                scalar = y.item()
-                return x + y + scalar
-
-            compiled_f = torch.compile(f)
-            compiled_out = f(torch.tensor(1, device=GPU_TYPE))
-            self.assertEqual(compiled_out, f(torch.tensor(1, device=GPU_TYPE)))
-
-        @torch._inductor.config.patch("graph_partition", True)
-        def test_graph_partition_buffer_reuse(self):
-            def f(x, y):
-                x1 = x + 1
-                y1 = y + 1
-                y_cpu = y1.cpu() + 1
-                z = x1 + y1 + x @ y
-                u = (y_cpu.to(GPU_TYPE) + 2) @ y + 3
-                u_cpu = u.cpu() + 2
-                return z + u_cpu.to(GPU_TYPE)
-
-            x, y = [torch.ones(2, 2, device=GPU_TYPE) for _ in range(2)]
-            x_cloned, y_cloned = [tmp.clone() for tmp in [x, y]]
-            eager_out = f(x, y)
-
-            f_compiled = torch.compile(f)
-            compiled_out = f_compiled(x_cloned, y_cloned)
-
-            self.assertEqual(eager_out, compiled_out)
-
-        @torch._inductor.config.patch("graph_partition", True)
-        def test_graph_partition_fused_scheduler_node(self):
-            def foo(x):
-                x = x * 20
-                x_alias = x[0]
-                y = x * 10
-                y_alias = y[0]
-                torch._dynamo.graph_break()
-                ind = torch.tensor(4, device=GPU_TYPE)
-                x_alias2 = x[ind:]
-                y_alias2 = y[ind:]
-                return x, x_alias, x_alias2, y_alias, y_alias2
-
-            foo = torch.compile(foo)
-            x = torch.rand([20, 20], device=GPU_TYPE)
-            _, code = run_and_get_code(foo, x)
-
-            if not config.cpp_wrapper:
-                FileCheck().check("def partition_0(args):").run(code[0])
-
         @unittest.skipIf(TEST_WITH_ROCM or not IS_SM90, "no scaled_grouped_mm support")
         def test_respect_scaled_grouped_mm_layout_tag(self):
             # scaled_grouped_mm needs `mat2` to be column-major
diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
index 6a7d40b6b7cad..4bcdf0d0cddcf 100644
--- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
@@ -25,6 +25,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from inductor.test_torchinductor import (  # @manual=fbcode//caffe2/test/inductor:test_inductor-library
+    add_test_failures,
     CommonTemplate,
     copy_tests,
     run_and_get_cpp_code,
@@ -137,6 +138,12 @@ def run(*ex, **kwargs):
     "test_mul_index_expr_dynamic_shapes": TestFailure(("cpu",)),
     "test_flip_cat_dynamic_shapes": TestFailure(("cpu",)),
     "test_pad_single_dynamic_shapes": TestFailure(("cpu",)),
+    "test_slice_scatter_dtype_consistency_dynamic_shapes": TestFailure(
+        (
+            "cpu",
+            "mps",
+        )
+    ),
     "test_embedding_sparse_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     #
     # Failed to find for loop/triton kernel:
@@ -152,6 +159,7 @@ def run(*ex, **kwargs):
     "test_bmm2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_both_scalars_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_compar_dynamic_shapes": TestFailure(("cpu",)),
+    "test_complex_from_real_imag_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_const_int32_to_float_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_conv2d_backward_channels_last_dynamic_shapes": TestFailure(("cpu",)),
     "test_conv_backward_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
@@ -243,6 +251,9 @@ def run(*ex, **kwargs):
     "test_pointwise_laguerre_polynomial_l_dynamic_shapes": TestFailure(("cuda", "xpu")),
     "test_pointwise_legendre_polynomial_p_dynamic_shapes": TestFailure(("cuda", "xpu")),
     "test_polar_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu"), is_skip=True),
+    "test_add_complex7_dynamic_shapes": TestFailure(("cpu",), is_skip=True),
+    "test_add_complex8_dynamic_shapes": TestFailure(("cpu",), is_skip=True),
+    "test_add_complex9_dynamic_shapes": TestFailure(("cpu",), is_skip=True),
     "test_randn_generator_dynamic_shapes": TestFailure(("cpu",)),
     "test_randn_like_empty_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_single_elem_dynamic_shapes": TestFailure(("cpu",)),
@@ -347,7 +358,7 @@ def run(*ex, **kwargs):
     "test_rand_like_deterministic_dynamic_shapes": TestFailure(
         ("cpu", "cuda", "xpu"), is_skip=True
     ),
-    "test_repeat_interleave_2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
+    "test_repeat_interleave_2_dynamic_shapes": TestFailure(("cpu",)),
     "test_slice_mutation2_dynamic_shapes": TestFailure(
         ("cpu", "cuda", "xpu"), is_skip=True
     ),
@@ -382,9 +393,10 @@ def run(*ex, **kwargs):
     # Refinement means we don't actually generate dynamic shapes (but only on
     # cpu apparently?!)
     "test_nonzero_unbacked_refinement_dynamic_shapes": TestFailure(("cpu",)),
-    **dynamic_shapes_test_failures,
 }
 
+add_test_failures(test_failures, dynamic_shapes_test_failures)
+
 if not TEST_WITH_ROCM:
     test_failures.update(
         {
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
index 91be313e3ff19..9a3ef60156849 100644
--- a/test/inductor/test_torchinductor_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -37,6 +37,7 @@
     GPU_TYPE,
     HAS_CPU,
     HAS_GPU,
+    HAS_MPS,
     patch_inductor_backend,
 )
 
@@ -60,9 +61,34 @@
     "test_kwargs_dynamic_shapes": TestFailure(("cpu",)),
     # calling div on only symint args
     "test_AllenaiLongformerBase_repro_dynamic_shapes": TestFailure(
-        ("cpu", "cuda", "xpu")
+        ("cpu", "cuda", "xpu", "mps")
+    ),
+    "test_argmax_argmin_with_duplicates_dynamic_shapes": TestFailure(("mps",)),
+    "test_batch_norm_2d_2_dynamic_shapes": TestFailure(("mps",)),
+    "test_buffer_batch_norm_dynamic_shapes": TestFailure(("mps",)),
+    "test_convolution4_dynamic_shapes": TestFailure(("mps",)),
+    "test_index_propagation_abs_dynamic_shapes": TestFailure(("mps",)),
+    "test_index_propagation_floordiv_dynamic_shapes": TestFailure(("mps",)),
+    "test_index_propagation_remainder_dynamic_shapes": TestFailure(("mps",)),
+    "test_multilayer_var_dynamic_shapes": TestFailure(("mps",)),
+    "test_multilayer_var_lowp_dynamic_shapes": TestFailure(("mps",)),
+    "test_reduction2_dynamic_shapes": TestFailure(("mps",)),
+    "test_reduction3_dynamic_shapes": TestFailure(("mps",)),
+    "test_reduction5_dynamic_shapes": TestFailure(("mps",)),
+    "test_reflection_pad2d_dynamic_shapes": TestFailure(("mps",)),
+    "test_require_stride_expanded_dynamic_shapes": TestFailure(("mps",)),
+    "test_roll_dynamic_shapes": TestFailure(("mps",)),
+    "test_std_dynamic_shapes": TestFailure(("mps",)),
+    "test_var_correction_dynamic_shapes": TestFailure(("mps",)),
+    "test_var_mean_div_by_dynamic_shapes": TestFailure(("mps",)),
+    "test_var_mean_tile_reduction_False_dynamic_shapes": TestFailure(("mps",)),
+    "test_var_mean_tile_reduction_True_dynamic_shapes": TestFailure(("mps",)),
+    "test_vectorized_ops_masked_var_novec_dynamic_shapes": TestFailure(("mps",)),
+    "test_reflection_pad2d_backward_dynamic_shapes": TestFailure(
+        ("mps",), is_skip=True
     ),
 }
+
 if not torch._inductor.config.cpp_wrapper:
     test_failures["test_conv_inference_heuristics_dynamic_shapes"] = TestFailure(
         ("cuda",)
@@ -79,10 +105,17 @@
     test_failures["test_unbacked_reduction"] = TestFailure(("cpu"), is_skip=True)
 
 
-if os.getenv("BUILD_ENVIRONMENT", "").endswith("-debug"):
+if any(os.getenv("BUILD_ENVIRONMENT", "").endswith(x) for x in ("-debug", "-asan")):
     # Fails with TORCH_INTERNAL_ASSERT(!is_heap_allocated()), see https://github.com/pytorch/pytorch/issues/130073
-    test_failures["test_resize_as_dynamic_shapes"] = TestFailure(("cpu", "cuda"))
-    test_failures["test_resize_dynamic_shapes"] = TestFailure(("cpu", "cuda"))
+    # After https://github.com/pytorch/pytorch/pull/161586, starts failing UBSAN so we can't even xfail.
+    # Root cause seems to be SymInt issues in StorageImpl, see
+    # https://github.com/pytorch/pytorch/pull/161586#issuecomment-3246530671
+    test_failures["test_resize_as_dynamic_shapes"] = TestFailure(
+        ("cpu", "cuda"), is_skip=True
+    )
+    test_failures["test_resize_dynamic_shapes"] = TestFailure(
+        ("cpu", "cuda"), is_skip=True
+    )
 
 
 def make_dynamic_cls(cls, xfail_prop="_expected_failure_dynamic"):
@@ -107,7 +140,7 @@ class DynamicShapesCpuTests(TestCase):
     copy_tests(DynamicShapesCommonTemplate, DynamicShapesCpuTests, "cpu", test_failures)
 
 
-if HAS_GPU and not TEST_WITH_ASAN:
+if (HAS_GPU or HAS_MPS) and not TEST_WITH_ASAN:
 
     class DynamicShapesGPUTests(TestCase):
         common = check_model_gpu
@@ -122,7 +155,7 @@ class TestInductorDynamic(TestCase):
     compile_fn = partial(torch.compile, dynamic=True)
 
     def setUp(self):
-        # HAS_CUDA also checks compute capability to skip tests
+        # HAS_CUDA_AND_TRITON also checks compute capability to skip tests
         # on older devices
         if not HAS_GPU:
             self.skipTest("Triton not available")
@@ -1135,5 +1168,5 @@ def fn(a, descending):
     from torch._inductor.test_case import run_tests
 
     # Slow on ASAN after https://github.com/pytorch/pytorch/pull/94068
-    if (HAS_CPU or HAS_GPU) and not TEST_WITH_ASAN:
+    if (HAS_CPU or HAS_GPU or HAS_MPS) and not TEST_WITH_ASAN:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 2ea0f263d5937..807ccb48a7983 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -26,7 +26,6 @@
     OpDTypes,
     ops,
     skipCPUIf,
-    skipCUDAIf,
     skipXPUIf,
 )
 from torch.testing._internal.common_methods_invocations import op_db, skipOps
@@ -46,11 +45,11 @@
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_CPU,
-    HAS_CUDA,
     has_triton,
-    HAS_XPU,
+    HAS_XPU_AND_TRITON,
     maybe_skip_size_asserts,
 )
+from torch.testing._internal.triton_utils import requires_gpu_and_triton
 from torch.utils._dtype_abbrs import dtype_abbrs
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_map
@@ -293,6 +292,17 @@ def format_op(op):
     # a deconvolution forward propagation primitive
     "nn.functional.conv_transpose2d": {f32, f64},
     "nn.functional.conv_transpose3d": {f32, f64},
+    # [Begin] Incorrect XPU reference due to new driver.
+    "masked.prod": {b8, i32, i64},
+    "masked.amin": {i64},
+    "masked.amax": {i64},
+    "amax": {i64},
+    "amin": {i64},
+    "std": {f64},
+    "var": {f64},
+    "std_mean": {f64},
+    "var_mean": {f64},
+    # [End]
 }
 
 
@@ -672,6 +682,14 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("nn.functional.unfold", f16): {
         "reference_in_float": True,
     },
+    # Reference crash on Intel LTS2 driver.
+    ("nn.functional.interpolate.trilinear", f32): {
+        "check_gradient": False,
+    },
+    # Reference crash on Intel LTS2 driver.
+    ("nn.functional.interpolate.trilinear", f64): {
+        "check_gradient": False,
+    },
 }
 if TEST_WITH_ROCM:
     inductor_override_kwargs["cuda"].update(
@@ -1115,8 +1133,10 @@ def tearDown(self):
     @skipCUDAMemoryLeakCheckIf(
         True
     )  # inductor kernels failing this test intermittently
-    @skipCUDAIf(not HAS_CUDA, "Skipped! Triton not found")
-    @skipXPUIf(not HAS_XPU, "Skipped! Supported XPU compiler not found")
+    @requires_gpu_and_triton
+    @skipXPUIf(
+        not HAS_XPU_AND_TRITON, "Skipped! Supported XPU compiler and Triton not found"
+    )
     @skipCPUIf(not HAS_CPU, "Skipped! Supported CPU compiler not found")
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @skipIfTorchDynamo("Test uses dynamo already")
diff --git a/test/inductor/test_torchinductor_strided_blocks.py b/test/inductor/test_torchinductor_strided_blocks.py
index 2ddb77be3bf07..41db6b18daba7 100644
--- a/test/inductor/test_torchinductor_strided_blocks.py
+++ b/test/inductor/test_torchinductor_strided_blocks.py
@@ -9,6 +9,7 @@
 
 import torch
 import torch.utils._pytree as pytree
+from torch._dynamo.debug_utils import InputReader
 from torch._inductor import config
 from torch._inductor.choices import InductorChoices
 from torch._inductor.codegen.triton import FixedTritonConfig
@@ -26,7 +27,7 @@
 )
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
-    HAS_CUDA,
+    HAS_CUDA_AND_TRITON,
     HAS_GPU,
     requires_gpu,
     skip_windows_ci,
@@ -77,9 +78,9 @@ def xfail_if_use_tensor_descriptor(fn):
         "test_2d_reduction_odd_shapes_view_size1_num_block_pointers_3_num_triton_kernels_2_reduction_op1",
         "test_broadcast_prefer_nd_tiling_False_x_size0_y_size0",
         "test_broadcast_prefer_nd_tiling_False_x_size2_y_size2",
-        "test_broadcast_prefer_nd_tiling_False_x_size3_y_size3",
         "test_broadcast_prefer_nd_tiling_True_x_size0_y_size0",
         "test_broadcast_prefer_nd_tiling_True_x_size2_y_size2",
+        "test_broadcast_with_singleton_dims",
     ),
     TMA_XFAIL,
 )
@@ -326,6 +327,75 @@ def foo(x, y):
             config_patches={"triton.prefer_nd_tiling": prefer_nd_tiling},
         )
 
+    def test_broadcast_with_singleton_dims(self):
+        # This tests the case when the input / output contains both zero strides
+        # and singleton dimensions. In this case the broadcasting dimensions
+        # generated for the descriptor need to ignore dimensions that have zero
+        # strides with size 1
+
+        # This is a minified repro based on HuggingFaceTB/SmolLM2-135M
+        # original issue:
+        # store index=x2 + 192*y0 + 64*y1
+        # matched block params = BlockParameters(
+        #     shape=[3, 4, 1, 1, 64],
+        #     block_shape=[((YBLOCK + 3)//4), Min(4, YBLOCK), 1, 1, XBLOCK],
+        #     strides=[64, 192, 0, 0, 1],
+        #     offsets=[(yoffset//4), ModularIndexing(yoffset, 1, 4), 0, 0, xoffset]
+        # )
+        # broadcasting_dims=[False, False, True, True, False]
+        # broadcast_shape=[((YBLOCK + 3)//4), Min(4, YBLOCK), XBLOCK]
+        # error, len(broadcasting_dims) != broadcast_shape
+        def forward(expand_4, permute_4, mul_7):
+            clone = torch.ops.aten.clone.default(
+                expand_4, memory_format=torch.contiguous_format
+            )
+            expand_4 = None
+            view_4 = torch.ops.aten.view.default(clone, [1, 4, 64])
+            clone = None
+            cos = torch.ops.aten.cos.default(view_4)
+            view_4 = None
+            mul = torch.ops.aten.mul.Tensor(cos, 1.0)
+            cos = None
+            unsqueeze_4 = torch.ops.aten.unsqueeze.default(mul, 1)
+            mul = None
+            mul_6 = torch.ops.aten.mul.Tensor(permute_4, unsqueeze_4)
+            permute_4 = unsqueeze_4 = None
+            add_3 = torch.ops.aten.add.Tensor(mul_6, mul_7)
+            mul_6 = mul_7 = None
+            unsqueeze_6 = torch.ops.aten.unsqueeze.default(add_3, 2)
+            add_3 = None
+            return (unsqueeze_6,)
+
+        def load_args(reader):
+            buf0 = reader.storage(storage_hash=None, nbytes=512, device=self.device)
+            reader.tensor(buf0, (1, 4, 2, 32), (128, 1, 0, 4), is_leaf=True)  # expand_4
+            buf1 = reader.storage(storage_hash=None, nbytes=3072, device=self.device)
+            reader.tensor(
+                buf1, (1, 3, 4, 64), (768, 64, 192, 1), is_leaf=True
+            )  # permute_4
+            buf2 = reader.storage(storage_hash=None, nbytes=3072, device=self.device)
+            reader.tensor(buf2, (1, 3, 4, 64), is_leaf=True)  # mul_7
+
+        load_args._version = 0
+
+        input_reader = InputReader()
+        load_args(input_reader)
+        args = input_reader.args
+        if self.device == "xpu":
+            atol = 1e-7
+            rtol = 1e-5
+        else:
+            atol = None
+            rtol = None
+
+        self._run_and_compare(
+            forward,
+            *args,
+            expected_num_block_pointers=4,
+            atol=atol,
+            rtol=rtol,
+        )
+
     @parametrize(
         "x_size,y_size",
         [
@@ -1163,6 +1233,9 @@ def foo(x, y, z):
     # }
     # This is now fixed by ensuring that that wild symbols only match integers
     @xfail_if_use_tensor_descriptor
+    @skipIfXpu(
+        msg="Triton issue exposed by new driver, will be resolved after next triton update."
+    )
     def test_ensure_integral_dims_and_strides(self):
         def model(data, *args):
             return torch.nn.functional.unfold(data, *args)
@@ -1321,7 +1394,11 @@ class TritonBlockPointerTestGPU(BlockDescriptorTestBase):
 
 
 @unittest.skipIf(
-    not (HAS_CUDA and torch.cuda.get_device_capability()[0] >= 9),
+    not (
+        HAS_CUDA_AND_TRITON
+        and torch.cuda.get_device_capability()[0] >= 9
+        and torch.version.hip is None
+    ),
     "Requires Triton CUDA backend and CUDA compute capability >= 9.0",
 )
 @config.patch({"triton.use_tensor_descriptor": True, "assume_aligned_inputs": True})
diff --git a/test/inductor/test_triton_heuristics.py b/test/inductor/test_triton_heuristics.py
index a9f898a36af55..1573d4860a84c 100644
--- a/test/inductor/test_triton_heuristics.py
+++ b/test/inductor/test_triton_heuristics.py
@@ -3,15 +3,24 @@
 import functools
 import sys
 import unittest
+from unittest import skipUnless
 from unittest.mock import MagicMock, patch
 
 import torch
 from torch._dynamo.testing import rand_strided
 from torch._inductor.runtime.triton_compat import HAS_WARP_SPEC
 from torch._inductor.utils import clone_preserve_strides
-from torch.testing._internal.common_utils import IS_LINUX, runOnRocm, skipIfXpu
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    IS_LINUX,
+    parametrize,
+    runOnRocm,
+    skipIfRocm,
+    skipIfXpu,
+)
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
+    HAS_CUDA_AND_TRITON,
     HAS_GPU,
     requires_cuda_with_enough_memory,
 )
@@ -67,6 +76,7 @@ def get_autotuned_amd_sqr_kernel():
     )(amd_sqr_kernel)
 
 
+@instantiate_parametrized_tests
 class TestTritonHeuristics(TestCase):
     device_type = GPU_TYPE
 
@@ -257,11 +267,39 @@ def grid(meta):
         def fn(x):
             return triton_sqr(x)
 
-        x = torch.randn(32, device="cuda")
+        x = torch.randn(32, device=GPU_TYPE)
         ref = fn(x)
         res = torch.compile(fn)(x)
         self.assertEqual(ref, res)
 
+    @skipIfXpu
+    @skipIfRocm
+    @skipUnless(HAS_CUDA_AND_TRITON, "requires CUDA")
+    @parametrize("do_pruning", [False, True])
+    def test_prune_configs_over_shared_memory_limit(self, do_pruning):
+        from torch._inductor.template_heuristics.triton import (
+            CUDAConfigHeuristic,
+            GemmConfig,
+        )
+
+        expected_count = 1 if do_pruning else 2
+        mm_configs = [
+            GemmConfig(32, 32, 32, 1, 8, 8),
+            GemmConfig(
+                128, 128, 128, 100, 8, 4
+            ),  # intentionally large to exceed shared memory limit
+        ]
+        with config.patch(
+            {"max_autotune_prune_choices_based_on_shared_mem": do_pruning}
+        ):
+            config_heuristic = CUDAConfigHeuristic()
+            config_heuristic.should_scale_configs = False
+            config_heuristic.mm_configs = mm_configs
+            configs = list(
+                config_heuristic.get_mm_configs()(3, 3, 3, dtype_size=4, op_name="mm")
+            )
+            self.assertEqual(len(configs), expected_count)
+
 
 class TestArgumentCloneAndRestore(TestCase):
     # Our tensor is large enough. If a unexpected copy happens, the
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index 689cf218b2bcd..5fe3623b271a5 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -31,7 +31,12 @@
     skipIfWindows,
     skipIfXpu,
 )
-from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA, HAS_GPU, HAS_XPU
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_CUDA_AND_TRITON,
+    HAS_GPU,
+    HAS_XPU_AND_TRITON,
+)
 from torch.testing._internal.logging_utils import log_settings, logs_to_string
 
 # Defines all the kernels for tests
@@ -47,7 +52,7 @@
     import triton
     from triton import language as tl
 
-    if HAS_CUDA:
+    if HAS_CUDA_AND_TRITON:
         try:
             from triton.language.extra.libdevice import (  # @manual
                 fast_dividef,
@@ -58,7 +63,7 @@
                 fast_dividef,
                 fast_dividef as my_fast_dividef,
             )
-    elif HAS_XPU:
+    elif HAS_XPU_AND_TRITON:
         from triton.language.extra.intel.libdevice import (  # @manual
             fast_dividef,
             fast_dividef as my_fast_dividef,
@@ -78,6 +83,12 @@ def _triton_get_ast_equal_to_str(params):
     BOOL_CONSTANT_C: tl.constexpr = tl.constexpr(True)
     FLOAT_CONSTANT_C = tl.constexpr(3.14)  # intentionally un-annotated
 
+    if hasattr(triton, "constexpr_function"):
+
+        @triton.constexpr_function
+        def log2(n):
+            return len(bin(n)) - 3
+
 
 class KernelTests(torch._inductor.test_case.TestCase):
     def _kernel_launched_in_code(self, kernel_name: str, code: str) -> bool:
@@ -1378,6 +1389,39 @@ def f(x):
 
         self.assertEqual(compiled_out, eager_out)
 
+    @unittest.skipIf(
+        not HAS_GPU or not hasattr(triton, "constexpr_function"),
+        "newer triton version required",
+    )
+    def test_triton_kernel_with_constexpr_function(self):
+        @triton.jit
+        def kernel(x_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
+            pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(x_ptr + offsets, mask=mask)
+
+            FIRST_DIM: tl.constexpr = x.shape[0]
+            output = x + log2(FIRST_DIM)
+            tl.store(output_ptr + offsets, output, mask=mask)
+
+        def f(x):
+            out = torch.zeros_like(x)
+            n_elements = x.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            kernel[grid](x, out, n_elements, BLOCK_SIZE=16)
+            return out
+
+        x = torch.randn(16, device=GPU_TYPE)
+        eager_out = f(x)
+        compiled_out, (triton_code,) = run_and_get_code(
+            torch.compile(f, fullgraph=True), x
+        )
+
+        self.assertIn("@triton.constexpr_function", triton_code)
+        self.assertEqual(compiled_out, eager_out)
+
     @requires_gpu
     def test_triton_kernel_with_imported_symbol_with_custom_name(self):
         @triton.jit
@@ -2195,7 +2239,7 @@ def f(x):
         self.assertEqual(compiled_out, eager_out)
 
     # TODO enable this test case on XPU.
-    @requires_cuda
+    @requires_cuda_and_triton
     @parametrize("cfg", ["normal", "cpp_wrapper"])
     def test_triton_kernel_dtype_view(self, cfg):
         # https://github.com/pytorch/pytorch/issues/136159
@@ -3578,6 +3622,40 @@ def f(x, y):
         self.assertNotIn(libname, code)
         self.assertNotIn(opname, code)
 
+    @requires_gpu
+    def test_subclass(self):
+        libname = "my_cool_namespace"
+        opname = "my_triton_operator"
+
+        @torch.library.triton_op(f"{libname}::{opname}", mutates_args={})
+        def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.empty_like(x)
+            n_elements = output.numel()
+
+            def grid(meta):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+                capture_triton(add_kernel)[grid](x, y, output, n_elements, 16)
+
+            return output
+
+        def f(x, y):
+            return add(x, y)
+
+        x0 = torch.randn(3, device=GPU_TYPE)
+        y0 = torch.randn(3, device=GPU_TYPE)
+        x1 = torch.randn(3, device=GPU_TYPE)
+        y1 = torch.randn(3, device=GPU_TYPE)
+
+        from torch.testing._internal.two_tensor import TwoTensor
+
+        x = TwoTensor(x0, x1)
+        y = TwoTensor(y0, y1)
+
+        out = torch.compile(f, fullgraph=True)(x, y)
+        expected = f(x, y)
+        self.assertEqual(out.a, expected.a)
+        self.assertEqual(out.b, expected.b)
+
     @requires_gpu
     @dynamo_config.patch("recompile_limit", 1)
     def test_triton_dynamic_grid_no_recompile(self):
diff --git a/test/inductor/test_xpu_basic.py b/test/inductor/test_xpu_basic.py
index 0572eccb77fd4..4501b8264c5f9 100644
--- a/test/inductor/test_xpu_basic.py
+++ b/test/inductor/test_xpu_basic.py
@@ -53,7 +53,7 @@ def fn(a, b):
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
-    from torch.testing._internal.inductor_utils import HAS_XPU
+    from torch.testing._internal.inductor_utils import HAS_XPU_AND_TRITON
 
-    if HAS_XPU:
+    if HAS_XPU_AND_TRITON:
         run_tests(needs="filelock")
diff --git a/test/jit/test_builtins.py b/test/jit/test_builtins.py
index b84bc96519cbc..781080f5deb60 100644
--- a/test/jit/test_builtins.py
+++ b/test/jit/test_builtins.py
@@ -131,6 +131,164 @@ def del_dict_multiple_operands(x: Dict[str, int]) -> Dict[str, int]:
         jit_out = torch.jit.script(del_dict_multiple_operands)({"hi": 5, "there": 6})
         self.assertEqual(py_out, jit_out)
 
+    def test_torch_check(self):
+        """Test torch._check functionality with flexible argument handling"""
+
+        def test_check_basic(x):
+            torch._check(x.sum().item() > -1000)
+            return x
+
+        def test_check_with_message(x):
+            torch._check(x.sum().item() > -1000, "Tensor sum must be reasonable")
+            return x
+
+        def test_check_with_kwarg_message(x):
+            torch._check(
+                x.sum().item() > -1000, message="Tensor sum must be reasonable"
+            )
+            return x
+
+        def test_check_cond_kwarg(x):
+            torch._check(cond=x.sum().item() > -1000)
+            return x
+
+        def test_check_both_kwargs(x):
+            torch._check(cond=x.sum().item() > -1000, message="Both as kwargs")
+            return x
+
+        def test_check_kwargs_reversed(x):
+            torch._check(message="Reversed order", cond=x.sum().item() > -1000)
+            return x
+
+        def test_check_in_loop(x):
+            sizes = torch.jit.annotate(List[int], x.tolist())
+            for s in sizes:
+                torch._check(s > -100)
+            return x
+
+        test_tensor = torch.tensor([1, 2, 3])
+
+        # Test all variations
+        self.checkScript(test_check_basic, (test_tensor,))
+        self.checkScript(test_check_with_message, (test_tensor,))
+        self.checkScript(test_check_with_kwarg_message, (test_tensor,))
+        self.checkScript(test_check_cond_kwarg, (test_tensor,))
+        self.checkScript(test_check_both_kwargs, (test_tensor,))
+        self.checkScript(test_check_kwargs_reversed, (test_tensor,))
+        self.checkScript(test_check_in_loop, (test_tensor,))
+
+        # Test that the compiled functions work correctly
+        scripted_basic = torch.jit.script(test_check_basic)
+        scripted_with_message = torch.jit.script(test_check_with_message)
+        scripted_with_kwarg = torch.jit.script(test_check_with_kwarg_message)
+        scripted_cond_kwarg = torch.jit.script(test_check_cond_kwarg)
+        scripted_both_kwargs = torch.jit.script(test_check_both_kwargs)
+        scripted_kwargs_reversed = torch.jit.script(test_check_kwargs_reversed)
+        scripted_in_loop = torch.jit.script(test_check_in_loop)
+
+        # These should all succeed without throwing
+        result1 = scripted_basic(test_tensor)
+        result2 = scripted_with_message(test_tensor)
+        result3 = scripted_with_kwarg(test_tensor)
+        result4 = scripted_cond_kwarg(test_tensor)
+        result5 = scripted_both_kwargs(test_tensor)
+        result6 = scripted_kwargs_reversed(test_tensor)
+        result7 = scripted_in_loop(test_tensor)
+
+        # Results should be the same as input
+        for result in [result1, result2, result3, result4, result5, result6, result7]:
+            self.assertEqual(result, test_tensor)
+
+        # Check that the message constants are present in the graphs
+        FileCheck().check("Tensor sum must be reasonable").run(
+            scripted_with_message.graph
+        )
+        FileCheck().check("Tensor sum must be reasonable").run(
+            scripted_with_kwarg.graph
+        )
+        FileCheck().check("Both as kwargs").run(scripted_both_kwargs.graph)
+        FileCheck().check("Reversed order").run(scripted_kwargs_reversed.graph)
+
+        # Verify the graphs contain some computation (not just empty)
+        basic_graph_str = str(scripted_basic.graph)
+        self.assertTrue(
+            len(basic_graph_str) > 100, "Basic graph should contain some computation"
+        )
+
+        # Verify the loop case contains a loop
+        FileCheck().check("prim::Loop").run(scripted_in_loop.graph)
+
+        for scripted_func in [
+            scripted_basic,
+            scripted_with_message,
+            scripted_with_kwarg,
+            scripted_cond_kwarg,
+            scripted_both_kwargs,
+            scripted_kwargs_reversed,
+        ]:
+            FileCheck().check("prim::If").check("prim::RaiseException").run(
+                scripted_func.graph
+            )
+
+    def test_torch_check_invalid_args(self):
+        """Test torch._check with invalid arguments"""
+
+        # Test too many arguments
+        with self.assertRaisesRegex(
+            RuntimeError, "torch._check\\(\\) expects 1 or 2 arguments"
+        ):
+
+            @torch.jit.script
+            def too_many_args(x):
+                torch._check(True, "msg", "extra")
+                return x
+
+        # Test invalid keyword argument
+        with self.assertRaisesRegex(RuntimeError, "unexpected keyword argument"):
+
+            @torch.jit.script
+            def invalid_kwarg(x):
+                torch._check(True, invalid_arg="msg")
+                return x
+
+        # Test duplicate cond argument (positional + keyword)
+        with self.assertRaisesRegex(
+            RuntimeError, "multiple values for argument 'cond'"
+        ):
+
+            @torch.jit.script
+            def duplicate_cond(x):
+                torch._check(True, cond=False)
+                return x
+
+        # Test missing required cond argument
+        with self.assertRaisesRegex(RuntimeError, "missing required argument 'cond'"):
+
+            @torch.jit.script
+            def missing_cond(x):
+                torch._check(message="msg only")
+                return x
+
+        # Test no arguments at all
+        with self.assertRaisesRegex(
+            RuntimeError, "torch._check\\(\\) expects 1 or 2 arguments"
+        ):
+
+            @torch.jit.script
+            def no_args(x):
+                torch._check()
+                return x
+
+        # Test too many total arguments (positional + keyword)
+        with self.assertRaisesRegex(
+            RuntimeError, "torch._check\\(\\) expects 1 or 2 arguments"
+        ):
+
+            @torch.jit.script
+            def too_many_total_args(x):
+                torch._check(True, "msg", cond=False)
+                return x
+
 
 class TestTensorBuiltins(JitTestCase):
     def test_tensor_properties(self):
diff --git a/test/jit/test_models.py b/test/jit/test_models.py
index c6364f10197d1..4dd099dbaad5e 100644
--- a/test/jit/test_models.py
+++ b/test/jit/test_models.py
@@ -7,6 +7,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.testing._internal.common_cuda import tf32_on_and_off
 from torch.testing._internal.common_utils import (
     enable_profiling_mode_for_profiling_tests,
     GRAPH_EXECUTOR,
@@ -482,6 +483,7 @@ def test_super_resolution(self):
         self._test_super_resolution(self, device="cpu")
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @tf32_on_and_off(0.02)
     def test_super_resolution_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
         self._test_super_resolution(self, device="cuda", check_export_import=False)
diff --git a/test/jit/test_recursive_script.py b/test/jit/test_recursive_script.py
index d595c793e79b6..d6addfddca1a7 100644
--- a/test/jit/test_recursive_script.py
+++ b/test/jit/test_recursive_script.py
@@ -4,6 +4,7 @@
 import os
 import re
 import sys
+import threading
 import types
 import typing
 import typing_extensions
@@ -773,6 +774,25 @@ def forward(self, x):
         mod.foo = None
         self.checkModule(mod, (torch.rand(2, 2),))
 
+    def test_thread_safe_error_stacks(self):
+        # prior to #160386, this causes a segfault. See [Note: Thread-safe CallStack]
+        callstacks = []
+
+        def callstack_creator():
+            factory = torch._C._jit_tree_views.SourceRangeFactory(
+                "source code", "a.py", 1, 0
+            )
+            x = torch._C.CallStack("a", factory.make_range(1, 0, 1))
+            callstacks.append(x)
+            del x
+
+        t = threading.Thread(target=callstack_creator)
+        t.start()
+        t.join()
+        del t
+        del callstacks[0]
+        self.assertTrue(len(callstacks) == 0)
+
     def test_override_instance_method_ignore(self):
         class M(torch.nn.Module):
             @torch.jit.ignore
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index f4847d3496bea..c13ed12ad5304 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: nn"]
 import itertools
 import math
+import os
 import unittest
 import warnings
 from itertools import product
@@ -62,6 +63,10 @@
 AMPERE_OR_ROCM = TEST_WITH_ROCM or torch.cuda.is_tf32_supported()
 
 
+if TEST_WITH_ROCM:
+    os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC"] = "1"
+
+
 if TEST_SCIPY:
     import scipy.ndimage
     import scipy.signal
@@ -2843,6 +2848,7 @@ def test_conv_transpose_with_output_size_and_no_batch_dim(self, device, N):
     @parametrize_test("strided", [False, True])
     # Test with both contiguous and non-contiguous inputs.
     @parametrize_test("contiguous", [False, True])
+    @expectedFailureMPS  # No double support
     def test_conv_backend(
         self,
         device,
@@ -3240,17 +3246,6 @@ def test_conv_large_batch_1(self, device):
         output_cpu = model(input_tensor.float().cpu())
         self.assertEqual(output.cpu().float(), output_cpu, atol=1e-3, rtol=1e-3)
 
-    @onlyCUDA
-    @skipCUDAIfRocm
-    @largeTensorTest("24GB", "cpu")
-    @largeTensorTest("20GB", "cuda")
-    def test_conv3d_large_batch_1(self, device):
-        x = torch.rand(1, 32, 512, 512, 256)
-        m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)
-        yref = m(x)
-        y = m.to(device=device)(x.to(device=device))
-        self.assertEqual(yref, y.cpu())
-
     @onlyCUDA
     @skipCUDAIfNoCudnn
     def test_contig_wrong_stride_cudnn(self, device):
@@ -4043,6 +4038,7 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
         self.assertEqual(grad_input.shape, input.shape)
         self.assertEqual(grad_weight.shape, weight.shape)
 
+    @skipCUDAIfRocm
     @onlyCUDA
     @largeTensorTest('40GB')
     @largeTensorTest('24GB', 'cpu')
@@ -4060,13 +4056,22 @@ def test_conv3d_64bit_indexing(self, device):
     @largeTensorTest("20GB")
     @largeTensorTest("64GB", "cpu")
     def test_depthwise_conv_64bit_indexing(self, device):
-        x = torch.randn(1, 2, 32800, 32800, dtype=torch.half)
+        x = torch.randn(1, 2, 32800, 32800, dtype=torch.half).to(
+            memory_format=torch.channels_last
+        )
         c = nn.Conv2d(
             2, 2, kernel_size=3, stride=1, padding=1, groups=2, dtype=torch.half
-        )
+        ).to(memory_format=torch.channels_last)
+        yref = c(x)
+        y = c.to(device=device)(x.to(device=device))
+        self.assertEqual(yref, y, atol=1e-3, rtol=1e-4)
+        del y, yref
+
+        # try a batch-splittable case
+        x = x.reshape(100, 2, 3280, 3280).contiguous(memory_format=torch.channels_last)
         yref = c(x)
         y = c.to(device=device)(x.to(device=device))
-        self.assertEqual(yref, y, atol=5e-3, rtol=1e-4)
+        self.assertEqual(yref, y, atol=1e-3, rtol=1e-4)
 
 
 instantiate_device_type_tests(TestConvolutionNNDeviceType, globals(), allow_mps=True)
diff --git a/test/nn/test_embedding.py b/test/nn/test_embedding.py
index 6b3833748f37e..3b21143711a56 100644
--- a/test/nn/test_embedding.py
+++ b/test/nn/test_embedding.py
@@ -182,6 +182,7 @@ def test_embedding_functional(self):
         self.assertEqual(res_old, res_F)
 
     # https://github.com/pytorch/pytorch/issues/130806
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
     @largeTensorTest("40GB", device="cuda")
     def test_large_tensors(self):
         input = torch.randint(low=0, high=16032, size=[131072], device="cuda")
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index e33385bcfa11c..a8f77df22d311 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -504,6 +504,7 @@ def test_quantized_max_pool3d(self):
 
 
 class TestPoolingNNDeviceType(NNTestCase):
+    @expectedFailureMPS  # No double, float shape prop does not work
     @onlyNativeDeviceTypes
     @dtypes(torch.float, torch.double)
     def test_adaptive_pooling_zero_batch(self, dtype, device):
@@ -523,6 +524,7 @@ def test_adaptive_pooling_zero_batch(self, dtype, device):
     # when output_size = 0, in adaptive_{avg, max}_pool and its variants.
     # These tests are explicitly written because ErrorInputs does not support backward calls
     # Issue: https://github.com/pytorch/pytorch/issues/78868
+    @expectedFailureMPS  # No double, float shape prop does not work
     @onlyNativeDeviceTypes
     @dtypes(torch.float32, torch.float64)
     @dtypesIfCUDA(torch.float32, torch.float64, torch.bfloat16, torch.float16)
@@ -556,6 +558,7 @@ def test_adaptive_pooling_empty_output_size(self, dtype, device):
             with self.assertRaisesRegex(RuntimeError, error_msg):
                 fn(input2, output_size).sum().backward()
 
+    @expectedFailureMPS  # Error message does not match
     @onlyNativeDeviceTypes
     def test_adaptive_avg_pooling_backward_fails(self, device):
         grad_output = torch.randn(1, 2, 7, device=device)
@@ -582,6 +585,7 @@ def test_adaptive_max_pooling_backward_fails(self, device):
         with self.assertRaisesRegex(RuntimeError, "expected dimensions"):
             torch.ops.aten.adaptive_max_pool3d_backward(grad_output, input, indices)
 
+    @expectedFailureMPS  # Op not implemented
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool2d_zero_batch(self, device):
         mod = nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5))
@@ -592,6 +596,7 @@ def test_FractionalMaxPool2d_zero_batch(self, device):
             inp = torch.randn(1, 0, 50, 32, device=device)
             mod(inp)
 
+    @expectedFailureMPS  # Op not implemented
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool3d_zero_batch(self, device):
         mod = nn.FractionalMaxPool3d(3, output_ratio=(0.5, 0.5, 0.5)).to(device)
@@ -602,6 +607,7 @@ def test_FractionalMaxPool3d_zero_batch(self, device):
             inp = torch.randn(1, 0, 50, 32, 32, device=device)
             mod(inp)
 
+    @expectedFailureMPS  # Op not implemented
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool2d_zero_out_size(self, device):
         mod = nn.FractionalMaxPool2d([2, 2], output_size=[0, 1])
@@ -609,6 +615,7 @@ def test_FractionalMaxPool2d_zero_out_size(self, device):
         out = mod(inp)
         self.assertEqual(out, torch.empty((16, 50, 0, 1), device=device))
 
+    @expectedFailureMPS  # Op not implemented
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool3d_zero_out_size(self, device):
         mod = nn.FractionalMaxPool3d([3, 2, 2], output_size=[0, 1, 1])
@@ -616,6 +623,7 @@ def test_FractionalMaxPool3d_zero_out_size(self, device):
         out = mod(inp)
         self.assertEqual(out, torch.empty((16, 0, 1, 1), device=device))
 
+    @expectedFailureMPS  # Op not implemented
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool2d_zero_samples(self, device):
         samples = torch.rand([0, 16, 2], device=device)
@@ -630,6 +638,7 @@ def test_FractionalMaxPool2d_zero_samples(self, device):
         with self.assertRaisesRegex(RuntimeError, "Expect _random_samples"):
             mod(inp1)
 
+    @expectedFailureMPS  # Op not implemented
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool3d_zero_samples(self, device):
         samples = torch.rand([0, 16, 3], device=device)
@@ -823,6 +832,7 @@ def test_MaxUnpool_index_errors(
             else:
                 unpool(output, indices)
 
+    @expectedFailureMPS
     @onlyNativeDeviceTypes
     def test_AdaptiveMaxPool_zero_batch_dim(self, device):
         inp = torch.randn(0, 16, 50, device=device)
@@ -962,6 +972,7 @@ def test_adaptive_avg_pool3d_output_size_one(self, device):
         c = out.size(1)
         self.assertEqual(out.stride(), [c, 1, 1, 1, 1])
 
+    @expectedFailureMPS  # Runtime Error not raised for mps
     @expectedFailureMeta  # Runtime Error not raised for meta
     @onlyNativeDeviceTypes
     @dtypes(torch.uint8, torch.int8, torch.short, torch.int, torch.long)
@@ -976,6 +987,7 @@ def test_adaptive_pooling_no_suppot_input(self, device, dtype):
                 with self.assertRaisesRegex(RuntimeError, "not implemented"):
                     module(input)
 
+    @expectedFailureMPS  # TODO: fixme
     @onlyNativeDeviceTypes
     @gcIfJetson
     @dtypes(torch.float, torch.double)
@@ -1123,6 +1135,7 @@ def helper(n, c, h, w, ks):
         helper(1, 100000, 32, 32, ks=4)
         helper(1, 100000, 1, 4, ks=(1, 4))  # test for max_pool1d
 
+    @expectedFailureMPS  # TODO: Fixme
     @onlyNativeDeviceTypes
     @dtypes(torch.half, torch.bfloat16, torch.float, torch.double)
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
@@ -1198,6 +1211,7 @@ def check(x, args, expected, memory_format):
             torch.channels_last,
         )
 
+    @expectedFailureMPS  # TODO: Fixme
     @onlyNativeDeviceTypes
     @dtypes(torch.half, torch.bfloat16, torch.float, torch.double)
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
@@ -1722,6 +1736,7 @@ def test_maxpool_indices_no_batch_dim(self, device, dtype):
 
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
     @dtypes(torch.float)
+    @expectedFailureMPS  # Exception not raise
     @onlyNativeDeviceTypes  # TODO: Fails on XLA
     @gcIfJetson
     def test_max_pool_nan_inf(self, device, dtype):
@@ -1758,6 +1773,7 @@ def test_max_pool_nan_inf(self, device, dtype):
                 res2 = fn(x2, 1 if adaptive else 3)
                 self.assertTrue(math.isinf(res2.item()))
 
+    @expectedFailureMPS  # float64
     @expectedFailureMeta  # RuntimeError: Unrecognized tensor type ID: Meta
     @onlyNativeDeviceTypes
     def test_fractional_max_pool2d(self, device):
@@ -1820,6 +1836,7 @@ def test_fractional_max_pool2d_backward_fails(self, device):
                 grad_output, input, kernel_size, output_size, indices
             )
 
+    @expectedFailureMPS  # float64
     @expectedFailureMeta  # RuntimeError: Unrecognized tensor type ID: Meta
     @onlyNativeDeviceTypes
     def test_fractional_max_pool3d(self, device):
@@ -1867,6 +1884,7 @@ def func(x):
                         x, (2, 2, 2), output_size=output_size, _random_samples=samples
                     )
 
+    @expectedFailureMPS  # Not implemented
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
     @dtypes(torch.float)
     @onlyNativeDeviceTypes  # TODO: Fails on XLA
@@ -1896,6 +1914,7 @@ def test_fractional_max_pool_nan_inf(self, device, dtype):
             res2.backward(torch.randn_like(res2))
             self.assertTrue(math.isinf(res2.item()))
 
+    @expectedFailureMPS  # TODO: Fix me
     @onlyNativeDeviceTypes  # TODO: RuntimeError message different on XLA
     def test_pooling_zero_stride(self, device):
         for op in ("max", "avg"):
diff --git a/test/onnx/exporter/test_api.py b/test/onnx/exporter/test_api.py
index 9a8a171b5fe29..24a9176bbe5bc 100644
--- a/test/onnx/exporter/test_api.py
+++ b/test/onnx/exporter/test_api.py
@@ -7,11 +7,9 @@
 import logging
 import os
 
-import numpy as np
-from onnxscript import BOOL, FLOAT, ir, opset18 as op
+from onnxscript import BOOL, FLOAT, opset18 as op
 
 import torch
-import torch.onnx._flags
 from torch.onnx._internal.exporter import _testing as onnx_testing
 from torch.testing._internal import common_utils
 
@@ -30,6 +28,11 @@ def forward(self, x, b):
         return (y, z)
 
 
+class SampleModelReduction(torch.nn.Module):
+    def forward(self, x):
+        return x.sum()
+
+
 class SampleModelForDynamicShapes(torch.nn.Module):
     def forward(self, x, b):
         return x.relu(), b.sigmoid()
@@ -67,6 +70,7 @@ def assert_export(
         )
         assert onnx_program is not None
         onnx_testing.assert_onnx_program(onnx_program, strategy=strategy)
+        return onnx_program
 
     def test_args_normalization_with_no_kwargs(self):
         self.assert_export(
@@ -74,6 +78,18 @@ def test_args_normalization_with_no_kwargs(self):
             (torch.randn(1, 1, 2), torch.randn(1, 1, 2)),
         )
 
+    def test_lower_opset_support(self):
+        # First test that opset 18 (torchlib opset works)
+        onnx_program = self.assert_export(
+            SampleModelReduction(), (torch.randn(1, 1, 2),), opset_version=18
+        )
+        self.assertEqual(onnx_program.model.opset_imports[""], 18)
+
+        onnx_program = self.assert_export(
+            SampleModelReduction(), (torch.randn(1, 1, 2),), opset_version=16
+        )
+        self.assertEqual(onnx_program.model.opset_imports[""], 16)
+
     def test_symbolic_argument_user_input_is_supported_by_report_and_call(self):
         class constant_plus_tensor_inputs(torch.nn.Module):
             def forward(self, a, x):
@@ -339,6 +355,47 @@ def test_export_successful_when_dynamic_dimension_is_one(self):
             ),
         )
 
+    def test_is_in_onnx_export(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x):
+                def f(x):
+                    return x.sin() if torch.onnx.is_in_onnx_export() else x.cos()
+
+                return f(x)
+
+        self.assertFalse(torch.onnx.is_in_onnx_export())
+        onnx_program = torch.onnx.export(
+            Mod(),
+            (torch.randn(3, 4),),
+            dynamo=True,
+            fallback=False,
+        )
+        self.assertFalse(torch.onnx.is_in_onnx_export())
+
+        node_names = [n.op_type for n in onnx_program.model.graph]
+        self.assertIn("Sin", node_names)
+
+    def test_torchscript_exporter_raises_deprecation_warning(self):
+        # Test that the deprecation warning is raised when using torchscript exporter
+        with self.assertWarnsRegex(
+            DeprecationWarning, "You are using the legacy TorchScript-based ONNX export"
+        ):
+            torch.onnx.export(
+                SampleModel(), (torch.randn(1, 1, 2),), io.BytesIO(), dynamo=False
+            )
+
+    def test_model_output_can_be_none(self):
+        class ModelWithNoneOutput(torch.nn.Module):
+            def forward(self, x):
+                return x + 1, None
+
+        onnx_program = torch.onnx.export(
+            ModelWithNoneOutput(),
+            (torch.randn(1, 1, 2),),
+            dynamo=True,
+        )
+        onnx_testing.assert_onnx_program(onnx_program)
+
 
 class TestCustomTranslationTable(common_utils.TestCase):
     def test_custom_translation_table_overrides_ops(self):
@@ -471,135 +528,5 @@ def onnx_add(self: FLOAT, other: FLOAT) -> FLOAT:
             self.assertNotIn("Sub", all_nodes_decomp)
 
 
-class TestFakeTensorExport(common_utils.TestCase):
-    """Test exporting in fake mode."""
-
-    def test_onnx_program_raises_when_model_defined_in_fake_mode(self):
-        with torch.onnx.enable_fake_mode():
-
-            class Model(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-                    self.weight = torch.nn.Parameter(torch.tensor(42.0))
-
-                def forward(self, x):
-                    return self.weight + x
-
-            onnx_program = torch.onnx.export(
-                Model(), (torch.tensor(1.0),), dynamo=True, optimize=False
-            )
-            assert onnx_program is not None
-            # Convert to model proto and back to trigger to_bytes method which serializes the tensor
-            with self.assertRaises(Exception):
-                # The tensors need to be replaced with real tensors
-                _ = onnx_program.model_proto
-
-        # Convert to model proto and back to trigger to_bytes method which serializes the tensor
-        with self.assertRaises(Exception):
-            # It doesn't matter if it is called inside or outside of the enable_fake_mode() context
-            _ = onnx_program.model_proto
-
-        # If we replace with concrete tensors, the serialization will succeed.
-        # This needs to happen outside of the fake context
-        onnx_program.apply_weights({"weight": torch.tensor(42.0)})
-        onnx_model = ir.serde.deserialize_model(onnx_program.model_proto)
-        np.testing.assert_allclose(
-            onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
-        )
-
-    def test_onnx_program_save_raises_when_model_initialized_in_fake_mode(self):
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = torch.nn.Parameter(torch.tensor(42.0))
-
-            def forward(self, x):
-                return self.weight + x
-
-        with torch.onnx.enable_fake_mode():
-            onnx_program = torch.onnx.export(
-                Model(), (torch.tensor(1.0),), dynamo=True, optimize=False
-            )
-            assert onnx_program is not None
-            # Convert to model proto and back to trigger to_bytes method which serializes the tensor
-            with self.assertRaises(Exception):
-                # The tensors need to be replaced with real tensors
-                _ = onnx_program.model_proto
-
-        with self.assertRaises(Exception):
-            # It doesn't matter if it is called inside or outside of the enable_fake_mode() context
-            _ = onnx_program.model_proto
-
-        # If we replace with concrete tensors, the serialization will succeed
-        # This needs to happen outside of the fake context
-        onnx_program.apply_weights({"weight": torch.tensor(42.0)})
-        onnx_model = ir.serde.deserialize_model(onnx_program.model_proto)
-        np.testing.assert_allclose(
-            onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
-        )
-
-    def test_onnx_program_save_succeeds_when_export_and_save_in_fake_mode(self):
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = torch.nn.Parameter(torch.tensor(42.0))
-
-            def forward(self, x):
-                return self.weight + x
-
-        real_model = Model()
-
-        with torch.onnx.enable_fake_mode():
-            onnx_program = torch.onnx.export(
-                real_model, (torch.tensor(1.0),), dynamo=True, optimize=False
-            )
-
-            assert onnx_program is not None
-            # Convert to model proto and back to trigger to_bytes method which serializes the tensor
-            # Note that even though we are calling .model_proto (equivalently .save()) in fake mode,
-            # the concrete tensors are maintained.
-            # This is due to the usage of torch._subclasses.fake_tensor.unset_fake_temporarily() in
-            # TorchTensor.tobytes()
-            onnx_model = ir.serde.deserialize_model(onnx_program.model_proto)
-            np.testing.assert_allclose(
-                onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
-            )
-
-        # This works inside or outside the fake mode
-        onnx_model = ir.serde.deserialize_model(onnx_program.model_proto)
-        np.testing.assert_allclose(
-            onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
-        )
-
-    def test_is_in_onnx_export(self):
-        class Mod(torch.nn.Module):
-            def forward(self, x):
-                def f(x):
-                    return x.sin() if torch.onnx.is_in_onnx_export() else x.cos()
-
-                return f(x)
-
-        self.assertFalse(torch.onnx.is_in_onnx_export())
-        onnx_program = torch.onnx.export(
-            Mod(),
-            (torch.randn(3, 4),),
-            dynamo=True,
-            fallback=False,
-        )
-        self.assertFalse(torch.onnx.is_in_onnx_export())
-
-        node_names = [n.op_type for n in onnx_program.model.graph]
-        self.assertIn("Sin", node_names)
-
-    def test_torchscript_exporter_raises_deprecation_warning(self):
-        # Test that the deprecation warning is raised when using torchscript exporter
-        with self.assertWarnsRegex(
-            DeprecationWarning, "You are using the legacy TorchScript-based ONNX export"
-        ):
-            torch.onnx.export(
-                SampleModel(), (torch.randn(1, 1, 2),), io.BytesIO(), dynamo=False
-            )
-
-
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/onnx/exporter/test_dynamic_shapes.py b/test/onnx/exporter/test_dynamic_shapes.py
index 464d3e34d6d0e..42a08e5647bdb 100644
--- a/test/onnx/exporter/test_dynamic_shapes.py
+++ b/test/onnx/exporter/test_dynamic_shapes.py
@@ -199,6 +199,7 @@ def test_dynamic_shapes_supports_nested_input_model_with_input_names_assigned(
                 filename,
                 dynamic_axes=dynamic_axes,
                 input_names=input_names,
+                dynamo=False,
             )
             onnx_model = onnx.load(filename)
 
diff --git a/test/onnx/internal/test_registraion.py b/test/onnx/internal/test_registraion.py
index e357dbff713a8..fcc4cdeedd92f 100644
--- a/test/onnx/internal/test_registraion.py
+++ b/test/onnx/internal/test_registraion.py
@@ -4,7 +4,7 @@
 from collections.abc import Sequence
 
 from torch.onnx import errors
-from torch.onnx._internal import registration
+from torch.onnx._internal.torchscript_exporter import registration
 from torch.testing._internal import common_utils
 
 
diff --git a/test/onnx/onnx_test_common.py b/test/onnx/onnx_test_common.py
index be6cc066e6b90..ab2bfb51bdea4 100644
--- a/test/onnx/onnx_test_common.py
+++ b/test/onnx/onnx_test_common.py
@@ -17,7 +17,8 @@
 
 import torch
 from torch import export as torch_export
-from torch.onnx import _constants, verification
+from torch.onnx import _constants
+from torch.onnx._internal.torchscript_exporter import verification
 from torch.testing._internal import common_utils
 from torch.testing._internal.opinfo import core as opinfo_core
 from torch.types import Number
diff --git a/test/onnx/test_autograd_funs.py b/test/onnx/test_autograd_funs.py
index cfeec9553ab74..81c70d7d98777 100644
--- a/test/onnx/test_autograd_funs.py
+++ b/test/onnx/test_autograd_funs.py
@@ -5,13 +5,11 @@
 
 import torch
 from torch.onnx import OperatorExportTypes
-from torch.onnx._globals import GLOBALS
-from torch.onnx.utils import _model_to_graph
 from torch.testing._internal import common_utils
 
 
 class TestAutogradFuns(pytorch_test_common.ExportTestCase):
-    opset_version = GLOBALS.export_onnx_opset_version
+    opset_version = 20
     keep_initializers_as_inputs = False
     onnx_shape_inference = True
 
@@ -133,7 +131,7 @@ def forward(self, input):
         input = torch.ones(1, 5)
 
         # Test ONNX_FALLTHROUGH_MODE
-        graph, _, _ = _model_to_graph(
+        graph, _, _ = torch.onnx.utils._model_to_graph(
             model,
             (input,),
             operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
@@ -142,7 +140,7 @@ def forward(self, input):
         self.assertEqual(next(iter).kind(), "prim::PythonOp")
 
         # Test ATEN_FALLBACK_MODE
-        graph, _, _ = _model_to_graph(
+        graph, _, _ = torch.onnx.utils._model_to_graph(
             model,
             (input,),
             operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
diff --git a/test/onnx/test_fx_passes.py b/test/onnx/test_fx_passes.py
deleted file mode 100644
index 97d255abdcb14..0000000000000
--- a/test/onnx/test_fx_passes.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Owner(s): ["module: onnx"]
-import torch
-import torch._dynamo
-import torch.fx
-from torch.onnx._internal.fx.passes import _utils as pass_utils
-from torch.testing._internal import common_utils
-
-
-class TestFxPasses(common_utils.TestCase):
-    def test_set_node_name_correctly_renames_when_new_name_collides_recursively(self):
-        def func(x, y, z):
-            return x + y + z
-
-        x = torch.randn(3)
-        y = torch.randn(3)
-        z = torch.randn(3)
-        gm, _ = torch._dynamo.export(func)(x, y, z)
-        torch._dynamo.reset()
-
-        # Purposely name the nodes in a way that will cause a recursive collision later.
-        # See :func:`set_node_name` for name collision renaming logic.
-        base_name = "tensor"
-        nodes = list(gm.graph.nodes)
-        for i, node in enumerate(nodes[1:]):
-            if i == 0:
-                node.name = base_name
-            else:
-                node.name = f"{base_name}.{i}"
-
-        # Run `set_node_name` and verify that the names are correct.
-        name_to_node = {node.name: node for node in gm.graph.nodes}
-        pass_utils.set_node_name(nodes[0], base_name, name_to_node)
-        assert nodes[0].name == base_name, f"Expected {base_name}, got {nodes[0].name}"
-        assert len({node.name for node in nodes}) == len(nodes), (
-            f"Expected all names to be unique, got {nodes}"
-        )
-
-    def test_set_node_name_succeeds_when_no_name_collisions(self):
-        def func(x, y, z):
-            return x + y + z
-
-        x = torch.randn(3)
-        y = torch.randn(3)
-        z = torch.randn(3)
-        gm, _ = torch._dynamo.export(func)(x, y, z)
-        torch._dynamo.reset()
-
-        # Run `set_node_name` and verify that the names are correct.
-        new_name = "some_tensor"
-        nodes = list(gm.graph.nodes)
-        name_to_node = {node.name: node for node in nodes}
-        pass_utils.set_node_name(nodes[1], new_name, name_to_node)
-        assert nodes[1].name == new_name, f"Expected {new_name}, got {nodes[0].name}"
-        assert len({node.name for node in nodes}) == len(nodes), (
-            f"Expected all names to be unique, got {nodes}"
-        )
-
-
-if __name__ == "__main__":
-    common_utils.run_tests()
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index 29ac8f108c2d9..75de1f3fab83e 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -11,7 +11,7 @@
 import torch.onnx
 from torch.nn import Module
 from torch.onnx import producer_name, producer_version
-from torch.onnx._globals import GLOBALS
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 from torch.testing._internal import common_utils
 
 
@@ -67,6 +67,7 @@ def check_onnx_opsets_operator(
             training=training,
             input_names=input_names,
             dynamic_axes=dynamic_axes,
+            dynamo=False,
         )
         model = onnx.load(io.BytesIO(f.getvalue()))
         check_onnx_opset_operator(model, ops[opset_version], opset_version)
diff --git a/test/onnx/test_onnxscript_no_runtime.py b/test/onnx/test_onnxscript_no_runtime.py
index 17e92f0e0117e..e47c88b4c4406 100644
--- a/test/onnx/test_onnxscript_no_runtime.py
+++ b/test/onnx/test_onnxscript_no_runtime.py
@@ -10,7 +10,7 @@
 from onnxscript.onnx_types import FLOAT
 
 import torch
-from torch.onnx._internal import jit_utils
+from torch.onnx._internal.torchscript_exporter import jit_utils
 from torch.testing._internal import common_utils
 
 
@@ -86,14 +86,20 @@ def custom_layer_norm(
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
         model_selu = torch.nn.SELU()
         selu_onnx = io.BytesIO()
-        torch.onnx.export(model_selu, x, selu_onnx, opset_version=self.opset_version)
+        torch.onnx.export(
+            model_selu, x, selu_onnx, opset_version=self.opset_version, dynamo=False
+        )
 
         N, C = 3, 4
         y = torch.randn(N, C)
         model_layer_norm = torch.nn.LayerNorm(C)
         layer_norm_onnx = io.BytesIO()
         torch.onnx.export(
-            model_layer_norm, y, layer_norm_onnx, opset_version=self.opset_version
+            model_layer_norm,
+            y,
+            layer_norm_onnx,
+            opset_version=self.opset_version,
+            dynamo=False,
         )
 
         # 4. test on models
@@ -156,7 +162,11 @@ def custom_selu(g, X):
 
         saved_model = io.BytesIO()
         torch.onnx.export(
-            torch.jit.script(model), inputs, f=saved_model, opset_version=15
+            torch.jit.script(model),
+            inputs,
+            f=saved_model,
+            opset_version=15,
+            dynamo=False,
         )
         loop_selu_proto = onnx.load(io.BytesIO(saved_model.getvalue()))
         self.assertEqual(len(loop_selu_proto.functions), 1)
diff --git a/test/onnx/test_onnxscript_runtime.py b/test/onnx/test_onnxscript_runtime.py
index 23205045e8388..dc19971498d95 100644
--- a/test/onnx/test_onnxscript_runtime.py
+++ b/test/onnx/test_onnxscript_runtime.py
@@ -9,7 +9,7 @@
 from onnxscript.onnx_types import FLOAT
 
 import torch
-from torch.onnx._internal import jit_utils
+from torch.onnx._internal.torchscript_exporter import jit_utils
 from torch.testing._internal import common_utils
 
 
diff --git a/test/onnx/test_pytorch_jit_onnx.py b/test/onnx/test_pytorch_jit_onnx.py
index 68f26aea8b894..bc3c64ab8679b 100644
--- a/test/onnx/test_pytorch_jit_onnx.py
+++ b/test/onnx/test_pytorch_jit_onnx.py
@@ -4,8 +4,11 @@
 from pytorch_test_common import skipIfNoCuda
 
 import torch
-from torch.onnx import verification
-from torch.onnx._globals import GLOBALS
+from torch.onnx._internal.torchscript_exporter import verification
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+from torch.onnx._internal.torchscript_exporter.utils import (
+    _trigger_symbolic_function_registration,
+)
 from torch.testing._internal import common_utils
 
 
@@ -20,6 +23,7 @@ def _jit_graph_to_onnx_model(graph, operator_export_type, opset_version):
     """
 
     GLOBALS.export_onnx_opset_version = opset_version
+    _trigger_symbolic_function_registration()
     graph = torch.onnx.utils._optimize_graph(
         graph, operator_export_type, params_dict={}
     )
diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
deleted file mode 100644
index b3a3aa01cf3c0..0000000000000
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ /dev/null
@@ -1,1226 +0,0 @@
-# Owner(s): ["module: onnx"]
-
-"""Tests for onnx export that don't run the exported model."""
-
-from __future__ import annotations
-
-import contextlib
-import io
-import itertools
-import unittest
-import unittest.mock
-import warnings
-from typing import Callable, Optional, TYPE_CHECKING, Union
-
-import numpy as np
-
-import onnx
-import onnx.numpy_helper
-import pytorch_test_common
-
-import torch
-import torch.nn.functional as F
-from torch import Tensor
-from torch.onnx import symbolic_helper, utils
-from torch.onnx._internal import registration
-from torch.testing._internal import common_quantization, common_utils, jit_utils
-
-
-if TYPE_CHECKING:
-    from collections.abc import Iterable
-
-
-def export_to_onnx(
-    model: Union[torch.nn.Module, torch.jit.ScriptFunction],
-    input: Union[torch.Tensor, tuple[torch.Tensor]],
-    custom_ops: Optional[
-        Iterable[Union[contextlib.AbstractContextManager, contextlib.ContextDecorator]]
-    ] = None,
-    mocks: Optional[Iterable] = None,
-    operator_export_type: torch.onnx.OperatorExportTypes = torch.onnx.OperatorExportTypes.ONNX,
-    opset_version: int = 17,
-    **torch_onnx_export_kwargs,
-) -> onnx.ModelProto:
-    """Exports `model(input)` to ONNX and returns it.
-
-    Custom operators and/or unittest patches can be used help reproducing specific behaviors.
-
-    Args:
-        model: model to export
-        input: model input with same format as `torch.onnx.export(..,args,...)`
-        custom_ops: list of custom operators to use during export
-        mocks: list of mocks to use during export
-        operator_export_type: export type as described by `torch.onnx.export(...operator_export_type,...)`
-        opset_version: ONNX opset version as described by `torch.onnx.export(...opset_version,...)`
-        torch_onnx_export_kwargs: extra torch.onnx.export kwargs arguments
-    Returns:
-        A valid ONNX model (`onnx.ModelProto`)
-    """
-    custom_ops = custom_ops or []
-    mocks = mocks or []
-    with contextlib.ExitStack() as stack:
-        for ctx in itertools.chain(custom_ops, mocks):
-            stack.enter_context(ctx)
-
-        f = io.BytesIO()
-        torch.onnx.export(
-            model,
-            input,
-            f,
-            operator_export_type=operator_export_type,
-            opset_version=opset_version,
-            **torch_onnx_export_kwargs,
-        )
-
-    # Validate ONNX graph before returning it
-    onnx_model = onnx.load_from_string(f.getvalue())
-    onnx.checker.check_model(onnx_model)
-    return onnx_model
-
-
-@common_utils.instantiate_parametrized_tests
-class TestONNXExport(pytorch_test_common.ExportTestCase):
-    def test_fuse_addmm(self):
-        class AddmmModel(torch.nn.Module):
-            def forward(self, x):
-                return torch.mm(x, x) + x
-
-        x = torch.ones(3, 3)
-        f = io.BytesIO()
-        torch.onnx.export(AddmmModel(), x, f)
-
-    def test_onnx_transpose_incomplete_tensor_type(self):
-        # Smoke test to get us into the state where we are attempting to export
-        # a transpose op, where the input is a TensorType without size information.
-        # This would previously not work, since we would
-        # take the size of the input and use the length of its sizes as the
-        # number of dimensions in the permutation.
-        class Foo(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                return x.contiguous().transpose(0, 1).sum()
-
-        class TraceMe(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                self.foo = Foo()
-
-            def forward(self, x):
-                return self.foo(x)
-
-        tm = TraceMe()
-        tm = torch.jit.trace(tm, torch.rand(3, 4))
-        f = io.BytesIO()
-        torch.onnx.export(tm, (torch.rand(3, 4),), f)
-
-    def test_export_tensoroption_to(self):
-        def foo(x):
-            return x[0].detach().clone().cpu() + x
-
-        traced = torch.jit.trace(foo, (torch.rand([2])))
-
-        f = io.BytesIO()
-        torch.onnx.export(traced, (torch.rand([2]),), f)
-
-    def test_onnx_export_script_module(self):
-        class ModuleToExport(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                y = x - x  # noqa: F841
-                return x + x
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
-
-    @common_utils.suppress_warnings
-    def test_onnx_export_func_with_warnings(self):
-        @torch.jit.script
-        def func_with_warning(inp):
-            return torch.nn.functional.sigmoid(inp)  # triggers a deprecation warning
-
-        class WarningTest(torch.nn.Module):
-            def forward(self, x):
-                return func_with_warning(x)
-
-        # no exception
-        f = io.BytesIO()
-        torch.onnx.export(WarningTest(), torch.randn(42), f)
-
-    def test_onnx_export_script_python_fail(self):
-        class PythonModule(torch.jit.ScriptModule):
-            @torch.jit.ignore
-            def forward(self, x):
-                return torch.neg(x)
-
-        class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self) -> None:
-                super().__init__()
-                self.mod = PythonModule()
-
-            @torch.jit.script_method
-            def forward(self, x):
-                y = self.mod(x)
-                return y + y
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        with self.assertRaisesRegex(RuntimeError, "Couldn't export Python"):
-            torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
-
-    def test_onnx_export_script_inline_trace(self):
-        class ModuleToInline(torch.nn.Module):
-            def forward(self, x):
-                return torch.neg(x)
-
-        class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self) -> None:
-                super().__init__()
-                self.mod = torch.jit.trace(ModuleToInline(), torch.zeros(1, 2, 3))
-
-            @torch.jit.script_method
-            def forward(self, x):
-                y = self.mod(x)
-                return y + y
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
-
-    def test_onnx_export_script_inline_script(self):
-        class ModuleToInline(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                return torch.neg(x)
-
-        class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self) -> None:
-                super().__init__()
-                self.mod = ModuleToInline()
-
-            @torch.jit.script_method
-            def forward(self, x):
-                y = self.mod(x)
-                return y + y
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
-
-    def test_onnx_export_script_module_loop(self):
-        class ModuleToExport(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                # test if we support end to end onnx export on loop and
-                # nested loops with and without loop index
-                for _ in range(5):
-                    for i in range(3):
-                        x = x + i
-                return x
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
-
-    @common_utils.suppress_warnings
-    def test_onnx_export_script_truediv(self):
-        class ModuleToExport(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                z = x.size(0) / 2
-                return x + z
-
-        mte = ModuleToExport()
-
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.zeros(1, 2, 3, dtype=torch.float),), f)
-
-    def test_onnx_export_script_non_alpha_add_sub(self):
-        class ModuleToExport(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                bs = x.size(0) + 1
-                return bs - 1
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.rand(3, 4),), f)
-
-    def test_onnx_export_script_module_if(self):
-        class ModuleToExport(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                if bool(torch.sum(x) > 0):
-                    x = torch.neg(x)
-                return x
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
-
-    def test_onnx_export_script_inline_params(self):
-        class ModuleToInline(torch.jit.ScriptModule):
-            def __init__(self) -> None:
-                super().__init__()
-                self.m = torch.nn.Parameter(torch.ones(3, 3))
-                self.unused = torch.nn.Parameter(torch.ones(1, 2, 3))
-
-            @torch.jit.script_method
-            def forward(self, x):
-                return torch.mm(x, self.m)
-
-        class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self) -> None:
-                super().__init__()
-                self.mod = ModuleToInline()
-                self.param = torch.nn.Parameter(torch.ones(3, 4))
-
-            @torch.jit.script_method
-            def forward(self, x):
-                y = self.mod(x)
-                return torch.mm(y, self.param)
-
-        mte = ModuleToExport()
-        result = mte(torch.zeros(2, 3))
-        reference = torch.mm(
-            torch.mm(torch.zeros(2, 3), torch.ones(3, 3)), torch.ones(3, 4)
-        )
-        self.assertEqual(result, reference)
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.ones(2, 3),), f)
-
-    def test_onnx_export_speculate(self):
-        class Foo(torch.jit.ScriptModule):
-            def __init__(self, m):
-                super().__init__()
-                self.m = m
-
-            @torch.jit.script_method
-            def forward(self, x):
-                x += x
-                # because we are testing if we emit `if` statement correctly
-                # we cannot use `True` as the condition. Constant prop
-                # would remove the `if` statements.
-                c = torch.sum(x) > 4
-                if bool(c):
-                    if bool(c):
-                        y = self.m(x)
-                    else:
-                        y = self.m(x)
-                else:
-                    y = self.m(x)
-                return y
-
-        linear = torch.jit.trace(
-            torch.nn.Linear(10, 20).float(), torch.zeros(1, 10, dtype=torch.float)
-        )
-
-        @torch.jit.script
-        def transpose(x):
-            return x.t()
-
-        f1 = Foo(transpose)
-        f2 = Foo(linear)
-
-        f = io.BytesIO()
-        torch.onnx.export(f1, (torch.ones(1, 10, dtype=torch.float),), f)
-        f = io.BytesIO()
-        torch.onnx.export(f2, (torch.ones(1, 10, dtype=torch.float),), f)
-
-    def test_onnx_export_shape_reshape(self):
-        class Foo(torch.nn.Module):
-            def forward(self, x):
-                import torch.onnx.operators
-
-                x = x.repeat(5, 1, 1)
-                shape = torch.onnx.operators.shape_as_tensor(x)
-                reshaped = torch.onnx.operators.reshape_from_tensor_shape(x, shape)
-                return reshaped
-
-        foo = torch.jit.trace(Foo(), torch.zeros(1, 2, 3))
-        f = io.BytesIO()
-        torch.onnx.export(foo, (torch.zeros(1, 2, 3)), f)
-
-    def test_export_dynamic_slice(self):
-        class DynamicSliceExportMod(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                retval = x[0]
-                for i in range(x.size(1)):
-                    retval += torch.sum(x[0:i], dim=0)
-                return retval
-
-        input = torch.rand(3, 4, 5)
-
-        f = io.BytesIO()
-        torch.onnx.export(DynamicSliceExportMod(), (input,), f, opset_version=10)
-
-    def test_export_dict(self):
-        class DictModule(torch.nn.Module):
-            def forward(self, x_in: torch.Tensor) -> dict[str, torch.Tensor]:
-                return {"test_key_out": x_in}
-
-        x_in = torch.tensor(1)
-        mod = DictModule()
-        mod.train(False)
-
-        f = io.BytesIO()
-        torch.onnx.export(mod, (x_in,), f)
-
-        with self.assertRaisesRegex(RuntimeError, r"DictConstruct.+is not supported"):
-            f = io.BytesIO()
-            torch.onnx.export(torch.jit.script(mod), (x_in,), f)
-
-    def test_source_range_propagation(self):
-        class ExpandingModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                # Will be expanded during ONNX export
-                self.ln = torch.nn.LayerNorm([1])
-
-            def forward(self, input):
-                return self.ln(input)
-
-        mod = ExpandingModule()
-
-        graph, _, _ = utils._model_to_graph(
-            mod,
-            (torch.zeros(1),),
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
-        )
-
-        # Ensure that every node in the graph has a valid source range
-        for node in graph.nodes():
-            self.assertTrue(node.sourceRange())
-
-    def test_clip_aten_fallback_due_exception(self):
-        def bad_clamp(g, self, min, max):
-            return symbolic_helper._onnx_unsupported("Bad boy!")
-
-        class MyClip(torch.nn.Module):
-            def forward(self, x):
-                return torch.clamp(x, min=-0.5, max=0.5)
-
-        onnx_model = export_to_onnx(
-            MyClip(),
-            torch.randn(3, 4, requires_grad=True),
-            custom_ops=[common_utils.custom_op("aten::clamp", bad_clamp, 17)],
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-        )
-        self.assertAtenOp(onnx_model, "clamp", "Tensor")
-
-    def test_clip_aten_fallback_explicit_request(self):
-        class MyClip(torch.nn.Module):
-            def forward(self, x):
-                return torch.clamp(x, min=-0.5, max=0.5)
-
-        # Copy of mocked method must be saved to prevent
-        # max recursion depth while trying to run original instance method
-        original_get_function_group = registration.registry.get_function_group
-
-        def break_is_registered_op_api(name):
-            fake_missing_symbolics = {"aten::clamp"}
-            if name in fake_missing_symbolics:
-                return None
-            return original_get_function_group(name)
-
-        # Force missing symbolic for well-known op using a mock
-        onnx_model = export_to_onnx(
-            MyClip(),
-            torch.randn(3, 4, requires_grad=True),
-            mocks=[
-                unittest.mock.patch(
-                    "torch.onnx._internal.registration.registry.get_function_group",
-                    side_effect=break_is_registered_op_api,
-                    # wraps=registration.registry.get_function_group
-                )
-            ],
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-        )
-        self.assertAtenOp(onnx_model, "clamp", "Tensor")
-
-    def _helper_test_to_(self, cast_fn: Callable[[torch.Tensor], torch.Tensor]):
-        """Helper to test aten::to(device) variants.
-
-        `cast_fn` is converted into a `torch.jit.script`. It wraps `aten::to`
-        during export to preventing the devices to be hard-coded.
-
-        Needed by detectron2 after https://github.com/facebookresearch/detectron2/pull/4132/
-        """
-        cast_fn = torch.jit.script(cast_fn)
-        onnx_model = export_to_onnx(cast_fn, torch.zeros([1, 3, 32, 32]))
-        for n in onnx_model.graph.node:
-            self.assertNotEqual(n.op_type, "To")
-            self.assertNotEqual(n.op_type, "Cast")
-
-    def test_to__cpu_string(self):
-        def cast_cpu_string(src: torch.Tensor) -> torch.Tensor:
-            return src.to("cpu")
-
-        self._helper_test_to_(cast_cpu_string)
-
-    def test_to__device_cpu_string(self):
-        def cast_device_cpu_string(src: torch.Tensor) -> torch.Tensor:
-            return src.to(device="cpu")
-
-        self._helper_test_to_(cast_device_cpu_string)
-
-    def test_script_custom_class_error(self):
-        class BoxCoder:
-            def __init__(self, bbox_xform_clip: float) -> None:
-                self.bbox_xform_clip = bbox_xform_clip
-
-            def decode(self, rel_codes: Tensor, boxes: list[Tensor]) -> Tensor:
-                boxes = torch.cat(boxes, dim=0)
-                pred_ctr_x = (
-                    torch.clamp(rel_codes[:, 0::4], max=self.bbox_xform_clip)
-                    * boxes[:, 2]
-                )
-                return pred_ctr_x
-
-        class MyModule(torch.nn.Module):
-            __annotations__ = {
-                "box_coder": BoxCoder,
-            }
-
-            def __init__(self) -> None:
-                super().__init__()
-                self.box_coder = BoxCoder(1.4)
-
-            def forward(self, box_regression: Tensor, proposals: list[Tensor]):
-                return self.box_coder.decode(box_regression, proposals)
-
-        model = torch.jit.script(MyModule())
-        box_regression = torch.randn([4, 4])
-        proposal = [torch.randn(2, 4), torch.randn(2, 4)]
-
-        with self.assertRaises(RuntimeError):
-            f = io.BytesIO()
-            torch.onnx.export(
-                model,
-                (box_regression, proposal),
-                f,
-            )
-
-    def test_initializer_sequence(self):
-        class MyModule(torch.nn.Module):
-            def __init__(self, input_size, hidden_size, num_classes):
-                super().__init__()
-                self.fc1 = torch.nn.Linear(input_size, hidden_size)
-                self.relu = torch.nn.ReLU()
-                self.fc2 = torch.nn.Linear(hidden_size, num_classes)
-
-            def forward(self, x):
-                out = self.fc1(x)
-                out = self.relu(out)
-                out = self.fc2(out)
-                return out
-
-        test_model = MyModule(3, 4, 10)
-        state_dict_list = [k for (k, v) in test_model.state_dict().items()]
-        named_params_list = [k for (k, v) in test_model.named_parameters()]
-
-        x = torch.randn(32, 3)
-        f = io.BytesIO()
-        torch.onnx.export(test_model, (x,), f, do_constant_folding=False)
-        loaded_model = onnx.load_from_string(f.getvalue())
-
-        actual_list = [p.name for p in loaded_model.graph.initializer]
-        assert actual_list == state_dict_list, (
-            "Initializers' sequence is not as same as state_dict(). Expected: ("
-            + ", ".join(state_dict_list)
-            + "). Actual:("
-            + ", ".join(actual_list)
-            + ")."
-        )
-        assert actual_list == named_params_list, (
-            "Initializers' sequence is not as same as named_parameters(). Expected: ("
-            + ", ".join(named_params_list)
-            + "). Actual:("
-            + ", ".join(actual_list)
-            + ")."
-        )
-
-    def test_initializer_sequence_script_model(self):
-        def list_is_expected(short_list, long_list) -> bool:
-            if len(short_list) > len(long_list):
-                return False
-
-            for i in range(len(short_list)):
-                if short_list[i] not in long_list[i]:
-                    return False
-
-            return True
-
-        def loop(x, y):
-            for i in range(int(y)):
-                x = x + i
-            return x
-
-        class MyModule(torch.nn.Module):
-            def __init__(self, input_size, hidden_size, num_classes):
-                super().__init__()
-                self.fc1 = torch.nn.Linear(input_size, hidden_size)
-                self.relu = torch.nn.ReLU()
-                self.fc2 = torch.nn.Linear(hidden_size, num_classes)
-
-            def forward(self, x, y):
-                x = loop(x, y)
-                out = self.fc1(x)
-                out = self.relu(out)
-                out = self.fc2(out)
-                return out
-
-        test_model = torch.jit.script(MyModule(3, 4, 10))
-        state_dict_list = [k for (k, v) in test_model.state_dict().items()]
-        named_params_list = [k for (k, v) in test_model.named_parameters()]
-
-        x = torch.ones(2, 3, dtype=torch.float)
-        y = torch.tensor(5, dtype=torch.long)
-        f = io.BytesIO()
-
-        torch.onnx.export(test_model, (x, y), f, do_constant_folding=False)
-        loaded_model = onnx.load_from_string(f.getvalue())
-
-        actual_list = [p.name for p in loaded_model.graph.initializer]
-        assert list_is_expected(state_dict_list, actual_list), (
-            "ScriptModel - Initializers' sequence is not as same as state_dict(). Expected: ("
-            + ", ".join(state_dict_list)
-            + "). Actual:("
-            + ", ".join(actual_list)
-            + ")."
-        )
-        assert list_is_expected(named_params_list, actual_list), (
-            "ScriptModel - Initializers' sequence is not as same as named_parameters(). Expected: ("
-            + ", ".join(named_params_list)
-            + "). Actual:("
-            + ", ".join(actual_list)
-            + ")."
-        )
-
-    def test_shape_value_map(self):
-        class RSoftMax(torch.nn.Module):
-            def __init__(self, radix, cardinality):
-                super().__init__()
-                self.radix = radix
-                self.cardinality = cardinality
-
-            def forward(self, x):
-                batch = x.size(0)
-                x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
-                x = F.softmax(x, dim=1)
-                x = x.reshape(batch, -1)
-                return x
-
-        radix = 2
-        cardinality = 1
-        x = torch.randn(10, 1, 128, 1)
-        f = io.BytesIO()
-        torch.onnx.export(
-            RSoftMax(radix, cardinality),
-            (x,),
-            f,
-            input_names=["x"],
-            dynamic_axes={"x": [0]},
-        )
-        loaded_model = onnx.load_from_string(f.getvalue())
-        self.assertEqual(
-            loaded_model.graph.output[0].type.tensor_type.shape.dim[1].dim_value, 128
-        )
-
-    def test_onnx_proto_checker(self):
-        class Model(torch.nn.Module):
-            def forward(self, x):
-                return 2 * x
-
-        x = torch.randn(1, 2, 3, requires_grad=True)
-        f = io.BytesIO()
-        torch.onnx.export(Model(), (x,), f)
-        model = onnx.load(f)
-        model.ir_version = 0
-
-        def check_proto():
-            torch._C._check_onnx_proto(model.SerializeToString())
-
-        self.assertRaises(RuntimeError, check_proto)
-
-    def test_maintain_dynamic_shapes_of_unreliable_nodes(self):
-        def symbolic_pythonop(g, *args, **kwargs):
-            return g.op("com.microsoft::PythonOp")
-
-        torch.onnx.register_custom_op_symbolic("prim::PythonOp", symbolic_pythonop, 1)
-        self.addCleanup(torch.onnx.unregister_custom_op_symbolic, "prim::PythonOp", 1)
-
-        # necessay parameters for transformer embeddings
-        hidden_size = 48
-        max_position_embeddings = 32
-        batch_size = 2
-
-        # issue found that autograd.function making downstream
-        # node unreliable but with static shape. The issue was first
-        # discovered with using Apex FusedLayerNorm in Transformers
-        class CustomLayerNorm(torch.autograd.Function):
-            @staticmethod
-            def forward(ctx, embedding):
-                layer_norm = torch.nn.LayerNorm(hidden_size, eps=1e-12)
-                return layer_norm(embedding)
-
-        class EmbeddingModule(torch.nn.Module):
-            def forward(
-                self,
-                embeddings=None,
-            ):
-                embedding_output = CustomLayerNorm.apply(embeddings)
-                query = embedding_output.transpose(0, 1)
-                target_len, batch_size, embedding_dim = query.size()
-                # Reshape is used for consuming batch_size, and if it is static,
-                # this will be a Constant node in the graph
-                query = query.reshape(target_len, batch_size, embedding_dim)
-                return query
-
-        embeddings = torch.randn(batch_size, max_position_embeddings, hidden_size)
-
-        f = io.BytesIO()
-        torch.onnx.export(
-            EmbeddingModule().eval(),
-            (embeddings,),
-            f,
-            input_names=["embeddings"],
-            dynamic_axes={
-                "embeddings": {
-                    0: "batch_size",
-                    1: "max_position_embeddings",
-                    2: "hidden_size",
-                }
-            },
-            custom_opsets={"com.microsoft": 1},
-        )
-        model = onnx.load(io.BytesIO(f.getvalue()))
-
-        # If there is a constant node with dim=3 and max_position_embeddings,
-        # batch_size, hidden_size as shape, it means the shape becomes static.
-        # Normally, with dynamic batch size, this constant node should not exist.
-        const_node = [n for n in model.graph.node if n.op_type == "Constant"]
-        self.assertNotEqual(len(const_node), 0)
-        for node in const_node:
-            for a in node.attribute:
-                if a.name == "value":
-                    shape = onnx.numpy_helper.to_array(a.t)
-                    self.assertNotEqual(
-                        shape.tolist(),
-                        [max_position_embeddings, batch_size, hidden_size],
-                    )
-
-    def test_is_fp_for_C_TypeList(self):
-        class M(torch.nn.Module):
-            def forward(self, x):
-                x = x.squeeze(1)
-                w = x.shape[2]
-                pos = x.view(2, -1).argmax(1)
-                x_int = pos % w
-                y_int = (pos - x_int) // w
-                return y_int, x_int
-
-        model = torch.jit.script(M())
-        inputs = torch.randn(2, 4, 6)
-        f = io.BytesIO()
-        torch.onnx.export(
-            model, inputs, f, dynamic_axes={"x": [0, 1]}, input_names=["x"]
-        )
-
-    def test_dropout_script(self):
-        eg = torch.zeros(1, 2, 3, requires_grad=True)
-
-        @jit_utils._trace(eg)
-        def foo(x):
-            x = torch.neg(x)
-            return F.dropout(x)
-
-        class MyDrop(torch.nn.Module):
-            def forward(self, x):
-                return foo(x)
-
-        f = io.BytesIO()
-        with warnings.catch_warnings(record=True):
-            torch.onnx.export(MyDrop(), (eg,), f)
-
-    def test_pack_padded_pad_packed_trace(self):
-        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-
-        T, B, C = 3, 5, 7
-
-        class PadPackedWrapper(torch.nn.Module):
-            def forward(self, x, seq_lens):
-                x = pack_padded_sequence(x, seq_lens)
-                x, _ = pad_packed_sequence(x)
-                return x
-
-        x = np.ones((T, B, C))
-        seq_lens = np.array([3, 3, 2, 2, 1], dtype=np.int32)
-        # set padding value so we can test equivalence
-        for b in range(B):
-            if seq_lens[b] < T:
-                x[seq_lens[b] :, b, :] = 0
-        seq_lens = torch.from_numpy(seq_lens)
-        x = torch.autograd.Variable(torch.from_numpy(x), requires_grad=True)
-
-        m = PadPackedWrapper()
-        m_traced = torch.jit.trace(
-            m,
-            (
-                x,
-                seq_lens,
-            ),
-        )
-
-        y = m(x, seq_lens)
-        loss = torch.sum(y)
-        loss.backward()
-        grad = x.grad.clone()
-        x.grad.zero_()
-
-        y_traced = m_traced(x, seq_lens)
-        loss_traced = torch.sum(y_traced)
-        loss_traced.backward()
-        grad_traced = x.grad.clone()
-
-        self.assertEqual(y_traced, x)
-        self.assertEqual(y_traced, y)
-        self.assertEqual(grad, grad_traced)
-
-        f = io.BytesIO()
-        torch.onnx.export(m, (x, seq_lens), f)
-
-    # Suppression: ONNX warns when exporting RNNs because of potential batch size mismatch.
-    @common_utils.suppress_warnings
-    def test_rnn_trace_override(self):
-        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-
-        num_layers = 3
-        T, B, C = 11, 5, 7
-
-        class RNNTraceWrapper(torch.nn.Module):
-            def __init__(self, cell_type):
-                super().__init__()
-                if cell_type == "RNN":
-                    self.rnn = torch.nn.RNN(
-                        input_size=C, hidden_size=C, num_layers=num_layers
-                    )
-                elif cell_type == "LSTM":
-                    self.rnn = torch.nn.LSTM(
-                        input_size=C, hidden_size=C, num_layers=num_layers
-                    )
-                elif cell_type == "GRU":
-                    self.rnn = torch.nn.GRU(
-                        input_size=C, hidden_size=C, num_layers=num_layers
-                    )
-
-            def forward(self, x, seq_lens):
-                x = pack_padded_sequence(x, seq_lens)
-                x, _ = self.rnn(x)
-                x, _ = pad_packed_sequence(x)
-                return x
-
-        for cell_type in ["RNN", "LSTM", "GRU"]:
-            x = torch.ones(T, B, C, requires_grad=True)
-            seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
-
-            m = RNNTraceWrapper(cell_type)
-            m_traced = torch.jit.trace(
-                m,
-                (
-                    x,
-                    seq_lens,
-                ),
-            )
-
-            y = m(x, seq_lens)
-            loss = torch.sum(y)
-            loss.backward()
-            grad = x.grad.clone()
-            x.grad.zero_()
-
-            y_traced = m_traced(x, seq_lens)
-            loss_traced = torch.sum(y_traced)
-            loss_traced.backward()
-            grad_traced = x.grad.clone()
-
-            self.assertEqual(y_traced, y)
-            self.assertEqual(grad, grad_traced)
-
-            f = io.BytesIO()
-            torch.onnx.export(m, (x, seq_lens), f)
-
-    def test_pushpackingpastrnn_in_peephole_create_own_gather_input(self):
-        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-
-        num_layers = 3
-        T, B, C = 11, 5, 7
-        mask_start_point = 0
-
-        class LSTMTraceWrapper(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-                self.rnn = torch.nn.LSTM(
-                    input_size=C, hidden_size=C, num_layers=num_layers
-                )
-
-            def forward(self, x, seq_lens):
-                mask = torch.arange(mask_start_point, x.shape[1])
-                seq_lens = seq_lens[mask]
-                x = pack_padded_sequence(x, seq_lens)
-                # Calculate sizes and prepare views to our zero buffer to pass as hx
-                max_batch_size = x.batch_sizes[0]
-                hx = torch.randn(num_layers, max_batch_size, C)
-                cx = torch.randn(num_layers, max_batch_size, C)
-                x, _ = self.rnn(x, (hx, cx))
-                x, _ = pad_packed_sequence(x)
-                return x
-
-        x = torch.ones(T, B, C)
-        # length 5 because of B
-        seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
-        m = LSTMTraceWrapper()
-
-        f = io.BytesIO()
-        torch.onnx.export(
-            m,
-            (x, seq_lens),
-            f,
-            verbose=True,
-            input_names=["input", "seq_len"],
-            dynamic_axes={"input": {1: "B"}},
-        )
-        onnx_proto = onnx.load_model_from_string(f.getvalue())
-        # the first argument in onnx::Range should be constant node with value 0
-        const_node = []
-        constant_input_name = None
-        for n in onnx_proto.graph.node:
-            if n.op_type == "Constant":
-                const_node.append(n)
-            elif n.op_type == "Range":
-                constant_input_name = n.input[0]
-        self.assertNotEqual(constant_input_name, None)
-        self.assertNotEqual(len(const_node), 0)
-
-        value = None
-        for n in const_node:
-            if n.output[0] == constant_input_name:
-                value = np.frombuffer(n.attribute[0].t.raw_data, dtype=np.int64)
-        self.assertEqual(value, 0)
-
-    def test_trace_fork_wait_inline_onnx(self):
-        def fork_body(x):
-            return torch.neg(x), torch.neg(x)
-
-        class MyMod(torch.nn.Module):
-            def forward(self, x):
-                fut = torch.jit._fork(fork_body, x)
-                val = torch.jit._wait(fut)
-                return val[1]
-
-        # smoke test for ONNX export
-        f = io.BytesIO()
-        torch.onnx.export(MyMod(), (torch.rand(3, 4),), f)
-
-    def test_trace_detach_onnx_erase(self):
-        class Mod(torch.nn.Module):
-            def forward(self, x, w):
-                return torch.matmul(x, w).detach()
-
-        f = io.BytesIO()
-        torch.onnx.export(Mod(), (torch.rand(3, 4), torch.rand(4, 5)), f)
-
-    def test_aten_fallback_must_fallback(self):
-        class ModelWithAtenNotONNXOp(torch.nn.Module):
-            def forward(self, x, y):
-                abcd = x + y
-                defg = torch.linalg.qr(abcd)
-                return defg
-
-        x = torch.rand(3, 4)
-        y = torch.rand(3, 4)
-        f = io.BytesIO()
-        torch.onnx.export(
-            ModelWithAtenNotONNXOp(),
-            (x, y),
-            f,
-            do_constant_folding=False,
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-            # support for linalg.qr was added in later op set versions.
-            opset_version=9,
-        )
-        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-        self.assertAtenOp(onnx_model, "linalg_qr")
-
-    def test_onnx_aten(self):
-        class ModelWithAtenFmod(torch.nn.Module):
-            def forward(self, x, y):
-                return torch.fmod(x, y)
-
-        x = torch.randn(3, 4, dtype=torch.float32)
-        y = torch.randn(3, 4, dtype=torch.float32)
-        f = io.BytesIO()
-        torch.onnx.export(
-            ModelWithAtenFmod(),
-            (x, y),
-            f,
-            do_constant_folding=False,
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN,
-        )
-        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-        self.assertAtenOp(onnx_model, "fmod", "Tensor")
-
-    def test_onnx_aten_fallback_must_not_fallback(self):
-        # For BUILD_CAFFE2=0, aten fallback only when not exportable
-        class ONNXExportable(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                self.quant = torch.ao.quantization.QuantStub()
-                self.fc1 = torch.nn.Linear(12, 8)
-                self.fc2 = torch.nn.Linear(8, 4)
-                self.fc3 = torch.nn.Linear(4, 6)
-                self.dequant = torch.ao.quantization.DeQuantStub()
-
-            def forward(self, x):
-                x = self.quant(x)
-                x = x.view((-1, 12))
-                h = F.relu(self.fc1(x))
-                h = F.relu(self.fc2(h))
-                h = F.relu(self.fc3(h))
-                h = self.dequant(h)
-                return h
-
-        dummy_input = torch.randn(12)
-        f = io.BytesIO()
-        torch.onnx.export(
-            ONNXExportable(),
-            (dummy_input,),
-            f,
-            do_constant_folding=False,
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-        )
-        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-        all_aten_nodes = [
-            p
-            for p in onnx_model.graph.node
-            if p.op_type == "ATen" and p.domain == "org.pytorch.aten"
-        ]
-        self.assertEqual(len(all_aten_nodes), 0)
-
-    def test_cat_with_empty_tensor(self):
-        class NoopConcat(torch.nn.Module):
-            def forward(self, x):
-                return torch.cat((torch.Tensor([]), x))
-
-        x = torch.randn(4, 5, 6)
-        # TODO: Parametrize this test for opset_version
-        for opset_version in {9, 11}:
-            f = io.BytesIO()
-            torch.onnx.export(NoopConcat(), (x,), f, opset_version=opset_version)
-            loaded_model = onnx.load_from_string(f.getvalue())
-            self.assertEqual(
-                len(loaded_model.graph.output[0].type.tensor_type.shape.dim), 3
-            )
-            for idx, dim in enumerate(x.shape):
-                self.assertEqual(
-                    loaded_model.graph.output[0]
-                    .type.tensor_type.shape.dim[idx]
-                    .dim_value,
-                    dim,
-                )
-
-    def test_col2im(self):
-        # This test can be moved to test/onnx/test_pytorch_onnx_onnxruntime.py when ORT implement ::Col2Im
-
-        # Random batched RGB 32x32 image-shaped input tensor of batch size 64
-        original_image_inputs = torch.randn((64, 3, 32, 32))
-        output_size = tuple(original_image_inputs.shape[2:])
-        kernel_size = (1, 2)
-        dilation = 3
-        padding = 2
-        stride = 1
-        model_im2col = torch.nn.Unfold(
-            kernel_size, dilation=dilation, padding=padding, stride=stride
-        )
-        blocks = model_im2col(original_image_inputs)
-
-        model = torch.nn.Fold(
-            output_size=output_size,
-            kernel_size=kernel_size,
-            dilation=dilation,
-            padding=padding,
-            stride=stride,
-        )
-        f = io.BytesIO()
-        torch.onnx.export(model, (blocks,), f, opset_version=18)
-
-        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-        self.assertEqual(onnx_model.graph.node[-1].op_type, "Col2Im")
-        self.assertEqual(onnx_model.graph.node[-1].domain, "")
-        self.assertEqual(len(onnx_model.graph.node[-1].input), 3)
-        self.assertEqual(onnx_model.graph.node[-1].attribute[0].name, "dilations")
-        self.assertEqual(onnx_model.graph.node[-1].attribute[1].name, "pads")
-        self.assertEqual(onnx_model.graph.node[-1].attribute[2].name, "strides")
-
-    @unittest.skipIf(
-        not torch.hub._check_module_exists("torch_scatter"),
-        "torch_scatter not installed.",
-    )
-    def test_random_namespace_custom_op_is_onnx_exportable(self):
-        from torch_scatter import scatter_max  # type: ignore[import]
-
-        class MyModel(torch.nn.Module):
-            def forward(self, src: torch.Tensor, idx: torch.Tensor):
-                return scatter_max(src, idx)
-
-        m = MyModel().eval()
-        src = torch.ones([3, 10], dtype=torch.float32)
-        idx = torch.randint(0, 4, [3, 10], dtype=torch.long)
-
-        def sym_scatter_max(g, src, index, dim, out, dim_size):
-            return g.op(
-                "torch_scatter::scatter_max", src, index, dim_size_i=-1, outputs=2
-            )
-
-        torch.onnx.register_custom_op_symbolic(
-            "torch_scatter::scatter_max", sym_scatter_max, 1
-        )
-        f = io.BytesIO()
-        with torch.no_grad():
-            torch.onnx.export(
-                m,
-                (src, idx),
-                f,
-                opset_version=13,
-                custom_opsets={"torch_scatter": 1},
-                do_constant_folding=True,
-            )
-
-    @common_utils.parametrize("fp8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
-    def test_fp8_export(self, fp8_dtype: torch.dtype):
-        class Model(torch.nn.Module):
-            def forward(self, x):
-                return x.to(torch.float32)
-
-        x = torch.randn(2, 3).to(fp8_dtype)
-
-        f = io.BytesIO()
-        torch.onnx.export(Model(), x, f, opset_version=19)
-        onnx.checker.check_model(f.getvalue())
-
-        onnx_type = {
-            torch.float8_e4m3fn: 17,
-            torch.float8_e5m2: 19,
-        }  # From https://github.com/onnx/onnx/blob/main/onnx/onnx.proto3#L512-L521
-        loaded_model = onnx.load_from_string(f.getvalue())
-        self.assertEqual(
-            loaded_model.graph.input[0].type.tensor_type.elem_type, onnx_type[fp8_dtype]
-        )
-
-
-class TestQuantizeEagerONNXExport(common_utils.TestCase):
-    def _test_lower_graph_impl(self, model, data):
-        model.qconfig = torch.ao.quantization.default_qconfig
-        model = torch.ao.quantization.prepare(model)
-        model = torch.ao.quantization.convert(model)
-
-        _ = model(data)
-        input_names = ["x"]
-
-        def _export_to_onnx(model, input, input_names):
-            traced = torch.jit.trace(model, input)
-            buf = io.BytesIO()
-            torch.jit.save(traced, buf)
-            buf.seek(0)
-
-            model = torch.jit.load(buf)
-            f = io.BytesIO()
-            torch.onnx.export(
-                model,
-                input,
-                f,
-                input_names=input_names,
-                operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-                opset_version=9,
-            )
-
-        _export_to_onnx(model, data, input_names)
-
-    @common_quantization.skipIfNoFBGEMM
-    @unittest.skip(
-        "onnx opset9 does not support quantize_per_tensor and caffe2 \
-    does not support conv3d"
-    )
-    def test_lower_graph_conv3d(self):
-        model = torch.ao.quantization.QuantWrapper(
-            torch.nn.Conv3d(3, 5, 2, bias=True)
-        ).to(dtype=torch.float)
-        data_numpy = np.random.rand(1, 3, 6, 6, 6).astype(np.float32)
-        data = torch.from_numpy(data_numpy).to(dtype=torch.float)
-        self._test_lower_graph_impl(model, data)
-
-    @pytorch_test_common.skipIfNoCuda
-    def test_composed_layer_norm_small_eps_fp16_keep_double(self):
-        class Net(torch.nn.Module):
-            def __init__(self, C):
-                super().__init__()
-                self.layer_norm = torch.nn.LayerNorm(C, eps=1e-8)
-
-            def forward(self, x):
-                return self.layer_norm(x)
-
-        N, C = 8, 4
-        model = Net(C).cuda().half()
-        x = torch.randn(N, C).cuda().half()
-        f = io.BytesIO()
-        torch.onnx.export(model, (x,), f, opset_version=14)
-        onnx_model = onnx.load_from_string(f.getvalue())
-        const_node = [n for n in onnx_model.graph.node if n.op_type == "Constant"]
-        self.assertNotEqual(len(const_node), 0)
-        double_type_count = 0
-        for node in const_node:
-            for a in node.attribute:
-                # EPS constant should be in double type
-                if a.name == "value" and a.t.data_type == 11:
-                    double_type_count += 1
-        self.assertNotEqual(double_type_count, 0)
-
-    @pytorch_test_common.skipIfNoCuda
-    def test_aten_device_with_index(self):
-        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
-        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
-        model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
-        model = torch.compile(model, backend="onnxrt")
-        model = model.eval()
-        device = "cuda:0"
-        model = model.to(device)
-        ids = tokenizer.batch_encode_plus(["This is a test"], return_tensors="pt").to(
-            device
-        )
-
-        with torch.no_grad():
-            _ = model(
-                input_ids=ids["input_ids"],
-                attention_mask=ids["attention_mask"],
-                decoder_input_ids=ids["input_ids"],
-                decoder_attention_mask=ids["attention_mask"],
-            )
-
-    def test_aten_linalg_vector_norm_with_reducel2(self):
-        class Net(torch.nn.Module):
-            def forward(self, x):
-                x = F.normalize(x)
-                return x
-
-        f = io.BytesIO()
-        torch.onnx.export(Net(), (torch.randn(1, 2, 2),), f)
-        onnx_model = onnx.load_from_string(f.getvalue())
-        onnx_nodes = [n.op_type for n in onnx_model.graph.node]
-        self.assertIn("ReduceL2", onnx_nodes)
-
-
-if __name__ == "__main__":
-    common_utils.run_tests()
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index f99380840679e..6fa49ed61b71b 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -41,7 +41,9 @@
 import torch
 from torch import Tensor
 from torch.nn.utils import rnn as rnn_utils
-from torch.onnx import errors, verification
+from torch.onnx import errors
+from torch.onnx._internal.torchscript_exporter import verification
+from torch.onnx._internal.torchscript_exporter._type_utils import JitScalarType
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import skipIfNoLapack
 
@@ -895,7 +897,11 @@ def forward(
         # export succeeds, but running ORT through run_test would fail because the exported model
         # has the inputs flattened into 3 inputs.
         torch.onnx.export(
-            model, (x, {"y": (y0, y1)}), io.BytesIO(), opset_version=self.opset_version
+            model,
+            (x, {"y": (y0, y1)}),
+            io.BytesIO(),
+            opset_version=self.opset_version,
+            dynamo=False,
         )
 
     def test_primitive_input_integer(self):
@@ -10789,6 +10795,7 @@ def forward(self, x):
             opset_version=self.opset_version,
             do_constant_folding=False,
             training=torch.onnx.TrainingMode.TRAINING,
+            dynamo=False,
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_outs = verification._run_onnx(ort_sess, (x,))
@@ -10804,6 +10811,7 @@ def forward(self, x):
             opset_version=self.opset_version,
             do_constant_folding=False,
             training=torch.onnx.TrainingMode.TRAINING,
+            dynamo=False,
         )
         ort_outs = verification._run_onnx(ort_sess, (x,))
         assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0])))
@@ -10837,6 +10845,7 @@ def forward(self, x):
             opset_version=self.opset_version,
             do_constant_folding=False,
             training=torch.onnx.TrainingMode.TRAINING,
+            dynamo=False,
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_outs = verification._run_onnx(ort_sess, (x,))
@@ -10862,6 +10871,7 @@ def forward(self, x):
             opset_version=self.opset_version,
             do_constant_folding=False,
             training=torch.onnx.TrainingMode.TRAINING,
+            dynamo=False,
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_outs = verification._run_onnx(ort_sess, (x,))
@@ -12622,7 +12632,11 @@ def forward(self, x, y):
         dummy_input = (torch.tensor([expected_mean]), torch.tensor([expected_std]))
         model_onnx = io.BytesIO()
         torch.onnx.export(
-            model_export, dummy_input, model_onnx, opset_version=self.opset_version
+            model_export,
+            dummy_input,
+            model_onnx,
+            opset_version=self.opset_version,
+            dynamo=False,
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_out = verification._run_onnx(ort_sess, inputs=dummy_input)
@@ -12653,7 +12667,11 @@ def forward(self):
         model_onnx = io.BytesIO()
         test_inputs = ()
         torch.onnx.export(
-            model_export, test_inputs, model_onnx, opset_version=self.opset_version
+            model_export,
+            test_inputs,
+            model_onnx,
+            opset_version=self.opset_version,
+            dynamo=False,
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_out = verification._run_onnx(ort_sess, inputs=test_inputs)
@@ -12696,7 +12714,11 @@ def forward(self, x, y):
         dummy_input = (torch.tensor([expected_min]), torch.tensor([expected_max]))
         model_onnx = io.BytesIO()
         torch.onnx.export(
-            model_export, dummy_input, model_onnx, opset_version=self.opset_version
+            model_export,
+            dummy_input,
+            model_onnx,
+            opset_version=self.opset_version,
+            dynamo=False,
         )
         ort_sess = verification._ort_session(model_onnx)
 
@@ -13703,9 +13725,10 @@ def test_optional_output(self, module_class: type[torch.nn.Module], x_size: int)
             # Ensure condition is not constant
             dynamic_axes={"x": {0: dynamic_axis_name}},
             input_names=["x"],
+            dynamo=False,
         )
         exported = onnx.load_from_string(f.getvalue())
-        expected_elem_type = torch.onnx.JitScalarType.from_value(x).onnx_type()
+        expected_elem_type = JitScalarType.from_value(x).onnx_type()
         expected_output_type = onnx.helper.make_optional_type_proto(
             onnx.helper.make_tensor_type_proto(expected_elem_type, (dynamic_axis_name,))
         )
diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py
index 801d84844935a..e7c58e1ffdbe1 100644
--- a/test/onnx/test_pytorch_onnx_shape_inference.py
+++ b/test/onnx/test_pytorch_onnx_shape_inference.py
@@ -10,8 +10,8 @@
 
 import torch
 from torch.onnx import _constants, utils
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import jit_utils
+from torch.onnx._internal.torchscript_exporter import jit_utils
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 from torch.testing._internal import common_utils
 
 
@@ -396,6 +396,7 @@ def linalg_inv_settype(g, self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
 
         model_proto = onnx.load(io.BytesIO(f.getvalue()))
@@ -430,6 +431,7 @@ def linalg_inv_no_settype(g, self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
 
         model_proto = onnx.load(io.BytesIO(f.getvalue()))
@@ -468,6 +470,7 @@ def linalg_inv_settype(g, self):
             custom_opsets={"com.microsoft": 1},
             input_names=["x"],
             dynamic_axes={"x": {0: "batch"}},
+            dynamo=False,
         )
 
         model_proto = onnx.load(io.BytesIO(f.getvalue()))
@@ -508,6 +511,7 @@ def linalg_inv_settype(g, self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
 
         model_proto = onnx.load(io.BytesIO(f.getvalue()))
diff --git a/test/onnx/test_symbolic_helper.py b/test/onnx/test_symbolic_helper.py
index b7358fc1ec41b..cc7a3a133732c 100644
--- a/test/onnx/test_symbolic_helper.py
+++ b/test/onnx/test_symbolic_helper.py
@@ -3,7 +3,7 @@
 
 import torch
 from torch.onnx import symbolic_helper
-from torch.onnx._globals import GLOBALS
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 from torch.testing._internal import common_utils
 
 
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 387a8985879bc..1f80f4163eb25 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -1,11 +1,9 @@
 # Owner(s): ["module: onnx"]
 
 import copy
-import functools
 import io
 import re
 import warnings
-from typing import Callable
 
 import onnx
 
@@ -23,7 +21,7 @@
 import torch.onnx
 import torch.utils.cpp_extension
 from torch.onnx import _constants, OperatorExportTypes, TrainingMode, utils
-from torch.onnx._globals import GLOBALS
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 from torch.onnx.symbolic_helper import _unpack_list, parse_args
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import skipIfNoLapack
@@ -86,86 +84,6 @@ def _model_to_graph(
         return graph, params_dict, torch_out
 
 
-@common_utils.instantiate_parametrized_tests
-class TestUnconvertibleOps(pytorch_test_common.ExportTestCase):
-    """Unit tests for the `unconvertible_ops` function."""
-
-    def setUp(self):
-        class EinsumModule(torch.nn.Module):
-            def forward(self, x):
-                return torch.einsum("ii", x)
-
-        self.einsum_module = EinsumModule()
-
-    def test_it_returns_graph_and_unconvertible_ops_at_lower_opset_version(self):
-        x = torch.randn(4, 4)
-
-        # Einsum is supported since opset 12. It should be unconvertible at opset 9.
-        graph, unconvertible_ops = utils.unconvertible_ops(
-            self.einsum_module, (x,), opset_version=9
-        )
-        nodes = graph.nodes()
-        self.assertEqual(next(nodes).kind(), "prim::Constant")
-        self.assertEqual(next(nodes).kind(), "prim::ListConstruct")
-        self.assertEqual(next(nodes).kind(), "prim::Constant")
-        self.assertEqual(next(nodes).kind(), "aten::einsum")
-        self.assertEqual(unconvertible_ops, ["aten::einsum"])
-
-    @common_utils.parametrize(
-        "jit_function",
-        [
-            common_utils.subtest(
-                functools.partial(torch.jit.trace, example_inputs=torch.randn(4, 4)),
-                name="traced",
-            ),
-            common_utils.subtest(torch.jit.script, name="scripted"),
-        ],
-    )
-    def test_it_returns_unconvertible_ops_at_lower_opset_version_for_jit_module(
-        self, jit_function: Callable
-    ):
-        module = jit_function(self.einsum_module)
-        x = torch.randn(4, 4)
-
-        # Einsum is supported since opset 12. It should be unconvertible at opset 9.
-        _, unconvertible_ops = utils.unconvertible_ops(module, (x,), opset_version=9)
-        self.assertEqual(unconvertible_ops, ["aten::einsum"])
-
-    @common_utils.parametrize(
-        "jit_function",
-        [
-            common_utils.subtest(lambda x: x, name="nn_module"),
-            common_utils.subtest(
-                functools.partial(torch.jit.trace, example_inputs=torch.randn(4, 4)),
-                name="traced",
-            ),
-            common_utils.subtest(torch.jit.script, name="scripted"),
-        ],
-    )
-    def test_it_returns_empty_list_when_all_ops_convertible(
-        self, jit_function: Callable
-    ):
-        module = jit_function(self.einsum_module)
-        x = torch.randn(4, 4)
-
-        # Einsum is supported since opset 12
-        _, unconvertible_ops = utils.unconvertible_ops(module, (x,), opset_version=12)
-        self.assertEqual(unconvertible_ops, [])
-
-    def test_it_returns_empty_list_when_model_contains_supported_inplace_ops(self):
-        class SkipConnectionModule(torch.nn.Module):
-            def forward(self, x):
-                out = x
-                out += x
-                out = torch.nn.functional.relu(out, inplace=True)
-                return out
-
-        module = SkipConnectionModule()
-        x = torch.randn(4, 4)
-        _, unconvertible_ops = utils.unconvertible_ops(module, (x,), opset_version=13)
-        self.assertEqual(unconvertible_ops, [])
-
-
 @parameterized.parameterized_class(
     [
         {"opset_version": opset}
@@ -193,7 +111,9 @@ def forward(self, x):
         x = torch.randn(3, 4)
         f = io.BytesIO()
         try:
-            torch.onnx.export(MyModule(), x, f, opset_version=self.opset_version)
+            torch.onnx.export(
+                MyModule(), x, f, opset_version=self.opset_version, dynamo=False
+            )
         except ValueError:
             self.assertFalse(torch.onnx.is_in_onnx_export())
 
@@ -720,7 +640,7 @@ def test_constant_fold_upsample_scale_fold_as_constant(self):
         model = torch.nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
         x = torch.randn(1, 32, 224, 224)
         f = io.BytesIO()
-        torch.onnx.export(model, x, f)
+        torch.onnx.export(model, x, f, dynamo=False)
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         self.assertEqual(len(onnx_model.graph.initializer), 0)
 
@@ -733,10 +653,17 @@ def forward(self, input):
 
         def is_model_stripped(f, verbose=None):
             if verbose is None:
-                torch.onnx.export(MyModule(), x, f, opset_version=self.opset_version)
+                torch.onnx.export(
+                    MyModule(), x, f, opset_version=self.opset_version, dynamo=False
+                )
             else:
                 torch.onnx.export(
-                    MyModule(), x, f, verbose=verbose, opset_version=self.opset_version
+                    MyModule(),
+                    x,
+                    f,
+                    verbose=verbose,
+                    opset_version=self.opset_version,
+                    dynamo=False,
                 )
             model = onnx.load(io.BytesIO(f.getvalue()))
             model_strip = copy.copy(model)
@@ -759,7 +686,9 @@ def test_error_on_data_parallel(self):
             "exporter, please use 'attribute' module to "
             "unwrap model from torch.nn.DataParallel. Try ",
         ):
-            torch.onnx.export(model, x, f, opset_version=self.opset_version)
+            torch.onnx.export(
+                model, x, f, opset_version=self.opset_version, dynamo=False
+            )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_sequence_dim(self):
@@ -783,6 +712,7 @@ def forward(self, x, y):
             opset_version=self.opset_version,
             input_names=["x", "y"],
             dynamic_axes={"y": [1]},
+            dynamo=False,
         )
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         loop_output_value_info_proto = onnx_model.graph.output[0]
@@ -794,7 +724,9 @@ def forward(self, x, y):
         # Case 2: no dynamic axes.
         f = io.BytesIO()
         y = torch.randn(2, 3)
-        torch.onnx.export(script_model, (x, y), f, opset_version=self.opset_version)
+        torch.onnx.export(
+            script_model, (x, y), f, opset_version=self.opset_version, dynamo=False
+        )
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         loop_output_value_info_proto = onnx_model.graph.output[0]
         ref_value_info_proto = onnx.helper.make_tensor_sequence_value_info(
@@ -821,6 +753,7 @@ def forward(self, x):
             f,
             opset_version=self.opset_version,
             training=torch.onnx.TrainingMode.TRAINING,
+            dynamo=False,
         )
         # verify that the model state is preserved
         self.assertEqual(model.training, old_state)
@@ -834,6 +767,7 @@ def forward(self, x):
             f,
             opset_version=self.opset_version,
             training=torch.onnx.TrainingMode.EVAL,
+            dynamo=False,
         )
         # verify that the model state is preserved
         self.assertEqual(model.training, old_state)
@@ -861,7 +795,9 @@ def forward(self, x):
         # jit.freeze removes the training attribute in the module
         module = torch.jit.freeze(module)
 
-        torch.onnx.export(module, (x,), io.BytesIO(), opset_version=self.opset_version)
+        torch.onnx.export(
+            module, (x,), io.BytesIO(), opset_version=self.opset_version, dynamo=False
+        )
 
     @skipIfUnsupportedMinOpsetVersion(15)
     def test_local_function(self):
@@ -910,6 +846,7 @@ def forward(self, x, y, z):
                 torch.nn.Dropout,
                 torch.nn.LayerNorm,
             },
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -944,6 +881,7 @@ def forward(self, x, y, z):
             f,
             opset_version=self.opset_version,
             export_modules_as_functions={torch.nn.CELU},
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -959,6 +897,7 @@ def forward(self, x, y, z):
             f,
             opset_version=self.opset_version,
             export_modules_as_functions=set(),
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -973,6 +912,7 @@ def forward(self, x, y, z):
             f,
             opset_version=self.opset_version,
             export_modules_as_functions=True,
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -1009,6 +949,7 @@ def forward(self, x, y, z):
             f,
             opset_version=self.opset_version,
             export_modules_as_functions={NWithOverloads},
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -1038,6 +979,7 @@ def forward(self, x):
             export_modules_as_functions=True,
             opset_version=self.opset_version,
             do_constant_folding=False,
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -1070,6 +1012,7 @@ def forward(self, x):
             f,
             export_modules_as_functions=True,
             opset_version=self.opset_version,
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -1135,6 +1078,7 @@ def forward(self, x):
             export_modules_as_functions=True,
             opset_version=self.opset_version,
             verbose=True,  # Allows the test case to print `Skipping module attribute 'freeze'`
+            dynamo=False,
         )
 
     def test_node_scope(self):
@@ -1379,6 +1323,7 @@ def gelu(g, self, approximate):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
 
         graph = onnx.load(io.BytesIO(f.getvalue()))
@@ -1399,7 +1344,9 @@ def gelu(g, self, approximate):
         model = torch.nn.GELU(approximate="none")
         x = torch.randn(3, 3)
         f = io.BytesIO()
-        torch.onnx.export(model, (x,), f, opset_version=self.opset_version)
+        torch.onnx.export(
+            model, (x,), f, opset_version=self.opset_version, dynamo=False
+        )
         graph = onnx.load(io.BytesIO(f.getvalue()))
 
         self.assertEqual(graph.graph.node[0].op_type, "Gelu")
@@ -1426,6 +1373,7 @@ def linalg_inv(g, self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
 
         graph = onnx.load(io.BytesIO(f.getvalue()))
@@ -1729,6 +1677,7 @@ def forward(self, x):
             f,
             opset_version=self.opset_version,
             keep_initializers_as_inputs=True,
+            dynamo=False,
         )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertEqual(graph.graph.input[1].name, "in_weight")
@@ -1761,13 +1710,19 @@ def forward(self, x):
         ]
         f = io.BytesIO()
 
-        torch.onnx.export(module, torch.ones(1, 10), f, output_names=["y"])
+        torch.onnx.export(
+            module, torch.ones(1, 10), f, output_names=["y"], dynamo=False
+        )
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         for n in onnx_model.graph.node:
             self.assertIn(n.name, ref_node_names)
 
         torch.onnx.export(
-            torch.jit.script(module), torch.ones(1, 10), f, output_names=["y"]
+            torch.jit.script(module),
+            torch.ones(1, 10),
+            f,
+            output_names=["y"],
+            dynamo=False,
         )
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         for n in onnx_model.graph.node:
@@ -1810,6 +1765,7 @@ def forward(self, x):
             f,
             training=TrainingMode.TRAINING,
             opset_version=self.opset_version,
+            dynamo=False,
         )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertSetEqual({i.name for i in graph.graph.initializer}, param_name_set)
@@ -1822,6 +1778,7 @@ def forward(self, x):
             f,
             training=TrainingMode.PRESERVE,
             opset_version=self.opset_version,
+            dynamo=False,
         )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertSetEqual({i.name for i in graph.graph.initializer}, param_name_set)
@@ -1829,7 +1786,9 @@ def forward(self, x):
         # Test eval mode.
         model.eval()
         f = io.BytesIO()
-        torch.onnx.export(model, (x,), f, opset_version=self.opset_version)
+        torch.onnx.export(
+            model, (x,), f, opset_version=self.opset_version, dynamo=False
+        )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         param_name_set.remove("param2")
         self.assertSetEqual({i.name for i in graph.graph.initializer}, param_name_set)
@@ -1858,7 +1817,9 @@ def forward(self, x, y):
         x = torch.randn(3, 3, device=torch.device("cpu"))
         y = torch.randn(3, 3, device=torch.device("cuda"))
         f = io.BytesIO()
-        torch.onnx.export(Model(), (x, y), f, opset_version=self.opset_version)
+        torch.onnx.export(
+            Model(), (x, y), f, opset_version=self.opset_version, dynamo=False
+        )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertSetEqual({i.name for i in graph.graph.initializer}, {"w_cpu"})
 
@@ -1899,6 +1860,7 @@ def forward(self, input0, input1):
             dynamic_axes=dynamic_axes,
             verbose=True,
             keep_initializers_as_inputs=True,
+            dynamo=False,
         )
 
         graph = onnx.load(io.BytesIO(f.getvalue()))
@@ -1926,7 +1888,7 @@ def forward(self, x):
 
         f = io.BytesIO()
         x = torch.randn(1, 32, 224, 224)
-        torch.onnx.export(Model(), x, f)
+        torch.onnx.export(Model(), x, f, dynamo=False)
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         # aten::upsample converts to onnx::resize
         resize_nodes = [n for n in onnx_model.graph.node if n.op_type == "Resize"]
@@ -1958,7 +1920,7 @@ def forward(self, x):
         self.assertExpectedRaisesInline(
             AssertionError,
             lambda: torch.onnx.export(
-                model, (x,), f, opset_version=_onnx_opset_version
+                model, (x,), f, opset_version=_onnx_opset_version, dynamo=False
             ),
             (
                 "A mismatch between the number of arguments (2) and their descriptors (1) was found at symbolic function "
diff --git a/test/onnx/test_verification.py b/test/onnx/test_verification.py
deleted file mode 100644
index 4d2b4676d9b17..0000000000000
--- a/test/onnx/test_verification.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# Owner(s): ["module: onnx"]
-
-import contextlib
-import io
-import tempfile
-import unittest
-
-import numpy as np
-
-import onnx
-import parameterized
-import pytorch_test_common
-from packaging import version
-
-import torch
-from torch.onnx import _constants, _experimental, verification
-from torch.testing._internal import common_utils
-
-
-class TestVerification(pytorch_test_common.ExportTestCase):
-    def test_check_export_model_diff_returns_diff_when_constant_mismatch(self):
-        class UnexportableModel(torch.nn.Module):
-            def forward(self, x, y):
-                # tensor.data() will be exported as a constant,
-                # leading to wrong model output under different inputs.
-                return x + y.data
-
-        test_input_groups = [
-            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
-            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
-        ]
-
-        results = verification.check_export_model_diff(
-            UnexportableModel(), test_input_groups
-        )
-        self.assertRegex(
-            results,
-            r"Graph diff:(.|\n)*"
-            r"First diverging operator:(.|\n)*"
-            r"prim::Constant(.|\n)*"
-            r"Former source location:(.|\n)*"
-            r"Latter source location:",
-        )
-
-    def test_check_export_model_diff_returns_diff_when_dynamic_controlflow_mismatch(
-        self,
-    ):
-        class UnexportableModel(torch.nn.Module):
-            def forward(self, x, y):
-                for i in range(x.size(0)):
-                    y = x[i] + y
-                return y
-
-        test_input_groups = [
-            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
-            ((torch.randn(4, 3), torch.randn(2, 3)), {}),
-        ]
-
-        export_options = _experimental.ExportOptions(
-            input_names=["x", "y"], dynamic_axes={"x": [0]}
-        )
-        results = verification.check_export_model_diff(
-            UnexportableModel(), test_input_groups, export_options
-        )
-        self.assertRegex(
-            results,
-            r"Graph diff:(.|\n)*"
-            r"First diverging operator:(.|\n)*"
-            r"prim::Constant(.|\n)*"
-            r"Latter source location:(.|\n)*",
-        )
-
-    def test_check_export_model_diff_returns_empty_when_correct_export(self):
-        class SupportedModel(torch.nn.Module):
-            def forward(self, x, y):
-                return x + y
-
-        test_input_groups = [
-            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
-            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
-        ]
-
-        results = verification.check_export_model_diff(
-            SupportedModel(), test_input_groups
-        )
-        self.assertEqual(results, "")
-
-    def test_compare_ort_pytorch_outputs_no_raise_with_acceptable_error_percentage(
-        self,
-    ):
-        ort_outs = [np.array([[1.0, 2.0], [3.0, 4.0]])]
-        pytorch_outs = [torch.tensor([[1.0, 2.0], [3.0, 1.0]])]
-        options = verification.VerificationOptions(
-            rtol=1e-5,
-            atol=1e-6,
-            check_shape=True,
-            check_dtype=False,
-            ignore_none=True,
-            acceptable_error_percentage=0.3,
-        )
-        verification._compare_onnx_pytorch_outputs(
-            ort_outs,
-            pytorch_outs,
-            options,
-        )
-
-    def test_compare_ort_pytorch_outputs_raise_without_acceptable_error_percentage(
-        self,
-    ):
-        ort_outs = [np.array([[1.0, 2.0], [3.0, 4.0]])]
-        pytorch_outs = [torch.tensor([[1.0, 2.0], [3.0, 1.0]])]
-        options = verification.VerificationOptions(
-            rtol=1e-5,
-            atol=1e-6,
-            check_shape=True,
-            check_dtype=False,
-            ignore_none=True,
-            acceptable_error_percentage=None,
-        )
-        with self.assertRaises(AssertionError):
-            verification._compare_onnx_pytorch_outputs(
-                ort_outs,
-                pytorch_outs,
-                options,
-            )
-
-
-@common_utils.instantiate_parametrized_tests
-class TestVerificationOnWrongExport(pytorch_test_common.ExportTestCase):
-    opset_version: int
-
-    def setUp(self):
-        super().setUp()
-
-        def incorrect_add_symbolic_function(g, self, other, alpha):
-            return self
-
-        self.opset_version = _constants.ONNX_DEFAULT_OPSET
-        torch.onnx.register_custom_op_symbolic(
-            "aten::add",
-            incorrect_add_symbolic_function,
-            opset_version=self.opset_version,
-        )
-
-    def tearDown(self):
-        super().tearDown()
-        torch.onnx.unregister_custom_op_symbolic(
-            "aten::add", opset_version=self.opset_version
-        )
-
-    @common_utils.parametrize(
-        "onnx_backend",
-        [
-            common_utils.subtest(
-                verification.OnnxBackend.REFERENCE,
-                decorators=[
-                    unittest.skipIf(
-                        version.Version(onnx.__version__) < version.Version("1.13"),
-                        reason="Reference Python runtime was introduced in 'onnx' 1.13.",
-                    )
-                ],
-            ),
-            verification.OnnxBackend.ONNX_RUNTIME_CPU,
-        ],
-    )
-    def test_verify_found_mismatch_when_export_is_wrong(
-        self, onnx_backend: verification.OnnxBackend
-    ):
-        class Model(torch.nn.Module):
-            def forward(self, x):
-                return x + 1
-
-        with self.assertRaisesRegex(AssertionError, ".*Tensor-likes are not close!.*"):
-            verification.verify(
-                Model(),
-                (torch.randn(2, 3),),
-                opset_version=self.opset_version,
-                options=verification.VerificationOptions(backend=onnx_backend),
-            )
-
-
-@parameterized.parameterized_class(
-    [
-        # TODO: enable this when ONNX submodule catches up to >= 1.13.
-        # {"onnx_backend": verification.OnnxBackend.ONNX},
-        {"onnx_backend": verification.OnnxBackend.ONNX_RUNTIME_CPU},
-    ],
-    class_name_func=lambda cls,
-    idx,
-    input_dicts: f"{cls.__name__}_{input_dicts['onnx_backend'].name}",
-)
-class TestFindMismatch(pytorch_test_common.ExportTestCase):
-    onnx_backend: verification.OnnxBackend
-    opset_version: int
-    graph_info: verification.GraphInfo
-
-    def setUp(self):
-        super().setUp()
-        self.opset_version = _constants.ONNX_DEFAULT_OPSET
-
-        def incorrect_relu_symbolic_function(g, self):
-            return g.op("Add", self, g.op("Constant", value_t=torch.tensor(1.0)))
-
-        torch.onnx.register_custom_op_symbolic(
-            "aten::relu",
-            incorrect_relu_symbolic_function,
-            opset_version=self.opset_version,
-        )
-
-        class Model(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                self.layers = torch.nn.Sequential(
-                    torch.nn.Linear(3, 4),
-                    torch.nn.ReLU(),
-                    torch.nn.Linear(4, 5),
-                    torch.nn.ReLU(),
-                    torch.nn.Linear(5, 6),
-                )
-
-            def forward(self, x):
-                return self.layers(x)
-
-        self.graph_info = verification.find_mismatch(
-            Model(),
-            (torch.randn(2, 3),),
-            opset_version=self.opset_version,
-            options=verification.VerificationOptions(backend=self.onnx_backend),
-        )
-
-    def tearDown(self):
-        super().tearDown()
-        torch.onnx.unregister_custom_op_symbolic(
-            "aten::relu", opset_version=self.opset_version
-        )
-        delattr(self, "opset_version")
-        delattr(self, "graph_info")
-
-    def test_pretty_print_tree_visualizes_mismatch(self):
-        f = io.StringIO()
-        with contextlib.redirect_stdout(f):
-            self.graph_info.pretty_print_tree()
-        self.assertExpected(f.getvalue())
-
-    def test_preserve_mismatch_source_location(self):
-        mismatch_leaves = self.graph_info.all_mismatch_leaf_graph_info()
-
-        self.assertTrue(len(mismatch_leaves) > 0)
-
-        for leaf_info in mismatch_leaves:
-            f = io.StringIO()
-            with contextlib.redirect_stdout(f):
-                leaf_info.pretty_print_mismatch(graph=True)
-            self.assertRegex(
-                f.getvalue(),
-                r"(.|\n)*aten::relu.*/torch/nn/functional.py:[0-9]+(.|\n)*",
-            )
-
-    def test_find_all_mismatch_operators(self):
-        mismatch_leaves = self.graph_info.all_mismatch_leaf_graph_info()
-
-        self.assertEqual(len(mismatch_leaves), 2)
-
-        for leaf_info in mismatch_leaves:
-            self.assertEqual(leaf_info.essential_node_count(), 1)
-            self.assertEqual(leaf_info.essential_node_kinds(), {"aten::relu"})
-
-    def test_find_mismatch_prints_correct_info_when_no_mismatch(self):
-        self.maxDiff = None
-
-        class Model(torch.nn.Module):
-            def forward(self, x):
-                return x + 1
-
-        f = io.StringIO()
-        with contextlib.redirect_stdout(f):
-            verification.find_mismatch(
-                Model(),
-                (torch.randn(2, 3),),
-                opset_version=self.opset_version,
-                options=verification.VerificationOptions(backend=self.onnx_backend),
-            )
-        self.assertExpected(f.getvalue())
-
-    def test_export_repro_for_mismatch(self):
-        mismatch_leaves = self.graph_info.all_mismatch_leaf_graph_info()
-        self.assertTrue(len(mismatch_leaves) > 0)
-        leaf_info = mismatch_leaves[0]
-        with tempfile.TemporaryDirectory() as temp_dir:
-            repro_dir = leaf_info.export_repro(temp_dir)
-
-            with self.assertRaisesRegex(AssertionError, "Tensor-likes are not close!"):
-                options = verification.VerificationOptions(backend=self.onnx_backend)
-                verification.OnnxTestCaseRepro(repro_dir).validate(options)
-
-
-if __name__ == "__main__":
-    common_utils.run_tests()
diff --git a/test/onnx/torchlib/error_reproduction.py b/test/onnx/torchlib/error_reproduction.py
index 260a37b65f169..9fd1dace77677 100644
--- a/test/onnx/torchlib/error_reproduction.py
+++ b/test/onnx/torchlib/error_reproduction.py
@@ -205,7 +205,7 @@ def create_reproduction_report(
 onnxscript=={onnxscript.__version__}
 numpy=={np.__version__}
 torch=={torch.__version__}"""
-    short_test_name = test_name.split(".")[-1]
+    short_test_name = test_name.rsplit(".", maxsplit=1)[-1]
     reproduction_code = _REPRODUCTION_TEMPLATE.format(
         onnx_model_text=onnx_model_text,
         ort_inputs=input_text,
@@ -245,7 +245,7 @@ def create_mismatch_report(
 
     error_text = str(error)
     error_stack = error_text + "\n" + "".join(traceback.format_tb(error.__traceback__))
-    short_test_name = test_name.split(".")[-1]
+    short_test_name = test_name.rsplit(".", maxsplit=1)[-1]
     diff = difflib.unified_diff(
         str(actual).splitlines(),
         str(expected).splitlines(),
diff --git a/test/optim/test_lrscheduler.py b/test/optim/test_lrscheduler.py
index a6e448173f9e5..c36e7b2e21d62 100644
--- a/test/optim/test_lrscheduler.py
+++ b/test/optim/test_lrscheduler.py
@@ -784,6 +784,19 @@ def test_sequentiallr5(self):
         scheduler = SequentialLR(self.opt, schedulers=schedulers, milestones=milestones)
         self._test(scheduler, targets, epochs)
 
+    def test_sequentiallr_no_warnings(self):
+        scheduler1 = LinearLR(self.opt, start_factor=0.5, end_factor=0.1, total_iters=5)
+        scheduler2 = ExponentialLR(self.opt, gamma=0.9)
+        scheduler = SequentialLR(
+            self.opt, schedulers=[scheduler1, scheduler2], milestones=[5]
+        )
+
+        for _ in range(10):
+            self.opt.step()
+            with warnings.catch_warnings(record=True) as ws:
+                scheduler.step()
+                self.assertTrue(len(ws) == 0, "No warning should be raised")
+
     def test_get_last_lr_sequentiallr(self):
         epochs = 12
         milestones = [3, 6]
diff --git a/test/package/test_save_load.py b/test/package/test_save_load.py
index a0cc967787e67..edbba9f6f8ee8 100644
--- a/test/package/test_save_load.py
+++ b/test/package/test_save_load.py
@@ -208,11 +208,10 @@ def make_exporter():
             # Ensure that the importer finds the 'PackageAObject' defined in 'importer1' first.
             return pe
 
-        # This should fail. The 'PackageAObject' type defined from 'importer1'
-        # is not necessarily the same 'obj2's version of 'PackageAObject'.
+        # This succeeds because OrderedImporter.get_name() properly
+        # falls back to sys_importer which can find the original PackageAObject
         pe = make_exporter()
-        with self.assertRaises(pickle.PicklingError):
-            pe.save_pickle("obj", "obj.pkl", obj2)
+        pe.save_pickle("obj", "obj.pkl", obj2)
 
         # This should also fail. The 'PackageAObject' type defined from 'importer1'
         # is not necessarily the same as the one defined from 'importer2'
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 36a3743b3757b..46b21cb4dc097 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -2336,6 +2336,74 @@ def verify_events(events):
             events = main_with_thread_fn(profile_all_threads)
             verify_events(events)
 
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
+    @unittest.skipIf(not kineto_available(), "Kineto is required")
+    def test_python_gc_event(self):
+        activities = [ProfilerActivity.CPU]
+
+        def payload():
+            x = torch.randn(10, 10)
+            y = torch.randn(10, 10)
+            with record_function("pre_gc"):
+                torch.mm(x, y)
+            gc.collect()
+            with record_function("post_gc"):
+                torch.mm(x, y)
+
+        def validate_json(prof, gc_collection_on):
+            with TemporaryFileName(mode="w+") as fname:
+                prof.export_chrome_trace(fname)
+                with open(fname) as f:
+                    events = json.load(f)["traceEvents"]
+                    # Find required events
+                    if gc_collection_on:
+                        pre_gc = next(
+                            (e for e in events if e["name"] == "pre_gc"), None
+                        )
+                        post_gc = next(
+                            (e for e in events if e["name"] == "post_gc"), None
+                        )
+                        python_gc_events = [
+                            e for e in events if e["name"] == "Python GC"
+                        ]
+                        # Assert all required events are present
+                        self.assertIsNotNone(pre_gc, "pre_gc event is missing")
+                        self.assertIsNotNone(post_gc, "post_gc event is missing")
+                        self.assertTrue(
+                            len(python_gc_events) > 0, "No Python GC events found"
+                        )
+                        # Calculate boundaries
+                        pre_gc_end = pre_gc["ts"] + pre_gc.get("dur", 0)
+                        post_gc_start = post_gc["ts"]
+                        # Assert each Python GC event is correctly placed
+                        for python_gc in python_gc_events:
+                            python_gc_start = python_gc["ts"]
+                            python_gc_end = python_gc["ts"] + python_gc.get("dur", 0)
+                            self.assertTrue(
+                                python_gc_start > pre_gc_end
+                                and python_gc_end < post_gc_start,
+                                f"Python GC event at {python_gc_start} is not correctly placed.",
+                            )
+                    else:
+                        python_gc_events = [
+                            e for e in events if e["name"] == "Python GC"
+                        ]
+                        self.assertTrue(
+                            len(python_gc_events) == 0,
+                            "Python GC event found when flag off",
+                        )
+
+        for gc_flag in [True, False]:
+            with profile(
+                activities=activities,
+                experimental_config=torch._C._profiler._ExperimentalConfig(
+                    record_python_gc_info=gc_flag
+                ),
+                with_stack=True,
+            ) as prof:
+                payload()
+            validate_json(prof, gc_flag)
+
 
 class SimpleNet(nn.Module):
     def __init__(self) -> None:
diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
index bb605f008fec5..670e639c98e23 100644
--- a/test/profiler/test_profiler_tree.py
+++ b/test/profiler/test_profiler_tree.py
@@ -191,6 +191,16 @@ def fmt_name(name: str) -> str:
                 name,
             )
 
+        # HACK: this patches around the fact that PyBind11 improperly sets the
+        # __qualname__ attribute on functions and methods; see
+        # https://github.com/pybind/pybind11/issues/5774.  This should be removed if
+        # that issue is fixed.
+        name = re.sub(
+            r"pybind11_builtins\.pybind11_detail_function_record_v[^ .]+",
+            "PyCapsule",
+            name,
+        )
+
         return re.sub("object at 0x[0-9a-fA-F]+>", "object at 0xXXXXXXXXXXXX>", name)
 
     @classmethod
@@ -754,6 +764,7 @@ def test_profiler_experimental_tree_with_stack_and_torch_dispatch(self):
               aten::add
                 torch/_library/simple_registry.py(...): find_torch_dispatch_rule
                   torch/_library/simple_registry.py(...): find
+                    <built-in method get of dict object at 0xXXXXXXXXXXXX>
                   torch/_library/simple_registry.py(...): find
                     <built-in method get of dict object at 0xXXXXXXXXXXXX>
                 test_profiler_tree.py(...): __torch_dispatch__
diff --git a/test/profiler/test_python_tracer.py b/test/profiler/test_python_tracer.py
index 389395d8027c6..f7732b0b3893f 100644
--- a/test/profiler/test_python_tracer.py
+++ b/test/profiler/test_python_tracer.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: profiler"]
 
 import json
+import subprocess
 import sys
 import time
 
@@ -63,6 +64,46 @@ def test_monitoring_callback(self):
         name = monitoring.get_tool(2)
         self.assertEqual(name, None)
 
+    def test_unexpected_c_return_events(self):
+        code = """
+import threading
+import time
+import torch
+
+from threading import Event, Lock
+
+lock = Lock()
+lock.acquire()
+event1 = Event()
+event2 = Event()
+event3 = Event()
+
+def run():
+    event1.set()
+    event2.wait()
+    lock.acquire()
+    event3.set()
+
+threading.Thread(target=run).start()
+
+with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU], with_stack=True):
+    event1.wait()
+    event2.set()
+    time.sleep(1)
+
+with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU], with_stack=True):
+    lock.release()
+    event3.wait()
+    """
+
+        result = subprocess.run(
+            [sys.executable, "-c", code], capture_output=True, text=True, check=True
+        )
+
+        self.assertFalse(
+            "Python replay stack is empty during pop operation" in result.stderr
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index b414b687f3d00..b6df2089e87e7 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -7,8 +7,8 @@
 import numpy as np
 import operator
 import random
-import sys
 import unittest
+from packaging.version import Version
 from typing import NamedTuple
 
 import torch
@@ -73,7 +73,7 @@ class PointwisePostOp(NamedTuple):
 def avoid_vpmaddubsw_overflow_linear(
     batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max
 ):
-    if sys.version_info >= (3, 13):
+    if Version(np.__version__) >= Version("2.1"):
         raise unittest.SkipTest("numpy 2.1 overflow error")
     for i, j in np.ndindex((batch_size, output_channels)):
         for k in range(0, input_channels // 2 * 2, 2):
@@ -166,7 +166,8 @@ def _quantize_fp8e4m3(t: torch.Tensor, channelwise: bool, scale: Optional[torch.
         scale = scale or t.abs().max().reshape([1]) / quant_max
         scale = torch.max(scale, eps) if isinstance(scale, torch.Tensor) else max(scale, eps.item())
         qt = t / scale
-    qt = qt.to(torch.float8_e4m3fn)
+    # Clamp to avoid NaN. Convert in two steps to align with fp32 -> fp16 -> fp8
+    qt = qt.clamp(-448, 448).half().to(torch.float8_e4m3fn)
     return qt, scale
 
 def _dequantize_fp8e4m3(qt: torch.Tensor, scale: torch.Tensor):
@@ -4732,7 +4733,7 @@ def _test_qlinear_fp8_helper(
         use_bias_list = [True, False]
         weight_quant_per_channel_list = [True, False]
         output_dtype_list = [None, torch.float32, torch.bfloat16]
-        y_scale, y_zp = 0.07, 0
+        y_scale, y_zp = 0.3, 0
         input_dim_list = [2, 3]
         cases = itertools.product(
             in_channels_list, out_channels_list, use_bias_list,
@@ -4830,6 +4831,7 @@ def _test_qlinear_fp8_helper(
 
                 self.assertEqual(x.dim(), qy.dim())
                 self.assertEqual(y_ref.float(), qy.float())
+                assert not torch.isnan(qy).any()
 
     @unittest.skipIf(IS_FBCODE, "Skip pt2e ops in fbcode")
     @skipIfNoONEDNN
@@ -7883,7 +7885,7 @@ def _test_qconv_impl_cpu_tensor_fp8(
         strides=(),
         pads=(),
         dilations=(),
-        Y_scale=0.02,
+        Y_scale=0.002,
         use_bias=True,
         post_op=PointwisePostOp(),
         use_channelwise=True,
@@ -7960,9 +7962,7 @@ def _test_qconv_impl_cpu_tensor_fp8(
 
         # Quantize reference results for comparison
         if qconv_output_dtype is None:
-            Y_scale_t = torch.Tensor([Y_scale]).to(device)
-            # Align with oneDNN: convert fp32 to fp8 by fp32 -> fp16 -> fp8
-            result_ref = result_ref.div(Y_scale_t).half().to(torch.float8_e4m3fn)
+            result_ref = _quantize_fp8e4m3(result_ref, False, Y_scale)[0]
         else:
             result_ref = result_ref.to(qconv_output_dtype)
 
@@ -8039,7 +8039,8 @@ def _test_qconv_impl_cpu_tensor_fp8(
         if fp32_output or bfloat16_output:
             self.assertTrue(result.dtype == qconv_output_dtype)
 
-        assert torch.allclose(result.float(), result_ref.float(), atol=1e-6)
+        self.assertEqual(result.float(), result_ref.float(), atol=1e-6, rtol=1e-5)
+        assert not torch.isnan(result).any()
 
     def _test_qconv_fp8_helper(self, nd, pointwise_post_op):
         # nd = 1,2,3 -> conv1d/2d/3d
@@ -8154,6 +8155,7 @@ def test_qconv2d_sum_relu_fp8(self):
     @skipIfNoONEDNN
     def test_qconv3d_fp8(self):
         pointwise_post_op = PointwisePostOp()
+        torch.manual_seed(0)  # For reproducibility in 3D conv tests
         self._test_qconv_fp8_helper(3, pointwise_post_op)
 
 
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 5517b9d8eddb9..f241cc4387578 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -1409,6 +1409,9 @@ def test_choose_qparams_optimized(self):
             self.assertEqual(y[0].numpy(), ref[0])
             self.assertEqual(y[1].numpy(), ref[1])
 
+        with self.assertRaisesRegex(ValueError, "input tensor is empty and has no data"):
+            torch.choose_qparams_optimized(torch.tensor([]), numel=0, n_bins=200, ratio=0.16, bit_width=8)
+
     def _test_pickle_checkpoint_qtensor(self, device):
         with TemporaryFileName() as fname:
             class M(torch.jit.ScriptModule):
diff --git a/test/run_test.py b/test/run_test.py
index 9d3bb636d7aac..c132448e33ee5 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -12,6 +12,7 @@
 import signal
 import subprocess
 import sys
+import sysconfig
 import tempfile
 import time
 from collections import defaultdict
@@ -28,7 +29,6 @@
 from torch.testing._internal.common_utils import (
     get_report_path,
     IS_CI,
-    IS_LINUX,
     IS_MACOS,
     retry_shell,
     set_cwd,
@@ -170,6 +170,7 @@ def __contains__(self, item):
     "distributed/rpc/test_faulty_agent",
     "distributed/rpc/test_share_memory",
     "distributed/rpc/cuda/test_tensorpipe_agent",
+    "inductor/test_max_autotune",  # taking excessive time, many tests >30 min
     "test_determination",
     "test_jit_legacy",
     "test_cuda_nvml_based_avail",
@@ -188,28 +189,15 @@ def __contains__(self, item):
     "lazy/test_meta_kernel",
     "onnx/test_utility_funs",
     "profiler/test_profiler",
-    "test_ao_sparsity",
     "test_jit",
-    "test_metal",
-    "test_mps",
-    "dynamo/test_torchrec",
-    "inductor/test_aot_inductor_utils",
-    "inductor/test_coordinate_descent_tuner",
-    "test_jiterator",
-    "inductor/test_cpu_cpp_wrapper",
-    "export/test_converter",
-    "inductor/test_inductor_freezing",
     "dynamo/test_utils",
     "test_nn",
-    "functorch/test_ops",
     # these tests run long and fail in addition to that
     "dynamo/test_dynamic_shapes",
     "test_quantization",
     "inductor/test_torchinductor",
     "inductor/test_torchinductor_dynamic_shapes",
     "inductor/test_torchinductor_opinfo",
-    "test_binary_ufuncs",
-    "test_unary_ufuncs",
     # these tests fail when cuda is not available
     "inductor/test_aot_inductor",
     "inductor/test_best_config",
@@ -228,9 +216,12 @@ def __contains__(self, item):
     # these tests fail when mkldnn is not available
     "inductor/test_custom_post_grad_passes",
     "inductor/test_mkldnn_pattern_matcher",
+    "test_metal",
     # lacks quantization support
     "onnx/test_models_quantized_onnxruntime",
     "onnx/test_pytorch_onnx_onnxruntime",
+    # sysctl -n hw.memsize is not available
+    "test_mps",
     # https://github.com/pytorch/pytorch/issues/102078
     "test_decomp",
     # https://github.com/pytorch/pytorch/issues/146698
@@ -249,6 +240,9 @@ def __contains__(self, item):
     "inductor/test_config",
     "test_public_bindings",
     "test_testing",
+    # depend on z3-solver
+    "fx/test_z3_gradual_types",
+    "test_proxy_tensor",
 ]
 
 XPU_BLOCKLIST = [
@@ -260,6 +254,7 @@ def __contains__(self, item):
     "profiler/test_profiler_tree",
     "profiler/test_record_function",
     "profiler/test_torch_tidy",
+    "test_openreg",
 ]
 
 XPU_TEST = [
@@ -657,27 +652,33 @@ def run_test(
     return ret_code
 
 
-def install_cpp_extensions(cpp_extensions_test_dir, env=os.environ):
+def install_cpp_extensions(extensions_dir, env=os.environ):
     # Wipe the build folder, if it exists already
-    cpp_extensions_test_build_dir = os.path.join(cpp_extensions_test_dir, "build")
-    if os.path.exists(cpp_extensions_test_build_dir):
-        shutil.rmtree(cpp_extensions_test_build_dir)
+    build_dir = os.path.join(extensions_dir, "build")
+    if os.path.exists(build_dir):
+        shutil.rmtree(build_dir)
 
     # Build the test cpp extensions modules
-    # FIXME: change setup.py command to pip command
-    cmd = [sys.executable, "setup.py", "install", "--root", "./install"]
-    return_code = shell(cmd, cwd=cpp_extensions_test_dir, env=env)
+    cmd = [
+        sys.executable,
+        "-m",
+        "pip",
+        "install",
+        "--no-build-isolation",
+        ".",
+        "--root",
+        "./install",
+    ]
+    return_code = shell(cmd, cwd=extensions_dir, env=env)
     if return_code != 0:
         return None, return_code
 
-    install_directory = ""
-    # install directory is the one that is named site-packages
-    for root, directories, _ in os.walk(
-        os.path.join(cpp_extensions_test_dir, "install")
-    ):
-        for directory in directories:
-            if "-packages" in directory:
-                install_directory = os.path.join(root, directory)
+    # Get the site-packages directory prepared for PYTHONPATH
+    platlib_path = sysconfig.get_paths()["platlib"]
+    platlib_rel = os.path.relpath(
+        platlib_path, os.path.splitdrive(platlib_path)[0] + os.sep
+    )
+    install_directory = os.path.join(extensions_dir, "install", platlib_rel)
 
     assert install_directory, "install_directory must not be empty"
     return install_directory, 0
@@ -824,8 +825,17 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja):
     # Build the test cpp extensions modules
     shell_env = os.environ.copy()
     shell_env["USE_NINJA"] = str(1 if use_ninja else 0)
-    install_cmd = [sys.executable, "setup.py", "install", "--root", "./install"]
-    wheel_cmd = [sys.executable, "setup.py", "bdist_wheel"]
+    install_cmd = [
+        sys.executable,
+        "-m",
+        "pip",
+        "install",
+        "--no-build-isolation",
+        ".",
+        "--root",
+        "./install",
+    ]
+    wheel_cmd = [sys.executable, "-m", "pip", "wheel", ".", "-w", "./dist"]
     return_code = shell(install_cmd, cwd=cpp_extensions_test_dir, env=shell_env)
     if return_code != 0:
         return return_code
@@ -911,10 +921,6 @@ def _test_autoload(test_directory, options, enable=True):
 
 
 def run_test_with_openreg(test_module, test_directory, options):
-    # TODO(FFFrog): Will remove this later when windows/macos are supported.
-    if not IS_LINUX:
-        return 0
-
     openreg_dir = os.path.join(
         test_directory, "cpp_extensions", "open_registration_extension", "torch_openreg"
     )
@@ -1562,7 +1568,7 @@ def get_selected_tests(options) -> list[str]:
     if options.einops:
         selected_tests = list(
             filter(
-                lambda test_name: test_name.startswith("test/dynamo/test_einops"),
+                lambda test_name: test_name.startswith("dynamo/test_einops"),
                 selected_tests,
             )
         )
@@ -1589,6 +1595,7 @@ def get_selected_tests(options) -> list[str]:
             "inductor/test_mps_basic",
             "inductor/test_torchinductor",
             "inductor/test_aot_inductor",
+            "inductor/test_torchinductor_dynamic_shapes",
         ]
     else:
         # Exclude all mps tests otherwise
diff --git a/test/slow_tests.json b/test/slow_tests.json
index 457701b46b611..cd9d6864f0ec4 100644
--- a/test/slow_tests.json
+++ b/test/slow_tests.json
@@ -1,260 +1,244 @@
 {
-  "EndToEndLSTM (__main__.RNNTest)": 200.1896718343099,
-  "MultiheadAttention (__main__.ModulesTest)": 141.92533365885416,
-  "test_AllenaiLongformerBase_repro_cpu_halide (__main__.HalideCpuTests)": 210.3270060221354,
-  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 105.85777706570096,
-  "test_adaptive_max_pool2d1_cpu_halide (__main__.HalideCpuTests)": 115.53966522216797,
-  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 62.45811038547092,
-  "test_alexnet_prefix_cpu_halide (__main__.HalideCpuTests)": 177.51766967773438,
-  "test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 74.74966557820638,
-  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 68.23533376057942,
-  "test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 61.625999450683594,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 134.07366434733072,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 188.88899739583334,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 111.63599904378255,
-  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 67.27233378092448,
-  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 105.4979985555013,
-  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 633.0828002929687,
-  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 91.86733309427898,
-  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 481.1977776421441,
-  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 491.7155592176649,
-  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 124.39833196004231,
-  "test_avg_pool3d_backward_cpu_halide (__main__.HalideCpuTests)": 62.104000091552734,
-  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 81.22966766357422,
-  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 69.64550145467122,
-  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 175.67355600992838,
-  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 125.82333374023438,
-  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 369.5883280436198,
-  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 418.0381130642361,
-  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 312.76700168185766,
-  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 84.68433380126953,
-  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 86.41216786702473,
-  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 60.670833587646484,
-  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 84.44266510009766,
-  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 86.69533284505208,
-  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 63.40933354695638,
-  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 375.11133829752606,
-  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 64.89966583251953,
-  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 386.1840108235677,
-  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 66.45699818929036,
-  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 227.58533223470053,
-  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 236.75483194986978,
-  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1000.12451171875,
-  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 63.72516632080078,
-  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 936.3953450520834,
-  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 65.74933242797852,
-  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 70.87016677856445,
-  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 68.49433453877766,
-  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.39149983723958,
-  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 71.41349919637044,
-  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 61.10983467102051,
-  "test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 64.13150151570638,
-  "test_comprehensive_linalg_vector_norm_cpu_float16 (__main__.TestInductorOpInfoCPU)": 89.73133341471355,
-  "test_comprehensive_linalg_vector_norm_cpu_float32 (__main__.TestInductorOpInfoCPU)": 86.45633188883464,
-  "test_comprehensive_linalg_vector_norm_cpu_float64 (__main__.TestInductorOpInfoCPU)": 88.76399993896484,
-  "test_comprehensive_linalg_vector_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 71.25218469125254,
-  "test_comprehensive_linalg_vector_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 71.11777793036566,
-  "test_comprehensive_logspace_cpu_float32 (__main__.TestInductorOpInfoCPU)": 176.61566670735678,
-  "test_comprehensive_logspace_cpu_float64 (__main__.TestInductorOpInfoCPU)": 173.7596689860026,
-  "test_comprehensive_logspace_cpu_int32 (__main__.TestInductorOpInfoCPU)": 163.57832845052084,
-  "test_comprehensive_logspace_cpu_int64 (__main__.TestInductorOpInfoCPU)": 161.29700215657553,
-  "test_comprehensive_masked_norm_cpu_float16 (__main__.TestInductorOpInfoCPU)": 208.6990000406901,
-  "test_comprehensive_masked_norm_cpu_float32 (__main__.TestInductorOpInfoCPU)": 198.11366271972656,
-  "test_comprehensive_masked_norm_cpu_float64 (__main__.TestInductorOpInfoCPU)": 198.788330078125,
-  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 121.93983332316081,
-  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 119.3211669921875,
-  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 113.11850102742513,
-  "test_comprehensive_nn_functional_fractional_max_pool3d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 121.52633412679036,
-  "test_comprehensive_nn_functional_fractional_max_pool3d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 114.41900126139323,
-  "test_comprehensive_nn_functional_fractional_max_pool3d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 120.74099985758464,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 92.1571667989095,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 93.97516759236653,
-  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 93.90033213297527,
-  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 102.24433135986328,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 237.9564997355143,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 263.09083048502606,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 70.44449869791667,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.58383433024089,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 66.97166633605957,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 81.04183451334636,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 89.63233439127605,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 94.67216491699219,
-  "test_comprehensive_nn_functional_max_pool1d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 168.28499857584634,
-  "test_comprehensive_nn_functional_max_pool1d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 171.91666666666666,
-  "test_comprehensive_nn_functional_max_pool1d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 166.12066650390625,
-  "test_comprehensive_nn_functional_max_pool2d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 1279.8836669921875,
-  "test_comprehensive_nn_functional_max_pool2d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 1132.968994140625,
-  "test_comprehensive_nn_functional_max_pool2d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 1118.725341796875,
-  "test_comprehensive_nn_functional_max_pool2d_cpu_int32 (__main__.TestInductorOpInfoCPU)": 973.7703247070312,
-  "test_comprehensive_nn_functional_max_pool2d_cpu_int64 (__main__.TestInductorOpInfoCPU)": 972.6750081380209,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1209.7756754557292,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1256.0619710286458,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1281.5216471354167,
-  "test_comprehensive_nn_functional_max_pool3d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 917.3249918619791,
-  "test_comprehensive_nn_functional_max_pool3d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 733.1909790039062,
-  "test_comprehensive_nn_functional_max_pool3d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 724.7653401692709,
-  "test_comprehensive_nn_functional_max_pool3d_cpu_int32 (__main__.TestInductorOpInfoCPU)": 726.2100219726562,
-  "test_comprehensive_nn_functional_max_pool3d_cpu_int64 (__main__.TestInductorOpInfoCPU)": 705.0809936523438,
-  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 517.8646697998047,
-  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 521.0065002441406,
-  "test_comprehensive_nn_functional_max_unpool2d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 130.64300028483072,
-  "test_comprehensive_nn_functional_max_unpool2d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 124.43033345540364,
-  "test_comprehensive_nn_functional_max_unpool2d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 128.03166707356772,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 64.71049880981445,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 64.55933380126953,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 65.66183217366536,
-  "test_comprehensive_nn_functional_max_unpool3d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 69.40700022379558,
-  "test_comprehensive_nn_functional_unfold_cpu_bool (__main__.TestInductorOpInfoCPU)": 74.34766642252605,
-  "test_comprehensive_nn_functional_unfold_cpu_float16 (__main__.TestInductorOpInfoCPU)": 112.48366800944011,
-  "test_comprehensive_nn_functional_unfold_cpu_float32 (__main__.TestInductorOpInfoCPU)": 116.27966562906902,
-  "test_comprehensive_nn_functional_unfold_cpu_float64 (__main__.TestInductorOpInfoCPU)": 117.50433603922527,
-  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 106.86666615804036,
-  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 94.00083287556966,
-  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 62.15316645304362,
-  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 69.82649993896484,
-  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 61.87600072224935,
-  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 69.6066665649414,
-  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 68.90516599019368,
-  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 102.65083312988281,
-  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 85.81283442179362,
-  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 70.68100102742513,
-  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 98.76588948567708,
-  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 229.82177903917102,
-  "test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 81.8357684795673,
-  "test_conv2d_unary_cpu_cpp_wrapper (__main__.TestCppWrapper)": 135.92233530680338,
-  "test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 141.42266845703125,
-  "test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 74.59500092726488,
-  "test_conv3d_unary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 64.01784662099985,
-  "test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 73.09766684638129,
-  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 95.88766733805339,
-  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 94.47416687011719,
-  "test_count_nonzero_all (__main__.TestBool)": 641.161878797743,
-  "test_custom_module_lstm (__main__.TestQuantizedOps)": 307.93677775065106,
-  "test_ddp_uneven_inputs (__main__.TestDistBackendWithSpawn)": 302.5940024058024,
-  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 81.91116714477539,
-  "test_dtensor_op_db_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDTensorOpsCPU)": 88.2913335164388,
-  "test_error_detection_and_propagation (__main__.NcclErrorHandlingTest)": 67.36266835530598,
-  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 60.49377780490451,
-  "test_fail_creation_ops.py (__main__.TestTyping)": 68.32106041185784,
-  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 76.85566584269206,
-  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 91.61366780598958,
-  "test_fn_gradgrad_map_triple_nested_cpu_float64 (__main__.TestBwdGradientsCPU)": 204.6830037434896,
-  "test_fn_gradgrad_map_triple_nested_cuda_float64 (__main__.TestBwdGradientsCUDA)": 134.79716873168945,
-  "test_fuse_large_params_cpu (__main__.CpuTests)": 97.0917501449585,
-  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 150.09088897705078,
-  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 147.25677744547525,
-  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 125.67216491699219,
-  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 94.74416732788086,
-  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 98.06850051879883,
-  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 150.5540008544922,
-  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 139.7729949951172,
-  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 232.7606684366862,
-  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 154.89383188883463,
-  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 156.3326670328776,
-  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 650.9168192545573,
-  "test_grid_sampler_2d_cpu_halide (__main__.HalideCpuTests)": 195.89266459147134,
-  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 273.2460021972656,
-  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 66.99511040581598,
-  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 101.2813351949056,
-  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 154.23166741265192,
-  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 116.40700022379558,
-  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 123.70700073242188,
-  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 95.7520014444987,
-  "test_linear (__main__.TestStaticQuantizedModule)": 62.20888815985786,
-  "test_lstm_cpu (__main__.TestMkldnnCPU)": 102.4893315633138,
-  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 127.22689056396484,
-  "test_max_pool2d2_cpu_halide (__main__.HalideCpuTests)": 431.17966715494794,
-  "test_max_pool2d3_cpu_halide (__main__.HalideCpuTests)": 133.41966756184897,
-  "test_max_pool2d5_cpu_halide (__main__.HalideCpuTests)": 360.4186706542969,
-  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 60.48455513848199,
-  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 63.52433310614692,
-  "test_proper_exit (__main__.TestDataLoader)": 234.38233439127603,
-  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 242.4615020751953,
-  "test_python_ref_executor__refs_special_zeta_executor_aten_cuda_float64 (__main__.TestCommonCUDA)": 65.31966749827068,
-  "test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 150.28666602240668,
-  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 65.1363112979465,
-  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 63.50664397345649,
-  "test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 62.56345471468839,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 73.45999908447266,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 88.02366638183594,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 85.85933430989583,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 74.7816670735677,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 88.31666564941406,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 89.21133422851562,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 73.58400217692058,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 85.65733337402344,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 94.56866709391277,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 80.31666564941406,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 95.52099863688152,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.52433522542317,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 75.57466634114583,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 96.05966695149739,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.94766743977864,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 77.00899759928386,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 95.18199920654297,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.22000122070312,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 69.10733286539714,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 84.89466603597005,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 85.52066548665364,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 93.1520004272461,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 91.66366831461589,
-  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 370.8893330891927,
-  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 733.5455017089844,
-  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 605.9030151367188,
-  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1136.014139811198,
-  "test_quick_core_backward_expand_copy_cuda_float64 (__main__.TestDecompCUDA)": 72.65350023905437,
-  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 64.6456667582194,
-  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 207.27167002360025,
-  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 91.64166768391927,
-  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 167.19299825032553,
-  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 64.22866694132487,
-  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 116.8476676940918,
-  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 70.6433334350586,
-  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 137.72866566975912,
-  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 87.72266642252605,
-  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 78.25366719563802,
-  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 67.75999959309895,
-  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 68.58633486429851,
-  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 76.43899959988065,
-  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 155.9663340250651,
-  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 110.39933268229167,
-  "test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 85.31637557347615,
-  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 136.4769990709093,
-  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 113.9978896247016,
-  "test_sort_stable_cpu (__main__.CpuTritonTests)": 76.96166737874348,
-  "test_split_cumsum_cpu (__main__.CpuTritonTests)": 89.43966674804688,
-  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 149.7841674486796,
-  "test_tensor_split (__main__.TestVmapOperators)": 76.2336671680021,
-  "test_terminate_handler_on_crash (__main__.TestTorch)": 111.58677675988939,
-  "test_terminate_signal (__main__.ForkTest)": 136.8188896137807,
-  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 136.99289169742002,
-  "test_terminate_signal (__main__.SpawnTest)": 140.61755683687,
-  "test_train_parity_multi_group_unshard_async_op (__main__.TestFullyShard1DTrainingCore)": 69.51326649983724,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 68.61666615804036,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 65.95349820454915,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 66.64900016784668,
-  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 122.68766657511394,
-  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 120.926331837972,
-  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 104.47883415222168,
-  "test_unary_ops (__main__.TestTEFuserDynamic)": 172.1952222188314,
-  "test_unary_ops (__main__.TestTEFuserStatic)": 158.92655531565347,
-  "test_upsample_bicubic2d_cpu_halide (__main__.HalideCpuTests)": 96.95966339111328,
-  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 90.34199778238933,
-  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 69.39216740926106,
-  "test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 73.56816864013672,
-  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 96.19633483886719,
-  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 93.57866668701172,
-  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 95.94100189208984,
-  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 71.65300051371257,
-  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 84.81466547648112,
-  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 100.53633308410645,
-  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 69.77733103434245,
-  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 67.43849881490071,
-  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 77.40583229064941,
-  "test_vmapjvpvjp_unbind_cpu_float32 (__main__.TestOperatorsCPU)": 64.32900110880534,
-  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 71.61133193969727,
-  "test_vmapvjpvjp_linalg_lstsq_cuda_float32 (__main__.TestOperatorsCUDA)": 60.90399932861328,
-  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 76.39033381144206,
-  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 77.00383377075195,
-  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 143.61550013224283
+  "EndToEndLSTM (__main__.RNNTest)": 194.9510040283203,
+  "MultiheadAttention (__main__.ModulesTest)": 140.13499959309897,
+  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 89.57710986667209,
+  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 64.31833351982965,
+  "test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 66.09833272298177,
+  "test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.02314267839704,
+  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 72.13800048828125,
+  "test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 63.19166692097982,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 153.9259999593099,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 214.78533426920572,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 158.7769978841146,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 60.201476414998375,
+  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 75.8566665649414,
+  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 158.88999938964844,
+  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 600.0303955078125,
+  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 143.89337348937988,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 494.34210883246527,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 504.5401102701823,
+  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 135.9231694539388,
+  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 71.03799947102864,
+  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 73.23316764831543,
+  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 214.73055691189236,
+  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 150.5653305053711,
+  "test_cat_2k_args (__main__.TestTEFuserDynamic)": 121.138150700114,
+  "test_cat_2k_args (__main__.TestTEFuserStatic)": 117.27021219874874,
+  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 332.1435546875,
+  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 413.1364440917969,
+  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 322.539549085829,
+  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 109.46066538492839,
+  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 110.44916661580403,
+  "test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 77.25650024414062,
+  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 75.41433461507161,
+  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 111.43533325195312,
+  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 113.98733520507812,
+  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 485.4573465983073,
+  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 464.56699625651044,
+  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 265.6348292032878,
+  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 314.0461654663086,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1546.3898315429688,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 69.4828332265218,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1384.938496907552,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 73.32633463541667,
+  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.70183436075847,
+  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 76.88016764322917,
+  "test_comprehensive_linalg_pinv_singular_cuda_complex128 (__main__.TestDecompCUDA)": 60.60533459981283,
+  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 83.5096664428711,
+  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 78.69066619873047,
+  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 92.91299947102864,
+  "test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 73.34999974568684,
+  "test_comprehensive_linalg_vector_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 70.28683344523112,
+  "test_comprehensive_linalg_vector_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 69.44366518656413,
+  "test_comprehensive_logspace_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 77.09783299763997,
+  "test_comprehensive_logspace_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 70.4760004679362,
+  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 142.64183044433594,
+  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 137.7250010172526,
+  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 138.17566553751627,
+  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 69.95266660054524,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 60.835333506266274,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 66.94753379821778,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 138.8831672668457,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 157.37983194986978,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 148.48499552408853,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 142.54666646321616,
+  "test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 66.76000086466472,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float16 (__main__.TestDecompCUDA)": 70.30716641743977,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 340.98316701253253,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 314.614995320638,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 88.2018330891927,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 85.09549967447917,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 88.72550201416016,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 85.59499867757161,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cpu_float32 (__main__.TestDecompCPU)": 61.82139994303385,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 141.1143341064453,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 142.72383499145508,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1356.413838704427,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1347.1215209960938,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1366.5043131510417,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 593.5763346354166,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 549.9474945068359,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 74.53666687011719,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 75.8316650390625,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 74.80666669209798,
+  "test_comprehensive_nn_functional_unfold_cuda_complex128 (__main__.TestDecompCUDA)": 67.3658332824707,
+  "test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 67.6716677347819,
+  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 120.74283218383789,
+  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 117.90700022379558,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 74.16149965922038,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.09249877929688,
+  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 68.72949981689453,
+  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 76.05216598510742,
+  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 79.25549952189128,
+  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 124.02233123779297,
+  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 130.15816497802734,
+  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 114.52783139546712,
+  "test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 94.13066546122234,
+  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 243.25878143310547,
+  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 560.9872216118706,
+  "test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 85.30400085449219,
+  "test_conv2d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 60.0622667948405,
+  "test_conv2d_unary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 60.94093297322591,
+  "test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 164.94733174641928,
+  "test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 67.41599782307942,
+  "test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 80.62599987453885,
+  "test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 77.90822347005208,
+  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 88.02899932861328,
+  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 83.99416732788086,
+  "test_count_nonzero_all (__main__.TestBool)": 625.3162163628472,
+  "test_custom_module_lstm (__main__.TestQuantizedOps)": 691.5127597384983,
+  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 86.18333435058594,
+  "test_eager_sequence_nr_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 146.76594623766448,
+  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 341.765677134196,
+  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 68.25488874647353,
+  "test_fail_random.py (__main__.TestTyping)": 69.70459224559643,
+  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 99.30016708374023,
+  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 90.32933298746745,
+  "test_fuse_large_params_cpu (__main__.CpuTests)": 100.9027509689331,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 156.06466674804688,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 154.44311014811197,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 140.33400217692056,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 108.87950007120769,
+  "test_grad_nn_Transformer_cpu_float64 (__main__.TestModuleCPU)": 78.21525671543219,
+  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 95.37383270263672,
+  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 124.23833465576172,
+  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 130.07466634114584,
+  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 228.14850107828775,
+  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 141.07866414388022,
+  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 155.69166564941406,
+  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 638.5084838867188,
+  "test_group_norm (__main__.TestQuantizedOps)": 235.64022382100424,
+  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 328.87933349609375,
+  "test_inductor_dynamic_shapes_broadcasting_dynamic_shapes (__main__.DynamicShapesReproTests)": 116.18105255930047,
+  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 70.07888836330838,
+  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 89.06283315022786,
+  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 131.60088857014975,
+  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.61966451009114,
+  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 131.74433390299478,
+  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 101.52466583251953,
+  "test_linear (__main__.TestStaticQuantizedModule)": 219.97832912868924,
+  "test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 111.1229985555013,
+  "test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 139.29833475748697,
+  "test_linear_relu (__main__.TestStaticQuantizedModule)": 222.60332700941296,
+  "test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 137.30917072296143,
+  "test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.62766689724393,
+  "test_low_memory_max_pool_dilation_1_dim_3_cpu_halide (__main__.HalideCpuTests)": 585.4219970703125,
+  "test_low_memory_max_pool_dilation_2_dim_3_cpu_halide (__main__.HalideCpuTests)": 504.6419982910156,
+  "test_lstm_cpu (__main__.TestMkldnnCPU)": 69.61133321126302,
+  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 127.47244517008464,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.23977788289388,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 63.10499954223633,
+  "test_nan_assert_float16 (__main__.ProcessGroupNCCLGroupTest)": 105.55233224232991,
+  "test_pattern_matcher_multi_user_cpu (__main__.CpuTritonTests)": 148.99966939290366,
+  "test_proper_exit (__main__.TestDataLoader)": 195.07049942016602,
+  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 238.3838322957357,
+  "test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 180.44411044650607,
+  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 64.31058961917192,
+  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 62.13955030441284,
+  "test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 141.32811228434244,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 92.34100087483723,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 84.88599904378255,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 77.63999938964844,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.23133341471355,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.41600036621094,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 75.7643305460612,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 85.55433400472005,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 86.17699940999348,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 76.47133382161458,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 98.72666676839192,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 102.08499908447266,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 79.43900044759114,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 87.4413324991862,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.52833302815755,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.18200174967448,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 91.71099853515625,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 75.84733327229817,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 89.47599792480469,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 89.17300160725911,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 96.56466674804688,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.08200073242188,
+  "test_qrnncell (__main__.TestDynamicQuantizedOps)": 200.46322377522787,
+  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 637.5349934895834,
+  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1213.9888509114583,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 759.4036661783854,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1672.4736735026042,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 76.77566528320312,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 292.51483662923175,
+  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 129.11066691080728,
+  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 260.64366658528644,
+  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 73.24966684977214,
+  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 157.60366821289062,
+  "test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 78.70783360799153,
+  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 89.36199951171875,
+  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 193.34283447265625,
+  "test_quick_core_backward_std_cpu_float64 (__main__.TestDecompCPU)": 64.08739941914877,
+  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 126.64083353678386,
+  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 106.82166735331218,
+  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 64.22033437093098,
+  "test_rosenbrock_sparse_with_lrsched_False_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 65.57016626993816,
+  "test_rosenbrock_sparse_with_lrsched_True_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 76.09683354695638,
+  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 71.15816752115886,
+  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 74.32677883572049,
+  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 157.43183390299478,
+  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 131.13233439127603,
+  "test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 160.5550011528863,
+  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 117.62710995144315,
+  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 114.96744452582465,
+  "test_std (__main__.TestQuantizedOps)": 275.08810419506494,
+  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 150.82900087038675,
+  "test_terminate_handler_on_crash (__main__.TestTorch)": 110.43555479579501,
+  "test_terminate_signal (__main__.ForkTest)": 130.07055732442274,
+  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 129.6981106830968,
+  "test_terminate_signal (__main__.SpawnTest)": 133.48411263359918,
+  "test_torchvision_smoke (__main__.TestTensorBoardPytorchGraph)": 90.4521090189616,
+  "test_train_parity_multi_group (__main__.TestFullyShard1DTrainingCore)": 164.04612350463867,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 77.9958324432373,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 78.84283447265625,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 79.08466720581055,
+  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 127.43616739908855,
+  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 129.390500386556,
+  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 104.55349795023601,
+  "test_unary_ops (__main__.TestTEFuserDynamic)": 84.59466772609287,
+  "test_unary_ops (__main__.TestTEFuserStatic)": 87.30733429061041,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 82.17999776204427,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 79.73050053914388,
+  "test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 87.70950190226237,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 96.42566680908203,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 78.90966542561848,
+  "test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 62.53285598754883,
+  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 91.11416816711426,
+  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 86.59666760762532,
+  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 93.32300059000652,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 100.57566833496094,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 116.00733248392741,
+  "test_vmapjvpvjp_nn_functional_conv2d_cpu_float32 (__main__.TestOperatorsCPU)": 62.26690483093262,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 87.44200134277344,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 133.6548334757487,
+  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 114.57983334859212,
+  "test_vmapjvpvjp_unbind_cpu_float32 (__main__.TestOperatorsCPU)": 69.25033442179362,
+  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 124.68766911824544,
+  "test_vmapvjpvjp_linalg_lstsq_cuda_float32 (__main__.TestOperatorsCUDA)": 76.81024932861328,
+  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 140.70899963378906,
+  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 118.22750091552734,
+  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 181.27366256713867
 }
\ No newline at end of file
diff --git a/test/test_accelerator.py b/test/test_accelerator.py
index 0ea224d704cb8..21731bd275b60 100644
--- a/test/test_accelerator.py
+++ b/test/test_accelerator.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: tests"]
 
+import gc
 import sys
 import unittest
 
@@ -156,6 +157,83 @@ def test_generic_event_behavior(self):
         ):
             event1.elapsed_time(event2)
 
+    @unittest.skipIf(TEST_MPS, "MPS doesn't support torch.accelerator memory API!")
+    def test_memory_stats(self):
+        # Ensure that device allocator is initialized
+        acc = torch.accelerator.current_accelerator()
+        tmp = torch.randn(100, device=acc)
+        del tmp
+        gc.collect()
+        self.assertTrue(torch._C._accelerator_isAllocatorInitialized())
+        torch.accelerator.empty_cache()
+
+        pool_type = ["all", "small_pool", "large_pool"]
+        metric_type = ["peak", "current", "allocated", "freed"]
+        stats_type = [
+            "allocated_bytes",
+            "reserved_bytes",
+            "active_bytes",
+            "requested_bytes",
+        ]
+        mem_stats = torch.accelerator.memory_stats()
+        expected_stats = [
+            f"{st}.{pt}.{mt}"
+            for st in stats_type
+            for pt in pool_type
+            for mt in metric_type
+        ]
+        missing_stats = [stat for stat in expected_stats if stat not in mem_stats]
+        self.assertEqual(
+            len(missing_stats),
+            0,
+            f"Missing expected memory statistics: {missing_stats}",
+        )
+
+        prev_allocated = torch.accelerator.memory_allocated()
+        prev_reserved = torch.accelerator.memory_reserved()
+        prev_max_allocated = torch.accelerator.max_memory_allocated()
+        prev_max_reserved = torch.accelerator.max_memory_reserved()
+        self.assertGreaterEqual(prev_allocated, 0)
+        self.assertGreaterEqual(prev_reserved, 0)
+        self.assertGreater(prev_max_allocated, 0)
+        self.assertGreater(prev_max_reserved, 0)
+        tmp = torch.ones(256, device=acc)
+        self.assertGreater(torch.accelerator.memory_allocated(), prev_allocated)
+        self.assertGreaterEqual(torch.accelerator.memory_reserved(), prev_reserved)
+        del tmp
+        gc.collect()
+        torch.accelerator.empty_cache()
+        torch.accelerator.reset_peak_memory_stats()
+        self.assertEqual(torch.accelerator.memory_allocated(), prev_allocated)
+        self.assertEqual(torch.accelerator.memory_reserved(), prev_reserved)
+        torch.accelerator.reset_accumulated_memory_stats()
+        prev_max_allocated = torch.accelerator.max_memory_allocated()
+        prev_max_reserved = torch.accelerator.max_memory_reserved()
+        # Activate 1kB memory
+        prev_active_current = torch.accelerator.memory_stats()[
+            "active_bytes.all.current"
+        ]
+        tmp = torch.randn(256, device=acc)
+        # Detect if the current active memory is 1kB
+        self.assertEqual(
+            torch.accelerator.memory_stats()["active_bytes.all.current"],
+            1024 + prev_active_current,
+        )
+        self.assertEqual(torch.accelerator.memory_stats()["active_bytes.all.freed"], 0)
+        del tmp
+        gc.collect()
+        torch.accelerator.empty_cache()
+        self.assertEqual(
+            torch.accelerator.memory_stats()["active_bytes.all.current"],
+            prev_active_current,
+        )
+        self.assertEqual(
+            torch.accelerator.memory_stats()["active_bytes.all.freed"], 1024
+        )
+        torch.accelerator.reset_peak_memory_stats()
+        self.assertEqual(torch.accelerator.max_memory_allocated(), prev_max_allocated)
+        self.assertEqual(torch.accelerator.max_memory_reserved(), prev_max_reserved)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 01929a276f569..dbd1454ff7459 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -109,6 +109,10 @@ def graph_desc(fn):
 
 
 class TestAutograd(TestCase):
+    def tearDown(self):
+        torch.autograd._force_original_view_tracking(False)
+        super(TestCase, self).tearDown()
+
     def test_copy_slices_graph_task_updates(self):
         def f1(x, y):
             out = x.clone().view(-1)
@@ -1192,6 +1196,33 @@ def fn(x, reduce=True):
                 tmp_edge, inputs=(x,), grad_tensors=torch.tensor([1.0, 2.0, 3.0, 4.0])
             )
 
+    def test_gradient_edge_graph_ownership(self):
+        # Ensure we own the graph properly
+        class Clone(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x.clone()
+
+            @staticmethod
+            def backward(ctx, gX):
+                return gX.clone()
+
+        inp = torch.rand(1, requires_grad=True).clone()
+
+        # C++ Node
+        out = inp.clone()
+        edge = torch.autograd.graph.get_gradient_edge(out)
+        torch.autograd.backward(edge)
+        del out
+        torch.autograd.backward(edge)
+
+        # python Node
+        out = Clone.apply(inp)
+        edge = torch.autograd.graph.get_gradient_edge(out)
+        torch.autograd.backward(edge)
+        del out
+        torch.autograd.backward(edge)
+
     def test_grad_nonleaf(self):
         x_init = torch.randn(2, 2, requires_grad=True)
         x = x_init
@@ -3857,6 +3888,38 @@ def backward(ctx, grad_output):
         torch.autograd.grad(y, x, create_graph=True)
         torch.autograd.grad(y, x)  # should not error!
 
+    def test_custom_autograd_ac_early_stop(self):
+        refs = []
+
+        class Test(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                y = x.clone()
+                ctx.save_for_backward(y)
+                refs.append(weakref.ref(y))
+                return y
+
+            @staticmethod
+            def backward(ctx, *args):
+                _ = ctx.saved_tensors
+                return None
+
+        def fn(inp):
+            return Test.apply(inp)
+
+        inp = torch.randn(5, 5, requires_grad=True)
+
+        def scope():
+            # Early-stop is true by default in non-reentrant torch.utils.checkpoint
+            out = torch.utils.checkpoint.checkpoint(fn, inp, use_reentrant=False)
+            out.sum().backward()
+
+        with disable_gc():
+            scope()
+
+            for ref in refs:
+                self.assertIsNone(ref())
+
     def test_detach(self):
         x = torch.randn(10, 10, requires_grad=True)
         y = x + 2
@@ -12392,6 +12455,29 @@ def test_resize_version_bump(self, device):
         x.resize_as_(y)
         self.assertEqual(x._version, 2)
 
+    @unittest.skipIf(not torch.accelerator.is_available(), "requires accelerator")
+    def test_zero_dim_param_mixed_device_grad(self, device):
+        # cpu 0-dim params with an accelerator device grad
+        # https://github.com/pytorch/pytorch/issues/160084
+        class RegressionModel(torch.nn.Module):
+            def __init__(self, a=0, b=0):
+                super().__init__()
+                self.a = torch.nn.Parameter(torch.tensor(a).float())
+                self.b = torch.nn.Parameter(torch.tensor(b).float())
+
+            def forward(self, x):
+                return x * self.a + self.b
+
+        # Keep the model on cpu as we do want to test the mixed cpu/accelerator behavior here
+        model = RegressionModel()
+        inputs = torch.randn(4, 10, device=device)
+        out = model(inputs)
+        out.sum().backward()
+        self.assertIsNotNone(model.a.grad)
+        self.assertIsNotNone(model.b.grad)
+        self.assertEqual(model.a.grad.device, torch.device("cpu"))
+        self.assertEqual(model.b.grad.device, torch.device("cpu"))
+
 
 class TestAllowMutationOnSaved(TestCase):
     def assertClonedLenEqual(self, ctx, n):
@@ -14087,13 +14173,27 @@ def fn(x):
             # early stop is enabled.
             return clone(x.sin().cos())
 
+        # Test default
         # Early stopping is enabled by default
         a = torch.tensor(1.0, requires_grad=True)
         out = checkpoint(fn, a, use_reentrant=False)
         out.backward()
         self.assertEqual(counter[0], 1)
 
-        # Try using the context manager to set early stopping to False.
+        # Test local setting
+        counter = [0]
+        a = torch.tensor(1.0, requires_grad=True)
+        out = checkpoint(fn, a, use_reentrant=False, early_stop=False)
+        out.backward()
+        self.assertEqual(counter[0], 2)
+
+        counter = [0]
+        a = torch.tensor(1.0, requires_grad=True)
+        out = checkpoint(fn, a, use_reentrant=False, early_stop=True)
+        out.backward()
+        self.assertEqual(counter[0], 1)
+
+        # Test context manager
         # Expect early stopping to be disabled for all checkpoints ran under
         # the context manager, even though context manager is no longer active
         # when backward/recomputation is performed.
@@ -14101,10 +14201,40 @@ def fn(x):
         a = torch.tensor(1.0, requires_grad=True)
         with torch.utils.checkpoint.set_checkpoint_early_stop(False):
             out = checkpoint(fn, a, use_reentrant=False)
+        out.backward()
+        self.assertEqual(counter[0], 2)
 
+        counter = [0]
+        a = torch.tensor(1.0, requires_grad=True)
+        with torch.utils.checkpoint.set_checkpoint_early_stop(True):
+            out = checkpoint(fn, a, use_reentrant=False)
+        out.backward()
+        self.assertEqual(counter[0], 1)
+
+        # Test context manager nesting
+        counter = [0]
+        a = torch.tensor(1.0, requires_grad=True)
+        with torch.utils.checkpoint.set_checkpoint_early_stop(False):
+            with torch.utils.checkpoint.set_checkpoint_early_stop(True):
+                out = checkpoint(fn, a, use_reentrant=False, early_stop=False)
+        out.backward()
+        self.assertEqual(counter[0], 1)
+
+        # Test precedence
+        counter = [0]
+        a = torch.tensor(1.0, requires_grad=True)
+        with torch.utils.checkpoint.set_checkpoint_early_stop(False):
+            out = checkpoint(fn, a, use_reentrant=False, early_stop=True)
         out.backward()
         self.assertEqual(counter[0], 2)
 
+        counter = [0]
+        a = torch.tensor(1.0, requires_grad=True)
+        with torch.utils.checkpoint.set_checkpoint_early_stop(True):
+            out = checkpoint(fn, a, use_reentrant=False, early_stop=False)
+        out.backward()
+        self.assertEqual(counter[0], 1)
+
     def test_nested_checkpoint_set_early_stop_no_recompution_needed(self):
         # Case 1: We have one tensor saved and its the input
 
diff --git a/test/test_content_store.py b/test/test_content_store.py
index 1238c15c22f6f..755f0852af749 100644
--- a/test/test_content_store.py
+++ b/test/test_content_store.py
@@ -7,7 +7,6 @@
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_utils import (
     run_tests,
-    skipIfRocm,
     TemporaryDirectoryName,
     TestCase,
 )
@@ -70,7 +69,6 @@ def test_repeated_hash(self, device):
         for _ in range(4):
             hash_storage(torch.tensor(2, device=device).untyped_storage())
 
-    @skipIfRocm
     def test_load_tensor(self, device):
         with TemporaryDirectoryName() as loc:
             writer = ContentStoreWriter(loc)
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
index 06b681bee981f..fd80c7fa565ad 100644
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@@ -1227,7 +1227,7 @@ def test_aoti_torch_call_dispatcher(self):
         #include <torch/csrc/inductor/aoti_runtime/utils.h>
         #include <torch/csrc/inductor/aoti_torch/utils.h>
         #include <torch/csrc/inductor/aoti_torch/c/shim.h>
-        #include <torch/csrc/stable/library.h>
+        #include <torch/csrc/stable/stableivalue_conversions.h>
 
         using RAIIATH = torch::aot_inductor::RAIIAtenTensorHandle;
 
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 54c5644a540b7..dd36d6a94478b 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -373,6 +373,42 @@ def test_memory_allocation(self):
                 torch.cuda.caching_allocator_delete(mem)
                 self.assertEqual(torch.cuda.memory_allocated(), prev)
 
+    def test_memory_stats(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.reset_accumulated_memory_stats()
+        prev_allocated = torch.accelerator.memory_allocated()
+        prev_reserved = torch.accelerator.memory_reserved()
+        prev_max_allocated = torch.accelerator.max_memory_allocated()
+        prev_max_reserved = torch.accelerator.max_memory_reserved()
+        self.assertEqual(prev_allocated, prev_max_allocated)
+        self.assertEqual(prev_reserved, prev_max_reserved)
+        # Activate 1kB memory
+        prev_active_current = torch.accelerator.memory_stats()[
+            "active_bytes.all.current"
+        ]
+        tmp = torch.randn(256, device="cuda")
+        # Detect if the current active memory is 1kB
+        self.assertEqual(
+            torch.accelerator.memory_stats()["active_bytes.all.current"],
+            1024 + prev_active_current,
+        )
+        self.assertEqual(torch.accelerator.memory_stats()["active_bytes.all.freed"], 0)
+        del tmp
+        gc.collect()
+        torch.accelerator.empty_cache()
+        self.assertEqual(
+            torch.accelerator.memory_stats()["active_bytes.all.current"],
+            prev_active_current,
+        )
+        self.assertEqual(
+            torch.accelerator.memory_stats()["active_bytes.all.freed"], 1024
+        )
+        torch.accelerator.reset_peak_memory_stats()
+        self.assertEqual(torch.accelerator.max_memory_allocated(), prev_max_allocated)
+        self.assertEqual(torch.accelerator.max_memory_reserved(), prev_max_reserved)
+
     def test_check_error(self):
         # Assert this call doesn't raise.
         torch.cuda.check_error(0)
@@ -3280,10 +3316,10 @@ def forward(self, x):
     @parametrize(
         "with_amp,cache_enabled,allow_unused_input",
         [
-            subtest((False, False, True), decorators=[skipIfRocm]),
-            subtest((True, False, True), decorators=[skipIfRocm]),
+            subtest((False, False, True)),
+            subtest((True, False, True)),
             subtest((True, True, True), decorators=[unittest.expectedFailure]),
-            subtest((False, False, False), decorators=[skipIfRocm]),
+            subtest((False, False, False)),
         ],
         name_fn=lambda x, y, z: "{}{}{}".format(
             {True: "with_amp", False: "without_amp"}[x],
@@ -3633,6 +3669,35 @@ def test_cuda_graph_raw_graph(self):
 
         graph.replay()
 
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH or not TEST_CUDA_PYTHON_BINDINGS,
+        "CUDA >= 11.0 or ROCM >= 5.3 required for graphs, cuda-bindings must be installed",
+    )
+    @parametrize("keep_graph", [True, False])
+    def test_cuda_graph_raw_graph_exec(self, keep_graph):
+        import cuda.bindings.runtime as cudart
+
+        graph = torch.cuda.CUDAGraph(keep_graph=keep_graph)
+        x = torch.zeros([2000], device="cuda")
+        y = torch.ones([2000], device="cuda")
+        with torch.cuda.graph(graph, capture_error_mode="relaxed"):
+            z = x + y
+
+        if keep_graph:
+            with self.assertRaisesRegex(
+                RuntimeError,
+                r"You cannot access the raw (cuda|hip)GraphExec_t instance until instantiate\(\) has been called",
+            ):
+                graph.raw_cuda_graph_exec()
+
+            graph.instantiate()
+        raw_pointer = graph.raw_cuda_graph_exec()
+
+        cudart_cuda_graph_exec = cudart.cudaGraphExec_t(init_value=raw_pointer)
+        cuda_python_error_check(cudart.cudaGraphExecGetFlags(cudart_cuda_graph_exec))
+
+        graph.replay()
+
     @unittest.skipIf(
         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
     )
@@ -4488,28 +4553,28 @@ def power2_div(size, div_factor):
         with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings("foo:1,bar:2")
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings(
                 "garbage_collection_threshold:1.2"
             )
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings("max_split_size_mb:2")
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings("release_lock_on_cudamalloc:none")
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings(
                 "pinned_use_cuda_host_register:none"
             )
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings(
                 "pinned_num_register_threads:none"
             )
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings(
                 "pinned_num_register_threads:1024"
             )
@@ -5552,6 +5617,149 @@ def my_function(pool):
             s = p.snapshot()
             self.assertEqual(len(s), 1, "Expected to have a single segment")
 
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    def test_graph_capture_reclaim_2_streams(self):
+        torch.cuda.memory._set_allocator_settings(
+            "graph_capture_record_stream_reuse:True"
+        )
+        torch.cuda.empty_cache()
+
+        s1, s2 = torch.cuda.Stream(), torch.cuda.Stream()
+        g = torch.cuda.CUDAGraph(keep_graph=True)
+
+        torch.cuda.synchronize()
+
+        with torch.cuda.stream(s1):
+            g.capture_begin()
+
+            # A sink node allocated up-front so it doesn't steal data1's block later.
+            sink1 = torch.empty(8, device="cuda")
+
+            # Source tensor on s1; this block is the reuse candidate.
+            data1 = torch.empty(8, device="cuda")
+            data1_ptr = data1.data_ptr()
+
+            # Fork: do real work on s2 that READS data1 and writes to its own buffer.
+            s2.wait_stream(s1)
+            with torch.cuda.stream(s2):
+                buf2 = torch.empty_like(data1)
+                torch.add(data1, 2.0, out=buf2)
+                data1.record_stream(s2)
+
+            del data1
+
+            # BEFORE JOIN: must NOT reuse
+            data2 = torch.empty(8, device="cuda")
+            data2_ptr = data2.data_ptr()
+
+            # Join s2 -> s1 and add a sink node on s1.
+            s1.wait_stream(s2)
+            sink1.fill_(1.0)
+
+            # AFTER JOIN: now reuse is allowed
+            data3 = torch.empty(8, device="cuda")
+            data3_ptr = data3.data_ptr()
+
+            g.capture_end()
+
+        torch.cuda.synchronize()
+
+        # No reuse before join; reuse after join.
+        self.assertNotEqual(data1_ptr, data2_ptr)
+        self.assertEqual(data1_ptr, data3_ptr)
+
+        torch.cuda.memory._set_allocator_settings(
+            "graph_capture_record_stream_reuse:False"
+        )
+
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    def test_graph_capture_reclaim_4_streams(self):
+        torch.cuda.memory._set_allocator_settings(
+            "graph_capture_record_stream_reuse:True"
+        )
+
+        torch.cuda.empty_cache()
+        s1, s2, s3, s4 = (
+            torch.cuda.Stream(),
+            torch.cuda.Stream(),
+            torch.cuda.Stream(),
+            torch.cuda.Stream(),
+        )
+        g = torch.cuda.CUDAGraph(keep_graph=True)
+
+        torch.cuda.synchronize()
+
+        with torch.cuda.stream(s1):
+            g.capture_begin()
+
+            # Source tensor allocated on s1. This block is the candidate for reuse.
+            data1 = torch.ones(8, device="cuda")
+            data1_ptr = data1.data_ptr()
+            sink1 = torch.empty_like(data1)
+            sink3 = torch.empty_like(data1)
+
+            s2.wait_stream(s1)
+            with torch.cuda.stream(s2):
+                buf2 = torch.empty_like(data1)
+                torch.add(data1, 2.0, out=buf2)
+                data1.record_stream(s2)
+
+            s3.wait_stream(s1)
+            with torch.cuda.stream(s3):
+                buf3 = torch.empty_like(data1)
+                torch.add(data1, 3.0, out=buf3)
+                data1.record_stream(s3)
+
+            s4.wait_stream(s1)
+            with torch.cuda.stream(s4):
+                buf4 = torch.empty_like(data1)
+                torch.add(data1, 4.0, out=buf4)
+                data1.record_stream(s4)
+
+            # Free data1 inside capture; allocator may reuse later when it's safe.
+            del data1
+
+            # PARTIAL JOINS: should NOT allow reuse yet
+            # Join s2 -> s1 and add a sink node on s1.
+            s1.wait_stream(s2)
+            sink1.fill_(1.0)
+
+            # Join s4 -> s3 and add a sink node on s3.
+            s3.wait_stream(s4)
+            with torch.cuda.stream(s3):
+                sink3.fill_(3.0)
+                sink3.record_stream(s3)
+
+            # At this point, s1 and s3 subgraphs are NOT yet joined together.
+            # Allocating data2 here must NOT reuse data1's block.
+            data2 = torch.empty(8, device="cuda")
+            data2_ptr = data2.data_ptr()
+
+            # FINAL JOIN: now reuse is allowed
+            # Join s3 -> s1 and add a sink node on s1.
+            s1.wait_stream(s3)
+            sink1.add_(sink3)
+
+            # Now allocator should safely reuse data1's block.
+            data3 = torch.empty(8, device="cuda")
+            data3_ptr = data3.data_ptr()
+
+            g.capture_end()
+
+        torch.cuda.synchronize()
+
+        # No reuse before full join; reuse after full join.
+        self.assertNotEqual(data1_ptr, data2_ptr)
+        self.assertEqual(data1_ptr, data3_ptr)
+
+        torch.cuda.memory._set_allocator_settings(
+            "graph_capture_record_stream_reuse:False"
+        )
+
     @skipIfRocm(msg="expandable_segments mode is not supported on ROCm")
     @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Load_inline doesn't work in fbcode")
     def test_mempool_expandable(self):
diff --git a/test/test_cuda_multigpu.py b/test/test_cuda_multigpu.py
index 2882b0f58808a..0ce0cbfa0e2b0 100644
--- a/test/test_cuda_multigpu.py
+++ b/test/test_cuda_multigpu.py
@@ -31,7 +31,6 @@
     run_tests,
     serialTest,
     skipCUDANonDefaultStreamIf,
-    skipIfRocm,
     TEST_CUDA,
     TestCase,
 )
@@ -777,8 +776,6 @@ def _test_stream_event_nogil(self, sync_func, p2c, c2p):
             p2c.get()
             c2p.put(sync_func(self, TestCudaMultiGPU.FIFTY_MIL_CYCLES))
 
-    # Skip the test for ROCm as per https://github.com/pytorch/pytorch/issues/53190
-    @skipIfRocm
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_stream_event_nogil(self):
         for sync_func in [
@@ -819,7 +816,6 @@ def test_stream_event_nogil(self):
             self.assertGreater(parent_time + child_time, total_time * 1.3)
 
     # This test is flaky for ROCm, see issue #62602
-    @skipIfRocm
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_events_wait(self):
         d0 = torch.device("cuda:0")
@@ -888,7 +884,6 @@ def test_events_multi_gpu_query(self):
             self.assertTrue(e1.query())
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
-    @skipIfRocm
     def test_events_multi_gpu_elapsed_time(self):
         d0 = torch.device("cuda:0")
         d1 = torch.device("cuda:1")
diff --git a/test/test_custom_ops.py b/test/test_custom_ops.py
index b713edeb7a954..491648494f6f0 100644
--- a/test/test_custom_ops.py
+++ b/test/test_custom_ops.py
@@ -11,6 +11,7 @@
 import tempfile
 import typing
 import unittest
+from functools import partial
 from pathlib import Path
 from typing import *  # noqa: F403
 
@@ -1769,7 +1770,7 @@ def f(x):
 
   Developer debug context: _torch_testing.numpy_nonzero.default
 
- For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0036.html""",
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0036.html""",
         )
 
     # pre-existing problem: torch.compile(dynamic=True) will, by default,
@@ -4156,6 +4157,148 @@ def test_any_output_is_alias_to_input_or_output(self):
             )
         )
 
+    def test_library_get_kernel(self):
+        """Test registering a custom kernel, using it, then deregistering and verifying error."""
+
+        # Register a dummy kernel for arange to the CPU key that returns a tensor of ones
+        def dummy_arange_cpu(
+            dispatch_keys,
+            start,
+            end,
+            dtype=None,
+            layout=torch.strided,
+            device=None,
+            pin_memory=False,
+        ):
+            size = max(0, int(end - start))
+            return torch.ones(size, dtype=dtype, device=device)
+
+        with torch.library._scoped_library("aten", "IMPL") as lib:
+            lib.impl("arange.start", dummy_arange_cpu, "CPU", with_keyset=True)
+
+            kernel = torch.library.get_kernel("aten::arange.start", "CPU")
+            dispatch_keys = torch._C.DispatchKeySet(torch._C.DispatchKey.CPU)
+            result = kernel.call_boxed(dispatch_keys, 0, 5)
+
+            self.assertEqual(result, torch.ones(5))
+
+        # The kernel should now be invalidated after exiting the scoped_library context
+        with self.assertRaisesRegex(RuntimeError, "has been invalidated"):
+            kernel.call_boxed(dispatch_keys, 0, 5)
+
+    def test_library_get_kernel_with_conditional_dispatch(self):
+        """Test registering a custom kernel with conditional dispatch logic."""
+
+        def conditional_arange_cpu1(
+            original_kernel,
+            dispatch_keys,
+            start,
+            end,
+            dtype=None,
+            layout=torch.strided,
+            device=None,
+            pin_memory=False,
+        ):
+            # If end is even, use the original kernel, otherwise return ones tensor
+            if end % 2 == 0:
+                op_handle = torch.ops.aten.arange.start._handle
+                return original_kernel.call_boxed(
+                    dispatch_keys,
+                    start,
+                    end,
+                    dtype=dtype,
+                    layout=layout,
+                    device=device,
+                    pin_memory=pin_memory,
+                )
+            else:
+                size = max(0, int(end - start))
+                return torch.ones(size, dtype=dtype, device=device)
+
+        def conditional_arange_cpu2(
+            original_kernel,
+            dispatch_keys,
+            start,
+            end,
+            dtype=None,
+            layout=torch.strided,
+            device=None,
+            pin_memory=False,
+        ):
+            # If start is even, use the original kernel, otherwise return twos tensor
+            if start % 2 == 0:
+                op_handle = torch.ops.aten.arange.start._handle
+                return original_kernel.call_boxed(
+                    dispatch_keys,
+                    start,
+                    end,
+                    dtype=dtype,
+                    layout=layout,
+                    device=device,
+                    pin_memory=pin_memory,
+                )
+            else:
+                size = max(0, int(end - start))
+                return torch.empty(size, dtype=dtype, device=device).fill_(2)
+
+        original_kernel = torch.library.get_kernel("aten::arange.start", "CPU")
+        expected_result1, expected_result2 = torch.ones(5), torch.arange(0, 6)
+        expected_result3, expected_result4, expected_result5 = (
+            torch.ones(5),
+            torch.arange(0, 6),
+            torch.ones(5).fill_(2),
+        )
+
+        with torch.library._scoped_library("aten", "IMPL") as lib2:
+            with torch.library._scoped_library("aten", "IMPL") as lib1:
+                lib1.impl(
+                    "arange.start",
+                    partial(conditional_arange_cpu1, original_kernel),
+                    "CPU",
+                    with_keyset=True,
+                )
+
+                self.assertEqual(torch.arange(0, 5), expected_result1)
+                self.assertEqual(torch.arange(0, 6), expected_result2)
+                new_original_kernel = torch.library.get_kernel(
+                    "aten::arange.start", "CPU"
+                )
+                lib2.impl(
+                    "arange.start",
+                    partial(conditional_arange_cpu2, new_original_kernel),
+                    "CPU",
+                    allow_override=True,
+                    with_keyset=True,
+                )
+
+                self.assertEqual(torch.arange(0, 5), expected_result3)
+                self.assertEqual(torch.arange(0, 6), expected_result4)
+                self.assertEqual(torch.arange(1, 6), expected_result5)
+
+            # The kernel should now be invalidated after destroying lib1
+            with self.assertRaisesRegex(RuntimeError, "has been invalidated"):
+                torch.arange(0, 5)
+
+            # Should still work after destroying lib1
+            self.assertEqual(torch.arange(1, 6), expected_result5)
+
+    def test_library_get_kernel_invalid(self):
+        """Test that get_kernel raises an error when no kernel is available."""
+        with torch.library._scoped_library("test_invalid_kernel", "DEF") as lib:
+            lib.define("cpu_only_op(Tensor x) -> Tensor")
+            lib.impl("cpu_only_op", lambda x: x * 2, "CPU")
+
+            cpu_kernel = torch.library.get_kernel(
+                "test_invalid_kernel::cpu_only_op", "CPU"
+            )
+            self.assertIsNotNone(cpu_kernel)
+
+            # CUDA should fail at the isValid() check since no CUDA kernel exists
+            with self.assertRaisesRegex(
+                RuntimeError, "no kernel for CUDA for test_invalid_kernel::cpu_only_op"
+            ):
+                torch.library.get_kernel("test_invalid_kernel::cpu_only_op", "CUDA")
+
 
 class MiniOpTestOther(CustomOpTestCaseBase):
     test_ns = "mini_op_test"
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 8f4e74d851770..8c98181e8b99e 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -32,13 +32,11 @@
     parametrize,
     run_tests,
     skipIfNoDill,
-    skipIfRocm,
     skipIfXpu,
     slowTest,
     TEST_CUDA,
     TEST_NUMPY,
     TEST_WITH_ASAN,
-    TEST_WITH_ROCM,
     TEST_WITH_TSAN,
     TestCase,
     xfailIfLinux,
@@ -96,7 +94,7 @@
     and sys.platform != "darwin"
     and sys.platform != "win32"
     and not IS_JETSON
-    and not TEST_WITH_ROCM
+    #    and not TEST_WITH_ROCM
 )  # https://github.com/pytorch/pytorch/issues/90940
 
 TEST_MULTIGPU = TEST_CUDA_IPC and torch.cuda.device_count() > 1
@@ -1865,7 +1863,6 @@ def test_chain_iterable_style_dataset(self):
             list(iter(ChainDataset([dataset1, self.dataset])))
 
     @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
-    @skipIfRocm  # https://github.com/pytorch/pytorch/issues/90940
     def test_multiprocessing_contexts(self):
         reference = [
             torch.arange(3),
@@ -2490,7 +2487,6 @@ def test_partial_workers(self):
                 self.assertFalse(pin_memory_thread.is_alive())
 
     # Takes 2.5min to finish, see https://github.com/pytorch/pytorch/issues/46065
-    @skipIfRocm
     @unittest.skipIf(not HAS_PSUTIL, "psutil not found")
     @slowTest
     def test_proper_exit(self):
@@ -3134,7 +3130,6 @@ def test_pin_memory(self):
             self.assertTrue(sample["another_dict"]["a_number"].is_pinned())
 
     @skipIfXpu
-    @skipIfRocm
     @unittest.skipIf(TEST_CUDA, "Test for when CUDA is not available")
     def test_pin_memory_no_cuda(self):
         loader = DataLoader(self.dataset, batch_size=2, pin_memory=True)
diff --git a/test/test_dlpack.py b/test/test_dlpack.py
index f734126b5e7c9..b960575cc6348 100644
--- a/test/test_dlpack.py
+++ b/test/test_dlpack.py
@@ -5,6 +5,7 @@
 from torch.testing._internal.common_device_type import (
     deviceCountAtLeast,
     dtypes,
+    dtypesIfMPS,
     instantiate_device_type_tests,
     onlyCPU,
     onlyCUDA,
@@ -13,10 +14,14 @@
     skipCUDAIfRocm,
     skipMeta,
 )
-from torch.testing._internal.common_dtype import all_types_and_complex_and
+from torch.testing._internal.common_dtype import (
+    all_mps_types_and,
+    all_types_and_complex_and,
+)
 from torch.testing._internal.common_utils import (
     IS_JETSON,
     run_tests,
+    skipIfMPS,
     skipIfTorchDynamo,
     TestCase,
 )
@@ -55,6 +60,7 @@ class TestTorchDlPack(TestCase):
             torch.uint64,
         )
     )
+    @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat, torch.chalf))
     def test_dlpack_capsule_conversion(self, device, dtype):
         x = make_tensor((5,), dtype=dtype, device=device)
         z = from_dlpack(to_dlpack(x))
@@ -72,6 +78,7 @@ def test_dlpack_capsule_conversion(self, device, dtype):
             torch.uint64,
         )
     )
+    @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat, torch.chalf))
     def test_dlpack_protocol_conversion(self, device, dtype):
         x = make_tensor((5,), dtype=dtype, device=device)
         z = from_dlpack(x)
@@ -80,7 +87,8 @@ def test_dlpack_protocol_conversion(self, device, dtype):
     @skipMeta
     @onlyNativeDeviceTypes
     def test_dlpack_shared_storage(self, device):
-        x = make_tensor((5,), dtype=torch.float64, device=device)
+        dtype = torch.bfloat16 if device.startswith("mps") else torch.float64
+        x = make_tensor((5,), dtype=dtype, device=device)
         z = from_dlpack(to_dlpack(x))
         z[0] = z[0] + 20.0
         self.assertEqual(z, x)
@@ -120,12 +128,14 @@ def test_dlpack_conversion_with_streams(self, device, dtype):
             torch.uint64,
         )
     )
+    @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat, torch.chalf))
     def test_from_dlpack(self, device, dtype):
         x = make_tensor((5,), dtype=dtype, device=device)
         y = torch.from_dlpack(x)
         self.assertEqual(x, y)
 
     @skipMeta
+    @skipIfMPS  # MPS crashes with noncontiguous now
     @onlyNativeDeviceTypes
     @dtypes(
         *all_types_and_complex_and(
@@ -189,6 +199,7 @@ def test_dlpack_conversion_with_diff_streams(self, device, dtype):
             torch.uint64,
         )
     )
+    @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat, torch.chalf))
     def test_from_dlpack_dtype(self, device, dtype):
         x = make_tensor((5,), dtype=dtype, device=device)
         y = torch.from_dlpack(x)
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 6a721a079a635..7ba466119da85 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -861,7 +861,7 @@ def test_mul_int_oo_nan(self):
         s2 = create_symint(shape_env, 5, duck=False)
         bool(s0 * (s1 // s0) == s2)
 
-    def test_non_overlapping_and_dense(self):
+    def test_non_overlapping_and_dense_backed(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 5)
         r = torch.empty_strided((a0, 7), (1, a0), device="meta")
@@ -896,6 +896,64 @@ def test_non_overlapping_and_dense_unbacked(self):
             )
         )
 
+    def test_prims_non_overlapping_and_dense(self):
+        shape_env = ShapeEnv()
+        cf = torch._prims_common.is_non_overlapping_and_dense
+
+        # backed case
+        a0 = create_symint(shape_env, 5)
+        self.assertTrue(cf(torch.empty_strided((a0, 7), (1, a0), device="meta")))
+
+        # unbacked
+        u0 = shape_env.create_unbacked_symint()
+        torch._check_is_size(u0)
+        self.assertTrue(cf(torch.empty_strided((u0, 2), (2, 1), device="meta")))
+        self.assertTrue(cf(torch.empty_strided((2, u0), (1, 2), device="meta")))
+        self.assertTrue(cf(torch.empty_strided((u0,), (1,), device="meta")))
+        self.assertTrue(cf(torch.empty_strided((1,), (u0,), device="meta")))
+
+        Max = torch.sym_max
+        self.assertTrue(
+            cf(
+                torch.empty_strided(
+                    (2, 3, 1, u0),
+                    (3 * Max(1, u0), Max(1, u0), Max(1, u0), 1),
+                    device="meta",
+                )
+            )
+        )
+        self.assertFalse(
+            cf(
+                torch.empty_strided(
+                    (2, 3, 1, u0),
+                    (Max(1, u0), Max(1, u0), 1, 3 * Max(1, u0)),
+                    device="meta",
+                )
+            )
+        )
+
+        # return False on arbitrary strides
+        u1 = shape_env.create_unbacked_symint()
+        torch._check_is_size(u1)
+        self.assertFalse(
+            cf(
+                torch.empty_strided(
+                    (2 * u0, u0, 1),
+                    (u1, u0, u0 + u1),
+                    device="meta",
+                )
+            )
+        )
+        self.assertFalse(
+            cf(
+                torch.empty_strided(
+                    (2, 3, u0),
+                    (u1, 3, 1),
+                    device="meta",
+                )
+            )
+        )
+
     def test_sympy_optimized_add_binary_search(self):
         import sympy
 
@@ -3616,6 +3674,17 @@ def func3(x, y):
     def test_unbacked_select_index_cpp_wrapper(self):
         self.test_unbacked_select_index()
 
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_select2(self):
+        def f(idx, x):
+            x = x.select(0, idx.item())
+            return x @ x
+
+        x = torch.randn(3, 3, 3)
+        idx = torch.tensor(1, dtype=torch.int64)
+        out = torch.compile(f)(idx, x)
+        self.assertEqual(out, f(idx, x))
+
 
 instantiate_parametrized_tests(TestUnbacked)
 
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index d6135ec16506e..83c6f383f430d 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -1464,7 +1464,6 @@ def test_cross_entropy_loss(self):
 
             self.assertEqual(ref.size(), meta_out.size())
 
-    @skipIfRocm
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION,
         "Does not support SDPA or pre-SM80 hardware",
@@ -1526,7 +1525,6 @@ def test_fake_gpu_no_init(self):
             torch.tensor(3.14, device=GPU_TYPE)
             torch.tensor([[3.14, 2], [1, 2]], device=GPU_TYPE)
 
-    @skipIfRocm
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_conv_c1_backward(self):
         class Repro(torch.nn.Module):
@@ -1943,6 +1941,16 @@ def test_cache_key_constants(self):
             self._test_cache_key(fm, 1.0, 1.0, 1)
             self._test_cache_key(fm, 0.0, 0.0, 0)
 
+    def test_empty_list(self):
+        with FakeTensorMode() as fm:
+            func = aten.any.dims
+            state = _CacheKeyState()
+            x = torch.ones((2, 3))
+            key_x = fm._cache_key(state, func, [x, []], {})
+            key_y = fm._cache_key(state, func, [x], {})
+
+        self.assertNotEqual(key_x, key_y)
+
     def assertHitsMisses(self, hits, misses):
         """
         Helper to assert on the number of recorded hits and misses.
@@ -2486,5 +2494,81 @@ def forward(
         self.assertBypasses("unrepresented symbol in output", 2)
 
 
+class FakeTensorPreferDeviceType(TestCase):
+    @unittest.skipIf(not RUN_CUDA, "requires cuda")
+    def test_fake_tensor_prefer_device_type(self):
+        """
+        Test that fake_tensor_prefer_device_type configuration works correctly
+        for device mismatch scenarios.
+        """
+
+        # Create a custom operation that would normally cause device mismatch
+        def mixed_device_op(a, b):
+            # This simulates an operation where 'a' is on MTIA/CUDA but 'b' is created on CPU
+            cpu_tensor = torch.arange(a.shape[0], device="cpu")
+            return a + cpu_tensor.unsqueeze(-1)
+
+        with FakeTensorMode():
+            # Test default behavior (should raise error on device mismatch)
+            cuda_tensor = torch.randn(3, 4, device="cuda")
+
+            # Without the config, this should raise a device mismatch error
+            with self.assertRaisesRegex(
+                RuntimeError, "Unhandled FakeTensor Device Propagation"
+            ):
+                mixed_device_op(cuda_tensor, None)
+
+        # Test with prefer_device_type set to "cuda"
+        with torch._functorch.config.patch(fake_tensor_prefer_device_type="cuda"):
+            with FakeTensorMode():
+                cuda_tensor = torch.randn(3, 4, device="cuda")
+
+                # This should now work and prefer the CUDA device
+                result = mixed_device_op(cuda_tensor, None)
+
+                # The result should be on CUDA device (preferred device type)
+                self.assertEqual(result.device.type, "cuda")
+                self.assertEqual(result.shape, (3, 4))
+                self.assertTrue(isinstance(result, FakeTensor))
+
+        # Test that the configuration doesn't affect normal operations
+        with torch._functorch.config.patch(fake_tensor_prefer_device_type="cuda"):
+            with FakeTensorMode():
+                # Normal same-device operations should work as before
+                x = torch.randn(2, 3, device="cuda")
+                y = torch.randn(2, 3, device="cuda")
+                result = x + y
+                self.assertEqual(result.device.type, "cuda")
+
+                # CPU operations should still work
+                x_cpu = torch.randn(2, 3, device="cpu")
+                y_cpu = torch.randn(2, 3, device="cpu")
+                result_cpu = x_cpu + y_cpu
+                self.assertEqual(result_cpu.device.type, "cpu")
+
+        # Test that the configuration is properly scoped
+        with FakeTensorMode():
+            cuda_tensor = torch.randn(3, 4, device="cuda")
+
+            # After exiting the config context, should raise error again
+            with self.assertRaisesRegex(
+                RuntimeError, "Unhandled FakeTensor Device Propagation"
+            ):
+                mixed_device_op(cuda_tensor, None)
+
+    def test_fake_tensor_prefer_device_type_cpu_only(self):
+        """
+        Test that fake_tensor_prefer_device_type works correctly when only CPU tensors are involved.
+        """
+        with torch._functorch.config.patch(fake_tensor_prefer_device_type="cuda"):
+            with FakeTensorMode():
+                # When all tensors are CPU, the result should still be CPU
+                x = torch.randn(2, 3, device="cpu")
+                y = torch.randn(2, 3, device="cpu")
+                result = x + y
+                self.assertEqual(result.device.type, "cpu")
+                self.assertTrue(isinstance(result, FakeTensor))
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_foreach.py b/test/test_foreach.py
index a5ca220dcb525..7ac128d6bac8a 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -43,7 +43,7 @@
     TEST_WITH_ROCM,
     TestCase,
 )
-from torch.testing._internal.triton_utils import requires_cuda
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 _BOOL_SUB_ERR_MSG = "Subtraction, the `-` operator"
@@ -1375,7 +1375,7 @@ def test_foreach_copy_with_multi_dtypes_large_input(self):
         ref_out = torch.empty_like(self_tensor).copy_(src_tensor)
         self.assertEqual(self_tensor, ref_out)
 
-    @requires_cuda
+    @requires_cuda_and_triton
     @ops(filter(lambda op: op.name == "_foreach_copy", foreach_binary_op_db))
     def test_foreach_copy_with_different_device_inputs(self, device, dtype, op):
         if dtype in (torch.complex128, torch.complex64):
diff --git a/test/test_fx.py b/test/test_fx.py
index 412e0a0d5acf5..79df575074799 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -4661,7 +4661,6 @@ def tearDown(self):
         "linear": BUILT_IN_FUNC,
         "logsigmoid": BUILT_IN_FUNC,
         "one_hot": BUILT_IN_FUNC,
-        "pad": ARG_TYPE_MISMATCH,
         "pairwise_distance": BUILT_IN_FUNC,
         "pdist": BUILT_IN_FUNC,
         "pixel_shuffle": BUILT_IN_FUNC,
@@ -4694,12 +4693,6 @@ def tearDown(self):
         "max_unpool3d": PROXY_ITERATED,
         "fold": PROXY_ITERATED,
         "unfold": PROXY_ITERATED,
-        "adaptive_max_pool1d_with_indices": ARG_TYPE_MISMATCH,
-        "fractional_max_pool2d_with_indices": ARG_TYPE_MISMATCH,
-        "fractional_max_pool3d_with_indices": ARG_TYPE_MISMATCH,
-        "layer_norm": ARG_TYPE_MISMATCH,
-        "rms_norm": ARG_TYPE_MISMATCH,
-        "lp_pool1d": ARG_TYPE_MISMATCH,
         "affine_grid": CONTROL_FLOW,
         "alpha_dropout": CONTROL_FLOW,
         "batch_norm": CONTROL_FLOW,
@@ -4733,9 +4726,6 @@ def tearDown(self):
         "leaky_relu": CONTROL_FLOW,
         "local_response_norm": CONTROL_FLOW,
         "margin_ranking_loss": CONTROL_FLOW,
-        "max_pool1d_with_indices": ARG_TYPE_MISMATCH,
-        "max_pool2d_with_indices": ARG_TYPE_MISMATCH,
-        "max_pool3d_with_indices": ARG_TYPE_MISMATCH,
         "mse_loss": CONTROL_FLOW,
         "multi_head_attention_forward": CONTROL_FLOW,
         "multi_margin_loss": CONTROL_FLOW,
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 91b574c9b04c0..72d770e6d3f02 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -53,7 +53,7 @@
 )
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.common_nn import module_tests, get_new_module_tests
-from torch.testing._internal.common_utils import TEST_Z3, run_tests, TestCase
+from torch.testing._internal.common_utils import TEST_Z3, run_tests, TestCase, TEST_WITH_CROSSREF
 from torch.testing._internal.jit_utils import JitTestCase
 import torch.utils._pytree as pytree
 
@@ -963,6 +963,95 @@ def _test_split_graph(split_gm):
         # `keep_original_order=True`
         _test_split_graph(split_module(g, None, split_callback=lambda _ : 0, keep_original_order=True))
 
+    @unittest.skipIf(TEST_WITH_CROSSREF, "See https://github.com/pytorch/pytorch/issues/160077")
+    def test_split_module_symint_dependency_handling(self):
+        # Based on the code from - transformers/models/granitemoe/modeling_granitemoe.py
+        class GraniteMoeTopKGating(torch.nn.Module):
+            def __init__(self, input_size: int, num_experts: int, top_k: int):
+                super().__init__()
+
+                self.num_experts = num_experts
+                self.input_size = input_size
+                self.top_k = top_k
+
+                self.layer = torch.nn.Linear(input_size, num_experts, bias=False)
+
+            def forward(self, hidden_states):
+                # compute the top_k routing decision
+                logits = self.layer(hidden_states).float()  # [batch_size x seq_len, num_experts]
+                top_k_logits, top_k_indices = logits.topk(self.top_k, dim=1)  # [num_tokens, top_k]
+                top_k_gates = torch.softmax(top_k_logits, dim=1).type_as(hidden_states)  # [num_tokens, top_k]
+
+                # compute number of input given to each expert
+                zeros = torch.zeros(
+                    [top_k_gates.size(0), self.num_experts], dtype=top_k_gates.dtype, device=top_k_gates.device
+                )  # [num_tokens, num_experts]
+                gates = zeros.scatter(1, top_k_indices, 1)  # [num_tokens, num_experts]
+                expert_size = gates.long().sum(0)  # [num_experts,]
+                expert_size = expert_size.tolist()
+
+                # sort and group input tokens according to expert assignment
+                top_k_experts = top_k_indices.flatten()  # [num_tokens * top_k]
+                _, index_sorted_experts = top_k_experts.sort(0)  # [num_tokens * top_k]
+                batch_index = index_sorted_experts.div(self.top_k, rounding_mode="trunc")  # [num_tokens * top_k]
+
+                # gather the gate values for grouped input tokens
+                top_k_gates = top_k_gates.flatten()  # [num_tokens * top_k]
+                batch_gates = top_k_gates[index_sorted_experts]  # [num_tokens * top_k]
+
+                return index_sorted_experts, batch_index, batch_gates, expert_size, logits
+
+        class GraniteMoeMoE(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+                self.input_size = 32
+                self.num_local_experts = 4
+
+                num_experts_per_tok = 2
+                self.router = GraniteMoeTopKGating(
+                    input_size=self.input_size,
+                    num_experts=self.num_local_experts,
+                    top_k=num_experts_per_tok,
+                )
+
+            def forward(self, layer_input):
+                _, batch_index, _, expert_size, _ = self.router(layer_input)
+                expert_inputs = layer_input[batch_index]
+                return expert_inputs.split(expert_size, dim=0)
+
+        moe = GraniteMoeMoE()
+        inp = torch.randn([32, 32])
+
+        expected = moe(inp)
+
+        PARTITION_ID = 0
+        PARTITION_OPS_CTR = 0
+        NODE_PARTITION_MAP = {}
+
+        # `callback` is called multiple times with same `node` in `split_module`.
+        # Cache the result such that partition id is consistent across calls.
+        def callback(node) -> int:
+            nonlocal PARTITION_ID, PARTITION_OPS_CTR, NODE_PARTITION_MAP
+            if node in NODE_PARTITION_MAP:
+                return NODE_PARTITION_MAP[node]
+
+            if PARTITION_OPS_CTR % 5 == 0:
+                PARTITION_ID += 1
+
+            PARTITION_OPS_CTR += 1
+
+            NODE_PARTITION_MAP[node] = PARTITION_ID
+            return PARTITION_ID
+
+        def backend(gm, inps):
+            split_gm = split_module(gm, root_m=None, split_callback=callback,
+                                    keep_original_order=True, keep_original_node_name=True)
+            return split_gm
+
+        actual = torch.compile(moe, backend=backend)(inp)
+        torch.testing.assert_close(actual, expected)
+
     def test_normalize_binary_operators(self):
         ops_to_test = {
             torch.add,
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 3870734f60d34..7a202efbe084f 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -5,6 +5,7 @@
 import unittest
 import warnings
 from functools import reduce
+from itertools import product
 
 import numpy as np
 
@@ -16,17 +17,27 @@
     dtypesIfCPU,
     dtypesIfCUDA,
     dtypesIfMPS,
+    expectedFailureMPS,
     instantiate_device_type_tests,
+    onlyCPU,
     onlyCUDA,
     onlyNativeDeviceTypes,
     skipXLA,
 )
+from torch.testing._internal.common_dtype import (
+    all_mps_types_and,
+    all_types_and,
+    all_types_and_complex_and,
+    all_types_complex_float8_and,
+)
 from torch.testing._internal.common_utils import (
     DeterministicGuard,
+    parametrize,
     run_tests,
     serialTest,
     skipIfTorchDynamo,
     TEST_CUDA,
+    TEST_MPS,
     TestCase,
     xfailIfTorchDynamo,
 )
@@ -183,6 +194,7 @@ def delitem():
 
     @onlyNativeDeviceTypes
     @dtypes(torch.half, torch.double)
+    @dtypesIfMPS(torch.half)  # TODO: add bf16 there?
     def test_advancedindex(self, device, dtype):
         # Tests for Integer Array Indexing, Part I - Purely integer array
         # indexing
@@ -993,10 +1005,11 @@ def test_byte_mask_accumulate(self, device):
             self.assertEqual(y, torch.ones(size=(10, 10), device=device))
             self.assertEqual(len(w), 2)
 
+    # MPS: Fails locally, but passes in CI...
     @skipIfTorchDynamo(
         "This test causes SIGKILL when running with dynamo, https://github.com/pytorch/pytorch/issues/88472"
     )
-    @serialTest(TEST_CUDA)
+    @serialTest(TEST_CUDA or TEST_MPS)
     def test_index_put_accumulate_large_tensor(self, device):
         # This test is for tensors with number of elements >= INT_MAX (2^31 - 1).
         N = (1 << 31) + 5
@@ -1195,10 +1208,11 @@ def func1(x, i, v):
 
     @onlyNativeDeviceTypes
     def test_index_put_accumulate_duplicate_indices(self, device):
+        dtype = torch.float if device.startswith("mps") else torch.double
         for i in range(1, 512):
             # generate indices by random walk, this will create indices with
             # lots of duplicates interleaved with each other
-            delta = torch.empty(i, dtype=torch.double, device=device).uniform_(-1, 1)
+            delta = torch.empty(i, dtype=dtype, device=device).uniform_(-1, 1)
             indices = delta.cumsum(0).long()
 
             input = torch.randn(indices.abs().max() + 1, device=device)
@@ -1783,6 +1797,334 @@ def test_index_limits(self, device):
         self.assertRaises(IndexError, lambda: t[idx_min])
         self.assertRaises(IndexError, lambda: t[idx_max])
 
+    @parametrize("reduce", ["prod", "amin", "amax", "mean"])
+    @dtypes(*all_types_and(torch.half, torch.bfloat16))
+    @expectedFailureMPS  # Unimplemented for MPS device
+    def test_index_reduce(self, device, dtype, reduce):
+        size = (3, 4, 5)
+        index_dtypes = [torch.int, torch.long]
+        include_selfs = [True, False]
+        amin_init = float("inf") if dtype.is_floating_point else torch.iinfo(dtype).max
+        amax_init = -float("inf") if dtype.is_floating_point else torch.iinfo(dtype).min
+        reduction_init = {"prod": 1, "mean": 0, "amin": amin_init, "amax": amax_init}
+
+        for dest_noncontig, src_noncontig, index_noncontig in product(
+            [True, False], repeat=3
+        ):
+            for idx_dtype, include_self in product(index_dtypes, include_selfs):
+                for dim in range(len(size)):
+                    num_src = np.random.randint(10)
+                    num_dest = size[dim]
+                    dest = make_tensor(
+                        size, device=device, dtype=dtype, noncontiguous=dest_noncontig
+                    )
+                    src_size = size[:dim] + (num_src,) + size[dim + 1 :]
+                    src = make_tensor(
+                        src_size,
+                        device=device,
+                        dtype=dtype,
+                        noncontiguous=src_noncontig,
+                    )
+                    idx = torch.testing.make_tensor(
+                        num_src,
+                        low=0,
+                        high=num_dest,
+                        dtype=idx_dtype,
+                        device=device,
+                        noncontiguous=index_noncontig,
+                    )
+                    expected = dest.clone()
+                    dest.index_reduce_(dim, idx, src, reduce, include_self=include_self)
+                    # fill rows in idx with reduction inits if include_self=False
+                    if not include_self:
+                        expected.index_fill_(dim, idx.long(), reduction_init[reduce])
+                    expected = expected.transpose(0, dim)
+                    src = src.transpose(0, dim)
+                    for i in range(num_src):
+                        if reduce == "prod":
+                            expected[idx[i]] *= src[i]
+                        elif reduce == "amin":
+                            torch.minimum(
+                                expected[idx[i]], src[i], out=expected[idx[i]]
+                            )
+                        elif reduce == "amax":
+                            torch.maximum(
+                                expected[idx[i]], src[i], out=expected[idx[i]]
+                            )
+                        else:
+                            expected[idx[i]] += src[i]
+                    if reduce == "mean":
+                        counts = (
+                            torch.ones_like(expected)
+                            if include_self
+                            else torch.zeros_like(expected)
+                        )
+                        counts.index_add_(0, idx, torch.ones_like(src))
+                        counts.masked_fill_(counts == 0, 1)
+                        if dtype.is_floating_point:
+                            expected.div_(counts)
+                        else:
+                            expected.div_(counts, rounding_mode="floor")
+                    expected = expected.transpose(0, dim)
+
+                    self.assertEqual(dest, expected)
+
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat))
+    def test_index_copy(self, device, dtype):
+        # We just test for num_copy <= num_dest, as otherwise there are repeated indices
+        # and the behavior is undefined
+        num_copy, num_dest = 3, 5
+
+        def make_arg(batch_sizes, n, dim, contig):
+            size_arg = batch_sizes[:dim] + (n,) + batch_sizes[dim:]
+            return make_tensor(
+                size_arg,
+                dtype=dtype,
+                device=device,
+                low=None,
+                high=None,
+                noncontiguous=not contig,
+            )
+
+        def ref_index_copy(tgt, dim, idx, src):
+            for i in range(idx.size(0)):
+                idx_dest = dim * (slice(None),) + (idx[i],)
+                idx_src = dim * (slice(None),) + (i,)
+                tgt[idx_dest] = src[idx_src]
+
+        # More thorough testing as in index_add
+        for dest_contig, src_contig, index_contig in product([True, False], repeat=3):
+            for other_sizes in ((), (4, 5)):
+                for dim in range(len(other_sizes)):
+                    dest = make_arg(other_sizes, num_dest, dim, dest_contig)
+                    src = make_arg(other_sizes, num_copy, dim, src_contig)
+                    idx = torch.randperm(num_dest, dtype=torch.int64, device=device)[
+                        :num_copy
+                    ]
+                    if not index_contig:
+                        idx = torch.repeat_interleave(idx, 2, dim=-1)
+                        idx = idx[..., ::2]
+                    dest2 = dest.clone()
+                    dest.index_copy_(dim, idx, src)
+                    ref_index_copy(dest2, dim, idx, src)
+                    self.assertEqual(dest, dest2)
+
+    # onlyNativeDeviceTypes due to an XLA error:
+    # https://github.com/pytorch/pytorch/issues/53256
+    @onlyNativeDeviceTypes
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat))
+    def test_index_copy_scalars(self, device, dtype):
+        # Create the 8 possible combinations of scalar sizes for target / index / source
+        scalars = (
+            (
+                make_tensor(size_t, dtype=dtype, device=device, low=None, high=None),
+                make_tensor(size_i, dtype=torch.int64, device=device, low=0, high=1),
+                make_tensor(size_s, dtype=dtype, device=device, low=None, high=None),
+            )
+            for size_t, size_i, size_s in product([(), (1,)], repeat=3)
+        )
+        for target, idx, source in scalars:
+            target.index_copy_(0, idx, source)
+            self.assertEqual(target.item(), source.item())
+
+    @onlyCPU
+    def test_errors_index_copy(self, device):
+        # We do not test the GPU as the CUDA_ASSERT would break the CUDA context
+        idx_dim = 8
+        tgt_dim = 5
+        batch_dim = 3
+
+        # Too large of an index
+        a = torch.randn(batch_dim, tgt_dim, device=device)
+        idx = torch.full((idx_dim,), tgt_dim, device=device)
+        c = torch.zeros(batch_dim, idx_dim, device=device)
+        with self.assertRaises(IndexError):
+            a.index_copy_(1, idx, c)
+
+        # Too small (negative indices)
+        idx = torch.full((idx_dim,), -1, device=device)
+        with self.assertRaises(IndexError):
+            a.index_copy_(1, idx, c)
+
+        # Too small (very negative indices) - they should be unsupported even
+        # when support for negative indices is implemented for index_copy_
+        idx = torch.full((idx_dim,), -tgt_dim - 1, device=device)
+        with self.assertRaises(IndexError):
+            a.index_copy_(1, idx, c)
+
+    def _prepare_data_for_index_copy_and_add_deterministic(
+        self, dim: int, device: torch.device
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        assert dim >= 0 and dim < 3
+        a = [5, 4, 3]
+        a[dim] = 2000
+        x = torch.zeros(a, device=device)
+        b = a.copy()
+        elems = a[dim] * 20
+        b[dim] = elems
+        src = torch.rand(b, device=device)
+        index = torch.randint(a[dim], (elems,), device=device)
+        return (x, index, src)
+
+    @onlyNativeDeviceTypes
+    @expectedFailureMPS  # See https://github.com/pytorch/pytorch/issues/161029
+    def test_index_copy_deterministic(self, device: torch.device) -> None:
+        for dim in range(3):
+            x, index, src = self._prepare_data_for_index_copy_and_add_deterministic(
+                dim, device
+            )
+            with DeterministicGuard(True):
+                y0 = torch.index_copy(x, dim, index, src)
+
+            x0 = x.detach().clone()
+            index_list = index.tolist()
+            for i in range(len(index_list)):
+                if dim == 0:
+                    x0[index_list[i], :, :] = src[i, :, :]
+                elif dim == 1:
+                    x0[:, index_list[i], :] = src[:, i, :]
+                elif dim == 2:
+                    x0[:, :, index_list[i]] = src[:, :, i]
+
+            self.assertEqual(x0, y0, atol=0, rtol=0)
+
+    @onlyNativeDeviceTypes
+    @expectedFailureMPS  # See https://github.com/pytorch/pytorch/issues/161029
+    def test_index_add_deterministic(self, device: torch.device) -> None:
+        for dim in range(3):
+            x, index, src = self._prepare_data_for_index_copy_and_add_deterministic(
+                dim, device
+            )
+            alpha = random.random() + 1
+            # on CPU it should be deterministic regardless of the deterministic mode
+            with DeterministicGuard(True):
+                y0 = torch.index_add(x, dim, index, src, alpha=alpha)
+                for _ in range(3):
+                    y = torch.index_add(x, dim, index, src, alpha=alpha)
+                    self.assertEqual(y, y0, atol=0, rtol=0)
+
+            with DeterministicGuard(False):
+                for _ in range(3):
+                    y_nd = torch.index_add(x, dim, index, src, alpha=alpha)
+                    self.assertEqual(y_nd, y0, atol=1e-3, rtol=1e-5)
+
+    @onlyNativeDeviceTypes
+    def test_index_put_non_accumulate_deterministic(self, device) -> None:
+        with DeterministicGuard(True):
+            for i in range(3):
+                m = random.randint(10, 20)
+                elems = random.randint(20000, 30000)
+                values = torch.rand(elems, device=device)
+                indices = torch.randint(m, (elems,), device=device)
+                input = torch.rand(m, device=device)
+                output = input.index_put((indices,), values, accumulate=False)
+
+                input_list = input.tolist()
+                indices_list = indices.tolist()
+                values_list = values.tolist()
+                for i, v in zip(indices_list, values_list):
+                    input_list[i] = v
+
+                self.assertEqual(output, input_list)
+
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @expectedFailureMPS
+    def test_index_fill(self, device, dtype):
+        x = torch.tensor([[1, 2], [4, 5]], dtype=dtype, device=device)
+        index = torch.tensor([0], device=device)
+        x.index_fill_(1, index, 0)
+        self.assertEqual(x, torch.tensor([[0, 2], [0, 5]], dtype=dtype, device=device))
+        if not x.is_complex() and not device == "meta":
+            with self.assertRaisesRegex(RuntimeError, r"Scalar"):
+                x.index_fill_(1, index, 1 + 1j)
+        # Make sure that the result stays 0-dim while applied to
+        # a 0-dim input
+        x = torch.tensor(1, dtype=dtype, device=device)
+        self.assertEqual(0, x.index_fill(0, index, -1).dim())
+        self.assertEqual(0, x.index_fill_(0, index, -1).dim())
+
+    # The test fails for zero-dimensional tensors on XLA
+    @onlyNativeDeviceTypes
+    @dtypes(*all_types_complex_float8_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*all_mps_types_and(torch.bool, torch.cfloat))
+    def test_index_select(self, device, dtype):
+        num_src, num_out = 3, 5
+
+        def make_arg(batch_sizes, n, dim, contig):
+            size_arg = batch_sizes[:dim] + (n,) + batch_sizes[dim:]
+            return make_tensor(
+                size_arg,
+                dtype=dtype,
+                device=device,
+                low=None,
+                high=None,
+                noncontiguous=not contig,
+            )
+
+        def ref_index_select(src, dim, idx):
+            # some types not supported on numpy
+            not_np_dtypes = (
+                torch.bfloat16,
+                torch.float8_e5m2,
+                torch.float8_e5m2fnuz,
+                torch.float8_e4m3fn,
+                torch.float8_e4m3fnuz,
+            )
+            if dtype in not_np_dtypes:
+                src = src.float()
+            out = torch.from_numpy(
+                np.take(src.cpu().numpy(), idx.cpu().numpy(), axis=dim)
+            )
+            if dtype in not_np_dtypes:
+                out = out.to(device=device, dtype=dtype)
+            return out
+
+        for src_contig, idx_contig in product([True, False], repeat=2):
+            for other_sizes in ((), (4, 5)):
+                for dim in range(len(other_sizes)):
+                    src = make_arg(other_sizes, num_src, dim, src_contig)
+                    idx = make_tensor(
+                        (num_out,),
+                        dtype=torch.int64,
+                        device=device,
+                        low=0,
+                        high=num_src,
+                        noncontiguous=not idx_contig,
+                    )
+                    out = torch.index_select(src, dim, idx)
+                    out2 = ref_index_select(src, dim, idx)
+                    self.assertEqual(out, out2)
+
+        for idx_type in (torch.int32, torch.int64):
+            other_sizes = (3, 2)
+            dim = 1
+            src = make_arg(other_sizes, num_src, dim, True)
+            idx = make_tensor(
+                (num_out,),
+                dtype=idx_type,
+                device=device,
+                low=0,
+                high=num_src,
+                noncontiguous=False,
+            )
+            out = torch.index_select(src, dim, idx)
+            out2 = ref_index_select(src, dim, idx)
+            self.assertEqual(out, out2)
+
+        # Create the 4 possible combinations of scalar sizes for index / source
+        scalars = (
+            (
+                make_tensor(size_s, dtype=dtype, device=device),
+                torch.zeros(size_i, dtype=torch.int64, device=device),
+            )
+            for size_s, size_i in product([(), (1,)], repeat=2)
+        )
+        for source, idx in scalars:
+            out = source.index_select(0, idx)
+            self.assertEqual(out.item(), source.item())
+
 
 # The tests below are from NumPy test_indexing.py with some modifications to
 # make them compatible with PyTorch. It's licensed under the BDS license below:
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 8d3a8090c67a3..c3e26d37da1b2 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -2939,7 +2939,10 @@ def test_unsupported(self, device, dtype, op):
 
     @slowTest
     @onlyCPU
-    @ops(op_db, dtypes=OpDTypes.supported)
+    @ops(
+        [op for op in op_db if get_name(op) not in known_failures],
+        dtypes=OpDTypes.supported,
+    )
     def test_nnc_correctness(self, device, dtype, op):
         if not op.supports_tracing:
             self.skipTest("Requires tracing support")
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 37c5a1907cced..de738b645564f 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -4251,6 +4251,18 @@ def test(n=10,                       # how many tests to generate
 
         test(500)
 
+    @dtypes(torch.float)
+    def test_einsum_output_layout(self, device, dtype):
+        batch, in_dim, out_dim = 2, 3, 5
+        x = make_tensor((batch, in_dim), dtype=dtype, device=device)
+        w = make_tensor((out_dim, in_dim), dtype=dtype, device=device)
+        result = torch.einsum("fd,bd->bf", w, x)
+        expected = x.matmul(w.t())
+        self.assertEqual(result, expected)
+        self.assertTrue(result.is_contiguous())
+        self.assertEqual(result.stride(), expected.stride())
+
+
     def test_einsum_corner_cases(self, device):
         def check(equation, *operands, expected_output):
             tensors = [torch.tensor(operand, device=device, dtype=torch.float32) if not isinstance(operand, tuple)
@@ -5769,6 +5781,7 @@ def test_tensordot_out_kernel_errors_with_autograd(self, device, dtype):
             self.assertEqual(len(w), 1)
 
     # 4GB should do, but we run tests in parallel in CI, so let's be generous
+    @onlyCUDA
     @largeTensorTest('16GB', device='cuda')
     def test_large_bmm_mm_backward(self, device):
         A = torch.randn([1024, 2, 1024], device="cuda").mT.contiguous().mT
@@ -5779,6 +5792,7 @@ def test_large_bmm_mm_backward(self, device):
         (A @ B).backward(G)
 
     # 4GB should do, but we run tests in parallel in CI, so let's be generous
+    @onlyCUDA
     @largeTensorTest('16GB', device='cuda')
     def test_large_bmm_backward(self, device):
         A = torch.randn([1024, 2, 1024], device="cuda").mT.contiguous().mT
@@ -7497,7 +7511,7 @@ def dyn_quant_matmul_4bit(
             all_elements_within_threshold, "Some elements have error >= 0.06"
         )
 
-    @onlyCPU
+    @onlyNativeDeviceTypes
     @parametrize("m", [32, 64])
     @parametrize("k", [32, 64])
     @parametrize("n", [48, 64])
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index a61e827ffffae..f1e9ee69b1a42 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -21,6 +21,7 @@
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_BF16,
     SM53OrLater,
+    SM80OrLater,
     SM89OrLater,
     SM90OrLater,
     xfailIfSM100OrLater,
@@ -29,7 +30,9 @@
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_FP8_GROUPED_GEMM,
     PLATFORM_SUPPORTS_MX_GEMM,
+    PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM,
     IS_SM90,
+    with_tf32_off,
 )
 from torch.testing._internal.common_device_type import (
     dtypes,
@@ -55,7 +58,13 @@
     TEST_WITH_ROCM,
     TestCase,
 )
-from torch.testing._internal.common_quantized import _f32_to_floatx_unpacked, _floatx_unpacked_to_f32, ceil_div, to_blocked
+from torch.testing._internal.common_quantized import (
+    _f32_to_floatx_unpacked,
+    _floatx_unpacked_to_f32,
+    ceil_div, to_blocked,
+    to_mxfp8,
+    generate_jagged_offs,
+)
 
 _IS_SM8X = False
 if TEST_CUDA:
@@ -208,7 +217,6 @@ def test_cublas_addmm_reduced_precision_fp16_accumulate(self, size: int, dtype:
             self.cublas_addmm(size, dtype, False, True)
 
     @onlyCUDA
-    @skipIfRocm
     def test_cublas_and_lt_reduced_precision_fp16_accumulate(self):
         orig_fp16_accumulate = torch.backends.cuda.matmul.allow_fp16_accumulation
         torch.backends.cuda.matmul.allow_fp16_accumulation = True
@@ -315,13 +323,13 @@ def grouped_mm_helper(self, alist, blist, gOlist, agradlist, bgradlist, outlist)
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM120OrLater
-    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported only on SM90 and SM100")
+    @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
     @parametrize("b_row_major", [False, True])
-    def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major):
+    @dtypes(torch.bfloat16, torch.float32, torch.float16)
+    def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major, dtype):
         device = "cuda"
-        dtype = torch.bfloat16
         m, n, k, n_groups = 16, 32, 64, 4
         if a_row_major:
             a = torch.randn(m, k * n_groups + k * int(strided), device=device, dtype=dtype)[:, :k * n_groups]
@@ -338,7 +346,7 @@ def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major):
         offs = torch.arange(k, n_groups * k + 1, k, device=device, dtype=torch.int32)
 
         f = torch._grouped_mm
-        out = f(a, b.t(), offs=offs, out_dtype=torch.bfloat16)
+        out = f(a, b.t(), offs=offs, out_dtype=dtype)
         gO = torch.rand_like(out)
         out.backward(gO)
         offs_cpu = offs.cpu()
@@ -354,13 +362,13 @@ def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major):
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM120OrLater
-    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported only on SM90 and SM100")
+    @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
     @parametrize("b_row_major", [False, True])
-    def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major):
+    @dtypes(torch.bfloat16, torch.float32, torch.float16)
+    def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major, dtype):
         device = "cuda"
-        dtype = torch.bfloat16
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 64, 4
         if a_row_major:
@@ -387,12 +395,12 @@ def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major):
 
             a.grad = None
             b.grad = None
-            offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32)
+            offs = torch.arange(m, n_groups * m + 1, m, device=device, dtype=torch.int32)
             if check_zero_size:
                 offs[0] = offs[1]
 
             f = torch._grouped_mm
-            out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=torch.bfloat16)
+            out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=dtype)
             gO = torch.rand_like(out)
             if not check_zero_size:
                 out.backward(gO)
@@ -411,13 +419,13 @@ def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major):
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM120OrLater
-    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported only on SM90 and SM100")
+    @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
     @parametrize("b_row_major", [False, True])
-    def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major):
+    @dtypes(torch.bfloat16, torch.float32, torch.float16)
+    def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major, dtype):
         device = "cuda"
-        dtype = torch.bfloat16
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 64, 4
         if a_row_major:
@@ -439,20 +447,20 @@ def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major):
         self.assertTrue(b_contig.is_contiguous() is not strided)
 
         f = torch._grouped_mm
-        out = f(a, b.transpose(-2, -1), out_dtype=torch.bfloat16)
+        out = f(a, b.transpose(-2, -1), out_dtype=dtype)
         gO = torch.rand_like(out)
         out.backward(gO)
         self.grouped_mm_helper(a, b, gO, a.grad, b.grad, out)
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM120OrLater
-    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported only on SM90 and SM100")
+    @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
     @parametrize("b_row_major", [False, True])
-    def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major):
+    @dtypes(torch.bfloat16, torch.float32, torch.float16)
+    def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major, dtype):
         device = "cuda"
-        dtype = torch.bfloat16
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 64, 4
         if a_row_major:
@@ -476,12 +484,12 @@ def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major):
             if check_zero_size and n_groups <= 1:
                 continue
 
-            offs = torch.arange(n, n_groups * n + 1, n, device="cuda", dtype=torch.int32)
+            offs = torch.arange(n, n_groups * n + 1, n, device=device, dtype=torch.int32)
             if check_zero_size:
                 offs[0] = offs[1]
 
             f = torch._grouped_mm
-            out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=torch.bfloat16)
+            out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=dtype)
             gO = torch.rand_like(out)
             if not check_zero_size:
                 out.backward(gO)
@@ -499,7 +507,8 @@ def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major):
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM100OrLater
-    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    # TODO(future PR): enable compile for torch._grouped_mm fallback path
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm with compile supported on SM90")
     @parametrize("op", ["2d/2d", "2d/3d", "3d/2d", "3d/3d"])
     @parametrize("a_row_major", [False, True])
     @parametrize("b_row_major", [False, True])
@@ -617,13 +626,13 @@ def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune)
 
 
     @onlyCUDA
-    @skipIfRocm
     @parametrize("input_dtype", [torch.float32, torch.float16, torch.bfloat16])
     @parametrize("M", [1, 32, 64])
     @parametrize("N", [1, 32, 64])
     @parametrize("K", [1, 32, 64])
     @parametrize("batch_size", [None, 1, 16])
-    @parametrize("backend", ["cublas", "cublaslt"])
+    # TODO: enable rocblas path on ROCm
+    @parametrize("backend", ["cublaslt"] if torch.version.hip else ["cublas", "cublaslt"])
     def test_mm_bmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
         device = "cuda"
         dtype = input_dtype
@@ -672,13 +681,13 @@ def create_inputs(B=None):
 
 
     @onlyCUDA
-    @skipIfRocm
     @parametrize("input_dtype", [torch.float32, torch.float16, torch.bfloat16])
     @parametrize("M", [1, 32, 64])
     @parametrize("N", [1, 32, 64])
     @parametrize("K", [1, 32, 64])
     @parametrize("batch_size", [None, 1, 32])
-    @parametrize("backend", ["cublas", "cublaslt"])
+    # TODO: enable rocblas path on ROCm
+    @parametrize("backend", ["cublaslt"] if torch.version.hip else ["cublas", "cublaslt"])
     def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
         device = "cuda"
         dtype = input_dtype
@@ -735,7 +744,6 @@ def create_inputs(B=None):
 
 
     @onlyCUDA
-    @skipIfRocm
     @parametrize("batch_size", [1, 32])
     @parametrize("backend", ["cublas", "cublaslt"])
     def test_fp16_accum_and_fp32_out_failure(self, batch_size, backend):
@@ -776,6 +784,7 @@ def expand(tensor):
 f8_msg = "FP8 is only supported on H100+, SM 8.9 and MI300+ devices"
 f8_grouped_msg = "FP8 grouped is only supported on SM90 and MI300+ devices"
 mx_skip_msg = "MX gemm is only supported on CUDA capability 10.0+"
+mxfp8_grouped_mm_skip_msg = "MXFP8 grouped GEMM is only supported when PyTorch is built with USE_FBGEMM_GENAI=1 on SM100+"
 
 # avoid division by zero when calculating scale
 EPS = 1e-12
@@ -906,6 +915,8 @@ def to_fp8_saturated(
 
     return x.to(fp8_dtype)
 
+
+
 def compute_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     """Computes the error between two tensors in dB.
 
@@ -1049,6 +1060,167 @@ def test_float8_scale(self, device) -> None:
         out_fp8_s = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b)
         self.assertEqual(out_fp8, out_fp8_s)
 
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM, mxfp8_grouped_mm_skip_msg)
+    @parametrize("G", [1, 4, 16])
+    @parametrize("M", [2048, 2049])
+    @parametrize("N", [8192])
+    @parametrize("K", [16640])
+    def test_mxfp8_scaled_grouped_mm_2d_2d(self, G, M, N, K):
+        torch.manual_seed(42)
+        total_K = K  # Alias for clarity, communicating this consists of several groups along this dim
+        input_group_end_offsets = generate_jagged_offs(
+            G, total_K, multiple_of=32, device="cuda"
+        )
+        X = torch.randn((M, total_K), dtype=torch.bfloat16, device="cuda") * 0.1
+        W = torch.randn((N, total_K), dtype=torch.bfloat16, device="cuda") * 0.01
+
+        # Convert scales to blocked format.
+        x_list = []
+        w_list = []
+        x_blocked_scale_list = []
+        w_blocked_scale_list = []
+
+        def round_up(x: int, y: int) -> int:
+            return ((x + y - 1) // y) * y
+
+        for group_idx in range(G):
+            # to_mxfp8 per group
+            prev_group_end_offset = (
+                0 if group_idx == 0 else input_group_end_offsets[group_idx - 1]
+            )
+            curr_group_end_offset = input_group_end_offsets[group_idx]
+            group_size = curr_group_end_offset - prev_group_end_offset
+            if group_size > 0:
+                x_slice = X[
+                    :, prev_group_end_offset:curr_group_end_offset
+                ].contiguous()  # (M, K_group)
+                w_slice = W[
+                    :, prev_group_end_offset:curr_group_end_offset
+                ].contiguous()  # (N, K_group)
+                x_scale_slice, xq_slice = to_mxfp8(
+                    x_slice
+                )  # scale shape -> (M, K_group // 32)
+                w_scale_slice, wq_slice = to_mxfp8(
+                    w_slice
+                )  # scale shape -> (N, K_group // 32)
+                x_list.append(xq_slice)
+                w_list.append(wq_slice)
+
+                # Convert scales to blocked format.
+                x_scale_slice_blocked = to_blocked(
+                    x_scale_slice
+                )  # (round_up(M, 128), round_up(K_group//32, 4))
+                w_scale_slice_blocked = to_blocked(
+                    w_scale_slice
+                )  # (round_up(N, 128), round_up(K_group//32, 4))
+                x_blocked_scale_list.append(x_scale_slice_blocked)
+                w_blocked_scale_list.append(w_scale_slice_blocked)
+
+        # Assemble the full XQ and WQ
+        xq = torch.cat(x_list, dim=1).contiguous()
+        wq = torch.cat(w_list, dim=1).contiguous()
+
+        # Combine all XQ groups blocked scales into one tensor.
+        x_blocked_scales = torch.cat(x_blocked_scale_list, dim=0)
+        M_rounded = round_up(M, 128)
+        x_blocked_scales = x_blocked_scales.reshape(M_rounded, -1)
+
+        # Combine all WQ groups blocked scales into one tensor.
+        w_blocked_scales = torch.cat(w_blocked_scale_list, dim=0)
+        N_rounded = round_up(N, 128)
+        w_blocked_scales = w_blocked_scales.reshape(N_rounded, -1)
+
+        # Compute mxfp8 grouped mm output
+        y_mxfp8 = torch._scaled_grouped_mm(
+            xq,  # (M, total_K)
+            wq.transpose(-2, -1),  # (total_K, N)
+            x_blocked_scales,  # to_blocked_per_group(M, total_K//32)
+            w_blocked_scales,  # to_blocked_per_group(N, total_K//32)
+            offs=input_group_end_offsets,  # (G,)
+            out_dtype=torch.bfloat16,
+        )
+
+        # bf16 reference output
+        y_bf16 = torch._grouped_mm(
+            X, W.t(), offs=input_group_end_offsets, out_dtype=torch.bfloat16
+        )
+
+        # Assert no NaNs
+        assert not y_mxfp8.isnan().any(), "mxfp8 output contains NaN"
+
+        # Assert outputs are close
+        torch.testing.assert_close(y_mxfp8, y_bf16, atol=8.0e-2, rtol=8.0e-2)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM, mxfp8_grouped_mm_skip_msg)
+    @parametrize("G", [1, 4, 16])
+    @parametrize("M", [16640])
+    @parametrize("N", [8192])
+    @parametrize("K", [4096])
+    def test_mxfp8_scaled_grouped_mm_2d_3d(self, G, M, N, K):
+        torch.manual_seed(42)
+        # Simulate 2d-3d grouped gemm `out = input @ weight.t()`
+        # 2D inputs with groups along M, 3D weights.
+        block_size = 32
+        total_M = M  # Alias for clarity that M dim contains groups.
+        X = torch.randn((total_M, K), dtype=torch.bfloat16, device="cuda") * 0.1
+        W = torch.randn((G, N, K), dtype=torch.bfloat16, device="cuda") * 0.01
+        input_group_end_offsets = generate_jagged_offs(
+            G, total_M, multiple_of=32, device="cuda"
+        )
+
+        # For each constituent 2d subtensor in the 3d weights, quantize and convert scale to blocked format separately,
+        # as they each used for independent gemm in the grouped gemm.
+        wq_list = []
+        w_scale_list = []
+        for i in range(G):
+            w_scale, wq = to_mxfp8(W[i])
+            w_scale = to_blocked(w_scale)
+            wq_list.append(wq)
+            w_scale_list.append(w_scale)
+        wq = torch.stack(wq_list, dim=0).contiguous()
+        w_scale = torch.stack(w_scale_list, dim=0).contiguous()
+
+        # For each group along `total_M` in the 2D tensor, quantize and convert scale to blocked format separately,
+        # as they each used for independent gemm in the grouped gemm.
+        xq_list = []
+        x_scale_list = []
+        for i in range(G):
+            prev_group_end = 0 if i == 0 else input_group_end_offsets[i - 1]
+            curr_group_end = input_group_end_offsets[i]
+            group_size = curr_group_end - prev_group_end
+            if group_size > 0:
+                x_slice = X[prev_group_end:curr_group_end, :]
+                x_scale, xq = to_mxfp8(x_slice)
+                x_scale = to_blocked(x_scale)
+                xq_list.append(xq)
+                x_scale_list.append(x_scale)
+        xq = torch.cat(xq_list, dim=0).contiguous()
+        x_scale = torch.cat(x_scale_list, dim=0).contiguous()
+        x_scale = x_scale.reshape(-1, K // block_size)
+        xq = xq.view(-1, xq.shape[-1])
+
+        # Compute mxfp8 grouped gemm.
+        y_mxfp8 = torch._scaled_grouped_mm(
+            xq,
+            wq.transpose(-2, -1),
+            x_scale,
+            w_scale,
+            offs=input_group_end_offsets,
+            out_dtype=torch.bfloat16,
+        )
+
+        # Compute reference bf16 grouped gemm.
+        y_bf16 = torch._grouped_mm(
+            X,
+            W.transpose(-2, -1),
+            offs=input_group_end_offsets,
+            out_dtype=torch.bfloat16,
+        )
+
+        # Assert outputs are close.
+        torch.testing.assert_close(y_mxfp8, y_bf16, atol=8.0e-2, rtol=8.0e-2)
+
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("base_dtype", [torch.float16, torch.bfloat16, torch.float32])
     def test_scaled_mm_vs_emulated(self, base_dtype):
@@ -1320,23 +1492,32 @@ def test_float8_error_messages(self, device) -> None:
                 out_dtype=torch.bfloat16,
             )
 
-        # Note re.compile is used, not re.escape. This is to accommodate fn vs fnuz type message.
-        with self.assertRaisesRegex(
-            RuntimeError,
-            r"Expected b\.dtype\(\) == at::kFloat8_e4m3fnu?z? to be true, but got false\.",
-        ):
-            torch._scaled_mm(
+        def e5m2():
+            out = torch._scaled_mm(
                 x_fp8,
                 y_fp8.to(e5m2_type),
                 scale_a=torch.ones((M, 1), device="cuda"),
                 scale_b=torch.ones((1, N), device="cuda"),
                 out_dtype=torch.bfloat16,
             )
+            return out
+
+        if torch.cuda.get_device_capability() == (9, 0) and torch.version.cuda and torch.version.cuda >= "12.9":
+            out = e5m2()
+            self.assertEqual(out, torch.ones_like(out) * 128.)
+        else:
+            # Note re.compile is used, not re.escape. This is to accommodate fn vs fnuz type message.
+            with self.assertRaisesRegex(
+                RuntimeError,
+                r"Expected b\.dtype\(\) == at::kFloat8_e4m3fnu?z? to be true, but got false\.",
+            ):
+                e5m2()
 
     @skipIfRocmArch("gfx950")
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
     @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
     @parametrize("base_dtype", [torch.bfloat16, torch.float32])
+    @with_tf32_off
     def test_scaled_mm_vs_emulated_row_wise(self, base_dtype):
         # Fp32 out_dtype is only supported by cuBLAS, which however only started
         # shipping row-wise kernels in CUDA 12.9, and only for sm90+.
@@ -1429,6 +1610,7 @@ def test_scaled_mm_vs_emulated_block_wise(self, output_dtype, lhs_block, rhs_blo
         self.assertGreaterEqual(float(cosine_sim), 0.999)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @unittest.skipIf(torch.version.hip is not None, "Float8_e4m3fn not supported on current ROCm CI setup (MI325X)")
     @parametrize("which_dim_zero", [0, 1, 2])
     @parametrize("use_torch_compile", [False, True])
     def test_zero_dim_tensorwise(self, which_dim_zero, use_torch_compile) -> None:
@@ -1567,25 +1749,18 @@ def test_pack_uint4(self):
         (127, 96, 1024),
         (1025, 128, 96)
     ], name_fn=lambda mkn: f"{mkn[0]}_{mkn[1]}_{mkn[2]}")
-    @parametrize("recipe", ["mxfp8", "mxfp4"] if TEST_WITH_ROCM else ["mxfp8", "nvfp4"])
+    @parametrize("recipe", ["mxfp8", "mxfp4" if torch.version.hip else "nvfp4"])
     def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum, mkn, recipe) -> None:
         if (recipe == "nvfp4" or recipe == "mxfp4") and fast_accum:
-            raise unittest.SkipTest("fast_accum not supported in nvfp4 and mxfp4 cublas gemm, skipping")
+            raise unittest.SkipTest("fast_accum not supported in nvfp4/mxfp4 cublas gemm, skipping")
 
         device = "cuda"
         M, K, N = mkn
-        if recipe == "nvfp4" and K % 32 != 0:
-            raise unittest.SkipTest("K must be divisible by 32 for nvfp4 cublas gemm, skipping")
-
-        if torch.version.hip:
-            if not (M % 32 == 0 and K % 32 == 0 and N % 32 == 0):
-                raise unittest.SkipTest("Matrix dimensions must be multiples of 32 on ROCm, skipping")
-        if torch.version.hip:
-            BLOCK_SIZE = 32
-            fp4_scaling_dtype = torch.float8_e8m0fnu
-        else:
-            BLOCK_SIZE = 16 if recipe == "nvfp4" else 32
-            fp4_scaling_dtype = torch.float8_e4m3fn
+        if (recipe == "nvfp4" or recipe == "mxfp4") and K % 32 != 0:
+            raise unittest.SkipTest("K must be divisible by 32 for nvfp4/mxfp4 cublas gemm, skipping")
+
+        fp4_scaling_dtype = torch.float8_e8m0fnu if torch.version.hip else torch.float8_e4m3fn
+        BLOCK_SIZE = 32 if torch.version.hip else (16 if recipe == "nvfp4" else 32)
         require_exact_match = True
         approx_match_sqnr_target = 22.0
 
@@ -1747,12 +1922,9 @@ def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum,
                 B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(N, K)
                 B = B.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
             else:  # nvfp4 # mxfp4
-                if recipe == "mxfp4":
-                    A_scale = data_to_mx_scale(A_ref, BLOCK_SIZE, recipe)
-                    B_scale = data_to_mx_scale(B_ref, BLOCK_SIZE, recipe)
-                else:
-                    A_scale = data_to_nvfp4_scale(A_ref, BLOCK_SIZE)
-                    B_scale = data_to_nvfp4_scale(B_ref, BLOCK_SIZE)
+                scale_func = data_to_mx_scale if recipe == "mxfp4" else data_to_nvfp4_scale
+                A_scale = scale_func(*([A_ref, BLOCK_SIZE] + recipe if recipe == "mxfp4" else [A_ref, BLOCK_SIZE]))
+                B_scale = scale_func(*([B_ref, BLOCK_SIZE] + recipe if recipe == "mxfp4" else [B_ref, BLOCK_SIZE]))
                 max_val = FP4_MAX_VAL
                 min_val = -1 * max_val
 
@@ -1767,8 +1939,8 @@ def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum,
 
         C_ref = A_ref @ B_ref.t()
 
+        # convert to swizzled format
         if not torch.version.hip:
-            # convert to swizzled format
             A_scale = to_blocked(A_scale)
             B_scale = to_blocked(B_scale)
 
@@ -1816,10 +1988,9 @@ def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
         # Test wrong scale tensor size for scale_a with correct dtype
         with self.assertRaisesRegex(
             RuntimeError,
-            re.escape(
-                f"For BlockWise scaling: Expected scale_a size to be {expected_a_size} "
-                f"but got {expected_a_size - 1}"
-            ),
+            f".*For Block[W,w]ise.*scaling.*scale_a should have {expected_a_size} "
+            f"elements.*"
+            ,
         ):
             incorrect_size_a = torch.ones(expected_a_size - 1, device=device, dtype=scale_dtype)
             correct_size_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
@@ -1834,10 +2005,9 @@ def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
         # Test wrong scale tensor size for scale_b with correct dtype
         with self.assertRaisesRegex(
             RuntimeError,
-            re.escape(
-                f"For BlockWise scaling: Expected scale_b size to be {expected_b_size} "
-                f"but got {expected_b_size + 1}"
-            ),
+            f"For Block[W,w]ise.*scaling.*scale_b should have {expected_b_size} "
+            f"elements.*"
+            ,
         ):
             correct_size_a = torch.ones(expected_a_size, device=device, dtype=scale_dtype)
             incorrect_size_b = torch.ones(expected_b_size + 1, device=device, dtype=scale_dtype)
@@ -1852,9 +2022,8 @@ def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
         # Test non-contiguous scale tensors with correct dtype
         with self.assertRaisesRegex(
             RuntimeError,
-            re.escape(
-                "For BlockWise scaling: Both scale_a and scale_b must be contiguous"
-            ),
+            "For Block[W,w]ise.*scaling.*both should be contiguous"
+            ,
         ):
             non_contiguous_a = torch.ones(expected_a_size * 2, device=device, dtype=scale_dtype)[::2]
             contiguous_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
diff --git a/test/test_meta.py b/test/test_meta.py
index b5b7cc8121929..b3e5faab4f659 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -575,8 +575,8 @@ def run_meta_crossref(
         elif func in (torch.ops.aten.repeat_interleave.Tensor, torch.ops.aten.repeat_interleave.Tensor_out):
             if kwargs.get("output_size", None) is None:
                 meta_args = args
-            if func is torch.ops.aten.repeat_interleave.Tensor_out:
-                meta_kwargs["out"] = kwargs["out"]
+                if func is torch.ops.aten.repeat_interleave.Tensor_out:
+                    meta_kwargs["out"] = kwargs["out"]
         elif func in (torch.ops.aten.index.Tensor, torch.ops.aten.index.Tensor_out):
             # Don't convert boolean tensors to meta as they will have nonzero
             # called on them
@@ -681,7 +681,10 @@ def run_meta_crossref(
 }
 
 meta_function_expected_failures_conditional = {
-    torch.repeat_interleave : (lambda dtype, *args, **kwargs: not isinstance(kwargs.get("repeats", None), int)),
+    torch.repeat_interleave: lambda dtype, *args, **kwargs: (
+        not isinstance(kwargs.get("repeats", None), int)
+        and (kwargs.get("output_size", None) is None)
+    ),
 }
 
 """
diff --git a/test/test_mps.py b/test/test_mps.py
index 6dfce783316f2..756b2cd20567a 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -29,6 +29,7 @@
 from torch.testing._internal.common_dtype import get_all_dtypes, integral_types
 import torch.backends.mps
 from torch.distributions import Uniform, Exponential
+from torch.utils._python_dispatch import TorchDispatchMode
 from functools import partial
 
 from torch.testing._internal.common_methods_invocations import (
@@ -198,6 +199,32 @@ def test_scaled_dot_product_attention_autocast(self, dtype):
         y = F.scaled_dot_product_attention(query, key, value.to(torch.float32))
         self.assertEqual(y.to(y_autocast.dtype), y_autocast)
 
+    def test_conv_transpose3d_autocast_fp32(self):
+        m = nn.ConvTranspose3d(16, 33, 3, stride=2).to("mps")
+        x = torch.randn(20, 16, 10, 50, 100, device="mps")
+        with torch.amp.autocast(device_type="mps"):
+            y = m(x)
+        self.assertEqual(y.dtype, torch.float32)
+
+    def test_conv3d_autocast(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/160415
+        class Foo(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.c1 = nn.Conv3d(3, 3, 1)
+                self.c2 = nn.Conv3d(3, 3, 1)
+
+            def forward(self, x):
+                x = self.c1(x)
+                x = self.c2(x)
+                return x
+
+        x = torch.randn(2, 3, 4, 4, 4, device="mps")
+        model = Foo().to("mps")
+        with torch.amp.autocast(device_type="mps"):
+            y = model(x)
+        self.assertEqual(y.dtype, torch.float16)
+
     def test_gradscaler_mps(self):
         # big model to force chunking/depth in the gradscaler dispatch
         class Model(nn.Module):
@@ -711,6 +738,33 @@ def test_avg_pool2d_ceil_mode(self):
             padding=(0, 1), stride=2)
         self.assertFalse(torch.isnan(y).any())
 
+    # Test some cases for avg_pool2d which used to mismatch CPU results.
+    # Addresses this issue: https://github.com/pytorch/pytorch/issues/160743
+    def test_avg_pool2d_ceil_mode_mismatch(self):
+        sizes = [
+            (4, 2, 3),
+            (5, 2, 3),
+            (50, 2, 3),
+            (4, 1, 2, 3),
+            (4, 4, 2, 3),
+            (2, 2, 4, 6),
+            (5, 40, 60),
+            (2, 2, 40, 60),
+        ]
+
+        kwargs = dict(kernel_size=[1, 3],
+                      stride=[2, 3],
+                      ceil_mode=True,
+                      divisor_override=7)
+
+        for input_size in sizes:
+            model = torch.nn.AvgPool2d(**kwargs)
+            x = torch.arange(math.prod(input_size), dtype=torch.float).reshape(input_size)
+            out_cpu = model(x)
+            out_mps = model(x.to("mps"))
+            msg = f'{input_size=}, {kwargs=}'
+            self.assertEqual(out_mps, out_cpu, msg=msg)
+
 
 class TestMPS(TestCaseMPS):
     def test_exp(self, device="mps", dtype=torch.float):
@@ -1190,6 +1244,17 @@ def test_linear_errors(self):
             torch.nn.functional.linear(torch.rand(size, device='cpu'),
                                        torch.rand(size, device='mps'))
 
+    def test_linear_non_contiguous(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/161640
+        # Slice tensors to force non-contiguity
+        large_weight = torch.randn(12, 8, device='mps')
+        weight_sliced = large_weight[::2, ::1]
+        weight_contiguous_equiv = weight_sliced.contiguous()
+        input_s = torch.randn(2, 8, device='mps')
+        result_sliced = torch.nn.functional.linear(input_s, weight_sliced)
+        result_contig = torch.nn.functional.linear(input_s, weight_contiguous_equiv)
+        self.assertEqual(result_contig, result_sliced)
+
     def _linear_helper(self, in_features, out_features, shape, bias=True, backward_pass=False):
         cpu_linear = torch.nn.Linear(in_features=in_features, out_features=out_features, device="cpu", bias=bias)
         mps_linear = torch.nn.Linear(in_features=in_features, out_features=out_features, device="mps", bias=bias)
@@ -5294,6 +5359,9 @@ def helper():
 
         helper()
 
+        # Regression test for https://github.com/pytorch/pytorch/issues/160738
+        self.assertTrue(torch.var(torch.tensor(3.13, device='mps'), dim=0).isnan().item())
+
     # Test forward amax
     def test_amax(self):
         def helper(shape, dim, keepdim):
@@ -7456,6 +7524,39 @@ def test_bernoulli(self):
             uniq = mps_out.unique()
             self.assertEqual(uniq, torch.arange(2, device='mps', dtype=dtype))
 
+    @parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    def test_dropout(self, dtype):
+        shapes = [
+            (100_000,),
+            (100, 1000),
+            (10, 100, 100),
+            (10, 10, 10, 10, 10),
+        ]
+        p_list = [0, 0.34, 0.78, 1]
+
+        for shape, p, train in itertools.product(shapes, p_list, [False, True]):
+            input = torch.randn(shape, device='mps', dtype=dtype, requires_grad=True)
+            output, mask = torch.native_dropout(input, p, train=train)
+
+            p_actual_mps = 1 - (mask.sum() / mask.numel())
+            if train:
+                self.assertEqual(p_actual_mps, p, atol=1e-2, rtol=1e-2)
+                self.assertTrue((output[mask.logical_not()] == 0).all())
+                self.assertEqual(output[mask], input[mask] / (1 - p))
+            else:
+                self.assertEqual(output, input)
+                self.assertTrue(mask.all())
+
+            output_grad = torch.randn_like(output)
+            output.backward(output_grad)
+
+            grad_scale = 0 if p == 1 else 1 / (1 - p)
+            if train:
+                self.assertEqual(input.grad, output_grad * mask * grad_scale)
+            else:
+                self.assertEqual(input.grad, output_grad)
+
+
     def test_mps_generator(self):
         # explicit manual seeding by creating an MPS Generator
         g_mps = torch.Generator(device='mps')
@@ -7735,6 +7836,8 @@ def helper(shape, alpha, op_name, inplace):
         y = torch.arange(32, device='mps', dtype=torch.int32)
         self.assertEqual(torch.add(x, y, alpha=2).cpu(), torch.add(x.cpu(), y.cpu(), alpha=2))
         self.assertEqual(torch.add(x, 3, alpha=2).cpu(), torch.add(x.cpu(), 3, alpha=2))
+        # Regression test for https://github.com/pytorch/pytorch/issues/160208
+        self.assertEqual(torch.add(y, x, alpha=2).cpu(), torch.add(y.cpu(), x.cpu(), alpha=2))
 
     # Test add
     def test_add_scalars(self):
@@ -7947,6 +8050,14 @@ def test_inplace_bitwise_not(self, dtype):
             x[::2].bitwise_not_()
         self.assertEqual(x_mps.cpu(), x_cpu)
 
+    def test_empty_posneginf(self):
+        # just to check that it doesnt crash
+        input_tensor = torch.empty(0, device="mps")
+        out_pos = torch.isposinf(input_tensor)
+        out_neg = torch.isposinf(input_tensor)
+        self.assertEqual(out_pos.numel(), 0)
+        self.assertEqual(out_neg.numel(), 0)
+
 
 class TestLargeTensors(TestCaseMPS):
     @serialTest()
@@ -8872,6 +8983,12 @@ def test_constant_pad_nd_preserves_memory_format(self):
         nhwc_padded = torch.constant_pad_nd(nhwc_tensor, [1, 2], 0.5)
         self.assertTrue(nhwc_padded.is_contiguous(memory_format=torch.channels_last))
 
+    def test_constant_pad_nd_with_empty_pad(self):
+        # Empty constant pad is no-op
+        # See https://github.com/pytorch/pytorch/issues/161066
+        input_mps = torch.randn((2, 3, 4), device="mps")
+        output_mps = torch.constant_pad_nd(input_mps, [])
+        self.assertEqual(output_mps, input_mps)
 
 class TestLinalgMPS(TestCaseMPS):
     def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False):
@@ -9446,6 +9563,78 @@ def test_fast_full_attention(self, dtype, contiguous, head_dim, with_mask):
         self.run_fast_attention_test(q, k, v, with_mask)
 
 
+
+
+class TestSDPAMetaDispatchMode(TorchDispatchMode):
+    """
+    TorchDispatchMode which intercepts the
+    _scaled_dot_product_attention_math_for_mps aten operator to check that the
+    meta kernel is correct.
+    """
+
+    def __init__(self, test):
+        self.test = test
+        super().__init__()
+
+    def __torch_dispatch__(self, func, types, args, kwargs=None):
+        kwargs = kwargs or {}
+        res = func(*args, **kwargs)
+        if func != torch.ops.aten._scaled_dot_product_attention_math_for_mps.default:
+            return res
+
+        meta_args, meta_kwargs = pytree.tree_map_only(torch.Tensor, lambda t: t.to(device="meta"), (args, kwargs))
+        meta_res = func(*meta_args, **meta_kwargs)
+
+        def format_res(res):
+            return [
+                (t.shape, t.stride(), t.dtype) if isinstance(t, torch.Tensor) else t
+                for t in pytree.tree_flatten(res)[0]
+            ]
+
+        # Format the output so that we only look at the tensor metadata
+        self.test.assertEqual(format_res(res), format_res(meta_res))
+        return res
+
+
+def create_sdpa_meta_test():
+    """
+    Creates a new class which takes every test in TestSDPA and adds the
+    TestSDPAMetaDispatchMode context in order to test the
+    scaled_dot_product_attention_for_mps meta kernel. This allows us to test all
+    the branches for the sdpa op. If there are changes to the sdpa kernel
+    without changing the meta kernel, a torch.compile guard will catch the issue
+    but not necessarily export.
+    """
+    orig_test_cls = TestSDPA
+
+    new_test_cls = type(f"{orig_test_cls.__name__}Meta", orig_test_cls.__bases__, {})
+    new_test_cls.__qualname__ = new_test_cls.__name__
+
+    for name in dir(orig_test_cls):
+        if name.startswith("test_"):
+            fn = getattr(orig_test_cls, name)
+            if not callable(fn):
+                setattr(new_test_cls, name, getattr(orig_test_cls, name))
+                continue
+
+            new_name = f"{name}_meta"
+
+            def new_fn(self, *args, **kwargs):
+                with TestSDPAMetaDispatchMode(self):
+                    fn(self, *args, **kwargs)
+
+            new_fn.__name__ = new_name
+
+            setattr(new_test_cls, new_name, new_fn)
+
+        elif not hasattr(new_test_cls, name):
+            setattr(new_test_cls, name, getattr(orig_test_cls, name))
+
+    return new_test_cls
+
+TestSDPAMeta = create_sdpa_meta_test()
+instantiate_parametrized_tests(TestSDPAMeta)
+
 class TestGatherScatter(TestCaseMPS):
     def test_slicing_with_step(self):
         # Slicing with step
@@ -12144,6 +12333,18 @@ def get_samples():
                 # Similar to the above, float vs double precision aresults in slight error
                 atol, rtol = 2e-5, 2e-6
 
+            if op.name in ["grid_sampler_3d", "asinh"]:
+                atol, rtol = 1e-4, 1e-4
+
+            if op.name == "kthvalue":
+                self.assertEqual(cpu_out[0], mps_out[0], atol=atol, rtol=rtol)
+                # kthvalue is non-deterministic if input has repeated values
+                dim = cpu_args[2] if len(cpu_args) > 2 else -1
+                keep_dim = cpu_args[3] if len(cpu_args) > 3 else False
+                values = torch.gather(mps_sample.input, dim, mps_out[1] if keep_dim else mps_out[1].unsqueeze(dim))
+                self.assertEqual(values if keep_dim else values.squeeze(dim), mps_out[0])
+                continue
+
             self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
 
     @ops(mps_ops_grad_modifier(copy.deepcopy(test_consistency_op_db)), allowed_dtypes=MPS_GRAD_DTYPES)
@@ -12245,6 +12446,39 @@ def req_grad(t):
                 atol, rtol = 5e-3, 5e-3
             self.assertEqual(cpu_grad_inputs, mps_grad_inputs, atol=atol, rtol=rtol)
 
+    # The CPU impl of grid_sampler_3d gives a large amount of error for half
+    # precision types. So instead of testing MPS-vs-CPU outputs, test
+    # full-vs-half precision dtypes for MPS.
+    @dtypes(torch.float16, torch.bfloat16)
+    def test_grid_sampler_3d_half_precision(self, device, dtype):
+        op = next((op for op in test_consistency_op_db if op.name == "grid_sampler_3d"), None)
+        include_conjugated_inputs = dtype.is_complex and op.test_conjugated_samples
+
+        def get_samples():
+            return op.sample_inputs(
+                device,
+                dtype,
+                requires_grad=(dtype.is_floating_point or dtype.is_complex),
+                include_conjugated_inputs=include_conjugated_inputs,
+                set_seed=True,
+            )
+
+        for half_sample in get_samples():
+            half_input = half_sample.input
+            half_grid, mode, padding_mode, align_corners = half_sample.args
+
+            full_input = half_input.to(torch.float).detach()
+            full_grid = half_grid.to(torch.float).detach()
+
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", category=UserWarning)
+                half_out = op(half_input, half_grid, mode, padding_mode, align_corners)
+                full_out = op(full_input, full_grid, mode, padding_mode, align_corners)
+
+            atol, rtol = 1e-4, 1e-4
+
+            self.assertEqual(half_out, full_out.to(dtype), atol=atol, rtol=rtol)
+
     def test_fmax_mixed_dtypes(self, device):
         # Regression tesing for https://github.com/pytorch/pytorch/issues/149951
         # fmax and fmin are implemented as binary metal shaders and they were implemented
@@ -12572,58 +12806,6 @@ def test_metal_capture(self):
                            f"Capture file {capture_dirname} contains only metadata, i.e. {capture_listdir}")
 
 
-
-class TestSparseMPS(TestCaseMPS):
-    def _get_basic_sparse_coo(self, device="mps"):
-        indices = torch.tensor([[0, 1], [2, 0]], dtype=torch.int64, device=device)
-        values = torch.tensor([1, 2], dtype=torch.float32, device=device)
-        size = (2, 3)
-        return torch.sparse_coo_tensor(indices, values, size, device=device)
-
-    def test_sparse_coo_tensor_with_dims(self):
-        indices = torch.zeros((2, 0), dtype=torch.int64, device="mps")
-        values = torch.tensor([], dtype=torch.float32, device="mps")
-        size = (2, 3)
-        t = torch.sparse_coo_tensor(indices, values, size, device="mps")
-        self.assertEqual(t.device.type, "mps")
-        self.assertEqual(t.layout, torch.sparse_coo)
-
-    def test_sparse_coo_tensor_with_dims_and_tensors(self):
-        indices = torch.tensor([[0, 1], [2, 0]], device="mps")
-        values = torch.tensor([1., 2.], device="mps")
-        size = (2, 3)
-        t = torch.sparse_coo_tensor(indices, values, size, device="mps")
-        self.assertEqual(t.device.type, "mps")
-        self.assertEqual(t.layout, torch.sparse_coo)
-        self.assertEqual(t._indices().cpu(), indices.cpu())
-        self.assertEqual(t._values().cpu(), values.cpu())
-
-    def test_nnz(self):
-        t = self._get_basic_sparse_coo()
-        self.assertEqual(t._nnz(), 2)
-
-    def test_sparse_dim(self):
-        t = self._get_basic_sparse_coo()
-        self.assertEqual(t.sparse_dim(), 2)
-
-    def test_to_sparse(self):
-        t = torch.tensor([[[1., 0], [2., 3.]], [[4., 0], [5., 6.]]], device="mps")
-        x = t.to_sparse()
-        t_cpu = torch.tensor([[[1., 0], [2., 3.]], [[4., 0], [5., 6.]]], device="mps")
-        x_cpu = t.to_sparse()
-        self.assertEqual(x.cpu(), x_cpu)
-
-    def test_resize(self):
-        indices = torch.tensor([[0, 1], [2, 0]])
-        values = torch.tensor([3.0, 4.0])
-        size = torch.Size([2, 3])
-        sparse = torch.sparse_coo_tensor(indices, values, size, device="mps")
-        sparse_cpu = torch.sparse_coo_tensor(indices, values, size, device="cpu")
-        sparse = sparse.sparse_resize_(torch.Size([4, 5]), sparse_dim=2, dense_dim=0)
-        sparse_cpu = sparse_cpu.sparse_resize_(torch.Size([4, 5]), sparse_dim=2, dense_dim=0)
-        self.assertEqual(sparse, sparse_cpu)
-
-
 # TODO: Actually instantiate that test for the "mps" device to better reflect what it is doing.
 # This requires mps to be properly registered in the device generic test framework which is not the
 # case right now. We can probably use `allow_mps` introduced in https://github.com/pytorch/pytorch/pull/87342
diff --git a/test/test_multiprocessing_spawn.py b/test/test_multiprocessing_spawn.py
index 25eea49b755cc..d093e01921dc1 100644
--- a/test/test_multiprocessing_spawn.py
+++ b/test/test_multiprocessing_spawn.py
@@ -47,7 +47,7 @@ def _test_terminate_signal_func(i):
 def _test_terminate_exit_func(i, arg):
     if i == 0:
         sys.exit(arg)
-    time.sleep(1.0)
+    time.sleep(4.0)
 
 
 def _test_success_first_then_exception_func(i, arg):
@@ -145,7 +145,7 @@ def test_terminate_signal(self):
         with self.assertRaisesRegex(Exception, message):
             mp.start_processes(_test_terminate_signal_func, nprocs=2, start_method=self.start_method)
 
-    @parametrize("grace_period", [None, 5])
+    @parametrize("grace_period", [None, 20])
     def test_terminate_exit(self, grace_period):
         exitcode = 123
         ctx = mp.start_processes(_test_terminate_exit_func, args=(exitcode,), nprocs=2, start_method=self.start_method, join=False)
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index 38c029f3c367c..ac97f2beda8e8 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -4444,12 +4444,18 @@ def test_jagged_op_different_output_shape_dim(
     @dtypes(torch.float32)
     @parametrize("requires_grad", [False, True])
     @parametrize("components_require_grad", [False, True])
+    @parametrize(
+        "func",
+        [torch.nn.functional.softmax, torch.nn.functional.log_softmax],
+        name_fn=lambda func: func.__name__,
+    )
     def test_softmax_dim(
         self,
         device,
         dtype,
         requires_grad,
         components_require_grad,
+        func,
     ):
         """
         Softmax passes when reducing on valid reduction dimensions.
@@ -4468,7 +4474,7 @@ def test_softmax_dim(
 
         for reduce_dim, _ in reduce_dims:
             nt = torch.nested.as_nested_tensor(ts, layout=torch.jagged)
-            out_actual = torch.nn.functional.softmax(nt, dim=reduce_dim)
+            out_actual = func(nt, dim=reduce_dim)
             torch._dynamo.disable(self.assertEqual)(
                 len(out_actual.shape), len(output_shape)
             )  # disable if running on dynamo
@@ -4498,12 +4504,10 @@ def test_softmax_dim(
             reduce_dim, reduce_dim_expected = reduce_dim_tuple
 
             if nt.dim() > reduce_dim:
-                out_actual = torch.nn.functional.softmax(
-                    nt, dim=reduce_dim
-                )  # nested tensor
-                out_expected = torch.nn.functional.softmax(
-                    nt.values(), dim=reduce_dim_expected
-                )  # dense tensor of dimensions 1 less than out_actual
+                # nested tensor
+                out_actual = func(nt, dim=reduce_dim)
+                # dense tensor of dimensions 1 less than out_actual
+                out_expected = func(nt.values(), dim=reduce_dim_expected)
                 self.assertTrue(
                     torch.allclose(out_actual.values().view(-1), out_expected.view(-1))
                 )
@@ -4601,8 +4605,13 @@ def test_softmax_dim_reduce_ragged_idx_1(
     @dtypes(torch.float32)
     @parametrize("requires_grad", [False, True])
     @parametrize("components_require_grad", [False, True])
+    @parametrize(
+        "func",
+        [torch.nn.functional.softmax, torch.nn.functional.log_softmax],
+        name_fn=lambda func: func.__name__,
+    )
     def test_softmax_reduce_batch_dim(
-        self, device, dtype, requires_grad, components_require_grad
+        self, device, dtype, requires_grad, components_require_grad, func
     ):
         """
         Softmax on NestedTensor fails when trying to reduce across batch dimension.
@@ -4627,7 +4636,7 @@ def test_softmax_reduce_batch_dim(
                 RuntimeError,
                 "not supported when reducing across the batch dimension for NestedTensor",
             ):
-                out = torch.nn.functional.softmax(nt, dim=reduce_dim)
+                out = func(nt, dim=reduce_dim)
 
     @dtypes(torch.float32)
     @parametrize("requires_grad", [False, True])
@@ -6751,11 +6760,10 @@ def check_forward_backward(skip_backward=False):
             and check_cudnn
             and (dtype == torch.float16 or dtype == torch.bfloat16)
         ):
-            with self.assertRaisesRegex(RuntimeError, "cuDNN SDPA Nested Tensor"):
-                with torch.nn.attention.sdpa_kernel(
-                    torch.nn.attention.SDPBackend.CUDNN_ATTENTION
-                ):
-                    check_forward_backward()
+            with torch.nn.attention.sdpa_kernel(
+                torch.nn.attention.SDPBackend.CUDNN_ATTENTION
+            ):
+                check_forward_backward()
 
     @skipIfTorchDynamo("SDPA test compiles internally")
     @unittest.skipIf(IS_WINDOWS, reason="Windows not yet supported for torch.compile")
@@ -7277,6 +7285,9 @@ def _rand_nt(noncontig_with_holes=noncontig_with_holes):
 
         return query, key, value
 
+    @unittest.skip(
+        "Temporarily skip - nested tensor backward pass broken after return-max-scores commit"
+    )
     @onlyCUDA
     @flex_attention_supported_platform
     @dtypes(torch.float32)
diff --git a/test/test_nn.py b/test/test_nn.py
index 78624e0e49556..5949012773ec5 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -60,6 +60,9 @@
 
 AMPERE_OR_ROCM = TEST_WITH_ROCM or torch.cuda.is_tf32_supported()
 
+if TEST_WITH_ROCM:
+    os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC"] = "1"
+
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -7463,6 +7466,7 @@ def test_layer_norm_backwards_eps(self):
                 if bias and elementwise_affine:
                     self.assertEqual(ln.bias.grad, ln_cuda.bias.grad, f"bias grad failed: {m=} {n=}", rtol=rtol, atol=atol)
 
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
     @largeTensorTest("40GB", device="cuda")
     def test_layer_norm_large_tensor(self):
         # test for https://github.com/pytorch/pytorch/issues/136291
@@ -8546,7 +8550,6 @@ def test_affine_2d_rotateRandom(self, device):
 
     @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
                      "Scipy v1.0 and/or numpy not found")
-    @expectedFailureMPS  # aten::grid_sampler_3d not implemented https://github.com/pytorch/pytorch/issues/77764
     @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
     @reduced_f32_on_and_off(0.005)
     def test_affine_3d_rotateRandom(self, device):
@@ -8798,6 +8801,7 @@ def rms_norm_reference_fn(i, normalized_shape, weight, eps=None):
 
     @onlyNativeDeviceTypes
     @dtypes(torch.float16, torch.bfloat16, torch.float32, torch.float64)
+    @dtypesIfMPS(torch.float16, torch.bfloat16, torch.float32)
     def test_rmsnorm_epsilon(self, device, dtype):
         def rms_norm_reference_fn(i, normalized_shape):
             eps = torch.finfo(i.dtype).eps
@@ -8972,6 +8976,7 @@ def group_norm_ref(X, gamma, beta, groups, channels, eps):
             Y_cpu = group_norm(X.cpu())
             self.assertEqual(Y_cpu, Y, rtol=0, atol=1e-5)
 
+    @expectedFailureMPS  # Double is not supported on MPS
     @onlyNativeDeviceTypes
     @dtypes(torch.float64, torch.complex128)
     def test_pad(self, device, dtype):
@@ -9003,6 +9008,7 @@ def test_pad(self, device, dtype):
             out.fill_(4)
             self.assertTrue(torch.all(torch.abs(inputs) < 2))
 
+    @expectedFailureMPS  # Unsupported float64/complex128
     @onlyNativeDeviceTypes
     @dtypes(torch.float64, torch.complex128)
     def test_ReplicationPad_empty(self, device, dtype):
@@ -9141,6 +9147,7 @@ def test_Bilinear_empty(self, device):
         self.assertEqual(inp1.grad, torch.zeros_like(inp1))
         self.assertEqual(inp2.grad, torch.zeros_like(inp2))
 
+    @expectedFailureMPS  # Double not supported
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
     @onlyNativeDeviceTypes
     def test_TransformerEncoderLayer_empty(self, device):
@@ -9170,6 +9177,7 @@ def test_TransformerEncoderLayer_empty(self, device):
                     _test_module_empty_input(self, encoder_layer, input, check_size=False)
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
+    @expectedFailureMPS   # Float64 is not supported
     @onlyNativeDeviceTypes
     def test_TransformerEncoder_empty(self, device):
         for batch_first, input_shape in [(True, (0, 10, 512)),
@@ -9180,6 +9188,7 @@ def test_TransformerEncoder_empty(self, device):
             _test_module_empty_input(self, transformer_encoder, input, check_size=False)
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
+    @expectedFailureMPS   # Float64 is not supported
     @onlyNativeDeviceTypes
     def test_TransformerDecoderLayer_empty(self, device):
         for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)),
@@ -9190,6 +9199,7 @@ def test_TransformerDecoderLayer_empty(self, device):
             self._test_module_empty_inputs(decoder_layer, [tgt, memory])
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
+    @expectedFailureMPS   # Float64 is not supported
     @onlyNativeDeviceTypes
     def test_TransformerDecoder_empty(self, device):
         for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)),
@@ -9201,6 +9211,7 @@ def test_TransformerDecoder_empty(self, device):
             self._test_module_empty_inputs(transformer_decoder, [tgt, memory])
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
+    @expectedFailureMPS   # Float64 is not supported
     @onlyNativeDeviceTypes
     def test_Transformer_empty(self, device):
         for batch_first, src_shape, tgt_shape in [(True, (10, 0, 512), (20, 0, 512))]:
@@ -9235,7 +9246,7 @@ def test_ReflectionPad_empty(self, device, dtype):
 
     @onlyNativeDeviceTypes
     def test_ReflectionPad_fails(self, device):
-        with self.assertRaisesRegex(RuntimeError, 'Only 2D, 3D, 4D, 5D'):
+        with self.assertRaisesRegex(RuntimeError, r'Padding size 2 is not supported for 4D input tensor'):
             mod = torch.nn.ReflectionPad1d(2)
             inp = torch.randn(3, 3, 10, 10, device=device)
             mod(inp)
@@ -9244,7 +9255,7 @@ def test_ReflectionPad_fails(self, device):
             inp = torch.randn(3, 3, 10, 10, device=device)
             torch.ops.aten.reflection_pad1d(inp, (2, 2))
 
-        with self.assertRaisesRegex(RuntimeError, 'Only 2D, 3D, 4D, 5D'):
+        with self.assertRaisesRegex(RuntimeError, r'Padding size 4 is not supported for 5D input tensor'):
             mod = torch.nn.ReflectionPad2d(2)
             inp = torch.randn(3, 3, 10, 10, 10, device=device)
             mod(inp)
@@ -9253,7 +9264,7 @@ def test_ReflectionPad_fails(self, device):
             inp = torch.randn(3, 3, 10, 10, 10, device=device)
             torch.ops.aten.reflection_pad2d(inp, (2, 2, 2, 2))
 
-        with self.assertRaisesRegex(RuntimeError, 'Only 2D, 3D, 4D, 5D'):
+        with self.assertRaisesRegex(RuntimeError, r'Padding size 6 is not supported for 6D input tensor'):
             mod = torch.nn.ReflectionPad3d(3)
             inp = torch.randn(3, 3, 10, 10, 10, 10, device=device)
             mod(inp)
@@ -9336,6 +9347,7 @@ def test_ReflectionPad3d_large(self, device):
 
             self.assertEqual(x.grad, ref_x.grad)
 
+    @expectedFailureMPS  # Unimplemented margin_loss
     @onlyNativeDeviceTypes
     @dtypes(torch.float, torch.double)
     def test_MarginLoss_empty(self, device, dtype):
@@ -9402,6 +9414,7 @@ def test_mse_loss_error(self, device):
         with self.assertRaisesRegex(RuntimeError, 'Expected all tensors to be on the same device'):
             F.mse_loss(i, t)
 
+    @expectedFailureMPS   # TODO: Fixme, and raise assert on empty tensor
     @onlyNativeDeviceTypes
     def test_Unfold_empty(self, device):
         inp = torch.randn(0, 3, 3, 4, device=device)
@@ -9625,6 +9638,7 @@ def verify_reduction_scalars(input, reduction, output):
                     verify_reduction_scalars(input, reduction, output)
 
     # verify that bogus reduction strings are errors
+    @expectedFailureMPS  # CTCLoss unimplemented
     @onlyNativeDeviceTypes
     def test_invalid_reduction_strings(self, device):
         input = torch.randn(3, 5, requires_grad=True, device=device)
@@ -10111,6 +10125,7 @@ def test_upsamplingNearestExact3d_correctness(self, device, memory_format, isize
     @parametrize_test("align_corners", [True, False])
     @parametrize_test("mode", ["bilinear", "bicubic"])
     @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
+    @expectedFailureMPS  # double device type
     @onlyNativeDeviceTypes
     def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory_format):
         # Forward AD does not support XLA because XLA tensors don't have storage
@@ -10180,6 +10195,7 @@ def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory
     @parametrize_test("num_channels", [3, 5])
     @parametrize_test("mode", ["nearest", "nearest-exact", "bilinear", "bicubic"])
     @parametrize_test("dtype", integral_types() + floating_types())
+    @skipIfMPS  # Error message is wrong for some dtypes
     @onlyNativeDeviceTypes
     def test_upsamplingBiMode2d_nonsupported_dtypes(self, device, antialias, num_channels, mode, dtype):
         x = torch.ones(1, num_channels, 32, 32, dtype=dtype, device=device)
@@ -11502,6 +11518,7 @@ def test_hardsigmoid_grad(self, device):
         self.assertTrue(gradcheck(F.hardsigmoid, (inputs,)))
 
     # currently fails on XLA
+    @expectedFailureMPS  # TypeError: the MPS framework doesn't support float64
     @onlyNativeDeviceTypes
     def test_hardswish_grad(self, device):
         inputs = (torch.randn(4, 16, 16, device=device, dtype=torch.double) - 0.5) * 10
@@ -11709,6 +11726,7 @@ def test_batchnorm_simple_average_mixed(self, device, dtype):
                 self._test_batchnorm_simple_average(device, dtype, torch.float)
 
     @onlyNativeDeviceTypes
+    @expectedFailureMPS  # Unsupported Border padding mode
     @dtypes(torch.float, torch.double)
     def test_grid_sample_nan_inf(self, device, dtype):
         input = torch.zeros([1, 1, 3, 3], device=device, dtype=dtype)
@@ -12061,6 +12079,12 @@ def test_softmax_bfloat16(self, device):
             # test softmax with large input value which causes exp() to overflow
             _test_bfloat16_ops(self, torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=0.05, scale_factor=1000.0)
 
+    def test_nll_loss_1d_input_1d_target_invalid_size(self, device):
+        x = torch.randn(10, device=device)
+        t = torch.randint(0, 10, (3,), dtype=torch.int64, device=device)
+        with self.assertRaisesRegex(ValueError, "For 1D input, 1D target must have size 1"):
+            F.nll_loss(x, t)
+
     def test_nll_loss_mismatched_batch(self, device):
         x = torch.randn((10, 3), requires_grad=True, device=device)
         # t should have size (10,)
@@ -12821,6 +12845,7 @@ def test_threshold_inplace_overlap(self, device):
         F.threshold(x, 0.5, 0.5, inplace=True)
         F.threshold_(x, 0.5, 0.5)
 
+    @expectedFailureMPS  # Double is unsupported
     @onlyNativeDeviceTypes
     def test_triplet_margin_with_distance_loss_default_parity(self, device):
         # Test for `nn.TripletMarginWithDistanceLoss` and
@@ -12855,6 +12880,7 @@ def test_triplet_margin_with_distance_loss_default_parity(self, device):
             self.assertTrue(gradcheck(lambda a, p, n: loss_op(a, p, n),
                             (anchor, positive, negative)))
 
+    @expectedFailureMPS  # Double is unsupported
     @onlyNativeDeviceTypes
     def test_triplet_margin_with_distance_loss(self, device):
         # Test for parity between `nn.TripletMarginWithDistanceLoss` and
diff --git a/test/test_numa_binding.py b/test/test_numa_binding.py
index e1637b2aad967..d38032ba22603 100644
--- a/test/test_numa_binding.py
+++ b/test/test_numa_binding.py
@@ -2,22 +2,27 @@
 
 from __future__ import annotations
 
-import subprocess
+import json
 import sys
 from dataclasses import dataclass
+from multiprocessing.context import SpawnProcess
 from typing import Any, Optional
 from unittest import skipIf, skipUnless
 from unittest.mock import mock_open, patch
 
 import torch
+from torch._utils_internal import signpost_event
 from torch.distributed.elastic.multiprocessing import DefaultLogsSpecs, start_processes
-from torch.distributed.numa.binding import (
+from torch.distributed.elastic.multiprocessing.subprocess_handler import (
+    SubprocessHandler,
+)
+from torch.numa.binding import (
     _get_ranges_str_from_ints,
     _get_set_of_int_from_ranges_str,
     AffinityMode,
     NumaOptions,
 )
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_utils import IS_MACOS, run_tests, TestCase
 
 
 @dataclass(frozen=True)
@@ -37,10 +42,7 @@ class MockDeviceProperties:
 _real_open = open
 
 
-@skipIf(
-    sys.platform == "win32",
-    "Windows is missing various os module attributes like sched_getaffinity",
-)
+@skipUnless(sys.platform == "linux", "Only linux currently supported")
 @skipUnless(
     torch.distributed.is_available(), "Need access to some distributed submodules"
 )
@@ -58,11 +60,12 @@ def setUp(self) -> None:
             patch("torch.cuda.device_count", self._mock_device_count),
             patch("torch.cuda.get_device_properties", self._mock_get_device_properties),
             patch("torch.cuda.is_available", self._mock_is_available),
+            # Implicitly used by dynamo
+            patch("torch.cuda.get_rng_state"),
             patch("builtins.open", new=self._mock_open),
             patch("os.listdir", new=self._mock_listdir),
             patch("os.sched_getaffinity", new=self._mock_sched_getaffinity),
-            patch("shutil.which", return_value="/usr/bin/numactl"),
-            patch("subprocess.run"),
+            patch("torch.numa.binding.signpost_event", self._mock_signpost_event),
         ]
 
         for context_manager in self._context_managers_to_apply_to_all_tests:
@@ -73,6 +76,11 @@ def tearDown(self) -> None:
             context_manager.__exit__(None, None, None)
         super().tearDown()
 
+    def _mock_signpost_event(self, *args, **kwargs) -> None:
+        # Please keep these parameters JSON serializable for logging purposes
+        json.dumps(kwargs["parameters"])
+        return signpost_event(*args, **kwargs)
+
     def _add_mock_hardware(
         self,
         *,
@@ -204,7 +212,7 @@ def _mock_get_device_properties(self, index: int) -> MockDeviceProperties:
     def _mock_open(self, path: str, *args, **kwargs) -> Any:
         if path in self._mock_file_path_to_contents:
             return mock_open(read_data=self._mock_file_path_to_contents[path])()
-        if path.startswith("/sys/"):
+        if isinstance(path, str) and path.startswith("/sys/"):
             raise FileNotFoundError(f"File {path} was not mocked.")
         # Looks like CI is calling open and intending to open an actual file in some places.
         # Need this to make the CI pass.
@@ -222,18 +230,41 @@ def _mock_listdir(self, target_path: str) -> set[str]:
     def _mock_sched_getaffinity(self, pid: int) -> set[int]:
         return set(range(self._mock_num_logical_cpus))
 
-    def _start_test_processes_and_get_command_args_for_local_rank(
-        self, *, numa_options: Optional[NumaOptions], local_rank: int
-    ) -> tuple[str, ...]:
-        """
-        Calls start_processes like elastic_launch ultimately would
-        and returns the commandline args tuple input to Popen.
+    def _start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+        self, *, numa_options: Optional[NumaOptions], target_local_rank: int
+    ) -> Optional[set[int]]:
+        active_local_rank = None
+        target_sched_setaffinity_logical_cpu_indices = None
+
+        real_subprocess_handler_init = SubprocessHandler.__init__
+
+        def mock_SubprocessHandler__init__(*args, **kwargs) -> None:
+            nonlocal active_local_rank
+            active_local_rank = kwargs["local_rank_id"]
+            return real_subprocess_handler_init(*args, **kwargs)
+
+        def mock_sched_setaffinity(*args, **kwargs) -> None:
+            nonlocal target_sched_setaffinity_logical_cpu_indices
+            if (
+                active_local_rank == target_local_rank
+                # We only care about the first call, not the second
+                # one where it gets reset
+                and target_sched_setaffinity_logical_cpu_indices is None
+            ):
+                target_sched_setaffinity_logical_cpu_indices = args[1]
 
-        Does not actually create the processes.
-        """
-        with patch(
-            "torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler.Popen"
-        ) as mock_popen:
+        with (
+            patch(
+                "os.sched_setaffinity", mock_sched_setaffinity
+            ) as mock_sched_setaffinity,
+            patch(
+                "torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler.Popen"
+            ),
+            patch(
+                "torch.distributed.elastic.multiprocessing.subprocess_handler.SubprocessHandler.__init__",
+                mock_SubprocessHandler__init__,
+            ),
+        ):
             start_processes(
                 name="test_process",
                 entrypoint="echo",
@@ -246,13 +277,59 @@ def _start_test_processes_and_get_command_args_for_local_rank(
                 logs_specs=DefaultLogsSpecs(),
                 numa_options=numa_options,
             )
-            # This will raise an exception if there is no call from the desired local_rank
-            call_args = next(
-                call_args
-                for call_args in mock_popen.call_args_list
-                if call_args.kwargs.get("env", {}).get("LOCAL_RANK") == str(local_rank)
+
+        return target_sched_setaffinity_logical_cpu_indices
+
+    def _start_processes_for_callable_entrypoint_and_get_sched_setaffinity_cpus(
+        self, *, numa_options: Optional[NumaOptions], target_local_rank: int
+    ) -> Optional[set[int]]:
+        active_local_rank = None
+        target_sched_setaffinity_logical_cpu_indices = None
+
+        real_process__init__ = SpawnProcess.__init__
+
+        def _mock_process__init__(*args, **kwargs) -> None:
+            nonlocal active_local_rank
+            active_local_rank = kwargs["args"][1]
+            return real_process__init__(*args, **kwargs)
+
+        def mock_sched_setaffinity(*args, **kwargs) -> None:
+            nonlocal target_sched_setaffinity_logical_cpu_indices
+            if (
+                active_local_rank == target_local_rank
+                # We only care about the first call, not the second
+                # one where it gets reset
+                and target_sched_setaffinity_logical_cpu_indices is None
+            ):
+                target_sched_setaffinity_logical_cpu_indices = args[1]
+
+        with (
+            patch(
+                "os.sched_setaffinity", mock_sched_setaffinity
+            ) as mock_sched_setaffinity,
+            patch("multiprocessing.context.SpawnProcess.start"),
+            patch(
+                "multiprocessing.context.SpawnProcess.__init__", _mock_process__init__
+            ),
+            patch("multiprocessing.process.BaseProcess.sentinel", 1),
+            # Prevent hanging
+            patch(
+                "multiprocessing.synchronize.Event.wait",
+                lambda self, timeout=None: None,
+            ),
+        ):
+            start_processes(
+                name="test_process",
+                entrypoint=lambda x: x,
+                args=dict.fromkeys(range(self._mock_device_count()), (0,)),
+                envs={
+                    i: {"LOCAL_RANK": str(i)} for i in range(self._mock_device_count())
+                },
+                logs_specs=DefaultLogsSpecs(),
+                numa_options=numa_options,
             )
-            return call_args.kwargs["args"]
+
+        return target_sched_setaffinity_logical_cpu_indices
 
     def test_node_numa_binding(self) -> None:
         self._add_mock_hardware(
@@ -263,20 +340,19 @@ def test_node_numa_binding(self) -> None:
             num_physical_core_per_l3_cache=2,
         )
 
-        command_args = self._start_test_processes_and_get_command_args_for_local_rank(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.NODE), local_rank=11
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
+                target_local_rank=11,
+            )
         )
         self.assertEqual(
-            command_args,
+            bound_logical_cpu_indices,
             # There are 8 numa nodes and 2 GPUs per numa node, so GPU 11 would be
             # on numa node 11 // 2 = 5.
-            (
-                "numactl",
-                "--cpunodebind=5",
-                "--preferred=5",
-                "echo",
-                "Hello, world!",
-            ),
+            # Each numa node has 4 * 2 * 2 = 16 logical CPUs
+            # Numa node 5 has CPUs 80-95
+            set(range(80, 96)),
         )
 
     def test_no_numa_binding_if_numa_options_not_provided(self) -> None:
@@ -288,15 +364,14 @@ def test_no_numa_binding_if_numa_options_not_provided(self) -> None:
             num_physical_core_per_l3_cache=2,
         )
 
-        command_args = self._start_test_processes_and_get_command_args_for_local_rank(
-            numa_options=None, local_rank=11
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=None, target_local_rank=11
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            None,
         )
 
     def test_default_numa_binding(self) -> None:
@@ -332,7 +407,7 @@ def test_default_numa_binding(self) -> None:
 
     def test_fallback(self) -> None:
         self._add_mock_hardware(
-            num_sockets=1,
+            num_sockets=2,
             num_numa_nodes_per_socket=1,
             num_gpus_per_numa_node=1,
             num_l3_caches_per_numa_node=1,
@@ -340,32 +415,29 @@ def test_fallback(self) -> None:
         )
 
         with (
-            patch("torch.distributed.numa.binding.signpost_event") as signpost_patch,
+            patch("torch.numa.binding.signpost_event") as signpost_patch,
             patch(
-                "subprocess.run",
-                side_effect=subprocess.CalledProcessError(1, "numactl"),
+                "torch.numa.binding._get_numa_node_index_for_gpu_index",
+                side_effect=Exception("Mock exception!"),
             ),
         ):
-            command_args = (
-                self._start_test_processes_and_get_command_args_for_local_rank(
+            bound_logical_cpu_indices = (
+                self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
                     numa_options=NumaOptions(
                         affinity_mode=AffinityMode.NODE,
                         should_fall_back_if_binding_fails=True,
                     ),
-                    local_rank=0,
+                    target_local_rank=0,
                 )
             )
         self.assertIn(
-            "subprocess.CalledProcessError",
+            "Mock exception!",
             signpost_patch.call_args.kwargs["parameters"]["traceback"],
         )
         self.assertEqual(
-            command_args,
-            # No numa bindings due to exception
-            (
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # We should just reset to the original CPU affinity, which is all the CPUs
+            set(range(4)),
         )
 
     def test_explicit_numa_options_overrides_default(self) -> None:
@@ -387,6 +459,31 @@ def test_explicit_numa_options_overrides_default(self) -> None:
             NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE),
         )
 
+    def test_nproc_must_equal_cuda_device_count_to_use_default_numa_options(
+        self,
+    ) -> None:
+        # Inner import to avoid crashing if not torch.distributed.is_available()
+        from torch.distributed.launcher.api import LaunchConfig
+
+        self._add_mock_hardware(
+            num_sockets=1,
+            num_numa_nodes_per_socket=1,
+            num_gpus_per_numa_node=1,
+            num_l3_caches_per_numa_node=1,
+            num_physical_core_per_l3_cache=1,
+        )
+
+        with patch(
+            "torch.distributed.launcher.api.get_default_numa_options"
+        ) as mock_get_default_numa_options:
+            launch_config = LaunchConfig(
+                min_nodes=1,
+                max_nodes=1,
+                nproc_per_node=2,
+            )
+            mock_get_default_numa_options.assert_not_called()
+            self.assertIsNone(launch_config.numa_options)
+
     def test_socket_numa_binding_with_multiple_numa_per_socket(self) -> None:
         self._add_mock_hardware(
             num_sockets=4,
@@ -396,18 +493,18 @@ def test_socket_numa_binding_with_multiple_numa_per_socket(self) -> None:
             num_physical_core_per_l3_cache=2,
         )
 
-        command_args = self._start_test_processes_and_get_command_args_for_local_rank(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.SOCKET), local_rank=15
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.SOCKET),
+                target_local_rank=15,
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                "--cpunodebind=6-7",
-                "--preferred-many=6-7",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # GPU 15 is on numa node 15 // 2 = 7, which is on socket 3 (numa nodes 6 and 7)
+            # Each numa node has 4 * 2 * 2 = 16 logical CPUs
+            # Numa nodes 6 and 7 have CPUs 96-111 and 112-127
+            set(range(96, 128)),
         )
 
     def test_socket_numa_binding_with_single_numa_per_socket(self) -> None:
@@ -419,18 +516,18 @@ def test_socket_numa_binding_with_single_numa_per_socket(self) -> None:
             num_physical_core_per_l3_cache=2,
         )
 
-        command_args = self._start_test_processes_and_get_command_args_for_local_rank(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.SOCKET), local_rank=7
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.SOCKET),
+                target_local_rank=7,
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                "--cpunodebind=3",
-                "--preferred=3",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # GPU 7 is on numa node 7 // 2 = 3, which is socket 3 by itself
+            # Each numa node has 4 * 2 * 2 = 16 logical CPUs
+            # Numa node 3 has CPUs 48-63
+            set(range(48, 64)),
         )
 
     def test_exclusive_numa_binding(self) -> None:
@@ -442,34 +539,30 @@ def test_exclusive_numa_binding(self) -> None:
             num_physical_core_per_l3_cache=3,
         )
 
-        command_args_0 = self._start_test_processes_and_get_command_args_for_local_rank(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE), local_rank=0
+        bound_logical_cpu_indices_0 = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE),
+                target_local_rank=0,
+            )
         )
         self.assertEqual(
-            command_args_0,
-            (
-                "numactl",
-                # Gets an extra physical core due to odd number of physical cores on numa node
-                "--physcpubind=0-3",
-                "--preferred=0",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices_0,
+            # Gets an extra physical core due to odd number of physical cores on numa node
+            # 3 physical cores total, 2 GPUs: GPU 0 gets 2 physical cores (CPUs 0-3)
+            set(range(0, 4)),
         )
 
-        command_args_1 = self._start_test_processes_and_get_command_args_for_local_rank(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE), local_rank=1
+        bound_logical_cpu_indices_1 = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE),
+                target_local_rank=1,
+            )
         )
         self.assertEqual(
-            command_args_1,
-            (
-                "numactl",
-                # Does not get an extra physical core, since the 1st GPU already took the extra.
-                "--physcpubind=4-5",
-                "--preferred=0",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices_1,
+            # Does not get an extra physical core, since the 1st GPU already took the extra.
+            # GPU 1 gets 1 physical core (CPUs 4-5)
+            set(range(4, 6)),
         )
 
     def test_exclusive_raises_if_too_few_physical_cores(self) -> None:
@@ -485,9 +578,9 @@ def test_exclusive_raises_if_too_few_physical_cores(self) -> None:
             RuntimeError,
             "There are only 1 physical cores on numa_node_index=0, but there are 2 GPUs associated with this NUMA node.",
         ):
-            self._start_test_processes_and_get_command_args_for_local_rank(
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
                 numa_options=NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE),
-                local_rank=1,
+                target_local_rank=1,
             )
 
     def test_core_complex_numa_binding_with_extra_l3(self) -> None:
@@ -499,20 +592,18 @@ def test_core_complex_numa_binding_with_extra_l3(self) -> None:
             num_physical_core_per_l3_cache=3,
         )
 
-        command_args = self._start_test_processes_and_get_command_args_for_local_rank(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
-            local_rank=3,
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
+                target_local_rank=3,
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                # The second L3 on the second numa node
-                "--physcpubind=24-29",
-                "--preferred=1",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # GPU 3 is on numa node 3 // 2 = 1, relative GPU index is 3 % 2 = 1
+            # The second L3 on the second numa node (numa node 1)
+            # Second numa node starts at CPU 18, second L3 cache is CPUs 24-29
+            set(range(24, 30)),
         )
 
     def test_core_complex_numa_binding_with_fewer_l3_than_gpu(self) -> None:
@@ -524,21 +615,18 @@ def test_core_complex_numa_binding_with_fewer_l3_than_gpu(self) -> None:
             num_physical_core_per_l3_cache=3,
         )
 
-        command_args = self._start_test_processes_and_get_command_args_for_local_rank(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
-            local_rank=3,
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
+                target_local_rank=3,
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                # There are only 2 L3 caches, so the 4th GPU shares the same
-                # cores as the 3rd GPU.
-                "--physcpubind=6-11",
-                "--preferred=1",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # GPU 3 is on numa node 3 // 2 = 1, relative GPU index is 3 % 2 = 1
+            # With 1 L3 cache per numa node, GPU 3 uses L3 cache index 1 % 1 = 0 (the only cache)
+            # Second numa node starts at CPU 6, single L3 cache spans CPUs 6-11
+            set(range(6, 12)),
         )
 
     def test_core_complex_prefers_caches_with_more_cpus(self) -> None:
@@ -552,23 +640,17 @@ def test_core_complex_prefers_caches_with_more_cpus(self) -> None:
 
         # Only some subset of the CPUs are available this time.
         with patch("os.sched_getaffinity", return_value={0, 4, 6, 7, 9}):
-            command_args = (
-                self._start_test_processes_and_get_command_args_for_local_rank(
+            bound_logical_cpu_indices = (
+                self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
                     numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
-                    local_rank=0,
+                    target_local_rank=0,
                 )
             )
 
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                # Binds to the second L3 because it has the most available CPUs
-                "--physcpubind=6-7,9",
-                "--preferred=0",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # Binds to the second L3 because it has the most available CPUs
+            {6, 7, 9},
         )
 
     def test_core_complex_tiebreak_prefers_lower_cache_key(self) -> None:
@@ -584,58 +666,21 @@ def test_core_complex_tiebreak_prefers_lower_cache_key(self) -> None:
             num_physical_core_per_l3_cache=1,
         )
 
-        command_args = self._start_test_processes_and_get_command_args_for_local_rank(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
-            local_rank=0,
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.CORE_COMPLEX),
+                target_local_rank=0,
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                "--physcpubind=0-1",
-                "--preferred=0",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # 1 numa node, 2 L3 caches, 1 physical core per L3 cache = 2 logical CPUs per cache
+            # L3 cache 0: CPUs 0-1, L3 cache 1: CPUs 2-3
+            # Both have same number of CPUs, so prefer lower cache key (0)
+            set(range(0, 2)),
         )
 
-    def test_raises_error_if_numa_options_provided_for_callable_entrypoint(
-        self,
-    ) -> None:
-        # Inner import to avoid crashing if not torch.distributed.is_available()
-        from torch.distributed.elastic.agent.server.api import WorkerSpec
-
-        def mock_entrypoint() -> None:
-            pass
-
-        with self.assertRaisesRegex(ValueError, r".*numa_options.*"):
-            # not relevant to test, just pass in an arbitrary value
-            mock_rdzv_handler: Any = 0
-            WorkerSpec(
-                role="trainer",
-                # Only str entrypoint (e.g. "echo") is currently supported
-                entrypoint=mock_entrypoint,
-                local_world_size=8,
-                rdzv_handler=mock_rdzv_handler,
-                numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
-            )
-
-    def test_raises_error_if_numactl_unavailable(self) -> None:
-        self._add_mock_hardware(
-            num_sockets=1,
-            num_numa_nodes_per_socket=1,
-            num_gpus_per_numa_node=1,
-            num_l3_caches_per_numa_node=1,
-            num_physical_core_per_l3_cache=1,
-        )
-        with (
-            patch("shutil.which", return_value=None),
-            self.assertRaisesRegex(RuntimeError, r".*numactl.*"),
-        ):
-            self._start_test_processes_and_get_command_args_for_local_rank(
-                numa_options=NumaOptions(affinity_mode=AffinityMode.NODE), local_rank=0
-            )
-
+    @skipIf(IS_MACOS, "sched_getaffinity doesn't exist")
     def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
         self._add_mock_hardware(
             num_sockets=1,
@@ -654,20 +699,64 @@ def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
             contents="-1",
         )
 
-        command_args = self._start_test_processes_and_get_command_args_for_local_rank(
-            numa_options=NumaOptions(affinity_mode=AffinityMode.NODE), local_rank=0
+        bound_logical_cpu_indices = (
+            self._start_processes_for_str_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
+                target_local_rank=0,
+            )
         )
         self.assertEqual(
-            command_args,
-            (
-                "numactl",
-                "--cpunodebind=0",
-                "--preferred=0",
-                "echo",
-                "Hello, world!",
-            ),
+            bound_logical_cpu_indices,
+            # GPU 0 has numa node stored as -1, which is treated as numa node 0
+            # Each numa node has 1 * 1 * 2 = 2 logical CPUs
+            # Numa node 0 has CPUs 0-1
+            set(range(0, 2)),
+        )
+
+    def test_callable_entrypoint_basic(self) -> None:
+        self._add_mock_hardware(
+            num_sockets=4,
+            num_numa_nodes_per_socket=2,
+            num_gpus_per_numa_node=2,
+            num_l3_caches_per_numa_node=4,
+            num_physical_core_per_l3_cache=2,
+        )
+
+        bound_logical_cpu_indices = self._start_processes_for_callable_entrypoint_and_get_sched_setaffinity_cpus(
+            numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
+            target_local_rank=11,
+        )
+        self.assertEqual(
+            bound_logical_cpu_indices,
+            # There are 8 numa nodes and 2 GPUs per numa node, so GPU 11 would be
+            # on numa node 11 // 2 = 5.
+            # Each numa node has 4 * 2 * 2 = 16 logical CPUs
+            # Numa node 5 has CPUs 80-95
+            set(range(80, 96)),
         )
 
+    def test_raises_if_binding_to_empty_set(self) -> None:
+        self._add_mock_hardware(
+            num_sockets=1,
+            num_numa_nodes_per_socket=1,
+            num_gpus_per_numa_node=1,
+            num_l3_caches_per_numa_node=1,
+            num_physical_core_per_l3_cache=1,
+        )
+
+        with (
+            patch(
+                "torch.numa.binding._get_logical_cpus_to_bind_to", return_value=set()
+            ),
+            self.assertRaisesRegex(
+                RuntimeError, "Must bind to a non-empty set of CPU indices"
+            ),
+        ):
+            self._start_processes_for_callable_entrypoint_and_get_sched_setaffinity_cpus(
+                numa_options=NumaOptions(affinity_mode=AffinityMode.NODE),
+                target_local_rank=0,
+            )
+
     def test_get_set_of_int_from_ranges_str(self) -> None:
         self.assertEqual(
             _get_set_of_int_from_ranges_str("0-2,4,6-7"), {0, 1, 2, 4, 6, 7}
diff --git a/test/test_openreg.py b/test/test_openreg.py
index cae20b16f4793..7ee8ccefcd093 100644
--- a/test/test_openreg.py
+++ b/test/test_openreg.py
@@ -16,7 +16,9 @@
 from torch.serialization import safe_globals
 from torch.testing._internal.common_utils import (
     run_tests,
+    skipIfMPS,
     skipIfTorchDynamo,
+    skipIfWindows,
     skipIfXpu,
     TemporaryFileName,
     TestCase,
@@ -284,6 +286,8 @@ def test_manual_seed(self):
         self.assertEqual(torch.openreg.initial_seed(), 2024)  # type: ignore[misc]
 
     # Autograd
+    @skipIfMPS
+    @skipIfWindows()
     def test_autograd_init(self):
         # Make sure autograd is initialized
         torch.ones(2, requires_grad=True, device="openreg").sum().backward()
diff --git a/test/test_ops.py b/test/test_ops.py
index 4803401b87ef8..b435260e3beef 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1615,6 +1615,16 @@ def _tensor_requires_grad(x):
         ) == 0:
             return
 
+        if TEST_WITH_TORCHDYNAMO:
+            # NOTE: Also for TEST_WITH_TORCHINDUCTOR tests
+            # Under compile, some ops may be decomposed into supported ops
+            # So it is okay to have supported_but_unclaimed_*
+            if (
+                len(claimed_but_unsupported_forward)
+                + len(claimed_but_unsupported_backward)
+            ) == 0:
+                return
+
         # Reference operators often support additional dtypes, and that's OK
         if op in python_ref_db:
             if (
diff --git a/test/test_optim.py b/test/test_optim.py
index 27db6d717954b..6dd23d6328c89 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -187,7 +187,8 @@ def test_forloop_goes_right_direction(
                     )
                 input = torch.randn(5, device=device, dtype=dtype)
 
-                optimizer = optim_cls([weight, bias], **optim_input.kwargs)
+                params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
+                optimizer = optim_cls(params, **optim_input.kwargs)
                 schedulers = [
                     s(optimizer)
                     for s in (schedulers_constructor if schedulers_constructor else [])
@@ -195,7 +196,12 @@ def test_forloop_goes_right_direction(
 
                 def closure():
                     optimizer.zero_grad()
-                    loss = (weight.mv(input) + bias).pow(2).sum()
+                    wo = (
+                        weight.mv(input)
+                        if optim_cls.__name__ == "Muon"
+                        else weight.mv(input) + bias
+                    )
+                    loss = wo.pow(2).sum()
                     loss.backward()
                     if optim_info.only_supports_sparse_grads:
                         # For this test, we naively convert the Tensor layout, which we know does
@@ -246,7 +252,8 @@ def test_forloop_goes_right_direction_multigpu(
                 bias = Parameter(torch.randn((10), device="cuda:1", dtype=dtype))
                 inpt = torch.randn(5, device="cuda:0", dtype=dtype)
 
-                optimizer = optim_cls([weight, bias], **optim_input.kwargs)
+                params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
+                optimizer = optim_cls(params, **optim_input.kwargs)
                 schedulers = [
                     s(optimizer)
                     for s in (schedulers_constructor if schedulers_constructor else [])
@@ -254,7 +261,12 @@ def test_forloop_goes_right_direction_multigpu(
 
                 def closure():
                     optimizer.zero_grad()
-                    loss = (weight.mv(inpt).cuda(1) + bias).pow(2).sum()
+                    wo = (
+                        weight.mv(inpt).cuda(1)
+                        if optim_cls.__name__ == "Muon"
+                        else weight.mv(inpt).cuda(1) + bias
+                    )
+                    loss = wo.pow(2).sum()
                     loss.backward()
                     if optim_info.only_supports_sparse_grads:
                         # For this test, we naively convert the Tensor layout, which we know does
@@ -285,23 +297,25 @@ def test_param_group_with_lrscheduler_goes_right_direction(
 
         for schedulers_c in optim_info.scheduler_inputs:
             weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
-            bias = Parameter(torch.randn((10), device=device, dtype=dtype))
+            weight2 = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
             inpt = torch.randn(5, device=device, dtype=dtype)
 
             # avoid endless recompiles by wrapping LR in a tensor if we're compiling
             lr = torch.tensor(0.01) if torch.compiler.is_compiling() else 0.01
-            optimizer = optim_cls([{"params": [weight]}, {"params": [bias], "lr": lr}])
+            optimizer = optim_cls(
+                [{"params": [weight]}, {"params": [weight2], "lr": lr}]
+            )
             schedulers = [scheduler_c(optimizer) for scheduler_c in schedulers_c]
 
             def closure():
                 optimizer.zero_grad()
-                loss = (weight.mv(inpt) + bias).pow(2).sum()
+                loss = (weight.mv(inpt) + weight2.mv(inpt)).pow(2).sum()
                 loss.backward()
                 if optim_info.only_supports_sparse_grads:
                     # For this test, we naively convert the Tensor layout, which we know does
                     # NOT represent the expected use case for optims like SparseAdam!
                     weight.grad = weight.grad.to_sparse()
-                    bias.grad = bias.grad.to_sparse()
+                    weight2.grad = weight2.grad.to_sparse()
                 return loss
 
             initial_value = closure().item()
@@ -339,21 +353,26 @@ def test_tensor_lr(self, device, dtype, optim_info, num_dim):
             if "lr" in kwargs:
                 del kwargs["lr"]
 
+            params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
             kwargs["lr"] = 1.0 if optim_info.step_requires_closure else 1e-3
-            optimizer_r = optim_cls([weight, bias], **kwargs)
+            optimizer_r = optim_cls(params, **kwargs)
 
             try:
                 kwargs["lr"] = (
                     torch.tensor(kwargs["lr"]).reshape([1] * num_dim).to(lr_device)
                 )
-                optimizer = optim_cls([weight_c, bias_c], **kwargs)
+                params_c = [weight_c, bias_c]
+                if optim_cls.__name__ == "Muon":
+                    params_c = [weight_c]
+                optimizer = optim_cls(params_c, **kwargs)
             except ValueError as e:
                 self.assertRegex(str(e), ".*lr as a Tensor is not supported.*")
                 continue
 
             def closure(optim, w, b, i):
                 optim.zero_grad()
-                loss = (w.mv(i) + b).pow(2).sum()
+                wo = w.mv(i) if optim_cls.__name__ == "Muon" else w.mv(i) + b
+                loss = wo.pow(2).sum()
                 loss.backward()
                 if optim_info.only_supports_sparse_grads:
                     # For this test, we naively convert the Tensor layout, which we know does
@@ -377,7 +396,8 @@ def closure(optim, w, b, i):
                     optimizer.step()
 
                 self.assertEqual(weight, weight_c)
-                self.assertEqual(bias, bias_c)
+                if optim_cls.__name__ != "Muon":
+                    self.assertEqual(bias, bias_c)
 
     @parametrize("with_lrsched", [True, False])
     @optims(
@@ -1217,31 +1237,31 @@ def test_param_groups_weight_decay(self, device, dtype, optim_info):
         )
         for optim_input in all_optim_inputs:
             weight_kwargs = optim_input.kwargs
-            bias_kwargs = deepcopy(optim_input.kwargs)
-            bias_kwargs["weight_decay"] = 0.0
+            weight2_kwargs = deepcopy(optim_input.kwargs)
+            weight2_kwargs["weight_decay"] = 0.0
 
             weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
-            bias = Parameter(torch.randn((10), device=device, dtype=dtype))
+            weight2 = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
             input = torch.randn(5, device=device, dtype=dtype)
 
             optimizer = optim_cls(
                 [
                     dict(params=[weight], **weight_kwargs),
-                    dict(params=[bias], **bias_kwargs),
+                    dict(params=[weight2], **weight2_kwargs),
                 ]
             )
 
-            loss = (weight.mv(input) + bias).pow(2).sum()
+            loss = (weight.mv(input) + weight2.mv(input)).pow(2).sum()
             initial_value = loss.item()
             for _ in range(20):
                 optimizer.zero_grad()
-                loss = (weight.mv(input) + bias).pow(2).sum()
+                loss = (weight.mv(input) + weight2.mv(input)).pow(2).sum()
                 loss.backward()
                 if optim_info.only_supports_sparse_grads:
                     # For this test, we naively convert the Tensor layout, which we know does
                     # NOT represent the expected use case for optims like SparseAdam!
                     weight.grad = weight.grad.to_sparse()
-                    bias.grad = bias.grad.to_sparse()
+                    weight2.grad = weight2.grad.to_sparse()
                 optimizer.step()
 
             # Test that the direction of loss moved appropriately
@@ -1268,22 +1288,33 @@ def test_param_groups_lr(self, device, dtype, optim_info):
 
             weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
             bias = Parameter(torch.randn((10), device=device, dtype=dtype))
-            irrelevant = Parameter(torch.randn(2, device=device, dtype=dtype))
+            irrelevant = Parameter(torch.randn((2, 2), device=device, dtype=dtype))
             irrelevant_clone = irrelevant.clone()
             input = torch.randn(5, device=device, dtype=dtype)
+            params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
             optimizer = optim_cls(
                 [
-                    dict(params=[weight, bias], **optim_input.kwargs),
+                    dict(params=params, **optim_input.kwargs),
                     dict(params=[irrelevant]),
                 ],
                 **outer_kwargs,
             )
 
-            loss = (weight.mv(input) + bias).pow(2).sum()
+            wo = (
+                weight.mv(input)
+                if optim_cls.__name__ == "Muon"
+                else weight.mv(input) + bias
+            )
+            loss = wo.pow(2).sum()
             initial_value = loss.item()
             for _ in range(20):
                 optimizer.zero_grad()
-                loss = (weight.mv(input) + bias).pow(2).sum()
+                wo = (
+                    weight.mv(input)
+                    if optim_cls.__name__ == "Muon"
+                    else weight.mv(input) + bias
+                )
+                loss = wo.pow(2).sum()
                 loss.backward()
                 irrelevant.grad = torch.rand_like(irrelevant)
                 if optim_info.only_supports_sparse_grads:
@@ -1341,8 +1372,8 @@ def closure():
             if kwargs.get("weight_decay", 0) != 0:
                 continue
 
-            # AdamW params will be updated regardless of grads due to lr, so make lr smaller
-            if optim_cls.__name__ == "AdamW":
+            # AdamW/Muon params will be updated regardless of grads due to lr, so make lr smaller
+            if optim_cls.__name__ == "AdamW" or optim_cls.__name__ == "Muon":
                 kwargs["lr"] = (
                     torch.tensor(1e-5)
                     if isinstance(kwargs.get("lr", 1e-5), torch.Tensor)
@@ -1439,6 +1470,8 @@ def test_state_dict_deterministic(
         bias = Parameter(torch.randn(2, requires_grad=True, device=device, dtype=dtype))
         input = torch.randn(3, requires_grad=True, device=device, dtype=dtype)
         params = [weight, bias]
+        if optim_cls.__name__ == "Muon":
+            params = [weight]
 
         def make_named_param(param, is_named):
             if not is_named:
@@ -1453,7 +1486,8 @@ def without_param_names(state_dict):
 
         def fwd_bwd(optim, w, b, i):
             optim.zero_grad()
-            loss = (w.mv(i) + b).pow(2).sum()
+            wo = w.mv(i) if optim_cls.__name__ == "Muon" else w.mv(i) + b
+            loss = wo.pow(2).sum()
             loss.backward()
             if optim_info.only_supports_sparse_grads:
                 if w.grad is not None:
@@ -1479,7 +1513,10 @@ def fwd_bwd(optim, w, b, i):
             with torch.no_grad():
                 weight_c = Parameter(weight.clone())
                 bias_c = Parameter(bias.clone())
-            params_c = make_named_param([weight_c, bias_c], is_named=is_named_optim1)
+            params_c_list = (
+                [weight_c, bias_c] if optim_cls.__name__ != "Muon" else [weight_c]
+            )
+            params_c = make_named_param(params_c_list, is_named=is_named_optim1)
             optimizer_c = optim_cls(params_c, **optim_input.kwargs)
             closure_c = functools.partial(fwd_bwd, optimizer_c, weight_c, bias_c, input)
 
@@ -1498,7 +1535,8 @@ def fwd_bwd(optim, w, b, i):
                     optimizer_c.step()
 
                 self.assertEqual(weight, weight_c)
-                self.assertEqual(bias, bias_c)
+                if optim_cls.__name__ != "Muon":
+                    self.assertEqual(bias, bias_c)
 
             # Make sure state dict is deterministic with equal (not identical) parameters
             # Param names are optional and not needed to be the consistent.
@@ -1522,14 +1560,24 @@ def test_can_load_older_state_dict(self, device, dtype, optim_info):
         all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
             device, dtype, optim_info, skip=("differentiable",)
         )
+
+        def _get_model_and_input_tensor(device, dtype, optim_cls):
+            if optim_cls.__name__ == "Muon":
+                # Muon only accepts 2D parameter.
+                model = torch.nn.Linear(10, 4, bias=False)
+                input = torch.rand(10, device=device, dtype=dtype)
+            else:
+                model = torch.nn.Sequential(
+                    torch.nn.Conv2d(4, 2, 1, stride=2),
+                    torch.nn.BatchNorm2d(2, eps=1e-05, momentum=0.1),
+                )
+                input = torch.rand(1, 4, 16, 16, device=device, dtype=dtype)
+            model.to(dtype=dtype, device=device)
+            return model, input
+
         for optim_input in all_optim_inputs:
             torch.manual_seed(1)
-            model = torch.nn.Sequential(
-                torch.nn.Conv2d(4, 2, 1, stride=2),
-                torch.nn.BatchNorm2d(2, eps=1e-05, momentum=0.1),
-            )
-            model.to(dtype=dtype, device=device)
-            input = torch.rand(1, 4, 16, 16, device=device, dtype=dtype)
+            model, input = _get_model_and_input_tensor(device, dtype, optim_cls)
             optimizer = optim_cls(model.parameters(), **optim_input.kwargs)
 
             def fwd_bwd(optim, mod, i):
@@ -1577,14 +1625,24 @@ def test_can_load_from_to_named_state_dict(
         all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
             device, dtype, optim_info, skip=("differentiable",)
         )
+
+        def _get_model_and_input_tensor(device, dtype, optim_cls):
+            if optim_cls.__name__ == "Muon":
+                # Muon only accepts 2D parameter.
+                model = torch.nn.Linear(10, 4, bias=False)
+                input = torch.rand(10, device=device, dtype=dtype)
+            else:
+                model = torch.nn.Sequential(
+                    torch.nn.Conv2d(4, 2, 1, stride=2),
+                    torch.nn.BatchNorm2d(2, eps=1e-05, momentum=0.1),
+                )
+                input = torch.rand(1, 4, 16, 16, device=device, dtype=dtype)
+            model.to(dtype=dtype, device=device)
+            return model, input
+
         for optim_input in all_optim_inputs:
             torch.manual_seed(1)
-            model = torch.nn.Sequential(
-                torch.nn.Conv2d(4, 2, 1, stride=2),
-                torch.nn.BatchNorm2d(2, eps=1e-05, momentum=0.1),
-            )
-            model.to(dtype=dtype, device=device)
-            input = torch.rand(1, 4, 16, 16, device=device, dtype=dtype)
+            model, input = _get_model_and_input_tensor(device, dtype, optim_cls)
 
             def fwd_bwd(optim, mod, i):
                 optim.zero_grad()
@@ -1621,11 +1679,12 @@ def fwd_bwd(optim, mod, i):
                 fwd_bwd(optimizer2, model, input)
                 optimizer2.step()
 
+            ref_names = [p[0] for p in model.named_parameters()]
             # Make sure that param_names are preserved when provided to at least one of the optimizers
             if is_named_optim0 or is_named_optim1:
                 self.assertEqual(
                     optimizer2.state_dict()["param_groups"][0]["param_names"],
-                    ["0.weight", "0.bias", "1.weight", "1.bias"],
+                    ref_names,
                 )
 
     @parametrize("is_named_optim", [True, False])
@@ -1644,7 +1703,7 @@ def test_save_load_equality_with_weights_only(
         )
         bias = Parameter(torch.randn(2, requires_grad=True, device=device, dtype=dtype))
         input = torch.randn(3, requires_grad=True, device=device, dtype=dtype)
-        params = [weight, bias]
+        params = [weight, bias] if optim_cls.__name__ != "Muon" else [weight]
 
         def make_named_param(param, is_named):
             if not is_named:
@@ -1653,7 +1712,8 @@ def make_named_param(param, is_named):
 
         def fwd_bwd(optim, w, b, i):
             optim.zero_grad()
-            loss = (w.mv(i) + b).pow(2).sum()
+            wo = w.mv(i) if optim_cls.__name__ == "Muon" else w.mv(i) + b
+            loss = wo.pow(2).sum()
             loss.backward()
             if optim_info.only_supports_sparse_grads:
                 weight.grad = weight.grad.to_sparse()
@@ -1937,7 +1997,7 @@ def post_hook(opt: Optimizer, args: tuple[Any], kwargs: dict[Any, Any]):
             nonlocal data
             data += 2
 
-        params = [torch.tensor([1, 1], device=device, dtype=dtype)]
+        params = [torch.tensor([[1, 1]], device=device, dtype=dtype)]
 
         def dummy_closure():
             return 1
@@ -1969,7 +2029,8 @@ def pre_hook(opt: Optimizer, args: tuple[Any], kwargs: dict[Any, Any]):
             nonlocal data
             data += 2
 
-        params = [torch.tensor([1, 1], device=device, dtype=dtype)]
+        # Create a random 2D tensor for compatibility with Muon.
+        params = [torch.tensor([[1, 1]], device=device, dtype=dtype)]
 
         def dummy_closure():
             return 1
@@ -2013,7 +2074,7 @@ def local_post_hook(opt: Optimizer, args: tuple[Any], kwargs: dict[Any, Any]):
             nonlocal data
             data.append(2)
 
-        params = [torch.tensor([1, 1], device=device, dtype=dtype)]
+        params = [torch.tensor([[1, 1]], device=device, dtype=dtype)]
 
         def dummy_closure():
             return 1
@@ -2219,7 +2280,8 @@ def test_defaults_changed_to_foreach(self, device, dtype, optim_info):
     def test_non_empty_state(self, device, dtype, optim_info):
         # There are internal tests that check that the state is not empty
         optim_cls = optim_info.optim_cls
-        model = torch.nn.Linear(5, 5)
+        # Muon only accepts 2D parameter.
+        model = torch.nn.Linear(5, 5, bias=False)
         model.to(dtype=dtype, device=device)
         inpt = torch.rand(2, 5, dtype=dtype, device=device)
 
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 4db042297f05c..8454677856d0f 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -615,6 +615,271 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
 
         self.assertEqual(NothingImplemented() ** RPowOnly(), -1)
 
+    def test_torch_function_in_lists(self):
+        """Test that __torch_function__ is called for objects inside lists"""
+
+        class IntLike:
+            """Object that can be used in int lists"""
+            def __init__(self, value):
+                self.value = value
+                self.torch_function_called = False
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                self.torch_function_called = True
+                # Return a result that makes the operation succeed
+                if func.__name__ == 'pad':
+                    # For pad, return the input with shape adjusted
+                    return args[0]
+                elif func.__name__ == 'layer_norm':
+                    # For layer_norm, return normalized tensor
+                    return torch.ones_like(args[0])
+                elif func.__name__ == 'tensordot':
+                    # For tensordot, return appropriate shape
+                    return torch.tensor(42.0)
+                # Fallback
+                return torch.tensor(42.0)
+
+        # Test with F.pad which takes int list
+        import torch.nn.functional as F
+        x = torch.randn(2, 3)
+        obj = IntLike(1)
+
+        # pad takes [left, right, top, bottom] as padding
+        _ = F.pad(x, [1, obj, 0, 0])
+        self.assertTrue(obj.torch_function_called,
+                        "torch_function should be called for object in int list")
+
+        # Test multiple objects in list
+        obj1 = IntLike(1)
+        obj2 = IntLike(2)
+        _ = F.pad(x, [obj1, obj2, 0, 0])
+        self.assertTrue(obj1.torch_function_called or obj2.torch_function_called,
+                        "torch_function should be called for at least one object")
+
+    def test_torch_function_in_float_lists(self):
+        """Test that __torch_function__ is called for objects inside float lists"""
+
+        class FloatLike:
+            """Object that can be used in float lists"""
+            def __init__(self, value):
+                self.value = float(value)
+                self.torch_function_called = False
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                self.torch_function_called = True
+                # Return appropriate result
+                if func.__name__ == 'layer_norm':
+                    return torch.ones_like(args[0])
+                return torch.tensor(42.0)
+
+        import torch.nn.functional as F
+        x = torch.randn(2, 3, 4)
+        obj = FloatLike(4.0)
+
+        # layer_norm takes normalized_shape as int/float list
+        _ = F.layer_norm(x, [3, obj])
+        self.assertTrue(obj.torch_function_called,
+                        "torch_function should be called for object in float list")
+
+    def test_torch_function_in_scalar_lists(self):
+        """Test that __torch_function__ is called for scalar objects inside lists"""
+
+        class ScalarLike:
+            """Object that can be used as a scalar in lists"""
+            def __init__(self, value):
+                self.value = value
+                self.torch_function_called = False
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                self.torch_function_called = True
+                # Return a scalar tensor
+                return torch.tensor(self.value)
+
+            def __float__(self):
+                return float(self.value)
+
+            def __int__(self):
+                return int(self.value)
+
+        # Test with a function that takes scalar lists
+        # Using torch.as_tensor which can take scalar lists
+        obj1 = ScalarLike(1.0)
+        obj2 = ScalarLike(2.0)
+
+        # Create a tensor with scalar list containing torch function objects
+        # Use a different operation that should trigger torch_function
+        _ = torch.stack([obj1, obj2])
+        self.assertTrue(obj1.torch_function_called or obj2.torch_function_called,
+                        "torch_function should be called for scalar objects in list")
+
+    def test_torch_function_precedence_in_lists(self):
+        """Test precedence when multiple torch function objects are in a list"""
+
+        call_order = []
+
+        class HighPriority:
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                call_order.append('high')
+                # Delegate to lower priority
+                return NotImplemented
+
+        class LowPriority:
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                call_order.append('low')
+                # Return valid result
+                if func.__name__ == 'pad':
+                    return args[0]
+                return torch.tensor(42.0)
+
+        import torch.nn.functional as F
+        x = torch.randn(2, 3)
+
+        high = HighPriority()
+        low = LowPriority()
+
+        # Test with both objects in list
+        call_order.clear()
+        _ = F.pad(x, [1, high, low, 0])
+
+        # High priority should be called first
+        self.assertEqual(call_order[0], 'high',
+                         "Higher priority torch_function should be called first")
+        self.assertEqual(call_order[1], 'low',
+                         "Lower priority torch_function should be called after NotImplemented")
+
+    def test_torch_function_mixed_lists(self):
+        """Test lists with mix of regular values and torch function objects"""
+
+        class CountingInt:
+            call_count = 0
+
+            def __init__(self, value):
+                self.value = value
+
+            @classmethod
+            def reset(cls):
+                cls.call_count = 0
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                CountingInt.call_count += 1
+                # Return valid result
+                if func.__name__ == 'pad':
+                    return args[0]
+                return torch.tensor(42.0)
+
+            def __index__(self):
+                return self.value
+
+        import torch.nn.functional as F
+        x = torch.randn(2, 3)
+
+        obj = CountingInt(2)
+        CountingInt.reset()
+
+        # Mix regular ints with torch function object
+        _ = F.pad(x, [1, obj, 0, 0])
+
+        self.assertEqual(CountingInt.call_count, 1,
+                         "torch_function should be called exactly once for mixed list")
+
+    def test_torch_function_empty_lists(self):
+        """Test that empty lists work correctly"""
+
+        # This should work without calling any torch_function
+        x = torch.randn(1)  # Single element tensor
+
+        # Functions that accept empty lists should still work
+        # torch.stack with empty list of tensors would fail,
+        # but empty size lists should work
+        result = x.view([])  # Empty list means scalar
+        self.assertEqual(result.shape, torch.Size([]),
+                         "Empty list should work for size arguments")
+
+    def test_torch_function_not_first_in_list(self):
+        """Test that torch_function is called even when object is not first in list"""
+
+        class IntLikeNotFirst:
+            """Object with torch_function that won't be first in list"""
+            def __init__(self, value):
+                self.value = value
+                self.torch_function_called = False
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                self.torch_function_called = True
+                # Return input tensor for pad
+                return args[0]
+
+            def __index__(self):
+                return self.value
+
+        import torch.nn.functional as F
+        x = torch.randn(2, 3)
+
+        # Test with torch_function object as second item
+        obj_second = IntLikeNotFirst(2)
+        _ = F.pad(x, [1, obj_second, 0, 0])
+        self.assertTrue(obj_second.torch_function_called,
+                        "torch_function should be called when object is second in list")
+
+        # Test with torch_function object as third item
+        obj_third = IntLikeNotFirst(1)
+        _ = F.pad(x, [1, 1, obj_third, 0])
+        self.assertTrue(obj_third.torch_function_called,
+                        "torch_function should be called when object is third in list")
+
+        # Test with torch_function object as last item
+        obj_last = IntLikeNotFirst(1)
+        _ = F.pad(x, [1, 1, 1, obj_last])
+        self.assertTrue(obj_last.torch_function_called,
+                        "torch_function should be called when object is last in list")
+
+    def test_torch_function_nested_tuple_getitem(self):
+        """Test that torch_function is called with getitem for TF objects inside nested tuples"""
+
+        called_functions = []
+
+        class TorchFunctionObj:
+            """Object with torch_function that tracks which functions are called"""
+            def __init__(self, value):
+                self.value = value
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                called_functions.append(func.__name__)
+                # For getitem, return the tensor unchanged
+                if func.__name__ == '__getitem__':
+                    return args[0]
+                # Return a simple result for other functions
+                return torch.tensor(42.0)
+
+            def __index__(self):
+                return self.value
+
+        # Create a tensor to index
+        x = torch.randn(5, 5, 5)
+
+        # Create torch function objects - these will be INSIDE the nested structure
+        tf_obj1 = TorchFunctionObj(0)
+        tf_obj2 = TorchFunctionObj(1)
+
+        # Clear the called functions list
+        called_functions.clear()
+
+        # Test with tuple of tuple where TF objects are only on the INSIDE
+        # The outer structure is regular tuples, but inner elements have __torch_function__
+        # This tests the recursive detection logic added in the recent commit
+        x[(0, (tf_obj1, tf_obj2))]
+
+        # Assert that torch_function was called
+        self.assertTrue(len(called_functions) > 0,
+                        "torch_function should be called for TF objects inside nested tuples")
+
+        # Assert that getitem was called, not size
+        self.assertIn('__getitem__', called_functions,
+                      "getitem should be called for tuple indexing with torch function objects inside")
+
+        self.assertNotIn('size', called_functions,
+                         "size should not be called - we should use getitem, not convert to advanced indexing")
+
 
 def generate_tensor_like_override_tests(cls):
     from torch.testing._internal.generated.annotated_fn_args import annotated_args
@@ -1135,29 +1400,31 @@ def test_resolve_name(self):
                 )
 
 class TestTorchFunctionWarning(TestCase):
-    def test_warn_on_invalid_torch_function_standalone_class(self):
+    def test_torch_function_standalone_class(self):
         class StandaloneTorchFunctionClass:
-            def __torch_function__(self, *args, **kwargs):
-                pass
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                # Return a simple tensor for testing
+                return torch.tensor(42.0)
         a = StandaloneTorchFunctionClass()
-        with self.assertWarnsRegex(DeprecationWarning, "as a plain method is deprecated"):
-            # Function that handles torch_function on the python side
-            torch.nn.functional.dropout(a)
-        with self.assertWarnsRegex(UserWarning, "as a plain method is deprecated"):
-            # Function that handles torch_function in C++
-            torch.abs(a)
-
-    def test_warn_on_invalid_torch_function_tensor_subclass(self):
+        # Test that torch_function works without warnings
+        result1 = torch.nn.functional.dropout(a)
+        result2 = torch.abs(a)
+        self.assertEqual(result1, torch.tensor(42.0))
+        self.assertEqual(result2, torch.tensor(42.0))
+
+    def test_torch_function_tensor_subclass(self):
         class TensorSubclassTorchFunctionClass(torch.Tensor):
-            def __torch_function__(self, *args, **kwargs):
-                pass
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                # Return a simple tensor for testing
+                return torch.tensor(99.0)
         b = TensorSubclassTorchFunctionClass()
-        with self.assertWarnsRegex(DeprecationWarning, "as a plain method is deprecated"):
-            # Function that handles torch_function on the python side
-            torch.nn.functional.dropout(b)
-        with self.assertWarnsRegex(UserWarning, "as a plain method is deprecated"):
-            # Function that handles torch_function in C++
-            torch.abs(b)
+        # Test that torch_function works without warnings
+        result1 = torch.nn.functional.dropout(b)
+        result2 = torch.abs(b)
+        self.assertEqual(result1, torch.tensor(99.0))
+        self.assertEqual(result2, torch.tensor(99.0))
 
 class TestDisabledUserWarnings(TestCase):
     def test_no_implicit_user_warning_for_deprecated_functions(self):
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index e0480ba6a6842..07a92244cd733 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -1,7 +1,6 @@
 # Owner(s): ["module: __torch_dispatch__"]
 # ruff: noqa: F841
 
-import logging
 import pickle
 import sys
 import tempfile
@@ -588,6 +587,47 @@ def test_error_for_unsupported_ns_or_kind(self) -> None:
             with self.assertRaisesRegex(ValueError, "reserved namespace"):
                 my_lib1 = Library("prim", kind)  # noqa: TOR901
 
+    def test_dispatcher_error_filenames(self) -> None:
+        # Test that dispatcher errors report correct Python filenames and line numbers
+        # when defining duplicate libraries (which triggers the filename tracking)
+        import linecache
+        import re
+
+        # Create first library
+        # NOTE: Using Library directly instead of _scoped_library because this test
+        # specifically verifies filename tracking in error messages, and _scoped_library
+        # would report library.py locations instead of the actual test file locations
+        lib1 = Library(self.test_ns, "DEF")  # FIRST_LIB_MARKER  # noqa: TOR901
+        try:
+            lib1.define("duplicate_op(Tensor x) -> Tensor")
+
+            # Try to create another library with same namespace - this should trigger error
+            with self.assertRaises(RuntimeError) as cm:
+                lib2 = Library(self.test_ns, "DEF")  # SECOND_LIB_MARKER  # noqa: TOR901
+        finally:
+            lib1._destroy()
+
+        error_msg = str(cm.exception)
+
+        # The error should NOT contain /dev/null (the old placeholder)
+        self.assertNotIn("/dev/null", error_msg)
+        # The error should contain the test file name for both registrations
+        self.assertIn("test_python_dispatch.py", error_msg)
+        # Extract line numbers from the error message and verify they point to the right lines
+        line_matches = re.findall(r"test_python_dispatch\.py:(\d+)", error_msg)
+        self.assertEqual(
+            len(line_matches), 2, "Should have exactly 2 line number references"
+        )
+
+        # Get the actual source lines and verify they contain our markers
+        first_line_num, second_line_num = sorted([int(x) for x in line_matches])
+        first_line = linecache.getline(__file__, first_line_num).strip()
+        second_line = linecache.getline(__file__, second_line_num).strip()
+
+        # Verify the lines contain our expected markers
+        self.assertIn("FIRST_LIB_MARKER", first_line)
+        self.assertIn("SECOND_LIB_MARKER", second_line)
+
     def test_returning_symint(self) -> None:
         shape_env = ShapeEnv()
         fake_tensor_mode = FakeTensorMode(shape_env=shape_env)
@@ -1718,49 +1758,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 self.assertEqual(s.device_index, 2)
                 self.assertEqual(s.device_type, 3)
 
-    def test_subclass_autograd_device_check(self) -> None:
-        class NonWrapperSubclass(torch.Tensor):
-            elem: torch.Tensor
-
-            __slots__ = ["elem"]
-
-            @staticmethod
-            def __new__(cls, elem, *args, **kwargs):
-                # Wrong device here!
-                r = torch.Tensor._make_subclass(
-                    cls, elem.to("meta"), elem.requires_grad
-                )
-                # ...the real tensor is held as an element on the tensor.
-                r.elem = elem
-                return r
-
-            @classmethod
-            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
-                def unwrap(e):
-                    return e.elem if isinstance(e, NonWrapperSubclass) else e
-
-                def wrap(e):
-                    return NonWrapperSubclass(e) if isinstance(e, torch.Tensor) else e
-
-                rs = tree_map(
-                    wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))
-                )
-                logging.getLogger("NonWrapperSubclass").info(
-                    f"{func.__module__}.{func.__name__}",  # noqa: G004
-                    args,
-                    kwargs,
-                    rs,
-                )
-                return rs
-
-        x = NonWrapperSubclass(torch.tensor([3.0, 4.0], requires_grad=True))
-        y = torch.randn(2, requires_grad=True)
-        z = x * y
-        self.assertIsInstance(z, NonWrapperSubclass)
-        z.sum().backward(torch.tensor(1))
-        self.assertEqual(x.grad, y)
-        self.assertEqual(y.grad, x)
-
     def test_none_wrapping(self):
         # A Tensor subclass that returns None when doing add
         # See LoggingTensor above for more details on the subclass
@@ -2002,6 +1999,8 @@ def __new__(cls, data, wrapper):
                 def __torch_dispatch__(cls, func, types, args, kwargs):
                     if func.overloadpacket == torch.ops.aten.is_contiguous:
                         return contiguous_data.is_contiguous()
+                    if func.overloadpacket == torch.ops.aten.sym_is_contiguous:
+                        return torch.ops.aten.sym_is_contiguous(contiguous_data)
                     return NotImplemented
 
             class ExampleTensor3(torch.Tensor):
@@ -2015,6 +2014,8 @@ def __new__(cls, data, wrapper):
                 def __torch_dispatch__(cls, func, types, args, kwargs):
                     if func.overloadpacket == torch.ops.aten.is_contiguous:
                         return not_contiguous_data.is_contiguous()
+                    if func.overloadpacket == torch.ops.aten.sym_is_contiguous:
+                        return torch.ops.aten.sym_is_contiguous(not_contiguous_data)
                     return NotImplemented
 
             err_msg = "Multiple dispatch failed for 'torch.ops.aten.is_contiguous'"
@@ -2047,6 +2048,7 @@ def __new__(cls, data):
             @classmethod
             def __torch_dispatch__(cls, func, types, args, kwargs):
                 if func in [
+                    torch.ops.aten.sym_is_contiguous.default,
                     torch.ops.aten.is_contiguous.default,
                     torch.ops.aten.is_contiguous.memory_format,
                     torch.ops.aten.is_strides_like_format.default,
@@ -2513,6 +2515,19 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
             with Mode():
                 torch.cond(pred, lambda x: x.sin(), lambda x: x.cos(), (x,))
 
+    def test_dispatch_uint64(self):
+        class DummyMode(TorchDispatchMode):
+            def __torch_dispatch__(self, func, types, args, kwargs):
+                self.last_args = args
+                return func(*args, **kwargs)
+
+        # Value that could not be intepreted as signed int64
+        uarg = 2**63 + 1
+        with DummyMode() as m:
+            a = torch.full((3, 3), uarg, dtype=torch.uint64)
+            self.assertEqual(m.last_args[1], uarg)
+        self.assertTrue((a == uarg).all().item())
+
 
 class TestPythonDispatcher(TestCase):
     def test_basic(self):
diff --git a/test/test_pytree.py b/test/test_pytree.py
index 228dec85bff69..e19f1471267cb 100644
--- a/test/test_pytree.py
+++ b/test/test_pytree.py
@@ -14,7 +14,7 @@
 from typing import Any, NamedTuple, Optional
 
 import torch
-import torch.utils._pytree as py_pytree
+import torch.utils._pytree as python_pytree
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -26,12 +26,24 @@
 )
 
 
-if IS_FBCODE:
-    # optree is not yet enabled in fbcode, so just re-test the python implementation
-    cxx_pytree = py_pytree
-else:
+pytree_modules = {
+    "python": python_pytree,
+}
+if not IS_FBCODE:
     import torch.utils._cxx_pytree as cxx_pytree
 
+    pytree_modules["cxx"] = cxx_pytree
+else:
+    # optree is not yet enabled in fbcode, so just re-test the python implementation
+    cxx_pytree = python_pytree
+
+
+parametrize_pytree_module = parametrize(
+    "pytree",
+    [subtest(module, name=name) for name, module in pytree_modules.items()],
+)
+
+
 GlobalPoint = namedtuple("GlobalPoint", ["x", "y"])
 
 
@@ -53,26 +65,32 @@ class TestEnum(enum.Enum):
     A = auto()
 
 
+python_leafspec = python_pytree.LeafSpec()
+
+
 class TestGenericPytree(TestCase):
     def test_aligned_public_apis(self):
-        public_apis = py_pytree.__all__
+        public_apis = python_pytree.__all__
 
         self.assertEqual(public_apis, cxx_pytree.__all__)
 
         for name in public_apis:
             cxx_api = getattr(cxx_pytree, name)
-            py_api = getattr(py_pytree, name)
+            python_api = getattr(python_pytree, name)
 
-            self.assertEqual(inspect.isclass(cxx_api), inspect.isclass(py_api))
-            self.assertEqual(inspect.isfunction(cxx_api), inspect.isfunction(py_api))
+            self.assertEqual(inspect.isclass(cxx_api), inspect.isclass(python_api))
+            self.assertEqual(
+                inspect.isfunction(cxx_api),
+                inspect.isfunction(python_api),
+            )
             if inspect.isfunction(cxx_api):
                 cxx_signature = inspect.signature(cxx_api)
-                py_signature = inspect.signature(py_api)
+                python_signature = inspect.signature(python_api)
 
                 # Check the parameter names are the same.
                 cxx_param_names = list(cxx_signature.parameters)
-                py_param_names = list(py_signature.parameters)
-                self.assertEqual(cxx_param_names, py_param_names)
+                python_param_names = list(python_signature.parameters)
+                self.assertEqual(cxx_param_names, python_param_names)
 
                 # Check the positional parameters are the same.
                 cxx_positional_param_names = [
@@ -86,9 +104,9 @@ def test_aligned_public_apis(self):
                         }
                     )
                 ]
-                py_positional_param_names = [
+                python_positional_param_names = [
                     n
-                    for n, p in py_signature.parameters.items()
+                    for n, p in python_signature.parameters.items()
                     if (
                         p.kind
                         in {
@@ -97,19 +115,22 @@ def test_aligned_public_apis(self):
                         }
                     )
                 ]
-                self.assertEqual(cxx_positional_param_names, py_positional_param_names)
+                self.assertEqual(
+                    cxx_positional_param_names,
+                    python_positional_param_names,
+                )
 
-                for py_name, py_param in py_signature.parameters.items():
-                    self.assertIn(py_name, cxx_signature.parameters)
-                    cxx_param = cxx_signature.parameters[py_name]
+                for python_name, python_param in python_signature.parameters.items():
+                    self.assertIn(python_name, cxx_signature.parameters)
+                    cxx_param = cxx_signature.parameters[python_name]
 
                     # Check parameter kinds and default values are the same.
-                    self.assertEqual(cxx_param.kind, py_param.kind)
-                    self.assertEqual(cxx_param.default, py_param.default)
+                    self.assertEqual(cxx_param.kind, python_param.kind)
+                    self.assertEqual(cxx_param.default, python_param.default)
 
                     # Check parameter annotations are the same.
                     if "TreeSpec" in str(cxx_param.annotation):
-                        self.assertIn("TreeSpec", str(py_param.annotation))
+                        self.assertIn("TreeSpec", str(python_param.annotation))
                         self.assertEqual(
                             re.sub(
                                 r"(?:\b)([\w\.]*)TreeSpec(?:\b)",
@@ -119,78 +140,66 @@ def test_aligned_public_apis(self):
                             re.sub(
                                 r"(?:\b)([\w\.]*)TreeSpec(?:\b)",
                                 "TreeSpec",
-                                str(py_param.annotation),
+                                str(python_param.annotation),
                             ),
                             msg=(
                                 f"C++ parameter {cxx_param} "
-                                f"does not match Python parameter {py_param} "
+                                f"does not match Python parameter {python_param} "
                                 f"for API `{name}`"
                             ),
                         )
                     else:
                         self.assertEqual(
                             cxx_param.annotation,
-                            py_param.annotation,
+                            python_param.annotation,
                             msg=(
                                 f"C++ parameter {cxx_param} "
-                                f"does not match Python parameter {py_param} "
+                                f"does not match Python parameter {python_param} "
                                 f"for API `{name}`"
                             ),
                         )
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_register_pytree_node(self, pytree_impl):
+    @parametrize_pytree_module
+    def test_register_pytree_node(self, pytree):
         class MyDict(UserDict):
             pass
 
         d = MyDict(a=1, b=2, c=3)
 
         # Custom types are leaf nodes by default
-        values, spec = pytree_impl.tree_flatten(d)
+        values, spec = pytree.tree_flatten(d)
         self.assertEqual(values, [d])
         self.assertIs(values[0], d)
-        self.assertEqual(d, pytree_impl.tree_unflatten(values, spec))
+        self.assertEqual(d, pytree.tree_unflatten(values, spec))
         self.assertTrue(spec.is_leaf())
 
         # Register MyDict as a pytree node
-        pytree_impl.register_pytree_node(
+        pytree.register_pytree_node(
             MyDict,
             lambda d: (list(d.values()), list(d.keys())),
             lambda values, keys: MyDict(zip(keys, values)),
         )
 
-        values, spec = pytree_impl.tree_flatten(d)
+        values, spec = pytree.tree_flatten(d)
         self.assertEqual(values, [1, 2, 3])
-        self.assertEqual(d, pytree_impl.tree_unflatten(values, spec))
+        self.assertEqual(d, pytree.tree_unflatten(values, spec))
 
         # Do not allow registering the same type twice
         with self.assertRaisesRegex(ValueError, "already registered"):
-            pytree_impl.register_pytree_node(
+            pytree.register_pytree_node(
                 MyDict,
                 lambda d: (list(d.values()), list(d.keys())),
                 lambda values, keys: MyDict(zip(keys, values)),
             )
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_flatten_unflatten_leaf(self, pytree_impl):
+    @parametrize_pytree_module
+    def test_flatten_unflatten_leaf(self, pytree):
         def run_test_with_leaf(leaf):
-            values, treespec = pytree_impl.tree_flatten(leaf)
+            values, treespec = pytree.tree_flatten(leaf)
             self.assertEqual(values, [leaf])
-            self.assertEqual(treespec, pytree_impl.LeafSpec())
+            self.assertEqual(treespec, pytree.LeafSpec())
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, leaf)
 
         run_test_with_leaf(1)
@@ -200,16 +209,16 @@ def run_test_with_leaf(leaf):
         run_test_with_leaf(torch.randn(3, 3))
 
     @parametrize(
-        "pytree_impl,gen_expected_fn",
+        "pytree,gen_expected_fn",
         [
             subtest(
                 (
-                    py_pytree,
-                    lambda tup: py_pytree.TreeSpec(
-                        tuple, None, [py_pytree.LeafSpec() for _ in tup]
+                    python_pytree,
+                    lambda tup: python_pytree.TreeSpec(
+                        tuple, None, [python_leafspec for _ in tup]
                     ),
                 ),
-                name="py",
+                name="python",
             ),
             subtest(
                 (cxx_pytree, lambda tup: cxx_pytree.tree_structure((0,) * len(tup))),
@@ -217,15 +226,15 @@ def run_test_with_leaf(leaf):
             ),
         ],
     )
-    def test_flatten_unflatten_tuple(self, pytree_impl, gen_expected_fn):
+    def test_flatten_unflatten_tuple(self, pytree, gen_expected_fn):
         def run_test(tup):
             expected_spec = gen_expected_fn(tup)
-            values, treespec = pytree_impl.tree_flatten(tup)
+            values, treespec = pytree.tree_flatten(tup)
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(tup))
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, tup)
             self.assertIsInstance(unflattened, tuple)
 
@@ -235,16 +244,16 @@ def run_test(tup):
         run_test((torch.tensor([1.0, 2]), 2, 10, 9, 11))
 
     @parametrize(
-        "pytree_impl,gen_expected_fn",
+        "pytree,gen_expected_fn",
         [
             subtest(
                 (
-                    py_pytree,
-                    lambda lst: py_pytree.TreeSpec(
-                        list, None, [py_pytree.LeafSpec() for _ in lst]
+                    python_pytree,
+                    lambda lst: python_pytree.TreeSpec(
+                        list, None, [python_leafspec for _ in lst]
                     ),
                 ),
-                name="py",
+                name="python",
             ),
             subtest(
                 (cxx_pytree, lambda lst: cxx_pytree.tree_structure([0] * len(lst))),
@@ -252,15 +261,15 @@ def run_test(tup):
             ),
         ],
     )
-    def test_flatten_unflatten_list(self, pytree_impl, gen_expected_fn):
+    def test_flatten_unflatten_list(self, pytree, gen_expected_fn):
         def run_test(lst):
             expected_spec = gen_expected_fn(lst)
-            values, treespec = pytree_impl.tree_flatten(lst)
+            values, treespec = pytree.tree_flatten(lst)
             self.assertIsInstance(values, list)
             self.assertEqual(values, lst)
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, lst)
             self.assertIsInstance(unflattened, list)
 
@@ -269,18 +278,18 @@ def run_test(lst):
         run_test([torch.tensor([1.0, 2]), 2, 10, 9, 11])
 
     @parametrize(
-        "pytree_impl,gen_expected_fn",
+        "pytree,gen_expected_fn",
         [
             subtest(
                 (
-                    py_pytree,
-                    lambda dct: py_pytree.TreeSpec(
+                    python_pytree,
+                    lambda dct: python_pytree.TreeSpec(
                         dict,
                         list(dct.keys()),
-                        [py_pytree.LeafSpec() for _ in dct.values()],
+                        [python_leafspec for _ in dct.values()],
                     ),
                 ),
-                name="py",
+                name="python",
             ),
             subtest(
                 (
@@ -291,15 +300,15 @@ def run_test(lst):
             ),
         ],
     )
-    def test_flatten_unflatten_dict(self, pytree_impl, gen_expected_fn):
+    def test_flatten_unflatten_dict(self, pytree, gen_expected_fn):
         def run_test(dct):
             expected_spec = gen_expected_fn(dct)
-            values, treespec = pytree_impl.tree_flatten(dct)
+            values, treespec = pytree.tree_flatten(dct)
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(dct.values()))
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, dct)
             self.assertIsInstance(unflattened, dict)
 
@@ -310,18 +319,18 @@ def run_test(dct):
         run_test({"a": 1, "b": 2, "c": torch.randn(2, 3)})
 
     @parametrize(
-        "pytree_impl,gen_expected_fn",
+        "pytree,gen_expected_fn",
         [
             subtest(
                 (
-                    py_pytree,
-                    lambda odict: py_pytree.TreeSpec(
+                    python_pytree,
+                    lambda odict: python_pytree.TreeSpec(
                         OrderedDict,
                         list(odict.keys()),
-                        [py_pytree.LeafSpec() for _ in odict.values()],
+                        [python_leafspec for _ in odict.values()],
                     ),
                 ),
-                name="py",
+                name="python",
             ),
             subtest(
                 (
@@ -334,15 +343,15 @@ def run_test(dct):
             ),
         ],
     )
-    def test_flatten_unflatten_ordereddict(self, pytree_impl, gen_expected_fn):
+    def test_flatten_unflatten_ordereddict(self, pytree, gen_expected_fn):
         def run_test(odict):
             expected_spec = gen_expected_fn(odict)
-            values, treespec = pytree_impl.tree_flatten(odict)
+            values, treespec = pytree.tree_flatten(odict)
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(odict.values()))
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, odict)
             self.assertIsInstance(unflattened, OrderedDict)
 
@@ -354,18 +363,18 @@ def run_test(odict):
         run_test(od)
 
     @parametrize(
-        "pytree_impl,gen_expected_fn",
+        "pytree,gen_expected_fn",
         [
             subtest(
                 (
-                    py_pytree,
-                    lambda ddct: py_pytree.TreeSpec(
+                    python_pytree,
+                    lambda ddct: python_pytree.TreeSpec(
                         defaultdict,
                         [ddct.default_factory, list(ddct.keys())],
-                        [py_pytree.LeafSpec() for _ in ddct.values()],
+                        [python_leafspec for _ in ddct.values()],
                     ),
                 ),
-                name="py",
+                name="python",
             ),
             subtest(
                 (
@@ -378,15 +387,15 @@ def run_test(odict):
             ),
         ],
     )
-    def test_flatten_unflatten_defaultdict(self, pytree_impl, gen_expected_fn):
+    def test_flatten_unflatten_defaultdict(self, pytree, gen_expected_fn):
         def run_test(ddct):
             expected_spec = gen_expected_fn(ddct)
-            values, treespec = pytree_impl.tree_flatten(ddct)
+            values, treespec = pytree.tree_flatten(ddct)
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(ddct.values()))
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, ddct)
             self.assertEqual(unflattened.default_factory, ddct.default_factory)
             self.assertIsInstance(unflattened, defaultdict)
@@ -398,18 +407,16 @@ def run_test(ddct):
         run_test(defaultdict(int, {"a": 1, "b": 2, "c": torch.randn(2, 3)}))
 
     @parametrize(
-        "pytree_impl,gen_expected_fn",
+        "pytree,gen_expected_fn",
         [
             subtest(
                 (
-                    py_pytree,
-                    lambda deq: py_pytree.TreeSpec(
-                        deque,
-                        deq.maxlen,
-                        [py_pytree.LeafSpec() for _ in deq],
+                    python_pytree,
+                    lambda deq: python_pytree.TreeSpec(
+                        deque, deq.maxlen, [python_leafspec for _ in deq]
                     ),
                 ),
-                name="py",
+                name="python",
             ),
             subtest(
                 (
@@ -422,15 +429,15 @@ def run_test(ddct):
             ),
         ],
     )
-    def test_flatten_unflatten_deque(self, pytree_impl, gen_expected_fn):
+    def test_flatten_unflatten_deque(self, pytree, gen_expected_fn):
         def run_test(deq):
             expected_spec = gen_expected_fn(deq)
-            values, treespec = pytree_impl.tree_flatten(deq)
+            values, treespec = pytree.tree_flatten(deq)
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(deq))
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, deq)
             self.assertEqual(unflattened.maxlen, deq.maxlen)
             self.assertIsInstance(unflattened, deque)
@@ -439,29 +446,23 @@ def run_test(deq):
         run_test(deque([1.0, 2]))
         run_test(deque([torch.tensor([1.0, 2]), 2, 10, 9, 11], maxlen=8))
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_flatten_unflatten_namedtuple(self, pytree_impl):
+    @parametrize_pytree_module
+    def test_flatten_unflatten_namedtuple(self, pytree):
         Point = namedtuple("Point", ["x", "y"])
 
         def run_test(tup):
-            if pytree_impl is py_pytree:
-                expected_spec = py_pytree.TreeSpec(
-                    namedtuple, Point, [py_pytree.LeafSpec() for _ in tup]
+            if pytree is python_pytree:
+                expected_spec = python_pytree.TreeSpec(
+                    namedtuple, Point, [python_leafspec for _ in tup]
                 )
             else:
                 expected_spec = cxx_pytree.tree_structure(Point(0, 1))
-            values, treespec = pytree_impl.tree_flatten(tup)
+            values, treespec = pytree.tree_flatten(tup)
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(tup))
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, tup)
             self.assertIsInstance(unflattened, Point)
 
@@ -475,43 +476,31 @@ def run_test(tup):
             subtest(torch.min, name="min"),
         ],
     )
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_flatten_unflatten_return_types(self, pytree_impl, op):
+    @parametrize_pytree_module
+    def test_flatten_unflatten_return_types(self, pytree, op):
         x = torch.randn(3, 3)
         expected = op(x, dim=0)
 
-        values, spec = pytree_impl.tree_flatten(expected)
+        values, spec = pytree.tree_flatten(expected)
         # Check that values is actually List[Tensor] and not (ReturnType(...),)
         for value in values:
             self.assertIsInstance(value, torch.Tensor)
-        result = pytree_impl.tree_unflatten(values, spec)
+        result = pytree.tree_unflatten(values, spec)
 
         self.assertEqual(type(result), type(expected))
         self.assertEqual(result, expected)
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_flatten_unflatten_nested(self, pytree_impl):
-        def run_test(pytree):
-            values, treespec = pytree_impl.tree_flatten(pytree)
+    @parametrize_pytree_module
+    def test_flatten_unflatten_nested(self, pytree):
+        def run_test(tree):
+            values, treespec = pytree.tree_flatten(tree)
             self.assertIsInstance(values, list)
             self.assertEqual(len(values), treespec.num_leaves)
 
             # NB: python basic data structures (dict list tuple) all have
             # contents equality defined on them, so the following works for them.
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
-            self.assertEqual(unflattened, pytree)
+            unflattened = pytree.tree_unflatten(values, treespec)
+            self.assertEqual(unflattened, tree)
 
         cases = [
             [()],
@@ -523,17 +512,11 @@ def run_test(pytree):
         for case in cases:
             run_test(case)
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_flatten_with_is_leaf(self, pytree_impl):
-        def run_test(pytree, one_level_leaves):
-            values, treespec = pytree_impl.tree_flatten(
-                pytree, is_leaf=lambda x: x is not pytree
+    @parametrize_pytree_module
+    def test_flatten_with_is_leaf(self, pytree):
+        def run_test(tree, one_level_leaves):
+            values, treespec = pytree.tree_flatten(
+                tree, is_leaf=lambda x: x is not tree
             )
             self.assertIsInstance(values, list)
             self.assertEqual(len(values), treespec.num_nodes - 1)
@@ -543,13 +526,13 @@ def run_test(pytree, one_level_leaves):
 
             self.assertEqual(
                 treespec,
-                pytree_impl.tree_structure(
-                    pytree_impl.tree_unflatten([0] * treespec.num_leaves, treespec)
+                pytree.tree_structure(
+                    pytree.tree_unflatten([0] * treespec.num_leaves, treespec)
                 ),
             )
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
-            self.assertEqual(unflattened, pytree)
+            unflattened = pytree.tree_unflatten(values, treespec)
+            self.assertEqual(unflattened, tree)
 
         cases = [
             ([()], [()]),
@@ -568,28 +551,22 @@ def run_test(pytree, one_level_leaves):
         for case in cases:
             run_test(*case)
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_tree_map(self, pytree_impl):
-        def run_test(pytree):
+    @parametrize_pytree_module
+    def test_tree_map(self, pytree):
+        def run_test(tree):
             def f(x):
                 return x * 3
 
-            sm1 = sum(map(f, pytree_impl.tree_leaves(pytree)))
-            sm2 = sum(pytree_impl.tree_leaves(pytree_impl.tree_map(f, pytree)))
+            sm1 = sum(map(f, pytree.tree_leaves(tree)))
+            sm2 = sum(pytree.tree_leaves(pytree.tree_map(f, tree)))
             self.assertEqual(sm1, sm2)
 
             def invf(x):
                 return x // 3
 
             self.assertEqual(
-                pytree_impl.tree_map(invf, pytree_impl.tree_map(f, pytree)),
-                pytree,
+                pytree.tree_map(invf, pytree.tree_map(f, tree)),
+                tree,
             )
 
         cases = [
@@ -602,27 +579,19 @@ def invf(x):
         for case in cases:
             run_test(case)
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_tree_map_multi_inputs(self, pytree_impl):
-        def run_test(pytree):
+    @parametrize_pytree_module
+    def test_tree_map_multi_inputs(self, pytree):
+        def run_test(tree):
             def f(x, y, z):
                 return x, [y, (z, 0)]
 
-            pytree_x = pytree
-            pytree_y = pytree_impl.tree_map(lambda x: (x + 1,), pytree)
-            pytree_z = pytree_impl.tree_map(lambda x: {"a": x * 2, "b": 2}, pytree)
+            tree_x = tree
+            tree_y = pytree.tree_map(lambda x: (x + 1,), tree)
+            tree_z = pytree.tree_map(lambda x: {"a": x * 2, "b": 2}, tree)
 
             self.assertEqual(
-                pytree_impl.tree_map(f, pytree_x, pytree_y, pytree_z),
-                pytree_impl.tree_map(
-                    lambda x: f(x, (x + 1,), {"a": x * 2, "b": 2}), pytree
-                ),
+                pytree.tree_map(f, tree_x, tree_y, tree_z),
+                pytree.tree_map(lambda x: f(x, (x + 1,), {"a": x * 2, "b": 2}), tree),
             )
 
         cases = [
@@ -635,55 +604,29 @@ def f(x, y, z):
         for case in cases:
             run_test(case)
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_tree_map_only(self, pytree_impl):
-        self.assertEqual(
-            pytree_impl.tree_map_only(int, lambda x: x + 2, [0, "a"]), [2, "a"]
-        )
+    @parametrize_pytree_module
+    def test_tree_map_only(self, pytree):
+        self.assertEqual(pytree.tree_map_only(int, lambda x: x + 2, [0, "a"]), [2, "a"])
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_tree_map_only_predicate_fn(self, pytree_impl):
+    @parametrize_pytree_module
+    def test_tree_map_only_predicate_fn(self, pytree):
         self.assertEqual(
-            pytree_impl.tree_map_only(lambda x: x == 0, lambda x: x + 2, [0, 1]), [2, 1]
+            pytree.tree_map_only(lambda x: x == 0, lambda x: x + 2, [0, 1]), [2, 1]
         )
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_tree_all_any(self, pytree_impl):
-        self.assertTrue(pytree_impl.tree_all(lambda x: x % 2, [1, 3]))
-        self.assertFalse(pytree_impl.tree_all(lambda x: x % 2, [0, 1]))
-        self.assertTrue(pytree_impl.tree_any(lambda x: x % 2, [0, 1]))
-        self.assertFalse(pytree_impl.tree_any(lambda x: x % 2, [0, 2]))
-        self.assertTrue(pytree_impl.tree_all_only(int, lambda x: x % 2, [1, 3, "a"]))
-        self.assertFalse(pytree_impl.tree_all_only(int, lambda x: x % 2, [0, 1, "a"]))
-        self.assertTrue(pytree_impl.tree_any_only(int, lambda x: x % 2, [0, 1, "a"]))
-        self.assertFalse(pytree_impl.tree_any_only(int, lambda x: x % 2, [0, 2, "a"]))
-
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_broadcast_to_and_flatten(self, pytree_impl):
+    @parametrize_pytree_module
+    def test_tree_all_any(self, pytree):
+        self.assertTrue(pytree.tree_all(lambda x: x % 2, [1, 3]))
+        self.assertFalse(pytree.tree_all(lambda x: x % 2, [0, 1]))
+        self.assertTrue(pytree.tree_any(lambda x: x % 2, [0, 1]))
+        self.assertFalse(pytree.tree_any(lambda x: x % 2, [0, 2]))
+        self.assertTrue(pytree.tree_all_only(int, lambda x: x % 2, [1, 3, "a"]))
+        self.assertFalse(pytree.tree_all_only(int, lambda x: x % 2, [0, 1, "a"]))
+        self.assertTrue(pytree.tree_any_only(int, lambda x: x % 2, [0, 1, "a"]))
+        self.assertFalse(pytree.tree_any_only(int, lambda x: x % 2, [0, 2, "a"]))
+
+    @parametrize_pytree_module
+    def test_broadcast_to_and_flatten(self, pytree):
         cases = [
             (1, (), []),
             # Same (flat) structures
@@ -716,29 +659,17 @@ def test_broadcast_to_and_flatten(self, pytree_impl):
             ((1, 2), ([0, [0, 0], 0], [0, 0]), [1, 1, 1, 1, 2, 2]),
             (([1, 2, 3], 4), ([0, [0, 0], 0], [0, 0]), [1, 2, 2, 3, 4, 4]),
         ]
-        for pytree, to_pytree, expected in cases:
-            _, to_spec = pytree_impl.tree_flatten(to_pytree)
-            result = pytree_impl._broadcast_to_and_flatten(pytree, to_spec)
-            self.assertEqual(result, expected, msg=str([pytree, to_spec, expected]))
+        for tree, to_tree, expected in cases:
+            _, to_spec = pytree.tree_flatten(to_tree)
+            result = pytree._broadcast_to_and_flatten(tree, to_spec)
+            self.assertEqual(result, expected, msg=str([tree, to_spec, expected]))
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_pytree_serialize_bad_input(self, pytree_impl):
+    @parametrize_pytree_module
+    def test_pytree_serialize_bad_input(self, pytree):
         with self.assertRaises(TypeError):
-            pytree_impl.treespec_dumps("random_blurb")
+            pytree.treespec_dumps("random_blurb")
 
-    @parametrize(
-        "pytree",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
+    @parametrize_pytree_module
     def test_is_namedtuple(self, pytree):
         DirectNamedTuple1 = namedtuple("DirectNamedTuple1", ["x", "y"])
 
@@ -779,13 +710,7 @@ class IndirectNamedTuple2(DirectNamedTuple2):
         self.assertFalse(pytree.is_namedtuple_class(tuple))
         self.assertFalse(pytree.is_namedtuple_class(list))
 
-    @parametrize(
-        "pytree",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
+    @parametrize_pytree_module
     def test_is_structseq(self, pytree):
         class FakeStructSeq(tuple):
             n_fields = 2
@@ -859,13 +784,7 @@ class DirectNamedTuple2(NamedTuple):
                 self.assertFalse(pytree.is_namedtuple(cls))
                 self.assertFalse(pytree.is_namedtuple_class(cls))
 
-    @parametrize(
-        "pytree",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
+    @parametrize_pytree_module
     def test_enum_treespec_roundtrip(self, pytree):
         data = {TestEnum.A: 5}
         spec = pytree.tree_structure(data)
@@ -885,14 +804,14 @@ def __init__(self, x, y):
         with self.assertWarnsRegex(
             FutureWarning, "torch.utils._pytree._register_pytree_node"
         ):
-            py_pytree._register_pytree_node(
+            python_pytree._register_pytree_node(
                 DummyType,
                 lambda dummy: ([dummy.x, dummy.y], None),
                 lambda xs, _: DummyType(*xs),
             )
 
         with self.assertWarnsRegex(UserWarning, "already registered"):
-            py_pytree._register_pytree_node(
+            python_pytree._register_pytree_node(
                 DummyType,
                 lambda dummy: ([dummy.x, dummy.y], None),
                 lambda xs, _: DummyType(*xs),
@@ -929,28 +848,30 @@ def test_import_pytree_doesnt_import_optree(self):
 
     def test_treespec_equality(self):
         self.assertEqual(
-            py_pytree.LeafSpec(),
-            py_pytree.LeafSpec(),
+            python_pytree.LeafSpec(),
+            python_pytree.LeafSpec(),
         )
         self.assertEqual(
-            py_pytree.TreeSpec(list, None, []),
-            py_pytree.TreeSpec(list, None, []),
+            python_pytree.TreeSpec(list, None, []),
+            python_pytree.TreeSpec(list, None, []),
         )
         self.assertEqual(
-            py_pytree.TreeSpec(list, None, [py_pytree.LeafSpec()]),
-            py_pytree.TreeSpec(list, None, [py_pytree.LeafSpec()]),
+            python_pytree.TreeSpec(list, None, [python_pytree.LeafSpec()]),
+            python_pytree.TreeSpec(list, None, [python_pytree.LeafSpec()]),
         )
         self.assertFalse(
-            py_pytree.TreeSpec(tuple, None, []) == py_pytree.TreeSpec(list, None, []),
+            python_pytree.TreeSpec(tuple, None, [])
+            == python_pytree.TreeSpec(list, None, []),
         )
         self.assertTrue(
-            py_pytree.TreeSpec(tuple, None, []) != py_pytree.TreeSpec(list, None, []),
+            python_pytree.TreeSpec(tuple, None, [])
+            != python_pytree.TreeSpec(list, None, []),
         )
 
     def test_treespec_repr(self):
         # Check that it looks sane
-        pytree = (0, [0, 0, [0]])
-        _, spec = py_pytree.tree_flatten(pytree)
+        tree = (0, [0, 0, [0]])
+        spec = python_pytree.tree_structure(tree)
         self.assertEqual(
             repr(spec),
             (
@@ -964,113 +885,86 @@ def test_treespec_repr(self):
     @parametrize(
         "spec",
         [
-            # py_pytree.tree_structure([])
-            py_pytree.TreeSpec(list, None, []),
-            # py_pytree.tree_structure(())
-            py_pytree.TreeSpec(tuple, None, []),
-            # py_pytree.tree_structure({})
-            py_pytree.TreeSpec(dict, [], []),
-            # py_pytree.tree_structure([0])
-            py_pytree.TreeSpec(list, None, [py_pytree.LeafSpec()]),
-            # py_pytree.tree_structure([0, 1])
-            py_pytree.TreeSpec(
+            # python_pytree.tree_structure([])
+            python_pytree.TreeSpec(list, None, []),
+            # python_pytree.tree_structure(())
+            python_pytree.TreeSpec(tuple, None, []),
+            # python_pytree.tree_structure({})
+            python_pytree.TreeSpec(dict, [], []),
+            # python_pytree.tree_structure([0])
+            python_pytree.TreeSpec(list, None, [python_leafspec]),
+            # python_pytree.tree_structure([0, 1])
+            python_pytree.TreeSpec(
                 list,
                 None,
-                [
-                    py_pytree.LeafSpec(),
-                    py_pytree.LeafSpec(),
-                ],
+                [python_leafspec, python_leafspec],
             ),
-            # py_pytree.tree_structure((0, 1, 2))
-            py_pytree.TreeSpec(
+            # python_pytree.tree_structure((0, 1, 2))
+            python_pytree.TreeSpec(
                 tuple,
                 None,
-                [
-                    py_pytree.LeafSpec(),
-                    py_pytree.LeafSpec(),
-                    py_pytree.LeafSpec(),
-                ],
+                [python_leafspec, python_leafspec, python_leafspec],
             ),
-            # py_pytree.tree_structure({"a": 0, "b": 1, "c": 2})
-            py_pytree.TreeSpec(
+            # python_pytree.tree_structure({"a": 0, "b": 1, "c": 2})
+            python_pytree.TreeSpec(
                 dict,
                 ["a", "b", "c"],
-                [
-                    py_pytree.LeafSpec(),
-                    py_pytree.LeafSpec(),
-                    py_pytree.LeafSpec(),
-                ],
+                [python_leafspec, python_leafspec, python_leafspec],
             ),
-            # py_pytree.tree_structure(OrderedDict([("a", (0, 1)), ("b", 2), ("c", {"a": 3, "b": 4, "c": 5})])
-            py_pytree.TreeSpec(
+            # python_pytree.tree_structure(OrderedDict([("a", (0, 1)), ("b", 2), ("c", {"a": 3, "b": 4, "c": 5})])
+            python_pytree.TreeSpec(
                 OrderedDict,
                 ["a", "b", "c"],
                 [
-                    py_pytree.TreeSpec(
+                    python_pytree.TreeSpec(
                         tuple,
                         None,
-                        [
-                            py_pytree.LeafSpec(),
-                            py_pytree.LeafSpec(),
-                        ],
+                        [python_leafspec, python_leafspec],
                     ),
-                    py_pytree.LeafSpec(),
-                    py_pytree.TreeSpec(
+                    python_leafspec,
+                    python_pytree.TreeSpec(
                         dict,
                         ["a", "b", "c"],
-                        [
-                            py_pytree.LeafSpec(),
-                            py_pytree.LeafSpec(),
-                            py_pytree.LeafSpec(),
-                        ],
+                        [python_leafspec, python_leafspec, python_leafspec],
                     ),
                 ],
             ),
-            # py_pytree.tree_structure([(0, 1, [2, 3])])
-            py_pytree.TreeSpec(
+            # python_pytree.tree_structure([(0, 1, [2, 3])])
+            python_pytree.TreeSpec(
                 list,
                 None,
                 [
-                    py_pytree.TreeSpec(
+                    python_pytree.TreeSpec(
                         tuple,
                         None,
                         [
-                            py_pytree.LeafSpec(),
-                            py_pytree.LeafSpec(),
-                            py_pytree.TreeSpec(
+                            python_leafspec,
+                            python_leafspec,
+                            python_pytree.TreeSpec(
                                 list,
                                 None,
-                                [
-                                    py_pytree.LeafSpec(),
-                                    py_pytree.LeafSpec(),
-                                ],
+                                [python_leafspec, python_leafspec],
                             ),
                         ],
                     ),
                 ],
             ),
-            # py_pytree.tree_structure(defaultdict(list, {"a": [0, 1], "b": [1, 2], "c": {}}))
-            py_pytree.TreeSpec(
+            # python_pytree.tree_structure(defaultdict(list, {"a": [0, 1], "b": [1, 2], "c": {}}))
+            python_pytree.TreeSpec(
                 defaultdict,
                 [list, ["a", "b", "c"]],
                 [
-                    py_pytree.TreeSpec(
+                    python_pytree.TreeSpec(
                         list,
                         None,
-                        [
-                            py_pytree.LeafSpec(),
-                            py_pytree.LeafSpec(),
-                        ],
+                        [python_leafspec, python_leafspec],
                     ),
-                    py_pytree.TreeSpec(
+                    python_pytree.TreeSpec(
                         list,
                         None,
-                        [
-                            py_pytree.LeafSpec(),
-                            py_pytree.LeafSpec(),
-                        ],
+                        [python_leafspec, python_leafspec],
                     ),
-                    py_pytree.TreeSpec(dict, [], []),
+                    python_pytree.TreeSpec(dict, [], []),
                 ],
             ),
         ],
@@ -1079,86 +973,92 @@ def test_pytree_serialize(self, spec):
         # Ensure that the spec is valid
         self.assertEqual(
             spec,
-            py_pytree.tree_structure(
-                py_pytree.tree_unflatten([0] * spec.num_leaves, spec)
+            python_pytree.tree_structure(
+                python_pytree.tree_unflatten([0] * spec.num_leaves, spec)
             ),
         )
 
-        serialized_spec = py_pytree.treespec_dumps(spec)
+        serialized_spec = python_pytree.treespec_dumps(spec)
         self.assertIsInstance(serialized_spec, str)
-        self.assertEqual(spec, py_pytree.treespec_loads(serialized_spec))
+        self.assertEqual(spec, python_pytree.treespec_loads(serialized_spec))
 
     def test_pytree_serialize_defaultdict_enum(self):
-        spec = py_pytree.TreeSpec(
+        spec = python_pytree.TreeSpec(
             defaultdict,
             [list, [TestEnum.A]],
             [
-                py_pytree.TreeSpec(
+                python_pytree.TreeSpec(
                     list,
                     None,
                     [
-                        py_pytree.LeafSpec(),
+                        python_leafspec,
                     ],
                 ),
             ],
         )
-        serialized_spec = py_pytree.treespec_dumps(spec)
+        serialized_spec = python_pytree.treespec_dumps(spec)
         self.assertIsInstance(serialized_spec, str)
 
     def test_pytree_serialize_enum(self):
-        spec = py_pytree.TreeSpec(dict, TestEnum.A, [py_pytree.LeafSpec()])
+        spec = python_pytree.TreeSpec(dict, TestEnum.A, [python_leafspec])
 
-        serialized_spec = py_pytree.treespec_dumps(spec)
+        serialized_spec = python_pytree.treespec_dumps(spec)
         self.assertIsInstance(serialized_spec, str)
 
     def test_pytree_serialize_namedtuple(self):
         Point1 = namedtuple("Point1", ["x", "y"])
-        py_pytree._register_namedtuple(
+        python_pytree._register_namedtuple(
             Point1,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point1",
         )
 
-        spec = py_pytree.tree_structure(Point1(1, 2))
+        spec = python_pytree.tree_structure(Point1(1, 2))
         self.assertIs(spec.type, namedtuple)
-        roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
+        roundtrip_spec = python_pytree.treespec_loads(
+            python_pytree.treespec_dumps(spec)
+        )
         self.assertEqual(spec, roundtrip_spec)
 
         class Point2(NamedTuple):
             x: int
             y: int
 
-        py_pytree._register_namedtuple(
+        python_pytree._register_namedtuple(
             Point2,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point2",
         )
 
-        spec = py_pytree.tree_structure(Point2(1, 2))
+        spec = python_pytree.tree_structure(Point2(1, 2))
         self.assertIs(spec.type, namedtuple)
-        roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
+        roundtrip_spec = python_pytree.treespec_loads(
+            python_pytree.treespec_dumps(spec)
+        )
         self.assertEqual(spec, roundtrip_spec)
 
         class Point3(Point2):
             pass
 
-        py_pytree._register_namedtuple(
+        python_pytree._register_namedtuple(
             Point3,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point3",
         )
 
-        spec = py_pytree.tree_structure(Point3(1, 2))
+        spec = python_pytree.tree_structure(Point3(1, 2))
         self.assertIs(spec.type, namedtuple)
-        roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
+        roundtrip_spec = python_pytree.treespec_loads(
+            python_pytree.treespec_dumps(spec)
+        )
         self.assertEqual(spec, roundtrip_spec)
 
     def test_pytree_serialize_namedtuple_bad(self):
         DummyType = namedtuple("DummyType", ["x", "y"])
 
-        spec = py_pytree.tree_structure(DummyType(1, 2))
+        spec = python_pytree.tree_structure(DummyType(1, 2))
 
         with self.assertRaisesRegex(
             NotImplementedError, "Please register using `_register_namedtuple`"
         ):
-            py_pytree.treespec_dumps(spec)
+            python_pytree.treespec_dumps(spec)
 
     def test_pytree_custom_type_serialize_bad(self):
         class DummyType:
@@ -1166,17 +1066,17 @@ def __init__(self, x, y):
                 self.x = x
                 self.y = y
 
-        py_pytree.register_pytree_node(
+        python_pytree.register_pytree_node(
             DummyType,
             lambda dummy: ([dummy.x, dummy.y], None),
             lambda xs, _: DummyType(*xs),
         )
 
-        spec = py_pytree.tree_structure(DummyType(1, 2))
+        spec = python_pytree.tree_structure(DummyType(1, 2))
         with self.assertRaisesRegex(
             NotImplementedError, "No registered serialization name"
         ):
-            py_pytree.treespec_dumps(spec)
+            python_pytree.treespec_dumps(spec)
 
     def test_pytree_custom_type_serialize(self):
         class DummyType:
@@ -1184,7 +1084,7 @@ def __init__(self, x, y):
                 self.x = x
                 self.y = y
 
-        py_pytree.register_pytree_node(
+        python_pytree.register_pytree_node(
             DummyType,
             lambda dummy: ([dummy.x, dummy.y], None),
             lambda xs, _: DummyType(*xs),
@@ -1192,10 +1092,10 @@ def __init__(self, x, y):
             to_dumpable_context=lambda context: "moo",
             from_dumpable_context=lambda dumpable_context: None,
         )
-        spec = py_pytree.tree_structure(DummyType(1, 2))
-        serialized_spec = py_pytree.treespec_dumps(spec, 1)
+        spec = python_pytree.tree_structure(DummyType(1, 2))
+        serialized_spec = python_pytree.treespec_dumps(spec, 1)
         self.assertIn("moo", serialized_spec)
-        roundtrip_spec = py_pytree.treespec_loads(serialized_spec)
+        roundtrip_spec = python_pytree.treespec_loads(serialized_spec)
         self.assertEqual(roundtrip_spec, spec)
 
     def test_pytree_serialize_register_bad(self):
@@ -1207,7 +1107,7 @@ def __init__(self, x, y):
         with self.assertRaisesRegex(
             ValueError, "Both to_dumpable_context and from_dumpable_context"
         ):
-            py_pytree.register_pytree_node(
+            python_pytree.register_pytree_node(
                 DummyType,
                 lambda dummy: ([dummy.x, dummy.y], None),
                 lambda xs, _: DummyType(*xs),
@@ -1221,7 +1121,7 @@ def __init__(self, x, y):
                 self.x = x
                 self.y = y
 
-        py_pytree.register_pytree_node(
+        python_pytree.register_pytree_node(
             DummyType,
             lambda dummy: ([dummy.x, dummy.y], None),
             lambda xs, _: DummyType(*xs),
@@ -1230,65 +1130,59 @@ def __init__(self, x, y):
             from_dumpable_context=lambda dumpable_context: None,
         )
 
-        spec = py_pytree.tree_structure(DummyType(1, 2))
+        spec = python_pytree.tree_structure(DummyType(1, 2))
 
         with self.assertRaisesRegex(
             TypeError, "Object of type type is not JSON serializable"
         ):
-            py_pytree.treespec_dumps(spec)
+            python_pytree.treespec_dumps(spec)
 
     def test_pytree_serialize_bad_protocol(self):
         import json
 
         Point = namedtuple("Point", ["x", "y"])
-        spec = py_pytree.tree_structure(Point(1, 2))
-        py_pytree._register_namedtuple(
+        spec = python_pytree.tree_structure(Point(1, 2))
+        python_pytree._register_namedtuple(
             Point,
             serialized_type_name="test_pytree.test_pytree_serialize_bad_protocol.Point",
         )
 
         with self.assertRaisesRegex(ValueError, "Unknown protocol"):
-            py_pytree.treespec_dumps(spec, -1)
+            python_pytree.treespec_dumps(spec, -1)
 
-        serialized_spec = py_pytree.treespec_dumps(spec)
+        serialized_spec = python_pytree.treespec_dumps(spec)
         _, data = json.loads(serialized_spec)
         bad_protocol_serialized_spec = json.dumps((-1, data))
 
         with self.assertRaisesRegex(ValueError, "Unknown protocol"):
-            py_pytree.treespec_loads(bad_protocol_serialized_spec)
+            python_pytree.treespec_loads(bad_protocol_serialized_spec)
 
     def test_saved_serialized(self):
-        # py_pytree.tree_structure(OrderedDict([(1, (0, 1)), (2, 2), (3, {4: 3, 5: 4, 6: 5})]))
-        complicated_spec = py_pytree.TreeSpec(
+        # python_pytree.tree_structure(OrderedDict([(1, (0, 1)), (2, 2), (3, {4: 3, 5: 4, 6: 5})]))
+        complicated_spec = python_pytree.TreeSpec(
             OrderedDict,
             [1, 2, 3],
             [
-                py_pytree.TreeSpec(
-                    tuple, None, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
-                ),
-                py_pytree.LeafSpec(),
-                py_pytree.TreeSpec(
+                python_pytree.TreeSpec(tuple, None, [python_leafspec, python_leafspec]),
+                python_leafspec,
+                python_pytree.TreeSpec(
                     dict,
                     [4, 5, 6],
-                    [
-                        py_pytree.LeafSpec(),
-                        py_pytree.LeafSpec(),
-                        py_pytree.LeafSpec(),
-                    ],
+                    [python_leafspec, python_leafspec, python_leafspec],
                 ),
             ],
         )
         # Ensure that the spec is valid
         self.assertEqual(
             complicated_spec,
-            py_pytree.tree_structure(
-                py_pytree.tree_unflatten(
+            python_pytree.tree_structure(
+                python_pytree.tree_unflatten(
                     [0] * complicated_spec.num_leaves, complicated_spec
                 )
             ),
         )
 
-        serialized_spec = py_pytree.treespec_dumps(complicated_spec)
+        serialized_spec = python_pytree.treespec_dumps(complicated_spec)
         saved_spec = (
             '[1, {"type": "collections.OrderedDict", "context": "[1, 2, 3]", '
             '"children_spec": [{"type": "builtins.tuple", "context": "null", '
@@ -1301,11 +1195,11 @@ def test_saved_serialized(self):
             '[]}, {"type": null, "context": null, "children_spec": []}]}]}]'
         )
         self.assertEqual(serialized_spec, saved_spec)
-        self.assertEqual(complicated_spec, py_pytree.treespec_loads(saved_spec))
+        self.assertEqual(complicated_spec, python_pytree.treespec_loads(saved_spec))
 
     def test_tree_map_with_path(self):
         tree = [{i: i for i in range(10)}]
-        all_zeros = py_pytree.tree_map_with_path(
+        all_zeros = python_pytree.tree_map_with_path(
             lambda kp, val: val - kp[1].key + kp[0].idx, tree
         )
         self.assertEqual(all_zeros, [dict.fromkeys(range(10), 0)])
@@ -1318,34 +1212,34 @@ class Data:
             c: Optional[str] = None
             d: str = field(init=False, default="")
 
-        py_pytree.register_dataclass(Data)
+        python_pytree.register_dataclass(Data)
         old_data = Data(torch.tensor(3), "b", "c")
         old_data.d = "d"
-        new_data = py_pytree.tree_unflatten(*py_pytree.tree_flatten(old_data))
+        new_data = python_pytree.tree_map(lambda x: x, old_data)
         self.assertEqual(new_data.a, torch.tensor(3))
         self.assertEqual(new_data.b, "b")
         self.assertEqual(new_data.c, "c")
         self.assertEqual(new_data.d, "")
-        py_pytree._deregister_pytree_node(Data)
+        python_pytree._deregister_pytree_node(Data)
 
         with self.assertRaisesRegex(ValueError, "Missing fields"):
-            py_pytree.register_dataclass(Data, field_names=["a", "b"])
+            python_pytree.register_dataclass(Data, field_names=["a", "b"])
 
         with self.assertRaisesRegex(ValueError, "Unexpected fields"):
-            py_pytree.register_dataclass(Data, field_names=["a", "b", "e"])
+            python_pytree.register_dataclass(Data, field_names=["a", "b", "e"])
 
         with self.assertRaisesRegex(ValueError, "Unexpected fields"):
-            py_pytree.register_dataclass(Data, field_names=["a", "b", "c", "d"])
+            python_pytree.register_dataclass(Data, field_names=["a", "b", "c", "d"])
 
-        py_pytree.register_dataclass(
+        python_pytree.register_dataclass(
             Data, field_names=["a"], drop_field_names=["b", "c"]
         )
         old_data = Data(torch.tensor(3), "b", "c")
-        new_data = py_pytree.tree_unflatten(*py_pytree.tree_flatten(old_data))
+        new_data = python_pytree.tree_map(lambda x: x, old_data)
         self.assertEqual(new_data.a, torch.tensor(3))
         self.assertEqual(new_data.b, "moo")
         self.assertEqual(new_data.c, None)
-        py_pytree._deregister_pytree_node(Data)
+        python_pytree._deregister_pytree_node(Data)
 
     def test_register_dataclass_class(self):
         class CustomClass:
@@ -1354,11 +1248,11 @@ def __init__(self, x, y):
                 self.y = y
 
         with self.assertRaisesRegex(ValueError, "field_names must be specified"):
-            py_pytree.register_dataclass(CustomClass)
+            python_pytree.register_dataclass(CustomClass)
 
-        py_pytree.register_dataclass(CustomClass, field_names=["x", "y"])
+        python_pytree.register_dataclass(CustomClass, field_names=["x", "y"])
         c = CustomClass(torch.tensor(0), torch.tensor(1))
-        mapped = py_pytree.tree_map(lambda x: x + 1, c)
+        mapped = python_pytree.tree_map(lambda x: x + 1, c)
         self.assertEqual(mapped.x, torch.tensor(1))
         self.assertEqual(mapped.y, torch.tensor(2))
 
@@ -1369,10 +1263,10 @@ def test_constant(self):
         class Config:
             norm: str
 
-        py_pytree.register_constant(Config)
+        python_pytree.register_constant(Config)
 
         config = Config("l1")
-        elements, spec = py_pytree.tree_flatten(config)
+        elements, spec = python_pytree.tree_flatten(config)
         self.assertEqual(elements, [])
         self.assertEqual(spec.context.value, config)
 
@@ -1382,7 +1276,7 @@ def __init__(self, norm: str):
                 self.norm = norm
 
         try:
-            py_pytree.register_constant(Config)
+            python_pytree.register_constant(Config)
             self.assertFalse(True)  # must raise error before this
         except TypeError as e:
             msg = "register_constant(cls) expects `cls` to have a non-default `__eq__` implementation."
@@ -1397,7 +1291,7 @@ def __eq__(self, other):
                 return self.norm == other.norm
 
         try:
-            py_pytree.register_constant(Config)
+            python_pytree.register_constant(Config)
             self.assertFalse(True)  # must raise error before this
         except TypeError as e:
             msg = "register_constant(cls) expects `cls` to have a non-default `__hash__` implementation."
@@ -1413,23 +1307,23 @@ class ACustomPytree:
         tree1 = [ACustomPytree(x=12, y={"cin": [1, 4, 10], "bar": 18}, z="leaf"), 5]
         tree2 = [ACustomPytree(x=2, y={"cin": [2, 2, 2], "bar": 2}, z="leaf"), 2]
 
-        py_pytree.register_pytree_node(
+        python_pytree.register_pytree_node(
             ACustomPytree,
             flatten_fn=lambda f: ([f.x, f.y], f.z),
             unflatten_fn=lambda xy, z: ACustomPytree(xy[0], xy[1], z),
             flatten_with_keys_fn=lambda f: ((("x", f.x), ("y", f.y)), f.z),
         )
-        from_two_trees = py_pytree.tree_map_with_path(
+        from_two_trees = python_pytree.tree_map_with_path(
             lambda kp, a, b: a + b, tree1, tree2
         )
-        from_one_tree = py_pytree.tree_map(lambda a: a + 2, tree1)
+        from_one_tree = python_pytree.tree_map(lambda a: a + 2, tree1)
         self.assertEqual(from_two_trees, from_one_tree)
 
     def test_tree_flatten_with_path_is_leaf(self):
         leaf_dict = {"foo": [(3)]}
-        pytree = (["hello", [1, 2], leaf_dict],)
-        key_leaves, _ = py_pytree.tree_flatten_with_path(
-            pytree, is_leaf=lambda x: isinstance(x, dict)
+        tree = (["hello", [1, 2], leaf_dict],)
+        key_leaves, _ = python_pytree.tree_flatten_with_path(
+            tree, is_leaf=lambda x: isinstance(x, dict)
         )
         self.assertTrue(key_leaves[-1][1] is leaf_dict)
 
@@ -1445,7 +1339,7 @@ class ACustomPytree:
             y: Any
             z: Any
 
-        py_pytree.register_pytree_node(
+        python_pytree.register_pytree_node(
             ACustomPytree,
             flatten_fn=lambda f: ([f.x, f.y], f.z),
             unflatten_fn=lambda xy, z: ACustomPytree(xy[0], xy[1], z),
@@ -1458,10 +1352,12 @@ class ACustomPytree:
             [ANamedTuple(x=torch.rand(2, 3), y=1, z="foo")],
             [ACustomPytree(x=12, y={"cin": [1, 4, 10], "bar": 18}, z="leaf"), 5],
         ]
-        for pytree in SOME_PYTREES:
-            key_leaves, spec = py_pytree.tree_flatten_with_path(pytree)
-            actual = py_pytree.tree_unflatten([leaf for _, leaf in key_leaves], spec)
-            self.assertEqual(actual, pytree)
+        for tree in SOME_PYTREES:
+            key_leaves, spec = python_pytree.tree_flatten_with_path(tree)
+            actual = python_pytree.tree_unflatten(
+                [leaf for _, leaf in key_leaves], spec
+            )
+            self.assertEqual(actual, tree)
 
     def test_tree_leaves_with_path(self):
         class ANamedTuple(NamedTuple):
@@ -1475,7 +1371,7 @@ class ACustomPytree:
             y: Any
             z: Any
 
-        py_pytree.register_pytree_node(
+        python_pytree.register_pytree_node(
             ACustomPytree,
             flatten_fn=lambda f: ([f.x, f.y], f.z),
             unflatten_fn=lambda xy, z: ACustomPytree(xy[0], xy[1], z),
@@ -1488,9 +1384,9 @@ class ACustomPytree:
             [ANamedTuple(x=torch.rand(2, 3), y=1, z="foo")],
             [ACustomPytree(x=12, y={"cin": [1, 4, 10], "bar": 18}, z="leaf"), 5],
         ]
-        for pytree in SOME_PYTREES:
-            flat_out, _ = py_pytree.tree_flatten_with_path(pytree)
-            leaves_out = py_pytree.tree_leaves_with_path(pytree)
+        for tree in SOME_PYTREES:
+            flat_out, _ = python_pytree.tree_flatten_with_path(tree)
+            leaves_out = python_pytree.tree_leaves_with_path(tree)
             self.assertEqual(flat_out, leaves_out)
 
     def test_key_str(self):
@@ -1499,8 +1395,8 @@ class ANamedTuple(NamedTuple):
             y: int
 
         tree = (["hello", [1, 2], {"foo": [(3)], "bar": [ANamedTuple(x="baz", y=10)]}],)
-        flat, _ = py_pytree.tree_flatten_with_path(tree)
-        paths = [f"{py_pytree.keystr(kp)}: {val}" for kp, val in flat]
+        flat, _ = python_pytree.tree_flatten_with_path(tree)
+        paths = [f"{python_pytree.keystr(kp)}: {val}" for kp, val in flat]
         self.assertEqual(
             paths,
             [
@@ -1515,7 +1411,7 @@ class ANamedTuple(NamedTuple):
 
     def test_flatten_flatten_with_key_consistency(self):
         """Check that flatten and flatten_with_key produces consistent leaves/context."""
-        reg = py_pytree.SUPPORTED_NODES
+        reg = python_pytree.SUPPORTED_NODES
 
         EXAMPLE_TREE = {
             list: [1, 2, 3],
@@ -1534,8 +1430,8 @@ def test_flatten_flatten_with_key_consistency(self):
             example = EXAMPLE_TREE.get(typ)
             if example is None:
                 continue
-            flat_with_path, spec1 = py_pytree.tree_flatten_with_path(example)
-            flat, spec2 = py_pytree.tree_flatten(example)
+            flat_with_path, spec1 = python_pytree.tree_flatten_with_path(example)
+            flat, spec2 = python_pytree.tree_flatten(example)
 
             self.assertEqual(flat, [x[1] for x in flat_with_path])
             self.assertEqual(spec1, spec2)
@@ -1546,9 +1442,9 @@ class ANamedTuple(NamedTuple):
             y: int
 
         tree = (["hello", [1, 2], {"foo": [(3)], "bar": [ANamedTuple(x="baz", y=10)]}],)
-        flat, _ = py_pytree.tree_flatten_with_path(tree)
+        flat, _ = python_pytree.tree_flatten_with_path(tree)
         for kp, val in flat:
-            self.assertEqual(py_pytree.key_get(tree, kp), val)
+            self.assertEqual(python_pytree.key_get(tree, kp), val)
 
 
 class TestCxxPytree(TestCase):
@@ -1561,8 +1457,8 @@ def test_treespec_equality(self):
 
     def test_treespec_repr(self):
         # Check that it looks sane
-        pytree = (0, [0, 0, [0]])
-        _, spec = cxx_pytree.tree_flatten(pytree)
+        tree = (0, [0, 0, [0]])
+        spec = cxx_pytree.tree_structure(tree)
         self.assertEqual(
             repr(spec), "PyTreeSpec((*, [*, *, [*]]), NoneIsLeaf, namespace='torch')"
         )
@@ -1599,7 +1495,7 @@ def test_pytree_serialize(self, spec):
         self.assertEqual(spec, cxx_pytree.treespec_loads(serialized_spec))
 
     def test_pytree_serialize_namedtuple(self):
-        py_pytree._register_namedtuple(
+        python_pytree._register_namedtuple(
             GlobalPoint,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.GlobalPoint",
         )
@@ -1609,7 +1505,7 @@ def test_pytree_serialize_namedtuple(self):
         self.assertEqual(roundtrip_spec.type._fields, spec.type._fields)
 
         LocalPoint = namedtuple("LocalPoint", ["x", "y"])
-        py_pytree._register_namedtuple(
+        python_pytree._register_namedtuple(
             LocalPoint,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.LocalPoint",
         )
diff --git a/test/test_schema_check.py b/test/test_schema_check.py
index 29ea36fd8a5f5..91d9a484d3c89 100644
--- a/test/test_schema_check.py
+++ b/test/test_schema_check.py
@@ -14,9 +14,12 @@
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.jit_utils import JitTestCase
 from torch.testing._internal.common_device_type import ops, OpDTypes, instantiate_device_type_tests
+from torch.testing._internal.common_utils import IS_WINDOWS, slowTestIf
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+
+
 def secretly_aliasing(x):
     return x.view(-1)
 
@@ -493,9 +496,9 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         with SchemaInfoBindTestMode(self) as schemaInfoCheck:
             x.add(x)
 
-
 class TestSchemaCheckModeOpInfo(JitTestCase):
     @ops(op_db, dtypes=OpDTypes.supported)
+    @slowTestIf(IS_WINDOWS)
     def test_schema_correctness(self, device, dtype, op):
         # Currently torch.equal isn't supported with torch.complex32
         # There's also errors with complex64 and complex128
diff --git a/test/test_segment_reductions.py b/test/test_segment_reductions.py
index 0b269595db211..815bbc7dbc3d4 100644
--- a/test/test_segment_reductions.py
+++ b/test/test_segment_reductions.py
@@ -14,7 +14,6 @@
     run_tests,
     gradcheck,
     parametrize,
-    skipIfRocm,
 )
 
 
@@ -231,7 +230,6 @@ def test_simple_zero_length(self, device, dtypes):
                             length_type,
                         )
 
-    @skipIfRocm
     @dtypes(
         *product(
             (torch.half, torch.bfloat16, torch.float, torch.double),
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 3413366608f4e..8fa78cb5da4b5 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -61,6 +61,7 @@
 )
 from torch.testing._internal.two_tensor import TwoTensor  # noqa: F401
 from torch.utils._import_utils import import_dill
+from pickle import UnpicklingError
 
 
 if not IS_WINDOWS:
@@ -1356,6 +1357,39 @@ def test_weights_only_error(self, unsafe_global):
                                             "file an issue with the following so that we can make `weights_only=True`"):
                     torch.load(f, weights_only=True)
 
+    def test_weights_only_blocked_func_error_msg(self):
+        import datetime
+        import zoneinfo
+
+        data = {
+            "a": torch.tensor([1, 2, 3]),
+            "b": datetime.datetime(2025, 1, 1, 12, 0, tzinfo=zoneinfo.ZoneInfo(key="UTC")),
+        }
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(data, f)
+            f.seek(0)
+
+            with torch.serialization.safe_globals([datetime.datetime, getattr, zoneinfo.ZoneInfo]):
+                with self.assertRaisesRegex(UnpicklingError, ".*_unpickle.*zoneinfo.ZoneInfo.*"):
+                    torch.load(f)
+
+
+    def test_weights_only_with_zoneinfo_unpickle_registration_success(self):
+        import datetime
+        import zoneinfo
+
+        data = {
+            "a": torch.tensor([1, 2, 3]),
+            "b": datetime.datetime(2025, 1, 1, 12, 0, tzinfo=zoneinfo.ZoneInfo(key="UTC")),
+        }
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(data, f)
+            f.seek(0)
+
+            with torch.serialization.safe_globals([datetime.datetime, getattr, zoneinfo.ZoneInfo, zoneinfo.ZoneInfo._unpickle]):
+                loaded_data = torch.load(f)
+                self.assertEqual(loaded_data, data)
+
     @parametrize('weights_only', (False, True))
     def test_serialization_math_bits(self, weights_only):
         t = torch.randn(1, dtype=torch.cfloat)
diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
index 669f165529e71..5be1758186467 100644
--- a/test/test_sort_and_select.py
+++ b/test/test_sort_and_select.py
@@ -179,7 +179,7 @@ def test_sort_stable_none(self):
     def test_complex_unsupported_cpu(self):
         x = torch.tensor([3.0 + 2j, 4.0 + 3j])
         with self.assertRaisesRegex(
-            ValueError, "Sort currently does not support complex dtypes on CPU."
+            RuntimeError, " Sort does not support complex dtypes on CPU"
         ):
             torch.sort(input=x)
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index bb7611c659f1d..82b535fd114ac 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -14,18 +14,19 @@
     parametrize, subtest, is_coalesced_indices, suppress_warnings, instantiate_parametrized_tests, \
     skipIfCrossRef
 from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_mps import mps_ops_modifier
 from numbers import Number
 from typing import Any
 from packaging import version
 from torch.testing._internal.common_cuda import \
     (SM53OrLater, SM80OrLater, TEST_MULTIGPU)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride,
-     deviceCountAtLeast, OpDTypes, onlyNativeDeviceTypes)
+    (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, dtypesIfMPS, onlyCPU, onlyCUDA, precisionOverride,
+     deviceCountAtLeast, OpDTypes, onlyNativeDeviceTypes, skipCUDAIf, expectedFailureMPS, largeTensorTest)
 from torch.testing._internal.common_methods_invocations import \
     (op_db, reduction_ops, sparse_unary_ufuncs, sparse_masked_reduction_ops, binary_ufuncs)
 from torch.testing._internal.common_dtype import (
-    all_types, all_types_and_complex, all_types_and_complex_and, floating_and_complex_types,
+    all_types, all_types_and_complex, all_mps_types, all_types_and_complex_and, floating_and_complex_types,
     floating_and_complex_types_and, integral_types, floating_types_and,
 )
 from torch.testing._internal.opinfo.definitions.sparse import validate_sample_input_sparse
@@ -42,7 +43,6 @@ def _op_supports_any_sparse(op):
             or op.supports_sparse_bsc)
 
 
-
 reduction_ops_with_sparse_support = [
     op for op in reduction_ops if 'masked.' not in op.name and
     _op_supports_any_sparse(op) and not isinstance(op, ReductionPythonRefInfo)]
@@ -230,10 +230,12 @@ def randn(self, *args, **kwargs):
         return torch.empty(*args, **kwargs).normal_()
 
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_print_coalesced(self, device, dtype):
         self._test_print(device, dtype, True)
 
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_print_uncoalesced(self, device, dtype):
         self._test_print(device, dtype, False)
 
@@ -272,7 +274,7 @@ def _test_print(self, device, dtype, coalesced):
             if values.dtype == torch.double:
                 dtypes.append(torch.float)
             else:
-                dtypes.append(torch.double)
+                dtypes.append(torch.double if values.device != torch.device("mps:0") else torch.float32)
             for dtype in dtypes:
                 printed.append(f"########## {dtype} ##########")
                 x = sp_tensor.detach().to(dtype)
@@ -292,6 +294,7 @@ def _test_print(self, device, dtype, coalesced):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_basic(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
             if isinstance(with_size, Number):
@@ -326,6 +329,7 @@ def test_shape(sparse_dims, nnz, with_size):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble, torch.bfloat16)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     @precisionOverride({torch.bfloat16: 1e-2})
     def test_coalesce(self, device, dtype, coalesced):
 
@@ -373,7 +377,22 @@ def _test_coalesce(t):
             t, _, _ = self._gen_sparse(len(sparse_size), nnz, sparse_size + dense_size, dtype, device, coalesced)
             _test_coalesce(t)  # this tests correctness
 
+    @onlyCUDA
+    @largeTensorTest("30GB", "cuda")
+    @skipCUDAIf(not SM80OrLater and not TEST_WITH_ROCM, "CUDA capability < SM80 and not ROCM")
+    @dtypes(torch.float)
+    def test_coalesce_accepts_large_tensor(self, device, dtype):
+        N = 22500000
+        NNZ = 272500000
+        rows = torch.randint(0, N, (NNZ,), dtype=torch.int64, device=device)
+        cols = torch.randint(0, N, (NNZ,), dtype=torch.int64, device=device)
+        indices = torch.stack([rows, cols], dim=0)
+        values = torch.randn(NNZ, dtype=dtype, device=device)
+        sparse_matrix = torch.sparse_coo_tensor(indices, values, size=(N, N), dtype=torch.float32, device=device)
+        sparse_matrix = sparse_matrix.coalesce()
+
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/89395")
     def test_coalesce_reference_cycle(self, device, dtype):
         # Test coalesce doesn't create autograd graph cycles (gh-52253)
@@ -401,6 +420,7 @@ def test_sparse_sum():
         self.assertTrue(ref.expired())
 
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_ctor_large_sizes(self, device, dtype):
         # Test that integer overflow is detected when computing numel
         # of a sparse tensor with large dimensions (gh-57416). Notice
@@ -415,6 +435,7 @@ def test_ctor_large_sizes(self, device, dtype):
                               indices, values, (N + 1,) * 4, device=device))
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_ctor_size_checks(self, device, dtype):
         indices = self.index_tensor([
             [0, 0, 0],
@@ -438,6 +459,7 @@ def test_ctor_size_checks(self, device, dtype):
             RuntimeError,
             lambda: self.sparse_tensor(indices, values, torch.Size([2, 4, 2, 1])))
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double)
     def test_ctor_is_coalesced_with_gradcheck(self, device, dtype, coalesced):
@@ -463,6 +485,7 @@ def func(indices, values, shape, is_coalesced):
                                                 "cannot set is_coalesced to true if indices correspond to uncoalesced COO tensor"):
                         torch.autograd.gradcheck(func, (t._indices(), t._values().requires_grad_(True), shape, True))
 
+    @expectedFailureMPS
     @dtypes(*floating_and_complex_types_and(torch.float16, torch.bfloat16))
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     @gradcheck_semantics()
@@ -528,6 +551,7 @@ def fn(x):
 
     @coalescedonoff
     @dtypes(torch.float16, torch.bfloat16, torch.float64, torch.int, torch.cfloat, torch.cdouble)
+    @expectedFailureMPS  # unique_dim not implemented for MPS device
     def test_to_sparse(self, device, dtype, coalesced):
         shape = [5, 2, 10, 4]
         max_nnz = 1
@@ -547,6 +571,7 @@ def test_to_sparse(self, device, dtype, coalesced):
                     self.assertEqual(dim, result.sparse_dim())
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_sparse_bool(self, device, dtype):
         a = torch.tensor([True, False], dtype=dtype, device=device).to(torch.bool)
         b = a.to_sparse().to_dense()
@@ -554,6 +579,7 @@ def test_sparse_bool(self, device, dtype):
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/108667")
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_scalar(self, device, dtype):
         # tensor with value
         a = self.sparse_tensor(self.index_tensor([], device=device).unsqueeze(1), 12.3, [], dtype=dtype, device=device)
@@ -584,6 +610,7 @@ def test_scalar(self, device, dtype):
         self.assertEqual(a, a.to_dense().to_sparse())
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_shared(self, device, dtype):
         i = self.index_tensor([[2]], device=device)
         v = torch.tensor([5], dtype=dtype, device=device)
@@ -599,6 +626,7 @@ def test_shared(self, device, dtype):
         i[0][0] = 0
         self.assertEqual(torch.empty((3, 0), dtype=dtype, device=device), self.safeToDense(x))
 
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     @gradcheck_semantics()
@@ -648,6 +676,7 @@ def fn(x):
         test_tensor(x, res)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_contig(self, device, dtype):
         def test_tensor(x, exp_i, exp_v):
             x = x.coalesce()
@@ -729,6 +758,7 @@ def test_tensor(x, exp_i, exp_v):
         test_tensor(x, exp_i, exp_v)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_contig_hybrid(self, device, dtype):
         def test_tensor(x, exp_i, exp_v):
             x = x.coalesce()
@@ -816,6 +846,7 @@ def test_tensor(x, exp_i, exp_v):
         test_tensor(x, exp_i, exp_v)
 
     @coalescedonoff
+    @dtypesIfMPS(torch.float32, torch.complex64)
     @dtypes(torch.double, torch.cdouble)
     def test_clone(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
@@ -834,6 +865,7 @@ def test_shape(sparse_dims, nnz, with_size):
         test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble, torch.bfloat16)
     @precisionOverride({torch.bfloat16: 2e-2})
     def test_Sparse_to_Sparse_copy_(self, device, dtype, coalesced):
@@ -936,6 +968,7 @@ def test_tensor(x):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_transpose(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
             x = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)[0]
@@ -956,6 +989,7 @@ def test_shape(sparse_dims, nnz, with_size):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @expectedFailureMPS
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     @gradcheck_semantics()
     def test_permute(self, device, dtype, coalesced, gradcheck):
@@ -1035,6 +1069,7 @@ def test_shape(di, dj, dk, nnz):
 
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1166")
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_t_empty(self, device, dtype):
         def test_in_place(x):
             shape_original = x.shape
@@ -1064,6 +1099,7 @@ def test_not_in_place(x):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_add_zeros(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, sizes):
             x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes, dtype, device, coalesced)
@@ -1078,6 +1114,7 @@ def test_shape(sparse_dims, nnz, sizes):
         test_shape(2, 20, [3, 17, 19, 5])
         test_shape(2, 20, [3, 17, 19, 0])
 
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_add_sub_nnz(self, device, dtype):
         # nnz should not grow unbounded (gh-34964)
@@ -1090,6 +1127,7 @@ def test_add_sub_nnz(self, device, dtype):
         x.sub_(2 * x)
         self.assertLessEqual(x._nnz(), 10)
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_cat(self, device, dtype, coalesced):
@@ -1132,6 +1170,7 @@ def test_shapes(shapes, dim, fail_message=None):
                                     "Concatenating sparse tensors, but a dense tensor was found at position 1."):
             torch.cat((sp, dn))
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_unsqueeze(self, device, dtype, coalesced):
@@ -1166,6 +1205,7 @@ def test_shape(sparse_dims, nnz, sizes, unsqueeze_dim, fail_message=None):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_select(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, sizes, select_dim, select_index, fail_message=None):
             x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes, dtype, device, coalesced)
@@ -1211,6 +1251,7 @@ def test_select_no_type_promotion(self, device, dtype):
             self.assertEqual(t.dtype, t[0, 0].dtype)
             self.assertEqual(t.dtype, t[1, 1].dtype)
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_index_select(self, device, dtype, coalesced):
@@ -1263,18 +1304,21 @@ def _test_index_select_exhaustive_index(self, sizes, dims, device, dtype, coales
                     small_sparse_result = t_small_sparse.index_select(d, t_idx)
                     self.assertEqual(small_dense_result, small_sparse_result)
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_index_select_exhaustive_index_small(self, device, dtype, coalesced):
         # will trigger brute-force algo
         self._test_index_select_exhaustive_index((3, 3, 4), range(3), device, dtype, coalesced)
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_index_select_exhaustive_index_large(self, device, dtype, coalesced):
         # will trigger more sophisticated algos
         self._test_index_select_exhaustive_index((100, 50, 3, 3), (2, 3), device, dtype, coalesced)
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_index_select_empty_and_non_contiguous_index(self, device, dtype, coalesced):
@@ -1373,6 +1417,7 @@ def test_shape(di, dj, dk, nnz):
         "bmm sparse-dense CUDA is not yet supported in Windows, at least up to CUDA 10.1"
     )
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_bmm(self, device, dtype, coalesced):
         def test_shape(num_mats, dim_i, dim_j, dim_k, nnz):
@@ -1583,6 +1628,7 @@ def test_shape(di, dj, dk, nnz):
         self.assertEqual(self.safeToDense(res), self.safeToDense(true_result))
 
     @coalescedonoff
+    @expectedFailureMPS
     @precisionOverride({torch.bfloat16: 5e-2, torch.float16: 5e-2})
     @dtypes(torch.double, torch.cdouble, torch.bfloat16, torch.float16)
     def test_sparse_addmm(self, device, dtype, coalesced):
@@ -1624,6 +1670,7 @@ def fn(S, D1, D2, beta=beta, alpha=alpha):
         test_shape(7, 8, 9, 20, True, (1, 1))
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     def test_sparse_mm(self, device, dtype, coalesced):
@@ -1646,6 +1693,7 @@ def fn(S, D):
         test_shape(7, 8, 9, 20, True)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     @gradcheck_semantics()
@@ -1669,6 +1717,7 @@ def test_shape(sparse_dims, nnz, with_shape):
         # test_shape(2, 3, [2, 2, 0])
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_dsmm(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
@@ -1688,6 +1737,7 @@ def test_shape(di, dj, dk, nnz):
         test_shape(1000, 100, 0, 20)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_hsmm(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
@@ -1707,6 +1757,7 @@ def test_shape(di, dj, dk, nnz):
         test_shape(1000, 100, 0, 20)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_spadd(self, device, dtype, coalesced):
 
@@ -1794,6 +1845,7 @@ def test_sparse_add_out_bfloat16(self, device, dtype, coalesced):
         self.assertEqual(res_fp32, res_bf16, atol=1e-2, rtol=0)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_norm(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
@@ -1822,6 +1874,7 @@ def test_shape(sparse_dims, nnz, with_size):
                 x.norm(**kwargs)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     @unittest.skipIf(TEST_WITH_CROSSREF, "fallback triggers cuda device error")
     def test_sparse_sum(self, device, dtype, coalesced):
@@ -1886,6 +1939,7 @@ def fn(S):
             S = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)[0]
             run_tests(S.requires_grad_(True), test_dim)
 
+    @expectedFailureMPS
     def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, device, coalesced):
         shape = shape_i + (shape_v)
         x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape, dtype, device, coalesced)
@@ -1994,6 +2048,7 @@ def _test_basic_ops_hybrid():
         _test_basic_ops_hybrid()
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_add_dense_sparse_mismatch(self, device, dtype):
         def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size):
             x = torch.zeros(dense_size, dtype=dtype, device=device)
@@ -2010,6 +2065,7 @@ def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size):
 
     @skipIfTorchDynamo("Not a TorchDynamo suitable test")
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_add_noncontiguous(self, device, dtype):
         indices = self.index_tensor([[1, 2], [0, 2]], device=device)
         values = torch.tensor([1.], dtype=dtype, device=device).expand(2, 3, 4, 5)
@@ -2032,6 +2088,7 @@ def _test_sparse_mask_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, devic
         self.assertEqual(self.safeToDense(y2), expected)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_sparse_mask(self, device, dtype, coalesced):
         def _test_sparse_mask_fixed():
@@ -2102,6 +2159,7 @@ def _test_sparse_mask_fixed():
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_sparse_mask_hybrid(self, device, dtype, coalesced):
         def _test_sparse_mask_hybrid_fixed():
             i = self.index_tensor([
@@ -2163,6 +2221,7 @@ def _test_sparse_mask_hybrid_fixed():
         self._test_sparse_mask_shape(0, 0, [10, 10, 0], [2, 0], dtype, device, coalesced)
 
     @dtypes(torch.double, torch.cdouble)
+    @expectedFailureMPS
     @skipIfCrossRef
     def test_sparse_mask_backward(self, device, dtype):
         from itertools import product, repeat
@@ -2197,6 +2256,7 @@ def test_sparse_mask_backward(self, device, dtype):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_zeros(self, device, dtype, coalesced):
         def _test_zeros(nnzs, shape, out_shape_i, out_shape_v=None):
             out_shape = out_shape_i + (out_shape_v or [])
@@ -2221,6 +2281,7 @@ def test_shape(i_shapes, v_shapes, shape, nnzs):
         test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 0], [9, 12])
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_zeros_like(self, device, dtype, coalesced):
         def _test_zeros_like(nnzs, template_shape_i, template_shape_v=None):
@@ -2304,6 +2365,7 @@ def _test_empty_like(self, sparse_tensor, dtype, device, coalesced):
             result = torch.empty_like(dense_tensor, layout=torch.sparse_coo)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_empty_like(self, device, dtype, coalesced):
         # tests https://github.com/pytorch/pytorch/issues/43699
@@ -2360,6 +2422,7 @@ def _all_narrow_combs(self, shape):
                     yield [dim, start, length]
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_narrow(self, device, dtype, coalesced):
         shape = [3, 3, 4, 2]
@@ -2402,6 +2465,7 @@ def is_integral(dtype):
                 sparse_tensor.requires_grad_()
 
     @coalescedonoff
+    @dtypesIfMPS(*all_mps_types())
     @dtypes(*all_types())
     def test_log1p(self, device, dtype, coalesced):
         if coalesced:
@@ -2467,6 +2531,7 @@ def _test_neg_negative(self, sparse_tensor):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_neg_negative(self, device, dtype, coalesced):
 
         if coalesced:
@@ -2548,6 +2613,7 @@ def is_integral(dtype):
 
     @coalescedonoff
     @dtypes(*all_types())
+    @dtypesIfMPS(*all_mps_types())
     def test_asin_arcsin(self, device, dtype, coalesced):
         if coalesced:
             input_coalesced = torch.sparse_coo_tensor(
@@ -2593,6 +2659,7 @@ def test_asin_arcsin(self, device, dtype, coalesced):
             self._test_asin_arcsin(input_uncoalesced, coalesced)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_mv(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
@@ -2620,6 +2687,7 @@ def test_shape(di, dj, dk, nnz):
             res = x.mv(y)
 
     @dtypes(*floating_and_complex_types())
+    @dtypesIfMPS(torch.float32, torch.bfloat16, torch.complex64)
     def test_sparse_add_coalesce(self, device, dtype):
         i = self.index_tensor([[1, 2, 1]], device=device)
         v = torch.tensor([3, 4, 5], dtype=dtype, device=device)
@@ -2697,6 +2765,7 @@ def test_new_device_multi_gpu(self):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_new(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
             x, indices, values = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)
@@ -2753,6 +2822,7 @@ def test_factory(self, device, dtype):
                             self.assertEqual(True, sparse_tensor.requires_grad)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_factory_size_check(self, device, dtype):
         indices = self.index_tensor([[1, 2],
                                     [0, 2]], device=device)
@@ -2807,6 +2877,7 @@ def test_factory_empty_indices(self, device):
         self.assertEqual(tensor._indices(), expected_indices)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_factory_nnz(self, device, dtype):
         indices = self.index_tensor([[0]], device=device)  # (sparse_dim, nnz): (1, 1)
         values = torch.tensor([[1, 1], [1, 1]], dtype=dtype, device=device)  # (nnz, ...): (2, 2)
@@ -2821,6 +2892,7 @@ def test_factory_nnz(self, device, dtype):
             torch.sparse_coo_tensor(indices, values, sizes, dtype=dtype, device=device)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_factory_nnz_zero(self, device, dtype):
         def test_shape(i_shape, v_shape, size, expected_size):
             if size:
@@ -2842,6 +2914,7 @@ def test_shape(i_shape, v_shape, size, expected_size):
         test_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0])
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_factory_dense_dim(self, device, dtype):
         indices = self.index_tensor([[0]], device=device)
         values = torch.tensor([[[1, 1, 1], [1, 1, 1]]], dtype=dtype, device=device)
@@ -3082,6 +3155,7 @@ def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size, dtype, device):
                          x_dense.view(-1)[0:x_v_numel].view(x_v))
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_resize(self, device, dtype):
         # 1. Expand the size of some dense dimensions [Supported]
         self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
@@ -3167,6 +3241,7 @@ def test_is_nonzero(self, device):
                          .is_nonzero())
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_change_tensor_metadata(self, device, dtype):
         i = self.index_tensor([[0], [1]], device=device)
         v = torch.tensor([[3, 4, 5]], dtype=dtype, device=device)
@@ -3209,6 +3284,7 @@ def test_change_tensor_metadata(self, device, dtype):
         self.assertEqual(list(t.coalesce().values().size()), [1, 3])
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_pickle(self, device, dtype, coalesced):
         import pickle
@@ -3240,6 +3316,7 @@ def test_pickle(self, device, dtype, coalesced):
             sp_tensor_loaded = pickle.loads(serialized)
             self.assertEqual(sp_tensor, sp_tensor_loaded)
 
+    @expectedFailureMPS
     def test_any(self, device):
         t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([False, False]), device=device)
         t_any = torch.tensor(False)
@@ -3257,6 +3334,7 @@ def test_isnan(self, device):
         self.assertEqual(torch.isnan(t).int(), t_nan.int())
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.float32, torch.float64)
     def test_div_rounding_mode(self, device, dtype, coalesced):
         sparse, _, _ = self._gen_sparse(2, 10, (10, 10), dtype,
@@ -3277,11 +3355,13 @@ def test_div_rounding_mode(self, device, dtype, coalesced):
             torch.div(sparse, -2, rounding_mode=mode, out=actual)
             self.assertEqual(self.safeToDense(actual), expect)
 
+    @expectedFailureMPS
     def test_div_by_sparse_error(self, device):
         self.assertRaisesRegex(RuntimeError, 'Sparse division requires',
                                lambda: torch.tensor(1., device=device).to_sparse()
                                / torch.tensor(1., device=device).to_sparse())
 
+    @expectedFailureMPS
     def test_floor_divide_by_sparse_error(self, device):
         self.assertRaisesRegex(RuntimeError, 'Sparse floor division requires',
                                lambda: torch.tensor(1., device=device).to_sparse()
@@ -3294,6 +3374,7 @@ def test_sparse_to_numpy(self, device):
         self.assertRaises(TypeError, lambda: t.numpy())
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_softmax(self, device, dtype, coalesced):
         import torch.nn.functional as F
@@ -3606,12 +3687,14 @@ def _check_zero_nnz_softmax_op(self, func, ndim, device, dtype):
 
 
     @dtypes(torch.double, torch.float)
+    @expectedFailureMPS
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     def test_softmax_zero_nnz(self, device, dtype):
         self._check_zero_nnz_softmax_op(torch.sparse.softmax, 1, device, dtype)
         self._check_zero_nnz_softmax_op(torch.sparse.softmax, 10, device, dtype)
 
     @dtypes(torch.double, torch.float)
+    @expectedFailureMPS
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     def test_log_softmax_zero_nnz(self, device, dtype):
         self._check_zero_nnz_softmax_op(torch.sparse.log_softmax, 1, device, dtype)
@@ -3620,8 +3703,9 @@ def test_log_softmax_zero_nnz(self, device, dtype):
     # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA
     @coalescedonoff
     @dtypes(*floating_and_complex_types())
-    @dtypesIfCUDA(*floating_types_and(*[torch.half] if SPARSE_FLOAT16_SUPPORTED else [],
-                                      *[torch.bfloat16] if SPARSE_BFLOAT16_SUPPORTED else [],
+    @expectedFailureMPS
+    @dtypesIfCUDA(*floating_types_and(*[torch.half] if SM53OrLater and not TEST_WITH_ROCM else [],
+                                      *[torch.bfloat16] if SM80OrLater and not TEST_WITH_ROCM else [],
                                       torch.complex64,
                                       *[torch.complex128] if SPARSE_COMPLEX128_SUPPORTED else []))
     @unittest.skipIf(TEST_WITH_CROSSREF, "not working with fake tensor")
@@ -3748,6 +3832,7 @@ def assign_to():
 
         self.assertRaises(TypeError, assign_to)
 
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_full_broadcast_to(self, device, dtype):
         def can_broadcast(s0, s1):
@@ -3778,6 +3863,7 @@ def can_broadcast(s0, s1):
                         torch._sparse_broadcast_to(s, s1)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_sparse_broadcast_to(self, device, dtype, coalesced):
         def test(sparse_dims, nnz, with_size, new_size):
@@ -3807,6 +3893,7 @@ def _test_mul_skips(self, device, dtype, coalesced):
             self.skipTest(f"Test with dtype={dtype}, device={device} runs only with coalesced inputs")
 
     @coalescedonoff
+    @expectedFailureMPS
     # NOTE: addcmul_out is not implemented for bool.
     @dtypes(*all_types_and_complex_and(torch.bfloat16, torch.float16))
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
@@ -3858,6 +3945,7 @@ def check_empty(sparse_shape, nnz, dense_shape, coalesce):
                 # check_autograd(x, y)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
     def test_sparse_dense_mul(self, device, dtype, coalesced):
@@ -4043,6 +4131,7 @@ def test_small_nnz_coalesced(self):
         self.assertFalse(torch.sparse_coo_tensor([[0, 1], [0, 1]], [1, 2], (2, 2)).is_coalesced())
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(*all_types_and_complex_and(torch.bool))
     def test_sum(self, device, dtype, coalesced):
         def run_test(shape, nnz):
@@ -4116,7 +4205,7 @@ def _sparse_to_dense(tensor):
     return tensor.to(torch.int8).to_dense().to(torch.bool)
 
 
-_sparse_unary_ops = ops(sparse_unary_ufuncs, dtypes=OpDTypes.supported,
+_sparse_unary_ops = ops(mps_ops_modifier(sparse_unary_ufuncs, sparse=True), dtypes=OpDTypes.supported,
                         allowed_dtypes=all_types_and_complex())
 class TestSparseUnaryUfuncs(TestCase):
     exact_dtype = True
@@ -4168,8 +4257,8 @@ def test_inplace(self, device, dtype, op):
     @_sparse_unary_ops
     def test_sparse_zero_dims(self, device, dtype, op):
         # test 0x0 sparse_coo_tensor
-        indices = torch.empty(2, 0, dtype=torch.int64)
-        values = torch.empty(0, dtype=dtype)
+        indices = torch.empty(2, 0, dtype=torch.int64, device=device)
+        values = torch.empty(0, dtype=dtype, device=device)
         sparse_0x0 = torch.sparse_coo_tensor(indices, values, (0, 0))
         expected = torch.sparse_coo_tensor(indices, op(values), (0, 0))
         actual = op(sparse_0x0)
@@ -5516,12 +5605,12 @@ def generic_constructor(*args, **kwargs):
 
 
 # e.g., TestSparseUnaryUfuncsCPU and TestSparseUnaryUfuncsCUDA
-instantiate_device_type_tests(TestSparseUnaryUfuncs, globals(), except_for='meta')
+instantiate_device_type_tests(TestSparseUnaryUfuncs, globals(), allow_mps=True, except_for='meta')
 
 instantiate_device_type_tests(TestSparseMaskedReductions, globals(), except_for='meta')
 
 # e.g., TestSparseCPU and TestSparseCUDA
-instantiate_device_type_tests(TestSparse, globals(), except_for='meta')
+instantiate_device_type_tests(TestSparse, globals(), allow_mps=True, except_for='meta')
 
 instantiate_device_type_tests(TestSparseAny, globals(), except_for='meta')
 
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 5376f8aa42d31..48c6be3f7b928 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -12,7 +12,7 @@
 from torch.testing._internal.common_cuda import SM53OrLater, SM80OrLater, TEST_CUSPARSE_GENERIC
 from torch.testing._internal.common_utils import \
     (TEST_WITH_TORCHINDUCTOR, TEST_WITH_ROCM, TEST_CUDA_CUDSS, TEST_SCIPY, TEST_NUMPY, TEST_MKL, IS_WINDOWS, TestCase,
-     run_tests, load_tests, coalescedonoff, parametrize, subtest, skipIfTorchDynamo, skipIfRocm,
+     run_tests, load_tests, coalescedonoff, parametrize, subtest, skipIfTorchDynamo,
      skipIfRocmVersionLessThan, IS_FBCODE, IS_REMOTE_GPU, suppress_warnings)
 from torch.testing._internal.common_device_type import \
     (ops, instantiate_device_type_tests, dtypes, OpDTypes, dtypesIfCUDA, onlyCPU, onlyCUDA, skipCUDAIfNoSparseGeneric,
@@ -3723,7 +3723,6 @@ def test_triton_bsr_dense_bmm_error_messages(self, device, dtype):
 
     @parametrize("block_size", [16, 32, 64])
     @onlyCUDA
-    @skipIfRocm
     @dtypes(torch.half, torch.bfloat16, torch.float)
     @dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "Test requires Triton")
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 02cb1d31d5637..15c04b8154c3a 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -1965,6 +1965,11 @@ def test_zeros(self, device):
         expected = torch.tensor([[0., 0.], [0., 0.]], device=device, dtype=torch.complex32)
         self.assertEqual(complexHalfTensor, expected)
 
+    def test_zeros_bounds_checking(self, device):
+        # Test negative large integer
+        with self.assertRaisesRegex(RuntimeError, r"zeros: Dimension size must be non-negative."):
+            torch.zeros(-6744789213055875072, device=device)
+
     # TODO: this test should be updated
     def test_zeros_out(self, device):
         shape = (3, 4)
diff --git a/test/test_testing.py b/test/test_testing.py
index a69fb8ac95326..00fb106ac2ab6 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -2351,7 +2351,7 @@ def _check_python_output(cls, program) -> str:
             # fail, so just set CWD to this script's directory
             cwd=os.path.dirname(os.path.realpath(__file__)),).decode("utf-8")
 
-    # The test is flaky on ROCm and has been open and close multiple times
+    # The test is flaky on ROCm/XPU and has been open and close multiple times
     # https://github.com/pytorch/pytorch/issues/110040
     @skipIfRocm
     def test_circular_dependencies(self) -> None:
diff --git a/test/test_torch.py b/test/test_torch.py
index ef23f13e4376b..a6c265c309a2a 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -66,6 +66,7 @@
     get_all_qint_dtypes, all_types_complex_float8_and,
 )
 from torch.testing._internal.two_tensor import TwoTensor
+from torch.testing._internal.common_utils import IS_WINDOWS
 
 if TEST_WITH_TORCHINDUCTOR:
     from torch._inductor.test_case import TestCase
@@ -158,6 +159,7 @@ def test_constants(self, device):
         self.assertEqual(torch.inf, math.inf)
 
     @onlyNativeDeviceTypes
+    @slowTestIf(IS_WINDOWS)
     @dtypes(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64,
             torch.bool, torch.float32, torch.complex64, torch.float64,
             torch.complex128, torch.uint16, torch.uint32, torch.uint64)
@@ -190,6 +192,7 @@ def test_int64_upsample3d(self, device, dtype):
     @dtypes(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64,
             torch.bool, torch.float32, torch.complex64, torch.float64,
             torch.complex128, torch.uint16, torch.uint32, torch.uint64)
+    @slowTestIf(IS_WINDOWS)
     def test_storage(self, device, dtype):
         v = make_tensor((3, 5), dtype=dtype, device=device, low=-9, high=9)
         self.assertEqual(v.storage()[0], v[0][0])
@@ -220,6 +223,7 @@ def test_storage(self, device, dtype):
             torch.bool, torch.float32, torch.complex64, torch.float64,
             torch.complex128, torch.quint8, torch.qint8, torch.qint32,
             torch.quint4x2)
+    @slowTestIf(IS_WINDOWS)
     def test_storage_setitem(self, device, dtype):
         # Skip quantized dtypes for CUDA, since they're not supported
         if torch.device(device).type == 'cuda':
@@ -251,6 +255,7 @@ def test_storage_setitem(self, device, dtype):
 
     @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
     @onlyNativeDeviceTypes
+    @slowTestIf(IS_WINDOWS)
     def test_storage_use_count(self, device):
         a = torch.randn(10, device=device)
         prev_cf = torch._C._storage_Use_Count(a.untyped_storage()._cdata)
@@ -261,6 +266,7 @@ def test_storage_use_count(self, device):
     @xfailIfTorchDynamo
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @slowTestIf(IS_WINDOWS)
     def test_tensor_storage_type(self, device, dtype):
         a = make_tensor((10,), dtype=dtype, device=device, low=-9, high=9)
 
@@ -271,6 +277,7 @@ def test_tensor_storage_type(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16, torch.uint16, torch.uint32, torch.uint64))
+    @slowTestIf(IS_WINDOWS)
     def test_tensor_from_storage(self, device, dtype):
         a = make_tensor((4, 5, 3), dtype=dtype, device=device, low=-9, high=9)
         a_s = a.storage()
@@ -288,6 +295,7 @@ def test_tensor_from_storage(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @slowTestIf(IS_WINDOWS)
     def test_set_storage(self, device, dtype):
         a = make_tensor((4, 5, 3), dtype=dtype, device=device, low=-9, high=9)
         a_s = a.storage()
@@ -326,6 +334,7 @@ def _check_storage_meta(self, s, s_check):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @slowTestIf(IS_WINDOWS)
     def test_typed_storage_meta(self, device, dtype):
         args_list = [
             [],
@@ -339,6 +348,7 @@ def test_typed_storage_meta(self, device, dtype):
             self._check_storage_meta(s, s_check)
 
     @onlyNativeDeviceTypes
+    @slowTestIf(IS_WINDOWS)
     def test_untyped_storage_meta(self, device):
         args_list = [
             [],
@@ -353,6 +363,7 @@ def test_untyped_storage_meta(self, device):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @slowTestIf(IS_WINDOWS)
     def test_storage_meta_from_tensor(self, device, dtype):
         t_check = make_tensor((4, 5, 3), dtype=dtype, device=device, low=-9, high=9)
         t = t_check.to('meta')
@@ -362,6 +373,7 @@ def test_storage_meta_from_tensor(self, device, dtype):
         self._check_storage_meta(s, s_check)
 
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @slowTestIf(IS_WINDOWS)
     def test_storage_meta_errors(self, device, dtype):
         s0 = torch.TypedStorage([1, 2, 3, 4], device='meta', dtype=dtype)
 
@@ -402,6 +414,7 @@ def test_storage_meta_errors(self, device, dtype):
 
     @onlyCPU
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @slowTestIf(IS_WINDOWS)
     def test_storage_meta_ok(self, device, dtype):
         s0 = torch.TypedStorage([1, 2, 3, 4], device='meta', dtype=dtype)
 
@@ -417,6 +430,7 @@ def test_module_share_memory(self):
         model.share_memory()
 
     @dtypes(torch.float32, torch.complex64)
+    @slowTestIf(IS_WINDOWS)
     def test_deepcopy(self, device, dtype):
         from copy import deepcopy
         a = torch.randn(5, 5, dtype=dtype, device=device)
@@ -444,6 +458,7 @@ def test_deepcopy(self, device, dtype):
         self.assertEqual(deepcopy(a).foo, 3)
 
     @dtypes(torch.float32, torch.complex64)
+    @slowTestIf(IS_WINDOWS)
     def test_deepcopy_scalar(self, device, dtype):
         from copy import deepcopy
         a = torch.tensor(5, dtype=dtype, device=device)
@@ -3433,269 +3448,9 @@ def test_narrow_copy_non_contiguous(self, device):
         actual = torch.narrow_copy(inp, 1, 0, 10)
         self.assertEqual(expected, actual)
 
-    # FIXME: move to indexing test suite
-    @parametrize("reduce", ['prod', 'amin', 'amax', 'mean'])
-    @dtypes(*all_types_and(torch.half, torch.bfloat16))
-    def test_index_reduce(self, device, dtype, reduce):
-        size = (3, 4, 5)
-        index_dtypes = [torch.int, torch.long]
-        include_selfs = [True, False]
-        amin_init = float('inf') if dtype.is_floating_point else torch.iinfo(dtype).max
-        amax_init = -float('inf') if dtype.is_floating_point else torch.iinfo(dtype).min
-        reduction_init = {'prod': 1, 'mean': 0, 'amin': amin_init, 'amax': amax_init}
-
-        for dest_noncontig, src_noncontig, index_noncontig in product([True, False], repeat=3):
-            for idx_dtype, include_self in product(index_dtypes, include_selfs):
-                for dim in range(len(size)):
-                    num_src = np.random.randint(10)
-                    num_dest = size[dim]
-                    dest = make_tensor(size, device=device, dtype=dtype, noncontiguous=dest_noncontig)
-                    src_size = size[:dim] + (num_src,) + size[dim + 1:]
-                    src = make_tensor(src_size, device=device, dtype=dtype, noncontiguous=src_noncontig)
-                    idx = torch.testing.make_tensor(
-                        num_src, low=0, high=num_dest, dtype=idx_dtype, device=device, noncontiguous=index_noncontig
-                    )
-                    expected = dest.clone()
-                    dest.index_reduce_(dim, idx, src, reduce, include_self=include_self)
-                    # fill rows in idx with reduction inits if include_self=False
-                    if (not include_self):
-                        expected.index_fill_(dim, idx.long(), reduction_init[reduce])
-                    expected = expected.transpose(0, dim)
-                    src = src.transpose(0, dim)
-                    for i in range(num_src):
-                        if reduce == 'prod':
-                            expected[idx[i]] *= src[i]
-                        elif reduce == 'amin':
-                            torch.minimum(expected[idx[i]], src[i], out=expected[idx[i]])
-                        elif reduce == 'amax':
-                            torch.maximum(expected[idx[i]], src[i], out=expected[idx[i]])
-                        else:
-                            expected[idx[i]] += src[i]
-                    if reduce == 'mean':
-                        counts = torch.ones_like(expected) if include_self else torch.zeros_like(expected)
-                        counts.index_add_(0, idx, torch.ones_like(src))
-                        counts.masked_fill_(counts == 0, 1)
-                        if (dtype.is_floating_point):
-                            expected.div_(counts)
-                        else:
-                            expected.div_(counts, rounding_mode="floor")
-                    expected = expected.transpose(0, dim)
-
-                    self.assertEqual(dest, expected)
-
-    # FIXME: move to test indexing
-    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
-    def test_index_copy(self, device, dtype):
-        # We just test for num_copy <= num_dest, as otherwise there are repeated indices
-        # and the behavior is undefined
-        num_copy, num_dest = 3, 5
-
-        def make_arg(batch_sizes, n, dim, contig):
-            size_arg = batch_sizes[:dim] + (n,) + batch_sizes[dim:]
-            return make_tensor(size_arg, dtype=dtype, device=device, low=None, high=None, noncontiguous=not contig)
-
-        def ref_index_copy(tgt, dim, idx, src):
-            for i in range(idx.size(0)):
-                idx_dest = dim * (slice(None),) + (idx[i],)
-                idx_src = dim * (slice(None),) + (i,)
-                tgt[idx_dest] = src[idx_src]
-
-        # More thorough testing as in index_add
-        for dest_contig, src_contig, index_contig in product([True, False], repeat=3):
-            for other_sizes in ((), (4, 5)):
-                for dim in range(len(other_sizes)):
-                    dest = make_arg(other_sizes, num_dest, dim, dest_contig)
-                    src = make_arg(other_sizes, num_copy, dim, src_contig)
-                    idx = torch.randperm(num_dest, dtype=torch.int64, device=device)[:num_copy]
-                    if not index_contig:
-                        idx = torch.repeat_interleave(idx, 2, dim=-1)
-                        idx = idx[..., ::2]
-                    dest2 = dest.clone()
-                    dest.index_copy_(dim, idx, src)
-                    ref_index_copy(dest2, dim, idx, src)
-                    self.assertEqual(dest, dest2)
-
-    # FIXME: move to test indexing
-    # onlyNativeDeviceTypes due to an XLA error:
-    # https://github.com/pytorch/pytorch/issues/53256
-    @onlyNativeDeviceTypes
-    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
-    def test_index_copy_scalars(self, device, dtype):
-        # Create the 8 possible combinations of scalar sizes for target / index / source
-        scalars = ((make_tensor(size_t, dtype=dtype, device=device, low=None, high=None),
-                    make_tensor(size_i, dtype=torch.int64, device=device, low=0, high=1),
-                    make_tensor(size_s, dtype=dtype, device=device, low=None, high=None))
-                   for size_t, size_i, size_s in product([(), (1,)], repeat=3))
-        for target, idx, source in scalars:
-            target.index_copy_(0, idx, source)
-            self.assertEqual(target.item(), source.item())
-
-    # FIXME: move to test indexing
-    @onlyCPU
-    def test_errors_index_copy(self, device):
-        # We do not test the GPU as the CUDA_ASSERT would break the CUDA context
-        idx_dim = 8
-        tgt_dim = 5
-        batch_dim = 3
-
-        # Too large of an index
-        a = torch.randn(batch_dim, tgt_dim, device=device)
-        idx = torch.full((idx_dim,), tgt_dim, device=device)
-        c = torch.zeros(batch_dim, idx_dim, device=device)
-        with self.assertRaises(IndexError):
-            a.index_copy_(1, idx, c)
-
-        # Too small (negative indices)
-        idx = torch.full((idx_dim,), -1, device=device)
-        with self.assertRaises(IndexError):
-            a.index_copy_(1, idx, c)
-
-        # Too small (very negative indices) - they should be unsupported even
-        # when support for negative indices is implemented for index_copy_
-        idx = torch.full((idx_dim,), -tgt_dim - 1, device=device)
-        with self.assertRaises(IndexError):
-            a.index_copy_(1, idx, c)
-
-    def _prepare_data_for_index_copy_and_add_deterministic(
-        self, dim: int, device: torch.device
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        assert (dim >= 0 and dim < 3)
-        a = [5, 4, 3]
-        a[dim] = 2000
-        x = torch.zeros(a, device=device)
-        b = a.copy()
-        elems = a[dim] * 20
-        b[dim] = elems
-        src = torch.rand(b, device=device)
-        index = torch.randint(a[dim], (elems,), device=device)
-        return (x, index, src)
-
-    # FIXME: move to test indexing
-    @onlyNativeDeviceTypes
-    def test_index_copy_deterministic(self, device: torch.device) -> None:
-        for dim in range(3):
-            x, index, src = self._prepare_data_for_index_copy_and_add_deterministic(dim, device)
-            with DeterministicGuard(True):
-                y0 = torch.index_copy(x, dim, index, src)
-
-            x0 = x.detach().clone()
-            index_list = index.tolist()
-            for i in range(len(index_list)):
-                if dim == 0:
-                    x0[index_list[i], :, :] = src[i, :, :]
-                elif dim == 1:
-                    x0[:, index_list[i], :] = src[:, i, :]
-                elif dim == 2:
-                    x0[:, :, index_list[i]] = src[:, :, i]
-
-            self.assertEqual(x0, y0, atol=0, rtol=0)
-
-    # FIXME: move to test indexing
-    @onlyNativeDeviceTypes
-    def test_index_add_deterministic(self, device: torch.device) -> None:
-        for dim in range(3):
-            x, index, src = self._prepare_data_for_index_copy_and_add_deterministic(dim, device)
-            alpha = random.random() + 1
-            # on CPU it should be deterministic regardless of the deterministic mode
-            with DeterministicGuard(True):
-                y0 = torch.index_add(x, dim, index, src, alpha=alpha)
-                for _ in range(3):
-                    y = torch.index_add(x, dim, index, src, alpha=alpha)
-                    self.assertEqual(y, y0, atol=0, rtol=0)
-
-            with DeterministicGuard(False):
-                for _ in range(3):
-                    y_nd = torch.index_add(x, dim, index, src, alpha=alpha)
-                    self.assertEqual(y_nd, y0, atol=1e-3, rtol=1e-5)
-
-    # FIXME: find a test suite for the put operator
-    @onlyNativeDeviceTypes
-    def test_index_put_non_accumulate_deterministic(self, device) -> None:
-        with DeterministicGuard(True):
-            for i in range(3):
-                m = random.randint(10, 20)
-                elems = random.randint(20000, 30000)
-                values = torch.rand(elems, device=device)
-                indices = torch.randint(m, (elems,), device=device)
-                input = torch.rand(m, device=device)
-                output = input.index_put((indices,), values, accumulate=False)
-
-                input_list = input.tolist()
-                indices_list = indices.tolist()
-                values_list = values.tolist()
-                for i, v in zip(indices_list, values_list):
-                    input_list[i] = v
-
-                self.assertEqual(output, input_list)
-
-    # FIXME: move to test indexing
-    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
-    @skipIfMPS
-    def test_index_fill(self, device, dtype):
-        x = torch.tensor([[1, 2], [4, 5]], dtype=dtype, device=device)
-        index = torch.tensor([0], device=device)
-        x.index_fill_(1, index, 0)
-        self.assertEqual(x, torch.tensor([[0, 2], [0, 5]], dtype=dtype, device=device))
-        if not x.is_complex() and not device == "meta":
-            with self.assertRaisesRegex(RuntimeError, r"Scalar"):
-                x.index_fill_(1, index, 1 + 1j)
-        # Make sure that the result stays 0-dim while applied to
-        # a 0-dim input
-        x = torch.tensor(1, dtype=dtype, device=device)
-        self.assertEqual(0, x.index_fill(0, index, -1).dim())
-        self.assertEqual(0, x.index_fill_(0, index, -1).dim())
-
-    # FIXME: move to test indexing
-    # The test fails for zero-dimensional tensors on XLA
-    @onlyNativeDeviceTypes
-    @dtypes(*all_types_complex_float8_and(torch.half, torch.bool, torch.bfloat16))
-    def test_index_select(self, device, dtype):
-        num_src, num_out = 3, 5
-
-        def make_arg(batch_sizes, n, dim, contig):
-            size_arg = batch_sizes[:dim] + (n,) + batch_sizes[dim:]
-            return make_tensor(size_arg, dtype=dtype, device=device, low=None, high=None, noncontiguous=not contig)
-
-        def ref_index_select(src, dim, idx):
-            # some types not supported on numpy
-            not_np_dtypes = (torch.bfloat16, torch.float8_e5m2, torch.float8_e5m2fnuz, torch.float8_e4m3fn, torch.float8_e4m3fnuz)
-            if dtype in not_np_dtypes:
-                src = src.float()
-            out = torch.from_numpy(np.take(src.cpu().numpy(), idx.cpu().numpy(), axis=dim))
-            if dtype in not_np_dtypes:
-                out = out.to(device=device, dtype=dtype)
-            return out
-
-        for src_contig, idx_contig in product([True, False], repeat=2):
-            for other_sizes in ((), (4, 5)):
-                for dim in range(len(other_sizes)):
-                    src = make_arg(other_sizes, num_src, dim, src_contig)
-                    idx = make_tensor(
-                        (num_out,), dtype=torch.int64, device=device, low=0, high=num_src, noncontiguous=not idx_contig
-                    )
-                    out = torch.index_select(src, dim, idx)
-                    out2 = ref_index_select(src, dim, idx)
-                    self.assertEqual(out, out2)
-
-        for idx_type in (torch.int32, torch.int64):
-            other_sizes = (3, 2)
-            dim = 1
-            src = make_arg(other_sizes, num_src, dim, True)
-            idx = make_tensor((num_out,), dtype=idx_type, device=device, low=0, high=num_src, noncontiguous=False)
-            out = torch.index_select(src, dim, idx)
-            out2 = ref_index_select(src, dim, idx)
-            self.assertEqual(out, out2)
-
-        # Create the 4 possible combinations of scalar sizes for index / source
-        scalars = ((make_tensor(size_s, dtype=dtype, device=device),
-                    torch.zeros(size_i, dtype=torch.int64, device=device))
-                   for size_s, size_i in product([(), (1,)], repeat=2))
-        for source, idx in scalars:
-            out = source.index_select(0, idx)
-            self.assertEqual(out.item(), source.item())
-
     # FIXME: find a test suite for the take operator
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @slowTestIf(IS_WINDOWS)
     def test_take(self, device, dtype):
         idx_size = (4,)
 
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 89db8d798c266..b90b1ed86ef29 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -49,9 +49,9 @@
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     PLATFORM_SUPPORTS_FUSED_ATTENTION,
     PLATFORM_SUPPORTS_CUDNN_ATTENTION,
-    SM90OrLater,
     tf32_on_and_off,
     tf32_enabled,
+    ROCM_VERSION,
 )
 
 if TEST_FAIRSEQ:
@@ -340,7 +340,7 @@ def test_train_with_pad_and_catch_error(self, device):
                 l1_bool = nn.L1Loss()(test_train_bool[:, 0:2, :], test_eval_bool[:, 0:2, :]).item()
                 self.assertTrue(l1_bool < 1e-4, "Eval/Train difference in pad_mask BOOL")
 
-    @tf32_on_and_off(0.001)
+    @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
     @parametrize("attn_mask_dim", [2, 3, None])
     @parametrize("key_padding_mask_dim", [2, None])
     @parametrize("mask_dtype", [torch.bool, torch.float32])
@@ -524,7 +524,7 @@ def test_transformerencoder_fastpath(self, device, use_torchscript, enable_neste
                 slowpath_output = slowpath_output.masked_fill(src_key_padding_mask.unsqueeze(-1), 0)
                 self.assertEqual(fastpath_output_expanded, slowpath_output)
 
-    @tf32_on_and_off(0.001)
+    @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
     @parametrize("with_no_grad", [True, False])
     @parametrize("training", [True, False])
     @parametrize("enable_nested_tensor", [False])
@@ -1110,7 +1110,7 @@ def forward(
                     return_all_hiddens=False,
                 )[0]
 
-    @tf32_on_and_off(0.003)
+    @tf32_on_and_off(0.003, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
     @parametrize("input_dim,attn_mask_dim,is_causal",
                  [(3, None, False), (3, 2, False), (3, 2, True), (3, 3, False), (3, 3, True),
                   (4, None, False), (4, 2, False), (4, 2, True), (4, 4, False), (4, 4, True)],
@@ -2070,6 +2070,11 @@ def ref(x):
             sdp_math = torch.nn.functional.scaled_dot_product_attention(x, x, x, scale=-1.0 / 0.0001)
         self.assertEqual(ref_result, sdp_math)
 
+    def test_scaled_dot_product_attention_fp16_overflow(self, device):
+        # Regression test for https://github.com/pytorch/pytorch/issues/160841
+        x = torch.full((1, 32, 23, 80), 64.0, dtype=torch.half, device=device)
+        y = torch.nn.functional.scaled_dot_product_attention(x, x, x)
+        self.assertFalse(y.isnan().any().item())
 
 class TestSDPACpuOnly(NNTestCase):
     """ Used to test CPU only functionality of scaled_dot_product_attention """
@@ -2667,18 +2672,24 @@ def test_cudnn_attention_d256_heuristic(self, device):
         v_shape = SdpaShape(batch, num_heads, seq_len, head_dim_v)
         query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
 
-        with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH], set_priority=True):
-            actual = torch.nn.functional.scaled_dot_product_attention(
-                query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
-            actual.backward(torch.randn_like(actual))
-        with sdpa_kernel(backends=[SDPBackend.MATH]):
-            math_ref = torch.nn.functional.scaled_dot_product_attention(
-                query.contiguous().to(torch.float32),
-                key.contiguous().to(torch.float32),
-                value.contiguous().to(torch.float32),
-                attn_mask=None, dropout_p=0.0, is_causal=False)
-
-        self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
+        def test():
+            with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION], set_priority=True):
+                actual = torch.nn.functional.scaled_dot_product_attention(
+                    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
+                actual.backward(torch.randn_like(actual))
+            with sdpa_kernel(backends=[SDPBackend.MATH]):
+                math_ref = torch.nn.functional.scaled_dot_product_attention(
+                    query.contiguous().to(torch.float32),
+                    key.contiguous().to(torch.float32),
+                    value.contiguous().to(torch.float32),
+                    attn_mask=None, dropout_p=0.0, is_causal=False)
+            self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
+
+        if torch.cuda.get_device_capability() in [(9, 0)]:
+            test()
+        else:
+            with self.assertRaisesRegex(RuntimeError, "No available kernel."):
+                test()
 
     @skipIfRocm(msg="No cuDNN on ROCm")
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
@@ -2705,7 +2716,7 @@ def test_fused_attention_different_dk_dv(self, device):
 
 
     @skipIfRocm  # No cuDNN Attention
-    @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
+    @unittest.skipIf(True, "broken as of cuDNN 9.10")
     def test_cudnn_attention_fail_d128(self, device):
         # Test that cuDNN attention dispatching correctly bails out on d > 128
         b, h = 1, 2
@@ -2720,7 +2731,6 @@ def test_cudnn_attention_fail_d128(self, device):
         ISSM90 = device_cap == (9, 0)
         ISSM100 = device_cap == (10, 0)
         with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION]):
-            # SM90/100 support d <= 256 as of cuDNN 9.5.1+
             if (ISSM90 or ISSM100) and torch.backends.cudnn.version() >= 90501:
                 torch.nn.functional.scaled_dot_product_attention(q, k, v)
             else:
@@ -3156,15 +3166,19 @@ def test_fused_sdp_choice(self, device, type: str):
         value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
         key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
 
+        device_capability = None
+        if "cuda" in str(device):
+            device_capability = torch.cuda.get_device_capability()
+        prefer_cudnn = "TORCH_CUDNN_SDPA_PREFERRED" not in os.environ or bool(os.environ["TORCH_CUDNN_SDPA_PREFERRED"])
+        prefer_cudnn = prefer_cudnn and device_capability and (device_capability == (9, 0) or device_capability == (10, 0))
+
         # TODO we are currently disabling this by default, lets assert that this returns
         # FlashAttention, we need to change when we make remove opt-in for cudnn
-        if type != "nested" and PLATFORM_SUPPORTS_CUDNN_ATTENTION and SM90OrLater:
-            self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.FLASH_ATTENTION.value)
-            with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION]):
-                self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.CUDNN_ATTENTION.value)
+        if type != "nested" and PLATFORM_SUPPORTS_CUDNN_ATTENTION and prefer_cudnn:
+            self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.CUDNN_ATTENTION.value)
         elif PLATFORM_SUPPORTS_FLASH_ATTENTION:
             self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.FLASH_ATTENTION.value)
-        elif type != "nested" and PLATFORM_SUPPORTS_CUDNN_ATTENTION:  # e.g., we're on Windows
+        elif type != "nested" and PLATFORM_SUPPORTS_CUDNN_ATTENTION and not prefer_cudnn:  # e.g., we're on Windows
             self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.EFFICIENT_ATTENTION.value)
             with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION]):
                 self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.CUDNN_ATTENTION.value)
@@ -3417,6 +3431,7 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
             'grad_value': 8.5,
         }
         if TEST_WITH_ROCM:
+            fudge_factors['out'] = 5.0
             fudge_factors['grad_key'] = 45.0
             fudge_factors['grad_query'] = 360.0
             if seq_len_k >= 1024:
@@ -3426,6 +3441,8 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
                 fudge_factors['grad_query'] = 670.0
             if dtype == torch.float32:
                 fudge_factors['grad_key'] = 90.0
+            if "gfx95" in torch.cuda.get_device_properties(0).gcnArchName:
+                fudge_factors['grad_value'] = 16.0
 
         check_out_and_grad(
             (out_ref, out_lp_ref, out),
@@ -3538,6 +3555,7 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
             "grad_attn_mask": 45.0,
         }
         if TEST_WITH_ROCM:
+            fudge_factors['out'] = 6.0
             fudge_factors['grad_key'] = 45.0
             fudge_factors['grad_query'] = 360.0
             if seq_len_k >= 1024:
@@ -3547,6 +3565,8 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
                 fudge_factors['grad_query'] = 670.0  # gfx90a
             if dtype == torch.float32:
                 fudge_factors['grad_key'] = 90.0
+                if "gfx95" in torch.cuda.get_device_properties(0).gcnArchName:
+                    fudge_factors['grad_value'] = 16.0
 
         check_out_and_grad(
             (out_ref, out_lp_ref, out),
@@ -3667,17 +3687,33 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le
             'grad_value': 4,
         }
         if TEST_WITH_ROCM:
-            fudge_factors['grad_key'] = 45.0
-            fudge_factors['grad_query'] = 360.0
-            if seq_len_k >= 1024:
-                fudge_factors['grad_key'] = 70.0
-            if seq_len_k >= 2048:
-                fudge_factors['grad_key'] = 190.0
-                fudge_factors['grad_query'] = 650.0
-                if seq_len_q >= 2048:
-                    fudge_factors['grad_query'] = 1100.0
-            if dtype == torch.float32:
-                fudge_factors['grad_key'] = 90.0
+            fudge_factors['grad_value'] = 6.0
+            if TEST_WITH_CK:
+                fudge_factors['out'] = 5.0
+                fudge_factors['grad_key'] = 145.0
+                fudge_factors['grad_query'] = 855.0  # ck min = 855.0
+                if seq_len_k >= 1024:
+                    fudge_factors['grad_key'] = 70.0
+                if seq_len_k >= 2048:
+                    fudge_factors['grad_key'] = 190.0
+                    fudge_factors['grad_query'] = 1550.0  # NEW CK MIN
+                    if seq_len_q >= 2048:
+                        fudge_factors['grad_query'] = 1100.0
+                if dtype == torch.float32:
+                    fudge_factors['grad_key'] = 90.0
+            else:
+                fudge_factors['out'] = 6.0
+                fudge_factors['grad_key'] = 45.0
+                fudge_factors['grad_query'] = 360.0
+                if seq_len_k >= 1024:
+                    fudge_factors['grad_key'] = 70.0
+                if seq_len_k >= 2048:
+                    fudge_factors['grad_key'] = 190.0
+                    fudge_factors['grad_query'] = 650.0
+                    if seq_len_q >= 2048:
+                        fudge_factors['grad_query'] = 1100.0
+                if dtype == torch.float32:
+                    fudge_factors['grad_key'] = 90.0
 
         check_out_and_grad(
             (out_ref, out_lp_ref, out),
@@ -3830,15 +3866,19 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d
             grads_ref_lp = torch.autograd.grad(out_lp_ref, (query, key, value), upstream_grad)
             grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad)
 
+            fudge_factors = {
+                'out': 3.0,
+                'grad_query': 110.0,
+                'grad_key': 8.0,
+                'grad_value': 3.0,
+            }
+            if TEST_WITH_ROCM:
+                fudge_factors['out'] = 6.0
+                fudge_factors['grad_value'] = 6.0
             check_out_and_grad(
                 (out_ref, out_lp_ref, out),
                 *zip(grads_ref, grads_ref_lp, grads),
-                fudge_factors={
-                    'out': 3.0,
-                    'grad_query': 110.0,
-                    'grad_key': 8.0,
-                    'grad_value': 3.0,
-                }
+                fudge_factors=fudge_factors
             )
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Fused SDPA was not built for this system")
@@ -4115,9 +4155,6 @@ def rand_nt(sequence_list, num_heads, head_dim):
 class TestSDPAXpuOnly(NNTestCase):
     """ Used to test XPU only functionality of scaled_dot_product_attention
     Mostly migrate from TestSDPACudaOnly in test/test_transformers.py
-
-    Note that as SDPBackend.OVERRIDEABLE is not managed by sdpa_kernel so that
-    math ref has to be called explicitly via torch.ops.aten._scaled_dot_product_attention_math.
     """
 
     @parametrize("type", ["dense"])
@@ -4143,7 +4180,6 @@ def test_fused_attention_different_dk_dv(self, device):
         v_shape = SdpaShape(batch, num_heads, 2, head_dim_v)
         query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
 
-        # test that we do not dispatch to onednn for an unsupported case
         actual = F.scaled_dot_product_attention(
             query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
 
@@ -4181,7 +4217,6 @@ def test_fused_attention_gqa(self, device, dtype, batch_size, n_head, n_head_kv,
         v_shape = SdpaShape(batch_size, n_head_kv, kv_size, head_dim)
         query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
 
-        # test that we do not dispatch to onednn for an unsupported case
         actual = F.scaled_dot_product_attention(
             query, key, value, attn_mask=None, dropout_p=0.0, is_causal=is_causal, enable_gqa=True)
 
@@ -4251,18 +4286,6 @@ def test_attention(permute_order: list[list[int]]):
         for permute_order in permute_orders:
             test_attention(list(permute_order) + [3])
 
-    def test_backends_flash_fallback_to_overrideable(self, device):
-        dtype = torch.bfloat16
-        q_shape = SdpaShape(1, 1, 8, 16)
-        kv_shape = SdpaShape(1, 1, 12, 16)
-        make_q = partial(torch.rand, q_shape, device=device, dtype=dtype)
-        make_kv = partial(torch.rand, kv_shape, device=device, dtype=dtype)
-        q, k, v = make_q(), make_kv(), make_kv()
-        warning_str = "Flash Attention is not supported on XPU, falling back to overrideable kernel."
-        with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
-            with self.assertWarnsRegex(UserWarning, warning_str):
-                _ = F.scaled_dot_product_attention(q, k, v)
-
     def test_backends_set_to_math(self, device):
         dtype = torch.bfloat16
         q_shape = SdpaShape(1, 1, 8, 16)
@@ -4275,6 +4298,23 @@ def test_backends_set_to_math(self, device):
             self.assertFalse(torch._C._get_overrideable_sdp_enabled())
             _ = F.scaled_dot_product_attention(q, k, v)
 
+    def test_default_priority_order(self, device):
+        # The default priority order of xpu is overrideable, math, flash, efficient, cudnn
+        # For xpu backend, we need to make sure that overrideable > math > flash
+        dtype = torch.bfloat16
+        shape = SdpaShape(1, 1, 1, 1)
+        make_tensor = partial(torch.rand, shape, device=device, dtype=dtype)
+        t = make_tensor()
+        # run sdp_choice to make sure priority_order is set by XPU default priority_order
+        torch._fused_sdp_choice(t, t, t)
+        from torch.nn.attention import _cur_sdpa_kernel_backends
+        default_priority = _cur_sdpa_kernel_backends(with_priority=True)
+        flash_index = default_priority.index(SDPBackend.FLASH_ATTENTION)
+        overrideable_index = default_priority.index(SDPBackend.OVERRIDEABLE)
+        math_index = default_priority.index(SDPBackend.MATH)
+        self.assertTrue(overrideable_index < math_index < flash_index,
+                        f"Expected overrideable < math < flash, got {overrideable_index}, {math_index}, {flash_index}")
+
     def test_scaled_dot_product_attention_fused_kernels_safe_softmax(self, device):
         dtype = torch.bfloat16
         make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=False)
@@ -4474,10 +4514,6 @@ def test_causal_variants(self, device, causal_variant: CausalVariant, shape: lis
         make_tensor = partial(
             torch.rand, device=device, dtype=torch.float16, requires_grad=True
         )
-        if TEST_WITH_ROCM and causal_variant == CausalVariant.LOWER_RIGHT:
-            self.skipTest("No support for LOWER_RIGHT variant for now")
-            return
-
         bsz, num_heads, seq_len_q, seq_len_kv, head_dim = shape
         make_q_tensor = partial(make_tensor, SdpaShape(bsz, num_heads, seq_len_q, head_dim))
         make_kv_tensor = partial(make_tensor, SdpaShape(bsz, num_heads, seq_len_kv, head_dim))
@@ -4508,10 +4544,6 @@ def test_causal_variants(self, device, causal_variant: CausalVariant, shape: lis
     @unittest.skipIf(IS_WINDOWS, "torch.compile is not supported on windows")
     @skipIfTorchDynamo("This function already calls torch.compile.")
     def test_causal_variants_compile(self, device, causal_variant: CausalVariant, shape: list[tuple[int]]):
-        if TEST_WITH_ROCM and causal_variant == CausalVariant.LOWER_RIGHT:
-            self.skipTest("No support for LOWER_RIGHT variant for now")
-            return
-
         cnts = CompileCounterWithBackend("aot_eager")
         make_tensor = partial(
             torch.rand, device=device, dtype=torch.float16, requires_grad=True
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index d7d9a2b1aab6d..9939e8e76ce94 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -54,6 +54,8 @@
 )
 from torch.utils import _pytree as pytree
 
+from torch.testing._internal.common_utils import IS_WINDOWS, slowTestIf
+
 if TEST_SCIPY:
     import scipy
 
@@ -271,6 +273,7 @@ def _helper_reference_numerics(
     #   and noncontiguities.
     @suppress_warnings
     @ops(reference_filtered_ops)
+    @slowTestIf(IS_WINDOWS)
     def test_reference_numerics_normal(self, device, dtype, op):
         tensors = generate_elementwise_unary_tensors(
             op, device=device, dtype=dtype, requires_grad=False
@@ -279,6 +282,7 @@ def test_reference_numerics_normal(self, device, dtype, op):
 
     @suppress_warnings
     @ops(reference_filtered_ops)
+    @slowTestIf(IS_WINDOWS)
     def test_reference_numerics_small(self, device, dtype, op):
         if dtype in (torch.bool,):
             raise self.skipTest("bool has no small values")
@@ -290,6 +294,7 @@ def test_reference_numerics_small(self, device, dtype, op):
 
     @suppress_warnings
     @ops(reference_filtered_ops)
+    @slowTestIf(IS_WINDOWS)
     def test_reference_numerics_large(self, device, dtype, op):
         if dtype in (torch.bool, torch.uint8, torch.int8):
             raise self.skipTest("bool, uint8, and int8 dtypes have no large values")
@@ -304,6 +309,7 @@ def test_reference_numerics_large(self, device, dtype, op):
         reference_filtered_ops,
         allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half),
     )
+    @slowTestIf(IS_WINDOWS)
     def test_reference_numerics_extremal(self, device, dtype, op):
         tensors = generate_elementwise_unary_extremal_value_tensors(
             op, device=device, dtype=dtype, requires_grad=False
@@ -312,6 +318,7 @@ def test_reference_numerics_extremal(self, device, dtype, op):
 
     # Tests for testing (non)contiguity consistency
     @ops(unary_ufuncs)
+    @slowTestIf(IS_WINDOWS)
     def test_contig_vs_every_other(self, device, dtype, op):
         contig = make_tensor(
             (1026,), device=device, dtype=dtype, low=op.domain[0], high=op.domain[1]
@@ -328,6 +335,7 @@ def test_contig_vs_every_other(self, device, dtype, op):
         self.assertEqual(result, expected)
 
     @ops(unary_ufuncs)
+    @slowTestIf(IS_WINDOWS)
     def test_contig_vs_transposed(self, device, dtype, op):
         contig = make_tensor(
             (789, 357), device=device, dtype=dtype, low=op.domain[0], high=op.domain[1]
@@ -344,6 +352,7 @@ def test_contig_vs_transposed(self, device, dtype, op):
         self.assertEqual(result, expected)
 
     @ops(unary_ufuncs)
+    @slowTestIf(IS_WINDOWS)
     def test_non_contig(self, device, dtype, op):
         shapes = [(5, 7), (1024,)]
         for shape in shapes:
@@ -360,6 +369,7 @@ def test_non_contig(self, device, dtype, op):
             self.assertEqual(op(contig, **torch_kwargs), op(non_contig, **torch_kwargs))
 
     @ops(unary_ufuncs)
+    @slowTestIf(IS_WINDOWS)
     def test_non_contig_index(self, device, dtype, op):
         contig = make_tensor(
             (2, 2, 1, 2),
@@ -378,6 +388,7 @@ def test_non_contig_index(self, device, dtype, op):
         self.assertEqual(op(contig, **torch_kwargs), op(non_contig, **torch_kwargs))
 
     @ops(unary_ufuncs)
+    @slowTestIf(IS_WINDOWS)
     def test_non_contig_expand(self, device, dtype, op):
         shapes = [(1, 3), (1, 7), (5, 7)]
         for shape in shapes:
@@ -399,6 +410,7 @@ def test_non_contig_expand(self, device, dtype, op):
                 )
 
     @ops(unary_ufuncs)
+    @slowTestIf(IS_WINDOWS)
     def test_contig_size1(self, device, dtype, op):
         contig = make_tensor(
             (5, 100), dtype=dtype, device=device, low=op.domain[0], high=op.domain[1]
@@ -414,6 +426,7 @@ def test_contig_size1(self, device, dtype, op):
         self.assertEqual(op(contig, **torch_kwargs), op(contig2, **torch_kwargs))
 
     @ops(unary_ufuncs)
+    @slowTestIf(IS_WINDOWS)
     def test_contig_size1_large_dim(self, device, dtype, op):
         contig = make_tensor(
             (5, 2, 3, 1, 4, 5, 3, 2, 1, 2, 3, 4),
@@ -435,6 +448,7 @@ def test_contig_size1_large_dim(self, device, dtype, op):
     # Tests that computation on a multiple batches is the same as
     # per-batch computation.
     @ops(unary_ufuncs)
+    @slowTestIf(IS_WINDOWS)
     def test_batch_vs_slicing(self, device, dtype, op):
         input = make_tensor(
             (1024, 512), dtype=dtype, device=device, low=op.domain[0], high=op.domain[1]
diff --git a/test/test_utils.py b/test/test_utils.py
index 080afe7615913..0314da6e320a1 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -21,8 +21,6 @@
 import torch.utils.data
 from torch._utils import try_import
 from torch._utils_internal import deprecated
-from torch.autograd._functions.utils import check_onnx_broadcast
-from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
@@ -790,65 +788,6 @@ def test_smoke(self):
         self.assertTrue(info_output.count("\n") >= 17)
 
 
-class TestONNXUtils(TestCase):
-    def test_prepare_onnx_paddings(self):
-        sizes = [2, 3, 4]
-        pad = [1, 2, 3, 4]
-        paddings = _prepare_onnx_paddings(len(sizes), pad)
-        self.assertEqual(paddings, [0, 3, 1, 0, 4, 2])
-
-    def test_check_onnx_broadcast(self):
-        def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail):
-            broadcast = True
-            fail = False
-            try:
-                broadcast = check_onnx_broadcast(dims1, dims2)
-            except ValueError:
-                fail = True
-            self.assertEqual(broadcast, expect_broadcast)
-            self.assertEqual(fail, expect_fail)
-
-        # Case 1, check the case when len(dims1) < len(dims2) and numel(dims2) > 1
-        dims1 = [3, 4]
-        dims2 = [2, 3, 4]
-        try_check_onnx_broadcast(dims1, dims2, True, True)
-
-        # Case 2, check the case when len(dims1) < len(dims2) and numel(dims2) == 1
-        dims1 = [3, 4]
-        dims2 = [1, 1, 1]
-        try_check_onnx_broadcast(dims1, dims2, True, False)
-
-        # Case 3, check the case when len(dims1) > len(dims2) and numel(dims2) == 1
-        dims1 = [1, 1]
-        dims2 = [1]
-        try_check_onnx_broadcast(dims1, dims2, True, False)
-
-        # Case 4, check the case when len(dims1) > len(dims2) and dims1[x:] == dims2
-        dims1 = [2, 3, 4]
-        dims2 = [3, 4]
-        try_check_onnx_broadcast(dims1, dims2, True, False)
-
-        # Case 5, check the case when len(dims1) > len(dims2), but dims1[x:] != dims2
-        dims1 = [2, 3, 4]
-        dims2 = [1, 4]
-        try_check_onnx_broadcast(dims1, dims2, True, True)
-
-        # Case 6, check the equal case, no broadcast
-        dims1 = [3, 4]
-        dims2 = [3, 4]
-        try_check_onnx_broadcast(dims1, dims2, False, False)
-
-        # Case 7, check the case when len(dims1) == len(dims2), but dims1 != dims2
-        dims1 = [3, 4]
-        dims2 = [1, 4]
-        try_check_onnx_broadcast(dims1, dims2, True, True)
-
-        # Case 8, check the case when len(dims1) == len(dims2) and numel(s2) == 1
-        dims1 = [3, 4]
-        dims2 = [1, 1]
-        try_check_onnx_broadcast(dims1, dims2, True, False)
-
-
 class TestHipify(TestCase):
     def test_import_hipify(self):
         from torch.utils.hipify import hipify_python  # noqa: F401
@@ -856,7 +795,9 @@ def test_import_hipify(self):
 
 class TestHipifyTrie(TestCase):
     def setUp(self):
-        self.trie = torch.utils.hipify.hipify_python.Trie()
+        from torch.utils.hipify import hipify_python
+
+        self.trie = hipify_python.Trie()
 
     def test_add_and_search_trie(self):
         self.trie.add("banana")
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 5aa30483deba9..5bec225787cc6 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -11,15 +11,16 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfMPS,
+    expectedFailureMPS,
     instantiate_device_type_tests,
     onlyCPU,
     onlyNativeDeviceTypes,
-    onlyNativeDeviceTypesAnd,
     skipLazy,
     skipMeta,
     skipXLA,
 )
 from torch.testing._internal.common_dtype import (
+    all_mps_types_and,
     all_types_and,
     all_types_and_complex_and,
     complex_types,
@@ -157,8 +158,11 @@ def test_conj_self(self, device, dtype):
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool))
+    @dtypesIfMPS(*integral_types_and(torch.cfloat, torch.float, torch.half, torch.bool))
     def test_view_dtype_new(self, device, dtype):
         dtypes = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}
+        if device.startswith("mps"):
+            del dtypes[torch.float64]
         del dtypes[torch.bool]
 
         def generate_inputs():
@@ -271,6 +275,7 @@ def calc_expected_size_and_stride(a, view_dtype):
     # has a greater element size than the original dtype
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypesIfMPS(*all_mps_types_and(torch.bool))
     def test_view_dtype_upsize_errors(self, device, dtype):
         dtype_size = torch._utils._element_size(dtype)
 
@@ -372,6 +377,7 @@ def fn(contiguous_input=True, dim0=0, dim1=1):
 
     @onlyNativeDeviceTypes
     @dtypes(*complex_types(), torch.complex32)
+    @dtypesIfMPS(torch.cfloat, torch.chalf)
     def test_view_as_real(self, device, dtype):
         def fn(contiguous_input=True):
             t = torch.randn(3, 4, dtype=dtype, device=device)
@@ -398,9 +404,7 @@ def fn(contiguous_input=True):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
-    @dtypesIfMPS(
-        *integral_types_and(torch.half, torch.bfloat16, torch.bool, torch.float32)
-    )
+    @dtypesIfMPS(*all_mps_types_and(torch.bool))
     def test_view_tensor_split(self, device, dtype):
         a = make_tensor((40, 30), dtype=dtype, device=device, low=-9, high=9)
         a_split_dim0 = a.tensor_split(7, 0)
@@ -412,6 +416,7 @@ def test_view_tensor_split(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypesIfMPS(*all_mps_types_and(torch.cfloat, torch.bool))
     def test_view_tensor_hsplit(self, device, dtype):
         t = make_tensor((4, 4, 4), dtype=dtype, device=device, low=-9, high=9)
         t_hsplit = torch.hsplit(t, 2)
@@ -422,6 +427,7 @@ def test_view_tensor_hsplit(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypesIfMPS(*all_mps_types_and(torch.cfloat, torch.bool))
     def test_view_tensor_vsplit(self, device, dtype):
         t = make_tensor((4, 4, 4), dtype=dtype, device=device, low=-9, high=9)
         t_vsplit = torch.vsplit(t, 2)
@@ -432,6 +438,7 @@ def test_view_tensor_vsplit(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypesIfMPS(*all_mps_types_and(torch.cfloat, torch.bool))
     def test_view_tensor_dsplit(self, device, dtype):
         t = make_tensor((4, 4, 4), dtype=dtype, device=device, low=-9, high=9)
         t_dsplit = torch.dsplit(t, 2)
@@ -440,9 +447,9 @@ def test_view_tensor_dsplit(self, device, dtype):
         t[2, 2, 2] = 7
         self.assertEqual(t_dsplit[1][2, 2, 0], t[2, 2, 2])
 
-    @onlyNativeDeviceTypesAnd("mps")
+    @onlyNativeDeviceTypes
     @dtypes(*all_types_and(torch.half, torch.bfloat16))
-    @dtypesIfMPS(*integral_types_and(torch.half, torch.bool, torch.float32))
+    @dtypesIfMPS(*all_mps_types_and(torch.bool))
     def test_imag_noncomplex(self, device, dtype):
         t = torch.ones((5, 5), dtype=dtype, device=device)
 
@@ -451,6 +458,7 @@ def test_imag_noncomplex(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*complex_types())
+    @dtypesIfMPS(torch.cfloat)
     def test_real_imag_view(self, device, dtype):
         def compare_with_numpy(contiguous_input=True):
             t = torch.randn(3, 3, dtype=dtype, device=device)
@@ -481,6 +489,7 @@ def compare_with_numpy(contiguous_input=True):
         self.assertEqual(a[5:].imag, a.imag[5:])
 
     @onlyNativeDeviceTypes
+    @expectedFailureMPS
     @dtypes(*complex_types())
     def test_conj_imag_view(self, device, dtype) -> None:
         t = _make_tensor((4, 5), dtype, device)
@@ -512,6 +521,12 @@ def test_conj_view_with_shared_memory(self, device) -> None:
             all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
         )
     )
+    @dtypesIfMPS(
+        *product(
+            [torch.cfloat, torch.chalf],
+            all_mps_types_and(torch.cfloat, torch.chalf, torch.bool),
+        )
+    )
     @suppress_warnings
     def test_set_real_imag(self, device, dtypes):
         x = torch.randn(10, dtype=dtypes[0], device=device)
@@ -1641,7 +1656,7 @@ def test_broadcast_shapes(self, device):
         inputs_with_neg_vals = [[1, 1, -12], [-1, 1], [-11]]
         for integral_inputs_with_neg_vals in inputs_with_neg_vals:
             with self.assertRaisesRegex(
-                RuntimeError, "Trying to create tensor with negative dimension"
+                ValueError, "Attempting to broadcast a dimension with negative length!"
             ):
                 torch.broadcast_shapes(*integral_inputs_with_neg_vals)
 
@@ -1649,20 +1664,21 @@ def test_broadcast_shapes(self, device):
         for error_input in integral_inputs_error_case:
             with self.assertRaisesRegex(
                 RuntimeError,
-                "Shape mismatch: objects cannot be broadcast to a single shape",
+                ".*expected shape should be broadcastable to*",
             ):
                 torch.broadcast_shapes(*error_input)
 
         negative_inputs = [(-1,), (1, -12), (4, -11), (-4, 1), (1, 1, -2)]
         for s0 in negative_inputs:
             with self.assertRaisesRegex(
-                RuntimeError, "Trying to create tensor with negative dimension"
+                ValueError, "Attempting to broadcast a dimension with negative length!"
             ):
                 torch.broadcast_shapes(s0)
 
             for s1 in negative_inputs:
                 with self.assertRaisesRegex(
-                    RuntimeError, "Trying to create tensor with negative dimension"
+                    ValueError,
+                    "Attempting to broadcast a dimension with negative length!",
                 ):
                     torch.broadcast_shapes(s0, s1)
 
diff --git a/test/test_xpu.py b/test/test_xpu.py
index cd5275418c440..04d045b00d8bc 100644
--- a/test/test_xpu.py
+++ b/test/test_xpu.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: intel"]
 
+import gc
 import re
 import subprocess
 import sys
@@ -133,6 +134,10 @@ def test_get_device_properties(self):
                 device_properties.architecture,
                 device_capability["architecture"],
             )
+        self.assertEqual(
+            len(str(device_properties.uuid)), 36
+        )  # xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+        self.assertEqual(len(device_properties.uuid.bytes), 16)
 
     @unittest.skipIf(IS_WINDOWS, "not applicable to Windows (only fails with fork)")
     def test_wrong_xpu_fork(self):
@@ -520,6 +525,42 @@ def test_device_memory_allocated(self):
         )
         del a
 
+    def test_memory_stats(self):
+        gc.collect()
+        torch.xpu.empty_cache()
+        torch.xpu.reset_peak_memory_stats()
+        torch.xpu.reset_accumulated_memory_stats()
+        prev_allocated = torch.accelerator.memory_allocated()
+        prev_reserved = torch.accelerator.memory_reserved()
+        prev_max_allocated = torch.accelerator.max_memory_allocated()
+        prev_max_reserved = torch.accelerator.max_memory_reserved()
+        self.assertEqual(prev_allocated, prev_max_allocated)
+        self.assertEqual(prev_reserved, prev_max_reserved)
+        # Activate 1kB memory
+        prev_active_current = torch.accelerator.memory_stats()[
+            "active_bytes.all.current"
+        ]
+        tmp = torch.randn(256, device="xpu")
+        # Detect if the current active memory is 1kB
+        self.assertEqual(
+            torch.accelerator.memory_stats()["active_bytes.all.current"],
+            1024 + prev_active_current,
+        )
+        self.assertEqual(torch.accelerator.memory_stats()["active_bytes.all.freed"], 0)
+        del tmp
+        gc.collect()
+        torch.accelerator.empty_cache()
+        self.assertEqual(
+            torch.accelerator.memory_stats()["active_bytes.all.current"],
+            prev_active_current,
+        )
+        self.assertEqual(
+            torch.accelerator.memory_stats()["active_bytes.all.freed"], 1024
+        )
+        torch.accelerator.reset_peak_memory_stats()
+        self.assertEqual(torch.accelerator.max_memory_allocated(), prev_max_allocated)
+        self.assertEqual(torch.accelerator.max_memory_reserved(), prev_max_reserved)
+
     @skipXPUIf(
         int(torch.version.xpu) < 20250000,
         "Test requires SYCL compiler version 2025.0.0 or newer.",
diff --git a/third_party/cpp-httplib b/third_party/cpp-httplib
index 3af7f2c16147f..89c932f313c64 160000
--- a/third_party/cpp-httplib
+++ b/third_party/cpp-httplib
@@ -1 +1 @@
-Subproject commit 3af7f2c16147f3fbc6e4d717032daf505dc1652c
+Subproject commit 89c932f313c6437c38f2982869beacc89c2f2246
diff --git a/third_party/cudnn_frontend b/third_party/cudnn_frontend
index f937055efc6d4..1a7b4b78db447 160000
--- a/third_party/cudnn_frontend
+++ b/third_party/cudnn_frontend
@@ -1 +1 @@
-Subproject commit f937055efc6d414d11f4c6577e3977fe74f35fb6
+Subproject commit 1a7b4b78db44712fb9707d21cd2e3179f1fd88b8
diff --git a/third_party/fbgemm b/third_party/fbgemm
index 0adf628317e0c..4b39c551efe15 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 0adf628317e0cea414f66dcca901e0b85280fdb1
+Subproject commit 4b39c551efe15e6bbade20565b0ceb2d8ce3352d
diff --git a/third_party/pybind11 b/third_party/pybind11
index a2e59f0e70654..f5fbe867d2d26 160000
--- a/third_party/pybind11
+++ b/third_party/pybind11
@@ -1 +1 @@
-Subproject commit a2e59f0e7065404b44dfe92a28aca47ba1378dc4
+Subproject commit f5fbe867d2d26e4a0a9177a51f6e568868ad3dc8
diff --git a/third_party/tensorpipe b/third_party/tensorpipe
index dacda0567d9f2..af0118d13e52f 160000
--- a/third_party/tensorpipe
+++ b/third_party/tensorpipe
@@ -1 +1 @@
-Subproject commit dacda0567d9f23d4bc503e1c4f84aa65f33ac38a
+Subproject commit af0118d13e52f5a08841464a768e01a0bf3e3075
diff --git a/third_party/tensorpipe.BUILD b/third_party/tensorpipe.BUILD
index ece345fda4a26..5e5b69b4cb4ec 100644
--- a/third_party/tensorpipe.BUILD
+++ b/third_party/tensorpipe.BUILD
@@ -7,6 +7,7 @@ LIBUV_COMMON_SRCS = [
     "third_party/libuv/src/inet.c",
     "third_party/libuv/src/random.c",
     "third_party/libuv/src/strscpy.c",
+    "third_party/libuv/src/strtok.c",
     "third_party/libuv/src/threadpool.c",
     "third_party/libuv/src/timer.c",
     "third_party/libuv/src/uv-common.c",
@@ -37,9 +38,7 @@ LIBUV_POSIX_SRCS = [
 
 LIBUV_LINUX_SRCS = LIBUV_POSIX_SRCS + [
     "third_party/libuv/src/unix/proctitle.c",
-    "third_party/libuv/src/unix/linux-core.c",
-    "third_party/libuv/src/unix/linux-inotify.c",
-    "third_party/libuv/src/unix/linux-syscalls.c",
+    "third_party/libuv/src/unix/linux.c",
     "third_party/libuv/src/unix/procfs-exepath.c",
     "third_party/libuv/src/unix/random-getrandom.c",
     "third_party/libuv/src/unix/random-sysctl-linux.c",
@@ -60,6 +59,7 @@ cc_library(
             "third_party/libuv/src/unix/*.h",
         ],
     ),
+    copts = ["-D_GNU_SOURCE"],
     visibility = ["//visibility:public"],
 )
 
@@ -151,7 +151,7 @@ cc_library(
         ".",
     ],
     copts = [
-        "-std=c++14",
+        "-std=c++17",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -168,7 +168,7 @@ cc_library(
         ".",
     ],
     copts = [
-        "-std=c++14",
+        "-std=c++17",
     ],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/third_party/xnnpack.buck.bzl b/third_party/xnnpack.buck.bzl
index db16e3565273a..b353d5d0d5982 100644
--- a/third_party/xnnpack.buck.bzl
+++ b/third_party/xnnpack.buck.bzl
@@ -2227,6 +2227,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ],
             # doesn't cover iphonesimulator-x86_64
             "ovr_config//runtime:arm64-linux-ubuntu-neon": [":arm64_lib"],
+            "ovr_config//runtime:fbcode-arm64": [":arm64_lib"],
             "ovr_config//runtime:platform010": [":x86_and_x86_64_lib"],
         }),
     )
diff --git a/third_party/xpu.txt b/third_party/xpu.txt
index b84ebb55a9018..74925f898e74b 100644
--- a/third_party/xpu.txt
+++ b/third_party/xpu.txt
@@ -1 +1 @@
-1f7a57f50745a429b7da10dddf2e366687659b87
+83c5a5a5516d498dde2ae131ca2d10a4abb94cfb
diff --git a/tools/autograd/build.bzl b/tools/autograd/build.bzl
index 588bd5944e294..c5ddf7a20b800 100644
--- a/tools/autograd/build.bzl
+++ b/tools/autograd/build.bzl
@@ -12,3 +12,9 @@ def define_targets(rules):
             "//torchgen",
         ],
     )
+
+    rules.filegroup(
+        name = "deprecated_yaml",
+        srcs = ["deprecated.yaml"],
+        visibility = ["//:__subpackages__"],
+    )
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 7dac5e5cfd260..506d829b5712c 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2904,6 +2904,10 @@
   output_differentiability: [True, False, False, False, False, False]
   query, key, value, bias: _efficient_attention_backward_symint(grad, query, key, value, bias, output, cu_seqlens_q, cu_seqlens_k, max_seqlen_batch_q, max_seqlen_batch_k, logsumexp, dropout_p, philox_seed, philox_offset, custom_mask_type, bias.requires_grad(), scale)
 
+- name: _cudnn_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+  output_differentiability: [True, False, False, False, False, False, False, False, False]
+  query, key, value: _cudnn_attention_backward_symint(grad, query, key, value, output, logsumexp, philox_seed, philox_offset, attn_bias, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, scale)
+
 - name: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
   output_differentiability: [True, False, False, False, False, False, False, False, False]
   query, key, value: _scaled_dot_product_cudnn_attention_backward_symint(grad, query, key, value, output, logsumexp, philox_seed, philox_offset, attn_bias, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, scale)
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 995243a9e6b4f..5a003cadf6b32 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -97,6 +97,7 @@
     "is_sparse_csr",
     "size",
     "stride",
+    "sym_is_contiguous",
     "sym_size",
     "sym_stride",
     "sym_storage_offset",
diff --git a/tools/dynamo/gb_id_mapping.py b/tools/dynamo/gb_id_mapping.py
index 1c42f2f3d6c42..8fef79bd80777 100644
--- a/tools/dynamo/gb_id_mapping.py
+++ b/tools/dynamo/gb_id_mapping.py
@@ -1,34 +1,33 @@
-# mypy: ignore-errors
-
 import argparse
 import ast
 import json
 import re
 from pathlib import Path
+from typing import Any, Optional
 
 
-def get_source_segment(source, node):
+def get_source_segment(source: str, node: ast.AST) -> Optional[str]:
     return ast.get_source_segment(source, node)
 
 
-def load_registry(path):
+def load_registry(path: Path) -> dict[str, Any]:
     if path.exists():
         with path.open() as f:
-            return json.load(f)
+            return json.load(f)  # type: ignore[no-any-return]
     return {}
 
 
-def save_registry(reg, path):
+def save_registry(reg: dict[str, Any], path: Path) -> None:
     with path.open("w") as f:
         json.dump(reg, f, indent=2)
 
 
-def next_gb_id(reg):
+def next_gb_id(reg: dict[str, Any]) -> str:
     ids = [int(x[2:]) for x in reg if x.startswith("GB") and x[2:].isdigit()]
     return f"GB{(max(ids, default=-1) + 1):04d}"
 
 
-def clean_string(s):
+def clean_string(s: Any) -> Any:
     """
     Normalizes string literals by removing formatting artifacts and escape sequences.
     Handles f-strings, quotes, newlines, and other syntax elements for cleaner output.
@@ -49,23 +48,23 @@ def clean_string(s):
     return s
 
 
-def expand_hints(hints, dynamo_dir=None):
+def expand_hints(hints: list[str], dynamo_dir: Optional[str] = None) -> list[str]:
     """
     Expands hint references to their actual values from graph_break_hints.
     Uses exec() to avoid import dependencies.
     """
     if dynamo_dir is None:
         script_dir = Path(__file__).resolve().parent
-        dynamo_dir = script_dir.parent.parent / "torch" / "_dynamo"
+        dynamo_dir_path = script_dir.parent.parent / "torch" / "_dynamo"
     else:
-        dynamo_dir = Path(dynamo_dir)
+        dynamo_dir_path = Path(dynamo_dir)
 
-    graph_break_hints_path = dynamo_dir / "graph_break_hints.py"
+    graph_break_hints_path = dynamo_dir_path / "graph_break_hints.py"
 
     with open(graph_break_hints_path) as f:
         hints_source = f.read()
 
-    hints_namespace = {}
+    hints_namespace: dict[str, Any] = {}
     exec(hints_source, hints_namespace)
 
     hint_constants = {
@@ -88,7 +87,7 @@ def expand_hints(hints, dynamo_dir=None):
     return expanded_hints
 
 
-def extract_info_from_keyword(source, kw):
+def extract_info_from_keyword(source: str, kw: ast.keyword) -> Any:
     """
     Extracts and returns the value of a keyword argument from an AST node.
 
@@ -114,14 +113,16 @@ def extract_info_from_keyword(source, kw):
         return clean_string(param_source)
 
 
-def find_unimplemented_v2_calls(path, dynamo_dir=None):
+def find_unimplemented_v2_calls(
+    path: str, dynamo_dir: Optional[str] = None
+) -> list[dict[str, Any]]:
     results = []
-    path = Path(path)
+    path_obj = Path(path)
 
-    if path.is_dir():
-        file_paths = path.glob("**/*.py")
+    if path_obj.is_dir():
+        file_paths = path_obj.glob("**/*.py")
     else:
-        file_paths = [path]
+        file_paths = [path_obj]  # type: ignore[assignment]
 
     for file_path in file_paths:
         with open(file_path) as f:
@@ -142,7 +143,7 @@ def find_unimplemented_v2_calls(path, dynamo_dir=None):
                         and node.func.id
                         in ("unimplemented_v2", "unimplemented_v2_with_warning")
                     ):
-                        info = {
+                        info: dict[str, Any] = {
                             "gb_type": None,
                             "context": None,
                             "explanation": None,
@@ -175,7 +176,7 @@ def find_unimplemented_v2_calls(path, dynamo_dir=None):
     return results
 
 
-def create_registry(dynamo_dir, registry_path):
+def create_registry(dynamo_dir: str, registry_path: str) -> None:
     calls = find_unimplemented_v2_calls(dynamo_dir)
     registry = {}
 
@@ -201,7 +202,7 @@ def create_registry(dynamo_dir, registry_path):
         json.dump(registry, f, indent=2)
 
 
-def main():
+def main() -> None:
     repo_root = Path(__file__).resolve().parent.parent.parent
     registry_path = repo_root / "torch" / "_dynamo" / "graph_break_registry.json"
 
diff --git a/tools/flight_recorder/components/builder.py b/tools/flight_recorder/components/builder.py
index 2a9cee36f7bc8..e0aaef31c1c32 100644
--- a/tools/flight_recorder/components/builder.py
+++ b/tools/flight_recorder/components/builder.py
@@ -24,6 +24,7 @@
     Traceback,
 )
 from tools.flight_recorder.components.utils import (
+    add_stack_id_in_entries,
     align_trace_from_beginning,
     check_current_entry_match,
     check_no_missing_dump_files,
@@ -391,6 +392,9 @@ def build_db(
     # Ensure version is consistent across all ranks.
     check_version(version_by_ranks, version)
     entries = align_trace_from_beginning(entries)
+    stack_id_trace_map: dict[str, int] = {}
+    if args.just_print_entries:
+        entries, stack_id_trace_map = add_stack_id_in_entries(entries)
 
     # flattened database
     groups, _groups, memberships, _memberships, _pg_guids = build_groups_memberships(
@@ -398,13 +402,15 @@ def build_db(
     )
     logger.debug("built groups, memberships")
 
-    if not args.allow_incomplete_ranks:
-        check_no_missing_dump_files(entries, memberships)
-
     if args.just_print_entries:
-        just_print_entries(entries, _groups, _memberships, _pg_guids, args)
+        just_print_entries(
+            entries, _groups, _memberships, _pg_guids, args, stack_id_trace_map
+        )
         sys.exit(0)
 
+    if not args.allow_incomplete_ranks:
+        check_no_missing_dump_files(entries, memberships)
+
     tracebacks, collectives, nccl_calls = build_collectives(
         entries, _groups, _memberships, _pg_guids, version
     )
diff --git a/tools/flight_recorder/components/config_manager.py b/tools/flight_recorder/components/config_manager.py
index ea9b0cf3918cd..abd7f5372133c 100644
--- a/tools/flight_recorder/components/config_manager.py
+++ b/tools/flight_recorder/components/config_manager.py
@@ -67,6 +67,7 @@ def __init__(self: "JobConfig"):
         )
         self.parser.add_argument("-j", "--just_print_entries", action="store_true")
         self.parser.add_argument("-v", "--verbose", action="store_true")
+        self.parser.add_argument("--print_stack_trace", action="store_true")
 
     def parse_args(
         self: "JobConfig", args: Optional[Sequence[str]]
diff --git a/tools/flight_recorder/components/loader.py b/tools/flight_recorder/components/loader.py
index dd2eb109aa563..7634226bae528 100644
--- a/tools/flight_recorder/components/loader.py
+++ b/tools/flight_recorder/components/loader.py
@@ -78,9 +78,9 @@ def read_dir(args: argparse.Namespace) -> tuple[dict[str, dict[str, Any]], str]:
         if prefix is None:
             prefix = _determine_prefix(files)
         for f in files:
-            if f.find(prefix) != 0:
+            if (offset := f.find(prefix)) == -1:
                 continue
-            details[f] = read_dump(prefix, os.path.join(root, f))
+            details[f] = read_dump(f[:offset] + prefix, os.path.join(root, f))
             filecount += 1
             if not version:
                 version = str(details[f]["version"])
diff --git a/tools/flight_recorder/components/types.py b/tools/flight_recorder/components/types.py
index 597ee8e3cedaa..20e093688ba14 100644
--- a/tools/flight_recorder/components/types.py
+++ b/tools/flight_recorder/components/types.py
@@ -388,13 +388,17 @@ def __init__(
         self, event: dict[Any, Any], memberships: dict[str, set[Any]], pg_name: str
     ):
         self.profiling_name = event["profiling_name"]
-        nccl, name = self.profiling_name.split(":")
-        assert nccl == "nccl", f"name formatting error? {nccl} != 'nccl'"
+        comm_lib_backend, name = self.profiling_name.split(":")
+        assert comm_lib_backend in ["nccl", "xccl"], (
+            f"name formatting error? {comm_lib_backend} != 'nccl' or 'xccl'"
+        )
         parts = name.split(" ")
         type = parts[0]
         meta = parts[1] if len(parts) == 2 else None
         self.state = event["state"]
-        self.pg_name, self.pg_desc = event["process_group"]
+        # Store the hashed pg_name for accessing memberships, and original pg info for display
+        self.pg_name = pg_name  # This is the hashed version used for memberships lookup
+        self.original_pg_name, self.pg_desc = event["process_group"]
         assert type in COLLECTIVES | P2P | {"coalesced"}, (
             f"{type} is not a supported operation"
         )
@@ -417,6 +421,7 @@ def __init__(
         else:
             self.input_sizes, self.output_sizes = None, None
         self.collective_seq_id = event["collective_seq_id"]
+        self.stack_id = event.get("stack_id", -1)
         self.p2p_seq_id = event["p2p_seq_id"]
         self.input_dtypes = event["input_dtypes"]
         self.output_dtypes = event["output_dtypes"]
@@ -425,9 +430,9 @@ def __init__(
         self.is_verbose = os.getenv("FR_TRACE_VERBOSE_OUTPUT", "0") == "1"
 
     def _init_global_src_dst(self, pg_ranks: set[Any]) -> None:
-        pg_ranks = sorted(pg_ranks)
-        self._src_g = pg_ranks[self._src] if self._src is not None else None
-        self._dst_g = pg_ranks[self._dst] if self._dst is not None else None
+        pg_ranks_sorted = sorted(pg_ranks)
+        self._src_g = pg_ranks_sorted[self._src] if self._src is not None else None
+        self._dst_g = pg_ranks_sorted[self._dst] if self._dst is not None else None
 
     @property
     def src(self) -> int:
@@ -456,6 +461,7 @@ def __repr__(self) -> str:
                 f"pg_name={self.pg_name}",
                 f"pg_description={self.pg_desc}",
                 f"pg_size={self.pg_size}",
+                f"stack_id={self.stack_id}",
                 f"state={self.state}",
             )
             return f"{self.type}(%s)" % ", ".join(s for s in verbose_info if s)
diff --git a/tools/flight_recorder/components/utils.py b/tools/flight_recorder/components/utils.py
index 73ec2a13d3be0..69455a5a433b0 100644
--- a/tools/flight_recorder/components/utils.py
+++ b/tools/flight_recorder/components/utils.py
@@ -115,13 +115,19 @@ def visualize_ops(
             for r in all_ops:
                 if len(all_ops[r]) > i:
                     rank, event = all_rank_events[r][i]
-                    row.append(
-                        Op(
-                            event,
-                            memberships,
-                            _pg_guids[(event["process_group"][0], rank)],
+                    # Check if the pg_guid exists for this rank and process group
+                    pg_key = (event["process_group"][0], rank)
+                    if pg_key in _pg_guids:
+                        row.append(
+                            Op(
+                                event,
+                                memberships,
+                                _pg_guids[pg_key],
+                            )
                         )
-                    )
+                    else:
+                        # Skip this entry if pg_guid mapping doesn't exist
+                        row.append(None)  # type: ignore[arg-type]
                     progress = True
                 else:
                     row.append(None)  # type: ignore[arg-type]
@@ -244,13 +250,19 @@ def visualize_ops(
             for r in all_ops:
                 if len(all_ops[r]) > i:
                     rank, event = all_rank_events[r][i]
-                    row.append(
-                        Op(
-                            event,
-                            memberships,
-                            _pg_guids[(event["process_group"][0], rank)],
+                    # Check if the pg_guid exists for this rank and process group
+                    pg_key = (event["process_group"][0], rank)
+                    if pg_key in _pg_guids:
+                        row.append(
+                            Op(
+                                event,
+                                memberships,
+                                _pg_guids[pg_key],
+                            )
                         )
-                    )
+                    else:
+                        # Skip this entry if pg_guid mapping doesn't exist
+                        row.append(None)  # type: ignore[arg-type]
                     progress = True
                 else:
                     row.append(None)  # type: ignore[arg-type]
@@ -616,6 +628,7 @@ def just_print_entries(
     _memberships: dict[str, set[Any]],
     _pg_guids: dict[tuple[str, int], str],
     args: argparse.Namespace,
+    stack_id_trace_map: dict[str, int],
 ) -> None:
     rows = []
     ranks = sorted(all_entries.keys())
@@ -650,6 +663,17 @@ def just_print_entries(
 
     logger.info(tabulate(rows, headers=headers))
 
+    if stack_id_trace_map and args.print_stack_trace:
+        headers = ["stack_id", "frame_stack"]
+        rows = []
+
+        for frame, stack_id in sorted(
+            stack_id_trace_map.items(), key=lambda item: item[1]
+        ):
+            rows.append([str(stack_id), frame])
+
+        logger.info(tabulate(rows, headers=headers))
+
 
 def check_no_missing_dump_files(
     entries: dict[int, Any], memberships: list[Membership]
@@ -677,6 +701,27 @@ def get_version_detail(version: str) -> tuple[int, int]:
     return major, minor
 
 
+def add_stack_id_in_entries(
+    entries: dict[int, list[dict[str, Any]]],
+) -> tuple[dict[int, list[dict[str, Any]]], dict[str, int]]:
+    stack_id = 0
+    stack_id_trace_map = {}
+    for rank in entries:
+        for dump in entries[rank]:
+            if dump.get("frames", []):
+                frames = str(dump["frames"])
+                if frames not in stack_id_trace_map:
+                    stack_id_trace_map[frames] = stack_id
+                    dump["stack_id"] = stack_id
+                    stack_id += 1
+                else:
+                    dump["stack_id"] = stack_id_trace_map[frames]
+            else:
+                dump["stack_id"] = -1
+
+    return entries, stack_id_trace_map
+
+
 def align_trace_from_beginning(
     entries: dict[int, list[dict[str, Any]]],
 ) -> dict[int, list[dict[str, Any]]]:
diff --git a/tools/linter/adapters/_linter/block.py b/tools/linter/adapters/_linter/block.py
index f0417a5ff47da..4097da50a7e4e 100644
--- a/tools/linter/adapters/_linter/block.py
+++ b/tools/linter/adapters/_linter/block.py
@@ -14,6 +14,9 @@
     from tokenize import TokenInfo
 
 
+_OVERRIDES = {"@override", "@typing_extensions.override", "@typing.override"}
+
+
 @total_ordering
 @dc.dataclass
 class Block:
@@ -68,11 +71,20 @@ class Category(str, Enum):
 
     @property
     def start_line(self) -> int:
-        return self.tokens[max(self.indent, self.index)].start[0]
+        """The line number for the def or class statement"""
+        return self.tokens[self.begin].start[0]
 
     @property
     def end_line(self) -> int:
-        return self.tokens[max(self.dedent, self.index)].start[0]
+        if 0 <= self.dedent < len(self.tokens):
+            return self.tokens[self.dedent].start[0] - 1
+        else:
+            return self.tokens[-1].start[0]
+            # Only happens in one case so far: a file whose last line was
+            #
+            #    def function(): ...
+            #
+            # and the dedent correctly pointed to one past the end of self.tokens
 
     @property
     def line_count(self) -> int:
@@ -99,9 +111,7 @@ def decorators(self) -> list[str]:
 
     @cached_property
     def is_override(self) -> bool:
-        return not self.is_class and any(
-            d.rpartition(".")[2] == "override" for d in self.decorators
-        )
+        return not self.is_class and bool(_OVERRIDES.intersection(self.decorators))
 
     DATA_FIELDS = (
         "category",
@@ -149,9 +159,9 @@ def _get_decorators(tokens: Sequence[TokenInfo], block_start: int) -> list[str]:
     def decorators() -> Iterator[str]:
         rev = reversed(range(block_start))
         newlines = (i for i in rev if tokens[i].type == token.NEWLINE)
-        newlines = itertools.chain(newlines, [-1])  # To account for the first line
+        it = iter(itertools.chain(newlines, [-1]))
+        # The -1 accounts for the very first line in the file
 
-        it = iter(newlines)
         end = next(it, -1)  # Like itertools.pairwise in Python 3.10
         for begin in it:
             for i in range(begin + 1, end):
diff --git a/tools/linter/adapters/black_linter.py b/tools/linter/adapters/black_linter.py
deleted file mode 100644
index c22a89032cfb3..0000000000000
--- a/tools/linter/adapters/black_linter.py
+++ /dev/null
@@ -1,225 +0,0 @@
-from __future__ import annotations
-
-import argparse
-import concurrent.futures
-import json
-import logging
-import os
-import subprocess
-import sys
-import time
-from enum import Enum
-from typing import BinaryIO, NamedTuple
-
-
-IS_WINDOWS: bool = os.name == "nt"
-
-
-class LintSeverity(str, Enum):
-    ERROR = "error"
-    WARNING = "warning"
-    ADVICE = "advice"
-    DISABLED = "disabled"
-
-
-class LintMessage(NamedTuple):
-    path: str | None
-    line: int | None
-    char: int | None
-    code: str
-    severity: LintSeverity
-    name: str
-    original: str | None
-    replacement: str | None
-    description: str | None
-
-
-def as_posix(name: str) -> str:
-    return name.replace("\\", "/") if IS_WINDOWS else name
-
-
-def _run_command(
-    args: list[str],
-    *,
-    stdin: BinaryIO,
-    timeout: int,
-) -> subprocess.CompletedProcess[bytes]:
-    logging.debug("$ %s", " ".join(args))
-    start_time = time.monotonic()
-    try:
-        return subprocess.run(
-            args,
-            stdin=stdin,
-            capture_output=True,
-            shell=IS_WINDOWS,  # So batch scripts are found.
-            timeout=timeout,
-            check=True,
-        )
-    finally:
-        end_time = time.monotonic()
-        logging.debug("took %dms", (end_time - start_time) * 1000)
-
-
-def run_command(
-    args: list[str],
-    *,
-    stdin: BinaryIO,
-    retries: int,
-    timeout: int,
-) -> subprocess.CompletedProcess[bytes]:
-    remaining_retries = retries
-    while True:
-        try:
-            return _run_command(args, stdin=stdin, timeout=timeout)
-        except subprocess.TimeoutExpired as err:
-            if remaining_retries == 0:
-                raise err
-            remaining_retries -= 1
-            logging.warning(
-                "(%s/%s) Retrying because command failed with: %r",
-                retries - remaining_retries,
-                retries,
-                err,
-            )
-            time.sleep(1)
-
-
-def check_file(
-    filename: str,
-    retries: int,
-    timeout: int,
-) -> list[LintMessage]:
-    try:
-        with open(filename, "rb") as f:
-            original = f.read()
-        with open(filename, "rb") as f:
-            proc = run_command(
-                [sys.executable, "-mblack", "--stdin-filename", filename, "-"],
-                stdin=f,
-                retries=retries,
-                timeout=timeout,
-            )
-    except subprocess.TimeoutExpired:
-        return [
-            LintMessage(
-                path=filename,
-                line=None,
-                char=None,
-                code="BLACK",
-                severity=LintSeverity.ERROR,
-                name="timeout",
-                original=None,
-                replacement=None,
-                description=(
-                    "black timed out while trying to process a file. "
-                    "Please report an issue in pytorch/pytorch with the "
-                    "label 'module: lint'"
-                ),
-            )
-        ]
-    except (OSError, subprocess.CalledProcessError) as err:
-        return [
-            LintMessage(
-                path=filename,
-                line=None,
-                char=None,
-                code="BLACK",
-                severity=LintSeverity.ADVICE,
-                name="command-failed",
-                original=None,
-                replacement=None,
-                description=(
-                    f"Failed due to {err.__class__.__name__}:\n{err}"
-                    if not isinstance(err, subprocess.CalledProcessError)
-                    else (
-                        "COMMAND (exit code {returncode})\n"
-                        "{command}\n\n"
-                        "STDERR\n{stderr}\n\n"
-                        "STDOUT\n{stdout}"
-                    ).format(
-                        returncode=err.returncode,
-                        command=" ".join(as_posix(x) for x in err.cmd),
-                        stderr=err.stderr.decode("utf-8").strip() or "(empty)",
-                        stdout=err.stdout.decode("utf-8").strip() or "(empty)",
-                    )
-                ),
-            )
-        ]
-
-    replacement = proc.stdout
-    if original == replacement:
-        return []
-
-    return [
-        LintMessage(
-            path=filename,
-            line=None,
-            char=None,
-            code="BLACK",
-            severity=LintSeverity.WARNING,
-            name="format",
-            original=original.decode("utf-8"),
-            replacement=replacement.decode("utf-8"),
-            description="Run `lintrunner -a` to apply this patch.",
-        )
-    ]
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Format files with black.",
-        fromfile_prefix_chars="@",
-    )
-    parser.add_argument(
-        "--retries",
-        default=3,
-        type=int,
-        help="times to retry timed out black",
-    )
-    parser.add_argument(
-        "--timeout",
-        default=90,
-        type=int,
-        help="seconds to wait for black",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        help="verbose logging",
-    )
-    parser.add_argument(
-        "filenames",
-        nargs="+",
-        help="paths to lint",
-    )
-    args = parser.parse_args()
-
-    logging.basicConfig(
-        format="<%(threadName)s:%(levelname)s> %(message)s",
-        level=logging.NOTSET
-        if args.verbose
-        else logging.DEBUG
-        if len(args.filenames) < 1000
-        else logging.INFO,
-        stream=sys.stderr,
-    )
-
-    with concurrent.futures.ThreadPoolExecutor(
-        max_workers=os.cpu_count(),
-        thread_name_prefix="Thread",
-    ) as executor:
-        futures = {
-            executor.submit(check_file, x, args.retries, args.timeout): x
-            for x in args.filenames
-        }
-        for future in concurrent.futures.as_completed(futures):
-            try:
-                for lint_message in future.result():
-                    print(json.dumps(lint_message._asdict()), flush=True)
-            except Exception:
-                logging.critical('Failed at "%s".', futures[future])
-                raise
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/linter/adapters/docstring_linter-grandfather.json b/tools/linter/adapters/docstring_linter-grandfather.json
index 9984843fb4bf7..49b12adb127bd 100644
--- a/tools/linter/adapters/docstring_linter-grandfather.json
+++ b/tools/linter/adapters/docstring_linter-grandfather.json
@@ -1,150 +1,112 @@
 {
-  "torch/_inductor/async_compile.py": {
-    "class AsyncCompile": 281
-  },
   "torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py": {
-    "class MMRankingA100": 278,
+    "class MMRankingA100": 279,
     "def MMRankingA100.fill_choices()": 199
   },
   "torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py": {
-    "class MMRankingH100": 303,
+    "class MMRankingH100": 304,
     "def MMRankingH100.fill_choices()": 203
   },
   "torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py": {
-    "class MixedMMA100": 132,
+    "class MixedMMA100": 133,
     "def MixedMMA100.get_best_choices()": 85
   },
   "torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py": {
-    "class MixedMMH100": 131,
+    "class MixedMMH100": 132,
     "def MixedMMH100.get_best_choices()": 85
   },
-  "torch/_inductor/autotune_process.py": {
-    "class CUDABenchmarkRequest": 115,
-    "class TritonBenchmarkRequest": 121,
-    "def TritonBenchmarkRequest.make_run_fn()": 81
-  },
   "torch/_inductor/bounds.py": {
     "class ValueRangeAnalysis": 107
   },
   "torch/_inductor/codecache.py": {
-    "class AotCodeCompiler": 516,
-    "class CUDACodeCache": 107,
-    "class CppCodeCache": 125,
-    "class CppPythonBindingsCodeCache": 168,
-    "class HalideCodeCache": 350
+    "class CppPythonBindingsCodeCache": 179,
+    "class HalideCodeCache": 357,
+    "class PyCodeCache": 102
   },
   "torch/_inductor/codegen/common.py": {
     "class CSE": 167,
-    "class CSEProxy": 310,
+    "class CSEProxy": 316,
     "class Kernel": 286,
-    "class KernelArgs": 325,
-    "class OpOverrides": 227
+    "class KernelArgs": 332,
+    "class OpOverrides": 198
   },
   "torch/_inductor/codegen/cpp.py": {
-    "class CppKernel": 572,
-    "class CppKernelProxy": 601,
+    "class CppKernelProxy": 617,
     "class CppOverrides": 429,
-    "class CppScheduling": 777,
-    "class CppVecKernel": 857,
+    "class CppScheduling": 786,
+    "class CppVecKernel": 865,
     "class OuterLoopFusedSchedulerNode": 159,
     "def CppKernel.codegen_loops_impl()": 144,
     "def CppKernelProxy.codegen_functions()": 183,
     "def CppKernelProxy.legalize_lowp_fp_dtype_loopbody()": 224,
     "def CppScheduling.fuse()": 81,
-    "def CppVecKernel.reduction()": 193,
-    "def CppVecKernel.reduction_combine_vec()": 87,
-    "def TilingSelect.select_tiling()": 165
+    "def CppVecKernel.reduction_combine_vec()": 100,
+    "def OuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr()": 85,
+    "def TilingSelect.select_tiling()": 170
   },
   "torch/_inductor/codegen/cpp_flex_attention_template.py": {
-    "class CppFlexAttentionTemplate": 374,
-    "def CppFlexAttentionTemplate.modification()": 94
+    "class CppFlexAttentionTemplate": 403,
+    "def CppFlexAttentionTemplate.modification()": 102
   },
   "torch/_inductor/codegen/cpp_gemm_template.py": {
-    "class CppGemmTemplate": 998,
-    "def CppGemmTemplate.add_choices()": 163,
-    "def CppGemmTemplate.get_options()": 243
+    "def CppGemmTemplate.get_options()": 249
   },
   "torch/_inductor/codegen/cpp_grouped_gemm_template.py": {
-    "def CppGroupedGemmTemplate.add_choices()": 141,
-    "def CppGroupedGemmTemplate.render()": 146
-  },
-  "torch/_inductor/codegen/cpp_micro_gemm.py": {
-    "def create_micro_gemm()": 94
+    "def CppGroupedGemmTemplate.add_choices()": 154,
+    "def CppGroupedGemmTemplate.render()": 153
   },
   "torch/_inductor/codegen/cpp_template.py": {
     "class CppTemplate": 114
   },
   "torch/_inductor/codegen/cpp_template_kernel.py": {
-    "class CppTemplateKernel": 469,
-    "def CppTemplateKernel.store_outputs()": 102
+    "class CppTemplateKernel": 499,
+    "def CppTemplateKernel.store_outputs()": 111
   },
   "torch/_inductor/codegen/cpp_utils.py": {
     "def create_epilogue_with_attr()": 165
   },
-  "torch/_inductor/codegen/cpp_wrapper_cpu.py": {
-    "def CppWrapperCpu.generate_extern_kernel_args_decl_if_needed()": 152,
-    "def CppWrapperCpu.generate_input_output_runtime_checks()": 115,
-    "def CppWrapperCpu.generate_py_arg()": 96,
-    "def CppWrapperCpu.val_to_arg_str()": 88,
-    "def CppWrapperCpu.write_wrapper_decl()": 140
-  },
   "torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py": {
-    "def CppWrapperCpuArrayRef.generate_return()": 127,
+    "def CppWrapperCpuArrayRef.generate_return()": 128,
     "def CppWrapperCpuArrayRef.write_wrapper_decl()": 208
   },
-  "torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py": {
-    "def EmitGemmUniversal3xInstanceWithEVT.emit()": 98
-  },
-  "torch/_inductor/codegen/cuda/device_op_overrides.py": {
-    "class CUDADeviceOpOverrides": 222,
-    "def CUDADeviceOpOverrides.tma_descriptor_helpers()": 102
-  },
   "torch/_inductor/codegen/cuda/gemm_template.py": {
-    "class CUTLASS2xGemmTemplate": 265,
-    "class CUTLASS3xGemmTemplate": 326
+    "class CUTLASS2xGemmTemplate": 267
   },
   "torch/_inductor/codegen/debug_utils.py": {
-    "class DebugPrinterManager": 228
+    "class DebugPrinterManager": 229
   },
   "torch/_inductor/codegen/halide.py": {
-    "class HalideKernel": 982,
-    "class HalideOverrides": 329,
+    "class HalideKernel": 984,
+    "class HalideOverrides": 325,
     "class HalidePrinter": 129,
-    "def HalideKernel.halide_kernel_meta()": 82
-  },
-  "torch/_inductor/codegen/mps.py": {
-    "class MetalKernel": 354,
-    "class MetalOverrides": 335,
-    "def MetalKernel.reduction()": 109
+    "def HalideKernel.halide_kernel_meta()": 82,
+    "def HalideKernel.scan()": 82
   },
   "torch/_inductor/codegen/rocm/ck_conv_template.py": {
-    "class CKGroupedConvFwdTemplate": 531,
-    "def CKGroupedConvFwdTemplate.globals()": 143
+    "class CKGroupedConvFwdTemplate": 544,
+    "def CKGroupedConvFwdTemplate.globals()": 145
   },
   "torch/_inductor/codegen/rocm/ck_universal_gemm_template.py": {
-    "class CKGemmTemplate": 947
+    "class CKGemmTemplate": 950
   },
   "torch/_inductor/codegen/rocm/rocm_benchmark_request.py": {
     "class ROCmBenchmarkRequest": 117
   },
   "torch/_inductor/codegen/simd.py": {
-    "class IterationRangesRoot": 122,
-    "class SIMDScheduling": 1054,
     "def SIMDScheduling.candidate_tilings()": 126,
     "def SIMDScheduling.generate_node_schedule()": 95
   },
   "torch/_inductor/codegen/triton.py": {
-    "class BlockPtrOptions": 272,
-    "class TritonKernel": 2455,
-    "class TritonOverrides": 505,
+    "class TritonKernel": 2562,
+    "class TritonOverrides": 469,
     "class TritonPrinter": 172,
     "class TritonScheduling": 396,
-    "def TritonKernel.codegen_kernel()": 222,
+    "def TritonKernel.codegen_kernel()": 233,
     "def TritonKernel.codegen_kernel_benchmark()": 89,
-    "def TritonKernel.load()": 134,
-    "def TritonKernel.reduction()": 383,
-    "def TritonKernel.scan()": 103,
-    "def TritonScheduling.benchmark_codegened_module()": 83,
+    "def TritonKernel.load()": 145,
+    "def TritonKernel.reduction()": 396,
+    "def TritonKernel.scan()": 110,
+    "def TritonScheduling.benchmark_codegened_module()": 85,
     "def TritonScheduling.benchmark_combo_kernel()": 91
   },
   "torch/_inductor/codegen/triton_combo_kernel.py": {
@@ -156,49 +118,42 @@
   },
   "torch/_inductor/codegen/wrapper.py": {
     "def PythonWrapperCodegen.benchmark_compiled_module()": 92,
-    "def PythonWrapperCodegen.define_user_defined_triton_kernel()": 249,
-    "def PythonWrapperCodegen.generate_example_arg_value()": 83,
-    "def user_defined_kernel_grid_fn_code()": 96
+    "def PythonWrapperCodegen.define_user_defined_triton_kernel()": 266,
+    "def PythonWrapperCodegen.generate_example_arg_value()": 84,
+    "def user_defined_kernel_grid_fn_code()": 102
   },
   "torch/_inductor/comm_lowering.py": {
-    "def register_comm_lowerings()": 189
+    "def register_comm_lowerings()": 192
   },
   "torch/_inductor/comms.py": {
-    "def enforce_comm_ordering_for_fsdp()": 170,
-    "def reinplace_fsdp_all_gather()": 110
+    "def enforce_comm_ordering_for_fsdp()": 174,
+    "def reinplace_fsdp_all_gather()": 106
   },
   "torch/_inductor/compile_fx.py": {
-    "def _InProcessFxCompile.codegen_and_compile()": 379,
-    "def fw_compiler_freezing()": 93
-  },
-  "torch/_inductor/config.py": {
-    "class cpp": 107,
-    "class triton": 182
+    "def fw_compiler_freezing()": 101
   },
   "torch/_inductor/constant_folding.py": {
-    "class ConstantFolder": 223,
+    "class ConstantFolder": 226,
     "def ConstantFolder.run_node()": 94
   },
   "torch/_inductor/cpu_vec_isa.py": {
-    "class VecISA": 120
+    "class VecISA": 125
   },
   "torch/_inductor/debug.py": {
-    "class DebugContext": 158,
-    "class DebugFormatter": 189,
-    "def DebugFormatter.log_autotuning_results()": 81
+    "class DebugContext": 155,
+    "class DebugFormatter": 172,
+    "def DebugFormatter.log_autotuning_results()": 90,
+    "def aot_inductor_minifier_wrapper()": 81
   },
   "torch/_inductor/dependencies.py": {
-    "class MemoryDep": 225
+    "class MemoryDep": 234
   },
   "torch/_inductor/fx_passes/b2b_gemm.py": {
-    "def b2b_gemm_handler()": 180
+    "def b2b_gemm_handler()": 182
   },
   "torch/_inductor/fx_passes/binary_folding.py": {
     "def binary_folding_init()": 416
   },
-  "torch/_inductor/fx_passes/freezing_patterns.py": {
-    "def addmm_patterns_init()": 94
-  },
   "torch/_inductor/fx_passes/group_batch_fusion.py": {
     "def BatchLayernormFusion.fuse()": 131,
     "def PostGradBatchLinearFusion.fuse()": 83,
@@ -206,134 +161,107 @@
   },
   "torch/_inductor/fx_passes/joint_graph.py": {
     "def constant_fold_uniform_value()": 109,
-    "def remove_no_ops()": 93
+    "def remove_no_ops()": 97
   },
   "torch/_inductor/fx_passes/micro_pipeline_tp.py": {
     "def find_all_gather_patterns()": 116,
     "def find_reduce_scatter_patterns()": 125
   },
-  "torch/_inductor/fx_passes/post_grad.py": {
-    "def lower_scan_to_while_loop()": 154
-  },
   "torch/_inductor/fx_passes/split_cat.py": {
-    "def SplitCatSimplifier.replace_cat()": 145,
+    "def SplitCatSimplifier.replace_cat()": 152,
     "def merge_getitem_cat()": 97,
-    "def merge_split_cat_aten()": 87,
+    "def merge_split_cat_aten()": 91,
     "def move_reshape_out_of_split_stack()": 110
   },
-  "torch/_inductor/fx_utils.py": {
-    "def FakeTensorUpdater.incremental_update()": 100
-  },
   "torch/_inductor/graph.py": {
-    "class GraphLowering": 2032,
-    "def GraphLowering.call_function()": 116,
-    "def GraphLowering.extract_autotune_inputs()": 90,
-    "def GraphLowering.output()": 87,
-    "def GraphLowering.placeholder()": 92,
-    "def GraphLowering.run_node()": 380
+    "class GraphLowering": 2144,
+    "def GraphLowering.call_function()": 112,
+    "def GraphLowering.create_deferred_runtime_asserts()": 84,
+    "def GraphLowering.extract_autotune_inputs()": 93,
+    "def GraphLowering.output()": 92,
+    "def GraphLowering.placeholder()": 102,
+    "def GraphLowering.run_node()": 361
   },
   "torch/_inductor/ir.py": {
-    "class Buffer": 122,
-    "class ComputedBuffer": 329,
-    "class Conditional": 138,
-    "class ExternKernel": 793,
-    "class FallbackKernel": 439,
-    "class FlexibleLayout": 139,
-    "class IRNode": 244,
-    "class Layout": 202,
-    "class Loops": 128,
-    "class Reduction": 737,
+    "class Buffer": 131,
+    "class ComputedBuffer": 350,
+    "class Conditional": 144,
+    "class ExternKernel": 866,
+    "class Layout": 214,
+    "class Loops": 130,
+    "class Reduction": 745,
     "class Scan": 199,
-    "class Sort": 150,
-    "class UserDefinedTritonKernel": 183,
-    "class View": 174,
+    "class Sort": 151,
+    "class UserDefinedTritonKernel": 198,
+    "class View": 180,
     "class WelfordReduction": 221,
-    "class WhileLoop": 203,
-    "def ConcatKernel.create()": 95,
-    "def ExternKernel.process_kernel()": 110,
-    "def ExternKernel.require_strides()": 149,
-    "def FallbackKernel.create()": 81,
-    "def FallbackKernel.export_extern_kernel_node()": 82,
-    "def Reduction.create()": 136,
-    "def Reduction.num_splits()": 152,
-    "def Scan.create()": 83,
-    "def WelfordReduction.create()": 110,
-    "def WhileLoop.create()": 161
+    "class WhileLoop": 212,
+    "def ConcatKernel.create()": 101,
+    "def ExternKernel.process_kernel()": 120,
+    "def ExternKernel.require_strides()": 162,
+    "def Reduction.create()": 159,
+    "def Reduction.num_splits()": 161,
+    "def Scan.create()": 96,
+    "def WelfordReduction.create()": 119,
+    "def WhileLoop.create()": 174
   },
   "torch/_inductor/jagged_lowerings.py": {
-    "def register_jagged_ops()": 156
-  },
-  "torch/_inductor/kernel/bmm.py": {
-    "def tuned_bmm()": 91
+    "def register_jagged_ops()": 160
   },
   "torch/_inductor/kernel/conv.py": {
-    "def convolution()": 231
+    "def convolution()": 239
   },
   "torch/_inductor/kernel/mm.py": {
-    "def tuned_addmm()": 169,
-    "def tuned_mm()": 127,
-    "def tuned_scaled_mm()": 130
+    "def tuned_addmm()": 151
   },
   "torch/_inductor/loop_body.py": {
     "class CaptureIndexing": 174
   },
   "torch/_inductor/lowering.py": {
-    "def avg_pool2d_backward()": 155,
-    "def avg_pool3d_backward()": 189,
+    "def avg_pool2d_backward()": 164,
+    "def avg_pool3d_backward()": 198,
     "def cat()": 123,
-    "def index_put_impl_()": 125,
+    "def index_put_impl_()": 127,
     "def make_pointwise()": 85,
-    "def max_pool2d_with_indices_backward()": 140,
-    "def scatter_reduce_()": 111,
-    "def sdpa_constraint()": 132,
-    "def searchsorted()": 84
+    "def max_pool2d_with_indices_backward()": 144,
+    "def scatter_reduce_()": 114,
+    "def sdpa_constraint()": 135,
+    "def searchsorted()": 96
   },
   "torch/_inductor/mkldnn_ir.py": {
-    "class MkldnnRnnLayer": 114
+    "class MkldnnRnnLayer": 116,
+    "def MkldnnRnnLayer.create()": 94,
+    "def QConvPointWiseBinaryPT2E.create()": 81
   },
   "torch/_inductor/mkldnn_lowerings.py": {
-    "def register_onednn_fusion_ops()": 1152
+    "def register_onednn_fusion_ops()": 1156
   },
   "torch/_inductor/mock_cache.py": {
     "class PatchCaches": 108
   },
   "torch/_inductor/pattern_matcher.py": {
-    "class ReplacementPatternEntry": 196,
-    "def ReplacementPatternEntry.replace_with_graph()": 177
+    "class ReplacementPatternEntry": 202,
+    "def ReplacementPatternEntry.replace_with_graph()": 188
   },
   "torch/_inductor/quantized_lowerings.py": {
-    "def register_woq_mm_ops()": 136
+    "def register_woq_mm_ops()": 116
   },
   "torch/_inductor/runtime/autotune_cache.py": {
-    "class AutotuneCache": 190
+    "class AutotuneCache": 201
   },
   "torch/_inductor/runtime/benchmarking.py": {
     "class InductorBenchmarker": 111
   },
   "torch/_inductor/scheduler.py": {
-    "class BaseSchedulerNode": 697,
-    "class BaseScheduling": 139,
-    "class Scheduler": 2568,
-    "class SchedulerBuffer": 103,
-    "class SchedulerNode": 256
+    "class BaseSchedulerNode": 695,
+    "class BaseScheduling": 142,
+    "class SchedulerBuffer": 106,
+    "class SchedulerNode": 268
   },
   "torch/_inductor/select_algorithm.py": {
-    "class AlgorithmSelectorCache": 694,
-    "class TritonTemplate": 224,
-    "class TritonTemplateKernel": 770,
-    "def AlgorithmSelectorCache.log_results()": 92,
-    "def AlgorithmSelectorCache.make_benchmark_fn[2]()": 145
-  },
-  "torch/_inductor/sizevars.py": {
-    "class SizeVarAllocator": 780
-  },
-  "torch/_inductor/template_heuristics.py": {
-    "class ROCmConfigHeuristic": 212
+    "def AlgorithmSelectorCache.log_results()": 108
   },
   "torch/_inductor/utils.py": {
-    "class IndentedBuffer": 136
-  },
-  "torch/_inductor/wrapper_benchmark.py": {
-    "def parse_profile_event_list()": 119
+    "class IndentedBuffer": 145
   }
-}
+}
\ No newline at end of file
diff --git a/tools/linter/adapters/docstring_linter.py b/tools/linter/adapters/docstring_linter.py
index cc27e6be72d95..477bfe7d9a809 100644
--- a/tools/linter/adapters/docstring_linter.py
+++ b/tools/linter/adapters/docstring_linter.py
@@ -10,6 +10,7 @@
 
 _FILE = Path(__file__).absolute()
 _PATH = [Path(p).absolute() for p in sys.path]
+_OVERRIDES = {"@override", "@typing_extensions.override", "@typing.override"}
 
 if TYPE_CHECKING or _FILE.parent not in _PATH:
     from . import _linter
@@ -154,7 +155,7 @@ def has_class_init_doc(b: _linter.Block) -> bool:
     def _is_bad_block(self, b: _linter.Block, pf: _linter.PythonFile) -> bool:
         max_lines = self._max_lines[b.category]
         return (
-            not pf.omitted(pf.tokens, b.begin, b.dedent)
+            not (b.is_override or pf.omitted(pf.tokens, b.begin, b.dedent))
             and b.line_count > max_lines
             and len(b.docstring) < self.args.min_docstring
             and (self.args.lint_local or not b.is_local)
diff --git a/tools/linter/adapters/pip_init.py b/tools/linter/adapters/pip_init.py
index 137e4637bdb44..05a7a8acf9324 100644
--- a/tools/linter/adapters/pip_init.py
+++ b/tools/linter/adapters/pip_init.py
@@ -41,11 +41,6 @@ def main() -> None:
     parser.add_argument(
         "--dry-run", help="do not install anything, just print what would be done."
     )
-    parser.add_argument(
-        "--no-black-binary",
-        help="do not use pre-compiled binaries from pip for black.",
-        action="store_true",
-    )
 
     args = parser.parse_args()
 
@@ -97,8 +92,6 @@ def main() -> None:
                 "Package {package_name} did not have a version specified. "
                 "Please specify a version to produce a consistent linting experience."
             )
-        if args.no_black_binary and "black" in package_name:
-            pip_args.append(f"--no-binary={package_name}")
 
     dry_run = args.dry_run == "1"
     if dry_run:
diff --git a/tools/linter/adapters/pyfmt_linter.py b/tools/linter/adapters/pyfmt_linter.py
index 55ffa429e7f9a..ce5f8252a20f0 100644
--- a/tools/linter/adapters/pyfmt_linter.py
+++ b/tools/linter/adapters/pyfmt_linter.py
@@ -2,7 +2,6 @@
 
 import argparse
 import concurrent.futures
-import fnmatch
 import json
 import logging
 import os
@@ -13,7 +12,6 @@
 from pathlib import Path
 from typing import NamedTuple
 
-import black
 import isort
 import usort
 
@@ -21,44 +19,6 @@
 IS_WINDOWS: bool = os.name == "nt"
 REPO_ROOT = Path(__file__).absolute().parents[3]
 
-# TODO: remove this when it gets empty and remove `black` in PYFMT
-USE_BLACK_FILELIST = re.compile(
-    "|".join(
-        (
-            r"\A\Z",  # empty string
-            *map(
-                fnmatch.translate,
-                [
-                    # **
-                    # .ci/**
-                    # .github/**
-                    # benchmarks/**
-                    # functorch/**
-                    # tools/**
-                    # torchgen/**
-                    # test/**
-                    # test/[a-h]*/**
-                    # test/[i-j]*/**
-                    # test/[k-m]*/**
-                    # test/optim/**
-                    # test/[p-z]*/**,
-                    # torch/**
-                    # torch/_[a-c]*/**
-                    # torch/_[e-h]*/**
-                    # torch/_i*/**
-                    # torch/_[j-z]*/**
-                    # torch/[a-c]*/**
-                    # torch/d*/**
-                    # torch/[e-m]*/**
-                    # torch/optim/**
-                    # torch/[p-z]*/**
-                    "torch/[p-z]*/**",
-                ],
-            ),
-        )
-    )
-)
-
 
 class LintSeverity(str, Enum):
     ERROR = "error"
@@ -118,23 +78,6 @@ def run_usort(content: str, path: Path) -> str:
     return usort.usort_string(content, path=path, config=usort_config)
 
 
-def run_black(content: str, path: Path) -> str:
-    black_config = black.parse_pyproject_toml(black.find_pyproject_toml((str(path),)))  # type: ignore[attr-defined,arg-type]
-    # manually patch options that do not have a 1-to-1 match in Mode arguments
-    black_config["target_versions"] = {
-        black.TargetVersion[ver.upper()]  # type: ignore[attr-defined]
-        for ver in black_config.pop("target_version", [])
-    }
-    black_config["string_normalization"] = not black_config.pop(
-        "skip_string_normalization", False
-    )
-    black_mode = black.Mode(**black_config)
-    black_mode.is_pyi = path.suffix.lower() == ".pyi"
-    black_mode.is_ipynb = path.suffix.lower() == ".ipynb"
-
-    return black.format_str(content, mode=black_mode)
-
-
 def run_ruff_format(content: str, path: Path) -> str:
     try:
         return subprocess.check_output(
@@ -166,10 +109,7 @@ def check_file(filename: str) -> list[LintMessage]:
         # NB: run isort first to enforce style for blank lines
         replacement = run_isort(replacement, path=path)
         replacement = run_usort(replacement, path=path)
-        if USE_BLACK_FILELIST.match(path.absolute().relative_to(REPO_ROOT).as_posix()):
-            replacement = run_black(replacement, path=path)
-        else:
-            replacement = run_ruff_format(replacement, path=path)
+        replacement = run_ruff_format(replacement, path=path)
 
         if original == replacement:
             return []
diff --git a/tools/linter/adapters/test_device_bias_linter.py b/tools/linter/adapters/test_device_bias_linter.py
index 9901d5f3fe523..a2079e4fe810a 100644
--- a/tools/linter/adapters/test_device_bias_linter.py
+++ b/tools/linter/adapters/test_device_bias_linter.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 """
 This lint verifies that every Python test file (file that matches test_*.py or
-*_test.py in the test folder) has a cuda hard code in `requires_gpu()`
-decorated function to ensure that the test not fail on other GPU.
-
+*_test.py in the test folder) has a cuda hard code in `requires_gpu()` or
+`requires_triton()` decorated function or `if HAS_GPU:` guarded main section,
+to ensure that the test not fail on other GPU devices.
 """
 
 from __future__ import annotations
@@ -39,21 +39,59 @@ class LintMessage(NamedTuple):
 
 
 DEVICE_BIAS = ["cuda", "xpu", "mps"]
+GPU_RELATED_DECORATORS = {"requires_gpu", "requires_triton"}
+
+
+def is_main_has_gpu(tree: ast.AST) -> bool:
+    def _contains_has_gpu(node: ast.AST) -> bool:
+        if isinstance(node, ast.Name) and node.id in ["HAS_GPU", "RUN_GPU"]:
+            return True
+        elif isinstance(node, ast.BoolOp):
+            return any(_contains_has_gpu(value) for value in node.values)
+        elif isinstance(node, ast.UnaryOp):
+            return _contains_has_gpu(node.operand)
+        elif isinstance(node, ast.Compare):
+            return _contains_has_gpu(node.left) or any(
+                _contains_has_gpu(comp) for comp in node.comparators
+            )
+        elif isinstance(node, (ast.IfExp, ast.Call)):
+            return False
+        return False
+
+    for node in ast.walk(tree):
+        # Detect if __name__ == "__main__":
+        if isinstance(node, ast.If):
+            if (
+                isinstance(node.test, ast.Compare)
+                and isinstance(node.test.left, ast.Name)
+                and node.test.left.id == "__name__"
+            ):
+                if any(
+                    isinstance(comp, ast.Constant) and comp.value == "__main__"
+                    for comp in node.test.comparators
+                ):
+                    for inner_node in node.body:
+                        if isinstance(inner_node, ast.If) and _contains_has_gpu(
+                            inner_node.test
+                        ):
+                            return True
+    return False
 
 
 class DeviceBiasVisitor(ast.NodeVisitor):
-    def __init__(self, filename: str):
+    def __init__(self, filename: str, is_gpu_test_suite: bool) -> None:
         self.filename = filename
         self.lint_messages: list[LintMessage] = []
+        self.is_gpu_test_suite = is_gpu_test_suite
 
-    def _has_requires_gpu_decorator(self, node: ast.FunctionDef) -> bool:
+    def _has_proper_decorator(self, node: ast.FunctionDef) -> bool:
         for d in node.decorator_list:
-            if isinstance(d, ast.Name) and d.id == "requires_gpu":
+            if isinstance(d, ast.Name) and d.id in GPU_RELATED_DECORATORS:
                 return True
             if (
                 isinstance(d, ast.Call)
                 and isinstance(d.func, ast.Name)
-                and d.func.id == "requires_gpu"
+                and d.func.id in GPU_RELATED_DECORATORS
             ):
                 return True
         return False
@@ -62,7 +100,6 @@ def _has_requires_gpu_decorator(self, node: ast.FunctionDef) -> bool:
     def _check_keyword_device(self, subnode: ast.keyword, msg_prefix: str) -> None:
         if subnode.arg != "device":
             return
-
         val = subnode.value
         if isinstance(val, ast.Constant) and any(
             bias in val.value for bias in DEVICE_BIAS
@@ -105,15 +142,26 @@ def _check_device_methods(self, subnode: ast.Call, msg_prefix: str) -> None:
                     f"{msg_prefix} .to('{arg.value}'), suggest to use .to(GPU_TYPE)",
                 )
 
-    def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
-        # Check if the function is decorated with @requires_gpu, which indicates
-        # that the function is intended to run on GPU devices (e.g., CUDA or XPU),
-        # but ensure it does not hardcode the device to CUDA.
-        if not self._has_requires_gpu_decorator(node):
-            self.generic_visit(node)
-            return
-
-        msg_prefix = "`@requires_gpu` function should not hardcode"
+    def _check_with_statement(self, node: ast.With, msg_prefix: str) -> None:
+        for item in node.items:
+            ctx_expr = item.context_expr
+            if isinstance(ctx_expr, ast.Call):
+                func = ctx_expr.func
+                if (
+                    isinstance(func, ast.Attribute)
+                    and func.attr == "device"
+                    and isinstance(func.value, ast.Name)
+                    and func.value.id == "torch"
+                    and ctx_expr.args
+                    and isinstance(ctx_expr.args[0], ast.Constant)
+                    and any(bias in ctx_expr.args[0].value for bias in DEVICE_BIAS)
+                ):
+                    self.record(
+                        ctx_expr,
+                        f"{msg_prefix} `with torch.device('{ctx_expr.args[0].value}')`, suggest to use torch.device(GPU_TYPE)",
+                    )
+
+    def _check_node(self, node: ast.AST, msg_prefix: str) -> None:
         for subnode in ast.walk(node):
             if isinstance(subnode, ast.keyword):
                 self._check_keyword_device(subnode, msg_prefix)
@@ -121,7 +169,19 @@ def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
                 subnode.func, ast.Attribute
             ):
                 self._check_device_methods(subnode, msg_prefix)
+            elif isinstance(subnode, ast.With):
+                self._check_with_statement(subnode, msg_prefix)
 
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
+        if self._has_proper_decorator(node):
+            msg_prefix = (
+                "`@requires_gpu` or `@requires_triton` function should not hardcode"
+            )
+            self._check_node(node, msg_prefix)
+        elif self.is_gpu_test_suite:
+            # If the function is guarded by HAS_GPU in main(), we still need to check for device bias
+            msg_prefix = "The test suites is shared amount GPUS, should not hardcode"
+            self._check_node(node, msg_prefix)
         self.generic_visit(node)
 
     def record(self, node: ast.AST, message: str) -> None:
@@ -144,16 +204,16 @@ def check_file(filename: str) -> list[LintMessage]:
     with open(filename) as f:
         source = f.read()
         tree = ast.parse(source, filename=filename)
-        checker = DeviceBiasVisitor(filename)
+        is_gpu_test_suite = is_main_has_gpu(tree)
+        checker = DeviceBiasVisitor(filename, is_gpu_test_suite)
         checker.visit(tree)
-
     return checker.lint_messages
 
 
 def main() -> None:
     parser = argparse.ArgumentParser(
-        description="Detect Device bias in python functions decorated with [require_gpu]"
-        " that may potentially break support for other GPU devices.",
+        description="Detect Device bias in functions decorated with requires_gpu/requires_triton"
+        " or guarded by HAS_GPU block in main() that may break other GPU devices.",
         fromfile_prefix_chars="@",
     )
     parser.add_argument(
diff --git a/tools/packaging/build_wheel.py b/tools/packaging/build_wheel.py
index 16e9a87bd9638..10c4516a32805 100644
--- a/tools/packaging/build_wheel.py
+++ b/tools/packaging/build_wheel.py
@@ -4,6 +4,7 @@
 import contextlib
 import logging
 import os
+import re
 import subprocess
 import sys
 import tempfile
@@ -16,11 +17,12 @@
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
+logger.setLevel(logging.INFO)
 
 ROOT_PATH = Path(__file__).absolute().parent.parent.parent
 SETUP_PY_PATH = ROOT_PATH / "setup.py"
 REQUIREMENTS_PATH = ROOT_PATH / "requirements.txt"
+PYPROJECT_TOML_PATH = ROOT_PATH / "pyproject.toml"
 
 
 def run_cmd(
@@ -45,6 +47,79 @@ def interpreter_version(interpreter: str) -> str:
     return str(version_string.split(" ")[1])
 
 
+def get_supported_python_versions() -> list[str]:
+    """Extract supported Python versions from pyproject.toml classifiers."""
+    with open(PYPROJECT_TOML_PATH) as f:
+        content = f.read()
+
+    # Find Python version classifiers
+    pattern = r'"Programming Language :: Python :: (\d+\.\d+)"'
+    matches = re.findall(pattern, content)
+
+    # Sort versions and return them
+    return sorted(matches, key=lambda x: tuple(map(int, x.split("."))))
+
+
+def find_python_interpreters(mode: str) -> list[str]:
+    """Find Python interpreters based on the specified mode."""
+    if mode == "manylinux":
+        return _find_manylinux_interpreters()
+    else:
+        raise ValueError(f"Unsupported mode: {mode}")
+
+
+def _find_manylinux_interpreters() -> list[str]:
+    """Find Python interpreters in manylinux format (/opt/python/)."""
+    supported_versions = get_supported_python_versions()
+    interpreters = []
+
+    python_root = Path("/opt/python")
+    if not python_root.exists():
+        logger.warning("Path /opt/python does not exist, no interpreters found")
+        return []
+
+    # Find all python3 binaries in /opt/python/
+    python_binaries = list(python_root.glob("*/bin/python3"))
+
+    for python_path in python_binaries:
+        try:
+            # Check if it's PyPy (skip it)
+            version_output = run_cmd(
+                [str(python_path), "--version"], capture_output=True
+            )
+            version_string = version_output.stdout.decode("utf-8").strip()
+
+            if "PyPy" in version_string:
+                logger.debug("Skipping PyPy interpreter: %s", python_path)
+                continue
+
+            # Extract Python version (e.g., "Python 3.9.1" -> "3.9")
+            match = re.search(r"Python (\d+\.\d+)", version_string)
+            if not match:
+                logger.debug("Could not parse version from: %s", version_string)
+                continue
+
+            python_version = match.group(1)
+
+            # Check if this version is supported
+            if python_version in supported_versions:
+                interpreters.append(str(python_path))
+                logger.debug(
+                    "Found supported Python %s at %s", python_version, python_path
+                )
+            else:
+                logger.debug(
+                    "Python %s not in supported versions: %s",
+                    python_version,
+                    supported_versions,
+                )
+
+        except subprocess.CalledProcessError as e:
+            logger.debug("Failed to get version for %s: %s", python_path, e)
+            continue
+    return interpreters
+
+
 @contextlib.contextmanager
 def venv(interpreter: str) -> Iterator[str]:
     # Should this use EnvBuilder? Probably, maybe a good todo in the future
@@ -100,6 +175,16 @@ def parse_args() -> argparse.Namespace:
             " should ideally be full paths, (default: %(default)s)"
         ),
     )
+    parser.add_argument(
+        "--find-python",
+        type=str,
+        choices=["manylinux"],
+        help=(
+            "Automatically find Python interpreters based on the specified mode. "
+            "Available modes: 'manylinux' (searches /opt/python/ for interpreters "
+            "matching supported versions in pyproject.toml)"
+        ),
+    )
     parser.add_argument(
         "-d",
         "--destination",
@@ -112,7 +197,26 @@ def parse_args() -> argparse.Namespace:
 
 def main() -> None:
     args = parse_args()
-    pythons = args.python or [sys.executable]
+
+    if args.find_python:
+        if args.python:
+            logger.warning(
+                "Both --python and --find-python specified. Using --find-python and ignoring --python."
+            )
+        pythons = find_python_interpreters(args.find_python)
+        if not pythons:
+            logger.error(
+                "No Python interpreters found with --find-python %s", args.find_python
+            )
+            sys.exit(1)
+        logger.info(
+            "Found %d supported Python interpreters: %s",
+            len(pythons),
+            ", ".join(pythons),
+        )
+    else:
+        pythons = args.python or [sys.executable]
+
     build_times: dict[str, float] = dict()
 
     if len(pythons) > 1 and args.destination == "dist/":
diff --git a/tools/packaging/split_wheel.py b/tools/packaging/split_wheel.py
deleted file mode 100644
index fd52c39a22b02..0000000000000
--- a/tools/packaging/split_wheel.py
+++ /dev/null
@@ -1,109 +0,0 @@
-"""Script to build split pytorch wheels
-
-What is split build / why is it important?
-    > Split build is splitting the PyTorch build into a libtorch &
-    > PyTorch python frontend package. This allows us to to publish
-    > both as separate packages and opens up our ability to have users
-    > install different libtorch backends per their PyTorch frontend
-    >
-    > Example: opening up the door to things like:
-    >     pip install torch[cuda]
-    >     pip install torch[rocm]
-    >     pip install torch[cpu]
-    >     etc.
-
-Why does this exist?
-    > Currently our split build requires you to invoke setup.py twice
-    > Which ends up complicating the build process and adds some level
-    > of complexity to our setup.py / build invocation for split builds.
-    > Ideally this script will eventually not be needed but for
-    > development purposes we should have an easy way to invoke this script
-"""
-
-import argparse
-import logging
-import os
-import subprocess
-import sys
-from pathlib import Path
-from typing import Optional
-
-
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
-
-# NOTE: This will need to be updated if this script is ever moved
-ROOT_PATH = Path(__file__).absolute().parents[2]
-SETUP_PY_PATH = ROOT_PATH / "setup.py"
-
-
-def requirements_installed() -> bool:
-    try:
-        import setuptools  # type: ignore[import-untyped]  # noqa: F401
-
-        return True
-    except ImportError:
-        logger.error(
-            "Requirements not installed, run the following command to install:"
-        )
-        logger.error(
-            "    > %s -m pip install -r %s/requirements.txt", sys.executable, ROOT_PATH
-        )
-        return False
-
-
-def setup_py(cmd_args: list[str], extra_env: Optional[dict[str, str]] = None) -> None:
-    if extra_env is None:
-        extra_env = {}
-    cmd = [sys.executable, str(SETUP_PY_PATH), *cmd_args]
-    logger.debug("+ %s", " ".join(cmd))
-    subprocess.run(
-        cmd,
-        # Give the parent environment to the subprocess
-        env={**os.environ, **extra_env},
-        check=True,
-    )
-
-
-def split_build(cmd: str) -> None:
-    logger.info("Running %s for libtorch wheel", cmd)
-    setup_py(
-        [cmd],
-        extra_env={"BUILD_LIBTORCH_WHL": "1", "BUILD_PYTHON_ONLY": "0"},
-    )
-    logger.info("Running %s for torch wheel", cmd)
-    # NOTE: Passing CMAKE_FRESH=1 is necessary here since the torch frontend has it's
-    # own cmake files that it needs to generate
-    setup_py(
-        [cmd],
-        extra_env={
-            "BUILD_LIBTORCH_WHL": "0",
-            "BUILD_PYTHON_ONLY": "1",
-            "CMAKE_FRESH": "1",
-        },
-    )
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
-    command_subparser = parser.add_subparsers(dest="command")
-    # Ideally these should mirror setuptools commands if we need support here for that
-    command_subparser.add_parser("install")
-    command_subparser.add_parser("bdist_wheel")
-    command_subparser.add_parser("develop")
-    return parser.parse_args()
-
-
-def main() -> None:
-    args = parse_args()
-    if not requirements_installed():
-        sys.exit(1)
-    split_build(args.command)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 81fadb855b004..0dc1e8de37d8c 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -422,6 +422,19 @@ def gen_nn_functional(fm: FileManager) -> None:
                         "Tensor",
                     )
                 ],
+                f"max_pool{d}d_with_indices": [
+                    defs(
+                        f"max_pool{d}d_with_indices",
+                        [
+                            INPUT,
+                            KERNEL_SIZE,
+                            *STRIDE_PADDING,
+                            "dilation: _int | _size = 1",
+                            "ceil_mode: bool = False",
+                        ],
+                        "tuple[Tensor, Tensor]",
+                    )
+                ],
             }
         )
 
@@ -551,6 +564,105 @@ def gen_nn_functional(fm: FileManager) -> None:
                     "Tensor",
                 )
             ],
+            "elu": [
+                defs(
+                    "elu",
+                    [
+                        INPUT,
+                        "alpha: float = 1.0",
+                        "scale: float = 1.0",
+                        "input_scale: float = 1.0",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "glu": [
+                defs(
+                    "glu",
+                    [
+                        INPUT,
+                        "dim: int = -1",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "max_unpool2d": [
+                defs(
+                    "max_unpool2d",
+                    [
+                        INPUT,
+                        "indices: Tensor",
+                        "output_size: Sequence[int] | None",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "max_unpool3d": [
+                defs(
+                    "max_unpool3d",
+                    [
+                        INPUT,
+                        "indices: Tensor",
+                        "output_size: Sequence[int] | None",
+                        "stride: _int | _size",
+                        "padding: _int | _size",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "cross_entropy_loss": [
+                defs(
+                    "cross_entropy_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "weight: Tensor | None = None",
+                        "reduction: str = ...",
+                        "ignore_index: int = -100",
+                        "label_smoothing: float = 0.0",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "hardsigmoid_": [
+                defs(
+                    "hardsigmoid_",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "hardswish": [
+                defs(
+                    "hardswish",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "hardswish_": [
+                defs(
+                    "hardswish_",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "huber_loss": [
+                defs(
+                    "huber_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "reduction: str = ...",
+                        "delta: float = 1.0",
+                    ],
+                    "Tensor",
+                )
+            ],
         }
     )
 
@@ -1318,6 +1430,20 @@ def replace_special_case(hint: str) -> str:
                     "S",
                 )
             ],
+            "_make_dtensor": [
+                "@staticmethod\n"
+                + defs(
+                    "_make_dtensor",
+                    [
+                        "cls: type[S]",
+                        "size: Sequence[_int | SymInt]",
+                        "strides: Sequence[_int | SymInt]",
+                        "local_tensor: Tensor",
+                        "requires_grad: _bool",
+                    ],
+                    "S",
+                )
+            ],
             "__contains__": [defs("__contains__", ["self", "item: Any", "/"], "_bool")],
             "__getitem__": [defs("__getitem__", ["self", INDICES, "/"], "Tensor")],
             "__setitem__": [
@@ -1826,8 +1952,15 @@ def main() -> None:
         default=".",
         help="path to output directory",
     )
+    parser.add_argument(
+        "--template-dir",
+        default=".",
+        help="path to template directory",
+    )
     args = parser.parse_args()
-    fm = FileManager(install_dir=args.out, template_dir=".", dry_run=False)
+    fm = FileManager(
+        install_dir=args.out, template_dir=args.template_dir, dry_run=False
+    )
     gen_pyi(
         args.native_functions_path,
         args.tags_path,
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.json b/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.json
index 1a52bb962d6b4..a62e93ecc2615 100644
--- a/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.json
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.json
@@ -1,10 +1,12 @@
 {
   "tools/test/docstring_linter_testdata/more_python_code.py.txt": {
-    "11": "class LintInit: lines=6, docs=0: (grandfathered)"
+    " 1": "def a_very_very_long(): lines=8, docs=0: (grandfathered)",
+    "10": "class LintInit: lines=6, docs=0: (grandfathered)"
   },
   "tools/test/docstring_linter_testdata/python_code.py.txt": {
-    "20": "class LongWithoutDocstring: lines=4, docs=0: (grandfathered)",
-    "25": "class LongWithShortDocstring: lines=6, docs=10: (grandfathered)",
-    "72": "def ImpossibleCombo.needs_docs(): lines=12, docs=0: (grandfathered)"
+    "17": "class LongWithoutDocstring: lines=6, docs=0: (grandfathered)",
+    "24": "class LongWithShortDocstring: lines=6, docs=10: (grandfathered)",
+    "54": "def long_without_docstring(): lines=7, docs=0: (grandfathered)",
+    "71": "def ImpossibleCombo.needs_docs(): lines=12, docs=0: (grandfathered)"
   }
 }
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.json b/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.json
index 816972717bd19..f6f71e0a45d6a 100644
--- a/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.json
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.json
@@ -1,10 +1,12 @@
 {
   "tools/test/docstring_linter_testdata/more_python_code.py.txt": {
-    "11": "class LintInit: lines=6, docs=0: FAIL"
+    " 1": "def a_very_very_long(): lines=8, docs=0: FAIL",
+    "10": "class LintInit: lines=6, docs=0: FAIL"
   },
   "tools/test/docstring_linter_testdata/python_code.py.txt": {
-    "20": "class LongWithoutDocstring: lines=4, docs=0: FAIL",
-    "25": "class LongWithShortDocstring: lines=6, docs=10: FAIL",
-    "72": "def ImpossibleCombo.needs_docs(): lines=12, docs=0: FAIL"
+    "17": "class LongWithoutDocstring: lines=6, docs=0: FAIL",
+    "24": "class LongWithShortDocstring: lines=6, docs=10: FAIL",
+    "54": "def long_without_docstring(): lines=7, docs=0: FAIL",
+    "71": "def ImpossibleCombo.needs_docs(): lines=12, docs=0: FAIL"
   }
 }
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.txt b/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.txt
index cb67759750701..de8cf370f7cc4 100644
--- a/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.txt
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.txt
@@ -1,3 +1,11 @@
+tools/test/docstring_linter_testdata/more_python_code.py.txt:1: No docstring found for function 'a_very_very_long' (8 lines)
+    1 | def a_very_very_long():
+        ^
+    2 |     # Lots of lines!
+    3 |     # Lots of lines!
+    4 |     # Lots of lines!
+    5 |     # Lots of lines!
+
 tools/test/docstring_linter_testdata/more_python_code.py.txt:10: No docstring found for class 'LintInit' (6 lines)
     8 |
     9 |
@@ -6,7 +14,7 @@ tools/test/docstring_linter_testdata/more_python_code.py.txt:10: No docstring fo
    11 |     def __init__(self) -> None:
    12 |         # Lots of lines!
 
-tools/test/docstring_linter_testdata/python_code.py.txt:17: No docstring found for class 'LongWithoutDocstring' (4 lines)
+tools/test/docstring_linter_testdata/python_code.py.txt:17: No docstring found for class 'LongWithoutDocstring' (6 lines)
    15 |
    16 |
    17 | class LongWithoutDocstring:
@@ -22,6 +30,14 @@ tools/test/docstring_linter_testdata/python_code.py.txt:24: docstring found for
    25 |     """TODO"""
    26 |
 
+tools/test/docstring_linter_testdata/python_code.py.txt:54: No docstring found for function 'long_without_docstring' (7 lines)
+   52 |
+   53 |
+   54 | def long_without_docstring():
+        ^
+   55 |     #
+   56 |     #
+
 tools/test/docstring_linter_testdata/python_code.py.txt:71: No docstring found for function 'needs_docs' (12 lines). If the method overrides a method on a parent class, adding the `@typing_extensions.override` decorator will make this error go away.
    69 |     """This docstring, while short, is enough"""
    70 |
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.grandfather.json b/tools/test/docstring_linter_testdata/more_python_code.py.txt.grandfather.json
index f4b3a72d19728..1c4c8b6963a31 100644
--- a/tools/test/docstring_linter_testdata/more_python_code.py.txt.grandfather.json
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt.grandfather.json
@@ -1,10 +1,12 @@
 {
   "tools/test/docstring_linter_testdata/more_python_code.py.txt": {
-    "class LintInit": 6
+    "class LintInit": 6,
+    "def a_very_very_long()": 8
   },
   "tools/test/docstring_linter_testdata/python_code.py.txt": {
     "class LongWithShortDocstring": 6,
-    "class LongWithoutDocstring": 4,
-    "def ImpossibleCombo.needs_docs()": 12
+    "class LongWithoutDocstring": 6,
+    "def ImpossibleCombo.needs_docs()": 12,
+    "def long_without_docstring()": 7
   }
 }
\ No newline at end of file
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.json b/tools/test/docstring_linter_testdata/python_code.py.txt.json
index 6b2fb2650010e..b95486e7ff563 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.json
@@ -4,7 +4,7 @@
     "code": "DOCSTRING_LINTER",
     "description": null,
     "line": 17,
-    "name": "No docstring found for class 'LongWithoutDocstring' (4 lines)",
+    "name": "No docstring found for class 'LongWithoutDocstring' (6 lines)",
     "original": null,
     "path": "tools/test/docstring_linter_testdata/python_code.py.txt",
     "replacement": null,
@@ -21,6 +21,17 @@
     "replacement": null,
     "severity": "error"
   },
+  {
+    "char": 0,
+    "code": "DOCSTRING_LINTER",
+    "description": null,
+    "line": 54,
+    "name": "No docstring found for function 'long_without_docstring' (7 lines)",
+    "original": null,
+    "path": "tools/test/docstring_linter_testdata/python_code.py.txt",
+    "replacement": null,
+    "severity": "error"
+  },
   {
     "char": 4,
     "code": "DOCSTRING_LINTER",
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.lintrunner b/tools/test/docstring_linter_testdata/python_code.py.txt.lintrunner
index 3c4e74f8ae69a..2db9a576291a0 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.lintrunner
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.lintrunner
@@ -1,4 +1,4 @@
-tools/test/docstring_linter_testdata/python_code.py.txt:17: No docstring found for class 'LongWithoutDocstring' (4 lines)
+tools/test/docstring_linter_testdata/python_code.py.txt:17: No docstring found for class 'LongWithoutDocstring' (6 lines)
    15 |
    16 |
    17 | class LongWithoutDocstring:
@@ -14,6 +14,14 @@ tools/test/docstring_linter_testdata/python_code.py.txt:24: docstring found for
    25 |     """TODO"""
    26 |
 
+tools/test/docstring_linter_testdata/python_code.py.txt:54: No docstring found for function 'long_without_docstring' (7 lines)
+   52 |
+   53 |
+   54 | def long_without_docstring():
+        ^
+   55 |     #
+   56 |     #
+
 tools/test/docstring_linter_testdata/python_code.py.txt:71: No docstring found for function 'needs_docs' (12 lines). If the method overrides a method on a parent class, adding the `@typing_extensions.override` decorator will make this error go away.
    69 |     """This docstring, while short, is enough"""
    70 |
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.json b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.json
index ba5803383ad96..65d46215a3c25 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.json
@@ -11,7 +11,7 @@
     "is_method": false,
     "line_count": 4,
     "parent": null,
-    "start_line": 2
+    "start_line": 1
   },
   {
     "category": "class",
@@ -25,7 +25,7 @@
     "is_method": false,
     "line_count": 3,
     "parent": null,
-    "start_line": 7
+    "start_line": 6
   },
   {
     "category": "class",
@@ -42,7 +42,7 @@
         "is_method": true,
         "line_count": 3,
         "parent": 2,
-        "start_line": 14
+        "start_line": 13
       }
     ],
     "decorators": [],
@@ -54,7 +54,7 @@
     "is_method": false,
     "line_count": 6,
     "parent": null,
-    "start_line": 11
+    "start_line": 10
   },
   {
     "category": "class",
@@ -71,7 +71,7 @@
         "is_method": true,
         "line_count": 3,
         "parent": 4,
-        "start_line": 21
+        "start_line": 20
       }
     ],
     "decorators": [],
@@ -81,9 +81,9 @@
     "index": 4,
     "is_local": false,
     "is_method": false,
-    "line_count": 4,
+    "line_count": 6,
     "parent": null,
-    "start_line": 20
+    "start_line": 17
   },
   {
     "category": "class",
@@ -100,7 +100,7 @@
         "is_method": true,
         "line_count": 3,
         "parent": 6,
-        "start_line": 28
+        "start_line": 27
       }
     ],
     "decorators": [],
@@ -112,7 +112,7 @@
     "is_method": false,
     "line_count": 6,
     "parent": null,
-    "start_line": 25
+    "start_line": 24
   },
   {
     "category": "class",
@@ -129,7 +129,7 @@
         "is_method": true,
         "line_count": 3,
         "parent": 8,
-        "start_line": 35
+        "start_line": 34
       }
     ],
     "decorators": [],
@@ -141,7 +141,7 @@
     "is_method": false,
     "line_count": 6,
     "parent": null,
-    "start_line": 32
+    "start_line": 31
   },
   {
     "category": "def",
@@ -153,9 +153,9 @@
     "index": 10,
     "is_local": false,
     "is_method": false,
-    "line_count": 3,
+    "line_count": 6,
     "parent": null,
-    "start_line": 42
+    "start_line": 38
   },
   {
     "category": "def",
@@ -169,7 +169,7 @@
     "is_method": false,
     "line_count": 8,
     "parent": null,
-    "start_line": 46
+    "start_line": 45
   },
   {
     "category": "def",
@@ -181,9 +181,9 @@
     "index": 12,
     "is_local": false,
     "is_method": false,
-    "line_count": 3,
+    "line_count": 7,
     "parent": null,
-    "start_line": 59
+    "start_line": 54
   },
   {
     "category": "class",
@@ -206,7 +206,7 @@
                 "is_method": false,
                 "line_count": 6,
                 "parent": 15,
-                "start_line": 74
+                "start_line": 73
               },
               {
                 "category": "class",
@@ -220,7 +220,7 @@
                 "is_method": false,
                 "line_count": 3,
                 "parent": 15,
-                "start_line": 81
+                "start_line": 80
               }
             ],
             "decorators": [],
@@ -232,7 +232,7 @@
             "is_method": false,
             "line_count": 11,
             "parent": 14,
-            "start_line": 73
+            "start_line": 72
           },
           {
             "category": "class",
@@ -246,7 +246,7 @@
             "is_method": false,
             "line_count": 6,
             "parent": 15,
-            "start_line": 74
+            "start_line": 73
           },
           {
             "category": "class",
@@ -260,7 +260,7 @@
             "is_method": false,
             "line_count": 3,
             "parent": 15,
-            "start_line": 81
+            "start_line": 80
           }
         ],
         "decorators": [],
@@ -272,7 +272,7 @@
         "is_method": true,
         "line_count": 12,
         "parent": 13,
-        "start_line": 72
+        "start_line": 71
       },
       {
         "category": "def",
@@ -289,7 +289,7 @@
             "is_method": false,
             "line_count": 6,
             "parent": 15,
-            "start_line": 74
+            "start_line": 73
           },
           {
             "category": "class",
@@ -303,7 +303,7 @@
             "is_method": false,
             "line_count": 3,
             "parent": 15,
-            "start_line": 81
+            "start_line": 80
           }
         ],
         "decorators": [],
@@ -315,7 +315,7 @@
         "is_method": false,
         "line_count": 11,
         "parent": 14,
-        "start_line": 73
+        "start_line": 72
       },
       {
         "category": "class",
@@ -329,7 +329,7 @@
         "is_method": false,
         "line_count": 6,
         "parent": 15,
-        "start_line": 74
+        "start_line": 73
       },
       {
         "category": "class",
@@ -343,7 +343,7 @@
         "is_method": false,
         "line_count": 3,
         "parent": 15,
-        "start_line": 81
+        "start_line": 80
       }
     ],
     "decorators": [],
@@ -353,9 +353,9 @@
     "index": 13,
     "is_local": false,
     "is_method": false,
-    "line_count": 15,
+    "line_count": 21,
     "parent": null,
-    "start_line": 69
+    "start_line": 62
   },
   {
     "category": "class",
@@ -372,7 +372,7 @@
         "is_method": true,
         "line_count": 2,
         "parent": 18,
-        "start_line": 87
+        "start_line": 86
       },
       {
         "category": "def",
@@ -386,9 +386,9 @@
         "index": 20,
         "is_local": false,
         "is_method": true,
-        "line_count": 2,
+        "line_count": 6,
         "parent": 18,
-        "start_line": 97
+        "start_line": 92
       },
       {
         "category": "def",
@@ -402,7 +402,7 @@
         "is_method": true,
         "line_count": 2,
         "parent": 18,
-        "start_line": 100
+        "start_line": 99
       },
       {
         "category": "def",
@@ -416,7 +416,7 @@
         "is_method": true,
         "line_count": 4,
         "parent": 18,
-        "start_line": 103
+        "start_line": 102
       }
     ],
     "decorators": [
@@ -430,7 +430,7 @@
     "is_method": false,
     "line_count": 21,
     "parent": null,
-    "start_line": 86
+    "start_line": 85
   },
   {
     "category": "def",
@@ -442,8 +442,8 @@
     "index": 23,
     "is_local": false,
     "is_method": false,
-    "line_count": 1,
+    "line_count": 5,
     "parent": null,
-    "start_line": 112
+    "start_line": 107
   }
 ]
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.json b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.json
index adebe574e6884..dd4c90dc2710c 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.json
@@ -1,17 +1,17 @@
 {
   "class ImpossibleCombo": {
     "children": {
-      "72": {
+      "71": {
         "children": {
-          "73": {
+          "72": {
             "children": {
-              "74": {
+              "73": {
                 "docstring_len": 0,
                 "lines": 6,
                 "name": "class ImpossibleCombo.needs_docs.not_short.Long",
                 "status": "good"
               },
-              "81": {
+              "80": {
                 "docstring_len": 0,
                 "lines": 3,
                 "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -23,13 +23,13 @@
             "name": "def ImpossibleCombo.needs_docs.not_short",
             "status": "good"
           },
-          "74": {
+          "73": {
             "docstring_len": 0,
             "lines": 6,
             "name": "class ImpossibleCombo.needs_docs.not_short.Long",
             "status": "good"
           },
-          "81": {
+          "80": {
             "docstring_len": 0,
             "lines": 3,
             "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -41,15 +41,15 @@
         "name": "def ImpossibleCombo.needs_docs",
         "status": "good"
       },
-      "73": {
+      "72": {
         "children": {
-          "74": {
+          "73": {
             "docstring_len": 0,
             "lines": 6,
             "name": "class ImpossibleCombo.needs_docs.not_short.Long",
             "status": "good"
           },
-          "81": {
+          "80": {
             "docstring_len": 0,
             "lines": 3,
             "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -61,13 +61,13 @@
         "name": "def ImpossibleCombo.needs_docs.not_short",
         "status": "good"
       },
-      "74": {
+      "73": {
         "docstring_len": 0,
         "lines": 6,
         "name": "class ImpossibleCombo.needs_docs.not_short.Long",
         "status": "good"
       },
-      "81": {
+      "80": {
         "docstring_len": 0,
         "lines": 3,
         "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -75,13 +75,13 @@
       }
     },
     "docstring_len": 44,
-    "line": 69,
-    "lines": 15,
+    "line": 62,
+    "lines": 21,
     "status": "good"
   },
   "class LongWithDocstring": {
     "children": {
-      "14": {
+      "13": {
         "docstring_len": 0,
         "lines": 3,
         "name": "def LongWithDocstring.short1",
@@ -89,13 +89,13 @@
       }
     },
     "docstring_len": 44,
-    "line": 11,
+    "line": 10,
     "lines": 6,
     "status": "good"
   },
   "class LongWithShortDocstring": {
     "children": {
-      "28": {
+      "27": {
         "docstring_len": 0,
         "lines": 3,
         "name": "def LongWithShortDocstring.short1",
@@ -103,13 +103,13 @@
       }
     },
     "docstring_len": 10,
-    "line": 25,
+    "line": 24,
     "lines": 6,
     "status": "good"
   },
   "class LongWithoutDocstring": {
     "children": {
-      "21": {
+      "20": {
         "docstring_len": 0,
         "lines": 3,
         "name": "def LongWithoutDocstring.short1",
@@ -117,31 +117,31 @@
       }
     },
     "docstring_len": 0,
-    "line": 20,
-    "lines": 4,
+    "line": 17,
+    "lines": 6,
     "status": "good"
   },
   "class NotDocstring": {
     "children": {
-      " 87": {
+      " 86": {
         "docstring_len": 0,
         "lines": 2,
         "name": "def NotDocstring.short1",
         "status": "good"
       },
-      " 97": {
+      " 92": {
         "docstring_len": 0,
-        "lines": 2,
+        "lines": 6,
         "name": "def NotDocstring.long_with_override",
         "status": "good"
       },
-      "100": {
+      " 99": {
         "docstring_len": 0,
         "lines": 2,
         "name": "def NotDocstring.short2",
         "status": "good"
       },
-      "103": {
+      "102": {
         "docstring_len": 0,
         "lines": 4,
         "name": "def NotDocstring.short3",
@@ -149,25 +149,25 @@
       }
     },
     "docstring_len": 0,
-    "line": 86,
+    "line": 85,
     "lines": 21,
     "status": "good"
   },
   "class Short": {
     "docstring_len": 0,
-    "line": 7,
+    "line": 6,
     "lines": 3,
     "status": "good"
   },
   "class ShortWithDocstring": {
     "docstring_len": 44,
-    "line": 2,
+    "line": 1,
     "lines": 4,
     "status": "good"
   },
   "class _Protected": {
     "children": {
-      "35": {
+      "34": {
         "docstring_len": 0,
         "lines": 3,
         "name": "def _Protected.short1",
@@ -175,32 +175,32 @@
       }
     },
     "docstring_len": 10,
-    "line": 32,
+    "line": 31,
     "lines": 6,
     "status": "good"
   },
   "def long": {
     "docstring_len": 44,
-    "line": 46,
+    "line": 45,
     "lines": 8,
     "status": "good"
   },
   "def long_with_omit": {
     "docstring_len": 0,
-    "line": 112,
-    "lines": 1,
+    "line": 107,
+    "lines": 5,
     "status": "good"
   },
   "def long_without_docstring": {
     "docstring_len": 0,
-    "line": 59,
-    "lines": 3,
+    "line": 54,
+    "lines": 7,
     "status": "good"
   },
   "def short": {
     "docstring_len": 0,
-    "line": 42,
-    "lines": 3,
+    "line": 38,
+    "lines": 6,
     "status": "good"
   }
 }
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.line.json b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.line.json
index 56ae51b77270e..cadee32ab874f 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.line.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.line.json
@@ -1,19 +1,19 @@
 {
-  "  2": {
+  "  1": {
     "docstring_len": 44,
     "lines": 4,
     "name": "class ShortWithDocstring",
     "status": "good"
   },
-  "  7": {
+  "  6": {
     "docstring_len": 0,
     "lines": 3,
     "name": "class Short",
     "status": "good"
   },
-  " 11": {
+  " 10": {
     "children": {
-      "14": {
+      "13": {
         "docstring_len": 0,
         "lines": 3,
         "name": "def LongWithDocstring.short1",
@@ -25,9 +25,9 @@
     "name": "class LongWithDocstring",
     "status": "good"
   },
-  " 20": {
+  " 17": {
     "children": {
-      "21": {
+      "20": {
         "docstring_len": 0,
         "lines": 3,
         "name": "def LongWithoutDocstring.short1",
@@ -35,13 +35,13 @@
       }
     },
     "docstring_len": 0,
-    "lines": 4,
+    "lines": 6,
     "name": "class LongWithoutDocstring",
     "status": "good"
   },
-  " 25": {
+  " 24": {
     "children": {
-      "28": {
+      "27": {
         "docstring_len": 0,
         "lines": 3,
         "name": "def LongWithShortDocstring.short1",
@@ -53,9 +53,9 @@
     "name": "class LongWithShortDocstring",
     "status": "good"
   },
-  " 32": {
+  " 31": {
     "children": {
-      "35": {
+      "34": {
         "docstring_len": 0,
         "lines": 3,
         "name": "def _Protected.short1",
@@ -67,37 +67,37 @@
     "name": "class _Protected",
     "status": "good"
   },
-  " 42": {
+  " 38": {
     "docstring_len": 0,
-    "lines": 3,
+    "lines": 6,
     "name": "def short",
     "status": "good"
   },
-  " 46": {
+  " 45": {
     "docstring_len": 44,
     "lines": 8,
     "name": "def long",
     "status": "good"
   },
-  " 59": {
+  " 54": {
     "docstring_len": 0,
-    "lines": 3,
+    "lines": 7,
     "name": "def long_without_docstring",
     "status": "good"
   },
-  " 69": {
+  " 62": {
     "children": {
-      "72": {
+      "71": {
         "children": {
-          "73": {
+          "72": {
             "children": {
-              "74": {
+              "73": {
                 "docstring_len": 0,
                 "lines": 6,
                 "name": "class ImpossibleCombo.needs_docs.not_short.Long",
                 "status": "good"
               },
-              "81": {
+              "80": {
                 "docstring_len": 0,
                 "lines": 3,
                 "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -109,13 +109,13 @@
             "name": "def ImpossibleCombo.needs_docs.not_short",
             "status": "good"
           },
-          "74": {
+          "73": {
             "docstring_len": 0,
             "lines": 6,
             "name": "class ImpossibleCombo.needs_docs.not_short.Long",
             "status": "good"
           },
-          "81": {
+          "80": {
             "docstring_len": 0,
             "lines": 3,
             "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -127,15 +127,15 @@
         "name": "def ImpossibleCombo.needs_docs",
         "status": "good"
       },
-      "73": {
+      "72": {
         "children": {
-          "74": {
+          "73": {
             "docstring_len": 0,
             "lines": 6,
             "name": "class ImpossibleCombo.needs_docs.not_short.Long",
             "status": "good"
           },
-          "81": {
+          "80": {
             "docstring_len": 0,
             "lines": 3,
             "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -147,13 +147,13 @@
         "name": "def ImpossibleCombo.needs_docs.not_short",
         "status": "good"
       },
-      "74": {
+      "73": {
         "docstring_len": 0,
         "lines": 6,
         "name": "class ImpossibleCombo.needs_docs.not_short.Long",
         "status": "good"
       },
-      "81": {
+      "80": {
         "docstring_len": 0,
         "lines": 3,
         "name": "class ImpossibleCombo.needs_docs.not_short.Short",
@@ -161,31 +161,31 @@
       }
     },
     "docstring_len": 44,
-    "lines": 15,
+    "lines": 21,
     "name": "class ImpossibleCombo",
     "status": "good"
   },
-  " 86": {
+  " 85": {
     "children": {
-      " 87": {
+      " 86": {
         "docstring_len": 0,
         "lines": 2,
         "name": "def NotDocstring.short1",
         "status": "good"
       },
-      " 97": {
+      " 92": {
         "docstring_len": 0,
-        "lines": 2,
+        "lines": 6,
         "name": "def NotDocstring.long_with_override",
         "status": "good"
       },
-      "100": {
+      " 99": {
         "docstring_len": 0,
         "lines": 2,
         "name": "def NotDocstring.short2",
         "status": "good"
       },
-      "103": {
+      "102": {
         "docstring_len": 0,
         "lines": 4,
         "name": "def NotDocstring.short3",
@@ -197,9 +197,9 @@
     "name": "class NotDocstring",
     "status": "good"
   },
-  "112": {
+  "107": {
     "docstring_len": 0,
-    "lines": 1,
+    "lines": 5,
     "name": "def long_with_omit",
     "status": "good"
   }
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.report.json b/tools/test/docstring_linter_testdata/python_code.py.txt.report.json
index 3d67396bbda39..43a8648aad288 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.report.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.report.json
@@ -11,7 +11,7 @@
     "is_method": false,
     "line_count": 4,
     "parent": null,
-    "start_line": 2
+    "start_line": 1
   },
   {
     "category": "class",
@@ -25,7 +25,7 @@
     "is_method": false,
     "line_count": 3,
     "parent": null,
-    "start_line": 7
+    "start_line": 6
   },
   {
     "category": "class",
@@ -41,7 +41,7 @@
     "is_method": false,
     "line_count": 6,
     "parent": null,
-    "start_line": 11
+    "start_line": 10
   },
   {
     "category": "def",
@@ -55,7 +55,7 @@
     "is_method": true,
     "line_count": 3,
     "parent": 2,
-    "start_line": 14
+    "start_line": 13
   },
   {
     "category": "class",
@@ -69,9 +69,9 @@
     "index": 4,
     "is_local": false,
     "is_method": false,
-    "line_count": 4,
+    "line_count": 6,
     "parent": null,
-    "start_line": 20
+    "start_line": 17
   },
   {
     "category": "def",
@@ -85,7 +85,7 @@
     "is_method": true,
     "line_count": 3,
     "parent": 4,
-    "start_line": 21
+    "start_line": 20
   },
   {
     "category": "class",
@@ -101,7 +101,7 @@
     "is_method": false,
     "line_count": 6,
     "parent": null,
-    "start_line": 25
+    "start_line": 24
   },
   {
     "category": "def",
@@ -115,7 +115,7 @@
     "is_method": true,
     "line_count": 3,
     "parent": 6,
-    "start_line": 28
+    "start_line": 27
   },
   {
     "category": "class",
@@ -131,7 +131,7 @@
     "is_method": false,
     "line_count": 6,
     "parent": null,
-    "start_line": 32
+    "start_line": 31
   },
   {
     "category": "def",
@@ -145,7 +145,7 @@
     "is_method": true,
     "line_count": 3,
     "parent": 8,
-    "start_line": 35
+    "start_line": 34
   },
   {
     "category": "def",
@@ -157,9 +157,9 @@
     "index": 10,
     "is_local": false,
     "is_method": false,
-    "line_count": 3,
+    "line_count": 6,
     "parent": null,
-    "start_line": 42
+    "start_line": 38
   },
   {
     "category": "def",
@@ -173,7 +173,7 @@
     "is_method": false,
     "line_count": 8,
     "parent": null,
-    "start_line": 46
+    "start_line": 45
   },
   {
     "category": "def",
@@ -185,9 +185,9 @@
     "index": 12,
     "is_local": false,
     "is_method": false,
-    "line_count": 3,
+    "line_count": 7,
     "parent": null,
-    "start_line": 59
+    "start_line": 54
   },
   {
     "category": "class",
@@ -204,9 +204,9 @@
     "index": 13,
     "is_local": false,
     "is_method": false,
-    "line_count": 15,
+    "line_count": 21,
     "parent": null,
-    "start_line": 69
+    "start_line": 62
   },
   {
     "category": "def",
@@ -224,7 +224,7 @@
     "is_method": true,
     "line_count": 12,
     "parent": 13,
-    "start_line": 72
+    "start_line": 71
   },
   {
     "category": "def",
@@ -241,7 +241,7 @@
     "is_method": false,
     "line_count": 11,
     "parent": 14,
-    "start_line": 73
+    "start_line": 72
   },
   {
     "category": "class",
@@ -255,7 +255,7 @@
     "is_method": false,
     "line_count": 6,
     "parent": 15,
-    "start_line": 74
+    "start_line": 73
   },
   {
     "category": "class",
@@ -269,7 +269,7 @@
     "is_method": false,
     "line_count": 3,
     "parent": 15,
-    "start_line": 81
+    "start_line": 80
   },
   {
     "category": "class",
@@ -290,7 +290,7 @@
     "is_method": false,
     "line_count": 21,
     "parent": null,
-    "start_line": 86
+    "start_line": 85
   },
   {
     "category": "def",
@@ -304,7 +304,7 @@
     "is_method": true,
     "line_count": 2,
     "parent": 18,
-    "start_line": 87
+    "start_line": 86
   },
   {
     "category": "def",
@@ -318,9 +318,9 @@
     "index": 20,
     "is_local": false,
     "is_method": true,
-    "line_count": 2,
+    "line_count": 6,
     "parent": 18,
-    "start_line": 97
+    "start_line": 92
   },
   {
     "category": "def",
@@ -334,7 +334,7 @@
     "is_method": true,
     "line_count": 2,
     "parent": 18,
-    "start_line": 100
+    "start_line": 99
   },
   {
     "category": "def",
@@ -348,7 +348,7 @@
     "is_method": true,
     "line_count": 4,
     "parent": 18,
-    "start_line": 103
+    "start_line": 102
   },
   {
     "category": "def",
@@ -360,8 +360,8 @@
     "index": 23,
     "is_local": false,
     "is_method": false,
-    "line_count": 1,
+    "line_count": 5,
     "parent": null,
-    "start_line": 112
+    "start_line": 107
   }
 ]
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.single.line.json b/tools/test/docstring_linter_testdata/python_code.py.txt.single.line.json
index cf40868ef4b0b..16b1f18567f78 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.single.line.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.single.line.json
@@ -1,26 +1,26 @@
 {
-  "  2": "class ShortWithDocstring: lines=4, docs=44",
-  "  7": "class Short: lines=3, docs=0",
-  " 11": "class LongWithDocstring: lines=6, docs=44",
-  " 14": "def LongWithDocstring.short1(): lines=3, docs=0",
-  " 20": "class LongWithoutDocstring: lines=4, docs=0",
-  " 21": "def LongWithoutDocstring.short1(): lines=3, docs=0",
-  " 25": "class LongWithShortDocstring: lines=6, docs=10",
-  " 28": "def LongWithShortDocstring.short1(): lines=3, docs=0",
-  " 32": "class _Protected: lines=6, docs=10",
-  " 35": "def _Protected.short1(): lines=3, docs=0",
-  " 42": "def short(): lines=3, docs=0",
-  " 46": "def long(): lines=8, docs=44",
-  " 59": "def long_without_docstring(): lines=3, docs=0",
-  " 69": "class ImpossibleCombo: lines=15, docs=44",
-  " 72": "def ImpossibleCombo.needs_docs(): lines=12, docs=0",
-  " 73": "def ImpossibleCombo.needs_docs.not_short(): lines=11, docs=0",
-  " 74": "class ImpossibleCombo.needs_docs.not_short.Long: lines=6, docs=0",
-  " 81": "class ImpossibleCombo.needs_docs.not_short.Short: lines=3, docs=0",
-  " 86": "class NotDocstring: lines=21, docs=0",
-  " 87": "def NotDocstring.short1(): lines=2, docs=0",
-  " 97": "def NotDocstring.long_with_override(): lines=2, docs=0",
-  "100": "def NotDocstring.short2(): lines=2, docs=0",
-  "103": "def NotDocstring.short3(): lines=4, docs=0",
-  "112": "def long_with_omit(): lines=1, docs=0"
+  "  1": "class ShortWithDocstring: lines=4, docs=44",
+  "  6": "class Short: lines=3, docs=0",
+  " 10": "class LongWithDocstring: lines=6, docs=44",
+  " 13": "def LongWithDocstring.short1(): lines=3, docs=0",
+  " 17": "class LongWithoutDocstring: lines=6, docs=0",
+  " 20": "def LongWithoutDocstring.short1(): lines=3, docs=0",
+  " 24": "class LongWithShortDocstring: lines=6, docs=10",
+  " 27": "def LongWithShortDocstring.short1(): lines=3, docs=0",
+  " 31": "class _Protected: lines=6, docs=10",
+  " 34": "def _Protected.short1(): lines=3, docs=0",
+  " 38": "def short(): lines=6, docs=0",
+  " 45": "def long(): lines=8, docs=44",
+  " 54": "def long_without_docstring(): lines=7, docs=0",
+  " 62": "class ImpossibleCombo: lines=21, docs=44",
+  " 71": "def ImpossibleCombo.needs_docs(): lines=12, docs=0",
+  " 72": "def ImpossibleCombo.needs_docs.not_short(): lines=11, docs=0",
+  " 73": "class ImpossibleCombo.needs_docs.not_short.Long: lines=6, docs=0",
+  " 80": "class ImpossibleCombo.needs_docs.not_short.Short: lines=3, docs=0",
+  " 85": "class NotDocstring: lines=21, docs=0",
+  " 86": "def NotDocstring.short1(): lines=2, docs=0",
+  " 92": "def NotDocstring.long_with_override(): lines=6, docs=0",
+  " 99": "def NotDocstring.short2(): lines=2, docs=0",
+  "102": "def NotDocstring.short3(): lines=4, docs=0",
+  "107": "def long_with_omit(): lines=5, docs=0"
 }
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.terse.json b/tools/test/docstring_linter_testdata/python_code.py.txt.terse.json
index ff878ca5d53e1..224da17c004fd 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.terse.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.terse.json
@@ -1,146 +1,146 @@
 {
   "class ImpossibleCombo": {
     "docstring_len": 44,
-    "line": 69,
-    "lines": 15,
+    "line": 62,
+    "lines": 21,
     "status": "good"
   },
   "class ImpossibleCombo.needs_docs.not_short.Long": {
     "docstring_len": 0,
-    "line": 74,
+    "line": 73,
     "lines": 6,
     "status": "good"
   },
   "class ImpossibleCombo.needs_docs.not_short.Short": {
     "docstring_len": 0,
-    "line": 81,
+    "line": 80,
     "lines": 3,
     "status": "good"
   },
   "class LongWithDocstring": {
     "docstring_len": 44,
-    "line": 11,
+    "line": 10,
     "lines": 6,
     "status": "good"
   },
   "class LongWithShortDocstring": {
     "docstring_len": 10,
-    "line": 25,
+    "line": 24,
     "lines": 6,
     "status": "good"
   },
   "class LongWithoutDocstring": {
     "docstring_len": 0,
-    "line": 20,
-    "lines": 4,
+    "line": 17,
+    "lines": 6,
     "status": "good"
   },
   "class NotDocstring": {
     "docstring_len": 0,
-    "line": 86,
+    "line": 85,
     "lines": 21,
     "status": "good"
   },
   "class Short": {
     "docstring_len": 0,
-    "line": 7,
+    "line": 6,
     "lines": 3,
     "status": "good"
   },
   "class ShortWithDocstring": {
     "docstring_len": 44,
-    "line": 2,
+    "line": 1,
     "lines": 4,
     "status": "good"
   },
   "class _Protected": {
     "docstring_len": 10,
-    "line": 32,
+    "line": 31,
     "lines": 6,
     "status": "good"
   },
   "def ImpossibleCombo.needs_docs": {
     "docstring_len": 0,
-    "line": 72,
+    "line": 71,
     "lines": 12,
     "status": "good"
   },
   "def ImpossibleCombo.needs_docs.not_short": {
     "docstring_len": 0,
-    "line": 73,
+    "line": 72,
     "lines": 11,
     "status": "good"
   },
   "def LongWithDocstring.short1": {
     "docstring_len": 0,
-    "line": 14,
+    "line": 13,
     "lines": 3,
     "status": "good"
   },
   "def LongWithShortDocstring.short1": {
     "docstring_len": 0,
-    "line": 28,
+    "line": 27,
     "lines": 3,
     "status": "good"
   },
   "def LongWithoutDocstring.short1": {
     "docstring_len": 0,
-    "line": 21,
+    "line": 20,
     "lines": 3,
     "status": "good"
   },
   "def NotDocstring.long_with_override": {
     "docstring_len": 0,
-    "line": 97,
-    "lines": 2,
+    "line": 92,
+    "lines": 6,
     "status": "good"
   },
   "def NotDocstring.short1": {
     "docstring_len": 0,
-    "line": 87,
+    "line": 86,
     "lines": 2,
     "status": "good"
   },
   "def NotDocstring.short2": {
     "docstring_len": 0,
-    "line": 100,
+    "line": 99,
     "lines": 2,
     "status": "good"
   },
   "def NotDocstring.short3": {
     "docstring_len": 0,
-    "line": 103,
+    "line": 102,
     "lines": 4,
     "status": "good"
   },
   "def _Protected.short1": {
     "docstring_len": 0,
-    "line": 35,
+    "line": 34,
     "lines": 3,
     "status": "good"
   },
   "def long": {
     "docstring_len": 44,
-    "line": 46,
+    "line": 45,
     "lines": 8,
     "status": "good"
   },
   "def long_with_omit": {
     "docstring_len": 0,
-    "line": 112,
-    "lines": 1,
+    "line": 107,
+    "lines": 5,
     "status": "good"
   },
   "def long_without_docstring": {
     "docstring_len": 0,
-    "line": 59,
-    "lines": 3,
+    "line": 54,
+    "lines": 7,
     "status": "good"
   },
   "def short": {
     "docstring_len": 0,
-    "line": 42,
-    "lines": 3,
+    "line": 38,
+    "lines": 6,
     "status": "good"
   }
 }
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.terse.line.json b/tools/test/docstring_linter_testdata/python_code.py.txt.terse.line.json
index c248493d4532f..0e7d43c440f31 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.terse.line.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.terse.line.json
@@ -1,145 +1,145 @@
 {
-  "  2": {
+  "  1": {
     "docstring_len": 44,
     "lines": 4,
     "name": "class ShortWithDocstring",
     "status": "good"
   },
-  "  7": {
+  "  6": {
     "docstring_len": 0,
     "lines": 3,
     "name": "class Short",
     "status": "good"
   },
-  " 11": {
+  " 10": {
     "docstring_len": 44,
     "lines": 6,
     "name": "class LongWithDocstring",
     "status": "good"
   },
-  " 14": {
+  " 13": {
     "docstring_len": 0,
     "lines": 3,
     "name": "def LongWithDocstring.short1",
     "status": "good"
   },
-  " 20": {
+  " 17": {
     "docstring_len": 0,
-    "lines": 4,
+    "lines": 6,
     "name": "class LongWithoutDocstring",
     "status": "good"
   },
-  " 21": {
+  " 20": {
     "docstring_len": 0,
     "lines": 3,
     "name": "def LongWithoutDocstring.short1",
     "status": "good"
   },
-  " 25": {
+  " 24": {
     "docstring_len": 10,
     "lines": 6,
     "name": "class LongWithShortDocstring",
     "status": "good"
   },
-  " 28": {
+  " 27": {
     "docstring_len": 0,
     "lines": 3,
     "name": "def LongWithShortDocstring.short1",
     "status": "good"
   },
-  " 32": {
+  " 31": {
     "docstring_len": 10,
     "lines": 6,
     "name": "class _Protected",
     "status": "good"
   },
-  " 35": {
+  " 34": {
     "docstring_len": 0,
     "lines": 3,
     "name": "def _Protected.short1",
     "status": "good"
   },
-  " 42": {
+  " 38": {
     "docstring_len": 0,
-    "lines": 3,
+    "lines": 6,
     "name": "def short",
     "status": "good"
   },
-  " 46": {
+  " 45": {
     "docstring_len": 44,
     "lines": 8,
     "name": "def long",
     "status": "good"
   },
-  " 59": {
+  " 54": {
     "docstring_len": 0,
-    "lines": 3,
+    "lines": 7,
     "name": "def long_without_docstring",
     "status": "good"
   },
-  " 69": {
+  " 62": {
     "docstring_len": 44,
-    "lines": 15,
+    "lines": 21,
     "name": "class ImpossibleCombo",
     "status": "good"
   },
-  " 72": {
+  " 71": {
     "docstring_len": 0,
     "lines": 12,
     "name": "def ImpossibleCombo.needs_docs",
     "status": "good"
   },
-  " 73": {
+  " 72": {
     "docstring_len": 0,
     "lines": 11,
     "name": "def ImpossibleCombo.needs_docs.not_short",
     "status": "good"
   },
-  " 74": {
+  " 73": {
     "docstring_len": 0,
     "lines": 6,
     "name": "class ImpossibleCombo.needs_docs.not_short.Long",
     "status": "good"
   },
-  " 81": {
+  " 80": {
     "docstring_len": 0,
     "lines": 3,
     "name": "class ImpossibleCombo.needs_docs.not_short.Short",
     "status": "good"
   },
-  " 86": {
+  " 85": {
     "docstring_len": 0,
     "lines": 21,
     "name": "class NotDocstring",
     "status": "good"
   },
-  " 87": {
+  " 86": {
     "docstring_len": 0,
     "lines": 2,
     "name": "def NotDocstring.short1",
     "status": "good"
   },
-  " 97": {
+  " 92": {
     "docstring_len": 0,
-    "lines": 2,
+    "lines": 6,
     "name": "def NotDocstring.long_with_override",
     "status": "good"
   },
-  "100": {
+  " 99": {
     "docstring_len": 0,
     "lines": 2,
     "name": "def NotDocstring.short2",
     "status": "good"
   },
-  "103": {
+  "102": {
     "docstring_len": 0,
     "lines": 4,
     "name": "def NotDocstring.short3",
     "status": "good"
   },
-  "112": {
+  "107": {
     "docstring_len": 0,
-    "lines": 1,
+    "lines": 5,
     "name": "def long_with_omit",
     "status": "good"
   }
diff --git a/tools/test/test_docstring_linter.py b/tools/test/test_docstring_linter.py
index e16e086cf606f..f1b98391b9ae9 100644
--- a/tools/test/test_docstring_linter.py
+++ b/tools/test/test_docstring_linter.py
@@ -28,7 +28,7 @@
 TEST_FILE = Path("tools/test/docstring_linter_testdata/python_code.py.txt")
 TEST_FILE2 = Path("tools/test/docstring_linter_testdata/more_python_code.py.txt")
 TEST_BLOCK_NAMES = Path("tools/test/docstring_linter_testdata/block_names.py.txt")
-ARGS = "--max-class=3", "--max-def=4", "--min-docstring=16"
+ARGS = "--max-class=5", "--max-def=6", "--min-docstring=16"
 
 
 class TestDocstringLinter(LinterTestCase):
diff --git a/tools/testing/discover_tests.py b/tools/testing/discover_tests.py
index 28ff5bc3ff292..96aee230f89f8 100644
--- a/tools/testing/discover_tests.py
+++ b/tools/testing/discover_tests.py
@@ -13,7 +13,7 @@
 
 
 def parse_test_module(test: str) -> str:
-    return test.split(".")[0]
+    return test.split(".", maxsplit=1)[0]
 
 
 def discover_tests(
diff --git a/tools/testing/modulefinder_determinator.py b/tools/testing/modulefinder_determinator.py
index e698cf3586dd3..e0ef858b96b21 100644
--- a/tools/testing/modulefinder_determinator.py
+++ b/tools/testing/modulefinder_determinator.py
@@ -186,7 +186,7 @@ def get_dep_modules(test: str) -> set[str]:
 
 
 def parse_test_module(test: str) -> str:
-    return test.split(".")[0]
+    return test.split(".", maxsplit=1)[0]
 
 
 def print_to_stderr(message: str) -> None:
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 8d761068d1e62..adc9aad4a05c3 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -95,6 +95,9 @@ endif()
 if(USE_ASAN AND TARGET Sanitizer::undefined)
   list(APPEND TORCH_PYTHON_LINK_LIBRARIES Sanitizer::undefined)
 endif()
+if(USE_LSAN AND TARGET Sanitizer::leak)
+  list(APPEND TORCH_PYTHON_LINK_LIBRARIES Sanitizer::leak)
+endif()
 if(USE_TSAN AND TARGET Sanitizer::thread)
   list(APPEND TORCH_PYTHON_LINK_LIBRARIES Sanitizer::thread)
 endif()
@@ -265,7 +268,7 @@ add_custom_command(
     OUTPUT
     "${TORCH_SRC_DIR}/utils/data/datapipes/datapipe.pyi"
     COMMAND
-    ${CMAKE_COMMAND} -E env PYTHONPATH="${TORCH_ROOT}"
+    ${CMAKE_COMMAND} -E env --modify PYTHONPATH=path_list_prepend:"${TORCH_ROOT}" --
     "${Python_EXECUTABLE}" ${TORCH_SRC_DIR}/utils/data/datapipes/gen_pyi.py
     DEPENDS
     "${TORCH_SRC_DIR}/utils/data/datapipes/datapipe.pyi.in"
@@ -273,32 +276,30 @@ add_custom_command(
     WORKING_DIRECTORY
     "${TORCH_ROOT}"
 )
-if(USE_DISTRIBUTED)
-    if(WIN32)
-      append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
-    else()
-      append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
-    endif()
-    # Disable certain warnings for GCC-9.X
-    if(CMAKE_COMPILER_IS_GNUCXX)
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-    endif()
-    # NCCL is a private dependency of libtorch, but libtorch_python includes
-    # some private headers of libtorch, which in turn include NCCL. As a hacky
-    # alternative to making NCCL a public dependency of libtorch, we make it
-    # a private dependency of libtorch_python as well.
-    if(USE_NCCL)
-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
-    endif()
-    # Same for MPI.
-    if(USE_MPI)
-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
-    endif()
-    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
 
+if(WIN32)
+  append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
+else()
+  append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
 endif()
+# Disable certain warnings for GCC-9.X
+if(CMAKE_COMPILER_IS_GNUCXX)
+  set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+  set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+  set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+endif()
+# NCCL is a private dependency of libtorch, but libtorch_python includes
+# some private headers of libtorch, which in turn include NCCL. As a hacky
+# alternative to making NCCL a public dependency of libtorch, we make it
+# a private dependency of libtorch_python as well.
+if(USE_NCCL)
+  list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
+endif()
+# Same for MPI.
+if(USE_MPI)
+  list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
+endif()
+list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
 
 if(USE_NCCL AND NOT WIN32)
     list(APPEND TORCH_PYTHON_SRCS
@@ -366,10 +367,6 @@ if(BUILD_LIBTORCHLESS)
     target_compile_definitions(torch_python PRIVATE USE_C10D_NCCL)
   endif()
 
-  if(USE_DISTRIBUTED)
-    target_compile_definitions(torch_python PRIVATE USE_DISTRIBUTED)
-  endif()
-
   if(USE_MPI AND USE_C10D_MPI)
     target_compile_definitions(torch_python PRIVATE USE_C10D_MPI)
   endif()
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 9e03c7dba8305..e55137c3d2bfd 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -951,6 +951,7 @@ class FunctionSchema:
         is_vararg: _bool,
         is_varret: _bool,
     ) -> None: ...
+    def _is_view_op(self) -> _bool: ...
 
 class _UpgraderEntry:
     bumped_at_version: _int
@@ -1693,6 +1694,11 @@ class _DispatchModule:
 _after_ADInplaceOrView_keyset: DispatchKeySet
 _after_autograd_keyset: DispatchKeySet
 
+class _SafeKernelFunction:
+    def call_boxed(self, keyset: DispatchKeySet, *args, **kwargs) -> Any: ...
+    @property
+    def op_handle(self) -> _DispatchOperatorHandle: ...
+
 def _dispatch_library(
     kind: str,
     name: str,
@@ -1730,6 +1736,10 @@ def _dispatch_has_computed_kernel_for_dispatch_key(
     name: str,
     dispatch: _dispatchkey,
 ) -> _bool: ...
+def _dispatch_get_computed_kernel_for_dispatch_key(
+    name: str,
+    dispatch: _dispatchkey,
+) -> _SafeKernelFunction: ...
 def _dispatch_find_dangling_impls() -> list[str]: ...
 def _dispatch_get_all_op_names() -> list[str]: ...
 def _dispatch_tls_set_dispatch_key_excluded(
@@ -1843,6 +1853,9 @@ class _SetExcludeDispatchKeyGuard:
     def __enter__(self): ...
     def __exit__(self, *exc_info: object) -> None: ...
 
+def _get_dtensor_allow_implicit_replication() -> _bool: ...
+def _set_dtensor_allow_implicit_replication(value: _bool) -> None: ...
+
 # Defined in torch/csrc/utils/schema_info.h
 
 class _SchemaInfo:
@@ -1970,7 +1983,9 @@ def _mtia_resetPeakMemoryStats(device: _int) -> None: ...
 
 # Defined in torch/csrc/mps/Module.cpp
 def _mps_deviceSynchronize() -> None: ...
+def _mps_get_core_count() -> _int: ...
 def _mps_get_default_generator() -> Generator: ...
+def _mps_get_name() -> _str: ...
 def _mps_emptyCache() -> None: ...
 def _mps_setMemoryFraction(fraction: _float) -> None: ...
 def _mps_currentAllocatedMemory() -> _int: ...
@@ -2017,6 +2032,7 @@ def _cuda_cudaHostAllocator() -> _int: ...
 def _cuda_cudaCachingAllocator_raw_alloc(size: _int, cuda_stream: _int) -> _int: ...
 def _cuda_cudaCachingAllocator_raw_delete(ptr: _int) -> None: ...
 def _cuda_cudaCachingAllocator_enable(val: _bool) -> None: ...
+def _cuda_cudaCachingAllocator_set_allocator_settings(env: str) -> None: ...
 def _cuda_beginAllocateToPool(device: _int, mempool_id: tuple[_int, _int]) -> None: ...
 def _cuda_beginAllocateCurrentThreadToPool(
     device: _int,
@@ -2316,6 +2332,7 @@ class _CUDAGraph:
     def enable_debug_mode(self) -> None: ...
     def debug_dump(self, debug_path: str) -> None: ...
     def raw_cuda_graph(self) -> _int: ...
+    def raw_cuda_graph_exec(self) -> _int: ...
 
 # Defined in torch/csrc/cuda/MemPool.cpp
 class _MemPool:
@@ -2324,13 +2341,10 @@ class _MemPool:
         allocator: _cuda_CUDAAllocator | None = None,
         is_user_created: _bool = True,
         use_on_oom: _bool = False,
-        symmetric: _bool = False,
     ) -> None: ...
     @property
     def id(self) -> tuple[_int, _int]: ...
     @property
-    def is_symmetric(self) -> _bool: ...
-    @property
     def allocator(self) -> _cuda_CUDAAllocator | None: ...
     def use_count(self) -> _int: ...
 
@@ -2379,6 +2393,7 @@ class _XpuDeviceProperties:
     gpu_subslice_count: _int
     architecture: _int
     type: str
+    uuid: Any
 
 # Defined in torch/csrc/xpu/Stream.cpp
 class _XpuStreamBase(Stream):
@@ -2434,7 +2449,11 @@ def _accelerator_getStream(device_index: _int) -> Stream: ...
 def _accelerator_synchronizeDevice(device_index: _int) -> None: ...
 def _accelerator_exchangeDevice(device_index: _int) -> _int: ...
 def _accelerator_maybeExchangeDevice(device_index: _int) -> _int: ...
-def _accelerator_setAllocatorSettings(env: str) -> None: ...
+def _accelerator_isAllocatorInitialized() -> _bool: ...
+def _accelerator_emptyCache() -> None: ...
+def _accelerator_getDeviceStats(device_index: _int) -> dict[str, Any]: ...
+def _accelerator_resetAccumulatedStats(device_index: _int) -> None: ...
+def _accelerator_resetPeakStats(device_index: _int) -> None: ...
 
 # Defined in torch/csrc/jit/python/python_tracer.cpp
 class TracingState:
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index f0413764cda6c..79e437063b8cb 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -298,6 +298,8 @@ class Backend:
         def _timeout(self) -> timedelta: ...
         @_timeout.setter
         def _timeout(self, val: timedelta) -> None: ...
+        global_ranks_in_group: list[int]
+        group_name: str
 
     def __init__(
         self,
@@ -315,6 +317,7 @@ class Backend:
     def options(self) -> Options: ...
     def rank(self) -> int: ...
     def size(self) -> int: ...
+    def name(self) -> str: ...
     def abort(self) -> None: ...
     def shutdown(self) -> None: ...
     def eager_connect_single_device(self, device: torch.device | None) -> None: ...
@@ -607,8 +610,6 @@ class ProcessGroupGloo(Backend):
     class Options(Backend.Options):
         devices: list[ProcessGroupGloo.Device]
         threads: int
-        global_ranks_in_group: list[int]
-        group_name: str
 
         def __init__(self): ...
 
@@ -643,14 +644,13 @@ class ProcessGroupNCCL(Backend):
         cga_cluster_size: int
         min_ctas: int
         max_ctas: int
+        def unsafe_get_ptr(self) -> int: ...
 
     class Options(Backend.Options):
         config: ProcessGroupNCCL.NCCLConfig
         is_high_priority_stream: bool
         split_from: ProcessGroupNCCL
         split_color: int
-        global_ranks_in_group: list[int]
-        group_name: str
 
         def __init__(self, is_high_priority_stream: bool = False): ...
 
@@ -769,6 +769,8 @@ class _SymmetricMemory:
     def set_backend(name: str) -> None: ...
     @staticmethod
     def get_backend(device: torch.device) -> Optional[str]: ...
+    @staticmethod
+    def get_mempool_allocator(device: torch.device) -> Any: ...
     @property
     def rank(self) -> int: ...
     @property
@@ -804,6 +806,12 @@ class _SymmetricMemory:
         channel: int = 0,
         timeout_ms: int = 0,
     ) -> None: ...
+    def get_remote_tensor(
+        self,
+        peer: int,
+        sizes: torch.types._size,
+        dtype: torch.dtype,
+    ) -> torch.Tensor: ...
     @staticmethod
     def memset32(
         tensor: torch.Tensor, offset: int, val: int, count: int = 1
@@ -828,12 +836,27 @@ class _SymmetricMemory:
     def signal_pad_size(self) -> int: ...
 
 class ProcessGroupXCCL(Backend):
+    class Options(Backend.Options):
+        def __init__(self): ...
+
     def __init__(
         self,
         store: Store,
         rank: int,
         size: int,
-    ): ...
+        options: Options,
+    ) -> None: ...
+    @property
+    def options(self) -> Options: ...  # type: ignore[override]
 
 def _set_process_group(pg: ProcessGroup) -> None: ...
 def _current_process_group() -> ProcessGroup: ...
+def _dump_nccl_trace_json(
+    includeCollectives: Optional[bool] = ...,
+    onlyActive: Optional[bool] = ...,
+) -> bytes: ...
+def _dump_nccl_trace(
+    includeCollectives: Optional[bool] = ...,
+    includeStackTraces: Optional[bool] = ...,
+    onlyActive: Optional[bool] = ...,
+) -> bytes: ...
diff --git a/torch/_C/_dynamo/eval_frame.pyi b/torch/_C/_dynamo/eval_frame.pyi
index 6261679dcdef4..117795db5ac3e 100644
--- a/torch/_C/_dynamo/eval_frame.pyi
+++ b/torch/_C/_dynamo/eval_frame.pyi
@@ -2,12 +2,9 @@ import enum
 import types
 from typing import Optional, overload
 
-from torch._dynamo.types import (
-    DynamoCallback,
-    DynamoGuardCompleteHook,
-    DynamoGuardHook,
-    GuardFn,
-)
+from torch._dynamo.guards import GuardManagerWrapper
+from torch._dynamo.types import DynamoCallback, DynamoGuardCompleteHook, DynamoGuardHook
+from torch._guards import CompileId
 
 def set_eval_frame(callback: DynamoCallback) -> DynamoCallback: ...
 def set_skip_guard_eval_unsafe(value: bool) -> bool: ...
@@ -25,14 +22,20 @@ def raise_sigtrap() -> None: ...
 
 class _CacheEntry:
     def check_fn(self, *args: object, **kwargs: object) -> bool: ...
+    def update_diff_guard_root_manager(self) -> None: ...
     code: types.CodeType
+    compile_id: CompileId
+    # If we run into circular issues, just use object
+    guard_manager: GuardManagerWrapper
     next: _CacheEntry | None
 
 class _PrecompileEntry:
-    guard_manager: GuardFn
+    guard_manager: GuardManagerWrapper
 
 class _ExtraState:
-    def invalidate(self, cache_entry: _CacheEntry, guard_manager: object) -> None: ...
+    def invalidate(
+        self, cache_entry: _CacheEntry, guard_manager: GuardManagerWrapper
+    ) -> None: ...
 
 class _FrameAction(enum.IntEnum):
     DEFAULT = 0
@@ -69,7 +72,9 @@ py_opcode_caches: list[int]
 
 def code_framelocals_names(code: types.CodeType) -> tuple[str]: ...
 def _load_precompile_entry(
-    code: types.CodeType, guard_manager: GuardFn, dynamo_code: types.CodeType
+    code: types.CodeType,
+    guard_manager: GuardManagerWrapper,
+    dynamo_code: types.CodeType,
 ) -> None: ...
 def _reset_precompile_entries(code: types.CodeType) -> None: ...
 def _debug_get_precompile_entries(code: types.CodeType) -> list[_PrecompileEntry]: ...
diff --git a/torch/_C/_dynamo/guards.pyi b/torch/_C/_dynamo/guards.pyi
index 9c2c379ae589b..aa6614504fc23 100644
--- a/torch/_C/_dynamo/guards.pyi
+++ b/torch/_C/_dynamo/guards.pyi
@@ -1,112 +1,335 @@
-# mypy: allow-untyped-defs
-from typing import Any, Callable
+import enum
+from typing import Any, Callable, Optional
+from typing_extensions import TypeAlias
 
 import torch
 
+# TODO: We should move the `GuardManagerType`
+# defined in `guards.py` here and update other
+# imports
+GuardManagerType: TypeAlias = enum.Enum
+
 class GlobalStateGuard:
     def check(self) -> bool: ...
     def reason(self) -> str: ...
 
-class LeafGuard: ...
-class GuardDebugInfo: ...
+class LeafGuard:
+    def verbose_code_parts(self) -> list[str]: ...
+
+class RelationalGuard: ...
+
+class GuardDebugInfo:
+    verbose_code_parts: list[str]
+    result: bool
+    num_guards_executed: int
 
 class GuardManager:
-    def check(self, value) -> bool: ...
-    def check_verbose(self, value) -> GuardDebugInfo: ...
+    def check(self, value: Any) -> bool: ...
+    def check_verbose(self, value: Any) -> GuardDebugInfo: ...
 
     # Accessors
     def globals_dict_manager(
         self,
         f_globals: dict[str, Any],
-        source,
-        example_value,
-        guard_manager_enum,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
     def framelocals_manager(
         self,
         key: tuple[str, int],
-        source,
-        example_value,
-        guard_manager_enum,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
     def dict_getitem_manager(
         self,
-        key,
-        source,
-        example_value,
-        guard_manager_enum,
+        key: Any,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def grad_manager(
+        self,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def generic_getattr_manager(
+        self,
+        attr: str,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def getitem_manager(
+        self,
+        key: Any,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def get_generic_dict_manager(
+        self,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def list_getitem_manager(
+        self,
+        key: Any,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def tuple_getitem_manager(
+        self,
+        key: Any,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def set_getitem_manager(
+        self,
+        index: Any,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def func_defaults_manager(
+        self,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def func_kwdefaults_manager(
+        self,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def tuple_iterator_getitem_manager(
+        self,
+        index: Any,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def weakref_call_manager(
+        self,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def call_function_no_args_manager(
+        self,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
     def global_weakref_manager(
         self,
         global_name: str,
-        source,
-        example_value,
-        guard_manager_enum,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
     def type_manager(
         self,
-        source,
-        example_value,
-        guard_manager_enum,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
     def getattr_manager(
         self,
         attr: str,
-        source,
-        example_value,
-        guard_manager_enum,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
     def tensor_property_size_manager(
         self,
         idx: int,
-        source,
-        example_value,
-        guard_manager_enum,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
     def tensor_property_shape_manager(
         self,
         idx: int,
-        source,
-        example_value,
-        guard_manager_enum,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
     def tensor_property_storage_offset_manager(
         self,
-        idx: None,
-        source,
-        example_value,
-        guard_manager_enum,
+        idx: int,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
     def indexed_manager(
         self,
         idx: int,
-        source,
-        example_value,
-        guard_manager_enum,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
     def lambda_manager(
         self,
-        python_lambda,
-        source,
-        example_value,
-        guard_manager_enum,
+        python_lambda: Callable[..., Any],
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def get_root(self) -> RootGuardManager: ...
+    def get_source(self) -> str: ...
+    def fail_count(self) -> int: ...
+    def get_child_managers(self) -> list[GuardManager]: ...
+    def repr(self) -> str: ...
+    def type_of_guarded_value(self) -> str: ...
+    def get_leaf_guards(self) -> list[LeafGuard]: ...
+    def get_accessors(self) -> list[GuardManager]: ...
+    def is_guarded_value_immutable(self) -> bool: ...
+    def is_tag_safe(self) -> bool: ...
+    def is_tag_safe_root(self) -> bool: ...
+    def has_no_accessors(self) -> bool: ...
+    def has_object_aliasing_guard(self) -> bool: ...
+    def get_type_of_guarded_value(self) -> type: ...
+    def type_dict_manager(
+        self,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def type_mro_manager(
+        self,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def code_manager(
+        self,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager: ...
+    def closure_manager(
+        self,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
-
     # Leaf guards
-    def add_lambda_guard(self, user_lambda, verbose_code_parts: list[str]) -> None: ...
-    def add_id_match_guard(self, id_val, verbose_code_parts: list[str]) -> None: ...
+    def add_lambda_guard(
+        self, user_lambda: Callable[..., Any], verbose_code_parts: list[str]
+    ) -> None: ...
+    def add_id_match_guard(
+        self, id_val: int, verbose_code_parts: list[str]
+    ) -> None: ...
     def add_equals_match_guard(
         self,
-        equals_val,
+        equals_val: Any,
         verbose_code_parts: list[str],
     ) -> None: ...
     def add_global_state_guard(
-        self, initial_state, verbose_code_parts: list[str]
+        self, initial_state: Any, verbose_code_parts: list[str]
     ) -> None: ...
     def add_torch_function_mode_stack_guard(
-        self, initial_stack, verbose_code_parts: list[str]
+        self, initial_stack: list[Any], verbose_code_parts: list[str]
+    ) -> None: ...
+    def add_mapping_keys_guard(
+        self, value: Any, verbose_code_parts: list[str]
+    ) -> None: ...
+    def add_dict_length_check_guard(
+        self, value: int, verbose_code_parts: list[str]
+    ) -> None: ...
+    def add_length_check_guard(
+        self, value: int, verbose_code_parts: list[str]
+    ) -> None: ...
+    def add_true_match_guard(
+        self,
+        verbose_code_parts: list[str],
     ) -> None: ...
-    def add_mapping_keys_guard(sef, value, verbose_code_parts: list[str]) -> None: ...
+    def add_false_match_guard(
+        self,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_none_match_guard(
+        self,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_not_none_guard(
+        self,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_dispatch_key_set_guard(
+        self,
+        dispatch_key: Any,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_tensor_match_guard(
+        self,
+        value: Any,
+        sizes: list[int],
+        strides: list[int],
+        tensor_name: str,
+        verbose_code_parts: list[str],
+        ptype: Any,
+        dispatch_keys: Any,
+    ) -> None: ...
+    def add_dynamic_indices_guard(
+        self,
+        value: set[Any],
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_no_hasattr_guard(
+        self,
+        attr_name: str,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_dict_contains_guard(
+        self,
+        contains: bool,
+        key: Any,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_type_match_guard(
+        self,
+        value: int,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_dict_version_guard(
+        self,
+        value: Any,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_set_contains_guard(
+        self,
+        contains: bool,
+        item: Any,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_tuple_iterator_length_guard(
+        self,
+        length: int,
+        type_id: int,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_range_iterator_match_guard(
+        self,
+        start: int,
+        stop: int,
+        step: int,
+        type_id: int,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def add_default_device_guard(
+        self,
+        verbose_code_parts: list[str],
+    ) -> None: ...
+    def mark_tag_safe(self) -> None: ...
+    def mark_tag_safe_root(self) -> None: ...
 
 class RootGuardManager(GuardManager):
     def get_epilogue_lambda_guards(self) -> list[LeafGuard]: ...
@@ -118,22 +341,26 @@ class RootGuardManager(GuardManager):
     def clone_manager(
         self, clone_filter_fn: Callable[[GuardManager], bool]
     ) -> RootGuardManager: ...
+    def attach_compile_id(self, compile_id: str) -> None: ...
 
 class DictGuardManager(GuardManager):
     def get_key_manager(
         self,
-        index,
-        source,
-        example_value,
-        guard_manager_enum,
+        index: int,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
     def get_value_manager(
         self,
-        index,
-        source,
-        example_value,
-        guard_manager_enum,
+        index: int,
+        source: str,
+        example_value: Any,
+        guard_manager_enum: GuardManagerType,
     ) -> GuardManager: ...
+    def get_key_value_managers(
+        self,
+    ) -> dict[int, tuple[GuardManager, GuardManager]]: ...
 
 # Guard accessor stubs
 class GuardAccessor: ...
@@ -141,25 +368,31 @@ class DictGetItemGuardAccessor(GuardAccessor): ...
 class GetGenericDictGuardAccessor(GuardAccessor): ...
 class TypeDictGuardAccessor(GuardAccessor): ...
 class TypeMROGuardAccessor(GuardAccessor): ...
+class ClosureGuardAccessor(GuardAccessor): ...
+class TupleGetItemGuardAccessor(GuardAccessor): ...
+class TypeGuardAccessor(GuardAccessor): ...
+class CodeGuardAccessor(GuardAccessor): ...
+class FuncDefaultsGuardAccessor(GuardAccessor): ...
+class FuncKwDefaultsGuardAccessor(GuardAccessor): ...
 
 class GetAttrGuardAccessor(GuardAccessor):
     def get_attr_name(self) -> str: ...
 
 def install_object_aliasing_guard(
-    guard_managers: list[GuardManager],
-    tensor_names: list[str],
+    x: GuardManager,
+    y: GuardManager,
     verbose_code_parts: list[str],
-): ...
+) -> None: ...
 def install_no_tensor_aliasing_guard(
     guard_managers: list[GuardManager],
     tensor_names: list[str],
     verbose_code_parts: list[str],
-): ...
+) -> None: ...
 def install_storage_overlapping_guard(
     overlapping_guard_managers: list[GuardManager],
     non_overlapping_guard_managers: list[GuardManager],
     verbose_code_parts: list[str],
-): ...
+) -> None: ...
 def install_symbolic_shape_guard(
     guard_managers: list[GuardManager],
     nargs_int: int,
@@ -167,7 +400,7 @@ def install_symbolic_shape_guard(
     py_addr: int,
     py_addr_keep_alive: Any,
     verbose_code_parts: list[str],
-): ...
+) -> None: ...
 def profile_guard_manager(
     guard_manager: GuardManager,
     f_locals: dict[str, Any],
@@ -181,20 +414,22 @@ class TensorGuards:
         dynamic_dims_sizes: list[torch.SymInt | None] | None = None,
         dynamic_dims_strides: list[torch.SymInt | None] | None = None,
     ) -> None: ...
-    def check(self, *args) -> bool: ...
-    def check_verbose(self, *args, tensor_check_names=None) -> bool | str: ...
+    def check(self, *args: Any) -> bool: ...
+    def check_verbose(
+        self, *args: Any, tensor_check_names: Optional[list[str]] = None
+    ) -> bool | str: ...
 
 def assert_size_stride(
     item: torch.Tensor,
     size: torch.types._size,
     stride: torch.types._size,
     op_name: str | None = None,
-): ...
+) -> None: ...
 def assert_alignment(
     item: torch.Tensor,
     alignment: int,
     op_name: str | None = None,
-): ...
+) -> None: ...
 def check_obj_id(obj: object, expected: int) -> bool: ...
 def check_type_id(obj: object, expected: int) -> bool: ...
 def dict_version(d: dict[Any, Any]) -> int: ...
diff --git a/torch/_C/_export/pt2_archive_constants.pyi b/torch/_C/_export/pt2_archive_constants.pyi
index 87e356453bcf0..ce225f0f1880b 100644
--- a/torch/_C/_export/pt2_archive_constants.pyi
+++ b/torch/_C/_export/pt2_archive_constants.pyi
@@ -10,8 +10,10 @@ MODELS_FILENAME_FORMAT: str = ...
 AOTINDUCTOR_DIR: str = ...
 MTIA_DIR: str = ...
 WEIGHTS_DIR: str = ...
+WEIGHTS_CONFIG_FILENAME_FORMAT: str = ...
 WEIGHT_FILENAME_PREFIX: str = ...
 CONSTANTS_DIR: str = ...
+CONSTANTS_CONFIG_FILENAME_FORMAT: str = ...
 TENSOR_CONSTANT_FILENAME_PREFIX: str = ...
 CUSTOM_OBJ_FILENAME_PREFIX: str = ...
 SAMPLE_INPUTS_DIR: str = ...
diff --git a/torch/__init__.py b/torch/__init__.py
index 34340b51d0e7d..7969b6edc787b 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -35,10 +35,6 @@
 from typing_extensions import ParamSpec as _ParamSpec, TypeIs as _TypeIs
 
 
-if TYPE_CHECKING:
-    from .types import Device, IntLikeType
-
-
 # As a bunch of torch.packages internally still have this check
 # we need to keep this. @todo: Remove tests that rely on this check as
 # they are likely stale.
@@ -61,6 +57,10 @@ def _running_with_deploy() -> builtins.bool:
 from torch.torch_version import __version__ as __version__
 
 
+if TYPE_CHECKING:
+    from torch.types import Device, IntLikeType
+
+
 __all__ = [
     "BoolStorage",
     "BoolTensor",
@@ -1108,7 +1108,7 @@ def is_tensor(obj: _Any, /) -> _TypeIs["torch.Tensor"]:
     r"""Returns True if `obj` is a PyTorch tensor.
 
     Note that this function is simply doing ``isinstance(obj, Tensor)``.
-    Using that ``isinstance`` check is better for typechecking with mypy,
+    Using that ``isinstance`` check is better for type checking with mypy,
     and more explicit - so it's recommended to use that instead of
     ``is_tensor``.
 
@@ -2218,6 +2218,7 @@ def _assert(condition, message):
     testing as testing,
     types as types,
     utils as utils,
+    version as version,
     xpu as xpu,
 )
 from torch.signal import windows as windows
diff --git a/torch/_custom_op/impl.py b/torch/_custom_op/impl.py
index dd3e9e8fa2dd1..208c18e392a46 100644
--- a/torch/_custom_op/impl.py
+++ b/torch/_custom_op/impl.py
@@ -648,7 +648,7 @@ def custom_op_from_existing(op):
     name = op.name().split("::")[-1]
     schema_str = str(op._schema)
     # CustomOp expects the schema string without the namespace
-    schema_str = schema_str.split("::")[-1]
+    schema_str = schema_str.rsplit("::", maxsplit=1)[-1]
     schema = FunctionSchema.parse(schema_str)
     return CustomOp(lib, ns, schema, name, op, _private_access=True)
 
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 04eb14d6b1eb4..ba09c6173c5f3 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1780,9 +1780,9 @@ def _fused_rms_norm_backward(
 
     N = prod(inner_dims)  # type: ignore[arg-type]
     M = prod(outer_dims)  # type: ignore[arg-type]
-    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
 
-    if guard_size_oblivious(M <= 0) or guard_size_oblivious(N <= 0):
+    if guard_or_false(M == 0) or guard_or_false(N == 0):
         return (
             input.new_zeros(input_shape) if output_mask[0] else None,
             input.new_zeros(input_shape[axis:]) if output_mask[1] else None,
@@ -3987,9 +3987,9 @@ def _unsafe_masked_index(x, mask, indices, fill):
         lambda: "tensors used as masks must be bool tensors",
     )
 
-    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
 
-    if guard_size_oblivious(x.numel() == 0):
+    if guard_or_false(x.numel() == 0):
         meta_result = torch._meta_registrations.meta_index_Tensor(x, indices)
         return x.new_full(meta_result.shape, fill)
 
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 02b921b30ee2e..561acf62f785c 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -10,7 +10,14 @@
 
 import torch
 
-from . import config, convert_frame, eval_frame, resume_execution
+from . import (
+    aot_compile,
+    config,
+    convert_frame,
+    eval_frame,
+    functional_export,
+    resume_execution,
+)
 from .backends.registry import list_backends, lookup_backend, register_backend
 from .callback import callback_handler, on_compile_end, on_compile_start
 from .code_context import code_context
@@ -21,6 +28,7 @@
     disable,
     disallow_in_graph,
     dont_skip_tracing,
+    error_on_graph_break,
     forbid_in_graph,
     graph_break,
     mark_dynamic,
@@ -30,7 +38,6 @@
     nonstrict_trace,
     patch_dynamo_config,
     run,
-    set_fullgraph,
     set_stance,
     skip_frame,
     substitute_in_graph,
@@ -90,7 +97,7 @@
     "replay",
     "reset",
     "run",
-    "set_fullgraph",
+    "error_on_graph_break",
     "set_stance",
     "skip_frame",
     "substitute_in_graph",
diff --git a/torch/_dynamo/aot_compile.py b/torch/_dynamo/aot_compile.py
new file mode 100644
index 0000000000000..0482016846283
--- /dev/null
+++ b/torch/_dynamo/aot_compile.py
@@ -0,0 +1,297 @@
+import abc
+import builtins
+import importlib
+import inspect
+import logging
+import pickle
+import types
+from dataclasses import dataclass
+from typing import Any, Callable, Optional
+
+import torch
+import torch.fx
+from torch._dynamo.precompile_context import PrecompileContext
+
+from . import convert_frame
+from .hooks import Hooks
+
+
+log = logging.getLogger(__name__)
+
+
+class SerializableCallable(abc.ABC):
+    @classmethod
+    @abc.abstractmethod
+    def serialize_compile_artifacts(cls, fn: Any) -> bytes:
+        pass
+
+    @classmethod
+    @abc.abstractmethod
+    def deserialize_compile_artifacts(cls, data: bytes) -> Any:
+        pass
+
+
+def bind_locals(
+    signature: inspect.Signature, *args: Any, **kwargs: Any
+) -> dict[str, Any]:
+    bound_arguments = signature.bind(*args, **kwargs)
+    bound_arguments.apply_defaults()
+    return bound_arguments.arguments
+
+
+@dataclass
+class CompileArtifacts:
+    signature: inspect.Signature
+    bytecode: types.CodeType
+    guard_manager: Optional[torch._dynamo.guards.GuardManagerWrapper]
+    guards_state: bytes
+    import_sources: dict[str, str]
+    backend_id: str
+    compiled_fn: SerializableCallable
+    original_code: types.CodeType
+    closure: Optional[tuple[Any, ...]]
+
+
+@dataclass
+class AOTCompiledFunction:
+    _artifacts: CompileArtifacts
+
+    def guard_check(self, *args: Any, **kwargs: Any) -> bool:
+        f_locals = bind_locals(self._artifacts.signature, *args, **kwargs)
+        assert self._artifacts.guard_manager is not None
+        return self._artifacts.guard_manager.check(f_locals)
+
+    def __post_init__(self) -> None:
+        import_sources = {
+            alias: importlib.import_module(module_name)
+            for alias, module_name in self._artifacts.import_sources.items()
+        }
+        f_globals = {
+            **import_sources,
+            self._artifacts.backend_id: self._artifacts.compiled_fn,
+        }
+        self.fn = types.FunctionType(
+            self._artifacts.bytecode, f_globals, closure=self._artifacts.closure
+        )
+
+        if self._artifacts.guard_manager is None:
+            guards_state = pickle.loads(self._artifacts.guards_state)
+            self._artifacts.guard_manager = torch._dynamo.guards.CheckFunctionManager(
+                self._artifacts.original_code,
+                guards_state.output_graph,
+                shape_code_parts=guards_state.shape_code_parts,
+                runtime_global_scope=f_globals,
+            ).guard_manager
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        assert self._artifacts.guard_manager is not None
+        if not self.guard_check(*args, **kwargs):
+            f_locals = bind_locals(self._artifacts.signature, *args, **kwargs)
+            reason = str(self._artifacts.guard_manager.check_verbose(f_locals))
+            raise RuntimeError(f"GuardManager check failed, reason: {reason}")
+        return self.fn(*args, **kwargs)
+
+    def save_compiled_function(self, path: str) -> None:
+        with open(path, "wb") as f:
+            f.write(type(self).serialize(self))
+
+    @classmethod
+    def serialize(cls, fn: "AOTCompiledFunction") -> bytes:
+        from torch._dynamo.package import SerializedCode
+
+        state = fn._artifacts.__dict__.copy()
+        state["guard_manager"] = None
+        state["bytecode"] = SerializedCode.from_code_object(state["bytecode"])
+        compiled_fn = state["compiled_fn"]
+        state["compiled_fn"] = (
+            type(compiled_fn).deserialize_compile_artifacts,
+            type(compiled_fn).serialize_compile_artifacts(compiled_fn),
+        )
+        state["original_code"] = SerializedCode.from_code_object(state["original_code"])
+        return pickle.dumps(state)
+
+    @classmethod
+    def deserialize(cls, data: bytes) -> "AOTCompiledFunction":
+        from torch._dynamo.package import SerializedCode
+
+        state = pickle.loads(data)
+        state["bytecode"] = SerializedCode.to_code_object(state["bytecode"])
+        deserializer, compiled_fn_state = state["compiled_fn"]
+        state["compiled_fn"] = deserializer(compiled_fn_state)
+        state["original_code"] = SerializedCode.to_code_object(state["original_code"])
+
+        artifacts = CompileArtifacts(**state)
+        return cls(artifacts)
+
+
+class BundledAOTAutogradSerializableCallable(SerializableCallable):
+    """
+    Represents a serializable callable generated by compile_fx.
+    This class wraps around the compiled function generated by AOTAutograd.
+
+    TODO: Instead of using PrecompileContext to grab it from AOTAutograd,
+    this object should be what's *returned* by aot_module_simplified.
+    We'll do that refactor in a later PR.
+    """
+
+    def __init__(self, artifact: Any) -> None:
+        """
+        Takes in a BundledAOTAutogradCacheArtifact, which is the serialized form
+        of a compiled function generated by AOTAutograd.
+        """
+
+        self.compiled_fn = artifact.after_deserialization()
+        self.data = artifact.content
+
+    def __getattr__(self, attr: Any) -> Any:
+        if hasattr(self, attr):
+            return getattr(super(), attr)
+        else:
+            return getattr(self.compiled_fn, attr)
+
+    @classmethod
+    def from_backend_id(
+        cls, backend_id: str
+    ) -> "BundledAOTAutogradSerializableCallable":
+        """
+        Takes in a backend_id, and returns a BundledAOTAutogradSerializableCallable
+        that wraps around the compiled function generated by AOTAutograd.
+        """
+        artifact = PrecompileContext.serialize_artifact_by_key(backend_id)
+        if artifact is None:
+            raise RuntimeError("No artifact found for backend_id: " + backend_id)
+        return cls(artifact)
+
+    @classmethod
+    def serialize_compile_artifacts(
+        cls, fn: "BundledAOTAutogradSerializableCallable"
+    ) -> bytes:
+        return fn.data
+
+    @classmethod
+    def deserialize_compile_artifacts(cls, data: bytes) -> Any:
+        from torch._functorch._aot_autograd.autograd_cache import (
+            BundledAOTAutogradCacheArtifact,
+        )
+
+        # The key in the artifact is not important here since we're not populating a cache,
+        # we just want to grab the callable back out of the serialized entry
+        artifact = BundledAOTAutogradCacheArtifact("", data)
+        return cls(artifact)
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        return self.compiled_fn(*args, **kwargs)
+
+
+def aot_compile_fullgraph(
+    model: Any,
+    example_inputs: tuple[tuple[Any, ...], dict[str, Any]],
+    hooks: Hooks,
+    backend: Callable[[torch.fx.GraphModule, list[torch.Tensor]], SerializableCallable],
+) -> AOTCompiledFunction:
+    from torch._dynamo.guards import CheckFunctionManager
+    from torch._dynamo.utils import dynamo_timed, get_metrics_context
+    from torch._guards import compile_context, CompileContext, TracingContext
+
+    args, kwargs = example_inputs
+    if hasattr(model, "__self__"):
+        fn = model.__func__
+        args = (model.__self__,) + args
+    elif inspect.isfunction(model):
+        fn = model
+    else:
+        raise RuntimeError(f"Unsupported model code type {model}")
+
+    signature = inspect.signature(fn)
+    f_locals = bind_locals(signature, *args, **kwargs)
+    if fn.__code__.co_freevars or fn.__closure__:
+        assert len(fn.__closure__) == len(fn.__code__.co_freevars)
+        f_locals.update(
+            {
+                name: cell.cell_contents
+                for name, cell in zip(fn.__code__.co_freevars, fn.__closure__)
+            }
+        )
+
+    with (
+        compile_context(CompileContext(convert_frame.get_compile_id({}))),
+        get_metrics_context(),
+        dynamo_timed("fullgraph_capture"),
+    ):
+        capture_output = convert_frame.fullgraph_capture(
+            convert_frame.FrameInfo(
+                fn.__code__,
+                fn.__globals__,
+                f_locals,
+                builtins.__dict__,
+                closure=fn.__closure__ or (),  # type: ignore[arg-type]
+            )
+        )
+        dynamo_output = capture_output.dynamo_output
+
+        if not hooks.guard_filter_fn:
+            from torch._dynamo.types import GuardFilterEntry
+
+            def new_guard_filter_fn(
+                guard_entries: list[GuardFilterEntry],
+            ) -> list[bool]:
+                return [
+                    (
+                        not (
+                            g.is_global
+                            or g.guard_type
+                            in CheckFunctionManager.UNSUPPORTED_SERIALIZATION_GUARD_TYPES
+                        )
+                    )
+                    for g in guard_entries
+                ]
+
+            hooks.guard_filter_fn = new_guard_filter_fn
+
+        check_fn = dynamo_output.build_guards(
+            fn.__code__, hooks=hooks, save=True, strict_error=True
+        )
+
+        assert check_fn.guards_state is not None
+
+    backend_input = capture_output.backend_input
+    backend_input.graph_module._backend_id = backend_input.backend_id  # type: ignore[assignment]
+    output_graph = dynamo_output.tracer_output.output_graph
+    assert output_graph is not None
+    import_sources = output_graph.import_sources
+    with (
+        torch._guards.tracing(TracingContext(backend_input.fake_mode)),
+        torch._functorch.config.patch("bundled_autograd_cache", True),
+    ):
+        compiled_fn = backend(backend_input.graph_module, backend_input.example_inputs)
+
+    # If Inductor backend is used, grab the compiled_fn from PrecompileContext
+    # TODO: this should be replaced once we make the backend return the SerializableCallable directly.
+    if isinstance(backend, torch._TorchCompileInductorWrapper):
+        compiled_fn = BundledAOTAutogradSerializableCallable.from_backend_id(
+            backend_input.backend_id
+        )
+
+    if not isinstance(compiled_fn, SerializableCallable):
+        if hasattr(backend, "compiler_fn"):
+            compiler_fn = backend.compiler_fn
+        else:
+            compiler_fn = backend
+        raise RuntimeError(
+            f"Compiled function type {type(compiled_fn)} (produced "
+            + f"from backend {compiler_fn}) does not implement SerializableCallable."
+        )
+
+    artifacts = CompileArtifacts(
+        signature=signature,
+        bytecode=dynamo_output.bytecode,
+        guard_manager=check_fn.guard_manager,
+        guards_state=check_fn.guards_state,
+        import_sources=import_sources,
+        backend_id=backend_input.backend_id,
+        compiled_fn=compiled_fn,
+        original_code=fn.__code__,
+        closure=fn.__closure__,
+    )
+    aot_compiled_fn = AOTCompiledFunction(_artifacts=artifacts)
+    return aot_compiled_fn
diff --git a/torch/_dynamo/backends/common.py b/torch/_dynamo/backends/common.py
index 167f678b6a208..b7604db5429d6 100644
--- a/torch/_dynamo/backends/common.py
+++ b/torch/_dynamo/backends/common.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 """
 This module provides common utilities and base classes for TorchDynamo backends.
 
@@ -21,6 +19,9 @@
 import contextlib
 import functools
 import logging
+from collections.abc import Iterable
+from typing import Any, Callable
+from typing_extensions import ParamSpec, TypeVar
 from unittest.mock import patch
 
 import torch
@@ -36,13 +37,18 @@
 
 log = logging.getLogger(__name__)
 
+P = ParamSpec("P")
+R = TypeVar("R")
+
 
 class AotAutograd:
-    def __init__(self, **kwargs) -> None:
+    def __init__(self, **kwargs: Any) -> None:
         self.__name__ = "compiler_fn"
         self.kwargs = kwargs
 
-    def __call__(self, gm: torch.fx.GraphModule, example_inputs, **kwargs):
+    def __call__(
+        self, gm: torch.fx.GraphModule, example_inputs: Iterable[Any], **kwargs: Any
+    ) -> Callable[..., Any]:
         if kwargs:
             log.warning("aot_autograd-based backend ignoring extra kwargs %s", kwargs)
 
@@ -66,8 +72,8 @@ def __call__(self, gm: torch.fx.GraphModule, example_inputs, **kwargs):
             counters["aot_autograd"]["not_ok"] += 1
             return gm
 
-        def wrap_bw_compiler(bw_compiler_fn):
-            def _wrapped_bw_compiler(*args, **kwargs):
+        def wrap_bw_compiler(bw_compiler_fn: Callable[P, R]) -> Callable[..., R]:
+            def _wrapped_bw_compiler(*args: P.args, **kwargs: P.kwargs) -> R:
                 # Note [Wrapping bw_compiler in disable]
                 # The two disables here:
                 # - stop TorchDynamo from trying to compile the bw_compiler function itself
@@ -75,7 +81,7 @@ def _wrapped_bw_compiler(*args, **kwargs):
                 return disable(
                     disable(
                         bw_compiler_fn, reason="do not trace backward compiler function"
-                    )(*args, **kwargs),
+                    )(*args, **kwargs),  # type: ignore[misc]
                     reason="do not trace generated backwards pass",
                 )
 
@@ -99,7 +105,9 @@ def _wrapped_bw_compiler(*args, **kwargs):
         # debug asserts slow down compile time noticeably,
         # So only default them on when the aot_eager backend is used.
         if self.kwargs.get("fw_compiler", None) == nop:
-            patch_config = patch("functorch.compile.config.debug_assert", True)
+            patch_config: contextlib.AbstractContextManager[Any] = patch(
+                "functorch.compile.config.debug_assert", True
+            )
         else:
             patch_config = contextlib.nullcontext()
 
@@ -116,11 +124,11 @@ def _wrapped_bw_compiler(*args, **kwargs):
             raise
 
 
-def aot_autograd(**kwargs) -> AotAutograd:
+def aot_autograd(**kwargs: Any) -> AotAutograd:
     return AotAutograd(**kwargs)
 
 
-def mem_efficient_fusion_kwargs(use_decomps):
+def mem_efficient_fusion_kwargs(use_decomps: bool) -> dict[str, Any]:
     from functorch.compile import (
         default_decompositions,
         min_cut_rematerialization_partition,
@@ -140,28 +148,30 @@ def mem_efficient_fusion_kwargs(use_decomps):
     return kwargs
 
 
-def fake_tensor_unsupported(fn):
+def fake_tensor_unsupported(fn: Callable[[Any, list[Any], Any], R]) -> Any:
     """
     Decorator for backends that need real inputs.  We swap out fake
     tensors for zero tensors.
     """
 
     @functools.wraps(fn)
-    def wrapper(model, inputs, **kwargs):
+    def wrapper(model: Any, inputs: Any, **kwargs: Any) -> Any:
         with _disable_current_modes():
             inputs = list(map(defake, inputs))
-            return fn(model, inputs, **kwargs)
+            return fn(model, inputs, **kwargs)  # type: ignore[call-arg]
 
     return wrapper
 
 
-def device_from_inputs(example_inputs) -> torch.device:
+def device_from_inputs(example_inputs: Iterable[Any]) -> torch.device:
     for x in example_inputs:
         if hasattr(x, "device"):
             return x.device
+    return torch.device("cpu")  # Default fallback
 
 
-def dtype_from_inputs(example_inputs) -> torch.dtype:
+def dtype_from_inputs(example_inputs: Iterable[Any]) -> torch.dtype:
     for x in example_inputs:
         if hasattr(x, "dtype"):
             return x.dtype
+    return torch.float32  # Default fallback
diff --git a/torch/_dynamo/backends/cudagraphs.py b/torch/_dynamo/backends/cudagraphs.py
index b2d784975251d..f8599d393833e 100644
--- a/torch/_dynamo/backends/cudagraphs.py
+++ b/torch/_dynamo/backends/cudagraphs.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 """
 This module implements CUDA graphs support for TorchDynamo backends.
 
@@ -25,9 +23,11 @@
 
 import functools
 from collections import defaultdict
-from typing import Optional
+from collections.abc import Sequence
+from typing import Any, Callable, Optional
 
 import torch
+import torch.fx
 from torch._dynamo import config
 from torch._dynamo.backends.common import aot_autograd
 from torch._dynamo.backends.debugging import boxed_nop
@@ -51,8 +51,8 @@
 from .registry import register_backend
 
 
-def find_input_mutations(g):
-    def meta_fk(meta):
+def find_input_mutations(g: torch.fx.Graph) -> set[int]:
+    def meta_fk(meta: dict[str, Any]) -> Any:
         return meta["val"] if "val" in meta else meta["fake_result"]
 
     inputs = defaultdict(set)
@@ -90,7 +90,9 @@ def meta_fk(meta):
     return mutated_inputs
 
 
-def get_device_node_mapping(gm: torch.fx.GraphModule):
+def get_device_node_mapping(
+    gm: torch.fx.GraphModule,
+) -> dict[torch.device, torch.fx.Node]:
     device_node_mapping: dict[torch.device, torch.fx.Node] = {}
     for n in gm.graph.nodes:
         t = n.meta.get("val", None)
@@ -100,7 +102,7 @@ def get_device_node_mapping(gm: torch.fx.GraphModule):
 
 
 def check_for_mutation_ignore_cuda_graph_managed_tensor(
-    aot_model: torch.fx.GraphModule, num_fixed
+    aot_model: torch.fx.GraphModule, num_fixed: int
 ) -> Optional[str]:
     mutation_indices = find_input_mutations(aot_model.graph) - set(range(num_fixed))
     if not mutation_indices:
@@ -110,7 +112,7 @@ def check_for_mutation_ignore_cuda_graph_managed_tensor(
     return get_mutation_stack_trace(placeholders, mutation_indices)
 
 
-def check_for_skip(aot_model: torch.fx.GraphModule, num_fixed) -> Optional[str]:
+def check_for_skip(aot_model: torch.fx.GraphModule, num_fixed: int) -> Optional[str]:
     if not config.cudagraph_backend_support_input_mutation:
         if mut_skip := check_for_mutation_ignore_cuda_graph_managed_tensor(
             aot_model, num_fixed
@@ -128,28 +130,35 @@ def check_for_skip(aot_model: torch.fx.GraphModule, num_fixed) -> Optional[str]:
     return None
 
 
-def get_device_index(gm) -> int:
+def get_device_index(gm: torch.fx.GraphModule) -> int:
     device = next(iter(get_device_node_mapping(gm)))
     assert device.type == "cuda"
     return device.index
 
 
-def get_stack_traces(gm) -> list[Optional[str]]:
+def get_stack_traces(gm: torch.fx.GraphModule) -> list[Optional[str]]:
     output = output_node(gm)
     assert len(output.args) == 1
+    args = output.args[0]
+    if not hasattr(args, "__iter__"):
+        return []
     return [
         (arg.stack_trace if isinstance(arg, torch.fx.node.Node) else None)
-        for arg in output.args[0]
+        for arg in args  # type: ignore[union-attr]
     ]
 
 
-def cudagraphs(dynamo_model, dynamo_inputs):
+def cudagraphs(dynamo_model: torch.fx.GraphModule, dynamo_inputs: Sequence[Any]) -> Any:
     from torch._inductor.cudagraph_trees import cudagraphify_impl
 
     do_cudagraphs = BoxedBool(True)
     boxed_device_index = BoxedDeviceIndex(None)
 
-    def forward_cudagraphs(aot_model, aot_inputs, is_inference=False):
+    def forward_cudagraphs(
+        aot_model: torch.fx.GraphModule,
+        aot_inputs: list[Any],
+        is_inference: bool = False,
+    ) -> Any:
         interp = boxed_nop(aot_model, aot_inputs)
         fixed = num_fw_fixed_arguments(len(dynamo_inputs), len(aot_inputs))
         if skip_msg := check_for_skip(aot_model, fixed):
@@ -166,15 +175,17 @@ def forward_cudagraphs(aot_model, aot_inputs, is_inference=False):
             range(fixed),
             device_index=boxed_device_index.value,
             is_backward=False,
-            is_inference=False,
+            is_inference=False,  # Q: should forward is_inference here?
             stack_traces=get_stack_traces(aot_model),
             placeholders=get_placeholder_info(aot_model.graph),
             mutated_input_idxs=find_input_mutations(aot_model.graph),
         )
-        out._boxed_call = True
+        out._boxed_call = True  # type: ignore[attr-defined]
         return out
 
-    def backward_cudagraphs(aot_model, aot_inputs):
+    def backward_cudagraphs(
+        aot_model: torch.fx.GraphModule, aot_inputs: list[Any]
+    ) -> Any:
         interp = boxed_nop(aot_model, aot_inputs)
         if not do_cudagraphs:
             return aot_model
@@ -182,20 +193,23 @@ def backward_cudagraphs(aot_model, aot_inputs):
         fixed = count_tangents(aot_model)
         if skip_msg := check_for_skip(aot_model, fixed):
             log_cudagraph_skip_and_bump_counter(
-                "skipping cudagraphs due to %s", skip_msg
+                f"skipping cudagraphs due to {skip_msg}"
             )
 
             # See [Backward Generation Handling]
+            device_idx = boxed_device_index.value
+            if device_idx is None:
+                device_idx = 0  # Default to device 0 if not set
             manager = torch._inductor.cudagraph_trees.get_manager(
-                boxed_device_index.value, create_if_none_exists=False
+                device_idx, create_if_none_exists=False
             )
             assert manager is not None
 
-            def fn(inputs):
+            def fn(inputs: list[Any]) -> Any:
                 manager.set_to_running_backward()
                 return aot_model(inputs)
 
-            fn._boxed_call = True
+            fn._boxed_call = True  # type: ignore[attr-defined]
             return fn
 
         out = cudagraphify_impl(
@@ -209,7 +223,7 @@ def fn(inputs):
             placeholders=get_placeholder_info(aot_model.graph),
             mutated_input_idxs=find_input_mutations(aot_model.graph),
         )
-        out._boxed_call = True
+        out._boxed_call = True  # type: ignore[attr-defined]
         return out
 
     aot_cudagraphs = aot_autograd(
@@ -225,13 +239,13 @@ class CudagraphsBackend:
     compiler_name = "cudagraphs"
 
     @staticmethod
-    def reset():
+    def reset() -> None:
         from torch._inductor.cudagraph_trees import reset_cudagraph_trees
 
         reset_cudagraph_trees()
 
     @staticmethod
-    def __call__(model, inputs):
+    def __call__(model: torch.fx.GraphModule, inputs: Sequence[Any]) -> Any:
         return cudagraphs(model, inputs)
 
 
@@ -240,7 +254,12 @@ def __call__(model, inputs):
 register_backend(name="cudagraphs", compiler_fn=CudagraphsBackend())
 
 
-def cudagraphs_inner(model, inputs, copy_outputs=True, copy_inputs=True):
+def cudagraphs_inner(
+    model: Callable[..., Any],
+    inputs: Sequence[Any],
+    copy_outputs: bool = True,
+    copy_inputs: bool = True,
+) -> Callable[..., Sequence[Any]]:
     """This isn't registered as a backend, but is used in some benchmarks"""
     assert isinstance(inputs, (list, tuple))
     if copy_inputs:
@@ -265,7 +284,7 @@ def cudagraphs_inner(model, inputs, copy_outputs=True, copy_inputs=True):
     if not isinstance(static_outputs, (list, tuple)):
         static_outputs = (static_outputs,)
 
-    def run(*new_inputs):
+    def run(*new_inputs: Any) -> Sequence[Any]:
         assert len(static_inputs) == len(new_inputs)
         if copy_inputs:
             for dst, src in zip(static_inputs, new_inputs):
diff --git a/torch/_dynamo/backends/debugging.py b/torch/_dynamo/backends/debugging.py
index cded5b005ee3c..32fc72cfa52a3 100644
--- a/torch/_dynamo/backends/debugging.py
+++ b/torch/_dynamo/backends/debugging.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 """
 This module provides debugging backends for TorchDynamo to help diagnose and troubleshoot
 compilation and execution issues. It includes:
@@ -28,40 +26,54 @@
 import dataclasses
 import functools
 import logging
+from collections.abc import Iterable
 from importlib import import_module
-from typing import Any, Optional
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
 
 import torch
 from functorch.compile import min_cut_rematerialization_partition
 from torch import _guards
+from torch._dynamo.output_graph import GraphCompileReason
 from torch._functorch import config as functorch_config
 from torch._functorch.compilers import ts_compile
 
 from .common import aot_autograd
-from .registry import register_debug_backend as register_backend
+from .registry import CompiledFn, CompilerFn, register_debug_backend as register_backend
+
+
+if TYPE_CHECKING:
+    from torch.fx.node import Target
 
 
 log = logging.getLogger(__name__)
 
 
 @register_backend
-def eager(gm, fake_tensor_inputs, **kwargs):
+def eager(
+    gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
+) -> Callable[..., Any]:
     if kwargs:
         log.warning("eager backend ignoring extra kwargs %s", kwargs)
     return gm.forward
 
 
-def make_eager_backend_with_torch_function_mode(mode):
+def make_eager_backend_with_torch_function_mode(
+    mode: torch.overrides.TorchFunctionMode,
+) -> Callable[..., Any]:
     return make_eager_backend_with_torch_function_modes([mode])
 
 
-def make_eager_backend_with_torch_function_modes(modes):
+def make_eager_backend_with_torch_function_modes(
+    modes: Iterable[torch.overrides.TorchFunctionMode],
+) -> Callable[..., Any]:
     """Used to trace HOPs (cond and while) for eager execution, the metadata
     TF mode mutates vars outside of the scope of the HOP, and we can't have graph breaks
     in the HOP, so we need to externally run this mode and not trace it."""
     from contextlib import ExitStack
 
-    def fn(gm, fake_tensor_inputs, **kwargs):
+    def fn(
+        gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
+    ) -> Callable[..., Any]:
         stack = ExitStack()
         for mode in modes:
             stack.enter_context(mode)
@@ -74,13 +86,15 @@ def fn(gm, fake_tensor_inputs, **kwargs):
 
 
 @register_backend
-def eager_noexcept(gm, fake_tensor_inputs, **kwargs):
+def eager_noexcept(
+    gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
+) -> Callable[..., Any]:
     if kwargs:
         log.warning("eager_noexcept backend ignoring extra kwargs %s", kwargs)
 
     # This backend is intended to check that dynamo-generated GraphModules
     # do not cause errors.
-    def inner(*args):
+    def inner(*args: Any) -> Any:
         try:
             return gm(*args)
         except Exception as e:
@@ -92,13 +106,15 @@ def inner(*args):
 
 
 @register_backend
-def pre_dispatch_eager(gm, fake_tensor_inputs, **kwargs):
+def pre_dispatch_eager(
+    gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
+) -> torch.fx.GraphModule:
     if kwargs:
         log.warning("pre_dispatch_eager backend ignoring extra kwargs %s", kwargs)
 
     from torch.fx.experimental.proxy_tensor import make_fx
 
-    def runnable_gm(*args):
+    def runnable_gm(*args: Any) -> Any:
         return torch.fx.Interpreter(gm).run(*args)
 
     pre_dispatch_gm = make_fx(runnable_gm, pre_dispatch=True)(*fake_tensor_inputs)
@@ -108,7 +124,9 @@ def runnable_gm(*args):
 
 
 @register_backend
-def eager_debug(gm, fake_tensor_inputs, **kwargs):
+def eager_debug(
+    gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
+) -> Callable[..., Any]:
     if kwargs:
         log.warning("eager_debug backend ignoring extra kwargs %s", kwargs)
 
@@ -117,42 +135,55 @@ def eager_debug(gm, fake_tensor_inputs, **kwargs):
     # We could add more debugging bits here.
     # Right now, this backend can be used to check for and error on
     # custom dispatcher ops that have incorrect schemas.
-    def inner(*args):
+    def inner(*args: Any) -> Any:
         with SchemaCheckMode():
             return torch.fx.Interpreter(gm).run(*args)
 
     return inner
 
 
-@register_backend(name="ts")
-def torchscript(gm, fake_tensor_inputs):
+@register_backend(name="ts")  # type: ignore[misc]
+def torchscript(
+    gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor]
+) -> torch.jit.ScriptModule:
     return torch.jit.script(gm)
 
 
 # used boxed call to discard inputs when they are no longer needed
-def boxed_nop(fx_g, example_inputs):
-    def run(args):
+def boxed_nop(
+    fx_g: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
+) -> Callable[..., Any]:
+    def run(args: Any) -> Any:
         return torch.fx.Interpreter(fx_g).boxed_run(args)
 
-    run._boxed_call = True
+    run._boxed_call = True  # type: ignore[attr-defined]
     return run
 
 
-def boxed_nop_with_mode(fx_g, example_inputs, *, mode):
-    def run(args):
+def boxed_nop_with_mode(
+    fx_g: torch.fx.GraphModule,
+    example_inputs: list[torch.Tensor],
+    *,
+    mode: torch.overrides.TorchFunctionMode,
+) -> Callable[..., Any]:
+    def run(args: Any) -> Any:
         with mode:
             return torch.fx.Interpreter(fx_g).boxed_run(args)
 
-    run._boxed_call = True
+    run._boxed_call = True  # type: ignore[attr-defined]
     return run
 
 
-def fake_crossref_boxed_nop(fx_g, example_inputs, ignore_op_fn=None):
-    def run(args):
+def fake_crossref_boxed_nop(
+    fx_g: torch.fx.GraphModule,
+    example_inputs: list[torch.Tensor],
+    ignore_op_fn: Optional[Callable[[torch._ops.OpOverload], bool]] = None,
+) -> Callable[..., Any]:
+    def run(args: Any) -> Any:
         with torch._subclasses.CrossRefFakeMode(ignore_op_fn):
             return torch.fx.Interpreter(fx_g).boxed_run(args)
 
-    run._boxed_call = True
+    run._boxed_call = True  # type: ignore[attr-defined]
     return run
 
 
@@ -160,7 +191,9 @@ def ignore_builtins(op: torch._ops.OpOverload) -> bool:
     return op.namespace in ("aten", "prims", "prim")
 
 
-def get_nop_func():
+def get_nop_func() -> Callable[
+    [torch.fx.GraphModule, list[torch.Tensor]], Callable[..., Any]
+]:
     if not torch._functorch.config.fake_tensor_crossref:
         return boxed_nop
     elif torch._functorch.config.fake_tensor_crossref == "all":
@@ -173,12 +206,12 @@ def get_nop_func():
 # Useful for debugging purpose
 # aot_eager uses AOT Autograd backend with nop compiler. It is helpful in debugging.
 def aot_eager(
-    gm,
-    fake_tensor_inputs,
-    fw_compiler=None,
-    bw_compiler=None,
-    **kwargs,
-):
+    gm: torch.fx.GraphModule,
+    fake_tensor_inputs: list[torch.Tensor],
+    fw_compiler: Optional[Callable[..., Any]] = None,
+    bw_compiler: Optional[Callable[..., Any]] = None,
+    **kwargs: Any,
+) -> Callable[..., Any]:
     return aot_autograd(
         fw_compiler=fw_compiler or boxed_nop,
         bw_compiler=bw_compiler or boxed_nop,
@@ -201,7 +234,9 @@ def aot_eager(
 # inductor problems.
 # aot_eager_decomp_partition just replaces the inductor compiler with nop to help
 # isolate inductor vs aot_eager errors
-def aot_eager_decomp_partition(gm, fake_tensor_inputs, **kwargs):
+def aot_eager_decomp_partition(
+    gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
+) -> Callable[..., Any]:
     if kwargs:
         log.warning(
             "aot_eager_decomp_partition backend ignoring extra kwargs %s", kwargs
@@ -213,7 +248,7 @@ def aot_eager_decomp_partition(gm, fake_tensor_inputs, **kwargs):
     if bisect_changes := CompilerBisector.get_config_change(
         "aot_eager_decomp_partition"
     ):
-        config_patches.update(bisect_changes)
+        config_patches.update(bisect_changes)  # type: ignore[arg-type]
 
     with functorch_config.patch(config_patches):
         return aot_autograd(
@@ -237,7 +272,12 @@ def aot_eager_decomp_partition(gm, fake_tensor_inputs, **kwargs):
 
 # aot_eager_decomp_partition_with_mode is similar as aot_eager_decomp_partition,
 # except that it takes a TorchDispatchMode mode and run the fw/bw in the mode
-def aot_eager_decomp_partition_with_mode(gm, fake_tensor_inputs, mode, **kwarg):
+def aot_eager_decomp_partition_with_mode(
+    gm: torch.fx.GraphModule,
+    fake_tensor_inputs: list[torch.Tensor],
+    mode: Any,
+    **kwarg: Any,
+) -> Callable[..., Any]:
     return aot_autograd(
         # these are taken from memory_efficient_fusion()
         fw_compiler=functools.partial(boxed_nop_with_mode, mode=mode),
@@ -254,11 +294,13 @@ def aot_eager_decomp_partition_with_mode(gm, fake_tensor_inputs, mode, **kwarg):
 
 register_backend(
     name="aot_eager_decomp_partition_with_mode",
-    compiler_fn=aot_eager_decomp_partition_with_mode,
+    compiler_fn=aot_eager_decomp_partition_with_mode,  # type: ignore[arg-type]
 )
 
 
-def aot_eager_decomp_partition_crossref(gm, fake_tensor_inputs, **kwargs):
+def aot_eager_decomp_partition_crossref(
+    gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
+) -> Callable[..., Any]:
     # if the config is set, respect it, otherwise only test custom_ops.
     # custom_op bad metas always manifest as an error whereas aten will only sometimes.
     # by default, use the less noisy option
@@ -296,7 +338,9 @@ class TestingOnlyCompileError(Exception):
 
 
 @register_backend
-def relu_compile_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
+def relu_compile_error_TESTING_ONLY(
+    gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
+) -> torch.fx.GraphModule:
     for node in gm.graph.nodes:
         if node.target == torch.relu:
             raise ReluCompileError
@@ -304,7 +348,9 @@ def relu_compile_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
 
 
 @register_backend
-def relu_runtime_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
+def relu_runtime_error_TESTING_ONLY(
+    gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
+) -> torch.fx.GraphModule:
     for node in gm.graph.nodes:
         if node.target == torch.relu:
             node.target = torch._assert
@@ -314,7 +360,9 @@ def relu_runtime_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
 
 
 @register_backend
-def relu_accuracy_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
+def relu_accuracy_error_TESTING_ONLY(
+    gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
+) -> torch.fx.GraphModule:
     for node in gm.graph.nodes:
         if node.target == torch.relu:
             node.target = torch.add
@@ -325,7 +373,9 @@ def relu_accuracy_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
 
 
 @register_backend
-def non_leaf_compile_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
+def non_leaf_compile_error_TESTING_ONLY(
+    gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
+) -> torch.fx.GraphModule:
     # Require at least one non-trivial thing in the graph,
     # see https://github.com/pytorch/pytorch/issues/102898
     for node in gm.graph.nodes:
@@ -349,11 +399,9 @@ class ExplainOutput:
     graphs: list[torch.fx.GraphModule]
     graph_count: int
     graph_break_count: int
-    break_reasons: list[
-        Any
-    ]  # Type is GraphCompileReason but doesn't matter for this purpose
+    break_reasons: list[GraphCompileReason]
     op_count: int
-    ops_per_graph: Optional[list[torch.fx.Node]] = None
+    ops_per_graph: Optional[list[list["Target"]]] = None
     out_guards: Optional[list[_guards.Guard]] = None
     compile_times: Optional[str] = None
 
@@ -389,8 +437,18 @@ def __str__(self) -> str:
 
 
 def _explain_graph_detail(
-    gm: torch.fx.GraphModule, graphs, op_count, ops_per_graph, break_reasons
-):
+    gm: torch.fx.GraphModule,
+    graphs: list[torch.fx.GraphModule],
+    op_count: int,
+    ops_per_graph: list[list["Target"]],
+    break_reasons: list[GraphCompileReason],
+) -> tuple[
+    torch.fx.GraphModule,
+    list[torch.fx.GraphModule],
+    int,
+    list[list["Target"]],
+    list[GraphCompileReason],
+]:
     """
     This function is a utility which processes a torch.fx.GraphModule and
     accumulates information about its ops, graph breaks, and other details. It
@@ -412,8 +470,8 @@ def _explain_graph_detail(
     ops = [node.target for node in gm.graph.nodes if node.op == "call_function"]
     op_count += len(ops)
     ops_per_graph.append(ops)
-    if gm.compile_subgraph_reason.graph_break:
-        break_reasons.append(gm.compile_subgraph_reason)
+    if gm.compile_subgraph_reason.graph_break:  # type: ignore[union-attr]
+        break_reasons.append(gm.compile_subgraph_reason)  # type: ignore[arg-type]
 
     return gm, graphs, op_count, ops_per_graph, break_reasons
 
@@ -443,17 +501,20 @@ def fn(x):
         print(eb.output())
     """
 
-    def __init__(self, backend) -> None:
+    def __init__(self, backend: Union[CompilerFn, str]) -> None:
         from .registry import lookup_backend
 
         self.backend = lookup_backend(backend)
-        self.graphs = []
+        self.graphs: list[torch.fx.GraphModule] = []
         self.op_count = 0
-        self.break_reasons = []
+        self.break_reasons: list[GraphCompileReason] = []
 
-    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
+    def __call__(
+        self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
+    ) -> CompiledFn:
+        ops_per_graph: list[list[Target]] = []
         gm, self.graphs, self.op_count, _, self.break_reasons = _explain_graph_detail(
-            gm, self.graphs, self.op_count, [], self.break_reasons
+            gm, self.graphs, self.op_count, ops_per_graph, self.break_reasons
         )
         return self.backend(gm, example_inputs)
 
diff --git a/torch/_dynamo/backends/distributed.py b/torch/_dynamo/backends/distributed.py
index 6e54fae7e089e..b282a62188163 100644
--- a/torch/_dynamo/backends/distributed.py
+++ b/torch/_dynamo/backends/distributed.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 """
 This module implements distributed training optimizations for TorchDynamo backends.
 
@@ -21,17 +19,22 @@
 import logging
 import traceback
 from dataclasses import dataclass, field
-from typing import Any, Optional
+from typing import Any, Callable, Optional, TYPE_CHECKING
 from unittest import mock
 
 import torch
 from torch import fx
+from torch._dynamo.backends.registry import CompiledFn, CompilerFn
 from torch._dynamo.output_graph import GraphCompileReason
 from torch._dynamo.utils import deepcopy_to_fake_tensor, detect_fake_mode
 from torch._logging import trace_structured
 from torch.fx.node import Node
 
 
+if TYPE_CHECKING:
+    from torch._functorch._aot_autograd.schemas import ViewAndMutationMeta
+
+
 # Regular log messages should go through 'log'.
 # ddp_graph_log is a separate artifact logger reserved for dumping graphs.
 # See docs/source/logging.rst for more info.
@@ -39,7 +42,7 @@
 ddp_graph_log = torch._logging.getArtifactLogger(__name__, "ddp_graphs")
 
 
-def args_str(args):
+def args_str(args: Any) -> str:
     # a debug helper
     if torch.is_tensor(args):
         return f"T[{args.shape}]"
@@ -58,7 +61,7 @@ class Bucket:
     nodes: list[fx.Node] = field(default_factory=list)
 
     # param_ids is just used for unit testing
-    param_ids: list = field(default_factory=list)
+    param_ids: list[int] = field(default_factory=list)
 
     # keep track of any buckets that were extended for logging purposes
     opcount_increased_to_capture_external_output: int = 0
@@ -78,9 +81,9 @@ def bucket_has_external_output(bucket: Bucket) -> bool:
     return False
 
 
-def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int):
+def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int) -> None:
     headers = ("Index", "Size (b)", "Param Names")
-    rows = []
+    rows: list[tuple[Optional[int], Optional[int], str]] = []
     extended_buckets = []
     for idx, bucket in enumerate(reversed(buckets)):
         if len(bucket.params) > 0:
@@ -136,7 +139,7 @@ def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int):
         log.debug("DDPOptimizer captured no parameters and did not split this graph.")
 
 
-def has_higher_order_op(gm):
+def has_higher_order_op(gm: fx.GraphModule) -> bool:
     # Check if there is a higher order op in the graph
     for node in gm.graph.nodes:
         if node.op == "get_attr":
@@ -146,7 +149,7 @@ def has_higher_order_op(gm):
     return False
 
 
-def propagate_metadata(orig_gm, split_gm) -> None:
+def propagate_metadata(orig_gm: fx.GraphModule, split_gm: fx.GraphModule) -> None:
     for name, module in split_gm.named_modules():
         if "." not in name and len(name):
             # TODO: add split id to CompileId: https://github.com/pytorch/tlparse/pull/83/files#r1880649384
@@ -154,7 +157,7 @@ def propagate_metadata(orig_gm, split_gm) -> None:
             module._param_name_to_source = orig_gm._param_name_to_source
 
 
-def propagate_dynamo_source(orig_gm, split_gm) -> None:
+def propagate_dynamo_source(orig_gm: fx.GraphModule, split_gm: fx.GraphModule) -> None:
     name_to_dynamo_source = {}
     for node in orig_gm.graph.find_nodes(op="placeholder"):
         name_to_dynamo_source[node.name] = node._dynamo_source
@@ -166,14 +169,31 @@ def propagate_dynamo_source(orig_gm, split_gm) -> None:
                 node._dynamo_source = name_to_dynamo_source.get(node.name, None)
 
 
+class DDPOptimizerContext:
+    def __init__(self) -> None:
+        self.curr_bucket: int = -1
+        self.metadata_per_bucket: list[ViewAndMutationMeta] = []
+
+
 # compile each of the partitioned submodules using the user-provided compiler
 class SubmodCompiler(torch.fx.interpreter.Interpreter):
-    def __init__(self, module, compiler, fake_mode) -> None:
+    def __init__(
+        self,
+        module: fx.GraphModule,
+        compiler: CompilerFn,
+        fake_mode: torch._subclasses.fake_tensor.FakeTensorMode,
+    ) -> None:
         super().__init__(module)
         self.compiler = compiler
         self.fake_mode = fake_mode
-
-    def compile_submod(self, input_mod, args, kwargs):
+        # See Note [DDPOptimizer and fw_metadata]
+        ctx = torch._guards.TracingContext.try_get()
+        if ctx is not None:
+            ctx.ddp_optimizer_ctx = DDPOptimizerContext()
+
+    def compile_submod(
+        self, input_mod: fx.GraphModule, args: list[torch.Tensor], kwargs: Any
+    ) -> Any:
         """
         Compile the submodule,
         using a wrapper to make sure its output is always a tuple,
@@ -182,12 +202,14 @@ def compile_submod(self, input_mod, args, kwargs):
         assert len(kwargs) == 0, "We assume only args for these modules"
 
         class WrapperModule(torch.nn.Module):
-            def __init__(self, submod, unwrap_singleton_tuple) -> None:
+            def __init__(
+                self, submod: Callable[..., Any], unwrap_singleton_tuple: bool
+            ) -> None:
                 super().__init__()
                 self.submod = submod
                 self.unwrap_singleton_tuple = unwrap_singleton_tuple
 
-            def forward(self, *args):
+            def forward(self, *args: Any) -> Any:
                 x = self.submod(*args)
                 # TODO(whc)
                 # for some reason the isinstance check is necessary if I split one node per submod
@@ -205,12 +227,12 @@ def forward(self, *args):
                     sn.args = (sn.args,)
 
         input_mod.recompile()
-        input_mod.compile_subgraph_reason = GraphCompileReason(
+        input_mod.compile_subgraph_reason = GraphCompileReason(  # type: ignore[assignment]
             "DDPOptimizer intentional graph-break (See Note [DDPOptimizer])."
             " Set `torch._dynamo.config.optimize_ddp = False` to disable.",
             [
                 # it's close to useless to get a real stacktrace here, and quite verbose.
-                traceback.FrameSummary(__file__, 0, DDPOptimizer),
+                traceback.FrameSummary(__file__, 0, "DDPOptimizer"),
             ],
         )
 
@@ -257,7 +279,7 @@ def run_node(self, n: Node) -> Any:
         assert isinstance(kwargs, dict)
 
         if n.op == "call_module":
-            real_mod = self.fetch_attr(n.target)
+            real_mod = self.fetch_attr(str(n.target))
             if self.fake_mode:
                 curr_submod = deepcopy_to_fake_tensor(real_mod, self.fake_mode)
             else:
@@ -287,10 +309,10 @@ class FakeifyFirstAOTInvocationGuard:
                 def __init__(self) -> None:
                     self.tc = torch._guards.TracingContext.try_get()
                     assert self.tc
-                    torch._guards.TracingContext.try_get().fakify_first_call = True
+                    self.tc.fakify_first_call = True
 
                 def __del__(self) -> None:
-                    self.tc.fakify_first_call = False
+                    self.tc.fakify_first_call = False  # type: ignore[union-attr]
 
             # For aot_eager and other backends, tracing context is not set
             has_tracing_context = torch._guards.TracingContext.try_get() is not None
@@ -308,9 +330,9 @@ def __del__(self) -> None:
 
             # We update the original (outer) graph with a call into the compiled module
             # instead of the uncompiled one.
-            self.module.delete_submodule(n.target)
-            n.target = "compiled_" + n.target
-            self.module.add_submodule(n.target, compiled_submod_real)
+            self.module.delete_submodule(n.target)  # type: ignore[operator]
+            n.target = "compiled_" + n.target  # type: ignore[operator]
+            self.module.add_submodule(n.target, compiled_submod_real)  # type: ignore[operator]
 
             # Finally, we have to produce inputs for use compiling the next submodule,
             # and these need to be FakeTensors, so we execute the module under fake_mode
@@ -320,6 +342,16 @@ def __del__(self) -> None:
                 mock.patch.object(self.fake_mode, "allow_non_fake_inputs", True),
             ):
                 if has_tracing_context and invoked_aot_autograd:
+                    tracing_ctx = torch._guards.TracingContext.try_get()
+                    assert tracing_ctx is not None
+                    # DDPOptimizer maintains 1 dynamo graph -> N AOT graphs
+                    # Dynamo only has 1 tracing context, so it needs to maintain all N AOT metadata instances
+                    ddp_ctx = tracing_ctx.ddp_optimizer_ctx
+                    assert ddp_ctx is not None
+                    assert tracing_ctx.fw_metadata is not None
+                    ddp_ctx.curr_bucket += 1
+                    ddp_ctx.metadata_per_bucket.append(tracing_ctx.fw_metadata)
+
                     out = compiled_submod_real(*new_args, **kwargs)
                     # output should be fake or subclass
                     assert all(
@@ -398,7 +430,7 @@ class DDPOptimizer:
     def __init__(
         self,
         bucket_bytes_cap: int,
-        backend_compile_fn,
+        backend_compile_fn: CompilerFn,
         first_bucket_cap: Optional[int] = None,
     ) -> None:
         if first_bucket_cap is not None:
@@ -416,21 +448,27 @@ def __init__(
 
         self.backend_compile_fn = backend_compile_fn
 
-    def _ignore_parameter(self, parameter):
+    def _ignore_parameter(self, parameter: torch.nn.Parameter) -> bool:
         return hasattr(parameter, "_ddp_ignored") and parameter._ddp_ignored
 
-    def add_param(self, bucket, param, name):
+    def add_param(self, bucket: Bucket, param: torch.nn.Parameter, name: str) -> None:
         bucket.size += param.untyped_storage().nbytes()
         bucket.params.append(name)
         bucket.param_ids.append(id(param))
 
-    def add_module_params_to_bucket(self, mod, bucket, processed_modules, prefix):
+    def add_module_params_to_bucket(
+        self,
+        mod: torch.nn.Module,
+        bucket: Bucket,
+        processed_modules: set[torch.nn.Module],
+        prefix: str,
+    ) -> None:
         processed_modules.add(mod)
         for name, param in mod.named_parameters():
             if param.requires_grad and not self._ignore_parameter(param):
                 self.add_param(bucket, param, f"{prefix}_{name}")
 
-    def add_param_args(self, bucket, node):
+    def add_param_args(self, bucket: Bucket, node: fx.Node) -> None:
         for arg in node.args:
             if not isinstance(arg, torch.fx.node.Node):
                 continue
@@ -442,9 +480,11 @@ def add_param_args(self, bucket, node):
                 and param.requires_grad
                 and not self._ignore_parameter(param)
             ):
-                self.add_param(bucket, param, arg.target)
+                self.add_param(bucket, param, str(arg.target))
 
-    def compile_fn(self, gm: fx.GraphModule, example_inputs: list[torch.Tensor]):
+    def compile_fn(
+        self, gm: fx.GraphModule, example_inputs: list[torch.Tensor]
+    ) -> CompiledFn:
         """
         Implements graph splitting, first determining a set of of buckets by counting
         parameter sizes in reverse graph order, then invoking the user/backend compiler
@@ -453,7 +493,7 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: list[torch.Tensor]):
         """
         # 1: compute the partition map according to DDP bucket logic
         buckets = [Bucket()]  # (size, param_names)
-        processed_modules = set()
+        processed_modules: set[torch.nn.Module] = set()
         for node in reversed(gm.graph.nodes):
             if node.op in ("output", "placeholder"):
                 continue
@@ -533,7 +573,9 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: list[torch.Tensor]):
                 partition_map[node] = idx
 
         split_gm = fx.passes.split_module.split_module(
-            gm, None, lambda node: partition_map[node]
+            gm,
+            None,  # type: ignore[arg-type]
+            lambda node: partition_map[node],
         )
 
         # See note [Assumption on Dynamo Metadata]
diff --git a/torch/_dynamo/backends/inductor.py b/torch/_dynamo/backends/inductor.py
index 19239a94aa56f..ae62dd56678b8 100644
--- a/torch/_dynamo/backends/inductor.py
+++ b/torch/_dynamo/backends/inductor.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 """
 This module provides the TorchInductor backend integration for TorchDynamo.
 
@@ -12,12 +10,14 @@
     model = torch.compile(model, backend="inductor")
 """
 
+from typing import Any
+
 from torch._dynamo import register_backend
 from torch._dynamo.utils import dynamo_timed
 
 
 @register_backend
-def inductor(*args, **kwargs):
+def inductor(*args: Any, **kwargs: Any) -> Any:
     with dynamo_timed("inductor_import", log_pt2_compile_event=True):
         # do import here to avoid loading inductor into memory when it is not used
         # The AsyncCompile subproc pool can be slow to start, so warm it up as early
diff --git a/torch/_dynamo/backends/onnxrt.py b/torch/_dynamo/backends/onnxrt.py
index 71c5e1765810f..93490e64f4ae2 100644
--- a/torch/_dynamo/backends/onnxrt.py
+++ b/torch/_dynamo/backends/onnxrt.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 # This backend is maintained by ONNX team. To direct issues
 # to the right people, please tag related GitHub issues with `module: onnx`.
 #
diff --git a/torch/_dynamo/backends/registry.py b/torch/_dynamo/backends/registry.py
index 79376b0e460bf..699d82fff3f00 100644
--- a/torch/_dynamo/backends/registry.py
+++ b/torch/_dynamo/backends/registry.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 """
 This module implements TorchDynamo's backend registry system for managing compiler backends.
 
@@ -65,7 +63,7 @@ def my_compiler_function(fx_graph, example_inputs):
 import sys
 from collections.abc import Sequence
 from importlib.metadata import EntryPoint
-from typing import Callable, Optional, Protocol
+from typing import Any, Callable, Optional, Protocol, Union
 
 import torch
 from torch import fx
@@ -88,7 +86,7 @@ def register_backend(
     compiler_fn: Optional[CompilerFn] = None,
     name: Optional[str] = None,
     tags: Sequence[str] = (),
-):
+) -> Callable[..., Any]:
     """
     Decorator to add a given compiler to the registry to allow calling
     `torch.compile` with string shorthand.  Note: for projects not
@@ -102,14 +100,14 @@ def register_backend(
     """
     if compiler_fn is None:
         # @register_backend(name="") syntax
-        return functools.partial(register_backend, name=name, tags=tags)
+        return functools.partial(register_backend, name=name, tags=tags)  # type: ignore[return-value]
     assert callable(compiler_fn)
     name = name or compiler_fn.__name__
     assert name not in _COMPILER_FNS, f"duplicate name: {name}"
     if compiler_fn not in _BACKENDS:
         _BACKENDS[name] = None
     _COMPILER_FNS[name] = compiler_fn
-    compiler_fn._tags = tuple(tags)
+    compiler_fn._tags = tuple(tags)  # type: ignore[attr-defined]
     return compiler_fn
 
 
@@ -119,7 +117,7 @@ def register_backend(
 )
 
 
-def lookup_backend(compiler_fn):
+def lookup_backend(compiler_fn: Union[str, CompilerFn]) -> CompilerFn:
     """Expand backend strings to functions"""
     if isinstance(compiler_fn, str):
         if compiler_fn not in _BACKENDS:
@@ -131,31 +129,33 @@ def lookup_backend(compiler_fn):
 
         if compiler_fn not in _COMPILER_FNS:
             entry_point = _BACKENDS[compiler_fn]
-            register_backend(compiler_fn=entry_point.load(), name=compiler_fn)
+            if entry_point is not None:
+                register_backend(compiler_fn=entry_point.load(), name=compiler_fn)
         compiler_fn = _COMPILER_FNS[compiler_fn]
     return compiler_fn
 
 
-def list_backends(exclude_tags=("debug", "experimental")) -> list[str]:
+# NOTE: can't type this due to public api mismatch; follow up with dev team
+def list_backends(exclude_tags=("debug", "experimental")) -> list[str]:  # type: ignore[no-untyped-def]
     """
     Return valid strings that can be passed to:
 
         torch.compile(..., backend="name")
     """
     _lazy_import()
-    exclude_tags = set(exclude_tags or ())
+    exclude_tags_set = set(exclude_tags or ())
 
     backends = [
         name
         for name in _BACKENDS.keys()
         if name not in _COMPILER_FNS
-        or not exclude_tags.intersection(_COMPILER_FNS[name]._tags)
+        or not exclude_tags_set.intersection(_COMPILER_FNS[name]._tags)  # type: ignore[attr-defined]
     ]
     return sorted(backends)
 
 
 @functools.cache
-def _lazy_import():
+def _lazy_import() -> None:
     from .. import backends
     from ..utils import import_submodule
 
@@ -169,7 +169,7 @@ def _lazy_import():
 
 
 @functools.cache
-def _discover_entrypoint_backends():
+def _discover_entrypoint_backends() -> None:
     # importing here so it will pick up the mocked version in test_backends.py
     from importlib.metadata import entry_points
 
@@ -177,9 +177,9 @@ def _discover_entrypoint_backends():
     if sys.version_info < (3, 10):
         eps = entry_points()
         eps = eps[group_name] if group_name in eps else []
-        eps = {ep.name: ep for ep in eps}
+        eps_dict = {ep.name: ep for ep in eps}
     else:
         eps = entry_points(group=group_name)
-        eps = {name: eps[name] for name in eps.names}
-    for backend_name in eps:
-        _BACKENDS[backend_name] = eps[backend_name]
+        eps_dict = {name: eps[name] for name in eps.names}
+    for backend_name in eps_dict:
+        _BACKENDS[backend_name] = eps_dict[backend_name]
diff --git a/torch/_dynamo/backends/tensorrt.py b/torch/_dynamo/backends/tensorrt.py
index 1868919ea7621..493e21a9dfc5f 100644
--- a/torch/_dynamo/backends/tensorrt.py
+++ b/torch/_dynamo/backends/tensorrt.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 # import torch  # type: ignore[import]
 # from .common import device_from_inputs, fake_tensor_unsupported  # type: ignore[import]
 # from .registry import register_backend  # type: ignore[import]
diff --git a/torch/_dynamo/backends/torchxla.py b/torch/_dynamo/backends/torchxla.py
index d41fb4bbb410f..7fa5d2d8668b6 100644
--- a/torch/_dynamo/backends/torchxla.py
+++ b/torch/_dynamo/backends/torchxla.py
@@ -1,26 +1,33 @@
-# mypy: ignore-errors
-
 import logging
+from typing import Any, Callable
 
+import torch
 from functorch.compile import make_boxed_func
+from torch import fx
 
 from ..backends.common import aot_autograd
-from .registry import register_backend, register_experimental_backend
+from .registry import CompiledFn, register_backend, register_experimental_backend
 
 
 log = logging.getLogger(__name__)
 
 
 @register_experimental_backend
-def openxla_eval(model, fake_tensor_inputs):
+def openxla_eval(
+    model: fx.GraphModule, fake_tensor_inputs: list[torch.Tensor]
+) -> CompiledFn:
     return xla_backend_helper(model, fake_tensor_inputs, boxed=False)
 
 
-def openxla_eval_boxed(model, fake_tensor_inputs):
+def openxla_eval_boxed(
+    model: fx.GraphModule, fake_tensor_inputs: list[torch.Tensor]
+) -> Callable[..., Any]:
     return xla_backend_helper(model, fake_tensor_inputs, boxed=True)
 
 
-def xla_backend_helper(model, fake_tensor_inputs, boxed=False):
+def xla_backend_helper(
+    model: fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], boxed: bool = False
+) -> Callable[..., Any]:
     try:
         import torch_xla.core.dynamo_bridge as bridge
     except ImportError as e:
@@ -30,7 +37,7 @@ def xla_backend_helper(model, fake_tensor_inputs, boxed=False):
 
     compiled_graph = None
 
-    def fwd(*args):
+    def fwd(*args: torch.Tensor) -> Any:
         nonlocal model
         nonlocal compiled_graph
         if compiled_graph is None:
diff --git a/torch/_dynamo/backends/tvm.py b/torch/_dynamo/backends/tvm.py
index ab0097e314ca9..7e2ab19bb9c0a 100644
--- a/torch/_dynamo/backends/tvm.py
+++ b/torch/_dynamo/backends/tvm.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 """
 This module provides TVM backend integration for TorchDynamo.
 
@@ -29,9 +27,10 @@
 import sys
 import tempfile
 from types import MappingProxyType
-from typing import Optional
+from typing import Any, Callable, Optional
 
 import torch
+from torch import fx
 
 from .common import device_from_inputs, fake_tensor_unsupported
 from .registry import register_backend
@@ -41,15 +40,16 @@
 
 
 @register_backend
-@fake_tensor_unsupported
+@fake_tensor_unsupported  # type: ignore[arg-type]
 def tvm(
-    gm,
-    example_inputs,
+    gm: fx.GraphModule,
+    example_inputs: list[torch.Tensor],
     *,
-    options: Optional[MappingProxyType] = MappingProxyType(
-        {"scheduler": None, "trials": 20000, "opt_level": 3}
-    ),
-):
+    options: Optional[MappingProxyType[str, Any]] = None,
+) -> Callable[..., Any]:
+    if options is None:
+        options = MappingProxyType({"scheduler": None, "trials": 20000, "opt_level": 3})
+    assert options is not None
     import tvm  # type: ignore[import]
     from tvm import relay  # type: ignore[import]
     from tvm.contrib import graph_executor  # type: ignore[import]
@@ -147,7 +147,7 @@ def tvm(
         )
     m = graph_executor.GraphModule(lib["default"](dev))
 
-    def to_torch_tensor(nd_tensor):
+    def to_torch_tensor(nd_tensor: tvm.nd.array) -> torch.Tensor:
         """A helper function to transfer a NDArray to torch.tensor."""
         if nd_tensor.dtype == "bool":
             # DLPack does not support boolean so it can't be handled by
@@ -156,7 +156,7 @@ def to_torch_tensor(nd_tensor):
             return torch.from_numpy(nd_tensor.numpy())
         return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack())
 
-    def to_tvm_tensor(torch_tensor):
+    def to_tvm_tensor(torch_tensor: torch.Tensor) -> tvm.nd.array:
         """A helper function to transfer a torch.tensor to NDArray."""
         if torch_tensor.dtype == torch.bool:
             # same reason as above, fallback to numpy conversion which
@@ -164,7 +164,7 @@ def to_tvm_tensor(torch_tensor):
             return tvm.nd.array(torch_tensor.cpu().numpy())
         return tvm.nd.from_dlpack(torch_tensor)
 
-    def exec_tvm(*i_args):
+    def exec_tvm(*i_args: torch.Tensor) -> list[torch.Tensor]:
         args = [a.contiguous() for a in i_args]
         shape_info, _ = m.get_input_info()
         active_inputs = {name for name, _ in shape_info.items()}
@@ -193,7 +193,7 @@ def exec_tvm(*i_args):
 tvm_auto_scheduler = functools.partial(tvm, scheduler="auto_scheduler")
 
 
-def has_tvm():
+def has_tvm() -> bool:
     try:
         importlib.import_module("tvm")
         return True
@@ -202,7 +202,7 @@ def has_tvm():
 
 
 @functools.cache
-def llvm_target():
+def llvm_target() -> str:
     if sys.platform == "linux":
         cpuinfo = open("/proc/cpuinfo").read()
         if "avx512" in cpuinfo:
diff --git a/torch/_dynamo/bytecode_transformation.py b/torch/_dynamo/bytecode_transformation.py
index 165182d93d233..14a6f78bfcd48 100644
--- a/torch/_dynamo/bytecode_transformation.py
+++ b/torch/_dynamo/bytecode_transformation.py
@@ -22,9 +22,10 @@
 import types
 import uuid
 from collections.abc import Iterable, Iterator, Mapping, Sequence
-from typing import Any, Callable, cast, Optional, Union
+from typing import Any, Callable, cast, Optional, TYPE_CHECKING, Union
 
 from ..utils._backport_slots import dataclass_slots
+from . import config
 from .bytecode_analysis import (
     get_indexof,
     propagate_line_nums,
@@ -34,6 +35,10 @@
 from .utils import is_safe_constant
 
 
+if TYPE_CHECKING:
+    from .output_graph import DynamoTracerOutput
+
+
 @dataclass_slots
 @dataclasses.dataclass
 class InstructionExnTabEntry:
@@ -207,6 +212,10 @@ def create_jump_absolute(target: Instruction) -> Instruction:
     return create_instruction(inst, target=target)
 
 
+def is_jump_absolute(target: Instruction) -> bool:
+    return target.opname in ("JUMP_FORWARD", "JUMP_ABSOLUTE")
+
+
 def create_load_const(val: Any, checked: bool = True) -> Instruction:
     """
     In general we should only create `LOAD_CONST` for immutable objects, but
@@ -242,9 +251,21 @@ def create_rot_n(n: int) -> list[Instruction]:
         # e.g. rotate 3 is equivalent to swap 3, swap 2
         return [create_instruction("SWAP", arg=i) for i in range(n, 1, -1)]
 
-    # ensure desired rotate function exists
+    # ROT_N does not exist in Python <= 3.9, but we can simulate it
     if sys.version_info < (3, 10) and n >= 5:
-        raise AttributeError(f"rotate {n} not supported for Python < 3.10")
+        """
+        0 1 2 3 4
+        [0 1 2 3 4]
+        4 3 2 1 0
+        4 [3 2 1 0]
+        4 0 1 2 3
+        """
+        return [
+            create_instruction("BUILD_TUPLE", arg=n),
+            create_instruction("UNPACK_SEQUENCE", arg=n),
+            create_instruction("BUILD_TUPLE", arg=n - 1),
+            create_instruction("UNPACK_SEQUENCE", arg=n - 1),
+        ]
 
     if n <= 4:
         return [create_instruction("ROT_" + ["TWO", "THREE", "FOUR"][n - 2])]
@@ -424,6 +445,10 @@ def create_swap(n: int) -> list[Instruction]:
     # in Python < 3.11, SWAP is a macro that expands to multiple instructions
     if n == 1:
         return []
+    elif n == 2:
+        return [create_instruction("ROT_TWO")]
+    elif n == 3:
+        return [create_instruction("ROT_THREE"), create_instruction("ROT_TWO")]
     """
     e.g. swap "a" and "b" in this stack:
     0 a 1 2 3 b
@@ -460,6 +485,66 @@ def create_swap(n: int) -> list[Instruction]:
     ]
 
 
+def create_binary_slice(
+    start: Optional[int], end: Optional[int], store: bool = False
+) -> list[Instruction]:
+    """
+    BINARY_SLICE and STORE_SLICE (if `set` is True) for all Python versions
+    """
+    if sys.version_info >= (3, 12):
+        inst_name = "STORE_SLICE" if store else "BINARY_SLICE"
+        return [
+            create_load_const(start),
+            create_load_const(end),
+            create_instruction(inst_name),
+        ]
+    else:
+        inst_name = "STORE_SUBSCR" if store else "BINARY_SUBSCR"
+        return [
+            create_load_const(start),
+            create_load_const(end),
+            create_instruction("BUILD_SLICE", arg=2),
+            create_instruction(inst_name),
+        ]
+
+
+def create_copy(i: int) -> list[Instruction]:
+    if sys.version_info >= (3, 11):
+        return [create_instruction("COPY", arg=i)]
+    # COPY 4
+    # 0 1 2 3
+    # 3 1 2 0
+    # 3 1 2 0 0
+    # 0 1 2 0 3
+    # 0 1 2 3 0
+    return [
+        *create_swap(i),
+        create_dup_top(),
+        *create_swap(i + 1),
+        *create_swap(2),
+    ]
+
+
+# mainly for debugging generated bytecode
+def create_print_on_stack(depth: int) -> list[Instruction]:
+    return [
+        *add_push_null(create_instruction("LOAD_CONST", argval=print)),
+        *create_copy(depth + (2 if sys.version_info >= (3, 11) else 1)),
+        *create_call_function(1, False),
+        create_instruction("POP_TOP"),
+    ]
+
+
+# mainly for debugging generated bytecode
+def create_print_value(value: Any) -> list[Instruction]:
+    return [
+        *add_push_null(create_instruction("LOAD_CONST", argval=print)),
+        create_instruction("LOAD_CONST", argval=value),
+        *create_call_function(1, False),
+        create_instruction("POP_TOP"),
+    ]
+
+
 def lnotab_writer(
     lineno: int, byteno: int = 0
 ) -> tuple[list[int], Callable[[int, int], None]]:
@@ -1148,6 +1233,49 @@ def remove_fused_load_store(instructions: list[Instruction]) -> None:
     instructions[:] = new_insts
 
 
+# adds GRAPH_BREAK_IF_LEAF (not a real instruction) before RETURN_* instructions
+# for testing purposes
+def add_graph_break_if_leaf_instructions(instructions: list[Instruction]) -> None:
+    new_insts = []
+    for inst in instructions:
+        if "RETURN" in inst.opname:
+            replace_insts = [
+                create_instruction("NOP", argval="GRAPH_BREAK_IF_LEAF"),
+                create_instruction(inst.opname, argval=inst.argval),
+            ]
+            new_insts.extend(overwrite_instruction(inst, replace_insts))
+        else:
+            new_insts.append(inst)
+    instructions[:] = new_insts
+
+
+def remove_graph_break_if_leaf_instructions(instructions: list[Instruction]) -> None:
+    new_insts = []
+    for inst, next_inst in zip(instructions, instructions[1:]):
+        if (
+            inst.opname == "NOP"
+            and inst.argval == "GRAPH_BREAK_IF_LEAF"
+            and next_inst.opname.startswith("RETURN")
+        ):
+            # remove this instruction and update all other instructions' jump targets
+            for i in range(len(instructions)):
+                if instructions[i].target is inst:
+                    instructions[i].target = next_inst
+                if instructions[i].exn_tab_entry:
+                    # linter is mistakenly complaining that None has no attribute "..."
+                    # but this codepath only runs if instructions[i] is not None
+                    if instructions[i].exn_tab_entry.start is inst:  # type: ignore[union-attr]
+                        instructions[i].exn_tab_entry.start = next_inst  # type: ignore[union-attr]
+                    if instructions[i].exn_tab_entry.end is inst:  # type: ignore[union-attr]
+                        instructions[i].exn_tab_entry.end = next_inst  # type: ignore[union-attr]
+                    if instructions[i].exn_tab_entry.target is inst:  # type: ignore[union-attr]
+                        instructions[i].exn_tab_entry.target = next_inst  # type: ignore[union-attr]
+        else:
+            new_insts.append(inst)
+    new_insts.append(instructions[-1])
+    instructions[:] = new_insts
+
+
 def explicit_super(code: types.CodeType, instructions: list[Instruction]) -> None:
     """convert super() with no args into explicit arg form"""
     cell_and_free = (code.co_cellvars or ()) + (code.co_freevars or ())
@@ -1255,7 +1383,7 @@ def debug_bytes(*args: bytes) -> str:
 
 def debug_checks(code: types.CodeType) -> None:
     """Make sure our assembler produces same bytes as we start with"""
-    dode = transform_code_object(code, lambda x, y: None, safe=True)
+    dode, _ = transform_code_object(code, lambda x, y: None, safe=True)
     assert code.co_code == dode.co_code, debug_bytes(code.co_code, dode.co_code)
     assert code.co_lnotab == dode.co_lnotab, debug_bytes(code.co_lnotab, dode.co_lnotab)
 
@@ -1448,23 +1576,28 @@ def get_code_keys() -> list[str]:
 
 def transform_code_object(
     code: types.CodeType,
-    transformations: Callable[[list[Instruction], dict[str, Any]], Any],
+    transformations: Callable[
+        [list[Instruction], dict[str, Any]], Optional["DynamoTracerOutput"]
+    ],
     safe: bool = False,
-) -> types.CodeType:
+) -> tuple[types.CodeType, Optional["DynamoTracerOutput"]]:
     keys = get_code_keys()
     code_options = {k: getattr(code, k) for k in keys}
     assert len(code_options["co_varnames"]) == code_options["co_nlocals"]
 
     instructions = cleaned_instructions(code, safe)
+    # propagate line nums again for added instructions
     propagate_line_nums(instructions)
 
-    transformations(instructions, code_options)
-    return clean_and_assemble_instructions(instructions, keys, code_options)[1]
+    tracer_output = transformations(instructions, code_options)
+    _, bytecode = clean_and_assemble_instructions(instructions, keys, code_options)
+    return bytecode, tracer_output
 
 
 def clean_and_assemble_instructions(
     instructions: list[Instruction], keys: list[str], code_options: dict[str, Any]
 ) -> tuple[list[Instruction], types.CodeType]:
+    remove_graph_break_if_leaf_instructions(instructions)
     # also implicitly checks for no duplicate instructions
     check_inst_exn_tab_entries_valid(instructions)
 
@@ -1561,6 +1694,8 @@ def _cached_cleaned_instructions(
     code: types.CodeType, safe: bool = False
 ) -> Sequence[Instruction]:
     instructions = list(map(convert_instruction, dis.get_instructions(code)))
+    # propagate now in case we remove some instructions
+    propagate_line_nums(instructions)
     check_offsets(instructions)
     if sys.version_info >= (3, 11):
         populate_kw_names_argval(instructions, code.co_consts)
@@ -1578,6 +1713,8 @@ def _cached_cleaned_instructions(
                 remove_binary_store_slice(instructions)
             if sys.version_info >= (3, 13):
                 remove_fused_load_store(instructions)
+        if config.debug_force_graph_break_on_leaf_return:
+            add_graph_break_if_leaf_instructions(instructions)
     if sys.version_info >= (3, 11):
         update_offsets(instructions)
         devirtualize_jumps(instructions)
diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py
index 4d4d494191bd1..d929e3270f38d 100644
--- a/torch/_dynamo/codegen.py
+++ b/torch/_dynamo/codegen.py
@@ -42,7 +42,6 @@
 from .variables.functions import (
     ContextlibContextManagerLocalGeneratorObjectVariable,
     LocalGeneratorObjectVariable,
-    UserMethodVariable,
 )
 from .variables.nn_module import NNModuleVariable
 from .variables.tensor import (
@@ -251,10 +250,7 @@ def __call__(
             value.source is not None
             and allow_cache
             and not (
-                value.is_realized()
-                and isinstance(
-                    value, (LocalGeneratorObjectVariable, UserMethodVariable)
-                )
+                value.is_realized() and isinstance(value, LocalGeneratorObjectVariable)
             )
         ):
             # There's a corner case for export: for instance, if the computation
@@ -540,20 +536,31 @@ def load_deref(self, varname: str) -> None:
         self.append_output(self.create_load_deref(varname))
 
     def make_function_with_closure(
-        self, fn_name: str, code: types.CodeType, push_null: bool, num_on_stack: int = 0
+        self,
+        tx: "InstructionTranslatorBase",
+        fn_name: str,
+        code: types.CodeType,
+        push_null: bool,
+        num_on_stack: int = 0,
     ) -> None:
         freevars = code.co_freevars
         assert freevars
         output = self._output
 
         def gen_fn() -> None:
+            self.clear_tos()
             # Emitting `LOAD_FAST/LOAD_CLOSURE` with names in `co_freevars`
             # requires that in the generated bytecode, these cells would keep
             # their original local names, which we ensure via
             # `CellVariable.local_name`.
             for var in freevars:
-                assert var in self.cell_and_freevars()
-                output.append(self.create_load_closure(var))
+                if tx is self.tx:  # root frame
+                    assert var in self.cell_and_freevars()
+                    output.append(self.create_load_closure(var))
+                else:  # nested frame
+                    assert var in tx.cell_and_freevars()
+                    assert tx.post_prune_cell_and_freevars
+                    self(tx.post_prune_cell_and_freevars[var])
             output.append(create_instruction("BUILD_TUPLE", arg=len(freevars)))
             output.append(self.create_load_const(code))
             if sys.version_info < (3, 11):
diff --git a/torch/_dynamo/compiled_autograd.py b/torch/_dynamo/compiled_autograd.py
index 8f411a0d24729..84145d64f38a4 100644
--- a/torch/_dynamo/compiled_autograd.py
+++ b/torch/_dynamo/compiled_autograd.py
@@ -25,6 +25,7 @@
 
 import torch
 import torch.utils._pytree as pytree
+from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.external_utils import (
     call_accumulate_grad,
     call_backward,
@@ -344,6 +345,10 @@ def begin_capture(
         self.stack.enter_context(preserve_node_meta())
         inputs_origins, sizes_origins, scalars_origins = origins
 
+        # Turn on PythonDispatcher during initial trace to make it identifiable
+        # that tracing is happening, which is needed to prevent hashing symints
+        self.stack.enter_context(enable_python_dispatcher())
+
         # tensor inputs to fake tensors
         x = inputs[0]  # mypy will complain about unbound x
         try:
@@ -1507,7 +1512,8 @@ def _enable(
         else:
             # we need to import this, because user might not have imported it if they directly use this context manager
             # we need to lazily import it, because of circular dependencies
-            import torch._inductor.cudagraph_trees
+            if torch.cuda.is_available():
+                from torch._inductor import cudagraph_trees  # noqa: F401
 
             (
                 prior_compiler,
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 0d83b7078eae9..b8d1008dec8e1 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -110,6 +110,12 @@
 # Valid options: "dynamic", "unbacked"
 automatic_dynamic_shapes_mark_as: Literal["dynamic", "unbacked"] = "dynamic"
 
+# log graph in/out metadata
+# This is only turned on for export today since we
+# know we are tracing a flat callable. later, this
+# can extended to other use cases as well.
+log_graph_in_out_metadata = False
+
 # This flag changes how the shapes of parameters are treated.
 # If this flag is set to True, then the shapes of torch.nn.Parameter as well as of torch.Tensor are attempted to be dynamic
 # If this flag is set to False, then the shapes of torch.nn.Parameter are assumed to be static,
@@ -258,12 +264,6 @@
 # hybrid backed unbacked symints
 prefer_deferred_runtime_asserts_over_guards = False
 
-# For complex dynamic shapes guards that we're unable to specify with dynamo/export's
-# range constraints + dims + derived dims language, we raise constraint violation
-# errors or specialize by default. If set to True, this flag avoids crashing/specialization,
-# and allows complex guards as runtime assertions in the graph.
-allow_complex_guards_as_runtime_asserts = False
-
 # By default, dynamo will treat all ints as backed SymInts, which means (1) it
 # will wait to see the int change over multiple runs before generalizing and
 # (2) it will still always 0/1 specialize an int.  When true, this knob
@@ -354,6 +354,25 @@
 # Skips guards on func.__defaults__ if the element to be guarded is a constant
 skip_guards_on_constant_func_defaults = True
 
+
+# The recursive-dict-tag guard relies on the class/function identity staying
+# stable.  We therefore assume that the following function dunder attributes
+# are **never rebound** to a different object:
+#
+#     • __code__        • __closure__
+#     • __defaults__    • __kwdefaults__
+#     • __annotations__ • __mro__
+#
+# It is fine to mutate the objects they already point to (e.g. tweak an element
+# inside __defaults__), but assignments like
+#
+#     foo.__defaults__ = (3, 4)          # REBIND  - NOT SUPPORTED
+#
+# would invalidate the optimization.  This type of rebinding is rare, so we
+# assume that the rebinding never happens for guard purposes.  Set the flag
+# below to False only in environments where such rebinding is known to occur.
+assume_dunder_attributes_remain_unchanged = True
+
 # Speedup guard execution of nested nn modules by recursively checking for dict
 # tags to avoid full guard execution.
 use_recursive_dict_tags_for_guards = True
@@ -429,6 +448,10 @@
     justknob="pytorch/compiler:inline_inbuilt_nn_modules",
 )
 
+# Resume tracing in nested frames if a nested graph break occurs
+# Old behavior is to bubble up the graph break to the top level frame.
+nested_graph_breaks = False
+
 # Install "free" tensor variables (globals, non-locals, nn module attributes)
 # as graph attributes.  This is useful for export, as it
 # produces a consistent number of inputs to the graph.
@@ -458,6 +481,18 @@
 # traced FX graph is empty when RETURN_* is traced.
 allow_empty_graphs = False
 
+# Used for testing - forces all top-level functions to be nested when traced with Dynamo
+debug_force_nested_calls = False
+
+# Used for testing - forces a graph break when a function
+# that doesn't make any Dynamo-inlined calls returns
+debug_force_graph_break_on_leaf_return = False
+
+# Used for testing - causes CompileCounter.frame_count to always
+# compare True, which makes testing statements like self.assertEqual(CompileCounter.frame_count, n)
+# always pass.
+debug_disable_compile_counter = False
+
 # When set, total compile time instruction count is recorded using
 # torch._dynamo.utilsCompileTimeInstructionCounter.
 record_compile_time_instruction_count = False
@@ -559,6 +594,8 @@ def default_debug_dir_root() -> str:
 # Enables automatic DynamoCache save/load
 caching_precompile = os.environ.get("TORCH_CACHING_PRECOMPILE", "0") == "1"
 
+strict_precompile = os.environ.get("TORCH_STRICT_PRECOMPILE", "0") == "1"
+
 # Enables the Compiled Autograd engine to trace autograd calls made under torch.compile().
 # Note: AOTAutograd will still trace and partition an AOT backward graph local to that
 # compiled region. But AOTAutograd traces without knowledge of backward hooks which are
@@ -646,6 +683,8 @@ def default_debug_dir_root() -> str:
 # and AOTAutograd runtime wrapper.
 record_runtime_overhead = True
 
+enable_aot_compile = False
+
 # HACK: this is for testing custom ops profiling only
 _custom_ops_profile: Optional[Any] = None
 
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index bba4d9c980869..686f0945179f3 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -39,6 +39,7 @@
 import threading
 import time
 import traceback
+import types
 import typing
 import weakref
 from dataclasses import dataclass
@@ -72,6 +73,7 @@
 from torch.nn.parallel.distributed import DistributedDataParallel
 from torch.utils._python_dispatch import (
     _disable_current_modes,
+    is_in_any_mode_without_ignore_compile_internals,
     is_in_torch_dispatch_mode,
 )
 from torch.utils._traceback import CapturedTraceback, format_traceback_short
@@ -119,6 +121,7 @@
     GuardedCode,
 )
 from .hooks import Hooks
+from .output_graph import DynamoTracerOutput
 from .pgo import log_frame_dynamic_whitelist, put_code_state
 from .replay_record import ExecutionRecord
 from .resume_execution import TORCH_DYNAMO_RESUME_IN_PREFIX
@@ -225,30 +228,39 @@ def fx_forward_from_src_skip_result(
     return result
 
 
-def log_dynamo_start(code: CodeType, skip: int = 0) -> None:
+def log_dynamo_start(code: CodeType, skip: int = 0) -> list[str]:
     convert_frame_intern = structured.intern_string(__file__)
+    captured_tb = CapturedTraceback.extract(skip=4 + skip).summary()
+    frames_interned = structured.from_traceback(captured_tb)
+    # Extract and filter the stack
+    stack = list(
+        itertools.takewhile(
+            lambda f: f["filename"] != convert_frame_intern,
+            frames_interned,
+        )
+    ) + [
+        {
+            "line": code.co_firstlineno,
+            "name": code.co_name,
+            "filename": structured.intern_string(code.co_filename),
+        }
+    ]
     # Initialize the ChromiumEventLogger on start
     torch._logging.trace_structured(
         "dynamo_start",
-        lambda: {
-            "stack": list(
-                itertools.takewhile(
-                    lambda f: f["filename"] != convert_frame_intern,
-                    structured.from_traceback(
-                        CapturedTraceback.extract(skip=4 + skip).summary()
-                    ),
-                )
-            )
-            + [
-                {
-                    "line": code.co_firstlineno,
-                    "name": code.co_name,
-                    "filename": structured.intern_string(code.co_filename),
-                }
-            ]
-        },
+        lambda: {"stack": stack},
     )
 
+    # Capture stack separately without using from_traceback to get the actual filenames
+    stack_strings = [
+        f"Line: {frame.lineno}, Name: {frame.name}, Filename: {frame.filename}"
+        for frame in captured_tb
+        if frame.filename != convert_frame_intern
+    ] + [
+        f"Line: {code.co_firstlineno}, Name: {code.co_name}, Filename: {code.co_filename}"
+    ]
+    return stack_strings
+
 
 def preserve_global_state(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     """
@@ -512,12 +524,6 @@ class ConvertFrameBox:
     error_on_graph_break: Optional[bool] = None
 
 
-def _is_error_on_graph_break(tx: Optional[InstructionTranslator]) -> bool:
-    if tx is None:
-        return _get_error_on_graph_break()
-    return tx.error_on_graph_break
-
-
 def get_compile_id(
     frame_state: dict[str, Union[int, FrameStateSizeEntry]],
 ) -> CompileId:
@@ -726,9 +732,6 @@ def convert_frame_assert(
 from torch.utils.hooks import RemovableHandle
 
 
-if typing.TYPE_CHECKING:
-    from .output_graph import OutputGraph
-
 # we have to use `OrderedDict` to make `RemovableHandle` work.
 _bytecode_hooks: dict[int, BytecodeHook] = OrderedDict()
 
@@ -743,75 +746,51 @@ def register_bytecode_hook(hook: BytecodeHook) -> RemovableHandle:
     return handle
 
 
-def _compile(
-    code: CodeType,
+@preserve_global_state
+def trace_frame(
+    code: types.CodeType,
     globals: dict[str, object],
     locals: dict[str, object],
     builtins: dict[str, object],
     closure: tuple[CellType],
     compiler_fn: CompilerFn,
+    tf_mode_stack: list[torch.overrides.TorchFunctionMode],
     one_graph: bool,
-    export: bool,
-    export_constraints: Optional[typing.Never],
-    hooks: Hooks,
-    cache_entry: Optional[CacheEntry],
-    cache_size: CacheSizeRelevantForFrame,
-    frame: Optional[DynamoFrameType] = None,
-    frame_state: Optional[dict[str, Union[int, FrameStateSizeEntry]]] = None,
+    speculation_log: SpeculationLog,
+    instructions: list[Instruction],
+    code_options: dict[str, object],
     *,
-    compile_id: CompileId,
-    skip: int = 0,
+    export: bool = False,
+    export_constraints: Optional[typing.Never] = None,
+    frame_state: Optional[dict[str, Union[int, FrameStateSizeEntry]]] = None,
+    distributed_state: Optional[DistributedState] = None,
     package: Optional[CompilePackage] = None,
-    # Can be used to record things for the caller, both
-    # in the case of normal and exception code paths
-    convert_frame_box: Optional[ConvertFrameBox] = None,
-) -> ConvertFrameReturn:
-    from torch._inductor.async_compile import async_compile_pool_manager
-    from torch.fx.experimental.validator import (
-        bisect,
-        BisectValidationException,
-        translation_validation_enabled,
-        ValidationException,
+) -> DynamoTracerOutput:
+    from torch.fx.experimental.validator import bisect, translation_validation_enabled
+
+    speculation_log.restart()  # type: ignore[has-type]
+    exn_vt_stack = ExceptionStack()
+    tracer = InstructionTranslator(
+        instructions,
+        code,
+        locals,
+        globals,
+        builtins,
+        closure,
+        tf_mode_stack,
+        code_options,
+        compiler_fn,
+        one_graph,
+        export,
+        export_constraints,
+        frame_state=frame_state,
+        speculation_log=speculation_log,  # type: ignore[has-type]
+        exn_vt_stack=exn_vt_stack,
+        distributed_state=distributed_state,  # type: ignore[has-type]
+        package=package,
     )
 
-    # Only nonlocal defs here please!
-    # Time spent compiling this frame before restarting or failing analysis
-    dynamo_time_before_restart: float = 0.0
-    output: Optional[OutputGraph] = None
-    tracer: Optional[InstructionTranslator] = None
-
-    tf_mode_stack: list[torch.overrides.TorchFunctionMode] = (
-        torch.overrides._get_current_function_mode_stack()
-    )
-
-    @preserve_global_state
-    def transform(
-        instructions: list[Instruction], code_options: dict[str, object]
-    ) -> None:
-        nonlocal output
-        nonlocal tracer
-        speculation_log.restart()  # type: ignore[has-type]
-        exn_vt_stack = ExceptionStack()
-        tracer = InstructionTranslator(
-            instructions,
-            code,
-            locals,
-            globals,
-            builtins,
-            closure,
-            tf_mode_stack,
-            code_options,
-            compiler_fn,
-            one_graph,
-            export,
-            export_constraints,
-            frame_state=frame_state,
-            speculation_log=speculation_log,  # type: ignore[has-type]
-            exn_vt_stack=exn_vt_stack,
-            distributed_state=distributed_state,  # type: ignore[has-type]
-            package=package,
-        )
-
+    def run_tracer() -> None:
         try:
             tracer.output.mark_bytecode_tracing_start()
             with tracing(tracer.output.tracing_context), tracer.set_current_tx():
@@ -832,7 +811,10 @@ def transform(
         finally:
             tracer.output.call_cleanup_hooks()
 
-        output = tracer.output
+    try:
+        run_tracer()
+        tracer_output = DynamoTracerOutput(tracer)
+        output = tracer_output.output_graph
         assert output is not None
         assert output.output_instructions
         instructions[:] = output.output_instructions
@@ -840,14 +822,291 @@ def transform(
         propagate_inst_exn_table_entries(instructions)
         check_inst_exn_tab_entries_valid(instructions)
         instructions[:] = remove_pointless_jumps(remove_dead_code(instructions))
+    except Exception as e:
+        e._torch_dynamo_tracer_output = DynamoTracerOutput(tracer, error=True)  # type: ignore[attr-defined]
+        raise
+
+    return tracer_output
+
+
+@dataclass
+class DynamoOutput:
+    """
+    Represents the core data returned from a single dynamo run, including:
+      - Guards, wrapped inside tracer_output.output_graph.guards
+      - Generated bytecode
+      - Other information needed for compilation.
+    This data structure should capture all the "interesting" information dynamo
+    produces on the frontend side before it enters user backend.
+    """
+
+    tracer_output: DynamoTracerOutput
+    bytecode: types.CodeType
+    last_attempt_start_time: Optional[float]
+
+    def build_guards(
+        self,
+        code: types.CodeType,
+        hooks: Optional[Hooks] = None,
+        save: bool = False,
+        cache_entry: Optional[CacheEntry] = None,
+        strict_error: bool = False,
+    ) -> CheckFunctionManager:
+        assert self.tracer_output.output_graph is not None
+        return CheckFunctionManager(
+            code,
+            self.tracer_output.output_graph,
+            cache_entry,
+            hooks.guard_fail_fn if hooks else None,
+            hooks.guard_filter_fn if hooks else None,
+            save_guards=save,
+            strict_error=strict_error,
+        )
+
+
+@dataclass
+class BackendInput:
+    """
+    Represents core data structure that dynamo will pass to a backend, including:
+      - Graph module
+      - Example inputs
+      - The FakeTensorMode used for compiling graph.
+    This data structure should capture all the information dynamo produces
+    on for the user backend.
+    """
+
+    backend_id: str
+    graph_module: torch.fx.GraphModule
+    example_inputs: Any
+    fake_mode: torch._subclasses.fake_tensor.FakeTensorMode
+
+
+@dataclass
+class CaptureOutput:
+    """
+    CaptureOutput should represent all the information produced from torch
+    compiler for a single graph capture. This intends to be consumed by
+    various compiler frontends so that we can share as much compiler internals
+    as possible and avoid great divergence between different stacks.
+    This data structure should eventually contain all the information compiler
+    produces as more refactors happens to converge different compiler
+    frontends.
+    """
+
+    dynamo_output: DynamoOutput
+    backend_input: BackendInput
+
+
+@dataclass
+class FrameInfo:
+    code: types.CodeType
+    globals: dict[str, object]
+    locals: dict[str, object]
+    builtins: dict[str, object]
+    closure: tuple[CellType]
+
+
+def fullgraph_capture(
+    frame: FrameInfo, *, _is_export_deprecated_do_not_use: bool = False
+) -> CaptureOutput:
+    """
+    A standalone function which takes a frame and returns dynamo captured graph
+    plus other important compile information. This should serve as the common
+    interface for different torch compiler AOT frontengs (e.g. precompile, export).
+    Note that this function doesn't apply context managers like metrics context
+    or compile id, and the expectation is that the caller will apply them depending
+    on the use case.
+
+    The CaptureOutput is separated into two parts:
+    1. Dynamo specific information from DynamoOutput, which includes:
+        - guards
+        - generated bytecode
+        - other information tracked by OutputGraph.
+    2. Backend specific information (indexed by unique backend id) such as:
+        - fx graph
+        - example inputs
+    """
+    from torch._guards import TracingContext
+
+    backend_input: Optional[BackendInput] = None
+
+    def fullgraph_compiler(
+        gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
+    ) -> torch.fx.GraphModule:
+        nonlocal backend_input
+        fake_mode = TracingContext.get().fake_mode
+        assert fake_mode is not None
+        assert isinstance(gm.meta["backend_id"], str)
+        backend_input = BackendInput(
+            gm.meta["backend_id"], gm, example_inputs, fake_mode
+        )
+        return gm
+
+    try:
+        dynamo_output = compile_frame(
+            frame.code,
+            frame.globals,
+            frame.locals,
+            frame.builtins,
+            frame.closure,
+            compiler_fn=fullgraph_compiler,
+            export=_is_export_deprecated_do_not_use,
+            one_graph=True,
+            restart_reasons=set(),
+        )
+        # https://github.com/pytorch/pytorch/blob/main/torch/_dynamo/eval_frame.py#L831
+    except Unsupported as e:
+        augment_exc_message(e)
+        if config.verbose:
+            raise
+        # strip internal tracebacks from causes
+        cur_exn: BaseException = e
+        while cur_exn.__cause__ is not None:
+            cur_exn.__cause__.with_traceback(None)
+            cur_exn = cur_exn.__cause__
+        raise e.with_traceback(None) from e.__cause__  # User compiler error
+
+    assert backend_input is not None
+    return CaptureOutput(dynamo_output, backend_input)
+
+
+def compile_frame(  # type: ignore[return]
+    code: types.CodeType,
+    globals: dict[str, object],
+    locals: dict[str, object],
+    builtins: dict[str, object],
+    closure: tuple[CellType],
+    compiler_fn: CompilerFn,
+    one_graph: bool,
+    restart_reasons: set[str],
+    *,
+    export: bool = False,
+    export_constraints: Optional[typing.Never] = None,
+    frame_state: Optional[dict[str, Union[int, FrameStateSizeEntry]]] = None,
+    distributed_state: Optional[DistributedState] = None,
+    package: Optional[CompilePackage] = None,
+) -> DynamoOutput:
+    """
+    A helper function taking a frame and backend, then return the generated bytecode
+    and guards as a common data structure.
+    This is a shared interface for multiple compiler frontends (e.g. torch.compile,
+    torch.export) that needs to capture a graph out of python code.
+    """
+    # This is shared across restarts
+    speculation_log = SpeculationLog()
+
+    def transform(
+        instructions: list[Instruction], code_options: dict[str, object]
+    ) -> DynamoTracerOutput:
+        tf_mode_stack: list[torch.overrides.TorchFunctionMode] = (
+            torch.overrides._get_current_function_mode_stack()
+        )
+        tracer_output = trace_frame(
+            code,
+            globals,
+            locals,
+            builtins,
+            closure,
+            compiler_fn,
+            tf_mode_stack,
+            one_graph,
+            speculation_log,
+            instructions,
+            code_options,
+            export=export,
+            export_constraints=export_constraints,
+            frame_state=frame_state,
+            distributed_state=distributed_state,
+            package=package,
+        )
+
+        assert tracer_output is not None
+        return tracer_output
+
+    last_attempt_start_time = None
+    for attempt in itertools.count():
+        CompileContext.get().attempt = attempt
+
+        try:
+            with dynamo_timed(f"compile_attempt_{attempt}", log_pt2_compile_event=True):
+                bytecode, tracer_output = transform_code_object(code, transform)
+                assert tracer_output is not None
+                return DynamoOutput(
+                    tracer_output=tracer_output,
+                    bytecode=bytecode,
+                    last_attempt_start_time=last_attempt_start_time,
+                )
+        except exc.RestartAnalysis as e:
+            if not isinstance(e, exc.TensorifyScalarRestartAnalysis):
+                TensorifyState.clear()
+            log.info(
+                "Restarting analysis due to %s",
+                LazyString(format_traceback_short, e.__traceback__),
+            )
+            # If restart reason is None just log the type of the exception
+            restart_reasons.add(e.restart_reason or str(type(e)))
+            # We now have a new "last attempt", reset the clock
+            last_attempt_start_time = time.time()
+            if attempt > 100:
+                unimplemented_v2(
+                    gb_type="Excessive RestartAnalysis() calls",
+                    context="",
+                    explanation="Dynamo attempted to trace the same frame 100+ times. "
+                    "Giving up on compiling as the compile time tradeoff is likely not "
+                    "worth the performance gain.",
+                    hints=[],
+                )
+        except exc.SkipFrame as e:
+            if not isinstance(e, exc.TensorifyScalarRestartAnalysis):
+                TensorifyState.clear()
+            log.debug(
+                "Skipping frame %s %s \
+                %s %s",
+                e,
+                code.co_name,
+                code.co_filename,
+                code.co_firstlineno,
+            )
+            raise
+
+
+def _compile(
+    code: CodeType,
+    globals: dict[str, object],
+    locals: dict[str, object],
+    builtins: dict[str, object],
+    closure: tuple[CellType],
+    compiler_fn: CompilerFn,
+    one_graph: bool,
+    export: bool,
+    export_constraints: Optional[typing.Never],
+    hooks: Hooks,
+    cache_entry: Optional[CacheEntry],
+    cache_size: CacheSizeRelevantForFrame,
+    frame: Optional[DynamoFrameType] = None,
+    frame_state: Optional[dict[str, Union[int, FrameStateSizeEntry]]] = None,
+    *,
+    compile_id: CompileId,
+    skip: int = 0,
+    package: Optional[CompilePackage] = None,
+    # Can be used to record things for the caller, both
+    # in the case of normal and exception code paths
+    convert_frame_box: Optional[ConvertFrameBox] = None,
+) -> ConvertFrameReturn:
+    from torch._inductor.async_compile import async_compile_pool_manager
+    from torch.fx.experimental.validator import (
+        BisectValidationException,
+        ValidationException,
+    )
+
+    # Only nonlocal defs here please!
+    # Time spent compiling this frame before restarting or failing analysis
+    dynamo_time_before_restart: float = 0.0
 
     @compile_time_strobelight_meta(phase_name="compile_inner")
     def compile_inner(
-        code: CodeType,
-        one_graph: bool,
-        hooks: Hooks,
-        transform: Callable[[list[Instruction], dict[str, Any]], Any],
-    ) -> ConvertFrameReturn:
+        code: CodeType, one_graph: bool, hooks: Hooks
+    ) -> tuple[ConvertFrameReturn, Optional[DynamoTracerOutput]]:
         with contextlib.ExitStack() as stack:
             stack.enter_context(
                 torch._dynamo.callback_handler.install_callbacks(
@@ -855,10 +1114,11 @@ def compile_inner(
                 )
             )
             stack.enter_context(CompileTimeInstructionCounter.record())
-            return _compile_inner(code, one_graph, hooks, transform)
+            return _compile_inner(code, one_graph, hooks)
 
         return (
-            ConvertFrameReturn()
+            ConvertFrameReturn(),
+            None,
         )  # dead, but see https://github.com/python/mypy/issues/7577
 
     @maybe_cprofile
@@ -866,8 +1126,7 @@ def _compile_inner(
         code: CodeType,
         one_graph: bool,
         hooks: Hooks,
-        transform: Callable[[list[Instruction], dict[str, Any]], Any],
-    ) -> ConvertFrameReturn:
+    ) -> tuple[ConvertFrameReturn, DynamoTracerOutput]:
         nonlocal dynamo_time_before_restart
         last_attempt_start_time = start_time = time.time()
 
@@ -888,54 +1147,35 @@ def log_bytecode(
         )
 
         out_code = None
-        for attempt in itertools.count():
-            CompileContext.get().attempt = attempt
-            try:
-                with dynamo_timed(
-                    f"compile_attempt_{attempt}", log_pt2_compile_event=True
-                ):
-                    out_code = transform_code_object(code, transform)
-                break
-            except exc.RestartAnalysis as e:
-                if not isinstance(e, exc.TensorifyScalarRestartAnalysis):
-                    TensorifyState.clear()
-                log.info(
-                    "Restarting analysis due to %s",
-                    LazyString(format_traceback_short, e.__traceback__),
-                )
-                # If restart reason is None just log the type of the exception
-                restart_reasons.add(e.restart_reason or str(type(e)))
-                # We now have a new "last attempt", reset the clock
-                last_attempt_start_time = time.time()
-                if attempt > 100:
-                    unimplemented_v2(
-                        gb_type="Excessive RestartAnalysis() calls",
-                        context="",
-                        explanation="Dynamo attempted to trace the same frame 100+ times. "
-                        "Giving up on compiling as the compile time tradeoff is likely not "
-                        "worth the performance gain.",
-                        hints=[],
-                    )
-            except exc.SkipFrame as e:
-                if not isinstance(e, exc.TensorifyScalarRestartAnalysis):
-                    TensorifyState.clear()
-                log.debug(
-                    "Skipping frame %s %s \
-                    %s %s",
-                    e,
-                    code.co_name,
-                    code.co_filename,
-                    code.co_firstlineno,
-                )
-                if one_graph or _is_error_on_graph_break(tracer):
-                    log.debug(
-                        "No graph captured with one_graph=True or error_on_graph_break=True"
-                    )
-                return ConvertFrameReturn()
+        try:
+            dynamo_output = compile_frame(
+                code,
+                globals,
+                locals,
+                builtins,
+                closure,
+                compiler_fn,
+                one_graph,
+                restart_reasons,
+                export=export,
+                export_constraints=export_constraints,
+                frame_state=frame_state,
+                distributed_state=distributed_state,
+                package=package,
+            )
+        except exc.SkipFrame as e:
+            if one_graph:
+                log.debug("No graph captured with export/fullgraph=True")
+            assert e._torch_dynamo_tracer_output is not None
+            return ConvertFrameReturn(), e._torch_dynamo_tracer_output
 
         assert distributed_state is None or distributed_state.all_states is not None, (  # type: ignore[has-type]
             "compiler collective wasn't run before compilation completed"
         )
+        out_code = dynamo_output.bytecode
+        tracer_output = dynamo_output.tracer_output
+        if dynamo_output.last_attempt_start_time is not None:
+            last_attempt_start_time = dynamo_output.last_attempt_start_time
 
         assert out_code is not None
         log_bytecode(
@@ -955,7 +1195,8 @@ def log_bytecode(
         orig_code_map[out_code] = code
         output_codes.add(out_code)
         dynamo_time_before_restart = last_attempt_start_time - start_time
-        assert output is not None
+        assert tracer_output.output_graph is not None
+        output = tracer_output.output_graph
 
         # Tests for new code objects.
         # The rationale for these tests can be found in torch/csrc/dynamo/eval_frame.c
@@ -1001,19 +1242,17 @@ def count_args(code: CodeType) -> int:
         # are extra graphs now.
 
         if output.export and output.is_empty_graph():
-            return ConvertFrameReturn()
+            return ConvertFrameReturn(), tracer_output
 
         assert output.guards is not None
         CleanupManager.instance[out_code] = output.cleanups
         nonlocal cache_entry
         with dynamo_timed("build_guards", log_pt2_compile_event=True):
-            check_fn = CheckFunctionManager(
+            check_fn = dynamo_output.build_guards(
                 code,
-                output,
-                cache_entry,
-                hooks.guard_fail_fn if hooks else None,
-                hooks.guard_filter_fn if hooks else None,
-                guards_serialization_mode="save" if package else None,
+                hooks=hooks,
+                save=package is not None,
+                cache_entry=cache_entry,
             )
 
         if package is not None:
@@ -1038,7 +1277,7 @@ def count_args(code: CodeType) -> int:
             # they are benign and do not generate any new graphs.
             hooks.guard_export_fn(output.guards)
 
-        return wrap_guarded_code(guarded_code)
+        return wrap_guarded_code(guarded_code), tracer_output
 
     metrics_context = get_metrics_context()
     code_context = (
@@ -1061,8 +1300,6 @@ def count_args(code: CodeType) -> int:
         code_context,
     ):
         restart_reasons: set[str] = set()
-        # This is shared across restarts
-        speculation_log = SpeculationLog()
         if compile_pg := get_compile_pg():
             distributed_state = DistributedState(compile_pg, LocalState())
         else:
@@ -1075,7 +1312,31 @@ def count_args(code: CodeType) -> int:
             recompile_reason = (
                 "Unable to find recompilation reasons" if not reasons else reasons[0]
             )
-        metrics_context.update_outer({"recompile_reason": recompile_reason})
+        # Recheck for recompilation, for when inline_inbuilt_nn_modules is set to False
+        inline_inbuilt_nn_modules_candidate = False
+        if not config.inline_inbuilt_nn_modules and frame:
+            inbuilt_nn_reasons = get_and_maybe_log_recompilation_reasons(
+                cache_entry, frame, skip_logging=True
+            )
+            inbuilt_nn_recompile_reason = (
+                None if not inbuilt_nn_reasons else inbuilt_nn_reasons[0]
+            )
+
+            if (
+                inbuilt_nn_recompile_reason is not None
+                and "[inline-inbuilt-nn-modules-candidate]"
+                in inbuilt_nn_recompile_reason
+            ):
+                inline_inbuilt_nn_modules_candidate = True
+
+        # Set if the recompile is a candidate for inline_inbuilt_nn_modules
+        # regardless of whether inline_inbuilt_nn_modules is set or not
+        metrics_context.update_outer(
+            {
+                "recompile_reason": recompile_reason,
+                "inline_inbuilt_nn_modules_candidate": inline_inbuilt_nn_modules_candidate,
+            }
+        )
 
         recompile_user_contexts = get_hook_for_recompile_user_context()
         if recompile_user_contexts:
@@ -1110,10 +1371,9 @@ def format_func_info(code: CodeType) -> str:
                 raise FailOnRecompileLimitHit(
                     f"{limit_type} reached, because fail_on_recompile_limit_hit = True this is a HARD failure"
                 )
-            elif one_graph or _is_error_on_graph_break(tracer):
+            elif one_graph:
                 raise FailOnRecompileLimitHit(
-                    f"{limit_type} reached with one_graph=True or error_on_graph_break=True. "
-                    "Excessive recompilations can degrade "
+                    f"{limit_type} reached with fullgraph=True. Excessive recompilations can degrade "
                     "performance due to the compilation overhead of each recompilation. To monitor "
                     "recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider "
                     "increasing torch._dynamo.config.cache_size_limit to an appropriate value."
@@ -1160,16 +1420,17 @@ def format_func_info(code: CodeType) -> str:
         # # 2 extra here
         # torch/_logging/_internal.py:1064 in trace_structured
         # torch/_dynamo/convert_frame.py:780 in <lambda>
-        log_dynamo_start(code, skip)
+        stack_trace = log_dynamo_start(code, skip)
         start_time_ns = time.time_ns()
         fail_type: Optional[str] = None
         fail_reason: Optional[str] = None
+        exception_stack_trace: Optional[list[str]] = None
         fail_user_frame_filename: Optional[str] = None
         fail_user_frame_lineno: Optional[int] = None
         torch._dynamo.utils.ReinplaceCounters.clear()
         guarded_code = None
         try:
-            guarded_code = compile_inner(code, one_graph, hooks, transform)
+            guarded_code, tracer_output = compile_inner(code, one_graph, hooks)
 
             # NB: We only put_code_state in success case.  Success case here
             # does include graph breaks; specifically, if a graph break still
@@ -1181,7 +1442,12 @@ def format_func_info(code: CodeType) -> str:
             # to upload for graph break though, because this can prevent
             # extra graph break compilations.)
             put_code_state()
-            log_frame_dynamic_whitelist(code)
+            if (
+                tracer_output
+                and (output_graph := tracer_output.output_graph)
+                and output_graph.has_outputs()
+            ):
+                log_frame_dynamic_whitelist(code)
 
             return guarded_code
         except Exception as e:
@@ -1190,6 +1456,7 @@ def format_func_info(code: CodeType) -> str:
             # info here and add it to the metrics context below.
             fail_type = type(e).__qualname__
             fail_reason = str(e)
+            exception_stack_trace = [traceback.format_exc()]
             exception_handler(e, code, frame, export=export)
             # NB: this is the post-mutation exception
             torch._logging.trace_structured(
@@ -1203,15 +1470,8 @@ def format_func_info(code: CodeType) -> str:
             fail_user_frame_filename, fail_user_frame_lineno = exc.get_exc_message(
                 e, compile_id
             )
-            if tracer and tracer.is_tracing_resume_prologue:
-                # Do not allow any errors to be suppressed if tracer is currently tracing
-                # through resume function.
-                raise ResumePrologueTracingError(
-                    "Error while tracing through a Dynamo-generated resume function prologue. "
-                    "Errors are not allowed when tracing resume function prologues.\n"
-                    f"{type(e).__qualname__}: {str(e)}"
-                ).with_traceback(e.__traceback__) from None
-            elif isinstance(
+            tracer_output = getattr(e, "_torch_dynamo_tracer_output", None)
+            if isinstance(
                 e,
                 (
                     Unsupported,
@@ -1225,6 +1485,7 @@ def format_func_info(code: CodeType) -> str:
                     BisectValidationException,
                     ShortenTraceback,
                     PackageError,
+                    ResumePrologueTracingError,
                 ),
             ):
                 raise
@@ -1244,9 +1505,14 @@ def format_func_info(code: CodeType) -> str:
                     log.info("run_gc_after_compile: running gc")
                     gc.collect(1)
 
-            if tracer:
-                tracer.output.local_scope = {}
-                tracer.f_locals = {}
+            output = None
+            if tracer_output:
+                output = tracer_output.output_graph
+            if output:
+                output.local_scope = {}
+                # tracer should already be None, keep an extra check here just in case.
+                if tracer := output.root_tx:
+                    tracer.f_locals = {}
 
             from .utils import curr_frame
 
@@ -1256,6 +1522,7 @@ def format_func_info(code: CodeType) -> str:
                 shape_env_guard_count = len(output.shape_env.guards)
                 graph_op_count = output.count_calls()
                 graph_node_count = len(output.graph.nodes)
+                graph_node_shapes = output.get_graph_sizes_structured()
                 graph_input_count = len(output.placeholders)
                 non_compliant_ops = {op.__qualname__ for op in output.non_compliant_ops}
                 compliant_custom_ops = {
@@ -1267,6 +1534,7 @@ def format_func_info(code: CodeType) -> str:
                 shape_env_guard_count = None
                 graph_op_count = None
                 graph_node_count = None
+                graph_node_shapes = {}
                 graph_input_count = None
                 non_compliant_ops = set({})
                 compliant_custom_ops = set({})
@@ -1300,6 +1568,9 @@ def format_func_info(code: CodeType) -> str:
                 "dynamo_compile_time_before_restart_us": to_int_us(
                     dynamo_time_before_restart
                 ),
+                "stack_trace": stack_trace,
+                "graph_node_shapes": str(graph_node_shapes),
+                "exception_stack_trace": exception_stack_trace,
             }
             # TODO: replace with CompileEventLogger.compilation_metrics
             # There are some columns here not in PT2 Compile Events
@@ -1313,8 +1584,8 @@ def format_func_info(code: CodeType) -> str:
             # If tracer is unavailable, then fallback to symbolic_convert.error_on_graph_break.
             if convert_frame_box:
                 convert_frame_box.error_on_graph_break = (
-                    tracer.error_on_graph_break
-                    if tracer
+                    tracer_output.error_on_graph_break
+                    if tracer_output
                     else _get_error_on_graph_break()
                 )
 
@@ -1470,7 +1741,7 @@ def replay(filename: str) -> None:
         record = ExecutionRecord.load(in_file)
     record.globals = dict(itertools.chain(record.globals.items(), globals().items()))
 
-    with decorators.set_fullgraph(fullgraph=False):
+    with decorators.error_on_graph_break(False):
         try:
             _compile(
                 record.code,
@@ -1514,6 +1785,10 @@ def __call__(
     ) -> ConvertFrameReturn: ...
 
 
+def should_skip_due_to_torch_dispatch_mode() -> bool:
+    return is_in_any_mode_without_ignore_compile_internals()
+
+
 class CatchErrorsWrapper:
     def __init__(self, callback: ConvertFrameProtocol, hooks: Hooks) -> None:
         functools.wraps(callback)(self)
@@ -1527,7 +1802,6 @@ def __call__(
         frame_state: dict[str, Union[int, FrameStateSizeEntry]],
     ) -> ConvertFrameReturn:
         assert frame_state is not None
-
         input_codes.add(frame.f_code)
 
         is_skipfile = trace_rules.check(frame.f_code)
@@ -1541,7 +1815,7 @@ def __call__(
             or is_skipfile
             or config.disable
             or (
-                is_in_torch_dispatch_mode(include_infra_modes=False)
+                should_skip_due_to_torch_dispatch_mode()
                 and not getattr(self._torchdynamo_orig_backend, "_export", False)
             )
         ):
@@ -1563,8 +1837,13 @@ def __call__(
                 )
             return ConvertFrameReturn()
 
-        if frame.f_code.co_filename == "<string>" and frame.f_code.co_name == "__new__":
-            # nametuple constructor
+        if (
+            frame.f_code.co_filename == "<string>" and frame.f_code.co_name == "__new__"
+        ) or (
+            frame.f_code.co_filename.endswith("collections/__init__.py")
+            and frame.f_code.co_name == "_make"
+        ):
+            # nametuple constructor/_make
             return ConvertFrameReturn()
         if torch._dynamo.utils.get_optimize_ddp_mode() == "ddp_optimizer":
             ddp_module = DistributedDataParallel._get_active_ddp_module()
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 1a17f5bf188e0..2321213a0a3ba 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -16,6 +16,8 @@
 - BuckTargetWriter: Manages Buck build system integration
 """
 
+from __future__ import annotations
+
 import atexit
 import copy
 import cProfile
@@ -31,9 +33,8 @@
 import tempfile
 import textwrap
 from collections import Counter
-from collections.abc import Sequence
 from importlib import import_module
-from typing import Any, Callable, Optional, TypeVar
+from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar
 
 import torch
 import torch._prims_common as utils
@@ -42,15 +43,20 @@
 from torch._dynamo.testing import rand_strided
 from torch._inductor.cpp_builder import normalize_path_separator
 from torch._prims_common import is_float_dtype
-from torch.hub import tqdm
 from torch.multiprocessing.reductions import StorageWeakRef
-from torch.storage import UntypedStorage
 from torch.utils._content_store import ContentStoreReader, ContentStoreWriter
 
 from . import config
 from .utils import clone_inputs, get_debug_dir
 
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from torch.hub import tqdm
+    from torch.storage import UntypedStorage
+
+
 log = logging.getLogger(__name__)
 
 T = TypeVar("T")
@@ -534,10 +540,10 @@ def backend_accuracy_fails(
 
 
 def _stride_or_default(
-    stride: Optional["torch._prims_common.StrideType"],
+    stride: Optional[torch._prims_common.StrideType],
     *,
-    shape: "torch._prims_common.ShapeType",
-) -> "torch._prims_common.StrideType":
+    shape: torch._prims_common.ShapeType,
+) -> torch._prims_common.StrideType:
     return stride if stride is not None else utils.make_contiguous_strides_for(shape)
 
 
@@ -561,7 +567,7 @@ def storage(
         storage_hash: Optional[str],
         nbytes: int,
         *,
-        device: Optional["torch._prims_common.DeviceLikeType"] = None,
+        device: Optional[torch._prims_common.DeviceLikeType] = None,
         dtype_hint: Optional[torch.dtype] = None,
     ) -> None:
         self.total += 1
@@ -592,7 +598,7 @@ def storage(
         storage_hash: Optional[str],
         nbytes: int,
         *,
-        device: Optional["torch._prims_common.DeviceLikeType"] = None,
+        device: Optional[torch._prims_common.DeviceLikeType] = None,
         dtype_hint: Optional[torch.dtype] = None,
     ) -> UntypedStorage:
         if self.pbar is not None:
@@ -619,8 +625,8 @@ def storage(
     def tensor(
         self,
         storage: UntypedStorage,
-        shape: "torch._prims_common.ShapeType",
-        stride: Optional["torch._prims_common.StrideType"] = None,
+        shape: torch._prims_common.ShapeType,
+        stride: Optional[torch._prims_common.StrideType] = None,
         *,
         storage_offset: Optional[int] = None,
         dtype: Optional[torch.dtype] = None,
@@ -698,7 +704,7 @@ def storage(
         self,
         untyped_storage: UntypedStorage,
         *,
-        device_hint: Optional["torch._prims_common.DeviceLikeType"] = None,
+        device_hint: Optional[torch._prims_common.DeviceLikeType] = None,
         dtype_hint: Optional[torch.dtype] = None,
     ) -> str:
         ws = StorageWeakRef(untyped_storage)
@@ -841,9 +847,7 @@ def get_sym_int(symint: str) -> int:
         )
         return sym_shapes_dict.get(symint, default_sym_shape)  # type: ignore[return-value]
 
-    def gen_tensor(
-        shape: "torch._prims_common.ShapeType", dtype: torch.dtype
-    ) -> Tensor:
+    def gen_tensor(shape: torch._prims_common.ShapeType, dtype: torch.dtype) -> Tensor:
         # Resolve symbolic shapes to concrete values
         resolved_shape = []
         dynamic_dims = []
diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py
index 305ce6e6146a0..8143a31608d57 100644
--- a/torch/_dynamo/decorators.py
+++ b/torch/_dynamo/decorators.py
@@ -585,6 +585,7 @@ def mark_dynamic(
     t: Any,
     index: Union[int, list[Any], tuple[Any]],
     *,
+    hint_override: Optional[int] = None,
     min: Optional[int] = None,
     max: Optional[int] = None,
     specialize_on: Optional[list[Any]] = None,
@@ -637,10 +638,13 @@ def mark_dynamic(
         if not hasattr(t, "_dynamo_dynamic_indices"):
             t._dynamo_dynamic_indices = set()
             t._dynamo_dynamic_range = set()
+            t._dynamo_hint_overrides = {}
 
         if not hasattr(t, "_specialize_on"):
             t._specialize_on = {}
 
+        if hint_override:
+            t._dynamo_hint_overrides[index] = hint_override
         # TODO(voz): Should we bounds check?
         t._dynamo_dynamic_indices.add(index)
         t._dynamo_dynamic_range.add(_DimRange(index, min, max))  # type: ignore[arg-type]
@@ -755,15 +759,6 @@ def mark_static_address(t: Any, guard: bool = True) -> None:
     is not needed for this input. The data_ptr will be guarded if guard=True. Note:
     Tensors marked in this way will be kept alive until `torch._dynamo.reset()` is called.
     """
-    if torch._dynamo.config.caching_precompile:
-        # [Note] Static Addresses and Precompile
-        # When using precompile, `mark_static_address` is dangerous to use, because
-        # dynamo saves the addresses directly on the parameters of the graph. These addresses
-        # are process dependent, so are not serializable, and serializing
-        # their tensors would be extremely expensive. Instead, by treating mark_static_address
-        # as a no-op, dynamo will automatically inline them as inputs to the graph instead.
-        # See https://github.com/pytorch/pytorch/issues/159228
-        return
     if not isinstance(t, torch.Tensor):
         raise TypeError(f"mark_static_address expects a tensor but received {type(t)}")
 
@@ -923,15 +918,15 @@ def dont_skip_tracing(fn: Optional[Any] = None) -> Any:
     return ctx
 
 
-class SetFullgraphDecoratorContextManager:
-    def __init__(self, fullgraph: bool) -> None:
-        self.fullgraph = fullgraph
+class ErrorOnGraphBreakDecoratorContextManager:
+    def __init__(self, error_on_graph_break: bool) -> None:
+        self.error_on_graph_break = error_on_graph_break
 
     __call__ = wrap_dunder_call_ctx_manager
 
     def __enter__(self) -> None:
-        self.prev_fullgraph = _get_error_on_graph_break()
-        _set_error_on_graph_break(self.fullgraph)
+        self.prev_error_on_graph_break = _get_error_on_graph_break()
+        _set_error_on_graph_break(self.error_on_graph_break)
 
     def __exit__(
         self,
@@ -939,14 +934,24 @@ def __exit__(
         exc_val: Optional[BaseException],
         exc_tb: Optional[TracebackType],
     ) -> None:
-        _set_error_on_graph_break(self.prev_fullgraph)
+        _set_error_on_graph_break(self.prev_error_on_graph_break)
 
 
-def set_fullgraph(fullgraph: bool) -> SetFullgraphDecoratorContextManager:
+def error_on_graph_break(
+    error_on_graph_break: bool,
+) -> ErrorOnGraphBreakDecoratorContextManager:
     """
-    Context manager/decorator to toggle fullgraph setting.
+    Context manager/decorator to toggle torch.compile's `error_on_graph_break` setting at compile time.
+
+    If `fullgraph` is set, then `error_on_graph_break` does nothing
+    (i.e. `fullgraph = True` takes higher precedence). If `fullgraph` is False, then
+    `error_on_graph_break` determines whether `torch.compile` throws an error upon
+    encountering a graph break, or attempts to continue tracing.
+
+    `error_on_graph_break` can be toggled during compile time with this decorator to allow graph breaks in some
+    compiled regions but not others. One key difference from `fullgraph` is that `error_on_graph_break = True`
+    does NOT guarantee that a single graph is captured from the compiled function.
 
-    More precisely, when encountering a graph break, we will decide to resume (fullgraph=False)
-    or error out (fullgraph=True) based on the fullgraph setting at the location of the graph break.
+    The default value of torch.compile's `error_on_graph_break` setting is False.
     """
-    return SetFullgraphDecoratorContextManager(fullgraph)
+    return ErrorOnGraphBreakDecoratorContextManager(error_on_graph_break)
diff --git a/torch/_dynamo/device_interface.py b/torch/_dynamo/device_interface.py
index 9ea53c900b054..26cf4796fd073 100644
--- a/torch/_dynamo/device_interface.py
+++ b/torch/_dynamo/device_interface.py
@@ -17,6 +17,7 @@
 
 import inspect
 import time
+from collections import namedtuple
 from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import Any, Callable, Literal, Optional, Union
@@ -544,8 +545,10 @@ def synchronize(device: torch.types.Device = None) -> None:
 
     class Worker:
         @staticmethod
-        def get_device_properties(device: torch.types.Device = None) -> dict[str, Any]:
-            return {}
+        def get_device_properties(device: torch.types.Device = None) -> Any:
+            return namedtuple("MPSProperties", ["multi_processor_count"])(
+                torch.backends.mps.get_core_count()  # type: ignore[arg-type]
+            )
 
         @staticmethod
         def current_device() -> int:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index fd85b5d28e03c..177541e8f3341 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -36,6 +36,7 @@
 import threading
 import traceback
 import types
+import unittest
 import warnings
 import weakref
 from dataclasses import dataclass
@@ -597,11 +598,13 @@ def __init__(
         patch_fn: Callable[[], Any] = nothing,
         first_ctx: bool = False,
         *,
-        error_on_graph_break: bool = False,
+        fullgraph: bool = False,
+        error_on_graph_break: Optional[bool] = None,
         export: bool = False,
         dynamic: Optional[bool] = None,
         compiler_config: Optional[Any] = None,
         package: Optional[CompilePackage] = None,
+        hooks: Optional[Hooks] = None,
     ) -> None:
         super().__init__()
         assert callable(callback) or callback is False or callback is None
@@ -609,6 +612,7 @@ def __init__(
         self._backend_ctx_ctor = backend_ctx_ctor
         self.prior: Union[Unset, DynamoCallback] = unset
         self.first_ctx = first_ctx
+        self.fullgraph = fullgraph
         self.error_on_graph_break = error_on_graph_break
         self.export = export
         self._dynamic = dynamic
@@ -616,6 +620,7 @@ def __init__(
         self.cleanup_fns: list[Callable[[], Any]] = []
         self.enter_exit_hooks = []
         self._package = package
+        self._hooks = hooks
         patch_fn()
 
         # Save the backends so that we can reset them during torch._dynamo.reset
@@ -699,6 +704,27 @@ def get_compiler_config() -> Any:
 
         fn = innermost_fn(fn)
 
+        def aot_compile(example_inputs: tuple[tuple[Any, ...], dict[str, Any]]) -> Any:
+            from torch._dynamo.aot_compile import aot_compile_fullgraph
+
+            if not self.fullgraph:
+                raise RuntimeError(
+                    "Graph breaks are not supported with aot compile. Please use torch.compile(fullgraph=True)."
+                )
+
+            if not callable(self.callback):
+                raise RuntimeError("aot compile requires a callable dynamo callback.")
+
+            assert self._hooks is not None
+            return aot_compile_fullgraph(
+                fn,
+                example_inputs,
+                hooks=self._hooks,
+                backend=innermost_fn(
+                    self.callback, unaltered_fn_attr="_torchdynamo_orig_backend"
+                ),
+            )
+
         # add context containing GraphModule to any GraphModule forward functions
         if isinstance(fn, GraphModule):
             # add context containing GraphModule to any GraphModule forward functions
@@ -739,7 +765,9 @@ def get_compiler_config() -> Any:
             filename = inspect.getsourcefile(fn)
         except TypeError:
             filename = None
-        if config.wrap_top_frame or (
+        if config.debug_force_nested_calls:
+            fn = external_utils.wrap_inline(fn)
+        elif config.wrap_top_frame or (
             (filename is None or trace_rules.check(fn))
             and (
                 getattr(fn, "__name__", "")
@@ -758,13 +786,13 @@ def do_nothing(*arg: Any, **kwargs: Any) -> None:
             callback = self.callback  # type: ignore[assignment]
 
         is_jit_tracing = torch._C._is_tracing
-        is_fx_tracing = torch.fx._symbolic_trace.is_fx_tracing
+        is_fx_symbolic_tracing = torch.fx._symbolic_trace.is_fx_symbolic_tracing
 
         @functools.wraps(fn)
         def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
             prior = set_eval_frame(None)
             try:
-                if is_fx_tracing():
+                if is_fx_symbolic_tracing():
                     if config.error_on_nested_fx_trace:
                         raise RuntimeError(
                             "Detected that you are using FX to symbolically trace "
@@ -784,7 +812,7 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
                     _is_skip_guard_eval_unsafe_stance()
                 )
                 prior_error_on_graph_break = None
-                if self.error_on_graph_break is not None:
+                if not self.fullgraph and self.error_on_graph_break is not None:
                     prior_error_on_graph_break = _get_error_on_graph_break()
                     _set_error_on_graph_break(self.error_on_graph_break)
 
@@ -831,9 +859,14 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
                 _maybe_set_eval_frame(prior)
 
         # hooks to properly handle inlining
-        compile_wrapper._torchdynamo_inline = (  # type: ignore[attr-defined]
-            external_utils.wrap_inline_with_set_fullgraph(fn, self.error_on_graph_break)
-        )
+        if self.error_on_graph_break is not None:
+            compile_wrapper._torchdynamo_inline = (  # type: ignore[attr-defined]
+                external_utils.wrap_inline_with_error_on_graph_break(
+                    fn, self.error_on_graph_break
+                )
+            )
+        else:
+            compile_wrapper._torchdynamo_inline = fn  # type: ignore[attr-defined]
 
         # Save the function pointer to find the original callable while nesting
         # of decorators.
@@ -843,6 +876,8 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
         # provide public api _fn.get_compiler_config()
         assert not hasattr(compile_wrapper, "get_compiler_config")
         compile_wrapper.get_compiler_config = get_compiler_config  # type: ignore[attr-defined]
+        if torch._dynamo.config.enable_aot_compile:
+            compile_wrapper.aot_compile = aot_compile  # type: ignore[attr-defined]
 
         # If the function is called using torch._dynamo.optimize decorator, we
         # should prevent any type of skipping.
@@ -893,7 +928,8 @@ def __init__(
         backend_ctx_ctor: Callable[[], contextlib.AbstractContextManager[Any]],
         first_ctx: bool = False,
         *,
-        error_on_graph_break: bool = False,
+        fullgraph: bool = False,
+        error_on_graph_break: Optional[bool] = None,
         export: bool = False,
         dynamic: Optional[bool] = None,
         compiler_config: Optional[Any] = None,
@@ -901,6 +937,7 @@ def __init__(
             Callable[[], Union[OptimizeContext, _NullDecorator]]
         ] = None,
         package: Optional[CompilePackage] = None,
+        hooks: Optional[Hooks] = None,
     ) -> None:
         def on_enter() -> None:
             install_generation_tagging_init()
@@ -911,11 +948,13 @@ def on_enter() -> None:
             backend_ctx_ctor=backend_ctx_ctor,
             patch_fn=TorchPatcher.patch,
             first_ctx=first_ctx,
+            fullgraph=fullgraph,
             error_on_graph_break=error_on_graph_break,
             export=export,
             dynamic=dynamic,
             compiler_config=compiler_config,
             package=package,
+            hooks=hooks,
         )
 
         if config.compiled_autograd:
@@ -1035,7 +1074,8 @@ def _optimize_catch_errors(
     backend_ctx_ctor: Callable[
         [], contextlib.AbstractContextManager[Any]
     ] = null_context,
-    error_on_graph_break: bool = False,
+    fullgraph: bool = False,
+    error_on_graph_break: Optional[bool] = None,
     export: bool = False,
     dynamic: Optional[bool] = None,
     compiler_config: Optional[Any] = None,
@@ -1046,12 +1086,14 @@ def _optimize_catch_errors(
         convert_frame.catch_errors_wrapper(compile_fn, hooks),
         backend_ctx_ctor=backend_ctx_ctor,
         first_ctx=True,
+        fullgraph=fullgraph,
         error_on_graph_break=error_on_graph_break,
         export=export,
         dynamic=dynamic,
         compiler_config=compiler_config,
         rebuild_ctx=rebuild_ctx,
         package=package,
+        hooks=hooks,
     )
 
 
@@ -1070,7 +1112,7 @@ def get_compiler_fn(
         compiler_str = compiler_fn
     else:
         compiler_str = None
-    compiler_fn = lookup_backend(compiler_fn)
+    compiler_fn = lookup_backend(compiler_fn)  # type: ignore[arg-type]
     return wrap_backend_debug(compiler_fn, compiler_str)
 
 
@@ -1082,6 +1124,89 @@ def __call__(self, fn: Callable[..., Any]) -> Callable[..., Any]:
         return fn
 
 
+# Make dynamo graph to have same input/output spec as user code
+def argument_names(
+    f_sig: inspect.Signature, args: list[Any], kwargs: dict[str, Any]
+) -> list[str]:
+    def signature_to_fullargspec(sig: inspect.Signature) -> inspect.FullArgSpec:
+        # Get a list of Parameter objects from the Signature object
+        params = list(sig.parameters.values())
+        # Separate positional arguments, keyword-only arguments and varargs/varkw
+        args = [
+            p.name for p in params if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+        ]
+        kwonlyargs = [
+            p.name for p in params if p.kind == inspect.Parameter.KEYWORD_ONLY
+        ]
+        varargs = next(
+            (p.name for p in params if p.kind == inspect.Parameter.VAR_POSITIONAL),
+            None,
+        )
+        varkw = next(
+            (p.name for p in params if p.kind == inspect.Parameter.VAR_KEYWORD),
+            None,
+        )
+        # Get default values for positional arguments and keyword-only arguments
+        defaults = tuple(
+            p.default
+            for p in params
+            if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+            and p.default is not inspect.Parameter.empty
+        )
+        kwonlydefaults = {
+            p.name: p.default
+            for p in params
+            if p.kind == inspect.Parameter.KEYWORD_ONLY
+            and p.default is not inspect.Parameter.empty
+        }
+        # Get annotations for parameters and return value
+        annotations = {}
+        if sig.return_annotation:
+            annotations = {"return": sig.return_annotation}
+        for parameter in params:
+            annotations[parameter.name] = parameter.annotation
+        # Return a FullArgSpec object with the extracted attributes
+        return inspect.FullArgSpec(
+            args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations
+        )
+
+    fullargspec = signature_to_fullargspec(f_sig)
+
+    # 1. Map `args` 1-to-1 to positional arguments in original signature.
+    input_strs = fullargspec.args[: len(args)]
+
+    if len(args) > len(fullargspec.args):
+        # 2. If there are more arguments left in `args`, they map to varargs in original
+        # signature. Assign names as {varargs}_0, {varargs}_1, ...
+        assert fullargspec.varargs is not None, "More arguments than expected"
+        input_strs += [
+            f"{fullargspec.varargs}_{i}" for i in range(0, len(args) - len(input_strs))
+        ]
+    elif len(args) < len(fullargspec.args):
+        # 3. If there are fewer arguments in `args` than `fullargspec.args`,
+        # it implies these are arguments either with default values, or provided in
+        # `kwargs`. The former can be safely ignored. Because Dynamo.export does not
+        # export them as part of the function signature. The latter will be handled
+        # in the next step.
+        for unprovided_arg in fullargspec.args[
+            len(args) : -len(fullargspec.defaults or [])
+        ]:
+            assert unprovided_arg in kwargs, f"Missing argument {unprovided_arg}"
+
+    # 4. Keyword arguments provided in `kwargs`.
+    input_strs += list(kwargs.keys())
+
+    # 5. Keyword-only arguments with default values if not provided are not exported
+    # as part of the function signature.
+    for kwonly_arg in fullargspec.kwonlyargs:
+        kwonlydefaults = fullargspec.kwonlydefaults or {}
+        assert kwonly_arg in kwargs or kwonly_arg in kwonlydefaults, (
+            f"Missing keyword only argument {kwonly_arg}"
+        )
+
+    return input_strs
+
+
 def check_if_dynamo_supported() -> None:
     if sys.version_info >= (3, 14):
         raise RuntimeError("Python 3.14+ not yet supported for torch.compile")
@@ -1143,6 +1268,7 @@ def _optimize(
     backend: Union[str, Callable[..., Any]] = "inductor",
     *,
     nopython: bool = False,
+    error_on_graph_break: Optional[bool] = None,
     guard_export_fn: Optional[Callable[[_guards.GuardsSet], None]] = None,
     guard_fail_fn: Optional[Callable[[GuardFail], None]] = None,
     guard_filter_fn: Optional[Callable[[list[GuardFilterEntry]], list[bool]]] = None,
@@ -1165,6 +1291,11 @@ def _optimize(
             - Or, a string backend name in `torch._dynamo.list_backends()`
         nopython: If True, graph breaks will be errors and there will
             be a single whole-program graph.
+        error_on_graph_break: If not None, the current `error_on_graph_break` setting is set to the given value.
+            See `torch._dynamo.error_on_graph_break()` for more details on what `error_on_graph_break` means.
+
+            Unlike `nopython=True` (i.e. `fullgraph=True`), there is no guarantee of a single whole-program graph.
+            If `nopython` is True, `error_on_graph_break` does nothing.
         disable: If True, turn this decorator into a no-op
         dynamic: If True, upfront compile as dynamic a kernel as possible.  If False,
             disable all dynamic shapes support (always specialize).  If None, automatically
@@ -1195,6 +1326,15 @@ def toy_example(a, b): ...
     ):
         return _NullDecorator()
 
+    if nopython and not config.debug_force_graph_break_on_leaf_return:
+        return optimize_assert(
+            backend,
+            dynamic=dynamic,
+            hooks=hooks,
+            rebuild_ctx=rebuild_ctx,
+            package=package,
+        )
+
     backend = get_compiler_fn(backend)
 
     # Find if backend has any extra context manager
@@ -1219,7 +1359,9 @@ def toy_example(a, b): ...
         ),
         hooks,
         backend_ctx_ctor,
-        error_on_graph_break=nopython,
+        fullgraph=False,
+        error_on_graph_break=error_on_graph_break
+        and not config.debug_force_graph_break_on_leaf_return,
         dynamic=dynamic,
         compiler_config=(
             backend.get_compiler_config()
@@ -1245,7 +1387,7 @@ def inner(*args: Any, **kwargs: Any) -> ExplainOutput:
         graphs: list[torch.fx.GraphModule] = []
         break_reasons: list[Any] = []
         op_count: int = 0
-        ops_per_graph: list[torch.fx.Node] = []
+        ops_per_graph: list[list[Target]] = []
         out_guards: list[_guards.Guard] = []
 
         def dynamo_graph_accumulating_compiler(
@@ -1591,91 +1733,6 @@ def produce_matching(
         fake_mode,
     ).transform()
 
-    # Make dynamo graph to have same input/output spec as user code
-    def argument_names(
-        f_sig: inspect.Signature, args: list[Any], kwargs: dict[str, Any]
-    ) -> list[str]:
-        def signature_to_fullargspec(sig: inspect.Signature) -> inspect.FullArgSpec:
-            # Get a list of Parameter objects from the Signature object
-            params = list(sig.parameters.values())
-            # Separate positional arguments, keyword-only arguments and varargs/varkw
-            args = [
-                p.name
-                for p in params
-                if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
-            ]
-            kwonlyargs = [
-                p.name for p in params if p.kind == inspect.Parameter.KEYWORD_ONLY
-            ]
-            varargs = next(
-                (p.name for p in params if p.kind == inspect.Parameter.VAR_POSITIONAL),
-                None,
-            )
-            varkw = next(
-                (p.name for p in params if p.kind == inspect.Parameter.VAR_KEYWORD),
-                None,
-            )
-            # Get default values for positional arguments and keyword-only arguments
-            defaults = tuple(
-                p.default
-                for p in params
-                if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
-                and p.default is not inspect.Parameter.empty
-            )
-            kwonlydefaults = {
-                p.name: p.default
-                for p in params
-                if p.kind == inspect.Parameter.KEYWORD_ONLY
-                and p.default is not inspect.Parameter.empty
-            }
-            # Get annotations for parameters and return value
-            annotations = {}
-            if sig.return_annotation:
-                annotations = {"return": sig.return_annotation}
-            for parameter in params:
-                annotations[parameter.name] = parameter.annotation
-            # Return a FullArgSpec object with the extracted attributes
-            return inspect.FullArgSpec(
-                args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations
-            )
-
-        fullargspec = signature_to_fullargspec(f_sig)
-
-        # 1. Map `args` 1-to-1 to positional arguments in original signature.
-        input_strs = fullargspec.args[: len(args)]
-
-        if len(args) > len(fullargspec.args):
-            # 2. If there are more arguments left in `args`, they map to varargs in original
-            # signature. Assign names as {varargs}_0, {varargs}_1, ...
-            assert fullargspec.varargs is not None, "More arguments than expected"
-            input_strs += [
-                f"{fullargspec.varargs}_{i}"
-                for i in range(0, len(args) - len(input_strs))
-            ]
-        elif len(args) < len(fullargspec.args):
-            # 3. If there are fewer arguments in `args` than `fullargspec.args`,
-            # it implies these are arguments either with default values, or provided in
-            # `kwargs`. The former can be safely ignored. Because Dynamo.export does not
-            # export them as part of the function signature. The latter will be handled
-            # in the next step.
-            for unprovided_arg in fullargspec.args[
-                len(args) : -len(fullargspec.defaults or [])
-            ]:
-                assert unprovided_arg in kwargs, f"Missing argument {unprovided_arg}"
-
-        # 4. Keyword arguments provided in `kwargs`.
-        input_strs += list(kwargs.keys())
-
-        # 5. Keyword-only arguments with default values if not provided are not exported
-        # as part of the function signature.
-        for kwonly_arg in fullargspec.kwonlyargs:
-            kwonlydefaults = fullargspec.kwonlydefaults or {}
-            assert kwonly_arg in kwargs or kwonly_arg in kwonlydefaults, (
-                f"Missing keyword only argument {kwonly_arg}"
-            )
-
-        return input_strs
-
     new_graph.graph._codegen = _PyTreeCodeGen(
         _PyTreeInfo(
             argument_names(f_sig, orig_args, orig_kwargs),
@@ -1702,7 +1759,6 @@ def export(
     same_signature: bool = True,
     disable_constraint_solver: bool = False,
     prefer_deferred_runtime_asserts_over_guards: bool = False,
-    allow_complex_guards_as_runtime_asserts: bool = False,
     _log_export_usage: bool = True,
     constraints: Optional[list[Constraint]] = None,
     **extra_kwargs: Any,
@@ -1760,6 +1816,9 @@ def export(
 
     Note - this headerdoc was authored by ChatGPT, with slight modifications by the author.
     """
+    if config.debug_force_graph_break_on_leaf_return:
+        raise unittest.SkipTest("Cannot force graph break on export")
+
     if _log_export_usage:
         log_export_usage(event="export.private_api", flags={"_dynamo"})
 
@@ -1926,7 +1985,6 @@ def fakify_with_ambient(
                 capture_dynamic_output_shape_ops=True,
                 capture_scalar_outputs=True,
                 prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
-                allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
             ),
             _compiling_state_context(),
         ):
@@ -2139,10 +2197,11 @@ def _optimize_assert(
     package: Optional[CompilePackage] = None,
 ) -> OptimizeContext:
     """
-    The same as `torch._dynamo.optimize(backend, nopython=True)`,
-    but ignores symbolic_convert.error_on_graph_break setting.
+    Guarantees single-graph capture.
+    The same as `torch._dynamo.optimize(backend)` but ignores
+    symbolic_convert.error_on_graph_break setting.
 
-    Used for export, since we must always error on graph breaks and ignore
+    Used for fullgraph=True and export, since we must always error on graph breaks and ignore
     symbolic_convert.error_on_graph_break. Can also be used for testing.
     """
     backend = get_compiler_fn(backend)
@@ -2169,6 +2228,7 @@ def _optimize_assert(
         ),
         hooks,
         backend_ctx_ctor,
+        fullgraph=True,
         export=export,
         dynamic=dynamic,
         rebuild_ctx=rebuild_ctx,
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index 5039cf63526c3..e69b768ba3746 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -39,6 +39,7 @@
 from typing import Any, NoReturn, Optional, TYPE_CHECKING
 
 import torch._guards
+from torch._utils_internal import get_file_path_2
 
 from . import config
 from .utils import counters
@@ -49,6 +50,7 @@
 
     from torch._guards import CompileId
 
+    from .output_graph import DynamoTracerOutput
     from .symbolic_convert import InstructionTranslatorBase
     from .types import DynamoFrameType
 
@@ -66,7 +68,9 @@ def exportdb_error_message(case_name: str) -> str:
 
 
 class TorchDynamoException(RuntimeError):
-    pass
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self._torch_dynamo_tracer_output: Optional[DynamoTracerOutput] = None
 
 
 class InternalTorchDynamoError(TorchDynamoException):
@@ -264,7 +268,14 @@ class UnsafeScriptObjectError(TorchDynamoException):
 
 
 class UncapturedHigherOrderOpError(TorchDynamoException):
-    pass
+    def __init__(self, msg: str, real_stack: Optional[StackSummary] = None) -> None:
+        super().__init__(msg)
+        self.msg = msg
+        self.real_stack = (
+            real_stack
+            if real_stack is not None
+            else torch._guards.TracingContext.extract_stack()
+        )
 
 
 class IncorrectUsage(Exception):
@@ -355,7 +366,7 @@ class ObservedTypeError(ObservedException):
 def get_dynamo_observed_exception(exc_type: type[Exception]) -> type[ObservedException]:
     if exc_type not in observed_exception_map:
         name = getattr(exc_type, "__name__", str(exc_type))
-        observed_exception_map[exc_type] = type(
+        observed_exception_map[exc_type] = type(  # type: ignore[assignment]
             f"Observed{name}Error", (ObservedException,), {}
         )
     return observed_exception_map[exc_type]
@@ -373,7 +384,7 @@ def raise_observed_exception(
     # CPython here raises an exception. Since there is no python code, we have to manually setup the exception
     # stack and raise the exception.
     exception_vt = BuiltinVariable(exc_type).call_function(tx, args or [], kwargs or {})  # type: ignore[arg-type]
-    tx.exn_vt_stack.set_current_exception(exception_vt)
+    tx.exn_vt_stack.set_current_exception(exception_vt)  # type: ignore[arg-type]
     raise get_dynamo_observed_exception(exc_type)
 
 
@@ -502,18 +513,29 @@ def format_graph_break_message(
 
 
 @lru_cache(maxsize=1)
-def _load_graph_break_registry() -> dict[str, Any]:
+def _load_gb_type_to_gb_id_map() -> dict[str, Any]:
     """
-    Loads the graph break registry from JSON file with caching.
+    Loads the gb_type to gb_id map from the graph break registry from JSON file with caching.
+
+    Includes historical gb_type (mapping behavior of duplicate gb_types with different gb_ids is undefined).
     """
     try:
         script_dir = Path(__file__).resolve().parent
-        registry_path = script_dir / "graph_break_registry.json"
-        with registry_path.open() as f:
-            return json.load(f)
-    except (FileNotFoundError, json.JSONDecodeError) as e:
+        registry_path = get_file_path_2(
+            "", str(script_dir), "graph_break_registry.json"
+        )
+        with open(registry_path) as f:
+            registry = json.load(f)
+    except Exception as e:
         log.error("Error accessing the registry file: %s", e)
-        return {}
+        registry = {}
+
+    mapping = {}
+    for k, v in registry.items():
+        for entry in v:
+            mapping[entry["Gb_type"]] = k
+
+    return mapping
 
 
 def get_gbid_documentation_link(gb_type: str) -> Optional[str]:
@@ -527,14 +549,15 @@ def get_gbid_documentation_link(gb_type: str) -> Optional[str]:
         A string containing the documentation URL if found, otherwise None.
     """
     GRAPH_BREAK_SITE_URL = (
-        "https://pytorch-labs.github.io/compile-graph-break-site/gb/"  # @lint-ignore
+        "https://meta-pytorch.github.io/compile-graph-break-site/gb/"  # @lint-ignore
     )
 
-    registry = _load_graph_break_registry()
+    gb_type_to_gb_id_map = _load_gb_type_to_gb_id_map()
 
-    for k, v in registry.items():
-        if v and v[0].get("Gb_type") == gb_type:
-            return f"{GRAPH_BREAK_SITE_URL}gb{k.lstrip('GB')}.html"
+    if gb_type in gb_type_to_gb_id_map:
+        return (
+            f"{GRAPH_BREAK_SITE_URL}gb{gb_type_to_gb_id_map[gb_type].lstrip('GB')}.html"
+        )
 
     return None
 
diff --git a/torch/_dynamo/external_utils.py b/torch/_dynamo/external_utils.py
index f48c14862ac04..2ff3f6752f568 100644
--- a/torch/_dynamo/external_utils.py
+++ b/torch/_dynamo/external_utils.py
@@ -203,7 +203,7 @@ def wrap_dunder_call_ctx_manager(self: Any, func: Callable[_P, _R]) -> Callable[
     Apply self as a ctx manager around a call to func
     """
 
-    @functools.wraps(func)
+    # NOTE: do not functools.wraps(func) because we don't ever want this frame to be skipped!
     def inner(*args: _P.args, **kwargs: _P.kwargs) -> _R:
         with self:
             return func(*args, **kwargs)
@@ -229,23 +229,52 @@ def call_accumulate_grad(
     variable.grad = updated_grad[0]
 
 
-def wrap_inline_with_set_fullgraph(
-    fn: Callable[_P, _R], fullgraph: bool
+def wrap_inline_with_error_on_graph_break(
+    fn: Callable[_P, _R], error_on_graph_break: bool
 ) -> Callable[_P, _R]:
     # NB: need multiple definitions in order to prevent `fullgraph` from
     # being a freevar of wrapper
-    if fullgraph:
+    # NOTE: do not functools.wraps(fn) because we don't ever want these wrappers to be skipped!
+    if error_on_graph_break:
 
-        @functools.wraps(fn)
         def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
-            with torch._dynamo.set_fullgraph(True):
+            with torch._dynamo.error_on_graph_break(True):
                 return fn(*args, **kwargs)
 
     else:
 
-        @functools.wraps(fn)
         def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 return fn(*args, **kwargs)
 
     return wrapper
+
+
+def filter_out_const_values(tup: tuple[Any, ...], masks: list[bool]) -> tuple[Any, ...]:
+    """
+    masks is a list of bools, where True means the corresponding element in tup
+    is a const value. Filter out the const values.
+    """
+    out = []
+    for mask_idx, mask in enumerate(masks):
+        if not mask:
+            out.append(tup[mask_idx])
+    return tuple(out)
+
+
+def insert_const_values_with_mask(
+    tup: tuple[Any, ...], masks: list[bool], values: tuple[Any, ...]
+) -> tuple[Any, ...]:
+    """
+    masks and values are of same length. For indices where the mask is True, use
+    the const_values to fill in.
+    """
+    out = []
+    idx = 0
+    for mask_idx, mask in enumerate(masks):
+        if mask:
+            out.append(values[mask_idx])
+        else:
+            out.append(tup[idx])
+            idx += 1
+    return tuple(out)
diff --git a/torch/_dynamo/functional_export.py b/torch/_dynamo/functional_export.py
new file mode 100644
index 0000000000000..228dd7924aa3a
--- /dev/null
+++ b/torch/_dynamo/functional_export.py
@@ -0,0 +1,142 @@
+import builtins
+import inspect
+from collections import namedtuple
+from typing import Any, Callable
+
+import torch
+import torch.utils._pytree as pytree
+from torch._dynamo.convert_frame import FrameInfo, fullgraph_capture, get_compile_id
+from torch._dynamo.eval_frame import argument_names
+from torch._dynamo.utils import dynamo_timed, get_metrics_context
+from torch._guards import compile_context, CompileContext
+from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
+
+
+class ModuleToTrace(torch.nn.Module):
+    def __init__(self, foo: Any, in_spec: Any) -> None:
+        super().__init__()
+        self._export_root = foo
+        self.in_spec = in_spec
+
+    def forward(self, *flat_args: Any) -> "ExportTracerOutput":
+        args, kwargs = pytree.tree_unflatten(flat_args, self.in_spec)
+        res = self._export_root(*args, **kwargs)
+        out_flat, out_spec = pytree.tree_flatten(res)
+        return ExportTracerOutput(out_flat, out_spec)
+
+
+ExportTracerOutput = namedtuple("ExportTracerOutput", ["flat_args", "out_spec"])
+
+
+def _dynamo_graph_capture_for_export(
+    mod: torch.nn.Module,
+) -> Callable[..., torch.fx.GraphModule]:
+    """
+    This is lower level API that is used for export to capture dynamo level
+    torch IR.
+
+    Notable TODOs:
+    1. Are we actually gonna run the bytecode?
+    2. Need to attach guards
+    """
+
+    def inner(*args: Any, **kwargs: Any) -> torch.fx.GraphModule:
+        flat_inputs, in_spec = pytree.tree_flatten((args, kwargs))
+        module_to_trace = ModuleToTrace(mod, in_spec)
+
+        signature = inspect.signature(module_to_trace.forward)
+
+        bound_arguments = signature.bind(*flat_inputs)
+        bound_arguments.apply_defaults()
+
+        f_locals = {"self": module_to_trace, **bound_arguments.arguments}
+
+        frame = FrameInfo(
+            module_to_trace.forward.__func__.__code__,  # type: ignore[attr-defined]
+            module_to_trace.forward.__func__.__globals__,  # type: ignore[attr-defined]
+            f_locals,
+            builtins,  # type: ignore[arg-type]
+            closure=(),  # type: ignore[arg-type]
+        )
+
+        dynamo_config_ctx = torch._dynamo.config.patch(
+            "log_graph_in_out_metadata", True
+        )
+
+        with (
+            compile_context(CompileContext(get_compile_id({}))),
+            get_metrics_context(),
+            dynamo_timed("fullgraph_capture"),
+            dynamo_config_ctx,
+        ):
+            out = fullgraph_capture(frame, _is_export_deprecated_do_not_use=True)
+
+            assert out.dynamo_output.tracer_output.output_graph is not None
+
+            export_metadata = (
+                out.dynamo_output.tracer_output.output_graph.export_metadata
+            )
+            graph_inputs = export_metadata.graph_input_idx_to_local_source
+            output_return_type = export_metadata.output_return_type
+            # We need to extract out_spec here because we are not actually running the bytecode
+            out_spec = export_metadata.out_spec
+
+            graph = out.backend_input.graph_module
+
+            # It is not guaranteed that dynamo puts inputs in right order, so we need to
+            # map the actual user order to the dynamo order.
+            graph_input_order: dict[int, int] = {}
+            for inp in graph_inputs:
+                source = graph_inputs[inp]
+                assert isinstance(source, torch._dynamo.source.GetItemSource)
+                graph_input_order[source.index] = len(graph_input_order)
+
+            placeholders = [n for n in list(graph.graph.nodes) if n.op == "placeholder"]
+            output = next(n for n in list(graph.graph.nodes) if n.op == "output")
+            # Sometimes there can be empty inputs
+            anchor = placeholders[0] if len(placeholders) > 0 else output
+            inp_to_node = {}
+
+            with graph.graph.inserting_before(anchor):
+                for i in range(len(flat_inputs)):
+                    node_new = graph.graph.placeholder(f"arg_{i}")
+                    if i in graph_input_order:
+                        placeholders[graph_input_order[i]]
+                        node_new.meta = placeholders[graph_input_order[i]].meta.copy()
+                    inp_to_node[i] = node_new
+
+            new_args = []
+            for i in output_return_type:
+                type, val = output_return_type[i]
+                if type == "graph_out":
+                    new_args.append(output.args[0][val])
+                if type == "input":
+                    input_idx = val.index
+                    new_args.append(inp_to_node[input_idx])
+                if type == "constant":
+                    new_args.append(val)
+            output.args = (tuple(new_args),)
+
+            for src_idx, i in graph_input_order.items():
+                old = placeholders[src_idx]
+                new = inp_to_node[i]
+                old.replace_all_uses_with(new)
+                graph.graph.erase_node(old)
+
+            # Dynamo uses _lazyGraphModule, so we need to force recompile
+            from torch.fx._lazy_graph_module import _LazyGraphModule
+
+            _LazyGraphModule.force_recompile(graph)
+
+        graph.graph._codegen = _PyTreeCodeGen(
+            _PyTreeInfo(
+                argument_names(signature, args, kwargs),  # type: ignore[arg-type]
+                in_spec,
+                out_spec,
+            )
+        )
+
+        graph.recompile()
+        return graph
+
+    return inner
diff --git a/torch/_dynamo/graph_break_registry.json b/torch/_dynamo/graph_break_registry.json
index 15920eb33c3d1..28fd02294ad3c 100644
--- a/torch/_dynamo/graph_break_registry.json
+++ b/torch/_dynamo/graph_break_registry.json
@@ -2680,5 +2680,43 @@
         "Use method calls instead of attribute access."
       ]
     }
+  ],
+  "GB0268": [
+    {
+      "Gb_type": "Unsupported kwargs for itertools.product",
+      "Context": "call_function {self} {args} {kwargs}",
+      "Explanation": "Expected kwargs: 'repeat', but got {','.join(set(kwargs.keys()) - {'repeat'})}",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0269": [
+    {
+      "Gb_type": "Forced graph break on leaf function",
+      "Context": "",
+      "Explanation": "Forced graph break for nested graph break testing purposes",
+      "Hints": [
+        "Set torch._dynamo.config.debug_force_graph_break_on_leaf_return = False"
+      ]
+    }
+  ],
+  "GB0270": [
+    {
+      "Gb_type": "unimplemented builtin op vars() with no arguments",
+      "Context": "vars: {self} {args}",
+      "Explanation": "Dynamo does not know how to trace builtin operator {self.fn} with no arguments",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0271": [
+    {
+      "Gb_type": "Class attribute mutation when the __dict__ was already materialized",
+      "Context": "str(self.value)",
+      "Explanation": "Dyanmo does not support tracing mutations on a class when its __dict__ is materialized",
+      "Hints": []
+    }
   ]
 }
diff --git a/torch/_dynamo/graph_deduplication.py b/torch/_dynamo/graph_deduplication.py
index 3e766a71274d4..be2b51a7abdf7 100644
--- a/torch/_dynamo/graph_deduplication.py
+++ b/torch/_dynamo/graph_deduplication.py
@@ -9,7 +9,7 @@
 
 import logging
 import operator
-from collections import defaultdict
+from collections import defaultdict, deque
 from collections.abc import Generator, Iterable
 from typing import Optional
 
@@ -80,6 +80,8 @@ def apply_graph_deduplication(output_graph) -> dict[str, torch.fx.GraphModule]:
         (
             subgraph,
             external_node_usages,
+            node_usage_to_tuple_elems,
+            ind_to_tuple_spec,
         ) = _create_subgraph(region, inds_with_external_users)
 
         # Ignore regions with no args for now, could they possibly be evaluated at compile time?
@@ -100,6 +102,8 @@ def apply_graph_deduplication(output_graph) -> dict[str, torch.fx.GraphModule]:
                 region,
                 get_subgraph_node,
                 external_node_usages,
+                node_usage_to_tuple_elems,
+                ind_to_tuple_spec,
                 inds_with_external_users,
                 subgraph_name,
                 node_to_additional_deps,
@@ -122,14 +126,18 @@ def _replace_region_with_subgraph(
     region: Region,
     get_subgraph_node: Node,
     external_node_usages: Iterable[OrderedSet[UsageIndex]],
+    node_usage_to_tuple_elems: dict[UsageIndex, OrderedSet[int]],
+    ind_to_tuple_spec: dict[int, dict[tuple[int, ...], int]],
     inds_with_external_users: list[int],
     subgraph_name: str,
     node_to_additional_deps: dict[Node, OrderedSet[Node]],
     node_to_mutated_arg_positions: dict[Node, OrderedSet[int]],
 ) -> None:
     sub_args = []
+    flattened_getitem_nodes: OrderedSet[Node] = OrderedSet()
     for usages in external_node_usages:
-        node_ind, usage_ind = next(iter(usages))
+        usage = next(iter(usages))
+        node_ind, usage_ind = usage
         node = region[node_ind]
         flattened_args_kwargs = _get_flat_args(node, {})
         for user_ind, node_usage_ind in usages:
@@ -140,12 +148,19 @@ def _replace_region_with_subgraph(
                         "NYI: Failed to substitute region %s due to mutation", region
                     )
                     return
-        sub_args.append(flattened_args_kwargs[usage_ind])
+        if usage in node_usage_to_tuple_elems:
+            tuple_elems = [region[i] for i in node_usage_to_tuple_elems[usage]]
+            flattened_getitem_nodes.update(tuple_elems)
+            sub_args.extend(tuple_elems)
+        else:
+            sub_args.append(flattened_args_kwargs[usage_ind])
 
     # Input/Output aliasing not supported in HOPs today
     # Note: we should use the nodes in the original graph (the region here)
     # because we use the original traced example values for this check
-    if _has_aliasing(region, sub_args, inds_with_external_users):
+    if _has_aliasing(
+        region, sub_args, inds_with_external_users, flattened_getitem_nodes
+    ):
         return
 
     invoke_args = (get_subgraph_node, subgraph_name, *sub_args)
@@ -156,16 +171,35 @@ def _replace_region_with_subgraph(
         invoke_args,  # type: ignore[arg-type]
         {},
     )
-    for ind, external_user_ind in enumerate(inds_with_external_users):
+
+    ind = 0
+    flattened_output_nodes: OrderedSet[Node] = OrderedSet()
+    for external_user_ind in inds_with_external_users:
         node = region[external_user_ind]
-        subgraph_output = graph.create_node(
-            "call_function", operator.getitem, (invoke_subgraph_node, ind), {}
-        )
-        node.replace_all_uses_with(subgraph_output, propagate_meta=True)
+        if _is_tuple_node(node):
+            tuple_spec = ind_to_tuple_spec[external_user_ind]
+            flattened_output_nodes.update(
+                _replace_tuple_outputs(
+                    node, ind, tuple_spec, invoke_subgraph_node, graph
+                )
+            )
+            ind += len(tuple_spec)
+        else:
+            subgraph_output = graph.create_node(
+                "call_function", operator.getitem, (invoke_subgraph_node, ind), {}
+            )
+            node.replace_all_uses_with(subgraph_output, propagate_meta=True)
+            ind += 1
 
     # Erase in reverse topological order
     for node in reversed(region):
-        graph.erase_node(node)
+        if node in flattened_getitem_nodes:
+            # Don't erase these, since they will still be used
+            continue
+
+        if node not in flattened_output_nodes:
+            graph.erase_node(node)
+
         # Remove any nodes with additional deps
         # This is safe; we've guaranteed that there is
         # no input mutation, so all additional deps
@@ -220,15 +254,43 @@ def _get_inds_with_external_users(region: Region, inds_unique: set[int]) -> None
                     inds_unique.add(ind)
 
 
-def _copy_nodes_and_remap_inputs(
-    subgraph: torch.fx.Graph, region: Region
-) -> list[OrderedSet[UsageIndex]]:
+def _create_subgraph(
+    region: Region,
+    inds_with_external_users: list[int],
+) -> tuple[
+    torch.fx.Graph,
+    list[OrderedSet[UsageIndex]],
+    dict[UsageIndex, OrderedSet[int]],
+    dict[int, dict[tuple[int, ...], int]],
+]:
+    subgraph: torch.fx.Graph = torch.fx.Graph()
     external_input_to_usages = _get_external_inputs(region)
     external_node_usages = list[OrderedSet[UsageIndex]]()
     region_to_subgraph_node = {}
+    flattened_getitem_nodes: OrderedSet[Node] = OrderedSet()
+    node_usage_to_tuple_elems: dict[UsageIndex, OrderedSet[int]] = {}
+
     for node, usage_indices in external_input_to_usages.items():
-        placeholder = subgraph.placeholder(f"subgraph_input_{node.name}")
-        region_to_subgraph_node[node] = placeholder
+        # We don't handle tuples as inputs today
+        if _is_tuple_node(node):
+            # If a node is a tuple we will possibly create multiple placeholders for them
+            # and track which nodes we won't copy into the subgraph because they are flattened away
+            # Later, when replacing each region with this subgraph, we will create a getitem node
+            # externally which will perform the flattening on the outer nodes.
+            flattened_node_indices = _get_flattened_node_indices(node, region)
+            for ind in flattened_node_indices:
+                placeholder = subgraph.placeholder(
+                    f"supgraph_input_{node.name}_flattened_{ind}"
+                )
+                region_to_subgraph_node[region[ind]] = placeholder
+                flattened_getitem_nodes.add(region[ind])
+            node_usage_to_tuple_elems[next(iter(usage_indices))] = (
+                flattened_node_indices
+            )
+        else:
+            placeholder = subgraph.placeholder(f"subgraph_input_{node.name}")
+            region_to_subgraph_node[node] = placeholder
+
         external_node_usages.append(usage_indices)
 
     def map_arg(node: Node) -> Node:
@@ -237,29 +299,29 @@ def map_arg(node: Node) -> Node:
         else:
             return node
 
-    for node in region:
+    def copy_to_subgraph(node: Node) -> Node:
         subgraph_node = subgraph.node_copy(node, lambda old: map_arg(old))
         region_to_subgraph_node[node] = subgraph_node
+        return subgraph_node
 
-    return external_node_usages
-
-
-def _create_subgraph_outputs(
-    subgraph: torch.fx.Graph, inds_to_output: list[int]
-) -> None:
-    node_list = [n for n in subgraph.nodes if n.op not in ("placeholder", "output")]
-    out_tup = tuple(node_list[ind] for ind in inds_to_output)
-    subgraph.output(out_tup)
+    output_list = []
+    ind_to_tuple_spec = {}
+    for ind, node in enumerate(region):
+        if node not in flattened_getitem_nodes:
+            subgraph_node = copy_to_subgraph(node)
+            if ind in inds_with_external_users:
+                # flatten tuple outputs by generating a getitem node tree
+                if _is_tuple_node(node):
+                    getitem_nodes, ind_to_tuple_spec[ind] = _create_getitem_nodes(
+                        node, subgraph_node, subgraph
+                    )
+                    output_list.extend(getitem_nodes)
+                else:
+                    output_list.append(subgraph_node)
 
+    subgraph.output(tuple(output_list))
 
-def _create_subgraph(
-    region: Region,
-    inds_with_external_users: list[int],
-) -> tuple[torch.fx.Graph, list[OrderedSet[UsageIndex]]]:
-    subgraph: torch.fx.Graph = torch.fx.Graph()
-    external_node_usages = _copy_nodes_and_remap_inputs(subgraph, region)
-    _create_subgraph_outputs(subgraph, inds_with_external_users)
-    return subgraph, external_node_usages
+    return subgraph, external_node_usages, node_usage_to_tuple_elems, ind_to_tuple_spec
 
 
 def _stable_topological_sort(
@@ -384,11 +446,15 @@ def _add_mutation_dependencies(
 
 
 def _has_aliasing(
-    region: Region, inputs: list[Node], inds_with_external_users: list[int]
+    region: Region,
+    inputs: list[Node],
+    inds_with_external_users: list[int],
+    flattened_getitem_nodes: OrderedSet[Node],
 ) -> bool:
     input_storages: dict[StorageWeakRef, Node] = dict()
-
     for node in inputs:
+        if node in flattened_getitem_nodes:
+            continue
         example_value = node.meta["example_value"]
         if isinstance(example_value, torch.Tensor):
             storage = StorageWeakRef(example_value._typed_storage())
@@ -402,10 +468,11 @@ def _has_aliasing(
                 )
                 return True
             input_storages[storage] = node
-
     output_storages: dict[StorageWeakRef, Node] = dict()
     for i in inds_with_external_users:
         out_node = region[i]
+        if out_node in flattened_getitem_nodes:
+            continue
         if out_node:
             example_value = out_node.meta["example_value"]
             assert not isinstance(example_value, list)
@@ -421,7 +488,6 @@ def _has_aliasing(
                     )
                     return True
                 output_storages[storage] = out_node
-
     intersected_storages = input_storages.keys() & output_storages.keys()
     if len(intersected_storages) > 0:
         # input-output aliasing
@@ -435,5 +501,91 @@ def _has_aliasing(
             aliased,
         )
         return True
-
     return False
+
+
+def _is_tuple_node(node: Node) -> bool:
+    return isinstance(node.meta["example_value"], tuple)
+
+
+def _get_children_getitems(node: Node) -> Generator[Node, None, None]:
+    for user in node.users:
+        if user.target == operator.getitem and isinstance(user.args[1], int):
+            yield user
+
+
+def _get_flattened_node_indices(node: Node, region: Region) -> OrderedSet[int]:
+    """Returns an ordered set of indices, each representing a node in the region which will be flattened"""
+    flattened_node_to_ind = {n: i for i, n in enumerate(region)}
+    node_indices: OrderedSet[int] = OrderedSet()
+    queue = deque(_get_children_getitems(node))
+    while queue:
+        cur_node = queue.popleft()
+        if any(user in region for user in cur_node.users):
+            node_indices.add(flattened_node_to_ind[cur_node])
+        for child in _get_children_getitems(cur_node):
+            queue.append(child)
+    return node_indices
+
+
+def _create_getitem_nodes(
+    node: Node, subgraph_tuple_node: Node, subgraph: torch.fx.Graph
+) -> tuple[list[Node], dict[tuple[int, ...], int]]:
+    tup = node.meta["example_value"]
+    assert isinstance(tup, tuple), "_get_getitem_children expects tuple"
+
+    getitem_nodes: list[Node] = []
+    queue = deque([(e, (i,), subgraph_tuple_node) for i, e in enumerate(tup)])
+    path_to_output_index = {}
+
+    while queue:
+        cur_elem, path, parent = queue.popleft()
+
+        with subgraph.inserting_after(parent):
+            new_getitem_node = subgraph.create_node(
+                "call_function", operator.getitem, (parent, path[-1]), {}
+            )
+        new_getitem_node.meta["example_value"] = cur_elem
+
+        path_to_output_index[path] = len(getitem_nodes)
+        getitem_nodes.append(new_getitem_node)
+
+        if isinstance(cur_elem, tuple):
+            queue.extend(
+                [(e, path + (i,), new_getitem_node) for i, e in enumerate(cur_elem)]  # type: ignore[arg-type,misc]
+            )
+
+    return getitem_nodes, path_to_output_index  # type: ignore[return-value]
+
+
+def _replace_tuple_outputs(
+    node: Node,
+    output_index: int,
+    tuple_spec: dict[tuple[int, ...], int],
+    invoke_subgraph_node: Node,
+    graph: torch.fx.Graph,
+) -> OrderedSet[Node]:
+    assert _is_tuple_node(node), "_replace_tuple_outputs expects a tuple node"
+
+    queue = deque((c, (c.args[1],)) for c in _get_children_getitems(node))
+    erased_nodes: OrderedSet[Node] = OrderedSet()
+    while queue:
+        cur_node, path = queue.pop()
+
+        for c in _get_children_getitems(cur_node):
+            queue.append((c, path + (c.args[1],)))  # type: ignore[return-value, arg-type]
+
+        with graph.inserting_after(invoke_subgraph_node):
+            subgraph_output = graph.create_node(
+                "call_function",
+                operator.getitem,
+                (invoke_subgraph_node, output_index + tuple_spec[path]),  # type: ignore[index]
+                {},
+            )
+        cur_node.replace_all_uses_with(subgraph_output, propagate_meta=True)
+        graph.erase_node(cur_node)
+        erased_nodes.add(cur_node)
+
+    graph.erase_node(node)
+    erased_nodes.add(node)
+    return erased_nodes
diff --git a/torch/_dynamo/graph_region_tracker.py b/torch/_dynamo/graph_region_tracker.py
index a16a9f45a9b54..c1463d290bc9c 100644
--- a/torch/_dynamo/graph_region_tracker.py
+++ b/torch/_dynamo/graph_region_tracker.py
@@ -13,6 +13,8 @@
 optimization operations.
 """
 
+from __future__ import annotations
+
 import copyreg
 import io
 import logging
@@ -123,6 +125,18 @@ def _normalize_args(
     return (sorted_keys, tuple(_extract_args(arg) for arg in all_args))
 
 
+def _sort_with_ref_region(
+    index_to_rank: dict[int, int], regions: list[list[Any]]
+) -> None:
+    # sort topologically
+    # we need to handle edge cases where some nodes have no dependencies
+    # so first we map each node to its ranking
+    ref_region = regions[0]
+    sorted_indices = sorted(range(len(ref_region)), key=lambda i: index_to_rank[i])
+    for region in regions:
+        region[:] = [region[i] for i in sorted_indices]
+
+
 def get_global_state_key() -> GlobalStateKey:
     return (
         torch.is_grad_enabled(),
@@ -151,7 +165,7 @@ def __init__(self, origin: Node) -> None:
         self._queue: deque[Optional[Node]] = deque()
 
     @staticmethod
-    def create(origin: Node) -> "BackwardBfsArgIter":
+    def create(origin: Node) -> BackwardBfsArgIter:
         it = BackwardBfsArgIter(origin)
         it.add_children(origin)
         # pop the origin node, since it is the origin of
@@ -226,20 +240,23 @@ def _is_identical(self, n0: Node, n1: Node) -> bool:
             and n0 is not n1
         )
 
-    def track_node(self, tx: "InstructionTranslatorBase", node: Node) -> None:
+    def track_node(self, tx: InstructionTranslatorBase, node: Node) -> None:
         """
         The main entry point for tracking a node. This function will hash the node argument and group
         nodes with the same hash together. It updates the hash_to_duplicates and node_to_duplicates dictionaries
         to track the new node.
         """
         try:
-            duplicates = self.hash_to_duplicates[
-                self._hash_node(
-                    tx.f_code.co_filename, tx.lineno, tx.instruction_pointer, node
-                )
-            ]
-            duplicates.append(node)
-            self.node_to_duplicates[node] = duplicates
+            if (
+                node not in self.node_to_duplicates
+            ):  # don't allow nodes to be added twice
+                duplicates = self.hash_to_duplicates[
+                    self._hash_node(
+                        tx.f_code.co_filename, tx.lineno, tx.instruction_pointer, node
+                    )
+                ]
+                duplicates.append(node)
+                self.node_to_duplicates[node] = duplicates
         except NodeHashException as e:
             log.debug("Unable to hash node %s with exception %s", node, e)
 
@@ -327,8 +344,13 @@ def get_identical_regions(self, graph: torch.fx.Graph) -> list[list[Region]]:
                 self._is_identical,
             )
             # sort topologically
-            for region in region_group:
-                region.sort(key=lambda n: topological_ranking[n])
+            # we need to handle edge cases where some nodes have no dependencies
+            # so first we map each node to its ranking,
+            ref_region = region_group[0]
+            index_to_rank = {
+                index: topological_ranking[n] for index, n in enumerate(ref_region)
+            }
+            _sort_with_ref_region(index_to_rank, region_group)
 
         return [
             region_group for region_group in region_groups if len(region_group[0]) > 1
@@ -422,6 +444,7 @@ def fully_expand_region_group(
                 candidate not in seen_nodes
                 and candidate not in nodes_to_add
                 and candidate.op != "placeholder"
+                and candidate.op != "get_attr"
                 and is_identical_fn(candidate, current_node)
                 and not region_wrapper.will_inclusion_create_cycle(candidate)
             )
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 50220f3e23299..be7ff5051f2d5 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -1,5 +1,3 @@
-# mypy: allow-untyped-defs
-
 """
 Core guard system for Dynamo that detects when compiled code needs to be recompiled due to
 changes in program state. Guards are conditions that must remain true for previously-compiled
@@ -33,6 +31,7 @@
 import pickle
 import sys
 import textwrap
+import traceback
 import types
 import warnings
 import weakref
@@ -40,6 +39,14 @@
 from copy import deepcopy
 from inspect import currentframe
 from typing import Any, Callable, NoReturn, Optional, TYPE_CHECKING, Union
+
+
+try:
+    from typing import LiteralString
+except ImportError:
+    from typing_extensions import LiteralString
+
+from typing_extensions import TypeAliasType, TypeVar
 from weakref import ReferenceType
 
 import torch
@@ -49,16 +56,30 @@
 from torch._C._dynamo.guards import (
     check_obj_id,
     check_type_id,
+    ClosureGuardAccessor,
+    CodeGuardAccessor,
     dict_version,
     DictGetItemGuardAccessor,
     DictGuardManager,
+    FuncDefaultsGuardAccessor,
+    FuncKwDefaultsGuardAccessor,
+    GetAttrGuardAccessor,
     GetGenericDictGuardAccessor,
+    GuardAccessor,
+    GuardDebugInfo,
+    GuardManager,
     install_no_tensor_aliasing_guard,
     install_object_aliasing_guard,
     install_storage_overlapping_guard,
     install_symbolic_shape_guard,
+    LeafGuard,
     profile_guard_manager,
+    RelationalGuard,
     RootGuardManager,
+    TupleGetItemGuardAccessor,
+    TypeDictGuardAccessor,
+    TypeGuardAccessor,
+    TypeMROGuardAccessor,
 )
 from torch._dynamo.source import (
     get_global_source_name,
@@ -67,6 +88,7 @@
     is_from_flatten_script_object_source,
     is_from_local_source,
     is_from_optimizer_source,
+    is_from_skip_guard_source,
     is_from_unspecialized_builtin_nn_module_source,
     TensorProperty,
     TensorPropertySource,
@@ -83,6 +105,7 @@
     Source,
     StorageOverlap,
 )
+from torch._inductor.utils import IndentedBuffer
 from torch._logging import structured
 from torch._utils_internal import justknobs_check
 from torch.fx.experimental.symbolic_shapes import (
@@ -124,6 +147,7 @@
     GradSource,
     ListGetItemSource,
     LocalSource,
+    NamedTupleFieldsSource,
     NNModuleSource,
     NonSerializableSetGetItemSource,
     NumpyTensorSource,
@@ -182,11 +206,14 @@
 
 
 if TYPE_CHECKING:
-    from sympy import Symbol
+    from collections.abc import Generator, KeysView, Sequence
 
-    from torch._dynamo.output_graph import OutputGraphGuardsState
+    from sympy import Symbol
 
+    from torch._C import DispatchKeySet
+    from torch._dynamo.output_graph import OutputGraph, OutputGraphGuardsState
 
+T = TypeVar("T")
 log = logging.getLogger(__name__)
 guards_log = torch._logging.getArtifactLogger(__name__, "guards")
 recompiles_log = torch._logging.getArtifactLogger(__name__, "recompiles")
@@ -196,6 +223,28 @@
 verbose_guards_log = torch._logging.getArtifactLogger(__name__, "verbose_guards")
 
 
+dunder_attrs_assumed_constants = (
+    "__defaults__",
+    "__kwdefaults__",
+    "__code__",
+    "__closure__",
+    "__annotations__",
+    "__func__",
+    "__mro__",
+)
+
+
+class IndentedBufferWithPrefix(IndentedBuffer):
+    def prefix(self) -> str:
+        return "| " * (self._indent * self.tabwidth)
+
+    def writeline(self, line: str, skip_prefix: bool = False) -> None:  # type: ignore[override]
+        if skip_prefix:
+            super().writeline(line)
+        else:
+            super().writeline("+- " + line)
+
+
 class GuardManagerWrapper:
     """
     A helper class that contains the root guard manager. An instance of this
@@ -204,37 +253,38 @@ class is stored in the Dynamo cache entry, so that the cache entry can
     the check_nopybind from C++.
     """
 
-    def __init__(self, root=None):
+    def __init__(self, root: Optional[RootGuardManager] = None) -> None:
         if root is None:
             self.root = RootGuardManager()
         else:
             self.root = root
 
-        self.diff_guard_root = None
-        self.closure_vars = None
-        self.args = None
-        self.code_parts = []
-        self.verbose_code_parts = None
-        self.global_scope = None
-        self.guard_fail_fn = None
-        self.cache_entry = None
-        self.extra_state = None
-        self.id_matched_objs = {}
-        self.no_tensor_aliasing_sources = []
+        self.diff_guard_root: Optional[RootGuardManager] = None
+        self.closure_vars: Optional[dict[str, Any]] = None
+        self.args: Optional[list[str]] = None
+        self.code_parts: list[str] = []
+        self.verbose_code_parts: Optional[list[str]] = None
+        self.global_scope: Optional[dict[str, Any]] = None
+        self.guard_fail_fn: Optional[Callable[[GuardFail], None]] = None
+        self.cache_entry: Optional[CacheEntry] = None
+        self.extra_state: Optional[ExtraState] = None
+        self.id_matched_objs: dict[str, ReferenceType[object]] = {}
+        self.no_tensor_aliasing_sources: list[str] = []
 
-        self.printed_relational_guards = set()
+        self.printed_relational_guards: set[RelationalGuard] = set()
 
         self.diff_guard_sources: OrderedSet[str] = OrderedSet()
 
     @contextmanager
-    def _preserve_printed_relational_guards(self):
+    def _preserve_printed_relational_guards(self) -> Generator[None, None, None]:
         self.printed_relational_guards = set()
         try:
             yield
         finally:
             self.printed_relational_guards = set()
 
-    def collect_diff_guard_sources(self):
+    # TODO: clarify what fn and attributes guard manager has to get the right things here
+    def collect_diff_guard_sources(self) -> OrderedSet[str]:
         # At the time of finalize, we have only marked guard managers with
         # TENSOR_MATCH guards as diff guard managers. So, we do a tree traversal
         # and collect all the nodes in the tree (branches) that lead to tensor
@@ -244,7 +294,7 @@ def collect_diff_guard_sources(self):
         # 0, so we collect them as well. Later on, we accumulate the diff guard
         # sources for all the guard managers.
 
-        def visit_dict_manager(node):
+        def visit_dict_manager(node: DictGuardManager) -> bool:
             is_diff_guard_node = (
                 node.get_source() in self.diff_guard_sources or node.fail_count() > 0
             )
@@ -258,7 +308,7 @@ def visit_dict_manager(node):
 
             return is_diff_guard_node
 
-        def visit_manager(node):
+        def visit_manager(node: GuardManager) -> bool:
             assert not isinstance(node, DictGuardManager)
 
             is_diff_guard_node = (
@@ -272,7 +322,7 @@ def visit_manager(node):
 
             return is_diff_guard_node
 
-        def visit(node):
+        def visit(node: GuardManager) -> bool:
             if node is None:
                 return False
             if isinstance(node, DictGuardManager):
@@ -283,18 +333,18 @@ def visit(node):
 
         return self.diff_guard_sources
 
-    def finalize(self):
+    def finalize(self) -> None:
         if config.use_recursive_dict_tags_for_guards and justknobs_check(
             "pytorch/compiler:use_recursive_dict_tags_for_guards"
         ):
             self.find_tag_safe_roots()
         self.prepare_diff_guard_manager()
 
-    def prepare_diff_guard_manager(self):
+    def prepare_diff_guard_manager(self) -> None:
         self.collect_diff_guard_sources()
         self.populate_diff_guard_manager()
 
-    def find_tag_safe_roots(self):
+    def find_tag_safe_roots(self) -> None:
         """
         Identify ``tag safe nodes`` and ``tag safe roots`` within a guard tree.
 
@@ -352,7 +402,17 @@ def find_tag_safe_roots(self):
         subset that are tag safe roots.
         """
 
-        def visit_dict_manager(node):
+        def check_tag_safety(
+            node: GuardManager, accepted_accessors: tuple[type[GuardAccessor], ...]
+        ) -> bool:
+            accessors = node.get_accessors()
+            child_mgrs = node.get_child_managers()
+            return all(
+                isinstance(accessor, accepted_accessors) and mgr.is_tag_safe()
+                for accessor, mgr in zip(accessors, child_mgrs)
+            )
+
+        def visit_dict_manager(node: DictGuardManager) -> list[GuardManager]:
             # Just recurse through the key and value dict managers and check if
             # all of them are tag safe nodes.
             assert issubclass(node.get_type_of_guarded_value(), dict)
@@ -382,7 +442,7 @@ def visit_dict_manager(node):
                 node.mark_tag_safe()
             return tag_safe_roots
 
-        def visit_manager(node):
+        def visit_manager(node: GuardManager) -> list[GuardManager]:
             assert not isinstance(node, DictGuardManager)
 
             # Collect the subtree tag safe roots
@@ -409,12 +469,8 @@ def visit_manager(node):
                 if is_subtree_tag_safe:
                     node.mark_tag_safe()
             elif issubclass(node.get_type_of_guarded_value(), torch.nn.Module):
-                accessors = node.get_accessors()
-                child_mgrs = node.get_child_managers()
-                is_subtree_tag_safe = all(
-                    isinstance(accessor, GetGenericDictGuardAccessor)
-                    and mgr.is_tag_safe()
-                    for accessor, mgr in zip(accessors, child_mgrs)
+                is_subtree_tag_safe = check_tag_safety(
+                    node, (GetGenericDictGuardAccessor, TypeGuardAccessor)
                 )
                 if is_subtree_tag_safe:
                     node.mark_tag_safe()
@@ -423,9 +479,80 @@ def visit_manager(node):
                     return [
                         node,
                     ]
+            elif (
+                node.get_type_of_guarded_value()
+                in (
+                    types.FunctionType,
+                    types.MethodType,
+                    staticmethod,
+                    classmethod,
+                )
+                and config.assume_dunder_attributes_remain_unchanged
+            ):
+                # Assumption: callers will not reassignthe attributes
+                #   func.__code__, func.__closure__, func.__defaults__, or func.__kwdefaults__.
+                # Mutating the objects those attributes point to is fine;
+                # rebinding the attribute itself is not.
+                # Example ─ allowed:   foo.__defaults__[0].bar = 99
+                #          forbidden: foo.__defaults__ = (3, 4)
+                is_subtree_tag_safe = check_tag_safety(
+                    node,
+                    (
+                        CodeGuardAccessor,
+                        ClosureGuardAccessor,
+                        FuncDefaultsGuardAccessor,
+                        FuncKwDefaultsGuardAccessor,
+                        GetAttrGuardAccessor,
+                    ),
+                )
+
+                for accessor in node.get_accessors():
+                    if isinstance(accessor, GetAttrGuardAccessor):
+                        is_subtree_tag_safe &= (
+                            accessor.get_attr_name() in dunder_attrs_assumed_constants
+                        )
+
+                if is_subtree_tag_safe:
+                    node.mark_tag_safe()
+            elif issubclass(node.get_type_of_guarded_value(), types.CellType):
+                is_subtree_tag_safe = check_tag_safety(node, (GetAttrGuardAccessor,))
+
+                is_subtree_tag_safe &= all(
+                    isinstance(accessor, GetAttrGuardAccessor)
+                    and accessor.get_attr_name() == "cell_contents"
+                    for accessor in node.get_accessors()
+                )
+                if is_subtree_tag_safe:
+                    node.mark_tag_safe()
+            elif (
+                issubclass(node.get_type_of_guarded_value(), tuple)
+                and node.get_source().endswith(dunder_attrs_assumed_constants)
+                and config.assume_dunder_attributes_remain_unchanged
+            ):
+                # We trust tuples obtained from a function’s __closure__ or
+                # __defaults__. Any *other* tuple-valued attribute can be
+                # silently replaced—for example:
+                #
+                #     foo.bar = (1, 2)      # original
+                #     foo.bar = (3, 4)      # rebinding that our dict-tag optimisation won’t see
+                #
+                # Therefore only tuples from __closure__ / __defaults__ participate in the
+                # recursive-dict-tag optimization; all others are ignored.
+                is_subtree_tag_safe = check_tag_safety(
+                    node, (TupleGetItemGuardAccessor,)
+                )
+                if is_subtree_tag_safe:
+                    node.mark_tag_safe()
+            elif issubclass(node.get_type_of_guarded_value(), type):
+                is_subtree_tag_safe = check_tag_safety(
+                    node, (TypeDictGuardAccessor, TypeMROGuardAccessor)
+                )
+                if is_subtree_tag_safe:
+                    node.mark_tag_safe()
+
             return tag_safe_roots
 
-        def visit(node):
+        def visit(node: GuardManager) -> list[GuardManager]:
             if node is None:
                 return []
             if isinstance(node, DictGuardManager):
@@ -437,7 +564,7 @@ def visit(node):
             if issubclass(node.get_type_of_guarded_value(), torch.nn.Module):
                 node.mark_tag_safe_root()
 
-    def populate_diff_guard_manager(self):
+    def populate_diff_guard_manager(self) -> None:
         self.diff_guard_root = self.clone_with_chosen_sources(self.diff_guard_sources)
 
         # Ensure that that C++ side points to the updated diff guard manager.
@@ -450,19 +577,23 @@ def populate_diff_guard_manager(self):
         if self.cache_entry:
             self.cache_entry.update_diff_guard_root_manager()
 
-    def clone_with_chosen_sources(self, chosen_sources):
-        def filter_fn(node_mgr):
+    def clone_with_chosen_sources(
+        self, chosen_sources: OrderedSet[str]
+    ) -> RootGuardManager:
+        def filter_fn(node_mgr: GuardManager) -> bool:
             return node_mgr.get_source() in chosen_sources
 
         return self.root.clone_manager(filter_fn)
 
-    def get_guard_lines(self, guard):
+    def get_guard_lines(self, guard: LeafGuard) -> list[str]:
         guard_name = guard.__class__.__name__
         parts = guard.verbose_code_parts()
         parts = [guard_name + ": " + part for part in parts]
         return parts
 
-    def get_manager_line(self, guard_manager, accessor_str=None):
+    def get_manager_line(
+        self, guard_manager: GuardManager, accessor_str: Optional[str] = None
+    ) -> str:
         source = guard_manager.get_source()
         t = guard_manager.__class__.__name__
         s = t + ": source=" + source
@@ -472,7 +603,9 @@ def get_manager_line(self, guard_manager, accessor_str=None):
         s += f", tag_safe=({guard_manager.is_tag_safe()}, {guard_manager.is_tag_safe_root()})"
         return s
 
-    def construct_dict_manager_string(self, mgr, body):
+    def construct_dict_manager_string(
+        self, mgr: DictGuardManager, body: IndentedBufferWithPrefix
+    ) -> None:
         for idx, (key_mgr, val_mgr) in sorted(mgr.get_key_value_managers().items()):
             body.writeline(f"KeyValueManager pair at index={idx}")
             with body.indent():
@@ -484,10 +617,12 @@ def construct_dict_manager_string(self, mgr, body):
                     body.writeline(f"ValueManager: {self.get_manager_line(val_mgr)}")
                     self.construct_manager_string(val_mgr, body)
 
-    def construct_manager_string(self, mgr, body):
+    def construct_manager_string(
+        self, mgr: GuardManager, body: IndentedBufferWithPrefix
+    ) -> None:
         with body.indent():
             for guard in mgr.get_leaf_guards():
-                if isinstance(guard, torch._C._dynamo.guards.RelationalGuard):  # type: ignore[attr-defined]
+                if isinstance(guard, RelationalGuard):
                     if guard not in self.printed_relational_guards:
                         self.printed_relational_guards.add(guard)
                         body.writelines(self.get_guard_lines(guard))
@@ -513,19 +648,7 @@ def construct_manager_string(self, mgr, body):
                 )
                 self.construct_manager_string(child_mgr, body)
 
-    def __str__(self):
-        from torch._inductor.utils import IndentedBuffer
-
-        class IndentedBufferWithPrefix(IndentedBuffer):
-            def prefix(self):
-                return "| " * (self._indent * self.tabwidth)
-
-            def writeline(self, line, skip_prefix=False):
-                if skip_prefix:
-                    super().writeline(line)
-                else:
-                    super().writeline("+- " + line)
-
+    def __str__(self) -> str:
         with self._preserve_printed_relational_guards():
             body = IndentedBufferWithPrefix()
             body.tabwidth = 1
@@ -538,29 +661,29 @@ def writeline(self, line, skip_prefix=False):
                     body.writelines(self.get_guard_lines(guard))
             return body.getvalue()
 
-    def check(self, x):
+    def check(self, x: Any) -> bool:
         # Only needed for debugging purposes.
         return self.root.check(x)
 
-    def check_verbose(self, x):
+    def check_verbose(self, x: Any) -> GuardDebugInfo:
         # Only needed for debugging purposes.
         return self.root.check_verbose(x)
 
-    def populate_code_parts_for_debugging(self):
+    def populate_code_parts_for_debugging(self) -> None:
         # This should be called when the guard manager is fully populated
         relational_guards_seen = set()
 
-        def get_code_parts(leaf_guard):
+        def get_code_parts(leaf_guard: LeafGuard) -> list[str]:
             code_parts = []
             for verbose_code_part in leaf_guard.verbose_code_parts():
                 code_part = verbose_code_part.split("#")[0].rstrip()
                 code_parts.append(code_part)
             return code_parts
 
-        def visit(mgr):
+        def visit(mgr: GuardManager) -> None:
             nonlocal relational_guards_seen
             for guard in mgr.get_leaf_guards():
-                if isinstance(guard, torch._C._dynamo.guards.RelationalGuard):  # type: ignore[attr-defined]
+                if isinstance(guard, RelationalGuard):
                     if guard not in relational_guards_seen:
                         self.code_parts.extend(get_code_parts(guard))
                         relational_guards_seen.add(guard)
@@ -573,7 +696,7 @@ def visit(mgr):
         visit(self.root)
 
 
-def from_numpy(a):
+def from_numpy(a: Any) -> torch.Tensor:
     # If not numpy array, piggy back on e.g. tensor guards to check type
     # Re-enable torch function since we disable it on leaf guards
     # we need it to properly construct the tensor if a default device is set
@@ -583,7 +706,7 @@ def from_numpy(a):
 
 # For user stack printing
 @functools.cache
-def uninteresting_files():
+def uninteresting_files() -> set[str]:
     import torch._dynamo.external_utils
     import torch._dynamo.polyfills
 
@@ -599,7 +722,7 @@ def uninteresting_files():
 _CLOSURE_VARS: Optional[dict[str, object]] = None
 
 
-def _get_closure_vars():
+def _get_closure_vars() -> dict[str, object]:
     global _CLOSURE_VARS
     if _CLOSURE_VARS is None:
         _CLOSURE_VARS = {
@@ -613,6 +736,7 @@ def _get_closure_vars():
             "___normalize_range_iter": normalize_range_iter,
             "___tuple_iterator_getitem": tuple_iterator_getitem,
             "___dataclass_fields": dataclass_fields,
+            "___namedtuple_fields": lambda x: x._fields,
             "___get_torch_function_mode_stack_at": get_torch_function_mode_stack_at,
             "__math_isnan": math.isnan,
             "__numpy_isnan": None if np is None else np.isnan,
@@ -635,13 +759,20 @@ def _ast_unparse(node: ast.AST) -> str:
 strip_function_call = torch._C._dynamo.strip_function_call
 
 
-def get_verbose_code_part(code_part: str, guard: Guard) -> str:
+def get_verbose_code_part(code_part: str, guard: Optional[Guard]) -> str:
     extra = ""
     if guard is not None:
         if guard.user_stack:
             for fs in reversed(guard.user_stack):
                 if fs.filename not in uninteresting_files():
                     extra = f"  # {format_frame(fs, line=True)}"
+                    if len(extra) > 1024:
+                        # For fx graphs, the line can be very long in case of
+                        # torch.stack ops, where many inputs are set to None
+                        # after the operation.  This increases the size of the
+                        # guards log file.  In such cases, do not print the line
+                        # contents.
+                        extra = f"  # {format_frame(fs)}"
                     break
         elif guard.stack:
             summary = guard.stack.summary()
@@ -653,14 +784,25 @@ def get_verbose_code_part(code_part: str, guard: Guard) -> str:
 
 
 def get_verbose_code_parts(
-    code_parts: Union[str | list[str]], guard: Guard
+    code_parts: Union[str, list[str]],
+    guard: Optional[Guard],
+    recompile_hint: Optional[str] = None,
 ) -> list[str]:
     if not isinstance(code_parts, list):
         code_parts = [code_parts]
-    return [get_verbose_code_part(code_part, guard) for code_part in code_parts]
+
+    verbose_code_parts = [
+        get_verbose_code_part(code_part, guard) for code_part in code_parts
+    ]
+    if recompile_hint:
+        verbose_code_parts = [
+            f"{part} (HINT: {recompile_hint})" for part in verbose_code_parts
+        ]
+
+    return verbose_code_parts
 
 
-def convert_int_to_concrete_values(dim) -> Optional[int]:
+def convert_int_to_concrete_values(dim: Any) -> Optional[int]:
     if dim is None:
         return None
     if not is_symbolic(dim):
@@ -670,11 +812,18 @@ def convert_int_to_concrete_values(dim) -> Optional[int]:
         return dim.node.maybe_as_int()
 
 
-def convert_to_concrete_values(size_or_stride):
+def convert_to_concrete_values(size_or_stride: list[Any]) -> list[Optional[int]]:
     return [convert_int_to_concrete_values(dim) for dim in size_or_stride]
 
 
-def get_tensor_guard_code_part(value, name, sizes, strides, pytype, dispatch_keys):
+def get_tensor_guard_code_part(
+    value: torch.Tensor,
+    name: str,
+    sizes: list[Optional[int]],
+    strides: list[Optional[int]],
+    pytype: type,
+    dispatch_keys: DispatchKeySet,
+) -> str:
     dispatch_key = (
         dispatch_keys | torch._C._dispatch_tls_local_include_set()
     ) - torch._C._dispatch_tls_local_exclude_set()
@@ -688,7 +837,7 @@ def get_tensor_guard_code_part(value, name, sizes, strides, pytype, dispatch_key
     return guard_str
 
 
-def get_key_index(dct, key):
+def get_key_index(dct: dict[Any, Any], key: Any) -> int:
     # Ensure that we call dict.keys and not value.keys (which can call
     # overridden keys method). In the C++ guards, we relied on PyDict_Next
     # to traverse the dictionary, which uses the internal data structure and
@@ -696,7 +845,7 @@ def get_key_index(dct, key):
     return list(builtin_dict_keys(dct)).index(key)
 
 
-def get_key_index_source(source, index):
+def get_key_index_source(source: Any, index: Any) -> str:
     return f"list(dict.keys({source}))[{index}]"
 
 
@@ -708,6 +857,16 @@ def raise_local_type_error(obj: Any) -> NoReturn:
     )
 
 
+def should_optimize_getattr_on_nn_module(value: Any) -> bool:
+    # If inline_inbuilt_nn_modules flag is True, Dynamo has already traced
+    # through the __getattr__, and therefore it is always safe to optimize
+    # getattr on nn modules.
+    return isinstance(value, torch.nn.Module) and (
+        config.inline_inbuilt_nn_modules
+        or get_custom_getattr(value) is unpatched_nn_module_getattr
+    )
+
+
 @dataclasses.dataclass(frozen=True)
 class NNModuleAttrAccessorInfo:
     # Represents where is the attr name is present in the nn module attribute
@@ -724,8 +883,12 @@ class NNModuleAttrAccessorInfo:
 
 
 def getitem_on_dict_manager(
-    source, base_guard_manager, base_example_value, example_value, guard_manager_enum
-):
+    source: Union[DictGetItemSource, DictSubclassGetItemSource],
+    base_guard_manager: DictGuardManager,
+    base_example_value: Any,
+    example_value: Any,
+    guard_manager_enum: GuardManagerType,
+) -> GuardManager:
     base_source_name = source.base.name()
     if isinstance(source.index, ConstDictKeySource):
         index = source.index.index
@@ -764,7 +927,7 @@ def getitem_on_dict_manager(
     )
 
 
-def match_on_id_for_tensor(guard):
+def match_on_id_for_tensor(guard: Guard) -> bool:
     source = guard.originating_source
     # For numpy tensors, always use TENSOR_MATCH because __from_numpy leads
     # to a new tensor every time and therefore id differs.
@@ -791,7 +954,7 @@ class GuardManagerType(enum.Enum):
 
 
 @functools.cache
-def code_framelocals_names_reversed_cached(code: types.CodeType):
+def code_framelocals_names_reversed_cached(code: types.CodeType) -> list[str]:
     return list(reversed(code_framelocals_names(code)))
 
 
@@ -799,16 +962,16 @@ class GuardBuilder(GuardBuilderBase):
     def __init__(
         self,
         f_code: types.CodeType,
-        id_ref: Callable[[Any, str], str],
+        id_ref: Callable[[object, str], int],
         source_ref: Callable[[Source], str],
-        lookup_weakrefs: Callable[[object], ReferenceType[object]],
+        lookup_weakrefs: Callable[[object], Optional[weakref.ref[object]]],
         local_scope: dict[str, object],
         global_scope: dict[str, object],
         guard_manager: GuardManagerWrapper,
         check_fn_manager: CheckFunctionManager,
-        serialization_mode: Optional[str] = None,
-        runtime_global_scope: Optional[dict[str, Any]] = None,
-    ):
+        save_guards: bool = False,
+        runtime_global_scope: Optional[dict[str, object]] = None,
+    ) -> None:
         self.f_code = f_code
         self.id_ref = id_ref
         self.source_ref = source_ref
@@ -839,7 +1002,7 @@ def __init__(
         # Collect the guard managers and debug info to insert no tensor aliasing
         # guards.
         self.no_tensor_aliasing_names: list[str] = []
-        self.no_tensor_aliasing_guard_managers: list[GuardManagerWrapper] = []
+        self.no_tensor_aliasing_guard_managers: list[GuardManager] = []
 
         self.check_fn_manager: CheckFunctionManager = check_fn_manager
 
@@ -848,6 +1011,7 @@ def __init__(
         # to access the same object - self._module["param"] is same as
         # self.param.
         self.key_order_guarded_dict_ids = set()
+        assert self.check_fn_manager.output_graph is not None
         for source in self.check_fn_manager.output_graph.guard_on_key_order:
             self.key_order_guarded_dict_ids.add(id(self.get(source.name())))
 
@@ -857,12 +1021,10 @@ def __init__(
         self.id_matched_objs: dict[str, ReferenceType[object]] = {}
 
         # Save the guard managers to avoid repeatedly traversing sources.
-        self._cached_guard_managers: dict[
-            str, torch._C._dynamo.guards.GuardManager
-        ] = {}
+        self._cached_guard_managers: dict[str, GuardManager] = {}
         self._cached_duplicate_input_guards: set[tuple[str, str]] = set()
         self.object_aliasing_guard_codes: list[tuple[str, str]] = []
-        self.serialization_mode = serialization_mode
+        self.save_guards = save_guards
         self.guard_nn_modules = config.guard_nn_modules and justknobs_check(
             "pytorch/compiler:guard_nn_modules"
         )
@@ -870,7 +1032,9 @@ def __init__(
             tuple[str, str]
         ] = OrderedSet()
 
-    def guard_on_dict_keys_and_ignore_order(self, example_value, guard):
+    def guard_on_dict_keys_and_ignore_order(
+        self, example_value: dict[Any, Any], guard: Guard
+    ) -> None:
         dict_mgr = self.get_guard_manager(guard)
         if isinstance(dict_mgr, DictGuardManager):
             raise NotImplementedError(
@@ -898,7 +1062,7 @@ def guard_on_dict_keys_and_ignore_order(self, example_value, guard):
                 guard_manager_enum=guard_manager_enum,
             )
 
-    def guard_on_dict_keys_and_order(self, value, guard):
+    def guard_on_dict_keys_and_order(self, value: dict[Any, Any], guard: Guard) -> None:
         # Add key managers for the DictGuardManager. Then add either an
         # ID_MATCH or EQUALS_MATCH guard on the key.
         dict_mgr = self.get_guard_manager(guard)
@@ -937,7 +1101,7 @@ def guard_on_dict_keys_and_order(self, value, guard):
                 )
 
     @staticmethod
-    def _get_generic_dict_manager_example_value(example_value):
+    def _get_generic_dict_manager_example_value(example_value: Any) -> Optional[Any]:
         # due to a bug in 3.13.0 (introduced by https://github.com/python/cpython/pull/116115,
         # reported in https://github.com/python/cpython/issues/125608,
         # fixed by https://github.com/python/cpython/pull/125611), we cannot take
@@ -956,14 +1120,14 @@ def _get_generic_dict_manager_example_value(example_value):
 
     def getattr_on_nn_module(
         self,
-        source,
-        base_guard_manager,
-        base_example_value,
-        example_value,
-        base_source_name,
-        source_name,
-        guard_manager_enum,
-    ):
+        source: AttrSource,
+        base_guard_manager: GuardManager,
+        base_example_value: Any,
+        example_value: Any,
+        base_source_name: str,
+        source_name: str,
+        guard_manager_enum: GuardManagerType,
+    ) -> GuardManager:
         """
         This tries to avoid calling the expensive nn module custom getattr method by
         checking if the attribute is accessible via __dict__. For attributes that
@@ -982,8 +1146,13 @@ def getattr_on_nn_module(
         """
 
         def getitem_on_dict_mgr(
-            mgr, key, source_name, base_example_value, example_value, guard_manager_enum
-        ):
+            mgr: GuardManager,
+            key: Any,
+            source_name: str,
+            base_example_value: Any,
+            example_value: Any,
+            guard_manager_enum: GuardManagerType,
+        ) -> GuardManager:
             if isinstance(mgr, DictGuardManager):
                 # Case where the user code relies on key order, e.g.,
                 # named_parameters
@@ -1093,6 +1262,7 @@ def getitem_on_dict_mgr(
             )
 
             if l2_key:
+                assert l2_source_name is not None and l2_guard_manager_enum is not None
                 return getitem_on_dict_mgr(
                     mgr=l1_mgr,
                     key=l2_key,
@@ -1103,14 +1273,20 @@ def getitem_on_dict_mgr(
                 )
             return l1_mgr
 
-    def requires_key_order_guarding(self, source):
+    def requires_key_order_guarding(self, source: Source) -> bool:
         source_name = source.name()
         if source_name == "":
             return False
         obj_id = id(self.get(source_name))
         return obj_id in self.key_order_guarded_dict_ids
 
-    def get_guard_manager_type(self, source, example_value):
+    def get_guard_manager_type(
+        self,
+        source: Source,
+        example_value: Optional[
+            Union[KeysView[Any], set[Any], frozenset[Any], dict[Any, Any]]
+        ],
+    ) -> GuardManagerType:
         guard_manager_enum = GuardManagerType.GUARD_MANAGER
         if self.requires_key_order_guarding(source):
             # Fix this if condition
@@ -1126,10 +1302,10 @@ def get_guard_manager_type(self, source, example_value):
                 guard_manager_enum = GuardManagerType.DICT_GUARD_MANAGER
         return guard_manager_enum
 
-    def manager_guards_on_keys(self, mgr_enum):
+    def manager_guards_on_keys(self, mgr_enum: GuardManagerType) -> bool:
         return mgr_enum == GuardManagerType.DICT_GUARD_MANAGER
 
-    def get_global_guard_manager(self):
+    def get_global_guard_manager(self) -> GuardManager:
         return self.guard_manager.root.globals_dict_manager(
             f_globals=self.runtime_global_scope,
             source="G",
@@ -1137,7 +1313,7 @@ def get_global_guard_manager(self):
             guard_manager_enum=GuardManagerType.GUARD_MANAGER,
         )
 
-    def get_guard_manager_from_source(self, source):
+    def get_guard_manager_from_source(self, source: Source) -> GuardManager:
         root_guard_manager = self.guard_manager.root
 
         example_value = None
@@ -1275,12 +1451,9 @@ def get_guard_manager_from_source(self, source):
             )
         elif istype(source, (AttrSource, UnspecializedParamBufferSource)):
             assert base_guard_manager  # to make mypy happy
-
-            if (
-                isinstance(base_example_value, torch.nn.Module)
-                and get_custom_getattr(base_example_value)
-                is unpatched_nn_module_getattr
-            ):
+            assert isinstance(source, AttrSource)
+            if should_optimize_getattr_on_nn_module(base_example_value):
+                assert base_source_name
                 out = self.getattr_on_nn_module(
                     source,
                     base_guard_manager,
@@ -1300,6 +1473,7 @@ def get_guard_manager_from_source(self, source):
         elif istype(source, (DictGetItemSource, DictSubclassGetItemSource)):
             assert base_guard_manager  # to make mypy happy
             assert isinstance(base_example_value, (dict, collections.OrderedDict))
+            assert isinstance(source, (DictGetItemSource, DictSubclassGetItemSource))
             if isinstance(base_guard_manager, DictGuardManager):
                 assert self.manager_guards_on_keys(base_guard_manager_enum)
                 out = getitem_on_dict_manager(
@@ -1379,6 +1553,7 @@ def get_guard_manager_from_source(self, source):
                 )
         elif istype(source, DefaultsSource):
             assert base_guard_manager  # to make mypy happy
+            assert base_source_name
             assert callable(base_example_value)
             if not source.is_kw:
                 out = base_guard_manager.func_defaults_manager(
@@ -1516,6 +1691,14 @@ def get_guard_manager_from_source(self, source):
                 example_value=example_value,
                 guard_manager_enum=guard_manager_enum,
             )
+        elif istype(source, NamedTupleFieldsSource):
+            assert base_guard_manager
+            out = base_guard_manager.lambda_manager(
+                python_lambda=lambda x: x._fields,
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
         elif istype(source, CodeSource):
             assert base_guard_manager  # to make mypy happy
             out = base_guard_manager.code_manager(
@@ -1538,16 +1721,16 @@ def get_guard_manager_from_source(self, source):
         self._cached_guard_managers[source.name()] = out
         return out
 
-    def get_guard_manager(self, guard: Guard):
+    def get_guard_manager(self, guard: Guard) -> GuardManager:
         return self.get_guard_manager_from_source(guard.originating_source)
 
     def add_python_lambda_leaf_guard_to_root(
         self,
-        code_parts,
-        verbose_code_parts,
-        closure_vars=None,
-        is_epilogue=True,
-    ):
+        code_parts: list[str],
+        verbose_code_parts: list[str],
+        closure_vars: Optional[dict[str, object]] = None,
+        is_epilogue: bool = True,
+    ) -> None:
         if closure_vars is None:
             closure_vars = _get_closure_vars()
         # Adds a lambda leaf guard to the root guard manager. It wraps the
@@ -1602,7 +1785,12 @@ def arg_ref(self, guard: Union[str, Guard]) -> str:
 
         return name
 
-    def _guard_on_attribute(self, guard: Guard, attr_name: str, guard_fn):
+    def _guard_on_attribute(
+        self,
+        guard: Guard,
+        attr_name: str,
+        guard_fn: Callable[[GuardBuilderBase, Guard], Any],
+    ) -> None:
         if attr_name == "__code__":
             attr_source = CodeSource(guard.originating_source)
         else:
@@ -1614,7 +1802,7 @@ def _guard_on_attribute(self, guard: Guard, attr_name: str, guard_fn):
         new_guard.create(self)
 
     # Note: the order of the guards in this file matters since we sort guards on the same object by lineno
-    def HASATTR(self, guard: Guard):
+    def HASATTR(self, guard: Guard) -> None:
         source = guard.originating_source
         if isinstance(source, NNModuleSource):
             source = source.base
@@ -1647,12 +1835,8 @@ def HASATTR(self, guard: Guard):
 
             # if the base value is nn.Module, check if we can speedup the
             # guard by going through __dict__ attrs.
-            if (
-                isinstance(base_example_value, torch.nn.Module)
-                and get_custom_getattr(base_example_value)
-                is unpatched_nn_module_getattr
-            ):
-                return self.getattr_on_nn_module(
+            if should_optimize_getattr_on_nn_module(base_example_value):
+                self.getattr_on_nn_module(
                     source,
                     base_manager,
                     base_example_value,
@@ -1671,7 +1855,9 @@ def HASATTR(self, guard: Guard):
         else:
             base_manager.add_no_hasattr_guard(attr, get_verbose_code_parts(code, guard))
 
-    def NOT_PRESENT_IN_GENERIC_DICT(self, guard: Guard, attr=None) -> None:
+    def NOT_PRESENT_IN_GENERIC_DICT(
+        self, guard: Guard, attr: Optional[Any] = None
+    ) -> None:
         assert attr is not None
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
@@ -1702,9 +1888,10 @@ def TYPE_MATCH(self, guard: Guard) -> None:
         else:
             t = type(value)
 
-        if self.serialization_mode == "save":
-            if t.__qualname__ != t.__name__:
-                raise_local_type_error(value)
+        if t.__qualname__ != t.__name__:
+            # Type match guards must be local scope, this is
+            # raised in self.serialize_guards
+            guard._unserializable = True
 
         obj_id = self.id_ref(t, f"type({guard.name})")
         code = f"___check_type_id({self.arg_ref(guard)}, {obj_id})"
@@ -1714,11 +1901,7 @@ def TYPE_MATCH(self, guard: Guard) -> None:
             obj_id, get_verbose_code_parts(code, guard)
         )
 
-    def DICT_VERSION(self, guard: Guard):
-        if self.serialization_mode == "save":
-            raise torch._dynamo.exc.PackageError(
-                "DICT_VERSION guard cannot be serialized."
-            )
+    def DICT_VERSION(self, guard: Guard) -> None:
         # ___check_dict_version is same as `dict_version(x) == y`
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
@@ -1732,7 +1915,7 @@ def DICT_VERSION(self, guard: Guard):
             val, get_verbose_code_parts(code, guard)
         )
 
-    def DICT_CONTAINS(self, guard: Guard, key: str, invert: bool):
+    def DICT_CONTAINS(self, guard: Guard, key: str, invert: bool) -> None:
         dict_ref = self.arg_ref(guard)
 
         maybe_not = "not " if invert else ""
@@ -1743,7 +1926,7 @@ def DICT_CONTAINS(self, guard: Guard, key: str, invert: bool):
             not invert, key, get_verbose_code_parts(code, guard)
         )
 
-    def SET_CONTAINS(self, guard: Guard, key: Any, invert: bool):
+    def SET_CONTAINS(self, guard: Guard, key: Any, invert: bool) -> None:
         set_ref = self.arg_ref(guard)
         item = key
         contains = not invert  # install_dict_contains_guard inverts "contains"
@@ -1756,7 +1939,7 @@ def SET_CONTAINS(self, guard: Guard, key: Any, invert: bool):
             contains, item, get_verbose_code_parts(code, guard)
         )
 
-    def BOOL_MATCH(self, guard: Guard):
+    def BOOL_MATCH(self, guard: Guard) -> None:
         # checks val == True or val == False
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
@@ -1773,7 +1956,7 @@ def BOOL_MATCH(self, guard: Guard):
                 get_verbose_code_parts(code, guard)
             )
 
-    def NONE_MATCH(self, guard: Guard):
+    def NONE_MATCH(self, guard: Guard) -> None:
         # checks `val is None`
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
@@ -1785,12 +1968,12 @@ def NONE_MATCH(self, guard: Guard):
             get_verbose_code_parts(code, guard)
         )
 
-    def ID_MATCH(self, guard: Guard):
-        if self.serialization_mode == "save":
-            raise torch._dynamo.exc.PackageError("ID_MATCH guard cannot be serialized.")
-        return self.id_match_unchecked(guard)
+    def ID_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> None:
+        return self.id_match_unchecked(guard, recompile_hint)
 
-    def id_match_unchecked(self, guard: Guard):
+    def id_match_unchecked(
+        self, guard: Guard, recompile_hint: Optional[str] = None
+    ) -> None:
         # ___check_obj_id is same as `id(x) == y`
         if isinstance(guard.originating_source, TypeSource):
             # optional optimization to produce cleaner/faster guard code
@@ -1803,9 +1986,8 @@ def id_match_unchecked(self, guard: Guard):
         id_val = self.id_ref(val, guard.name)
         code = f"___check_obj_id({ref}, {id_val})"
         self._set_guard_export_info(guard, [code], provided_func_name="ID_MATCH")
-
         self.get_guard_manager(guard).add_id_match_guard(
-            id_val, get_verbose_code_parts(code, guard)
+            id_val, get_verbose_code_parts(code, guard, recompile_hint)
         )
 
         # Keep track of ID_MATCH'd objects. This will be used to modify the
@@ -1820,7 +2002,7 @@ def id_match_unchecked(self, guard: Guard):
                 if weak_id is not None:
                     self.id_matched_objs[local_name] = weak_id
 
-    def NOT_NONE_MATCH(self, guard: Guard, value=None):
+    def NOT_NONE_MATCH(self, guard: Guard, value: Optional[Any] = None) -> None:
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
         assert isinstance(val, torch.Tensor)
@@ -1831,7 +2013,7 @@ def NOT_NONE_MATCH(self, guard: Guard, value=None):
             get_verbose_code_parts(code, guard)
         )
 
-    def DISPATCH_KEY_SET_MATCH(self, guard: Guard):
+    def DISPATCH_KEY_SET_MATCH(self, guard: Guard) -> None:
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
         assert isinstance(val, torch._C.DispatchKeySet)
@@ -1841,28 +2023,30 @@ def DISPATCH_KEY_SET_MATCH(self, guard: Guard):
             val, get_verbose_code_parts(code_parts, guard)
         )
 
-    def NAME_MATCH(self, guard: Guard):
-        self._guard_on_attribute(guard, "__name__", GuardBuilder.EQUALS_MATCH)
+    def NAME_MATCH(self, guard: Guard) -> None:
+        self._guard_on_attribute(guard, "__name__", GuardBuilder.EQUALS_MATCH)  # type: ignore[arg-type]
 
-    def DUAL_LEVEL(self, guard: Guard):
+    def DUAL_LEVEL(self, guard: Guard) -> None:
         # Invalidate dual level if current dual level is different than the one
         # in the fx graph
+        assert self.check_fn_manager.output_graph is not None
         dual_level = self.check_fn_manager.output_graph.dual_level
         code = [f"torch.autograd.forward_ad._current_level == {dual_level}"]
-        self._set_guard_export_info(guard, [code])
+        self._set_guard_export_info(guard, code)
         # TODO(anijain2305) - Consider this moving this guard to C++
         forward_ad = torch.autograd.forward_ad
 
-        def fn(x):
+        def fn(x: Any) -> bool:
             return forward_ad._current_level == dual_level
 
         self.guard_manager.root.add_lambda_guard(
             fn, get_verbose_code_parts(code, guard)
         )
 
-    def FUNCTORCH_STACK_MATCH(self, guard: Guard):
+    def FUNCTORCH_STACK_MATCH(self, guard: Guard) -> None:
         # Invalidate functorch code if current level is different than
         # the one when FX graph was generated
+        assert self.check_fn_manager.output_graph is not None
         cis = self.check_fn_manager.output_graph.functorch_layers
         states = [ci.get_state() for ci in cis]
         code = [f"torch._functorch.pyfunctorch.compare_functorch_state({states})"]
@@ -1871,20 +2055,22 @@ def FUNCTORCH_STACK_MATCH(self, guard: Guard):
         # TODO(anijain2305) - Consider this moving this guard to C++
         compare_fn = torch._functorch.pyfunctorch.compare_functorch_state
 
-        def fn(x):
+        def fn(x: Any) -> bool:
             return compare_fn(states)
 
         self.guard_manager.root.add_lambda_guard(
             fn, get_verbose_code_parts(code, guard)
         )
 
-    def AUTOGRAD_SAVED_TENSORS_HOOKS(self, guard: Guard):
+    def AUTOGRAD_SAVED_TENSORS_HOOKS(self, guard: Guard) -> None:
         get_hooks = torch._functorch._aot_autograd.utils.top_saved_tensors_hooks
         are_inline_hooks = (
             torch._functorch._aot_autograd.utils.saved_tensors_hooks_are_inlineable
         )
 
-        def hooks_ids_fn(hooks):
+        def hooks_ids_fn(
+            hooks: tuple[Callable[[torch.Tensor], Any], Callable[[Any], torch.Tensor]],
+        ) -> Optional[tuple[int, ...]]:
             if not are_inline_hooks(hooks):
                 return None
 
@@ -1898,27 +2084,27 @@ def hooks_ids_fn(hooks):
         ]
         self._set_guard_export_info(guard, code)
 
-        def fn(x):
+        def fn(x: Any) -> bool:
             return guard_hooks_ids == hooks_ids_fn(get_hooks())
 
         self.guard_manager.root.add_lambda_guard(
             fn, get_verbose_code_parts(code, guard)
         )
 
-    def TENSOR_SUBCLASS_METADATA_MATCH(self, guard: Guard):
+    def TENSOR_SUBCLASS_METADATA_MATCH(self, guard: Guard) -> None:
         value = self.get(guard.name)
         original_metadata = deepcopy(self.get(guard.name).__tensor_flatten__()[1])
         if hasattr(value, "__metadata_guard__"):
             verify_guard_fn_signature(value)
 
-            def metadata_checker(x):
+            def metadata_checker(x: Any) -> bool:
                 return value.__metadata_guard__(
                     original_metadata, x.__tensor_flatten__()[1]
                 )
 
         else:
 
-            def metadata_checker(x):
+            def metadata_checker(x: Any) -> bool:
                 return x.__tensor_flatten__()[1] == original_metadata
 
         global_name = f"___check_metadata_{id(metadata_checker)}_c{CompileContext.current_compile_id()}"
@@ -1926,7 +2112,7 @@ def metadata_checker(x):
             metadata_checker, get_verbose_code_parts(global_name, guard)
         )
 
-    def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None):
+    def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> None:
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
         if np:
@@ -1998,7 +2184,7 @@ def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None):
             self._set_guard_export_info(guard, code)
 
             self.get_guard_manager(guard).add_lambda_guard(
-                _get_closure_vars()["__math_isnan"],
+                _get_closure_vars()["__math_isnan"],  # type: ignore[arg-type]
                 get_verbose_code_parts(code, guard),
             )
             return
@@ -2011,7 +2197,7 @@ def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None):
             self._set_guard_export_info(guard, code)
 
             self.get_guard_manager(guard).add_lambda_guard(
-                _get_closure_vars()["__numpy_isnan"],
+                _get_closure_vars()["__numpy_isnan"],  # type: ignore[arg-type]
                 get_verbose_code_parts(code, guard),
             )
             return
@@ -2034,7 +2220,7 @@ def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None):
         self._set_guard_export_info(guard, code)
         return
 
-    def CONSTANT_MATCH(self, guard: Guard):
+    def CONSTANT_MATCH(self, guard: Guard) -> None:
         val = self.get(guard.name)
         if istype(val, bool):
             self.BOOL_MATCH(guard)
@@ -2045,19 +2231,15 @@ def CONSTANT_MATCH(self, guard: Guard):
         else:
             self.EQUALS_MATCH(guard)
 
-    def NN_MODULE(self, guard: Guard):
+    def NN_MODULE(self, guard: Guard) -> None:
         # don't support this in serialization because it uses unsupported ID_MATCH
-        if self.serialization_mode == "save":
-            raise torch._dynamo.exc.PackageError(
-                "NN_MODULE guard cannot be serialized."
-            )
-        self.ID_MATCH(guard)
+        self.ID_MATCH(guard, "[inline-inbuilt-nn-modules-candidate]")
         val = self.get(guard.name)
         if hasattr(val, "training"):
             assert istype(val.training, bool)
             if not self.guard_nn_modules:
                 # If guard_nn_modules is true, we will guard on the right set of guards
-                self._guard_on_attribute(guard, "training", GuardBuilder.CONSTANT_MATCH)
+                self._guard_on_attribute(guard, "training", GuardBuilder.CONSTANT_MATCH)  # type: ignore[arg-type]
         else:
             exc.unimplemented_v2(
                 gb_type="Attempted to guard on uninitialized nn.Module",
@@ -2069,32 +2251,24 @@ def NN_MODULE(self, guard: Guard):
                 ],
             )
 
-    def FUNCTION_MATCH(self, guard: Guard):
+    def FUNCTION_MATCH(self, guard: Guard) -> None:
         """things like torch.add and user defined functions"""
         # don't support this in serialization because it uses unsupported ID_MATCH
-        if self.serialization_mode == "save":
-            raise torch._dynamo.exc.PackageError(
-                "FUNCTION_MATCH guard cannot be serialized."
-            )
         return self.ID_MATCH(guard)
 
-    def CLOSURE_MATCH(self, guard: Guard):
+    def CLOSURE_MATCH(self, guard: Guard) -> None:
         """matches a closure by __code__ id."""
         # don't support this in serialization because it uses unsupported FUNCTION_MATCH
-        if self.serialization_mode == "save":
-            raise torch._dynamo.exc.PackageError(
-                "CLOSURE_MATCH guard cannot be serialized."
-            )
         val = self.get(guard.name)
         # Strictly only want user-defined functions
         if type(val) == types.FunctionType and hasattr(val, "__code__"):
-            self._guard_on_attribute(guard, "__code__", GuardBuilder.HASATTR)
-            self._guard_on_attribute(guard, "__code__", GuardBuilder.FUNCTION_MATCH)
+            self._guard_on_attribute(guard, "__code__", GuardBuilder.HASATTR)  # type: ignore[arg-type]
+            self._guard_on_attribute(guard, "__code__", GuardBuilder.FUNCTION_MATCH)  # type: ignore[arg-type]
         else:
             self.FUNCTION_MATCH(guard)
 
-    def BUILTIN_MATCH(self, guard: Guard):
-        if self.serialization_mode == "save":
+    def BUILTIN_MATCH(self, guard: Guard) -> None:
+        if self.save_guards:
             # Record which builtin variables are used for pruning later.
             if isinstance(guard.originating_source, DictGetItemSource):
                 self.check_fn_manager.used_builtin_vars.add(
@@ -2104,7 +2278,7 @@ def BUILTIN_MATCH(self, guard: Guard):
 
         return self.ID_MATCH(guard)
 
-    def SEQUENCE_LENGTH(self, guard):
+    def SEQUENCE_LENGTH(self, guard: Guard) -> None:
         # This guard is used to check length of PySequence objects like list,
         # tuple, collections.deque etc
         ref = self.arg_ref(guard)
@@ -2130,7 +2304,7 @@ def SEQUENCE_LENGTH(self, guard):
                 len(value), get_verbose_code_parts(code, guard)
             )
 
-    def TUPLE_ITERATOR_LEN(self, guard):
+    def TUPLE_ITERATOR_LEN(self, guard: Guard) -> None:
         ref = self.arg_ref(guard)
         value = self.get(guard.name)
         t = type(value)
@@ -2146,7 +2320,7 @@ def TUPLE_ITERATOR_LEN(self, guard):
             tuple_iterator_len(value), obj_id, get_verbose_code_parts(code, guard)
         )
 
-    def RANGE_ITERATOR_MATCH(self, guard):
+    def RANGE_ITERATOR_MATCH(self, guard: Guard) -> None:
         ref = self.arg_ref(guard)
         value = self.get(guard.name)
         t = type(value)
@@ -2165,8 +2339,8 @@ def RANGE_ITERATOR_MATCH(self, guard):
         )
 
     # TODO(voz): Deduplicate w/ AOTAutograd dupe input guards
-    def DUPLICATE_INPUT(self, guard, source_b):
-        if self.serialization_mode == "save":
+    def DUPLICATE_INPUT(self, guard: Guard, source_b: Source) -> None:
+        if self.save_guards:
             if name := get_local_source_name(source_b):
                 self.check_fn_manager.additional_used_local_vars.add(name)
             if name := get_global_source_name(source_b):
@@ -2205,11 +2379,7 @@ def DUPLICATE_INPUT(self, guard, source_b):
                 get_verbose_code_parts(code, guard),
             )
 
-    def WEAKREF_ALIVE(self, guard):
-        if self.serialization_mode == "save":
-            raise torch._dynamo.exc.PackageError(
-                "WEAKREF_ALIVE guard cannot be serialized."
-            )
+    def WEAKREF_ALIVE(self, guard: Guard) -> None:
         code = [f"{self.arg_ref(guard)} is not None"]
 
         self._set_guard_export_info(guard, code)
@@ -2217,7 +2387,7 @@ def WEAKREF_ALIVE(self, guard):
             get_verbose_code_parts(code, guard)
         )
 
-    def MAPPING_KEYS_CHECK(self, guard):
+    def MAPPING_KEYS_CHECK(self, guard: Guard) -> None:
         """Guard on the key order of types.MappingProxyType object"""
         ref = self.arg_ref(guard)
         value = self.get(guard.name)
@@ -2227,7 +2397,7 @@ def MAPPING_KEYS_CHECK(self, guard):
         self._set_guard_export_info(guard, code)
         self.get_guard_manager(guard).add_mapping_keys_guard(value, code)
 
-    def DICT_KEYS_MATCH(self, guard):
+    def DICT_KEYS_MATCH(self, guard: Guard) -> None:
         """Insert guard to check that the keys of a dict are same"""
         ref = self.arg_ref(guard)
         value = self.get(guard.name)
@@ -2252,29 +2422,30 @@ def DICT_KEYS_MATCH(self, guard):
         else:
             self.guard_on_dict_keys_and_ignore_order(value, guard)
 
-    def EMPTY_NN_MODULE_HOOKS_DICT(self, guard):
+    def EMPTY_NN_MODULE_HOOKS_DICT(self, guard: Guard) -> None:
         """Special guard to skip guards on empty hooks. This is controlled by skip_nnmodule_hook_guards"""
         if config.skip_nnmodule_hook_guards:
             # This is unsafe if you add/remove a hook on nn module variable
             return
         self.SEQUENCE_LENGTH(guard)
 
-    def GRAD_MODE(self, guard: Guard):
+    def GRAD_MODE(self, guard: Guard) -> None:
         pass  # we always guard on this via GlobalStateGuard()
 
-    def DETERMINISTIC_ALGORITHMS(self, guard: Guard):
+    def DETERMINISTIC_ALGORITHMS(self, guard: Guard) -> None:
         pass  # we always guard on this via GlobalStateGuard()
 
-    def TORCH_FUNCTION_STATE(self, guard: Guard):
+    def TORCH_FUNCTION_STATE(self, guard: Guard) -> None:
         pass  # we always guard on this via GlobalStateGuard()
 
-    def FSDP_TRAINING_STATE(self, guard: Guard):
+    def FSDP_TRAINING_STATE(self, guard: Guard) -> None:
         pass  # we always guard on this via GlobalStateGuard()
 
-    def DEFAULT_DEVICE(self, guard: Guard):
+    def DEFAULT_DEVICE(self, guard: Guard) -> None:
         """Guard on CURRENT_DEVICE per torch.utils._device"""
         assert guard.source is GuardSource.GLOBAL
 
+        assert self.check_fn_manager.output_graph is not None
         code = [
             f"utils_device.CURRENT_DEVICE == {self.check_fn_manager.output_graph.current_device!r}"
         ]
@@ -2284,11 +2455,13 @@ def DEFAULT_DEVICE(self, guard: Guard):
             get_verbose_code_parts(code, guard)
         )
 
-    def SHAPE_ENV(self, guard: Guard):
+    def SHAPE_ENV(self, guard: Guard) -> None:
+        from torch._dynamo.output_graph import OutputGraph
+
         assert guard.name == ""
         output_graph = self.check_fn_manager.output_graph
-        if self.serialization_mode == "load":
-            assert self.check_fn_manager.shape_code_parts is not None
+        assert output_graph is not None
+        if self.check_fn_manager.shape_code_parts is not None:
             shape_code_parts = self.check_fn_manager.shape_code_parts
             python_code_parts = shape_code_parts.python_code_parts
             verbose_code_parts = shape_code_parts.verbose_code_parts
@@ -2300,10 +2473,11 @@ def SHAPE_ENV(self, guard: Guard):
             # shape variables to sources from tracked_fakes.  This must happen after
             # tensor checks.
             # NB: self.output_graph can be None in the debug_nops tests
+            assert isinstance(output_graph, OutputGraph)
             fs = output_graph.tracked_fakes
             input_contexts = [a.symbolic_context for a in fs]
 
-            def get_sources(t_id, dim):
+            def get_sources(t_id: int, dim: int) -> list[Source]:
                 # Looks up base sources mapped to a tensor id and uses them to create
                 # sources for the corresponding tensor dimension.
                 return [
@@ -2311,6 +2485,7 @@ def get_sources(t_id, dim):
                     for source in output_graph.tracked_fakes_id_to_source[t_id]
                 ]
 
+            assert output_graph.shape_env is not None
             if output_graph.export_constraints:
                 names: dict[str, tuple[int, int]] = {}
                 source_pairs: list[tuple[Source, Source]] = []
@@ -2319,7 +2494,7 @@ def get_sources(t_id, dim):
                 ] = []
                 phantom_symbols: dict[str, Symbol] = {}
                 relaxed_sources: set[Source] = set()
-                for constraint in output_graph.export_constraints:
+                for constraint in output_graph.export_constraints:  # type: ignore[attr-defined]
                     if constraint.t_id in output_graph.tracked_fakes_id_to_source:
                         torch.export.dynamic_shapes._process_equalities(
                             constraint,
@@ -2343,15 +2518,15 @@ def get_sources(t_id, dim):
             else:
                 equalities_inputs = None
 
-            def _get_code_parts(langs):
+            def _get_code_parts(langs: tuple[str, ...]) -> list[_ShapeGuardsHelper]:
                 return output_graph.shape_env.produce_guards_verbose(
-                    [a.fake for a in fs],
+                    [a.fake for a in fs],  # type: ignore[misc]
                     [a.source for a in fs],
-                    input_contexts=input_contexts,
+                    input_contexts=input_contexts,  # type: ignore[arg-type]
                     equalities_inputs=equalities_inputs,
                     source_ref=self.source_ref,
                     # Export keeps static.
-                    ignore_static=(not self.check_fn_manager.output_graph.export),
+                    ignore_static=(not output_graph.export),
                     langs=langs,
                 )
 
@@ -2359,7 +2534,7 @@ def _get_code_parts(langs):
                 try:
                     # For exporting we need the python code parts
                     python_code_parts, verbose_code_parts, cpp_code_parts = (
-                        _get_code_parts(("python", "verbose_python", "cpp"))
+                        _get_code_parts(("python", "verbose_python", "cpp"))  # type: ignore[assignment]
                     )
                     python_fallback = False
                 except OverflowError:
@@ -2376,10 +2551,10 @@ def _get_code_parts(langs):
 
             # When exporting, we may work with the shape constraints some more in
             # postprocessing, so don't freeze yet
-            if not self.check_fn_manager.output_graph.export:
+            if not output_graph.export:
                 output_graph.shape_env.freeze()
 
-        if self.serialization_mode == "save":
+        if self.save_guards:
             # For SHAPE_ENV we want to skip serializing the entire ShapeEnv so instead
             # we directly serialize the generated code here.
             maybe_cpp_code_parts = locals().get("cpp_code_parts")
@@ -2520,7 +2695,7 @@ def _get_code_parts(langs):
                 closure_vars={**SYMPY_INTERP, **_get_closure_vars()},
             )
 
-    def TENSOR_MATCH(self, guard: Guard, value=None):
+    def TENSOR_MATCH(self, guard: Guard, value: Optional[Any] = None) -> None:
         if config._unsafe_skip_fsdp_module_guards and guard.is_fsdp_module():
             return
         # For tensors that are part of the Dynamo extracted Fx graph module, an
@@ -2573,6 +2748,7 @@ def TENSOR_MATCH(self, guard: Guard, value=None):
             # The list of tensor fields and calls we care about can be found in `terms` below.
             # TODO(voz): We are missing storage offset in all our tensor guards?
             code: list[str] = []
+            assert self.check_fn_manager.output_graph is not None
             if self.check_fn_manager.output_graph.export:
                 self.TYPE_MATCH(guard)
                 terms = [
@@ -2624,14 +2800,19 @@ def TENSOR_MATCH(self, guard: Guard, value=None):
 
                 verbose_code_parts = get_verbose_code_parts(
                     get_tensor_guard_code_part(
-                        value, tensor_name, size, stride, pytype, dispatch_keys
+                        value,
+                        tensor_name,
+                        size,
+                        stride,
+                        pytype,
+                        dispatch_keys,
                     ),
                     guard,
                 )
                 guard_manager.add_tensor_match_guard(
                     value,
-                    size,
-                    stride,
+                    size,  # type: ignore[arg-type]
+                    stride,  # type: ignore[arg-type]
                     tensor_name,
                     verbose_code_parts,
                     pytype,
@@ -2700,8 +2881,12 @@ def TENSOR_MATCH(self, guard: Guard, value=None):
 
     # A util that in the case of export, adds data onto guards
     def _set_guard_export_info(
-        self, guard, code_list, provided_guarded_object=None, provided_func_name=None
-    ):
+        self,
+        guard: Guard,
+        code_list: list[str],
+        provided_guarded_object: Optional[Any] = None,
+        provided_func_name: Optional[str] = None,
+    ) -> None:
         # WARNING: It is important that cur_frame/caller do NOT stay in
         # the current frame, because they will keep things live longer
         # than they should.  See TestMisc.test_release_module_memory
@@ -2734,7 +2919,9 @@ def _set_guard_export_info(
             getattr(guarded_object.__class__, "__weakrefoffset__", 0) != 0
         )
         # See D64140537 for why we are checking for tuple.
-        if supports_weakref and not isinstance(guarded_object, (enum.Enum, tuple)):
+        if supports_weakref and not isinstance(
+            guarded_object, (enum.Enum, tuple, weakref.ProxyTypes)
+        ):
             obj_ref = weakref.ref(guarded_object)
 
         guard.set_export_info(
@@ -2779,7 +2966,7 @@ class ExprCounter(ast.NodeVisitor):
         def __init__(self, config: PyExprCSEPass.Config) -> None:
             self._config = config
 
-        def visit(self, node: ast.AST) -> Any:
+        def visit(self, node: ast.AST) -> None:
             if isinstance(node, PyExprCSEPass.ALLOWED_NODE_TYPES):
                 self._config.expr_count[_ast_unparse(node)] += 1
             super().visit(node)
@@ -2847,7 +3034,7 @@ def replace(self, expr: str) -> tuple[list[str], str]:
         return replacer.preface, _ast_unparse(new_node)
 
 
-def must_add_nn_module_guards(guard):
+def must_add_nn_module_guards(guard: Guard) -> bool:
     # For config.guard_nn_modules=False, we can skip all the guards that
     # originate from inside of nn module except for a few categories.
     return (
@@ -2862,11 +3049,11 @@ def must_add_nn_module_guards(guard):
 
 
 class DeletedGuardManagerWrapper(GuardManagerWrapper):
-    def __init__(self, reason):
+    def __init__(self, reason: str) -> None:
         super().__init__()
         self.invalidation_reason = reason
 
-    def populate_diff_guard_manager(self):
+    def populate_diff_guard_manager(self) -> None:
         self.diff_guard_root = None
 
 
@@ -2885,20 +3072,31 @@ class GuardsState:
     shape_code_parts: Optional[ShapeCodeParts]
 
 
+class _Missing:
+    pass
+
+
 class GuardsStatePickler(pickle.Pickler):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         self.fake_mode = torch._subclasses.FakeTensorMode()
         self.tensor_converter = torch._subclasses.fake_tensor.FakeTensorConverter()
 
     @classmethod
-    def _unpickle_module(cls, state):
+    def _unpickle_module(cls, state: Any) -> torch.nn.Module:
         mod = torch.nn.Module()
         mod.__setstate__(state)
         return mod
 
     @classmethod
-    def _unpickle_tensor(cls, meta_tensor, device, pytype, dispatch_keys_raw, grad):
+    def _unpickle_tensor(
+        cls,
+        meta_tensor: torch.Tensor,
+        device: torch.device,
+        pytype: type,
+        dispatch_keys_raw: int,
+        grad: torch.Tensor,
+    ) -> torch.Tensor:
         fake_mode = torch._subclasses.FakeTensorMode()
         tensor_converter = torch._subclasses.fake_tensor.FakeTensorConverter()
         ret = tensor_converter.from_meta_and_device(
@@ -2913,15 +3111,21 @@ def _unpickle_tensor(cls, meta_tensor, device, pytype, dispatch_keys_raw, grad):
 
     @classmethod
     def _unpickle_traceable_wrapper_subclass(
-        cls, meta_tensor, device, pytype, dispatch_keys_raw, ctx, inner_data
-    ):
+        cls,
+        meta_tensor: torch.Tensor,
+        device: torch.device,
+        pytype: type,
+        dispatch_keys_raw: int,
+        ctx: Any,
+        inner_data: list[tuple[str, Callable[..., Any], tuple[Any, ...]]],
+    ) -> torch.Tensor:
         # Unpickle the inner tensor components. These could also be subclass instances.
         inner_tensors = {}
         for attr, unpickle_func, unpickle_func_args in inner_data:
             inner_tensors[attr] = unpickle_func(*unpickle_func_args)
 
         outer_size, outer_stride = meta_tensor.shape, meta_tensor.stride()
-        out = type(meta_tensor).__tensor_unflatten__(
+        out = type(meta_tensor).__tensor_unflatten__(  # type: ignore[attr-defined]
             inner_tensors, ctx, outer_size, outer_stride
         )
         out.pytype = pytype
@@ -2929,22 +3133,32 @@ def _unpickle_traceable_wrapper_subclass(
         return out
 
     @classmethod
-    def _unpickle_python_module(cls, alias: str):
+    def _unpickle_python_module(cls, alias: str) -> types.ModuleType:
         return importlib.import_module(alias)
 
     @classmethod
-    def _unpickle_dispatch_key_set(cls, raw_repr: int):
+    def _unpickle_dispatch_key_set(cls, raw_repr: int) -> torch._C.DispatchKeySet:
         return torch._C.DispatchKeySet.from_raw_repr(raw_repr)
 
     @classmethod
-    def _unpickle_functorch_interpreter(cls, json: bytes):
+    def _unpickle_functorch_interpreter(
+        cls, json: bytes
+    ) -> torch._C._functorch.CInterpreter:
         return torch._C._functorch.CInterpreter.deserialize(json)
 
     @classmethod
-    def _unpickle_mapping_proxy(cls, d):
+    def _unpickle_mapping_proxy(
+        cls, d: dict[Any, Any]
+    ) -> types.MappingProxyType[Any, Any]:
         return types.MappingProxyType(d)
 
-    def reducer_override(self, obj):
+    @classmethod
+    def _unpickle_c_op(cls, name: str) -> Any:
+        return getattr(torch.ops._C, name)
+
+    def reducer_override(
+        self, obj: Any
+    ) -> Union[tuple[Callable[..., Any], tuple[Any, ...]], Any]:
         import sympy
 
         if isinstance(obj, torch.Tensor) and obj.device.type != "meta":
@@ -3008,6 +3222,27 @@ def reducer_override(self, obj):
         elif isinstance(obj, types.MappingProxyType):
             return type(self)._unpickle_mapping_proxy, (obj.copy(),)
 
+        elif isinstance(
+            obj, torch._ops.OpOverloadPacket
+        ) and obj._qualified_op_name.startswith("_C::"):
+            return type(self)._unpickle_c_op, (obj.__name__,)
+
+        elif (
+            obj.__class__.__module__ == "builtins"
+            and obj.__class__.__name__ == "PyCapsule"
+        ):
+            # Skipping PyCapsule since there isn't much to be guarded about them.
+            return _Missing, ()
+
+        elif isinstance(obj, types.CodeType):
+            # We only do ID_MATCH on code objects which is already banned from guards serialization.
+            return _Missing, ()
+
+        elif inspect.isfunction(obj) and (obj.__code__.co_flags & inspect.CO_NESTED):
+            # Skipping nested function since CLOSURE_MATCH is banned from guards serialization.
+            assert obj.__qualname__ != obj.__name__
+            return _Missing, ()
+
         if type(obj).__qualname__ != type(obj).__name__:
             raise torch._dynamo.exc.PackageError(
                 f"Type {type(obj)} for object {obj} cannot be saved "
@@ -3036,16 +3271,17 @@ def pickle_guards_state(state: GuardsState) -> bytes:
 class CheckFunctionManager:
     def __init__(
         self,
-        f_code,
-        output_graph=None,
-        cache_entry=None,
+        f_code: types.CodeType,
+        output_graph: OutputGraphGuardsState,
+        cache_entry: Optional[CacheEntry] = None,
         guard_fail_fn: Optional[Callable[[GuardFail], None]] = None,
         guard_filter_fn: Optional[
             Callable[[list[GuardFilterEntry]], list[bool]]
         ] = None,
-        guards_serialization_mode: Optional[str] = None,
         shape_code_parts: Optional[ShapeCodeParts] = None,
         runtime_global_scope: Optional[dict[str, Any]] = None,
+        save_guards: bool = False,
+        strict_error: bool = False,
     ):
         guards = output_graph.guards if output_graph else None
         self._weakrefs: dict[int, ReferenceType[object]] = {}
@@ -3053,7 +3289,8 @@ def __init__(
         existing_diff_guard_sources = (
             update_diff_guard_managers_for_existing_cache_entries(cache_entry)
         )
-        self.output_graph = output_graph
+        self.output_graph: Optional[OutputGraphGuardsState] = output_graph
+        assert self.output_graph is not None
 
         # Only used for serialization.
         self.shape_code_parts = shape_code_parts
@@ -3063,25 +3300,19 @@ def __init__(
         self.torch_function_mode_stack = (
             output_graph.torch_function_mode_stack if output_graph else None
         )
-        self.guards_serialization_mode = guards_serialization_mode
         self.used_builtin_vars: OrderedSet[str] = OrderedSet()
         self.additional_used_local_vars: OrderedSet[str] = OrderedSet()
         self.additional_used_global_vars: OrderedSet[str] = OrderedSet()
-        if runtime_global_scope:
-            assert self.guards_serialization_mode == "load"
         self.runtime_global_scope = runtime_global_scope
 
         if not justknobs_check("pytorch/compiler:guard_nn_modules"):
             log.warning("guard_nn_modules is turned off using justknobs killswitch")
 
         # TODO Be more explicit about the behavior for the users.
-        if (
-            torch._dynamo.config.caching_precompile
-            and self.guards_serialization_mode != "load"
-        ):
+        if torch._dynamo.config.caching_precompile:
             _guard_filter_fn = guard_filter_fn or (lambda gs: [True for g in gs])
 
-            def guard_filter_fn(guards):
+            def guard_filter_fn(guards: list[GuardFilterEntry]) -> list[bool]:
                 ret = []
                 for keep, g in zip(_guard_filter_fn(guards), guards):
                     if not keep:
@@ -3101,17 +3332,15 @@ def guard_filter_fn(guards):
                 return ret
 
         sorted_guards = sorted(guards or (), key=Guard.sort_key)
-        builder, guard_manager = self.build_guards(
-            sorted_guards,
-            existing_diff_guard_sources,
-            f_code,
-            output_graph,
-            None if guard_filter_fn else self.guards_serialization_mode,
-        )
 
         if guard_filter_fn:
+            # If we're filtering guards, we need to build it an extra time first
+            # because filtering depends on the builder/guard_manager results
+            builder, guard_manager = self.build_guards(
+                sorted_guards, existing_diff_guard_sources, f_code, output_graph, False
+            )
 
-            def make_guard_filter_entry(guard):
+            def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
                 MISSING = object()
                 name = strip_local_scope(guard.name)
                 if name == "":
@@ -3129,17 +3358,14 @@ def make_guard_filter_entry(guard):
                         value = MISSING
                         has_value = False
                 is_global = get_global_source_name(guard.originating_source) is not None
-                guard_fn = guard.create_fn
-                if isinstance(guard_fn, functools.partial):
-                    guard_fn = guard.create_fn.func
                 return GuardFilterEntry(
                     name=name,
                     has_value=has_value,
                     value=value,
-                    guard_type=guard_fn.__name__,
-                    derived_guard_types=tuple(guard.guard_types)
-                    if guard.guard_types
-                    else (),
+                    guard_type=guard.create_fn_name(),
+                    derived_guard_types=(
+                        tuple(guard.guard_types) if guard.guard_types else ()
+                    ),
                     is_global=is_global,
                     orig_guard=guard,
                 )
@@ -3152,14 +3378,15 @@ def make_guard_filter_entry(guard):
             sorted_guards = [
                 guard for i, guard in enumerate(sorted_guards) if filter_results[i]
             ]
-            # Redo the guards because filtering relies on the results from the last guard builder.
-            builder, guard_manager = self.build_guards(
-                sorted_guards,
-                existing_diff_guard_sources,
-                f_code,
-                output_graph,
-                self.guards_serialization_mode,
-            )
+
+        # Redo the guards because filtering relies on the results from the last guard builder.
+        builder, guard_manager = self.build_guards(
+            sorted_guards,
+            existing_diff_guard_sources,
+            f_code,
+            output_graph,
+            save_guards,
+        )
 
         self.guard_manager = guard_manager
         self.compile_check_fn(builder, sorted_guards, guard_fail_fn)
@@ -3182,10 +3409,11 @@ def make_guard_filter_entry(guard):
         # TODO(anijain2305, ydwu4) - Skipping export because of following test
         # python -s test/dynamo/test_export.py -k test_export_with_symbool_inputs
         latency = 0.0
-        if not output_graph.export and self.guards_serialization_mode != "load":
+
+        if not output_graph.skip_guards_check and not output_graph.export:
             if not self.guard_manager.check(output_graph.local_scope):
                 reasons = get_guard_fail_reason_helper(
-                    self.guard_manager,  # type: ignore[arg-type]
+                    self.guard_manager,
                     output_graph.local_scope,
                     CompileContext.current_compile_id(),
                 )
@@ -3218,91 +3446,21 @@ def make_guard_filter_entry(guard):
             CompileEventLogger.increment_toplevel("guard_latency_us", int(latency))
 
         self.guards_state: Optional[bytes] = None
-        builtins_dict_name = self.output_graph.name_of_builtins_dict_key_in_fglobals
-        if self.guards_serialization_mode == "save":
-            used_global_vars = set()
-            used_local_vars = set()
-
-            def prune_variable(source):
-                if name := get_global_source_name(source):
-                    assert isinstance(name, str)
-                    # Leave out the builtins dict key, as we will special handle
-                    # it later because the guarded code rarely use the entire
-                    # builtin dict in the common case.
-                    if name not in (builtins_dict_name,):
-                        used_global_vars.add(name)
-                elif name := get_local_source_name(source):
-                    assert isinstance(name, str)
-                    used_local_vars.add(name)
-
-            output_graph_guards_state = self.output_graph.dump_guards_state()
-            # Only serialize the global variables that are actually used in guards.
-            for guard in sorted_guards:
-                if isinstance(guard.originating_source, ShapeEnvSource):
-                    assert self.shape_code_parts
-                    for source in self.shape_code_parts.shape_env_sources:
-                        prune_variable(source)
-                else:
-                    prune_variable(guard.originating_source)
-
-            for source in self.output_graph.guard_on_key_order:
-                prune_variable(source)
-
-            def normalize_create_fn(x):
-                if isinstance(x, functools.partial):
+        if save_guards:
+            from torch._dynamo.output_graph import OutputGraph
 
-                    def _ref(x):
-                        if isinstance(x, (TensorWeakRef, weakref.ref)):
-                            return x()
-                        return x
-
-                    new_args = tuple(_ref(a) for a in x.args)
-                    new_keywords = {k: _ref(v) for k, v in x.keywords.items()}
-                    return functools.partial(x.func, *new_args, **new_keywords)
-
-                return x
-
-            global_scope_state = {
-                k: v
-                for k, v in output_graph_guards_state.global_scope.items()
-                if k in used_global_vars or k in self.additional_used_global_vars
-            }
-            global_scope_state[builtins_dict_name] = {
-                k: v
-                for k, v in output_graph_guards_state.global_scope[
-                    builtins_dict_name
-                ].items()
-                if k in self.used_builtin_vars
-            }
-            output_graph_guards_state = dataclasses.replace(
-                output_graph_guards_state,
-                local_scope={
-                    k: v
-                    for k, v in output_graph_guards_state.local_scope.items()
-                    if k in used_local_vars or k in self.additional_used_local_vars
-                },
-                global_scope=global_scope_state,
-                _guards=torch._guards.GuardsSet(
-                    {
-                        dataclasses.replace(
-                            guard,
-                            obj_weakref=None,
-                            guarded_class_weakref=None,
-                            create_fn=normalize_create_fn(guard.create_fn),
-                        )
-                        for guard in sorted_guards
-                    }
-                ),
-                input_source_to_sizes_strides=pytree.tree_map(
-                    convert_int_to_concrete_values,
-                    output_graph_guards_state.input_source_to_sizes_strides,
-                ),
-            )
-            guards_state = GuardsState(
-                output_graph=output_graph_guards_state,
-                shape_code_parts=self.shape_code_parts,
-            )
-            self.guards_state = pickle_guards_state(guards_state)
+            assert isinstance(self.output_graph, OutputGraph)
+            try:
+                self.guards_state = self.serialize_guards(
+                    builder, sorted_guards, self.output_graph
+                )
+            except exc.PackageError as e:
+                if torch._dynamo.config.strict_precompile or strict_error:
+                    raise e
+                self.output_graph.bypass_package(
+                    f"Guard evaluation failed: {str(e)}",
+                    traceback=traceback.format_exc().split("\n"),
+                )
 
         # TODO: don't do the string rep, do something more structured here
         torch._logging.trace_structured(
@@ -3320,20 +3478,152 @@ def _ref(x):
         self._weakrefs.clear()
         self.output_graph = None
 
+    UNSUPPORTED_SERIALIZATION_GUARD_TYPES: tuple[LiteralString, ...] = (
+        "DICT_VERSION",
+        "NN_MODULE",
+        "ID_MATCH",
+        "FUNCTION_MATCH",
+        "CLOSURE_MATCH",
+        "WEAKREF_ALIVE",
+    )
+
+    def serialize_guards(
+        self,
+        builder: GuardBuilder,
+        sorted_guards: list[Guard],
+        output_graph: OutputGraph,
+    ) -> bytes:
+        # We check whether our list of guards are serializable here
+        for guard in sorted_guards:
+            guard_type = guard.create_fn_name()
+            derived_guard_types = tuple(guard.guard_types) if guard.guard_types else ()
+            # BUILTIN_MATCH calls TYPE_MATCH sometimes, so we need to check both for
+            # a chance that the guard is unserializable
+            if guard_type in ("TYPE_MATCH", "BUILTIN_MATCH"):
+                if guard._unserializable:
+                    # Only call builder.get again if we know we're going to throw
+                    obj = builder.get(guard.name)
+                    raise_local_type_error(obj)
+            elif (
+                guard_type in CheckFunctionManager.UNSUPPORTED_SERIALIZATION_GUARD_TYPES
+            ):
+                raise torch._dynamo.exc.PackageError(
+                    f"{guard_type} guard cannot be serialized."
+                )
+            elif failed := next(
+                (
+                    i
+                    for i in derived_guard_types
+                    if i in CheckFunctionManager.UNSUPPORTED_SERIALIZATION_GUARD_TYPES
+                ),
+                None,
+            ):
+                # Just raise the first failed guard name
+                raise torch._dynamo.exc.PackageError(
+                    f"{failed} guard cannot be serialized."
+                )
+
+        builtins_dict_name = output_graph.name_of_builtins_dict_key_in_fglobals
+        used_global_vars = set()
+        used_local_vars = set()
+
+        def prune_variable(source: Source) -> None:
+            if name := get_global_source_name(source):
+                assert isinstance(name, str)
+                # Leave out the builtins dict key, as we will special handle
+                # it later because the guarded code rarely use the entire
+                # builtin dict in the common case.
+                if name not in (builtins_dict_name,):
+                    used_global_vars.add(name)
+            elif name := get_local_source_name(source):
+                assert isinstance(name, str)
+                used_local_vars.add(name)
+
+        output_graph_guards_state = output_graph.dump_guards_state()
+        # Only serialize the global variables that are actually used in guards.
+        for guard in sorted_guards:
+            if isinstance(guard.originating_source, ShapeEnvSource):
+                assert self.shape_code_parts
+                for source in self.shape_code_parts.shape_env_sources:
+                    prune_variable(source)
+            else:
+                prune_variable(guard.originating_source)
+
+        for source in output_graph.guard_on_key_order:
+            prune_variable(source)
+
+        def normalize_create_fn(x: Callable[..., None]) -> Callable[..., None]:
+            if isinstance(x, functools.partial):
+
+                def _ref(x: Any) -> Any:
+                    if isinstance(x, (TensorWeakRef, weakref.ref)):
+                        return x()
+                    return x
+
+                new_args = tuple(_ref(a) for a in x.args)
+                new_keywords = {k: _ref(v) for k, v in x.keywords.items()}
+                return functools.partial(x.func, *new_args, **new_keywords)
+
+            return x
+
+        global_scope_state = {
+            k: v
+            for k, v in output_graph_guards_state.global_scope.items()
+            if k in used_global_vars or k in self.additional_used_global_vars
+        }
+        global_scope_state[builtins_dict_name] = {
+            k: v
+            for k, v in output_graph_guards_state.global_scope[
+                builtins_dict_name
+            ].items()  # type: ignore[attr-defined]
+            if k in self.used_builtin_vars
+        }
+        output_graph_guards_state = dataclasses.replace(
+            output_graph_guards_state,
+            local_scope={
+                k: v
+                for k, v in output_graph_guards_state.local_scope.items()
+                if k in used_local_vars or k in self.additional_used_local_vars
+            },
+            global_scope=global_scope_state,
+            _guards=torch._guards.GuardsSet(
+                {
+                    dataclasses.replace(
+                        guard,
+                        obj_weakref=None,
+                        guarded_class_weakref=None,
+                        create_fn=normalize_create_fn(guard.create_fn),
+                    )
+                    for guard in sorted_guards
+                }
+            ),
+            input_source_to_sizes_strides=pytree.tree_map(
+                convert_int_to_concrete_values,
+                output_graph_guards_state.input_source_to_sizes_strides,
+            ),
+            skip_guards_check=True,
+        )
+        guards_state = GuardsState(
+            output_graph=output_graph_guards_state,
+            shape_code_parts=self.shape_code_parts,
+        )
+
+        return pickle_guards_state(guards_state)
+
     def build_guards(
         self,
-        sorted_guards,
-        existing_diff_guard_sources,
-        f_code,
-        output_graph,
-        serialization_mode=None,
-    ):
+        sorted_guards: list[Guard],
+        existing_diff_guard_sources: OrderedSet[str],
+        f_code: types.CodeType,
+        output_graph: OutputGraphGuardsState,
+        save_guards: bool,
+    ) -> tuple[GuardBuilder, GuardManagerWrapper]:
         guard_manager = GuardManagerWrapper()
         guard_manager.diff_guard_sources = existing_diff_guard_sources
 
         w_builder = None
 
-        def source_ref(source):
+        def source_ref(source: Source) -> str:
             guard_source = source.guard_source()
             if guard_source is GuardSource.CONSTANT:
                 # No need to track constants
@@ -3352,15 +3642,15 @@ def source_ref(source):
             output_graph.global_scope,
             guard_manager,
             self,
-            serialization_mode,
+            save_guards,
             runtime_global_scope=self.runtime_global_scope,
         )
 
         # Break retain cycle. See test_release_scope_memory
-        def cleanup_builder(weak_b):
+        def cleanup_builder(weak_b: weakref.ref[GuardBuilder]) -> None:
             b = weak_b()
             if b:
-                b.scope = None
+                b.scope = None  # type: ignore[assignment]
 
         # Break retain cycle. See test_release_input_memory
         w_builder = weakref.ref(builder, cleanup_builder)
@@ -3384,7 +3674,12 @@ def cleanup_builder(weak_b):
             guard.create(builder)
         return builder, guard_manager
 
-    def compile_check_fn(self, builder, guards_out, guard_fail_fn):
+    def compile_check_fn(
+        self,
+        builder: GuardBuilder,
+        guards_out: list[Guard],
+        guard_fail_fn: Optional[Callable[[GuardFail], None]],
+    ) -> None:
         # see parallel handling of ".0" / "___implicit0" in _eval_frame.c
         largs = builder.argnames
         largs += ["**___kwargs_ignored"]
@@ -3395,6 +3690,7 @@ def compile_check_fn(self, builder, guards_out, guard_fail_fn):
         verbose_code_parts = []
         structured_guard_fns: list[Callable[[], dict[str, Any]]] = []
 
+        assert self.torch_function_mode_stack is not None
         torch_function_mode_stack_check_fn = make_torch_function_mode_stack_guard(
             self.torch_function_mode_stack
         )
@@ -3418,7 +3714,9 @@ def compile_check_fn(self, builder, guards_out, guard_fail_fn):
         # Clear references to torch_function modes held in the list
         self.torch_function_mode_stack = None
 
-        def add_code_part(code_part, guard, log_only=False):
+        def add_code_part(
+            code_part: str, guard: Optional[Guard], log_only: bool = False
+        ) -> None:
             verbose_code_part = get_verbose_code_part(code_part, guard)
             guards_log.debug("%s", verbose_code_part)
 
@@ -3588,7 +3886,7 @@ def add_code_part(code_part, guard, log_only=False):
         self.guard_manager.extra_state = None
         self.guard_manager.no_tensor_aliasing_sources = no_tensor_aliasing_names
 
-    def invalidate(self, obj_str):
+    def invalidate(self, obj_str: str) -> None:
         # Some tests reveal that CheckFunctionManager has no attribute
         # guard_manager, but this case should not be of any concern.
         # This case doesn't seem easy to repro.
@@ -3605,7 +3903,7 @@ def invalidate(self, obj_str):
             extra_state.invalidate(cache_entry, deleted_guard_manager)
             self.guard_manager = deleted_guard_manager
 
-    def id_ref(self, obj, obj_str):
+    def id_ref(self, obj: object, obj_str: str) -> int:
         """add a weakref, return the id"""
         try:
             if id(obj) not in self._weakrefs:
@@ -3620,14 +3918,14 @@ def id_ref(self, obj, obj_str):
             pass  # cannot weakref bool object
         return id(obj)
 
-    def lookup_weakrefs(self, obj):
+    def lookup_weakrefs(self, obj: object) -> Optional[weakref.ref[object]]:
         """Lookup the _weakrefs created in id_ref function for ID_MATCH'd objects"""
         if id(obj) in self._weakrefs:
             return self._weakrefs[id(obj)]
         return None
 
 
-def build_guard_function(code_parts, closure_args) -> tuple[str, str]:
+def build_guard_function(code_parts: list[str], closure_args: str) -> tuple[str, str]:
     from torch._inductor.utils import IndentedBuffer
 
     csepass = PyExprCSEPass()
@@ -3636,6 +3934,7 @@ def build_guard_function(code_parts, closure_args) -> tuple[str, str]:
 
         def replace(expr: str) -> tuple[list[str], str]:
             return csepass.replace(expr)
+
     except RecursionError:
         # If we hit recursion limits during CSE analysis, fall back to a no-op replace function
         # This can happen with extremely complex guard expressions
@@ -3670,19 +3969,21 @@ def replace(expr: str) -> tuple[list[str], str]:
     return guard_body.getvalue(), make_guard_fn.getvalue()
 
 
-def is_recompiles_enabled():
+def is_recompiles_enabled() -> bool:
     return torch._logging._internal.log_state.is_artifact_enabled("recompiles")
 
 
-def is_recompiles_verbose_enabled():
+def is_recompiles_verbose_enabled() -> bool:
     return torch._logging._internal.log_state.is_artifact_enabled("recompiles_verbose")
 
 
 # this will only be used if cpp guards are disabled
-def make_torch_function_mode_stack_guard(initial_stack):
+def make_torch_function_mode_stack_guard(
+    initial_stack: list[torch.overrides.TorchFunctionMode],
+) -> Callable[[], bool]:
     types = [type(x) for x in initial_stack]
 
-    def check_torch_function_mode_stack():
+    def check_torch_function_mode_stack() -> bool:
         cur_stack = get_torch_function_mode_stack()
 
         if len(cur_stack) != len(types):
@@ -3697,10 +3998,16 @@ def check_torch_function_mode_stack():
     return check_torch_function_mode_stack
 
 
-def recompilation_reason_for_no_tensor_aliasing_guard(guard_manager, scope):
+Scope = TypeAliasType("Scope", dict[str, object])
+
+
+def recompilation_reason_for_no_tensor_aliasing_guard(
+    guard_manager: GuardManagerWrapper, scope: Scope
+) -> list[str]:
+    assert guard_manager.global_scope is not None
     global_scope = dict(guard_manager.global_scope)
     ids_to_source = collections.defaultdict(list)
-    for tensor_source in guard_manager.no_tensor_aliasing_sources:  # type: ignore[attr-defined]
+    for tensor_source in guard_manager.no_tensor_aliasing_sources:
         global_scope["__compile_source__"] = tensor_source
         tensor_id = id(eval(tensor_source, global_scope, scope))
         ids_to_source[tensor_id].append(tensor_source)
@@ -3727,7 +4034,7 @@ def strip_local_scope(s: str) -> str:
 
 
 def get_guard_fail_reason_helper(
-    guard_manager: GuardFn,
+    guard_manager: GuardManagerWrapper,
     f_locals: dict[str, object],
     compile_id: Optional[CompileId],
 ) -> str:
@@ -3736,6 +4043,8 @@ def get_guard_fail_reason_helper(
     Updates `guard_failures` with the generated reason.
     Only the first failed check of guard_manager is reported.
     """
+    assert guard_manager.global_scope is not None
+    assert guard_manager.closure_vars is not None
     scope = {"L": f_locals, "G": guard_manager.global_scope["G"]}
     scope.update(guard_manager.closure_vars)
     reasons: list[str] = []
@@ -3743,7 +4052,7 @@ def get_guard_fail_reason_helper(
     no_tensor_aliasing_check_failed = False
 
     verbose_code_parts: list[str] = []
-    guard_debug_info = guard_manager.check_verbose(f_locals)  # type: ignore[attr-defined]
+    guard_debug_info = guard_manager.check_verbose(f_locals)
     # For test_export_with_map_cond, the check_verbose fail even without the
     # C++ guard manager. We need to fix the issue to remove the comment.
     # assert not guard_debug_info.result
@@ -3794,14 +4103,17 @@ def get_guard_fail_reason_helper(
 
 
 def get_guard_fail_reason(
-    guard_manager: GuardFn,
+    guard_manager: GuardManagerWrapper,
     code: types.CodeType,
     f_locals: dict[str, object],
     compile_id: CompileId,
+    skip_logging: bool = False,
 ) -> str:
     if isinstance(guard_manager, DeletedGuardManagerWrapper):
         return f"{compile_id}: {guard_manager.invalidation_reason}"
     reason_str = get_guard_fail_reason_helper(guard_manager, f_locals, compile_id)
+    if skip_logging:
+        return reason_str
     guard_failures[orig_code_map[code]].append(reason_str)
 
     try:
@@ -3818,7 +4130,9 @@ def get_guard_fail_reason(
 
 
 def get_and_maybe_log_recompilation_reasons(
-    cache_entry, frame: DynamoFrameType
+    cache_entry: Optional[CacheEntry],
+    frame: DynamoFrameType,
+    skip_logging: bool = False,
 ) -> list[str]:
     """
     Return the list of guard failure reasons using cache_entry.
@@ -3832,6 +4146,7 @@ def get_and_maybe_log_recompilation_reasons(
             cache_entry.code,
             frame.f_locals,
             cache_entry.compile_id,
+            skip_logging,
         )
         if reason:
             reasons.append(reason)
@@ -3839,6 +4154,8 @@ def get_and_maybe_log_recompilation_reasons(
 
     code = frame.f_code
 
+    if skip_logging:
+        return reasons
     # at least one of "recompiles" or "recompiles_verbose" is enabled
     do_recompiles_log = is_recompiles_enabled() or is_recompiles_verbose_enabled()
 
@@ -3877,18 +4194,20 @@ def get_and_maybe_log_recompilation_reasons(
     return reasons
 
 
-def update_diff_guard_managers_for_existing_cache_entries(cache_entry):
+def update_diff_guard_managers_for_existing_cache_entries(
+    cache_entry: Optional[CacheEntry],
+) -> OrderedSet[str]:
     first_cache_entry = cache_entry
 
     # On the first pass, go through the cache entries and accumulate the diff
     # guard sources. Different guard managers can fail with different sources.
     # So, we collect all of them first.
-    acc_diff_guard_sources = set()
+    acc_diff_guard_sources: OrderedSet[str] = OrderedSet()
     while cache_entry is not None:
         acc_diff_guard_sources.update(
             cache_entry.guard_manager.collect_diff_guard_sources()
         )
-        cache_entry = cache_entry.next
+        cache_entry = cache_entry.next  # type: ignore[assignment]
 
     # On the second pass, set the diff_guard_sources for each cache line to the
     # accumulated value. And the re-populate the diff guard manager.
@@ -3896,7 +4215,7 @@ def update_diff_guard_managers_for_existing_cache_entries(cache_entry):
     while cache_entry is not None:
         cache_entry.guard_manager.diff_guard_sources = acc_diff_guard_sources
         cache_entry.guard_manager.populate_diff_guard_manager()
-        cache_entry = cache_entry.next
+        cache_entry = cache_entry.next  # type: ignore[assignment]
 
     # return the accumulated sources to set up the new cache line.
     return acc_diff_guard_sources
@@ -3908,7 +4227,7 @@ def guard_error_hook(
     f_locals: dict[str, object],
     index: int,
     last: bool,
-):
+) -> None:
     print(
         f"ERROR RUNNING GUARDS {code.co_name} {code.co_filename}:{code.co_firstlineno}"
     )
@@ -3928,7 +4247,7 @@ def guard_error_hook(
 set_guard_error_hook(guard_error_hook)
 
 
-def unique(seq):
+def unique(seq: Sequence[T]) -> Generator[T, None, None]:
     seen = set()
     for x in seq:
         if x not in seen:
@@ -3936,7 +4255,9 @@ def unique(seq):
             seen.add(x)
 
 
-def make_dupe_guard(obj_source, dupe_source):
+def make_dupe_guard(
+    obj_source: Source, dupe_source: Source
+) -> Optional[functools.partial[Any]]:
     # Note - we may end up in a situation where we invoke something like
     # def fn(x, y)
     # with fn(x, x)
@@ -3970,7 +4291,7 @@ def make_dupe_guard(obj_source, dupe_source):
     return None
 
 
-def install_guard(*guards, skip=0):
+def install_guard(*guards: Guard, skip: int = 0) -> None:
     """
     Add dynamo guards to the current tracing context.
 
@@ -3986,4 +4307,7 @@ def install_guard(*guards, skip=0):
     add = TracingContext.get().guards_context.dynamo_guards.add
     for guard in guards:
         assert isinstance(guard, Guard)
+
+        if is_from_skip_guard_source(guard.originating_source):
+            continue
         add(guard, collect_debug_stack=collect_debug_stack, skip=skip + 1)
diff --git a/torch/_dynamo/metrics_context.py b/torch/_dynamo/metrics_context.py
index 37e0d19ecae53..786dc1a9d34d0 100644
--- a/torch/_dynamo/metrics_context.py
+++ b/torch/_dynamo/metrics_context.py
@@ -13,12 +13,17 @@
 execution performance.
 """
 
+from __future__ import annotations
+
 import heapq
 import logging
 import time
-from collections.abc import Iterator
-from typing import Any, Callable, Optional
-from typing_extensions import TypeAlias
+from typing import Any, Callable, Optional, TYPE_CHECKING
+from typing_extensions import Self, TypeAlias
+
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
 
 from torch.utils._traceback import CapturedTraceback
 
@@ -67,7 +72,7 @@ def __init__(self, on_exit: OnExitType):
         self._level: int = 0
         self._edits: list[tuple[CapturedTraceback, set[str]]] = []
 
-    def __enter__(self) -> "MetricsContext":
+    def __enter__(self) -> Self:
         """
         Initialize metrics recording.
         """
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index aa8902f05e2b9..4cdf353da99ed 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -30,8 +30,9 @@
 import re
 import sys
 import traceback
+import warnings
 import weakref
-from collections.abc import Generator
+from collections.abc import Generator, Sequence
 from dataclasses import dataclass, field as dc_field
 from types import CodeType
 from typing import Any, Callable, cast, Optional, TYPE_CHECKING, Union
@@ -57,6 +58,7 @@
 )
 from torch._subclasses.fake_tensor import FakeTensor
 from torch._utils_internal import signpost_event
+from torch.export.dynamic_shapes import _ConstraintTarget
 from torch.fx._lazy_graph_module import _make_graph_module  # type: ignore[attr-defined]
 from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.symbolic_shapes import (
@@ -75,9 +77,13 @@
 from . import config, exc, logging as torchdynamo_logging, variables
 from .backends.registry import CompiledFn, CompilerFn
 from .bytecode_transformation import (
+    create_binary_slice,
     create_call_function,
+    create_dup_top,
     create_instruction,
     create_load_const,
+    create_rot_n,
+    create_swap,
     Instruction,
     unique_id,
 )
@@ -96,8 +102,9 @@
 from .graph_region_tracker import GraphRegionTracker
 from .guards import GuardBuilder, install_guard
 from .mutation_guard import is_dynamic_nn_module
-from .side_effects import AttributeMutationExisting, SideEffects
+from .side_effects import AttributeMutationExisting, SideEffects, ValueMutationExisting
 from .source import (
+    _get_source_debug_name,
     AttrSource,
     BackwardStateSource,
     ConstantSource,
@@ -143,7 +150,7 @@
 )
 from .variables.ctx_manager import ContextWrappingVariable
 from .variables.lists import BaseListVariable
-from .variables.misc import CellVariable, NullVariable
+from .variables.misc import NullVariable
 from .variables.nn_module import NNModuleVariable
 from .variables.tensor import (
     NumpyNdarrayVariable,
@@ -152,6 +159,7 @@
     UnspecializedPythonVariable,
 )
 from .variables.torch_function import TensorWithTFOverrideVariable
+from .variables.user_defined import UserDefinedDictVariable
 
 
 if TYPE_CHECKING:
@@ -315,24 +323,26 @@ class OutputGraphGuardsState:
     functorch_layers: list[torch._functorch.pyfunctorch.FuncTorchInterpreter]
     current_device: Optional[torch.device]
     global_state_guard: torch._C._dynamo.guards.GlobalStateGuard
-    name_of_builtins_dict_key_in_fglobals: Optional[str] = None
+    _guards: torch._guards.GuardsSet
+    _aotautograd_guards: list[torch._guards.GuardEnvExpr]
+
+    # Whether or not the guards should be checked for correctness
 
     export: bool = False
+    skip_guards_check: bool = False
     export_constraints: bool = False
-
-    _guards: Optional[torch._guards.GuardsSet] = None
-    _aotautograd_guards: Optional[list[torch._guards.GuardEnvExpr]] = None
+    name_of_builtins_dict_key_in_fglobals: Optional[str] = None
 
     @property
     def shape_env(self) -> ShapeEnv:
         raise AssertionError(f"shape_env shouldn't be accessed from {type(self)}")
 
     @property
-    def guards(self) -> Optional[torch._guards.GuardsSet]:
+    def guards(self) -> torch._guards.GuardsSet:
         return self._guards
 
     @property
-    def aotautograd_guards(self) -> Optional[list[torch._guards.GuardEnvExpr]]:
+    def aotautograd_guards(self) -> list[torch._guards.GuardEnvExpr]:
         return self._aotautograd_guards
 
 
@@ -342,6 +352,10 @@ class StackLocalsMetadata:
     Stores metadata for a frame's stack and locals for the purposes of building resume functions
     """
 
+    num_stack: int = 0  # number of stack elements, minus removed NULLs
+    locals_names: dict[str, int] = dc_field(
+        default_factory=dict
+    )  # order of locals codegen'd to the stack
     stack_null_idxes: list[int] = dc_field(default_factory=list)
     locals_null_keys: list[str] = dc_field(default_factory=list)
     stack_ctx_args: list[tuple[int, tuple[Any, ...]]] = dc_field(default_factory=list)
@@ -349,6 +363,24 @@ class StackLocalsMetadata:
     locals_ctx_args: list[tuple[str, tuple[Any, ...]]] = dc_field(default_factory=list)
 
 
+# TODO we should expand this to make it work for atribtrary in/out
+@dataclass
+class ExportMetaData:
+    # maps graph input index to its' source which is later
+    # used in export to map to correct user input. In its' flat form,
+    # just looks like GetItem(base=LocalSource("foo", idx=0))
+    graph_input_idx_to_local_source: dict[int, Source] = dc_field(default_factory=dict)
+    # maps user output idx to what type of output it is. There are 3 options:
+    # 1) graph out
+    # 2) user input
+    # 3) constants
+    output_return_type: dict[int, tuple[str, Any]] = dc_field(default_factory=dict)
+    # output spec of the traced function
+    out_spec: Union[torch.utils._pytree.TreeSpec, torch.utils._pytree.LeafSpec] = (
+        torch.utils._pytree._LEAF_SPEC
+    )
+
+
 def get_builtins_dict(global_scope: Scope) -> dict[str, Any]:
     # f_globals["__builtins__"] can be a dict or a module. This is an
     # implementation detail -
@@ -388,7 +420,7 @@ def __init__(
         compiler_fn: Optional[CompilerFn],
         root_tx: "InstructionTranslatorBase",
         export: bool,
-        export_constraints: Any,
+        export_constraints: Sequence[_ConstraintTarget],
         frame_state: Any,
         local_scope: Scope,
         global_scope: Scope,
@@ -408,13 +440,16 @@ def __init__(
             # initial_global_state is only None during NopTest.
             global_state_guard=torch._dynamo.convert_frame.initial_global_state
             or torch._C._dynamo.guards.GlobalStateGuard(),
+            # These are set by @property instead, just initialize them as blank
+            _guards=torch._guards.GuardsSet(),
+            _aotautograd_guards=[],
         )
         self.tracers = [SubgraphTracer(self, is_export=export)]
         # Map from graph input's `Source` to its `VariableTracker` to
         # de-duplicate graph inputs by source and reuse the tracker
         self.input_source_to_var: dict[Source, VariableTracker] = {}
         self.export = export
-        self.export_constraints = export_constraints
+        self.export_constraints = export_constraints  # type: ignore[assignment]
         self.frame_state = frame_state
         self.cleanup_hooks: list[Callable[[], Any]] = []
         # compile_id is an id number for the current torch.compile
@@ -451,7 +486,6 @@ def __init__(
             allow_scalar_outputs=config.capture_scalar_outputs,
             allow_dynamic_output_shape_ops=config.capture_dynamic_output_shape_ops,
             prefer_deferred_runtime_asserts_over_guards=config.prefer_deferred_runtime_asserts_over_guards,
-            allow_complex_guards_as_runtime_asserts=config.allow_complex_guards_as_runtime_asserts,
             co_fields=self.co_fields,
         )
 
@@ -579,6 +613,11 @@ def __init__(
             self.maybe_install_saved_tensors_hooks_subgraphs()
         )
 
+        # mangled alias -> module fqn name
+        self.import_sources: dict[str, str] = {}
+
+        self.export_metadata = ExportMetaData()
+
     def mark_bytecode_tracing_start(self) -> None:
         self.compiler_trace_stack.enter_context(
             dynamo_timed(
@@ -682,6 +721,7 @@ def maybe_install_saved_tensors_hooks_subgraphs(self) -> Optional[list[str]]:
         return [pack_subgraph_name, unpack_subgraph_name]
 
     def dump_guards_state(self) -> OutputGraphGuardsState:
+        # Dump a serializable version of self without extras
         return OutputGraphGuardsState(
             local_scope=self.local_scope,
             global_scope=self.global_scope,
@@ -697,6 +737,7 @@ def dump_guards_state(self) -> OutputGraphGuardsState:
             export_constraints=self.export_constraints,
             _guards=self.guards,
             _aotautograd_guards=self.aotautograd_guards,
+            skip_guards_check=self.skip_guards_check,
         )
 
     def synthetic_graph_input(
@@ -890,6 +931,9 @@ def count_calls(self) -> int:
     def is_empty_graph(self) -> bool:
         return len(list(self.graph.nodes)) == 0
 
+    def has_outputs(self) -> bool:
+        return len([x for x in self.graph.nodes if x.op == "output"]) > 0
+
     def get_submodule(self, keys: str) -> Union[torch.nn.Module, Any]:
         assert keys
         obj: Union[torch.nn.Module, dict[str, torch.nn.Module]] = self.nn_modules
@@ -1172,7 +1216,7 @@ def handle_aliases_for_stolen_lists(
 
     def _get_stack_values_to_restore(
         self, tx: "InstructionTranslatorBase", stack_pops: int
-    ) -> tuple[list[VariableTracker], list[str], StackLocalsMetadata]:
+    ) -> tuple[list[VariableTracker], StackLocalsMetadata]:
         """
         Gets the stack + locals values belonging to tx that need to be restored.
 
@@ -1184,7 +1228,6 @@ def _get_stack_values_to_restore(
 
         Returns:
             - stack_values: stack and locals values that need to be restored
-            - restore_vars: names of locals corresponding to the locals part of `stack_values`
             - meta: locations of NULLs and ContextWrappingVariables in the stack/locals
                 (ignores the top `stack_pops` values on the stack)
         """
@@ -1213,9 +1256,10 @@ def _get_stack_values_to_restore(
                 meta.stack_ctx_args.append((len(stack_values) - 1, target_values))
                 meta.stack_ctx_idxes_orig.append(i)
 
-        # Add all the local vars to the "stack" so restore at the end
-        restore_vars: list[str] = []
-        val_to_names: dict[VariableTracker, list[str]] = {}
+        meta.num_stack = len(stack_values)
+
+        cell_and_freevars = set(tx.cellvars() + tx.freevars())
+
         # NB: Typically (i.e., for graph compile from RETURN_VALUE),
         # symbolic_locals will be empty at this point, as prune_dead_locals
         # will clear out all of symbolic_locals because RETURN_VALUE is the
@@ -1230,12 +1274,20 @@ def _get_stack_values_to_restore(
             # This will in turn result in spurious variables showing up in the graph.
             # This was very tricky to debug. For an example, dump the graph at call_user_compiler
             # while running test_subgraphs.py
-            if isinstance(v.source, LocalSource) and v.source.local_name == k:
-                continue  # no need to restore initial state
-            if isinstance(v, CellVariable) and v.local_name == k:
-                continue  # no need to restore initial state
+            # Do not include top-frame unmodified locals here - otherwise, the compiled graph may
+            # erroneously include them as part of the return. We manually codegen them afterward.
+            if (
+                isinstance(v.source, LocalSource)
+                and v.source.local_name == k
+                and tx is self.root_tx
+            ):
+                continue
+            # Do not load cell/free vars
+            if k in cell_and_freevars:
+                continue
             # Do not load variable if it is NULL.
             if sys.version_info >= (3, 12):
+                # NOTE: do not use isinstance, since it realizes lazy VT's
                 # Continuation function will load the NULL for v.
                 if type.__instancecheck__(NullVariable, v):
                     meta.locals_null_keys.append(k)
@@ -1243,19 +1295,15 @@ def _get_stack_values_to_restore(
             else:
                 # A variable should never be NULL in < 3.12
                 assert not type.__instancecheck__(NullVariable, v)
+            meta.locals_names[k] = len(meta.locals_names)
             if isinstance(v, ContextWrappingVariable):
                 target_values = (
                     () if v.target_values is None else tuple(v.target_values)
                 )
                 meta.locals_ctx_args.append((k, target_values))
-            if v not in val_to_names:
-                val_to_names[v] = []
-            val_to_names[v].append(k)
-        for v in val_to_names.keys():
-            restore_vars.extend(val_to_names[v])
-            stack_values.extend([v] * len(val_to_names[v]))
+            stack_values.append(v)
 
-        return stack_values, restore_vars, meta
+        return stack_values, meta
 
     def compile_subgraph(
         self,
@@ -1281,9 +1329,9 @@ def compile_subgraph(
 
         assert self.root_tx is not None
 
-        # FIXME temporary assert to make sure we're not accidentally compiling nested graph breaks
-        # before we're done the full implementation
-        assert self.root_tx is tx
+        if not config.nested_graph_breaks:
+            # expect to only compile 1 frame
+            assert self.root_tx is tx
 
         # bytecode tracing has finished. Pop the context manager for dynamo_timed
         self.mark_bytecode_tracing_stop()
@@ -1297,19 +1345,42 @@ def compile_subgraph(
         # prefix instructions (Python 3.11+)
         prefix_insts: list[Instruction] = []
         if sys.version_info >= (3, 11):
-            for inst in tx.prefix_insts:
-                if inst.opname == "MAKE_CELL":
-                    prefix_insts.append(
-                        create_instruction("MAKE_CELL", argval=inst.argval)
-                    )
-                elif inst.opname == "COPY_FREE_VARS":
+            for inst in self.root_tx.prefix_insts:
+                if inst.opname == "COPY_FREE_VARS":
                     prefix_insts.append(
                         create_instruction(
-                            "COPY_FREE_VARS", arg=len(tx.code_options["co_freevars"])
+                            "COPY_FREE_VARS",
+                            arg=len(self.root_tx.code_options["co_freevars"]),
                         )
                     )
                 else:
                     prefix_insts.append(copy.copy(inst))
+
+        # stack values and restore vars for each frame are pushed in reverse order
+        # i.e. last element corresponds to root frame (1),
+        # first element corresponds to current frame (N)
+        all_stack_values = []
+        all_stack_locals_metas = []
+        cur_tx: Optional[InstructionTranslatorBase] = tx
+        while cur_tx is not None:
+            # this should have been checked by the caller
+            assert all(block.can_restore() for block in cur_tx.block_stack)
+
+            stack_values, meta = self._get_stack_values_to_restore(
+                cur_tx, stack_pops if cur_tx is tx else 0
+            )
+            all_stack_values.append(stack_values)
+            all_stack_locals_metas.append(meta)
+
+            # Exit from all context manager variables to make sure global state is restored
+            for block in reversed(cur_tx.block_stack):
+                block.exit(cur_tx, is_graph_break=reason.graph_break)
+
+            cur_tx = cur_tx.parent
+
+        # "Garbage collect the heap".
+        self.side_effects.prune_dead_object_new(tx)
+
         self.add_output_instructions(prefix_insts)
 
         assert not (self.pregraph_bytecode and self.export), (
@@ -1322,32 +1393,8 @@ def compile_subgraph(
         )
         self.add_output_instructions(alias_insts)
 
-        # Exit from all context manager variables to make sure global state is restored
-        for block in reversed(self.root_tx.block_stack):
-            block.exit(self.root_tx, is_graph_break=reason.graph_break)
-
         self.cleanup_graph()
 
-        # stack values and restore vars for each frame are pushed in reverse order
-        # i.e. last element corresponds to root frame, first element corresponds to current frame
-        all_stack_values = []
-        all_restore_vars = []
-        all_stack_locals_metas = []
-        cur_tx: Optional[InstructionTranslatorBase] = tx
-        while True:
-            assert cur_tx is not None
-            # this should have been checked by the caller
-            assert all(block.can_restore() for block in cur_tx.block_stack)
-            stack_values, restore_vars, meta = self._get_stack_values_to_restore(
-                cur_tx, stack_pops
-            )
-            all_stack_values.append(stack_values)
-            all_restore_vars.append(restore_vars)
-            all_stack_locals_metas.append(meta)
-            if cur_tx is self.root_tx:
-                break
-            cur_tx = tx.parent
-
         # Use nn.Module "proxies" in the constructed GraphModule so that
         # the resulting GM does not hold additional strong references to the original modules.
         # This prevents a strong ref cycle where Dynamo created code holds on to references
@@ -1382,13 +1429,30 @@ def compile_subgraph(
             )
             self.add_output_instructions(random_calls_instructions)
 
-        # call compiled fx graph
-        graph_output_var = None
+        # Codegen stack convention before the unsupported instruction
+        # NOTE: in these comment blocks, "locals" EXCLUDE free and cell vars.
+        # NOTE: stack and locals must be codegen'd BEFORE the unsupported instruction, since the latter
+        # can arbitrarily mutate the former.
+        # [
+        #   frame N locals,
+        #   frame N-1 stack + locals,
+        #   ...,
+        #   frame 1 stack + locals,
+        # ], frame N stack
+
+        # see symbolic_convert.py for
+        # codegen stack convention after the unsupported instruction
+        # NOTE: cells are loaded into continuation functions directly
+
+        # this determines the order that values are codegen'd to the stack
+        stack_values_flat = [val for vals in all_stack_values for val in vals]
         stored_graph_output_var = False
-        root_stack_values = all_stack_values[-1]
+        graph_output_var = None
+
+        # call compiled fx graph and codegen all values - stack and locals
         if (
-            self.root_tx is tx
-            and root_stack_values
+            self.root_tx is tx  # single frame
+            and stack_values_flat
             and all(
                 not isinstance(
                     v,
@@ -1399,10 +1463,10 @@ def compile_subgraph(
                     ),
                 )
                 and not (isinstance(v, SymNodeVariable) and v.python_type() is float)
-                for v in root_stack_values
+                for v in stack_values_flat
             )
-            and all(isinstance(x, TensorVariable) for x in root_stack_values)
-            and len(set(root_stack_values)) == len(root_stack_values)
+            and all(isinstance(x, TensorVariable) for x in stack_values_flat)
+            and len(set(stack_values_flat)) == len(stack_values_flat)
             and self.side_effects.is_empty()
             and not tx.debug_locals
             and not self.backward_state
@@ -1411,17 +1475,19 @@ def compile_subgraph(
         ):
             # optimization to generate better code in a common case
             self.add_output_instructions(
-                self.compile_and_call_fx_graph(
-                    tx, list(reversed(root_stack_values)), root
-                )
-                + [create_instruction("UNPACK_SEQUENCE", arg=len(root_stack_values))]
+                [
+                    # load in reverse since UNPACK_SEQUENCE will reverse
+                    *self.compile_and_call_fx_graph(
+                        tx, list(reversed(stack_values_flat)), root
+                    ),
+                    create_instruction("UNPACK_SEQUENCE", arg=len(stack_values_flat)),
+                ]
             )
+            # function output will be moved to the correct places below
         else:
             graph_output_var = self.new_var("graph_out")
-            # load stack values in a flat manner for now - will likely change later.
-            stack_values_flat = [
-                val for vals in reversed(all_stack_values) for val in vals
-            ]
+            # load stack values in a flat manner - we will codegen bytecode to place them correctly
+            # according to our convention above
             pass1 = PyCodegen(
                 self.root_tx,
                 root,
@@ -1448,6 +1514,54 @@ def compile_subgraph(
             )
             self.codegen_suffix(tx, stack_values_flat, pass2)
 
+            if (
+                torch._dynamo.config.log_graph_in_out_metadata
+                and stack_values_flat
+                and len(stack_values_flat) == 1
+            ):
+                vt = stack_values_flat[0]
+                if (
+                    isinstance(vt, torch._dynamo.variables.NamedTupleVariable)
+                    and vt.tuple_cls
+                    is torch._dynamo.functional_export.ExportTracerOutput
+                ):
+                    flat_returns = vt.items[0]
+                    out_spec = vt.items[1]
+                    assert isinstance(
+                        flat_returns, torch._dynamo.variables.ListVariable
+                    )
+
+                    vt_to_graph_out_idx: dict[VariableTracker, int] = {}
+                    for value in pass2.graph_outputs.values():
+                        assert isinstance(value, torch._dynamo.codegen.GraphOutputEntry)
+                        variable: VariableTracker = value.variable
+                        vt_to_graph_out_idx[variable] = value.index
+
+                    for idx, vt in enumerate(flat_returns.items):
+                        if vt in vt_to_graph_out_idx:
+                            self.export_metadata.output_return_type[idx] = (
+                                "graph_out",
+                                vt_to_graph_out_idx[vt],
+                            )
+                        elif (
+                            vt.source is not None
+                            and (source := getattr(vt.source, "base", None))
+                            and source.is_input
+                        ):
+                            self.export_metadata.output_return_type[idx] = (
+                                "input",
+                                vt.source,
+                            )
+                        elif isinstance(vt, torch._dynamo.variables.ConstantVariable):
+                            self.export_metadata.output_return_type[idx] = (
+                                "constant",
+                                vt.as_python_constant(),
+                            )
+                        else:
+                            assert f"Encountered unrecognized type {vt} at output {idx}"  # noqa: PLW0129
+
+                    self.export_metadata.out_spec = out_spec.as_python_constant()
+
             output = []
             if count_calls(self.graph) != 0 or len(pass2.graph_outputs) != 0:
                 output.extend(
@@ -1465,23 +1579,140 @@ def compile_subgraph(
                 self.run_compiler_collective()
             self.add_output_instructions(output + pass2.get_instructions())
 
-        # restore all the live local vars of the root
-        local_restore_cg = PyCodegen(
-            self.root_tx, overridden_sources=overridden_sources
+        # store all stack and locals for each frame
+        # current state of the stack:
+        # *(frame N stack), *(frame N locals),
+        # ...,
+        # *(frame 1 stack), *(frame 1 locals)
+
+        self.add_output_instructions(
+            [
+                create_instruction(
+                    "BUILD_LIST",
+                    arg=len(stack_values_flat) - all_stack_locals_metas[0].num_stack,
+                ),
+            ]
         )
-        # TODO this local restoration should be removed when fully implementing nested graph breaks
+
+        # current state of the stack:
+        # *(frame N stack), [
+        #     *(frame N locals),
+        #     *(frame N-1 stack), *(frame N-1 locals),
+        #     ...
+        #     *(frame 1 stack), *(frame 1 locals),
+        # ]
+        # iterate current frame (N) to root frame (1)
+        # sliding window over frame stack/locals
+        start_idx = 0
+        end_idx = 0
+        for i, meta in enumerate(all_stack_locals_metas):
+            # do not pack frame N's stack into the value list
+            n_vals = len(meta.locals_names)
+            if i != 0:
+                n_vals += meta.num_stack
+            if n_vals == 0:
+                self.add_output_instructions(
+                    [
+                        create_instruction("BUILD_LIST", arg=0),
+                        *create_swap(2),
+                    ]
+                )
+                # [], stack_values_flat
+            else:
+                end_idx += n_vals
+                self.add_output_instructions(
+                    [
+                        create_dup_top(),
+                        *create_binary_slice(start_idx, end_idx),
+                        *create_swap(2),
+                    ]
+                )
+                start_idx += n_vals
+                # stack_values_flat[x:y], stack_values_flat
+
+            # add root frame's unmodified locals here
+            if i == len(all_stack_locals_metas) - 1:
+                root_cg = PyCodegen(self.root_tx)
+                unmodified_locals_names: dict[str, int] = {}
+                for k, v in self.root_tx.symbolic_locals.items():
+                    if isinstance(v.source, LocalSource) and v.source.local_name == k:
+                        root_cg.append_output(root_cg.create_load(k))
+                        unmodified_locals_names[k] = len(meta.locals_names) + len(
+                            unmodified_locals_names
+                        )
+                self.add_output_instructions(
+                    root_cg.get_instructions()
+                    + [
+                        create_instruction(
+                            "BUILD_LIST", arg=len(unmodified_locals_names)
+                        ),
+                        # arg=2 because we already swapped the locals list back
+                        create_instruction("LIST_EXTEND", arg=2),
+                    ]
+                )
+                meta.locals_names.update(unmodified_locals_names)
+
+            # *(frame N stack), metas[0] stack + locals, ..., metas[i] stack + locals, stack_values_flat
+
+        # current state of the stack:
+        # *(frame N stack)
+        # frame N locals,
+        # frame N-1 stack, frame N-1 locals,
+        # ...
+        # frame 1 stack, frame 1 locals,
+        # stack_values_flat
+        #
+
         self.add_output_instructions(
             [
-                local_restore_cg.create_store(var)
-                for var in reversed(all_restore_vars[-1])
+                create_instruction("POP_TOP"),
+                create_instruction("BUILD_LIST", arg=len(all_stack_locals_metas)),
+                *create_rot_n(all_stack_locals_metas[0].num_stack + 1),
             ]
         )
 
+        # final state of the stack before running the unsupported bytecode:
+        # [
+        #   [frame N locals],
+        #   [frame N-1 stack + locals],
+        #   ...,
+        #   [frame 1 stack + locals],
+        # ], *(frame N stack)
+
         if graph_output_var and stored_graph_output_var:
             self.add_output_instructions(
-                [local_restore_cg.create_delete(graph_output_var)]
+                [create_instruction("DELETE_FAST", argval=graph_output_var)]
             )
 
+        if self.export:
+            from torch.export._trace import _ExportModuleSpecTrackerDict
+
+            potential_side_effects = []
+            for var in self.side_effects._get_modified_vars():
+                if hasattr(var, "mutation_type"):
+                    mut_type = var.mutation_type
+                    # Make sure to skip codegen specific mutations
+                    if isinstance(
+                        mut_type, (AttributeMutationExisting, ValueMutationExisting)
+                    ):
+                        # export uses tracepoint pass to dump submodule inp/out spec
+                        # into global state, so we filter it here
+                        if not (
+                            isinstance(var, UserDefinedDictVariable)
+                            and isinstance(var.value, _ExportModuleSpecTrackerDict)
+                        ):
+                            potential_side_effects.append(var)
+
+            side_effect_refs = [
+                _get_source_debug_name(var.source) for var in potential_side_effects
+            ]
+
+            if len(side_effect_refs):
+                warnings.warn(
+                    f"While exporting, we found certain side effects happened in the model.forward. "
+                    f"Here are the list of potential sources you can double check: {side_effect_refs}"
+                )
+
         return all_stack_locals_metas
 
     def codegen_suffix(
@@ -1544,6 +1775,32 @@ def cleanup_graph(self) -> None:
                     self.graph.erase_node(node1)
                     self.graph.erase_node(node2)
 
+    def bypass_package(self, reason: str = "", **kwargs: Any) -> None:
+        """
+        Do not save this output graph to the CompilePackage
+        """
+        if not self.package:
+            return
+        if torch._dynamo.config.strict_precompile:
+            raise torch._dynamo.exc.PackageError(
+                "Detected a package bypass: %s", reason
+            )
+        log.warning("Detected a package bypass: %s", reason)
+        torch._logging.trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "precompile_cache_bypass",
+                "encoding": "json",
+            },
+            payload_fn=lambda: {
+                # precede with underscore so it always appear first in JSON in tlparse
+                "_reason": reason,
+                **kwargs,
+            },
+        )
+        self.package.bypass_current_entry()
+        self.package = None
+
     def get_graph_sizes_structured(self) -> dict[str, list[Union[int, str]]]:
         ret: dict[str, list[Union[int, str]]] = {}
         for node in self.graph.nodes:
@@ -1647,6 +1904,8 @@ def compile_and_call_fx_graph(
             assert self.should_exit
 
             self.run_compiler_collective()
+            if count_calls(self.graph) == 0 and len(rv) == 0:
+                return []
 
             name = unique_id("__compiled_fn", with_uuid=True)
 
@@ -1700,12 +1959,26 @@ def compile_and_call_fx_graph(
             for register_finalizer in self.register_finalizer_fns:
                 register_finalizer(gm)
 
-            gm._backend_id = name
+            if next(gm.parameters(), None) is not None:
+                # If dynamo produces a graph with parameters, skip package stuff
+                # Bypass output graph
+                self.bypass_package(
+                    "Graph contains named parameters: either inline_inbuilt_nn_modules=False or there are static addresses.",
+                    inline_builtin_nn_modules=torch._dynamo.config.inline_inbuilt_nn_modules,
+                    gm=gm.print_readable(
+                        print_output=False, include_stride=True, include_device=True
+                    ),
+                )
+
+            if self.package is not None:
+                gm._backend_id = name
+
             gm.compile_subgraph_reason = self.compile_subgraph_reason
             gm.meta["dynamo_flat_name_to_original_fqn"] = (
                 self.dynamo_flat_name_to_original_fqn.copy()
             )
             gm.meta["dynamo_compile_id"] = self.dynamo_compile_id
+            gm.meta["backend_id"] = name
 
             graph_code_log.debug(
                 "%s",
@@ -1834,6 +2107,10 @@ def specialized_dispatch(*args: Any, **kwargs: Any) -> Any:
 
             assert self.root_tx is not None
             cg = PyCodegen(self.root_tx)
+
+            for idx, arg in enumerate(self.graphargs):
+                self.export_metadata.graph_input_idx_to_local_source[idx] = arg.source
+
             cg.make_call_generated_code(name)
             return cg.get_instructions()
 
@@ -2223,6 +2500,22 @@ def example_value_from_input_node(self, node: torch.fx.Node) -> Any:
         return self.nn_modules[node.target]  # type: ignore[index]
 
 
+class DynamoTracerOutput:
+    error_on_graph_break: bool
+    is_tracing_resume_prologue: bool
+    output_graph: Optional[OutputGraph]
+
+    def __init__(
+        self, tracer: "InstructionTranslatorBase", error: Optional[Any] = None
+    ) -> None:
+        self.error_on_graph_break = tracer.error_on_graph_break
+        self.is_tracing_resume_prologue = tracer.is_tracing_resume_prologue
+        if error:
+            self.output_graph = None
+        else:
+            self.output_graph = tracer.output
+
+
 err_epilogue = (
     "With the current config, we will graph break "
     "(and fall back to eager-mode PyTorch) on all ops "
@@ -2379,7 +2672,9 @@ def __init__(
         # map basic symbols (unbacked and unbacked) to their bound proxies.
         # There are only two cases where bound_symbols will be recorded:
         # 1. when we create_graph_input for a backed SymInt that's basic symbol
-        # 2. when we track_unbacked_symbols for intermediate results that contain unbacked symints.
+        # 2. when we track_produced_symints for intermediate results
+        # bound_symbols always map the symbol to the proxy whose
+        # tracer is the current tracer that's readily accessible in current tracer's graph.
         self.bound_symbols: dict[sympy.Symbol, Union[torch.fx.Proxy, LazyProxy]] = {}
 
         self.prev_inst = None
@@ -2755,27 +3050,34 @@ def create_graph_input(
             self._used_names.add(name)
 
             # NOTE: [Auto lift basic free symbols when create_graph_input]
-            # Whenever we call create_graph_input, we try to also lift the basic symbols in example values
-            # as graph input.
-            # This applies to both top-level graph and subgraphs in higher order ops.
-            # It has several cases:
+            # There are two sources of basic symbols:
+            #
+            # - They can come from inputs, e.g. when an input tensor is specified as dynamic. We handle
+            # this case by intercepting at create_graph_input. Whenever we call create_graph_input, we
+            # try to also lift the basic symbols in example values as graph input.
+            #
             #  1. When create_graph_input for a tensor that has symbolic shapes,
             #     we look for basic symbols in its size and stride, we check if the symbol is bound
             #     in current graph (i.e. bound_symbols), it it's not bound, we'll create a placeholder
-            #     for it then recursively check its parent, creates ph if not bound.
-            #     Every tracer maintains a mapping (i.e. lifted_freevars)
-            #     that maps from parent proxy to proxy in current tracer for the symbol.
-            #  2. When create_graph_input for a tensor with unbacked symbolic shapes,
-            #     Backed symbols all come from inputs's symbolic shape. But unbacked symbols
-            #     can be created while tracing. So we use track_unbacked_symbols will intercept
-            #     at wrap_fx_proxy, and try to bind the unbacked symbols immediately after they're
-            #     created.
-            #  3. subgraph will also lifted basic symbols in compound exprs of tensor shape.
-            #     For example, if an input to subgraph takes size [s1+s2//8], we'll look for the
-            #     the free symbols in the sizes and lift as inputs similar to 1 in _lift_symbols_in_symint)
-            #  4. When create_graph_input for a SymInt, if the symint is a basic symbol, we'll track it
-            #     in bound_symbols so that we don't lift the same basic symbol twice. When the symint is a
-            #     compound expr, we'll just create the proxy for the compouned expr but not lift its basic symbols.
+            #     for it then recursively check its parent, creates ph if not bound at parent until.
+            #     reachting the top-level, where we require a source is attached to the proxy.
+            #
+            #  2. When create_graph_input for a tensor that contains compound exprs,
+            #     for example, if an input to subgraph takes size [s1+s2//8], we'll look for the
+            #     the free basic symbols in the sizes and lift all of them following 1.
+            #
+            #  3. When create_graph_input for a symint. The following invariants hold:
+            #     a. if symint's expr is a basic symbol, we only lift it once.
+            #     b. if symint's expr is compuned, we lift the expr as a single input. We won't lift The basic symbols
+            #       in the compuned expr are NOT lifted. Because if the basic symbols are used inside the subgraph
+            #       they will be lifted according to 3.a
+            #
+            # - They can come from intermediate results:
+            # For example, data-dependent operators such as t.item(), t.nonzero(), where basic symbols
+            # might be created. For this purpose, we track the basic symbols of intermediate results
+            # immediately after they're created at wrap_fx_proxy with track_produced_symints. Notice
+            # that for basic symbols that're already tracked by create_graph_input, we won't track it again.
+            #
             # Also see NOTE: [Export inputs must be explicitly passed in]
             is_strict_export = self.is_export
             is_non_strict_export = torch.compiler.is_compiling()
@@ -2871,12 +3173,13 @@ def maybe_lift_tracked_freevar_to_input(self, arg: Any) -> Any:
 
     # See NOTE: [Auto lift basic free symbols when create_graph_input] for overall design
     # You MUST call this API every time when creating a proxy in wrap_fx_proxy for a call
-    # that produced unbacked symints or tensors with unbacked symint shapes.
-    # This function is used to track the unbacked symints with its proxies created during
+    # that produced symints or tensors with unbacked symint shapes.
+    # This function is used to track the symints with its proxies created during
     # dynamo tracing so that subgraph knows how to bind a symbol input with parent's proxy.
     # LazyProxy are created for tensor shapes that're unbacked so that we don't create proxies
-    # for symbols that're not going to be used.
-    def track_unbacked_symbols(
+    # for symbols that're not going to be used, the LazyProxy will be turned into a proxy
+    # when it's lifted as input to subgraph.
+    def track_produced_symints(
         self, example_value: Any, e_proxy: Union[LazyProxy, torch.fx.Proxy]
     ) -> None:
         # When binding the symbols in an exmaple_value, we bind the symbols
@@ -2901,22 +3204,26 @@ def need_bind(s: Any) -> bool:
             return (
                 is_symbolic(s)
                 and isinstance(s.node.expr, sympy.Symbol)
-                and s.node.shape_env.is_unbacked_symint(s.node.expr)
                 and s.node.expr not in self.bound_symbols
             )
 
         def _proxy_with_example_value(
             example_value: Any, *args: Any, **kwargs: Any
         ) -> fx.Proxy:
-            proxy = tracer.create_proxy(*args, **kwargs)
-            set_example_value(proxy.node, example_value)
-            return proxy
+            # We need to insert proxy for creating sym_size/sym_stride/sym_storage right after e_proxy
+            nonlocal e_proxy
+            e_proxy = e_proxy() if isinstance(e_proxy, LazyProxy) else e_proxy
+            assert isinstance(e_proxy, torch.fx.Proxy)
+            with tracer.graph.inserting_after(e_proxy.node):
+                proxy = tracer.create_proxy(*args, **kwargs)
+                set_example_value(proxy.node, example_value)
+                return proxy
 
         if isinstance(example_value, torch.Tensor):
             for i, s in enumerate(example_value.size()):
                 if need_bind(s):
                     log.debug(
-                        "_track_unbacked_symbols %s for %s.size()[%s] at debug_level %s",
+                        "track_produced_symints %s for %s.size()[%s] at debug_level %s",
                         s,
                         e_proxy,
                         i,
@@ -2932,13 +3239,33 @@ def _proxy_with_example_value(
                         {},
                         type_expr=type(s),
                     )
-                    self.track_unbacked_symbols(s, lazy_proxy)
+                    self.track_produced_symints(s, lazy_proxy)
+
+            storage_offset = example_value.storage_offset()
+            if need_bind(storage_offset):
+                log.debug(
+                    "track_produced_symints %s for %s.storage_offset() at debug_level %s",
+                    storage_offset,
+                    e_proxy,
+                    tracer.debug_level,
+                )
+                lazy_proxy = LazyProxy(
+                    tracer,
+                    _proxy_with_example_value,
+                    storage_offset,
+                    "call_function",
+                    torch.ops.aten.sym_storage_offset,
+                    (e_proxy,),
+                    {},
+                    type_expr=type(storage_offset),
+                )
+                self.track_produced_symints(storage_offset, lazy_proxy)
 
             if example_value.layout is torch.strided:
                 for i, s in enumerate(example_value.stride()):
                     if need_bind(s):
                         log.debug(
-                            "_track_unbacked_symbols %s for %s.stride()[%s] at debug_level %s",
+                            "track_produced_symints %s for %s.stride()[%s] at debug_level %s",
                             s,
                             e_proxy,
                             i,
@@ -2954,24 +3281,23 @@ def _proxy_with_example_value(
                             {},
                             type_expr=type(s),
                         )
-                        self.track_unbacked_symbols(s, lazy_proxy)
+                        self.track_produced_symints(s, lazy_proxy)
 
             elif example_value.layout is torch.sparse_coo:
-                self.track_unbacked_symbols(example_value._indices(), e_proxy)
-                self.track_unbacked_symbols(example_value._values(), e_proxy)
+                self.track_produced_symints(example_value._indices(), e_proxy)
+                self.track_produced_symints(example_value._values(), e_proxy)
             elif example_value.layout in {torch.sparse_csr, torch.sparse_bsr}:
-                self.track_unbacked_symbols(example_value.crow_indices(), e_proxy)
-                self.track_unbacked_symbols(example_value.col_indices(), e_proxy)
+                self.track_produced_symints(example_value.crow_indices(), e_proxy)
+                self.track_produced_symints(example_value.col_indices(), e_proxy)
             elif example_value.layout in {torch.sparse_csc, torch.sparse_bsc}:
-                self.track_unbacked_symbols(example_value.ccol_indices(), e_proxy)
-                self.track_unbacked_symbols(example_value.row_indices(), e_proxy)
+                self.track_produced_symints(example_value.ccol_indices(), e_proxy)
+                self.track_produced_symints(example_value.row_indices(), e_proxy)
             if is_traceable_wrapper_subclass(example_value):
                 attrs, ctx = example_value.__tensor_flatten__()
                 for attr in attrs:
                     inner_t = getattr(example_value, attr)
-                    self.track_unbacked_symbols(inner_t, getattr(e_proxy, attr))
+                    self.track_produced_symints(inner_t, getattr(e_proxy, attr))
         elif isinstance(example_value, torch.SymInt):
-            # Only bind unbacked symbols. backed symbols are lifted as inputs.
             if need_bind(example_value):
                 expr = example_value.node.expr
                 tracer.bound_symbols[expr] = e_proxy
diff --git a/torch/_dynamo/package.py b/torch/_dynamo/package.py
index b15dc0b2fdf69..9aa00a6a9d1e3 100644
--- a/torch/_dynamo/package.py
+++ b/torch/_dynamo/package.py
@@ -112,7 +112,17 @@ class InlinedSource:
 
 
 @dataclasses.dataclass
-class _DynamoCodeCacheEntry:
+class DynamoCaptureOutput:
+    """
+    Core information generated from Dynamo for fullgraph=True.
+    """
+
+    guarded_codes: list[_GuardedCodeCacheEntry]
+    backend_ids: list[_BackendId]
+
+
+@dataclasses.dataclass
+class _DynamoCodeCacheEntry(DynamoCaptureOutput):
     """
     Contains the serializable information associated with a single code object
     in dynamo. To restore an execution of compiled code, we will need the following
@@ -130,17 +140,17 @@ class _DynamoCodeCacheEntry:
          A code object can be accessed by "{python_module}.{function_name}.{code_source}" .
       8. A boolean flag indicating whether the function is installed to global scope.
       9. A boolean flag indicating whether the function has a compile id.
+      10. Whether or not this code entry was bypassed
     """
 
     python_code: SerializedCode
     python_module: str
     function_names: list[_FunctionId]
-    guarded_codes: list[_GuardedCodeCacheEntry]
     import_sources: dict[str, str]
-    backend_ids: list[_BackendId]
     code_source: Optional[str]
     install_to_global: bool
     has_compile_id: bool = False
+    bypassed: bool = False
 
 
 def _lookup_code(entry: _DynamoCodeCacheEntry) -> types.CodeType:
@@ -314,7 +324,6 @@ def _compile_frame_context(
     def _ctx() -> Iterator[None]:
         increment_frame()
         compile_id = get_compile_id(frame_state={})
-        log_dynamo_start(code)
         with (
             compile_context(CompileContext(compile_id)),
             dynamo_timed(
@@ -330,6 +339,7 @@ def _ctx() -> Iterator[None]:
                 },
             ),
         ):
+            log_dynamo_start(code)
             yield
 
     return _ctx()
@@ -480,6 +490,10 @@ def code_context(self, code: types.CodeType) -> Generator[None, None, None]:
         try:
             yield
         finally:
+            if (
+                entry.bypassed
+            ):  # Remove the code from the cache entry if it's been bypassed
+                del self._codes[code]
             entry.has_compile_id = True
             self._current_entry = None
 
@@ -489,6 +503,8 @@ def add_guarded_code(
         dynamo_code: types.CodeType,
     ) -> None:
         assert self._current_entry is not None
+        if self._current_entry.bypassed:
+            return
         guarded_code_entry = _GuardedCodeCacheEntry(
             guards_state=guards_state,
             dynamo_code=SerializedCode.from_code_object(dynamo_code),
@@ -496,26 +512,32 @@ def add_guarded_code(
         self._current_entry.guarded_codes.append(guarded_code_entry)
 
     def add_inlined_source(self, sources: list[types.CodeType]) -> None:
+        assert self._current_entry is not None
+        if self._current_entry.bypassed:
+            return
         for code in sources:
             if code in self._resume_codes:
                 continue
             module = inspect.getmodule(code)
             if module is None:
                 continue
-            source = inspect.getsource(code)
-            lastlineno = code.co_firstlineno + len(inspect.getsourcelines(code)[0])
-            assert source == "".join(
-                _get_sourcelines(module, code.co_firstlineno, lastlineno)
-            )
+            sourcelines, firstlineno = inspect.getsourcelines(code)
+            lastlineno = firstlineno + len(sourcelines)
+            source = "".join(sourcelines)
+            assert source == "".join(_get_sourcelines(module, firstlineno, lastlineno))
             self._inlined_sources.add(
                 InlinedSource(
                     module=module.__name__,
-                    firstlineno=code.co_firstlineno,
+                    firstlineno=firstlineno,
                     lastlineno=lastlineno,
                     checksum=_hash_source(source),
                 )
             )
 
+    def bypass_current_entry(self) -> None:
+        assert self._current_entry is not None
+        self._current_entry.bypassed = True
+
     def add_resume_function(
         self,
         python_code: types.CodeType,
@@ -636,7 +658,6 @@ def install(self, backends: dict[_BackendId, Any]) -> None:
                     check_fn_manager = torch._dynamo.guards.CheckFunctionManager(
                         target_code,
                         guards_state.output_graph,
-                        guards_serialization_mode="load",
                         shape_code_parts=guards_state.shape_code_parts,
                         runtime_global_scope=runtime_global_scope,
                     )
diff --git a/torch/_dynamo/pgo.py b/torch/_dynamo/pgo.py
index 403187bc6bde8..1a2c98ee6c7dd 100644
--- a/torch/_dynamo/pgo.py
+++ b/torch/_dynamo/pgo.py
@@ -173,6 +173,7 @@ class CodeState:
 
 _INIT_CODE_STATE: Optional[defaultdict[CodeId, CodeState]] = None
 _CODE_STATE: Optional[defaultdict[CodeId, CodeState]] = None
+_LOGGED_DYNAMIC_ALLOWLIST: bool = False
 
 
 @dataclasses.dataclass(frozen=True)
@@ -519,14 +520,7 @@ def process_automatic_dynamic(
         return res
 
 
-def get_cache_key() -> Optional[str]:
-    # TODO: info versions of these logs that log only once
-    if torch.compiler.config.force_disable_caches:
-        warn_once(
-            "dynamo_pgo force disabled by torch.compiler.config.force_disable_caches"
-        )
-        return None
-
+def format_cache_key(key: str) -> str:
     # NB: We always use global rank for keys, even though they are overkill
     # for local only cache
     rank = None
@@ -534,6 +528,16 @@ def get_cache_key() -> Optional[str]:
         rank = dist.get_rank()
 
     tag = torch.compiler.config.cache_key_tag
+    return f"{key}:{rank}:{tag}"
+
+
+def get_cache_key() -> Optional[str]:
+    # TODO: info versions of these logs that log only once
+    if torch.compiler.config.force_disable_caches:
+        warn_once(
+            "dynamo_pgo force disabled by torch.compiler.config.force_disable_caches"
+        )
+        return None
 
     # NB: We namespace the cache keys so that only user-specified job id
     # can alias with each other.
@@ -544,15 +548,25 @@ def get_cache_key() -> Optional[str]:
                 "automatically generated job id associated with a specific MAST job "
                 "name and version."
             )
-        return f"{r}:{rank}:{tag}"
+        return format_cache_key(r)
 
     if (name_version := torch._utils_internal.get_mast_job_name_version()) is not None:
         mast_job_name, mast_job_version = name_version
-        return f"mast:{mast_job_name}:{mast_job_version}:{rank}:{tag}"
+        return format_cache_key(f"mast:{mast_job_name}:{mast_job_version}")
 
     return None
 
 
+def get_extra_cache_key(sticky_key: str) -> Optional[str]:
+    if torch.compiler.config.force_disable_caches:
+        warn_once(
+            "dynamo_pgo force disabled by torch.compiler.config.force_disable_caches"
+        )
+        return None
+
+    return format_cache_key(sticky_key)
+
+
 # This solely controls local PGO
 def code_state_path(cache_key: str) -> Optional[str]:
     if not torch._dynamo.config.automatic_dynamic_local_pgo:
@@ -616,6 +630,7 @@ def _collect_dynamic_sources(code_state: CodeState) -> OrderedSet[str]:
 
 
 def log_frame_dynamic_whitelist(f_code: types.CodeType) -> None:
+    global _LOGGED_DYNAMIC_ALLOWLIST
     code_id = CodeId.make(f_code)
     frame_state = get_code_state()[code_id]
     frame_whitelist = ",".join(_collect_dynamic_sources(frame_state))
@@ -624,6 +639,16 @@ def log_frame_dynamic_whitelist(f_code: types.CodeType) -> None:
             CompileEventLogger.pt2_compile(
                 name, recompile_dynamic_whitelist=frame_whitelist
             )
+        if not _LOGGED_DYNAMIC_ALLOWLIST:
+            torch._utils_internal.add_mlhub_insight(
+                category="dynamic_shapes_analysis",
+                insight="Dynamic shape recompilation detected",
+                insight_description="PGO detected a recompilation due to dynamic shapes. \
+                Please follow the instruction from the action link to reduce \
+                recompilation overhead.",
+            )
+            # add mlhub insight only once per rank
+            _LOGGED_DYNAMIC_ALLOWLIST = True
 
 
 def render_code_state(cs: defaultdict[CodeId, CodeState]) -> str:
@@ -646,6 +671,16 @@ def render_code_state(cs: defaultdict[CodeId, CodeState]) -> str:
     return code_state_str
 
 
+def merge_pgo_entry(src: FrameStateSizeEntry, dst: FrameStateSizeEntry) -> None:
+    def rank(entry: FrameStateSizeEntry) -> int:
+        if not isinstance(entry.size, tuple):  # scalar
+            return -1
+        return len(entry.size)
+
+    if rank(src) == rank(dst):  # both tensors same rank, or both scalars
+        dst |= src
+
+
 @CacheArtifactFactory.register
 class PGOCacheArtifact(CacheArtifact):
     @override
@@ -675,32 +710,22 @@ def _rewrite_cache_key_for_mega_cache(original_key: str) -> str:
         return original_key
 
 
-def get_code_state() -> defaultdict[CodeId, CodeState]:
-    global _CODE_STATE, _INIT_CODE_STATE
-    if _CODE_STATE is not None:
-        return _CODE_STATE
-
-    # Initialize it (even if we don't look up profile)
-    _CODE_STATE = defaultdict(CodeState)
-
-    cache_key = get_cache_key()
-    if cache_key is None:
-        return _CODE_STATE
+def hit(key: str, ty: str) -> defaultdict[CodeId, CodeState]:
+    global _INIT_CODE_STATE
+    assert isinstance(_CODE_STATE, defaultdict)
+    log.info("get_code_state %s hit %s, %d entries", key, ty, len(_CODE_STATE))
+    trace_structured_artifact(
+        f"get_{ty}_code_state",
+        "string",
+        lambda: render_code_state(_CODE_STATE),  # type: ignore[arg-type]
+    )
+    set_feature_use("pgo", True)
+    _INIT_CODE_STATE = copy.deepcopy(_CODE_STATE)
+    return _CODE_STATE
 
-    def hit(ty: str) -> defaultdict[CodeId, CodeState]:
-        global _INIT_CODE_STATE
-        assert isinstance(_CODE_STATE, defaultdict)
-        log.info("get_code_state %s hit %s, %d entries", path, ty, len(_CODE_STATE))
-        trace_structured_artifact(
-            f"get_{ty}_code_state",
-            "string",
-            lambda: render_code_state(_CODE_STATE),  # type: ignore[arg-type]
-        )
-        set_feature_use("pgo", True)
-        _INIT_CODE_STATE = copy.deepcopy(_CODE_STATE)
-        return _CODE_STATE
 
-    # Attempt local
+def get_local_code_state(cache_key: str) -> Optional[defaultdict[CodeId, CodeState]]:
+    global _CODE_STATE
     path = code_state_path(cache_key)
     if path is not None and os.path.exists(path):
         with dynamo_timed(
@@ -722,9 +747,49 @@ def hit(ty: str) -> defaultdict[CodeId, CodeState]:
                     CacheArtifactManager.record_artifact(
                         PGOCacheArtifact.type(), cache_key, content
                     )
-                    return hit("local")
+                    return hit(path, "local")
+    return None
 
-    # Attempt remote
+
+def lookup_remote_cache_entry(
+    remote_cache: RemoteCache[JsonDataTy],
+    cache_key: str,
+    event_name: Optional[str] = None,
+) -> Optional[defaultdict[CodeId, CodeState]]:
+    code_state = None
+    try:
+        cache_data = remote_cache.get(cache_key)
+    except Exception:
+        log.warning("get_code_state failed remote read on %s", cache_key, exc_info=True)
+    else:
+        if cache_data is not None:
+            try:
+                assert isinstance(cache_data, dict)
+                data = cache_data["data"]
+                assert isinstance(data, str)
+                payload = base64.b64decode(data)
+                if event_name is not None:
+                    CompileEventLogger.pt2_compile(
+                        event_name, cache_size_bytes=len(payload)
+                    )
+                code_state = pickle.loads(payload)
+            except Exception:
+                log.warning(
+                    "get_code_state failed parsing remote result on %s",
+                    cache_key,
+                    exc_info=True,
+                )
+            else:
+                CacheArtifactManager.record_artifact(
+                    PGOCacheArtifact.type(), cache_key, payload
+                )
+        else:
+            log.info("get_code_state remote miss on %s", cache_key)
+    return code_state
+
+
+def get_remote_code_state(cache_key: str) -> Optional[defaultdict[CodeId, CodeState]]:
+    global _CODE_STATE
     remote_cache = get_remote_cache()
     if remote_cache is not None:
         with dynamo_timed(
@@ -733,37 +798,80 @@ def hit(ty: str) -> defaultdict[CodeId, CodeState]:
             dynamo_compile_column_us="pgo_get_remote_code_state_time_us",
         ):
             CompileEventLogger.pt2_compile(name, cache_key=cache_key)
-            # TODO: I don't really understand why there's a JSON container format
-            try:
-                cache_data = remote_cache.get(cache_key)
-            except Exception:
-                log.warning(
-                    "get_code_state failed remote read on %s", cache_key, exc_info=True
-                )
-            else:
-                if cache_data is not None:
-                    try:
-                        assert isinstance(cache_data, dict)
-                        data = cache_data["data"]
-                        assert isinstance(data, str)
-                        payload = base64.b64decode(data)
-                        CompileEventLogger.pt2_compile(
-                            name, cache_size_bytes=len(payload)
-                        )
-                        _CODE_STATE = pickle.loads(payload)
-                    except Exception:
-                        log.warning(
-                            "get_code_state failed parsing remote result on %s",
-                            cache_key,
-                            exc_info=True,
-                        )
+            code_state = lookup_remote_cache_entry(remote_cache, cache_key, name)
+            if code_state is not None:
+                _CODE_STATE = code_state
+                return hit(cache_key, "remote")
+    return None
+
+
+def add_extra_remote_code_state(cache_key: str) -> None:
+    """
+    Reads an additional PGO profile from the given cache key, and merges it with the default PGO profile.
+    """
+    global _CODE_STATE
+    assert _CODE_STATE is not None
+
+    remote_cache = get_remote_cache()
+    if remote_cache is not None:
+        with dynamo_timed(
+            name := "pgo.add_extra_remote_code_state",
+            log_pt2_compile_event=True,
+            dynamo_compile_column_us="pgo_get_remote_code_state_time_us",
+        ):
+            CompileEventLogger.pt2_compile(name, cache_key=cache_key)
+            code_state = lookup_remote_cache_entry(remote_cache, cache_key)
+            log.info(
+                "add_extra_code_state %s hit, %d entries",
+                cache_key,
+                len(code_state) if code_state is not None else 0,
+            )
+            if code_state is not None:
+                # merge the code state into the current one
+                for code_id, state in code_state.items():
+                    if code_id in _CODE_STATE:
+                        for src, entry in state.automatic_dynamic.items():
+                            # NOTE: maybe we need an "unsafe" merge to handle this,
+                            # where one entry might be 1-d, the other 2-d.
+                            # or if entries are of different types?
+                            # with local source naming, could be scalar vs. tensor
+                            merge_pgo_entry(
+                                entry, _CODE_STATE[code_id].automatic_dynamic[src]
+                            )
                     else:
-                        CacheArtifactManager.record_artifact(
-                            PGOCacheArtifact.type(), cache_key, payload
-                        )
-                        return hit("remote")
-                else:
-                    log.info("get_code_state remote miss on %s", cache_key)
+                        _CODE_STATE[code_id] = state
+                # log to tlparse
+                trace_structured_artifact(
+                    "add_extra_remote_code_state",
+                    "string",
+                    lambda: render_code_state(code_state),
+                )
+
+
+def get_code_state() -> defaultdict[CodeId, CodeState]:
+    global _CODE_STATE, _INIT_CODE_STATE
+    if _CODE_STATE is not None:
+        return _CODE_STATE
+
+    # Initialize it (even if we don't look up profile)
+    _CODE_STATE = defaultdict(CodeState)
+
+    cache_key = get_cache_key()
+    if cache_key is None:
+        return _CODE_STATE
+
+    # Attempt local
+    local_code_state = get_local_code_state(cache_key)
+
+    # Attempt remote
+    if local_code_state is None:
+        get_remote_code_state(cache_key)
+
+    # Attempt additional remote
+    if (sticky_read := torch.compiler.config.pgo_extra_read_key) is not None:
+        extra_read_key = get_extra_cache_key(sticky_read)
+        if extra_read_key is not None:
+            add_extra_remote_code_state(extra_read_key)
 
     log.info("get_code_state using default")
 
@@ -787,6 +895,10 @@ def put_code_state() -> None:
 
     put_local_code_state(cache_key)
     put_remote_code_state(cache_key)
+    if (sticky_write := torch.compiler.config.pgo_extra_write_key) is not None:
+        extra_write_key = get_extra_cache_key(sticky_write)
+        if extra_write_key is not None:
+            put_remote_code_state(extra_write_key)
 
 
 def write_local_impl(cache_key: str, pickled_code: bytes) -> Optional[tuple[str, int]]:
@@ -874,6 +986,7 @@ def put_remote_code_state(cache_key: str) -> None:
 
 # NB: this does NOT reset the cached code state on disk
 def reset_code_state() -> None:
-    global _CODE_STATE, _INIT_CODE_STATE
+    global _CODE_STATE, _INIT_CODE_STATE, _LOGGED_DYNAMIC_ALLOWLIST
     _CODE_STATE = None
     _INIT_CODE_STATE = None
+    _LOGGED_DYNAMIC_ALLOWLIST = False
diff --git a/torch/_dynamo/polyfills/__init__.py b/torch/_dynamo/polyfills/__init__.py
index 6d467b215797c..4fc777ffe7efd 100644
--- a/torch/_dynamo/polyfills/__init__.py
+++ b/torch/_dynamo/polyfills/__init__.py
@@ -24,6 +24,7 @@
     # See also the POLYFILLED_MODULE_NAMES in torch/_dynamo/polyfills/loader.py
     # Put the submodules here to avoid circular imports
     from . import (
+        _collections as _collections,
         builtins as builtins,
         functools as functools,
         itertools as itertools,
@@ -243,6 +244,10 @@ def set_difference_update(set1, *others):
     set1.update(result)
 
 
+def assert_dict_equal(self_, d1, d2, msg=None):
+    self_.assertTrue(d1 == d2, msg)
+
+
 def assert_multi_line_equal(self_, first, second, msg=None):
     return self_.assertTrue(first == second, msg)
 
diff --git a/torch/_dynamo/polyfills/_collections.py b/torch/_dynamo/polyfills/_collections.py
new file mode 100644
index 0000000000000..9773635ae3058
--- /dev/null
+++ b/torch/_dynamo/polyfills/_collections.py
@@ -0,0 +1,33 @@
+"""
+Python polyfills for builtins
+"""
+
+from collections.abc import Iterable, MutableMapping
+from typing import TypeVar
+
+from ..decorators import substitute_in_graph
+
+
+__all__ = []
+
+
+T = TypeVar("T")
+
+
+try:
+    import _collections  # type: ignore[import-not-found]
+
+    @substitute_in_graph(_collections._count_elements)
+    def _count_elements(
+        mapping: MutableMapping[T, int],
+        iterable: Iterable[T],
+    ) -> None:
+        "Tally elements from the iterable."
+        mapping_get = mapping.get
+        for elem in iterable:
+            mapping[elem] = mapping_get(elem, 0) + 1
+
+    __all__.append("_count_elements")
+
+except ImportError:
+    pass
diff --git a/torch/_dynamo/polyfills/itertools.py b/torch/_dynamo/polyfills/itertools.py
index 745df38496ffc..2b64327b93de9 100644
--- a/torch/_dynamo/polyfills/itertools.py
+++ b/torch/_dynamo/polyfills/itertools.py
@@ -24,6 +24,7 @@
     "compress",
     "cycle",
     "dropwhile",
+    "filterfalse",
     "islice",
     "tee",
     "zip_longest",
@@ -123,6 +124,15 @@ def dropwhile(predicate: _Predicate[_T], iterable: Iterable[_T], /) -> Iterator[
     yield from iterator
 
 
+@substitute_in_graph(itertools.filterfalse, is_embedded_type=True)  # type: ignore[arg-type]
+def filterfalse(function: _Predicate[_T], iterable: Iterable[_T], /) -> Iterator[_T]:
+    it = iter(iterable)
+    if function is None:
+        return filter(operator.not_, it)
+    else:
+        return filter(lambda x: not function(x), it)
+
+
 # Reference: https://docs.python.org/3/library/itertools.html#itertools.islice
 @substitute_in_graph(itertools.islice, is_embedded_type=True)  # type: ignore[arg-type]
 def islice(iterable: Iterable[_T], /, *args: int | None) -> Iterator[_T]:
diff --git a/torch/_dynamo/polyfills/loader.py b/torch/_dynamo/polyfills/loader.py
index f306d47ba5f8a..d348a422ff576 100644
--- a/torch/_dynamo/polyfills/loader.py
+++ b/torch/_dynamo/polyfills/loader.py
@@ -13,6 +13,7 @@
 
 # See also the TYPE_CHECKING block in torch/_dynamo/polyfills/__init__.py
 POLYFILLED_MODULE_NAMES: tuple[str, ...] = (
+    "_collections",
     "builtins",
     "functools",
     "itertools",
diff --git a/torch/_dynamo/profiler.py b/torch/_dynamo/profiler.py
index b3f5eb98d619c..2055507f72a4c 100644
--- a/torch/_dynamo/profiler.py
+++ b/torch/_dynamo/profiler.py
@@ -12,6 +12,8 @@
 by tracking both captured and total operations, timing, and graph statistics.
 """
 
+from __future__ import annotations
+
 import dataclasses
 import os
 from typing import Any
@@ -35,7 +37,7 @@ def __iadd__(self, other: Self) -> Self:
         self.fusions += other.fusions
         return self
 
-    def __add__(self, other: "ProfileMetrics") -> "ProfileMetrics":
+    def __add__(self, other: ProfileMetrics) -> ProfileMetrics:
         assert isinstance(other, ProfileMetrics)
         return ProfileMetrics(
             self.microseconds + other.microseconds,
@@ -43,7 +45,7 @@ def __add__(self, other: "ProfileMetrics") -> "ProfileMetrics":
             self.fusions + other.fusions,
         )
 
-    def __truediv__(self, other: Any) -> "ProfileMetrics":
+    def __truediv__(self, other: Any) -> ProfileMetrics:
         if isinstance(other, int):
             other = ProfileMetrics(other, other, other)
         return ProfileMetrics(
diff --git a/torch/_dynamo/repro/after_aot.py b/torch/_dynamo/repro/after_aot.py
index 71f552a83b4ab..998acc7397753 100644
--- a/torch/_dynamo/repro/after_aot.py
+++ b/torch/_dynamo/repro/after_aot.py
@@ -17,6 +17,8 @@
 the Dynamo AOT compilation pipeline, particularly for the Inductor backend.
 """
 
+from __future__ import annotations
+
 import argparse
 import copy
 import functools
@@ -28,12 +30,27 @@
 import sys
 import textwrap
 import uuid
-from collections.abc import Sequence
 from importlib import import_module
 from tempfile import TemporaryFile
 from typing import Any, Callable, IO, Optional, TYPE_CHECKING, Union
 from typing_extensions import Unpack
 
+
+try:
+    from triton.runtime.autotuner import Autotuner, Heuristics
+    from triton.runtime.jit import JITFunction
+except ImportError:
+
+    class Autotuner:  # type: ignore[no-redef]
+        pass
+
+    class JITFunction:  # type: ignore[no-redef]
+        pass
+
+    class Heuristics:  # type: ignore[no-redef]
+        pass
+
+
 import torch
 import torch.fx as fx
 import torch.nn as nn
@@ -58,8 +75,8 @@
 )
 from torch._dynamo.utils import clone_inputs, counters, same
 from torch._environment import is_fbcode
+from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
 from torch._inductor.cpp_builder import normalize_path_separator
-from torch._inductor.output_code import OutputCode
 from torch._library.fake_class_registry import FakeScriptObject
 from torch._ops import OpOverload
 from torch.fx.experimental.proxy_tensor import make_fx
@@ -73,7 +90,10 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from torch._inductor.compile_fx import _CompileFxCallable, _CompileFxKwargs
+    from torch._inductor.output_code import OutputCode
     from torch._inductor.utils import InputType
 
 
@@ -89,9 +109,9 @@
 
 
 def wrap_compiler_debug(
-    unconfigured_compiler_fn: "_CompileFxCallable",
+    unconfigured_compiler_fn: _CompileFxCallable,
     compiler_name: str,
-) -> "_CompileFxCallable":
+) -> _CompileFxCallable:
     """
     Minifier for Fx Graph modules after Aot Autograd has finished. We wrap both
     forward and backward call separately with the backend compiler_fn - like
@@ -103,8 +123,8 @@ def wrap_compiler_debug(
     @functools.wraps(unconfigured_compiler_fn)
     def debug_wrapper(
         gm: torch.fx.GraphModule,
-        example_inputs: Sequence["InputType"],
-        **kwargs: Unpack["_CompileFxKwargs"],
+        example_inputs: Sequence[InputType],
+        **kwargs: Unpack[_CompileFxKwargs],
     ) -> OutputCode:
         from torch._subclasses import FakeTensorMode
 
@@ -144,7 +164,7 @@ def debug_wrapper(
         # We may run regular PyTorch compute that may trigger Dynamo, do NOT
         # recursively attempt to accuracy minify in that case!
         def deferred_for_real_inputs(
-            real_inputs: Sequence["InputType"], **_kwargs: object
+            real_inputs: Sequence[InputType], **_kwargs: object
         ) -> Any:
             # This is a bit obscure: if we recursively try to accuracy minify
             # the SAME function, this would trigger.  But most of the time
@@ -156,7 +176,7 @@ def deferred_for_real_inputs(
             with config.patch(repro_after=None):
                 return inner_debug_fn(real_inputs)
 
-        def inner_debug_fn(real_inputs: Sequence["InputType"]) -> Any:
+        def inner_debug_fn(real_inputs: Sequence[InputType]) -> Any:
             """
             Aot Autograd fw_compiler and bw_compiler can have fake tensors. So,
             example_inputs can be fake tensors. We can call compiler_fn (which is
@@ -302,6 +322,16 @@ def generate_compiler_repro_string(
         """
         ).strip()
 
+    triton_imports = ""
+
+    if len(kernel_side_table.id_to_kernel) > 0:
+        triton_imports = textwrap.dedent(
+            """
+import triton
+import triton.language as tl
+        """
+        ).strip()
+
     model_str = textwrap.dedent(
         f"""
 {generate_env_vars_string(stable_output=stable_output)}
@@ -312,6 +342,7 @@ def generate_compiler_repro_string(
 from math import inf
 import torch._inductor.inductor_prims
 {distributed_imports}
+{triton_imports}
 
 {generate_config_string(stable_output=stable_output)}
 
@@ -330,6 +361,57 @@ def generate_compiler_repro_string(
             model_str += f"# torch git version: {torch.version.git_version}\n\n\n"
         model_str += _cuda_system_info_comment()
 
+    kernel_side_table_prefix = (
+        "torch._higher_order_ops.triton_kernel_wrap.kernel_side_table"
+    )
+    # Track which grid entry corresponds to the best config
+    for id in kernel_side_table.id_to_kernel:
+        kernel = kernel_side_table.get_kernel(id)
+
+        try:
+            if isinstance(kernel, Autotuner):
+                if isinstance(kernel.fn, Heuristics):
+                    model_str += "ERROR: Repro will not work as intended, "
+                    model_str += "triton.runtime.autotuner.Heuristics is not currently supported\n"
+                    break
+
+                config_strs = []
+                for kernel_config in kernel.configs:
+                    config_strs.append(f"""triton.Config(
+                            {str(kernel_config.kwargs)},
+                            num_warps={kernel_config.num_warps},
+                            num_stages={kernel_config.num_stages},
+                        )""")
+
+                config_str = ",".join(config_strs)
+                model_str += textwrap.dedent(f"""
+                @triton.autotune(
+                    configs=[
+                        {config_str}
+                    ],
+                    key=[]
+                )
+                """).strip()
+
+            model_str += "\n@triton.jit\n"
+            src_code = kernel.src if isinstance(kernel, JITFunction) else kernel.fn.src
+            fn_name = (
+                kernel._fn_name
+                if isinstance(kernel, JITFunction)
+                else kernel.fn._fn_name
+            )
+            fn_name = fn_name.split(".")[-1]
+
+            model_str += src_code
+            model_str += "\n"
+            model_str += f"{kernel_side_table_prefix}.add_kernel({fn_name})\n"
+        except AttributeError as e:
+            model_str += "ERROR: Repro will not work as intended, "
+            model_str += f"User defined triton kernel exception: {e}\n"
+
+    if len(kernel_side_table.constant_args) > 0:
+        model_str += f"{kernel_side_table_prefix}.constant_args={kernel_side_table.constant_args}\n"
+
     model_str += NNModuleToString.convert(gm)
 
     writer = InputWriter(save_dir, stable_hash=stable_hash)
diff --git a/torch/_dynamo/repro/after_dynamo.py b/torch/_dynamo/repro/after_dynamo.py
index 898946d6f89f5..65b9fc2eaa35d 100644
--- a/torch/_dynamo/repro/after_dynamo.py
+++ b/torch/_dynamo/repro/after_dynamo.py
@@ -319,7 +319,7 @@ def dynamo_minifier_backend(
 ) -> fx.GraphModule:
     from functorch.compile import minifier
 
-    compiler_fn = lookup_backend(compiler_name)
+    compiler_fn = lookup_backend(compiler_name)  # type: ignore[arg-type]
 
     # TODO: It's inconsistent to pass SymInt inputs but REAL tensors.
     # We should pass ints and look at the GraphModule placeholders
@@ -330,7 +330,7 @@ def dynamo_minifier_backend(
 
     try:
         compiled_gm = compiler_fn(gm, example_inputs)
-        run_fwd_maybe_bwd(compiled_gm, example_inputs)
+        run_fwd_maybe_bwd(compiled_gm, example_inputs)  # type: ignore[arg-type]
         raise ValueError("No issue was detected")
     except Exception as exc:
         orig_failure = str(exc)
@@ -361,20 +361,20 @@ def dynamo_accuracy_minifier_backend(
 ) -> fx.GraphModule:
     from functorch.compile import minifier
 
-    compiler_fn = lookup_backend(compiler_name)
+    compiler_fn = lookup_backend(compiler_name)  # type: ignore[arg-type]
 
     # Set the eval mode to remove randomness.
     gm.eval()
 
     # Check Accuracy
-    if _accuracy_fails(gm, example_inputs, compiler_fn):
+    if _accuracy_fails(gm, example_inputs, compiler_fn):  # type: ignore[arg-type]
         log.warning("Accuracy failed for the TorchDynamo produced graph")
         dump_state_fn = functools.partial(
             dump_backend_state, compiler_name=compiler_name, check_accuracy=True
         )
         fails_fn = functools.partial(
             _accuracy_fails,
-            compiler_fn=compiler_fn,
+            compiler_fn=compiler_fn,  # type: ignore[arg-type]
         )
         dump_state_fn(fx.GraphModule(gm, copy.deepcopy(gm.graph)), example_inputs)
         minifier(
@@ -469,7 +469,7 @@ def repro_minify(options: Any, mod: torch.nn.Module, load_args: Any) -> None:
 
     dynamo_minifier_backend = functools.partial(
         compiler_fn,
-        compiler_name=options.backend,
+        compiler_name=options.backend,  # type: ignore[call-arg]
     )
     opt_mod = torch._dynamo.optimize(dynamo_minifier_backend)(mod)
 
diff --git a/torch/_dynamo/repro/aoti.py b/torch/_dynamo/repro/aoti.py
index 808383e68e51a..e0aaf4caee475 100644
--- a/torch/_dynamo/repro/aoti.py
+++ b/torch/_dynamo/repro/aoti.py
@@ -162,7 +162,7 @@ def save_graph_repro_ep(
         assert args is not None
         exported_program = torch.export.export(gm, args, strict=strict)
     elif gm is None:
-        gm = exported_program.module()
+        gm = exported_program.module(check_guards=False)
 
     # save a graph preview using gm
     module_string = get_module_string(gm)  # type: ignore[arg-type]
@@ -302,7 +302,7 @@ def repro_common(
     options: Any, exported_program: ExportedProgram
 ) -> tuple[torch.fx.GraphModule, Any, Any]:
     torch._inductor.config.generate_intermediate_hooks = True
-    mod = exported_program.module()
+    mod = exported_program.module(check_guards=False)
     args, kwargs = exported_program.example_inputs
     return mod, args, kwargs  # type: ignore[return-value]
 
@@ -368,7 +368,7 @@ def export_for_aoti_minifier(
 
     try:
         ep = torch.export.export(gm, tuple_inputs, strict=strict)
-        gm = ep.module()
+        gm = ep.module(check_guards=False)
         return gm
     except Exception as e:
         if skip_export_error:
diff --git a/torch/_dynamo/resume_execution.py b/torch/_dynamo/resume_execution.py
index 0bd0a1b0ab2a0..840e02a9cdb80 100644
--- a/torch/_dynamo/resume_execution.py
+++ b/torch/_dynamo/resume_execution.py
@@ -22,6 +22,7 @@
 from typing import Any, Callable, cast, Optional
 
 from .bytecode_transformation import (
+    add_push_null,
     bytecode_from_template,
     create_call_function,
     create_instruction,
@@ -248,8 +249,10 @@ class ResumeFunctionMetadata:
     prefix_block_target_offset_remap: list[int] = dataclasses.field(
         default_factory=list
     )
-    # map from new block target offsets to original block target offsets
-    block_target_offset_remap: Optional[dict[int, int]] = None
+    # per-offset map from new block target offsets to original block target offsets
+    block_target_offset_remap: dict[int, dict[int, int]] = dataclasses.field(
+        default_factory=dict
+    )
 
 
 def _filter_iter(
@@ -310,6 +313,9 @@ def generate(
         stack_ctx_vars: tuple[tuple[int, tuple[Any, ...]], ...],
         argnames_ctx_vars: tuple[tuple[str, tuple[Any, ...]], ...],
         null_idxes: tuple[int, ...],
+        # mainly used to ensure distinct code objects per stack trace,
+        # which prevents excessive recompilation of inner frames
+        nested_code_objs: tuple[types.CodeType],
     ) -> types.CodeType:
         assert offset is not None
         assert not (
@@ -330,6 +336,7 @@ def generate(
                 stack_ctx_vars,
                 argnames_ctx_vars,
                 null_idxes,
+                nested_code_objs,
             )
 
         is_py311_plus = sys.version_info >= (3, 11)
@@ -340,7 +347,8 @@ def update(
         ) -> None:
             meta.instructions = copy.deepcopy(instructions)
 
-            args = [f"___stack{i}" for i in range(nstack)]
+            args = ["__nested_resume_fns", "__nested_frame_values"]
+            args += [f"___stack{i}" for i in range(nstack)]
             args.extend(v for v in argnames if v not in args)
             freevars = tuple(code_options["co_cellvars"] or []) + tuple(
                 code_options["co_freevars"] or []
@@ -368,11 +376,7 @@ def update(
             code_options["co_varnames"] = tuple(
                 args
                 + [v for v in argnames_null if v not in args]
-                + [
-                    v
-                    for v in code_options["co_varnames"]
-                    if v not in args and v not in freevars
-                ]
+                + [v for v in code_options["co_varnames"] if v not in args]
                 + [IS_TRACING_RESUME_PROLOGUE_VARNAME]
             )
             code_options["co_flags"] = code_options["co_flags"] & ~(
@@ -461,15 +465,67 @@ def update(
                         ]
                     )
 
-            # Set is_tracing_resume_prologue back to allow graph breaks.
-            prefix.extend(
-                [
-                    create_instruction("LOAD_CONST", argval=False),
-                    create_instruction(
-                        "STORE_FAST", argval=IS_TRACING_RESUME_PROLOGUE_VARNAME
-                    ),
-                ]
-            )
+            # Call nested resume function
+            if nested_code_objs:
+                prefix.extend(
+                    [
+                        # set up __nested_resume_fns[-1] call
+                        *add_push_null(
+                            [
+                                create_instruction(
+                                    "LOAD_FAST", argval="__nested_resume_fns"
+                                ),
+                                create_instruction("LOAD_CONST", argval=-1),
+                                create_instruction("BINARY_SUBSCR"),
+                            ]
+                        ),
+                        # del __nested_resume_fns[-1]
+                        create_instruction("LOAD_FAST", argval="__nested_resume_fns"),
+                        create_instruction("LOAD_CONST", argval=-1),
+                        create_instruction("DELETE_SUBSCR"),
+                        # load [__nested_resume_fns, __nested_frame_values]
+                        create_instruction("LOAD_FAST", argval="__nested_resume_fns"),
+                        create_instruction("LOAD_FAST", argval="__nested_frame_values"),
+                        create_instruction("BUILD_LIST", arg=2),
+                        # load __nested_frame_values[-1]
+                        create_instruction("LOAD_FAST", argval="__nested_frame_values"),
+                        create_instruction("LOAD_CONST", argval=-1),
+                        create_instruction("BINARY_SUBSCR"),
+                        # create [
+                        #     __nested_resume_fns,
+                        #     __nested_frame_values,
+                        #     *__nested_frame_values[-1],
+                        # ]
+                        create_instruction("LIST_EXTEND", arg=1),
+                        # del __nested_frame_values[-1]
+                        create_instruction("LOAD_FAST", argval="__nested_frame_values"),
+                        create_instruction("LOAD_CONST", argval=-1),
+                        create_instruction("DELETE_SUBSCR"),
+                        # delete __nested values
+                        create_instruction("DELETE_FAST", argval="__nested_resume_fns"),
+                        create_instruction(
+                            "DELETE_FAST", argval="__nested_frame_values"
+                        ),
+                        # Set is_tracing_resume_prologue back to allow graph breaks
+                        # in the nested resume
+                        create_instruction("LOAD_CONST", argval=False),
+                        create_instruction(
+                            "STORE_FAST", argval=IS_TRACING_RESUME_PROLOGUE_VARNAME
+                        ),
+                        # finish the call
+                        create_instruction("CALL_FUNCTION_EX", arg=0),
+                    ]
+                )
+            else:
+                # Set is_tracing_resume_prologue back to allow graph breaks after the jump
+                prefix.extend(
+                    [
+                        create_instruction("LOAD_CONST", argval=False),
+                        create_instruction(
+                            "STORE_FAST", argval=IS_TRACING_RESUME_PROLOGUE_VARNAME
+                        ),
+                    ]
+                )
 
             prefix.append(create_jump_absolute(target))
 
@@ -502,7 +558,7 @@ def update(
             # TODO(jansel): add dead code elimination here
             instructions[:] = prefix + instructions
 
-        new_code = transform_code_object(code, update)
+        new_code, _ = transform_code_object(code, update)
         ContinueExecutionCache.generated_code_metadata[new_code] = meta
         return new_code
 
@@ -534,7 +590,7 @@ def generate_based_on_original_code_object(
         meta: ResumeFunctionMetadata = ContinueExecutionCache.generated_code_metadata[
             code
         ]
-        new_offset = None
+        new_offset = -1
 
         def find_new_offset(
             instructions: list[Instruction], code_options: dict[str, Any]
@@ -548,17 +604,21 @@ def find_new_offset(
                 if i1 is target
             )
             assert target.opcode == new_target.opcode
+            assert new_target.offset is not None
             new_offset = new_target.offset
 
         transform_code_object(code, find_new_offset)
+        assert new_offset >= 0
 
         if sys.version_info >= (3, 11):
             # setup_fn_target_offsets currently contains the target offset of
             # each setup_fn, based on `code`. When we codegen the resume function
             # based on the original code object, `meta.code`, the offsets in
             # setup_fn_target_offsets must be based on `meta.code` instead.
-            if not meta.block_target_offset_remap:
-                block_target_offset_remap = meta.block_target_offset_remap = {}
+            if new_offset not in meta.block_target_offset_remap:
+                block_target_offset_remap = meta.block_target_offset_remap[
+                    new_offset
+                ] = {}
 
                 def remap_block_offsets(
                     instructions: list[Instruction], code_options: dict[str, Any]
@@ -606,7 +666,8 @@ def remap_block_offsets(
 
             # if offset is not in setup_fn_target_offsets, it is an error
             setup_fn_target_offsets = tuple(
-                meta.block_target_offset_remap[n] for n in setup_fn_target_offsets
+                meta.block_target_offset_remap[new_offset][n]
+                for n in setup_fn_target_offsets
             )
         return ContinueExecutionCache.lookup(
             meta.code, lineno, new_offset, setup_fn_target_offsets, *args
diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
index 58ed0da5fb2de..80b22e55227cd 100644
--- a/torch/_dynamo/side_effects.py
+++ b/torch/_dynamo/side_effects.py
@@ -617,16 +617,21 @@ def is_live(var: VariableTracker) -> bool:
         # The only live side effects come from returns (tx.stack), any intermediates
         # during a graph break (tx.symbolic_locals), and mutation on pre-existing variables.
         # Recursively visit Variables and see if any of them have been mutated.
+        init_live_vars = []
+        # gather stack/symbolic_locals for all tx's up the chain
+        cur_tx: Optional[InstructionTranslatorBase] = tx
+        while cur_tx is not None:
+            init_live_vars.extend([cur_tx.stack, cur_tx.symbolic_locals])
+            cur_tx = cur_tx.parent
         VariableTracker.visit(
             visit,
             # TODO track from all possible sources.
-            (
-                tx.stack,
-                tx.symbolic_locals,
+            init_live_vars
+            + [
                 pre_existing_vars,
                 tx.output.backward_state,
                 self.tensor_hooks,
-            ),
+            ],
         )
         # Manually release the self-referential function, which indirectly
         # captures certain `VariableTracker` and affects parts of PT test/logic
diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py
index a6bedb178e00b..c1906eeee710c 100644
--- a/torch/_dynamo/source.py
+++ b/torch/_dynamo/source.py
@@ -106,6 +106,13 @@ def is_constant_source(source: Source) -> bool:
     return False
 
 
+def _get_source_debug_name(source: Source) -> str:
+    try:
+        return source.name()
+    except NotImplementedError:
+        return "<unknown source>"
+
+
 @dataclasses.dataclass(frozen=True)
 class LocalSource(Source):
     local_name: str
@@ -402,6 +409,18 @@ def is_ephemeral(self) -> bool:
         return True
 
 
+@dataclasses.dataclass(frozen=True)
+class SkipGuardSource(ChainedSource):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        self.base.reconstruct(codegen)
+
+    def guard_source(self) -> GuardSource:
+        return self.base.guard_source()
+
+    def name(self) -> str:
+        return self.base.name()
+
+
 class TensorProperty(enum.Enum):
     SIZE = 0
     STRIDE = 1
@@ -811,6 +830,19 @@ def name(self) -> str:
         return f"___tuple_iterator_getitem({self.base.name()}, {self.index!r})"
 
 
+@dataclasses.dataclass(frozen=True)
+class NamedTupleFieldsSource(ChainedSource):
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        codegen(self.base)
+        codegen.extend_output(codegen.create_load_attrs("_fields"))
+
+    def guard_source(self) -> GuardSource:
+        return self.base.guard_source()
+
+    def name(self) -> str:
+        return f"___namedtuple_fields({self.base.name()})"
+
+
 @dataclasses.dataclass(frozen=True)
 class DataclassFieldsSource(ChainedSource):
     def reconstruct(self, codegen: "PyCodegen") -> None:
@@ -1066,6 +1098,14 @@ def is_from_nonlocal_source(source: Source) -> bool:
     )
 
 
+def is_from_closure_source(source: Source) -> bool:
+    if isinstance(source, ClosureSource):
+        return True
+    if isinstance(source, ChainedSource):
+        return is_from_closure_source(source.base)
+    return False
+
+
 def is_from_source(source: Source, target: Source) -> bool:
     if isinstance(source, ChainedSource):
         return is_from_source(source.base, target)
@@ -1143,3 +1183,14 @@ def is_from_defaults(source: Source) -> bool:
     if isinstance(source, ChainedSource):
         return is_from_defaults(source.base)
     return False
+
+
+@functools.lru_cache
+def is_from_skip_guard_source(source: Source) -> bool:
+    if isinstance(source, SkipGuardSource):
+        return True
+
+    if isinstance(source, ChainedSource):
+        return is_from_skip_guard_source(source.base)
+
+    return False
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 546d1bc84f25e..4dd1321a5057d 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -1,5 +1,3 @@
-# mypy: allow-untyped-defs
-
 """
 Core module responsible for converting Python bytecode into TorchDynamo's symbolic execution format.
 
@@ -24,6 +22,8 @@
 optimization of PyTorch programs.
 """
 
+from __future__ import annotations
+
 import collections
 import collections.abc
 import contextlib
@@ -42,14 +42,15 @@
 import threading
 import traceback
 import types
-import typing
 import weakref
+from traceback import StackSummary
 from typing import Any, Callable, cast, NoReturn, Optional, TYPE_CHECKING, Union
+from typing_extensions import TypeAlias, TypeIs
 from unittest.mock import patch
 
 import torch
 import torch._logging
-from torch._dynamo.exc import TensorifyScalarRestartAnalysis
+from torch._dynamo.exc import ObservedException, TensorifyScalarRestartAnalysis
 from torch._guards import tracing, TracingContext
 from torch._logging.structured import dump_file
 from torch.fx.experimental.symbolic_shapes import guard_bool
@@ -71,13 +72,18 @@
 )
 from .bytecode_transformation import (
     cleaned_instructions,
+    create_binary_slice,
     create_call_function,
+    create_copy,
+    create_dup_top,
     create_instruction,
     create_jump_absolute,
+    create_rot_n,
     create_swap,
     get_code_keys,
     Instruction,
     is_generator,
+    is_jump_absolute,
     unique_id,
 )
 from .code_context import code_context
@@ -88,6 +94,7 @@
     collapse_resume_frames,
     format_graph_break_message,
     get_stack_above_dynamo,
+    ResumePrologueTracingError,
     unimplemented_v2,
     Unsupported,
 )
@@ -108,6 +115,7 @@
     GlobalWeakRefSource,
     LocalCellSource,
     LocalSource,
+    SkipGuardSource,
     Source,
 )
 from .trace_rules import is_builtin_constant, is_forbidden
@@ -175,6 +183,10 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Generator, Sequence
+
+    from torch._subclasses.fake_tensor import FakeTensorMode
+
     from .package import CompilePackage
 
 log = logging.getLogger(__name__)
@@ -195,9 +207,14 @@
     tx, [handle_contains(tx, [*reversed(args)], {})], {}
 )
 
-
 PT2_ISSUE_TRACKER_URL = "https://github.com/pytorch/pytorch/issues/new?&labels=oncall%3A+pt2&projects=&template=pt2-bug-report.yml"
 
+ExceptionVals: TypeAlias = Union[
+    variables.ExceptionVariable,
+    UserDefinedExceptionClassVariable,
+    UserDefinedExceptionObjectVariable,
+]
+
 
 @functools.cache
 def _import_module(name: str) -> types.ModuleType:
@@ -219,7 +236,7 @@ class SpeculationEntry:
     error_on_graph_break: Optional[bool] = None
     reason: Optional[GraphCompileReason] = None
 
-    def fail_and_restart_analysis(self, error_on_graph_break: bool):
+    def fail_and_restart_analysis(self, error_on_graph_break: bool) -> None:
         """
         Start tracing of the current frame over again, and don't take this branch.
         """
@@ -231,7 +248,7 @@ def fail_and_restart_analysis(self, error_on_graph_break: bool):
             restart_reason = "Unknown fail_and_restart_analysis"
         raise exc.SpeculationRestartAnalysis(restart_reason=restart_reason)
 
-    def failed(self, tx):
+    def failed(self, tx: InstructionTranslatorBase) -> bool:
         if self._failed:
             assert self.error_on_graph_break is not None
             tx.error_on_graph_break = self.error_on_graph_break
@@ -252,15 +269,15 @@ class SpeculationLog:
     entries: list[SpeculationEntry] = dataclasses.field(default_factory=list)
     index: int = 0
 
-    def restart(self):
+    def restart(self) -> None:
         self.index = 0
 
-    def clear(self):
+    def clear(self) -> None:
         self.entries.clear()
         self.index = 0
 
     def next(
-        self, filename: str, lineno: int, instruction_pointer, inst
+        self, filename: str, lineno: int, instruction_pointer: int, inst: Instruction
     ) -> SpeculationEntry:
         """
         Lookup or create a SpeculationEntry() that is shared across
@@ -351,12 +368,14 @@ def empty(cls) -> bool:
 
 
 @functools.cache
-def _step_logger():
+def _step_logger() -> Callable[..., None]:
     return torchdynamo_logging.get_step_logger(log)
 
 
 @contextlib.contextmanager
-def save_and_restart_speculation_log(tx: "InstructionTranslatorBase"):
+def save_and_restart_speculation_log(
+    tx: InstructionTranslatorBase,
+) -> Generator[None, None, None]:
     # When reconstructing a generator after a graph break, we advance it until
     # it is fully exhausted. This process adds new entries to the speculation
     # log that were not previously observed. Without temporarily clearing the
@@ -374,7 +393,9 @@ def save_and_restart_speculation_log(tx: "InstructionTranslatorBase"):
 
 
 @contextlib.contextmanager
-def temporarely_allow_writes_to_output_graph(tx: "InstructionTranslatorBase"):
+def temporarely_allow_writes_to_output_graph(
+    tx: InstructionTranslatorBase,
+) -> Generator[None, None, None]:
     try:
         tmp = tx.output.should_exit
         tx.output.should_exit = False
@@ -393,10 +414,10 @@ class BlockStackEntry:
         Union[ContextWrappingVariable, GenericContextWrappingVariable]
     ] = None
 
-    def can_restore(self):
+    def can_restore(self) -> bool:
         return self.with_context is not None
 
-    def resume_fn(self):
+    def resume_fn(self) -> ReenterWith:
         assert self.stack_index is not None
         if (
             self.with_context
@@ -409,12 +430,12 @@ def resume_fn(self):
         else:
             return ReenterWith(self.stack_index - 1)
 
-    def exit(self, tx, is_graph_break):
+    def exit(self, tx: InstructionTranslatorBase, is_graph_break: bool) -> None:
         assert self.with_context is not None
         if (
             is_graph_break and self.with_context.exit_on_graph_break()
         ) or not is_graph_break:
-            return self.with_context.exit(tx)
+            return self.with_context.exit(tx)  # type: ignore[arg-type]
 
 
 class SpeculationLogDivergence(AssertionError):
@@ -432,22 +453,31 @@ class YieldValueOp(Exception):
     """
 
 
-def stack_op(fn: typing.Callable[..., object]):
+def stack_op(fn: Callable[..., object]) -> Callable[..., Any]:
     nargs = len(inspect.signature(fn).parameters)
     fn_var = BuiltinVariable(fn)
 
     @functools.wraps(fn)
-    def impl(self: "InstructionTranslator", inst: Instruction):
+    def impl(self: InstructionTranslator, inst: Instruction) -> None:
         self.push(fn_var.call_function(self, self.popn(nargs), {}))
 
     return impl
 
 
+def is_stdlib(mod: object) -> bool:
+    if sys.version_info < (3, 10):
+        # For < 3.10, no easy way to identify a stdlib module name.
+        return False
+    if not isinstance(mod, types.ModuleType):
+        return False
+    return mod.__name__.split(".")[0] in sys.stdlib_module_names
+
+
 def _detect_and_normalize_assert_statement(
-    self: "InstructionTranslatorBase",
-    truth_fn: typing.Callable[[object], bool],
+    self: InstructionTranslatorBase,
+    truth_fn: Callable[[object], bool],
     push: bool,
-):
+) -> bool:
     # Detect if this jump instruction is assert and normalize the assert
     # by pushing dummy error message when nothing is given.
     #
@@ -512,7 +542,12 @@ def _detect_and_normalize_assert_statement(
 explain = False
 
 
-def log_graph_break(code_options, reason="", exc_info=False, user_stack=None):
+def log_graph_break(
+    code_options: dict[str, Any],
+    reason: str = "",
+    exc_info: bool = False,
+    user_stack: Optional[StackSummary] = None,
+) -> None:
     if user_stack is None:
         user_stack = torch._guards.TracingContext.extract_stack()
 
@@ -532,7 +567,7 @@ def log_graph_break(code_options, reason="", exc_info=False, user_stack=None):
             traceback.format_list(stack_above_dynamo)
         )
     else:
-        user_stack = get_stack_above_dynamo() + user_stack
+        user_stack = get_stack_above_dynamo() + user_stack  # type: ignore[assignment]
         user_stack = collapse_resume_frames(user_stack)
     user_stack_formatted = "".join(traceback.format_list(user_stack))
     user_stack_trace = (
@@ -588,7 +623,9 @@ def log_graph_break(code_options, reason="", exc_info=False, user_stack=None):
         )
 
 
-def generic_jump(truth_fn: typing.Callable[[object], bool], push: bool):
+def generic_jump(
+    truth_fn: Callable[[object], bool], push: bool
+) -> Callable[[InstructionTranslatorBase, Instruction], None]:
     # graph break message fields for data dependent branching
     _gb_type = "Data-dependent branching"
     _explanation = (
@@ -600,7 +637,12 @@ def generic_jump(truth_fn: typing.Callable[[object], bool], push: bool):
         "Use `torch.cond` to express dynamic control flow.",
     ]
 
-    def jump_graph_break(self, inst, value, extra_msg=""):
+    def jump_graph_break(
+        self: InstructionTranslatorBase,
+        inst: Instruction,
+        value: VariableTracker,
+        extra_msg: str = "",
+    ) -> None:
         log_graph_break(
             self.code_options,
             reason=format_graph_break_message(
@@ -632,11 +674,14 @@ def jump_graph_break(self, inst, value, extra_msg=""):
         self.pop()
 
         if_next = self.create_call_resume_at(
-            self.next_instruction, all_stack_locals_metadata
+            self.next_instruction, all_stack_locals_metadata, False
         )
         if push:
             self.push(value)
-        if_jump = self.create_call_resume_at(inst.target, all_stack_locals_metadata)
+        assert inst.target is not None
+        if_jump = self.create_call_resume_at(
+            inst.target, all_stack_locals_metadata, False
+        )
 
         if sys.version_info >= (3, 13):
             # 3.13 requires stack[-1] to be bool type
@@ -646,7 +691,7 @@ def jump_graph_break(self, inst, value, extra_msg=""):
         jump_inst.copy_positions(inst)
         self.output.add_output_instructions([jump_inst] + if_next + if_jump)
 
-    def inner(self: "InstructionTranslatorBase", inst: Instruction):
+    def inner(self: InstructionTranslatorBase, inst: Instruction) -> None:
         value: VariableTracker = self.pop()
         if (
             config.rewrite_assert_with_torch_assert
@@ -841,10 +886,16 @@ def inner(self: "InstructionTranslatorBase", inst: Instruction):
     return inner
 
 
-def break_graph_if_unsupported(*, push):
-    def decorator(inner_fn):
+def break_graph_if_unsupported(
+    *, push: int
+) -> Callable[
+    [Callable[..., None]], Callable[[InstructionTranslatorBase, Instruction], None]
+]:
+    def decorator(
+        inner_fn: Callable[..., None],
+    ) -> Callable[[InstructionTranslatorBase, Instruction], None]:
         @functools.wraps(inner_fn)
-        def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
+        def wrapper(self: InstructionTranslatorBase, inst: Instruction) -> None:
             speculation = self.speculate()
             if speculation.failed(self):
                 assert speculation.reason is not None
@@ -894,10 +945,10 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
             speculation.fail_and_restart_analysis(self.error_on_graph_break)
 
         def handle_graph_break(
-            self: "InstructionTranslatorBase",
+            self: InstructionTranslatorBase,
             inst: Instruction,
             reason: GraphCompileReason,
-        ):
+        ) -> None:
             if (
                 sys.version_info >= (3, 11)
                 and sys.version_info < (3, 12)
@@ -963,7 +1014,7 @@ def handle_graph_break(
                 self.push(UnknownVariable())
             self.output.add_output_instructions(
                 self.create_call_resume_at(
-                    self.next_instruction, all_stack_locals_metadata
+                    self.next_instruction, all_stack_locals_metadata, False
                 )
             )
 
@@ -975,10 +1026,10 @@ def handle_graph_break(
 class BytecodeDistpatchTableMeta(type):
     """Installs a `cls.dispatch_table` on every subclass to speed up calls to self.OPCODE()"""
 
-    def __init__(cls, name, bases, dct) -> None:
-        super().__init__(name, bases, dct)
+    def __init__(cls: type, name: str, bases: Any, dct: Any) -> None:
+        super().__init__(name, bases, dct)  # type: ignore[misc]
 
-        def _missing(opname, *args):
+        def _missing(opname: str, *args: Any) -> None:
             unimplemented_v2(
                 gb_type="Missing bytecode handler",
                 context=f"{opname} with args {args}",
@@ -1014,35 +1065,37 @@ class ExceptionStack:
     #  + PUSH_EXC_INFO := pushes the current_exception to the *exception stack*
     #  + POP_EXCEPT := pops TOS from the *exception stack*
 
-    _exc_stack: list[VariableTracker] = dataclasses.field(default_factory=list)
-    _current_exception: Optional[VariableTracker] = dataclasses.field(default=None)
+    _exc_stack: list[ExceptionVals] = dataclasses.field(default_factory=list)
+    _current_exception: Optional[ExceptionVals] = dataclasses.field(default=None)
 
-    def clear_current_exception(self):
+    def clear_current_exception(self) -> None:
         self._current_exception = None
 
-    def set_current_exception(self, val):
+    def set_current_exception(self, val: ExceptionVals) -> None:
         self._set_context_and_break_context_reference_cycle(val)
         self._current_exception = val
 
-    def move_current_exception_to_stack(self):
+    def move_current_exception_to_stack(self) -> None:
         assert self._current_exception is not None
         self.append(self._current_exception)
         self.clear_current_exception()
 
-    def get_current_exception(self):
+    def get_current_exception(self) -> ExceptionVals:
         assert self._current_exception is not None
         return self._current_exception
 
-    def _set_context_recursive(self, val, prev_idx):
-        if (ctx := val.__context__) and type(ctx) is not ConstantVariable:
+    def _set_context_recursive(
+        self, val: ExceptionVals, prev_idx: int
+    ) -> ExceptionVals:
+        if (ctx := val.__context__) and type(ctx) is not ConstantVariable:  # type: ignore[union-attr]
             return val
         if len(self._exc_stack) + prev_idx > 0:
             prev = self._exc_stack[prev_idx]
             self._set_context_recursive(prev, prev_idx - 1)
-            val.set_context(prev)
+            val.set_context(prev)  # type: ignore[union-attr, arg-type]
         return val
 
-    def _break_context_reference_cycle(self, val):
+    def _break_context_reference_cycle(self, val: ExceptionVals) -> None:
         # See test_exceptions::test_raise_does_not_create_context_chain_cycle
         # Based on https://github.com/python/cpython/blob/e635bf2e49797ecb976ce45a67fce2201a25ca68/Python/errors.c#L207-L228
         # As noted on CPython, this is O(chain length) but the context chains
@@ -1050,42 +1103,45 @@ def _break_context_reference_cycle(self, val):
         o = slow_o = val
         slow_update_toggle = False  # floyd's algorithm for detecting cycle
         while True:
-            context = o.__context__
+            context = o.__context__  # type: ignore[union-attr]
             if type(context) is ConstantVariable:  # context not set
                 break
 
             if context is val:
-                o.set_context(ConstantVariable(None))
+                o.set_context(ConstantVariable(None))  # type: ignore[union-attr, arg-type]
                 break
 
-            o = context
+            o = context  # type: ignore[assignment]
             if o is slow_o:
                 # pre-existing cycle - all exceptions on the path were
                 # visited and checked
                 break
 
             if slow_update_toggle:
-                slow_o = slow_o.__context__  # visited all exceptions
+                # visited all exceptions
+                slow_o = slow_o.__context__  # type: ignore[union-attr, assignment]
             slow_update_toggle = not slow_update_toggle
 
-    def _set_context_and_break_context_reference_cycle(self, val):
+    def _set_context_and_break_context_reference_cycle(
+        self, val: ExceptionVals
+    ) -> None:
         # set Exception.__context__
         self._set_context_recursive(val, len(self._exc_stack) - 1)
         self._break_context_reference_cycle(val)
 
-    def pop(self):
+    def pop(self) -> ExceptionVals:
         return self._exc_stack.pop()
 
-    def append(self, val):
+    def append(self, val: ExceptionVals) -> None:
         self._exc_stack.append(val)
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self._exc_stack)
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> ExceptionVals:
         return self._exc_stack[index]
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self._exc_stack=} - {self._current_exception=}"
 
     __repr__ = __str__
@@ -1098,6 +1154,7 @@ class InstructionTranslatorBase(
     symbolic_locals: dict[str, VariableTracker]
     symbolic_globals: dict[str, VariableTracker]
     symbolic_torch_function_state: SymbolicTorchFunctionState
+    post_prune_cell_and_freevars: Optional[dict[str, VariableTracker]]
     stack: list[VariableTracker]
     instruction_pointer: Optional[int]
     current_instruction: Instruction
@@ -1115,11 +1172,11 @@ class InstructionTranslatorBase(
     strict_checks_fn: Optional[Callable[[VariableTracker], bool]]
     start_point: Optional[int]
     is_leaf_tracer: bool
-    parent: Optional["InstructionTranslatorBase"]
+    parent: Optional[InstructionTranslatorBase]
     debug_locals: list[tuple[VariableTracker, list[VariableTracker]]]
-    package: Optional["CompilePackage"]
+    package: Optional[CompilePackage]
 
-    def mark_inconsistent_side_effects(self):
+    def mark_inconsistent_side_effects(self) -> None:
         """
         InstructionTranslator has encountered instructions which may cause
         dynamo to see a different version of history from eager
@@ -1127,7 +1184,7 @@ def mark_inconsistent_side_effects(self):
         """
         self.inconsistent_side_effects = True
 
-    def maybe_has_backedge(self):
+    def maybe_has_backedge(self) -> bool:
         # This function employs a heuristic. It does not reliably detect a backedge.
         # The heuristic is straightforward: starting from the current instruction and
         # continuing to the end, if any jump instruction targets an instruction before
@@ -1147,43 +1204,51 @@ def maybe_has_backedge(self):
         # graph during a for loop. In general, its better to have fewer false
         # negatives so that Dynamo does not skip the whole frame.
 
-        cur_offset = self.current_instruction.offset
-        assert self.instruction_pointer is not None
-        for inst in self.instructions[self.instruction_pointer :]:
-            if inst.opname in ("RETURN_VALUE", "RETURN_CONST"):
-                return False
-            if inst.opname in JUMP_OPNAMES:
-                jump_offset = inst.argval
-                if jump_offset < cur_offset:
-                    return True
+        # If any parent tx has a backedge, then return True
+        cur_tx: Optional[InstructionTranslatorBase] = self
+        while cur_tx is not None:
+            cur_offset = cur_tx.current_instruction.offset
+            assert cur_tx.instruction_pointer is not None
+            for inst in cur_tx.instructions[cur_tx.instruction_pointer :]:
+                if inst.opname in ("RETURN_VALUE", "RETURN_CONST"):
+                    break
+                if inst.opname in JUMP_OPNAMES:
+                    jump_offset = inst.argval
+                    if jump_offset < cur_offset:
+                        return True
+            cur_tx = cur_tx.parent
         return False
 
-    def cellvars(self):
+    def cellvars(self) -> list[str]:
         return self.code_options["co_cellvars"]
 
-    def freevars(self):
+    def freevars(self) -> list[str]:
         return self.code_options["co_freevars"]
 
-    def cell_and_freevars(self):
+    def cell_and_freevars(self) -> list[str]:
         if not hasattr(self, "_cell_and_freevars"):
             self._cell_and_freevars = self.cellvars() + self.freevars()
         return self._cell_and_freevars
 
-    def prune_dead_locals(self):
+    def prune_dead_locals(self) -> None:
+        # keep cell and freevar references alive
+        self.post_prune_cell_and_freevars = {
+            k: v
+            for k, v in self.symbolic_locals.items()
+            if k in self.cell_and_freevars()
+        }
         # Only keep the locals that must remain on the stack.
         reads = livevars_analysis(self.instructions, self.current_instruction)
         self.symbolic_locals = {
             k: v for k, v in self.symbolic_locals.items() if k in reads
         }
-        # "Garbage collect the heap".
-        self.output.side_effects.prune_dead_object_new(self)
 
     def call_function(
         self,
         fn: VariableTracker,
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
-    ):
+    ) -> None:
         assert isinstance(fn, VariableTracker)
         assert isinstance(args, list)
         assert isinstance(kwargs, dict)
@@ -1200,24 +1265,29 @@ def call_function(
             raise AssertionError(f"Attempt to trace forbidden callable {inner_fn}")
         self.push(fn.call_function(self, args, kwargs))  # type: ignore[arg-type]
 
-    def inline_generator_function(self, fn, args, kwargs):
+    def inline_generator_function(
+        self, fn: VariableTracker, args: Sequence[Any], kwargs: dict[str, Any]
+    ) -> Any:
         """
         Redirect the call to the generator "call_function"
         """
         if not isinstance(fn, LocalGeneratorFunctionVariable):
-            fn = LocalGeneratorFunctionVariable(fn)
-        return fn.call_function(self, args, kwargs)
+            fn = LocalGeneratorFunctionVariable(fn)  # type: ignore[arg-type]
+        return fn.call_function(self, args, kwargs)  # type: ignore[arg-type]
 
-    def inline_user_function_return(self, fn, args, kwargs):
+    def inline_user_function_return(
+        self, fn: VariableTracker, args: Sequence[Any], kwargs: dict[str, Any]
+    ) -> Any:
         """
         A call to some user defined function by inlining it.
         """
-        if config.enable_faithful_generator_behavior and is_generator(fn.get_code()):
+        self.is_leaf_tracer = False
+        if config.enable_faithful_generator_behavior and is_generator(fn.get_code()):  # type: ignore[attr-defined]
             return self.inline_generator_function(fn, args, kwargs)
         else:
             return InliningInstructionTranslator.inline_call(self, fn, args, kwargs)
 
-    def get_line_of_code_header(self, lineno=None):
+    def get_line_of_code_header(self, lineno: Optional[int] = None) -> str:
         if lineno is None:
             lineno = self.lineno
         inline_depth_str = (
@@ -1227,13 +1297,13 @@ def get_line_of_code_header(self, lineno=None):
         funcname_str = "" if funcname is None else f" ({funcname})"
         return f"{self.f_code.co_filename}:{lineno} in {self.f_code.co_name}{funcname_str}{inline_depth_str}"
 
-    def get_log_starts_line_log_str(self):
+    def get_log_starts_line_log_str(self) -> str:
         log_str = f"TRACE starts_line {self.get_line_of_code_header()}\n"
         line = linecache.getline(self.f_code.co_filename, self.lineno).rstrip()
         log_str += f"    {line}"
         return log_str
 
-    def starts_line(self, lineno):
+    def starts_line(self, lineno: int) -> None:
         if self.lineno == lineno:
             return
         self.lineno = lineno
@@ -1244,7 +1314,7 @@ def starts_line(self, lineno):
         if self.is_trace_source_log_enabled:
             trace_source_log.debug("%s", LazyString(self.get_log_starts_line_log_str))
 
-    def step(self):
+    def step(self) -> bool:
         """Process exactly one instruction, return False we should exit"""
         self.error_on_graph_break = _get_error_on_graph_break()
 
@@ -1264,7 +1334,8 @@ def step(self):
         ):
             self.current_speculation = self.speculate()
             if self.current_speculation.failed(self):
-                return self.step_graph_break(inst)
+                self.step_graph_break(inst)
+                return False
 
         if self.is_trace_bytecode_log_enabled:
             trace_bytecode_log.debug(
@@ -1290,10 +1361,11 @@ def step(self):
             log.debug("step triggered compile", exc_info=True)
 
         self.current_speculation.fail_and_restart_analysis(self.error_on_graph_break)
+        return False
 
     if sys.version_info >= (3, 11):
 
-        def update_block_stack(self, inst):
+        def update_block_stack(self, inst: Instruction) -> None:
             # 3.11+ no longer uses a block stack, but we still keep track of one
             # so that we know which contexts are currently active.
             # For our purposes, all exception table entries with the same target
@@ -1334,14 +1406,15 @@ def update_block_stack(self, inst):
 
     else:
 
-        def update_block_stack(self, inst):
+        def update_block_stack(self, inst: Instruction) -> None:
             pass
 
     @property
-    def next_instruction(self):
-        return self.instructions[self.instruction_pointer]  # type: ignore[index]
+    def next_instruction(self) -> Instruction:
+        assert self.instruction_pointer is not None
+        return self.instructions[self.instruction_pointer]
 
-    def step_graph_break(self, continue_inst):
+    def step_graph_break(self, continue_inst: Instruction) -> None:
         # generate code from checkpoint
         assert not self.output.output_instructions
         assert self.current_speculation is not None
@@ -1349,29 +1422,78 @@ def step_graph_break(self, continue_inst):
         # where we call step_graph_break right now is when the stack is empty,
         # so let's enforce that for now.
         assert not self.stack
-        self.output.compile_subgraph(
+        # NOTE: if we support non-empty self.stack in the future, the `stack_pops` argument
+        # below should be set to the stack length to ensure that the stack is codegen'd
+        # for the rest of the function.
+        all_stack_locals_metadata = self.output.compile_subgraph(
             self,
             partial_convert=True,
             reason=GraphCompileReason("step_unsupported", [self.frame_summary()]),
         )
-        self.output.add_output_instructions(
-            [create_jump_absolute(continue_inst)] + self.instructions
-        )
+        if self.parent:
+            # nested graph break
+            assert config.nested_graph_breaks
+            self.output.add_output_instructions(
+                self.create_call_resume_at(
+                    continue_inst, all_stack_locals_metadata, True
+                )
+            )
+        else:
+            # load locals from frame values
+            # current frame state
+            # [
+            #   frame N locals,
+            #   frame N-1 stack + locals,
+            #   ...,
+            #   frame 1 stack + locals,
+            # ],
+            cg = PyCodegen(self)
+            self.output.add_output_instructions(
+                [
+                    cg.create_load_const(-1),
+                    cg.create_binary_subscr(),
+                ]
+            )
+            for local, idx in all_stack_locals_metadata[-1].locals_names.items():
+                self.output.add_output_instructions(
+                    [
+                        create_dup_top(),
+                        cg.create_load_const(idx),
+                        cg.create_binary_subscr(),
+                        cg.create_store(local),
+                    ]
+                )
+            self.output.add_output_instructions(
+                [
+                    create_instruction("POP_TOP"),
+                    create_jump_absolute(continue_inst),
+                    *self.instructions,
+                ]
+            )
 
-    def run_ctx_mgr(self):
+    def run_ctx_mgr(self) -> Any:
         # NB: Don't push the top level frame summary; set_current_loc will
         # take care of it.  However, DO make sure we attach real_stack to
         # exceptions
         return TracingContext.current_frame(None)
 
-    def run(self):
+    def run(self) -> None:
         with self.run_ctx_mgr():
             dump_file(self.f_code.co_filename)
             try:
                 self.output.push_tx(self)
                 self.start_point = self.instruction_pointer
-                while self.step():
-                    pass
+                try:
+                    while self.step():
+                        pass
+                except Exception as e:
+                    if self.is_tracing_resume_prologue:
+                        raise ResumePrologueTracingError(
+                            "Error while tracing through a Dynamo-generated resume function prologue. "
+                            "Errors are not allowed when tracing resume function prologues.\n"
+                            f"{type(e).__qualname__}: {str(e)}"
+                        ).with_traceback(e.__traceback__) from None
+                    raise
             except TensorifyScalarRestartAnalysis:
                 raise
             except BackendCompilerFailed:
@@ -1407,13 +1529,13 @@ def run(self):
                     # twice is not an issue (second stop is a no op).
                     self.output.mark_bytecode_tracing_stop()
 
-    def push(self, val: Optional[VariableTracker]):
+    def push(self, val: Optional[VariableTracker]) -> None:
         assert val is None or isinstance(val, VariableTracker), (
             f"push expects VariableTracker, got {typestr(val)}"
         )
         self.stack.append(val)  # type: ignore[arg-type]
 
-    def push_many(self, vals: list[VariableTracker]):
+    def push_many(self, vals: list[VariableTracker]) -> None:
         for val in vals:
             self.push(val)
 
@@ -1423,7 +1545,7 @@ def pop(self) -> VariableTracker:
     def popn(self, n: int) -> list[VariableTracker]:
         return [*reversed([self.pop() for _ in range(n)])]
 
-    def LOAD_FAST(self, inst):
+    def LOAD_FAST(self, inst: Instruction) -> None:
         name = inst.argval
         if self.exec_recorder and name in self.f_locals:
             self.exec_recorder.add_local_var(name, self.f_locals[name])
@@ -1458,7 +1580,7 @@ def LOAD_FAST(self, inst):
         if name.startswith("__stack"):
             self.symbolic_locals.pop(name)
 
-    def LOAD_DEREF(self, inst):
+    def LOAD_DEREF(self, inst: Instruction) -> None:
         assert inst.argval in self.cell_and_freevars()
         cell = self.symbolic_locals[inst.argval]
         contents_var = self.output.side_effects.load_cell(cell)
@@ -1467,7 +1589,7 @@ def LOAD_DEREF(self, inst):
         if self.exec_recorder and inst.argval in self.f_locals:
             self.exec_recorder.add_local_var(inst.argval, self.f_locals[inst.argval])
 
-    def STORE_FAST(self, inst):
+    def STORE_FAST(self, inst: Instruction) -> None:
         name = inst.argval
         loaded_vt = self.pop()
         loaded_vt.set_name_hint(name)
@@ -1477,10 +1599,10 @@ def STORE_FAST(self, inst):
             assert type(val) is bool
             self.is_tracing_resume_prologue = val
 
-    def DELETE_FAST(self, inst):
+    def DELETE_FAST(self, inst: Instruction) -> None:
         del self.symbolic_locals[inst.argval]
 
-    def STORE_DEREF(self, inst):  # type: ignore[override]
+    def STORE_DEREF(self, inst: Instruction) -> None:  # type: ignore[override]
         assert inst.argval in self.cell_and_freevars()
         cell = self.symbolic_locals[inst.argval]
         val = self.pop()
@@ -1492,19 +1614,21 @@ def STORE_DEREF(self, inst):  # type: ignore[override]
 
     LOAD_CLOSURE = LOAD_FAST
 
-    def _load_const(self, inst):
+    def _load_const(self, inst: Instruction) -> ConstantVariable:
         i = inst.arg
         if i is None:
-            return ConstantVariable.create(value=inst.argval)
+            return ConstantVariable.create(value=inst.argval)  # type: ignore[return-value]
         val = self._constants_cache[i]
         if not val:
-            self._constants_cache[i] = val = ConstantVariable.create(value=inst.argval)
+            self._constants_cache[i] = ConstantVariable.create(value=inst.argval)  # type: ignore[call-overload]
+            val = self._constants_cache[i]
+        assert val is not None
         return val
 
-    def LOAD_CONST(self, inst):
+    def LOAD_CONST(self, inst: Instruction) -> None:
         self.push(self._load_const(inst))
 
-    def _load_global(self, inst):
+    def _load_global(self, inst: Instruction) -> None:
         name = inst.argval
 
         if self.exec_recorder:
@@ -1526,20 +1650,21 @@ def _load_global(self, inst):
         self.push(VariableTracker.build(self, value, GlobalSource(name)))
 
     @functools.cached_property
-    def nn_modules_globals_vt(self):
+    def nn_modules_globals_vt(self) -> VariableTracker:
         module_name = "torch.nn.modules.module"
         module_source = self.import_source(module_name)
         fglobals_value = _import_module(module_name)
         return VariableTracker.build(self, fglobals_value, module_source)
 
-    def LOAD_GLOBAL(self, inst):
+    def LOAD_GLOBAL(self, inst: Instruction) -> None:
+        assert inst.arg is not None
         if sys.version_info >= (3, 11) and sys.version_info < (3, 13) and inst.arg % 2:
             self.PUSH_NULL(inst)
         self._load_global(inst)
         if sys.version_info >= (3, 13) and inst.arg % 2:
             self.PUSH_NULL(inst)
 
-    def STORE_GLOBAL(self, inst):
+    def STORE_GLOBAL(self, inst: Instruction) -> None:
         value = self.pop()
         name = inst.argval
         source = GlobalSource(name)
@@ -1560,7 +1685,7 @@ def STORE_GLOBAL(self, inst):
     # Cache note: This cache only exists for the duration of this
     # InstructionTranslator - so it should be safe to do.
     @cache_method
-    def import_source(self, module_name):
+    def import_source(self, module_name: str) -> GlobalSource:
         """Create an alias to a module for use in guards"""
         if "torch_package" in module_name:
             value = torch.package.package_importer._package_imported_modules[
@@ -1575,13 +1700,14 @@ def import_source(self, module_name):
 
         if self.package is not None:
             self.package.add_import_source(alias, module_name)
+        self.output.import_sources[alias] = module_name
         f_globals = self.output.global_scope
         assert alias not in f_globals or f_globals[alias] is value
         f_globals[alias] = value
         self.output.update_co_names(alias)
         return GlobalSource(alias)
 
-    def resolve_name(self, name, package, level):
+    def resolve_name(self, name: str, package: str, level: int) -> str:
         """
         Copied from the Cpython implementation of __import__
         Resolve a relative module name to an absolute one.
@@ -1593,7 +1719,7 @@ def resolve_name(self, name, package, level):
         base = bits[0]
         return f"{base}.{name}" if name else base
 
-    def calc_package(self):
+    def calc_package(self) -> str:
         """
         Copied from the Cpython implementation of __import__
         https://github.com/python/cpython/blob/5a094f0255eea1db58fb2cf14c200971e64ec36e/Lib/importlib/_bootstrap.py#L1090
@@ -1622,7 +1748,7 @@ def calc_package(self):
                 package = package.rpartition(".")[0]
         return package
 
-    def IMPORT_NAME(self, inst):
+    def IMPORT_NAME(self, inst: Instruction) -> None:
         level, fromlist = self.popn(2)
         level = level.as_python_constant()
         fromlist = fromlist.as_python_constant()
@@ -1682,14 +1808,14 @@ def IMPORT_NAME(self, inst):
     # fb internal 3.12 opcode
     EAGER_IMPORT_NAME = IMPORT_NAME
 
-    def IMPORT_FROM(self, inst):
+    def IMPORT_FROM(self, inst: Instruction) -> None:
         self.DUP_TOP(inst)
         self._load_attr(inst)
 
     # Cache note: This cache only exists for the duration of this
     # InstructionTranslator - so it should be safe to do.
     @cache_method
-    def load_builtin_from_argval(self, argval):
+    def load_builtin_from_argval(self, argval: Any) -> VariableTracker:
         if argval not in self.f_builtins:
             raise Unsupported(f"name '{argval}' is not defined")
         val = self.f_builtins[argval]
@@ -1704,12 +1830,13 @@ def load_builtin_from_argval(self, argval):
             assert is_builtin_constant(val)
             return ConstantVariable.create(value=val)
 
-    def load_builtin(self, inst):
+    def load_builtin(self, inst: Instruction) -> None:
         self.push(self.load_builtin_from_argval(inst.argval))
 
-    def jump(self, inst):
+    def jump(self, inst: Instruction) -> None:
         assert self.instruction_pointer is not None
         assert self.start_point is not None
+        assert inst.target is not None
         get_metrics_context().increment(
             "ir_count", self.instruction_pointer - self.start_point
         )
@@ -1724,37 +1851,40 @@ def jump(self, inst):
     JUMP_IF_FALSE_OR_POP = generic_jump(operator.not_, True)
     JUMP_IF_TRUE_OR_POP = generic_jump(operator.truth, True)
 
-    def SETUP_LOOP(self, inst):
+    def SETUP_LOOP(self, inst: Instruction) -> None:
         # only exists in python<=3.7
+        assert inst.target is not None
         self.block_stack.append(BlockStackEntry(inst, inst.target, len(self.stack)))
 
-    def SETUP_EXCEPT(self, inst):
+    def SETUP_EXCEPT(self, inst: Instruction) -> None:
         # only exists in python<=3.7
+        assert inst.target is not None
         self.block_stack.append(BlockStackEntry(inst, inst.target, len(self.stack)))
 
-    def POP_BLOCK(self, inst):
+    def POP_BLOCK(self, inst: Instruction) -> None:
         self.block_stack.pop()
 
-    def SETUP_WITH(self, inst):
+    def SETUP_WITH(self, inst: Instruction) -> None:
         self.setup_or_before_with(inst)
 
-    def SETUP_FINALLY(self, inst):
+    def SETUP_FINALLY(self, inst: Instruction) -> None:
+        assert inst.target is not None
         self.block_stack.append(BlockStackEntry(inst, inst.target, len(self.stack)))
 
-    def BEGIN_FINALLY(self, inst):
+    def BEGIN_FINALLY(self, inst: Instruction) -> None:
         self.push(None)
 
-    def WITH_CLEANUP_START(self, inst):
+    def WITH_CLEANUP_START(self, inst: Instruction) -> None:
         exit, exc = self.popn(2)
         assert exc is None
         self.push(exc)
         self.push(exit.call_function(self, [ConstantVariable.create(None)] * 3, {}))
 
-    def WITH_CLEANUP_FINISH(self, inst):
+    def WITH_CLEANUP_FINISH(self, inst: Instruction) -> None:
         self.popn(2)
         self.push(None)
 
-    def FOR_ITER(self, inst):
+    def FOR_ITER(self, inst: Instruction) -> None:
         it = self.pop().realize()
         try:
             val = it.next_variable(self)
@@ -1774,7 +1904,7 @@ def FOR_ITER(self, inst):
                 self.push(ConstantVariable.create(None))
             self.jump(inst)
 
-    def _create_exception_type(self, val):
+    def _create_exception_type(self, val: VariableTracker) -> VariableTracker:
         if isinstance(
             val, (variables.BuiltinVariable, UserDefinedExceptionClassVariable)
         ):
@@ -1783,7 +1913,7 @@ def _create_exception_type(self, val):
             val = val.call_function(self, [], {})  # type: ignore[arg-type]
         return val
 
-    def _raise_exception_variable(self, val) -> NoReturn:
+    def _raise_exception_variable(self, val: VariableTracker) -> NoReturn:
         # User can raise exception in 2 ways
         #   1) raise exception type - raise NotImplementedError
         #   2) raise exception instance - raise NotImplemetedError("foo")
@@ -1801,11 +1931,11 @@ def _raise_exception_variable(self, val) -> NoReturn:
             val = variables.BuiltinVariable(RuntimeError).call_function(self, [], {})  # type: ignore[arg-type]
 
         # Save the exception in a global data structure
-        self.exn_vt_stack.set_current_exception(val)
+        self.exn_vt_stack.set_current_exception(val)  # type: ignore[arg-type]
 
         # 2) when user raises exception instance
         if self._isinstance_exception(val):
-            observed_exception_type = exc.get_dynamo_observed_exception(val.exc_type)  # type: ignore[attr-defined]
+            observed_exception_type = exc.get_dynamo_observed_exception(val.exc_type)  # type: ignore[attr-defined, union-attr]
             raise observed_exception_type(f"raised exception {val}")
         unimplemented_v2(
             gb_type="Failed to raise exception",
@@ -1814,7 +1944,7 @@ def _raise_exception_variable(self, val) -> NoReturn:
             hints=[*graph_break_hints.USER_ERROR],
         )
 
-    def RAISE_VARARGS(self, inst):
+    def RAISE_VARARGS(self, inst: Instruction) -> None:
         if inst.arg == 0:
             if not len(self.exn_vt_stack):
                 msg = ConstantVariable("No active exception to reraise")
@@ -1828,21 +1958,21 @@ def RAISE_VARARGS(self, inst):
             self._raise_exception_variable(val)
         elif inst.arg == 1:
             # raise TOS
-            val = self.stack[-1]
+            val = self.stack[-1]  # type: ignore[assignment]
             self._raise_exception_variable(val)
         else:
             # raise .. from ...
             from_vt = self.pop()
-            val = self.pop()
+            val = self.pop()  # type: ignore[assignment]
             try:
                 self._raise_exception_variable(val)
             finally:
                 # Update __cause__/__supppress_context__ in the raised exception
                 curr_exc = self.exn_vt_stack.get_current_exception()
                 cause = self._create_exception_type(from_vt)
-                curr_exc.call_setattr(self, ConstantVariable("__cause__"), cause)
+                curr_exc.call_setattr(self, ConstantVariable("__cause__"), cause)  # type: ignore[arg-type, union-attr, assignment]
 
-    def CLEANUP_THROW(self, inst):
+    def CLEANUP_THROW(self, inst: Instruction) -> None:
         # https://github.com/python/cpython/pull/96010
         tos = self.stack[-1]
         assert isinstance(tos, ExceptionVariable)
@@ -1856,7 +1986,7 @@ def CLEANUP_THROW(self, inst):
         else:
             self.RERAISE(inst)
 
-    def RERAISE(self, inst):
+    def RERAISE(self, inst: Instruction) -> None:
         # https://docs.python.org/3/library/dis.html#opcode-RERAISE
         #   Re-raises the exception currently on top of the stack. If oparg is
         #   non-zero, pops an additional value from the stack which is used to
@@ -1879,7 +2009,7 @@ def RERAISE(self, inst):
             _tb = self.pop()
             self._raise_exception_variable(val)
 
-    def _isinstance_exception(self, val):
+    def _isinstance_exception(self, val: VariableTracker) -> TypeIs[ExceptionVals]:
         return isinstance(
             val,
             (
@@ -1889,7 +2019,7 @@ def _isinstance_exception(self, val):
             ),
         )
 
-    def WITH_EXCEPT_START(self, inst):
+    def WITH_EXCEPT_START(self, inst: Instruction) -> None:
         if sys.version_info >= (3, 11):
             # At the top of the stack are 4 values:
             #    - TOP = exc_info()
@@ -1902,7 +2032,7 @@ def WITH_EXCEPT_START(self, inst):
             fn = self.stack[-4]
             val = self.stack[-1]
             assert self._isinstance_exception(val)
-            typ = BuiltinVariable(val.exc_type)  # type: ignore[attr-defined]
+            typ = BuiltinVariable(val.exc_type)  # type: ignore[attr-defined, union-attr]
             tb = ConstantVariable(None)
         else:
             assert len(self.stack) >= 7
@@ -1914,15 +2044,20 @@ def WITH_EXCEPT_START(self, inst):
 
         self.call_function(fn, [typ, val, tb], {})
 
-    def exception_handler(self, raised_exception):
-        def bubble_exception_to_interpreter():
+    def exception_handler(self, raised_exception: ObservedException) -> None:
+        observed_exn_gb_explanation = (
+            "Dynamo found no exception handler at the top-level compiled function "
+            "when encountering an exception. Exception will propagate outside the compiled region."
+        )
+
+        def bubble_exception_to_interpreter() -> None:
             # Bubble the exception to the interpreter
             curr_exc = self.exn_vt_stack.get_current_exception()
             dynamo_exc = exc.get_dynamo_observed_exception(curr_exc.python_type())
             assert isinstance(raised_exception, dynamo_exc)  # sanity check
             unimplemented_v2(
                 gb_type="Observed exception",
-                context=f"raised exception {curr_exc.python_type_name()}({curr_exc.args})",
+                context=f"raised exception {curr_exc.python_type_name()}({curr_exc.args})",  # type: ignore[union-attr]
                 explanation=observed_exn_gb_explanation,
                 hints=[
                     *graph_break_hints.USER_ERROR,
@@ -1930,11 +2065,6 @@ def bubble_exception_to_interpreter():
                 ],
             )
 
-        observed_exn_gb_explanation = (
-            "Dynamo found no exception handler at the top-level compiled function "
-            "when encountering an exception. Exception will propagate outside the compiled region."
-        )
-
         if sys.version_info >= (3, 11):
             exn_tab_entry = self.current_instruction.exn_tab_entry
             if exn_tab_entry:
@@ -1955,7 +2085,7 @@ def bubble_exception_to_interpreter():
                 self.push(self.exn_vt_stack.get_current_exception())
 
                 # 4) jump to the handler
-                self.jump(exn_tab_entry)
+                self.jump(exn_tab_entry)  # type: ignore[arg-type]
             else:
                 # No handler found. Bubble the exception to the parent
                 # instruction translator. We use special exception for this.
@@ -2036,7 +2166,7 @@ def bubble_exception_to_interpreter():
                     bubble_exception_to_interpreter()
                 raise raised_exception
 
-    def PUSH_EXC_INFO(self, inst):
+    def PUSH_EXC_INFO(self, inst: Instruction) -> None:
         # https://docs.python.org/3/library/dis.html#opcode-PUSH_EXC_INFO
         #   Pops a value from the stack. Pushes the current exception to the top
         #   of the stack. Pushes the value originally popped back to the stack.
@@ -2058,14 +2188,14 @@ def PUSH_EXC_INFO(self, inst):
 
         val = self.pop()
         if len(self.exn_vt_stack) == 0:
-            prev_exc = ConstantVariable(None)
+            prev_exc: VariableTracker = ConstantVariable(None)
         else:
             prev_exc = self.exn_vt_stack[-1]
         self.push(prev_exc)
         self.push(val)
         self.exn_vt_stack.move_current_exception_to_stack()
 
-    def POP_EXCEPT(self, inst):
+    def POP_EXCEPT(self, inst: Instruction) -> None:
         if sys.version_info >= (3, 11):
             _ = self.pop()
             # This exception is handled and therefore we can clear the error indicator
@@ -2086,7 +2216,7 @@ def POP_EXCEPT(self, inst):
             assert len(self.exn_vt_stack)
             self.exn_vt_stack.pop()
 
-    def check_if_exc_matches(self):
+    def check_if_exc_matches(self) -> bool:
         assert len(self.stack) >= 2
         expected_exc_types = self.pop()
         if sys.version_info >= (3, 11):
@@ -2156,7 +2286,7 @@ def check_if_exc_matches(self):
                     hints=[*graph_break_hints.USER_ERROR],
                 )
             if self._isinstance_exception(exc_instance) and issubclass(
-                exc_instance.exc_type,  # type: ignore[attr-defined]
+                exc_instance.exc_type,  # type: ignore[union-attr]
                 expected_type.fn,  # type: ignore[attr-defined]
             ):
                 return True
@@ -2167,30 +2297,30 @@ def check_if_exc_matches(self):
 
         return False
 
-    def CHECK_EXC_MATCH(self, inst):
+    def CHECK_EXC_MATCH(self, inst: Instruction) -> None:
         self.push(variables.ConstantVariable(self.check_if_exc_matches()))
 
-    def JUMP_IF_NOT_EXC_MATCH(self, inst):
+    def JUMP_IF_NOT_EXC_MATCH(self, inst: Instruction) -> None:
         if not self.check_if_exc_matches():
             self.jump(inst)
 
-    def COMPARE_OP(self, inst):
+    def COMPARE_OP(self, inst: Instruction) -> None:
         if inst.argval == "exception match":
             self.CHECK_EXC_MATCH(inst)
         else:
             self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))
 
-    def GET_ITER(self, inst):
+    def GET_ITER(self, inst: Instruction) -> None:
         self.call_function(BuiltinVariable(iter), [self.pop()], {})
 
     @break_graph_if_unsupported(push=1)
-    def CALL_FUNCTION(self, inst):
+    def CALL_FUNCTION(self, inst: Instruction) -> None:
         args = self.popn(inst.argval)
         fn = self.pop()
         self.call_function(fn, args, {})
 
     @break_graph_if_unsupported(push=1)
-    def CALL_FUNCTION_EX(self, inst):
+    def CALL_FUNCTION_EX(self, inst: Instruction) -> None:
         kwargsvars: VariableTracker
         if inst.argval == 0:
             kwargsvars = ConstDictVariable({})
@@ -2241,7 +2371,7 @@ def CALL_FUNCTION_EX(self, inst):
         self.call_function(fn, argsvars.items, kwargsvars)
 
     @break_graph_if_unsupported(push=1)
-    def CALL_FUNCTION_KW(self, inst):
+    def CALL_FUNCTION_KW(self, inst: Instruction) -> None:
         argnames = self.pop()
         args = self.popn(inst.argval)
         fn = self.pop()
@@ -2252,7 +2382,7 @@ def CALL_FUNCTION_KW(self, inst):
         assert len(kwargs) == len(argnames)
         self.call_function(fn, args, kwargs)
 
-    def LOAD_METHOD_SUPER(self, inst):
+    def LOAD_METHOD_SUPER(self, inst: Instruction) -> None:
         self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
         arg = inst.argval[0]
         argval = self.code_options["co_names"][arg]
@@ -2261,13 +2391,13 @@ def LOAD_METHOD_SUPER(self, inst):
         else:
             self.LOAD_METHOD(dataclasses.replace(inst, argval=argval))
 
-    def LOAD_ATTR_SUPER(self, inst):
+    def LOAD_ATTR_SUPER(self, inst: Instruction) -> None:
         self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
         arg = inst.argval[0]
         argval = self.code_options["co_names"][arg]
         self._load_attr(dataclasses.replace(inst, argval=argval))
 
-    def LOAD_METHOD(self, inst):
+    def LOAD_METHOD(self, inst: Instruction) -> None:
         self._load_attr(inst)
         obj = self.pop()
         if sys.version_info >= (3, 13):
@@ -2283,14 +2413,14 @@ def LOAD_METHOD(self, inst):
             self.push(obj)
             self.push(None)
 
-    def CALL_METHOD(self, inst):
+    def CALL_METHOD(self, inst: Instruction) -> None:
         args = self.popn(inst.argval)
         dummy = self.pop()
         assert dummy is None
         fn = self.pop()
         self.call_function(fn, args, {})
 
-    def _load_attr(self, inst):
+    def _load_attr(self, inst: Instruction) -> None:
         obj = self.pop()
         result = BuiltinVariable(getattr).call_function(
             self,  # type: ignore[arg-type]
@@ -2299,14 +2429,14 @@ def _load_attr(self, inst):
         )
         self.push(result)
 
-    def LOAD_ATTR(self, inst):
+    def LOAD_ATTR(self, inst: Instruction) -> None:
         if sys.version_info >= (3, 12):
             if inst.arg % 2:
                 self.LOAD_METHOD(inst)
                 return
         self._load_attr(inst)
 
-    def STORE_ATTR(self, inst):
+    def STORE_ATTR(self, inst: Instruction) -> None:
         speculation = self.speculate()
         if speculation.failed(self):
             return self.store_attr_graph_break(inst)
@@ -2334,7 +2464,7 @@ def STORE_ATTR(self, inst):
             e.add_to_stats("graph_break")
         speculation.fail_and_restart_analysis(self.error_on_graph_break)
 
-    def store_attr_graph_break(self, inst):
+    def store_attr_graph_break(self, inst: Instruction) -> None:
         log_graph_break(self.code_options, reason="STORE_ATTR-caused graph break")
         if not self.should_compile_partial_graph():
             unimplemented_v2(
@@ -2352,10 +2482,12 @@ def store_attr_graph_break(self, inst):
         self.output.add_output_instructions([copy.copy(inst)])
         self.popn(2)
         self.output.add_output_instructions(
-            self.create_call_resume_at(self.next_instruction, all_stack_locals_metadata)
+            self.create_call_resume_at(
+                self.next_instruction, all_stack_locals_metadata, False
+            )
         )
 
-    def DELETE_ATTR(self, inst):
+    def DELETE_ATTR(self, inst: Instruction) -> None:
         obj = self.pop()
         BuiltinVariable(delattr).call_function(
             self,  # type: ignore[arg-type]
@@ -2363,38 +2495,351 @@ def DELETE_ATTR(self, inst):
             {},
         )
 
-    def create_call_resume_at(self, offset, all_stack_locals_metadata):
-        raise AssertionError(
-            f"create_call_resume_at not overridden by subclass {type(self)}"
+    def create_call_resume_at(
+        self,
+        inst: Instruction,
+        all_stack_locals_metadata: Any,
+        disable_current_frame_resume: bool,
+    ) -> list[Instruction]:
+        """
+        Codegen resume function(s) and call it.
+        Assumes that the unsupported instruction has already been run.
+
+        Expects the stack to be in the state:
+            [
+                frame N locals,
+                frame N-1 stack + locals,
+                ...,
+                frame 1 stack + locals
+            ], frame N stack (post-instruction)
+
+        Args:
+            - inst: the instruction of the current (deepest) frame to resume at
+            - all_stack_locals_metadata: metadata returned from OutputGraph.compile_subgraph - contains
+                metadata such as local names, NULL positions, stack length, etc.
+            - disable_current_frame_resume: If True, disable tracing on the current frame's resume function.
+                Used for implementing nested step_graph_break.
+        """
+
+        self.instruction_pointer = None
+
+        if inst.opname == "RETURN_VALUE":
+            return [create_instruction("RETURN_VALUE")]
+        elif inst.opname == "RETURN_CONST":
+            return [create_instruction("RETURN_CONST", argval=inst.argval)]
+
+        cg = PyCodegen(self.output.root_tx)
+
+        # move frame N stack to the frame values list
+        current_num_stack = len(self.stack) - len(
+            all_stack_locals_metadata[0].stack_null_idxes
+        )
+        all_stack_locals_metadata[0].num_stack = current_num_stack
+        cg.extend_output(
+            [
+                create_instruction("BUILD_LIST", arg=current_num_stack),
+                *create_copy(2),
+                # frame_values, frame N stack, frame_values
+                cg.create_load_const(0),
+                cg.create_binary_subscr(),
+                *create_binary_slice(0, 0, True),
+                # frame_values[0][0:0] = frame N stack
+                # frame_values left on top of stack
+            ]
+        )
+
+        # current frame state
+        # [
+        #   [frame N stack (fixed) + locals]
+        #   ...,
+        #   [frame 1 stack + locals]
+        # ],
+
+        #
+        txes = []
+        cur_tx: Optional[InstructionTranslatorBase] = self
+        while cur_tx is not None:
+            txes.append(cur_tx)
+            cur_tx = cur_tx.parent
+        assert len(txes) == len(all_stack_locals_metadata)
+
+        # Handle inactive context variables.
+        # The resume function assumes that context variables are the class, NOT the object.
+        # e.g. torch.set_grad_enabled(True) will be reconstructed as torch.set_grad_enabled
+        # NOTE: if the unsupported instruction modifies the inactive context variable, it may
+        # result in silent incorrectness!
+        for i, meta in enumerate(all_stack_locals_metadata):
+            if i == 0 and disable_current_frame_resume:
+                continue
+
+            for (j, _), j_orig in zip(meta.stack_ctx_args, meta.stack_ctx_idxes_orig):
+                # Replace the stack var with the context class
+                ctx = cast(ContextWrappingVariable, txes[i].stack[j_orig])
+                # frames[i][j] = reconstructed_ctx
+                cg.append_output(create_dup_top())
+                ctx.reconstruct_type(cg)
+                cg.extend_output(
+                    [
+                        *create_swap(2),
+                        cg.create_load_const(i),
+                        cg.create_binary_subscr(),
+                        cg.create_load_const(j),
+                        create_instruction("STORE_SUBSCR"),
+                    ]
+                )
+
+            for name, _ in meta.locals_ctx_args:
+                # Replace the local with the context class
+                ctx = cast(ContextWrappingVariable, txes[i].symbolic_locals[name])
+                # frames[i][meta.num_stack +meta.locals_names[name]] = reconstructed_ctx
+                cg.append_output(create_dup_top())
+                ctx.reconstruct_type(cg)
+                cg.extend_output(
+                    [
+                        *create_swap(2),
+                        cg.create_load_const(i),
+                        cg.create_binary_subscr(),
+                        cg.create_load_const(meta.num_stack + meta.locals_names[name]),
+                        create_instruction("STORE_SUBSCR"),
+                    ]
+                )
+
+        # build the resume function for each frame
+        resume_names = []
+        resume_codes: list[types.CodeType] = []
+        for i, meta in enumerate(all_stack_locals_metadata):
+            cur_tx = txes[i]
+            if cur_tx is self:
+                resume_inst = inst
+            else:
+                resume_inst = cur_tx.next_instruction
+            # If the resume instruction is a jump absolute, then resume
+            # at the target instead. This handles the case where we
+            # graph break again in a nested function before jump-resuming
+            # this frame.
+            if is_jump_absolute(resume_inst):
+                assert resume_inst.target
+                resume_inst = resume_inst.target
+            resume_name = unique_id(f"__resume_at_{resume_inst.offset}")
+            resume_names.append(resume_name)
+
+            # More locals may have been pruned in the current frame
+            # after the unsupported instruction (e.g. branch).
+            # There should not be any pruning in the other frames since
+            # the current instruction is a CALL.
+            if cur_tx is self:
+                reads = livevars_analysis(cur_tx.instructions, resume_inst)
+                all_argnames = tuple(
+                    k
+                    for k in cur_tx.symbolic_locals.keys()
+                    if k in reads and k not in cur_tx.cell_and_freevars()
+                )
+                argnames_null_set = set(meta.locals_null_keys)
+                argnames = tuple(k for k in all_argnames if k not in argnames_null_set)
+                argnames_null = tuple(k for k in all_argnames if k in argnames_null_set)
+
+                # codegen filter for current frame's locals
+                # current stack state: frames
+                cg.extend_output(
+                    [
+                        create_dup_top(),
+                        cg.create_load_const(i),
+                        cg.create_binary_subscr(),
+                        create_dup_top(),
+                    ]
+                )
+                for arg in argnames:
+                    # current stack state: frames, frames[i], *(prev locals), frames[i]
+                    cg.extend_output(
+                        [
+                            create_dup_top(),
+                            cg.create_load_const(
+                                meta.num_stack + meta.locals_names[arg]
+                            ),
+                            cg.create_binary_subscr(),
+                            *create_swap(2),
+                        ],
+                    )
+                # current stack state: frames, frames[i], *(frame i live locals), frames[i]
+                cg.extend_output(
+                    [
+                        create_instruction("POP_TOP"),
+                        create_instruction("BUILD_LIST", arg=len(argnames)),
+                        *create_swap(2),
+                        # frames, frames i live locals, frames[i]
+                        *create_binary_slice(meta.num_stack, None, True),
+                        # frames[i][num_stack:] = frame i live locals
+                    ]
+                )
+                # current stack state: frames
+            else:
+                argnames = tuple(meta.locals_names.keys())
+                argnames_null = tuple(meta.locals_null_keys)
+
+            if sys.version_info < (3, 12):
+                assert len(argnames_null) == 0, "variables should not be NULL in < 3.12"
+
+            # compile_subgraph did not codegen any NULLs,
+            # so we should not count NullVariables
+            stack_len = len(cur_tx.stack) - len(meta.stack_null_idxes)
+
+            new_code: types.CodeType = ContinueExecutionCache.lookup(
+                cur_tx.f_code,
+                cur_tx.lineno,
+                resume_inst.offset,
+                tuple(b.target.offset for b in cur_tx.block_stack),
+                stack_len,
+                argnames,
+                argnames_null,
+                tuple(b.resume_fn() for b in cur_tx.block_stack),
+                tuple(meta.stack_ctx_args),
+                tuple(meta.locals_ctx_args),
+                tuple(meta.stack_null_idxes),
+                tuple(resume_codes),
+            )
+            resume_codes.append(new_code)
+
+            # Add original GraphModule context to the resume function to handle
+            # the case of a graph break while tracing a GraphModule
+            orig_graphmodule_maybe = code_context.get_context(cur_tx.f_code).get(
+                "orig_graphmodule", lambda: None
+            )()
+            if orig_graphmodule_maybe is not None:
+                code_context.get_context(new_code)["orig_graphmodule"] = weakref.ref(
+                    orig_graphmodule_maybe
+                )
+
+            # add resume function to the global scope
+            if new_code.co_freevars:
+                # expose code object for debugging purposes
+                cur_tx.output.install_global_unsafe(resume_name, new_code)
+                package_name = None
+            else:
+                # This is safe: we pre-generate a unique name
+                cur_tx.output.install_global_unsafe(
+                    resume_name,
+                    types.FunctionType(new_code, cur_tx.f_globals, resume_name),
+                )
+                package_name = resume_name
+
+            if cur_tx.package is not None:
+                cur_tx.package.add_resume_function(
+                    new_code, cur_tx.f_globals["__name__"], package_name
+                )
+
+        if disable_current_frame_resume:
+            from .eval_frame import skip_code
+
+            skip_code(resume_codes[0])
+
+        # load first resume function (to be called this frame)
+        if resume_codes[-1].co_freevars:
+            cg.make_function_with_closure(
+                txes[-1], resume_names[-1], resume_codes[-1], True, 1
+            )
+        else:
+            cg.extend_output(cg.load_function_name(resume_names[-1], True, 1))
+
+        # load all other resume functions (to be called later)
+        resume_names.pop()
+        resume_codes.pop()
+        for tx, name, code in zip(txes, resume_names, resume_codes):
+            if code.co_freevars:
+                cg.make_function_with_closure(tx, name, code, False, 0)
+            else:
+                cg.extend_output(cg.load_function_name(name, False, 0))
+        cg.extend_output(
+            [
+                create_instruction("BUILD_LIST", arg=len(resume_codes)),
+                *create_swap(2),
+            ]
+        )
+
+        # resume 1 (+ NULL), [resume N, ..., resume 2], frames
+
+        # load top level-frame; final stack state should be:
+        # first resume function (+ NULL),
+        # [
+        #     [resume N, ..., resume 2],
+        #     [
+        #         frame N stack + locals,
+        #         ...,
+        #         frame 2 stack + locals,
+        #     ], *(frame 1 stack + locals)
+        # ]
+        cg.extend_output(
+            [
+                create_dup_top(),
+                create_dup_top(),
+                # frames, frames, frames
+                cg.create_load_const(-1),
+                cg.create_binary_subscr(),
+                # frames, frames, frames[-1]
+                *create_swap(2),
+                # frames, frames[-1], frames
+                cg.create_load_const(-1),
+                create_instruction("DELETE_SUBSCR"),
+            ]
+        )
+
+        # TOS: resumes, frames (popped), frame 1 stack + locals
+        cg.extend_output(
+            [
+                *create_rot_n(3),
+                create_instruction("BUILD_LIST", arg=2),
+                *create_swap(2),
+                # [resumes, frames (popped)], frame 1 stack + locals
+                create_instruction("LIST_EXTEND", arg=1),
+            ]
         )
 
+        # TOS: [resumes, frames, *(frame 1 stack + locals)]
+        cg.extend_output(
+            [
+                create_instruction("CALL_FUNCTION_EX", arg=0),
+                create_instruction("RETURN_VALUE"),
+            ]
+        )
+        return cg.get_instructions()
+
     def should_compile_partial_graph(self) -> bool:
-        raise AssertionError(
-            f"should_compile_partial_graph not overridden by subclass {type(self)}"
+        if sys.version_info >= (3, 11):
+            # Do not compile if current instruction's block is not the top with block
+            entry = self.current_instruction.exn_tab_entry
+            if entry and (
+                not self.block_stack or entry.target is not self.block_stack[-1].target
+            ):
+                return False
+        return (
+            all(b.can_restore() for b in self.block_stack)
+            and not self.one_graph
+            and not self.error_on_graph_break
+            and not self.is_tracing_resume_prologue
+            and not self.active_generic_context_managers
         )
 
     @break_graph_if_unsupported(push=0)
-    def STORE_SUBSCR(self, inst):
+    def STORE_SUBSCR(self, inst: Instruction) -> None:
         val, obj, key = self.popn(3)
         obj.call_method(self, "__setitem__", [key, val], {})
 
-    def DELETE_SUBSCR(self, inst):
+    def DELETE_SUBSCR(self, inst: Instruction) -> None:
         obj, key = self.popn(2)
         obj.call_method(self, "__delitem__", [key], {})
 
-    def BUILD_TUPLE(self, inst):
+    def BUILD_TUPLE(self, inst: Instruction) -> None:
         items = self.popn(inst.argval)
         self.push(TupleVariable(items))
 
-    def BUILD_SLICE(self, inst):
+    def BUILD_SLICE(self, inst: Instruction) -> None:
         items = self.popn(inst.argval)
         self.push(SliceVariable(items))
 
-    def BUILD_LIST(self, inst):
+    def BUILD_LIST(self, inst: Instruction) -> None:
         items = self.popn(inst.argval)
         self.push(ListVariable(items, mutation_type=ValueMutationNew()))
 
-    def BUILD_SET(self, inst):
+    def BUILD_SET(self, inst: Instruction) -> None:
         if config.inject_BUILD_SET_unimplemented_TESTING_ONLY:
             unimplemented_v2(
                 gb_type="missing BUILD_SET handler",
@@ -2406,7 +2851,7 @@ def BUILD_SET(self, inst):
         new_set = SetVariable(items, mutation_type=ValueMutationNew())
         self.push(new_set)
 
-    def BUILD_LIST_UNPACK(self, inst, cls=ListVariable):
+    def BUILD_LIST_UNPACK(self, inst: Instruction, cls: type = ListVariable) -> None:
         seqs = self.popn(inst.argval)
         items = []
         for seq in seqs:
@@ -2422,21 +2867,21 @@ def BUILD_LIST_UNPACK(self, inst, cls=ListVariable):
                 )
         self.push(cls(items, mutation_type=ValueMutationNew()))
 
-    def BUILD_TUPLE_UNPACK(self, inst):
+    def BUILD_TUPLE_UNPACK(self, inst: Instruction) -> None:
         self.BUILD_LIST_UNPACK(inst, cls=TupleVariable)
 
     BUILD_TUPLE_UNPACK_WITH_CALL = BUILD_TUPLE_UNPACK
 
-    def BUILD_MAP(self, inst):
+    def BUILD_MAP(self, inst: Instruction) -> None:
         items = self.popn(inst.argval * 2)
         d = dict(zip(items[::2], items[1::2]))
         self.push(ConstDictVariable(d, mutation_type=ValueMutationNew()))
 
-    def BUILD_MAP_UNPACK(self, inst):
+    def BUILD_MAP_UNPACK(self, inst: Instruction) -> None:
         items = self.popn(inst.argval)
         # ensure everything is a dict
         items = [BuiltinVariable(dict).call_function(self, [x], {}) for x in items]  # type: ignore[arg-type]
-        result = {}
+        result: dict[Any, Any] = {}
         for x in items:
             assert isinstance(x, ConstDictVariable)
             result.update(x.items)
@@ -2449,7 +2894,7 @@ def BUILD_MAP_UNPACK(self, inst):
 
     BUILD_MAP_UNPACK_WITH_CALL = BUILD_MAP_UNPACK
 
-    def BUILD_CONST_KEY_MAP(self, inst):
+    def BUILD_CONST_KEY_MAP(self, inst: Instruction) -> None:
         keys = self.pop()
         values = self.popn(inst.argval)
         assert isinstance(keys, TupleVariable)
@@ -2465,39 +2910,43 @@ def BUILD_CONST_KEY_MAP(self, inst):
             )
         )
 
-    def MAP_ADD(self, inst):
+    def MAP_ADD(self, inst: Instruction) -> None:
         k, v = self.popn(2)
         assert inst.argval > 0
+        assert inst.arg is not None
         obj = self.stack[-inst.arg].realize()
         assert isinstance(obj, ConstDictVariable)
         obj.call_method(self, "__setitem__", (k, v), {})  # type: ignore[arg-type]
 
-    def SET_ADD(self, inst):
+    def SET_ADD(self, inst: Instruction) -> None:
         v = self.pop()
         assert inst.argval > 0
+        assert inst.arg is not None
         obj = self.stack[-inst.arg]
         assert isinstance(obj, SetVariable)
         assert obj.is_mutable()
-        return obj.call_method(self, "add", [v], {})
+        obj.call_method(self, "add", [v], {})
 
-    def SET_UPDATE(self, inst):
+    def SET_UPDATE(self, inst: Instruction) -> None:
         v = self.pop()
         assert inst.argval > 0
+        assert inst.arg is not None
         obj = self.stack[-inst.arg]
         assert isinstance(obj, SetVariable)
         assert obj.is_mutable()
         obj.call_method(self, "update", [v], {})
 
-    def LIST_APPEND(self, inst):
+    def LIST_APPEND(self, inst: Instruction) -> None:
         v = self.pop()
         assert inst.argval > 0
+        assert inst.arg is not None
         obj = self.stack[-inst.arg].realize()
         assert isinstance(obj, ListVariable)
         assert obj.is_mutable()
         self.output.side_effects.mutation(obj)
         obj.items.append(v)
 
-    def MAKE_FUNCTION(self, inst):
+    def MAKE_FUNCTION(self, inst: Instruction) -> None:
         flags = inst.arg
         if sys.version_info < (3, 11):
             fn_name = self.pop()
@@ -2514,14 +2963,15 @@ def MAKE_FUNCTION(self, inst):
 
         if sys.version_info < (3, 13):
             # in 3.13, this is handled in SET_FUNCTION_ATTRIBUTE
-            if flags & 0x08:
-                closure = self.pop()
-            if flags & 0x04:
-                annotations = self.pop()
-            if flags & 0x02:
-                kwdefaults = self.pop()
-            if flags & 0x01:
-                defaults = self.pop()
+            if flags is not None:
+                if flags & 0x08:
+                    closure = self.pop()
+                if flags & 0x04:
+                    annotations = self.pop()
+                if flags & 0x02:
+                    kwdefaults = self.pop()
+                if flags & 0x01:
+                    defaults = self.pop()
 
         self.push(
             NestedUserFunctionVariable(
@@ -2535,7 +2985,7 @@ def MAKE_FUNCTION(self, inst):
             )
         )
 
-    def UNPACK_SEQUENCE(self, inst):
+    def UNPACK_SEQUENCE(self, inst: Instruction) -> None:
         seq = self.pop()
         if isinstance(seq, TensorVariable):
             val = seq.unpack_var_sequence(self, idxes=range(inst.argval))  # type: ignore[arg-type]
@@ -2564,7 +3014,7 @@ def UNPACK_SEQUENCE(self, inst):
         for i in reversed(val):
             self.push(i)
 
-    def UNPACK_EX(self, inst):
+    def UNPACK_EX(self, inst: Instruction) -> None:
         assert 0 <= inst.argval <= 0xFFFF
         prefix = inst.argval & 0xFF  # low byte
         suffix = inst.argval >> 8  # high byte
@@ -2588,19 +3038,33 @@ def UNPACK_EX(self, inst):
                 hints=[*graph_break_hints.USER_ERROR],
             )
 
-    def NOP(self, inst):
-        pass
+    @break_graph_if_unsupported(push=0)
+    def graph_break_on_leaf_function(self, inst: Instruction) -> None:
+        if self.is_leaf_tracer:
+            unimplemented_v2(
+                gb_type="Forced graph break on leaf function",
+                context="",
+                explanation="Forced graph break for nested graph break testing purposes",
+                hints=[
+                    "Set torch._dynamo.config.debug_force_graph_break_on_leaf_return = False",
+                ],
+            )
+
+    def NOP(self, inst: Instruction) -> None:
+        # Dynamo-specific testing behavior
+        if inst.argval == "GRAPH_BREAK_IF_LEAF":
+            self.graph_break_on_leaf_function(inst)
 
-    def POP_TOP(self, inst):
+    def POP_TOP(self, inst: Instruction) -> None:
         self.pop()
 
-    def ROT_TWO(self, inst):
+    def ROT_TWO(self, inst: Instruction) -> None:
         a = self.pop()
         b = self.pop()
         self.push(a)
         self.push(b)
 
-    def ROT_THREE(self, inst):
+    def ROT_THREE(self, inst: Instruction) -> None:
         a = self.pop()
         b = self.pop()
         c = self.pop()
@@ -2608,7 +3072,7 @@ def ROT_THREE(self, inst):
         self.push(c)
         self.push(b)
 
-    def ROT_FOUR(self, inst):
+    def ROT_FOUR(self, inst: Instruction) -> None:
         a = self.pop()
         b = self.pop()
         c = self.pop()
@@ -2618,12 +3082,12 @@ def ROT_FOUR(self, inst):
         self.push(c)
         self.push(b)
 
-    def DUP_TOP(self, inst):
+    def DUP_TOP(self, inst: Instruction) -> None:
         a = self.pop()
         self.push(a)
         self.push(a)
 
-    def DUP_TOP_TWO(self, inst):
+    def DUP_TOP_TWO(self, inst: Instruction) -> None:
         a = self.pop()
         b = self.pop()
         self.push(b)
@@ -2631,7 +3095,7 @@ def DUP_TOP_TWO(self, inst):
         self.push(b)
         self.push(a)
 
-    def _convert_value(self, value, flag):
+    def _convert_value(self, value: VariableTracker, flag: int) -> VariableTracker:
         if flag == 1:
             return BuiltinVariable(str).call_function(self, [value], {})  # type: ignore[arg-type]
         elif flag == 2:
@@ -2640,7 +3104,7 @@ def _convert_value(self, value, flag):
             return BuiltinVariable(ascii).call_function(self, [value], {})  # type: ignore[arg-type]
         return value
 
-    def _format_value(self, fmt_spec, flags):
+    def _format_value(self, fmt_spec: VariableTracker, flags: int) -> None:
         value = self.pop()
         if isinstance(value, SymNodeVariable):
             from torch._dynamo.variables.lazy import (
@@ -2660,8 +3124,9 @@ def _format_value(self, fmt_spec, flags):
 
         self.call_function(BuiltinVariable(str.format), [fmt_var, value], {})
 
-    def FORMAT_VALUE(self, inst):
+    def FORMAT_VALUE(self, inst: Instruction) -> None:
         flags = inst.arg
+        assert flags is not None
         if (flags & 0x04) == 0x04:
             fmt_spec = self.pop()
         else:
@@ -2669,10 +3134,11 @@ def FORMAT_VALUE(self, inst):
 
         return self._format_value(fmt_spec, flags)
 
-    def BUILD_STRING(self, inst):
+    def BUILD_STRING(self, inst: Instruction) -> None:
         format_string_parts: list[str] = []
         args: list[VariableTracker] = []
         kwargs: dict[str, VariableTracker] = {}
+        assert inst.arg is not None
         for part in self.popn(inst.arg):
             if isinstance(part, ConstantVariable):
                 format_string_parts.append("{}")
@@ -2701,7 +3167,7 @@ def BUILD_STRING(self, inst):
             )
         )
 
-    def IS_OP(self, inst):
+    def IS_OP(self, inst: Instruction) -> None:
         assert inst.argval == 0 or inst.argval == 1
         if inst.argval == 0:
             new_argval = "is"
@@ -2710,14 +3176,22 @@ def IS_OP(self, inst):
         new_inst = create_instruction("COMPARE_OP", argval=new_argval)
         self.COMPARE_OP(new_inst)
 
-    def CONTAINS_OP(self, inst):
+    def CONTAINS_OP(self, inst: Instruction) -> None:
         assert inst.argval == 0 or inst.argval == 1
         left, right = self.popn(2)
         op = inst.argval
         try:
             self.push(right.call_method(self, "__contains__", [left], {}))
-        except Unsupported:  # object doesn't support __contains__
+        except (
+            # right.__contains__ can raise TypeError
+            exc.ObservedTypeError,
+            # Ideally we should only capture TypeError here but some VTs don't
+            # implement hasattr(vt, "__contains__") entirely
+            Unsupported,
+        ) as excp:  # object doesn't support __contains__
             # Use __iter__ as fallback
+            if isinstance(excp, Unsupported):
+                excp.remove_from_stats()
             self.push(
                 self.inline_user_function_return(
                     VariableTracker.build(self, impl_CONTAINS_OP_fallback),
@@ -2728,18 +3202,19 @@ def CONTAINS_OP(self, inst):
         if op == 1:
             self.UNARY_NOT(inst)
 
-    def LIST_EXTEND(self, inst):
+    def LIST_EXTEND(self, inst: Instruction) -> None:
         v = self.pop()
         assert inst.argval > 0
+        assert inst.arg is not None
         obj = self.stack[-inst.arg]
         assert isinstance(obj, ListVariable)
         assert obj.is_mutable()
         obj.call_method(self, "extend", [v], {})
 
-    def LIST_TO_TUPLE(self, inst):
+    def LIST_TO_TUPLE(self, inst: Instruction) -> None:
         self.push(BuiltinVariable(tuple).call_function(self, [self.pop()], {}))  # type: ignore[arg-type]
 
-    def STOPITERATION_ERROR(self, inst):
+    def STOPITERATION_ERROR(self, inst: Instruction) -> None:
         # wrap the generator body in a try: ... except StopIteration: ... which
         # converts the StopIteration into a RuntimeError
         # https://peps.python.org/pep-0479/
@@ -2747,7 +3222,7 @@ def STOPITERATION_ERROR(self, inst):
         # https://github.com/python/cpython/commit/28187141cc34063ef857976ddbca87ba09a882c2
         val = self.stack[-1]
         assert self._isinstance_exception(val)
-        if val.exc_type is StopIteration:  # type: ignore[attr-defined]
+        if val.exc_type is StopIteration:  # type: ignore[union-attr]
             new_val = variables.BuiltinVariable(RuntimeError).call_function(
                 self,  # type: ignore[arg-type]
                 [ConstantVariable("generator raised StopIteration")],
@@ -2757,9 +3232,10 @@ def STOPITERATION_ERROR(self, inst):
             new_val.call_setattr(self, ConstantVariable("__cause__"), val)  # type: ignore[attr-defined]
             self.stack[-1] = new_val
 
-    def DICT_MERGE(self, inst):
+    def DICT_MERGE(self, inst: Instruction) -> None:
         v = self.pop()
         assert inst.argval > 0
+        assert inst.arg is not None
         obj = self.stack[-inst.arg].realize()
         assert isinstance(obj, ConstDictVariable)
         assert obj.is_mutable()
@@ -2767,17 +3243,17 @@ def DICT_MERGE(self, inst):
 
     DICT_UPDATE = DICT_MERGE
 
-    def GEN_START(self, inst):
+    def GEN_START(self, inst: Instruction) -> None:
         self.pop()
 
-    def GET_LEN(self, inst):
+    def GET_LEN(self, inst: Instruction) -> None:
         tos = self.stack[-1]
         if tos.is_python_constant():
             self.push(ConstantVariable.create(len(tos.as_python_constant())))
         else:
             self.push(tos.call_method(self, "__len__", [], {}))
 
-    def MATCH_MAPPING(self, inst):
+    def MATCH_MAPPING(self, inst: Instruction) -> None:
         tos = self.stack[-1]
         assert isinstance(tos, ConstDictVariable)
         if isinstance(tos.items, collections.abc.Mapping):
@@ -2785,7 +3261,7 @@ def MATCH_MAPPING(self, inst):
         else:
             self.push(ConstantVariable.create(False))
 
-    def MATCH_SEQUENCE(self, inst):
+    def MATCH_SEQUENCE(self, inst: Instruction) -> None:
         tos = self.stack[-1]
         assert tos.is_python_constant()
         tos_value = tos.as_python_constant()
@@ -2796,7 +3272,7 @@ def MATCH_SEQUENCE(self, inst):
         else:
             self.push(ConstantVariable.create(False))
 
-    def MATCH_KEYS(self, inst):
+    def MATCH_KEYS(self, inst: Instruction) -> None:
         tos = self.stack[-1]
         tos1 = self.stack[-2]
         assert isinstance(tos1, ConstDictVariable)
@@ -2810,10 +3286,10 @@ def MATCH_KEYS(self, inst):
             if sys.version_info < (3, 11):
                 self.push(ConstantVariable.create(False))
 
-    def LOAD_ASSERTION_ERROR(self, inst):
+    def LOAD_ASSERTION_ERROR(self, inst: Instruction) -> None:
         self.push(self.load_builtin_from_argval("AssertionError"))
 
-    def LOAD_BUILD_CLASS(self, inst):
+    def LOAD_BUILD_CLASS(self, inst: Instruction) -> None:
         unimplemented_v2(
             gb_type="LOAD_BUILD_CLASS bytecode not supported",
             context="",
@@ -2861,7 +3337,7 @@ def LOAD_BUILD_CLASS(self, inst):
     INPLACE_OR = stack_op(operator.ior)
 
     # 3.11 opcodes
-    def RESUME(self, inst):
+    def RESUME(self, inst: Instruction) -> None:
         if inst.arg == 0:
             self.append_prefix_inst(inst)
             self.accept_prefix_inst = False
@@ -2870,13 +3346,14 @@ def RESUME(self, inst):
 
     if sys.version_info >= (3, 11):
 
-        def BINARY_OP(self, inst):
+        def BINARY_OP(self, inst: Instruction) -> None:
+            assert inst.arg is not None
             return _binary_op_lookup[inst.arg](self, inst)
 
-    def PRECALL(self, inst):
+    def PRECALL(self, inst: Instruction) -> None:
         pass
 
-    def KW_NAMES(self, inst):
+    def KW_NAMES(self, inst: Instruction) -> None:
         kw_names = self.code_options["co_consts"][inst.arg]
         assert isinstance(kw_names, tuple)
         for name in kw_names:
@@ -2884,10 +3361,10 @@ def KW_NAMES(self, inst):
         assert self.kw_names is None
         self.kw_names = ConstantVariable.create(value=kw_names)  # type: ignore[assignment]
 
-    def PUSH_NULL(self, inst):
+    def PUSH_NULL(self, inst: Instruction) -> None:
         self.push(NullVariable())
 
-    def _call(self, inst, call_kw=False):
+    def _call(self, inst: Instruction, call_kw: bool = False) -> None:
         # see https://docs.python.org/3.11/library/dis.html#opcode-CALL
         # for convention
         if call_kw:
@@ -2899,6 +3376,7 @@ def _call(self, inst, call_kw=False):
         else:
             kw_names = self.kw_names.value if self.kw_names else ()
 
+        assert inst.arg is not None
         contents = self.popn(inst.arg + 2)
         if sys.version_info >= (3, 13):
             # NULL and callable swapped
@@ -2929,13 +3407,15 @@ def _call(self, inst, call_kw=False):
             self.kw_names = None
 
     @break_graph_if_unsupported(push=1)
-    def CALL(self, inst):
+    def CALL(self, inst: Instruction) -> None:
         self._call(inst)
 
-    def COPY(self, inst):
+    def COPY(self, inst: Instruction) -> None:
+        assert inst.arg is not None
         self.push(self.stack[-inst.arg])
 
-    def SWAP(self, inst):
+    def SWAP(self, inst: Instruction) -> None:
+        assert inst.arg is not None
         self.stack[-1], self.stack[-inst.arg] = self.stack[-inst.arg], self.stack[-1]
 
     JUMP_BACKWARD = jump
@@ -2946,13 +3426,13 @@ def SWAP(self, inst):
     POP_JUMP_FORWARD_IF_FALSE = generic_jump(operator.not_, False)
     POP_JUMP_BACKWARD_IF_FALSE = generic_jump(operator.not_, False)
 
-    def CACHE(self, inst):
+    def CACHE(self, inst: Instruction) -> None:
         pass
 
-    def BEFORE_WITH(self, inst):
+    def BEFORE_WITH(self, inst: Instruction) -> None:
         self.setup_or_before_with(inst)
 
-    def setup_or_before_with(self, inst):
+    def setup_or_before_with(self, inst: Instruction) -> None:
         ctx = self.pop()
         if not isinstance(
             ctx, (ContextWrappingVariable, GenericContextWrappingVariable)
@@ -2999,6 +3479,7 @@ def setup_or_before_with(self, inst):
             ):
                 target = None
             else:
+                assert self.next_instruction.exn_tab_entry is not None
                 target = self.next_instruction.exn_tab_entry.target
         else:
             target = inst.target
@@ -3006,7 +3487,7 @@ def setup_or_before_with(self, inst):
         self.push(exit)
 
         if target:
-            if isinstance(self, InstructionTranslator):
+            if isinstance(self, InstructionTranslator) or config.nested_graph_breaks:
                 self.block_stack.append(
                     BlockStackEntry(inst, target, len(self.stack), ctx)
                 )
@@ -3015,11 +3496,11 @@ def setup_or_before_with(self, inst):
 
         self.push(ctx.enter(self))
 
-    def append_prefix_inst(self, inst):
+    def append_prefix_inst(self, inst: Instruction) -> None:
         assert self.accept_prefix_inst
         self.prefix_insts.append(inst)
 
-    def MAKE_CELL(self, inst):
+    def MAKE_CELL(self, inst: Instruction) -> None:
         if sys.version_info >= (3, 12) and not self.accept_prefix_inst:
             # In 3.12+, MAKE_CELL is not longer necessarily a prefix instruction.
             # It can be generated by inlined comprehensions.
@@ -3030,23 +3511,23 @@ def MAKE_CELL(self, inst):
         else:
             self.append_prefix_inst(inst)
 
-    def COPY_FREE_VARS(self, inst):
+    def COPY_FREE_VARS(self, inst: Instruction) -> None:
         self.append_prefix_inst(inst)
 
-    def RETURN_GENERATOR(self, inst):
+    def RETURN_GENERATOR(self, inst: Instruction) -> None:
         self.append_prefix_inst(inst)
 
     # 3.12 opcodes
     # BINARY/STORE_SLICE opcodes are broken down into
     # BUILD_SLICE 2 and BINARY/STORE_SUBSCR
 
-    def END_FOR(self, inst):
+    def END_FOR(self, inst: Instruction) -> None:
         if sys.version_info >= (3, 13):
             self.pop()
         else:
             self.popn(2)
 
-    def LOAD_FAST_CHECK(self, inst):
+    def LOAD_FAST_CHECK(self, inst: Instruction) -> None:
         if istype(self.symbolic_locals.get(inst.argval, None), NullVariable):
             unimplemented_v2(
                 gb_type="LOAD_FAST_CHECK on uninitialized variable",
@@ -3056,21 +3537,22 @@ def LOAD_FAST_CHECK(self, inst):
             )
         self.LOAD_FAST(inst)
 
-    def LOAD_FAST_AND_CLEAR(self, inst):
+    def LOAD_FAST_AND_CLEAR(self, inst: Instruction) -> None:
         if inst.argval not in self.symbolic_locals:
             self.push(NullVariable())
         else:
             self.LOAD_FAST(inst)
         self.symbolic_locals[inst.argval] = NullVariable()
 
-    def LOAD_SUPER_ATTR(self, inst):
+    def LOAD_SUPER_ATTR(self, inst: Instruction) -> None:
         self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
+        assert inst.arg is not None
         if inst.arg & 1:
             self.LOAD_METHOD(inst)
         else:
             self._load_attr(inst)
 
-    def CALL_INTRINSIC_1(self, inst):
+    def CALL_INTRINSIC_1(self, inst: Instruction) -> None:
         if inst.argval == 3:
             # INTRINSIC_STOPITERATION_ERROR
             self.STOPITERATION_ERROR(inst)
@@ -3088,7 +3570,7 @@ def CALL_INTRINSIC_1(self, inst):
                 hints=[*graph_break_hints.SUPPORTABLE],
             )
 
-    def END_SEND(self, inst):
+    def END_SEND(self, inst: Instruction) -> None:
         tos = self.pop()
         self.pop()
         self.push(tos)
@@ -3097,10 +3579,10 @@ def END_SEND(self, inst):
     # fused instructions LOAD_FAST_LOAD_FAST, STORE_FAST_STORE_FAST, STORE_FAST_LOAD_FAST
     # are broken down.
     @break_graph_if_unsupported(push=1)
-    def CALL_KW(self, inst):
+    def CALL_KW(self, inst: Instruction) -> None:
         self._call(inst, call_kw=True)
 
-    def TO_BOOL(self, inst):
+    def TO_BOOL(self, inst: Instruction) -> None:
         # TO_BOOL only precedes a conditional jump or UNARY_NOT (see compile.c in CPython)
         # So we can skip this instruction as long as we remember to codegen a TO_BOOL
         # before conditional jumps/UNARY_NOT.
@@ -3110,8 +3592,9 @@ def TO_BOOL(self, inst):
             "UNARY_NOT",
         )
 
-    def SET_FUNCTION_ATTRIBUTE(self, inst):
+    def SET_FUNCTION_ATTRIBUTE(self, inst: Instruction) -> None:
         flags = inst.arg
+        assert flags is not None
         fn = self.pop()
         assert isinstance(fn, NestedUserFunctionVariable)
         attr = self.pop()
@@ -3127,23 +3610,25 @@ def SET_FUNCTION_ATTRIBUTE(self, inst):
 
         self.push(fn)
 
-    def CONVERT_VALUE(self, inst):
+    def CONVERT_VALUE(self, inst: Instruction) -> None:
         self.push(self._convert_value(self.pop(), inst.argval))
 
-    def FORMAT_SIMPLE(self, inst):
+    def FORMAT_SIMPLE(self, inst: Instruction) -> None:
         self._format_value(ConstantVariable.create(""), 0)
 
-    def FORMAT_WITH_SPEC(self, inst):
+    def FORMAT_WITH_SPEC(self, inst: Instruction) -> None:
         self._format_value(self.pop(), 0)
 
-    def is_non_empty_graph(self):
+    def is_non_empty_graph(self) -> bool:
         if self.output.count_calls() > 1:
             # perf optimization only
             self.is_non_empty_graph = lambda: True  # type: ignore[method-assign]
             return True
         return False
 
-    def format_frame_summary(self, additional_stack_frames=None):
+    def format_frame_summary(
+        self, additional_stack_frames: Optional[list[Any]] = None
+    ) -> str:
         if additional_stack_frames is None:
             additional_stack_frames = []
         return "".join(
@@ -3152,7 +3637,7 @@ def format_frame_summary(self, additional_stack_frames=None):
             )
         )
 
-    def frame_summary(self):
+    def frame_summary(self) -> traceback.FrameSummary:
         return traceback.FrameSummary(
             getattr(self.f_code, "co_filename", "<unknown>"),
             self.lineno,
@@ -3160,12 +3645,12 @@ def frame_summary(self):
             lookup_line=False,
         )
 
-    def is_co_filename_from_nn_modules(self):
+    def is_co_filename_from_nn_modules(self) -> bool:
         filename = getattr(self.f_code, "co_filename", "<unknown>")
         nn_modules_pattern = re.compile(r".*torch/nn/modules.*")
         return nn_modules_pattern.match(filename) is not None
 
-    def store_global_weakref_by_id(self, prefix, value):
+    def store_global_weakref_by_id(self, prefix: str, value: Any) -> str:
         global_name = self.output.install_global_by_id(prefix, weakref.ref(value))
         install_guard(
             GlobalWeakRefSource(global_name).make_guard(GuardBuilder.WEAKREF_ALIVE)
@@ -3173,11 +3658,13 @@ def store_global_weakref_by_id(self, prefix, value):
         return global_name
 
     @property
-    def fake_mode(self):
+    def fake_mode(self) -> Optional[FakeTensorMode]:
         return self.output.tracing_context.fake_mode
 
     @contextlib.contextmanager
-    def strict_translation_mode(self, check_fn: Callable[[VariableTracker], bool]):
+    def strict_translation_mode(
+        self, check_fn: Callable[[VariableTracker], bool]
+    ) -> Any:
         """
         Strict mode is enabled on a per-VariableTracker level depending on the return value of check_fn(node).
         """
@@ -3217,7 +3704,7 @@ def __init__(
         distributed_state: Optional[DistributedState],
         # This determines whether to use the execution recorder.
         closure: Optional[tuple[types.CellType]] = None,
-        package: Optional["CompilePackage"] = None,
+        package: Optional[CompilePackage] = None,
     ) -> None:
         super().__init__()
         self.speculation_log = speculation_log
@@ -3228,7 +3715,10 @@ def __init__(
         self.symbolic_locals = symbolic_locals
         self.symbolic_globals = symbolic_globals
         self.symbolic_torch_function_state = symbolic_torch_function_state
-        self.stack = []
+        # used to keep cell/freevars alive after pruning symbolic_locals (prune_dead_locals)
+        # in order to generate any nested closures
+        self.post_prune_cell_and_freevars = None
+        self.stack: list[VariableTracker] = []
         self.instruction_pointer = 0
         self.start_point = None
         self.current_instruction = create_instruction("NOP")
@@ -3266,8 +3756,8 @@ def __init__(
         self.num_calls: dict[str, int] = {}
         # Flag to indicate whether tracing is used for export.
         self.export = export
-        # NOTE: one_graph is used for export/debugging to always force errors on graph breaks.
-        # To toggle fullgraph during normal compile, self.error_on_graph_break
+        # NOTE: one_graph is used for export/fullgraph=True to always force errors on graph breaks.
+        # To toggle erroring/resuming on graph breaks during fullgraph=False compile, self.error_on_graph_break
         # is used instead. Every step(), its value is updated to the global tls.error_on_graph_break.
         # We mirror this value since cleanup may (correctly) inadvertently change tls.error_on_graph_break.
         # This assumes that we cannot both trace a change to tls.error_on_graph_break and graph break on
@@ -3302,7 +3792,7 @@ def __init__(
 
         self.inline_depth = inline_depth
         self.inconsistent_side_effects = False
-        self._constants_cache: list[Optional[VariableTracker]] = [None] * len(
+        self._constants_cache: list[Optional[ConstantVariable]] = [None] * len(
             f_code.co_consts
         )
 
@@ -3317,11 +3807,11 @@ def __init__(
 
 class InstructionTranslator(InstructionTranslatorBase):
     @staticmethod
-    def current_tx() -> "InstructionTranslator":
+    def current_tx() -> InstructionTranslator:
         return tls.current_tx
 
     @contextlib.contextmanager
-    def set_current_tx(self):
+    def set_current_tx(self) -> Any:
         prior = getattr(tls, "current_tx", None)
         tls.current_tx = self
         try:
@@ -3332,22 +3822,22 @@ def set_current_tx(self):
     def __init__(
         self,
         instructions: list[Instruction],
-        f_code,
-        f_locals,
-        f_globals,
-        f_builtins,
-        closure,
-        torch_function_mode_stack,
-        code_options,
-        compiler_fn,
-        one_graph,
-        export,
-        export_constraints,
-        frame_state,
+        f_code: types.CodeType,
+        f_locals: dict[str, Any],
+        f_globals: dict[str, Any],
+        f_builtins: dict[str, Any],
+        closure: Optional[tuple[Any, ...]],
+        torch_function_mode_stack: Any,
+        code_options: dict[str, Any],
+        compiler_fn: Any,
+        one_graph: bool,
+        export: bool,
+        export_constraints: Any,
+        frame_state: Any,
         speculation_log: SpeculationLog,
         exn_vt_stack: ExceptionStack,
         distributed_state: Optional[DistributedState],
-        package: Optional["CompilePackage"],
+        package: Optional[CompilePackage],
     ) -> None:
         _step_logger()(
             logging.INFO,
@@ -3460,6 +3950,7 @@ def __init__(
 
             # Populate `symbolic_locals` with cells captured by this frame,
             # effectively implementing the `COPY_FREE_VARS` instruction.
+            assert closure is not None
             for name, cell in zip(self.freevars(), closure):
                 cell_source = LocalCellSource(name)
                 contents_source = LocalSource(name, is_derefed_cell_contents=True)
@@ -3487,7 +3978,7 @@ def __init__(
                     self.symbolic_locals
                 )
 
-    def _throw_if_in_functorch(self):
+    def _throw_if_in_functorch(self) -> None:
         # Fallback to eager in case of a graph break inside vmap
         eager = torch._dynamo.lookup_backend("eager")
         compiler_fn = inspect.getattr_static(
@@ -3518,130 +4009,14 @@ def _throw_if_in_functorch(self):
                 hints=[],
             )
 
-    def get_example_value(self, source: Source):
+    def get_example_value(self, source: Source) -> Any:
         if isinstance(source, LocalSource):
             return self.f_locals[source.local_name]
         if isinstance(source, GlobalSource):
             return self.f_globals[source.global_name]
         raise KeyError
 
-    def run(self):
-        super().run()
-
-    def should_compile_partial_graph(self):
-        if sys.version_info >= (3, 11):
-            # Do not compile if current instruction's block is not the top with block
-            entry = self.current_instruction.exn_tab_entry
-            if entry and (
-                not self.block_stack or entry.target is not self.block_stack[-1].target
-            ):
-                return False
-        return (
-            all(b.can_restore() for b in self.block_stack)
-            and not self.one_graph
-            and not self.error_on_graph_break
-            and not self.is_tracing_resume_prologue
-            and not self.active_generic_context_managers
-        )
-
-    def create_call_resume_at(self, inst, all_stack_locals_metadata):
-        self.instruction_pointer = None
-
-        if inst.opname == "RETURN_VALUE":
-            return [create_instruction("RETURN_VALUE")]
-        elif inst.opname == "RETURN_CONST":
-            return [create_instruction("RETURN_CONST", argval=inst.argval)]
-
-        reads = livevars_analysis(self.instructions, inst)
-        all_argnames = tuple(
-            k
-            for k in self.symbolic_locals.keys()
-            if k in reads and k not in self.cell_and_freevars()
-        )
-        # NOTE: do not use isinstance, since it realizes lazy VT's
-        argnames_null_set = set(all_stack_locals_metadata[0].locals_null_keys)
-        argnames = tuple(k for k in all_argnames if k not in argnames_null_set)
-        argnames_null = tuple(k for k in all_argnames if k in argnames_null_set)
-        if sys.version_info < (3, 12):
-            assert len(argnames_null) == 0, "variables should not be NULL in < 3.12"
-        # compile_subgraph did not codegen any NULLs,
-        # so we should not count NullVariables
-        stack_len = len(self.stack) - len(all_stack_locals_metadata[0].stack_null_idxes)
-        nargs = stack_len + len(argnames)
-
-        cg = PyCodegen(self)
-
-        # Handle inactive context variables.
-        # The resume function assumes that context variables are the class, NOT the object.
-        # e.g. torch.set_grad_enabled(True) will be reconstructed as torch.set_grad_enabled
-        # NOTE: if the unsupported instruction modifies the inactive context variable, it may
-        # result in silent incorrectness!
-        for (i, _), i_orig in zip(
-            all_stack_locals_metadata[0].stack_ctx_args,
-            all_stack_locals_metadata[0].stack_ctx_idxes_orig,
-        ):
-            # Replace the current stack var with the context class
-            ctx = cast(ContextWrappingVariable, self.stack[i_orig])
-            ctx.reconstruct_type(cg)
-            cg.extend_output(create_swap(stack_len - i + 1))
-            cg.append_output(create_instruction("POP_TOP"))
-
-        for name, _ in all_stack_locals_metadata[0].locals_ctx_args:
-            # Replace the local with the context class
-            ctx = cast(ContextWrappingVariable, self.symbolic_locals[name])
-            ctx.reconstruct_type(cg)
-            cg.append_output(create_instruction("STORE_FAST", argval=name))
-
-        name = unique_id(f"__resume_at_{inst.offset}", with_uuid=True)
-
-        new_code: types.CodeType = ContinueExecutionCache.lookup(
-            self.f_code,
-            self.lineno,
-            inst.offset,
-            tuple(b.target.offset for b in self.block_stack),
-            stack_len,
-            argnames,
-            argnames_null,
-            tuple(b.resume_fn() for b in self.block_stack),
-            tuple(all_stack_locals_metadata[0].stack_ctx_args),
-            tuple(all_stack_locals_metadata[0].locals_ctx_args),
-            tuple(all_stack_locals_metadata[0].stack_null_idxes),
-        )
-
-        # Add original GraphModule context to the resume function to handle
-        # the case of a graph break while tracing a GraphModule
-        orig_graphmodule_maybe = code_context.get_context(self.f_code).get(
-            "orig_graphmodule", lambda: None
-        )()
-        if orig_graphmodule_maybe is not None:
-            code_context.get_context(new_code)["orig_graphmodule"] = weakref.ref(
-                orig_graphmodule_maybe
-            )
-
-        if new_code.co_freevars:
-            # expose code object for debugging purposes
-            self.output.install_global_unsafe(name, new_code)
-            cg.make_function_with_closure(name, new_code, True, stack_len)
-            package_name = None
-        else:
-            # This is safe: we pre-generate a unique name
-            self.output.install_global_unsafe(
-                name, types.FunctionType(new_code, self.f_globals, name)
-            )
-            cg.extend_output(cg.load_function_name(name, True, stack_len))
-            package_name = name
-
-        if self.package is not None:
-            self.package.add_resume_function(
-                new_code, self.f_globals["__name__"], function_name=package_name
-            )
-
-        cg.extend_output([cg.create_load(k) for k in argnames])
-        cg.extend_output(create_call_function(nargs, False))
-        cg.append_output(create_instruction("RETURN_VALUE"))
-        return cg.get_instructions()
-
-    def symbolic_locals_contain_module_class(self):
+    def symbolic_locals_contain_module_class(self) -> bool:
         for v in self.symbolic_locals.values():
             if isinstance(v, UserDefinedClassVariable) and issubclass(
                 v.as_python_constant(), torch.nn.Module
@@ -3649,7 +4024,7 @@ def symbolic_locals_contain_module_class(self):
                 return True
         return False
 
-    def replace_tos_if_return_is_generator(self):
+    def replace_tos_if_return_is_generator(self) -> None:
         if (
             len(self.stack)
             and (tos := self.stack[-1])
@@ -3660,7 +4035,7 @@ def replace_tos_if_return_is_generator(self):
                 mutation_type=ValueMutationNew(),
             )
 
-    def _return(self, inst):
+    def _return(self, inst: Instruction) -> None:
         self.replace_tos_if_return_is_generator()
         assert self.instruction_pointer is not None
         assert self.start_point is not None
@@ -3691,6 +4066,8 @@ def _return(self, inst):
             reason=GraphCompileReason(
                 "return_value", [self.frame_summary()], graph_break=False
             ),
+            # the value to be returned
+            stack_pops=1 if inst.opname == "RETURN_VALUE" else 0,
         )
         # check that our stack/locals meta are correct:
         # we should only be tracing 1 frame, and there should not be any NULLs on the stack
@@ -3701,13 +4078,14 @@ def _return(self, inst):
             if inst.opname == "RETURN_VALUE"
             else create_instruction("RETURN_CONST", argval=inst.argval)
         )
+        # NOTE: does the stack need to be empty after the return?
         self.output.add_output_instructions([return_inst])
         raise ReturnValueOp
 
-    def RETURN_VALUE(self, inst):
+    def RETURN_VALUE(self, inst: Instruction) -> None:
         self._return(inst)
 
-    def RETURN_CONST(self, inst):
+    def RETURN_CONST(self, inst: Instruction) -> None:
         self._return(inst)
 
 
@@ -3728,13 +4106,13 @@ class InliningInstructionTranslator(InstructionTranslatorBase):
     parent: InstructionTranslatorBase
 
     @classmethod
-    def inline_call(cls, parent, func, args, kwargs):
+    def inline_call(cls, parent: Any, func: Any, args: Any, kwargs: Any) -> Any:
         with patch.dict(counters, {"unimplemented": counters["inline_call"]}):
             tracer = cls.build_inline_tracer(parent, func, args, kwargs)
             return tracer.inline_call_()
 
     @staticmethod
-    def check_inlineable(func):
+    def check_inlineable(func: Any) -> trace_rules.SkipResult:
         if func.has_self():
             unimplemented_v2(
                 gb_type="Inline attempt with __self__",
@@ -3797,11 +4175,11 @@ def check_inlineable(func):
 
     @staticmethod
     def build_inline_tracer(
-        parent,
+        parent: Any,
         func: VariableTracker,
         args: list[VariableTracker],
-        kwargs,
-    ):
+        kwargs: Any,
+    ) -> InliningInstructionTranslator:
         assert isinstance(
             func,
             (
@@ -3881,7 +4259,7 @@ def build_inline_tracer(
             cur_inst = parent.current_instruction
             parent_code = parent.f_code
 
-            def get_trace_call_log_str():
+            def get_trace_call_log_str() -> str:
                 header = parent.get_line_of_code_header(
                     lineno=cur_inst.positions.lineno
                 )
@@ -3925,7 +4303,7 @@ def get_trace_call_log_str():
             )
         return tracer
 
-    def inline_call_(self):
+    def inline_call_(self) -> VariableTracker:
         parent = self.parent
         code = self.f_code
 
@@ -3950,6 +4328,10 @@ def inline_call_(self):
         finally:
             parent.error_on_graph_break = self.error_on_graph_break
 
+        if self.output.should_exit:
+            # graph break
+            return ConstantVariable.create(None)  # return dummy variable
+
         assert self.symbolic_result is not None
 
         if self.f_globals is parent.f_globals:
@@ -3972,7 +4354,13 @@ def inline_call_(self):
             ):
                 assert isinstance(self, InliningGeneratorInstructionTranslator)
                 # When the generator returns None, we raise StopIteration
-                exc.raise_observed_exception(StopIteration, self)
+                args = []
+                if not (
+                    isinstance(self.symbolic_result, ConstantVariable)
+                    and self.symbolic_result.value is None
+                ):
+                    args = [self.symbolic_result]
+                exc.raise_observed_exception(StopIteration, self, args=args)
             else:
                 return self.symbolic_result
         else:
@@ -4044,16 +4432,29 @@ def __init__(
         self.one_graph = parent.one_graph
 
     @property
-    def fake_mode(self):
+    def fake_mode(self) -> Optional[FakeTensorMode]:
         return self.parent.fake_mode
 
-    def run_ctx_mgr(self):
+    def run_ctx_mgr(self) -> Any:
         return TracingContext.current_frame(self.parent.frame_summary())
 
-    def should_compile_partial_graph(self):
+    def should_compile_partial_graph(self) -> bool:
+        if config.nested_graph_breaks:
+            if not self.parent.should_compile_partial_graph():
+                return False
+            return super().should_compile_partial_graph()
         return False  # inlining functions is all-or-nothing
 
-    def create_call_resume_at(self, inst, all_stack_locals_metadata):
+    def create_call_resume_at(
+        self,
+        inst: Instruction,
+        all_stack_locals_metadata: Any,
+        disable_current_frame_resume: bool,
+    ) -> list[Instruction]:
+        if config.nested_graph_breaks:
+            return super().create_call_resume_at(
+                inst, all_stack_locals_metadata, disable_current_frame_resume
+            )
         unimplemented_v2(
             gb_type="Graph break in inlined function",
             context="",
@@ -4061,17 +4462,19 @@ def create_call_resume_at(self, inst, all_stack_locals_metadata):
             hints=[],
         )
 
-    def RETURN_VALUE(self, inst):
+    def RETURN_VALUE(self, inst: Instruction) -> None:
         self.symbolic_result = self.pop()  # type: ignore[assignment]
         self.instruction_pointer = None
         raise ReturnValueOp
 
-    def RETURN_CONST(self, inst):
+    def RETURN_CONST(self, inst: Instruction) -> None:
         self.symbolic_result = self._load_const(inst)
         self.instruction_pointer = None
         raise ReturnValueOp
 
-    def get_globals_source_and_value(self, name):
+    def get_globals_source_and_value(
+        self, name: str
+    ) -> tuple[Any, VariableTracker, Source]:
         # NamedTuple's `__new__` has a fake global scope that's not an actual
         # module. TODO generalize the check for other non-importable cases.
         # https://github.com/python/cpython/blob/8421b03b16a4852a527256cb7cdce2ab2d318548/Lib/collections/__init__.py#L441-L447
@@ -4100,9 +4503,15 @@ def get_globals_source_and_value(self, name):
             # Dont use lazy vt because we will do a setattr afterwards
             fglobals_vt = VariableBuilder(self, globals_source)(fglobals_value)
             global_source = DictGetItemSource(globals_source, name)  # type: ignore[assignment]
+
+        if is_stdlib(fglobals_value):
+            # Users don't inplace mutate a stdlib attribute (like inspect,
+            # collections), skip guards that originate from the stdlib modules.
+            global_source = SkipGuardSource(global_source)  # type: ignore[assignment]
+
         return fglobals_value, fglobals_vt, global_source
 
-    def _load_global(self, inst):
+    def _load_global(self, inst: Instruction) -> None:
         name = inst.argval
         if name not in self.f_globals:
             return self.load_builtin(inst)
@@ -4119,7 +4528,7 @@ def _load_global(self, inst):
                 value = self.f_globals[name]
                 self.push(VariableTracker.build(self, value, global_source))
 
-    def STORE_GLOBAL(self, inst):
+    def STORE_GLOBAL(self, inst: Instruction) -> None:
         if self.output.global_scope is self.f_globals:
             # If the global scope matches that of the root frame, use handler in
             # root frame instruction translator, to enforce consistency.
@@ -4142,13 +4551,13 @@ class InliningGeneratorInstructionTranslator(InliningInstructionTranslator):
     generated_items: list[VariableTracker]
     # Flag whether or not the InlineGenerator should consume the entire iterator
 
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         self.generated_items = []
         self.generator_exhausted = False
         self.is_generator_from_ctx_manager = False
 
-    def YIELD_VALUE(self, inst: Instruction):
+    def YIELD_VALUE(self, inst: Instruction) -> None:
         top = self.pop()
         self.generated_items.append(top)
         if len(self.generated_items) > MAX_ITERATOR_LIMIT:
@@ -4165,22 +4574,22 @@ def YIELD_VALUE(self, inst: Instruction):
             # Stop tracing
             raise YieldValueOp
 
-    def GET_YIELD_FROM_ITER(self, inst):
+    def GET_YIELD_FROM_ITER(self, inst: Instruction) -> None:
         tos = self.stack[-1]
         if not isinstance(tos, ListIteratorVariable):
             self.pop()
             res = BuiltinVariable(iter).call_function(self, [tos], {})  # type: ignore[arg-type]
             self.push(res)
 
-    def RETURN_VALUE(self, inst):
+    def RETURN_VALUE(self, inst: Instruction) -> None:
         self.generator_exhausted = True
         return super().RETURN_VALUE(inst)
 
-    def RETURN_CONST(self, inst):
+    def RETURN_CONST(self, inst: Instruction) -> None:
         self.generator_exhausted = True
         return super().RETURN_CONST(inst)
 
-    def YIELD_FROM(self, inst):
+    def YIELD_FROM(self, inst: Instruction) -> None:
         assert len(self.stack) >= 2
         val = self.pop()
         tos = self.stack[-1]
@@ -4218,7 +4627,7 @@ def YIELD_FROM(self, inst):
             # Add the value to yield into generated_items and replace the top of the stack with None
             self.YIELD_VALUE(inst)
 
-    def SEND(self, inst):
+    def SEND(self, inst: Instruction) -> None:
         assert len(self.stack) >= 2
         val = self.pop()
         tos = self.stack[-1]
diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py
index 230aac4794f25..77860c720a6e2 100644
--- a/torch/_dynamo/test_case.py
+++ b/torch/_dynamo/test_case.py
@@ -101,6 +101,18 @@ def tearDown(self) -> None:
             log.warning("Running test changed grad mode")
             torch.set_grad_enabled(self._prior_is_grad_enabled)
 
+    def assertEqual(self, x: Any, y: Any, *args: Any, **kwargs: Any) -> None:  # type: ignore[override]
+        if (
+            config.debug_disable_compile_counter
+            and isinstance(x, utils.CompileCounterInt)
+            or isinstance(y, utils.CompileCounterInt)
+        ):
+            return
+        return super().assertEqual(x, y, *args, **kwargs)
+
+    # assertExpectedInline might also need to be disabled for wrapped nested
+    # graph break tests
+
 
 class CPythonTestCase(TestCase):
     """
@@ -140,7 +152,7 @@ class CPythonTestCase(TestCase):
     assertListEqual = unittest.TestCase.assertListEqual
     assertTupleEqual = unittest.TestCase.assertTupleEqual
     assertSetEqual = unittest.TestCase.assertSetEqual
-    assertDictEqual = unittest.TestCase.assertDictEqual
+    assertDictEqual = polyfills.assert_dict_equal
     assertRaises = unittest.TestCase.assertRaises
     assertRaisesRegex = unittest.TestCase.assertRaisesRegex
     assertWarns = unittest.TestCase.assertWarns
@@ -158,7 +170,7 @@ def compile_fn(
         # We want to compile only the test function, excluding any setup code
         # from unittest
         method = getattr(self, self._testMethodName)
-        method = torch._dynamo.optimize(backend, nopython=nopython)(method)
+        method = torch._dynamo.optimize(backend, error_on_graph_break=nopython)(method)
         setattr(self, self._testMethodName, method)
         return fn
 
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index c87efa048cec2..805c3be524e8f 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -42,7 +42,7 @@
 )
 from .guards import CheckFunctionManager, CompileId, GuardedCode
 from .types import ConvertFrameReturn, DynamoFrameType, wrap_guarded_code
-from .utils import same
+from .utils import CompileCounterInt, same
 
 
 np: Optional[types.ModuleType] = None
@@ -200,13 +200,13 @@ def insert_nops(instructions: list[Any], code_options: Any) -> None:
             return ConvertFrameReturn()
 
         debug_checks(frame.f_code)
-        code = transform_code_object(frame.f_code, insert_nops)
+        code, _ = transform_code_object(frame.f_code, insert_nops)
         graph = OutputGraph(
             code_options={},
             compiler_fn=None,
             root_tx=None,  # type: ignore[arg-type]
             export=False,
-            export_constraints=None,
+            export_constraints=[],
             frame_state={"_id": 0},
             # TODO: shouldn't this be f_locals/f_globals from frame?
             local_scope=locals(),
@@ -227,8 +227,8 @@ def insert_nops(instructions: list[Any], code_options: Any) -> None:
 
 class CompileCounter:
     def __init__(self) -> None:
-        self.frame_count = 0
-        self.op_count = 0
+        self.frame_count: Union[int, CompileCounterInt] = 0
+        self.clear()
 
     def __call__(
         self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
@@ -240,16 +240,19 @@ def __call__(
         return gm.forward
 
     def clear(self) -> None:
-        self.frame_count = 0
+        if config.debug_disable_compile_counter:
+            self.frame_count = CompileCounterInt(0)
+        else:
+            self.frame_count = 0
         self.op_count = 0
 
 
 class CompileCounterWithBackend:
     def __init__(self, backend: str) -> None:
-        self.frame_count = 0
-        self.op_count = 0
+        self.frame_count: Union[int, CompileCounterInt] = 0
         self.backend = backend
         self.graphs: list[torch.fx.GraphModule] = []
+        self.clear()
 
     def __call__(
         self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
@@ -264,7 +267,10 @@ def __call__(
         return lookup_backend(self.backend)(gm, example_inputs)
 
     def clear(self) -> None:
-        self.frame_count = 0
+        if config.debug_disable_compile_counter:
+            self.frame_count = CompileCounterInt(0)
+        else:
+            self.frame_count = 0
         self.op_count = 0
         self.graphs = []
 
@@ -414,11 +420,12 @@ def rand_strided(
     device: Union[str, torch.device] = "cpu",
     extra_size: int = 0,
 ) -> torch.Tensor:
-    needed_size = (
-        sum((shape - 1) * stride for shape, stride in zip(size, stride))
-        + 1
-        + extra_size
-    )
+    needed_size = extra_size
+    if all(s > 0 for s in size):
+        # only need to allocate if all sizes are non-zero
+        needed_size += (
+            sum((shape - 1) * stride for shape, stride in zip(size, stride)) + 1
+        )
     if dtype.is_floating_point:
         if dtype.itemsize == 1:
             """
diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
index a3beb561f1866..47ad8cda0c974 100644
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@@ -21,7 +21,6 @@
 
 import abc
 import builtins
-import collections
 import copy
 import dataclasses
 import functools
@@ -172,11 +171,13 @@
     "torch.distributed.distributed_c10d.get_process_group_ranks": TorchInGraphFunctionVariable,
     "torch._utils.is_compiling": TorchInGraphFunctionVariable,
     "torch.fx._symbolic_trace.is_fx_tracing": TorchInGraphFunctionVariable,
+    "torch.fx._symbolic_trace.is_fx_symbolic_tracing": TorchInGraphFunctionVariable,
     "torch._dynamo.external_utils.is_compiling": TorchInGraphFunctionVariable,
     "torch._dynamo.utils._disable_side_effect_safety_checks_for_current_subtracer": UserFunctionVariable,
     "torch.compiler.is_compiling": TorchInGraphFunctionVariable,
     "torch.compiler.is_dynamo_compiling": TorchInGraphFunctionVariable,
     "torch.compiler.is_exporting": TorchInGraphFunctionVariable,
+    "torch.autograd._profiler_enabled": SkipFunctionVariable,
     "torch._C._to_dlpack": SkipFunctionVariable,
     "torch.to_dlpack": SkipFunctionVariable,
     # We graph break on RNG state setters or getters like
@@ -244,6 +245,8 @@
     "torch._C.set_autocast_xla_dtype": SkipFunctionVariable,
     "torch._C.set_autocast_xla_enabled": SkipFunctionVariable,
     "torch.resize_as_": SkipFunctionVariable,
+    "torch._functorch.predispatch._add_batch_dim": TorchInGraphFunctionVariable,
+    "torch._functorch.predispatch._remove_batch_dim": TorchInGraphFunctionVariable,
     "torch.resize_as_sparse_": SkipFunctionVariable,
     "torch.get_default_device": TorchInGraphFunctionVariable,
     # functorch/vmap
@@ -324,8 +327,6 @@
     "torch._functorch.deprecated.grad_and_value": UserFunctionVariable,
     "torch._functorch.deprecated.vjp": UserFunctionVariable,
     # functorch/C++ bindings
-    "torch._C._functorch._add_batch_dim": TorchInGraphFunctionVariable,
-    "torch._C._functorch._remove_batch_dim": TorchInGraphFunctionVariable,
     "torch._C._functorch._wrap_for_grad": TorchInGraphFunctionVariable,
     "torch._C._functorch._unwrap_for_grad": TorchInGraphFunctionVariable,
     "torch._C._functorch._unwrap_batched": TorchInGraphFunctionVariable,
@@ -334,6 +335,8 @@
     "torch._C._functorch.is_batchedtensor": TorchInGraphFunctionVariable,
     "torch._C._functorch.peek_interpreter_stack": TorchInGraphFunctionVariable,
     "torch._C._functorch.unwrap_if_dead": TorchInGraphFunctionVariable,
+    "torch._functorch.predispatch._vmap_increment_nesting": TorchInGraphFunctionVariable,
+    "torch._functorch.predispatch._vmap_decrement_nesting": TorchInGraphFunctionVariable,
     # everything else
     "torch._functorch.pyfunctorch.coerce_cinterpreter": TorchInGraphFunctionVariable,
     "torch._higher_order_ops.triton_kernel_wrap.do_prune_configs": UserFunctionVariable,
@@ -346,7 +349,7 @@
     "torch._dynamo.mark_static": UserFunctionVariable,
     "torch._dynamo.nonstrict_trace": UserFunctionVariable,
     "torch._dynamo.patch_dynamo_config": UserFunctionVariable,
-    "torch._dynamo.set_fullgraph": UserFunctionVariable,
+    "torch._dynamo.error_on_graph_break": UserFunctionVariable,
     "torch.fx.experimental.symbolic_shapes.guard_size_oblivious": TorchInGraphFunctionVariable,
     "torch.fx.experimental.symbolic_shapes.guard_or_true": TorchInGraphFunctionVariable,
     "torch.fx.experimental.symbolic_shapes.guard_or_false": TorchInGraphFunctionVariable,
@@ -447,7 +450,6 @@
         "torch._C._accelerator_getAccelerator",
         "torch._C._accelerator_getDeviceIndex",
         "torch._C._accelerator_getStream",
-        "torch._C._accelerator_setAllocatorSettings",
         "torch._C._accelerator_setStream",
         "torch._C._accelerator_synchronizeDevice",
         "torch._C._activate_gpu_trace",
@@ -504,6 +506,7 @@
         "torch._C._cuda_clearCublasWorkspaces",
         "torch._C._cuda_cudaCachingAllocator_raw_alloc",
         "torch._C._cuda_cudaCachingAllocator_raw_delete",
+        "torch._C._cuda_cudaCachingAllocator_set_allocator_settings",
         "torch._C._cuda_cudaHostAllocator",
         "torch._C._cuda_customAllocator",
         "torch._C._cuda_emptyCache",
@@ -2365,7 +2368,11 @@
         "torch._functorch.utils.enable_single_level_autograd_function",
         "torch._functorch.utils.exposed_in",
         "torch._functorch.utils.unwrap_dead_wrappers",
-        "torch._functorch.vmap.lazy_load_decompositions",
+        "torch._functorch.predispatch.lazy_load_decompositions",
+        "torch._functorch.predispatch._vmap_increment_nesting",
+        "torch._functorch.predispatch._vmap_decrement_nesting",
+        "torch._functorch.predispatch._add_batch_dim",
+        "torch._functorch.predispatch._remove_batch_dim",
         "torch._guards.compile_context",
         "torch._guards.detect_fake_mode",
         "torch._guards.tracing",
@@ -2434,7 +2441,6 @@
         "torch.atleast_3d",
         "torch.autograd._calculate_shape",
         "torch.autograd._is_checkpoint_valid",
-        "torch.autograd._profiler_enabled",
         "torch.autograd._make_grads",
         "torch.autograd._register_py_tensor_class_for_device",
         "torch.autograd._tensor_or_tensors_to_tuple",
@@ -2684,7 +2690,6 @@
         "torch.cuda.set_stream",
         "torch.cuda.set_sync_debug_mode",
         "torch.cuda.stream",
-        "torch.cuda.synchronize",
         "torch.cuda.temperature",
         "torch.cuda.utilization",
         "torch.einsum",
@@ -2963,6 +2968,7 @@
         "torch.xpu.random.seed_all",
         "torch.xpu.random.seed",
         "torch.xpu.set_stream",
+        "torch.xpu.stream",
         "torch.xpu.synchronize",
     ],
     TorchInGraphFunctionVariable,
@@ -3291,7 +3297,6 @@ def is_numpy_type_info(obj: Any) -> bool:
 
 BUILTIN_SKIPLIST = (
     abc,
-    collections,
     copy,
     random,
     traceback,
@@ -3398,6 +3403,7 @@ def _module_dir(m: types.ModuleType) -> Optional[str]:
     "torch._dynamo.compiled_autograd",
     "torch._dynamo.comptime",
     "torch._dynamo.polyfills",
+    "torch._dynamo.test_case",
     "torch._functorch._aot_autograd.subclass_parametrization",
     "torch._functorch.autograd_function",
     "torch._functorch.eager_transforms",
@@ -3413,7 +3419,6 @@ def _module_dir(m: types.ModuleType) -> Optional[str]:
     "torch._tensor",
     "torch.amp.autocast_mode",
     "torch.ao.nn",
-    "torch.ao.quantization.fake_quantize",
     "torch.autograd.function",
     "torch.backends.cuda",
     "torch.cuda.amp.autocast_mode",
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 588f1ddb99a19..058a66cf5b772 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -95,11 +95,13 @@
 
 if typing.TYPE_CHECKING:
     from collections.abc import (
+        Container,
         Generator,
         ItemsView,
         Iterable,
         Iterator,
         KeysView,
+        Mapping,
         Sequence,
         ValuesView,
     )
@@ -151,6 +153,7 @@
 
 
 T = TypeVar("T")
+R = TypeVar("R")
 _P = ParamSpec("_P")
 
 unpatched_nn_module_getattr = torch.nn.Module.__getattr__
@@ -1288,6 +1291,9 @@ class CompilationMetrics:
     compliant_custom_ops: Optional[set[str]] = None
     restart_reasons: Optional[set[str]] = None
     dynamo_time_before_restart_s: Optional[float] = None
+    stack_trace: Optional[list[str]] = None
+    exception_stack_trace: Optional[list[str]] = None
+    graph_node_shapes: Optional[str] = None
     # Sometimes, we will finish analyzing a frame but conclude we don't want
     # to install any guarded code.  True means we actually decided to install
     # a compiled frame
@@ -1357,6 +1363,7 @@ class CompilationMetrics:
     # the number of distinct type of params.
     param_count: Optional[int] = None
     recompile_user_contexts: Optional[set[str]] = None
+    inline_inbuilt_nn_modules_candidate: Optional[bool] = False
 
     @classmethod
     def create(cls, metrics: dict[str, Any]) -> CompilationMetrics:
@@ -1550,9 +1557,14 @@ def default(self, o: Any) -> Any:
 
     keys_to_scrub: set[Any] = set()
     inductor_conf_str = None
-    inductor_config_copy = (
-        torch._inductor.config.get_config_copy() if torch._inductor.config else None
-    )
+    inductor_config_copy = None
+
+    if torch._inductor.config:
+        try:
+            inductor_config_copy = torch._inductor.config.get_config_copy()
+        except (TypeError, AttributeError):
+            inductor_conf_str = "Inductor Config cannot be pickled"
+
     if inductor_config_copy is not None:
         try:
             for key, val in inductor_config_copy.items():
@@ -2217,7 +2229,14 @@ def preserve_rng_state() -> Generator[None, None, None]:
 
 def is_jit_model(
     model0: Any,
-) -> bool:
+) -> TypeIs[
+    Union[
+        torch.jit._trace.TopLevelTracedModule,
+        torch.jit._script.RecursiveScriptModule,
+        torch.jit.ScriptFunction[Any, Any],
+        torch.jit.ScriptModule,
+    ]
+]:
     return isinstance(
         model0,
         (
@@ -2337,7 +2356,9 @@ def restore() -> None:
     return restore
 
 
-def timed(model: Any, example_inputs: Any, times: int = 1) -> tuple[Any, float]:
+def timed(
+    model: Any, example_inputs: Iterable[Any], times: int = 1
+) -> tuple[Any, float]:
     if torch.cuda.is_available():
         synchronize = torch.cuda.synchronize
     else:
@@ -2354,7 +2375,7 @@ def timed(model: Any, example_inputs: Any, times: int = 1) -> tuple[Any, float]:
     return result, t1 - t0  # type: ignore[possibly-undefined]
 
 
-def check_is_cuda(gm: torch.fx.GraphModule, example_inputs: Any) -> bool:
+def check_is_cuda(gm: torch.fx.GraphModule, example_inputs: Iterable[Any]) -> bool:
     return all(x.is_cuda for x in itertools.chain(example_inputs, gm.parameters(True)))
 
 
@@ -2504,11 +2525,11 @@ def guard_if_dyn(arg: Any) -> Any:
     return arg
 
 
-def check_constant_args(args: Any, kwargs: Any) -> bool:
+def check_constant_args(args: Iterable[Any], kwargs: Mapping[Any, Any]) -> bool:
     return all(x.is_python_constant() for x in itertools.chain(args, kwargs.values()))
 
 
-def check_unspec_python_args(args: Any, kwargs: Any) -> bool:
+def check_unspec_python_args(args: Iterable[Any], kwargs: Mapping[Any, Any]) -> bool:
     from .variables.constant import ConstantVariable
     from .variables.tensor import UnspecializedPythonVariable
 
@@ -2521,7 +2542,9 @@ def check_unspec_python_args(args: Any, kwargs: Any) -> bool:
     return unspec_count > 0
 
 
-def check_unspec_or_constant_args(args: Any, kwargs: Any) -> bool:
+def check_unspec_or_constant_args(
+    args: Iterable[Any], kwargs: Mapping[Any, Any]
+) -> bool:
     # A fused version of:
     # return check_constant_args(args, kwargs) or check_unspec_python_args(args, kwargs)
     from .variables.tensor import UnspecializedPythonVariable
@@ -2532,7 +2555,7 @@ def check_unspec_or_constant_args(args: Any, kwargs: Any) -> bool:
     return True
 
 
-def check_numpy_ndarray_args(args: Any, kwargs: Any) -> bool:
+def check_numpy_ndarray_args(args: Iterable[Any], kwargs: Mapping[Any, Any]) -> bool:
     from .variables.tensor import NumpyNdarrayVariable
 
     return any(
@@ -2567,14 +2590,17 @@ def check_numpy_ndarray_args(args: Any, kwargs: Any) -> bool:
 
 str_methods = {method for method in str.__dict__.values() if callable(method)}
 
+K = TypeVar("K")
+V = TypeVar("V")
+
 
-def builtin_dict_keys(d: dict[Any, Any]) -> KeysView[Any]:
+def builtin_dict_keys(d: dict[K, V]) -> KeysView[K]:
     # Avoids overridden keys method of the dictionary
     assert isinstance(d, dict)
     return dict.keys(d)
 
 
-def get_items_from_dict(obj: dict[Any, Any]) -> Any:
+def get_items_from_dict(obj: dict[K, V]) -> Iterable[tuple[K, Union[V, Any]]]:
     # Get items without calling the user defined __getitem__ or keys method.
     assert isinstance(obj, dict)
     if istype(obj, (dict, OrderedDict)):
@@ -2591,7 +2617,7 @@ def nn_module_new(cls: Any) -> Any:
     return obj
 
 
-def product(it: Iterable[Any]) -> Any:
+def product(it: Iterable[T]) -> int:
     return functools.reduce(operator.mul, it, 1)
 
 
@@ -2611,7 +2637,11 @@ def normalize_range_iter(range_iter: Any) -> tuple[int, int, int]:
     _, (range_obj,), maybe_idx = range_iter.__reduce__()
     # In 3.12+, `maybe_idx` could be None, and `range_obj.start` would've been
     # already incremented by the current index.
-    start = range_obj.start + (maybe_idx or 0)
+    # The index (maybe_idx) is the number of steps taken so far. To get the
+    # correct start value, one must add (maybe_idx * step) to the original
+    # start. See:
+    # https://github.com/python/cpython/blob/ea77feecbba389916af8f90b2fc77f07910a2963/Objects/rangeobject.c#L885-L899
+    start = range_obj.start + (maybe_idx or 0) * range_obj.step
     stop = range_obj.stop
     step = range_obj.step
     return (start, stop, step)
@@ -2632,7 +2662,7 @@ def dict_keys_getitem(d: dict[Any, Any], n: int) -> Any:
     return next(itertools.islice(dict_class.keys(d), n, n + 1))
 
 
-def set_getitem(s: set[Any], n: int) -> Any:
+def set_getitem(s: set[T], n: int) -> T:
     # Set ordering might not be stable
     return list(s)[n]
 
@@ -2699,7 +2729,7 @@ def raise_args_mismatch(tx: InstructionTranslatorBase, name: str) -> None:
 
 
 def iter_contains(
-    items: Any,
+    items: Iterable[Any],
     search: Any,
     tx: InstructionTranslator,
     check_tensor_identity: bool = False,
@@ -2804,7 +2834,7 @@ def get_safe_global_name(tx: InstructionTranslatorBase, root: str, obj: Any) ->
     return f"{root}_{id(obj)}_c{tx.output.compile_id}"
 
 
-def is_in(item: str, *containers: Any) -> bool:
+def is_in(item: T, *containers: Container[T]) -> bool:
     for container in containers:
         if item in container:
             return True
@@ -2954,7 +2984,7 @@ def same(
         assert not isinstance(ref, torch._subclasses.FakeTensor)
         assert not isinstance(res, torch._subclasses.FakeTensor)
 
-        def to_tensor(t: Any) -> Any:
+        def to_tensor(t: Any) -> torch.Tensor:
             return t if isinstance(t, torch.Tensor) else torch.tensor(t)
 
         ref, res, fp64_ref = (to_tensor(val) for val in (ref, res, fp64_ref))
@@ -3309,9 +3339,11 @@ def get_fake_value(
         id_to_initial_version = {}
 
     nnmodule = None
+    fake_mode = tx.fake_mode
+    assert fake_mode is not None
     if op == "call_method" and len(args) > 0 and isinstance(args[0], torch.nn.Module):
         # If the first argument is nn.Module, should copy to fake mode.
-        args = (deepcopy_to_fake_tensor(args[0], tx.fake_mode),) + tuple(args[1:])
+        args = (deepcopy_to_fake_tensor(args[0], fake_mode),) + tuple(args[1:])
 
     if op == "call_module":
         nnmodule = tx.output.nn_modules[node.target]  # type: ignore[index]
@@ -3324,7 +3356,7 @@ def get_fake_value(
             nnmodule._infer_parameters(nnmodule, args)
 
         # no matter it's lazy module or not, we should copy to fake mode.
-        nnmodule = deepcopy_to_fake_tensor(nnmodule, tx.fake_mode)
+        nnmodule = deepcopy_to_fake_tensor(nnmodule, fake_mode)
 
     if node.name in ["interpolate", "is_integer", "wrapped_gradient"] or any(
         isinstance(a, complex) for a in args
@@ -3340,7 +3372,7 @@ def get_fake_value(
         )
 
     try:
-        with tx.fake_mode, enable_python_dispatcher():
+        with fake_mode, enable_python_dispatcher():
             ret_val = wrap_fake_exception(
                 lambda: run_node(tx.output, node, args, kwargs, nnmodule)
             )
@@ -3852,15 +3884,15 @@ def numpy_to_tensor(value: Any) -> Any:
         return value
 
 
-class numpy_to_tensor_wrapper:
-    def __init__(self, f: Any) -> None:
+class numpy_to_tensor_wrapper(Generic[_P, R]):
+    def __init__(self, f: Callable[_P, R]) -> None:
         self.f = f
         self.__name__ = "wrapped_" + self.f.__name__
 
     def __repr__(self) -> str:
         return f"<Wrapped function <original {self.f.__name__}>>"
 
-    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> Any:
         out = self.f(*args, **kwargs)
         return numpy_to_tensor(out)
 
@@ -3893,7 +3925,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
         return numpy_to_tensor(out)
 
 
-class numpy_operator_wrapper:
+class numpy_operator_wrapper(Generic[_P, R]):
     """Implements dunder methods for tnp.ndarray via functions from the operator library"""
 
     def __init__(self, op: Callable[..., Any]) -> None:
@@ -3903,7 +3935,7 @@ def __init__(self, op: Callable[..., Any]) -> None:
     def __repr__(self) -> str:
         return f"<Wrapped operator <original {self.__name__}>>"
 
-    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> Any:
         assert not kwargs
 
         args = (
@@ -3946,8 +3978,8 @@ def defake(x: Any) -> Any:
 
 
 def _disable_side_effect_safety_checks_for_current_subtracer(
-    fn: Callable[_P, Any], *args: _P.args, **kwargs: _P.kwargs
-) -> Any:
+    fn: Callable[_P, R], *args: _P.args, **kwargs: _P.kwargs
+) -> R:
     return fn(*args, **kwargs)
 
 
@@ -4704,6 +4736,11 @@ def record(cls) -> Generator[None, None, None]:
                 cls.end()
 
 
+class CompileCounterInt(int):
+    def __add__(self, other: Any) -> CompileCounterInt:
+        return CompileCounterInt(super().__add__(other))
+
+
 def set_feature_use(feature: str, usage: bool) -> None:
     """
     Records whether we are using a feature
diff --git a/torch/_dynamo/variables/__init__.py b/torch/_dynamo/variables/__init__.py
index 08d62ce607f0d..31bc7db5128f7 100644
--- a/torch/_dynamo/variables/__init__.py
+++ b/torch/_dynamo/variables/__init__.py
@@ -27,6 +27,7 @@
     DisabledSavedTensorsHooksVariable,
     DualLevelContextManager,
     DynamoConfigPatchVariable,
+    ErrorOnGraphBreakVariable,
     FSDPParamGroupUseTrainingStateVariable,
     GradIncrementNestingCtxManagerVariable,
     GradInplaceRequiresGradCtxManagerVariable,
@@ -34,7 +35,6 @@
     InferenceModeVariable,
     JvpIncrementNestingCtxManagerVariable,
     SDPAKernelVariable,
-    SetFullgraphVariable,
     SetFwdGradEnabledContextManager,
     StreamContextVariable,
     StreamVariable,
@@ -200,7 +200,7 @@
     "RemovableHandleVariable",
     "RepeatIteratorVariable",
     "SDPAParamsVariable",
-    "SetFullgraphVariable",
+    "ErrorOnGraphBreakVariable",
     "SkipFunctionVariable",
     "SliceVariable",
     "StringFormatVariable",
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index f9d8e273068f3..20b88759ef324 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -44,6 +44,7 @@
 
 import torch
 from torch import SymInt
+from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import (
     get_metrics_context,
     is_int_specialization_case,
@@ -104,6 +105,7 @@
     GetItemSource,
     GradSource,
     is_constant_source,
+    is_from_closure_source,
     is_from_global_source,
     is_from_nonlocal_source,
     is_from_optimizer_source,
@@ -167,10 +169,10 @@
 from .ctx_manager import (
     AutocastModeVariable,
     DynamoConfigPatchVariable,
+    ErrorOnGraphBreakVariable,
     EventVariable,
     NullContextVariable,
     PreserveVersionContextVariable,
-    SetFullgraphVariable,
     StreamContextVariable,
     StreamVariable,
 )
@@ -629,7 +631,7 @@ def _wrap(self, value):
 
         from ..decorators import (
             DynamoConfigPatchProxy,
-            SetFullgraphDecoratorContextManager,
+            ErrorOnGraphBreakDecoratorContextManager,
         )
 
         if has_triton():
@@ -688,13 +690,10 @@ def from_tensor():
             )
             and type(value) not in config.nontraceable_tensor_subclasses
         ):
-            if type(value).__torch_dispatch__ is torch.Tensor.__torch_dispatch__:
-                # This case it's either tensor or subclass with default
-                # torch_dispatch (they might override torch_function or not),
-                # and we can always trace into them.
-                return self.wrap_tensor(value)
-            elif is_traceable_wrapper_subclass(value):
-                # For non-default torch_dispatch, we have more requirements.
+            if (
+                type(value).__torch_dispatch__ is torch.Tensor.__torch_dispatch__
+                or is_traceable_wrapper_subclass(value)
+            ):
                 return self.wrap_tensor(value)
 
         if is_namedtuple(value):
@@ -709,7 +708,7 @@ def from_tensor():
             result = NamedTupleVariable(
                 output, tuple_cls=type(value), source=self.source
             )
-            return result
+            return self.tx.output.side_effects.track_object_existing(value, result)
         elif istype(value, (dict, collections.defaultdict, collections.OrderedDict)):
             self.install_guards(GuardBuilder.TYPE_MATCH)
             all_const = all(ConstantVariable.is_literal(k) for k in value.keys())
@@ -990,8 +989,8 @@ def build_key_value(i, k, v):
             )
         elif isinstance(value, DynamoConfigPatchProxy):
             return DynamoConfigPatchVariable(value.changes)
-        elif isinstance(value, SetFullgraphDecoratorContextManager):
-            return SetFullgraphVariable(value.fullgraph)
+        elif isinstance(value, ErrorOnGraphBreakDecoratorContextManager):
+            return ErrorOnGraphBreakVariable(value.error_on_graph_break)
         elif callable(value) and trace_rules.lookup_callable(value) is not None:
             if trace_rules.is_callable_allowed(value):
                 self.tx.output.has_user_defined_allowed_in_graph = True
@@ -1332,9 +1331,16 @@ def build_key_value(i, k, v):
                 and not is_traceable_wrapper_subclass_type(value)
             ):
                 return TensorSubclassVariable(value, source=self.source)
-            # This is a userdefined class, so install an ID_MATCH even if its a
-            # global variable.
-            self.install_guards(GuardBuilder.ID_MATCH)
+
+            if not is_from_closure_source(self.source):
+                # For closure source, the variable comes from LOAD_SUPER_ATTR,
+                # which calls self.__class__. This is internal Cpython
+                # implementation, and it is rare for the user to modify
+                # self.__class__ manually.
+                # For other cases, this is a userdefined class, so install an
+                # ID_MATCH even if its a global variable.
+                self.install_guards(GuardBuilder.ID_MATCH)
+
             return UserDefinedClassVariable(
                 value,
                 source=self.source,
@@ -1516,7 +1522,8 @@ def build_key_value(i, k, v):
             return self.tx.output.side_effects.track_object_existing(value, result)
         elif issubclass(type(value), MutableMapping):
             self.install_guards(GuardBuilder.TYPE_MATCH)
-            return MutableMappingVariable(value, source=self.source)
+            result = MutableMappingVariable(value, source=self.source)
+            return self.tx.output.side_effects.track_object_existing(value, result)
         elif is_frozen_dataclass(value):
             self.install_guards(GuardBuilder.TYPE_MATCH)
             result = FrozenDataClassVariable.create(self.tx, value, source=self.source)
@@ -1736,11 +1743,6 @@ def wrap_slice_range(self, value: Union[slice, range]):
     def mark_static_input(self, value: torch.Tensor, guard: bool):
         from ..decorators import mark_static_address
 
-        # See [Note] Static Addresses and Precompile
-        # https://github.com/pytorch/pytorch/issues/159228
-        if torch._dynamo.config.caching_precompile:
-            return
-
         static_inputs_log.debug(
             "Marking static input %s, id: %s)", self.source.name(), id(value)
         )
@@ -2052,32 +2054,8 @@ def wrap_tensor(self, value: torch.Tensor):
             return self.tx.output.input_source_to_var[source]
 
         options = {}
-        if type(value) in (
-            torch.Tensor,
-            torch.nn.Parameter,
-            torch._subclasses.fake_tensor.FakeTensor,
-            torch._subclasses.functional_tensor.FunctionalTensor,
-        ) or is_traceable_wrapper_subclass(value):
-            # Ordinarily, we would fakeify a tensor so that it can get dynamic
-            # shapes and be computed on without triggering actual operations.
-            # However, how can we fakeify a tensor subclass?  Ordinary
-            # inheritance (nor multiple inheritance) won't work work.
-            #
-            # Instead, our plan is to *manually simulate* the tensor subclass
-            # inheriting from a fake tensor with dynamo.  This means our
-            # data representation for a tensor subclass will be a fake tensor
-            # + tensor subclass type + any extra data the subclass may have
-            # been storing on the tensor.  Because all Python accesses are
-            # mediated through TensorWithTFOverrideVariable, we can ensure
-            # that we dispatch differently, e.g., according to
-            # __torch_function__
-            #
-            # To simplify things for now, the __dict__ tracking bits haven't
-            # been implemented yet, but they can be added into this design at
-            # a later point in time.
-            subclass_type = None
-        else:
-            subclass_type = type(value)
+        subclass_type = infer_subclass_type(value)
+        if subclass_type is not None:
             self.install_guards(GuardBuilder.TYPE_MATCH)
 
         if get_static_address_type(value) == "guarded":
@@ -2935,7 +2913,7 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
     elif example_value is None or proxy.node.target is torch.manual_seed:
         return ConstantVariable.create(None, **options)
     elif isinstance(example_value, (torch.SymInt, torch.SymFloat, torch.SymBool)):
-        tx.output.current_tracer.track_unbacked_symbols(example_value, proxy)
+        tx.output.current_tracer.track_produced_symints(example_value, proxy)
         set_example_value(proxy.node, example_value)
         return SymNodeVariable(proxy, example_value, **options)
     elif (
@@ -2977,6 +2955,8 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
             torch.seed,
             operator.mod,
             torch._functorch.vmap._validate_and_get_batch_size,
+            torch._functorch.predispatch._vmap_increment_nesting,
+            torch._functorch.predispatch._vmap_decrement_nesting,
             # some mac builds are missing torch.distributed.get_rank()
             getattr(torch.distributed, "get_rank", _missing),
             getattr(torch.distributed, "get_world_size", _missing),
@@ -3010,15 +2990,10 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
     ):
         set_example_value(proxy.node, example_value)
         return ConstantVariable.create(example_value, **options)
-    elif (
-        isinstance(example_value, (int, float, bool))
-        and proxy.node.target is call_torchbind
-    ):
-        set_example_value(proxy.node, example_value)
-        return ConstantVariable.create(example_value, **options)
-    elif (
-        isinstance(example_value, (int, float, bool))
-        and proxy.node.target is flat_apply
+    elif isinstance(example_value, (int, float, bool)) and (
+        proxy.node.target is call_torchbind
+        or proxy.node.target is flat_apply
+        or (proxy.node.op == "call_method" and proxy.node.target == "item")
     ):
         set_example_value(proxy.node, example_value)
         return ConstantVariable.create(example_value, **options)
@@ -3034,23 +3009,36 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         )
 
 
-def construct_tensor_variable(
-    target_cls, tx, proxy, example_value, subclass_type, options
-):
-    """
-    Actually construct a tensor variable after all the pre-processing from
-    wrapping a pre-existing or newly created tensor value.
-    """
-    # NB: In most (all?) cases, this does not actually do a clone.
-    # (WARNING: this means that if we mutate metadata on the fake
-    # tensor, the stored example value will update too!)
-    example_value = _clone_input(example_value, tx.fake_mode)
-    set_example_value(proxy.node, example_value)
-    # We bind the unbacked symints in sizes/trdies of tensor lazily.
-    # So that subgraphs can access the unbacked symbol's proxy in parent graph
-    # when lifting unbacked symbols of input tensors to subgraph inputs.
-    # We do it lazily because the tensor may not be used in subgraphs.
-    tx.output.current_tracer.track_unbacked_symbols(example_value, proxy)
+def infer_subclass_type(value):
+    if type(value) in (
+        torch.Tensor,
+        torch.nn.Parameter,
+        torch._subclasses.fake_tensor.FakeTensor,
+        torch._subclasses.functional_tensor.FunctionalTensor,
+    ) or is_traceable_wrapper_subclass(value):
+        # Ordinarily, we would fakeify a tensor so that it can get dynamic
+        # shapes and be computed on without triggering actual operations.
+        # However, how can we fakeify a tensor subclass?  Ordinary
+        # inheritance (nor multiple inheritance) won't work work.
+        #
+        # Instead, our plan is to *manually simulate* the tensor subclass
+        # inheriting from a fake tensor with dynamo.  This means our
+        # data representation for a tensor subclass will be a fake tensor
+        # + tensor subclass type + any extra data the subclass may have
+        # been storing on the tensor.  Because all Python accesses are
+        # mediated through TensorWithTFOverrideVariable, we can ensure
+        # that we dispatch differently, e.g., according to
+        # __torch_function__
+        #
+        # To simplify things for now, the __dict__ tracking bits haven't
+        # been implemented yet, but they can be added into this design at
+        # a later point in time.
+        return None
+    else:
+        return type(value)
+
+
+def get_specialized_props(target_cls, tx, example_value, subclass_type):
     specialized_props = target_cls.specialize(example_value)
     # TODO: not sure about this fake mode test
     if (
@@ -3067,7 +3055,28 @@ def construct_tensor_variable(
             tensor_type = torch.Tensor
         specialized_props["class_type"] = tensor_type
 
-    options.update(specialized_props)
+    return specialized_props
+
+
+def construct_tensor_variable(
+    target_cls, tx, proxy, example_value, subclass_type, options
+):
+    """
+    Actually construct a tensor variable after all the pre-processing from
+    wrapping a pre-existing or newly created tensor value.
+    """
+    # NB: In most (all?) cases, this does not actually do a clone.
+    # (WARNING: this means that if we mutate metadata on the fake
+    # tensor, the stored example value will update too!)
+    example_value = _clone_input(example_value, tx.fake_mode)
+    set_example_value(proxy.node, example_value)
+    # We bind the unbacked symints in sizes/trdies of tensor lazily.
+    # So that subgraphs can access the unbacked symbol's proxy in parent graph
+    # when lifting unbacked symbols of input tensors to subgraph inputs.
+    # We do it lazily because the tensor may not be used in subgraphs.
+    if proxy.node.op != "placeholder":
+        tx.output.current_tracer.track_produced_symints(example_value, proxy)
+    options.update(get_specialized_props(target_cls, tx, example_value, subclass_type))
     return target_cls(proxy, **options)
 
 
@@ -3239,7 +3248,6 @@ def _automatic_dynamic(
         )
 
     if static_shapes and not is_dynamic_source(name):
-        record_automatic_dynamic(tx, name, e)
         return StatefulSymbolicContext(
             dynamic_sizes=[DimDynamic.STATIC] * e.dim(),
             dynamic_strides=[DimDynamic.INFER_STRIDE] * e.dim(),
@@ -3490,13 +3498,19 @@ def wrap_to_fake_tensor_and_record(
             type(e),
         )
 
-        fake_e = wrap_fake_exception(
-            lambda: tx.fake_mode.from_tensor(
-                e,
-                source=source,
-                symbolic_context=symbolic_context,
+        # Note [enable_python_dispatcher in dynamo]
+        # Dynamo disables itself when it runs fake tensor prop, which means that tensor subclasses
+        # have no way to know (purely based off of global state) if they are currently being run under compile or not.
+        # we use enable_python_dispatcher mainly to tweak the DispatchKeyState so that subclass authors
+        # can check it to know if they are running in an eager context or not
+        with enable_python_dispatcher():
+            fake_e = wrap_fake_exception(
+                lambda: tx.fake_mode.from_tensor(
+                    e,
+                    source=source,
+                    symbolic_context=symbolic_context,
+                )
             )
-        )
         if (
             source is not None
             and isinstance(fake_e, FakeTensor)
@@ -3588,6 +3602,12 @@ def create(tx: "InstructionTranslator", value) -> VariableTracker:
             if trace_rules.is_callable_allowed(value):
                 tx.output.has_user_defined_allowed_in_graph = True
             return trace_rules.lookup_callable(value)(value)
+        elif callable(value) and UserDefinedClassVariable.is_supported_new_method(
+            value
+        ):
+            # NamedTuple._make uses an alias of tuple.__new__
+            obj = trace_rules.lookup_callable(value.__self__)(value.__self__)
+            return GetAttrVariable(obj, "__new__")
         elif is_function_or_wrapper(value):
             return trace_rules.lookup(value)(value)
         elif isinstance(
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index e0d722d3aae99..b46707f2f1172 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -46,6 +46,7 @@
 from ..exc import (
     AttributeMutationError,
     ObservedAttributeError,
+    ObservedUserStopIteration,
     raise_observed_exception,
     unimplemented_v2,
     Unsupported,
@@ -109,9 +110,9 @@
     UnspecializedPythonVariable,
 )
 from .user_defined import (
+    MutableMappingVariable,
     UserDefinedDictVariable,
     UserDefinedObjectVariable,
-    UserDefinedSetVariable,
     UserDefinedVariable,
 )
 
@@ -307,6 +308,7 @@ def _constant_fold_functions():
             bool,
             callable,
             chr,
+            complex,
             divmod,
             float,
             getattr,
@@ -1156,6 +1158,21 @@ def builtin_dispatch(tx: "InstructionTranslator", args, kwargs):
 
         return builtin_dispatch
 
+    def call_vars(self, tx: "InstructionTranslator", *args):
+        if len(args) == 0:
+            unimplemented_v2(
+                gb_type="unimplemented builtin op vars() with no arguments",
+                context=f"vars: {self} {args}",
+                explanation=f"Dynamo does not know how to trace builtin operator {self.fn} with no arguments",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
+        assert len(args) == 1
+        # vars(obj) is obj.__dict__ if __dict__ is present else TypeError
+        try:
+            return args[0].var_getattr(tx, "__dict__")
+        except ObservedAttributeError:
+            raise_observed_exception(TypeError, tx)
+
     def _handle_insert_op_in_graph(self, tx: "InstructionTranslator", args, kwargs):
         from .builder import wrap_fx_proxy, wrap_fx_proxy_cls
 
@@ -1372,11 +1389,11 @@ def call_method(
             if (
                 self.fn is tuple
                 and len(args) == 2
-                and args[1].has_unpack_var_sequence(tx)
+                and args[1].has_force_unpack_var_sequence(tx)
                 and not kwargs
             ):
                 if isinstance(args[0], BuiltinVariable) and args[0].fn is tuple:
-                    init_args = args[1].unpack_var_sequence(tx)
+                    init_args = args[1].force_unpack_var_sequence(tx)
                     return variables.TupleVariable(
                         init_args, mutation_type=ValueMutationNew()
                     )
@@ -1792,7 +1809,10 @@ def _call_tuple_list(self, tx, obj=None, *args, **kwargs):
                 list(obj.force_unpack_var_sequence(tx)),
                 mutation_type=ValueMutationNew(),
             )
-        elif isinstance(obj, variables.LocalGeneratorObjectVariable):
+        elif isinstance(obj, variables.LocalGeneratorObjectVariable) or (
+            isinstance(obj, UserDefinedObjectVariable)
+            and obj.has_force_unpack_var_sequence(tx)
+        ):
             return self._call_iter_tuple_generator(tx, obj, *args, **kwargs)
         else:
             return self._call_iter_tuple_list(tx, obj, *args, **kwargs)
@@ -1800,6 +1820,8 @@ def _call_tuple_list(self, tx, obj=None, *args, **kwargs):
     def call_iter(self, tx: "InstructionTranslator", obj, *args, **kwargs):
         if isinstance(obj, variables.IteratorVariable):
             ret = obj
+        elif isinstance(obj, variables.RangeVariable):
+            ret = obj.call_method(tx, "__iter__", [], {})
         else:
             # Handle the case where we are iterating over a tuple, list or iterator
             ret = self._call_iter_tuple_list(tx, obj, *args, **kwargs)
@@ -1865,11 +1887,28 @@ def call_cast(self, _, *args, **kwargs):
             hints=["Ensure your call to cast() has exactly 2 arguments."],
         )
 
+    def call_dir(self, tx: "InstructionTranslator", arg):
+        if isinstance(arg, variables.UserDefinedClassVariable):
+            return VariableTracker.build(tx, dir(arg.value))
+        if isinstance(arg, BuiltinVariable):
+            return VariableTracker.build(tx, dir(arg.fn))
+
     def call_dict(self, tx: "InstructionTranslator", *args, **kwargs):
         return BuiltinVariable.call_custom_dict(tx, dict, *args, **kwargs)
 
     @staticmethod
     def call_custom_dict(tx: "InstructionTranslator", user_cls, *args, **kwargs):
+        args = list(args)
+        if (
+            len(args) == 1
+            and isinstance(args[0], variables.GetAttrVariable)
+            and isinstance(args[0].obj, variables.UserDefinedClassVariable)
+            and not tx.output.side_effects.has_pending_mutation(args[0].obj)
+        ):
+            # Forward the GetAttrVariable(foo, "__dict__") to a realized vt of
+            # VT(foo.__dict__). This simplifies the construction of the new
+            # dict.
+            args[0] = args[0].get_forwarded_dict(tx)
         return tx.inline_user_function_return(
             VariableTracker.build(tx, polyfills.construct_dict),
             [VariableTracker.build(tx, user_cls), *args],
@@ -1991,10 +2030,7 @@ def call_zip(self, tx: "InstructionTranslator", *args, **kwargs):
         if kwargs:
             assert len(kwargs) == 1 and "strict" in kwargs
         strict = kwargs.pop("strict", False)
-        args = [
-            arg.unpack_var_sequence(tx) if arg.has_unpack_var_sequence(tx) else arg
-            for arg in args
-        ]
+        args = [BuiltinVariable(iter).call_function(tx, [arg], {}) for arg in args]
         return variables.ZipVariable(
             args, strict=strict, mutation_type=ValueMutationNew()
         )
@@ -2133,9 +2169,14 @@ def call_issubclass(self, tx: "InstructionTranslator", left_ty, right_ty):
     def call_super(self, tx: "InstructionTranslator", a, b):
         return variables.SuperVariable(a, b)
 
-    def call_next(self, tx: "InstructionTranslator", arg: VariableTracker):
+    def call_next(self, tx: "InstructionTranslator", *args):
+        arg = args[0]
         try:
             return arg.next_variable(tx)
+        except ObservedUserStopIteration:
+            if len(args) == 2:
+                return args[1]
+            raise
         except Unsupported as ex:
             if isinstance(arg, variables.BaseListVariable):
                 ex.remove_from_stats()
@@ -2160,6 +2201,18 @@ def call_filter(self, tx: "InstructionTranslator", fn, seq):
         seq = seq.unpack_var_sequence(tx) if seq.has_unpack_var_sequence(tx) else seq
         return variables.FilterVariable(fn, seq, mutation_type=ValueMutationNew())
 
+    def var_getattr(self, tx: "InstructionTranslator", name):
+        source = self.source and AttrSource(self.source, name)
+        if self.fn is object:
+            # for object, we can just directly read the attribute
+            try:
+                value = getattr(self.fn, name)
+            except AttributeError:
+                raise_observed_exception(AttributeError, tx)
+            if not callable(value):
+                return VariableTracker.build(tx, value, source)
+        return variables.GetAttrVariable(self, name, source=source)
+
     def call_getattr(
         self,
         tx: "InstructionTranslator",
@@ -2255,7 +2308,6 @@ def call_getattr(
                     "assertRaisesRegex",
                     "assertNotWarns",
                     "assertWarnsRegex",
-                    "assertDictEqual",
                     "assertWarns",
                 )
             ):
@@ -2537,6 +2589,13 @@ def call_neg(self, tx: "InstructionTranslator", a):
                 (operator.neg)(a.as_proxy()),
                 sym_num=None,
             )
+
+        if (
+            isinstance(a, UserDefinedObjectVariable)
+            and a.call_obj_hasattr(tx, "__neg__").value  # type: ignore[attr-defined]
+        ):
+            return a.call_method(tx, "__neg__", [], {})
+
         # None no-ops this handler and lets the driving function proceed
         return None
 
@@ -2671,19 +2730,19 @@ def _comparison_with_symnode(self, tx: "InstructionTranslator", left, right):
         )
 
     def call_xor(self, tx: "InstructionTranslator", a, b):
-        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedSetVariable)):
+        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedObjectVariable)):
             return a.call_method(tx, "__xor__", [b], {})
 
     def call_ixor(self, tx: "InstructionTranslator", a, b):
-        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedSetVariable)):
+        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedObjectVariable)):
             return a.call_method(tx, "__ixor__", [b], {})
 
     def call_sub(self, tx: "InstructionTranslator", a, b):
-        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedSetVariable)):
+        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedObjectVariable)):
             return a.call_method(tx, "__sub__", [b], {})
 
     def call_isub(self, tx: "InstructionTranslator", a, b):
-        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedSetVariable)):
+        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedObjectVariable)):
             return a.call_method(tx, "__isub__", [b], {})
 
     def call_and_(self, tx: "InstructionTranslator", a, b):
@@ -2700,7 +2759,7 @@ def call_and_(self, tx: "InstructionTranslator", a, b):
                 ),
                 sym_num=None,
             )
-        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedSetVariable)):
+        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedObjectVariable)):
             return a.call_method(tx, "__and__", [b], {})
         # None no-ops this handler and lets the driving function proceed
 
@@ -2718,7 +2777,7 @@ def call_iand(self, tx: "InstructionTranslator", a, b):
                 ),
                 sym_num=None,
             )
-        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedSetVariable)):
+        if isinstance(a, (DictKeysVariable, SetVariable, UserDefinedObjectVariable)):
             return a.call_method(tx, "__iand__", [b], {})
 
     def call_or_(self, tx: "InstructionTranslator", a, b):
@@ -2742,11 +2801,14 @@ def call_or_(self, tx: "InstructionTranslator", a, b):
             (
                 ConstDictVariable,
                 DictKeysVariable,
+                MutableMappingVariable,
                 SetVariable,
                 UserDefinedDictVariable,
-                UserDefinedSetVariable,
+                UserDefinedObjectVariable,
             ),
         ):
+            # TODO(guilhermeleobas): forward the call to b.__ror__(a) if
+            # a.__ror__(b) returns NotImplemented
             return a.call_method(tx, "__or__", [b], {})
 
         # None no-ops this handler and lets the driving function proceed
@@ -2770,7 +2832,13 @@ def call_ior(self, tx: "InstructionTranslator", a, b):
         # This call looks like `{"one": torch.ones(1)} |= {"two": torch.ones(2)}`.
         if isinstance(
             a,
-            (ConstDictVariable, DictKeysVariable, SetVariable, UserDefinedSetVariable),
+            (
+                ConstDictVariable,
+                DictKeysVariable,
+                MutableMappingVariable,
+                SetVariable,
+                UserDefinedObjectVariable,
+            ),
         ):
             return a.call_method(tx, "__ior__", [b], {})
 
diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py
index 998bef52da4ca..11822016827ea 100644
--- a/torch/_dynamo/variables/constant.py
+++ b/torch/_dynamo/variables/constant.py
@@ -125,7 +125,7 @@ def unpack_var_sequence(self, tx):
 
     def const_getattr(self, tx: "InstructionTranslator", name):
         if not hasattr(self.value, name):
-            raise NotImplementedError
+            raise_observed_exception(AttributeError, tx, args=[name])
         member = getattr(self.value, name)
         if callable(member):
             raise NotImplementedError
@@ -206,6 +206,12 @@ def call_method(
         elif isinstance(self.value, bytes) and name == "decode":
             method = getattr(self.value, name)
             return ConstantVariable.create(method(*const_args, **const_kwargs))
+        elif type(self.value) is complex and name in complex.__dict__.keys():
+            method = getattr(self.value, name)
+            try:
+                return ConstantVariable.create(method(*const_args, **const_kwargs))
+            except Exception as e:
+                raise_observed_exception(type(e), tx)
 
         if name == "__len__" and not (args or kwargs):
             return ConstantVariable.create(len(self.value))
diff --git a/torch/_dynamo/variables/ctx_manager.py b/torch/_dynamo/variables/ctx_manager.py
index 09c9f4b7b7270..15a5540395d18 100644
--- a/torch/_dynamo/variables/ctx_manager.py
+++ b/torch/_dynamo/variables/ctx_manager.py
@@ -523,7 +523,7 @@ def enter(self, tx):
         self.set_cleanup_hook(tx, lambda: torch._C._functorch._vmap_decrement_nesting())
         self.proxy = tx.output.create_node(
             "call_function",
-            torch._C._functorch._vmap_increment_nesting,
+            torch._functorch.predispatch._vmap_increment_nesting,
             (batch_size_node, randomness),
             {},
         )
@@ -532,7 +532,10 @@ def enter(self, tx):
     def exit(self, tx: "InstructionTranslator", *args):
         self.cleanup()
         tx.output.create_node(
-            "call_function", torch._C._functorch._vmap_decrement_nesting, (), {}
+            "call_function",
+            torch._functorch.predispatch._vmap_decrement_nesting,
+            (),
+            {},
         )
         return variables.ConstantVariable.create(None)
 
@@ -940,7 +943,8 @@ def __init__(self, target_values=None, **kwargs) -> None:
         super().__init__(target_values=target_values, **kwargs)
 
     def enter(self, tx):
-        return variables.ConstantVariable.create(None)
+        none = variables.ConstantVariable.create(None)
+        return self.target_values if self.target_values else none
 
     def exit(self, tx: "InstructionTranslator", *args):
         return variables.ConstantVariable.create(None)
@@ -1425,12 +1429,12 @@ def fn_name(self):
         return "patch_dynamo_config"
 
 
-class SetFullgraphVariable(ContextWrappingVariable):
-    """represents torch._dynamo.set_fullgraph"""
+class ErrorOnGraphBreakVariable(ContextWrappingVariable):
+    """represents torch._dynamo.error_on_graph_break"""
 
-    def __init__(self, fullgraph, **kwargs) -> None:
+    def __init__(self, error_on_graph_break, **kwargs) -> None:
         super().__init__(
-            target_values=(fullgraph,),
+            target_values=(error_on_graph_break,),
             initial_values=(_get_error_on_graph_break(),),
             **kwargs,
         )
@@ -1443,7 +1447,7 @@ def module_name(self):
         return "torch._dynamo"
 
     def fn_name(self):
-        return "set_fullgraph"
+        return "error_on_graph_break"
 
 
 class WithExitFunctionVariable(VariableTracker):
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index edb1169cb193b..c33979aae07df 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -120,6 +120,7 @@ def is_hashable(x):
                 variables.TypingVariable,
                 variables.FunctoolsPartialVariable,
                 variables.WeakRefVariable,
+                variables.TorchHigherOrderOperatorVariable,
             ),
         )
 
@@ -244,13 +245,33 @@ def __init__(
         def make_hashable(key):
             return key if isinstance(key, Hashable) else Hashable(key)
 
-        self.items = {make_hashable(x): v for x, v in items.items()}
+        dict_cls = self._get_dict_cls_from_user_cls(user_cls)
+        self.items = dict_cls({make_hashable(x): v for x, v in items.items()})
         # need to reconstruct everything if the dictionary is an intermediate value
         # or if a pop/delitem was executed
         self.should_reconstruct_all = not is_from_local_source(self.source)
         self.original_items = items.copy()
         self.user_cls = user_cls
 
+    def _get_dict_cls_from_user_cls(self, user_cls):
+        accepted_dict_types = (dict, collections.OrderedDict, collections.defaultdict)
+
+        # avoid executing user code if user_cls is a dict subclass
+        if user_cls in accepted_dict_types:
+            dict_cls = user_cls
+        else:
+            # <Subclass, ..., dict, object>
+            dict_cls = next(
+                base for base in user_cls.__mro__ if base in accepted_dict_types
+            )
+        assert dict_cls in accepted_dict_types, dict_cls
+
+        # Use a dict instead as the call "defaultdict({make_hashable(x): v ..})"
+        # would fail as defaultdict expects a callable as first argument
+        if dict_cls is collections.defaultdict:
+            dict_cls = dict
+        return dict_cls
+
     def as_proxy(self):
         return {k.vt.as_proxy(): v.as_proxy() for k, v in self.items.items()}
 
@@ -285,19 +306,13 @@ def __contains__(self, vt) -> bool:
             and not isinstance(self.items[Hashable(vt)], variables.DeletedVariable)
         )
 
-    def len(self):
-        return len(
-            [
-                x
-                for x in self.items.values()
-                if not isinstance(x, variables.DeletedVariable)
-            ]
+    def len(self) -> int:
+        return sum(
+            not isinstance(x, variables.DeletedVariable) for x in self.items.values()
         )
 
-    def has_new_items(self):
-        if self.should_reconstruct_all:
-            return True
-        return any(
+    def has_new_items(self) -> bool:
+        return self.should_reconstruct_all or any(
             self.is_new_item(self.original_items.get(key.vt), value)
             for key, value in self.items.items()
         )
@@ -533,14 +548,33 @@ def call_method(
             tx.output.side_effects.mutation(self)
             return self.items.pop(Hashable(args[0]))
         elif name == "popitem" and self.is_mutable():
-            if len(args):
+            if (
+                issubclass(self.user_cls, dict)
+                and not issubclass(self.user_cls, collections.OrderedDict)
+                and len(args)
+            ):
                 raise_args_mismatch(tx, name)
+
             if not self.items:
                 msg = ConstantVariable.create("popitem(): dictionary is empty")
                 raise_observed_exception(KeyError, tx, args=[msg])
+
+            if self.user_cls is collections.OrderedDict and (
+                len(args) == 1 or "last" in kwargs
+            ):
+                if len(args) == 1 and isinstance(args[0], ConstantVariable):
+                    last = args[0].value
+                elif (v := kwargs.get("last")) and isinstance(v, ConstantVariable):
+                    last = v.value
+                else:
+                    raise_args_mismatch(tx, name)
+                k, v = self.items.popitem(last=last)
+            else:
+                k, v = self.items.popitem()
+
             self.should_reconstruct_all = True
             tx.output.side_effects.mutation(self)
-            k, v = self.items.popitem()
+
             return variables.TupleVariable([k.vt, v])
         elif name == "clear":
             if args or kwargs:
@@ -609,12 +643,23 @@ def call_method(
                 return x
         elif name == "move_to_end":
             self.install_dict_keys_match_guard()
-            assert not kwargs and len(args) == 1
             tx.output.side_effects.mutation(self)
+            if args[0] not in self:
+                raise_observed_exception(KeyError, tx)
+
+            last = True
+            if len(args) == 2 and isinstance(args[1], ConstantVariable):
+                last = args[1].value
+
+            if (
+                kwargs
+                and "last" in kwargs
+                and isinstance(kwargs["last"], ConstantVariable)
+            ):
+                last = kwargs.get("last").value
+
             key = Hashable(args[0])
-            val = self.items[key]
-            self.items.pop(key)
-            self.items[key] = val
+            self.items.move_to_end(key, last=last)
             return ConstantVariable.create(None)
         elif name == "__eq__" and istype(
             self, ConstDictVariable
@@ -631,33 +676,48 @@ def call_method(
             )
         elif name == "__or__":
             assert len(args) == 1
-            # Dicts can only be unioned with other dicts or subclasses of dicts.
-            # Sets can be unioned with other sets, frozensets or subclasses of sets.
-            _raise = not (
-                (
-                    istype(self, ConstDictVariable)
-                    and istype(
-                        args[0], (ConstDictVariable, variables.UserDefinedDictVariable)
-                    )
-                )
-                or (
-                    isinstance(self, SetVariable)
-                    and isinstance(
-                        args[0], (SetVariable, variables.UserDefinedSetVariable)
-                    )
-                )
-            )
-
-            if _raise:
+            other = args[0]
+
+            # Method resolution for binops works as follow (using __or__ as example):
+            # (1) dict.__or__(dict) => dict
+            # (2) dict.__or__(subclass): return NotImplemented
+            # (3) Check if subclass implements __ror__ => forward the call
+            # to subclass.__ror__(dict)
+
+            # Let's not forward the call to __ror__ yet because __ror__ can be
+            # implemented in C (i.e. OrderedDict subclass) which Dynamo cannot
+            # trace
+            # if istype(other, variables.UserDefinedDictVariable):
+            #     if other.call_obj_hasattr(tx, "__ror__").value:
+            #         return other.call_method(tx, "__ror__", [self], kwargs)
+
+            # The three dict types Dynamo can handle are dict, OrderedDict and
+            # defaultdict.
+
+            # TODO(guilhermeleobas): this check should be on builtin.py::call_or_
+            if not istype(
+                other, (ConstDictVariable, variables.UserDefinedDictVariable)
+            ):
                 msg = (
                     f"unsupported operand type(s) for |: '{self.python_type().__name__}'"
-                    f"and '{args[0].python_type().__name__}'"
+                    f"and '{other.python_type().__name__}'"
                 )
                 raise_observed_exception(TypeError, tx, args=[msg])
 
+            # OrderedDict overloads __ror__
+            ts = {self.user_cls, other.user_cls}
+            user_cls = (
+                collections.OrderedDict
+                if any(issubclass(t, collections.OrderedDict) for t in ts)
+                else dict
+            )
+
             self.install_dict_keys_match_guard()
             new_dict_vt = self.clone(
-                items=self.items.copy(), mutation_type=ValueMutationNew(), source=None
+                items=self.items.copy(),
+                mutation_type=ValueMutationNew(),
+                source=None,
+                user_cls=user_cls,
             )
 
             # NB - Guard on all the keys of the other dict to ensure
@@ -886,6 +946,18 @@ def reconstruct(self, codegen: "PyCodegen"):
         codegen.foreach([x.vt for x in self.set_items])
         codegen.append_output(create_instruction("BUILD_SET", arg=len(self.set_items)))
 
+    def _fast_set_method(self, tx, fn, args, kwargs):
+        try:
+            res = fn(
+                *[x.as_python_constant() for x in [self, *args]],
+                **{k: v.as_python_constant() for k, v in kwargs.items()},
+            )
+        except Exception as exc:
+            raise_observed_exception(
+                type(exc), tx, args=list(map(ConstantVariable.create, exc.args))
+            )
+        return VariableTracker.build(tx, res)
+
     def call_method(
         self,
         tx,
@@ -894,6 +966,23 @@ def call_method(
         kwargs: dict[str, VariableTracker],
     ) -> "VariableTracker":
         # We forward the calls to the dictionary model
+        from ..utils import check_constant_args
+
+        if (
+            name
+            in (
+                "isdisjoint",
+                "union",
+                "intersection",
+                "difference",
+                "symmetric_difference",
+            )
+            and check_constant_args(args, kwargs)
+            and self.python_type() is set
+        ):
+            py_type = self.python_type()
+            return self._fast_set_method(tx, getattr(py_type, name), args, kwargs)
+
         if name == "__init__":
             temp_set_vt = variables.BuiltinVariable(set).call_set(tx, *args, *kwargs)
             tx.output.side_effects.mutation(self)
diff --git a/torch/_dynamo/variables/distributed.py b/torch/_dynamo/variables/distributed.py
index 39320c423e4e3..59f3102c6519b 100644
--- a/torch/_dynamo/variables/distributed.py
+++ b/torch/_dynamo/variables/distributed.py
@@ -266,6 +266,10 @@ def call_method(
             return ConstantVariable.create(self.value.size(*const_args, **const_kwargs))
         if name == "get_coordinate":
             return ConstantVariable.create(self.value.get_coordinate())
+        if name == "get_rank":
+            return ConstantVariable.create(self.value.get_rank())
+        if name == "get_local_rank":
+            return ConstantVariable.create(self.value.get_local_rank())
         if name == "get_group":
             const_args = [x.as_python_constant() for x in args]
             const_kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index e628a955bc904..d1755c85abf61 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -62,6 +62,7 @@
     ConstantSource,
     DefaultsSource,
     GetItemSource,
+    SkipGuardSource,
 )
 from ..utils import (
     check_constant_args,
@@ -303,6 +304,13 @@ def _create_nested_fn(
 
 def fn_var_getattr(tx, fn, source, name):
     source = source and AttrSource(source, name)
+
+    if source and name == "__annotations__":
+        # We get a large number of silly guards from annotations from inspect
+        # module. Changing annotations is rare, and it impacting the extracted
+        # graph is even rarer. So skip guards.
+        source = SkipGuardSource(source)
+
     try:
         subobj = inspect.getattr_static(fn, name)
     except AttributeError:
@@ -416,6 +424,13 @@ def has_self(self):
     def get_globals(self):
         return self.fn.__globals__
 
+    def get_source(self):
+        source = self.source
+
+        if source and isinstance(self, variables.UserMethodVariable):
+            source = self.source_fn
+        return source
+
     def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
         """
         Assume `args` and `kwargs` are VariableTracker arguments for a call to
@@ -428,7 +443,9 @@ def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
         if not isinstance(fn, FunctionType):
             raise TypeError("Only supports regular Python functions.")
         root_tx = parent.output.root_tx
-        result = bind_args_cached(fn, root_tx, self.source, args, kwargs)
+
+        source = self.get_source()
+        result = bind_args_cached(fn, root_tx, source, args, kwargs)
 
         init_cellvars(parent, result, fn.__code__)
         closure = self.fn.__closure__ or ()
@@ -441,8 +458,8 @@ def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
             if cell in side_effects:
                 cell_var = side_effects[cell]
 
-            elif self.source:
-                closure_cell = GetItemSource(ClosureSource(self.source), idx)
+            elif source:
+                closure_cell = GetItemSource(ClosureSource(source), idx)
                 closure_cell_contents = AttrSource(closure_cell, "cell_contents")
                 try:
                     contents_var = VariableTracker.build(
@@ -472,7 +489,8 @@ def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
     def var_getattr(self, tx: "InstructionTranslator", name: str):
         if name in cmp_name_to_op_mapping:
             return variables.GetAttrVariable(self, name)
-        return fn_var_getattr(tx, self.fn, self.source, name)
+        source = self.get_source()
+        return fn_var_getattr(tx, self.fn, source, name)
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
@@ -503,15 +521,17 @@ def call_function(
                     "Please fix your call to patch_dynamo_config by using simpler inputs. "
                     f"args: {args}, kwargs: {kwargs}"
                 ) from e
-        elif self.fn is torch._dynamo.set_fullgraph:
+        elif self.fn is torch._dynamo.error_on_graph_break:
             try:
                 bound = inspect.signature(self.fn).bind(*args, **kwargs)
-                fullgraph = bound.arguments["fullgraph"].as_python_constant()
-                assert isinstance(fullgraph, bool)
-                return variables.SetFullgraphVariable(fullgraph)
+                error_on_graph_break = bound.arguments[
+                    "error_on_graph_break"
+                ].as_python_constant()
+                assert isinstance(error_on_graph_break, bool)
+                return variables.ErrorOnGraphBreakVariable(error_on_graph_break)
             except Exception as e:
                 raise RuntimeError(
-                    "Improper set_fullgraph() call. Please fix your call to set_fullgraph(). "
+                    "Improper error_on_graph_break() call. Please fix your call to error_on_graph_break(). "
                     f"args: {args}, kwargs: {kwargs}"
                 ) from e
         # Handle a `nonstrict_trace(fn)` call
@@ -1044,9 +1064,24 @@ def _build_inline_tracer(self, tx, args, kwargs):
 class UserMethodVariable(UserFunctionVariable):
     """Some unsupported user-defined method"""
 
-    def __init__(self, fn, obj, **kwargs) -> None:
+    def __init__(self, fn, obj, source_fn=None, **kwargs) -> None:
         super().__init__(fn=fn, **kwargs)
         self.obj = obj
+        self.source_fn = source_fn
+        # Note on source and source_fn
+        # Be careful with `source` when delegating to UserFunctionVariable
+        # (base-class) methods. In this __init__, `source` is a *bound method*
+        # object, but the base class expects the underlying *function* object.
+        # One way is to simplly use `__func__` to unwrap it.
+        #
+        # For recursive dict-tag optimizations, it can be faster to fetch the
+        # function directly from `cls.__dict__`; that’s why we pass on
+        # `source_fn`. Whenever it is possible to access the function from
+        # cls.__dict__, we pass that on to `source_fn`. Because bind_args
+        # operates on the unbound function, most guards should target
+        # `source_fn` rather than the original `source`.
+        if source_fn is None and kwargs.get("source") is not None:
+            self.source_fn = AttrSource(kwargs.get("source"), "__func__")
 
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}({self.fn}, {self.obj})"
@@ -1122,26 +1157,15 @@ def inspect_parameter_names(self):
         return super().inspect_parameter_names()[1:]
 
     def var_getattr(self, tx: "InstructionTranslator", name: str):
-        if name == "__func__":
-            # self.source points to the source of the function object and not
-            # the method object
-            return VariableTracker.build(tx, self.fn, self.source)
         if name == "__self__":
             return self.obj
+        if name == "__func__":
+            # We might have a better way to access the function object, this
+            # information is stored in self.source_fn, use that to construct the
+            # variable tracker.
+            return VariableTracker.build(tx, self.fn, self.source_fn)
         return super().var_getattr(tx, name)
 
-    def reconstruct(self, codegen):
-        if not self.obj.source or not self.source:
-            raise NotImplementedError
-
-        def get_bound_method():
-            codegen(self.source)
-            codegen.extend_output(codegen.create_load_attrs("__get__"))
-
-        codegen.add_push_null(get_bound_method)
-        codegen(self.obj.source)
-        codegen.extend_output(create_call_function(1, False))
-
 
 class WrappedUserMethodVariable(UserMethodVariable):
     def __init__(self, wrapped, context, **kwargs) -> None:
@@ -1444,11 +1468,27 @@ def as_python_constant(self):
 
     @classmethod
     def create_with_source(cls, value, source):
-        if not is_wrapper_or_member_descriptor(value):
+        # Use closure match guard (i.e. guard on __code__ object instead of
+        # function id) to avoid guarding on nested functions.
+        if inspect.getattr_static(value, "_torchdynamo_disable", False):
+            # For torch._dynamo.disable function, ensure that the original
+            # function is guarded. Otherwise, the else branch will guard on the
+            # _dynamo.disable.__code__
+            guard_on_source = source
+            guard_on_value = value
+
+            while getattr(guard_on_value, "_torchdynamo_orig_callable", False):
+                guard_on_value = guard_on_value._torchdynamo_orig_callable
+                guard_on_source = AttrSource(
+                    guard_on_source, "_torchdynamo_orig_callable"
+                )
+
+            guard_on_source.make_guard(GuardBuilder.CLOSURE_MATCH)
+        elif not is_wrapper_or_member_descriptor(value):
             # These descriptors are not guaranteed to return the same object on
             # attribute lookup. They are unlikely to be changed, so we can skip
             # guarding them.
-            install_guard(source.make_guard(GuardBuilder.FUNCTION_MATCH))
+            install_guard(source.make_guard(GuardBuilder.CLOSURE_MATCH))
         return cls(value, source=source)
 
     def call_function(
@@ -1828,10 +1868,17 @@ def call_function(
     ) -> "VariableTracker":
         constant_args = check_constant_args(args, kwargs)
         if constant_args:
-            value = self.fn(
-                *[x.as_python_constant() for x in args],
-                **{k: v.as_python_constant() for k, v in kwargs.items()},
-            )
+            try:
+                value = self.fn(
+                    *[x.as_python_constant() for x in args],
+                    **{k: v.as_python_constant() for k, v in kwargs.items()},
+                )
+            except TypeError as exc:
+                raise_observed_exception(
+                    type(exc),
+                    tx,
+                    args=list(map(ConstantVariable.create, exc.args)),
+                )
             return variables.UserDefinedClassVariable(
                 value, mutation_type=ValueMutationNew()
             )
diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
index cdaf1e9e52ccc..5ac883c7d3932 100644
--- a/torch/_dynamo/variables/higher_order_ops.py
+++ b/torch/_dynamo/variables/higher_order_ops.py
@@ -27,7 +27,8 @@
 import types
 import warnings
 from collections.abc import Sequence
-from typing import Optional, TYPE_CHECKING
+from dataclasses import dataclass
+from typing import Any, Optional, TYPE_CHECKING
 
 import torch._C
 import torch.fx
@@ -70,6 +71,33 @@
 hc_log = torch._logging.getArtifactLogger(__name__, "hierarchical_compile")
 
 
+@dataclass
+class OutputSpec:
+    """
+    Contains the treespec of the output of the speculated subgraph, and the
+    information to mask out the constant values from the output during
+    flattening and inserting them back during unflattening. Cleaning up
+    constants from the graph makes the graph simpler for AOTDispatcher and
+    Inductor.
+    """
+
+    treespec: pytree.TreeSpec
+    # list of True/False to identify the locations of const values in the
+    # subgraph output. True means that value at that index is a constant.
+    masks_to_filter_const_values: Optional[list[bool]] = None
+    # The actual constant values that were present in the subgraph output. Note
+    # that this is the same length as the mask, we just look at the indices
+    # where mask is True.
+    const_values: Optional[list[Any]] = None
+
+    def __post_init__(self):
+        if (
+            self.masks_to_filter_const_values is not None
+            or self.const_values is not None
+        ):
+            assert len(self.masks_to_filter_const_values) == len(self.const_values)
+
+
 def raise_hard_error_if_graph_break(reason):
     def deco(fn):
         @functools.wraps(fn)
@@ -77,8 +105,19 @@ def graph_break_as_hard_error(*args, **kwargs):
             try:
                 return fn(*args, **kwargs)
             except (Unsupported, ObservedException) as e:
-                msg = " Scroll up to find out what causes the graph break."
-                raise UncapturedHigherOrderOpError(reason + msg) from e
+                import sys
+
+                if isinstance(e, Unsupported):
+                    exc = UncapturedHigherOrderOpError(
+                        f"{reason} Got {e.msg}", e.real_stack
+                    )
+                else:
+                    msg = e.msg if hasattr(e, "msg") else type(e)
+                    real_stack = e.real_stack if hasattr(e, "real_stack") else None
+                    exc = UncapturedHigherOrderOpError(
+                        f"{reason} Got {msg}", real_stack
+                    )
+                raise exc.with_traceback(sys.exc_info()[2]) from None
 
         return graph_break_as_hard_error
 
@@ -205,7 +244,7 @@ def inline_call(*args, **kwargs):
 
 
 def _call_function_and_unflatten_output(
-    tx, fn, args, kwargs, flat_example_value, ret_treespec
+    tx, fn, args, kwargs, flat_example_value, ret_spec
 ):
     from .builder import wrap_fx_proxy
 
@@ -221,12 +260,21 @@ def _call_function_and_unflatten_output(
         example_value=flat_example_value,
     )
 
+    if ret_spec.masks_to_filter_const_values:
+        from torch._dynamo.external_utils import insert_const_values_with_mask
+
+        # During flattening, we removed the constant values. To ensure Dynamo
+        # can trace correctly, insert back the constant values in the output.
+        flat_variable = _make_inlined(tx, insert_const_values_with_mask)(
+            flat_variable, ret_spec.masks_to_filter_const_values, ret_spec.const_values
+        )
+
     # Transform variable back into a list (previously made into a tuple by
     # speculate_subgraph function) so as to respect the pytree API typing.
     flat_list_variable = BuiltinVariable(list).call_function(tx, [flat_variable], {})
     return (
-        _make_inlined(tx, pytree.tree_unflatten)(flat_list_variable, ret_treespec)
-        if ret_treespec
+        _make_inlined(tx, pytree.tree_unflatten)(flat_list_variable, ret_spec.treespec)
+        if ret_spec.treespec
         else flat_variable
     )
 
@@ -264,6 +312,246 @@ def _check_supported_callable_arg(
         )
 
 
+def _call_while_loop(
+    self: VariableTracker,
+    tx: "InstructionTranslator",
+    args: list[VariableTracker],
+    kwargs: dict[str, VariableTracker],
+    stack_output: bool,
+) -> VariableTracker:
+    from torch._higher_order_ops.while_loop import _create_unbacked_symint
+
+    from . import TensorVariable
+
+    args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
+    cond_fn, body_fn, operands, additional_inputs = args
+
+    # Input checks
+    for i, k in enumerate(["cond_fn", "body_fn", "operands"]):
+        if v := kwargs.pop(k, None):
+            assert i == len(args), (
+                "did not provide the right number of non-keyword args"
+            )
+            args.append(v)
+
+    if kwargs:
+        unimplemented(f"torch.while_loop: Got unexpected kwargs: {list(kwargs.keys())}")
+
+    if len(args) != 4:
+        unimplemented(
+            f"Expected 4 arguments but got {len(args)}.\n"
+            f"Usage: while_loop(cond_fn, body_fn, operands)",
+        )
+
+    # cond_fn and body_fn input check
+    _check_supported_callable_arg(tx, cond_fn, "cond_fn")
+    _check_supported_callable_arg(tx, body_fn, "body_fn")
+
+    # operands input check
+    operands_seq = operands.unpack_var_sequence(tx)
+
+    # additional_inputs input check
+    if not isinstance(additional_inputs, (ListVariable, TupleVariable)):
+        unimplemented(
+            f"Expected additional_inputs to be a list/tuple but got "
+            f"{additional_inputs.python_type()}. It seems to be an "
+            f"internal error, please report an issue to PyTorch."
+        )
+    additional_inputs_seq = additional_inputs.unpack_var_sequence(tx)
+
+    with discard_graph_changes(tx):
+        # Note: this must be run under discard graph changes.
+        def unspecialize_carried_inputs(tx, carry) -> VariableTracker:
+            # See NOTE [unspecialize int carry with unbacked symints]
+            if (
+                isinstance(carry, ConstantVariable) and carry.python_type() is int
+            ) or isinstance(carry, SymNodeVariable):
+                example_value = _create_unbacked_symint(
+                    tx.output.fake_mode, ignore_fresh_unbacked_symbols=True
+                )
+                proxy = tx.output.current_tracer.create_graph_input(
+                    "unbacked_symint", type(example_value), example_value
+                )
+                return SymNodeVariable.create(tx, proxy, example_value)
+            else:
+                # See NOTE [unspecialize constant tensor carry]
+                assert isinstance(carry, TensorVariable)
+                cloned_carry = carry.clone()
+                cloned_carry.proxy.node.meta["example_value"].constant = None
+                return cloned_carry
+
+        # clone inputs across subgraphs, to avoid unbacked memoization in fake prop
+        cond_operands_seq = [
+            unspecialize_carried_inputs(
+                tx,
+                (
+                    carry.call_method(tx, "clone", args=(), kwargs={})
+                    if isinstance(carry, TensorVariable)
+                    else carry
+                ),
+            )
+            for carry in operands_seq
+        ]
+        body_operands_seq = [
+            unspecialize_carried_inputs(
+                tx,
+                (
+                    carry.call_method(tx, "clone", args=(), kwargs={})
+                    if isinstance(carry, TensorVariable)
+                    else carry
+                ),
+            )
+            for carry in operands_seq
+        ]
+
+    # create cond subgrpahs
+    (
+        (cond_r, _cond_treespec),
+        cond_graph,
+        cond_lifted_freevars,
+    ) = speculate_subgraph(
+        tx,
+        cond_fn,
+        cond_operands_seq + additional_inputs_seq,
+        {},
+        "while_loop",
+        source_target=self.value,
+        # NOTE [why we cannot use "automatic" for while_loop]:
+        # The reason is that we want to enforce
+        # the ordering of inputs and outputs to be consistent and the the ordering
+        # of cond_fn and body_fn to the consistent.
+        # e.g. suppose we use "automatic" and we have:
+        #
+        # def body_fn(ph1, ph2):
+        #   new_a, new_b = ph2.cos(), ph1.sin()
+        #   return new_a, new_b
+        #
+        # a, b = torch.randn(3), torch.randn(3)
+        # new_a, new_b = body_fn(a, b)
+        #
+        # Using automatic, the ordering of arguments will be the order that they're
+        # used. In this example, the capture graph looks like:
+        #
+        # def captured_body(ph1, ph2):
+        #   new_a, new_b = ph1.cos(), ph2.add_(1)
+        #   return new_a, new_b
+        #
+        # This is fine when we change the calling convention of captured_body to be
+        # new_a, new_b = captured_body(b, a).
+        # But for while_loop, the next iteration's input is previous iteration output
+        # we'll end up feeding captured_body(new_a, new_b) instead.
+        # So it's best we always enforce the ordering of carried_inputs the same as outputs
+        # with "flatten_manual".
+        set_subgraph_inputs="flatten_manual",
+        supports_input_mutation=self.supports_input_mutation,
+        supports_aliasing=self.supports_aliasing,
+        remove_consts_from_outputs=False,
+    )
+    cond_nn_modules = dict(tx.output.nn_modules)
+    validate_subgraph_output_types(cond_r)
+    if isinstance(cond_r, TensorVariable):
+        cond_r_meta = _extract_tensor_metadata(
+            cond_r.proxy.node.meta["example_value"], include_contiguity=False
+        )
+        if not cond_r_meta.dtype == torch.bool or not cond_r_meta.shape == torch.Size(
+            []
+        ):
+            unimplemented(
+                f"Expected cond_fn to return a scalar tensor or a bool but got {cond_r_meta.shape}"
+            )
+    elif isinstance(cond_r, ConstantVariable):
+        # short-circuiting while_loop when cond_fn returns a constant such as 0, 1 True or False
+        pred = cond_r.as_python_constant()
+        if pred:
+            unimplemented(
+                f"Infinite loop detected because while_loop's cond_fn always returns the same value {pred}"
+            )
+        else:
+            return operands
+
+    # create body subgraph
+    (
+        (body_r, body_treespec),
+        body_graph,
+        body_lifted_freevars,
+    ) = speculate_subgraph(
+        tx,
+        body_fn,
+        body_operands_seq + additional_inputs_seq,
+        {},
+        "while_loop",
+        source_target=self.value,
+        set_subgraph_inputs="flatten_manual",
+        should_flatten_outputs=True,
+        supports_input_mutation=False,
+        supports_aliasing=False,
+        remove_consts_from_outputs=False,
+    )
+    validate_subgraph_output_types(body_r)
+
+    # We set include contiguity=False because we have vmap x HOP tests, where if
+    # include_contiguity=True will call t.is_contiguous inside of vmap and get an error
+    # "querying is_contiguous inside of vmap for memory_format other than
+    # torch.contiguous_format is not yet implemented". This is okay because stride
+    # is still checked.
+    check_meta_consistency_vt(
+        body_r.unpack_var_sequence(tx),
+        operands_seq,
+        "body_fn_output",
+        "carried_inputs",
+        include_contiguity=False,
+    )
+
+    (
+        cond_graph,
+        body_graph,
+        cond_shared,
+        _body_shared,
+        cond_unique,
+        body_unique,
+    ) = _merge_graph_inputs(
+        cond_graph,
+        cond_lifted_freevars,
+        "cond_fn",
+        body_graph,
+        body_lifted_freevars,
+        "body_fn",
+    )
+
+    # Note: cond_shared and body_shared refer to the same proxy in parent graph
+    # so using either of them is OK. Use cond_shared as it doesn't matter.
+    additional_lifted_inputs = cond_shared + cond_unique + body_unique
+
+    body_nn_modules = dict(tx.output.nn_modules)
+
+    cond_gm = torch.fx.GraphModule(cond_nn_modules, cond_graph)
+    body_gm = torch.fx.GraphModule(body_nn_modules, body_graph)
+    cond_name = tx.output.install_subgraph("cond_fn", cond_gm)
+    body_name = tx.output.install_subgraph("body_fn", body_gm)
+
+    cond_node = make_attr(tx, cond_name)
+    body_node = make_attr(tx, body_name)
+
+    operands_proxy = tuple(operand.as_proxy() for operand in operands_seq)
+    additional_inputs_proxy = tuple(
+        [inp.as_proxy() for inp in additional_inputs_seq] + additional_lifted_inputs
+    )
+    p_args = (
+        cond_node,
+        body_node,
+        operands_proxy,
+        additional_inputs_proxy,
+    )
+    return _call_function_and_unflatten_output(
+        tx,
+        self.value,
+        p_args,
+        {},
+        None,
+        body_treespec,
+    )
+
+
 def are_same_graph_modules(fn_name, a_mod, b_mod, fake_mode):
     from torch._subclasses._fake_tensor_utils import _CacheKeyState
     from torch._subclasses.fake_tensor import extract_tensor_metadata
@@ -625,6 +913,9 @@ def speculate_subgraph(
     set_subgraph_inputs="automatic",
     restore_side_effects=True,
     should_flatten_outputs=False,
+    # if should_flatten_outputs is True, `remove_consts_from_outputs` remove the
+    # const outputs from the subgraph output.
+    remove_consts_from_outputs=True,
     under_activation_checkpoint=False,
     # TODO - supports input_mutation and aliasing should be False by default for strictness
     supports_input_mutation=True,
@@ -715,15 +1006,38 @@ def speculate_subgraph(
                 tx.output.side_effects = prev_side_effects
 
             treespec = None
+            masks_to_filter_const_values = None
+            const_values = None
             if should_flatten_outputs:
+                from torch._dynamo.external_utils import filter_out_const_values
+
                 # Flatten the speculated subgraph output.
                 output, treespec = _make_inlined(tx, pytree.tree_flatten)(
                     output
                 ).unpack_var_sequence(tx)
+
                 # Actually, transform the list (returned by flatten) into a tuple
                 # for dynamo consistency.
                 output = BuiltinVariable(tuple).call_function(tx, [output], {})
 
+                if remove_consts_from_outputs:
+                    # Filter out the constants and save them into a spec. Filtering
+                    # out constants makes the graph simpler for the backends. We
+                    # need to ensure that after unflattening the constants are
+                    # inserted back at the right positions for the Dynamo tracing to
+                    # continue. This is done by filter_const_spec
+                    output_proxies = output.as_proxy()
+                    masks_to_filter_const_values = pytree.tree_map(
+                        lambda x: not isinstance(x, torch.fx.Proxy), output_proxies
+                    )
+                    const_values = pytree.tree_map(
+                        lambda x: None if isinstance(x, torch.fx.Proxy) else x,
+                        output_proxies,
+                    )
+                    output = _make_inlined(tx, filter_out_const_values)(
+                        output, masks_to_filter_const_values
+                    )
+
             # Register output to graph
             # Modeled off of compile_and_call_fx_graph
             # TODO: support pytree output
@@ -731,7 +1045,16 @@ def speculate_subgraph(
             # like bwd.
             if always_restore:
                 # Nothing left to do here
-                return (output, treespec), tx.output.graph, subtracer.lifted_freevars
+                return (
+                    (
+                        output,
+                        OutputSpec(
+                            treespec, masks_to_filter_const_values, const_values
+                        ),
+                    ),
+                    tx.output.graph,
+                    subtracer.lifted_freevars,
+                )
             else:
                 validate_subgraph_output_types(output)
 
@@ -847,7 +1170,12 @@ def move_lifted_freevars_phs_to_end(
                         )
 
                 return (
-                    (output, treespec),
+                    (
+                        output,
+                        OutputSpec(
+                            treespec, masks_to_filter_const_values, const_values
+                        ),
+                    ),
                     graph,
                     lifted_freevars,
                 )
@@ -938,7 +1266,7 @@ def _call_function(
             torch._dynamo.variables.UserDefinedObjectVariable(
                 self.value, source=self.source
             ),
-            source=AttrSource(AttrSource(self.source, "__call__"), "__func__"),
+            source=AttrSource(self.source, "__call__"),
         ).call_function(tx, args, kwargs)
 
 
@@ -1034,7 +1362,7 @@ def speculate_branch(branch):
             ix = 1 if branch else 2
             # TODO: Support kwargs
             (
-                (ret_val, ret_treespec),
+                (ret_val, ret_spec),
                 ret_graph,
                 ret_lifted_freevars,
             ) = speculate_subgraph(
@@ -1045,6 +1373,8 @@ def speculate_branch(branch):
                 "cond",
                 source_target=self.value,
                 should_flatten_outputs=True,
+                # TODO - removing consts from control flow ops need more work
+                remove_consts_from_outputs=False,
                 supports_input_mutation=self.supports_input_mutation,
                 supports_aliasing=self.supports_aliasing,
             )
@@ -1060,25 +1390,23 @@ def speculate_branch(branch):
                         "Expected branches to return a possibly nested pytree of tensors "
                         f"or constant ints but it consists of others {ret.python_type()}.",
                     )
-            return ret_val, ret_treespec, ret_graph, ret_lifted_freevars
+            return ret_val, ret_spec, ret_graph, ret_lifted_freevars
 
-        (true_r, true_treespec, true_graph, true_lifted_freevars) = speculate_branch(
-            True
-        )
+        (true_r, true_spec, true_graph, true_lifted_freevars) = speculate_branch(True)
         true_nn_modules = dict(tx.output.nn_modules)
 
         (
             false_r,
-            false_treespec,
+            false_spec,
             false_graph,
             false_lifted_freevars,
         ) = speculate_branch(False)
         false_nn_modules = dict(tx.output.nn_modules)
 
-        same_treespec = _make_inlined(tx, pytree.TreeSpec.__eq__)(
-            true_treespec, false_treespec
+        same_spec = _make_inlined(tx, pytree.TreeSpec.__eq__)(
+            true_spec.treespec, false_spec.treespec
         )
-        if not same_treespec.as_python_constant():
+        if not same_spec.as_python_constant():
             unimplemented("Expected branches to return the same pytree structure.")
 
         (
@@ -1123,7 +1451,7 @@ def speculate_branch(branch):
             p_args,
             {},
             None,
-            true_treespec,
+            true_spec,
         )
 
 
@@ -1192,241 +1520,23 @@ def _call_function(
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
-        from torch._higher_order_ops.while_loop import _create_unbacked_symint
-
-        from . import TensorVariable
-
-        args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
-        cond_fn, body_fn, operands, additional_inputs = args
-
-        # Input checks
-        for i, k in enumerate(["cond_fn", "body_fn", "operands"]):
-            if v := kwargs.pop(k, None):
-                assert i == len(args), (
-                    "did not provide the right number of non-keyword args"
-                )
-                args.append(v)
-
-        if kwargs:
-            unimplemented(
-                f"torch.while_loop: Got unexpected kwargs: {list(kwargs.keys())}"
-            )
-
-        if len(args) != 4:
-            unimplemented(
-                f"Expected 4 arguments but got {len(args)}.\n"
-                f"Usage: while_loop(cond_fn, body_fn, operands)",
-            )
-
-        # cond_fn and body_fn input check
-        _check_supported_callable_arg(tx, cond_fn, "cond_fn")
-        _check_supported_callable_arg(tx, body_fn, "body_fn")
-
-        # operands input check
-        operands_seq = operands.unpack_var_sequence(tx)
-
-        # additional_inputs input check
-        if not isinstance(additional_inputs, (ListVariable, TupleVariable)):
-            unimplemented(
-                f"Expected additional_inputs to be a list/tuple but got "
-                f"{additional_inputs.python_type()}. It seems to be an "
-                f"internal error, please report an issue to PyTorch."
-            )
-        additional_inputs_seq = additional_inputs.unpack_var_sequence(tx)
-
-        with discard_graph_changes(tx):
-            # Note: this must be run under discard graph changes.
-            def unspecialize_carried_inputs(tx, carry) -> VariableTracker:
-                # See NOTE [unspecialize int carry with unbacked symints]
-                if (
-                    isinstance(carry, ConstantVariable) and carry.python_type() is int
-                ) or isinstance(carry, SymNodeVariable):
-                    example_value = _create_unbacked_symint(
-                        tx.output.fake_mode, ignore_fresh_unbacked_symbols=True
-                    )
-                    proxy = tx.output.current_tracer.create_graph_input(
-                        "unbacked_symint", type(example_value), example_value
-                    )
-                    return SymNodeVariable.create(tx, proxy, example_value)
-                else:
-                    # See NOTE [unspecialize constant tensor carry]
-                    assert isinstance(carry, TensorVariable)
-                    cloned_carry = carry.clone()
-                    cloned_carry.proxy.node.meta["example_value"].constant = None
-                    return cloned_carry
-
-            # clone inputs across subgraphs, to avoid unbacked memoization in fake prop
-            cond_operands_seq = [
-                unspecialize_carried_inputs(
-                    tx,
-                    (
-                        carry.call_method(tx, "clone", args=(), kwargs={})
-                        if isinstance(carry, TensorVariable)
-                        else carry
-                    ),
-                )
-                for carry in operands_seq
-            ]
-            body_operands_seq = [
-                unspecialize_carried_inputs(
-                    tx,
-                    (
-                        carry.call_method(tx, "clone", args=(), kwargs={})
-                        if isinstance(carry, TensorVariable)
-                        else carry
-                    ),
-                )
-                for carry in operands_seq
-            ]
+        return _call_while_loop(self, tx, args, kwargs, stack_output=False)
 
-        # create cond subgrpahs
-        (
-            (cond_r, _cond_treespec),
-            cond_graph,
-            cond_lifted_freevars,
-        ) = speculate_subgraph(
-            tx,
-            cond_fn,
-            cond_operands_seq + additional_inputs_seq,
-            {},
-            "while_loop",
-            source_target=self.value,
-            # NOTE [why we cannot use "automatic" for while_loop]:
-            # The reason is that we want to enforce
-            # the ordering of inputs and outputs to be consistent and the the ordering
-            # of cond_fn and body_fn to the consistent.
-            # e.g. suppose we use "automatic" and we have:
-            #
-            # def body_fn(ph1, ph2):
-            #   new_a, new_b = ph2.cos(), ph1.sin()
-            #   return new_a, new_b
-            #
-            # a, b = torch.randn(3), torch.randn(3)
-            # new_a, new_b = body_fn(a, b)
-            #
-            # Using automatic, the ordering of arguments will be the order that they're
-            # used. In this example, the capture graph looks like:
-            #
-            # def captured_body(ph1, ph2):
-            #   new_a, new_b = ph1.cos(), ph2.add_(1)
-            #   return new_a, new_b
-            #
-            # This is fine when we change the calling convention of captured_body to be
-            # new_a, new_b = captured_body(b, a).
-            # But for while_loop, the next iteration's input is previous iteration output
-            # we'll end up feeding captured_body(new_a, new_b) instead.
-            # So it's best we always enforce the ordering of carried_inputs the same as outputs
-            # with "flatten_manual".
-            set_subgraph_inputs="flatten_manual",
-            supports_input_mutation=self.supports_input_mutation,
-            supports_aliasing=self.supports_aliasing,
-        )
-        cond_nn_modules = dict(tx.output.nn_modules)
-        validate_subgraph_output_types(cond_r)
-        if isinstance(cond_r, TensorVariable):
-            cond_r_meta = _extract_tensor_metadata(
-                cond_r.proxy.node.meta["example_value"], include_contiguity=False
-            )
-            if (
-                not cond_r_meta.dtype == torch.bool
-                or not cond_r_meta.shape == torch.Size([])
-            ):
-                unimplemented(
-                    f"Expected cond_fn to return a scalar tensor or a bool but got {cond_r_meta.shape}"
-                )
-        elif isinstance(cond_r, ConstantVariable):
-            # short-circuiting while_loop when cond_fn returns a constant such as 0, 1 True or False
-            pred = cond_r.as_python_constant()
-            if pred:
-                unimplemented(
-                    f"Infinite loop detected because while_loop's cond_fn always returns the same value {pred}"
-                )
-            else:
-                return operands
 
-        # create body subgraph
-        (
-            (body_r, body_treespec),
-            body_graph,
-            body_lifted_freevars,
-        ) = speculate_subgraph(
-            tx,
-            body_fn,
-            body_operands_seq + additional_inputs_seq,
-            {},
-            "while_loop",
-            source_target=self.value,
-            set_subgraph_inputs="flatten_manual",
-            should_flatten_outputs=True,
-            supports_input_mutation=False,
-            supports_aliasing=False,
-        )
-        validate_subgraph_output_types(body_r)
-
-        # We set include contiguity=False because we have vmap x HOP tests, where if
-        # include_contiguity=True will call t.is_contiguous inside of vmap and get an error
-        # "querying is_contiguous inside of vmap for memory_format other than
-        # torch.contiguous_format is not yet implemented". This is okay because stride
-        # is still checked.
-        check_meta_consistency_vt(
-            body_r.unpack_var_sequence(tx),
-            operands_seq,
-            "body_fn_output",
-            "carried_inputs",
-            include_contiguity=False,
-        )
-
-        (
-            cond_graph,
-            body_graph,
-            cond_shared,
-            _body_shared,
-            cond_unique,
-            body_unique,
-        ) = _merge_graph_inputs(
-            cond_graph,
-            cond_lifted_freevars,
-            "cond_fn",
-            body_graph,
-            body_lifted_freevars,
-            "body_fn",
-        )
-
-        # Note: cond_shared and body_shared refer to the same proxy in parent graph
-        # so using either of them is OK. Use cond_shared as it doesn't matter.
-        additional_lifted_inputs = cond_shared + cond_unique + body_unique
-
-        body_nn_modules = dict(tx.output.nn_modules)
-
-        cond_name = tx.output.install_subgraph(
-            "cond_fn",
-            torch.fx.GraphModule(cond_nn_modules, cond_graph),
-        )
-        body_name = tx.output.install_subgraph(
-            "body_fn",
-            torch.fx.GraphModule(body_nn_modules, body_graph),
-        )
-
-        cond_node = make_attr(tx, cond_name)
-        body_node = make_attr(tx, body_name)
+class WhileLoopStackOutputHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    supports_input_mutation = False
+    supports_aliasing = False
 
-        p_args = (
-            cond_node,
-            body_node,
-            tuple([operand.as_proxy() for operand in operands_seq]),
-            tuple(
-                [inp.as_proxy() for inp in additional_inputs_seq]
-                + additional_lifted_inputs
-            ),
-        )
-        return _call_function_and_unflatten_output(
-            tx,
-            torch.ops.higher_order.while_loop,
-            p_args,
-            {},
-            None,
-            body_treespec,
-        )
+    @raise_hard_error_if_graph_break(
+        reason="while_loop_stack_output doesn't work unless it is captured completely with torch.compile."
+    )
+    def _call_function(
+        self,
+        tx: "InstructionTranslator",
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
+        return _call_while_loop(self, tx, args, kwargs, stack_output=True)
 
 
 class AssociativeScanHigherOrderVariable(TorchHigherOrderOperatorVariable):
@@ -1515,7 +1625,7 @@ def arg_extractor(combine_fn, xs, additional_inputs):
 
         sub_args = sub_args + sub_args_additional_inputs
         (
-            (combine_result, _combine_treespec),
+            (combine_result, _combine_spec),
             combine_graph,
             combine_lifted_freevars,
         ) = speculate_subgraph(
@@ -1630,7 +1740,7 @@ def arg_extractor(combine_fn, xs, additional_inputs):
             p_args,
             {},
             None,
-            xs_treespec,
+            OutputSpec(xs_treespec),
         )
 
 
@@ -1738,7 +1848,7 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
 
         sub_args = sub_args_init + sub_args_inp + sub_args_additional_inputs
         (
-            (combine_result, _combine_treespec),
+            (combine_result, _combine_spec),
             combine_graph,
             combine_lifted_freevars,
         ) = speculate_subgraph(
@@ -1772,7 +1882,7 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
                     f"Expect combine_fn to return a tuple (next_carry, y) but got {combine_result_vars}"
                 )
             carry_tree, out_vars = combine_result_vars
-            carry_vars, carry_treespec = _make_inlined(tx, pytree.tree_flatten)(
+            carry_vars, _ = _make_inlined(tx, pytree.tree_flatten)(
                 carry_tree
             ).unpack_var_sequence(tx)
             carry_vars = carry_vars.unpack_var_sequence(tx)
@@ -1781,7 +1891,9 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
             ).unpack_var_sequence(tx)
 
             # additional output checking
-            _combine_treespec = _make_inlined(tx, pytree.tree_structure)(combine_result)
+            _combine_spec = OutputSpec(
+                _make_inlined(tx, pytree.tree_structure)(combine_result)
+            )
 
             check_meta_consistency_vt(
                 init_vars,
@@ -1822,7 +1934,7 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
         )
 
         return _call_function_and_unflatten_output(
-            tx, torch.ops.higher_order.scan, p_args, {}, None, _combine_treespec
+            tx, torch.ops.higher_order.scan, p_args, {}, None, _combine_spec
         )
 
 
@@ -1901,6 +2013,8 @@ def _call_function(
             source_target=self.value,
             set_subgraph_inputs="flatten_manual",
             should_flatten_outputs=True,
+            # TODO - removing consts from control flow ops need more work
+            remove_consts_from_outputs=False,
             supports_input_mutation=self.supports_input_mutation,
             supports_aliasing=self.supports_aliasing,
         )
@@ -2402,7 +2516,7 @@ def _call_function(
             )
 
         (
-            (ret_val, ret_treespec),
+            (ret_val, ret_spec),
             ret_graph,
             ret_lifted_freevars,
         ) = speculate_subgraph(
@@ -2440,7 +2554,7 @@ def _call_function(
             p_args,
             {},
             flat_example_value,
-            ret_treespec,
+            ret_spec,
         )
 
 
@@ -2454,8 +2568,6 @@ def _call_function(
         from torch._higher_order_ops.wrap import TagActivationCheckpoint
         from torch.utils.checkpoint import noop_context_fn
 
-        from .builder import wrap_fx_proxy
-
         context_fn = None
         if "context_fn" in kwargs and kwargs["context_fn"] != noop_context_fn:
             ctx = kwargs.pop("context_fn")
@@ -2479,7 +2591,7 @@ def _call_function(
             _,
             example_value,
             _body_r,
-            treespec,
+            out_spec,
             checkpointed_gmod,
             _,
         ) = self.create_wrapped_node(
@@ -2495,27 +2607,15 @@ def _call_function(
 
         _, checkpoint_kwargs = proxy_args_kwargs([], checkpoint_kwargs)
 
-        # Store the invocation as a call
-        variable = wrap_fx_proxy(
-            tx=tx,
-            proxy=tx.output.create_proxy(
-                "call_function",
-                self.value,
-                args=tuple(p_args),
-                kwargs=checkpoint_kwargs,
-            ),
-            example_value=example_value,
+        return _call_function_and_unflatten_output(
+            tx,
+            self.value,
+            p_args,
+            checkpoint_kwargs,
+            example_value,
+            out_spec,
         )
 
-        if treespec is None:
-            return variable
-
-        # Transform variable back into a list (previously made into a tuple by
-        # speculate_subgraph function) so as to respect the pytree API typing.
-        variable = BuiltinVariable(list).call_function(tx, [variable], {})
-
-        return _make_inlined(tx, pytree.tree_unflatten)(variable, treespec)
-
 
 class DynamoBypassingWrapperHigherOrderVariable(WrapHigherOrderVariable):
     def __init__(self, hop, source) -> None:
@@ -2527,8 +2627,6 @@ def _call_function(
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
-        from .builder import wrap_fx_proxy
-
         func_var = args[0]
 
         if isinstance(func_var, torch._dynamo.variables.UserFunctionVariable):
@@ -2546,7 +2644,7 @@ def _call_function(
             _,
             example_value,
             _body_r,
-            treespec,
+            out_spec,
             gmod,
             _,
         ) = self.create_wrapped_node(
@@ -2562,27 +2660,15 @@ def _call_function(
         gmod_meta_key = "_dynamo_bypassing_wrapper_fn"
         gmod.meta[gmod_meta_key] = func
 
-        # Store the invocation as a call
-        variable = wrap_fx_proxy(
-            tx=tx,
-            proxy=tx.output.create_proxy(
-                "call_function",
-                self.value,
-                args=(gmod_meta_key,) + tuple(p_args),
-                kwargs={},
-            ),
-            example_value=example_value,
+        return _call_function_and_unflatten_output(
+            tx,
+            self.value,
+            (gmod_meta_key,) + tuple(p_args),
+            {},
+            example_value,
+            out_spec,
         )
 
-        if treespec is None:
-            return variable
-
-        # Transform variable back into a list (previously made into a tuple by
-        # speculate_subgraph function) so as to respect the pytree API typing.
-        variable = BuiltinVariable(list).call_function(tx, [variable], {})
-
-        return _make_inlined(tx, pytree.tree_unflatten)(variable, treespec)
-
 
 class ExportTracepointHigherOrderVariable(TorchHigherOrderOperatorVariable):
     def call_function(
@@ -2761,7 +2847,7 @@ def create_scalar():
 
         with TransformGetItemToIndex():
             (
-                (_body_output, _body_treespec),
+                (_body_output, _body_spec),
                 body_graph,
                 body_lifted_freevars,
             ) = speculate_subgraph(
@@ -3415,7 +3501,7 @@ def _call_function(
 _hop_name_to_variable_class = {
     "cond": CondHigherOrderVariable,
     "while_loop": WhileLoopHigherOrderVariable,
-    "map": MapHigherOrderVariable,
+    "while_loop_stack_output": WhileLoopStackOutputHigherOrderVariable,
     "map_impl": MapHigherOrderVariable,
     "executorch_call_delegate": ExecutorchCallDelegateHigherOrderVariable,
     "out_dtype": OutDtypeHigherOrderVariable,
diff --git a/torch/_dynamo/variables/iter.py b/torch/_dynamo/variables/iter.py
index dcdd0e80a434a..80b9915aaa217 100644
--- a/torch/_dynamo/variables/iter.py
+++ b/torch/_dynamo/variables/iter.py
@@ -59,14 +59,24 @@ def call_function(
     ) -> "VariableTracker":
         # See also: module `torch._dynamo.polyfills.itertools`
 
-        if (
-            self.value is itertools.product
-            and not kwargs
-            and all(arg.has_unpack_var_sequence(tx) for arg in args)
-        ):
-            seqs = [arg.unpack_var_sequence(tx) for arg in args]
+        if self.value is itertools.product:
+            if any(kw != "repeat" for kw in kwargs.keys()):
+                unimplemented_v2(
+                    gb_type="Unsupported kwargs for itertools.product",
+                    context=f"call_function {self} {args} {kwargs}",
+                    explanation=f"Expected kwargs: 'repeat', but got "
+                    f"{','.join(set(kwargs.keys()) - {'repeat'})}",
+                    hints=[*graph_break_hints.USER_ERROR],
+                )
+
+            if "repeat" in kwargs.keys():
+                r = kwargs["repeat"].as_python_constant()
+            else:
+                r = 1
+            seqs = [arg.force_unpack_var_sequence(tx) for arg in args]
             items = [
-                variables.TupleVariable(list(item)) for item in itertools.product(*seqs)
+                variables.TupleVariable(list(item))
+                for item in itertools.product(*seqs, repeat=r)
             ]
             return variables.ListIteratorVariable(
                 items, mutation_type=ValueMutationNew()
@@ -180,6 +190,24 @@ def keyfunc(x):
             return variables.CountIteratorVariable(
                 *args, mutation_type=ValueMutationNew()
             )
+        elif (
+            self.value is itertools.permutations
+            and (len(args) == 1 or (len(args) == 2 and args[1].is_python_constant()))
+            and not kwargs
+        ):
+            if len(args) == 2:
+                r = args[1].as_python_constant()
+            else:
+                r = None
+            items = [
+                variables.TupleVariable(list(item))
+                for item in itertools.permutations(
+                    args[0].force_unpack_var_sequence(tx), r
+                )
+            ]
+            return variables.ListIteratorVariable(
+                items, mutation_type=ValueMutationNew()
+            )
         else:
             return super().call_function(tx, args, kwargs)
 
@@ -317,7 +345,7 @@ class ZipVariable(IteratorVariable):
 
     def __init__(
         self,
-        iterables: list[Union[list[VariableTracker], VariableTracker]],
+        iterables: list[VariableTracker],
         strict: bool = False,
         **kwargs,
     ) -> None:
@@ -351,6 +379,10 @@ def unpack_var_sequence(self, tx) -> list["VariableTracker"]:
 
     def next_variable(self, tx):
         assert self.is_mutable()
+
+        if len(self.iterables) == 0:
+            raise_observed_exception(StopIteration, tx)
+
         old_index = self.index
         args = []
 
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 355692de2b718..9bef1aecc342c 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -19,15 +19,20 @@ class that handles its unique behaviors while integrating with Dynamo's
 import collections
 import inspect
 import operator
+import sys
 from typing import Optional, TYPE_CHECKING
 
 import torch
 import torch.fx
 
 from .. import graph_break_hints, polyfills, variables
-from ..bytecode_transformation import create_call_function, create_instruction
+from ..bytecode_transformation import (
+    create_call_function,
+    create_instruction,
+    create_rot_n,
+)
 from ..exc import raise_observed_exception, unimplemented_v2
-from ..source import AttrSource
+from ..source import AttrSource, NamedTupleFieldsSource
 from ..utils import (
     cmp_name_to_op_mapping,
     cmp_name_to_op_str_mapping,
@@ -38,6 +43,7 @@ class that handles its unique behaviors while integrating with Dynamo's
     namedtuple_fields,
     odict_values,
     raise_args_mismatch,
+    range_iterator,
     set_example_value,
 )
 from .base import ValueMutationNew, VariableTracker
@@ -276,6 +282,16 @@ def __init__(self, items, **kwargs) -> None:
         else:
             raise AssertionError
 
+        def maybe_as_int(x):
+            return (
+                ConstantVariable(int(x.value)) if isinstance(x, ConstantVariable) else x
+            )
+
+        # cast each argument to an integer
+        start = maybe_as_int(start)
+        step = maybe_as_int(step)
+        stop = maybe_as_int(stop)
+
         assert stop is not None
         super().__init__([start, stop, step], **kwargs)
 
@@ -362,7 +378,12 @@ def apply_index(self, index):
             index = length + index
 
         if index < 0 or index >= length:
-            raise IndexError(f"index {index} is out of range")
+            tx = torch._dynamo.symbolic_convert.InstructionTranslator.current_tx()
+            raise_observed_exception(
+                IndexError,
+                tx,
+                args=[ConstantVariable("range object index out of range")],
+            )
 
         return variables.ConstantVariable.create(self.start() + (index * self.step()))
 
@@ -396,8 +417,11 @@ def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
 
         if isinstance(index, slice):
             return self.apply_slice(index)
-        else:
+        elif isinstance(index, int):
             return self.apply_index(index)
+        else:
+            msg = ConstantVariable("range indices must be integers or slices")
+            raise_observed_exception(TypeError, tx, args=[msg])
 
     def as_proxy(self):
         return self.python_type()(*self._as_proxy())
@@ -413,17 +437,94 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.foreach(self.items)
         codegen.extend_output(create_call_function(3, False))
 
+    def call_obj_hasattr(
+        self, tx: "InstructionTranslator", name: str
+    ) -> "VariableTracker":
+        if self.python_type() is not range:
+            return super().call_obj_hasattr(tx, name)
+        return variables.ConstantVariable.create(hasattr(range(0), name))
+
+    def range_equals(self, other: "RangeVariable"):
+        r0, r1 = self, other
+        if (
+            self.range_length() != r1.range_length()
+            or self.range_length() == 0
+            or r0.start() != r1.start()
+        ):
+            return False
+
+        if len(r0) == 1:
+            return True
+
+        return r0.step() == r1.step()
+
+    def range_count(self, x: VariableTracker):
+        # Based on CPython
+        # https://github.com/guilhermeleobas/cpython/blob/baefaa6cba1d69efd2f930cdc56bca682c54b139/Objects/rangeobject.c#L442-L486
+        x = x.as_python_constant()
+        if type(x) not in (bool, int, float):
+            return 0
+
+        start, stop, step = self.start(), self.stop(), self.step()
+
+        if step == 0:
+            return 0
+
+        in_range = (start <= x < stop) if step > 0 else (stop < x <= start)
+
+        if in_range:
+            re = ((x - start) % step) == 0
+            return int(re)
+        return 0
+
+    def call_method(self, tx, name, args, kwargs):
+        if name == "__iter__":
+            if not all(var.is_python_constant() for var in self.items):
+                # Can't represent a `range_iterator` without well defined bounds
+                return variables.misc.DelayGraphBreakVariable(
+                    msg="Cannot create range_iterator: bounds (start, stop, step) must be fully defined as concrete constants.",
+                )
+            return RangeIteratorVariable(
+                self.start(), self.stop(), self.step(), self.range_length()
+            )
+        elif name == "__len__":
+            length = self.range_length()
+            if length > sys.maxsize:
+                raise_observed_exception(OverflowError, tx)
+            return ConstantVariable.create(self.range_length())
+        elif name in ("count", "__contains__"):
+            return ConstantVariable(self.range_count(*args))
+        elif name == "__getitem__":
+            return self.getitem_const(tx, *args)
+        elif name in cmp_name_to_op_mapping:
+            other = args[0]
+            pt = other.python_type()
+            if name not in ("__eq__", "__ne__"):
+                # ranges are only comparable to other ranges
+                msg = f"{name} not supported between instances of 'range' and '{pt}'"
+                raise_observed_exception(
+                    TypeError,
+                    tx,
+                    args=[ConstantVariable.create(msg)],
+                )
+
+            if pt is not range:
+                return ConstantVariable.create(NotImplemented)
+
+            cmp = self.range_equals(other)
+
+            # Two ranges are equal if they produce the same sequence of values
+            if name == "__eq__":
+                return ConstantVariable(cmp)
+            else:
+                return ConstantVariable(not cmp)
+        return super().call_method(tx, name, args, kwargs)
+
     def var_getattr(self, tx: "InstructionTranslator", name):
         fields = ["start", "stop", "step"]
-        if name not in fields:
-            unimplemented_v2(
-                gb_type="Unsupported attribute for range() object",
-                context=f"var_getattr {self} {name}",
-                explanation=f"Expected attribute to be one of {','.join(fields)} "
-                f"but got {name}",
-                hints=[*graph_break_hints.USER_ERROR],
-            )
-        return self.items[fields.index(name)]
+        if name in fields:
+            return self.items[fields.index(name)]
+        return super().var_getattr(tx, name)
 
 
 class CommonListMethodsVariable(BaseListVariable):
@@ -1076,9 +1177,24 @@ def python_type(self):
     def as_python_constant(self):
         if self.is_structseq():
             # StructSequenceType(iterable)
-            return self.python_type()([x.as_python_constant() for x in self.items])
-        # NamedTupleType(*iterable)
-        return self.python_type()(*[x.as_python_constant() for x in self.items])
+            result = self.python_type()([x.as_python_constant() for x in self.items])
+        else:
+            # NamedTupleType(*iterable)
+            result = self.python_type()(*[x.as_python_constant() for x in self.items])
+
+        # Apply dynamic attributes if any were set
+        if self.dynamic_attributes:
+            for attr_name, attr_value in self.dynamic_attributes.items():
+                # Convert VariableTracker to Python constant if needed
+                if hasattr(attr_value, "as_python_constant"):
+                    python_value = attr_value.as_python_constant()
+                else:
+                    raise NotImplementedError(
+                        "Can not convert dynamic attribute without python constant value to python constant."
+                    )
+                setattr(result, attr_name, python_value)
+
+        return result
 
     def as_proxy(self):
         assert self.python_type() is not SizeVariable
@@ -1089,6 +1205,7 @@ def as_proxy(self):
         return self.python_type()(*self._as_proxy())
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
+        # Always reconstruct the NamedTuple normally first
         # Constructors:
         #   StructSequenceType(iterable)
         #   NamedTupleType(*iterable)
@@ -1107,6 +1224,12 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
             + create_call_function(1, False)
         )
 
+        for name, value in self.dynamic_attributes.items():
+            codegen.dup_top()
+            codegen(value)
+            codegen.extend_output(create_rot_n(2))
+            codegen.store_attr(name)
+
     def call_method(
         self,
         tx,
@@ -1130,6 +1253,8 @@ def call_method(
                 raise_observed_exception(AttributeError, tx)
             # Subclass of namedtuple type can have dynamic attributes
             tx.output.side_effects.mutation(self)
+            if self.source:
+                tx.output.side_effects.store_attr(self, attr, value)
             self.dynamic_attributes[attr] = value
             return ConstantVariable.create(None)
         return super().call_method(tx, name, args, kwargs)
@@ -1150,6 +1275,10 @@ def check_and_create_method():
             else:
                 return None
 
+        if name == "_fields":
+            source = NamedTupleFieldsSource(self.source) if self.source else None
+            return VariableTracker.build(tx, self.fields(), source=source)
+
         if name in self.dynamic_attributes:
             return self.dynamic_attributes[name]
 
@@ -1294,3 +1423,52 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
 
 class TupleIteratorVariable(ListIteratorVariable):
     pass
+
+
+class RangeIteratorVariable(IteratorVariable):
+    # only needed for isinstance(..., range_iterator) to work
+    _nonvar_fields = {
+        "iter_obj",
+    }
+
+    def __init__(self, start: int, stop: int, step: int, len_: int, **kwargs):
+        super().__init__(**kwargs)
+        self.start = start
+        self.stop = stop
+        self.step = step
+        self.len = len_
+
+    def call_method(self, tx, name, args, kwargs):
+        if name == "__next__":
+            return self.next_variable(tx)
+        elif name == "__iter__":
+            return self
+        return super().call_method(tx, name, args, kwargs)
+
+    def call_obj_hasattr(self, tx, name):
+        if self.python_type() is range_iterator:
+            ri = iter(range(0))
+            return ConstantVariable(hasattr(ri, name))
+        return super().call_obj_hasattr(tx, name)
+
+    def next_variable(self, tx):
+        if self.len <= 0:
+            raise_observed_exception(StopIteration, tx)
+
+        self.len -= 1
+        current = self.start
+        self.start += self.step
+        return ConstantVariable.create(current)
+
+    def python_type(self):
+        return range_iterator
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen.add_push_null(
+            lambda: codegen.append_output(codegen.create_load_python_module(range))
+        )
+        codegen.append_output(codegen.create_load_const(self.start))
+        codegen.append_output(codegen.create_load_const(self.stop))
+        codegen.append_output(codegen.create_load_const(self.step))
+        codegen.extend_output(create_call_function(3, False))
+        codegen.append_output(create_instruction("GET_ITER"))
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 18eda602dbdc0..60086fe6758c7 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -251,9 +251,9 @@ def call_method(
                     tx, self.objvar.value_type, cls_source
                 )
 
-            return variables.UserMethodVariable(
-                inner_fn.__func__, cls_variable, source=source
-            ).call_function(tx, args, kwargs)
+            return variables.UserFunctionVariable(
+                inner_fn.__func__, source=AttrSource(source, "__func__")
+            ).call_function(tx, [cls_variable, *args], kwargs)
         elif isinstance(inner_fn, types.FunctionType):
             return variables.UserFunctionVariable(
                 inner_fn, source=source
@@ -657,13 +657,13 @@ def __init__(self, fn_cls, **kwargs) -> None:
     def call_apply(self, tx: "InstructionTranslator", args, kwargs):
         requires_grad = False
 
-        def visit(node):
+        def visit(vt):
             nonlocal requires_grad
-            if isinstance(node, variables.TensorVariable):
-                if node.requires_grad is not False:
+            if isinstance(vt, variables.TensorVariable):
+                if vt.requires_grad is not False:
                     requires_grad = True
-            if isinstance(node, variables.NNModuleVariable):
-                if node.is_training(tx):
+            if isinstance(vt, variables.NNModuleVariable):
+                if vt.is_training(tx):
                     requires_grad = True
 
         VariableTracker.visit(visit, (args, kwargs))
@@ -1182,6 +1182,15 @@ def call_method(
 
         return super().call_method(tx, name, args, kwargs)
 
+    def get_forwarded_dict(self, tx):
+        assert (
+            self.name == "__dict__"
+            and isinstance(self.obj, variables.UserDefinedClassVariable)
+            and not tx.output.side_effects.has_pending_mutation(self.obj)
+        )
+        self.obj.ban_mutation = True
+        return VariableTracker.build(tx, self.obj.value.__dict__, self.source)
+
 
 class MethodWrapperVariable(VariableTracker):
     def __init__(self, method_wrapper, **kwargs) -> None:
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 3ca91814b8ae9..10ad8c4a12865 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -909,7 +909,11 @@ def set_nn_module_stack_source(self, source):
     @functools.cache
     def _nn_module_method_ids():
         # Allow __setattr__ to fall through to base class handler
-        supported = {torch.nn.Module.__setattr__, torch.nn.Module.__init__}
+        supported = {
+            torch.nn.Module.__setattr__,
+            torch.nn.Module.__init__,
+            torch.nn.Module.__delattr__,
+        }
         return {
             id(x.__code__)
             for x in torch.nn.Module.__dict__.values()
@@ -1091,9 +1095,10 @@ def call_method(
                     # Handle submodules
                     self.is_state_mutated = True
 
-            if method is torch.nn.Module.__setattr__ and isinstance(
-                args[1], variables.DeletedVariable
-            ):
+            if (
+                method is torch.nn.Module.__setattr__
+                and isinstance(args[1], variables.DeletedVariable)
+            ) or method is torch.nn.Module.__delattr__:
                 # Trace through __delattr__ to track mutations on the module
                 # members like `_modules``.
                 return tx.inline_user_function_return(
diff --git a/torch/_dynamo/variables/optimizer.py b/torch/_dynamo/variables/optimizer.py
index 025b106880498..499c956843beb 100644
--- a/torch/_dynamo/variables/optimizer.py
+++ b/torch/_dynamo/variables/optimizer.py
@@ -239,22 +239,10 @@ def map_sources_and_install_guards(self, tx):
         self.grad_to_source = {}
         self.tensor_to_source = {}
 
-        # Tracing the _init_group is expensive. But we still have to insert the
-        # necessary guards for _init_group. So, we manually handle insertion of
-        # guards. We also want to mark all the tensors inside the state dict to
-        # be static address.
-
-        # Mark all the tensors in the state dict to be static address. This has
-        # to be done first because the variable builder relies on the static
-        # address annotation.
-        # NB: Caching precompile is incompatible with mark_static_address
-        # https://github.com/pytorch/pytorch/issues/159228
-        if not torch._dynamo.config.caching_precompile:
-
-            def mark_static(x):
-                mark_static_address(x)
-
-            tree_map_only(torch.Tensor, mark_static, self.value.state)
+        def mark_static(x):
+            mark_static_address(x)
+
+        tree_map_only(torch.Tensor, mark_static, self.value.state)
 
         # Recursively realize the variable trackers for optim.state and
         # optim.param_groups, which recursively install the necessary guards.
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 62d0542dcab04..08dab47451abf 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -1090,6 +1090,30 @@ def method___setitem__(self, key, value):
             *proxy_args_kwargs([self, key, value], {}),
         )
 
+        if isinstance(value, TensorVariable):
+            # [Note: Tensor.__setitem__ and VariableTracker metadata]
+            # At this point, we proxied a node representing `self[key] = value` into the graph.
+            # When executed, this node will mutate `self`'s tensor metadata, so it's important
+            # even during tracing to propagate. For example:
+            #   value.requires_grad is True => self.requires_grad becomes True
+            #   value.requires_grad is True => self.has_grad_fn becomes True
+
+            # Not sure if __setitem__ can ever save activations, disabling just in case
+            with torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing():
+                get_fake_value(proxy.node, tx, allow_non_graph_fake=False)
+
+            example_value = self.proxy.node.meta.get("example_value")
+            from .builder import get_specialized_props, infer_subclass_type
+
+            if isinstance(value, variables.lazy.LazyVariableTracker):
+                value = variables.lazy.LazyVariableTracker.realize_all(value)
+
+            specialized_props = get_specialized_props(
+                type(value), tx, example_value, infer_subclass_type(example_value)
+            )
+            for k, v in specialized_props.items():
+                setattr(self, k, v)
+
         if config.use_graph_deduplication or config.track_nodes_for_deduplication:
             tx.output.region_tracker.add_node_mutation(proxy.node, 0)
 
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 370f4ea7834d2..bfebedc88d6eb 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -77,6 +77,7 @@
 )
 from .dicts import ConstDictVariable
 from .distributed import DistributedVariable, ProcessGroupVariable
+from .functions import bind_args_cached
 from .lists import ListVariable, TupleVariable
 from .torch_function import (
     can_dispatch_torch_function,
@@ -124,8 +125,14 @@
         torch.autograd.graph.disable_saved_tensors_hooks,
         torch.cpu.amp.autocast_mode.autocast,
         torch.cuda.amp.autocast_mode.autocast,
-        torch.nn.attention.sdpa_kernel,
-        torch.nn.attention._sdpa_kernel_variadic,
+        # We'll let Dynamo inline into the contextlib part of these context
+        # manager instances, all the way till it invokes the wrapped function
+        # itself (at which point we wrap it back to special context manager
+        # VTs).
+        #
+        # This allows us to support calling functions decorated with these
+        # context managers, without much extra effort or code dup.
+        torch.nn.attention.sdpa_kernel.__wrapped__,  # type: ignore[attr-defined]
     ]
 )
 
@@ -142,7 +149,6 @@
     torch.cuda.is_initialized,
     torch.xpu.current_device,
     torch.xpu.is_initialized,
-    torch.autograd._profiler_enabled,
 ]
 
 constant_fold_functions = [
@@ -190,6 +196,7 @@ def tracing_state_functions() -> dict[Callable[[], Any], Optional[bool]]:
         torch.jit.is_tracing: False,
         torch._C._get_tracing_state: None,
         torch.fx._symbolic_trace.is_fx_tracing: False,
+        torch.fx._symbolic_trace.is_fx_symbolic_tracing: False,
         torch.onnx.is_in_onnx_export: False,
         torch._dynamo.external_utils.is_compiling: True,
         torch._utils.is_compiling: True,
@@ -411,17 +418,13 @@ def call_function(
             return FSDPParamGroupUseTrainingStateVariable.create(
                 tx, args[0], args[1].as_python_constant()
             )
-        elif self.value is torch.nn.attention.sdpa_kernel:
-            assert len(args) == 1 or (len(kwargs) == 1 and "backends" in kwargs)
-            backends = args[0] if len(args) == 1 else kwargs["backends"]
-            set_priority = kwargs["set_priority"] if "set_priority" in kwargs else False
-            return SDPAKernelVariable.create(
-                tx, backends.as_python_constant(), set_priority
-            )
-        elif self.value is torch.nn.attention._sdpa_kernel_variadic:
-            return SDPAKernelVariable.create(
-                tx, [arg.as_python_constant() for arg in args]
+        elif self.value is torch.nn.attention.sdpa_kernel.__wrapped__:  # type: ignore[attr-defined]
+            name_to_arg_map = bind_args_cached(
+                self.value, tx, self.source, args, kwargs
             )
+            backends = name_to_arg_map["backends"].as_python_constant()
+            set_priority = name_to_arg_map["set_priority"].as_python_constant()
+            return SDPAKernelVariable.create(tx, backends, set_priority)
 
         return super().call_function(tx, args, kwargs)
 
@@ -1811,7 +1814,8 @@ def _nn_param_via_prefix_insert(tx: "InstructionTranslator", data, requires_grad
         # add the newly constructed nn.Parameter as a graph input
         source = SyntheticLocalSource(varname)
         example_value = torch.nn.Parameter(
-            tx.output.example_value_from_input_node(data.as_proxy().node)
+            tx.output.example_value_from_input_node(data.as_proxy().node),
+            requires_grad=requires_grad,
         )
         result = VariableTracker.build(tx, example_value, source)
         # Realize the VT because we will delete the guards on it in the next line.
diff --git a/torch/_dynamo/variables/torch_function.py b/torch/_dynamo/variables/torch_function.py
index c48c7c3f24844..4458468d8118c 100644
--- a/torch/_dynamo/variables/torch_function.py
+++ b/torch/_dynamo/variables/torch_function.py
@@ -59,7 +59,7 @@
 from .base import VariableTracker
 from .constant import ConstantVariable
 from .ctx_manager import GenericContextWrappingVariable
-from .functions import UserMethodVariable
+from .functions import UserFunctionVariable, UserMethodVariable
 from .lazy import LazyVariableTracker
 from .lists import TupleVariable
 from .tensor import TensorSubclassVariable, TensorVariable
@@ -620,8 +620,8 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 elif isinstance(attr, property):
                     getter_source = AttrSource(attr_source, "fget")
                     getter = attr.fget
-                    getter_var = UserMethodVariable(getter, self, source=getter_source)
-                    return getter_var.call_function(tx, [], {})
+                    getter_var = UserFunctionVariable(getter, source=getter_source)
+                    return getter_var.call_function(tx, [self], {})
 
                 elif isinstance(attr, classmethod):
                     return UserMethodVariable(
diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index 1b6d9ffacf130..98e07fc84dfb0 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -52,6 +52,9 @@
 from ..exc import (
     handle_observed_exception,
     ObservedAttributeError,
+    ObservedKeyError,
+    ObservedTypeError,
+    ObservedUserStopIteration,
     raise_observed_exception,
     unimplemented_v2,
 )
@@ -69,7 +72,6 @@
     UnspecializedParamBufferSource,
 )
 from ..utils import (
-    build_checkpoint_variable,
     check_constant_args,
     cmp_name_to_op_mapping,
     dict_methods,
@@ -79,7 +81,6 @@
     is_frozen_dataclass,
     is_lru_cache_wrapped_function,
     is_namedtuple_cls,
-    is_utils_checkpoint,
     is_wrapper_or_member_descriptor,
     istype,
     list_methods,
@@ -91,7 +92,7 @@
     tuple_methods,
     unpatched_nn_module_getattr,
 )
-from .base import AttributeMutationExisting, ValueMutationNew, VariableTracker
+from .base import ValueMutationNew, VariableTracker
 from .dicts import DefaultDictVariable
 from .lists import SizeVariable
 
@@ -156,6 +157,10 @@ class UserDefinedClassVariable(UserDefinedVariable):
     def __init__(self, value, **kwargs) -> None:
         super().__init__(**kwargs)
         self.value = value
+        # Used when we materialize class.__dict__ to a MappingProxyObject. In
+        # this case, we don't want to allow mutation in the class because there
+        # is no way to reflect it in the created MappingProxyVariable.
+        self.ban_mutation = False
 
     def as_python_constant(self):
         return self.value
@@ -253,6 +258,9 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracke
         elif name == "__dict__":
             options = {"source": source}
             return variables.GetAttrVariable(self, name, **options)
+        elif name == "__mro__":
+            attr_source = self.source and TypeMROSource(self.source)
+            return VariableTracker.build(tx, self.value.__mro__, attr_source)
 
         # Special handling of collections.OrderedDict.fromkeys()
         # Wrap it as GetAttrVariable(collections.OrderedDict, "fromkeys") to make it consistent with
@@ -295,10 +303,7 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracke
             func = obj.__get__(None, self.value)
             return VariableTracker.build(tx, func, source)
         elif source:
-            # __mro__ is a member in < 3.12, an attribute in >= 3.12
-            if inspect.ismemberdescriptor(obj) or (
-                sys.version_info >= (3, 12) and name == "__mro__"
-            ):
+            if inspect.ismemberdescriptor(obj):
                 return VariableTracker.build(tx, obj.__get__(self.value), source)
 
         if ConstantVariable.is_literal(obj):
@@ -414,6 +419,8 @@ def call_method(
             return BuiltinVariable.call_custom_dict_fromkeys(
                 tx, self.value, *args, **kwargs
             )
+        elif self.value is collections.OrderedDict and name == "move_to_end":
+            return args[0].call_method(tx, name, [*args[1:]], kwargs)
         elif name == "__eq__" and len(args) == 1 and hasattr(args[0], "value"):
             return variables.ConstantVariable(self.value == args[0].value)
         elif name == "__ne__" and len(args) == 1 and hasattr(args[0], "value"):
@@ -443,6 +450,13 @@ def call_method(
                 args[0],
                 args[1:],
             )
+        elif name == "__setattr__" and self.ban_mutation:
+            unimplemented_v2(
+                gb_type="Class attribute mutation when the __dict__ was already materialized",
+                context=str(self.value),
+                explanation="Dyanmo does not support tracing mutations on a class when its __dict__ is materialized",
+                hints=graph_break_hints.SUPPORTABLE,
+            )
         return super().call_method(tx, name, args, kwargs)
 
     def call_function(
@@ -470,7 +484,7 @@ def call_function(
             # import here to avoid circular dependency
             from .ctx_manager import NullContextVariable
 
-            return NullContextVariable()
+            return NullContextVariable(*args, **kwargs)
         elif self.value is collections.OrderedDict:
             return tx.inline_user_function_return(
                 VariableTracker.build(tx, polyfills.construct_dict),
@@ -588,6 +602,7 @@ def deque_signature(iterable=None, maxlen=None):
             and self.source
             and not is_forbidden_context_manager(self.value)
         ):
+            from . import TorchCtxManagerClassVariable
             from .functions import (
                 BaseUserFunctionVariable,
                 FunctionDecoratedByContextlibContextManagerVariable,
@@ -619,7 +634,7 @@ def deque_signature(iterable=None, maxlen=None):
                 )
 
             if self.value is contextlib._GeneratorContextManager and isinstance(
-                args[0], BaseUserFunctionVariable
+                args[0], (BaseUserFunctionVariable, TorchCtxManagerClassVariable)
             ):
                 if not torch._dynamo.config.enable_trace_contextlib:
                     unimplemented_v2(
@@ -630,6 +645,29 @@ def deque_signature(iterable=None, maxlen=None):
                             "Set torch._dynamo.config.enable_trace_contextlib = True",
                         ],
                     )
+
+                # Special treatments for certain context managers created via
+                # contextlib, because
+                # 1. we (pytorch) own their impls
+                # 2. it's tedious to trace through them, so we effectively
+                #    "allow in graph" them without sacrificing soundness.
+                #
+                # We would typically reach here via either
+                # 1. the instance construction in `with ctx_manager(...):`:
+                #    https://github.com/python/cpython/blob/3.12/Lib/contextlib.py#L301
+                # 2. calling a function decorated with a context manager:
+                #    https://github.com/python/cpython/blob/3.12/Lib/contextlib.py#L122
+                #
+                # So we basically trace through the surface part of the
+                # contextlib code, and then special case the shared remaining
+                # logic (the actual context manager instance construction and
+                # usage later on).
+                if isinstance(args[0], TorchCtxManagerClassVariable):
+                    fn_var = args[0]
+                    args_list = args[1].items
+                    kwargs_dict = args[2].keys_as_python_constant()
+                    return fn_var.call_function(tx, args_list, kwargs_dict)
+
                 # Wrap UserFunctionVariable in FunctionDecoratedByContextlibContextManagerVariable
                 # if the function is annotated with @contextlib.contextmanager
                 # This shouldn't be necessary once generator functions are fully
@@ -678,7 +716,12 @@ def deque_signature(iterable=None, maxlen=None):
 
                 assert all(x is not None for x in items)
 
-            return variables.NamedTupleVariable(items, self.value)
+            # Modify mutability of namedtuple for sourcelesss instantiations.
+            from .base import AttributeMutationNew
+
+            return variables.NamedTupleVariable(
+                items, self.value, mutation_type=AttributeMutationNew()
+            )
         elif self.value is torch.Size:
             # This simulates `THPSize_pynew`, the C impl for `Size.__new__`.
             tup = variables.BuiltinVariable(tuple).call_function(tx, args, kwargs)
@@ -1009,17 +1052,18 @@ def call_method(
 
             # check for methods implemented in C++
             if isinstance(method, types.FunctionType):
-                source = None
-                if self.source:
-                    source = self.get_source_by_walking_mro(name)
+                source = self.source
+                source_fn = None
+                if source:
+                    source_fn = self.get_source_by_walking_mro(name)
                 # TODO(jansel): add a guard to check for monkey patching?
                 from ..mutation_guard import unpatched_nn_module_init
 
                 if method is torch.nn.Module.__init__:
                     method = unpatched_nn_module_init
-                return UserMethodVariable(method, self, source=source).call_function(
-                    tx, args, kwargs
-                )
+                return UserMethodVariable(
+                    method, self, source_fn=source_fn, source=source
+                ).call_function(tx, args, kwargs)
 
             if method is list.__len__ and self.source and not (args or kwargs):
                 install_guard(self.source.make_guard(GuardBuilder.SEQUENCE_LENGTH))
@@ -1091,6 +1135,27 @@ def unpack_var_sequence(self, tx):
             ]
         return super().unpack_var_sequence(tx)
 
+    def has_force_unpack_var_sequence(self, tx: "InstructionTranslator") -> bool:
+        try:
+            variables.BuiltinVariable(iter).call_function(tx, [self], {})
+            return True
+        except ObservedTypeError:
+            handle_observed_exception(tx)
+            return False
+
+    def force_unpack_var_sequence(self, tx):
+        result = []
+        iter_ = variables.BuiltinVariable(iter).call_function(tx, [self], {})
+
+        while True:
+            try:
+                r = iter_.next_variable(tx)
+                result.append(r)
+            except ObservedUserStopIteration:
+                handle_observed_exception(tx)
+                break
+        return result
+
     def next_variable(self, tx):
         return self.call_method(tx, "__next__", [], {})
 
@@ -1279,7 +1344,6 @@ def get_source_by_walking_mro(self, name):
         )
 
     def var_getattr(self, tx: "InstructionTranslator", name):
-        from .. import trace_rules
         from . import ConstantVariable
 
         source = AttrSource(self.source, name) if self.source else None
@@ -1383,6 +1447,8 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             subobj_from_class is subobj
             and self.cls_source is not None
             and self.source is not None
+            and hasattr(self.value, "__dict__")
+            and name not in self.value.__dict__
         )
 
         if isinstance(subobj, property):
@@ -1391,9 +1457,13 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 source = self.get_source_by_walking_mro(name)
                 # Get the getter function
                 source = AttrSource(source, "fget")
-            return variables.UserMethodVariable(
-                subobj.fget, self, source=source
-            ).call_function(tx, [], {})
+
+            # Avoid using UserMethodVariable here because there is no way to
+            # access the method object here. Direct inline by creating the
+            # UserFunctionVariable.
+            return variables.UserFunctionVariable(
+                subobj.fget, source=source
+            ).call_function(tx, [self], {})
         elif isinstance(subobj, _collections._tuplegetter):
             # namedtuple fields are represented by _tuplegetter, and here we
             # emulate its `__get__`, which is implemented in C.
@@ -1414,13 +1484,17 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             func = subobj.__get__(self.value)
             return VariableTracker.build(tx, func, source)
         elif isinstance(subobj, classmethod):
+            source_fn = None
             if is_accessible_from_type_mro:
                 # Accessing from __dict__ does not resolve the descriptor, it
                 # returns a classmethod object, so access the __func__
                 # attribute to get to the actual function.
-                source = AttrSource(self.get_source_by_walking_mro(name), "__func__")
+                source_fn = AttrSource(self.get_source_by_walking_mro(name), "__func__")
             return variables.UserMethodVariable(
-                subobj.__func__, self.var_getattr(tx, "__class__"), source=source
+                subobj.__func__,
+                self.var_getattr(tx, "__class__"),
+                source_fn=source_fn,
+                source=source,
             )
         elif isinstance(subobj, types.ClassMethodDescriptorType):
             # e.g.: inspect.getattr_static({}, "fromkeys")
@@ -1468,9 +1542,6 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             isinstance(subobj, types.MethodType)
             and isinstance(self.value, torch.nn.Module)
         ):
-            if is_accessible_from_type_mro:
-                source = self.get_source_by_walking_mro(name)
-
             # Since we get subobj via self._getattr_static, which may not trigger dynamic lookup.
             # Static lookup can't tell us it's a method or function correctly,
             # so we trigger dynamic lookup here to get the correct type.
@@ -1513,16 +1584,14 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 func = subobj
 
             if inspect.ismethod(dynamic_subobj):
-                return variables.UserMethodVariable(func, self, source=source)
+                source_fn = None
+                if is_accessible_from_type_mro:
+                    source_fn = self.get_source_by_walking_mro(name)
+                return variables.UserMethodVariable(
+                    func, self, source_fn=source_fn, source=source
+                )
             elif inspect.isfunction(dynamic_subobj):
-                if is_utils_checkpoint(func):
-                    return build_checkpoint_variable(source=source)
-                elif source is not None:
-                    return trace_rules.lookup(func).create_with_source(
-                        func, source=source
-                    )
-                else:
-                    return trace_rules.lookup(func)(func)
+                return VariableTracker.build(tx, func, source)
 
         if (
             # wrap the source only if inline_inbuilt_nn_modules is set or fsdp modules. This is a temporary solution to
@@ -1684,6 +1753,21 @@ def as_proxy(self):
         ctor = self.python_type()
         return ctor(*args, **kwargs)
 
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        # Handle specific pytree classes
+        import torch.utils._pytree as pytree
+
+        if self.value_type is pytree.LeafSpec:
+            # Create a new LeafSpec instance by calling the constructor
+            codegen.add_push_null(
+                lambda: codegen.load_import_from("torch.utils._pytree", "LeafSpec")
+            )
+            codegen.extend_output(create_call_function(0, False))
+            return
+
+        # For other frozen dataclasses, fall back to the base class behavior
+        super().reconstruct(codegen)
+
     # NB: This is called during __init__ for a frozen dataclass
     # use this to accumulate the most up-to-date field values
     def method_setattr_standard(self, tx: "InstructionTranslator", name, value):
@@ -1861,7 +1945,7 @@ def __init__(self, value, dict_vt=None, **kwargs):
                 "dict_vt must be constructed by builder.py when source is present"
             )
             self._dict_vt = variables.ConstDictVariable(
-                {}, mutation_type=ValueMutationNew()
+                {}, type(value), mutation_type=ValueMutationNew()
             )
         self._dict_methods = dict_methods
 
@@ -1874,7 +1958,20 @@ def call_method(
     ) -> "VariableTracker":
         method = self._maybe_get_baseclass_method(name)
         if method in self._dict_methods:
-            return self._dict_vt.call_method(tx, name, args, kwargs)
+            # Dict subclasses can override __missing__ to provide fallback
+            # behavior instead of raising a KeyError. This is used, for example,
+            # by collections.Counter.
+            try:
+                return self._dict_vt.call_method(tx, name, args, kwargs)
+            except ObservedKeyError:
+                if (
+                    name == "__getitem__"
+                    and issubclass(self.python_type(), dict)
+                    and self._maybe_get_baseclass_method("__missing__")
+                ):
+                    return self.call_method(tx, "__missing__", args, kwargs)
+                else:
+                    raise
         return super().call_method(tx, name, args, kwargs)
 
     def unpack_var_sequence(self, tx):
@@ -1888,6 +1985,10 @@ def unpack_var_sequence(self, tx):
     def is_underlying_vt_modified(self, side_effects):
         return side_effects.is_modified(self._dict_vt)
 
+    @property
+    def user_cls(self):
+        return self._dict_vt.user_cls
+
     @property
     def items(self):
         return self._dict_vt.items
@@ -2042,7 +2143,7 @@ def __init__(self, value, tuple_vt=None, init_args=None, **kwargs):
             from torch._dynamo.symbolic_convert import InstructionTranslator
 
             tx = InstructionTranslator.current_tx()
-            elems = init_args[0].unpack_var_sequence(tx)
+            elems = init_args[0].force_unpack_var_sequence(tx)
             self._tuple_vt = variables.TupleVariable(
                 elems, mutation_type=ValueMutationNew()
             )
@@ -2073,7 +2174,6 @@ class MutableMappingVariable(UserDefinedObjectVariable):
     def __init__(self, value, **kwargs):
         super().__init__(value, **kwargs)
         self.generic_dict_vt = variables.ConstDictVariable({})
-        self.mutation_type = AttributeMutationExisting()
 
     def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracker":
         # A common pattern in the init code of MutableMapping objects is to
diff --git a/torch/_export/__init__.py b/torch/_export/__init__.py
index df72642d0cc5a..02e8c118cfafb 100644
--- a/torch/_export/__init__.py
+++ b/torch/_export/__init__.py
@@ -148,6 +148,7 @@ def aot_compile(
     with torch.no_grad():
         so_path = torch._inductor.aot_compile(gm, args, kwargs, options=options)  # type: ignore[arg-type]
 
+    assert isinstance(so_path, (str, list))
     return so_path
 
 def aot_load(so_path: str, device: str) -> Callable:
diff --git a/torch/_export/non_strict_utils.py b/torch/_export/non_strict_utils.py
index 6a24642013db9..fffe85beb467e 100644
--- a/torch/_export/non_strict_utils.py
+++ b/torch/_export/non_strict_utils.py
@@ -330,7 +330,7 @@ def make_fake_inputs(
     args,
     kwargs,
     dynamic_shapes,
-    allow_complex_guards_as_runtime_asserts=False,
+    prefer_deferred_runtime_asserts_over_guards=False,
 ):
     """
     Given an nn module, example inputs, and constraints, return a new fake mode,
@@ -382,8 +382,7 @@ def make_fake_inputs(
                 shape_env=ShapeEnv(
                     tracked_fakes=[],
                     co_fields=co_fields,
-                    prefer_deferred_runtime_asserts_over_guards=True,
-                    allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+                    prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
                     trace_asserts=True,
                 ),
                 allow_non_fake_inputs=True,
diff --git a/torch/_export/passes/_node_metadata_hook.py b/torch/_export/passes/_node_metadata_hook.py
index ef49c4f035a56..f1958815293c1 100644
--- a/torch/_export/passes/_node_metadata_hook.py
+++ b/torch/_export/passes/_node_metadata_hook.py
@@ -3,6 +3,9 @@
 from typing import Any, Optional
 
 import torch
+import torch.utils._pytree as pytree
+from torch._dispatch.python import enable_python_dispatcher
+from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.graph_module import GraphModule
 
 
@@ -10,7 +13,9 @@
 
 
 def _node_metadata_hook(
-    node: torch.fx.Node, metadata: Optional[dict[str, Any]] = None
+    node: torch.fx.Node,
+    metadata: Optional[dict[str, Any]] = None,
+    fake_mode: Optional[FakeTensorMode] = None,
 ) -> None:
     """
     Hook for adding the appropriate metadata to nodes that are created during a
@@ -27,11 +32,11 @@ def _node_metadata_hook(
     that nodes being added are only call_function nodes, and copies over the
     first argument node's nn_module_stack.
     """
-    assert node.op == "call_function" and callable(node.target)
+    fake_mode = fake_mode or contextlib.nullcontext()
 
-    arg_meta = [arg.meta for arg in node.args if isinstance(arg, torch.fx.Node)]
-    assert len(arg_meta) >= 1
-    arg_meta = arg_meta[0]
+    assert node.op == "call_function" and callable(node.target), (
+        f"node: {node}, target: {node.target}"
+    )
 
     if (
         isinstance(node.target, torch._ops.OpOverload)
@@ -39,34 +44,48 @@ def _node_metadata_hook(
     ):
         node.meta["val"] = None
     else:
-        fake_args = [
-            arg.meta["val"] if isinstance(arg, torch.fx.Node) else arg
-            for arg in node.args
-        ]
-        fake_res = node.target(*fake_args)
+        fake_args, fake_kwargs = pytree.tree_map_only(
+            torch.fx.Node, lambda arg: arg.meta["val"], (node.args, node.kwargs)
+        )
+        with fake_mode, enable_python_dispatcher():
+            fake_res = node.target(*fake_args, **fake_kwargs)
         node.meta["val"] = fake_res
 
-    node.meta["nn_module_stack"] = arg_meta.get(
+    if metadata is not None:
+        for k, v in metadata.items():
+            node.meta[k] = v
+
+    # Copy over metadata from argument nodes
+    arg_meta = [
+        arg.meta
+        for arg in pytree.tree_flatten((node.args, node.kwargs))[0]
+        if isinstance(arg, torch.fx.Node)
+    ]
+    if len(arg_meta) == 0:
+        return
+    arg_meta = arg_meta[0]
+
+    node.meta["nn_module_stack"] = node.meta.get(
         "nn_module_stack",
-        {
-            _EMPTY_NN_MODULE_STACK_KEY: (
-                _EMPTY_NN_MODULE_STACK_KEY,
-                _EMPTY_NN_MODULE_STACK_KEY,
-            )
-        },
+        arg_meta.get(
+            "nn_module_stack",
+            {
+                _EMPTY_NN_MODULE_STACK_KEY: (
+                    _EMPTY_NN_MODULE_STACK_KEY,
+                    _EMPTY_NN_MODULE_STACK_KEY,
+                )
+            },
+        ),
     )
 
-    node.meta["torch_fn"] = (
-        f"{node.target.__name__}_0",
-        f"{node.target.__class__.__name__}.{node.target.__name__}",
+    node.meta["torch_fn"] = node.meta.get(
+        "torch_fn",
+        (
+            f"{node.target.__name__}_0",
+            f"{node.target.__class__.__name__}.{node.target.__name__}",
+        ),
     )
 
-    # Hook specified metadata takes precedence over all previously set
-    # metadata, so this goes last
-    if metadata is not None:
-        for k, v in metadata.items():
-            node.meta[k] = v
-
 
 @contextlib.contextmanager
 def _set_node_metadata_hook(gm: torch.fx.GraphModule, f):
diff --git a/torch/_export/serde/export_schema.thrift b/torch/_export/serde/export_schema.thrift
index 50472c02375cc..f4a08f8739993 100644
--- a/torch/_export/serde/export_schema.thrift
+++ b/torch/_export/serde/export_schema.thrift
@@ -1,5 +1,5 @@
 // @generated by update_schema.py
-// checksum<<31664e4faa0eacd6f538ffed163078e190d9d2b98d762dd45b68eb1b7b12f0d1>>
+// checksum<<a1c01cb72b55ca996960afa7e3b5ab6714405b036d8a3c33a81084a84e58bbab>>
 
 namespace py3 torch._export
 namespace cpp2 torch._export.schema
@@ -134,6 +134,11 @@ struct CustomObjArgument {
   20: string class_fqn;
 }
 
+struct ComplexValue {
+  10: double real;
+  20: double imag;
+}
+
 union Argument {
   10: bool as_none;
   20: TensorArgument as_tensor;
@@ -161,6 +166,7 @@ union Argument {
   230: SymFloatArgument as_sym_float;
   240: list<SymFloatArgument> as_sym_floats;
   250: OptionalTensorArgument as_optional_tensor;
+  260: ComplexValue as_complex;
 }
 
 struct NamedArgument {
@@ -254,6 +260,11 @@ struct BufferMutationSpec {
   20: string buffer_name;
 }
 
+struct ParameterMutationSpec {
+  10: TensorArgument arg;
+  20: string parameter_name;
+}
+
 struct GradientToParameterSpec {
   10: TensorArgument arg;
   20: string parameter_name;
@@ -281,6 +292,7 @@ union OutputSpec {
   50: GradientToUserInputSpec gradient_to_user_input;
   60: UserInputMutationSpec user_input_mutation;
   70: OutputTokenSpec token;
+  80: ParameterMutationSpec parameter_mutation;
 }
 
 struct GraphSignature {
@@ -330,19 +342,18 @@ struct ExportedProgram {
   60: SchemaVersion schema_version;
   70: list<string> verifiers;
   80: string torch_version;
+  90: list<string> guards_code;
 }
 
-struct Program {
-  200: map<string, ExportedProgram> methods;
+struct PayloadMeta {
+  10: string path_name;
+  20: bool is_param;
+  30: bool use_pickle;
+  40: optional TensorMeta tensor_meta;
 }
 
-struct Model {
-  10: string name;
-  20: map<string, string> tensorPaths;
-  40: Program program;
-  50: map<string, Program> delegates;
-  60: map<string, string> deviceAllocationMap;
-  70: map<string, string> constantPaths;
+struct PayloadConfig {
+  10: map<string, PayloadMeta> config;
 }
 
 struct AOTInductorModelPickleData {
diff --git a/torch/_export/serde/schema.py b/torch/_export/serde/schema.py
index 933d30310b72c..f4ce89c006f59 100644
--- a/torch/_export/serde/schema.py
+++ b/torch/_export/serde/schema.py
@@ -9,7 +9,7 @@
 
 
 # NOTE: Please update this value if any modifications are made to the schema
-SCHEMA_VERSION = (8, 9)
+SCHEMA_VERSION = (8, 14)
 TREESPEC_VERSION = 1
 
 
@@ -176,6 +176,12 @@ class CustomObjArgument:
     class_fqn: Annotated[str, 20]
 
 
+@dataclass
+class ComplexValue:
+    real: Annotated[float, 10]
+    imag: Annotated[float, 20]
+
+
 # This is actually a union type
 @_union_dataclass
 class Argument(_Union):
@@ -205,6 +211,7 @@ class Argument(_Union):
     as_sym_float: Annotated[SymFloatArgument, 230]
     as_sym_floats: Annotated[list[SymFloatArgument], 240]
     as_optional_tensor: Annotated[OptionalTensorArgument, 250]
+    as_complex: Annotated[ComplexValue, 260]
 
 
 class ArgumentKind(IntEnum):
@@ -327,6 +334,12 @@ class BufferMutationSpec:
     buffer_name: Annotated[str, 20]
 
 
+@dataclass
+class ParameterMutationSpec:
+    arg: Annotated[TensorArgument, 10]
+    parameter_name: Annotated[str, 20]
+
+
 @dataclass
 class GradientToParameterSpec:
     arg: Annotated[TensorArgument, 10]
@@ -359,6 +372,7 @@ class OutputSpec(_Union):
     gradient_to_user_input: Annotated[GradientToUserInputSpec, 50]
     user_input_mutation: Annotated[UserInputMutationSpec, 60]
     token: Annotated[OutputTokenSpec, 70]
+    parameter_mutation: Annotated[ParameterMutationSpec, 80]
 
 
 @dataclass
@@ -435,6 +449,7 @@ class ExportedProgram:
     schema_version: Annotated[SchemaVersion, 60]
     verifiers: Annotated[list[str], 70] = field(default_factory=list)
     torch_version: Annotated[str, 80] = "<=2.4"
+    guards_code: Annotated[list[str], 90] = field(default_factory=list)
 
 
 #########################################################################
@@ -442,29 +457,25 @@ class ExportedProgram:
 #########################################################################
 
 
+# The metadata for payload saved in PT2 archive.
+# payload includes params, buffers, tensor constants, and custom objects.
 @dataclass
-class Program:
-    methods: Annotated[dict[str, ExportedProgram], 200]
+class PayloadMeta:
+    # the path of the payload in the archive file, e.g. "weight_0"
+    path_name: Annotated[str, 10]
+    is_param: Annotated[bool, 20]
+    # whether the payload is serialized using pickle.
+    # Only custom objects and tensor subclasses that are not fake tensors
+    # are serialized using pickle.
+    use_pickle: Annotated[bool, 30]
+    # Custom Objects don't have tensor_meta and will be serialized using pickle
+    tensor_meta: Annotated[Optional[TensorMeta], 40]
 
 
-# This is the top-level model definition that be will serialized into the package
+# The mapping from payload FQN to its metadata.
 @dataclass
-class Model:
-    # unique identifier of the model in the package, e.g. local, remote, merge
-    name: Annotated[str, 10]
-    # key is the FQN of tensor in exported program
-    # value is the archive path of tensor payloads
-    # e.g. "L__self__linear.weight" : "/data/tensor/L__self__linear.weight"
-    tensorPaths: Annotated[dict[str, str], 20]
-    # program exported from torch.export()
-    program: Annotated[Program, 40]
-    # Backend-specialized Lowered GraphModule
-    # e.g. "aotinductor-a100" : ExportedProgram_with_AOTInductor_delegate
-    delegates: Annotated[dict[str, Program], 50]
-    deviceAllocationMap: Annotated[dict[str, str], 60]
-    # key is the FQN of constant in exported program (constant tensor or torchbind objs)
-    # value is the archive path of serialized constants
-    constantPaths: Annotated[dict[str, str], 70]
+class PayloadConfig:
+    config: Annotated[dict[str, PayloadMeta], 10]
 
 
 #
diff --git a/torch/_export/serde/schema.yaml b/torch/_export/serde/schema.yaml
index 9167a6820ef40..951351e7786aa 100644
--- a/torch/_export/serde/schema.yaml
+++ b/torch/_export/serde/schema.yaml
@@ -1,5 +1,5 @@
 # @generated by update_schema.py
-# checksum<<5c990535d373dcaa291a4f994b4d7b025e0f8e806ca5268085ef699d0e4d3000>>
+# checksum<<74d07b92c36d5854263145c231553dcda15215f0460e7ace43554248c05378ec>>
 AOTInductorModelPickleData:
   kind: struct
   fields:
@@ -73,6 +73,8 @@ Argument:
       type: List[SymFloatArgument]
     as_optional_tensor:
       type: OptionalTensorArgument
+    as_complex:
+      type: ComplexValue
 ArgumentKind:
   kind: enum
   fields:
@@ -86,6 +88,13 @@ BufferMutationSpec:
       type: TensorArgument
     buffer_name:
       type: str
+ComplexValue:
+  kind: struct
+  fields:
+    real:
+      type: float
+    imag:
+      type: float
 ConstantValue:
   kind: union
   fields:
@@ -131,6 +140,9 @@ ExportedProgram:
     torch_version:
       type: str
       default: <=2.4
+    guards_code:
+      type: List[str]
+      default: '[]'
 ExternKernelNode:
   kind: struct
   fields:
@@ -293,21 +305,6 @@ MemoryFormat:
     ChannelsLast: 2
     ChannelsLast3d: 3
     PreserveFormat: 4
-Model:
-  kind: struct
-  fields:
-    name:
-      type: str
-    tensorPaths:
-      type: Dict[str, str]
-    program:
-      type: Program
-    delegates:
-      type: Dict[str, Program]
-    deviceAllocationMap:
-      type: Dict[str, str]
-    constantPaths:
-      type: Dict[str, str]
 ModuleCallEntry:
   kind: struct
   fields:
@@ -383,16 +380,36 @@ OutputSpec:
       type: UserInputMutationSpec
     token:
       type: OutputTokenSpec
+    parameter_mutation:
+      type: ParameterMutationSpec
 OutputTokenSpec:
   kind: struct
   fields:
     arg:
       type: TokenArgument
-Program:
+ParameterMutationSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    parameter_name:
+      type: str
+PayloadConfig:
+  kind: struct
+  fields:
+    config:
+      type: Dict[str, PayloadMeta]
+PayloadMeta:
   kind: struct
   fields:
-    methods:
-      type: Dict[str, ExportedProgram]
+    path_name:
+      type: str
+    is_param:
+      type: bool
+    use_pickle:
+      type: bool
+    tensor_meta:
+      type: Optional[TensorMeta]
 RangeConstraint:
   kind: struct
   fields:
@@ -534,5 +551,5 @@ UserOutputSpec:
       type: Argument
 SCHEMA_VERSION:
 - 8
-- 9
+- 14
 TREESPEC_VERSION: 1
diff --git a/torch/_export/serde/schema_check.py b/torch/_export/serde/schema_check.py
index ccc963397530b..29b9766ae18a4 100644
--- a/torch/_export/serde/schema_check.py
+++ b/torch/_export/serde/schema_check.py
@@ -448,6 +448,7 @@ class ForwardRef {{
     ptr_ = std::make_unique<T>(*other.ptr_);
     return *this;
   }}
+  ~ForwardRef();
   const T& operator*() const {{
     return *ptr_;
   }}
@@ -519,6 +520,7 @@ class F64 {{
 
 template <typename T> ForwardRef<T>::ForwardRef(ForwardRef<T>&&) = default;
 template <typename T> ForwardRef<T>& ForwardRef<T>::operator=(ForwardRef<T>&&) = default;
+template <typename T> ForwardRef<T>::~ForwardRef() = default;
 }} // namespace _export
 }} // namespace torch
 """
diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py
index 57a5c0b20600c..07674b5702947 100644
--- a/torch/_export/serde/serialize.py
+++ b/torch/_export/serde/serialize.py
@@ -14,7 +14,7 @@
 import traceback
 import typing
 from collections import namedtuple, OrderedDict
-from collections.abc import Iterable, Iterator
+from collections.abc import Iterable, Iterator, Sequence
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from enum import Enum
@@ -35,12 +35,14 @@
 from torch.utils._sympy.symbol import prefix_str, SymT
 from torch.utils._sympy.value_ranges import ValueRanges
 from torch.utils._traceback import CapturedTraceback
+from torch.utils._triton import has_triton
 
 from ..utils import remove_proxy_from_state_dict
 from .schema import (  # type: ignore[attr-defined]
     Argument,
     ArgumentKind,
     BufferMutationSpec,
+    ComplexValue,
     ConstantValue,
     CustomObjArgument,
     Device,
@@ -69,6 +71,7 @@
     OptionalTensorArgument,
     OutputSpec,
     OutputTokenSpec,
+    ParameterMutationSpec,
     RangeConstraint,
     ScalarType,
     SCHEMA_VERSION,
@@ -255,6 +258,31 @@ def deserialize_device(d: Device) -> torch.device:
     return torch.device(type=d.type, index=d.index)
 
 
+def deserialize_size(sizes: Sequence[SymInt]) -> tuple[int, ...]:
+    for sym_int_size in sizes:
+        assert sym_int_size.type == "as_int", (
+            f"Only as_int is supported, got {sym_int_size.type}"
+        )
+    return tuple(sym_int_size.as_int for sym_int_size in sizes)
+
+
+def deserialize_stride(strides: Sequence[SymInt]) -> tuple[int, ...]:
+    for sym_int_stride in strides:
+        assert sym_int_stride.type == "as_int", (
+            f"Only as_int is supported, got {sym_int_stride.type}"
+        )
+    return tuple(sym_int_stride.as_int for sym_int_stride in strides)
+
+
+def deserialize_scalar_type(st: ScalarType) -> torch.dtype:
+    return _SERIALIZE_TO_TORCH_DTYPE[st]
+
+
+def deserialize_storage_offset(offset: SymInt) -> int:
+    assert offset.type == "as_int", f"Only as_int is supported, got {offset.type}"
+    return offset.as_int
+
+
 def _print_sympy(s: Union[torch.SymInt, torch.SymBool, torch.SymFloat, sympy.Expr]):
     if isinstance(s, (torch.SymInt, torch.SymBool, torch.SymFloat)):
         s = s.node.expr
@@ -325,7 +353,7 @@ def serialize_tensor_meta(t: torch.Tensor) -> TensorMeta:
         requires_grad=t.requires_grad,
         device=Device(type=t.device.type, index=t.device.index),
         strides=[serialize_sym_int(s) for s in t.stride()],
-        storage_offset=serialize_sym_int(0),  # TODO needs to be fixed.
+        storage_offset=serialize_sym_int(t.storage_offset()),
         layout=_TORCH_TO_SERIALIZE_LAYOUT[t.layout],
     )
 
@@ -644,6 +672,76 @@ def serialize_tensor_list_output(node):
                     metadata=self.serialize_metadata(node),
                     is_hop_single_tensor_return=False,
                 )
+            elif (
+                node.target
+                is torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional
+            ):
+                assert has_triton(), "triton required to serialize triton kernels"
+                from triton.runtime.autotuner import Autotuner
+
+                meta_val = node.meta["val"]
+                assert isinstance(meta_val, dict)
+
+                output_keys = meta_val.keys()
+                output_indices = []
+
+                assert isinstance(node.kwargs["kernel_idx"], int)
+                kernel = torch._higher_order_ops.triton_kernel_wrap.kernel_side_table.get_kernel(
+                    node.kwargs["kernel_idx"]
+                )
+
+                if isinstance(kernel, Autotuner):
+                    assert len(kernel.configs) == 1
+                    num_warps = kernel.configs[0].num_warps
+                    assert kernel.configs[0].num_ctas == 1, (
+                        "serialization only supports num_ctas == 1"
+                    )
+                    kernel = kernel.fn
+                else:
+                    num_warps = 4
+
+                constexpr_keys = set()
+                for p in kernel.params:
+                    if p.is_constexpr:
+                        constexpr_keys.add(p.name)
+
+                found_constexpr = False
+                args_new = ()
+                i = 0
+
+                assert isinstance(node.kwargs["kwargs"], dict)
+                for k, v in node.kwargs["kwargs"].items():
+                    # don't serialize constexpr since they will
+                    # be embedded into the binary and don't
+                    # need to be passed around as attributes
+                    if k in constexpr_keys:
+                        found_constexpr = True
+                        continue
+
+                    assert not found_constexpr, (
+                        "non-constexpr args found after constexpr arg(s)"
+                    )
+
+                    if k in output_keys:
+                        output_indices.append(i)
+                    args_new += (v,)  # type: ignore[assignment]
+                    i += 1
+
+                assert isinstance(node.kwargs["grid"], list)
+                kwargs_new = {
+                    "name": kernel.fn.__name__,
+                    "grid": node.kwargs["grid"][0],
+                    "output_indices": output_indices,
+                    "num_warps": num_warps,
+                }
+
+                ex_node = Node(
+                    target=self.serialize_operator(node.target),
+                    inputs=self.serialize_hoo_inputs(args_new, kwargs_new),
+                    outputs=self.serialize_hoo_outputs(node),
+                    metadata=self.serialize_metadata(node),
+                    is_hop_single_tensor_return=_is_hop_single_tensor_return(node),
+                )
             else:
                 ex_node = Node(
                     target=self.serialize_operator(node.target),
@@ -884,6 +982,15 @@ def serialize_input(self, arg, arg_type: Optional[Any] = None) -> Argument:
                     return Argument.create(
                         as_graph=GraphArgument(name=arg.target, graph=graph)
                     )
+                elif type(attr).__name__ == "LoweredBackendModule":
+                    # Special handling for executorch_call_delegate HOP
+                    # It's first argument is a LoweredBackendModule, for which we
+                    # serialize name and backend id of the lowered module
+                    module_name = getattr(attr, "module_name", None)
+                    backend_id = getattr(attr, "backend_id", None)
+                    assert module_name is not None, "module_name should not be None"
+                    assert backend_id is not None, "backend_id should not be None"
+                    return Argument.create(as_string=f"{module_name}-{backend_id}")
                 else:
                     raise SerializeError(
                         f"Unsupported getattr attribute {arg.target} with type: {type(attr)}"
@@ -951,6 +1058,10 @@ def serialize_input(self, arg, arg_type: Optional[Any] = None) -> Argument:
             return Argument.create(as_int=arg)
         elif type(arg) is float:
             return Argument.create(as_float=arg)
+        elif type(arg) is complex:
+            return Argument.create(
+                as_complex=ComplexValue(real=arg.real, imag=arg.imag)
+            )
         elif arg is None:
             return Argument.create(as_none=True)
         elif isinstance(arg, (list, tuple)):
@@ -1241,6 +1352,15 @@ def serialize_output_spec(self, spec: ep.OutputSpec) -> OutputSpec:
                     buffer_name=spec.target,
                 )
             )
+        elif spec.kind == ep.OutputKind.PARAMETER_MUTATION:
+            assert spec.target is not None
+            assert isinstance(spec.arg, ep.TensorArgument)
+            return OutputSpec.create(
+                parameter_mutation=ParameterMutationSpec(
+                    arg=TensorArgument(name=spec.arg.name),
+                    parameter_name=spec.target,
+                )
+            )
         elif spec.kind == ep.OutputKind.GRADIENT_TO_PARAMETER:
             assert spec.target is not None
             assert isinstance(spec.arg, ep.TensorArgument)
@@ -1506,6 +1626,17 @@ def serialize_hoo_outputs(self, node: torch.fx.Node) -> list[Argument]:
                     outputs.append(self.serialize_output(name, element_meta_val))
 
             return outputs
+        elif isinstance(meta_val, dict):
+            tensor_args = []
+            # use the dict key as the idx
+            for idx, meta in meta_val.items():
+                if not isinstance(meta, torch.Tensor):
+                    raise SerializeError(
+                        f"Serialize list output with type {type(meta)} nyi"
+                    )
+                name = self._output_node_name_at_index(node, idx)
+                tensor_args.append(self.serialize_tensor_output(name, meta))
+            return [Argument.create(as_tensors=tensor_args)]
         else:
             return [self.serialize_output(node.name, meta_val)]
 
@@ -1663,6 +1794,7 @@ def serialize(self, exported_program: ep.ExportedProgram) -> _SerializedProgram:
             ),
             verifiers=[v.dialect for v in exported_program.verifiers],
             torch_version=torch.__version__,
+            guards_code=exported_program._guards_code,
         )
 
         # Test canonical form is well defined.
@@ -2032,7 +2164,13 @@ def _is_single_tensor_return(target) -> bool:
 
             fx_node = self.graph.create_node("call_function", target, args, {}, name)
             self.deserialize_sym_op_outputs(serialized_node, fx_node)
-
+        elif (
+            target
+            is torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional
+        ):
+            raise SerializeError(
+                "deserialize nyi for torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional"
+            )
         elif isinstance(target, torch._ops.HigherOrderOperator):
             args, kwargs = self.deserialize_hoo_inputs(serialized_node.inputs)
             metadata = self.deserialize_metadata(serialized_node.metadata)
@@ -2199,6 +2337,12 @@ def deserialize_output_spec(self, o: OutputSpec) -> ep.OutputSpec:
                 arg=ep.TensorArgument(name=o.buffer_mutation.arg.name),
                 target=o.buffer_mutation.buffer_name,
             )
+        elif o.type == "parameter_mutation":
+            return ep.OutputSpec(
+                kind=ep.OutputKind.PARAMETER_MUTATION,
+                arg=ep.TensorArgument(name=o.parameter_mutation.arg.name),
+                target=o.parameter_mutation.parameter_name,
+            )
         elif o.type == "gradient_to_parameter":
             return ep.OutputSpec(
                 kind=ep.OutputKind.GRADIENT_TO_PARAMETER,
@@ -2452,6 +2596,8 @@ def deserialize_input(self, inp: Argument) -> Any:
             return inp.as_bool
         elif typ_ == "as_string":
             return inp.as_string
+        elif typ_ == "as_complex":
+            return complex(inp.as_complex.real, inp.as_complex.imag)
         elif typ_ == "as_sym_int":
             return self.deserialize_sym_argument(inp.as_sym_int)
         elif typ_ == "as_sym_float":
@@ -2893,6 +3039,7 @@ def deserialize(
             constants=res.constants,
             verifiers=[load_verifier(v) for v in exported_program.verifiers],
         )
+        result._guards_code = exported_program.guards_code
         log.debug("\n[deserialize]: %s", result)
         return result
 
@@ -2979,13 +3126,17 @@ def _dict_to_dataclass(cls, data):
         field_type = cls.__annotations__[_type]
         return cls.create(**{_type: _dict_to_dataclass(field_type, _value)})
     elif dataclasses.is_dataclass(cls):
-        obj = cls(**data)  # type: ignore[assignment,operator]
+        fields = {}
         type_hints = typing.get_type_hints(cls)
+        # For forward compatibility consideration, we ignore all the keys
+        # that are not showing up in the dataclass definition.
         for f in dataclasses.fields(cls):
             name = f.name
-            new_field_obj = _dict_to_dataclass(type_hints[name], getattr(obj, name))
-            setattr(obj, name, new_field_obj)
-        return obj
+            if name not in data:
+                continue
+            new_field_obj = _dict_to_dataclass(type_hints[name], data[name])
+            fields[name] = new_field_obj
+        return cls(**fields)  # type: ignore[operator]
     elif isinstance(data, list):
         if len(data) == 0:
             return data
@@ -2999,6 +3150,13 @@ def _dict_to_dataclass(cls, data):
     return data
 
 
+def _bytes_to_dataclass(cls: Any, artifact_bytes: bytes) -> Any:
+    artifact_str = artifact_bytes.decode("utf-8")
+    artifact_dict = json.loads(artifact_str)
+    artifact_dataclass = _dict_to_dataclass(cls, artifact_dict)
+    return artifact_dataclass
+
+
 def deserialize(
     artifact: SerializedArtifact,
     expected_opset_version: Optional[dict[str, int]] = None,
@@ -3006,10 +3164,8 @@ def deserialize(
     _unsafe_skip_version_check=False,
 ) -> ep.ExportedProgram:
     assert isinstance(artifact.exported_program, bytes)
-    exported_program_str = artifact.exported_program.decode("utf-8")
-    exported_program_dict = json.loads(exported_program_str)
-    serialized_exported_program = _dict_to_dataclass(
-        ExportedProgram, exported_program_dict
+    serialized_exported_program = _bytes_to_dataclass(
+        ExportedProgram, artifact.exported_program
     )
     return ExportedProgramDeserializer(expected_opset_version).deserialize(
         serialized_exported_program,
@@ -3042,6 +3198,8 @@ def _get_argument(a: Argument):
             return None
         elif a.type == "as_strings":
             return None
+        elif a.type == "as_complex":
+            return None
         elif a.type == "as_sym_int":
             return a.as_sym_int
         elif a.type == "as_sym_ints":
@@ -3346,6 +3504,7 @@ def canonicalize(
     range_constraints = dict(
         sorted(ep.range_constraints.items(), key=operator.itemgetter(0))
     )
+    guards_code = sorted(ep.guards_code)
     module_call_graph = sorted(ep.graph_module.module_call_graph, key=lambda x: x.fqn)
     signature = ep.graph_module.signature
     graph = ep.graph_module.graph
@@ -3377,17 +3536,19 @@ def rank_output(out) -> tuple[int, Optional[str], int]:
         idx, (_arg, spec) = out
         assert isinstance(spec, OutputSpec)
         if spec.type == "user_output":
-            return 3, None, idx
+            return 4, None, idx
         elif spec.type == "loss_output":
-            return 3, None, idx
+            return 4, None, idx
+        elif spec.type == "parameter_mutation":
+            return 1, spec.parameter_mutation.parameter_name, idx
         elif spec.type == "buffer_mutation":
-            return 1, spec.buffer_mutation.buffer_name, idx
+            return 2, spec.buffer_mutation.buffer_name, idx
         elif spec.type == "gradient_to_parameter":
-            return 4, spec.gradient_to_parameter.parameter_name, idx
+            return 5, spec.gradient_to_parameter.parameter_name, idx
         elif spec.type == "gradient_to_user_input":
-            return 5, None, idx
+            return 6, None, idx
         elif spec.type == "user_input_mutation":
-            return 2, None, idx
+            return 3, None, idx
         elif spec.type == "token":
             return 0, None, idx
         else:
@@ -3500,6 +3661,9 @@ def replace_output(out):
         elif spec.type == "buffer_mutation":
             t = spec.buffer_mutation.arg
             t.name = replace_table[t.name]
+        elif spec.type == "parameter_mutation":
+            t = spec.parameter_mutation.arg
+            t.name = replace_table[t.name]
         elif spec.type == "gradient_to_parameter":
             t = spec.gradient_to_parameter.arg
             t.name = replace_table[t.name]
@@ -3537,6 +3701,7 @@ def replace_output(out):
         schema_version=ep.schema_version,
         verifiers=ep.verifiers,
         torch_version=ep.torch_version,
+        guards_code=guards_code,
     )
 
 
diff --git a/torch/_export/utils.py b/torch/_export/utils.py
index 16f7ebcb76760..b7807145a9fa8 100644
--- a/torch/_export/utils.py
+++ b/torch/_export/utils.py
@@ -19,6 +19,7 @@
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
 from torch._subclasses.functional_tensor import FunctionalTensor
 from torch.fx._utils import first_call_function_nn_module_stack
+from torch.fx.experimental.proxy_tensor import PreDispatchTorchFunctionMode
 from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 
 
@@ -211,6 +212,29 @@ def _getattr(model: torch.fx.GraphModule, attr_name: str):
     return params_buffers_to_node_meta
 
 
+def _maybe_find_pre_dispatch_tf_mode_for_export():
+    if not torch._C._is_torch_function_mode_enabled():
+        return None
+
+    torch_function_mode_stack = torch.overrides._get_current_function_mode_stack()
+
+    pre_dispatch_tf_modes = [
+        mode
+        for mode in torch_function_mode_stack
+        if isinstance(mode, PreDispatchTorchFunctionMode)
+    ]
+
+    assert len(pre_dispatch_tf_modes) <= 1, (
+        f"Expected only one PreDispatchTorchFunctionMode, found {len(pre_dispatch_tf_modes)}"
+    )
+
+    if len(pre_dispatch_tf_modes) == 0:
+        return None
+
+    mode = pre_dispatch_tf_modes[0]
+    return mode
+
+
 def _populate_param_buffer_metadata_to_new_gm(
     params_buffers_to_node_meta: dict[str, Any],
     gm: torch.fx.GraphModule,
@@ -307,7 +331,7 @@ def get_keystr(key_path: KeyPath) -> str:
         return f"*args{keystr(key_path[1:])}"
     else:
         kwarg_key = key_path[1]
-        assert isinstance(kwarg_key, MappingKey)
+        assert isinstance(kwarg_key, (GetAttrKey, MappingKey))
         name = str(kwarg_key)[1:-1]  # get rid of the enclosed []
         return f"{name}{keystr(key_path[2:])}"
 
@@ -395,7 +419,7 @@ def _check_symint(
         # this means we deferred a guard from export analysis to runtime, let this pass
         # we'll add a runtime assert checking equality to this replacement expression
         pass
-    elif arg != symint:
+    elif arg != int(symint):
         path = get_keystr(keypath)
         if i is not None:
             path += f".shape[{i}]"
diff --git a/torch/_export/verifier.py b/torch/_export/verifier.py
index 3b0adf0be1365..58c0f1771a1ee 100644
--- a/torch/_export/verifier.py
+++ b/torch/_export/verifier.py
@@ -223,6 +223,11 @@ def _allowed_op_types() -> tuple[type[Any], ...]:
                 torch.amp.autocast_mode._enter_autocast,
                 torch.amp.autocast_mode._exit_autocast,
                 torch.fx.experimental.symbolic_shapes.cast_symbool_to_symint_guardless,
+                torch._functorch.predispatch._add_batch_dim,
+                torch._functorch.predispatch._remove_batch_dim,
+                torch._functorch.predispatch._vmap_increment_nesting,
+                torch._functorch.predispatch._vmap_decrement_nesting,
+                torch._functorch.predispatch.lazy_load_decompositions,
             )
 
             if not isinstance(op, _allowed_op_types()):
@@ -275,6 +280,13 @@ def _is_type(name, ty):
                             return isinstance(getattr(attr, name, None), ty)
 
                         if type(attr).__name__ == "LoweredBackendModule":
+                            if (
+                                _is_type("backend_id", str)
+                                and hasattr(attr, "original_module")
+                                and hasattr(attr, "module_name")
+                                and getattr(attr, "backend_id", None) == "aoti"
+                            ):
+                                continue
                             if (
                                 _is_type("backend_id", str)
                                 and _is_type("processed_bytes", bytes)
@@ -463,7 +475,12 @@ def _verify_exported_program_signature(exported_program) -> None:
         )
 
     num_tokens = len(gs.output_tokens)
-    end = len(gs.buffers_to_mutate) + len(gs.user_inputs_to_mutate) + num_tokens
+    end = (
+        len(gs.buffers_to_mutate)
+        + len(gs.parameters_to_mutate)
+        + len(gs.user_inputs_to_mutate)
+        + num_tokens
+    )
     mutate_nodes: list[str] = output_nodes[num_tokens:end]
     user_output_nodes = output_nodes[end : end + len(gs.user_outputs)]
 
@@ -475,6 +492,13 @@ def _verify_exported_program_signature(exported_program) -> None:
                     f"Dict of buffers that are mutated, in order: {gs.buffers_to_mutate} \n"
                     f"Buffer nodes available: {gs.buffers} \n"
                 )
+        elif mutation_node in gs.parameters_to_mutate:
+            if gs.parameters_to_mutate[mutation_node] not in gs.parameters:
+                raise SpecViolationError(
+                    f"Parameter output {mutation_node} does not point to a parameter that exists. \n"
+                    f"Dict of parameters that are mutated, in order: {gs.parameters_to_mutate} \n"
+                    f"Parameter nodes available: {gs.parameters} \n"
+                )
         elif mutation_node in gs.user_inputs_to_mutate:
             if gs.user_inputs_to_mutate[mutation_node] not in gs.user_inputs:
                 raise SpecViolationError(
diff --git a/torch/_export/wrappers.py b/torch/_export/wrappers.py
index 47f736de303d2..b851847bada81 100644
--- a/torch/_export/wrappers.py
+++ b/torch/_export/wrappers.py
@@ -4,6 +4,7 @@
 import torch
 import torch._custom_ops
 from torch._C import DispatchKey
+from torch._export.utils import _maybe_find_pre_dispatch_tf_mode_for_export
 from torch._higher_order_ops.flat_apply import (
     _ConstantFunction,
     flat_apply,
@@ -186,23 +187,12 @@ def wrapper(*args, **kwargs):
                 f"tensor subclass. Please look at DTensor.__init__ implementation as an example of proper usage of this API."
             )
         constructor_subclass(*args, **kwargs)
-        if not torch._C._is_torch_function_mode_enabled():
-            return
-        torch_function_mode_stack = torch.overrides._get_current_function_mode_stack()
-
-        pre_dispatch_tf_modes = [
-            mode
-            for mode in torch_function_mode_stack
-            if isinstance(mode, PreDispatchTorchFunctionMode)
-        ]
-        assert len(pre_dispatch_tf_modes) <= 1, (
-            f"Expected only one PreDispatchTorchFunctionMode, found {len(pre_dispatch_tf_modes)}"
-        )
 
-        if len(pre_dispatch_tf_modes) == 0:
+        mode = _maybe_find_pre_dispatch_tf_mode_for_export()
+        if mode is None:
             return
 
-        mode = pre_dispatch_tf_modes[0]
+        assert isinstance(mode, PreDispatchTorchFunctionMode)
 
         tracer = mode.tracer
         subclass = args[0]
diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
index 7217a9c9b3903..248c3a0ae673e 100644
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -302,6 +302,42 @@ class AOTAutogradCacheDetails(FxGraphHashDetails):
     a safe and stable cache key for AOTAutograd.
     """
 
+    def get_triton_source_codes_from_gm(
+        self,
+        gm: torch.fx.GraphModule,
+    ):
+        triton_kernels = []
+        for module in gm.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            for node in module.graph.nodes:
+                if isinstance(node.target, torch._ops.OpOverloadPacket):
+                    attrs = node.target._dir
+                    for attr in attrs:
+                        if custom_op := getattr(node.target, attr, None):
+                            kernels = torch._library.triton.get_triton_kernels_for_op(
+                                custom_op._name
+                            )
+                            triton_kernels.extend(kernels)
+                elif isinstance(node.target, torch._ops.OpOverload):
+                    kernels = torch._library.triton.get_triton_kernels_for_op(
+                        node.target._name
+                    )
+                    triton_kernels.extend(kernels)
+
+        triton_kernel_source_codes = []
+        from torch._inductor.codegen.wrapper import (
+            user_defined_triton_kernel_transitive_closure_source_code,
+        )
+
+        for kernel in triton_kernels:
+            source_codes = user_defined_triton_kernel_transitive_closure_source_code(
+                kernel
+            )
+            triton_kernel_source_codes.append(source_codes)
+
+        return triton_kernel_source_codes
+
     def __init__(
         self,
         gm: torch.fx.GraphModule,
@@ -319,6 +355,7 @@ def __init__(
             [],
             [],
         )
+        self.triton_kernel_source_codes = self.get_triton_source_codes_from_gm(gm)
 
         if hasattr(gm, "saved_tensors_hooks_pack_0"):
 
diff --git a/torch/_functorch/_aot_autograd/graph_compile.py b/torch/_functorch/_aot_autograd/graph_compile.py
index 27cf699091ee4..d02d29cba199b 100644
--- a/torch/_functorch/_aot_autograd/graph_compile.py
+++ b/torch/_functorch/_aot_autograd/graph_compile.py
@@ -516,6 +516,48 @@ class InvokeSubgraphHopGraphs:
     new_num_saved_nodes: Optional[int] = None
 
 
+def prepare_for_partitioner(mod, num_primals, num_fw_outputs):
+    # min-cut partitioner requires the placeholders to have primals and
+    # tangents string in the node.name. The signature of the joint graph is
+    # (*primals, *tangents)
+
+    # We also have to update the output signature which is right now
+    # (*grads, *fw_outs) and we have to change to (*fw_outs, *grads) for the
+    # partitioner to work.
+    new_graph = torch.fx.Graph()
+    env = {}
+
+    primals_counter = itertools.count(0)
+    tangents_counter = itertools.count(0)
+
+    for idx, node in enumerate(mod.graph.nodes):
+        if node.op == "placeholder":
+            if idx < num_primals:
+                env[node] = new_graph.placeholder(f"primals_{next(primals_counter)}")
+            else:
+                env[node] = new_graph.placeholder(f"tangents_{next(tangents_counter)}")
+            env[node].meta = copy.copy(node.meta)
+        elif node.op == "output":
+            # Reverse the (*grads, *fw_outs) to (*fw_outs, *grads)
+            # The reason for having the reversed signature in the first
+            # place is to simplify step 3.
+            old_outputs = node.args[0]
+            new_outputs = (
+                *old_outputs[-num_fw_outputs:],
+                *old_outputs[:-num_fw_outputs],
+            )
+            new_outputs = [env[n] if n else None for n in new_outputs]
+            new_graph.output(tuple(new_outputs))
+        else:
+            env[node] = new_graph.node_copy(node, lambda n: env[n])
+            env[node].meta = copy.copy(node.meta)
+
+    new_graph.lint()
+
+    out = torch.fx.GraphModule(mod, new_graph)
+    return out
+
+
 def run_joint_graph_passes_on_hops(
     joint_gm: torch.fx.GraphModule,
     joint_inputs: Any,
@@ -553,51 +595,6 @@ def num_outputs(mod):
     def num_inputs(mod):
         return len(mod.graph.find_nodes(op="placeholder"))
 
-    def prepare_for_partitioner(mod, num_primals, num_fw_outputs):
-        # min-cut partitioner requires the placeholders to have primals and
-        # tangents string in the node.name. The signature of the joint graph is
-        # (*primals, *tangents)
-
-        # We also have to update the output signature which is right now
-        # (*grads, *fw_outs) and we have to change to (*fw_outs, *grads) for the
-        # partitioner to work.
-        new_graph = torch.fx.Graph()
-        env = {}
-
-        primals_counter = itertools.count(0)
-        tangents_counter = itertools.count(0)
-
-        for idx, node in enumerate(mod.graph.nodes):
-            if node.op == "placeholder":
-                if idx < num_primals:
-                    env[node] = new_graph.placeholder(
-                        f"primals_{next(primals_counter)}"
-                    )
-                else:
-                    env[node] = new_graph.placeholder(
-                        f"tangents_{next(tangents_counter)}"
-                    )
-                env[node].meta = copy.copy(node.meta)
-            elif node.op == "output":
-                # Reverse the (*grads, *fw_outs) to (*fw_outs, *grads)
-                # The reason for having the reversed signature in the first
-                # place is to simplify step 3.
-                old_outputs = node.args[0]
-                new_outputs = (
-                    *old_outputs[-num_fw_outputs:],
-                    *old_outputs[:-num_fw_outputs],
-                )
-                new_outputs = [env[n] if n else None for n in new_outputs]
-                new_graph.output(tuple(new_outputs))
-            else:
-                env[node] = new_graph.node_copy(node, lambda n: env[n])
-                env[node].meta = copy.copy(node.meta)
-
-        new_graph.lint()
-
-        out = torch.fx.GraphModule(mod, new_graph)
-        return out
-
     new_hop_graphs: dict[str, InvokeSubgraphHopGraphs] = defaultdict(
         lambda: InvokeSubgraphHopGraphs()
     )
@@ -1367,7 +1364,8 @@ def aot_stage2_autograd(
             if maybe_subclass_meta is None
             else maybe_subclass_meta.fw_metadata
         )
-        with track_graph_compiling(aot_config, "joint"):
+        context = torch._C._DisableAutocast if disable_amp else nullcontext
+        with context(), track_graph_compiling(aot_config, "joint"):
             # See Note: [Partitioner handling for Subclasses, Part 1]
             # See Note: [Recomputing subclass mutation handling]
             mutated_inp_runtime_indices = (
diff --git a/torch/_functorch/_aot_autograd/input_output_analysis.py b/torch/_functorch/_aot_autograd/input_output_analysis.py
index 528720629ada9..dcee706f5cc22 100644
--- a/torch/_functorch/_aot_autograd/input_output_analysis.py
+++ b/torch/_functorch/_aot_autograd/input_output_analysis.py
@@ -460,6 +460,7 @@ def create_graph_signature(
         named_buffers=buffer_names,
         num_user_inputs=num_user_args,
         num_user_outputs=num_user_fw_outs,
+        trace_joint=trace_joint,
         loss_index=loss_index,
         backward_signature=backward_signature,
     )
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
index 01caf13ef3f61..f1cce86403209 100644
--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -2347,6 +2347,44 @@ def _backward_impl(ctx, all_args):
                         lazy_backward_info, AutogradLazyBackwardCompileInfo
                     )
 
+                    if (
+                        hasattr(lazy_backward_info, "saved_context")
+                        and lazy_backward_info.saved_context is not None
+                    ):
+                        assert isinstance(
+                            lazy_backward_info.saved_context, TracingContext
+                        )
+                        ddp_ctx = lazy_backward_info.saved_context.ddp_optimizer_ctx
+                        if ddp_ctx is not None:
+                            assert ddp_ctx.curr_bucket >= 0, (
+                                f"expected same # of fw and bw compiles, but found bucket {ddp_ctx.curr_bucket}"
+                            )
+                            curr_fw_meta = ddp_ctx.metadata_per_bucket[
+                                ddp_ctx.curr_bucket
+                            ]
+                            # Note [DDPOptimizer and fw_metadata]
+                            # When using the DDPOptimizer, we have a single dynamo graph (and TracingContext),
+                            # but multiple AOTDispatcher graph.
+                            #
+                            # One consequence is that there will be **multiple** fw_metadata objects, one per AOT graph,
+                            # which we stash the fw_metadata on the TracingContext.
+                            #
+                            # Normally what happens is that as we compile AOT graphs 1...N, we clobber the fw_metadata
+                            # for graph i-1 when we start running AOT for graph i.
+                            # Ordinarily this is fine, because inductor no longer needs the metadata from graph i-1.
+                            #
+                            # However, this is a problem for lazy compilation of the backward. During backward compilation,
+                            # we compile the backward lazily at backward runtime, meaning that we will first compile
+                            # backward graph N, N-1, ..., 1.
+                            # We need to ensure that at the time inductor compiles bw graph N-1, it can access
+                            # the corresponding fw_metadta for graph N-1.
+                            #
+                            # We do this by stashing a DDPOptimizerContext, which tracks:
+                            # - the metadata of all N graphs
+                            # - the graph we are currently compiling in our DDPOptimizer region.
+                            ddp_ctx.curr_bucket -= 1
+                            lazy_backward_info.saved_context.fw_metadata = curr_fw_meta
+
                     if not saved_tensors_use_once:
                         fw_metadata.bw_donated_idxs = []
                         # Update bw_donated_idxs if using lazy_backward_info from `aot_dispatch_autograd`
diff --git a/torch/_functorch/_aot_autograd/schemas.py b/torch/_functorch/_aot_autograd/schemas.py
index 8bfb7b1f11d13..9c8cfc0a318da 100644
--- a/torch/_functorch/_aot_autograd/schemas.py
+++ b/torch/_functorch/_aot_autograd/schemas.py
@@ -829,6 +829,7 @@ class GraphSignature:
     # "graph outputs that correspond to updated buffers"
     # to the FQN names of those mutated buffers.
     buffers_to_mutate: dict[GraphOutputName, FQN]
+    parameters_to_mutate: dict[GraphOutputName, FQN]
     user_inputs_to_mutate: dict[GraphOutputName, GraphInputName]
 
     in_spec: pytree.TreeSpec
@@ -852,6 +853,7 @@ def from_tracing_metadata(
         named_buffers: list[str],
         num_user_inputs: int,
         num_user_outputs: int,
+        trace_joint: bool,
         loss_index: Optional[int],
         backward_signature: Optional[BackwardSignature],
     ) -> GraphSignature:
@@ -897,8 +899,9 @@ def from_tracing_metadata(
         mutations = []
         for idx, input_info in enumerate(view_mutation_metadata.input_info):
             if input_info.mutates_data:
-                # Only buffers can be mutated, not parameters
-                assert idx >= len(parameters)
+                if trace_joint:
+                    # Only buffers can be mutated, not parameters
+                    assert idx >= len(parameters)
                 mutations.append(names[idx + num_tokens])
 
         assert len(mutations) == view_mutation_metadata.num_mutated_inp_runtime_indices
@@ -911,12 +914,16 @@ def from_tracing_metadata(
 
         user_inputs_to_mutate = {}
         buffers_to_mutate = {}
+        parameters_to_mutate = {}
         for output_name, mutation_name in outputs_to_mutations.items():
             if mutation_name in user_inputs:
                 user_inputs_to_mutate[output_name] = mutation_name
             else:
-                assert mutation_name in buffers
-                buffers_to_mutate[output_name] = mutation_name
+                assert mutation_name in buffers or mutation_name in parameters
+                if mutation_name in buffers:
+                    buffers_to_mutate[output_name] = mutation_name
+                else:
+                    parameters_to_mutate[output_name] = mutation_name
 
         start, stop = stop, stop + num_user_outputs
         user_outputs = graph_outputs[start:stop]
@@ -937,6 +944,7 @@ def from_tracing_metadata(
             inputs_to_parameters=inputs_to_parameters,  # type: ignore[arg-type]
             user_inputs_to_mutate=user_inputs_to_mutate,
             buffers_to_mutate=buffers_to_mutate,  # type: ignore[arg-type]
+            parameters_to_mutate=parameters_to_mutate,  # type: ignore[arg-type]
             in_spec=in_spec,
             out_spec=out_spec,
             backward_signature=backward_signature,
@@ -983,6 +991,9 @@ class AOTConfig:
     ignore_shape_env: bool = False
     precompile_backend_id: Optional[str] = None
     force_non_lazy_backward_lowering: bool = False
+    # This config makes sure to check certain things like
+    # mutating input with req_grad in export joint tracing.
+    export_trace_joint: bool = False
 
     def __post_init__(self):
         if self.pre_dispatch:
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index 6be696fddbaff..1e0cb6a2ef8be 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -672,6 +672,7 @@ def _dup_fake_script_obj(fake_flat_args):
                 ]
             )
             != 0
+            and aot_config.export_trace_joint
         ):
             raise RuntimeError(
                 f"""\
@@ -1154,6 +1155,8 @@ def aot_export_joint_with_descriptors(
     decompositions: Optional[dict] = None,
     keep_inference_input_mutations=False,
     ignore_shape_env=False,
+    fw_compiler: Optional[AOTDispatchCompiler] = boxed_nop_preserve_node_meta,
+    bw_compiler: Optional[AOTDispatchCompiler] = boxed_nop_preserve_node_meta,
 ) -> JointWithDescriptors:
     """
     This API captures the joint graph for an nn.Module.  However, unlike
@@ -1231,8 +1234,8 @@ def aot_export_joint_with_descriptors(
         mod,
         args,
         kwargs,
-        boxed_nop_preserve_node_meta,
-        boxed_nop_preserve_node_meta,
+        fw_compiler,
+        bw_compiler,
         default_partition,
         decompositions,
         keep_inference_input_mutations,
@@ -1446,6 +1449,7 @@ def fn_to_trace(*args):
             no_tangents=True,
             pre_dispatch=pre_dispatch,
             dynamic_shapes=dynamic_shapes,
+            trace_joint=trace_joint,
             kwargs=kwargs,
         )
 
@@ -1548,6 +1552,7 @@ def aot_export_joint_simple(
             func,
             args,
             decompositions=decompositions,
+            trace_joint=trace_joint,
         )
         in_spec, _kw_in_spec = in_spec.children_specs
     # At this point, we can just directly return the (joint or inference graph) that we traced.
@@ -1629,6 +1634,8 @@ def _aot_export_function(
     # If None, `dynamic_shapes` will be inferred from inputs, but the inferred result might be wrong.
     dynamic_shapes: Optional[bool] = None,
     keep_input_mutations: bool = False,
+    # Under export, configures whether we are getting inference or training IR
+    trace_joint: bool = False,
     kwargs=None,
 ) -> tuple[torch.fx.GraphModule, ViewAndMutationMeta, pytree.TreeSpec, pytree.TreeSpec]:
     kwargs = kwargs or {}
@@ -1673,6 +1680,7 @@ def _aot_export_function(
         is_export=True,
         no_tangents=no_tangents,
         pre_dispatch=pre_dispatch,
+        export_trace_joint=trace_joint,
     )
     if fake_mode is None:
         fake_mode, shape_env = construct_fake_mode(flat_args, aot_config)
diff --git a/torch/_functorch/config.py b/torch/_functorch/config.py
index 2833a2b1631a1..5bf2dee3e1d7d 100644
--- a/torch/_functorch/config.py
+++ b/torch/_functorch/config.py
@@ -281,6 +281,17 @@ def remote_autograd_cache_default() -> Optional[bool]:
 # real tensor outputs.
 generate_fake_kernels_from_real_mismatches = False
 
+# When there are device mismatches in FakeTensor device propagation,
+# prefer a specific device type over others. This is particularly useful
+# in full compiled mode where intermediate tensors with device mismatches
+# represent only logical differences during compilation - these intermediate
+# tensors will never physically materialize in the binary execution, so the
+# device mismatch is not a real runtime concern. Enabling this allows the
+# compiler to proceed with compilation by choosing the preferred device type
+# for consistency. For example, set to "mtia" to prefer MTIA devices over
+# CPU, or "cuda" to prefer CUDA devices over CPU.
+fake_tensor_prefer_device_type: Optional[str] = None
+
 # CUDAGraph save run_with_rng functionalization.
 # TODO: turn on by default
 graphsafe_rng_functionalization = True
diff --git a/torch/_functorch/partitioners.py b/torch/_functorch/partitioners.py
index cb524eae36407..9030cfc3c17ca 100644
--- a/torch/_functorch/partitioners.py
+++ b/torch/_functorch/partitioners.py
@@ -2598,6 +2598,88 @@ def has_same_nodes(joint_graph):
     return saved_values
 
 
+def thread_graphsafe_rng_from_hops(module, is_backward):
+    """
+    Graph-safe RNG lets torch.compile use CUDA Graphs for graphs with RNG ops.
+    For graphs without HOPs, the partitioner adds placeholder nodes
+    fwd_rng_state_* and bw_rng_state_* to the forward and backward graphs. At
+    runtime, the AOTDispatcher retrieves these RNG states and passes them to the
+    compiled graphs.
+
+    This works well for no-HOP graphs. With HOPs, the partitioner runs
+    recursively: it first partitions the HOP (producing forward/backward HOP
+    subgraphs) and then stitches them back into the outer joint graph. For HOPs
+    that contain RNG ops, the outer joint graph now includes HOP subgraph
+    modules with extra RNG placeholders. We must thread these placeholders
+    through the outer module partitioned forward and backward graphs—this
+    function does exactly that. It collects the RNG placeholder nodes from the
+    HOPs and creates corresponding placeholders in the outer forward and
+    backward graphs.
+
+    There is a catch: for a short period, the joint graph is in a “bad” state.
+    The HOP subgraphs expect additional inputs (because of the new
+    placeholders), but the outer graph call sites don't yet provide them. We
+    can't fix this in the joint graph because the joint graph's input signature
+    is fixed (primals, tangents). As a compromise, we keep the joint graph in
+    somewhat of a bad state for some time and, once the outer forward and
+    backward graphs are partitioned, insert the corresponding RNG placeholders
+    and wire up the calls.
+    """
+
+    rng_count = 0
+    rng_string = "bwd_rng_state" if is_backward else "fwd_rng_state"
+    last_input = next(reversed(module.graph.find_nodes(op="placeholder")))
+    for hop_node in module.graph.find_nodes(
+        op="call_function", target=torch.ops.higher_order.invoke_subgraph
+    ):
+        subgraph = getattr(module, hop_node.args[0].target)
+        if isinstance(subgraph, fx.GraphModule):
+            new_rng_inputs = []
+            for idx, placeholder_node in enumerate(
+                subgraph.graph.find_nodes(op="placeholder")
+            ):
+                if rng_string in placeholder_node.name:
+                    # Found a rng state placeholder in the hop graph, lets add
+                    # the corresponding node in the outer graph
+                    with module.graph.inserting_after(last_input):
+                        rng_state = module.graph.placeholder(
+                            f"{rng_string}_{rng_count}"
+                        )
+                        rng_count += 1
+                        rng_state.meta["val"] = placeholder_node.meta["val"]
+                        last_input = rng_state
+                        new_rng_inputs.append(rng_state)
+
+            if new_rng_inputs:
+                # Pass on the new args that include the new_rng_inputs
+                with module.graph.inserting_after(hop_node):
+                    new_hop_node_with_fixed_args = module.graph.create_node(
+                        "call_function",
+                        torch.ops.higher_order.invoke_subgraph,
+                        (*hop_node.args, *new_rng_inputs),  # type: ignore[arg-type]
+                        {},
+                    )
+                    hop_node.replace_all_uses_with(
+                        new_hop_node_with_fixed_args, propagate_meta=True
+                    )
+
+                # Setup the eager_input_vals
+                eager_vals = hop_node.meta.get("eager_input_vals")
+                if eager_vals:
+                    eager_args, eager_kwargs = eager_vals
+                    new_eager_args = (
+                        *eager_args,
+                        *[inp.meta["val"] for inp in new_rng_inputs],
+                    )
+                    new_hop_node_with_fixed_args.meta["eager_input_vals"] = (
+                        new_eager_args,
+                        eager_kwargs,
+                    )
+                module.graph.erase_node(hop_node)
+
+    return module
+
+
 def min_cut_rematerialization_partition(
     joint_module: fx.GraphModule,
     _joint_inputs,
@@ -2767,6 +2849,9 @@ def classify_nodes(joint_module, static_lifetime_input_indices):
     fw_module = raise_getitems(fw_module)
     bw_module = raise_getitems(bw_module)
 
+    fw_module = thread_graphsafe_rng_from_hops(fw_module, is_backward=False)
+    bw_module = thread_graphsafe_rng_from_hops(bw_module, is_backward=True)
+
     if AOT_PARTITIONER_DEBUG:
         # Calculate sorted sizes of saved values
         sorted_sizes = sorted([(_size_of(i), str(i)) for i in saved_values])
diff --git a/torch/_functorch/predispatch.py b/torch/_functorch/predispatch.py
new file mode 100644
index 0000000000000..44fbd5b632c18
--- /dev/null
+++ b/torch/_functorch/predispatch.py
@@ -0,0 +1,158 @@
+# mypy: ignore-errors
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This module contains pre-dispatch wrappers for functorch operations
+that enable proper tracing in PT2 non-strict export/compile fx graph.
+"""
+
+import torch
+from torch._C._functorch import (
+    _add_batch_dim as _add_batch_dim_impl,
+    _remove_batch_dim as _remove_batch_dim_impl,
+    _vmap_decrement_nesting as _vmap_decrement_nesting_impl,
+    _vmap_increment_nesting as _vmap_increment_nesting_impl,
+)
+
+
+def _add_batch_dim(self, batch_dim, level):
+    """
+    Thin wrapper around torch._C._add_batch_dim that is used to proxy in
+    PT2 export/compile fx graph
+    """
+    from torch._export.utils import _maybe_find_pre_dispatch_tf_mode_for_export
+
+    mode = _maybe_find_pre_dispatch_tf_mode_for_export()
+
+    if mode:
+        return torch.overrides.handle_torch_function(
+            _add_batch_dim, (self,), self, batch_dim, level
+        )
+
+    res = _add_batch_dim_impl(self, batch_dim, level)
+    return res
+
+
+def _remove_batch_dim(self, level, batch_size, out_dim):
+    """
+    Thin wrapper around torch._C._remove_batch_dim that is used to proxy in
+    PT2 export/compile fx graph
+    """
+    from torch._export.utils import _maybe_find_pre_dispatch_tf_mode_for_export
+
+    mode = _maybe_find_pre_dispatch_tf_mode_for_export()
+
+    if mode:
+        return torch.overrides.handle_torch_function(
+            _remove_batch_dim, (self,), self, level, batch_size, out_dim
+        )
+
+    res = _remove_batch_dim_impl(self, level, batch_size, out_dim)
+    return res
+
+
+def _vmap_increment_nesting(batch_size, randomness):
+    """
+    Thin wrapper around torch._C._vmap_increment_nesting that is used
+    to proxy in export/compile graph
+    """
+    from torch._export.utils import _maybe_find_pre_dispatch_tf_mode_for_export
+
+    mode = _maybe_find_pre_dispatch_tf_mode_for_export()
+
+    if mode:
+        return torch.overrides.handle_torch_function(
+            _vmap_increment_nesting, (batch_size,), batch_size, randomness
+        )
+    res = _vmap_increment_nesting_impl(batch_size, randomness)
+    return res
+
+
+def _vmap_decrement_nesting():
+    """
+    Thin wrapper around torch._C._vmap_increment_nesting that is used
+    to proxy in export/compile graph
+    """
+    from torch._export.utils import _maybe_find_pre_dispatch_tf_mode_for_export
+
+    mode = _maybe_find_pre_dispatch_tf_mode_for_export()
+
+    if mode:
+        return torch.overrides.handle_torch_function(
+            _vmap_decrement_nesting,
+            (),
+        )
+    return _vmap_decrement_nesting_impl()
+
+
+# Global variables for lazy_load_decompositions
+DECOMPOSITIONS_LOADED = False
+DECOMPOSITIONS_LOCK = None  # Will be initialized when needed
+VMAP_DECOMPOSITIONS_LIB = None
+
+
+def lazy_load_decompositions():
+    """
+    Lazy loading of vmap decompositions with pre-dispatch support.
+    """
+    from torch._export.utils import _maybe_find_pre_dispatch_tf_mode_for_export
+
+    mode = _maybe_find_pre_dispatch_tf_mode_for_export()
+
+    if mode:
+        return torch.overrides.handle_torch_function(lazy_load_decompositions, ())
+
+    global DECOMPOSITIONS_LOADED, DECOMPOSITIONS_LOCK, VMAP_DECOMPOSITIONS_LIB
+
+    if DECOMPOSITIONS_LOADED:
+        return
+
+    # Initialize lock if needed
+    if DECOMPOSITIONS_LOCK is None:
+        import threading
+
+        DECOMPOSITIONS_LOCK = threading.Lock()
+
+    with DECOMPOSITIONS_LOCK:
+        if DECOMPOSITIONS_LOADED:
+            return
+
+        import os
+
+        if not (os.environ.get("PYTORCH_JIT", "1") == "1" and __debug__):
+            DECOMPOSITIONS_LOADED = True
+            return
+
+        # use an alternate way to register an operator into the decomposition table
+        # _register_jit_decomposition doesn't work for some operators, e.g. addr,
+        #  because the Tensor types generated cannot be unioned by torchscript
+        # decomp should be type OpOverload
+        VMAP_DECOMPOSITIONS_LIB = torch.library.Library(
+            "aten", "IMPL", "FuncTorchBatched"
+        )
+
+        from torch._decomp import decomposition_table
+
+        def _register_python_decomposition_vmap(decomp):
+            if decomp in decomposition_table:
+                VMAP_DECOMPOSITIONS_LIB.impl(decomp, decomposition_table[decomp])
+            else:
+                raise RuntimeError(f"could not find decomposition for {decomp}")
+
+        _register_python_decomposition_vmap(torch.ops.aten.mse_loss_backward.default)
+        _register_python_decomposition_vmap(
+            torch.ops.aten.smooth_l1_loss_backward.default
+        )
+        _register_python_decomposition_vmap(torch.ops.aten.huber_loss_backward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.nll_loss_forward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.nll_loss2d_forward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.nll_loss_backward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.nll_loss2d_backward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.addr.default)
+
+        DECOMPOSITIONS_LOADED = True
diff --git a/torch/_functorch/vmap.py b/torch/_functorch/vmap.py
index f5c6dc1a5db5d..5e3893fef5cd0 100644
--- a/torch/_functorch/vmap.py
+++ b/torch/_functorch/vmap.py
@@ -9,19 +9,18 @@
 import contextlib
 import functools
 import itertools
-import os
-import threading
 from functools import partial
 from typing import Any, Callable, Optional, Union
 
 import torch
 from torch import Tensor
-from torch._C._functorch import (
+from torch._C._functorch import is_batchedtensor
+from torch._functorch.predispatch import (
     _add_batch_dim,
     _remove_batch_dim,
     _vmap_decrement_nesting,
     _vmap_increment_nesting,
-    is_batchedtensor,
+    lazy_load_decompositions,
 )
 from torch.utils._pytree import (
     _broadcast_to_and_flatten,
@@ -258,57 +257,6 @@ def _get_name(func: Callable):
     return repr(func)
 
 
-DECOMPOSITIONS_LOADED = False
-DECOMPOSITIONS_LOCK = threading.Lock()
-VMAP_DECOMPOSITIONS_LIB = None
-
-
-# torch.package, Python 3.11, and torch.jit-less environments are unhappy with
-# decompositions. Only load them when needed if possible.
-def lazy_load_decompositions():
-    global DECOMPOSITIONS_LOADED
-    if DECOMPOSITIONS_LOADED:
-        return
-
-    with DECOMPOSITIONS_LOCK:
-        if DECOMPOSITIONS_LOADED:
-            return
-
-        if not (os.environ.get("PYTORCH_JIT", "1") == "1" and __debug__):
-            DECOMPOSITIONS_LOADED = True
-            return
-
-        # use an alternate way to register an operator into the decomposition table
-        # _register_jit_decomposition doesn't work for some operators, e.g. addr,
-        #  because the Tensor types generated cannot be unioned by torchscript
-        # decomp should be type OpOverload
-        global VMAP_DECOMPOSITIONS_LIB
-        VMAP_DECOMPOSITIONS_LIB = torch.library.Library(
-            "aten", "IMPL", "FuncTorchBatched"
-        )
-
-        from torch._decomp import decomposition_table
-
-        def _register_python_decomposition_vmap(decomp):
-            if decomp in decomposition_table:
-                VMAP_DECOMPOSITIONS_LIB.impl(decomp, decomposition_table[decomp])
-            else:
-                raise RuntimeError(f"could not find decomposition for {decomp}")
-
-        _register_python_decomposition_vmap(torch.ops.aten.mse_loss_backward.default)
-        _register_python_decomposition_vmap(
-            torch.ops.aten.smooth_l1_loss_backward.default
-        )
-        _register_python_decomposition_vmap(torch.ops.aten.huber_loss_backward.default)
-        _register_python_decomposition_vmap(torch.ops.aten.nll_loss_forward.default)
-        _register_python_decomposition_vmap(torch.ops.aten.nll_loss2d_forward.default)
-        _register_python_decomposition_vmap(torch.ops.aten.nll_loss_backward.default)
-        _register_python_decomposition_vmap(torch.ops.aten.nll_loss2d_backward.default)
-        _register_python_decomposition_vmap(torch.ops.aten.addr.default)
-
-        DECOMPOSITIONS_LOADED = True
-
-
 def vmap_impl(func, in_dims, out_dims, randomness, chunk_size, *args, **kwargs):
     lazy_load_decompositions()
     _check_out_dims_is_int_or_int_pytree(out_dims, func)
diff --git a/torch/_guards.py b/torch/_guards.py
index fa6f9cc1e7bd6..f6f053ea064cb 100644
--- a/torch/_guards.py
+++ b/torch/_guards.py
@@ -41,6 +41,7 @@
 
     import sympy
 
+    from torch._dynamo.backends.distributed import DDPOptimizerContext
     from torch._dynamo.codegen import PyCodegen
     from torch._functorch._aot_autograd.schemas import ViewAndMutationMeta
     from torch._subclasses.fake_tensor import FakeTensorMode
@@ -261,17 +262,18 @@ class Guard:
     # it is meaningless.  Example create_fns that are like this include
     # GRAD_MODE and SHAPE_ENV.
     originating_source: Source
-    create_fn: Callable[[GuardBuilderBase, Guard], Any]
+    create_fn: Callable[[GuardBuilderBase, Guard], None]
 
     # Export only. These values are written to at time of guard check_fn creation.
     guard_types: Optional[list[str]] = None
     code_list: Optional[list[str]] = None
     obj_weakref: Optional[object] = None
-    guarded_class_weakref: Optional[type] = None
+    guarded_class_weakref: Optional[weakref.ReferenceType[Any]] = None
 
     stack: Optional[CapturedTraceback] = None
     user_stack: Optional[traceback.StackSummary] = None
     _hash: Optional[int] = None
+    _unserializable: bool = False
 
     def __hash__(self) -> int:
         if self._hash is None:
@@ -377,10 +379,17 @@ def is_fsdp_module(self) -> bool:
     def is_local(self) -> bool:
         return self.source.is_local()
 
+    def create_fn_name(self) -> str:
+        if isinstance(self.create_fn, functools.partial):
+            create_fn = self.create_fn.func  # type: ignore[attr-defined]
+        else:
+            create_fn = self.create_fn
+        return create_fn.__name__
+
     def set_export_info(
         self,
         guard_type: str,
-        guarded_class: Optional[type],
+        guarded_class: Optional[weakref.ReferenceType[Any]],
         code_list: list[str],
         obj_weakref: object,
     ) -> None:
@@ -860,6 +869,8 @@ def __init__(self, fake_mode: Optional[FakeTensorMode]) -> None:
         self.loc_in_frame: Optional[tuple[str, int, str]] = None
         # this is only set after aot_autograd
         self.fw_metadata: Optional[ViewAndMutationMeta] = None
+        # this is only set when the DDPOptimizer is used
+        self.ddp_optimizer_ctx: Optional[DDPOptimizerContext] = None
         # this is only set after aot_autograd
         self.aot_graph_name: Optional[list[str]] = None
         self.params_flat: Optional[list[Any]] = None
diff --git a/torch/_higher_order_ops/__init__.py b/torch/_higher_order_ops/__init__.py
index e14659276cc74..e809c729dc424 100644
--- a/torch/_higher_order_ops/__init__.py
+++ b/torch/_higher_order_ops/__init__.py
@@ -27,7 +27,10 @@
 from torch._higher_order_ops.scan import scan
 from torch._higher_order_ops.strict_mode import strict_mode
 from torch._higher_order_ops.torchbind import call_torchbind
-from torch._higher_order_ops.while_loop import while_loop
+from torch._higher_order_ops.while_loop import (
+    while_loop,
+    while_loop_stack_output_op as while_loop_stack_output,
+)
 from torch._higher_order_ops.wrap import (
     dynamo_bypassing_wrapper,
     tag_activation_checkpoint,
@@ -69,4 +72,5 @@
     "strict_mode",
     "aoti_call_delegate",
     "map",
+    "while_loop_stack_output",
 ]
diff --git a/torch/_higher_order_ops/associative_scan.py b/torch/_higher_order_ops/associative_scan.py
index c9f5dda563369..fa59ee244fec1 100644
--- a/torch/_higher_order_ops/associative_scan.py
+++ b/torch/_higher_order_ops/associative_scan.py
@@ -5,16 +5,21 @@
 
 import torch
 import torch._prims_common as utils
-import torch._subclasses.functional_tensor
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._higher_order_ops.utils import (
     _maybe_compile_and_run_fn,
     _maybe_run_with_interpreter,
-    autograd_not_implemented,
+    check_input_alias_and_mutation_return_outputs,
     check_meta_consistency,
+    create_bw_fn,
     first_slice_copy,
+    first_slice_copy_with_grad,
+    materialize_as_graph,
     reenter_make_fx,
+    save_tensors_and_symints_for_backward,
+    saved_tensors_and_symints,
+    split_into_chunks,
     unique_graph_id,
     validate_subgraph_args_types,
 )
@@ -90,6 +95,58 @@ def __call__(self, combine_fn, xs, additional_inputs):
         validate_subgraph_args_types(additional_inputs)
         return super().__call__(combine_fn, xs, additional_inputs)
 
+    def gen_schema(self, combine_fn, xs, additional_inputs):
+        from torch._higher_order_ops.schema import HopSchemaGenerator
+        from torch._higher_order_ops.utils import materialize_as_graph
+
+        # For associative scan, we need two copies of xs for the combine function
+        # The combine function takes two elements and returns one element
+        xs_slice1 = [first_slice_copy(x) for x in xs]
+        xs_slice2 = [first_slice_copy(x) for x in xs]
+        all_inputs = tuple(xs_slice1 + xs_slice2 + list(additional_inputs))
+
+        combine_gm: torch.fx.GraphModule = (
+            combine_fn
+            if isinstance(combine_fn, torch.fx.GraphModule)
+            else materialize_as_graph(combine_fn, all_inputs)
+        )
+
+        example_inputs = [
+            n.meta["val"] if "val" in n.meta else n.meta["example_value"]
+            for n in combine_gm.graph.find_nodes(op="placeholder")
+        ]
+
+        (
+            _,
+            _,
+            _,
+            mutated_inputs,
+            outputs,
+        ) = check_input_alias_and_mutation_return_outputs(combine_gm, example_inputs)
+        if len(mutated_inputs) > 0:
+            raise RuntimeError(
+                "For associative_scan, combine_fn cannot have in-place mutations but found "
+                f"{mutated_inputs}-th inputs are mutated."
+            )
+
+        schema_gen = HopSchemaGenerator(self)
+        schema_gen.add_arg("combine_fn", combine_gm)
+
+        for idx, x in enumerate(xs):
+            schema_gen.add_arg(f"xs{idx}", x)
+
+        for idx, arg in enumerate(additional_inputs):
+            schema_gen.add_arg(
+                f"additional_input{idx}",
+                arg,
+            )
+
+        for out in outputs:
+            schema_gen.add_output(out)
+
+        schema_gen.add_schema_tree_spec(combine_fn, xs, additional_inputs)
+        return schema_gen.gen_schema()
+
 
 associative_scan_op = AssociativeScanOp()
 
@@ -138,6 +195,9 @@ def add(x: torch.Tensor, y: torch.Tensor):
         cumsum = associative_scan(add, x, dim)
 
     """
+    # TODO: Support lifted arguments in inductor for associative_scan
+    # TODO: Support autograd for cases with lifted arguments for combine_mode=pointwise
+
     # The reason we flatten xs before calling into dynamo is that
     # we want to create a consistent input ordering for combine_fn
     # and we also want to the input ordering matches the output ordering.
@@ -189,9 +249,6 @@ def _validate_input(cfn, lxs, d, r, cm):
     if reverse:
         leaves_xs = [torch.flip(elem, [0]) for elem in leaves_xs]
 
-    # TODO: Support Autograd
-    # TODO: Unify handling of pytrees for control flow ops, such as cond, while_loop, etc.
-
     if combine_mode == "generic":
         # The generic_associative_scan implementation calls the combine_fn with a `batch` along the scan dimension
         # For example, consider:
@@ -415,9 +472,378 @@ def associative_scan_op_dense(combine_fn, xs, additional_inputs):
     return generic_associative_scan(combine_fn, xs, additional_inputs=additional_inputs)
 
 
-associative_scan_op.py_autograd_impl(
-    autograd_not_implemented(associative_scan_op, deferred_error=True)
-)
+class AssociativeScanAutogradOp(torch.autograd.Function):
+    r""" associative_scan
+        Example::
+            xs = torch.arange(1, 5) = [1, 2, 3, 4]
+
+            def combine_fn(a: torch.Tensor, b: torch.Tensor):
+                return a * b
+
+            ys = associative_scan(comine_fn, xs),
+            which can be unpacked as:
+            ys0 = xs0                                         = 1
+            ys1 = combine_fn(ys0, xs1) = combine_fn(1, 2)     = 2
+            ...
+            ysT = combine_fn(ys(T-1), xsT) = combine_fn(6, 4) = 24
+            ys = [1, 2, 6, 24]
+
+            This creates a recursive data dependency structure where each output yst
+            depends on all prior inputs xs0 through xst. The dependency can be visualized as:
+
+    Level 0 (Input):    xs0    xs1    xs2    xs3    xs4
+                        \    /       |      |      |
+                        \  /        |      |      |
+    Level 1:               ys1 ───────┘      |      |
+                            \               /       |
+                            \             /        |
+    Level 2:                  ys2 ────────┘         |
+                            \                   /
+                                \                 /
+    Level 3:                     ys3 ────────────┘
+                                \
+                                \
+    Level 4:                        ys4
+
+
+    We could get the following backward gradient graph:
+
+
+    Level 0 (output):   g_xs0   g_xs1   g_xs2   g_xs3   g_xs4
+                        \      /       |       |     |
+                        \    /        |       |     |
+    Level 1:    gl_ys1  ─> g_ys1  ──────┘       |     |
+                            \                  /      |
+                            \                /       |
+    Level 2:    gl_ys2     ─> g_ys2  ────────┘        |
+                            \                     /
+                                \                   /
+    Level 3:    gl_ys3        ─> g_ys3  ───────────┘
+                                \
+                                \
+    Level 4:    gl_ys4           ─> g_ys4,
+
+    where gl_y1 is the gradient of the loss with respect to ys1 and the input of backward.
+
+    To calculate the gradients of the inputs, the chain rule suggests:
+
+    g_xs0 = g_ys1
+    g_xs1 = g_ys1 * bw(ys0, xs1) = g_ys1 * bwxs01
+    g_xs2 = g_ys2 * bw(ys1, xs2) = g_ys2 * bwxs12
+    g_xs3 = g_ys3 * bw(ys2, xs3) = g_ys3 * bwxs23
+    g_xs4 = g_ys4 * bw(ys3, xs4) = g_ys4 * bwxs34
+
+    Notice the bw(...) is just the single step bw (instantaneous gradients), whose formula can be computed from combine_fn.
+    For example bw(ys3, xs4) (also abbreviated with bwxs34) computes the gradients ∂/∂xs4 combine_fn(ys3, xs4).
+    Similarly, bw(ys4, ys3) (also abbreviated with bwys43) computes the gradients ∂/∂ys3 combine_fn(ys3, xs4).
+
+    Let's break down how to calculate g_ys by recursively substituting the unknowns:
+
+    g_ys1 = gl_ys1 + g_ys2 * bw(ys2, ys1)
+          = gl_ys1 + (gl_ys2  + g_ys3 * bw(ys3, ys2)) * bw(ys2, ys1)
+          = gl_ys1 + gl_ys2 * bw(ys2, ys1) + g_ys3 * bw(ys3, ys2) * bw(y2, y1)
+          = gl_ys1 + gl_ys2 * bw(ys2, ys1) + gl_ys3 * bw(ys3, ys2) * bw(y2, y1) \
+                   + g_ys4 * bw(ys4, ys3) * bw(ys3, ys2) * bw(ys2, ys1)
+          = gl_ys1 + gl_ys2 * bw(ys2, ys1) + gl_ys3 * bw(ys3, ys2) * bw(y2, y1) \
+                   + gl_ys4 * bw(ys4, ys3) * bw(ys3, ys2) * bw(ys2, ys1)
+
+    Let's do the same for all the g_ys:
+    g_ys2 = gl_ys2 + gl_ys3 * bw(ys3, ys2) + gl_y4 * bw(ys4, ys3) * bw(ys3, ys2)
+    g_ys3 = gl_ys3 + gl_ys4 * bw(ys4, ys3)
+    g_ys4 = gl_ys4
+
+    Notice that the above can be re-written as columnwise multiplication of y_mat and gl_ys:
+
+    g_ys1   1, bwys21, bwys321, bwys4321       gl_ys1
+    g_ys2 = 0,    1  , bwys321, bwys4321   .   gl_ys2
+    g_ys3   0,    0  ,     1  , bwys4321       gl_ys3
+    g_ys4   0,    0  ,     0  ,        1       gl_ys4,
+
+    where bwys21 is an abbreviation for bw(ys2, ys1),
+    bwys321 is an abbreviation for bw(ys3, ys2) * bw(ys2, ys1) so on and so forth.
+
+    We could effectively compute the upper triangular matrix y_mat with:
+    cumprod([1, bwys21, bwys32, bwys43]) then masking out the values as needed.
+    Thus, only [1, bwys21, bwys32, bwys43] are required to compute the y_mat.
+
+
+        References: https://justintchiu.com/blog/pscan_diff/
+
+        NOTE: [associative_scan autograd implementation]
+
+        The forward of associative_scan can be computed with the following steps:
+
+        1.) Compute the forward output of the associative_scan
+            ys = associative_scan(combine_fn, xs, additional_inputs)
+
+        The backward of associative_scan can be computed with the following steps:
+
+        2.) Prepare the backward graph
+            We prepare the backward graph to be used in the backward function.
+            We utilize ``create_bw_fn`` to generate the joint function:
+            combine_fn_bw = create_bw_fn(combine_fn, operands)
+            where operands = [ys{t-1}, xst, additional_inputs]
+
+        3.) Materialize the ``combine_fn_bw``
+            This is required because torch.compile and torch.autograd.grad
+            cannot trace through the joint backward function dynamically.
+
+        4.) Compute the single step bw (instantaneous gradients) at every step t
+            bwys{t-1}, bwxst = combine_fn_bw(ys{t-1}, xst, 1.)
+            Here we pass 1 as the upstream gradient to obtain the local partial derivatives.
+
+            This gives:
+                bwys = [bw(ys1, ys0), bw(ys2, ys1), ..., bw(ysT, ys{T-1})]
+                bwxs = [bw(ys1, xs0), bw(ys2, xs1), ..., bw(ys{T-1}, xsT)]
+
+        5.) Compute the gradient transition matrix y_mat
+
+            As shown in the example above, each input xst affects all later outputs ysi for i ≥ t.
+            According to the chain rule, each such path contributes a product of local gradients g_ysk.
+
+            For example:
+                ∂ysT/∂xst = ∂ysT/∂ys{T-1} * ∂ys{T-1}/∂ys{T-2} * ... * ∂ys{t+1}/∂yst * ∂yst/∂xst
+                        = bw(ysT, ys{T-1}) * bw(ys{T-1}, ys{T-2}) * ... * bw(ys{t+1}, yst) * bw(ys{t-1}, xst)
+
+            This motivates the use of a cumulative product over bwys to compute all such paths efficiently.
+
+            We now construct the matrix of gradient transition paths:
+
+            5.1 Repeat g_y values to form the base matrix
+                y_mat = [[1, bwys21, bwys32, bwys43],
+                         [1, bwys21, bwys32, bwys43],
+                         [1, bwys21, bwys32, bwys43],
+                         [1, bwys21, bwys32, bwys43]]
+
+            5.2 Mask the lower triangle (inclusive) with 1s
+                y_mat = [[1, bwys21, bwys32, bwys43],
+                         [1, 1     , bwys32, bwys43],
+                         [1, 1     , 1     , bwys43],
+                         [1, 1     , 1     , 1    ]]
+
+            5.3 Apply cumulative product row-wise
+                y_mat = cumprod(y_mat, dim=1)
+                Resulting in:
+                y_mat = [[1, bwys21, bwys32 * bwys21, bwys43 * bwys32 * bwys21],
+                         [1, 1      , bwys32         , bwys43 * bwys32         ],
+                         [1, 1      , 1              , bwys43                  ],
+                         [1, 1      , 1              , 1                       ]]
+
+            5.4 Zero out the lower triangle (exclusive)
+                Final y_mat:
+                y_mat = [[1, bwys21, bwys32 * bwys21, bwys43 * bwys32 * bwys21],
+                         [0, 1      , bwys32         , bwys43 * bwys32         ],
+                         [0, 0      , 1              , bwys43                  ],
+                         [0, 0      , 0              , 1                       ]]
+
+        6.) Scale the y_mat with the upstream gradients gl_ys
+            scaled_y_mat = y_mat * gl_ys
+            Each entry now holds the full contribution of ∂L/∂ysj to ∂L/∂xsi via the path through ysj.
+
+        7.) Reduce the scaled_y_mat with a row-wise sum
+            summed_y_mat = scaled_y_mat.sum(dim=1)
+            This accumulates all downstream contributions for each xst.
+
+        8.) Scale with the instantaneous input gradients bwxs
+            g_xs = summed_y_mat * bwxs
+
+            This gives the final input gradients:
+                g_xs = [∂L/∂xs0, ∂L/∂xs1, ..., ∂L/∂xsT]
+
+        NOTE: [scan partial grad handling]
+            If any element of xs or of the outputs does not require gradients
+            (i.e., requires_grad=False), then the corresponding gradients will be returned
+            as tensors of zeros with the same shape as the element.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        combine_fn,
+        num_xs,
+        num_additional_inputs,
+        *operands,
+    ):
+        ctx._num_xs = num_xs
+        ctx._num_additional_inputs = num_additional_inputs
+        ctx._combine_fn = combine_fn
+        xs, additional_inputs = split_into_chunks(
+            operands, [num_xs, num_additional_inputs]
+        )
+
+        scan_length = xs[0].shape[0]
+        ctx._scan_length = scan_length
+
+        # We snapshot the dispatch keys in forward for materializing the
+        # the bw_graph in backward.
+        ctx._fw_include_key_set = torch._C._dispatch_tls_local_include_set()
+        ctx._fw_exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+
+        with torch._C._AutoDispatchBelowAutograd():
+            # 1.) Compute the forward output of the associative_scan
+            ys = associative_scan_op(combine_fn, xs, additional_inputs)
+            save_tensors_and_symints_for_backward(ctx, list(operands) + list(ys))
+
+        return (*ys,)
+
+    @staticmethod
+    def backward(ctx, *gl_ys):
+        r"""
+        This function computes the gradients of the scan operation.
+        For a detailed description see the document above.
+
+        Args:
+            flat_grads (torch.Tensor): The tensor of upstream gradients, or a nested pytree of tensors.
+                                       E.g.: Gradient of the loss with respect to the forward output ys
+        """
+
+        # The backward of associative_scan is always performed on the first dimension
+        dim = 0
+        scan_length = ctx._scan_length
+        num_xs = ctx._num_xs
+        num_additional_inputs = ctx._num_additional_inputs
+
+        # Extract the inputs to the forward path and outputs from the forward path
+        flat_args = saved_tensors_and_symints(ctx)
+        xs, additional_inputs, outs = split_into_chunks(
+            flat_args, [num_xs, num_additional_inputs, num_xs]
+        )
+        ndim = outs[0].ndim
+
+        # First_slice_copy does not keep the original requires_grad flag,
+        # but we need it here in order to compute the correcte gradients
+        xs_slices = first_slice_copy_with_grad(itertools.chain(xs, xs))
+
+        # Construct the operands from the forward, fw_operands
+        # and the operands for a single event t of the forward, fw_operands_slice
+        fw_operands = (*xs, *additional_inputs)
+        fw_operands_slice = (*xs_slices, *additional_inputs)
+
+        # 2.) Prepare the backward graph
+        combine_fn_bw = create_bw_fn(ctx._combine_fn, fw_operands_slice)
+
+        # 3.) Materialize the ``combine_fn_bw``
+        # TODO: we need to materialize the bw graphs because dynamo is unable to
+        # trace through the joint function when torch.compile torch.autograd.grad.
+        combine_fn_bw_gm = materialize_as_graph(
+            combine_fn_bw,
+            (
+                *fw_operands_slice,
+                *[first_slice_copy(o) for o in outs],
+            ),
+            ctx._fw_include_key_set,
+            ctx._fw_exclude_key_set,
+            force_enable_grad=True,
+        )
+
+        # vmap joint graph over scan dimension to compute the individual
+        # gradients for each time slice ``t`` in parallel.
+        # This computation can be parallelized, as these are just the instantaneous gradients and not the full chain-rule
+        mapped_combine_fn_bw_gm = torch.vmap(combine_fn_bw_gm, 0, 0)
+
+        # 4.) Compute the single step bw (instantaneous gradients) at every step ``t``
+        # Use a ones_like tensor in order not to scale the bwyst and bwxst,
+        # with the upstream gradients yet.
+        # Note: All bwyst and bwxst are computed in parallel, thus the tensors bwys and bwxs are the result.
+        dummy_upstream_grad = (torch.ones_like(x) for x in gl_ys)
+        grads = mapped_combine_fn_bw_gm(
+            *(o.roll(1, dim) for o in outs), *fw_operands, *dummy_upstream_grad
+        )
+        bwys, bwxs = split_into_chunks(grads, [num_xs, num_xs])
+
+        def compute_y_mat(bwys: torch.Tensor) -> torch.Tensor:
+            # Prepare a ones and a zeros helper mask in order to easily compute the y_mat
+            def compute_helper_tril_mask(diagonal):
+                def expand_masks(mask):
+                    for _ in range(ndim - 1):
+                        mask = mask.unsqueeze(-1)
+                    return mask
+
+                tril_mask = torch.tril(
+                    torch.ones(
+                        scan_length, scan_length, device=bwys.device, dtype=torch.bool
+                    ),
+                    diagonal=diagonal,
+                )
+                tril_mask = expand_masks(tril_mask)
+                tril_mask = tril_mask.expand(-1, -1, *bwys.shape[1:])
+                return tril_mask
+
+            # The ones mask is used to fill the main diagonal and all elements below it with 1s
+            ones_mask = compute_helper_tril_mask(0)
+
+            # The zero mask is used to set all elements below the main diagonal to 0
+            zeros_mask = compute_helper_tril_mask(-1)
+
+            # 5.1) Repeat the elements of bwys to form the square matrix
+            y_mat = bwys.unsqueeze(dim).repeat_interleave(scan_length, dim)
+
+            # 5.2) Fill the lower triangular part, including the diagonal,
+            # of the h_mat with 1s. I.e., use the ones_mask to fill with 1s.
+            y_mat.masked_fill_(ones_mask, 1.0)
+
+            # 5.3) Compute the cumulative products across dim + 1
+            y_mat = y_mat.cumprod(dim=dim + 1)
+
+            # 5.4) Replace the elements we filled with 1s before with 0s
+            y_mat.masked_fill_(zeros_mask, 0.0)
+
+            return y_mat
+
+        def compute_grad(bwxs, bwys, gl_ys):
+            # Set the first gradient component of bwxs to 1.0, per definition.
+            torch.select(bwxs, dim, 0).fill_(1.0)
+
+            # 5.) Compute the gradient transition matrix
+            y_mat = compute_y_mat(bwys)
+
+            # 6.) scale the y_mat with the upstream gradients gl_ys
+            scaled_y_mat = y_mat * gl_ys
+
+            # 7.) Reduce the y_mat with sum along the columns to get the total contributions for xs_t
+            summed_y_mat = scaled_y_mat.sum(dim + 1)
+
+            # 8.) Scale with the bwxs to obtain the final gradients g_xs
+            g_xs = summed_y_mat * bwxs
+
+            return g_xs
+
+        # Stack all leaves of the gradients along the first dimension.
+        # This is useful as later the gradients of those leaves can be computed in parallel.
+        bwxs_stacked_leaves = torch.stack(bwxs)
+        bwys_stacked_leaves = torch.stack(bwys)
+        gl_ys_stacked_leaves = torch.stack(gl_ys)
+
+        # The compute_grad function is parallelized across all individual leaves of xs
+        # as these gradients can be computed independently from each other
+        # TODO: torch.vmap may create composability issues
+        compute_grad_mapped = torch.vmap(compute_grad, 0, 0)
+
+        g_xs = compute_grad_mapped(
+            bwxs_stacked_leaves, bwys_stacked_leaves, gl_ys_stacked_leaves
+        )
+
+        # TODO: Currently the gradients for the additional_inputs are not computed properly
+        return *[None] * 3, *g_xs, *[None] * num_additional_inputs
+
+
+@associative_scan_op.py_autograd_impl
+def associative_scan_autograd(combine_fn, xs, additional_inputs):
+    num_xs = len(xs)
+    num_additional_inputs = len(additional_inputs)
+
+    if num_additional_inputs > 0:
+        raise RuntimeError(
+            "Associative_scan does currently not support gradients for lifted parameters!"
+        )
+
+    flat_out = AssociativeScanAutogradOp.apply(
+        combine_fn,
+        num_xs,
+        num_additional_inputs,
+        *(tuple(xs) + tuple(additional_inputs)),
+    )
+    return (*flat_out,)
 
 
 @associative_scan_op.py_impl(ProxyTorchDispatchMode)
diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py
index 648d41b0b95a6..7c13b9a0fd147 100644
--- a/torch/_higher_order_ops/cond.py
+++ b/torch/_higher_order_ops/cond.py
@@ -1,12 +1,12 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
 import contextlib
+import functools
 import logging
 import warnings
 from typing import Any, Callable, Optional, Union
 
 import torch
-import torch._subclasses.functional_tensor
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._C._functorch import (
@@ -19,6 +19,10 @@
 from torch._higher_order_ops.utils import (
     _maybe_run_with_interpreter,
     _set_compilation_env,
+    check_input_alias_and_mutation_return_outputs,
+    create_bw_fn,
+    fill_none_with_masks,
+    filter_with_masks,
     materialize_as_graph,
     reenter_make_fx,
     save_tensors_and_symints_for_backward,
@@ -36,8 +40,6 @@
 )
 from torch.utils._python_dispatch import _get_current_dispatch_mode
 
-from .utils import clone_outputs_aliasing_inputs
-
 
 log = logging.getLogger(__name__)
 
@@ -55,6 +57,52 @@ def __call__(self, pred, true_fn, false_fn, operands):
         validate_subgraph_args_types(operands)
         return super().__call__(pred, true_fn, false_fn, operands)
 
+    def gen_schema(self, pred, true_fn, false_fn, operands):
+        from torch._higher_order_ops.schema import HopSchemaGenerator
+        from torch._higher_order_ops.utils import materialize_as_graph
+
+        then_gm: torch.fx.GraphModule = (
+            true_fn
+            if isinstance(true_fn, torch.fx.GraphModule)
+            else materialize_as_graph(true_fn, operands)
+        )
+        else_gm: torch.fx.GraphModule = (
+            false_fn
+            if isinstance(false_fn, torch.fx.GraphModule)
+            else materialize_as_graph(false_fn, operands)
+        )
+        example_inputs = [
+            n.meta["val"] if "val" in n.meta else n.meta["example_value"]
+            for n in then_gm.graph.find_nodes(op="placeholder")
+        ]
+        (
+            _,
+            _,
+            _,
+            then_mutated_inputs,
+            then_outputs,
+        ) = check_input_alias_and_mutation_return_outputs(then_gm, example_inputs)
+        (
+            _,
+            _,
+            _,
+            else_mutated_inputs,
+            else_outputs,
+        ) = check_input_alias_and_mutation_return_outputs(else_gm, example_inputs)
+        mutated_inputs = set(then_mutated_inputs) | set(else_mutated_inputs)
+
+        schema_gen = HopSchemaGenerator(self)
+        schema_gen.add_arg("pred", pred)
+        schema_gen.add_arg("true_fn", then_gm)
+        schema_gen.add_arg("false_fn", else_gm)
+        for idx, arg in enumerate(operands):
+            schema_gen.add_arg(f"operand{idx}", arg, is_mutated=idx in mutated_inputs)
+
+        for out in then_outputs:
+            schema_gen.add_output(out)
+        schema_gen.add_schema_tree_spec(pred, true_fn, false_fn, operands)
+        return schema_gen.gen_schema()
+
 
 cond_op = CondOp()
 
@@ -193,7 +241,9 @@ def _cond_op_wrapper(*args, **kwargs):
     ):
         with _temp_remove_metadata_torch_function_mode() as metadata_mode:
             if metadata_mode:
-                backend = make_eager_backend_with_torch_function_mode(metadata_mode)
+                backend: Union[str, Callable[..., Any]] = (
+                    make_eager_backend_with_torch_function_mode(metadata_mode)
+                )
             else:
                 backend = "eager"
             return torch.compile(_cond_op_wrapper, backend=backend, fullgraph=True)(
@@ -201,60 +251,6 @@ def _cond_op_wrapper(*args, **kwargs):
             )
 
 
-def create_bw_fn(fn: Callable, args: tuple[Any]) -> Callable:
-    """
-    For a fn that accepts flat inputs and returns flat outputs:
-        fw_out = fn(*args),
-    this function returns:
-        grad_args = bw_fn(*args_and_grad_output)
-    with the following invariants:
-      1. args + fw_out has an 1-1 correspondence to args_and_grad_output
-      2. grad_args has an 1-1 corresponsence to args
-      3. for tensor arg whose requires_grad is False, its corresponding grad in
-         grad_args will be a zero tensor with the same shape.
-    """
-
-    from torch._functorch.aot_autograd import AOTConfig, create_joint
-    from torch._higher_order_ops.utils import prepare_fw_with_masks_all_requires_grad
-
-    dummy_aot_config = AOTConfig(
-        fw_compiler=None,  # type: ignore[arg-type]
-        bw_compiler=None,  # type: ignore[arg-type]
-        partition_fn=None,  # type: ignore[arg-type]
-        decompositions={},
-        num_params_buffers=0,
-        aot_id=0,
-        keep_inference_input_mutations=False,
-    )
-    n_primals = len(args)
-
-    bw_fn = create_joint(
-        prepare_fw_with_masks_all_requires_grad(fn), aot_config=dummy_aot_config
-    )
-
-    def flat_fn(*args_and_grad_outs):
-        primals = args_and_grad_outs[:n_primals]
-        tangents = args_and_grad_outs[n_primals:]
-        grad_args = bw_fn(primals, tangents)[1]
-        assert len(args) == len(grad_args)
-        # In order to keep HOPs functional where the backward graph,
-        # would have outputs that are aliasing inputs.
-        # For example in cases where the backward of the function is simply
-        # passing the upstream gradients through.
-        maybe_clone = clone_outputs_aliasing_inputs(args_and_grad_outs)
-
-        return [
-            (
-                torch.zeros_like(arg)
-                if isinstance(arg, torch.Tensor) and grad is None
-                else maybe_clone(grad)
-            )
-            for grad, arg in zip(grad_args, primals)
-        ]
-
-    return flat_fn
-
-
 def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
     assert isinstance(operands, (list, tuple)), (
         f"Cond operands must be a list or tuple of tensors and SymInts {operands}"
@@ -349,15 +345,32 @@ def backward(ctx, *flat_grads):
         args = operands + flat_grads
         # TODO: we need to materialize the bw graphs because dynamo is unable to
         # trace through the joint function when torch.compile torch.autograd.grad.
+
+        grads_tensor_masks = []
+
+        def create_fn_remove_none(fn):
+            @functools.wraps(fn)
+            def wrapped(*args):
+                nonlocal grads_tensor_masks
+
+                true_outputs = fn(*args)
+                grads_tensor_masks = [
+                    True if isinstance(out, torch.Tensor) else False
+                    for out in true_outputs
+                ]
+                return filter_with_masks(true_outputs, grads_tensor_masks)
+
+            return wrapped
+
         true_bw_gm = materialize_as_graph(
-            ctx._true_bw_fn,
+            create_fn_remove_none(ctx._true_bw_fn),
             args,
             ctx._fw_include_key_set,
             ctx._fw_exclude_key_set,
             force_enable_grad=True,
         )
         false_bw_gm = materialize_as_graph(
-            ctx._false_bw_fn,
+            create_fn_remove_none(ctx._false_bw_fn),
             args,
             ctx._fw_include_key_set,
             ctx._fw_exclude_key_set,
@@ -369,7 +382,7 @@ def backward(ctx, *flat_grads):
             false_bw_gm,
             args,
         )
-        return None, None, None, *grads
+        return None, None, None, *fill_none_with_masks(grads, grads_tensor_masks)
 
 
 # Note:
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
index 633b465407676..2d352ae03a45c 100644
--- a/torch/_higher_order_ops/flex_attention.py
+++ b/torch/_higher_order_ops/flex_attention.py
@@ -92,7 +92,7 @@ def __call__(
         kernel_options: dict[str, Any],
         score_mod_other_buffers: tuple = (),
         mask_mod_other_buffers: tuple = (),
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         validate_subgraph_args_types(score_mod_other_buffers + mask_mod_other_buffers)
         return super().__call__(
             query,
@@ -209,7 +209,7 @@ def math_attention(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Eager implementation
 
     This implementation uses vmap to vectorize the score_mod function over the batch, head, m, and n dimensions.
@@ -252,9 +252,19 @@ def math_attention(
     masked_rows = torch.all(post_mod_scores == -float("inf"), dim=-1)
     logsumexp = torch.where(masked_rows, -float("inf"), logsumexp)
 
+    # working precision will be used so no need to cast to fp32
+    max_scores = torch.max(post_mod_scores, dim=-1)[0]
+
     post_mod_scores = torch._safe_softmax(post_mod_scores, dim=-1)
 
-    return post_mod_scores.to(query.dtype) @ value, logsumexp / math.log(2)
+    # NB: kernel computes in ln2 space, we always convert back at the top level op, so
+    # for math impl we divide by log(2) because we will multiply by log(2)
+
+    return (
+        post_mod_scores.to(query.dtype) @ value,
+        logsumexp / math.log(2),
+        max_scores / math.log(2),
+    )
 
 
 @flex_attention.py_impl(DispatchKey.CompositeExplicitAutograd)
@@ -268,8 +278,8 @@ def sdpa_dense(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
-    out, lse = math_attention(
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    out, lse, max_scores = math_attention(
         query,
         key,
         value,
@@ -281,7 +291,7 @@ def sdpa_dense(
         mask_mod_other_buffers,
     )
     out = _permute_strides(out, query.stride())
-    return out, lse
+    return out, lse, max_scores
 
 
 def trace_flex_attention(
@@ -295,7 +305,7 @@ def trace_flex_attention(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Traces the flex_attention operator with the given score_mod function and other_buffers.
 
     Trace SDPA will call make_fx with "fake" example vals and then trace the score_mod function
@@ -365,7 +375,7 @@ def flex_attention_proxy_torch_dispatch_mode(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert mode is not None, "Mode should always be enabled for python fallback key"
     return trace_flex_attention(
         mode,
@@ -393,7 +403,7 @@ def flex_attention_functionalize(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Defines the functionalization rules for the flex_attention operator.
 
     Write now we are unwrapping each tensor and then redispatching to the next, however we want to
@@ -478,7 +488,7 @@ def flex_attention_fake_impl(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     if has_user_subclass(
         (
             query,
@@ -499,15 +509,17 @@ def flex_attention_fake_impl(
     if query.is_nested:
         out = torch.empty_like(query, memory_format=torch.contiguous_format)
         logsumexp = query.sum(dim=-1)
-        return out, logsumexp
+        max_scores = query.max(dim=-1)[0]
+        return out, logsumexp, max_scores
 
     v_head_dim = value.size(-1)
     batch_size, num_heads, seq_len_q, _q_head_dim = query.shape
     logsumexp = query.new_empty(batch_size, num_heads, seq_len_q, dtype=torch.float32)
+    max_scores = query.new_empty(batch_size, num_heads, seq_len_q, dtype=torch.float32)
     out_shape = (batch_size, num_heads, seq_len_q, v_head_dim)
     out = query.new_empty(out_shape)
     out = _permute_strides(out, query.stride())
-    return out, logsumexp
+    return out, logsumexp, max_scores
 
 
 # Registers dispatches for SAC
@@ -628,7 +640,7 @@ def forward(
         kernel_options: dict[str, Any],
         mask_mod_other_buffers: tuple[Any, ...],
         *score_mod_other_buffers: tuple[Any, ...],
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         any_buffer_requires_grad = any(
             buffer.requires_grad
             for buffer in mask_mod_other_buffers
@@ -644,7 +656,7 @@ def forward(
         ctx.kernel_options = kernel_options
         ctx._score_mod_other_buffers_len = len(score_mod_other_buffers)
         with torch._C._AutoDispatchBelowAutograd():
-            out, logsumexp = flex_attention(
+            out, logsumexp, max_scores = flex_attention(
                 query,
                 key,
                 value,
@@ -655,7 +667,8 @@ def forward(
                 score_mod_other_buffers,
                 mask_mod_other_buffers,
             )
-
+        # no grads for you sir
+        ctx.mark_non_differentiable(max_scores)
         save_tensors_and_symints_for_backward(
             ctx,
             (
@@ -664,18 +677,20 @@ def forward(
                 value,
                 out,
                 logsumexp,
+                max_scores,
                 *block_mask[:-1],
                 *score_mod_other_buffers,
                 *mask_mod_other_buffers,
             ),
         )
-        return out, logsumexp
+        return out, logsumexp, max_scores
 
     @staticmethod
     def backward(  # type: ignore[override]
         ctx: Any,
         grad_out: Tensor,
         grad_logsumexp: Tensor,
+        grad_max_scores: Tensor,
     ) -> tuple[Optional[Tensor], ...]:
         fw_args = saved_tensors_and_symints(ctx)
         (
@@ -684,6 +699,7 @@ def backward(  # type: ignore[override]
             value,
             out,
             logsumexp,
+            max_scores,
             query_lengths,
             kv_lengths,
             kv_num_blocks,
@@ -762,7 +778,7 @@ def flex_attention_autograd(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple[Tensor, ...] = (),
     mask_mod_other_buffers: tuple[Tensor, ...] = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
 
     with TransformGetItemToIndex():
@@ -788,7 +804,7 @@ def flex_attention_autograd(
             )
         else:
             fw_graph, bw_graph = score_mod, None
-        out, logsumexp = FlexAttentionAutogradOp.apply(
+        out, logsumexp, max_scores = FlexAttentionAutogradOp.apply(
             query,
             key,
             value,
@@ -800,7 +816,7 @@ def flex_attention_autograd(
             mask_mod_other_buffers,
             *score_mod_other_buffers,
         )
-    return out, logsumexp
+    return out, logsumexp, max_scores
 
 
 # ---------------------------- Backward HOP Implementation ----------------------------
diff --git a/torch/_higher_order_ops/invoke_subgraph.py b/torch/_higher_order_ops/invoke_subgraph.py
index 1a4c8699f337b..11b663ea4f61a 100644
--- a/torch/_higher_order_ops/invoke_subgraph.py
+++ b/torch/_higher_order_ops/invoke_subgraph.py
@@ -3,7 +3,7 @@
 import contextlib
 from contextlib import nullcontext
 from dataclasses import dataclass, field
-from typing import Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -45,7 +45,7 @@
 @dataclass
 class OutputMetadata:
     num_fw_outs: Optional[int] = None
-    indexes_with_none: set[int] = field(default_factory=set)
+    indexes_with_symint: set[int] = field(default_factory=set)
     indexes_with_no_grad: set[int] = field(default_factory=set)
 
 
@@ -74,8 +74,11 @@ def __call__(
         )
 
         assert all(
-            isinstance(o, (torch.Tensor, int, torch.SymInt)) for o in operands
-        ), f"invoke_subgraph operands must be a list of tensors/ints/SymInts {operands}"
+            isinstance(o, (torch.Tensor, int, torch.SymInt, torch.Generator))
+            for o in operands
+        ), (
+            f"invoke_subgraph operands must be a list of tensors/ints/SymInts/Generator {operands}"
+        )
 
         return super().__call__(subgraph, identifier, *operands)
 
@@ -134,7 +137,9 @@ def _invoke_subgraph_placeholder_wrapper(func, args):
         ):
             with _temp_remove_metadata_torch_function_mode() as metadata_mode:
                 if metadata_mode:
-                    backend = make_eager_backend_with_torch_function_mode(metadata_mode)
+                    backend: Union[str, Callable[..., Any]] = (
+                        make_eager_backend_with_torch_function_mode(metadata_mode)
+                    )
                 else:
                     backend = "eager"
 
@@ -253,8 +258,8 @@ def create_fw_bw_graph(subgraph, operands, grad_outputs=None):
 
             output_metadata.num_fw_outs = num_fw_outs
             for idx, fw_out in enumerate(fw_outs):
-                if fw_out is None:
-                    output_metadata.indexes_with_none.add(idx)
+                if isinstance(fw_out, torch.SymInt):
+                    output_metadata.indexes_with_symint.add(idx)
                 elif not fw_out.requires_grad:
                     output_metadata.indexes_with_no_grad.add(idx)
 
@@ -326,8 +331,8 @@ def get_output_metadata(subgraph, *operands):
 
             output_metadata.num_fw_outs = num_fw_outs
             for idx, fw_out in enumerate(fw_outs):
-                if fw_out is None:
-                    output_metadata.indexes_with_none.add(idx)
+                if isinstance(fw_out, torch.SymInt):
+                    output_metadata.indexes_with_symint.add(idx)
                 elif not fw_out.requires_grad:
                     output_metadata.indexes_with_no_grad.add(idx)
             return output_metadata
@@ -423,10 +428,10 @@ def forward(
                 *operands,
             )
 
-        # Check that None is at expected indexes.
+        # Check that int (coming from symint) is at expected indexes.
         for idx, o in enumerate(out):
-            if o is None:
-                assert idx in output_metadata.indexes_with_none
+            if isinstance(o, int):
+                assert idx in output_metadata.indexes_with_symint
 
         return out
 
@@ -447,7 +452,7 @@ def backward(
         filtered_grad_outs = []
         for idx, o in enumerate(grad_outs):
             if o is None:
-                assert idx in output_metadata.indexes_with_none
+                assert idx in output_metadata.indexes_with_symint
             elif idx in output_metadata.indexes_with_no_grad:
                 # Deliberately skip over the grad_outs which we know should be
                 # None because the corresponding fwd_out does not require_grad.
diff --git a/torch/_higher_order_ops/map.py b/torch/_higher_order_ops/map.py
index 9f73df7ef478a..57d2cd3cb9001 100644
--- a/torch/_higher_order_ops/map.py
+++ b/torch/_higher_order_ops/map.py
@@ -13,7 +13,6 @@
 from torch._subclasses.functional_tensor import disable_functional_mode
 from torch.fx.experimental.proxy_tensor import (
     disable_proxy_modes_tracing,
-    make_fx,
     ProxyTorchDispatchMode,
     track_tensor_tree,
 )
@@ -22,10 +21,13 @@
     _from_fun,
     _stack_pytree,
     _unstack_pytree,
-    clone_outputs_aliasing_inputs,
-    prepare_fw_with_masks,
+    create_bw_fn,
+    fill_none_with_masks,
+    filter_with_masks,
+    materialize_as_graph,
     save_tensors_and_symints_for_backward,
     saved_tensors_and_symints,
+    split_into_chunks,
 )
 
 
@@ -40,77 +42,6 @@ def __call__(self, *args, **kwargs):
 map_impl = MapImpl()
 
 
-def create_fw_bw_graph(f, num_mapped_args, *args):
-    mapped_xs = args[:num_mapped_args]
-    pos_args = args[num_mapped_args:]
-
-    # See Note [HOP create fw_bw graph] in create_fw_bw_graph in utils.py
-
-    with suspend_functionalization(), disable_functional_mode():
-        with disable_proxy_modes_tracing():
-            unwrapped_mapped_xs = pytree.tree_map(_from_fun, mapped_xs)
-            example_xs = _unstack_pytree(unwrapped_mapped_xs)[0]
-
-            example_pos_args = [
-                _from_fun(arg) if isinstance(arg, torch.Tensor) else arg
-                for arg in pos_args
-            ]
-            example_flat_out = pytree.tree_map(
-                _from_fun, f(*example_xs, *example_pos_args)
-            )
-            if any(
-                not isinstance(out, torch.Tensor)
-                for out in example_flat_out
-                if out is not None
-            ):
-                raise RuntimeError(
-                    "Expect outputs of map only contains tensors or None. "
-                    f"Got types {[type(out) for out in example_flat_out]}."
-                )
-            example_grad = [_from_fun(out) for out in example_flat_out]
-
-            fw_graph = make_fx(f)(*example_xs, *example_pos_args)
-
-        from torch._functorch.aot_autograd import AOTConfig, create_joint
-
-        dummy_aot_config = AOTConfig(
-            fw_compiler=None,  # type: ignore[arg-type]
-            bw_compiler=None,  # type: ignore[arg-type]
-            partition_fn=None,  # type: ignore[arg-type]
-            decompositions={},
-            num_params_buffers=0,
-            aot_id=0,
-            keep_inference_input_mutations=False,
-        )
-
-        def joint_f(*example_args):
-            joint_mapped_args = example_args[:joint_num_mapped]
-            args = example_args[joint_num_mapped:]
-
-            mapped_input = joint_mapped_args[:num_mapped_args]
-            mapped_grads = joint_mapped_args[num_mapped_args:]
-
-            joint = create_joint(prepare_fw_with_masks(f), aot_config=dummy_aot_config)
-            _, grads = joint(
-                list(mapped_input) + list(args),
-                [
-                    grad
-                    for grad in mapped_grads
-                    if grad is not None and grad.requires_grad
-                ],
-            )
-
-            # In order to keep map functional for backward graph,
-            # we clone outputs that are aliasing inputs
-            maybe_clone = clone_outputs_aliasing_inputs(example_args)
-
-            return pytree.tree_map(maybe_clone, grads)
-
-        joint_num_mapped = len(example_grad) + len(example_xs)
-        joint_graph = make_fx(joint_f)(*example_xs, *example_grad, *example_pos_args)
-        return fw_graph, joint_graph
-
-
 def map(
     f: Callable[[pytree.PyTree, tuple[pytree.PyTree, ...]], pytree.PyTree],
     xs: Union[pytree.PyTree, torch.Tensor],
@@ -193,36 +124,96 @@ def wrapped_fn(*flat_args, f, xs_tree_spec, args_tree_spec, num_xs):
 
 class MapAutogradOp(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, fw_graph, joint_graph, num_mapped_args, *flat_args):
-        save_tensors_and_symints_for_backward(ctx, flat_args)
-        ctx._joint_graph = joint_graph
+    def forward(ctx, f, num_mapped_args, *flat_args):
+        ctx._f = f
         ctx._num_mapped_args = num_mapped_args
+        ctx._num_pos_args = len(flat_args) - num_mapped_args
+
+        # We snapshot the dispatch keys in forward for materializing the
+        # the bw_graph in backward.
+        ctx._fw_include_key_set = torch._C._dispatch_tls_local_include_set()
+        ctx._fw_exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+        save_tensors_and_symints_for_backward(ctx, flat_args)
         with torch._C._AutoDispatchBelowAutograd():
             return (
-                *map_impl(
-                    fw_graph, flat_args[:num_mapped_args], flat_args[num_mapped_args:]
-                ),
+                *map_impl(f, flat_args[:num_mapped_args], flat_args[num_mapped_args:]),
             )
 
     @staticmethod
     def backward(ctx, *flat_grads):
         fw_args = saved_tensors_and_symints(ctx)
-        fw_mapped_args = fw_args[: ctx._num_mapped_args]
-        pos_args = fw_args[ctx._num_mapped_args :]
-
-        grads = map_impl(
-            ctx._joint_graph,
-            fw_mapped_args + flat_grads,
-            pos_args,
+        num_mapped_args = ctx._num_mapped_args
+        num_pos_args = ctx._num_pos_args
+        num_grads = len(flat_grads)
+
+        fw_mapped_args, pos_args = split_into_chunks(
+            fw_args,
+            [
+                num_mapped_args,
+                num_pos_args,
+            ],
         )
-        return None, None, None, *grads
+
+        bw_f = create_bw_fn(ctx._f, fw_args)
+
+        grads_tensor_masks = []
+
+        # Create a wrapper around thefor the bw_f
+        def bw_f_wrapper(*args):
+            nonlocal grads_tensor_masks
+
+            # Dissect args and re-order them for the ``ctx._bw_f``
+            # args provided to the wrapper are composed of [*fw_mapped_args, *flat_grads, *pos_args]
+            # The content of ``bw_f_tangents`` are the upstream gradients, i.e. flat_grads
+            # The content of ``bw_f_primals`` are the fw_args, i.e., [*fw_mapped_args, *pos_args]
+            # The bw_f requires *bw_f_primals, *bw_f_tangents
+            fw_m_args, bw_f_tangents, pos_args = split_into_chunks(
+                args, [num_mapped_args, num_grads, num_pos_args]
+            )
+            bw_f_primals = *fw_m_args, *pos_args
+            gradients = bw_f(*bw_f_primals, *bw_f_tangents)
+            grads_tensor_masks = [
+                True if isinstance(out, torch.Tensor) else out for out in gradients
+            ]
+            return filter_with_masks(gradients, grads_tensor_masks)
+
+        def construct_args_single_step_bw():
+            unwrapped_mapped_xs = pytree.tree_map(_from_fun, fw_mapped_args)
+            example_xs = _unstack_pytree(unwrapped_mapped_xs)[0]
+            unwrapped_grads = pytree.tree_map(_from_fun, flat_grads)
+            example_grads = _unstack_pytree(unwrapped_grads)[0]
+            example_pos_args = [
+                _from_fun(arg) if isinstance(arg, torch.Tensor) else arg
+                for arg in pos_args
+            ]
+            return *example_xs, *example_grads, *example_pos_args
+
+        with suspend_functionalization(), disable_functional_mode():
+            with disable_proxy_modes_tracing():
+                args_single_step_bw = construct_args_single_step_bw()
+
+            # TODO: we need to materialize the bw graphs because dynamo is unable to
+            # trace through the joint function when torch.compile torch.autograd.grad.
+            fn_bw_gm = materialize_as_graph(
+                bw_f_wrapper,
+                args_single_step_bw,
+                ctx._fw_include_key_set,
+                ctx._fw_exclude_key_set,
+                force_enable_grad=True,
+            )
+
+        grads = map_impl(fn_bw_gm, fw_mapped_args + flat_grads, pos_args)
+
+        return None, None, *fill_none_with_masks(grads, grads_tensor_masks)
 
 
 def trace_map(proxy_mode, func_overload, f, xs, pos_args):
-    example_input = _unstack_pytree(xs)[0]
-    body_graph = f
+    with disable_proxy_modes_tracing():
+        example_input = _unstack_pytree(xs)[0]
+
+        body_graph = f
 
-    body_graph = reenter_make_fx(body_graph)(*example_input, *pos_args)
+        body_graph = reenter_make_fx(body_graph)(*example_input, *pos_args)
 
     next_name = proxy_mode.tracer.get_fresh_qualname("body_graph_")
 
@@ -249,8 +240,7 @@ def map_dense(f, xs, pos_args):
 @map_impl.py_autograd_impl
 def map_autograd(f, xs, pos_args):
     num_mapped_args = len(xs)
-    fw_graph, bw_graph = create_fw_bw_graph(f, num_mapped_args, *xs, *pos_args)
-    flat_out = MapAutogradOp.apply(fw_graph, bw_graph, num_mapped_args, *xs, *pos_args)
+    flat_out = MapAutogradOp.apply(f, num_mapped_args, *xs, *pos_args)
     return flat_out
 
 
diff --git a/torch/_higher_order_ops/scan.py b/torch/_higher_order_ops/scan.py
index 3cd5bf9ec4e22..e4aa0161ad3c9 100644
--- a/torch/_higher_order_ops/scan.py
+++ b/torch/_higher_order_ops/scan.py
@@ -1,22 +1,26 @@
 # mypy: allow-untyped-defs
 import functools
 import itertools
-from collections.abc import Sequence
-from typing import Any, Callable, Optional
+from typing import Any, Callable
 
 import torch
 import torch._prims_common as utils
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
-from torch._higher_order_ops.cond import create_bw_fn
 from torch._higher_order_ops.utils import (
     _maybe_compile_and_run_fn,
+    check_input_alias_and_mutation_return_outputs,
     check_meta_consistency,
+    create_bw_fn,
     first_slice_copy,
+    first_slice_copy_with_grad,
+    get_tensor_mask,
+    mask_list,
     materialize_as_graph,
     reenter_make_fx,
     save_tensors_and_symints_for_backward,
     saved_tensors_and_symints,
+    split_into_chunks,
     unique_graph_id,
     validate_subgraph_args_types,
 )
@@ -59,50 +63,6 @@ def stack_y(y: torch.Tensor, scan_length: int) -> torch.Tensor:
     )
 
 
-# NOTE: These functions can be reused in associative_scan and eventually moved to
-# torch._higher_order_ops.utils
-def get_tensor_mask(tensor_list: list[Any]) -> list[bool]:
-    # Returns a mask whether a list element is a tensor or not
-    return [True if isinstance(v, torch.Tensor) else False for v in tensor_list]
-
-
-def mask_list(
-    mask: list[bool], inp: list[Any], other: Optional[list[Any]] = None
-) -> list[Any]:
-    # Masks elements on an `inp` list.
-    # If other is None, then the elements of the `inp` list where the mask is False are removed
-    # If other is not None, then the elements of the `inp` list where the mask is False are
-    # replaced with the elements of the `other` list
-    assert len(mask) == len(inp), (
-        "The length of the mask needs to be identical to the length of the input"
-    )
-    if other is not None:
-        assert len(inp) == len(other), (
-            "If an input and an other list is provided, they need to have the same length"
-        )
-        return [i if m else o for m, i, o in zip(mask, inp, other)]
-    else:
-        return [i for m, i in zip(mask, inp) if m]
-
-
-def first_slice_copy_with_grad(li: list[Any]) -> list[Any]:
-    # First_slice_copy does not keep the original requires_grad flag,
-    # but we need it for materialize_as_graph
-    # in order to compute the correct gradients
-    # The reason why first_slice_copy doesn't keep requires_grad flag is
-    # because it's called in torch.autograd.Function.backward/forward.
-    slc = [first_slice_copy(x).requires_grad_(x.requires_grad) for x in li]
-    return slc
-
-
-def split_into_chunks(iterable: Sequence[Any], chunk_sizes: list[int]) -> list[Any]:
-    it = iter(iterable)
-    assert sum(chunk_sizes) == len(iterable), (
-        "the sum of all chunks needs to match the length of the iterable."
-    )
-    return [list(itertools.islice(it, size)) for size in chunk_sizes]
-
-
 def call_operator(operator, *args):
     return pytree.tree_leaves(operator(*args))
 
@@ -274,6 +234,56 @@ def __call__(self, combine_fn, init, xs, additional_inputs):
         validate_subgraph_args_types(additional_inputs)
         return super().__call__(combine_fn, init, xs, additional_inputs)
 
+    def gen_schema(self, combine_fn, init, xs, additional_inputs):
+        from torch._higher_order_ops.schema import HopSchemaGenerator
+        from torch._higher_order_ops.utils import materialize_as_graph
+
+        all_inputs = tuple(
+            list(init) + [first_slice_copy(x) for x in xs] + list(additional_inputs)
+        )
+
+        combine_gm: torch.fx.GraphModule = (
+            combine_fn
+            if isinstance(combine_fn, torch.fx.GraphModule)
+            else materialize_as_graph(combine_fn, all_inputs)
+        )
+
+        example_inputs = [
+            n.meta["val"] if "val" in n.meta else n.meta["example_value"]
+            for n in combine_gm.graph.find_nodes(op="placeholder")
+        ]
+
+        (
+            _,
+            _,
+            _,
+            mutated_inputs,
+            outputs,
+        ) = check_input_alias_and_mutation_return_outputs(combine_gm, example_inputs)
+        if len(mutated_inputs) > 0:
+            raise RuntimeError(
+                "For scan, combine_fn cannot have in-place mutations but found "
+                f"{mutated_inputs}-th inputs are mutated."
+            )
+
+        schema_gen = HopSchemaGenerator(self)
+        schema_gen.add_arg("combine_fn", combine_gm)
+
+        for idx, arg in enumerate(init):
+            schema_gen.add_arg(f"init{idx}", arg)
+
+        for idx, arg in enumerate(xs):
+            schema_gen.add_arg(f"xs{idx}", arg)
+
+        for idx, arg in enumerate(additional_inputs):
+            schema_gen.add_arg(f"additional_input{idx}", arg)
+
+        for out in outputs:
+            schema_gen.add_output(out)
+
+        schema_gen.add_schema_tree_spec(combine_fn, init, xs, additional_inputs)
+        return schema_gen.gen_schema()
+
 
 scan_op = ScanOp()
 
diff --git a/torch/_higher_order_ops/schema.py b/torch/_higher_order_ops/schema.py
index 15bfac752ab74..b1cdacb323731 100644
--- a/torch/_higher_order_ops/schema.py
+++ b/torch/_higher_order_ops/schema.py
@@ -65,6 +65,8 @@ def from_example(obj: Any) -> Any:
             return torch._C.AnyType.get()
         elif isinstance(obj, torch.SymInt):
             return torch._C.SymIntType.get()
+        elif isinstance(obj, torch.SymBool):
+            return torch._C.SymBoolType.get()
         return torch._C._jit_try_infer_type(obj).type()
 
 
diff --git a/torch/_higher_order_ops/strict_mode.py b/torch/_higher_order_ops/strict_mode.py
index 1d838d510094f..1ed920c4a150c 100644
--- a/torch/_higher_order_ops/strict_mode.py
+++ b/torch/_higher_order_ops/strict_mode.py
@@ -1,4 +1,6 @@
 # mypy: allow-untyped-defs
+from typing import Any, Callable, Union
+
 import torch
 import torch._subclasses.functional_tensor
 import torch.utils._pytree as pytree
@@ -33,7 +35,9 @@ def strict_mode(callable, operands):
                 modes = [metadata_mode, predispatch_mode]
                 modes = [mode for mode in modes if mode is not None]
                 if modes:
-                    backend = make_eager_backend_with_torch_function_modes(modes)
+                    backend: Union[str, Callable[..., Any]] = (
+                        make_eager_backend_with_torch_function_modes(modes)
+                    )
                 else:
                     backend = "eager"
                 with torch._dynamo.utils.disable_cache_limit():
diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py
index 6d4ea64e78aa6..4848e5f9b5ea3 100644
--- a/torch/_higher_order_ops/triton_kernel_wrap.py
+++ b/torch/_higher_order_ops/triton_kernel_wrap.py
@@ -18,6 +18,7 @@
 import torch.utils._pytree as pytree
 from torch import SymInt, Tensor
 from torch._C import DispatchKey
+from torch._higher_order_ops.utils import redirect_to_mode
 from torch._ops import HigherOrderOperator
 from torch._prims_common import clone_preserve_strides
 from torch._subclasses.fake_tensor import FakeTensorMode
@@ -28,6 +29,7 @@
 )
 from torch.fx.experimental.symbolic_shapes import guard_scalar
 from torch.types import IntLikeType
+from torch.utils.checkpoint import _CachedTorchDispatchMode, _CachingTorchDispatchMode
 
 
 if TYPE_CHECKING:
@@ -461,6 +463,11 @@ def get_signature_value(idx: int, arg: Any) -> str:
     elif make_ir_sig_params == 3:
         codegen_fns = backend.get_codegen_implementation()
         ttir_module = src.make_ir(target, options, codegen_fns, context)
+    elif make_ir_sig_params == 4:
+        codegen_args = [options] if get_codegen_implementation_sig_params == 1 else []
+        codegen_fns = backend.get_codegen_implementation(*codegen_args)
+        module_map = backend.get_module_map()
+        ttir_module = src.make_ir(target, codegen_fns, module_map, context)
     else:
         codegen_args = [options] if get_codegen_implementation_sig_params == 1 else []
         codegen_fns = backend.get_codegen_implementation(*codegen_args)
@@ -1337,6 +1344,9 @@ def triton_kernel_wrapper_functional_functionalize(
 triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCUDA)
 triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCPU)
 
+# Adds SAC support for triton ops
+redirect_to_mode(triton_kernel_wrapper_mutation, _CachingTorchDispatchMode)
+redirect_to_mode(triton_kernel_wrapper_mutation, _CachedTorchDispatchMode)
 
 ###############################################################################
 # The "TritonHOPifier": a class that transforms a call to a triton kernel into
diff --git a/torch/_higher_order_ops/utils.py b/torch/_higher_order_ops/utils.py
index 25ef972864d58..7e5b235264fc5 100644
--- a/torch/_higher_order_ops/utils.py
+++ b/torch/_higher_order_ops/utils.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import contextlib
 import functools
+from collections.abc import Iterable, Sequence
 from contextlib import AbstractContextManager, contextmanager, ExitStack, nullcontext
 from dataclasses import dataclass
 from typing import Any, Callable, Optional, overload, TypeVar, Union
@@ -102,7 +103,9 @@ def _maybe_compile_and_run_fn(fn, *args):
         with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit():
             with _temp_remove_metadata_torch_function_mode() as metadata_mode:
                 if metadata_mode:
-                    backend = make_eager_backend_with_torch_function_mode(metadata_mode)
+                    backend: Union[str, Callable[..., Any]] = (
+                        make_eager_backend_with_torch_function_mode(metadata_mode)
+                    )
                 else:
                     backend = "eager"
                 return torch.compile(fn, backend=backend, fullgraph=True)(*args)
@@ -111,9 +114,7 @@ def _maybe_compile_and_run_fn(fn, *args):
 
 
 def reenter_make_fx(fn):
-    from torch._guards import detect_fake_mode
     from torch.fx.experimental.proxy_tensor import _CURRENT_MAKE_FX_TRACER
-    from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 
     @functools.wraps(fn)
     def wrapped(*args):
@@ -123,9 +124,6 @@ def wrapped(*args):
         gm = _CURRENT_MAKE_FX_TRACER.trace_subgraph(
             _maybe_run_with_interpreter(fn), *args
         )
-        if (fake_mode := detect_fake_mode()) and fake_mode.shape_env is not None:
-            insert_deferred_runtime_asserts(gm, fake_mode.shape_env, "reenter_make_fx")
-            gm.recompile()
         return gm
 
     return wrapped
@@ -503,7 +501,9 @@ def fw_with_masks(*args):
         # require_gradness reasoning much easier.
         if pytree.tree_any_only(torch.Tensor, lambda t: t.requires_grad, args):
             fw_out = pytree.tree_map_only(
-                torch.Tensor, lambda x: x.requires_grad_(True), fw_out
+                torch.Tensor,
+                lambda x: x.requires_grad_(True) if x.dtype.is_floating_point else x,
+                fw_out,
             )
         return fw_out, pytree.tree_map_only(
             torch.Tensor, lambda x: x.requires_grad, fw_out
@@ -722,6 +722,69 @@ def saved_tensors_and_symints(ctx):
     return tuple(args)
 
 
+def split_into_chunks(iterable: Sequence[Any], chunk_sizes: list[int]) -> list[Any]:
+    assert sum(chunk_sizes) == len(iterable), (
+        "the sum of all chunks needs to match the length of the iterable."
+    )
+    elements = []
+    idx = 0
+    for size in chunk_sizes:
+        elements.append(iterable[idx : idx + size])
+        idx += size
+    return elements
+
+
+def create_bw_fn(fn: Callable, args: tuple[Any]) -> Callable:
+    """
+    For a fn that accepts flat inputs and returns flat outputs:
+        fw_out = fn(*args),
+    this function returns:
+        grad_args = bw_fn(*args_and_grad_output)
+    with the following invariants:
+      1. args + fw_out has an 1-1 correspondence to args_and_grad_output
+      2. grad_args has an 1-1 corresponsence to args
+      3. for tensor arg whose requires_grad is False, its corresponding grad in
+         grad_args will be a zero tensor with the same shape.
+    """
+
+    from torch._functorch.aot_autograd import AOTConfig, create_joint
+    from torch._higher_order_ops.utils import prepare_fw_with_masks_all_requires_grad
+
+    dummy_aot_config = AOTConfig(
+        fw_compiler=None,  # type: ignore[arg-type]
+        bw_compiler=None,  # type: ignore[arg-type]
+        partition_fn=None,  # type: ignore[arg-type]
+        decompositions={},
+        num_params_buffers=0,
+        aot_id=0,
+        keep_inference_input_mutations=False,
+    )
+    n_primals = len(args)
+
+    bw_fn = create_joint(
+        prepare_fw_with_masks_all_requires_grad(fn), aot_config=dummy_aot_config
+    )
+
+    def flat_fn(*args_and_grad_outs):
+        primals = args_and_grad_outs[:n_primals]
+        tangents = args_and_grad_outs[n_primals:]
+        grad_args = bw_fn(primals, tangents)[1]
+        assert len(args) == len(grad_args)
+
+        maybe_clone = clone_outputs_aliasing_inputs(args_and_grad_outs)
+
+        return [
+            (
+                torch.zeros_like(arg)
+                if isinstance(arg, torch.Tensor) and grad is None
+                else maybe_clone(grad)
+            )
+            for grad, arg in zip(grad_args, primals)
+        ]
+
+    return flat_fn
+
+
 def get_dummy_aot_autograd_config():
     from torch._functorch.aot_autograd import AOTConfig
 
@@ -741,6 +804,40 @@ def first_slice_copy(t: torch.Tensor, dim: int = 0) -> torch.Tensor:
     return torch.select_copy(t, dim, 0)
 
 
+# Returns a mask whether a list element is a tensor or not
+def get_tensor_mask(tensor_list: Iterable[Any]) -> list[bool]:
+    return [True if isinstance(v, torch.Tensor) else False for v in tensor_list]
+
+
+def mask_list(
+    mask: list[bool], inp: list[Any], other: Optional[list[Any]] = None
+) -> list[Any]:
+    # Masks elements on an `inp` list.
+    # If other is None, then the elements of the `inp` list where the mask is False are removed
+    # If other is not None, then the elements of the `inp` list where the mask is False are
+    # replaced with the elements of the `other` list
+    assert len(mask) == len(inp), (
+        "The length of the mask needs to be identical to the length of the input"
+    )
+    if other is not None:
+        assert len(inp) == len(other), (
+            "If an input and an other list is provided, they need to have the same length"
+        )
+        return [i if m else o for m, i, o in zip(mask, inp, other)]
+    else:
+        return [i for m, i in zip(mask, inp) if m]
+
+
+def first_slice_copy_with_grad(li: Iterable[Any]) -> list[Any]:
+    # First_slice_copy does not keep the original requires_grad flag,
+    # but we need it for materialize_as_graph
+    # in order to compute the correct gradients
+    # The reason why first_slice_copy doesn't keep requires_grad flag is
+    # because it's called in torch.autograd.Function.backward/forward.
+    slc = [first_slice_copy(x).requires_grad_(x.requires_grad) for x in li]
+    return slc
+
+
 # Reports the difference between meta of two tensors in a string
 def diff_tensor_meta(
     meta1: TensorMetadata, meta2: TensorMetadata, check_grad=True
@@ -850,19 +947,16 @@ def _tensor_version(t) -> Optional[int]:
 
         def _get_shape_env(
             fake_args,
-        ) -> Optional[torch.fx.experimental.symbolic_shapes.ShapeEnv]:
+        ) -> torch.fx.experimental.symbolic_shapes.ShapeEnv:
             # detect_fake_mode requires there could be only one active fake mode. This
             # restricts the usage of this function because the global TracingContext
             # has a persistent fake mode but fake tensors can be created
             # outside of the tracing context (e.g. in testing).
             # Instead, we just look at fake_args fake tensor mode
-            if len(fake_args) == 0:
-                return torch.fx.experimental.symbolic_shapes.ShapeEnv()
-
             for arg in fake_args:
-                if isinstance(arg, FakeTensor):
+                if isinstance(arg, FakeTensor) and arg.fake_mode.shape_env is not None:
                     return arg.fake_mode.shape_env
-            return None
+            return torch.fx.experimental.symbolic_shapes.ShapeEnv()
 
         # Clone the fake args to avoid mutating the original fake args
         with ExitStack() as ctx_stack:
@@ -1076,12 +1170,22 @@ def _materialize_as_graph_inner():
         with suspend_functionalization(), disable_functional_mode():
             with disable_proxy_modes_tracing():
                 unfunc_t = [_from_fun(arg) for arg in args]
+
             with contextlib.ExitStack() as stack:
+                stack.enter_context(
+                    torch.utils._python_dispatch._disable_current_modes()
+                )
                 stack.enter_context(
                     torch._C._ForceDispatchKeyGuard(include_key_set, exclude_key_set),
                 )
                 if force_enable_grad:
                     stack.enter_context(torch.enable_grad())
+                # fake_mode is needed because parent tracer's fake_mode might
+                # be None but the associated inputs have fake mode or there
+                # is a global tracing context with fake mode. We nneed to
+                # make sure the fake mode when tracing subgraph is consistent.
+                if fake_mode := detect_fake_mode(unfunc_t):
+                    stack.enter_context(fake_mode)
                 return _maybe_reenter_make_fx(fn)(*unfunc_t)
 
     gm = _materialize_as_graph_inner()
@@ -1154,3 +1258,13 @@ def _has_gen_schema(op: HigherOrderOperator):
     return hasattr(type(op), method) and getattr(type(op), method) is not getattr(
         HigherOrderOperator, method
     )
+
+
+def filter_with_masks(data: list[Optional[torch.Tensor]], masks: list[bool]):
+    assert len(data) == len(masks)
+    return [item for item, keep in zip(data, masks) if keep]
+
+
+def fill_none_with_masks(data: list[Optional[torch.Tensor]], masks: list[bool]):
+    data_iter = iter(data)
+    return [next(data_iter) if kept else None for kept in masks]
diff --git a/torch/_higher_order_ops/while_loop.py b/torch/_higher_order_ops/while_loop.py
index 16f4606256166..02aa6ac0215ec 100644
--- a/torch/_higher_order_ops/while_loop.py
+++ b/torch/_higher_order_ops/while_loop.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import contextlib
-from typing import Callable, Union
+import functools
+from typing import Any, Callable, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -9,7 +10,11 @@
     _maybe_run_with_interpreter,
     _set_compilation_env,
     autograd_not_implemented,
+    check_input_alias_and_mutation_return_outputs,
     check_meta_consistency,
+    fill_none_with_masks,
+    filter_with_masks,
+    materialize_as_graph,
     reenter_make_fx,
     validate_subgraph_args_types,
 )
@@ -48,6 +53,83 @@ def __call__(
         validate_subgraph_args_types(additional_inputs)
         return super().__call__(cond_fn, body_fn, carried_inputs, additional_inputs)
 
+    def gen_schema(self, cond_fn, body_fn, carried_inputs, additional_inputs):
+        from torch._higher_order_ops.schema import HopSchemaGenerator
+        from torch._higher_order_ops.utils import materialize_as_graph
+
+        all_inputs = carried_inputs + additional_inputs
+
+        cond_gm: torch.fx.GraphModule = (
+            cond_fn
+            if isinstance(cond_fn, torch.fx.GraphModule)
+            else materialize_as_graph(cond_fn, all_inputs)
+        )
+        body_gm: torch.fx.GraphModule = (
+            body_fn
+            if isinstance(body_fn, torch.fx.GraphModule)
+            else materialize_as_graph(body_fn, all_inputs)
+        )
+
+        def _find_example_value(n, real_inp):
+            if "val" in n.meta:
+                return n.meta["val"]
+            elif "example_value" in n.meta:
+                return n.meta["example_value"]
+            else:
+                assert not isinstance(real_inp, torch.Tensor)
+                return real_inp
+
+        example_inputs = [
+            _find_example_value(n, real_inp)
+            for n, real_inp in zip(
+                body_gm.graph.find_nodes(op="placeholder"),
+                carried_inputs + additional_inputs,
+            )
+        ]
+
+        (
+            _,
+            _,
+            _,
+            body_mutated_inputs,
+            body_outputs,
+        ) = check_input_alias_and_mutation_return_outputs(body_gm, example_inputs)
+
+        (
+            _,
+            _,
+            _,
+            cond_mutated_inputs,
+            _,
+        ) = check_input_alias_and_mutation_return_outputs(cond_gm, example_inputs)
+
+        mutated_inputs = set(body_mutated_inputs) | set(cond_mutated_inputs)
+
+        schema_gen = HopSchemaGenerator(self)
+        schema_gen.add_arg("cond_fn", cond_gm)
+        schema_gen.add_arg("body_fn", body_gm)
+
+        for idx, arg in enumerate(carried_inputs):
+            schema_gen.add_arg(
+                f"carried_input{idx}", arg, is_mutated=idx in mutated_inputs
+            )
+
+        for idx, arg in enumerate(additional_inputs):
+            additional_idx = len(carried_inputs) + idx
+            schema_gen.add_arg(
+                f"additional_input{idx}",
+                arg,
+                is_mutated=additional_idx in mutated_inputs,
+            )
+
+        for out in body_outputs:
+            schema_gen.add_output(out)
+
+        schema_gen.add_schema_tree_spec(
+            cond_fn, body_fn, carried_inputs, additional_inputs
+        )
+        return schema_gen.gen_schema()
+
 
 while_loop_op = WhileLoopOp()
 
@@ -171,7 +253,9 @@ def _while_loop_op_wrapper(*args, **kwargs):
         with _temp_remove_metadata_torch_function_mode() as metadata_mode:
             with _temp_remove_metadata_torch_function_mode() as metadata_mode:
                 if metadata_mode:
-                    backend = make_eager_backend_with_torch_function_mode(metadata_mode)
+                    backend: Union[str, Callable[..., Any]] = (
+                        make_eager_backend_with_torch_function_mode(metadata_mode)
+                    )
                 else:
                     backend = "eager"
                 return torch.compile(
@@ -180,7 +264,9 @@ def _while_loop_op_wrapper(*args, **kwargs):
 
 
 @while_loop_op.py_impl(DispatchKey.CompositeExplicitAutograd)
-def while_loop_dense(cond_fn, body_fn, carried_inputs, additional_inputs):
+def while_loop_dense(
+    cond_fn, body_fn, carried_inputs, additional_inputs, stack_output=False
+):
     carried_vals = carried_inputs
 
     def _validate_cond_output(pred):
@@ -200,9 +286,30 @@ def _validate_cond_output(pred):
             f"carried_inputs must be a tuple or list but got {type(carried_inputs)}"
         )
 
-    while pred := cond_fn(*carried_vals, *additional_inputs):
-        _validate_cond_output(pred)
+    # Check condition and set up flag
+    should_loop = cond_fn(*carried_vals, *additional_inputs)
+    _validate_cond_output(should_loop)
+
+    if not should_loop:
+        if stack_output:
+            return tuple(
+                val.unsqueeze(0).clone() if isinstance(val, torch.Tensor) else val
+                for val in carried_vals
+            )
+        else:
+            return tuple(
+                val.clone() if isinstance(val, torch.Tensor) else val
+                for val in carried_vals
+            )
+
+    outputs: list[list[torch.Tensor]] = [[] for _ in carried_vals]
+
+    while should_loop:
         out = body_fn(*carried_vals, *additional_inputs)
+        if stack_output:
+            for i, o in enumerate(out):
+                outputs[i].append(o)
+
         assert isinstance(out, tuple), (
             f"body_fn should return a tuple but got {type(out)}"
         )
@@ -210,12 +317,28 @@ def _validate_cond_output(pred):
             "body_fn should return the same number of elements as carried_inputs"
         )
         carried_vals = out
+
+        should_loop = cond_fn(*carried_vals, *additional_inputs)
+
+    if stack_output:
+        outs: list[torch.Tensor] = []
+        for i, out in enumerate(outputs):
+            outs.append(torch.stack(out, dim=0))
+        return tuple(outs)
+
     return carried_vals
 
 
-while_loop_op.py_autograd_impl(
-    autograd_not_implemented(while_loop_op, deferred_error=True)
-)
+@while_loop_op.py_autograd_impl
+def while_loop_autograd(cond_fn, body_fn, operands, additional_inputs):
+    return WhileLoopAutogradOp.apply(
+        cond_fn,
+        body_fn,
+        len(operands),
+        len(additional_inputs),
+        *operands,
+        *additional_inputs,
+    )
 
 
 def _find_or_create_fake_mode() -> FakeTensorMode:
@@ -244,9 +367,18 @@ def _create_unbacked_symint(
 
 
 @while_loop_op.py_impl(ProxyTorchDispatchMode)
-def while_loop_tracing(mode, cond_fn, body_fn, carried_inputs, additional_inputs):
+def while_loop_tracing(
+    mode,
+    cond_fn,
+    body_fn,
+    carried_inputs,
+    additional_inputs,
+    stack_output=False,
+):
+    op = while_loop_stack_output_op if stack_output else while_loop_op
+
     def _trace_while_loop(
-        proxy_mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs
+        proxy_mode, op, cond_fn, body_fn, carried_inputs, additional_inputs
     ):
         # NOTE [unspecialize int carry with unbacked symints]
         # When we support int carry, we'll also need to support int output of body_fn because.
@@ -345,10 +477,10 @@ def produce_graph(fn):
         proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)
 
         out_proxy = proxy_mode.tracer.create_proxy(
-            "call_function", while_loop_op, proxy_args, {}, name="while_loop"
+            "call_function", op, proxy_args, {}, name=op._name
         )
 
-        out = while_loop_op(
+        out = op(
             cond_graph, body_graph, unspecialized_carried_inputs, additional_inputs
         )
         return track_tensor_tree(
@@ -356,13 +488,18 @@ def produce_graph(fn):
         )
 
     return _trace_while_loop(
-        mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs
+        mode,
+        op,
+        cond_fn,
+        body_fn,
+        carried_inputs,
+        additional_inputs,
     )
 
 
 @while_loop_op.py_impl(FakeTensorMode)
 def while_loop_fake_tensor_mode(
-    mode, cond_fn, body_fn, carried_inputs, additional_inputs
+    mode, cond_fn, body_fn, carried_inputs, additional_inputs, stack_output=False
 ):
     with mode:
         # NOTE: [Handling unback symints in subgraph of while_loop]
@@ -407,6 +544,26 @@ def while_loop_fake_tensor_mode(
                 "body_output",
                 include_contiguity=False,
             )
+
+        if stack_output:
+            n_iter = _create_unbacked_symint(mode, ignore_fresh_unbacked_symbols=False)
+            assert all(isinstance(x, torch.Tensor) for x in carried_inputs)
+            fake_outputs = tuple(
+                out.clone()
+                .unsqueeze(0)
+                .repeat((n_iter,) + tuple(1 for _ in range(out.dim())))
+                for out in body_outs
+            )
+            return pytree.tree_map_only(
+                (int, torch.SymInt),
+                # For while_loop's unbacked symint output, we want them to be bound
+                # to the proxy of while_loop's output.
+                lambda _: _create_unbacked_symint(
+                    mode, ignore_fresh_unbacked_symbols=False
+                ),
+                fake_outputs,
+            )
+
         # See NOTE [unspecialize int carry with unbacked symints]
         return pytree.tree_map_only(
             (int, torch.SymInt),
@@ -420,9 +577,13 @@ def while_loop_fake_tensor_mode(
 
 
 @while_loop_op.py_functionalize_impl
-def while_loop_func(ctx, cond_fn, body_fn, carried_inputs, additional_inputs):
+def while_loop_func(
+    ctx, cond_fn, body_fn, carried_inputs, additional_inputs, stack_output=False
+):
     from torch._higher_order_ops.utils import _check_alias_and_mutation
 
+    op = while_loop_stack_output_op if stack_output else while_loop_op
+
     unwrapped_carried_inputs = ctx.unwrap_tensors(carried_inputs)
     unwrapped_additional_inputs = ctx.unwrap_tensors(additional_inputs)
     unwrapped_inputs = unwrapped_carried_inputs + unwrapped_additional_inputs
@@ -435,10 +596,334 @@ def while_loop_func(ctx, cond_fn, body_fn, carried_inputs, additional_inputs):
             (body_fn, "body_fn"),
         ]:
             _check_alias_and_mutation(fn, unwrapped_inputs, fn_name, pre_dispatch)
-        ret = while_loop_op(
+        ret = op(
             functional_cond_fn,
             functional_body_fn,
             unwrapped_carried_inputs,
             unwrapped_additional_inputs,
         )
         return ctx.wrap_tensors(ret)
+
+
+class WhileLoopStackOutputOp(HigherOrderOperator):
+    """
+    while_loop_stack_output is a variant of while_loop that returns a stack of outputs.
+    Its semantic can be illurated using python code as:
+    def while_loop_stack_output(cond_fn, body_fn, carried_inputs, additional_inputs):
+        outs = []
+        while cond_fn(*carried_inputs, *additional_inputs):
+            out = body_fn(*carried_inputs, *additional_inputs)
+            outs.append(out)
+        return torch.stack(outs)
+
+    It's useful for supporting autograd of while_loop.
+    """
+
+    def __init__(self) -> None:
+        super().__init__("while_loop_stack_output")
+
+    def __call__(
+        self,
+        cond_fn: Callable,
+        body_fn: Callable,
+        carried_inputs: tuple[Union[torch.Tensor, int, float, bool]],
+        additional_inputs: tuple[Union[torch.Tensor, torch.SymInt, int], ...],
+        /,
+    ):
+        if not isinstance(carried_inputs, (tuple, list)):
+            raise RuntimeError(
+                f"carried_inputs must be a tuple or list, got {type(carried_inputs)}"
+            )
+        if not isinstance(additional_inputs, (tuple, list)):
+            raise RuntimeError(
+                f"additional_inputs must be a tuple or list, got {type(additional_inputs)}"
+            )
+
+        validate_subgraph_args_types(carried_inputs)
+        validate_subgraph_args_types(additional_inputs)
+        return super().__call__(cond_fn, body_fn, carried_inputs, additional_inputs)
+
+
+# Note [while_loop autograd]
+# Consider wthe following while_loop that can be visualized as:
+#           additional_inputs
+#       ┌─────┬─────┼─────┬─────┐
+#       |     |     |     |     |
+#       ↓     ↓     ↓     ↓     ↓
+# x ──→ y0 ─→ y1 ─→ y2 ─→ y3 ─→ y4
+#
+# The bacwkard can be visualized as follows:
+#
+#             g_additional_inputs
+#         ┌──────┬──────┼──────┬──────┐
+#         |      |      |      |      |
+#         |      |      |      |      |
+# gx <── gy0 <─ gy1 <─ gy2 <─ gy3 <─ gy4
+#
+# We can compute gx using chain rule:
+#
+#     gx = gy0 * bw(y0, x),
+#
+# where gy0 denotes the graident of loss with respect to y0, and bw(y0, x) denotes the graident of y0 with
+# respect to x. Note that bw can be computed from forward body_fn easily using torch.autograd.grad.
+# We could substitute the unknowns gy0, gy1, ..., with chain rule until gy4:
+#
+#     gx = gy1 * bw(y1, y0) * bw(y0, x)
+#        = gy2 * bw(y2, y1) * bw(y1, y0) * bw(y0, x)
+#        = ...
+#        = gy4 * bw(y4, y3) * bw(y3, y2) * bw(y2, y1) * bw(y1, y0) * bw(y0, x)
+#
+# since gy4 is the graient of the final output, which is given as the backward input, we've got a formula
+# to compute gx. A abbr for the formula is: gy4 * bw43210x
+#
+# In a similar way, we can compute g_additional_inputs using chain rule:
+#
+# g_additional_inputs = gy0 * bw(y0, addi) + gy1 * bw(y1, addi) + gy2 * bw(y2, addi) + ... + gy4 * bw(y4, addi)
+#
+# Notice that gy0 = gy4 * bw43210, gy1 = gy4 * bw4321 etc, we now also get a formula for g_additional_inputs.
+#
+# Implementation:
+# The idea of implementation is to construct a while_loop to calculate both gx and g_additional_inputs.
+# Specifically, we can implement the backward of while_loop with as follows:
+#
+# def cond_fn(idx, grad_carries, grad_additional_inputs, fw_additional_inputs, fw_inps):
+#     return idx < fw_inps.size(0)
+#
+# def body_fn(idx, grad_carries, grad_additional_inputs, fw_additional_inputs, fw_inps):
+#     reversed_idx = fw_inps.size(0) - 1 - idx
+#     next_grad_carry, next_grad_additional_inputs  = bw(fw_inps[reversed_idx], fw_additional_inputs, grad_carries)
+#     return idx + 1, next_grad_carry, next_grad_additional_inputs + grad_additional_inputs
+#
+# idx = 0
+# init_grad_carries = grads
+# init_grad_additional_inputs = torch.zeros_like(g_additioanl_inputs)
+# fw_inps = torch.cat([ctx.fw_carried_inputs, fw_outputs[:-1]])
+# while_loop(cond_fn, body_fn, (idx, init_grad_carries, init_grad_additional_inputs,), (fw_additional_inputs, fw_inps))
+
+
+class WhileLoopAutogradOp(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        cond_fn,
+        body_fn,
+        num_carried_inputs,
+        num_additional_inputs,
+        *carries_and_inputs,
+    ):
+        from torch._higher_order_ops.scan import split_into_chunks
+
+        carries, additional_inputs = split_into_chunks(
+            carries_and_inputs, [num_carried_inputs, num_additional_inputs]
+        )
+        with torch._C._AutoDispatchBelowAutograd():
+            fw_outputs = while_loop_stack_output_op(
+                cond_fn, body_fn, carries, additional_inputs
+            )
+
+        assert not hasattr(ctx, "fw_cond_fn")
+        assert not hasattr(ctx, "fw_body_fn")
+        assert not hasattr(ctx, "carries")
+        assert not hasattr(ctx, "additional_inputs")
+        assert not hasattr(ctx, "fw_outputs")
+        ctx.fw_cond_fn = cond_fn
+        ctx.fw_body_fn = body_fn
+        ctx.carries = carries
+        ctx.additional_inputs = additional_inputs
+        ctx.fw_outputs = fw_outputs
+        loop_count = None
+        for out in fw_outputs:
+            if isinstance(out, torch.Tensor):
+                if loop_count is not None:
+                    assert out.size(0) == loop_count
+                else:
+                    loop_count = out.size(0)
+        assert loop_count is not None
+
+        # Remove the loop_count from pending_fresh_unbacked_symbols
+        # because it's not part of forward output and it's impossible
+        # to bind it to a proxy in forward graph anyways.
+        if (
+            isinstance(loop_count, torch.SymInt)
+            and (shape_env := loop_count.node.shape_env)
+            and loop_count in shape_env.pending_fresh_unbacked_symbols
+        ):
+            shape_env.pending_fresh_unbacked_symbols.remove(loop_count)
+
+        # Even when body function is not executed, we clone and unsqueeze the input
+        # to avoid the aliasing, therefore loop_count is always >= 1
+        torch._check(loop_count >= 1)
+        # We snapshot the dispatch keys in forward for materializing the
+        # the bw_graph in backward.
+        ctx._fw_include_key_set = torch._C._dispatch_tls_local_include_set()
+        ctx._fw_exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+        assert len(fw_outputs) > 0, "fw_outputs shouldn't be empty"
+        # Only the last of the output fw_outputs need to be returned
+        return tuple(ckp[-1] for ckp in fw_outputs)
+
+    @staticmethod
+    def backward(ctx, *grads):
+        from torch._higher_order_ops.cond import create_bw_fn
+        from torch._higher_order_ops.scan import split_into_chunks
+
+        # set up single step bw fn
+        bw_body_fn = create_bw_fn(ctx.fw_body_fn, ctx.carries + ctx.additional_inputs)
+        # Note [Handle inputs that're not differentiable]
+        # When a forward input is non-differentiable e.g. a symint or an integer tensor, their gradients
+        # will be None. However, we don't want to return None in the subgraph because this complicates the
+        # inductor codegen, where we need to do a non-unform treatment for None and tensors.
+        # So we set up masks and filter the None gradients so that only tensors are returned from each step.
+        carries_tensor_masks = [
+            True if isinstance(t, torch.Tensor) and t.dtype.is_floating_point else False
+            for t in ctx.carries
+        ]
+        additional_inputs_tensor_masks = [
+            True if isinstance(t, torch.Tensor) and t.dtype.is_floating_point else False
+            for t in ctx.additional_inputs
+        ]
+
+        init_idx = torch.zeros((), dtype=torch.int64)
+        init_grad_carries = filter_with_masks(grads, carries_tensor_masks)  # type: ignore[arg-type]
+        init_grad_additional_inputs = tuple(
+            torch.zeros_like(t)
+            for need_keep, t in zip(
+                additional_inputs_tensor_masks, ctx.additional_inputs
+            )
+            if need_keep
+        )
+        # We need to the forward inputs to each iteration to compute the backward
+        # which is the concatenation of first iteraiton input i.e. ctx.carries and all iterations's
+        # output except the last iteration.
+        fw_carries = [
+            torch.cat([carry.unsqueeze(0), carries[:-1]])
+            for carry, carries in zip(ctx.carries, ctx.fw_outputs)
+        ]
+        for fw_carry, carry in zip(fw_carries, ctx.carries):
+            fw_carry.requires_grad_(carry.requires_grad)
+
+        _, spec = pytree.tree_flatten(
+            (
+                init_idx,
+                init_grad_carries,
+                init_grad_additional_inputs,
+                ctx.fw_outputs,
+                ctx.additional_inputs,
+            )
+        )
+
+        def cond_fn(*flat_args):
+            (
+                idx,
+                grad_carries,
+                grad_additional_inputs,
+                fw_carries,
+                additional_inputs,
+            ) = pytree.tree_unflatten(flat_args, spec)
+            assert isinstance(fw_carries[0], torch.Tensor), fw_carries[0]
+            # excluding the last iteration's output
+            return idx < fw_carries[0].size(0)
+
+        def body_fn(*flat_args):
+            (
+                idx,
+                grad_carries,
+                grad_additional_inputs,
+                fw_carries,
+                additional_inputs,
+            ) = pytree.tree_unflatten(flat_args, spec)
+            reversed_idx = fw_carries[0].size(0) - idx - 1
+            selected_fw_carries = [
+                ckp.select(0, reversed_idx.item()) for ckp in fw_carries
+            ]
+            cur_grad_carries, cur_grad_additional_inputs = split_into_chunks(
+                bw_body_fn(*selected_fw_carries, *additional_inputs, *grad_carries),
+                [len(ctx.carries), len(ctx.additional_inputs)],
+            )
+            assert all(isinstance(t, torch.Tensor) for t in cur_grad_carries)
+            cur_grad_carries_tensors = filter_with_masks(
+                cur_grad_carries, carries_tensor_masks
+            )
+            cur_grad_additional_inputs_tensors = filter_with_masks(
+                cur_grad_additional_inputs, additional_inputs_tensor_masks
+            )
+            return (
+                idx + 1,
+                *cur_grad_carries_tensors,
+                *(
+                    cur_grad + grad
+                    for cur_grad, grad in zip(
+                        cur_grad_additional_inputs_tensors, grad_additional_inputs
+                    )
+                ),
+            )
+
+        args_single_step_bw = (
+            init_idx,
+            *init_grad_carries,
+            *init_grad_additional_inputs,
+            *fw_carries,
+            *ctx.additional_inputs,
+        )
+
+        cond_gm = materialize_as_graph(
+            cond_fn,
+            args_single_step_bw,
+            ctx._fw_include_key_set,
+            ctx._fw_exclude_key_set,
+            force_enable_grad=True,
+        )
+
+        body_gm = materialize_as_graph(
+            body_fn,
+            args_single_step_bw,
+            ctx._fw_include_key_set,
+            ctx._fw_exclude_key_set,
+            force_enable_grad=True,
+        )
+
+        _, final_grad_carries, final_grad_additional_inputs = split_into_chunks(
+            while_loop_op(
+                cond_gm,
+                body_gm,
+                (
+                    init_idx,
+                    *init_grad_carries,
+                    *init_grad_additional_inputs,
+                ),
+                (*fw_carries, *ctx.additional_inputs),
+            ),
+            [1, len(init_grad_carries), len(init_grad_additional_inputs)],
+        )
+        return (
+            None,
+            None,
+            None,
+            None,
+            *fill_none_with_masks(final_grad_carries, carries_tensor_masks),
+            *fill_none_with_masks(
+                final_grad_additional_inputs, additional_inputs_tensor_masks
+            ),
+        )
+
+
+while_loop_stack_output_op = WhileLoopStackOutputOp()
+
+while_loop_stack_output_op.py_impl(DispatchKey.CompositeExplicitAutograd)(
+    functools.partial(while_loop_dense, stack_output=True)
+)
+
+while_loop_stack_output_op.py_impl(ProxyTorchDispatchMode)(
+    functools.partial(while_loop_tracing, stack_output=True)
+)
+
+while_loop_stack_output_op.py_impl(FakeTensorMode)(
+    functools.partial(while_loop_fake_tensor_mode, stack_output=True)
+)
+
+while_loop_stack_output_op.py_functionalize_impl(
+    functools.partial(while_loop_func, stack_output=True)
+)
+
+while_loop_stack_output_op.py_autograd_impl(
+    autograd_not_implemented(while_loop_stack_output_op, deferred_error=True)
+)
diff --git a/torch/_higher_order_ops/wrap.py b/torch/_higher_order_ops/wrap.py
index eec23ee20a471..8e9ca0503402c 100644
--- a/torch/_higher_order_ops/wrap.py
+++ b/torch/_higher_order_ops/wrap.py
@@ -2,10 +2,14 @@
 import inspect
 import itertools
 import logging
-from typing import Optional
+from typing import Any, Optional
 
+import torch
+import torch.utils._pytree as pytree
 from torch._logging import warning_once
 from torch._ops import HigherOrderOperator
+from torch.fx import GraphModule
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
 from torch.types import _dtype
 
 
@@ -227,7 +231,8 @@ def divide_kwargs(kwargs):
         }
         return checkpoint_kwargs, gmod_kwargs
 
-    def tag_nodes(self, gmod, is_sac):
+    @staticmethod
+    def tag_nodes(gmod, is_sac):
         from torch.utils.checkpoint import CheckpointPolicy
 
         unique_graph_id = next(uid)
@@ -243,44 +248,85 @@ def tag_nodes(self, gmod, is_sac):
         return gmod
 
     def __call__(self, gmod, *args, **kwargs):
-        import torch.fx.traceback as fx_traceback
-        from torch.fx import Interpreter
+        dispatch_key_set = torch._ops._compute_keyset(
+            args, kwargs, self.non_fallthrough_keys
+        )
+        dispatch_key = dispatch_key_set.highestPriorityTypeId()
+        if dispatch_key == torch._C.DispatchKey.PreDispatch:
+            return super().__call__(gmod, *args, **kwargs)
 
-        if "_checkpoint_context_fn" in gmod.meta:
-            warning_once(
-                log,
-                """
+        return tag_activation_checkpoint_impl(gmod, *args, **kwargs)
+
+
+tag_activation_checkpoint = TagActivationCheckpoint()
+
+
+def tag_activation_checkpoint_impl(gmod, *args, **kwargs):
+    import torch.fx.traceback as fx_traceback
+    from torch.fx import Interpreter
+
+    if "_checkpoint_context_fn" in gmod.meta:
+        warning_once(
+            log,
+            """
 Detected that context_fn is passed to torch.utils.checkpoint under torch.compile.
 Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_).
 """,
-            )
-            # use_reentrant is set to False because this op is going to be traced.
-            # And we ensure that AOT Autograd traces through the non reentrant
-            # version of checkpointing.
-            kwargs["use_reentrant"] = False
-            # preserve_rng_state is set to False because we want to prevent AOTAutograd from tracing through
-            # `torch.random.fork_rng` op (which is not supported yet under CUDA).
-            # This doesn't mean that we don't preserve RNG state. Instead, we will always preserve RNG state
-            # regardless of this flag (by doing RNG functionalization via `replace_random_passes` in Inductor
-            # instead of in AOTAutograd).
-            kwargs["preserve_rng_state"] = False
-            kwargs["context_fn"] = gmod.meta["_checkpoint_context_fn"]
-            # We first tag all nodes as "recompute" in this graph, and then we undo the "recompute" tag
-            # for specific nodes in _CachingTorchDispatchMode in torch/utils/checkpoint.py.
-            gmod = self.tag_nodes(gmod, is_sac=True)
-            # Using interpreter allows preservation of metadata through torch.compile stack.
-            with fx_traceback.preserve_node_meta():
-                from torch.utils.checkpoint import checkpoint
-
-                return checkpoint(Interpreter(gmod).run, *args, **kwargs)
-        else:
-            gmod = self.tag_nodes(gmod, is_sac=False)
-            # Using interpreter allows preservation of metadata through torch.compile stack.
-            # TODO: We want to use the same `checkpoint(Interpreter(gmod).run, *args, **kwargs)` here
-            # as the `context_fn != None` case, but that depends on in-place op support in TorchDispatchMode + torch.compile.
-            # (for details on in-place op issue, run `test_compile_selective_checkpoint_inplace_op` unit test)
-            with fx_traceback.preserve_node_meta():
-                return Interpreter(gmod).run(*args)
-
+        )
+        # use_reentrant is set to False because this op is going to be traced.
+        # And we ensure that AOT Autograd traces through the non reentrant
+        # version of checkpointing.
+        kwargs["use_reentrant"] = False
+        # preserve_rng_state is set to False because we want to prevent AOTAutograd from tracing through
+        # `torch.random.fork_rng` op (which is not supported yet under CUDA).
+        # This doesn't mean that we don't preserve RNG state. Instead, we will always preserve RNG state
+        # regardless of this flag (by doing RNG functionalization via `replace_random_passes` in Inductor
+        # instead of in AOTAutograd).
+        kwargs["preserve_rng_state"] = False
+        kwargs["context_fn"] = gmod.meta["_checkpoint_context_fn"]
+        # We first tag all nodes as "recompute" in this graph, and then we undo the "recompute" tag
+        # for specific nodes in _CachingTorchDispatchMode in torch/utils/checkpoint.py.
+        gmod = TagActivationCheckpoint.tag_nodes(gmod, is_sac=True)
+        # Using interpreter allows preservation of metadata through torch.compile stack.
+        with fx_traceback.preserve_node_meta():
+            from torch.utils.checkpoint import checkpoint
 
-tag_activation_checkpoint = TagActivationCheckpoint()
+            return checkpoint(Interpreter(gmod).run, *args, **kwargs)
+    else:
+        gmod = TagActivationCheckpoint.tag_nodes(gmod, is_sac=False)
+        # Using interpreter allows preservation of metadata through torch.compile stack.
+        # TODO: We want to use the same `checkpoint(Interpreter(gmod).run, *args, **kwargs)` here
+        # as the `context_fn != None` case, but that depends on in-place op support in TorchDispatchMode + torch.compile.
+        # (for details on in-place op issue, run `test_compile_selective_checkpoint_inplace_op` unit test)
+        with fx_traceback.preserve_node_meta():
+            return Interpreter(gmod).run(*args)
+
+
+@tag_activation_checkpoint.py_impl(ProxyTorchDispatchMode)
+def proxy_mode_key(
+    proxy_mode: ProxyTorchDispatchMode,
+    gmod: GraphModule,
+    *args: Any,
+    **kwargs: Any,
+) -> tuple[torch.Tensor]:
+    assert proxy_mode.pre_dispatch, (
+        "post-dispatch mode should have inlined in the Autograd key"
+    )
+    example_out = tag_activation_checkpoint(gmod, *args, **kwargs)
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)  # type: ignore[union-attr]
+    proxy_kwargs = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, kwargs)  # type: ignore[union-attr]
+    qualname = proxy_mode.tracer.get_fresh_qualname("wrap_body")  # type: ignore[union-attr]
+    proxy_mode.tracer.root.register_module(qualname, gmod)  # type: ignore[union-attr]
+    proxy_gmod = proxy_mode.tracer.unwrap_proxy(gmod)  # type: ignore[union-attr, call-overload]
+    for node in proxy_gmod.graph.nodes:
+        if "example_value" in node.meta:
+            node.meta["val"] = node.meta["example_value"]
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function",
+        tag_activation_checkpoint,
+        (proxy_gmod, *proxy_args),
+        proxy_kwargs,
+    )
+    return track_tensor_tree(
+        example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+    )
diff --git a/torch/_inductor/__init__.py b/torch/_inductor/__init__.py
index f80b71cbe420d..d287337afaa69 100644
--- a/torch/_inductor/__init__.py
+++ b/torch/_inductor/__init__.py
@@ -275,7 +275,7 @@ def aot_compile(
     kwargs: Optional[dict[str, Any]] = None,
     *,
     options: Optional[dict[str, Any]] = None,
-) -> Union[str, list[Union[str, Weights]]]:
+) -> Union[str, list[Union[str, Weights]], torch.fx.GraphModule]:
     """
     Ahead-of-time compile a given FX graph with TorchInductor into a shared library.
 
@@ -292,6 +292,15 @@ def aot_compile(
     """
     from .compile_fx import _aoti_flatten_inputs, compile_fx_aot
 
+    if hasattr(gm, "_guards_fn"):
+        # Do not compile the guards function, since it may contain checks
+        # that are not currently supported by AOTI. In particular, non-Tensor
+        # arguments are converted to None and will fail specialization checks.
+        node = next(iter(gm.graph.find_nodes(op="call_module", target="_guards_fn")))
+        gm.graph.erase_node(node)
+        delattr(gm, "_guards_fn")
+        gm.recompile()
+
     flat_example_inputs, options = _aoti_flatten_inputs(
         gm, args, kwargs, options=options
     )
diff --git a/torch/_inductor/analysis/profile_analysis.py b/torch/_inductor/analysis/profile_analysis.py
index 3a30380a656aa..134d06528c0df 100644
--- a/torch/_inductor/analysis/profile_analysis.py
+++ b/torch/_inductor/analysis/profile_analysis.py
@@ -670,12 +670,64 @@ def dump(self, out: str) -> None:
         with open(out, "w") as f:
             json.dump(self.data, f)
 
+    def combine_with(self, other: "JsonProfile") -> "JsonProfile":
+        """
+        Combine this profile with another profile by merging their trace events.
+        Returns a new JsonProfile object with combined data.
+        """
+        # Create a new combined data structure
+        combined_data = {
+            "traceEvents": self.data["traceEvents"] + other.data["traceEvents"],
+            "deviceProperties": self.data.get("deviceProperties", []),
+        }
+
+        # Merge device properties, avoiding duplicates
+        other_device_props = other.data.get("deviceProperties", [])
+        existing_device_ids = OrderedSet(
+            [dev["id"] for dev in combined_data["deviceProperties"]]
+        )
+
+        for device_prop in other_device_props:
+            if device_prop["id"] not in existing_device_ids:
+                combined_data["deviceProperties"].append(device_prop)
+
+        # Copy any other top-level properties from the first profile
+        for key, value in self.data.items():
+            if key not in combined_data:
+                combined_data[key] = value
+
+        import os
+
+        # Create a temporary file to write the combined data
+        import tempfile
+
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".json", delete=False
+        ) as tmp_file:
+            json.dump(combined_data, tmp_file)
+            tmp_path = tmp_file.name
+
+        try:
+            # Create new JsonProfile from the combined data
+            combined_profile = JsonProfile(
+                tmp_path,
+                benchmark_name=f"{self.benchmark_name or 'Profile1'}_+_{other.benchmark_name or 'Profile2'}",
+                dtype=self.dtype or other.dtype,
+            )
+            return combined_profile
+        finally:
+            # Clean up temporary file
+            os.unlink(tmp_path)
+
 
 class ParseException(RuntimeError):
     pass
 
 
 def main() -> None:
+    """
+    Main function for the profile analysis script.
+    """
     import argparse
 
     parser = argparse.ArgumentParser()
@@ -709,6 +761,14 @@ def main() -> None:
         metavar=("input_file", "dtype"),
         help="Run analysis on a single trace, specified as <file> <dtype>",
     )
+    parser.add_argument(
+        "--combine",
+        nargs="+",
+        metavar=("input_files", "output_file"),
+        help="Combine multiple profiles into a single profile by merging trace events. Specify as <input_file1> \
+<input_file2> [input_file3 ...] <output_file>. The last argument is the output file, all preceding arguments are \
+input files to combine.",
+    )
     args = parser.parse_args()
 
     if args.diff:
@@ -734,6 +794,24 @@ def main() -> None:
         p = JsonProfile(args.augment_trace[0], dtype=args.augment_trace[2])
         p.augment_trace()
         p.dump(args.augment_trace[1])
+    if args.combine:
+        input_files = args.combine[:-1]  # All arguments except the last one
+        output_file = args.combine[-1]  # Last argument is the output file
+
+        if len(input_files) < 2:
+            print("Error: At least 2 input files are required for combining")
+            return
+
+        # Load the first profile
+        combined = JsonProfile(input_files[0], dtype=None)
+
+        # Iteratively combine with all other profiles
+        for input_file in input_files[1:]:
+            profile = JsonProfile(input_file, dtype=None)
+            combined = combined.combine_with(profile)
+
+        combined.dump(output_file)
+        print(f"Successfully combined {', '.join(input_files)} into {output_file}")
 
 
 if __name__ == "__main__":
diff --git a/torch/_inductor/async_compile.py b/torch/_inductor/async_compile.py
index 0a12356de6701..9f941c04e7b38 100644
--- a/torch/_inductor/async_compile.py
+++ b/torch/_inductor/async_compile.py
@@ -38,7 +38,11 @@
     StaticAutotunerFuture,
     torch_key,
 )
-from torch._inductor.compile_worker.subproc_pool import AnyPool, SubprocPool
+from torch._inductor.compile_worker.subproc_pool import (
+    AnyPool,
+    SubprocException,
+    SubprocPool,
+)
 from torch._inductor.compile_worker.tracked_process_pool import (
     TrackedProcessPoolExecutor,
 )
@@ -49,6 +53,7 @@
 )
 from torch._inductor.utils import clear_on_fresh_cache
 from torch._inductor.virtualized import V
+from torch._utils_internal import log_triton_builds
 from torch.hub import _Faketqdm, tqdm
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._triton import has_triton_package
@@ -143,6 +148,7 @@ def shutdown_compile_workers() -> None:
     """Shut down all outstanding compile-worker pools."""
     for pool in _pool_set:
         pool.shutdown()
+    AsyncCompile._ready_future = None
     after_fork()
 
 
@@ -303,8 +309,8 @@ def warm_pool(cls) -> None:
 
     @classmethod
     def wait_pool_ready(cls, timeout=120) -> None:
-        if cls.use_process_pool():
-            assert cls._ready_future is not None
+        cls.use_process_pool()
+        if cls._ready_future is not None:
             cls._ready_future.result(timeout=timeout)
 
     @classmethod
@@ -449,12 +455,18 @@ def reload_kernel_in_parent():
             )
 
             def get_result() -> CachingAutotuner:
-                kernel, elapsed_us = task.result()
+                try:
+                    kernel, elapsed_us = task.result()
+                except SubprocException as e:
+                    raise e.with_name(kernel_name) from e
+
                 # Now that we've compiled, we should clear the future
                 # so it can't be used again
                 kernel.set_compile_info(compile_id, is_backward)
                 CompiledTritonKernels.remove_future(source_code)
 
+                kernel.restore_after_unpickle(old_values=None)
+
                 kernel.precompile(
                     warm_cache_only=False,
                     reload_kernel=reload_kernel_in_parent,
@@ -479,22 +491,29 @@ def get_result() -> CachingAutotuner:
                 log_waitcounter=True,
                 waitcounter_name_override="compile_triton",
             ):
-                start_ns = time_ns()
-                _set_triton_ptxas_path()
-                kernel = load_kernel()
-                kernel.set_compile_info(compile_id, is_backward)
-                kernel.precompile(
-                    warm_cache_only=False,
-                    static_triton_bundle_key=CompiledTritonKernels.key(source_code),
-                )
-                elapsed_us = (time_ns() - start_ns) // 1000
-                get_metrics_context().add_top_n(
-                    "triton_kernel_compile_times_us", kernel_name, elapsed_us
-                )
-                info = kernel.autotune_cache_info or {}
-                info["compile_time_us"] = elapsed_us
-                _add_triton_kernel_info(kernel_name, info)
-                return kernel
+                fail = None
+                try:
+                    start_ns = time_ns()
+                    _set_triton_ptxas_path()
+                    kernel = load_kernel()
+                    kernel.set_compile_info(compile_id, is_backward)
+                    kernel.precompile(
+                        warm_cache_only=False,
+                        static_triton_bundle_key=CompiledTritonKernels.key(source_code),
+                    )
+                    elapsed_us = (time_ns() - start_ns) // 1000
+                    get_metrics_context().add_top_n(
+                        "triton_kernel_compile_times_us", kernel_name, elapsed_us
+                    )
+                    info = kernel.autotune_cache_info or {}
+                    info["compile_time_us"] = elapsed_us
+                    _add_triton_kernel_info(kernel_name, info)
+                    return kernel
+                except Exception as e:
+                    fail = str(e)
+                    raise
+                finally:
+                    log_triton_builds(fail=fail)
 
     def multi_kernel(self, *args, **kwargs) -> Any:
         from torch._inductor.codegen.multi_kernel import MultiKernelCall
@@ -561,6 +580,45 @@ def halide(self, meta: HalideMeta, source_code: str):
             )
             return LambdaFuture(get_result)
 
+    def cutedsl(self, kernel_name: str, source_code: str):
+        """
+        Compile CuteDSL (CUTLASS Python DSL) kernels.
+
+        Args:
+            kernel_name: Name of the kernel to be defined
+            source_code: Source code of the CuteDSL kernel, as a string
+
+        Note:
+            CuteDSL currently requires source files to do its compilation, there we
+            use the PyCodeCache to write the source code to a file and load it.
+        """
+        from torch._inductor.codegen.cutedsl.cutedsl_kernel import (
+            CuteDSLKernelWrapper,
+            MAIN_SUFFIX,
+        )
+
+        kernel_code_log.info("CuteDSL Kernel:\n%s", source_code)
+
+        def task():
+            key, path = torch._inductor.codecache.PyCodeCache.write(source_code)
+            mod = torch._inductor.codecache.PyCodeCache.load_by_key_path(key, path)
+
+            # Find our special entry point named function
+            main_func_name = f"{kernel_name}_{MAIN_SUFFIX}"
+            if not hasattr(mod, main_func_name):
+                available = [name for name in dir(mod) if callable(getattr(mod, name))]
+                raise RuntimeError(
+                    f"Could not find CuteDSL main kernel function '{main_func_name}'. Available callables: {available}"
+                )
+
+            return CuteDSLKernelWrapper(getattr(mod, main_func_name), kernel_path=path)
+
+        if get_compile_threads() <= 1:
+            return task()
+        else:
+            future = self.submit(task)
+            return LambdaFuture(lambda: future.result())
+
     def wait(self, scope: dict[str, Any]) -> None:
         if get_compile_threads() > 1:
             with dynamo_timed(
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
index c936fbe92c671..a504b54f132b7 100644
--- a/torch/_inductor/autotune_process.py
+++ b/torch/_inductor/autotune_process.py
@@ -31,7 +31,12 @@
     get_hash,
     PyCodeCache,
 )
-from torch._inductor.utils import get_gpu_type, get_ld_library_path, is_gpu
+from torch._inductor.utils import (
+    get_gpu_type,
+    get_ld_library_path,
+    is_gpu,
+    python_subprocess_env,
+)
 from torch._logging import getArtifactLogger
 from torch.utils._ordered_set import OrderedSet
 
@@ -39,7 +44,7 @@
 if TYPE_CHECKING:
     from types import ModuleType
 
-    from torch._inductor.select_algorithm import TritonTemplateCaller
+    from torch._inductor.select_algorithm import PartialRender, TritonTemplateCaller
 
 from . import config
 from .runtime.benchmarking import benchmarker
@@ -123,11 +128,8 @@ def start(self):
             f"--read-fd={str(subproc_read_fd)}",
             f"--write-fd={str(subproc_write_fd)}",
         ]
-        extra_env = {
-            # We need to set the PYTHONPATH so the subprocess can find torch.
-            "PYTHONPATH": os.environ.get(
-                "TORCH_CUSTOM_PYTHONPATH", os.pathsep.join(sys.path)
-            ),
+        env = {
+            **python_subprocess_env(),
             # We shouldn't be using the Triton async compile subprocess pool,
             # but as a precaution set the env var that disables its creation.
             "TORCH_WARM_POOL": "0",
@@ -139,10 +141,10 @@ def start(self):
             else "0",
         }
         if self.device is not None:
-            extra_env[CUDA_VISIBLE_DEVICES] = str(self.device)
+            env[CUDA_VISIBLE_DEVICES] = str(self.device)
         self.process = subprocess.Popen(
             cmd,
-            env={**os.environ, **extra_env},
+            env=env,
             pass_fds=(subproc_read_fd, subproc_write_fd),
         )
         os.close(subproc_read_fd)
@@ -764,7 +766,7 @@ def update_workspace_size(self) -> None:
             return
         self.ensure_dll_loaded()
         unique_input_count = len(
-            {meta.name for meta in self.input_tensor_meta}  # noqa: set_linter
+            dict.fromkeys(meta.name for meta in self.input_tensor_meta)
         )
         args = [c_void_p(None) for _ in range(unique_input_count + 1)]
         stream_ptr = c_void_p(torch.cuda.current_stream().cuda_stream)
@@ -874,6 +876,55 @@ def __str__(self) -> str:
         return f"{self.kernel_name=}"
 
 
+class CuteDSLBenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
+    """Benchmark request for CuteDSL (CUTLASS Python DSL) kernels."""
+
+    def __init__(
+        self,
+        kernel_name: str,
+        input_tensor_meta: Union[TensorMeta, list[TensorMeta]],
+        output_tensor_meta: Union[TensorMeta, list[TensorMeta]],
+        extra_args: tuple[Any, ...],
+        source_code: PartialRender,
+    ) -> None:
+        super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args)
+
+        finalized_code = source_code.finalize_all()
+        self.module_cache_key, self.module_path = PyCodeCache.write(finalized_code)
+
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, out: torch.Tensor
+    ) -> Callable[[], None]:
+        """
+        Create a function to run the CuteDSL kernel with the given input and output tensors.
+        Similar to TritonBenchmarkRequest.make_run_fn but for CuteDSL kernels.
+        """
+        mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
+
+        # Logic replicated async_compile
+        from .codegen.cutedsl.cutedsl_kernel import MAIN_SUFFIX
+
+        main_func_name = f"{self.kernel_name}_{MAIN_SUFFIX}"
+
+        if not hasattr(mod, main_func_name):
+            available = [name for name in dir(mod) if callable(getattr(mod, name))]
+            raise RuntimeError(
+                f"Could not find CuteDSL main kernel function '{main_func_name}'. Available callables: {available}"
+            )
+
+        kernel_func = getattr(mod, main_func_name)
+
+        def run_kernel():
+            device_interface = get_interface_for_device("cuda")
+            stream = device_interface.get_raw_stream(out.device.index)
+            return kernel_func(*input_tensors, out, stream=stream)
+
+        return run_kernel
+
+    def cleanup_run_fn(self) -> None:
+        """Clean up any resources used by the kernel."""
+
+
 @functools.cache
 def get_tuning_process_pool() -> TuningProcessPool:
     pool = TuningProcessPool()
diff --git a/torch/_inductor/await_utils.py b/torch/_inductor/await_utils.py
new file mode 100644
index 0000000000000..a549674d5cd78
--- /dev/null
+++ b/torch/_inductor/await_utils.py
@@ -0,0 +1,176 @@
+import asyncio
+import sys
+import weakref
+from asyncio import AbstractEventLoop, Future
+from collections.abc import Awaitable, Coroutine, Generator, Iterator
+from contextlib import contextmanager, ExitStack
+from contextvars import Context
+from typing import Any, Callable, Optional, Protocol, TypeVar
+
+from torch.utils._ordered_set import OrderedSet
+
+
+T = TypeVar("T")
+TCoro = Generator[Any, None, T]
+
+if sys.version_info >= (3, 11):
+
+    class TaskFactory(Protocol):
+        def __call__(
+            self,
+            __loop: AbstractEventLoop,
+            __factory: Coroutine[None, None, object] | Generator[None, None, object],
+            __context: Context | None = None,
+            /,
+        ) -> asyncio.futures.Future[object]: ...
+
+    TaskFactoryType = TaskFactory
+else:
+    TaskFactoryType = Callable[[AbstractEventLoop, Generator[TCoro, None, T]], Future]  # type: ignore[valid-type]
+
+
+def await_sync(awaitable: Awaitable[T]) -> T:
+    with get_loop() as loop:
+        return loop.run_until_complete(awaitable)
+
+
+@contextmanager
+def get_loop(
+    always_create_new_loop: bool = False,
+) -> Iterator[AbstractEventLoop]:
+    try:
+        loop = asyncio.get_event_loop()
+    except RuntimeError as re:
+        if "There is no current event loop in thread" in str(re):
+            with _new_loop() as loop:
+                yield loop
+            return
+        else:
+            raise
+
+    @contextmanager
+    def _restore_loop(
+        loop: asyncio.AbstractEventLoop,
+    ) -> Iterator[None]:
+        try:
+            yield
+        finally:
+            asyncio.set_event_loop(loop)
+
+    @contextmanager
+    def _restore_running_loop() -> Iterator[None]:
+        loop_from_events = asyncio.events._get_running_loop()
+        asyncio.events._set_running_loop(None)
+        try:
+            yield
+        finally:
+            asyncio.events._set_running_loop(loop_from_events)
+
+    with ExitStack() as stack:
+        if loop.is_running():
+            stack.enter_context(_restore_running_loop())
+            stack.enter_context(_restore_loop(loop=loop))
+            loop = stack.enter_context(_new_loop(loop.get_task_factory()))  # type: ignore[arg-type]
+        elif loop.is_closed():
+            loop = stack.enter_context(_new_loop())  # type: ignore[arg-type]
+        elif always_create_new_loop:
+            stack.enter_context(_restore_loop(loop=loop))
+            loop = stack.enter_context(_new_loop())  # type: ignore[arg-type]
+        yield loop
+
+
+@contextmanager
+def _new_loop(
+    task_factory: Optional[TaskFactoryType] = None,
+) -> Iterator[asyncio.AbstractEventLoop]:
+    loop = asyncio.new_event_loop()
+    tasks = _patch_loop(loop)
+
+    if task_factory:
+        # pyre-ignore[6]
+        loop.set_task_factory(task_factory)  # type: ignore[arg-type]
+
+    asyncio.set_event_loop(loop)
+    try:
+        yield loop
+    finally:
+        try:
+            _cancel_all_tasks(loop, tasks)
+        finally:
+            asyncio.set_event_loop(None)
+            loop.close()
+
+
+def _cancel_all_tasks(
+    loop: AbstractEventLoop,
+    tasks: OrderedSet[Future],  # type: ignore[type-arg]
+) -> None:
+    to_cancel = [task for task in tasks if not task.done()]
+
+    if not to_cancel:
+        return
+
+    # pyre-fixme[1001]: Awaitable assigned to `task` is never awaited.
+    for task in to_cancel:
+        task.cancel()
+
+    loop.run_until_complete(asyncio.gather(*to_cancel, return_exceptions=True))
+
+    for task in to_cancel:
+        if task.cancelled():
+            continue
+        if task.exception() is not None:
+            loop.call_exception_handler(
+                {
+                    "message": "unhandled exception during asyncio.run() shutdown",
+                    "exception": task.exception(),
+                    "task": task,
+                }
+            )
+
+
+def _patch_loop(loop: AbstractEventLoop) -> OrderedSet[Future]:  # type: ignore[type-arg]
+    tasks: weakref.WeakSet[Future] = weakref.WeakSet()  # type: ignore[type-arg]
+
+    task_factories: list[Optional[TaskFactoryType]] = [None]
+
+    def _set_task_factory(factory: Optional[TaskFactoryType]) -> None:
+        task_factories[0] = factory
+
+    def _get_task_factory() -> Optional[TaskFactoryType]:
+        return task_factories[0]
+
+    def _safe_task_factory(
+        loop: AbstractEventLoop,
+        coro: TCoro,  # type: ignore[type-arg]
+        *,
+        context: Context | None = None,
+    ) -> asyncio.Future:  # type: ignore[valid-type, type-arg]
+        task_factory = task_factories[0]
+        if task_factory is None:
+            if sys.version_info >= (3, 11):
+                task = asyncio.Task(coro, loop=loop, context=context)
+            else:
+                task = asyncio.Task(coro, loop=loop)
+            # pyre-ignore[16]: `Task` has no attribute `_source_traceback`.
+            if task._source_traceback:  # type: ignore[attr-defined]
+                del task._source_traceback[  # type: ignore[attr-defined]
+                    -1
+                ]  # pragma: no cover  # type: ignore[attr-defined]
+        else:
+            if sys.version_info >= (3, 11):
+                task = task_factory(loop, coro, context=context)  # type: ignore[arg-type, call-arg, assignment]
+            else:
+                task = task_factory(loop, coro)  # type: ignore[arg-type]
+        #  `Union[Task[Any], Future[Any]]`.
+        tasks.add(task)
+        return task
+
+    # pyre-ignore[6]
+    loop.set_task_factory(_safe_task_factory)  # type: ignore[method-assign, arg-type]
+    # pyre-ignore[8]
+    loop.set_task_factory = _set_task_factory  # type: ignore[method-assign, assignment]
+    # pyre-ignore[8]
+    loop.get_task_factory = _get_task_factory  # type: ignore[method-assign, assignment]
+
+    return tasks  # type: ignore[return-value]
diff --git a/torch/_inductor/choices.py b/torch/_inductor/choices.py
index 84041326e4ae9..a6275ac85c110 100644
--- a/torch/_inductor/choices.py
+++ b/torch/_inductor/choices.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import typing
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import sympy
 
@@ -10,10 +10,12 @@
 from . import config
 from .codecache import write_text
 from .kernel_inputs import KernelInputs  # noqa: TC001
+from .kernel_template_choice import make_ktc_generator
 from .metrics import get_metric_table, is_metric_table_enabled
 from .runtime.hints import DeviceProperties, ReductionHint
 from .scheduler import BaseSchedulerNode, Scheduler, WhyNoFuse
-from .template_heuristics import (
+from .template_heuristics import get_template_heuristic
+from .template_heuristics.triton import (
     BaseConfigHeuristic,
     CPUConfigHeuristic,
     CUDAConfigHeuristic,
@@ -21,7 +23,6 @@
     ROCmConfigHeuristic,
     XPUConfigHeuristic,
 )
-from .template_registry import get_template_heuristic
 from .virtualized import V
 
 
@@ -33,8 +34,12 @@
 
     from torch.utils._ordered_set import OrderedSet
 
+    from .codegen.common import KernelTemplate
     from .codegen.simd_kernel_features import SIMDKernelFeatures
     from .codegen.triton import TritonKernel
+    from .ir import ChoiceCaller, Layout
+    from .kernel_template_choice import KernelTemplateChoice
+    from .select_algorithm import ExternKernelChoice
 
 
 class Sortable(typing.Protocol):
@@ -100,36 +105,139 @@ def get_flex_decode_configs(
         flex_heuristics = self.get_config_heuristics(device_type)
         return flex_heuristics.get_flex_decode_configs(head_dim, dtype)
 
-    def get_mm_configs(
+    def _finalize_mm_configs(
         self,
+        template_choices: dict[str, Generator[KernelTemplateChoice, None, None]],
         kernel_inputs: KernelInputs,
         layout: Any,
-        template_name: str,
+        templates: list[Union[KernelTemplate, ExternKernelChoice]],
         op_name: str,
-    ) -> Generator[dict[str, Any], None, None]:
+        kwarg_overrides: Optional[dict[str, dict[str, Any]]] = None,
+    ) -> list[KernelTemplateChoice]:
         """
-        Get generator of template parameters for MM templates using template-specific heuristics.
+        This method can be subclassed to perform any override/modification of the choices.
+        The incoming parameters are cheap (generators), so you can do any overrides without
+        incurring too much cost. Override this method to customize the kernel template choices
+        before they are converted to ChoiceCaller objects, which is expensive on template codegen.
+
+        The full list of arguments are here to facilitate any overrides you may want to do,
+        as they can be used to start from scratch for each template if so desired.
 
         Args:
+            template_choices: Dictionary mapping template UIDs to generators of KernelTemplateChoice objects
             kernel_inputs: MMKernelInputs containing input tensor nodes and matrix indices
             layout: Output layout
-            template_name: Template name (e.g., "bmm", "mm", "mm_persistent_tma")
-            op_name: Operation name (e.g., "bmm", "baddbmm", "addmm", "mm_plus_mm")
+            templates: List of template objects (KernelTemplate or ExternKernelChoice) in use
+            op_name: Operation name (e.g., "bmm", "baddbmm", "addmm")
+            kwarg_overrides: Optional dict of kwargs to override for each template heuristic
 
-        Yields:
-            Template parameter dictionaries ready for maybe_append_choice
+        Returns:
+            Flattened list of KernelTemplateChoice objects across all templates
         """
-        input_tensors = kernel_inputs.nodes()
-        if len(input_tensors) < 2:
-            raise ValueError(f"Need at least 2 input tensors, got {len(input_tensors)}")
+        choices: list[KernelTemplateChoice] = []
+        for choice_gen in template_choices.values():
+            choices.extend(choice_gen)
+        return choices
 
+    def get_ktc(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Layout,
+        template: Union[KernelTemplate, ExternKernelChoice],
+        op_name: str,
+        kwarg_overrides: Optional[dict[str, Any]] = None,
+    ) -> Generator[KernelTemplateChoice, None, None]:
+        """
+        Utility to get the KernelTemplateChoice generator for a specific input.
+
+        This is a per template/op call, whereas get_mm_configs is an op wide call (all templates).
+        Consider when overriding/using at which level you need to make decisions
+        """
         # Extract device_type from kernel_inputs
         device_type = kernel_inputs.device_type
         assert device_type is not None, "get_mm_configs requires a valid device type"
+        # Extract template_name from the template object
+        template_name = template.uid
+
         # Get the appropriate template-specific heuristic
         heuristic = get_template_heuristic(template_name, device_type, op_name)
+        cs = heuristic.get_template_configs(
+            kernel_inputs,
+            op_name,
+        )
+        extra_kwargs = heuristic.get_extra_kwargs(kernel_inputs, op_name)
+        # adjust the kernel inputs to the template-specific heuristic, if needed
+        # default here is to just return the kernel_inputs as is
+        inputs_val = heuristic.adjust_kernel_inputs(kernel_inputs, op_name)
+        # Create KernelTemplateChoice generator using the moved function
+        overrides = kwarg_overrides or {}
+        return make_ktc_generator(
+            template=template,
+            cs=cs,
+            overrides=overrides,
+            extra_kwargs=extra_kwargs,
+            layout=layout,
+            inputs=inputs_val,
+        )
+
+    def get_mm_configs(
+        self,
+        kernel_inputs: KernelInputs,
+        templates: list[Union[KernelTemplate, ExternKernelChoice]],
+        op_name: str,
+        layout: Optional[Layout] = None,
+        kwarg_overrides: Optional[dict[str, dict[str, Any]]] = None,
+    ) -> list[ChoiceCaller]:
+        """
+        Get list of ChoiceCallers for MM templates using template-specific heuristics.
 
-        yield from heuristic.get_template_configs(kernel_inputs, layout, op_name)
+        Args:
+            kernel_inputs: MMKernelInputs containing input tensor nodes and matrix indices
+            layout: Output layout
+            templates: List of template objects (KernelTemplate or ExternKernelChoice)
+            op_name: Operation name (e.g., "bmm", "baddbmm", "addmm", "mm_plus_mm")
+            kwarg_overrides: Optional dict of kwargs to override for each template heuristic,
+                             indexed by template.uid. These only override the per config kwargs, not the extra kwargs
+        Returns:
+            List of ChoiceCaller objects from the templates
+        """
+        if kwarg_overrides is None:
+            kwarg_overrides = {}
+        input_tensors = kernel_inputs.nodes()
+        if len(input_tensors) < 2:
+            raise ValueError(f"Need at least 2 input tensors, got {len(input_tensors)}")
+        if layout is None:
+            # TODO(coconutruben): remove this once we remove the layout argument entirely
+            # This is just here to the brief gap between commits where we still need this
+            # to accommodate fixed vs flexible layout decision externally
+            layout = kernel_inputs.output_layout(flexible=False)
+        # First pass: Create dict of template.uid to generator of KernelTemplateChoice objects
+        template_choices = {}
+        for template in templates:
+            template_choices[template.uid] = self.get_ktc(
+                kernel_inputs,
+                layout,
+                template,
+                op_name,
+                kwarg_overrides.get(template.uid, {}),
+            )
+
+        # Second pass: Adjust the template choices
+        adjusted_choices = self._finalize_mm_configs(
+            template_choices,
+            kernel_inputs,
+            layout,
+            templates,
+            op_name,
+            kwarg_overrides,
+        )
+        choices = []
+        # Third pass: Get adjusted choices and collect non-None ChoiceCaller objects
+        for ktc in adjusted_choices:
+            if ktc.choice is not None:
+                choices.append(ktc.choice)
+
+        return choices
 
     def triton_kernel_kwargs(
         self,
@@ -196,18 +304,6 @@ def should_use_persistent_reduction(
             features.reduction_numel, threshold
         )  # type: ignore[arg-types]
 
-    @staticmethod
-    def want_no_x_dim(features: SIMDKernelFeatures) -> bool:
-        """
-        Heuristic to decide if we should drop the X dimension from a persistent reduction kernel.
-        So the [XBLOCK, RBLOCK] block becomes a [RBLOCK] block and XBLOCK is forced to be always 1.
-        Strangely this is faster than a [1, RBLOCK] block in some cases.
-
-        ROCm branch change: Remove want_no_x_dim for persistent reduction.
-        Inductor benchmarks show no perf advantage and simplifies autotune flow.
-        """
-        return False
-
     @staticmethod
     def reduction_split_factor(
         device: torch.device,
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 451f72f621691..7b24208a2c512 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -83,6 +83,8 @@
     CustomGraphModulePass,
     CustomGraphPass,
     CustomGraphPassType,
+    CustomPartitionerFn,
+    CustomPartitionerFnType,
 )
 from torch._inductor.freezing_utils import has_frozen_params, is_frozen_param
 from torch._inductor.runtime.compile_tasks import _reload_python_module
@@ -144,6 +146,7 @@
 LOCK_TIMEOUT = 600
 
 output_code_log = torch._logging.getArtifactLogger(__name__, "output_code")
+autotuning_log = torch._logging.getArtifactLogger(__name__, "autotuning")
 log = logging.getLogger(__name__)
 
 
@@ -894,6 +897,11 @@ def __init__(
             if custom_config is not None
         }
 
+        # Register the custom partitioner function
+        self._custom_partitioner_fn = self._get_custom_partitioner_fn_detail(
+            config.custom_partitioner_fn
+        )
+
     # This is mainly added to handle these two inductor configs, which are (unfortunately)
     # sometimes cache safe:
     # - _pre_fusion_custom_pass
@@ -926,6 +934,14 @@ def _get_custom_pass_detail(
         assert isinstance(custom_pass, (CustomGraphPass, CustomGraphModulePass))
         return custom_pass.uuid()
 
+    def _get_custom_partitioner_fn_detail(
+        self, custom_partitioner_fn: CustomPartitionerFnType
+    ) -> Optional[Any]:
+        if not custom_partitioner_fn:
+            return None
+        assert isinstance(custom_partitioner_fn, CustomPartitionerFn)
+        return custom_partitioner_fn.uuid()
+
 
 def compiled_fx_graph_hash(
     gm: torch.fx.GraphModule,
@@ -1232,6 +1248,22 @@ def cache_hit_post_compile(
             lambda: {"filename": artifact_path},
             payload_fn=lambda: code,
         )
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "inductor_provenance_tracking_node_mappings",
+                "encoding": "json",
+            },
+            payload_fn=lambda: graph.inductor_provenance_mapping_str,
+        )
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "inductor_provenance_tracking_kernel_stack_traces",
+                "encoding": "json",
+            },
+            payload_fn=lambda: graph.inductor_provenance_stack_traces_str,
+        )
         return graph, cache_info
 
     @staticmethod
@@ -1570,8 +1602,14 @@ def clear() -> None:
 
 @functools.cache
 def split_aot_inductor_output_path(path: str) -> tuple[str, str]:
+    def get_module_ext_type() -> str:
+        if _IS_WINDOWS:
+            return ".pyd"
+        else:
+            return ".so"
+
     """Returns the path where the AOT Inductor compiled kernels are stored."""
-    if path.endswith(".so"):
+    if path.endswith(get_module_ext_type()):
         return os.path.split(path)
     elif path.endswith(".pt2"):
         return os.path.split(path)
@@ -1711,12 +1749,6 @@ def compile(
             wrapper_code = "\n".join((wrapper_code, kernel_code))
             kernel_code = ""
 
-        from .utils import aoti_model_name_from_config
-
-        model_class_name = ""
-        if config.aot_inductor.compile_standalone:
-            model_class_name = aoti_model_name_from_config()
-
         wrapper_key, wrapper_path = write(
             wrapper_code,
             "wrapper.cpp",
@@ -1749,6 +1781,8 @@ def compile(
                     "model.h",
                 )
             ) as f:
+                # model_name_for_generated_files is guaranteed to be non-empty when compile_standalone
+                model_class_name = config.aot_inductor.model_name_for_generated_files
                 class_name = f"AOTInductorModel{model_class_name}"
                 header_code = f.read()
 
@@ -1763,7 +1797,7 @@ def compile(
                     header_code,
                     "h",
                     specified_dir=specified_output_path,
-                    key=f"{model_class_name}",
+                    key=model_class_name,
                 )
 
         # Log the AOTInductor wrapper and kernel code, if needed.
@@ -1888,7 +1922,7 @@ def format_consts_to_gnu_asm(
                     consts_asm += f"\t.space {len(consts) - 8}\n"
                 consts_asm += f".globl\t{symbol_prefix}_binary_constants_bin_end\n"
                 consts_asm += f"{symbol_prefix}_binary_constants_bin_end:\n"
-                return consts_asm, "S"
+                return consts_asm, "weights.S"
 
             # Use c++ to convert consts to object file can support more compilers, such as msvc and icx.
             def format_consts_to_cpp(
@@ -1913,7 +1947,7 @@ def format_consts_to_cpp(
                         const_cpp += "\t\n"
                 const_cpp += "};\t\n"
                 const_cpp += f"alignas({align_bytes}) extern unsigned char * {symbol_prefix}_binary_constants_bin_end;\t\n"
-                return const_cpp, "cpp"
+                return const_cpp, "weights.cpp"
 
             def get_zero_consts_asm_code(
                 align_bytes: int,
@@ -1979,6 +2013,7 @@ def get_zero_consts_asm_code(
                 consts_code,
                 code_ext,
                 specified_dir=str(specified_sub_dir),
+                key=config.aot_inductor.model_name_for_generated_files,
             )
             consts_s = Path(consts_s)
             object_build_options = CppTorchDeviceOptions(
@@ -2279,7 +2314,13 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
             asm_files = []
             if not _IS_WINDOWS:
                 ld, objcopy = get_ld_and_objcopy(use_relative_path)
+                kernels = getattr(V.graph.wrapper_code, "_kernel_name_to_body", {})
                 for kernel_name, value in CudaKernelParamCache.cache.items():
+                    if kernel_name not in kernels:
+                        # It is possible that CudaKernelParamCache contains more Triton kernels
+                        # than what the current graph uses
+                        continue
+
                     if asm_file := value["asm"]:
                         asm_files.append(asm_file)
 
@@ -2394,9 +2435,44 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                     os.remove(o_file)
 
                 if use_mmap_weights:
-                    import resource
 
-                    page_size_ = resource.getpagesize()
+                    def get_page_size() -> int:
+                        # Don't use resource.getpagesize() on Windows, as it is a Unix specific package
+                        # as seen in https://docs.python.org/2/library/resource.html
+                        if _IS_WINDOWS:
+                            from ctypes import (  # type: ignore[attr-defined]
+                                byref,
+                                Structure,
+                                windll,
+                            )
+                            from ctypes.wintypes import DWORD, LPVOID, WORD
+
+                            class SYSTEM_INFO(Structure):
+                                _fields_ = [
+                                    ("wProcessorArchitecture", WORD),
+                                    ("wReserved", WORD),
+                                    ("dwPageSize", DWORD),
+                                    ("lpMinimumApplicationAddress", LPVOID),
+                                    ("lpMaximumApplicationAddress", LPVOID),
+                                    ("dwActiveProcessorMask", DWORD),
+                                    ("dwNumberOfProcessors", DWORD),
+                                    ("dwProcessorType", DWORD),
+                                    ("dwAllocationGranularity", DWORD),
+                                    ("wProcessorLevel", WORD),
+                                    ("wProcessorRevision", WORD),
+                                ]
+
+                            si = SYSTEM_INFO()
+                            windll.kernel32.GetSystemInfo(byref(si))
+                            sys_page_size = si.dwPageSize
+                        else:
+                            import resource
+
+                            sys_page_size = resource.getpagesize()
+
+                        return sys_page_size
+
+                    page_size_ = get_page_size()
                     page_size = max(16384, page_size_)
 
                     with open(output_so, "a+b") as f_so:
@@ -2410,6 +2486,15 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                     generated_files.append(output_so)
 
         if config.aot_inductor.package:
+            if config.trace.provenance_tracking_level != 0:
+                kernel_info = torch._inductor.debug.create_kernel_information_json()
+                kernel_info_json = os.path.join(
+                    wrapper_path_operator.parent, "kernel_information.json"
+                )
+                with open(kernel_info_json, "w") as f:
+                    f.write(json.dumps(kernel_info, indent=4))
+                generated_files.append(kernel_info_json)
+
             # We want to return the directory that contains all the AOTI
             # generated files, not just the so
             # return os.path.split(output_so)[0]
@@ -2549,7 +2634,7 @@ def _get_cpp_prefix_header(device: str) -> Optional[str]:
 def _get_cpp_wrapper_header(device: str, aot_mode: bool = False) -> str:
     """Given a device type (and optionally whether we're in AOT Inductor mode), returns
     the path to the cpp_wrapper header file to be precompiled."""
-    base_device = device.split(":")[0]
+    base_device = device.split(":", maxsplit=1)[0]
     is_array_ref = config.aot_inductor.allow_stack_allocation and base_device == "cpu"
     return (
         "torch/csrc/inductor/"
@@ -3619,9 +3704,12 @@ def _cuda_lib_options() -> list[str]:
             if "torch/lib" in path:
                 # don't want to depend on pytorch
                 continue
+            extra_ldflags.append(f"-L{path}")
             # -rpath ensures the DLL can find its dependencies when loaded, even
             # if the library path is non-standard.
-            extra_ldflags.extend([f"-L{path}", "-Xlinker", f"-rpath={path}"])
+            # But do not add the stubs folder to rpath as the driver is expected to be found at runtime
+            if os.path.basename(path) != "stubs":
+                extra_ldflags.extend(["-Xlinker", f"-rpath={path}"])
         extra_ldflags.append("-lcuda")
         extra_ldflags.append("-lcudart")
     else:
@@ -3730,7 +3818,10 @@ def cuda_compile_command(
         res = f"{_cuda_compiler()} {' '.join(options)} -o {dst_file} {src_file}"
     else:
         raise NotImplementedError(f"Unsupported output file suffix {dst_file_ext}!")
-    log.debug("CUDA command: %s", res)
+    if log.isEnabledFor(logging.DEBUG):
+        log.debug("CUDA command: %s", res)
+    else:
+        autotuning_log.debug("CUDA command: %s", res)
     return res
 
 
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index dad5a281e10a6..9802358b02eee 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -44,6 +44,7 @@
 from .. import config, metrics
 from ..dtype_propagation import DtypePropagationOpsHandler
 from ..ops_handler import BasicMathOpsMixin, DefaultHandler
+from ..shape_propagation import ShapePropagationOpsHandler
 from ..utils import (
     boolean_ops,
     DeferredLineBase,
@@ -70,6 +71,7 @@
     from ..ir import Buffer, ChoiceCaller, FixedLayout, IRNode
     from ..loop_body import LoopBody
     from ..scheduler import BaseScheduling, Scheduler, SchedulerNode
+    from ..shape_propagation import BlockShapeType
     from .wrapper import PythonWrapperCodegen
 
     _T = TypeVar("_T")
@@ -253,6 +255,9 @@ def get_stride(self) -> list[sympy.Expr]:
     def get_name(self) -> str:
         return self.outer_name
 
+    def get_is_pinned(self) -> bool:
+        return False
+
     def get_inputs_that_alias_output(self) -> list[str]:
         return []
 
@@ -303,6 +308,7 @@ class DeviceCodegen:
     scheduling: SchedulingConstructor
     wrapper_codegen: WrapperConstructor
     cpp_wrapper_codegen: Optional[WrapperConstructor] = None
+    fx_wrapper_codegen: Optional[WrapperConstructor] = None
 
 
 KernelArgType = Union[WorkspaceArg, TensorArg, SizeArg, TMADescriptorArg, ConstexprArg]
@@ -359,8 +365,8 @@ def cpp_device_ptr(self) -> str:
     def tma_descriptor_helpers(self) -> str:
         raise NotImplementedError
 
-    def cpp_global_scratch(
-        self, idx: int, workspace: TritonScratchWorkspace
+    def cpp_scratch(
+        self, idx: int, workspace: TritonScratchWorkspace, prefix: Optional[str] = None
     ) -> Optional[tuple[list[str], str]]:
         # optionally return (scratch definition, arg name)
         raise NotImplementedError
@@ -397,11 +403,15 @@ def register_backend_for_device(
     device_scheduling: SchedulingConstructor,
     device_wrapper_codegen: WrapperConstructor,
     device_cpp_wrapper_codegen: Optional[WrapperConstructor] = None,
+    device_fx_wrapper_codegen: Optional[WrapperConstructor] = None,
     device_custom_pass: Optional[CustomGraphModulePass] = None,
     device_custom_config: Optional[ConfigModule] = None,
 ) -> None:
     device_codegens[device] = DeviceCodegen(
-        device_scheduling, device_wrapper_codegen, device_cpp_wrapper_codegen
+        device_scheduling,
+        device_wrapper_codegen,
+        device_cpp_wrapper_codegen,
+        device_fx_wrapper_codegen,
     )
     custom_backend_passes[device] = device_custom_pass
     if device_custom_config:
@@ -458,15 +468,16 @@ def get_scheduling_for_device(device: str) -> Optional[SchedulingConstructor]:
 
 
 def get_wrapper_codegen_for_device(
-    device: str, cpp_wrapper: bool = False
+    device: str, cpp_wrapper: bool = False, fx_wrapper: bool = False
 ) -> Optional[WrapperConstructor]:
     if device in device_codegens:
         wrapper_codegen_obj: DeviceCodegen = device_codegens[device]
-        return (
-            wrapper_codegen_obj.cpp_wrapper_codegen
-            if cpp_wrapper
-            else wrapper_codegen_obj.wrapper_codegen
-        )
+        if fx_wrapper:
+            return wrapper_codegen_obj.fx_wrapper_codegen
+        elif cpp_wrapper:
+            return wrapper_codegen_obj.cpp_wrapper_codegen
+        else:
+            return wrapper_codegen_obj.wrapper_codegen
     return None
 
 
@@ -499,6 +510,7 @@ def init_backend_registration() -> None:
     from .python_wrapper_mtia import PythonWrapperMtia
     from .triton import TritonScheduling
     from .wrapper import PythonWrapperCodegen
+    from .wrapper_fxir import WrapperFxCodegen
 
     if get_scheduling_for_device("cpu") is None:
         cpu_backends = {
@@ -513,6 +525,7 @@ def init_backend_registration() -> None:
             CppWrapperCpuArrayRef
             if config.aot_inductor.allow_stack_allocation
             else CppWrapperCpu,
+            WrapperFxCodegen,
         )
 
     if get_scheduling_for_device("cuda") is None:
@@ -526,6 +539,7 @@ def init_backend_registration() -> None:
             lambda scheduling: cuda_backends[config.cuda_backend](scheduling),
             PythonWrapperCodegen,
             CppWrapperGpu,
+            WrapperFxCodegen,
         )
 
     if get_scheduling_for_device("xpu") is None:
@@ -534,6 +548,7 @@ def init_backend_registration() -> None:
             TritonScheduling,
             PythonWrapperCodegen,
             CppWrapperGpu,
+            WrapperFxCodegen,
         )
 
     if get_scheduling_for_device("mps") is None:
@@ -542,6 +557,7 @@ def init_backend_registration() -> None:
             MetalScheduling,
             PythonWrapperCodegen,
             CppWrapperMps,
+            WrapperFxCodegen,
         )
 
     if get_scheduling_for_device("mtia") is None:
@@ -550,6 +566,7 @@ def init_backend_registration() -> None:
             TritonScheduling,
             PythonWrapperMtia,
             CppWrapperGpu,
+            WrapperFxCodegen,
         )
 
     private_backend = torch._C._get_privateuse1_backend_name()
@@ -563,12 +580,14 @@ def init_backend_registration() -> None:
             device_scheduling = _get_custom_mod_func("Scheduling")
             wrapper_codegen = _get_custom_mod_func("PythonWrapperCodegen")
             cpp_wrapper_codegen = _get_custom_mod_func("CppWrapperCodegen")
+            fx_wrapper_codegen = _get_custom_mod_func("WrapperFxCodegen")
             if device_scheduling and wrapper_codegen and cpp_wrapper_codegen:
                 register_backend_for_device(
                     private_backend,
                     device_scheduling,
                     wrapper_codegen,
                     cpp_wrapper_codegen,
+                    fx_wrapper_codegen,
                 )
         except RuntimeError:
             pass
@@ -801,6 +820,14 @@ def doprint(
             expr = V.graph.sizevars.simplify(expr)
         return super().doprint(expr)
 
+    def parenthesize(self, item: sympy.Expr, level: int, strict: bool = False) -> str:
+        if isinstance(item, sympy.Mod):
+            # use parenthesis to enforce precedence.
+            # in sympy 1.13.3, -2*Mod(x,y) becomes -2*x%y, which is wrong.
+            return f"({self._print(item)})"
+        else:
+            return super().parenthesize(item, level, strict)
+
 
 class OpDecompositions:
     """
@@ -1767,6 +1794,7 @@ def __init__(
         name: str,
         bounds: ValueRanges[Any],
         dtype: Optional[torch.dtype] = None,
+        shape: BlockShapeType = None,
     ):
         super().__init__()
         assert isinstance(bounds, ValueRanges), type(bounds)
@@ -1774,6 +1802,7 @@ def __init__(
         self.bounds = bounds
         self.use_count = 1  # track how many times this expression is used
         self.dtype = dtype
+        self.shape = shape
 
     def __str__(self) -> str:
         return self.name
@@ -1883,6 +1912,7 @@ def generate(
         write: bool = True,
         assignment: bool = True,
         dtype: Optional[torch.dtype] = None,
+        shape: BlockShapeType = None,
     ) -> CSEVariableType:
         if isinstance(expr, OpsValue):
             expr = expr.value
@@ -1903,8 +1933,12 @@ def generate(
             assert isinstance(expr, str)
             cache_key = expr
         var = self.try_get(cache_key)
+        if shape is None and not assignment:
+            # since there's no assignment to a variable, use any shape here
+            # other than None to avoid the unknown shape failures
+            shape = ()
         if not var:
-            var = self.newvar(bounds, dtype)
+            var = self.newvar(bounds, dtype, shape)
             self.put(cache_key, var)
             if write:
                 if V.kernel.current_node:
@@ -1950,9 +1984,10 @@ def newvar(
         self,
         bounds: ValueRanges[Any] = ValueRanges.unknown(),
         dtype: Optional[torch.dtype] = None,
+        shape: BlockShapeType = None,
     ) -> CSEVariableType:
         var_name = f"{self.name_prefix}{next(self.iter_buffer_ids)}"
-        var = V.kernel.create_cse_var(var_name, bounds, dtype)
+        var = V.kernel.create_cse_var(var_name, bounds, dtype, shape)
         self.varname_map[var_name] = var
         return var
 
@@ -1961,11 +1996,12 @@ def namedvar(
         name: str,
         bounds: ValueRanges[Any] = ValueRanges.unknown(),
         dtype: Optional[torch.dtype] = None,
+        shape: BlockShapeType = None,
     ) -> CSEVariableType:
         torch._check_value(
             name not in self.varname_map, lambda: f"duplicate name: {name}"
         )
-        var = V.kernel.create_cse_var(name, bounds, dtype)
+        var = V.kernel.create_cse_var(name, bounds, dtype, shape)
         self.varname_map[name] = var
         return var
 
@@ -2374,6 +2410,29 @@ def get_dtype(name: str) -> torch.dtype:
     def __init__(self, name: str) -> None:
         self.name = name
 
+    @property
+    def uid(self) -> str:
+        """
+        entry point to override for templates to ensure a uid e.g. through a prefix
+
+        the purpose of this is that every KernelTemplate/ExternKernelChoice is unique
+        in the system, but reproducible e.g. restarting pytorch should yield the same id
+        """
+        # TODO(coconutruben): add some central registration to assert on global uniqueness
+        return self.name
+
+    def choice_or_none(self, **kwargs: Any) -> Optional[ChoiceCaller]:
+        """
+        Maybe generates a new ChoiceCaller and returns it, or None if generation fails.
+
+        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
+        """
+        temp_choices: list[Any] = []
+        result = self.maybe_append_choice(temp_choices, **kwargs)
+        if result is None and len(temp_choices) == 1:
+            return temp_choices[0]
+        return None
+
     def maybe_append_choice(
         self, choices: list[Any], **kwargs: Any
     ) -> Optional[NotImplementedError]:
@@ -2421,19 +2480,27 @@ def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) ->
 
         value = getattr(self.parent_handler, name)(*args, **kwargs)
         dtype_handler = DtypePropagationOpsHandler()
+        shape_handler = ShapePropagationOpsHandler()
 
         backend = get_current_backend()
 
+        shape_op = getattr(shape_handler, name)
         output_dtype = None
+        output_shape = None
+
         if name == "masked" and backend == "triton":
             output_dtype = value.dtype
+            output_shape = value.shape
         elif name == "masked" and backend == "cpp":
             output_dtype = V.interpreter.current_node.meta.get(
                 OptimizationContext.key, None
             ).dtype
+            # TODO: fix me
+            output_shape = None
         elif backend in ("triton", "cpp", "mps"):
             dtype_op = getattr(dtype_handler, name)
             output_dtype = dtype_op(*args, **kwargs)
+            output_shape = shape_op(*args, **kwargs)
 
         if backend in ("triton", "cpp"):
             # maybe there are some exceptions on mps?
@@ -2441,7 +2508,7 @@ def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) ->
 
         output_idx = 0
 
-        def do_cse(v: str) -> CSEVariable:
+        def do_cse(v: Union[str, CSEVariable]) -> CSEVariable:
             # we tree_map over the output, so we need to fetch corresponding dtype
             nonlocal output_idx
             var_dtype: Optional[torch.dtype] = (
@@ -2449,17 +2516,28 @@ def do_cse(v: str) -> CSEVariable:
                 if isinstance(output_dtype, (list, tuple))
                 else output_dtype
             )
+            var_shape: BlockShapeType = (
+                output_shape[output_idx]  # type: ignore[assignment]
+                if isinstance(output_shape, (list, tuple))
+                and len(output_shape) > 0
+                and isinstance(output_shape[0], (list, tuple))
+                else output_shape
+            )
             output_idx += 1
 
             # some cpp op implementations don't set the dtype
-            if backend == "cpp" and isinstance(v, CSEVariable) and v.dtype is None:
-                v.dtype = var_dtype
+            if isinstance(v, CSEVariable):
+                if backend == "cpp" and v.dtype is None:
+                    v.dtype = var_dtype
+                if v.shape is None:
+                    v.shape = var_shape
 
             csevar = V.kernel.cse.generate(
                 V.kernel.compute,
                 v,
                 bounds=bounds,
                 dtype=output_dtype,
+                shape=output_shape,
             )
 
             csevar.update_on_args(name, args, kwargs)
@@ -2556,7 +2634,13 @@ def indirect_indexing(
                     pos = var.bounds & ValueRanges(0, int_oo)
                     new_bounds = new_bounds | pos
 
-            var = self.kernel.cse.generate(self.kernel.compute, stm, bounds=new_bounds)
+            var = self.kernel.cse.generate(
+                self.kernel.compute,
+                stm,
+                bounds=new_bounds,
+                dtype=var.dtype,
+                shape=var.shape,
+            )
 
         sympy_var = self.parent_handler.indirect_indexing(var, size, check)
         if generate_assert(check):
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index e995faae26523..9d36e24d5f9e5 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -933,8 +933,8 @@ def frexp(x):
             return tuple(V.kernel.cse.try_get(cache_key) for cache_key in cache_keys)
 
         code = BracesBuffer()
-        exponent = V.kernel.cse.newvar(dtype=torch.int32)
-        mantissa = V.kernel.cse.newvar(dtype=x.dtype)
+        exponent = V.kernel.cse.newvar(dtype=torch.int32, shape=x.shape)
+        mantissa = V.kernel.cse.newvar(dtype=x.dtype, shape=x.shape)
         code.writeline(f"int32_t {exponent};")
         code.writeline(f"auto {mantissa} = std::frexp({x}, &{exponent});")
         V.kernel.compute.splice(code)
@@ -1119,6 +1119,10 @@ def sign(x):
         code.writeline("()")
         return code
 
+    @staticmethod
+    def device_assert_async(cond, msg):
+        return f'({cond} ? 0 : (throw std::runtime_error("{msg}"), 0))'
+
 
 CppOverrides._initialize_pointwise_overrides("cpp")
 
@@ -3218,11 +3222,10 @@ def store_reduction(self, name, index, value):
         index = self.rename_indexing(index)
         var = self.args.output(name)
         out_dtype = V.graph.get_dtype(name)
-        dtype = (
-            (out_dtype if out_dtype == torch.double else torch.float)
-            if out_dtype.is_floating_point
-            else torch.int64
-        )
+        if out_dtype.is_floating_point and out_dtype != torch.double:
+            dtype = torch.float
+        else:
+            dtype = out_dtype
         out_num_vectors = V.kernel._get_num_vectors(out_dtype)
         src_num_vectors = V.kernel._get_num_vectors(dtype)
         code = IndentedBuffer()
@@ -5159,11 +5162,19 @@ def is_contiguous_index(x):
                         ):
                             continue
                         # Local Buffer is a view of global buffer
+                        local_buffer_stride: list[int] = []
+                        stride = global_buffer_layout.stride[-1]
+                        local_buffer_size = get_call_ranges(scheduler_node)[
+                            size_offset:
+                        ]
+                        for sz in reversed(local_buffer_size):
+                            local_buffer_stride.insert(0, stride)
+                            stride *= sz
                         local_buffer_layout = ir.FixedLayout(
                             global_buffer_layout.device,
                             global_buffer_layout.dtype,
-                            global_buffer_layout.size[size_offset:],
-                            global_buffer_layout.stride[size_offset:],
+                            local_buffer_size,
+                            local_buffer_stride,
                         )
 
                         def try_share_local_buffer(local_buffer_layout, local_buffers):
@@ -5385,10 +5396,6 @@ def define_kernel(self, src_code, nodes, kernel_args=None):
             else ""
         )
         kernel_name = "_".join(["cpp", fused_name, wrapper.next_kernel_suffix()])
-        # below add provenance tracing info for cpu CppKernel types
-        if config.trace.provenance_tracking:
-            set_kernel_post_grad_provenance_tracing(nodes, kernel_name)
-
         kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
         src_code = src_code.replace(str(Placeholder.KERNEL_NAME), kernel_decl_name)
         src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name)
@@ -5427,7 +5434,15 @@ def flush(self):
             kernel_name = self.define_kernel(
                 src_code, self.kernel_group.scheduled_nodes
             )
-            self.kernel_group.call_kernel(V.graph.wrapper_code, kernel_name)
+            # below add provenance tracing info for cpu CppKernel types
+            debug_handle: Optional[int] = None
+            if config.trace.provenance_tracking_level != 0:
+                debug_handle = set_kernel_post_grad_provenance_tracing(
+                    self.kernel_group.scheduled_nodes, kernel_name
+                )
+            self.kernel_group.call_kernel(
+                V.graph.wrapper_code, kernel_name, debug_handle=debug_handle
+            )
         self.reset_kernel_group()
         self._set_flush_status(False)
 
@@ -5468,7 +5483,7 @@ def codegen_group(self, name=None) -> str:
             "win32",
         ]
         if enable_kernel_profile:
-            code.writelines(["#include <ATen/record_function.h>"])
+            code.writelines(["#include <torch/csrc/inductor/aoti_runtime/utils.h>"])
         code.writeline("#include <torch/csrc/inductor/cpp_prefix.h>")
 
         # 2. Function definition
@@ -5491,7 +5506,10 @@ def codegen_group(self, name=None) -> str:
                 prefix = "graph_" + str(graph_id) + "_" if graph_id is not None else ""
                 code.writelines(
                     [
-                        f'RECORD_FUNCTION("{prefix + kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'
+                        (
+                            "torch::aot_inductor::RAIIAtenRecordFunctionHandle "
+                            f'record_{prefix + kernel_name}_("{prefix + kernel_name}", nullptr);'
+                        )
                     ]
                 )
             for old, new in self.args.aliases():
@@ -5499,10 +5517,14 @@ def codegen_group(self, name=None) -> str:
             code.splice(self.loops_code)
         return code.getvalue()
 
-    def call_kernel(self, wrapper, kernel_name):
+    def call_kernel(self, wrapper, kernel_name, debug_handle: Optional[int] = None):
         _, call_args, arg_types = self.args.cpp_argdefs()
         wrapper.generate_kernel_call(
-            kernel_name, call_args, triton=False, arg_types=arg_types
+            kernel_name,
+            call_args,
+            triton=False,
+            arg_types=arg_types,
+            debug_handle=debug_handle,
         )
 
 
diff --git a/torch/_inductor/codegen/cpp_flex_attention_template.py b/torch/_inductor/codegen/cpp_flex_attention_template.py
index 80fd3014a643c..a1ceecf7f7c9e 100644
--- a/torch/_inductor/codegen/cpp_flex_attention_template.py
+++ b/torch/_inductor/codegen/cpp_flex_attention_template.py
@@ -792,7 +792,7 @@ def get_arg_name(name):
             return ""
 
         if start_offset == -1:
-            start_offset = getattr(self, len_attr)
+            start_offset = self.len_score_other
 
         length = getattr(self, len_attr)
         for i in range(length):
@@ -995,9 +995,9 @@ def render(  # type: ignore[override,return]
             value=value,
             kv_num_blocks=self.input_nodes[3],
             kv_indices=self.input_nodes[4],
-            full_kv_num_blocks=self.input_nodes[5]
-            if not self.no_full_kv_block
-            else None,
+            full_kv_num_blocks=(
+                self.input_nodes[5] if not self.no_full_kv_block else None
+            ),
             full_kv_indices=self.input_nodes[6] if not self.no_full_kv_block else None,
             score_mod_other_buffers=self.score_mod_other_buffers,
             mask_mod_other_buffers=self.mask_mod_other_buffers,
diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py
index 8f04ac9236136..bfcebbd6a3810 100644
--- a/torch/_inductor/codegen/cpp_gemm_template.py
+++ b/torch/_inductor/codegen/cpp_gemm_template.py
@@ -917,9 +917,6 @@ def add_choices(
 
         if input_indices is None:
             input_indices = list(range(len(input_nodes)))
-        only_one_input = (
-            input_nodes[0] == input_nodes[1] if len(input_nodes) > 1 else False
-        )
 
         def reorder_and_filter(inputs, layout_or_out):
             if has_bias:
@@ -1019,6 +1016,9 @@ def normalize_shapes(inputs, layout_or_out):
         assert micro_gemm is not None
         pre_block_weights = cls.check_if_block_weight(new_inputs[1], micro_gemm)
         micro_gemm.use_local_vnni_blocking(not pre_block_weights)
+        only_one_input = (
+            input_nodes[0] == input_nodes[1] if len(input_nodes) > 1 else False
+        ) and not pre_block_weights  # If weights are blocked, use the second input
 
         def preprocessor(inputs, layout):
             new_inputs, new_layout = normalize_shapes(
@@ -1094,6 +1094,18 @@ def get_padded_size(n, block_n, k, should_block_weight):
         new_size = [padded_n // block_n, k, block_n]
         return new_size, padded_n
 
+    @staticmethod
+    def _maybe_remove_storage_offset(node: ir.IRNode):
+        if node.get_layout().offset == 0:
+            return node
+        # node may be contiguous but still have a non-zero storage offset.
+        # GEMM_TEMPLATE emits code like:
+        #   W.data_ptr[node.offset + ...]
+        # but runtime W.data_ptr (after normalize_shapes()) already includes this offset.
+        # To avoid double-offsetting, we remove the offset in the node also in the generated code.
+        #   W.data_ptr[...]
+        return ir.ExternKernel.copy_input(node)
+
     @classmethod
     def prep_weight(
         cls,
@@ -1149,6 +1161,7 @@ def prep_weight(
         elif isinstance(W, ir.IRNode):
             # Require W layout to be fixed & contiguous, happens inplace.
             ir.ExternKernel.require_contiguous(W)
+            new_inputs[1] = cls._maybe_remove_storage_offset(W)
 
         if not skip_int8_compensation and _is_int8_gemm(new_inputs):
             BCompensate = None
diff --git a/torch/_inductor/codegen/cpp_micro_gemm.py b/torch/_inductor/codegen/cpp_micro_gemm.py
index 13d946863425d..d6b8806bdd910 100644
--- a/torch/_inductor/codegen/cpp_micro_gemm.py
+++ b/torch/_inductor/codegen/cpp_micro_gemm.py
@@ -195,7 +195,7 @@ def get_b_layout(self) -> LayoutType:
     ALLOCATE_WEIGHT_BUFFER = r"""
     {%- if is_msvc_compiler %}
     // MSVC doesn't support stack-allocated dynamic-sized arrays, so using heap memory here.
-    std::unique_ptr<{{buffer_dtype}}[]> heap_deq_b_buf_ptr(new {{buffer_dtype}}[{{buffer_size}}]);
+    auto heap_deq_b_buf_ptr = std::make_unique<{{buffer_dtype}}[]>({{buffer_size}});
     {{buffer_dtype}}* {{buffer_name}} = heap_deq_b_buf_ptr.get();
     {%- else %}
     // It's safe to use a stack-allocated array since the blocking strategy would
@@ -963,6 +963,24 @@ def check_amx_extra(config, m, n, k, alpha, num_threads, **kwargs):
     return k % vnni_size == 0 and alpha == 1
 
 
+def check_int8_bf16_amx_extra(config, m, n, k, alpha, num_threads, **kwargs):
+    # We need avx512_bf16 to dequant int8 to bf16
+    vec_isa = kwargs.get("vec_isa", None)
+    assert vec_isa is not None
+    return vec_isa.is_avx512_bf16_supported() and check_amx_extra(
+        config, m, n, k, alpha, num_threads, **kwargs
+    )
+
+
+# amx_fp16 need to be checked separately since it is not always supported when amx is supported
+def check_amx_fp16_extra(config, m, n, k, alpha, num_threads, **kwargs):
+    assert config.input_dtype == torch.float16 and config.output_dtype == torch.float
+    vec_isa = kwargs.get("vec_isa", None)
+    assert vec_isa is not None
+    vnni_size = 2
+    return vec_isa.is_amx_fp16_supported() and k % vnni_size == 0 and alpha == 1
+
+
 @register_micro_gemm(
     *generate_gemm_config(
         VecAMX,
@@ -975,20 +993,27 @@ def check_amx_extra(config, m, n, k, alpha, num_threads, **kwargs):
     ),
     *generate_gemm_config(
         VecAMX,
-        [(32, 32, 32), (48, 16, 32), (16, 48, 32)],
+        [(32, 32, 32), (48, 16, 32)],
         input_dtype=torch.bfloat16,
         input2_dtype=torch.int8,
         output_dtype=torch.float,
         compute_dtype=torch.float,
-        extra_check=check_amx_extra,
+        extra_check=check_int8_bf16_amx_extra,
     ),
     *generate_gemm_config(
         VecAMX,
-        [(32, 32, 32), (48, 16, 32), (16, 48, 32)],
+        [(32, 16, 32), (32, 32, 32), (48, 16, 32), (16, 48, 32)],
         input_dtype=torch.bfloat16,
         output_dtype=torch.float,
         extra_check=check_amx_extra,
     ),
+    *generate_gemm_config(
+        VecAMX,
+        [(32, 32, 32), (48, 16, 32), (16, 48, 32)],
+        input_dtype=torch.float16,
+        output_dtype=torch.float,
+        extra_check=check_amx_fp16_extra,
+    ),
     *generate_gemm_config(
         VecAMX,
         [(32, 32, 64), (48, 16, 64)],
@@ -1025,12 +1050,38 @@ class CppMicroGemmAMX(CppMicroGemm):
         for (int idx_dq = 0, idx_q = 0; idx_dq < buf_size; idx_q += ldb, idx_dq += {{block_n}}) {
         {%- for vec_idx in range(0, block_n, 32) %}
             {%- if (block_n - vec_idx) >= 32 %}
-            auto b_int8_idx_{{vec_idx}} = at::vec::Vectorized<int8_t>::loadu(
-                base_addr + idx_q + {{vec_idx}} ,
-                static_cast<int64_t>(32)
-            );
-            auto b_bf16_idx_{{vec_idx}} = at::vec::convert<{{input_t}}>(b_int8_idx_{{vec_idx}});
-            b_bf16_idx_{{vec_idx}}.store(dequantized_B_buf + idx_dq + {{vec_idx}});
+            // 1) Load 32 x int8
+            __m256i v8  = _mm256_loadu_si256((const __m256i*)(base_addr + idx_q + {{vec_idx}}));
+            // 2) Widen: 32 x i8 -> 32 x i16
+            __m512i v16 = _mm512_cvtepi8_epi16(v8);  // sign-extend. Use _mm512_cvtepu8_epi16 for unsigned
+            // Split the 32 x i16 into two 16-lane halves
+            __m256i v16_lo = _mm512_castsi512_si256(v16);
+            __m256i v16_hi = _mm512_extracti64x4_epi64(v16, 1);
+            // 3) Widen each half to i32
+            __m512i v32_lo = _mm512_cvtepi16_epi32(v16_lo);
+            __m512i v32_hi = _mm512_cvtepi16_epi32(v16_hi);
+            // 4) Convert to f32
+            __m512 f_lo = _mm512_cvtepi32_ps(v32_lo);
+            __m512 f_hi = _mm512_cvtepi32_ps(v32_hi);
+            // 5) f32 -> bf16 (round-to-nearest-even) and pack 32 lanes to 512b
+            // Packs the second operand (f_lo) into the lower 16 bf16 lanes and the first (f_hi) into the upper 16.
+            __m512i bf = (__m512i)_mm512_cvtne2ps_pbh(f_hi, f_lo);
+            // 6) Store 32 x bf16 (512 bits)
+            _mm512_storeu_si512((__m512i*)(dequantized_B_buf + idx_dq + {{vec_idx}}), bf);
+            {%- elif (block_n - vec_idx) >= 16 %}
+            // 1) Load 16 x int8 (128 bits)
+            __m128i v8 = _mm_loadu_si128((const __m128i*)(base_addr + idx_q + {{vec_idx}}));
+            // 2) Widen: 16 x i8 -> 16 x i16
+            __m256i v16 = _mm256_cvtepi8_epi16(v8);   // for signed
+            // use _mm256_cvtepu8_epi16 for unsigned
+            // 3) Widen further: 16 x i16 -> 16 x i32
+            __m512i v32 = _mm512_cvtepi16_epi32(v16);
+            // 4) Convert to f32
+            __m512 f32 = _mm512_cvtepi32_ps(v32);
+            // 5) Convert f32 -> bf16 (round-to-nearest-even)
+            __m256i bf16 = (__m256i)_mm512_cvtneps_pbh(f32);
+            // 6) Store 16 x bf16 (256 bits)
+            _mm256_storeu_si256((__m256i*)(dequantized_B_buf + idx_dq + {{vec_idx}}), bf16);
             {%- else %}
             auto b_int8_tail = at::vec::Vectorized<int8_t>::loadu(
                 base_addr + idx_q + {{block_n - (block_n % 32)}},
@@ -1187,7 +1238,11 @@ class CppMicroGemmAMX(CppMicroGemm):
         _tile_dpbusd({{tile_idx_c}}, {{tile_idx_a}}, {{tile_idx_b}});
             {%- endif %}
         {%- else %}
+            {%- if input_dtype == torch.float16 %}
+        _tile_dpfp16ps({{tile_idx_c}}, {{tile_idx_a}}, {{tile_idx_b}});
+            {%- else %}
         _tile_dpbf16ps({{tile_idx_c}}, {{tile_idx_a}}, {{tile_idx_b}});
+            {%- endif %}
         {%- endif %}
     {%- endfor %}
 {%- endfor %}
@@ -1362,7 +1417,7 @@ def check_woq_int4_extra(config, m, n, k, alpha, num_threads, **kwargs):
     q_group_size = kwargs.get("q_group_size", None)
     assert q_group_size is not None
     if (
-        q_group_size < 32
+        q_group_size not in [32, 64, 128]
         or k % q_group_size != 0
         or config.register_blocking.block_k > q_group_size
     ):
@@ -1508,9 +1563,7 @@ class CppMicroGemmWoQInt4Avx512(CppMicroGemmFP32Vec):
   auto load_scale_and_zeros = [&](int i, int _kb) {
     // load 2x bfloat16 vector
     __m512i t = _mm512_loadu_si512((__m512i*)(ScaleAndZeros + _kb * lds + 32 * i));
-    if (_kb + PREFETCH_SIZE_KB < KB) {
-      _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * lds + 32 * i, _MM_HINT_T0);
-    }
+    _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * lds + 32 * i, _MM_HINT_T0);
 
     // convert to 2x f32 vector
     __m512 a, b;
@@ -1544,9 +1597,7 @@ class CppMicroGemmWoQInt4Avx512(CppMicroGemmFP32Vec):
 
     if constexpr (col == 0) {
       float aa = static_cast<float>(A[row * lda + k]);
-      if (k + PREFETCH_SIZE_K < K) {
-        _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0);
-      }
+      _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0);
       va = _mm512_set1_ps(aa);
     }
 
@@ -1556,9 +1607,7 @@ class CppMicroGemmWoQInt4Avx512(CppMicroGemmFP32Vec):
         // to reduce de-quantize overhead.
         if constexpr (col == 0) {
           __m256i b4 = _mm256_loadu_si256((__m256i*)(B + k * ldb));
-          if (k + PREFETCH_SIZE_K < K) {
-            _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb, _MM_HINT_T0);
-          }
+          _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb, _MM_HINT_T0);
 
           __m512i b32 = _mm512_cvtepu8_epi32(_mm256_castsi256_si128(b4));
           vb[0] = _mm512_permutexvar_ps(b32, lut);
@@ -1650,7 +1699,8 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
 
     TEMPLATE_ENTRY = r"""
 inline bool {{kernel_name}}_is_block_start(int index, int k_start, int group_size) {
-  return (k_start + index) % group_size == 0;
+  // check if (k_start + index) % group_size == 0, assuming group_size = 32/64/128
+  return ((k_start + index) & (group_size - 1)) == 0;
 }
 
 {{declare_kernel}} {
@@ -1734,9 +1784,7 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
     auto load_scale_and_zeros = [&](int i, int _kb) {
         // load 2x bfloat16 vector
         __m512i t = _mm512_loadu_si512((__m512i*)(ScaleAndZeros + _kb * lds + 32 * i));
-        if (_kb + PREFETCH_SIZE_KB < KB) {
-            _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * lds + 32 * i, _MM_HINT_T0);
-        }
+        _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * lds + 32 * i, _MM_HINT_T0);
 
         // convert to 2x f32 vector
         __m512 a, b;
@@ -1765,11 +1813,9 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
                 c10::ForcedUnroll<COLS>{}(load_scale_and_zeros, kb++);
             }
 
-            // load 256 bits = 64 elements in int4
-            if (k + PREFETCH_SIZE_K < K) {
-                _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb_int4, _MM_HINT_T0);
-            }
+            _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb_int4, _MM_HINT_T0);
 
+            // load 256 bits = 64 elements in int4
             __m128i b4 = _mm_loadu_si128((__m128i*)(B + n / 2 * K + k * ldb_int4));
             b32[0] = _mm512_cvtepu8_epi32(b4);
             b32[1] = _mm512_srli_epi32(b32[0], 4);
@@ -1778,8 +1824,8 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
             vb[1] = _mm512_permutexvar_ps(b32[1], lut);
             vb[1] = _mm512_fmadd_ps(vb[1], scale[1], zero[1]);
 
-            b4 = _mm_loadu_si128((__m128i*)(B + n / 2 * K + (k + 1) * ldb_int4));
-            b32[0 + COLS] = _mm512_cvtepu8_epi32(b4);
+            __m128i b4_2 = _mm_loadu_si128((__m128i*)(B + n / 2 * K + (k + 1) * ldb_int4));
+            b32[0 + COLS] = _mm512_cvtepu8_epi32(b4_2);
             b32[1 + COLS] = _mm512_srli_epi32(b32[0 + COLS], 4);
             vb[0 + COLS] = _mm512_permutexvar_ps(b32[0 + COLS] , lut);
             vb[0 + COLS] = _mm512_fmadd_ps(vb[0 + COLS], scale[0], zero[0]);
@@ -1904,7 +1950,7 @@ def create_from_config(cls, config: CppMicroGemmConfig):
             alpha,
         )
 
-    def skip_amx_kernel_for_woq(config, dynamic_M, micro_gemm_cls):
+    def skip_amx_kernel_for_woq(dynamic_M):
         # For WoQ GEMM, AMX micro-kernel may not perform well if m is small.
         # Exception: for dynamic shapes, we consider using the AMX micro-kernel.
         if (
@@ -1913,11 +1959,7 @@ def skip_amx_kernel_for_woq(config, dynamic_M, micro_gemm_cls):
             or input2_dtype not in [torch.int8, torch.uint8]
         ):
             return False
-        # For WOQ INT8, use AMX for m >= block_m
-        # For WOQ INT4, use AMX for m >= 5
-        block_m, *_ = config.register_blocking
-        is_woq_int4 = micro_gemm_cls == CppMicroGemmWoQInt4Amx
-        m_threshold = 5 if is_woq_int4 else block_m
+        m_threshold = 5
         return m < m_threshold
 
     assert isinstance(n, int) or n.is_number, n
@@ -1959,12 +2001,11 @@ def skip_amx_kernel_for_woq(config, dynamic_M, micro_gemm_cls):
                     num_threads,
                     dynamic_M=dynamic_M,
                     q_group_size=q_group_size,
+                    vec_isa=vec_isa,
                 ):
                     continue
                 block_m, block_n, block_k = config.register_blocking
-                if config.vec_isa_cls == VecAMX and skip_amx_kernel_for_woq(
-                    config, dynamic_M, cls
-                ):
+                if config.vec_isa_cls == VecAMX and skip_amx_kernel_for_woq(dynamic_M):
                     continue
                 # Criteria on the ranking of configurations
                 # 1. ISA: AMX > VEC
@@ -1993,9 +2034,14 @@ def skip_amx_kernel_for_woq(config, dynamic_M, micro_gemm_cls):
                     + (block_m * block_k + block_k * block_n)
                     * config.input_dtype.itemsize
                 )
+                size_score = register_bytes
+                # if number of mxn blocks can not occupy all the threads,
+                # we favor smaller register blocks.
+                if occupancy_score == 0:
+                    size_score = 0 - register_bytes
                 matched_configs.append(
                     (
-                        (isa_score, dividable_score, occupancy_score, register_bytes),
+                        (isa_score, dividable_score, occupancy_score, size_score),
                         cls,
                         config,
                     )
diff --git a/torch/_inductor/codegen/cpp_template.py b/torch/_inductor/codegen/cpp_template.py
index 09ee0b1848925..d72f13a3e3fac 100644
--- a/torch/_inductor/codegen/cpp_template.py
+++ b/torch/_inductor/codegen/cpp_template.py
@@ -131,7 +131,7 @@ def header(self) -> IndentedBuffer:
             "win32",
         ]
         if enable_kernel_profile:
-            res.writelines(["#include <ATen/record_function.h>"])
+            res.writelines(["#include <torch/csrc/inductor/aoti_runtime/utils.h>"])
         return res
 
     def render(self, **kwargs) -> str:
diff --git a/torch/_inductor/codegen/cpp_template_kernel.py b/torch/_inductor/codegen/cpp_template_kernel.py
index 184c0fe889af9..b0dee69b012b7 100644
--- a/torch/_inductor/codegen/cpp_template_kernel.py
+++ b/torch/_inductor/codegen/cpp_template_kernel.py
@@ -190,7 +190,11 @@ def maybe_codegen_profile(self) -> str:
         if config.cpp.enable_kernel_profile:
             graph_id = V.graph.graph_id
             prefix = "graph_" + str(graph_id) + "_" if graph_id is not None else ""
-            return f'RECORD_FUNCTION("{prefix}{self.kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'
+            handle_str = (
+                "torch::aot_inductor::RAIIAtenRecordFunctionHandle "
+                f'record_{prefix}{self.kernel_name}_("{prefix}{self.kernel_name}", nullptr);'
+            )
+            return handle_str
         else:
             return ""
 
diff --git a/torch/_inductor/codegen/cpp_utils.py b/torch/_inductor/codegen/cpp_utils.py
index bc0316bd3ff8a..929c227039463 100644
--- a/torch/_inductor/codegen/cpp_utils.py
+++ b/torch/_inductor/codegen/cpp_utils.py
@@ -22,6 +22,7 @@
 from ..dependencies import Dep
 from ..loop_body import LoopBody
 from ..scheduler import BaseSchedulerNode, SchedulerBuffer
+from ..shape_propagation import BlockShapeType
 from ..utils import IndentedBuffer, sympy_index_symbol_with_prefix, sympy_subs
 from ..virtualized import ops, OpsValue, V
 from .common import CSEVariable, Kernel, KernelArgs, OptimizationContext
@@ -145,8 +146,9 @@ def __init__(
         name,
         bounds: ValueRanges[Any],
         dtype: Optional[torch.dtype] = None,
+        shape: BlockShapeType = None,
     ) -> None:
-        super().__init__(name, bounds, dtype)
+        super().__init__(name, bounds, dtype, shape=shape)
         self.is_vec = False
         self.dependent_itervars = OrderedSet[sympy.Symbol]()
 
@@ -199,6 +201,14 @@ def doprint(self, expr, *, simplify: bool = True, p=True):
             expr = V.graph.sizevars.simplify(expr)
         return super().doprint(expr)
 
+    def parenthesize(self, item: sympy.Expr, level: int, strict: bool = False) -> str:
+        if isinstance(item, sympy.Mod):
+            # use parenthesis to enforce precedence.
+            # in sympy 1.13.3, -2*Mod(x,y) becomes -2*x%y, which is wrong.
+            return f"({self._print(item)})"
+        else:
+            return super().parenthesize(item, level, strict)
+
 
 # A function to print, useful for printing sympy symbols.
 cexpr = CppPrinter().doprint
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index 6047ea916fb17..83d1d0614674b 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -22,13 +22,8 @@
 from torch.utils._sympy.symbol import symbol_is_type, SymT
 
 from .. import config, cpp_builder, ir
-from ..utils import (
-    _align,
-    aoti_model_name_from_config,
-    DeferredLineBase,
-    LineContext,
-    normalize_name,
-)
+from ..debug import set_kernel_post_grad_provenance_tracing
+from ..utils import _align, DeferredLineBase, LineContext, normalize_name
 from ..virtualized import V
 from .aoti_hipify_utils import maybe_hipify_code_wrapper
 from .common import get_device_op_overrides, IndentedBuffer, Kernel
@@ -64,11 +59,15 @@ def __init__(self):
             self.device = "cpu"
         # must be initialized prior to calling super().__init__()
         self.included_devices: OrderedSet[str] = OrderedSet()
-        self.model_class_name_suffix = ""
-        if config.aot_inductor.compile_standalone:
-            self.model_class_name_suffix = aoti_model_name_from_config()
+        self.model_class_name_suffix = (
+            config.aot_inductor.model_name_for_generated_files
+            if config.aot_inductor.compile_standalone
+            else ""
+        )
         self.aoti_model_class_name = f"AOTInductorModel{self.model_class_name_suffix}"
+
         super().__init__()
+
         self.declare = "auto "
         self.declare_maybe_reference = "decltype(auto) "
         self.ending = ";"
@@ -235,15 +234,6 @@ def write_header(self):
                 self.header.splice(f"""#include \"{self.model_class_name_suffix}.h\"""")
             self.header.splice("\n")
 
-        enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
-            "linux",
-            "win32",
-        ]
-        if config.profiler_mark_wrapper_call or enable_kernel_profile:
-            # No C shim for profiling APIs, assuming profiling is a debugging feature which
-            # does not provide any ABI compatibility promise.
-            self.header.splice("#include <ATen/record_function.h>")
-
     def _include_extra_header(self, header: str):
         # This is needed for cpp to python dtype conversion
         self.header.splice(f"#include <{header}>")
@@ -518,6 +508,8 @@ def gen_check(handle_kind, idx, name, tensor):
     def write_wrapper_decl(self):
         inputs_len = len(V.graph.graph_inputs.keys())
         if V.graph.aot_mode:
+            self.codegen_additional_funcs()
+
             if V.graph.const_module:
                 self.header.splice(V.graph.const_module.wrapper_code.header)
 
@@ -593,7 +585,7 @@ def write_wrapper_decl(self):
                     # Weights are promoted in the JIT mode
                     num_args = len(V.graph.graph_inputs) + len(V.graph.constants)
                     # release GIL to support multiple instances inference (in different threads of the same process)
-                    self.prefix.splice("py::gil_scoped_release release;")
+                    self.prefix.splice("py::gil_scoped_release_simple release;")
 
                 self.prefix.splice(
                     f"""
@@ -674,6 +666,9 @@ def codegen_input_device_type_var_decl(self, code: IndentedBuffer, name):
             f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type({name}, &{name}_device_type));"
         )
 
+    def codegen_additional_funcs(self):
+        pass
+
     def codegen_model_kernels(self):
         self.prefix.writeline("namespace {")
 
@@ -727,6 +722,28 @@ def codegen_model_kernels(self):
                     )
             self.prefix.writeline("}")
 
+    # MSVC string was longer than the limit of 16380 single-byte characters.
+    # https://learn.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2026
+    MSVC_C2026_MAX_STRING_LENGTH = 16000
+
+    def codegen_write_arg_with_large_length_string(
+        self,
+        arg_name: str,
+        arg_str_val: str,
+        max_truncate_length: int = MSVC_C2026_MAX_STRING_LENGTH,
+    ):
+        def truncate_string(s: str, length: int) -> list[str]:
+            return [s[i : i + length] for i in range(0, len(s), length)]
+
+        if len(arg_str_val) > max_truncate_length:
+            truncated_strs = truncate_string(arg_str_val, max_truncate_length)
+            self.prefix.writeline(f"{arg_name} =")
+            for truncate_str in truncated_strs:
+                self.prefix.writeline(f'R"({truncate_str})"')
+            self.prefix.writeline(";")
+        else:
+            self.prefix.writeline(f'{arg_name} = R"({arg_str_val})";')
+
     def codegen_model_constructor(self):
         """
         // Generated code example
@@ -874,11 +891,16 @@ def escape_string(x):
                     .replace("\t", "\\t")
                 )
 
-            self.prefix.writeline(
-                f'in_spec_ = R"({config.aot_inductor.serialized_in_spec})";'
+            # Origin code: self.prefix.writeline(f'in_spec_ = R"({config.aot_inductor.serialized_in_spec})";')
+            # Fix msvc C2026 error via codegen_write_arg_with_large_length_string
+            self.codegen_write_arg_with_large_length_string(
+                arg_name="in_spec_", arg_str_val=config.aot_inductor.serialized_in_spec
             )
-            self.prefix.writeline(
-                f'out_spec_ = R"({config.aot_inductor.serialized_out_spec})";'
+            # Origin code: self.prefix.writeline(f'out_spec_ = R"({config.aot_inductor.serialized_out_spec})";')
+            # Fix msvc C2026 error via codegen_write_arg_with_large_length_string
+            self.codegen_write_arg_with_large_length_string(
+                arg_name="out_spec_",
+                arg_str_val=config.aot_inductor.serialized_out_spec,
             )
 
             for idx, output in enumerate(V.graph.graph_outputs):
@@ -1225,6 +1247,7 @@ def generate_c_shim_extern_kernel_call(
         device: str,
         *,
         debug_args: Optional[list[str]] = None,
+        debug_handle: Optional[int] = None,
     ) -> None:
         """debug_args kwarg allows CppWrapperCpuArrayRef to pass in wrapped arguments in
         place of args while preserving debug printer output."""
@@ -1241,14 +1264,16 @@ def generate_c_shim_extern_kernel_call(
         ]
         with debug_printer_manager:
             shim_fn = self.get_c_shim_func_name(kernel, device)
+            self.write_provenance_debug_handle(shim_fn, debug_handle)
             shim_fn_codes = (
                 f"AOTI_TORCH_ERROR_CODE_CHECK({shim_fn}({', '.join(args)}));"
             )
             if enable_kernel_profile:
+                debug_handle_str = "" if debug_handle is None else f":{debug_handle}"
                 shim_fn_codes = textwrap.dedent(
                     f"""
                     {{
-                      RECORD_FUNCTION("{shim_fn}", c10::ArrayRef<c10::IValue>());
+                      RAIIAtenRecordFunctionHandle record_{shim_fn}_("{shim_fn}{debug_handle_str}", nullptr);
                       {shim_fn_codes}
                     }}
                     """
@@ -1271,16 +1296,24 @@ def generate_c_shim_extern_kernel_alloc(
             args = [*args, f"&{output_handle_name}"]
 
         device = d.type if (d := extern_kernel.get_device()) else self.device
+
+        debug_handle = None
+        if config.trace.provenance_tracking_level != 0:
+            debug_handle = set_kernel_post_grad_provenance_tracing(
+                extern_kernel, extern_kernel.get_kernel_name(), is_extern=True
+            )
+
         self.generate_c_shim_extern_kernel_call(
-            extern_kernel.get_kernel_name(), args, device
+            extern_kernel.get_kernel_name(), args, device, debug_handle=debug_handle
         )
 
-        if (
-            extern_kernel.python_kernel_name
-            == "torch.ops._c10d_functional.wait_tensor.default"
+        if extern_kernel.python_kernel_name in (
+            "torch.ops._c10d_functional.all_reduce_.default",
+            "torch.ops._c10d_functional.wait_tensor.default",
         ):
-            # wait_tensor returns its input, and the returned tensor is not used anywhere,
-            # so we can delete the returned AtenTensorHandle to reduce its lifetime.
+            # all_reduce_ is an inplace op and its returned tensor is not used anywhere.
+            # wait_tensor returns its input without any modification and the returned tensor is not used anywhere.
+            # In both cases, we can immediately delete the returned AtenTensorHandle to reduce its lifetime.
             self.writeline(
                 f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_tensor_object({output_handle_name}));"
             )
@@ -1328,10 +1361,19 @@ def generate_c_shim_fallback_kernel(
                 raise NotImplementedError(f"unsupported type of {output=}")
         args = args + output_args
         device = d.type if (d := fallback_kernel.get_device()) else self.device
+
+        debug_handle = None
+        if config.trace.provenance_tracking_level != 0:
+            debug_handle = set_kernel_post_grad_provenance_tracing(
+                fallback_kernel,
+                fallback_kernel.cpp_kernel_name,  # type: ignore[arg-type]
+                is_extern=True,
+            )
         self.generate_c_shim_extern_kernel_call(
             fallback_kernel.cpp_kernel_name,  # type: ignore[arg-type]
             args,
             device,
+            debug_handle=debug_handle,
         )
         for raii_handle in output_raii_handles:
             self.writeline(raii_handle)
@@ -1343,6 +1385,7 @@ def _generate_extern_kernel_out_helper(
         out_view: Optional[str],
         args: list[str],
         device: str,
+        debug_handle: Optional[int] = None,
     ) -> None:
         if out_view:
             out_name = f"{out}_as_strided"
@@ -1351,7 +1394,9 @@ def _generate_extern_kernel_out_helper(
         else:
             args.insert(0, out)
 
-        self.generate_c_shim_extern_kernel_call(kernel, args, device)
+        self.generate_c_shim_extern_kernel_call(
+            kernel, args, device, debug_handle=debug_handle
+        )
 
     def generate_scatter_fallback(
         self,
@@ -1491,7 +1536,7 @@ def codegen_exact_buffer_reuse(self, old_name: str, new_name: str, del_line: str
 
     def generate_profiler_mark_wrapper_call(self, stack):
         self.wrapper_call.writeline(
-            'RECORD_FUNCTION("inductor_wrapper_call", c10::ArrayRef<c10::IValue>());'
+            'RAIIAtenRecordFunctionHandle record_inductor_wrapper_call_("inductor_wrapper_call", nullptr);'
         )
 
     def generate_start_graph(self):
@@ -1551,14 +1596,20 @@ def codegen_int_array_var(
         if int_array == "{}":
             #  An array of unknown bound cannot be initialized with {}.
             if known_statically:
-                writeline(f"static constexpr {ctype} *{var}=nullptr;")
+                if config.cpp.use_constexpr_for_int_array:
+                    writeline(f"static constexpr {ctype} *{var}=nullptr;")
+                else:
+                    writeline(f"static const {ctype} *{var}=nullptr;")
             else:
                 writeline(f"const {ctype} *{var}=nullptr;")
         else:
             if var not in self.declared_int_array_vars:
                 self.declared_int_array_vars.add(var)
                 if known_statically:
-                    writeline(f"static constexpr {ctype} {var}[] = {int_array};")
+                    if config.cpp.use_constexpr_for_int_array:
+                        writeline(f"static constexpr {ctype} {var}[] = {int_array};")
+                    else:
+                        writeline(f"static const {ctype} {var}[] = {int_array};")
                 else:
                     writeline(f"const {ctype} {var}[] = {int_array};")
         return var
@@ -1571,10 +1622,11 @@ def make_buffer_allocation(self, buffer):
             buffer.get_size(),
             buffer.get_stride(),
             V.graph.get_allocation_size(buffer),
+            buffer.get_is_pinned(),
         )
 
     def make_allocation(
-        self, name, device, dtype, shape, stride, allocation_shape=None
+        self, name, device, dtype, shape, stride, allocation_shape=None, is_pinned=False
     ):
         if allocation_shape is None:
             allocation_shape = shape
@@ -1626,8 +1678,9 @@ def make_allocation(
         ]
 
         self.wrapper_call.writeline(f"AtenTensorHandle {handle_name};")
+        pinned_str = "_pinned" if is_pinned else ""
         self.wrapper_call.writeline(
-            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided({', '.join(args)}));"
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided{pinned_str}({', '.join(args)}));"
         )
 
         if allocation_size != size:
@@ -1642,10 +1695,15 @@ def make_allocation(
             self.wrapper_call.writeline(
                 f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_as_strided({', '.join(args)}));"
             )
+            self.wrapper_call.writeline(
+                f"wrap_with_raii_handle_if_needed({old_handle_name});"
+            )
 
         return f"RAIIAtenTensorHandle {name}({handle_name});"
 
-    def codegen_alloc_from_pool(self, name, offset, dtype, shape, stride) -> str:
+    def codegen_alloc_from_pool(
+        self, name, offset, dtype, shape, stride
+    ) -> tuple[str, list[str]]:
         size = self.codegen_shape_tuple(shape)
         stride = self.codegen_shape_tuple(stride)
         tmp_name = f"tmp_tensor_handle_{next(self.tmp_tensor_id)}"
@@ -1662,11 +1720,14 @@ def codegen_alloc_from_pool(self, name, offset, dtype, shape, stride) -> str:
             ),
             f"&{tmp_name}",
         ]
-        self.wrapper_call.writeline(f"AtenTensorHandle {tmp_name};")
-        self.wrapper_call.writeline(
-            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__alloc_from_pool({', '.join(args)}));"
-        )
-        return f"RAIIAtenTensorHandle({tmp_name})"
+        # We return the lines instead of writing here because writing here is bug prune.
+        # If you write aoti_torch__alloc_from_pool lines, you must write the RAIIAtenTensorHandle
+        # as well, otherwise you get memory leaks
+        allocations_to_write = [
+            f"AtenTensorHandle {tmp_name};",
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__alloc_from_pool({', '.join(args)}));",
+        ]
+        return f"RAIIAtenTensorHandle({tmp_name})", allocations_to_write
 
     def codegen_reinterpret_view(
         self,
@@ -1886,7 +1947,9 @@ def codegen_subgraph(self, subgraph, outer_inputs, outer_outputs):
         finally:
             self.pop_codegened_graph()
 
-    def codegen_while_loop(self, while_loop):
+    def codegen_while_loop(self, while_loop, stack_output=False):
+        if stack_output:
+            raise NotImplementedError("NYI cpp wrapper for while_loop_stack_output")
         is_bool_pred = isinstance(
             while_loop.cond_subgraph.graph.graph_outputs[0], ir.ShapeAsConstantBuffer
         )
@@ -2247,7 +2310,7 @@ def generate_scoped_gil_acquire(self, declarations_before_scope, lines_in_scope)
 
         scoped_lines.writeline("{")
         with scoped_lines.indent():
-            scoped_lines.writeline("py::gil_scoped_acquire acquire;")
+            scoped_lines.writeline("py::gil_scoped_acquire_simple acquire;")
             scoped_lines.writelines(lines_in_scope.split("\n"))
         scoped_lines.writelines("}")
         return scoped_lines._lines
@@ -2602,7 +2665,7 @@ def generate_fallback_kernel_with_runtime_lookup_aot(
             "AtenTensorHandle", tensor_call_args, force_mutable=True
         )
 
-        extern_kernel_node_index = len(V.graph.extern_kernel_nodes) - 1
+        extern_kernel_node_index = len(V.extern_kernel_nodes) - 1
         self.writeline(
             f"aoti_torch_proxy_executor_call_function(proxy_executor, "
             f"{extern_kernel_node_index}, "
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
index eb3390cbc39cf..63c5bc2debe8b 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
@@ -297,7 +297,7 @@ def write_wrapper_decl(self):
                         # Weights are promoted in the JIT mode
                         num_args = len(V.graph.graph_inputs) + len(V.graph.constants)
                         # release GIL to support multiple instances inference (in different threads of the same process)
-                        self.prefix.splice("py::gil_scoped_release release;")
+                        self.prefix.splice("py::gil_scoped_release_simple release;")
 
                     self.prefix.splice(
                         f"""
@@ -565,10 +565,18 @@ def make_buffer_allocation(self, buffer):
             buffer.get_size(),
             buffer.get_stride(),
             buffer if self.can_stack_allocate_buffer(buffer) else None,
+            buffer.get_is_pinned(),
         )
 
     def make_allocation(
-        self, name, device, dtype, shape, stride, buffer_if_can_stack_allocate=None
+        self,
+        name,
+        device,
+        dtype,
+        shape,
+        stride,
+        buffer_if_can_stack_allocate=None,
+        is_pinned=False,
     ):
         orig_stride = stride
         device_str = self.codegen_device(device)
@@ -615,8 +623,9 @@ def make_allocation(
         ]
 
         self.wrapper_call.writeline(f"AtenTensorHandle {name}_handle;")
+        pinned_str = "_pinned" if is_pinned else ""
         self.wrapper_call.writeline(
-            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided({', '.join(args)}));"
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided{pinned_str}({', '.join(args)}));"
         )
 
         return f"RAIIAtenTensorHandle {name}({name}_handle);"
diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
index 430511ce4ebf0..6bbbab8599008 100644
--- a/torch/_inductor/codegen/cpp_wrapper_gpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@@ -211,12 +211,17 @@ def generate_launch_kernel(self, prefix, wrapper, kernel_var_name, params):
         ]
         arg_types = [arg_type_loookup[name] for name in call_args]
         arg_signatures = [triton_meta["signature"][name] for name in call_args]
+        scratch_spaces = {
+            name: params[name]
+            for name in ["global_scratch", "profile_scratch"]
+            if params.get(name, None) is not None
+        }
         call_args_str = wrapper.generate_args_decl(
             prefix,
             call_args,
             arg_types,
             arg_signatures,
-            workspace_size=params.get("global_scratch") or 0,
+            scratch_spaces=scratch_spaces,
         )
         prefix.writeline(f"void* kernel_args_[] = {{{call_args_str}}};")
         launch_kernel_args = [
@@ -454,7 +459,7 @@ def generate_args_decl(
         arg_types,
         arg_signatures,
         is_triton_kernel=True,
-        workspace_size=0,
+        scratch_spaces: Optional[dict[str, int]] = None,
     ):
         """
         Generates any declarations of args to pass into a kernel call, and then returns the arg names.
@@ -572,22 +577,26 @@ def process_args(arg, arg_type, arg_signature=None):
         ):
             process_args(arg, arg_type, arg_signature)
 
-        if (
-            is_triton_kernel
-            and (
-                global_scratch := self.device_codegen.cpp_global_scratch(
-                    next(self.arg_var_id),
-                    workspace=TritonScratchWorkspace(
-                        size=workspace_size,
-                        generate_dtype_str=(lambda: self.codegen_dtype(torch.uint8)),
-                    ),
+        for scratch_name, workspace_size in (scratch_spaces or {}).items():
+            if (
+                is_triton_kernel
+                and (
+                    scratch := self.device_codegen.cpp_scratch(
+                        next(self.arg_var_id),
+                        workspace=TritonScratchWorkspace(
+                            size=workspace_size,
+                            generate_dtype_str=(
+                                lambda: self.codegen_dtype(torch.uint8)
+                            ),
+                        ),
+                        prefix=scratch_name,
+                    )
                 )
-            )
-            is not None
-        ):
-            global_scratch_def, global_scratch_var = global_scratch
-            code.writelines([maybe_hipify_code_wrapper(x) for x in global_scratch_def])
-            new_args.append(f"&{global_scratch_var}")
+                is not None
+            ):
+                scratch_def, scratch_var = scratch
+                code.writelines([maybe_hipify_code_wrapper(x) for x in scratch_def])
+                new_args.append(f"&{scratch_var}")
 
         return ", ".join(new_args)
 
diff --git a/torch/_inductor/codegen/cpp_wrapper_mps.py b/torch/_inductor/codegen/cpp_wrapper_mps.py
index b953927f52be1..aea4470f1c964 100644
--- a/torch/_inductor/codegen/cpp_wrapper_mps.py
+++ b/torch/_inductor/codegen/cpp_wrapper_mps.py
@@ -9,7 +9,7 @@
 from ..virtualized import V
 from .cpp_wrapper_cpu import CppWrapperCpu
 from .cpp_wrapper_gpu import CppWrapperGpu
-from .wrapper import PythonWrapperCodegen
+from .wrapper import KernelCallLine, PythonWrapperCodegen
 
 
 class CppWrapperMps(CppWrapperGpu):
@@ -47,14 +47,12 @@ def _generate_kernel_call_helper(
         """
         Generates MPS kernel call code. It should look something like:
         ```
-        auto mps_lib_0_func = mps_lib_0.getKernelFunction("generated_kernel");
-        auto mps_lib_0_func_handle = AOTIMetalKernelFunctionHandle(mps_lib_0_func.get());
-        mps_lib_0_func->runCommandBlock([&] {
-            mps_lib_0_func->startEncoding();
-            aoti_torch_mps_set_arg(mps_lib_0_func_handle, 0, buf0);
-            aoti_torch_mps_set_arg(mps_lib_0_func_handle, 1, arg0_1);
+        get_mps_lib_0()->runCommandBlock([&] {
+            get_mps_lib_0()->startEncoding();
+            aoti_torch_mps_set_arg(get_mps_lib_0_handle(), 0, buf0);
+            aoti_torch_mps_set_arg(get_mps_lib_0_handle(), 1, arg0_1);
             ...
-            mps_lib_0_func->dispatch(9);
+            get_mps_lib_0()->dispatch(9);
         });
         ```
         """
@@ -81,11 +79,11 @@ def _generate_kernel_call_helper(
         for idx, (arg, arg_type) in enumerate(zip(call_args[:-2], arg_types[:-2])):
             if isinstance(arg_type, torch.dtype):
                 new_args.append(
-                    f"aoti_torch_mps_set_arg_tensor({kernel_name}_handle, {idx}, {arg});"
+                    f"aoti_torch_mps_set_arg_tensor(get_{kernel_name}_handle(), {idx}, {arg});"
                 )
             elif arg_type in (int, sympy.core.symbol.Symbol):
                 new_args.append(
-                    f"aoti_torch_mps_set_arg_int({kernel_name}_handle, {idx}, {arg});"
+                    f"aoti_torch_mps_set_arg_int(get_{kernel_name}_handle(), {idx}, {arg});"
                 )
             else:
                 raise NotImplementedError(
@@ -96,9 +94,11 @@ def _generate_kernel_call_helper(
         if threads is None:
             raise NotImplementedError("No threads or group_size provided")
         elif group_size is None:
-            new_args.append(f"{kernel_name}->dispatch({threads});\n")
+            new_args.append(f"get_{kernel_name}()->dispatch({threads});\n")
         else:
-            new_args.append(f"{kernel_name}->dispatch({threads}, {group_size});\n")
+            new_args.append(
+                f"get_{kernel_name}()->dispatch({threads}, {group_size});\n"
+            )
 
         # debug printer related logic for cpp kernel type.
         debug_printer_manager = V.graph.wrapper_code.debug_printer
@@ -113,20 +113,11 @@ def _generate_kernel_call_helper(
             self.write_mps_kernel_call(kernel_name, new_args)
 
     def write_mps_kernel_call(self, name: str, call_args: list[str]) -> None:
-        # Only add handle definition if the kernel is not already used
-        lib_name = name[: -len("_func")]
-        if name not in self._used_kernel_names:
-            self._used_kernel_names.add(name)
-
-            self.writeline(
-                f'auto {name} = {lib_name}.getKernelFunction("generated_kernel");'
-            )
-            self.writeline(
-                f"auto {name}_handle = AOTIMetalKernelFunctionHandle({name}.get());"
-            )
-
-        self.writeline(f"{name}->runCommandBlock([&] {{")
-        self.writeline(f"    {name}->startEncoding();")
+        # Initialization of the kernel function and kernel function handle
+        # variables have already been done at the beginning, which was
+        # codegen-ed in `codegen_mps_func_init`
+        self.writeline(f"get_{name}()->runCommandBlock([&] {{")
+        self.writeline(f"    get_{name}()->startEncoding();")
         for call_arg in call_args:
             self.writeline(f"    {call_arg}")
         self.writeline("});")
@@ -138,3 +129,52 @@ def get_device_include_path(device: str) -> str:
             "#include <torch/csrc/inductor/aoti_include/mps.h>\n"
             "#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>"
         )
+
+    def codegen_additional_funcs(self) -> None:
+        """
+        We want to codegen the mps kernel function variable initializations
+        ahead of time.  This is so that if we reuse kernels within subgraphs, we
+        don't need to worry about the scope in which we're initializing the
+        variables. Instead we will just initialize the variables all at the top
+        level.
+
+        The kernel function variable initializations should look something like:
+        ```
+        const std::shared_ptr<at::native::mps::MetalKernelFunction> get_mps_lib_0() {
+            static const auto func = mps_lib_0.getKernelFunction("generated_kernel");
+            return func;
+        }
+        AOTIMetalKernelFunctionHandle get_mps_lib_0_handle() {
+            static const auto handle = AOTIMetalKernelFunctionHandle(get_mps_lib_0().get());
+            return handle;
+        }
+        ```
+        """
+
+        for line in self.lines:
+            if not isinstance(line, KernelCallLine):
+                continue
+            if line.device.type != "mps":
+                continue
+
+            # Only add handle definition once
+            if line.kernel_name not in self._used_kernel_names:
+                self._used_kernel_names.add(line.kernel_name)
+
+                self.prefix.writeline(
+                    f"const std::shared_ptr<at::native::mps::MetalKernelFunction> get_{line.kernel_name}() {{"
+                )
+                self.prefix.writeline(
+                    f'    static const auto func = {line.kernel_name}.getKernelFunction("generated_kernel");'
+                )
+                self.prefix.writeline("    return func;")
+                self.prefix.writeline("}")
+
+                self.prefix.writeline(
+                    f"AOTIMetalKernelFunctionHandle get_{line.kernel_name}_handle() {{"
+                )
+                self.prefix.writeline(
+                    f"    static const auto handle = AOTIMetalKernelFunctionHandle(get_{line.kernel_name}().get());"
+                )
+                self.prefix.writeline("    return handle;")
+                self.prefix.writeline("}")
diff --git a/torch/_inductor/codegen/cuda/cuda_kernel.py b/torch/_inductor/codegen/cuda/cuda_kernel.py
index 224f0d2a423dc..0a9c6b0ca4e5f 100644
--- a/torch/_inductor/codegen/cuda/cuda_kernel.py
+++ b/torch/_inductor/codegen/cuda/cuda_kernel.py
@@ -177,6 +177,9 @@ def get_ld(node) -> Union[Expr, int]:
     def get_dynamic_shape_args(self) -> list[Union[Expr, int]]:
         return [*self.get_layout_args(), *self.size_args]
 
+    def get_offset_args(self) -> list[Expr]:
+        return [node.get_layout().offset for node in self.named_nodes.values()]
+
     @staticmethod
     def find_ld_idx(node: IRNode) -> int:
         strides = node.get_stride()
@@ -264,6 +267,7 @@ def def_kernel(
                            In this case, the `input_reorder` would be [2, 0, 1].
             additional_size_args: Additional size arguments for epilogue inputs
         """
+        # NB: name order matters here, it's used to match up offsets
         names = [x.strip() for x in names_str.strip().split(",")]
         if len(inputs) + len(outputs) != len(names):
             raise RuntimeError(
@@ -285,6 +289,7 @@ def def_kernel(
         free_symbols: OrderedSet[Expr] = OrderedSet()
         for name, node in zip(names[len(inputs) : len(inputs) + len(outputs)], outputs):
             if node is not None:
+                # NB: named nodes must be populated in the order of names
                 self.named_nodes[name] = node
                 self.args.output_buffers[node.get_name()] = name
 
@@ -306,14 +311,17 @@ def def_kernel(
         size_vars.extend(str(s) for s in free_symbols)
         self.size_args.extend(free_symbols)
         size_args = [f"const int {s}" for s in size_vars]
-
+        offset_args = [f"const int {name}_offset" for name in self.named_nodes.keys()]
         runtime_arg_decls = ",".join(
             [f"{arg.ty} {arg.name}" for arg in self.runtime_arg_info]
         )
         if runtime_arg_decls:
             runtime_arg_decls += ", "
 
-        signature = f"int {self.kernel_name}({', '.join(arg_defs + size_args)}, {runtime_arg_decls}{self._EXTRA_CPP_ARGS})"
+        signature = (
+            f"int {self.kernel_name}({', '.join(arg_defs + size_args + offset_args)},\
+ {runtime_arg_decls}{self._EXTRA_CPP_ARGS})"
+        )
         self.signature = signature
         return signature
 
@@ -346,10 +354,13 @@ def call_kernel(
             _, call_args, _, arg_types = self.args.python_argdefs()
 
         dynamic_shape_args = self.get_dynamic_shape_args()
+        offset_args = self.get_offset_args()
         call_args.extend(dynamic_shape_args)  # type: ignore[arg-type]
+        call_args.extend(offset_args)  # type: ignore[arg-type]
         for arg in self.runtime_arg_values:
-            call_args.append(arg)
-        arg_types.extend("int" for _ in dynamic_shape_args)
+            call_args.append(str(arg))
+        arg_types.extend("const int" for _ in dynamic_shape_args)
+        arg_types.extend("const int" for _ in offset_args)
         for arg in self.runtime_arg_info:
             arg_types.append(arg.ty)
         # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar
@@ -425,15 +436,6 @@ def max_valid_index(self, node: IRNode, default=-1):
             max_valid_offset += (node.get_size()[i] - 1) * node.get_stride()[i]
         return max_valid_offset
 
-    def offset(self, node: IRNode) -> str:
-        """
-        Generates code which represents offset of a given node.
-        """
-
-        if node is None:
-            return "0"
-        return str(node.get_layout().offset)  # type: ignore[union-attr]
-
     def ptr(self, node: IRNode) -> str:
         """
         Generates code which represents pointer of a given node.
@@ -444,8 +446,7 @@ def ptr(self, node: IRNode) -> str:
         arg_name = self.arg_name(node)
         if arg_name is None:
             return "nullptr"
-        offset = self.offset(node)
-        return arg_name if offset == "0" else f"{arg_name} + {offset}"
+        return f"{arg_name} + {arg_name}_offset"
 
     def size(
         self,
diff --git a/torch/_inductor/codegen/cuda/cuda_template.py b/torch/_inductor/codegen/cuda/cuda_template.py
index cc03ccbdda863..4aa0aeb46e077 100644
--- a/torch/_inductor/codegen/cuda/cuda_template.py
+++ b/torch/_inductor/codegen/cuda/cuda_template.py
@@ -43,7 +43,7 @@ class ArgInfo:
 class CUDATemplate(KernelTemplate):
     index_counter = itertools.count()
     # dict of cache key to (code, size_args)
-    code_cache: dict[str, tuple[str, tuple[int, ...]]] = {}
+    code_cache: dict[str, tuple[str, tuple[int, ...], tuple[int, ...]]] = {}
     cache_clear = staticmethod(code_cache.clear)
 
     def __init__(
@@ -113,8 +113,12 @@ def generate_code_and_args(
             key = self.make_key(name=name, input_key=input_key, layout_repr=layout_repr)
 
         if key is not None and key in self.code_cache:
-            code, size_args = self.code_cache[key]
-            extra_args = tuple(list(size_args) + self.get_runtime_arg_values(**kwargs))
+            code, size_args, offset_args = self.code_cache[key]
+            extra_args = tuple(
+                list(size_args)
+                + list(offset_args)
+                + list(self.get_runtime_arg_values(**kwargs))
+            )
             return code, extra_args
 
         kernel_name = str(Placeholder.KERNEL_NAME)
@@ -148,12 +152,15 @@ def generate_code_and_args(
         )
         V.graph.sizevars.size_hints(map(sympy.expand, call_args[len(expected_args) :]))
         size_args = V.graph.sizevars.size_hints(kernel.get_dynamic_shape_args())
+        offset_args = V.graph.sizevars.size_hints(kernel.get_offset_args())
 
         if key is not None:
-            self.code_cache[key] = code, size_args
+            self.code_cache[key] = code, size_args, offset_args
 
         # extra args has runtime params, which shouldn't be cached
-        extra_args = tuple(list(size_args) + self.get_runtime_arg_values(**kwargs))
+        extra_args = tuple(
+            list(size_args) + list(offset_args) + self.get_runtime_arg_values(**kwargs)
+        )
 
         return code, extra_args
 
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_eq b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_eq
rename to torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/__init__.py
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
index e42a13534e6f4..605b93dff5926 100644
--- a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
@@ -255,7 +255,8 @@ def render_stride(x: int) -> str:
             return f"{{{', '.join([render_stride(x) for x in stride])}}}"
 
         elif issubclass(arg_ty, ctypes.c_void_p):
-            return f"({CUTLASSTemplate._DTYPE_TO_CUTLASS[node.get_layout().dtype]}*) {arg_renames.new_name(node.get_name())}"
+            name = arg_renames.new_name(node.get_name())
+            return f"({CUTLASSTemplate._DTYPE_TO_CUTLASS[node.get_layout().dtype]}*) ({name} + {name}_offset)"
         elif (
             arg_ty in _CUTLASS_C_DTYPES
         ):  # Assumption: this is the element dtype, this holds for all cutlass ir nodes currently
diff --git a/torch/_inductor/codegen/cuda/device_op_overrides.py b/torch/_inductor/codegen/cuda/device_op_overrides.py
index 0ba0677422944..147515e0decfe 100644
--- a/torch/_inductor/codegen/cuda/device_op_overrides.py
+++ b/torch/_inductor/codegen/cuda/device_op_overrides.py
@@ -4,7 +4,6 @@
 
 import torch
 
-from ...utils import triton_version_uses_attrs_dict
 from ..common import (
     DeviceOpOverrides,
     register_device_op_overrides,
@@ -333,34 +332,33 @@ def cpp_kernel_type(self) -> str:
     def cpp_device_ptr(self) -> str:
         return "CUdeviceptr"
 
-    def cpp_global_scratch(
-        self, idx: int, workspace: TritonScratchWorkspace
+    def cpp_scratch(
+        self, idx: int, workspace: TritonScratchWorkspace, prefix: Optional[str] = None
     ) -> Optional[tuple[list[str], str]]:
-        if triton_version_uses_attrs_dict():
-            var_name = f"global_scratch_{idx}"
-            if workspace.size > 0:
-                size_array = f"int64_t {var_name}_size[] = {{{workspace.size}}};"
-                stride_array = f"int64_t {var_name}_stride[] = {{1}};"
-                device_type = "cached_torch_device_type_cuda"
-                device_idx = "device_idx_"
-
-                return (
-                    [
-                        f"{size_array}",
-                        f"{stride_array}",
-                        f"AtenTensorHandle {var_name}_handle;",
-                        (
-                            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(1, {var_name}_size, {var_name}_stride, "
-                            f"{workspace.generate_dtype_str()}, {device_type}, {device_idx}, &{var_name}_handle));"
-                        ),
-                        f"RAIIAtenTensorHandle {var_name}_tensor({var_name}_handle);",
-                        f"CUdeviceptr {var_name} = reinterpret_cast<CUdeviceptr>({var_name}_tensor.data_ptr());",
-                    ],
-                    var_name,
-                )
-            else:
-                return [f"CUdeviceptr {var_name} = 0;"], var_name
-        return None
+        prefix = f"{prefix}_" if prefix else ""
+        var_name = f"{prefix}scratch_{idx}"
+        if workspace.size > 0:
+            size_array = f"int64_t {var_name}_size[] = {{{workspace.size}}};"
+            stride_array = f"int64_t {var_name}_stride[] = {{1}};"
+            device_type = "cached_torch_device_type_cuda"
+            device_idx = "device_idx_"
+
+            return (
+                [
+                    f"{size_array}",
+                    f"{stride_array}",
+                    f"AtenTensorHandle {var_name}_handle;",
+                    (
+                        f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(1, {var_name}_size, {var_name}_stride, "
+                        f"{workspace.generate_dtype_str()}, {device_type}, {device_idx}, &{var_name}_handle));"
+                    ),
+                    f"RAIIAtenTensorHandle {var_name}_tensor({var_name}_handle);",
+                    f"CUdeviceptr {var_name} = reinterpret_cast<CUdeviceptr>({var_name}_tensor.data_ptr());",
+                ],
+                var_name,
+            )
+        else:
+            return [f"CUdeviceptr {var_name} = 0;"], var_name
 
 
 register_device_op_overrides("cuda", CUDADeviceOpOverrides())
diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
index 6436989bb0bca..d37e16768adb2 100644
--- a/torch/_inductor/codegen/cuda/gemm_template.py
+++ b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -594,11 +594,14 @@ def _add_cutlass_gemm_choices(
                     )
 
         if len(ops) == 0:
-            input_layouts = [node.get_layout() for node in input_nodes]
-            input_strides = [node.get_stride() for node in input_nodes]
-            output_layout = layout
-            warning_msg = f"No suitable Cutlass GEMM configs found, fallbacks used ( {len(ops)=}, {output_layout=}, {input_layouts=}, {input_strides=} )"  # noqa: B950
-            log.warning(warning_msg)
+            log.info(
+                "No suitable Cutlass GEMM configs found, fallbacks used "
+                "( len(ops)=%d, output_layout=%s, input_layouts=%s, input_strides=%s )",
+                len(ops),
+                layout,
+                [node.get_layout() for node in input_nodes],
+                [node.get_stride() for node in input_nodes],
+            )
         log.debug(
             "Added %d Cutlass gemm configs.",
             len(ops),
@@ -1165,6 +1168,10 @@ def render(  # type: ignore[override]
                 op = self.swap_XW(op)
                 should_swap_xw = True
 
+        name_to_buffer = {node.get_name(): node for node in self.input_nodes}
+        # handle the fake output buffer during lowering
+        name_to_buffer[Y.get_name()] = Y  # type: ignore[assignment]
+
         if epilogue_nodes or is_scaled_mm:
             if epilogue_nodes:
                 (
@@ -1176,12 +1183,15 @@ def render(  # type: ignore[override]
                     Y.get_name(), epilogue_nodes, V.kernel.removed_buffers
                 )
 
+                # TODO: mlazos remove this by returning buffer metadata from
+                # ir_to_evt_python code
+                for name, buf in (
+                    V.graph.name_to_buffer | V.graph.graph_inputs
+                ).items():
+                    if name not in name_to_buffer:
+                        name_to_buffer[name] = buf  # type: ignore[assignment]
+
                 D_output_name = var_name_to_buffer_name["D"]
-                name_to_buffer = V.graph.name_to_buffer | V.graph.graph_inputs
-                for name in V.graph.constants.keys():
-                    name_to_buffer[name] = V.graph.add_tensor_constant(
-                        V.graph.constants[name], name
-                    )
                 D_output_buffer = name_to_buffer[D_output_name]
                 Y = D_output_buffer  # type: ignore[assignment]
                 # Interestingly, I don't think the rest of the layout matters here since we
@@ -1226,6 +1236,7 @@ def render(  # type: ignore[override]
                 op,
                 evt_py_code,
                 var_name_to_buffer_name,
+                name_to_buffer,
                 Y.get_dtype(),
                 acc_dtype,
             )
@@ -1317,13 +1328,14 @@ def test_call_statement(
             f"(({arg_type}){arg_name}_data.get())"
             for arg_type, arg_name in zip(arg_types, arg_names)
         ]
-        return f"{kernel.kernel_name}({', '.join(arguments)}, M, N, K, B, lda, ldb, ldc, ldd, swizzle, workspace_size_ptr, (uint8_t*)workspace_data.get(), 0);"  # noqa: B950
+        return f"{kernel.kernel_name}({', '.join(arguments)}, M, N, K, B, lda, ldb, ldc, ldd, 0, 0, 0, swizzle, workspace_size_ptr, (uint8_t*)workspace_data.get(), 0);"  # noqa: B950
 
     def _render_evt(
         self,
         op: GemmOperation,
         evt_py_code: str,
         buffer_renames: dict[str, str],
+        name_to_buffer: dict[str, Buffer],
         output_dtype: torch.dtype,
         accumulator_dtype: torch.dtype,
     ) -> tuple[str, str, str, EVTArgRenames]:  # type: ignore[name-defined]  # noqa: F821
@@ -1485,23 +1497,15 @@ def _render_evt(
         op: GemmOperation,
         evt_py_code: str,
         var_name_to_buffer_name: dict[str, str],
+        name_to_buffer: dict[str, Buffer],
         output_dtype: torch.dtype,
         accumulator_dtype: torch.dtype,
     ) -> tuple[str, str, str, EVTArgRenames]:
         from .cutlass_lib_extensions.evt_extensions import create_example_tensors, trace
 
-        name_to_buffer = V.graph.name_to_buffer | V.graph.graph_inputs
-
-        for name in V.graph.constants.keys():
-            name_to_buffer[name] = V.graph.add_tensor_constant(
-                V.graph.constants[name], name
-            )
-
-        # handle the fake output buffer during lowering
-        name_to_buffer[self.output_node.get_name()] = self.output_node  # type: ignore[assignment]
-
         acc_dtype = torch_dtype_to_cutlass_type(accumulator_dtype)
         output_dtype = torch_dtype_to_cutlass_type(output_dtype)
+
         examples = create_example_tensors(
             var_name_to_buffer_name,
             name_to_buffer,  # type: ignore[arg-type]
diff --git a/torch/_inductor/codegen/cuda_combined_scheduling.py b/torch/_inductor/codegen/cuda_combined_scheduling.py
index 0aee8760282d0..cb497284d52f5 100644
--- a/torch/_inductor/codegen/cuda_combined_scheduling.py
+++ b/torch/_inductor/codegen/cuda_combined_scheduling.py
@@ -11,6 +11,7 @@
     SchedulerNode,
 )
 from .cuda.cuda_cpp_scheduling import CUDACPPScheduling
+from .cutedsl.cutedsl_scheduling import CuteDSLScheduling
 from .rocm.rocm_cpp_scheduling import ROCmCPPScheduling
 from .triton import TritonScheduling
 
@@ -44,6 +45,7 @@ def __init__(self, scheduler: Optional[Scheduler]) -> None:
         self._triton_scheduling = TritonScheduling(scheduler)
         self._cuda_cpp_scheduling = CUDACPPScheduling(scheduler)
         self._rocm_cpp_scheduling = ROCmCPPScheduling(scheduler)
+        self._cutedsl_scheduling = CuteDSLScheduling(scheduler)
 
     def get_backend_features(self, device: torch.device) -> OrderedSet[BackendFeature]:
         return self._triton_scheduling.get_backend_features(device)
@@ -53,6 +55,8 @@ def choose_node_backend(self, node: BaseSchedulerNode) -> BaseScheduling:
             return self._cuda_cpp_scheduling
         if self._rocm_cpp_scheduling.is_rocm_cpp_template(node):
             return self._rocm_cpp_scheduling
+        if self._cutedsl_scheduling.is_cutedsl_template(node):
+            return self._cutedsl_scheduling
         return self._triton_scheduling
 
     def can_fuse_vertical(
@@ -64,6 +68,11 @@ def can_fuse_vertical(
             node1
         ) or self._cuda_cpp_scheduling.is_cuda_cpp_template(node2):
             return False
+        # CuteDSL doesn't support vertical fusion currently
+        elif self._cutedsl_scheduling.is_cutedsl_template(
+            node1
+        ) or self._cutedsl_scheduling.is_cutedsl_template(node2):
+            return False
         return self._triton_scheduling.can_fuse_vertical(node1, node2)
 
     def can_fuse_horizontal(
@@ -74,6 +83,10 @@ def can_fuse_horizontal(
                 return self._cuda_cpp_scheduling.can_fuse_horizontal(
                     node1, node2
                 )  # always False at the moment
+            if self._cutedsl_scheduling.is_cutedsl_template(node):
+                return self._cutedsl_scheduling.can_fuse_horizontal(
+                    node1, node2
+                )  # always False at the moment
         return self._triton_scheduling.can_fuse_horizontal(node1, node2)
 
     def group_fn(
@@ -98,6 +111,13 @@ def codegen_template(
             return self._rocm_cpp_scheduling.codegen_template(
                 template_node, epilogue_nodes, prologue_nodes
             )
+        elif self._cutedsl_scheduling.is_cutedsl_template(template_node):
+            # TODO remove this when we add epilogue support
+            assert not epilogue_nodes
+            assert not prologue_nodes
+            return self._cutedsl_scheduling.codegen_template(
+                template_node, epilogue_nodes, prologue_nodes
+            )
         else:
             return self._triton_scheduling.codegen_template(
                 template_node, epilogue_nodes, prologue_nodes
diff --git a/torch/_inductor/codegen/cutedsl/README.md b/torch/_inductor/codegen/cutedsl/README.md
new file mode 100644
index 0000000000000..3b0deedafc341
--- /dev/null
+++ b/torch/_inductor/codegen/cutedsl/README.md
@@ -0,0 +1,101 @@
+# CuteDSL Template System
+
+## Quick Start
+
+Writing a CuteDSL template:
+
+```python
+from torch._inductor.codegen.cutedsl import CuteDSLTemplate
+
+template_source = """
+@cute.kernel
+def {{kernel_name}}_kernel(A, B, C):
+    # Your CUTLASS kernel logic here
+    pass
+
+{{def_kernel("A", "B", "C")}}
+    # Call the kernel
+    {{kernel_name}}_kernel(A, B, C)
+    return C
+"""
+
+my_template = CuteDSLTemplate(
+    name="my_gemm",
+    source=template_source,
+)
+```
+
+## Architecture
+
+- **[CuteDSLTemplate](cutedsl_template.py#L39)**: Template definition and registration. Generates ChoiceCallers for autotuning.
+- **[CuteDSLTemplateKernel](cutedsl_kernel.py#L61)**: Handles code generation, provides template hooks (`def_kernel`), manages args.
+- **[CuteDSLScheduling](cutedsl_scheduling.py#L28)**: Integrates with Inductor's scheduler, handles kernel compilation via [`async_compile.cutedsl()`](../../async_compile.py#L756).
+- **[CuteDSLTemplateBuffer](../../ir.py)**: IR node representing a CuteDSL template operation in the graph.
+
+### Compilation Process
+
+CuteDSL requires source files for compilation (cannot compile from strings directly). The process:
+
+1. **[CuteDSLScheduling](cutedsl_scheduling.py#L59)** generates the kernel code string and calls [`async_compile.cutedsl()`](../../async_compile.py#L756)
+2. **[async_compile.cutedsl()](../../async_compile.py#L756)** uses [`PyCodeCache.write()`](../../codecache.py) to write source to a temporary `.py` file
+3. **[PyCodeCache](../../codecache.py)** loads the module from disk, enabling CUTLASS compilation
+4. The compiled kernel is wrapped in **[CuteDSLKernelWrapper](cutedsl_kernel.py#L22)** to provide a `.run()` interface
+5. The generated Python file is cached via PyCodeCache, but CUTLASS compilation runs every time (no kernel-level caching yet)
+
+**Debug tip**: Use `TORCH_LOGS="kernel_code"` to see the generated kernel source and file path during compilation.
+
+## Writing Templates
+
+Templates use Jinja2 syntax with these available hooks:
+
+- `{{kernel_name}}` - Unique kernel identifier
+- `{{def_kernel(args...)}}` - Generates kernel function signature and argument handling
+- `{{input_nodes}}` - List of input buffers
+- `{{output_node}}` - Output buffer
+- `{{gen_defines()}}` - Generates autotunable parameter definitions with proper CuteDSL typing
+
+## Autotunable Parameters
+
+CuteDSL templates support autotunable parameters similar to Triton's `tl.constexpr` system:
+
+```python
+template_source = r"""
+{{gen_defines()}}
+
+@cute.kernel
+def {{kernel_name}}_kernel(gA: cute.Tensor, gB: cute.Tensor, gC: cute.Tensor):
+    threads_per_block = THREADS_PER_BLOCK  # Uses autotuned value
+    block_size = BLOCK_SIZE
+    # ... kernel implementation
+"""
+
+# Pass parameters when generating template choices
+template.maybe_append_choice(
+    choices,
+    input_nodes=[a, b],
+    layout=layout,
+    THREADS_PER_BLOCK=256,    # cutlass.Constexpr = 256
+    BLOCK_SIZE=128,           # cutlass.Constexpr = 128
+    SCALE_FACTOR=1.5,         # cutlass.Constexpr = 1.5
+)
+```
+
+Templates must:
+1. Define a `@cute.kernel` decorated function
+2. Use `{{def_kernel()}}` to create the entry point
+3. Return the output tensor
+4. Use `{{gen_defines()}}` for autotunable parameters
+
+See [test_cutedsl_template.py](../../../../test/inductor/test_cutedsl_template.py) for complete examples.
+
+## Current Limitations / TODOs
+
+- **No fusion support**: `can_fuse_vertical` and `can_fuse_horizontal` return False
+- **Subgraph management**: Bodies and masks not fully implemented
+- **File-based compilation**: Requires writing to disk (uses PyCodeCache)
+- **Missing epilogue/prologue**: No support for fused operations yet
+- **Fixed kernel suffix**: Uses hardcoded "_main" suffix
+- **No CUTLASS kernel caching**: Only PyCodeCache works; CUTLASS compilation runs every time (major perf issue)
+
+
+Note: Requires CUTLASS Python package (`pip install nvidia-cutlass`)
\ No newline at end of file
diff --git a/torch/_inductor/codegen/cutedsl/__init__.py b/torch/_inductor/codegen/cutedsl/__init__.py
new file mode 100644
index 0000000000000..f12fa963fd60c
--- /dev/null
+++ b/torch/_inductor/codegen/cutedsl/__init__.py
@@ -0,0 +1,8 @@
+# mypy: allow-untyped-defs
+from .cutedsl_template import CuteDSLTemplate, CuteDSLTemplateCaller
+
+
+__all__ = [
+    "CuteDSLTemplate",
+    "CuteDSLTemplateCaller",
+]
diff --git a/torch/_inductor/codegen/cutedsl/cutedsl_kernel.py b/torch/_inductor/codegen/cutedsl/cutedsl_kernel.py
new file mode 100644
index 0000000000000..c30f8bc05d6f5
--- /dev/null
+++ b/torch/_inductor/codegen/cutedsl/cutedsl_kernel.py
@@ -0,0 +1,416 @@
+# mypy: allow-untyped-defs
+import contextlib
+import dataclasses
+import logging
+import textwrap
+from typing import Any, Callable, Optional
+
+import sympy
+
+import torch
+from torch._inductor.codegen.common import (
+    CSE,
+    CSEVariable,
+    IndentedBuffer,
+    Kernel,
+    ValueRanges,
+)
+from torch._inductor.ir import Buffer, ComputedBuffer, InputBuffer
+from torch._inductor.ops_handler import StoreMode
+from torch._inductor.utils import OrderedSet
+from torch._inductor.virtualized import V
+
+from .cutedsl_op_overrides import CuteDSLOpOverrides
+
+
+# TODO setting the 'main' kernel w/ this suffix. We have 3 should probably just auto generate this
+MAIN_SUFFIX = "main"
+
+
+log = logging.getLogger(__name__)
+kernel_code_log = torch._logging.getArtifactLogger(__name__, "kernel_code")
+
+
+class CuteDSLKernelWrapper:
+    """Wrapper to provide .run() interface for CuteDSL kernels"""
+
+    def __init__(
+        self, kernel_fn: Callable[..., Any], kernel_path: Optional[str] = None
+    ):
+        self.kernel_fn = kernel_fn
+        self.kernel_path = kernel_path
+        kernel_code_log.info("CuteDSL kernel path: %s", kernel_path)
+
+    def run(self, *args, stream=None, **kwargs):
+        """
+        Execute the CuteDSL kernel.
+
+        Args:
+            *args: Arguments to pass to the kernel function
+            stream: CUDA stream to pass to the kernel function
+            **kwargs: Additional keyword arguments for the kernel
+
+        Returns:
+            Result of the kernel execution
+        """
+        return self.kernel_fn(*args, stream=stream, **kwargs)
+
+
+@dataclasses.dataclass
+class CuteDSLSubgraphInfo:
+    """Minimal subgraph info for CuteDSL kernels."""
+
+    body: IndentedBuffer
+    template_mask: Optional[str] = None
+    template_out: Optional[str] = None
+
+    def to_dict(self):
+        return {
+            field.name: getattr(self, field.name) for field in dataclasses.fields(self)
+        }
+
+
+class CuteDSLTemplateKernel(Kernel):
+    """
+    Template kernel implementation for CuteDSL (CUTLASS Python DSL).
+    Handles code generation and argument management for CuteDSL CUDA kernels.
+    Provides CuteDSL-specific functionality for tensor conversion and kernel configuration.
+    """
+
+    def __init__(
+        self,
+        kernel_name: str,
+        input_nodes: list[Buffer],
+        output_node: Buffer,
+        subgraphs: Optional[list[Buffer]] = None,
+    ) -> None:
+        # Call parent Kernel constructor
+        super().__init__()
+        self.kernel_name = kernel_name
+        self.input_nodes = input_nodes
+        self.output_node = output_node
+        self.subgraphs = subgraphs
+        self.subgraph_bodies: dict[str, CuteDSLSubgraphInfo] = {}
+
+        # Template attributes
+        self.body: IndentedBuffer = IndentedBuffer()
+        self.template_mask: Optional[str] = None
+        self.template_out: Optional[str] = None
+        self.template_indices: Optional[list[Any]] = None
+        self.render_hooks: dict[str, Any] = {}
+
+        # TODO Additional attributes needed by template system
+        self.prologue_fused_inputs: OrderedSet[str] = OrderedSet()
+        self.prologue_fused_inputs_preserve_zero: OrderedSet[str] = OrderedSet()
+        self.named_input_nodes: dict[str, Buffer] = {}
+
+        # Create named input nodes mapping
+        for i, input_node in enumerate(input_nodes):
+            node_name = getattr(input_node, "name", f"input_{i}")
+            self.named_input_nodes[node_name] = input_node
+
+        self.cse = CSE(name_prefix="tmp")
+
+    def gen_imports(self) -> str:
+        """Generate common imports for CuteDSL templates."""
+        imports = IndentedBuffer()
+        imports.splice(
+            """
+            import torch
+            import cutlass
+            import cutlass.cute as cute
+            from cutlass.cute.runtime import from_dlpack
+            import cuda.bindings.driver as cuda
+            from cutlass._mlir.dialects import math as mlir_math
+            import operator
+            """
+        )
+        return imports.getvalue()
+
+    def gen_defines(self, **kwargs) -> str:
+        """Generate CuteDSL parameter definitions from kwargs, similar to Triton's gen_defines."""
+        params = IndentedBuffer()
+        for name, val in kwargs.items():
+            params.writeline(f"{name}: cutlass.Constexpr = {val}")
+        return params.getvalue()
+
+    def render(self, template, **kwargs):
+        from torch._inductor.select_algorithm import PartialRender
+
+        """Render the kernel using the template, returning PartialRender object with hooks."""
+        # Available {{}} hooks for jinja rendering
+        template_env = {
+            "def_kernel": self.def_kernel,
+            "gen_defines": lambda: self.gen_defines(**kwargs),
+            "get_output": self.get_output,
+            "modification": self.modification,
+        }
+
+        # Render the template with the environment and provided kwargs
+        rendered_code = template.render(
+            kernel_name=self.kernel_name,
+            input_nodes=self.input_nodes,
+            output_node=self.output_node,
+            **template_env,
+            **kwargs,
+        )
+
+        # Always prepend the common imports
+        imports = self.gen_imports()
+        full_code = imports + rendered_code
+
+        return PartialRender(full_code, self.render_hooks)
+
+    @contextlib.contextmanager
+    def set_subgraph_body(self, body_name: str):
+        """Set the active subgraph body for template processing."""
+        assert all(
+            hasattr(self, field.name)
+            for field in dataclasses.fields(CuteDSLSubgraphInfo)
+        )
+        old_state = {
+            key.name: getattr(self, key.name)
+            for key in dataclasses.fields(CuteDSLSubgraphInfo)
+        }
+
+        if body_name not in self.subgraph_bodies:
+            self.subgraph_bodies[body_name] = CuteDSLSubgraphInfo(
+                body=IndentedBuffer(),
+                template_mask=None,
+                template_out=None,
+            )
+
+        subgraph = self.subgraph_bodies[body_name]
+        for key, value in subgraph.to_dict().items():
+            setattr(self, key, value)
+
+        try:
+            yield
+        finally:
+            # Save current state back to subgraph
+            self.subgraph_bodies[body_name] = CuteDSLSubgraphInfo(
+                **{
+                    key.name: getattr(self, key.name)
+                    for key in dataclasses.fields(CuteDSLSubgraphInfo)
+                }
+            )
+            # Restore old state
+            for key, value in old_state.items():
+                setattr(self, key, value)
+
+    @contextlib.contextmanager
+    def create_subgraph_body(self, body_name: str):
+        """Create a new subgraph body for template processing."""
+        assert body_name not in self.subgraph_bodies, (
+            f"Subgraph body '{body_name}' already exists"
+        )
+        self.subgraph_bodies[body_name] = CuteDSLSubgraphInfo(
+            body=IndentedBuffer(),
+            template_mask=None,
+            template_out=None,
+        )
+        with self.set_subgraph_body(body_name):
+            yield
+
+    def def_kernel(self, *argnames):
+        """Define kernel function signature for CuteDSL templates."""
+        renames = IndentedBuffer(initial_indent=1)
+
+        for i, input_node in enumerate(self.input_nodes):
+            buf_name = input_node.get_name()
+            self.args.input(buf_name)
+
+            # Template aliasing: converts template variables (e.g., "input_a") to function args (e.g., "arg_input_a")
+            # and generates rename statements so template code can use the original names
+            if i < len(argnames):
+                template_name = argnames[i]
+                arg_name = f"arg_{template_name}"
+                self.args.input_buffers[buf_name] = arg_name
+                renames.writeline(f"{template_name} = {arg_name}")
+
+        if self.output_node:
+            self.args.output(self.output_node.get_name())
+
+        def hook():
+            # Deferred execution: arg definitions must be collected after template processing adds all args
+            arg_defs, *_ = self.args.python_argdefs()
+            code = IndentedBuffer()
+            code.writeline(f"# Kernel function signature: {self.kernel_name}")
+            params = [x.full_name() for x in arg_defs] + ["stream"]
+            code.writeline(
+                f"def {self.kernel_name}_{MAIN_SUFFIX}({', '.join(params)}):"
+            )
+            with code.indent():
+                code.splice(renames.getvalue())
+            return code.getvalue()
+
+        assert "<DEF_KERNEL>" not in self.render_hooks
+        # Placeholder-based rendering: hook will be called when template encounters "<DEF_KERNEL>"
+        self.render_hooks["<DEF_KERNEL>"] = hook
+        return "<DEF_KERNEL>"
+
+    def get_output(self):
+        """Get the actual argument name for the output buffer."""
+        assert self.output_node, "Output node must exist to get output buffer name"
+        buf_name = self.output_node.get_name()
+        output = self.args.output_buffers.get(buf_name, None)
+        if output is None:
+            raise ValueError(f"Output buffer '{buf_name}' not found in args")
+        return output
+
+    def call_kernel(self, name: str, node=None):
+        """Call the kernel function. Simplified version of TritonTemplateKernel.call_kernel."""
+        wrapper = V.graph.wrapper_code
+        _, call_args, _, arg_types = self.args.python_argdefs()
+        # TODO triton should really be swapped w/ `python`
+        wrapper.generate_kernel_call(name, call_args, triton=True, arg_types=arg_types)
+
+    def _get_subgraph(self, subgraph_number: int):
+        """Get subgraph by number for modification processing."""
+        assert isinstance(subgraph_number, int)
+        assert isinstance(self.subgraphs, list)
+        assert subgraph_number < len(self.subgraphs), (
+            f"Invalid subgraph number provided to create_modification, {subgraph_number} must be < {len(self.subgraphs)}"
+        )
+        assert self.body.getvalue() == "", (
+            "Body should be clear before adding a modification"
+        )
+        return self.subgraphs[subgraph_number]
+
+    def modification(
+        self,
+        subgraph_number: int,
+        output_name: Optional[str],
+        mask: Optional[str] = None,
+        **fixed_inputs,
+    ) -> str:
+        """Generate CuteDSL code for a subgraph modification."""
+        # Find unique name to avoid collisions between multiple modifications of same subgraph
+        num = 0
+        while f"mod_{subgraph_number}_{num}" in self.subgraph_bodies:
+            num += 1
+
+        with self.create_subgraph_body(f"mod_{subgraph_number}_{num}"):
+            subgraph = self._get_subgraph(subgraph_number)
+            modification_handler = ModificationWrapperCuteDSL(
+                self, subgraph_number, fixed_inputs, mask
+            )
+            with V.set_kernel_handler(self), V.set_ops_handler(modification_handler):
+                assert isinstance(subgraph, (ComputedBuffer, list)), (
+                    f"Expected ComputedBuffer or List[ComputedBuffer], got {type(subgraph)}"
+                )
+
+                if isinstance(subgraph, list):
+                    raise NotImplementedError(
+                        "Scatter graphs are not supported for CuteDSL"
+                    )
+
+                if isinstance(subgraph.data, InputBuffer):
+                    # grad_score_mod can be InputBuffers
+                    out = subgraph.data.make_loader()(())
+                else:
+                    # Inline a pointwise lowering into the template
+                    out = subgraph.data.inner_fn(())
+
+            if output_name is not None:
+                assert out is not None, (
+                    f"Expected computation result for named output {output_name}"
+                )
+                self.body.writeline(f"{output_name} = {out.value}")
+            else:
+                # Side-effect only: no output assignment (currently only for scatter operations)
+                raise NotImplementedError(
+                    "Side-effect only modifications not yet supported for CuteDSL"
+                )
+
+            return self.body.getvalue()
+
+
+class ModificationWrapperCuteDSL(V.WrapperHandler):  # type: ignore[name-defined]
+    """
+    Wrapper handler that enables CuteDSL code generation during subgraph modifications.
+
+    This class sits between the PyTorch IR and CuteDSL code generation, providing:
+    1. Operation substitution: converts PyTorch ops to CuteDSL equivalents via CuteDSLOpOverrides
+    2. Placeholder handling: resolves fixed_inputs during template processing
+    3. Limited operation support: currently restricted to pointwise operations
+
+    """
+
+    def __init__(
+        self,
+        kernel,
+        subgraph_number: int,
+        fixed_inputs: dict[str, Any],
+        mask: Optional[str],
+    ):
+        cutedsl_ops = CuteDSLOpOverrides()
+        super().__init__(cutedsl_ops)
+        self.name = f"CuteDSLPlaceholderSubstitution_{subgraph_number}"
+        self.kernel = kernel
+        self.fixed_inputs = fixed_inputs
+        self.mask = mask
+
+    def _get_input_dtype(self, name: str) -> torch.dtype:
+        """Get the dtype for an input from the kernel's named_input_nodes."""
+        if name in self.kernel.named_input_nodes:
+            return self.kernel.named_input_nodes[name].dtype
+        # TODO: Fallback for common dimension names - should be replaced with proper dtype tracking
+        return torch.float32 if name not in ("b", "h", "m", "n") else torch.int32
+
+    def load(self, name: str, index: sympy.Expr):
+        """Handle loading from tensor or fixed(template args) input for CuteDSL."""
+        if name not in self.fixed_inputs:
+            raise NotImplementedError(
+                "Tensor loading not yet supported for CuteDSL - only fixed input substitution"
+            )
+        value = self.fixed_inputs[name]
+        dtype = self._get_input_dtype(name)
+
+        # ensure CSE wrapping
+        return self.kernel.cse.generate(
+            self.kernel.body, value, bounds=ValueRanges.unknown(), dtype=dtype
+        )
+
+    def indirect_indexing(self, index_var: str, size, check, wrap_neg=True):
+        """Convert index variable to symbolic form."""
+        raise NotImplementedError("Indirect indexing not supported")
+
+    def store(
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+    ) -> str:
+        raise NotImplementedError(
+            "Store operations not supported - CuteDSL limited to read-only operations"
+        )
+
+    def _add_kernel_input(self, name: str):
+        """Add name as input to kernel and return input ref."""
+        return self.kernel.args.input(name)
+
+    def _process_indexing(self, index):
+        """Process and rename indexing, adding symbols as kernel inputs."""
+        # Convert sympy expression to string representation for CuteDSL
+        return str(index)  # Simplified for now
+
+    def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+        try:
+            return getattr(self._inner, name)(*args, **kwargs)
+        except NotImplementedError as e:
+            bar = "=" * 80
+            msg = textwrap.dedent(f"""
+                {bar}
+                UNSUPPORTED CUTEDSL OPERATION: '{name}'
+                {bar}
+                This operation is not yet implemented in Inductor.
+
+                Please open an issue at: https://github.com/pytorch/pytorch/issues
+                with the following information:
+
+                Operation: {name}
+                Args: {args!r}
+                Kwargs: {kwargs!r}
+
+                Title your issue: [CuteDSL] Missing operation: {name}
+                {bar}
+            """).strip()
+            raise NotImplementedError(msg) from e
diff --git a/torch/_inductor/codegen/cutedsl/cutedsl_op_overrides.py b/torch/_inductor/codegen/cutedsl/cutedsl_op_overrides.py
new file mode 100644
index 0000000000000..5dd79db7bdb72
--- /dev/null
+++ b/torch/_inductor/codegen/cutedsl/cutedsl_op_overrides.py
@@ -0,0 +1,358 @@
+# mypy: allow-untyped-defs
+"""
+CuteDSL-specific operation overrides for pointwise operations.
+
+This module provides CuteDSL implementations of common operations used in
+template kernels, particularly for flex attention modifications.
+"""
+
+import math
+from typing import Optional, Union
+
+import sympy
+
+import torch
+from torch._inductor.codegen.common import CSEVariable, OpOverrides
+from torch._inductor.virtualized import OpsValue, V
+from torch.utils._sympy.value_ranges import ValueRanges
+
+
+CuteDSLArg = Union[CSEVariable, str]
+
+
+def upcast_compute_type(dtype: torch.dtype) -> torch.dtype:
+    """Maybe upcast [b]float16 to float32"""
+    if dtype in (torch.float16, torch.bfloat16):
+        return torch.float32
+    return dtype
+
+
+class CuteDSLOpOverrides(OpOverrides):
+    """
+    CuteDSL-specific operation overrides that generate code using CuteDSL syntax.
+
+    CuteDSL TensorSSA objects have built-in operator overloads (__add__, __mul__, etc.)
+    and math functions (cute.math.exp, cute.math.sqrt, etc.)
+    """
+
+    TORCH_TO_CUTE_DTYPE = {
+        torch.float16: "cutlass.Float16",
+        torch.bfloat16: "cutlass.BFloat16",
+        torch.float32: "cutlass.Float32",
+        torch.float64: "cutlass.Float64",
+        torch.int8: "cutlass.Int8",
+        torch.int16: "cutlass.Int16",
+        torch.int32: "cutlass.Int32",
+        torch.int64: "cutlass.Int64",
+        torch.bool: "cutlass.Boolean",
+        torch.float8_e4m3fn: "cutlass.Float8E4M3FN",
+        torch.float8_e5m2: "cutlass.Float8E5M2",
+    }
+
+    # Math constants
+    LOG2_E = 1.4426950408889634  # 1/ln(2) for converting natural exp to base-2 exp
+
+    @staticmethod
+    def _ensure_tensor_ssa(arg: CuteDSLArg, template_tensor: CuteDSLArg) -> str:
+        """
+        Convert scalar arguments to TensorSSA using cute.full_like if needed.
+
+        Args:
+            arg: The argument to check (CSEVariable for tensors, str for scalars, or OpsValue wrapper)
+            template_tensor: A tensor argument to use as template for full_like
+
+        Returns:
+            String representation suitable for CuteDSL operations
+        """
+        if isinstance(arg, CSEVariable):
+            return str(arg)
+
+        if isinstance(arg, OpsValue) and isinstance(arg.value, CSEVariable):
+            return str(arg.value)
+
+        if isinstance(template_tensor, CSEVariable):
+            return f"cute.full_like({template_tensor}, {arg})"
+
+        return str(arg)
+
+    @staticmethod
+    def _extract_dtype_and_bounds(
+        *args: CuteDSLArg,
+    ) -> tuple[Optional[torch.dtype], ValueRanges[sympy.Expr]]:
+        """Extract dtype and bounds from CSEVariable arguments."""
+        for arg in args:
+            if isinstance(arg, CSEVariable):
+                return arg.dtype, arg.bounds
+        return None, ValueRanges.unknown()
+
+    @staticmethod
+    def _apply_binary_op(a: CuteDSLArg, b: CuteDSLArg, op_format: str) -> CuteDSLArg:
+        """
+        Apply a binary operation with automatic scalar-to-tensor conversion.
+
+        CuteDSL requires both operands to be TensorSSA objects for tensor operations.
+        This helper automatically converts scalar arguments to TensorSSA using
+        cute.full_like when at least one argument is a tensor (CSEVariable).
+
+        Args:
+            a: First operand (CSEVariable for tensors, str for scalars)
+            b: Second operand (CSEVariable for tensors, str for scalars)
+            op_format: Format string with {a} and {b} placeholders for the operation
+
+        Returns:
+            CSEVariable if at least one operand is a CSEVariable, otherwise string
+        """
+        tensor_arg = (
+            a
+            if isinstance(a, CSEVariable)
+            else b
+            if isinstance(b, CSEVariable)
+            else None
+        )
+        if tensor_arg is not None:
+            a_ssa = CuteDSLOpOverrides._ensure_tensor_ssa(a, tensor_arg)
+            b_ssa = CuteDSLOpOverrides._ensure_tensor_ssa(b, tensor_arg)
+            result_expr = op_format.format(a=a_ssa, b=b_ssa)
+
+            dtype, bounds = CuteDSLOpOverrides._extract_dtype_and_bounds(a, b)
+
+            # Create and return CSEVariable using CSE generation for caching
+            return V.kernel.cse.generate(
+                V.kernel.body, result_expr, bounds=bounds, dtype=dtype
+            )
+
+        return op_format.format(a=a, b=b)
+
+    @staticmethod
+    def _apply_unary_op(x: CuteDSLArg, op_format: str) -> CuteDSLArg:
+        """
+        Apply a unary operation, returning CSEVariable if input is CSEVariable.
+
+        Args:
+            x: Input operand (CSEVariable for tensors, str for scalars)
+            op_format: Format string with {x} placeholder for the operation
+
+        Returns:
+            CSEVariable if input is a CSEVariable, otherwise string
+        """
+        if isinstance(x, CSEVariable):
+            result_expr = op_format.format(x=str(x))
+            return V.kernel.cse.generate(
+                V.kernel.body, result_expr, bounds=x.bounds, dtype=x.dtype
+            )
+
+        return op_format.format(x=x)
+
+    @staticmethod
+    def constant(value: Union[bool, float, int], dtype: torch.dtype) -> str:
+        """Generate CuteDSL constant representation."""
+        if value == float("-inf"):
+            return "float('-inf')"
+        elif value == float("inf"):
+            return "float('inf')"
+        elif math.isnan(value):
+            return "float('nan')"
+        return repr(value)
+
+    @staticmethod
+    def add(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} + {b})")
+
+    @staticmethod
+    def mul(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} * {b})")
+
+    @staticmethod
+    def sub(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} - {b})")
+
+    @staticmethod
+    def truediv(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} / {b})")
+
+    @staticmethod
+    def mod(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} % {b})")
+
+    @staticmethod
+    def remainder(a, b):
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} % {b})")
+
+    @staticmethod
+    def exp(x: CuteDSLArg) -> CuteDSLArg:
+        """Exponential using CuteDSL cute.math.exp function."""
+        return CuteDSLOpOverrides._apply_unary_op(
+            x, f"cute.math.exp2({{x}} * {CuteDSLOpOverrides.LOG2_E})"
+        )
+
+    @staticmethod
+    def sqrt(x: CuteDSLArg) -> CuteDSLArg:
+        """Square root using CuteDSL cute.math.sqrt function."""
+        return CuteDSLOpOverrides._apply_unary_op(x, "cute.math.sqrt({x})")
+
+    @staticmethod
+    def log(x: CuteDSLArg) -> CuteDSLArg:
+        """Natural logarithm using CuteDSL cute.math.log function."""
+        return CuteDSLOpOverrides._apply_unary_op(x, "cute.math.log({x})")
+
+    @staticmethod
+    def cos(x: CuteDSLArg) -> CuteDSLArg:
+        """Cosine using CuteDSL cute.math.cos function."""
+        return CuteDSLOpOverrides._apply_unary_op(x, "cute.math.cos({x})")
+
+    @staticmethod
+    def sin(x: CuteDSLArg) -> CuteDSLArg:
+        """Sine using CuteDSL cute.math.sin function."""
+        return CuteDSLOpOverrides._apply_unary_op(x, "cute.math.sin({x})")
+
+    @staticmethod
+    def erf(x: CuteDSLArg) -> CuteDSLArg:
+        """Error function using CuteDSL cute.math.erf function."""
+        return CuteDSLOpOverrides._apply_unary_op(x, "cute.math.erf({x})")
+
+    @staticmethod
+    def maximum(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        raise NotImplementedError("TODO: maximum is not supported yet for TensorSSA")
+
+    @staticmethod
+    def minimum(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        raise NotImplementedError("TODO: minimum is not supported yet for TensorSSA")
+
+    @staticmethod
+    def where(
+        condition: CuteDSLArg,
+        a: CuteDSLArg,
+        b: CuteDSLArg,
+    ) -> CuteDSLArg:
+        """Conditional selection - handles both CSEVariable and string inputs."""
+        # Find a tensor argument to use as template for full_like
+        # Priority: use 'a' if it's a tensor, else use 'b', else condition
+        tensor_arg = (
+            a
+            if isinstance(a, CSEVariable)
+            else (
+                b
+                if isinstance(b, CSEVariable)
+                else condition
+                if isinstance(condition, CSEVariable)
+                else None
+            )
+        )
+
+        if tensor_arg is not None:
+            a_ssa = CuteDSLOpOverrides._ensure_tensor_ssa(a, tensor_arg)
+            b_ssa = CuteDSLOpOverrides._ensure_tensor_ssa(b, tensor_arg)
+            result_expr = f"cute.where({condition}, {a_ssa}, {b_ssa})"
+
+            dtype, bounds = CuteDSLOpOverrides._extract_dtype_and_bounds(
+                a, b, condition
+            )
+
+            return V.kernel.cse.generate(
+                V.kernel.body, result_expr, bounds=bounds, dtype=dtype
+            )
+
+        return f"cute.where({condition}, {a}, {b})"
+
+    @staticmethod
+    def pow(a: CuteDSLArg, b: CuteDSLArg):
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} ** {b})")
+
+    @staticmethod
+    def abs(x: CuteDSLArg) -> CuteDSLArg:
+        """Absolute value using CuteDSL cute.math.abs function."""
+        if isinstance(x, CSEVariable):
+            x_dtype = x.dtype
+        elif isinstance(x, OpsValue) and isinstance(x.value, CSEVariable):
+            x_dtype = x.value.dtype
+        else:
+            x_dtype = torch.float32
+
+        abs_op = (
+            "mlir_math.absf"
+            if x_dtype in (torch.float16, torch.bfloat16, torch.float32)
+            else "mlir_math.absi"
+        )
+        return CuteDSLOpOverrides._apply_unary_op(
+            x, f"cute.TensorSSA({abs_op}({{x}}), {{x}}.shape, {{x}}.dtype)"
+        )
+
+    @staticmethod
+    def neg(x: CuteDSLArg) -> CuteDSLArg:
+        """Negation using CuteDSL TensorSSA __neg__ operator."""
+        # TODO: See https://github.com/NVIDIA/cutlass/issues/2584
+        return CuteDSLOpOverrides._apply_unary_op(
+            x, "cute.TensorSSA(-{x}, {x}.shape, {x}.dtype)"
+        )
+
+    @staticmethod
+    def to_dtype(
+        x: CuteDSLArg, dtype: torch.dtype, src_dtype=None, use_compute_types=True
+    ) -> CuteDSLArg:
+        """Type conversion using CuteDSL TensorSSA.to(Type[Numeric]).
+
+        Maps torch dtypes to cutlass.cute.typing numeric types and emits
+        `{x}.to(cute.typing.<Type>)`.
+
+        Raises NotImplementedError for unsigned integer and unsupported dtypes.
+        """
+        # Always convert up from bf16 and fp16 TODO on configuring
+        dtype = upcast_compute_type(dtype)
+
+        cute_type = CuteDSLOpOverrides.TORCH_TO_CUTE_DTYPE.get(dtype)
+        if cute_type is None:
+            raise NotImplementedError(
+                f"CuteDSL dtype cast not implemented for torch dtype: {dtype}"
+            )
+
+        if isinstance(x, CSEVariable):
+            result_expr = f"{str(x)}.to({cute_type})"
+            return V.kernel.cse.generate(
+                V.kernel.body, result_expr, bounds=x.bounds, dtype=dtype
+            )
+
+        return f"{x}.to({cute_type})"
+
+    @staticmethod
+    def tanh(x0: CuteDSLArg) -> CuteDSLArg:
+        """Hyperbolic tangent using CuteDSL cute.math.tanh function."""
+        return CuteDSLOpOverrides._apply_unary_op(x0, "cute.math.tanh({x})")
+
+    # Logical operations
+    @staticmethod
+    def logical_and(x0: CuteDSLArg, x1: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(x0, x1, "({a} and {b})")
+
+    @staticmethod
+    def logical_or(x0: CuteDSLArg, x1: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(x0, x1, "({a} or {b})")
+
+    @staticmethod
+    def logical_not(a):
+        """Logical NOT."""
+        return CuteDSLOpOverrides._apply_unary_op(a, "({x} == 0)")
+
+    # Comparison operations
+    @staticmethod
+    def eq(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "operator.eq({a}, {b})")
+
+    @staticmethod
+    def ne(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "operator.ne({a}, {b})")
+
+    @staticmethod
+    def lt(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "operator.lt({a}, {b})")
+
+    @staticmethod
+    def le(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "operator.le({a}, {b})")
+
+    @staticmethod
+    def gt(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "operator.gt({a}, {b})")
+
+    @staticmethod
+    def ge(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "operator.ge({a}, {b})")
diff --git a/torch/_inductor/codegen/cutedsl/cutedsl_scheduling.py b/torch/_inductor/codegen/cutedsl/cutedsl_scheduling.py
new file mode 100644
index 0000000000000..427b6fe5f1df0
--- /dev/null
+++ b/torch/_inductor/codegen/cutedsl/cutedsl_scheduling.py
@@ -0,0 +1,140 @@
+# mypy: allow-untyped-defs
+import hashlib
+import logging
+from collections.abc import Sequence
+from typing import cast
+
+from torch._inductor.utils import Placeholder
+from torch.utils._ordered_set import OrderedSet
+
+from ... import config
+from ...codecache import code_hash, get_path
+from ...ir import CuteDSLTemplateBuffer
+from ...scheduler import (
+    BaseSchedulerNode,
+    BaseScheduling,
+    FusedSchedulerNode,
+    SchedulerNode,
+)
+from ...select_algorithm import PartialRender
+from ...utils import get_fused_kernel_name, get_kernel_metadata
+from ...virtualized import V
+from ..common import BackendFeature, IndentedBuffer
+
+
+log = logging.getLogger(__name__)
+
+
+class CuteDSLScheduling(BaseScheduling):
+    """
+    Scheduling implementation for CuteDSL (CUTLASS Python DSL) kernels.
+    This class is intended to be used in combination with other schedulers,
+    and delegated to by CUDACombinedScheduling.
+    """
+
+    @classmethod
+    def get_backend_features(cls, device) -> OrderedSet[BackendFeature]:
+        return OrderedSet()
+
+    @staticmethod
+    def is_cutedsl_template(node: BaseSchedulerNode) -> bool:
+        """Check if a node is a CuteDSL template."""
+        return isinstance(node, SchedulerNode) and isinstance(
+            node.node, CuteDSLTemplateBuffer
+        )
+
+    def is_cutedsl_fused_template(self, node: BaseSchedulerNode) -> bool:
+        """Check if a node is a fused CuteDSL template."""
+        return isinstance(node, FusedSchedulerNode) and self.is_cutedsl_template(node)
+
+    def can_fuse_vertical(
+        self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
+    ) -> bool:
+        """
+        TODO CuteDSL doesn't support vertical fusion yet.
+        This could be extended in the future for epilogue fusion.
+        """
+        return False
+
+    def define_kernel(self, src_code_str: str, node_schedule) -> str:
+        """Produce the kernel string
+        Args:
+            src_code_str: The finalized kernel code string
+            node_schedule: List of nodes in the schedule
+
+        Note:
+            This is a little weird since async_compile.cutedsl() has to write the string to
+            a file in order to cute compile it. Feels bad to have two...
+        """
+        wrapper = V.graph.wrapper_code
+
+        # Use the string as the key for caching
+        if src_code_str in wrapper.src_to_kernel:
+            kernel_name = wrapper.src_to_kernel[src_code_str]
+        else:
+            fused_name = (
+                get_fused_kernel_name(node_schedule, config.triton.descriptive_names)
+                if config.triton.descriptive_names
+                else ""
+            )
+
+            kernel_hash = hashlib.sha256(src_code_str.encode("utf-8")).hexdigest()[:8]
+            if fused_name == "fused":
+                kernel_name = f"cutedsl_{kernel_hash}"
+            else:
+                kernel_name = f"cutedsl_{fused_name}_{kernel_hash}"
+            wrapper.src_to_kernel[src_code_str] = kernel_name
+            src_code_str = src_code_str.replace(
+                str(Placeholder.KERNEL_NAME), kernel_name
+            )
+
+            _, _, kernel_path = get_path(code_hash(src_code_str), "py")
+
+            compile_wrapper = IndentedBuffer()
+            compile_wrapper.writeline(f"async_compile.cutedsl({kernel_name!r}, r'''")
+            compile_wrapper.splice(src_code_str, strip=True)
+            compile_wrapper.writeline("''')")
+
+            metadata_comment = f"# kernel path: {kernel_path}"
+            origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
+            metadata_comment += "\n" + origins + "\n" + detailed_origins
+            wrapper.define_kernel(
+                kernel_name, compile_wrapper.getvalue(), metadata_comment
+            )
+        return kernel_name
+
+    def codegen_template(
+        self,
+        template_node: BaseSchedulerNode,
+        epilogue_nodes: Sequence[BaseSchedulerNode],
+        prologue_nodes: Sequence[BaseSchedulerNode],
+    ):
+        """
+        Codegen a CuteDSL template. Currently doesn't support fusion.
+        """
+        assert self.is_cutedsl_template(template_node), (
+            "Template node passed to CuteDSLScheduling.codegen_template must be a "
+            "SchedulerNode that wraps a CuteDSLTemplateBuffer"
+        )
+        # TODO remove when supported
+        assert not epilogue_nodes, "CuteDSL doesn't support epilogue fusion yet"
+        assert not prologue_nodes, "CuteDSL doesn't support prologue fusion yet"
+
+        template_node = cast(SchedulerNode, template_node)
+        ctb: CuteDSLTemplateBuffer = cast(CuteDSLTemplateBuffer, template_node.node)
+
+        kernel, render = ctb.make_kernel_render(ctb)  # type: ignore[misc]
+        template_node.mark_run()
+        src_code = render()
+        # Finalize PartialRender if needed
+        if isinstance(src_code, PartialRender):
+            src_code_str = src_code.finalize_all()
+        else:
+            src_code_str = src_code
+
+        with V.set_kernel_handler(kernel):
+            node_schedule = [template_node]
+            kernel_name = self.define_kernel(src_code_str, node_schedule)
+        kernel.call_kernel(kernel_name, ctb)
+        V.graph.removed_buffers |= kernel.removed_buffers
+        self.free_buffers_in_scheduler()
diff --git a/torch/_inductor/codegen/cutedsl/cutedsl_template.py b/torch/_inductor/codegen/cutedsl/cutedsl_template.py
new file mode 100644
index 0000000000000..b43dbd9cfd710
--- /dev/null
+++ b/torch/_inductor/codegen/cutedsl/cutedsl_template.py
@@ -0,0 +1,198 @@
+# mypy: allow-untyped-defs
+import functools
+import itertools
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+from unittest.mock import patch
+
+from torch._inductor.ir import ShapeAsConstantBuffer
+from torch._inductor.utils import Placeholder
+from torch._inductor.virtualized import V
+from torch._logging import getArtifactLogger
+
+from ...autotune_process import CuteDSLBenchmarkRequest, TensorMeta
+from ...ir import Buffer, ChoiceCaller, CuteDSLTemplateBuffer, IRNode, Layout, TensorBox
+from ..common import KernelTemplate
+from .cutedsl_kernel import CuteDSLTemplateKernel
+
+
+log = getArtifactLogger(__name__, "output_code")
+
+
+class CuteDSLTemplate(KernelTemplate):
+    """Template for generating CuteDSL (CUTLASS Python DSL) kernels."""
+
+    kernel_type: type[Any] = CuteDSLTemplateKernel
+    index_counter = itertools.count()
+    all_templates: dict[str, "CuteDSLTemplate"] = {}
+
+    def __init__(
+        self,
+        name: str,
+        source: str,
+        subgraph_fn: Optional[Any] = None,
+        mask_fn: Optional[Any] = None,
+    ) -> None:
+        super().__init__(name)
+        self.source = source
+        self.subgraph_fn = subgraph_fn
+        self.mask_fn = mask_fn
+        self.template = CuteDSLTemplate._template_from_string(source)
+        assert name not in self.all_templates, f"duplicate template name, {name}"
+        CuteDSLTemplate.all_templates[name] = self
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _template_from_string(source: str) -> Any:
+        return KernelTemplate._template_from_string(source)
+
+    def maybe_append_choice(
+        self, choices: list[Any], **kwargs: Any
+    ) -> Optional[NotImplementedError]:
+        """
+        Maybe generates a new ChoiceCaller and appends it into existing choices.
+        Returns None if success, otherwise returns the error.
+        """
+        try:
+            choices.append(self.generate(**kwargs))
+            return None
+        except NotImplementedError as e:
+            log.debug("CuteDSL template choice generation failed: %s", e)
+            return e
+        except Exception as e:
+            log.debug("CuteDSL template choice generation error: %s", e)
+            return NotImplementedError(f"CuteDSL template failed: {e}")
+
+    def generate(self, **kwargs: Any) -> ChoiceCaller:
+        """Generate the CuteDSL kernel caller."""
+        input_nodes = kwargs.pop("input_nodes")
+        layout = kwargs.pop("layout")
+        mutated_inputs = kwargs.pop("mutated_inputs", None)
+        subgraphs = kwargs.pop("subgraphs", None)
+
+        kernel_name = f"cutedsl_{self.name}_{next(self.index_counter)}"
+
+        if self.template is None:
+            raise RuntimeError("Template compilation failed (Jinja2 required)")
+
+        self.output_node: Buffer = Buffer(name="buf_out", layout=layout)
+        # Patch V.graph.get_dtype to handle the fake buf_out buffer
+        with patch.object(
+            V.graph, "get_dtype", KernelTemplate._fake_get_dtype(self.output_node)
+        ):
+            kernel = self.kernel_type(
+                kernel_name=kernel_name,
+                input_nodes=input_nodes,
+                output_node=self.output_node,
+                subgraphs=subgraphs,
+            )
+            code = kernel.render(self.template, **kwargs)
+
+            log.debug("Generated CuteDSL Code:\n%s", code)
+
+            bmreq = CuteDSLBenchmarkRequest(
+                kernel_name=kernel_name,
+                input_tensor_meta=TensorMeta.from_irnodes(input_nodes),
+                output_tensor_meta=TensorMeta.from_irnodes(self.output_node),
+                extra_args=tuple(),
+                source_code=code,
+            )
+
+            def make_kernel_render(out_node, hint_override: Optional[int] = None):
+                """
+                Factory function that creates a kernel renderer for the final output.
+
+                This closure captures the current template and parameters, but allows
+                the output node to be specified later. This is used during the final
+                kernel selection phase when the actual output buffer is available.
+                """
+                render_kernel = self.kernel_type(
+                    kernel_name=str(Placeholder.KERNEL_NAME),
+                    input_nodes=input_nodes,
+                    output_node=out_node,
+                    subgraphs=subgraphs,
+                )
+
+                def render():
+                    return render_kernel.render(self.template, **kwargs)
+
+                return render_kernel, render
+
+            return CuteDSLTemplateCaller(
+                name=kernel_name,
+                input_nodes=input_nodes,
+                layout=layout,
+                make_kernel_render=make_kernel_render,
+                bmreq=bmreq,
+                template=self,
+                mutated_inputs=mutated_inputs,
+            )
+
+
+class CuteDSLTemplateCaller(ChoiceCaller):
+    """Caller for CuteDSL templates that integrates with the autotuning system."""
+
+    def __init__(
+        self,
+        name: str,
+        input_nodes: list[Buffer],
+        layout: Layout,
+        make_kernel_render: Any,
+        bmreq: CuteDSLBenchmarkRequest,
+        template: "CuteDSLTemplate",
+        mutated_inputs: Optional[Iterable[IRNode]] = None,
+    ):
+        super().__init__(
+            name=name,
+            input_nodes=input_nodes,
+            layout=layout,
+            description=f"CuteDSL template {name}",
+        )
+        self.make_kernel_render = make_kernel_render
+        self.bmreq = bmreq
+        self.template = template
+        self.mutated_inputs = mutated_inputs
+
+    def __str__(self) -> str:
+        return f"CuteDSLTemplateCaller({self.name})"
+
+    def benchmark(self, *args, out) -> float:
+        """Benchmark the kernel execution."""
+        return self.bmreq.benchmark(*args, out=out)
+
+    def output_node(self) -> Union[TensorBox, ShapeAsConstantBuffer]:
+        """Create the output node for this template choice."""
+        return TensorBox.create(
+            CuteDSLTemplateBuffer(
+                layout=self.layout,
+                inputs=self.input_nodes,
+                make_kernel_render=self.make_kernel_render,
+                template=self.template,
+                mutated_inputs=self.mutated_inputs,
+            )
+        )
+
+    def call_name(self) -> str:
+        """Return the kernel call name."""
+        return self.name
+
+    def to_callable(self) -> Any:
+        """Return callable that can execute this kernel."""
+        return self.make_kernel_render
+
+    def hash_key(self) -> str:
+        """Return unique hash key for this choice."""
+        return "-".join(
+            [
+                self.name.rsplit("_", 1)[0],
+                self.bmreq.module_cache_key,
+            ]
+        )
+
+    def info_dict(self) -> dict[str, Any]:
+        """Return information about this kernel."""
+        return {
+            "name": self.name,
+            "backend": "CuteDSL",
+            "template": self.template.name,
+        }
diff --git a/torch/_inductor/codegen/halide.py b/torch/_inductor/codegen/halide.py
index 0d979eeed83fa..f477d16cc7668 100644
--- a/torch/_inductor/codegen/halide.py
+++ b/torch/_inductor/codegen/halide.py
@@ -54,6 +54,7 @@
     from collections.abc import Sequence
 
     from ..ops_handler import ReductionType, StoreMode
+    from ..shape_propagation import BlockShapeType
 
 log = logging.getLogger(__name__)
 
@@ -556,6 +557,7 @@ def masked(mask, body, other):
             f"hl.cast({result.name}.type(), {halide_constant(other)})",
             [],
             bounds=ValueRanges.wrap(other),
+            shape=result.shape,
         )
         # TODO(jansel): look into removing the where in the same places triton does
         return ops.where(new_mask, result, other)
@@ -564,6 +566,10 @@ def masked(mask, body, other):
     def frexp(x):
         raise NotImplementedError("frexp")
 
+    @staticmethod
+    def device_assert_async(cond, msg):
+        raise NotImplementedError("device_assert_async")
+
 
 HalideOverrides._initialize_pointwise_overrides("halide")
 
@@ -576,8 +582,9 @@ def __init__(
         name,
         bounds: ValueRanges[Any],
         dtype: Optional[torch.dtype] = None,
+        shape: BlockShapeType = None,
     ) -> None:
-        super().__init__(name, bounds, dtype)
+        super().__init__(name, bounds, dtype, shape=shape)
         self.used_dims: Optional[list[sympy.Symbol]] = None
 
     def update_on_args(self, name, args, kwargs):
@@ -702,9 +709,9 @@ def __init__(
     def dtype_to_str(self, dtype: torch.dtype) -> str:
         return halide_type(dtype)
 
-    def create_cse_var(self, name, bounds=None, dtype=None):
+    def create_cse_var(self, name, bounds=None, dtype=None, shape=None):
         self.body.writeline(f"{name} = hl.Func({name!r})")
-        return HalideCSEVariable(name, bounds, dtype)
+        return HalideCSEVariable(name, bounds, dtype, shape)
 
     def finalize_indexing(self, indices: Sequence[sympy.Expr]):
         """
@@ -1196,12 +1203,13 @@ def reduction(
         assert isinstance(value, HalideCSEVariable) and value.used_dims is not None
         reduction_vars = OrderedSet(self.reduction_renames)
         result_var = self.newfunc(
-            [v for v in value.used_dims if v not in reduction_vars]
+            [v for v in value.used_dims if v not in reduction_vars],
         )
         if reduction_vars - OrderedSet(value.used_dims):
             value = self.genfunc(
                 f"{value}",
                 self.sort_used_dims(OrderedSet((*value.used_dims, *reduction_vars))),
+                shape=value.shape,
             )
         value_str = value.subs_str(self.reduction_renames)
         default = ir.Reduction.default_accumulator(reduction_type, src_dtype)
@@ -1291,7 +1299,9 @@ def scan(
             else:
                 values.append(
                     self.genfunc(
-                        f"{value}", [*value.used_dims, [*self.reduction_renames][:1]]
+                        f"{value}",
+                        [*value.used_dims, [*self.reduction_renames][:1]],
+                        shape=value.shape,
                     )
                 )
             all_used_dims.update(value.used_dims)
@@ -1355,15 +1365,20 @@ def maybe_tuple(x):
         return tuple(unpack_vars)
 
     def genfunc(
-        self, line, used_dims, *, bounds=ValueRanges.unknown()
+        self,
+        line,
+        used_dims,
+        *,
+        bounds=ValueRanges.unknown(),
+        shape: BlockShapeType = None,
     ) -> HalideCSEVariable:
-        var = self.cse.generate(self.body, line, bounds=bounds)
+        var = self.cse.generate(self.body, line, bounds=bounds, shape=shape)
         assert isinstance(var, HalideCSEVariable)
         var.used_dims = used_dims
         return var
 
-    def newfunc(self, used_dims) -> HalideCSEVariable:
-        var = self.cse.newvar()
+    def newfunc(self, used_dims, *, shape: BlockShapeType = None) -> HalideCSEVariable:
+        var = self.cse.newvar(shape=shape)
         assert isinstance(var, HalideCSEVariable)
         var.used_dims = used_dims
         return var
diff --git a/torch/_inductor/codegen/memory_planning.py b/torch/_inductor/codegen/memory_planning.py
index 8efec7eeca9f8..12d7500975e5b 100644
--- a/torch/_inductor/codegen/memory_planning.py
+++ b/torch/_inductor/codegen/memory_planning.py
@@ -10,6 +10,7 @@
 import sympy
 
 import torch
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
 from torch.utils._ordered_set import OrderedSet
 
 from .. import config
@@ -142,6 +143,17 @@ class Allocation(AllocationTreeNode):
     allocated: bool = False
     pool: Optional[AllocationPool] = None
     offset: Optional[sympy.Expr] = None
+    earliest_available: Optional[float] = None
+
+    def __post_init__(self) -> None:
+        has_unbacked_sym = False
+        for s in self.node.get_layout().size:
+            if free_unbacked_symbols(s):
+                has_unbacked_sym = True
+                break
+
+        if has_unbacked_sym:
+            self.earliest_available = self.get_live_ranges().begin
 
     @property
     def device(self):
@@ -186,6 +198,9 @@ def __repr__(self):
             f"offset={self.offset})"
         )
 
+    def get_earliest_available(self):
+        return self.earliest_available
+
 
 @dataclasses.dataclass
 class Empty(AllocationTreeNode):
@@ -377,14 +392,26 @@ class AllocationPool:
     names_to_del: list[str] = dataclasses.field(default_factory=list)
     creation_cache: dict[str, str] = dataclasses.field(default_factory=dict)
 
+    def __post_init__(self) -> None:
+        for block in self.root.allocations:
+            if isinstance(block, Allocation):
+                self.update_restrict_live_range(block)
+
     def allocate(self, block: Allocation, is_last: bool):
-        if self.restrict_live_range and not self.restrict_live_range.contains(
-            block.live_range
+        if (
+            self.restrict_live_range is not None
+            and not self.restrict_live_range.contains(block.live_range)
         ):
             return False
 
+        block_earliest_available = block.get_earliest_available()
+        pool_begin = self.root.get_live_ranges().begin
+        if block_earliest_available and block_earliest_available > pool_begin:
+            return False
+
         is_last = self.can_expand and is_last
         if self.root.allocate(block, is_last):
+            self.update_restrict_live_range(block)
             return True
 
         if is_last:
@@ -392,9 +419,22 @@ def allocate(self, block: Allocation, is_last: bool):
 
         return False
 
+    def update_restrict_live_range(self, block: Allocation):
+        if block_earliest_available := block.get_earliest_available():
+            if self.restrict_live_range is None:
+                self.restrict_live_range = LiveRange(
+                    block_earliest_available, float("inf")
+                )
+            else:
+                self.restrict_live_range = LiveRange(
+                    min(self.restrict_live_range.begin, block_earliest_available),
+                    self.restrict_live_range.end,
+                )
+
     def allocate_at_end(self, block):
         block.mark_allocated()
         self.root = TemporalSplit([SpatialSplit(self.root, TemporalSplit([block]))])
+        self.update_restrict_live_range(block)
         return True
 
     def finalize(self, name):
@@ -408,7 +448,6 @@ def codegen_create(self, wrapper, code: IndentedBuffer):
         nbytes = self.root.get_symbolic_size()
         for block in self.root.allocations:
             if isinstance(block, Allocation) and nbytes == block.get_symbolic_size():
-                # optimization: fuse first allocation and pool creation
                 node = block.node
                 code.writeline(
                     wrapper.make_allocation(
@@ -419,7 +458,6 @@ def codegen_create(self, wrapper, code: IndentedBuffer):
                         stride=tuple(node.get_stride()),
                     )
                 )
-                self.creation_cache[block.codegen_alloc_from_pool(wrapper)] = self.name
                 return
         else:
             code.writeline(
@@ -577,7 +615,10 @@ def codegen(self, code: IndentedBuffer):
             pool.codegen_create(self.wrapper, code)
 
         pool.names_to_del.extend(self.group.names)
-        alloc_from_pool = allocation.codegen_alloc_from_pool(self.wrapper)
+        alloc_from_pool, allocation_lines_to_write = allocation.codegen_alloc_from_pool(
+            self.wrapper
+        )
+        code.writelines(allocation_lines_to_write)
         if alloc_from_pool in pool.creation_cache:
             code.writeline(
                 self.wrapper.make_tensor_alias(
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
index 5850270a67e2c..32e45bfde48d2 100644
--- a/torch/_inductor/codegen/mps.py
+++ b/torch/_inductor/codegen/mps.py
@@ -366,7 +366,7 @@ def randint64(
 
     @staticmethod
     def round(x: CSEVariable) -> str:
-        return f"metal::round({x})"
+        return f"metal::rint({x})"
 
     @staticmethod
     def pow(a: CSEVariable, b: CSEVariable) -> str:
@@ -421,6 +421,8 @@ def _initialize_special_ops(cls) -> None:
         # Binary special ops
         for name in [
             "polygamma",
+            "igamma",
+            "igammac",
             "zeta",
         ]:
             setattr(cls, name, functools.partialmethod(cls._special_binary, name=name))
@@ -535,7 +537,7 @@ def _new_idxvar(
         var_def = "threadgroup " if is_threadgroup else ""
         var_def += f"{dtype} {var_name}"
         if elem_count:
-            var_def += f"[{elem_count}]"
+            var_def += f"[{self.sexpr(elem_count)}]"
         if default_value is not None:
             assert not is_threadgroup, "Thread group var can not have default value"
             var_def += f" = {default_value}"
@@ -657,7 +659,7 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
                 )
             return self.cse.generate(
                 self.stores,
-                f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {val}, {reduction_idx}, {acc_buf_size})",
+                f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {val}, {reduction_idx}, {acc_buf_size_str})",
                 dtype=DTYPE_TO_COMPUTATION_DTYPE[dtype],
             )
         if reduction_type in ["argmin", "argmax"]:
@@ -693,7 +695,7 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
             return self.cse.generate(
                 self.stores,
                 f"c10::metal::threadgroup_{reduction_type}({data_acc_buf}, {idx_acc_buf}, "
-                f"{val}, {idx_val}, {reduction_idx}, {acc_buf_size})",
+                f"{val}, {idx_val}, {reduction_idx}, {acc_buf_size_str})",
                 dtype=dtype,
             )
         if reduction_type == "welford_reduce":
@@ -702,7 +704,7 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
                 self.compute.splice(f"{acc_buf}[{reduction_idx}] = {value};")
                 wf_res = self.cse.generate(
                     self.compute,
-                    f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size})",
+                    f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size_str})",
                     dtype=torch.float32,
                 )
                 return _unwrap_helper(wf_res)
@@ -733,7 +735,7 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
                 self.compute.writeline(f"{acc_thread_var} = {inp_value};")
             wf_res = self.cse.generate(
                 self.stores if self.multistage_reduction_entry else self.compute,
-                f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size})",
+                f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size_str})",
                 dtype=torch.float32,
             )
             return _unwrap_helper(wf_res)
@@ -1052,11 +1054,7 @@ def define_kernel(
             # Either using MultiKernel concept or overriding SIMDScheduling.codegen_node_scheduling
             mps_lib_name = f"mps_lib_{wrapper.next_kernel_suffix()}"
 
-            if V.graph.cpp_wrapper:
-                kernel_name = f"{mps_lib_name}_func"
-            else:
-                kernel_name = f"{mps_lib_name}"
-
+            kernel_name = f"{mps_lib_name}"
             wrapper.src_to_kernel[src_code] = kernel_name
 
             if V.graph.cpp_wrapper:
diff --git a/torch/_inductor/codegen/rocm/rocm_benchmark_request.py b/torch/_inductor/codegen/rocm/rocm_benchmark_request.py
index 4a08773433c3a..df4982988aa15 100644
--- a/torch/_inductor/codegen/rocm/rocm_benchmark_request.py
+++ b/torch/_inductor/codegen/rocm/rocm_benchmark_request.py
@@ -96,7 +96,7 @@ def update_workspace_size(self) -> None:
             return
         self.ensure_dll_loaded()
         unique_input_count = len(
-            {meta.name for meta in self.input_tensor_meta}  # noqa: set_linter
+            dict.fromkeys(meta.name for meta in self.input_tensor_meta)
         )
         args = [c_void_p(None) for _ in range(unique_input_count + 1)]
         stream_ptr = c_void_p(torch.cuda.current_stream().cuda_stream)
diff --git a/torch/_inductor/codegen/segmented_tree.py b/torch/_inductor/codegen/segmented_tree.py
new file mode 100644
index 0000000000000..0c59dc65f9508
--- /dev/null
+++ b/torch/_inductor/codegen/segmented_tree.py
@@ -0,0 +1,241 @@
+from typing import Callable, Generic, Optional, TypeVar
+
+
+T = TypeVar("T")
+
+
+def _value_or(opt: Optional[T], default: T) -> T:
+    return opt if opt is not None else default
+
+
+class SegmentedTree(Generic[T]):
+    def __init__(
+        self,
+        values: list[T],
+        update_op: Callable[[T, T], T],
+        summary_op: Callable[[T, T], T],
+        identity_element: T,
+    ):
+        """
+        Initialize a segment tree with the given values and operations.
+
+        Args:
+            values: list of initial values
+            update_op: Function to apply when updating a value (e.g., addition)
+            summary_op: Function to summarize two values (e.g., min, max, sum)
+            identity_element: Identity element for the summary_op (e.g., 0 for sum, float('inf') for min)
+
+        Raises:
+            ValueError: If the input values list is empty
+        """
+        if not values:
+            raise ValueError("Cannot create a segment tree with empty values list")
+
+        self.n = len(values)
+        self.update_op = update_op
+        self.summary_op = summary_op
+        self.identity = identity_element
+
+        # Size of segment tree array (next power of 2 * 2)
+        # The tree follows a standard heap layout where
+        # node `n`'s children are at `2*n` and `2*n+1`.
+        # Index 0 is unused.
+        self.size = 1
+        while self.size < self.n:
+            self.size *= 2
+        self.size *= 2
+
+        # Initialize tree and lazy arrays
+        self.tree = [identity_element] * self.size
+        # The lazy array contains updates to the given node
+        # Upon update, we only push updates to the top-most
+        # nodes that fully receive the update. We then
+        # propagate the update down as required (i.e., when
+        # we receive an interval query that neither fully
+        # contains the node nor fully doesn't contain the
+        # node
+        self.lazy: list[Optional[T]] = [None] * self.size
+
+        # Build the tree
+        self._build(values, 1, 0, self.n - 1)
+
+    def _build(self, values: list[T], node: int, start: int, end: int) -> None:
+        """
+        Build the segment tree recursively.
+
+        Args:
+            values: Original array of values
+            node: Current node index in the segment tree
+            start: Start index of the segment
+            end: End index of the segment
+        """
+        if start == end:
+            # Leaf node
+            if start < len(values):
+                self.tree[node] = values[start]
+            return
+
+        mid = (start + end) // 2
+        left_child = 2 * node
+        right_child = 2 * node + 1
+
+        # Recursively build left and right subtrees
+        self._build(values, left_child, start, mid)
+        self._build(values, right_child, mid + 1, end)
+
+        # Update current node with summary of children
+        self.tree[node] = self.summary_op(self.tree[left_child], self.tree[right_child])
+
+    def _children(self, node: int) -> list[int]:
+        return [2 * node, 2 * node + 1]
+
+    def _push_lazy(self, node: int, start: int, end: int) -> None:
+        """
+        Push lazy updates down to children.
+
+        Args:
+            node: Current node index
+            start: Start index of the segment
+            end: End index of the segment
+        """
+        lazy_node = self.lazy[node]
+        if lazy_node is None:
+            return
+
+        # Apply lazy update to current node
+        self.tree[node] = self.update_op(self.tree[node], lazy_node)
+
+        if start != end:  # Not a leaf node
+            # Propagate to children
+            for child in self._children(node):
+                self.lazy[child] = self.update_op(
+                    _value_or(self.lazy[child], self.identity), lazy_node
+                )
+
+        # Clear the lazy value
+        self.lazy[node] = None
+
+    def _update_range_helper(
+        self, node: int, start: int, end: int, left: int, right: int, value: T
+    ) -> None:
+        """
+        Helper method to update a range of values in the segment tree.
+
+        Args:
+            node: Current node index
+            start: Start index of the current segment
+            end: End index of the current segment
+            left: Start index of the range to update
+            right: End index of the range to update
+            value: Value to apply to the range
+        """
+        # Push lazy updates before processing this node
+        self._push_lazy(node, start, end)
+
+        # No overlap
+        if start > right or end < left:
+            return
+
+        # Complete overlap
+        if start >= left and end <= right:
+            # Apply update to current node
+            self.lazy[node] = value
+            self._push_lazy(node, start, end)
+            return
+
+        # Partial overlap, recurse to children
+        mid = (start + end) // 2
+        left_child = 2 * node
+        right_child = 2 * node + 1
+
+        self._update_range_helper(left_child, start, mid, left, right, value)
+        self._update_range_helper(right_child, mid + 1, end, left, right, value)
+
+        # Update current node based on children
+        self.tree[node] = self.summary_op(self.tree[left_child], self.tree[right_child])
+
+    def _query_range_helper(
+        self, node: int, start: int, end: int, left: int, right: int
+    ) -> T:
+        """
+        Helper method to query a range of values in the segment tree.
+
+        Args:
+            node: Current node index
+            start: Start index of the current segment
+            end: End index of the current segment
+            left: Start index of the range to query
+            right: End index of the range to query
+
+        Returns:
+            Summary value for the range
+        """
+        # No overlap
+        if start > right or end < left:
+            return self.identity
+
+        # Push lazy updates before processing this node
+        self._push_lazy(node, start, end)
+
+        # Complete overlap
+        if start >= left and end <= right:
+            return self.tree[node]
+
+        # Partial overlap, recurse to children
+        mid = (start + end) // 2
+        left_child = 2 * node
+        right_child = 2 * node + 1
+
+        left_result = self._query_range_helper(left_child, start, mid, left, right)
+        right_result = self._query_range_helper(right_child, mid + 1, end, left, right)
+
+        # Combine results from children
+        return self.summary_op(left_result, right_result)
+
+    def update_range(self, start: int, end: int, value: T) -> None:
+        """
+        Update a range of values in the segment tree.
+
+        Args:
+            start: Start index of the range to update (inclusive)
+            end: End index of the range to update (inclusive)
+            value: Value to apply to the range
+
+        Raises:
+            ValueError: If start > end or indices are out of bounds
+        """
+        if start > end:
+            raise ValueError("Start index must be less than or equal to end index")
+
+        if start < 0 or start >= self.n:
+            raise ValueError(f"Start index {start} out of bounds [0, {self.n - 1}]")
+
+        if end < 0 or end >= self.n:
+            raise ValueError(f"End index {end} out of bounds [0, {self.n - 1}]")
+
+        self._update_range_helper(1, 0, self.n - 1, start, end, value)
+
+    def summarize_range(self, start: int, end: int) -> T:
+        """
+        Query a range of values in the segment tree.
+
+        Args:
+            start: Start index of the range to query (inclusive)
+            end: End index of the range to query (inclusive)
+
+        Returns:
+            Summary value for the range according to the summary operation
+
+        Raises:
+            ValueError: If start > end or indices are out of bounds
+        """
+        if start > end:
+            raise ValueError("Start index must be less than or equal to end index")
+
+        if start < 0 or start >= self.n:
+            raise ValueError(f"Start index {start} out of bounds [0, {self.n - 1}]")
+
+        if end < 0 or end >= self.n:
+            raise ValueError(f"End index {end} out of bounds [0, {self.n - 1}]")
+
+        return self._query_range_helper(1, 0, self.n - 1, start, end)
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index 5b1350a9239e4..d73db7ed2a227 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -408,6 +408,7 @@ def __init__(
             else self.should_use_cooperative_reduction()
         )
         self.tiling_scores: Optional[dict[str, sympy.Expr]] = tiling_scores
+        self.tiling: dict[str, sympy.Expr] = tiling
         self.persistent_reduction: bool = (
             override_persistent_reduction
             if override_persistent_reduction is not None
@@ -1422,6 +1423,15 @@ def can_use_32bit_indexing(
             if buf.has_tensor_output()
         ]
 
+        for buf in buffers:
+            if not buf.has_tensor_output() and isinstance(buf, ir.MutationOutput):
+                mutated_bufs = buf.get_mutation_buffers()
+                buf_sizes += [
+                    buf.get_layout().storage_size()
+                    for buf in mutated_bufs
+                    if buf.has_tensor_output()
+                ]
+
         if not all(expr_fits_within_32bit(size) for size in buf_sizes):
             return False
 
@@ -1449,15 +1459,17 @@ def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
         for kernel in kernels:
             self.codegen_node_schedule_with_kernel(node_schedule, kernel)
         MultiKernel.merge_workspaces_inplace(kernels)
+        debug_handles: list[tuple[str, Optional[int]]] = []
         for kernel in kernels:
             with V.set_kernel_handler(kernel):
                 src_code = kernel.codegen_kernel()
             kernel_name = self.define_kernel(src_code, node_schedule, kernel)
-            if config.trace.provenance_tracking:
-                set_kernel_post_grad_provenance_tracing(
+            if config.trace.provenance_tracking_level != 0:
+                debug_handle = set_kernel_post_grad_provenance_tracing(
                     node_schedule,  # type: ignore[arg-type]
                     kernel_name,
                 )
+                debug_handles.append((kernel_name, debug_handle))
             log.debug("Generating kernel code with kernel_name: %s", kernel_name)
             kernel.kernel_name = kernel_name
             kernel.code_hash = code_hash(src_code)
@@ -1474,6 +1486,10 @@ def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
                 node.mark_run()
 
         self.codegen_comment(node_schedule)
+        for kernel_name, debug_handle in debug_handles:
+            V.graph.wrapper_code.write_provenance_debug_handle(
+                kernel_name, debug_handle
+            )
         final_kernel.call_kernel(final_kernel.kernel_name)
 
         if config.nan_asserts:
@@ -1664,7 +1680,7 @@ def _codegen_single_template(
 
             kernel.kernel_name = self.define_kernel(src_code, node_schedule, kernel)
 
-            if config.trace.provenance_tracking:
+            if config.trace.provenance_tracking_level != 0:
                 set_kernel_post_grad_provenance_tracing(
                     node_schedule, kernel.kernel_name
                 )
@@ -1849,7 +1865,7 @@ def codegen_combo_kernel(self, combo_kernel_node):
         for src_code, kernel, _ in kernel_code_list:
             kernel_name = self.define_kernel(src_code, [combo_kernel_node], kernel)
             # dump provenance node info for ComboKernelNode/ForeachKernel type
-            if config.trace.provenance_tracking:
+            if config.trace.provenance_tracking_level != 0:
                 set_kernel_post_grad_provenance_tracing(
                     combo_kernel_node.snodes, kernel_name
                 )
diff --git a/torch/_inductor/codegen/subgraph.py b/torch/_inductor/codegen/subgraph.py
index 8e34c43cebad5..374186c2e2426 100644
--- a/torch/_inductor/codegen/subgraph.py
+++ b/torch/_inductor/codegen/subgraph.py
@@ -168,7 +168,6 @@ class SubgraphTemplate(KernelTemplate):
     def __init__(
         self,
         name: str,
-        make_fx_graph: Callable[..., Any],
     ):
         """
         Initialize a subgraph template.
@@ -177,13 +176,15 @@ def __init__(
             name: The name of this template
             graph: The FX graph
         """
-        self.name = f"{name}_{next(SubgraphTemplate.index_counter)}"
-        self.make_fx_graph = make_fx_graph
+        super().__init__(name=name)
 
     def generate(  # type: ignore[override]
         self,
+        name: str,
         input_nodes: list[Buffer],
         layout: Layout,
+        make_fx_graph: Callable[..., Any],
+        description: str = "",
         **kwargs: Any,
     ) -> SubgraphChoiceCaller:
         """
@@ -200,9 +201,9 @@ def generate(  # type: ignore[override]
         """
 
         return SubgraphChoiceCaller(
-            name=self.name,
+            name=f"{name}_{next(SubgraphTemplate.index_counter)}",
             input_nodes=input_nodes,
             layout=layout,
-            description="",
-            make_fx_graph=self.make_fx_graph,
+            description=description,
+            make_fx_graph=make_fx_graph,
         )
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 0e763772911ca..121b64cdd6a6a 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -105,6 +105,7 @@
     from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
 
     from ..ir import IRNode
+    from .common import BlockShapeType
     from .simd_kernel_features import SIMDKernelFeatures
 
     _T = TypeVar("_T")
@@ -209,6 +210,7 @@ class IndexingOptions:
     expand_str: Optional[str]
     _has_rindex: bool
     index: sympy.Expr
+    expand_shape: Optional[Sequence[Union[int, str]]]
 
     def has_mask(self) -> bool:
         return bool(self.mask_vars)
@@ -316,6 +318,14 @@ def lookup_size(exprs: Iterable[sympy.Expr]) -> list[sympy.Expr]:
         # Combine all removable dims.
         removable_dims = [any(dims) for dims in zip(singleton_dims, broadcasting_dims)]
 
+        # Remove singleton_dims from broadcasting_dims so that
+        # broadcast_shape and broadcasting_dims have the same length
+        broadcasting_dims = [
+            dim
+            for dim, is_singleton in zip(broadcasting_dims, singleton_dims)
+            if not is_singleton
+        ]
+
         def remove_dims(it):
             """Removes any broadcasting or singleton dims from a given sequence"""
             return [
@@ -826,11 +836,19 @@ def low_precision_fp_var(var: Union[CSEVariable, Any]) -> bool:
 
 
 class TritonCSEVariable(CSEVariable):
-    def __init__(self, name, bounds: ValueRanges[Any], dtype: torch.dtype) -> None:
-        super().__init__(name, bounds, dtype)
+    def __init__(
+        self,
+        name: str,
+        bounds: ValueRanges[Any],
+        dtype: torch.dtype,
+        shape: BlockShapeType = None,
+    ) -> None:
+        super().__init__(name, bounds, dtype, shape=shape)
         # We'll use this to track which masks the variable needs when used for indirect indexing
         self.mask_vars: OrderedSet[str] = OrderedSet()
         assert dtype is not None, "TritonCSEVariable must have dtype"
+        # TODO: uncomment this and fix the few failures left
+        # assert shape is not None, "TritonCSEVariable must have shape"
 
     def update_on_args(self, name, args, kwargs):
         for arg in args:
@@ -1044,9 +1062,9 @@ def exp(x):
         more details.
         """
         if config.use_fast_math:
-            return f"libdevice.exp2({x} * {TritonOverrides._LOG_2_E})"
-        else:
             return f"tl_math.exp({x})"
+        else:
+            return f"libdevice.exp({x})"
 
     @staticmethod
     @maybe_upcast_float32()
@@ -1457,6 +1475,7 @@ def index_expr(cls, expr, dtype):
                 indexing.index_str,
                 bounds=get_bounds_index_expr(expr),
                 dtype=dtype,
+                shape=indexing.expand_shape,
             )
         finally:
             config.test_configs.runtime_triton_dtype_assert = orig
@@ -1466,6 +1485,7 @@ def index_expr(cls, expr, dtype):
                 V.kernel.compute,
                 cls.to_dtype(var, dtype),
                 dtype=upcast_compute_type(dtype),
+                shape=var.shape,
             )
         else:
             # TODO: we are not always consistent in enforcing that the output of the index expr printing
@@ -1484,6 +1504,7 @@ def index_expr(cls, expr, dtype):
                     V.kernel.compute,
                     cls.to_dtype(var, index_dtype),
                     dtype=index_dtype,
+                    shape=var.shape,
                 )
 
         var.mask_vars = indexing.mask_vars
@@ -1496,6 +1517,7 @@ def masked(mask, body, other):
                 V.kernel.compute,
                 f"{mask}.to(tl.int1)",
                 dtype=torch.bool,
+                shape=mask.shape,
             )
 
         nodes = body.graph.find_nodes(op="output")
@@ -1526,6 +1548,7 @@ def masked(mask, body, other):
                 f"tl.full({result}.shape, {constant_repr(other)}, {result}.dtype)",
                 bounds=ValueRanges.wrap(other),
                 dtype=result.dtype,
+                shape=result.shape,
             )
             ret = ops.where(new_mask, result, other)
         else:
@@ -1547,14 +1570,18 @@ def frexp(x):
         if cse_val := V.kernel.cse.try_get(cache_key):
             return cse_val
 
-        mantissa = V.kernel.cse.newvar(dtype=x.dtype)
-        exponent = V.kernel.cse.newvar(dtype=torch.int32)
+        mantissa = V.kernel.cse.newvar(dtype=x.dtype, shape=x.shape)
+        exponent = V.kernel.cse.newvar(dtype=torch.int32, shape=x.shape)
         V.kernel.compute.writeline(
             f"{mantissa}, {exponent} = triton_helpers.frexp({x})"
         )
         V.kernel.cse.put(cache_key, (mantissa, exponent))
         return (mantissa, exponent)
 
+    @staticmethod
+    def device_assert_async(cond, msg):
+        return f"tl.device_assert({cond}, {repr(msg)})"
+
 
 class HelperFunctions:
     """An ordered set of helper functions."""
@@ -1845,6 +1872,10 @@ def are_block_parameters_compatible(
 
 
 class TritonKernel(SIMDKernel[TritonCSEVariable]):
+    """A class to represent a triton kernel and helpers to generate
+    triton kernel programmatically
+    """
+
     overrides = TritonKernelOverrides  # type: ignore[assignment]
     helper_functions: HelperFunctions
     kexpr: Callable[[sympy.Expr], str] = texpr
@@ -1999,11 +2030,12 @@ def should_use_persistent_reduction(self) -> bool:
         )
 
     def want_no_x_dim(self):
-        """
-        ROCm branch change: Remove want_no_x_dim for persistent reduction.
-        Inductor benchmarks show no perf advantage and simplifies autotune flow.
-        """
-        return False
+        return (
+            self.persistent_reduction
+            and len(self.numels) == self.num_reduction_dims + 1
+            and self.fixed_config
+            and self.fixed_config["XBLOCK"] == 1
+        )
 
     @property
     def assert_function(self) -> str:
@@ -2266,27 +2298,31 @@ def match_block_expr() -> Optional[BlockDescriptorOptions]:
 
                 # Form the block pointer or TMA descriptor.
                 self.filter_masks(mask_vars)
-                options_class: type[BlockDescriptorOptions]
-                if config.triton.use_block_ptr:
-                    options_class = BlockPtrOptions
-                else:
+
+                options_class = (
+                    BlockPtrOptions
+                    if config.triton.use_block_ptr
+                    else TensorDescriptorOptions
+                )
+                options = options_class.create(
+                    params=block_params,
+                    constant_offset=offset,
+                    range_trees=range_trees,
+                    mask_vars=mask_vars,
+                    get_max_block=self.max_block,
+                )
+
+                if options_class == TensorDescriptorOptions:
                     nonlocal tma_compatibility_checker
                     tma_compatibility_checker = cast(
                         TMACompatibilityChecker, tma_compatibility_checker
                     )
                     if not tma_compatibility_checker.are_block_parameters_compatible(
-                        block_params
+                        options.params
                     ):
                         return None
-                    options_class = TensorDescriptorOptions
 
-                return options_class.create(
-                    params=block_params,
-                    constant_offset=offset,
-                    range_trees=range_trees,
-                    mask_vars=mask_vars,
-                    get_max_block=self.max_block,
-                )
+                return options
 
             # Return a block pointer, if indexing matches the pattern.
             options = match_block_expr()
@@ -2294,9 +2330,11 @@ def match_block_expr() -> Optional[BlockDescriptorOptions]:
                 return options
 
         expand_str = None
+        expand_shape: BlockShapeType = None
         index_str = self.index_to_str(index)
         if isinstance(index, sympy.Integer):
             expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
+            expand_shape = None if copy_shape else tuple(self.dense_size_list())
             index_str = f"tl.full({expand_str}, {index_str}, tl.int32)"
             if self.fixed_config and not self._has_constant_xmask():
                 mask_vars = OrderedSet(["xmask"])
@@ -2304,16 +2342,30 @@ def match_block_expr() -> Optional[BlockDescriptorOptions]:
                 mask_vars = OrderedSet()
             if self._load_mask:
                 mask_vars.add(self._load_mask)
-            return IndexingOptions(index_str, mask_vars, expand_str, has_rindex, index)
+            return IndexingOptions(
+                index_str,
+                mask_vars,
+                expand_str,
+                has_rindex,
+                index,
+                expand_shape=expand_shape,
+            )
 
         if need_dense and not have_dense:
             expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
+            expand_shape = None if copy_shape else tuple(self.dense_size_list())
             index_str = f"tl.broadcast_to({index_str}, {expand_str})"
             mask_vars = dense_mask_vars
         elif not have_loop_vars and copy_shape:
             index_str = f"tl.broadcast_to({index_str}, {copy_shape}.shape)"
             mask_vars = dense_mask_vars
 
+        if expand_shape is None:
+            if need_dense or have_dense:
+                expand_shape = None if copy_shape else tuple(self.dense_size_list())
+            else:
+                expand_shape = ()
+
         if override_mask:
             mask_vars = OrderedSet([override_mask])
 
@@ -2322,7 +2374,14 @@ def match_block_expr() -> Optional[BlockDescriptorOptions]:
 
         self.filter_masks(mask_vars)
 
-        return IndexingOptions(index_str, mask_vars, expand_str, has_rindex, index)
+        return IndexingOptions(
+            index_str,
+            mask_vars,
+            expand_str,
+            has_rindex,
+            index,
+            expand_shape=expand_shape,
+        )
 
     def codegen_block_ptr(
         self,
@@ -2458,6 +2517,9 @@ def get_load_buffer(self, indexing):
             return self.loads
 
     def load(self, name: str, index: sympy.Expr):
+        """
+        Load from the memory location 'name', offset by some indexing expression 'index'.
+        """
         var = self.args.input(name)
         load_counts = self._load_counts
         load_counts[name] += 1
@@ -2536,6 +2598,7 @@ def decide_later():
             cachemod = ", cache_modifier='.cg'"
 
         append_broadcast = None
+        shape: BlockShapeType = None
 
         if should_unwrap_unspec_arg(name):
             line = var
@@ -2543,6 +2606,7 @@ def decide_later():
             # see triton_utils.py:signature_of
             if dtype in (torch.float16, torch.bfloat16):
                 dtype = torch.float32
+            shape = ()
 
         else:
             if isinstance(indexing, (BlockPtrOptions, TensorDescriptorOptions)):
@@ -2556,11 +2620,14 @@ def decide_later():
                 line = indexing.codegen_broadcast_and_reshape(
                     line, indexing.block_shape, indexing.final_shape, True
                 )
+                shape = indexing.final_shape
             elif isinstance(original_index, sympy.Integer):
                 line = f"tl.load({var} + ({original_index}))"
                 append_broadcast = indexing.expand_str
+                shape = ()
             else:
                 line = f"tl.load({var} + ({indexing.index_str}), {indexing.mask_str}{ep}{other}{cachemod})"
+                shape = indexing.expand_shape
 
             if (
                 dtype in (torch.float16, torch.bfloat16)
@@ -2576,7 +2643,9 @@ def decide_later():
                 dtype = torch.bool
 
         load_buffer = self.get_load_buffer(indexing)
-        result_var = self.cse.generate(load_buffer, make_line(line), dtype=dtype)
+        result_var = self.cse.generate(
+            load_buffer, make_line(line), dtype=dtype, shape=shape
+        )
         if result_var.use_count > 1:
             load_counts[name] -= 1  # don't double count cache hit
         assert isinstance(result_var, TritonCSEVariable)
@@ -2584,7 +2653,9 @@ def decide_later():
 
         if append_broadcast:
             line = f"tl.broadcast_to({result_var}, {append_broadcast})"
-            result_var = self.cse.generate(load_buffer, line, dtype=dtype)
+            result_var = self.cse.generate(
+                load_buffer, line, dtype=dtype, shape=indexing.expand_shape
+            )
             if indexing.mask_vars:
                 if dtype.is_floating_point:
                     zero = "0.0"
@@ -2596,7 +2667,9 @@ def decide_later():
                     constant_repr(self._load_other) if self._load_other else zero
                 )
                 line = f"tl.where({indexing.mask_str}, {result_var}, {other_val})"
-                result_var = self.cse.generate(load_buffer, line, dtype=dtype)
+                result_var = self.cse.generate(
+                    load_buffer, line, dtype=dtype, shape=result_var.shape
+                )
 
         if not self.inside_reduction or (not indexing.has_rmask() and not has_rindex):
             self.outside_loop_vars.add(result_var)
@@ -2666,6 +2739,18 @@ def guard_cooperative_store(self, name, buffer):
         buffer.writeline(DeferredLine(name, f"if rsplit_id == ({idx} % RSPLIT):"))
         return buffer.indent()
 
+    def _combine_masks(self, *variables: Optional[CSEVariable]):
+        masks = None
+        for elem in variables:
+            if elem is None:
+                continue
+            if hasattr(elem, "mask_vars"):
+                if masks is None:
+                    masks = elem.mask_vars
+                else:
+                    masks = masks | elem.mask_vars
+        return masks
+
     def bucketize(
         self,
         values: CSEVariable,
@@ -2713,8 +2798,12 @@ def bucketize(
             f"{sorter_indices}, "
             ")",
             dtype=indexing_dtype,  # type: ignore[attr-defined]
+            shape=values.shape,
         )
 
+        masks = self._combine_masks(values, boundary_indices, sorter_indices)
+        result.mask_vars = masks  # type: ignore[attr-defined]
+
         return result
 
     def reduction_resize(self, value) -> str:
@@ -2726,7 +2815,21 @@ def reduction_resize(self, value) -> str:
         sizes = [":"] * (ndims - nreduce) + ["None"] * nreduce
         return f"{value}[{', '.join(sizes)}]"
 
-    def reduction_collapse_dims(self, buffer, value: str, dtype: torch.dtype) -> str:
+    def reduction_resize_and_shape(self, value, shape) -> tuple[str, BlockShapeType]:
+        ndims = self.triton_tensor_ndim()
+        if ndims == 1:
+            return f"triton_helpers.promote_to_tensor({value})", shape
+
+        nreduce = self.num_reduction_dims
+        sizes = [":"] * (ndims - nreduce) + ["None"] * nreduce
+        new_shape = (
+            (*shape[: (ndims - nreduce)], *[1] * nreduce) if shape is not None else None
+        )
+        return f"{value}[{', '.join(sizes)}]", new_shape
+
+    def reduction_collapse_dims(
+        self, buffer, value: CSEVariable, dtype: torch.dtype
+    ) -> CSEVariable:
         """
         Reshape to RBLOCK, collapsing all reduction dims.
         """
@@ -2737,10 +2840,11 @@ def reduction_collapse_dims(self, buffer, value: str, dtype: torch.dtype) -> str
         target_ndim = self.triton_tensor_ndim() - self.num_reduction_dims
         initial_shape = self.dense_size_list()
         target_shape = initial_shape[:target_ndim] + ["RBLOCK"]
-        return str(
-            self.cse.generate(
-                buffer, triton_reshape(value, initial_shape, target_shape), dtype=dtype
-            )
+        return self.cse.generate(
+            buffer,
+            triton_reshape(str(value), initial_shape, target_shape),
+            dtype=dtype,
+            shape=tuple(target_shape),
         )
 
     def reduction(
@@ -2791,6 +2895,7 @@ def maybe_upcast(value: CSEVariable) -> CSEVariable:
                 self.compute,
                 f"tl.broadcast_to({v}, {dense_size_str})",
                 dtype=v.dtype,
+                shape=tuple(self.dense_size_list()),
             ),
             value,
         )
@@ -2800,9 +2905,9 @@ def maybe_upcast(value: CSEVariable) -> CSEVariable:
 
         def final_reduction(
             buffer,
-            value: str,
-            result_type: Optional[str],
-        ) -> str:
+            value: CSEVariable,
+            result_type: Optional[torch.dtype],
+        ) -> tuple[str, Optional[torch.dtype], BlockShapeType]:
             """
             Helper to generate a reduction call, e.g. tl.sum.
             """
@@ -2811,29 +2916,31 @@ def final_reduction(
 
             value = self.reduction_collapse_dims(buffer, value, dtype)
             if reduction_type in ("max", "min"):
-                value = self.reduction_resize(
-                    f"{module}.{reduction_type}2({value}, {dim})"
+                result, shape = self.reduction_resize_and_shape(
+                    f"{module}.{reduction_type}2({value}, {dim})", value.shape
                 )
             else:
-                value = self.reduction_resize(
-                    f"{module}.{reduction_type}({value}, {dim})"
+                result, shape = self.reduction_resize_and_shape(
+                    f"{module}.{reduction_type}({value}, {dim})", value.shape
                 )
 
             if result_type is not None:
-                value = f"{value}.to({result_type})"
+                result = f"{result}.to({self.dtype_to_str(result_type)})"
+            else:
+                result_type = value.dtype
 
-            return value
+            return result, result_type, shape
 
         def final_reduction_define(
             buffer,
-            result_var: str,
-            value: str,
-            result_type: Optional[str],
+            result_var: CSEVariable,
+            value: CSEVariable,
+            result_type: Optional[torch.dtype],
         ) -> None:
             """
             Generate a reduction and assign it to an existing variable.
             """
-            value = final_reduction(buffer, value, result_type)
+            value, _, _ = final_reduction(buffer, value, result_type)
             buffer.splice(f"{result_var} = {value}")
 
         def final_argreduce(buffer, result_var, value, index):
@@ -2852,7 +2959,11 @@ def final_argreduce(buffer, result_var, value, index):
 
         acc_type = triton_acc_type(src_dtype)
         torch_acc_type = upcast_acc_dtype(src_dtype)
-        result_var: Any = self.cse.newvar(dtype=torch_acc_type)
+        result_shape = list(self.dense_size_list())
+        result_shape[dim] = "1"
+        result_var: Any = self.cse.newvar(
+            dtype=torch_acc_type, shape=tuple(result_shape)
+        )
         result_var.mask_vars = OrderedSet(
             var for var in masks if not prefix_is_reduction(var[0])
         )
@@ -2869,7 +2980,10 @@ def where_cond(tval, fval):
 
             def _mask_value(value, default) -> CSEVariable:
                 return self.cse.generate(
-                    self.compute, where_cond(value, default), dtype=value.dtype
+                    self.compute,
+                    where_cond(value, default),
+                    dtype=value.dtype,
+                    shape=value.shape if value.shape is not None else default.shape,
                 )
 
             masked_value: Union[CSEVariable, Sequence[CSEVariable]]
@@ -2883,12 +2997,14 @@ def _mask_value(value, default) -> CSEVariable:
                 masked_value = _mask_value(value, default)
 
             if reduction_type in ("argmax", "argmin"):
+                assert isinstance(masked_value, CSEVariable)
                 accumulator_dtype = V.kernel.get_index_dtype_as_torch_dtype()
                 accumulator_index = str(
                     self.cse.generate(
                         self.compute,
                         f"tl.broadcast_to({reduction_range_prefix}index, {masked_value}.shape)",
                         dtype=accumulator_dtype,
+                        shape=masked_value.shape,
                     )
                 )
                 root_op = {"argmax": "max", "argmin": "min"}[reduction_type]
@@ -2911,8 +3027,8 @@ def _mask_value(value, default) -> CSEVariable:
                 assert isinstance(masked_value, Sequence)
                 (mean, m2, weight) = masked_value
                 result_var = tuple(
-                    self.cse.generate(self.compute, value, dtype=dtype)
-                    for value in self._welford(
+                    self.cse.generate(self.compute, value, dtype=dtype, shape=shape)
+                    for value, shape in self._welford(
                         self.compute, mean, m2, weight, dim, dtype
                     )
                 )
@@ -2922,13 +3038,18 @@ def _mask_value(value, default) -> CSEVariable:
                 result_var = self.prepare_softmax_twopass_fallback(dtype, value)
             else:
                 assert isinstance(masked_value, CSEVariable)
+                _result, _dtype, _shape = final_reduction(
+                    self.compute, masked_value, masked_value.dtype
+                )
                 result_var = self.cse.generate(
-                    self.compute,
-                    final_reduction(self.compute, str(masked_value), None),
-                    dtype=masked_value.dtype,
+                    self.compute, _result, dtype=_dtype, shape=_shape
                 )
         else:
-            accumulator = self.cse.namedvar(f"_{result_var}", dtype=torch_acc_type)
+            accumulator = self.cse.namedvar(
+                f"_{result_var}",
+                dtype=torch_acc_type,
+                shape=tuple(self.dense_size_list()),
+            )
             default = ir.Reduction.default_accumulator(reduction_type, src_dtype)
             default = self._map_tuple_or_scalar(constant_repr, default)
             if not isinstance(default, tuple):
@@ -2995,7 +3116,7 @@ def _mask_value(value, default) -> CSEVariable:
                 # reduce. Similar to the final reduction for coopereative
                 # reduction
                 result_max = result_var
-                result_sum = self.cse.newvar(dtype=dtype)
+                result_sum = self.cse.newvar(dtype=dtype, shape=result_max.shape)
 
                 result_var = self.online_softmax_reduce_final_reduction(
                     self.post_loop_combine,
@@ -3020,19 +3141,17 @@ def _mask_value(value, default) -> CSEVariable:
                     # to
                     #     tmp5 = triton_helpers.max(_tmp5.to(tl.int8), 1)[:, None].to(tl.int1)
                     # which is needed because tl.reduce doesn't support tl.int1
-                    accumulator_casted_str = f"{accumulator}.to(tl.int8)"
-                    result_type = triton_compute_type(dtype)
-                    final_reduction_define(
+                    accumulator = self.cse.generate(
                         self.post_loop_combine,
-                        str(result_var),
-                        accumulator_casted_str,
-                        result_type,
-                    )
-                else:
-                    final_reduction_define(
-                        self.post_loop_combine, str(result_var), str(accumulator), None
+                        f"{accumulator}.to(tl.int8)",
+                        dtype=torch.int8,
+                        shape=accumulator.shape,
                     )
 
+                final_reduction_define(
+                    self.post_loop_combine, result_var, accumulator, None
+                )
+
         if self.cooperative_reduction:
             default = ir.Reduction.default_accumulator(reduction_type, src_dtype)
             exit_stack = contextlib.ExitStack()
@@ -3104,9 +3223,7 @@ def _mask_value(value, default) -> CSEVariable:
                 peers = self.codegen_cooperative_reduction_peer_combine(
                     result_var, upcast_acc_dtype(src_dtype), default
                 )
-                final_reduction_define(
-                    self.post_loop_store, str(result_var), peers, None
-                )
+                final_reduction_define(self.post_loop_store, result_var, peers, None)
             exit_stack.close()
 
         self.cse.reduction_cache[cache_key] = result_var
@@ -3166,11 +3283,20 @@ def _welford(self, buffer, mean, m2, weight, dim, dtype: torch.dtype):
             for value in (mean, m2, weight)
         )
         welford = f"triton_helpers.welford({mean}, {m2}, {weight}, {dim})"
-        welford_results = [str(self.cse.newvar(dtype=dtype)) for _ in range(3)]
-        buffer.writeline(f"{', '.join(welford_results)} = {welford}")
 
-        result_values = tuple(self.reduction_resize(value) for value in welford_results)
-        return result_values
+        def reduced_shape(shape):
+            return tuple(shape[0:dim] + shape[dim + 1 :])
+
+        welford_results = [
+            self.cse.newvar(dtype=dtype, shape=reduced_shape(value.shape))
+            for value in (mean, m2, weight)
+        ]
+        buffer.writeline(f"{', '.join([str(r) for r in welford_results])} = {welford}")
+
+        return tuple(
+            self.reduction_resize_and_shape(value, value.shape)
+            for value in welford_results
+        )
 
     def welford_reduce(
         self, result_var, reduction_type, value, where_cond, acc_type, dtype
@@ -3178,9 +3304,24 @@ def welford_reduce(
         """Helper to codegen a welford reduction"""
         dim = self.triton_tensor_ndim() - self.num_reduction_dims
 
-        accumulator = f"{result_var}_mean"
-        accumulator_m2 = f"{result_var}_m2"
-        accumulator_weight = f"{result_var}_weight"
+        accumulator = TritonCSEVariable(
+            f"{result_var}_mean",
+            shape=tuple(self.dense_size_list()),
+            dtype=acc_type,
+            bounds=ValueRanges.unknown(),
+        )
+        accumulator_m2 = TritonCSEVariable(
+            f"{result_var}_m2",
+            shape=tuple(self.dense_size_list()),
+            dtype=acc_type,
+            bounds=ValueRanges.unknown(),
+        )
+        accumulator_weight = TritonCSEVariable(
+            f"{result_var}_weight",
+            shape=tuple(self.dense_size_list()),
+            dtype=acc_type,
+            bounds=ValueRanges.unknown(),
+        )
         self.body.writeline(
             f"{accumulator} = tl.zeros({self.dense_size_str()}, {acc_type})"
         )
@@ -3217,13 +3358,11 @@ def welford_reduce(
             """
         )
         result_mean = result_var
-        result_m2 = self.cse.newvar(dtype=dtype)
-        result_weight = self.cse.newvar(dtype=dtype)
         return self.welford_reduce_final_reduction(
             self.post_loop_combine,
             result_mean,
-            result_m2,
-            result_weight,
+            None,
+            None,
             accumulator,
             accumulator_m2,
             accumulator_weight,
@@ -3244,21 +3383,30 @@ def welford_reduce_final_reduction(
         dtype,
     ):
         """Helper to codegen call to triton_helpers.welford"""
-        values = self._welford(buffer, mean, m2, weight, dim, dtype)
+        values = list(self._welford(buffer, mean, m2, weight, dim, dtype))
+
         result_exprs = [result_mean, result_m2, result_weight]
-        for result_expr, value in zip(result_exprs, values):
+        for i, (result_expr, (value, shape)) in enumerate(zip(result_exprs, values)):
+            if result_expr is None:
+                result_expr = self.cse.newvar(dtype=dtype, shape=shape)
+                result_exprs[i] = result_expr
             buffer.splice(f"{result_expr} = {value}")
 
-        return result_mean, result_m2, result_weight
+        return tuple(result_exprs)
 
     def online_softmax_reduce_final_reduction(
         self, buffer, result_max, result_sum, peer_max, peer_sum, dim, dtype
     ):
-        values = self._online_softmax_reduce(buffer, peer_max, peer_sum, dim, dtype)
-        result_exprs = [result_max, result_sum]
-        for result_expr, value in zip(result_exprs, values):
-            buffer.splice(f"{result_expr} = {value}")
-
+        accumulator_max = self.reduction_collapse_dims(buffer, peer_max, dtype)
+        accumulator_sum = self.reduction_collapse_dims(buffer, peer_sum, dtype)
+        buffer.splice(
+            f"""
+            {result_max}, {result_sum} = triton_helpers.online_softmax_reduce(
+                {accumulator_max}, {accumulator_sum}, {dim}, {config.use_fast_math})
+            {result_max} = {self.reduction_resize(f"{result_max}")}
+            {result_sum} = {self.reduction_resize(f"{result_sum}")}
+            """
+        )
         return result_max, result_sum
 
     def max_rsplit(self):
@@ -3268,7 +3416,7 @@ def max_rsplit(self):
 
     def codegen_cooperative_reduction_peer_combine(
         self, result_var, dtype, default_val
-    ):
+    ) -> CSEVariable:
         """
         Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
         column.  After the barrier, every thread block loads the completed value so that it can compute the final
@@ -3287,11 +3435,17 @@ def codegen_cooperative_reduction_peer_combine(
             """,
             strip=True,
         )
+        peers = self.create_cse_var(
+            f"{result_var}_peers",
+            shape=["XBLOCK", "RSPLIT"],
+            dtype=dtype,
+            bounds=ValueRanges.unknown(),
+        )
         self.post_loop_store.writeline(
-            f"{result_var}_peers = tl.load({result_var}_ws + (xindex * RSPLIT + rsplit_arange), "
+            f"{peers} = tl.load({result_var}_ws + (xindex * RSPLIT + rsplit_arange), "
             f"rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, {constant_repr(default_val)}))"
         )
-        return f"{result_var}_peers"
+        return peers
 
     def store_reduction(
         self,
@@ -3342,7 +3496,9 @@ def store_reduction(
 
         exit_stack.close()
 
-    def _lift_helper(self, fn, num_args, dtypes: tuple[torch.dtype, ...]) -> str:
+    def _lift_helper(
+        self, fn, values: tuple[CSEVariable, ...], dtypes: tuple[torch.dtype, ...]
+    ) -> str:
         # Lift IR function for scan operations into a triton function
         # in the global namespace
         helper = IndentedBuffer()
@@ -3350,7 +3506,10 @@ def _lift_helper(self, fn, num_args, dtypes: tuple[torch.dtype, ...]) -> str:
         cse = CSE()
 
         args = [
-            tuple(cse.namedvar(f"arg{i}_{n}", dtype=dtypes[n]) for n in range(num_args))
+            tuple(
+                cse.namedvar(f"arg{i}_{n}", dtype=dtype, shape=value.shape)
+                for n, (value, dtype) in enumerate(zip(values, dtypes))
+            )
             for i in range(2)
         ]
         signature = ", ".join(str(x) for x in itertools.chain.from_iterable(args))
@@ -3365,7 +3524,9 @@ def _lift_helper(self, fn, num_args, dtypes: tuple[torch.dtype, ...]) -> str:
         helper_name = "_triton_helper_fn"
 
         from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
+        from torch._inductor.shape_propagation import ShapePropagationOpsHandler
 
+        shape_handler = ShapePropagationOpsHandler()
         dtype_handler = DtypePropagationOpsHandler()
 
         class CSEProxy(DefaultHandler):
@@ -3380,10 +3541,16 @@ def _default(
                     name,
                 )(*args, **kwargs)
 
+                output_shape = getattr(
+                    shape_handler,
+                    name,
+                )(*args, **kwargs)
+
                 return cse.generate(
                     helper,
                     getattr(overrides, name)(*args, **kwargs),
                     dtype=output_dtype,
+                    shape=output_shape,
                 )
 
         with helper.indent(), V.set_ops_handler(CSEProxy()):
@@ -3401,6 +3568,9 @@ def scan(
         ],
         values: tuple[CSEVariable, ...],
     ) -> tuple[CSEVariable, ...]:
+        """
+        Perform an associative scan on 'values'.
+        """
         assert self.inside_reduction
         assert not self.cooperative_reduction, "TODO"
         masks = OrderedSet(f"{tree.prefix}mask" for tree in self.range_trees)
@@ -3413,7 +3583,7 @@ def scan(
 
         dtypes = tuple(upcast_compute_type(dtype) for dtype in dtypes)
         cse_compute = functools.partial(self.cse.generate, self.compute)
-        combine_helper_fn = self._lift_helper(combine_fn, len(values), dtypes)
+        combine_helper_fn = self._lift_helper(combine_fn, values, dtypes)
         dim = self.triton_tensor_ndim() - self.num_reduction_dims
 
         for value, dtype in zip(values, dtypes):
@@ -3421,25 +3591,27 @@ def scan(
                 self.compute,
                 f"{value}.to({triton_compute_type(dtype)})",
                 dtype=dtype,
+                shape=value.shape,
             )
             value = self.cse.generate(
                 self.compute,
                 f"tl.broadcast_to({value_dtype}, {self.dense_size_str()})",
                 dtype=dtype,
+                shape=tuple(self.dense_size_list()),
             )
             broadcasted_values.append(value)
 
             acc_type = triton_acc_type(dtype)
 
             if not self.persistent_reduction:
-                accumulator = self.cse.newvar(dtype=dtype)
                 reduced_size = self.dense_size_list()
                 reduced_size[-1] = "1"
-                reduced_size = f"[{', '.join(reduced_size)}]"
+                accumulator = self.cse.newvar(dtype=dtype, shape=reduced_size)
+                reduced_size_str = f"[{', '.join(reduced_size)}]"
 
                 default = "float('nan')" if dtype.is_floating_point else "-1"
                 self.body.writeline(
-                    f"{accumulator} = tl.full({reduced_size}, {default}, {acc_type})"
+                    f"{accumulator} = tl.full({reduced_size_str}, {default}, {acc_type})"
                 )
 
                 accumulators.append(accumulator)
@@ -3452,7 +3624,10 @@ def cse_multiple(line, values, masks, dtypes):
             cache_keys = [f"{line}, {i}, {masks}" for i in range(n)]
             if all(self.cse.contains(cache_key) for cache_key in cache_keys):
                 return [self.cse.get(cache_key) for cache_key in cache_keys]
-            result_vars = [self.cse.newvar(dtype=_dtype) for _dtype in dtypes]
+            result_vars = [
+                self.cse.newvar(dtype=dtype, shape=value.shape)
+                for (dtype, value) in zip(dtypes, values)
+            ]
             self.compute.writeline(
                 f"{csv(result_vars)} = {line}",
             )
@@ -3464,7 +3639,7 @@ def cse_multiple(line, values, masks, dtypes):
 
         partial_scan_vars = cse_multiple(
             f"tl.associative_scan(({csv(broadcasted_values)}), {dim}, {combine_helper_fn})",
-            values,
+            broadcasted_values,
             masks,
             dtypes,
         )
@@ -3473,10 +3648,19 @@ def cse_multiple(line, values, masks, dtypes):
             # tl.reduce doesn't work for non-commutative operators, so instead
             # of repeating the scan op as a reduction, we use sum to select the
             # last scan value
+            def _partial_scan_shape(var):
+                if var.shape is None:
+                    return None
+                else:
+                    shape = list(var.shape)
+                    shape[-1] = "1"
+                    return shape
+
             partial_reduce_vars = [
                 cse_compute(
                     f"triton_helpers.select_one(({partial_scan_var}), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)",
                     dtype=upcast_compute_type(partial_scan_var.dtype),
+                    shape=_partial_scan_shape(partial_scan_var),
                 )
                 for partial_scan_var in partial_scan_vars
             ]
@@ -3486,6 +3670,7 @@ def cse_multiple(line, values, masks, dtypes):
                 cse_compute(
                     f"tl.where(roffset > 0, {full_scan}, {partial_scan})",
                     dtype=partial_scan.dtype,
+                    shape=partial_scan.shape,
                 )
                 for full_scan, partial_scan in zip(full_scan_vars, partial_scan_vars)
             ]
@@ -3528,7 +3713,9 @@ def sort(
         assert len(dtypes) == len(values)
         broadcasted_values = [
             cse_compute(
-                f"tl.broadcast_to({value}, {self.dense_size_str()})", dtype=dtypes[i]
+                f"tl.broadcast_to({value}, {self.dense_size_str()})",
+                dtype=dtypes[i],
+                shape=tuple(self.dense_size_list()),
             )
             for i, value in enumerate(values)
         ]
@@ -3536,11 +3723,15 @@ def sort(
         def csv(values):
             return " ".join(f"{value}," for value in values)
 
-        def cse_multiple(line, n, masks, dtypes):
+        def cse_multiple(line, broadcasted_values, masks, dtypes):
+            n = len(broadcasted_values)
             cache_keys = [f"{line}, {i}, {masks}" for i in range(n)]
             if all(self.cse.contains(cache_key) for cache_key in cache_keys):
                 return [self.cse.get(cache_key) for cache_key in cache_keys]
-            result_vars = [self.cse.newvar(dtype=dtypes[i]) for i in range(n)]  # type: ignore[attr-defined]
+            result_vars = [
+                self.cse.newvar(dtype=dtype, shape=value.shape)
+                for dtype, value in zip(dtypes, broadcasted_values)
+            ]  # type: ignore[attr-defined]
             self.compute.writeline(
                 f"{csv(result_vars)} = {line}",
             )
@@ -3558,7 +3749,7 @@ def cse_multiple(line, n, masks, dtypes):
                 f"triton_helpers.sort_with_index({broadcasted_values[0]}, {broadcasted_values[1]},"
                 f" {rnumel}, {dim}, stable={stable}, descending={descending})"
             )
-            result_vars = cse_multiple(line, len(values), masks, dtypes)
+            result_vars = cse_multiple(line, broadcasted_values, masks, dtypes)
         else:
             raise AssertionError("Unhandled sort")
 
@@ -3835,7 +4026,12 @@ def inductor_meta_common():
             )
         return inductor_meta
 
-    def codegen_kernel(self, name=None):
+    def codegen_kernel(self, name=None) -> str:
+        """
+        Convert the TritonKernel from Inductor SIMD IR to triton code, including inductor triton heuristics, imports,
+        metadata, and benchmarking infra.
+        """
+
         code = IndentedBuffer()
 
         size_hints = {}
@@ -3965,8 +4161,8 @@ def add_constexpr_arg(arg_name):
         optimize_mem = V.graph.is_inference or V.graph.is_backward
 
         inductor_meta = {
-            # Triton will not accept an OrderedSet for autotune_hints
             "grid_type": self._get_grid_type().__name__,
+            # Triton will not accept an OrderedSet for autotune_hints
             "autotune_hints": set(self.autotune_hints),  # noqa: set_linter
             "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
             "mutated_arg_names": mutated_args,
@@ -3976,6 +4172,53 @@ def add_constexpr_arg(arg_name):
             "num_reduction": self.num_reduction,
             **self.inductor_meta_common(),
         }
+
+        # Bail on 3d tiling, which has more complicated coalesce patterns
+        looped_red = V.kernel.features.is_reduction() and not self.persistent_reduction
+        tiling_scores = self.tiling_scores
+        two_d_red = (
+            len(self.tiling) == 2 and tiling_scores is not None and "x" in tiling_scores
+        )
+        if looped_red and two_d_red:
+            assert tiling_scores is not None
+            memory_stats = self.features.memory_stats(self.tiling)
+            dim_stats = memory_stats.persistent.memory.dim[0]
+            mem_ops_per_thread = dim_stats.count_per_thread
+
+            # check if majority of reads are coalesced by the rblock
+            r_coalesce_ratio = tiling_scores["r0_"] / max(tiling_scores["x"], 1)
+
+            looped_mem = memory_stats.looped.memory.bytes
+            persistent_mem = memory_stats.persistent.memory.bytes
+            # check that we save significant memory by doing persistent
+            saved_bytes_ratio = V.graph.sizevars.size_hint(
+                looped_mem, fallback=config.unbacked_symint_fallback
+            ) / max(
+                V.graph.sizevars.size_hint(
+                    persistent_mem, fallback=config.unbacked_symint_fallback
+                ),
+                1,
+            )
+
+            # TODO - rnumel should be reasonably close to power of 2
+            if (
+                # significant memory bandwidth savings
+                saved_bytes_ratio >= 1.3
+                # large rblock inhibits xblock size, dont attempt if there is a decent amount of
+                # reads coalesced by xblock
+                and r_coalesce_ratio >= 8.0
+                # TODO - need more detailed register analysis
+                and V.graph.sizevars.statically_known_leq(
+                    self.features.reduction_numel, 32768
+                )
+                # We will already generate a persistent config in this case
+                and V.graph.sizevars.statically_known_gt(
+                    self.features.reduction_numel, 2048
+                )
+                and mem_ops_per_thread <= 10
+            ):
+                inductor_meta["add_persistent_rblock"] = True
+
         if self.tiling_scores:
             inductor_meta["tiling_scores"] = self.tiling_scores
 
@@ -4478,6 +4721,11 @@ def define_kernel(self, src_code, node_schedule, kernel):
             kernel_name = "_".join(
                 ["triton", kernel_category, fused_name, wrapper.next_kernel_suffix()]
             )
+            if config.aot_inductor.model_name_for_generated_files:
+                # When AOTI compiles multiple submodules, we need to use the model name to
+                # distinguish kernel related symbols.
+                kernel_name = f"{config.aot_inductor.model_name_for_generated_files}_{kernel_name}"
+
             # use the original src_code as the key
             wrapper.src_to_kernel[src_code] = kernel_name
             subs_name = kernel_name if config.triton.unique_kernel_names else "triton_"
diff --git a/torch/_inductor/codegen/triton_split_scan.py b/torch/_inductor/codegen/triton_split_scan.py
index 23ee1e38d18b2..b36d26ec08bf6 100644
--- a/torch/_inductor/codegen/triton_split_scan.py
+++ b/torch/_inductor/codegen/triton_split_scan.py
@@ -86,6 +86,9 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         raise NotImplementedError("NYI TritonSplitDimKernel reductions")
 
     def scan(self, dtypes, combine_fn, values):
+        """
+        Perform an associative scan on 'values'.
+        """
         import triton.language as tl
 
         (dtype,) = dtypes
@@ -123,11 +126,16 @@ def scan(self, dtypes, combine_fn, values):
         scratch_base: Union[str, TritonCSEVariable]
         scratch_base, offset = self.args.workspace(nbytes=nbytes, zero_fill=True)
         if offset != 0:
-            scratch_base = cse_load(f"{scratch_base} + {self.index_to_str(offset)}")
-        runtime_rblocks = cse_load(f"tl.num_programs({self.range_trees[-1].index})")
+            scratch_base = cse_load(
+                f"{scratch_base} + {self.index_to_str(offset)}", shape=()
+            )
+        runtime_rblocks = cse_load(
+            f"tl.num_programs({self.range_trees[-1].index})", shape=()
+        )
         scratch_base = cse_load(
             f"{scratch_base}.to(tl.pointer_type({scratch_type})) + xoffset * "
-            f"{scratch_elems_per_block} * {runtime_rblocks}"
+            f"{scratch_elems_per_block} * {runtime_rblocks}",
+            shape=(),
         )
 
         masks = OrderedSet(f"{tree.prefix}mask" for tree in self.range_trees)
@@ -137,22 +145,28 @@ def scan(self, dtypes, combine_fn, values):
         value = cse_compute(
             f"{value}.to({compute_type})",
             dtype=dtype,
+            shape=value.shape,
         )
         value = cse_compute(
             f"tl.broadcast_to({value}, {self.dense_size_str()})",
             dtype=dtype,
+            shape=self.dense_size_list(),
         )
 
-        combine_helper_fn = self._lift_helper(combine_fn, 1, (dtype,))
+        combine_helper_fn = self._lift_helper(combine_fn, (value,), (dtype,))
         dim = self.triton_tensor_ndim() - 1
         assert dim == 0, ""
+        shape = list(self.dense_size_list())
+        del shape[dim]
 
         block_sum = cse_compute(
             f"tl.reduce({value}, {dim}, {combine_helper_fn})",
             dtype=dtype,
+            shape=shape,
         )
         exclusive_prefix = self.cse.newvar(
             dtype=dtype,
+            shape=shape,
         )
         if element_nbits == 64:
             self.compute.splice(
@@ -188,15 +202,18 @@ def scan(self, dtypes, combine_fn, values):
         block_scan = cse_compute(
             f"tl.associative_scan({value}, {dim}, {combine_helper_fn})",
             dtype=dtype,
+            shape=shape,
         )
         combined_result = cse_compute(
             f"{combine_helper_fn}({exclusive_prefix}, {block_scan})",
             dtype=dtype,
+            shape=shape,
         )
         return (
             cse_compute(
                 f"tl.where(roffset == 0, {block_scan}, {combined_result})",
                 dtype=dtype,
+                shape=block_scan.shape,
             ),
         )
 
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index f4370e619c1ba..9d1b82d9b9334 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -48,8 +48,10 @@
     cache_on_self,
     DelayReplaceLine,
     get_benchmark_name,
+    get_dtype_size,
     IndentedBuffer,
     is_codegen_graph_partition_subgraph,
+    is_using_cudagraph_partition,
     LineContext,
     sympy_product,
     sympy_str,
@@ -226,11 +228,18 @@ def writeline(line: str, example_grid: Optional[str] = None):
                 key=lambda x: len(x[1].kwargs),
                 reverse=True,
             ):
+                guardslist = []
                 if c.kwargs:
-                    guards = [
-                        f"meta['{name}'] == {val}" for name, val in c.kwargs.items()
-                    ]
-                    guards = " and ".join(guards)
+                    # Remove AMD specific kwargs.
+                    for kwarg in c.kwargs:
+                        if kwarg not in [
+                            "matrix_instr_nonkdim",
+                            "waves_per_eu",
+                            "kpack",
+                        ]:
+                            guardslist.append(f"meta['{kwarg}'] == {c.kwargs[kwarg]}")
+                if guardslist:
+                    guards = " and ".join(guardslist)
                 else:
                     guards = "True"  # for configs with empty kwargs
                 grid, example_grid = determine_grid(grid, example_grid)
@@ -252,6 +261,7 @@ def user_defined_triton_kernel_transitive_closure_source_code(kernel) -> str:
     compile_wrapper.splice(kernel.src, strip=True)
 
     # Also include any possible kernel being called indirectly
+    import triton
     from triton import JITFunction  # type: ignore[name-defined, attr-defined]
     from triton.language import constexpr  # type: ignore[name-defined]
 
@@ -280,6 +290,14 @@ def traverse(cur_kernel):
                     compile_wrapper.splice(symbol.src, strip=True)
                     symbols_included.add(symbol_name)
                     traverse(symbol)
+                elif hasattr(triton, "constexpr_function") and isinstance(
+                    symbol, triton.runtime.jit.ConstexprFunction
+                ):
+                    compile_wrapper.newline()
+                    compile_wrapper.writeline("@triton.constexpr_function")
+                    compile_wrapper.splice(symbol.src, strip=True)
+                    symbols_included.add(symbol_name)
+                    traverse(symbol)
                 elif isinstance(symbol, (int, str, bool, constexpr)):
                     compile_wrapper.newline()
                     if isinstance(symbol, constexpr):
@@ -478,15 +496,19 @@ def codegen(self, code: IndentedBuffer) -> None:
         else:
             kernel_name = node.get_kernel_name()
         device = d.type if (d := node.get_device()) else V.graph.device_type
+        provenance_debug_handle: Optional[int] = None
         # set provenance tracing kernel mapping for ExternKernel types
-        if config.trace.provenance_tracking:
-            set_kernel_post_grad_provenance_tracing(node, kernel_name, is_extern=True)
+        if config.trace.provenance_tracking_level != 0:
+            provenance_debug_handle = set_kernel_post_grad_provenance_tracing(
+                node, kernel_name, is_extern=True
+            )
         self.wrapper._generate_extern_kernel_out_helper(
             kernel_name,
             node.codegen_reference(),
             node.output_view.codegen_reference() if node.output_view else None,
             args,
             device,
+            provenance_debug_handle,
         )
 
     def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
@@ -586,10 +608,64 @@ def __str__(self) -> str:
         return f"{type(self).__name__}({', '.join(args)})"
 
 
+class EfficientPeakEstimate:
+    def __init__(self):
+        from ..memory import estimate_peak_memory, get_freeable_input_buf
+
+        scheduler_nodes = V.graph.scheduler.nodes
+        graph_inputs = OrderedSet(V.graph.graph_inputs.keys())
+        graph_outputs = OrderedSet(V.graph.get_output_names())
+        names_to_freeable_bufs = get_freeable_input_buf(scheduler_nodes, graph_inputs)
+        self.overall_peak_memory, peak_by_scheduler_node = estimate_peak_memory(
+            scheduler_nodes,
+            names_to_freeable_bufs,
+            graph_outputs,
+        )
+
+        from .segmented_tree import SegmentedTree
+
+        self.segmented_tree = SegmentedTree(
+            peak_by_scheduler_node, operator.add, max, 0
+        )
+
+    def _get_size(self, node: BufferLike) -> int:
+        return V.graph.sizevars.size_hint(
+            V.graph.get_allocation_storage_size(node), fallback=0
+        ) * get_dtype_size(node.get_dtype())
+
+    def peak_between(self, line_a: FreeIfNotReusedLine, line_b: AllocateLine):
+        return self.segmented_tree.summarize_range(
+            line_a.scheduler_node_index + 1, line_b.scheduler_node_index - 1
+        )
+
+    def update_peak_between(self, line_a: FreeIfNotReusedLine, line_b: AllocateLine):
+        if line_a.scheduler_node_index + 1 == line_b.scheduler_node_index:
+            return
+        self.segmented_tree.update_range(
+            line_a.scheduler_node_index + 1,
+            line_b.scheduler_node_index - 1,
+            self._get_size(line_b.node),
+        )
+
+
 @dataclasses.dataclass
 class AllocateLine(MemoryPlanningLine):
     node: BufferLike
 
+    def __post_init__(self):
+        assert V.graph.scheduler.current_node is not None
+        self.scheduler_node_index = V.graph.scheduler.nodes.index(
+            V.graph.scheduler.current_node
+        )
+
+    def should_reuse_buffer(self, free_line: FreeIfNotReusedLine, size: int) -> bool:
+        if free_line.scheduler_node_index + 1 == self.scheduler_node_index:
+            return True
+        overall_peak_memory = self.wrapper.estimate_peak.overall_peak_memory
+        peak_memory_in_range = self.wrapper.estimate_peak.peak_between(free_line, self)
+        new_peak_memory = size + peak_memory_in_range
+        return new_peak_memory <= overall_peak_memory
+
     def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
         if self.node.get_name() in V.graph.removed_buffers:
             return NullLine(self.wrapper)
@@ -598,8 +674,16 @@ def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
         key = buffer_reuse_key(self.node)
         if config.allow_buffer_reuse and key in state:
             free_line = state.pop(key)
-            free_line.is_reused = True
-            return ReuseLine(self.wrapper, free_line.node, self.node)
+            size = V.graph.sizevars.size_hint(
+                V.graph.get_allocation_storage_size(self.node), fallback=0
+            ) * get_dtype_size(self.node.get_dtype())
+            if self.should_reuse_buffer(free_line, size):
+                free_line.is_reused = True
+                self.wrapper.estimate_peak.update_peak_between(free_line, self)
+                return ReuseLine(self.wrapper, free_line.node, self.node)
+            else:
+                state.push(key, free_line)
+                return self
 
         if self.node.get_device_or_error().type == "cpu":
             static_shape = self.wrapper.static_shape_for_buffer_or_none(self.node)
@@ -624,6 +708,12 @@ class FreeIfNotReusedLine(MemoryPlanningLine):
     node: BufferLike
     is_reused: bool = False
 
+    def __post_init__(self):
+        assert V.graph.scheduler.current_node is not None
+        self.scheduler_node_index = V.graph.scheduler.nodes.index(
+            V.graph.scheduler.current_node
+        )
+
     def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
         if len(self.node.get_inputs_that_alias_output()) > 0:
             return self
@@ -963,9 +1053,12 @@ def write_header(self) -> None:
         aot_config_comment = ""
         if context is not None and context.aot_graph_name is not None:
             aot_config_comment = f"# AOT ID: {context.aot_graph_name}"
-        aot_inductor_debug_utils = ""
+        inductor_debug_utils = ""
         if int(config.aot_inductor.debug_intermediate_value_printer) > 0:
-            aot_inductor_debug_utils = "from torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_info"
+            inductor_debug_utils = "from torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_info"
+        elif torch._inductor.config.test_configs.track_memory_lifecycle:
+            inductor_debug_utils = "from torch._inductor.runtime.debug_utils import tracked_empty_strided\n"
+
         self.imports.splice(
             f"""
                 {aot_config_comment}
@@ -983,7 +1076,7 @@ def write_header(self) -> None:
                 from torch import device, empty_strided
                 from {async_compile.__name__} import AsyncCompile
                 from torch._inductor.select_algorithm import extern_kernels
-                {aot_inductor_debug_utils}
+                {inductor_debug_utils}
             """,
             strip=True,
         )
@@ -995,6 +1088,7 @@ def write_header(self) -> None:
                 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                 assert_alignment = torch._C._dynamo.guards.assert_alignment
                 empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+                empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
                 empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                 empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                 empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
@@ -1193,7 +1287,14 @@ def write_prefix(self) -> None:
                 self.write_args(graph_input_names)
 
             self.codegen_inputs()
-            self.codegen_input_size_and_nan_asserts()
+
+            # avoid duplicating asserts for both partition functions and
+            # the call function when using cudagraph partition
+            if not (
+                is_using_cudagraph_partition()
+                and (not is_codegen_graph_partition_subgraph(self))
+            ):
+                self.codegen_input_size_and_nan_asserts()
 
     def codegen_input_size_and_nan_asserts(self) -> None:
         if config.size_asserts:
@@ -1351,11 +1452,13 @@ def _generate_extern_kernel_out_helper(
         out_view: Optional[str],
         args: list[str],
         device: str,
+        debug_handle: Optional[int] = None,
     ) -> None:
         # add debug printer code for triton kernel calls at (jit) inductor level
         debug_printer_manager = V.graph.wrapper_code.debug_printer
         debug_printer_manager.set_printer_args(args, kernel, None, None, "extern")
         args.append(f"out={out_view if out_view else out}")
+        self.write_provenance_debug_handle(kernel, debug_handle)
         with debug_printer_manager:
             self.writeline(f"{kernel}({', '.join(args)})")
 
@@ -1633,6 +1736,8 @@ def run_wrapper_ir_passes(self, is_inference: bool):
         if is_inference and config.memory_planning:
             self.memory_plan()
         else:
+            if config.allow_buffer_reuse:
+                self.estimate_peak = EfficientPeakEstimate()
             self.memory_plan_reuse()
 
     def codegen_input_symbol_assignment(
@@ -1753,7 +1858,9 @@ def codegen_python_shape_tuple(self, shape: Sequence[Expr]) -> str:
     def codegen_shape_tuple(self, shape: Sequence[Expr]) -> str:
         return self.codegen_python_shape_tuple(shape)
 
-    def codegen_alloc_from_pool(self, name, offset, dtype, shape, stride) -> str:
+    def codegen_alloc_from_pool(
+        self, name, offset, dtype, shape, stride
+    ) -> tuple[str, list[str]]:
         return "alloc_from_pool({})".format(
             ", ".join(
                 [
@@ -1764,7 +1871,7 @@ def codegen_alloc_from_pool(self, name, offset, dtype, shape, stride) -> str:
                     self.codegen_python_shape_tuple(stride),
                 ]
             )
-        )
+        ), []
 
     def codegen_reinterpret_view(
         self,
@@ -2193,6 +2300,7 @@ def rename_sizes_for_launcher(expr: Union[int, sympy.Expr]) -> sympy.Expr:
                         "config": config_to_dict(cfg),
                         "python": [*map(pexpr, grid)],
                         "cpp": [*map(cexpr, grid)],
+                        "python_slow": [*map(pexpr, grid)],
                     }
                 )
             inductor_meta = {
@@ -2500,6 +2608,7 @@ def generate_kernel_call(
         raw_args=None,
         triton_meta=None,
         original_fxnode_name=None,
+        debug_handle: Optional[int] = None,
     ):
         """
         Generates kernel call code.
@@ -2519,6 +2628,7 @@ def generate_kernel_call(
         )
 
         device = device or V.graph.get_current_device_or_throw()
+        self.write_provenance_debug_handle(kernel_name, debug_handle)
         self.writeline(
             KernelCallLine(
                 self,
@@ -2769,12 +2879,21 @@ def make_buffer_allocation(self, buffer: BufferLike):
         shape = tuple(buffer.get_size())
         allocation_shape = tuple(V.graph.get_allocation_size(buffer))
         stride = tuple(buffer.get_stride())
+        is_pinned = buffer.get_is_pinned()
         return self.make_allocation(
-            buffer.get_name(), device, dtype, shape, stride, allocation_shape
+            buffer.get_name(), device, dtype, shape, stride, allocation_shape, is_pinned
         )
 
+    @cache_on_self
+    def write_memory_track_allocation_once(self):
+        import_str = """
+            from torch._inductor.runtime.debug_utils import check_memory_step, track_tensor
+            """
+        if not V.graph.cpp_wrapper:
+            self.imports.splice(import_str, strip=True)
+
     def make_allocation(
-        self, name, device, dtype, shape, stride, allocation_shape=None
+        self, name, device, dtype, shape, stride, allocation_shape=None, is_pinned=False
     ):
         if allocation_shape is None:
             allocation_shape = shape
@@ -2784,7 +2903,23 @@ def make_allocation(
             allocation_shape
         )
         codegen_stride_tuple = self.codegen_python_shape_tuple(stride)
-        if device.type in ("cpu", "cuda", "xpu", "mtia"):
+        if torch._inductor.config.test_configs.track_memory_lifecycle:
+            out = (
+                f"{name} = tracked_empty_strided("
+                f"{codegen_allocation_shape_tuple}, "
+                f"{codegen_stride_tuple}, "
+                f"dtype={dtype}, "
+                f"device='{device.type}', "
+                f"name='{name}')"
+            )
+        elif device.type == "cpu" and is_pinned:
+            out = (
+                f"{name} = empty_strided_cpu_pinned("
+                f"{codegen_allocation_shape_tuple}, "
+                f"{codegen_stride_tuple}, "
+                f"{dtype})"
+            )
+        elif device.type in ("cpu", "cuda", "xpu", "mtia"):
             # optimized path for faster allocations, saving ~2us versus the stuff below
             out = (
                 f"{name} = empty_strided_{device.type}("
@@ -2820,6 +2955,16 @@ def make_free_by_names(self, names_to_del: list[str]):
     def codegen_exact_buffer_reuse(self, old_name: str, new_name: str, del_line: str):
         return f"{self.declare_maybe_reference}{new_name} = {old_name}{del_line}{self.ending}  {self.comment} reuse"
 
+    def write_provenance_debug_handle(
+        self,
+        kernel_name,
+        debug_handle: Optional[int] = None,
+    ):
+        if debug_handle is not None:
+            self.writeline(
+                f"{self.comment} [Provenance debug handles] {kernel_name}:{debug_handle}"
+            )
+
     def make_buffer_reuse(self, old: BufferLike, new: BufferLike, delete_old: bool):
         assert old.get_dtype() == new.get_dtype()
         old_name = old.get_name()
@@ -3225,7 +3370,18 @@ def codegen_conditional(self, conditional):
             self.codegen_subgraph(conditional.false_subgraph, outer_inputs, name)
         self.writeline(ExitSubgraphLine(self))
 
-    def codegen_while_loop(self, while_loop):
+    def codegen_while_loop(self, while_loop, stack_output):
+        """while_loop is codegened as a host side while_loop"""
+
+        def codegen_subgraph(subgraph, outer_inputs, outer_outputs):
+            """Helper method to deduplicate subgraph codegen logic"""
+            if V.graph.aot_mode:
+                self.codegen_subgraph_by_inlining(subgraph, outer_inputs, outer_outputs)
+            else:
+                self.codegen_subgraph_with_flattened_outputs(
+                    subgraph, outer_inputs, outer_outputs
+                )
+
         name = while_loop.get_name()
         outer_carried_inputs = [
             buf.codegen_reference() for buf in while_loop.carried_inputs
@@ -3234,7 +3390,13 @@ def codegen_while_loop(self, while_loop):
             buf.codegen_reference() for buf in while_loop.additional_inputs
         ]
 
+        ckp_offset = len(outer_carried_inputs)
         self.writeline(f"{name} = [None] * {len(outer_carried_inputs)}")
+        if stack_output:
+            self.writeline(
+                f"{name}.extend([[] for _ in range({len(outer_carried_inputs)})])"
+            )
+
         for i, inp in enumerate(outer_carried_inputs):
             # set the initial state before the loop
             self.writeline(f"{name}[{i}] = {inp}")
@@ -3251,32 +3413,61 @@ def codegen_while_loop(self, while_loop):
         # the carried_inputs part of the inputs, the additional ones
         # are passed in as they're before.
         body_outer_outputs = body_outer_inputs[: len(outer_carried_inputs)]
-
-        self.writeline("while True:")
-        self.writeline(EnterSubgraphLine(self, while_loop.cond_subgraph.graph))
-
-        if V.graph.aot_mode:
-            self.codegen_subgraph_by_inlining(
-                while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
-            )
+        # Check condition at the beginning and set up flag
+        codegen_subgraph(
+            while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
+        )
+        self.writeline(f"should_loop = {cond_outer_outputs[0]}")
+        self.writeline("if not should_loop:")
+        if stack_output:
+            # Handle the case when loop never executes
+            for i, (carried_input, carried_buf) in enumerate(
+                zip(outer_carried_inputs, while_loop.carried_inputs)
+            ):
+                self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
+                self.writeline(f"{name}[{i}] = {carried_input}.unsqueeze(0).clone()")
+                self.writeline(ExitSubgraphLine(self))
         else:
-            self.codegen_subgraph_with_flattened_outputs(
-                while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
-            )
-        self.writeline(
-            f"if not {cond_outer_outputs[0]}: break"
-        )  # condition doesn't hold
-        self.writeline(ExitSubgraphLine(self))
+            for i, (carried_input, carried_buf) in enumerate(
+                zip(outer_carried_inputs, while_loop.carried_inputs)
+            ):
+                self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
+                self.writeline(f"{name}[{i}] = {carried_input}.clone()")
+                self.writeline(ExitSubgraphLine(self))
+
+        self.writeline("while should_loop:")
+        # Body execution
         self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
-        if V.graph.aot_mode:
-            self.codegen_subgraph_by_inlining(
-                while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
-            )
-        else:
-            self.codegen_subgraph_with_flattened_outputs(
-                while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
-            )
+        codegen_subgraph(
+            while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
+        )
+        self.writeline(ExitSubgraphLine(self))
+
+        # Collect outputs if enabled
+        if stack_output:
+            self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
+            for i in range(len(outer_carried_inputs)):
+                self.writeline(f"{name}[{i + ckp_offset}].append({name}[{i}])")
+            self.writeline(ExitSubgraphLine(self))
+
+        # Condition check at end of loop
+        self.writeline(EnterSubgraphLine(self, while_loop.cond_subgraph.graph))
+        codegen_subgraph(
+            while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
+        )
         self.writeline(ExitSubgraphLine(self))
+        self.writeline(f"    should_loop = {cond_outer_outputs[0]}")
+
+        # Stack outputs after loop completion
+        if stack_output:
+            self.writeline("# Stack outputs after loop completion")
+            for i in range(len(outer_carried_inputs)):
+                self.writeline(f"if len({name}[{i + ckp_offset}]) > 0:")
+                self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
+                self.writeline(
+                    f"{name}[{i}] = torch.stack({name}[{i + ckp_offset}], dim=0)"
+                )
+                self.writeline(ExitSubgraphLine(self))
 
     @staticmethod
     def statically_known_int_or_none(x):
diff --git a/torch/_inductor/codegen/wrapper_fxir.py b/torch/_inductor/codegen/wrapper_fxir.py
index 1537d8267f0bc..29905b11f3b97 100644
--- a/torch/_inductor/codegen/wrapper_fxir.py
+++ b/torch/_inductor/codegen/wrapper_fxir.py
@@ -4,28 +4,37 @@
 import operator
 import textwrap
 from collections import Counter
+from collections.abc import Sequence
 from typing import Any, Callable, Optional, Union
 
 import sympy
 
 import torch
+from torch._export.passes._node_metadata_hook import (
+    _node_metadata_hook,
+    _set_node_metadata_hook,
+)
+from torch._export.utils import _detect_fake_mode_from_gm
 from torch._higher_order_ops.triton_kernel_wrap import (
     TraceableTritonKernelWrapper,
     tracing_triton_hopifier_singleton,
     triton_kernel_wrapper_mutation,
 )
-from torch._inductor.codecache import PyCodeCache
+from torch._inductor.codecache import LambdaFuture, PyCodeCache
 from torch._inductor.runtime.triton_heuristics import CachingAutotuner
 from torch._inductor.select_algorithm import extern_kernels  # noqa: F401
-from torch._inductor.utils import sympy_product, sympy_subs
+from torch._inductor.utils import convert_shape_to_symint, sympy_product
 from torch._inductor.virtualized import V
 from torch._library.triton import wrap_triton
 from torch.fx import GraphModule
 from torch.utils import _pytree as pytree
-from torch.utils._sympy.functions import CeilDiv
+from torch.utils._sympy.functions import FloorDiv
+from torch.utils._sympy.interp import _run_sympy_handler, sympy_interp
+from torch.utils._sympy.reference import OptimizedPythonReferenceAnalysis
 
 from .. import config, ir
-from ..utils import convert_shape_to_symint, convert_to_symint, LineContext
+from ..runtime.triton_compat import Config
+from ..utils import LineContext
 from .common import (
     CodegenSymbol,
     FileBackedGraphModule,
@@ -92,6 +101,38 @@ class TritonKernel:
     wrapped: TraceableTritonKernelWrapper
 
 
+def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
+    """
+    Replace sympy.floor with FloorDiv.
+    """
+    expr = sympy.together(expr)
+
+    # Find division operations in the sympy.floor expression
+    # Div is either represented as Mul with:
+    # Rational denominator or Pow with negative exponent
+    if not isinstance(expr, sympy.core.mul.Mul):
+        return sympy.floor(expr)
+
+    if isinstance(expr.args[0], sympy.Rational):
+        frac = expr.args[0]
+        numerator = sympy_product(expr.args[1:]) * frac.numerator
+        denominator = frac.denominator
+
+        return FloorDiv(numerator, denominator)
+    elif isinstance(expr.args[0], sympy.Pow):
+        base = expr.args[0].base
+        exp = expr.args[0].exp
+        numerator = sympy_product(expr.args[1:])
+        if exp < 0:
+            denominator = base ** (-exp)
+        else:
+            numerator = numerator * (base**exp)
+            denominator = 1
+        return FloorDiv(numerator, denominator)
+    else:
+        return sympy.floor(expr)
+
+
 class WrapperFxCodegen(PythonWrapperCodegen):
     """
     Backend to generate wrapper code as an FX IR graph.
@@ -155,10 +196,9 @@ def __post_init__(self) -> None:
             Optional[str], torch.fx.Node
         ] = {}  # Symbol table for codegen.
         self.kernels: dict[str, TritonKernel] = {}  # Table to store Triton kernels.
-        self.symbolic_arg_defs: dict[
-            sympy.Symbol, sympy.Expr
-        ] = {}  # Call arg definitions.
         self._unique_symbol_ids: Counter[str] = Counter()
+        self.tracer = torch.fx.proxy.GraphAppendingTracer(graph)
+        self.expr_to_proxy: dict[sympy.Expr, torch.fx.Proxy] = {}
 
     def _import_kernel(self, code: str, kernel_name: str) -> CachingAutotuner:
         """
@@ -168,6 +208,9 @@ def _import_kernel(self, code: str, kernel_name: str) -> CachingAutotuner:
         mod = PyCodeCache.load(module_code)
         kernel = getattr(mod, kernel_name)
 
+        if isinstance(kernel, LambdaFuture):
+            kernel = kernel.result()
+
         if not isinstance(kernel, CachingAutotuner):
             raise NotImplementedError(
                 textwrap.dedent(f"""
@@ -193,14 +236,6 @@ def _fake_tensor(
                 device=device,
             )
 
-    def _create_meta_from_buffer(
-        self, node: torch.fx.Node, buffer: CodegenBuffer
-    ) -> None:
-        name = buffer.get_name()
-        assert name
-        node.name = name
-        node.meta["val"] = buffer.get_example()
-
     def _create_as_strided(
         self,
         input_node: torch.fx.Node,
@@ -212,9 +247,9 @@ def _create_as_strided(
             torch.as_strided,
             args=(
                 input_node,
-                convert_shape_to_symint(size),
-                convert_shape_to_symint(stride),
-                convert_to_symint(offset),
+                self._generate_sym_nodes(size),
+                self._generate_sym_nodes(stride),
+                self._generate_sym_node(offset),
             ),
         )
 
@@ -263,16 +298,76 @@ def _generate_graph_inputs(self) -> None:
         """
         Converts graph inputs to FX placeholders.
         """
-        for name, ir_node in V.graph.graph_inputs.items():
-            # Introduce a new symbol for constant inputs.
-            buffer = (
-                SymbolBuffer(sympy.Symbol(name, is_integer=True))
-                if isinstance(ir_node, (int, float, sympy.Integer, sympy.Float))
-                else self._get_buffer(ir_node)
-            )
-            node = self.gm.graph.placeholder(buffer.get_name())
-            self._create_meta_from_buffer(node, buffer)
-            self._record_allocation(buffer, node)
+
+        for node in V.graph.module.graph.find_nodes(op="placeholder"):  # type: ignore[operator, union-attr]
+            name = node.name
+            if name in V.graph.graph_inputs:
+                ir_node = V.graph.graph_inputs[name]
+
+                # Introduce a new symbol for constant inputs.
+                buffer = (
+                    SymbolBuffer(sympy.Symbol(name, is_integer=True))
+                    if isinstance(ir_node, (int, float, sympy.Integer, sympy.Float))
+                    else self._get_buffer(ir_node)
+                )
+                placeholder_node = self.gm.graph.placeholder(buffer.get_name())
+                placeholder_node.meta["val"] = buffer.get_example()
+                self._record_allocation(buffer, placeholder_node)
+
+            elif V.aot_compilation:
+                # Create dummy input nodes to match the input signature
+                self.gm.graph.placeholder(name)
+
+    def _generate_graph_input_shapes(self) -> None:
+        """
+        Generate nodes creating symints that are part of graph input
+        shape/strides.
+        """
+
+        def _codegen_symbol(
+            sym_or_exp: Union[sympy.Symbol, sympy.Expr],
+            base_node: torch.fx.Node,
+            target: torch._ops.OpOverload,
+            dim: int,
+        ) -> None:
+            if isinstance(sym_or_exp, sympy.Symbol):
+                if sym_or_exp in self.expr_to_proxy:
+                    return
+
+                size_node = self.gm.graph.call_function(target, (base_node, dim))
+                size_proxy = torch.fx.Proxy(size_node, tracer=self.tracer)
+
+                self.expr_to_proxy[sym_or_exp] = size_proxy
+
+            elif isinstance(sym_or_exp, sympy.Integer):
+                return
+
+            elif isinstance(sym_or_exp, sympy.Expr):
+                self._sympy_interp(sym_or_exp)
+
+        for node in V.graph.module.graph.find_nodes(op="placeholder"):  # type: ignore[operator, union-attr]
+            name = node.name
+            if name in V.graph.graph_inputs:
+                ir_node = V.graph.graph_inputs[name]
+                if isinstance(ir_node, ir.TensorBox):
+                    buffer = self._get_buffer(ir_node)
+                    placeholder_node = self.buffer_to_node[buffer.get_name()]
+
+                    for dim, size in enumerate(ir_node.get_size()):
+                        _codegen_symbol(
+                            size, placeholder_node, torch.ops.aten.sym_size.int, dim
+                        )
+                    for dim, stride in enumerate(ir_node.get_stride()):
+                        _codegen_symbol(
+                            stride, placeholder_node, torch.ops.aten.sym_stride.int, dim
+                        )
+
+    def _generate_graph_constants(self) -> None:
+        for name, value in V.graph.constants.items():
+            node = self.gm.graph.get_attr(name)
+            node.meta["val"] = value
+            setattr(self.gm, name, value)
+            self.buffer_to_node[name] = node
 
     def _generate_buffer(self, node: ir.IRNode) -> Optional[torch.fx.Node]:
         """
@@ -285,7 +380,7 @@ def generate_to_buffer(node: ir.IRNode) -> Optional[BufferLike]:
                 return node
             elif isinstance(node, ir.NoneAsConstantBuffer):
                 return None
-            elif isinstance(node, ir.StorageBox):
+            elif isinstance(node, ir.MutableBox):
                 return generate_to_buffer(node.data)
             elif isinstance(node, ir.ReinterpretView):
                 # We need to introduce a new symbol if the output is a ReinterpretView.
@@ -334,30 +429,89 @@ def generate(self) -> torch.fx.GraphModule:
         Main entrypoint for FX codegen.
         """
         self._generate_graph_inputs()
+        self._generate_graph_constants()
 
-        # Generate FX IR from Wrapper IR lines.
-        for line in self.lines:
-            if isinstance(line, WrapperLine):
-                line.codegen_fx(self)(line)
-            elif isinstance(line, LineContext):
-                # Ignore line context in FX IR.
-                pass
-            else:
-                raise NotImplementedError(
-                    textwrap.dedent(
-                        f"""
-                    Found line of unrecognized type '{type(line)}':
-                        '{line}'
-
-                    FX conversion only supports Wrapper IR lines.
-                    """
+        fake_mode = _detect_fake_mode_from_gm(self.gm)
+
+        with _set_node_metadata_hook(
+            self.gm,
+            functools.partial(_node_metadata_hook, fake_mode=fake_mode),
+        ):
+            self._generate_graph_input_shapes()
+
+            # Generate FX IR from Wrapper IR lines.
+            for line in self.lines:
+                if isinstance(line, WrapperLine):
+                    line.codegen_fx(self)(line)
+                elif isinstance(line, LineContext):
+                    # Ignore line context in FX IR.
+                    pass
+                else:
+                    raise NotImplementedError(
+                        textwrap.dedent(
+                            f"""
+                        Found line of unrecognized type '{type(line)}':
+                            '{line}'
+
+                        FX conversion only supports Wrapper IR lines.
+                        """
+                        )
                     )
-                )
 
         self._generate_output()
         self.gm.recompile()
         return self.gm
 
+    def _sympy_interp(self, expr: sympy.Expr) -> torch.fx.Proxy:
+        # hash cons
+        if expr in self.expr_to_proxy:
+            return self.expr_to_proxy[expr]
+        # base cases, don't cache
+        if isinstance(
+            expr,
+            (
+                sympy.Integer,
+                sympy.Number,
+                sympy.Symbol,
+                sympy.logic.boolalg.BooleanAtom,
+            ),
+        ):
+            return sympy_interp(
+                OptimizedPythonReferenceAnalysis, self.expr_to_proxy, expr
+            )
+
+        # hash cons on arguments, run expr handler
+        self.expr_to_proxy[expr] = _run_sympy_handler(
+            OptimizedPythonReferenceAnalysis,
+            [self._sympy_interp(arg) for arg in expr.args],
+            expr,
+        )
+        return self.expr_to_proxy[expr]
+
+    def _generate_sym_node(
+        self, s: Union[int, sympy.Expr]
+    ) -> Union[int, torch.fx.Node]:
+        if isinstance(s, (int, sympy.Integer)):
+            return int(s)
+        elif isinstance(s, sympy.Symbol):
+            assert s in self.expr_to_proxy, (
+                f"Could not find a node corresponding to the symbol {s}"
+            )
+            return self.expr_to_proxy[s].node
+        elif isinstance(s, sympy.Expr):
+            return self._sympy_interp(s).node
+
+        elif isinstance(s, torch.fx.Node):
+            return s
+
+        else:
+            raise ValueError(f"{s} of type {type(s)} is not a valid input")
+
+    def _generate_sym_nodes(
+        self, shape: Sequence[sympy.Expr]
+    ) -> list[Union[int, torch.fx.Node]]:
+        return [self._generate_sym_node(s) for s in shape]
+
     def _generate_allocate(self, line: WrapperLine) -> None:
         assert isinstance(line, AllocateLine)
         buffer = line.node
@@ -366,8 +520,8 @@ def _generate_allocate(self, line: WrapperLine) -> None:
 
         device = buffer.get_device()
         dtype = buffer.get_dtype()
-        shape = convert_shape_to_symint(buffer.get_size())
-        stride = convert_shape_to_symint(buffer.get_stride())
+        shape = self._generate_sym_nodes(buffer.get_size())
+        stride = self._generate_sym_nodes(buffer.get_stride())
 
         node = self.gm.graph.call_function(
             torch.empty_strided,
@@ -376,7 +530,6 @@ def _generate_allocate(self, line: WrapperLine) -> None:
         )
         assert name
         node.name = name
-        self._create_meta_from_buffer(node, buffer)
         self._record_allocation(buffer, node)
 
     def _generate_comment(self, line: WrapperLine) -> None:
@@ -447,7 +600,6 @@ def _generate_reinterpret_helper(
         # Map ReinterpretView to as_strided.
         result_node = self._create_as_strided(input_node, size, stride, offset)
         result_node.name = name
-        result_node.meta["val"] = layout.get_example()
         self._record_allocation(result_buffer, result_node)
 
     def _generate_reuse(self, line: WrapperLine) -> None:
@@ -470,7 +622,6 @@ def _generate_reuse(self, line: WrapperLine) -> None:
             or old.get_offset() != offset
         ):
             result_node = self._create_as_strided(old_node, size, stride, offset)
-            self._create_meta_from_buffer(result_node, new)
 
         self._record_allocation(new, result_node)
 
@@ -499,7 +650,6 @@ def _generate_multi_output(self, line: WrapperLine) -> None:
         idx = inds[0]
 
         node = self.gm.graph.call_function(operator.getitem, args=(arg_node, idx))
-        node.meta["val"] = arg_node.meta["val"][idx]
         node.name = line.result_name
         self.buffer_to_node[line.result_name] = node
 
@@ -522,6 +672,10 @@ def _generate_triton_call(self, line: WrapperLine) -> None:
         call_args = self._lookup_args(line.call_args)
         kernel = self.kernels[line.kernel_name]
         tuner = kernel.tuner
+        # Use python_slow mode instead of python mode to avoid
+        # the round to neginf behaviour, which is not the convention
+        # in other languages.
+        tuner.grid_mode = "python_slow"
 
         # Optionally autotune the kernels.
         # The FX backend currently only supports compile-time tuning.
@@ -559,47 +713,54 @@ def node_to_tuning_arg(arg: Any) -> Any:
                 kernel_name,
             )
 
-        kernel_config = tuner.compile_results[0].config
-        call_args, grid = tuner._interpret_args_grid(call_args, kernel_config)
-        call_kwargs = dict(zip(tuner.triton_meta["signature"], call_args))
-        call_kwargs.update(kernel_config.kwargs)
+        triton_meta = tuner.triton_meta
+        signature = triton_meta["signature"]
 
-        def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
+        def add_constants_to_call_args(
+            call_args: Sequence[Any], cfg: Config
+        ) -> tuple[Any, ...]:
             """
-            Converts floor(x / c) to x // c.
+            Add constant kwargs to the arg list.
             """
-            if isinstance(expr, sympy.core.mul.Mul) and isinstance(
-                expr.args[0], sympy.Rational
-            ):
-                # Only the first argument of a Mul can be a Rational.
-                frac = expr.args[0]
-                numerator = sympy_product(expr.args[1:]) * frac.numerator
-                denominator = frac.denominator
-
-                # Sanity check the results.
-                new_expr = numerator / denominator
-                assert V.graph.sizevars.statically_known_equals(new_expr, expr), (
-                    f"Unsound replacement: '{new_expr}' != '{expr}'"
-                )
-                # Undo the python division trick and replace with explicit CeilDiv
-                return -CeilDiv(-numerator, denominator)
-            else:
-                return sympy.floor(expr)
+            # Add args from the proper Triton signature.
+            new_call_args = []
+            call_arg_idx = 0
+            constants = triton_meta["constants"]
+            for arg_name in signature:
+                # Config kwargs are tracked separately.
+                if arg_name in cfg.kwargs:
+                    continue
+
+                try:
+                    new_arg = constants[arg_name]
+                except KeyError:
+                    new_arg = call_args[call_arg_idx]
+                    call_arg_idx += 1
+                new_call_args.append(new_arg)
+
+            # Add Inductor's extra call args to the end.
+            new_call_args.extend(call_args[call_arg_idx:])
+
+            return tuple(new_call_args)
 
-        def expr_to_symint(
-            expr: Union[int, torch.fx.Node, sympy.Expr],
-        ) -> Union[int, torch.fx.Node, sympy.Expr]:
-            if not isinstance(expr, sympy.Expr):
-                return expr
-
-            expr = expr.replace(sympy.floor, replace_floor_div)
-            expr = sympy_subs(expr, self.symbolic_arg_defs)
-            return convert_to_symint(expr)
+        kernel_config = tuner.compile_results[0].config
+        call_args = add_constants_to_call_args(call_args, kernel_config)
+        call_args, grid = tuner._interpret_args_grid(call_args, kernel_config)
+        call_kwargs = dict(zip(signature, call_args))
+        call_kwargs.update(kernel_config.kwargs)
 
-        # Convert sympy expressions to symints.
-        # Use FloorDiv over sympy.floor, so we can get nicer Python code from FX.
-        wrapper_grid = [tuple(expr_to_symint(dim) for dim in grid)]
-        call_kwargs = {name: expr_to_symint(val) for name, val in call_kwargs.items()}
+        # Replace all sympy.floor with FloorDiv
+        # _generate_sym_node does not support sympy.floor
+        grid = [
+            x.replace(sympy.floor, replace_floor_div)
+            if isinstance(x, sympy.Expr)
+            else x
+            for x in grid
+        ]
+        wrapper_grid = [tuple(self._generate_sym_nodes(grid))]
+        call_kwargs = {
+            name: self._generate_sym_node(val) for name, val in call_kwargs.items()
+        }
 
         # Store non-graphable kwargs in the side table.
         (
@@ -657,14 +818,11 @@ def _generate_extern_kernel_common(
         else:
             raise NotImplementedError(f"Unrecognized output layout: {kernel.layout}")
 
-        # Look up the kernel function from its name.
-        kernel_name = kernel.get_kernel_name()
-        module_name, kernel_name = kernel_name.split(".", 1)
-        op = globals()[module_name]  # E.g. extern_kernels, aten, etc.
-        for subname in kernel_name.split("."):
-            op = getattr(op, subname)  # E.g. extern_kernels.addmm
-
-        fx_node = self.gm.graph.call_function(op, args=args, kwargs=kwargs)
+        fx_node = self.gm.graph.call_function(
+            kernel.op_overload,  # type: ignore[arg-type]
+            args=args,
+            kwargs=kwargs,
+        )
 
         # Assign the result to the given name.
         if result_buffer:
@@ -674,14 +832,6 @@ def _generate_extern_kernel_common(
             fx_node.name = result_buffer
             self.buffer_to_node[result_buffer] = fx_node
 
-            arg_tensors = [
-                arg.meta["val"] if isinstance(arg, torch.fx.Node) else arg
-                for arg in args
-            ]
-
-            # Run the operation to propagate metadata.
-            fx_node.meta["val"] = op(*arg_tensors, **kwargs)
-
     def _generate_kernel_call(self, line: WrapperLine) -> None:
         assert isinstance(line, KernelCallLine)
         if not line.triton:
@@ -706,4 +856,6 @@ def _generate_symbolic_call_arg(self, line: WrapperLine) -> None:
         assert isinstance(line, SymbolicCallArgLine)
         # Store the arg: expr mapping for later use.
         arg = line.arg
-        self.symbolic_arg_defs[arg.inner] = arg.inner_expr
+
+        inner_expr_proxy = self._sympy_interp(arg.inner_expr)
+        self.expr_to_proxy[arg.inner] = inner_expr_proxy
diff --git a/torch/_inductor/codegen/xpu/device_op_overrides.py b/torch/_inductor/codegen/xpu/device_op_overrides.py
index 632cfd29f174f..5d538ec20ca21 100644
--- a/torch/_inductor/codegen/xpu/device_op_overrides.py
+++ b/torch/_inductor/codegen/xpu/device_op_overrides.py
@@ -58,10 +58,10 @@ def cpp_kernel_type(self) -> str:
     def cpp_device_ptr(self) -> str:
         return "void *"
 
-    def cpp_global_scratch(
-        self, idx: int, workspace: TritonScratchWorkspace
+    def cpp_scratch(
+        self, idx: int, workspace: TritonScratchWorkspace, prefix: Optional[str] = None
     ) -> Optional[tuple[list[str], str]]:
-        return None
+        return [f"void *global_scratch_{idx} = 0;"], f"global_scratch_{idx}"
 
 
 register_device_op_overrides("xpu", XPUDeviceOpOverrides())
diff --git a/torch/_inductor/comm_analysis.py b/torch/_inductor/comm_analysis.py
index 2a69a05313479..c24cf336e66a3 100644
--- a/torch/_inductor/comm_analysis.py
+++ b/torch/_inductor/comm_analysis.py
@@ -1,20 +1,26 @@
 import functools
+import logging
 import math
 from enum import IntEnum
+from typing import Optional
 
 import sympy
 
 import torch
 
 from . import ir
-from .utils import get_dtype_size, sympy_product
+from .utils import get_dtype_size, snode_args_kwargs, sympy_product
 from .virtualized import V
 
 
+log = logging.getLogger(__name__)
+
+
 class NCCL_COLL(IntEnum):
     ALL_REDUCE = 0
     ALL_GATHER = 1
     REDUCE_SCATTER = 2
+    ALL_TO_ALL = 3
 
 
 class NVIDIA_GPU_TYPE(IntEnum):
@@ -49,6 +55,8 @@ def get_collective_type(node: ir.IRNode) -> NCCL_COLL:
         return NCCL_COLL.ALL_GATHER
     elif "reduce_scatter" in kernel_name:
         return NCCL_COLL.REDUCE_SCATTER
+    elif "torch.ops._dtensor.shard_dim_alltoall.default" in kernel_name:
+        return NCCL_COLL.ALL_TO_ALL
     else:
         raise ValueError(f"Unsupported collective kernel: {kernel_name}")
 
@@ -158,9 +166,53 @@ class NCCL_PROTO(IntEnum):
 ]
 
 
+def estimate_nccl_collective_runtime_nccl_estimator(snode) -> Optional[float]:  # type: ignore[no-untyped-def]
+    kernel = snode.node
+    assert kernel is not None
+    py_kernel_name = getattr(kernel, "python_kernel_name", "")
+    if not ("all_gather" in py_kernel_name or "reduce_scatter" in py_kernel_name):
+        # NCCL of version 2.27 sometimes unrecoverably fail for all_to_all, all_reduce
+        return None
+
+    from torch.distributed.distributed_c10d import _resolve_process_group
+
+    pg_name = kernel.constant_args[-1]  # type: ignore[attr-defined]
+    pg = _resolve_process_group(pg_name)
+    rank: int = torch.distributed.get_rank(pg)
+    # TODO(ivankobzarev): Figure out how we can use time estimations,
+    # without cuda allocations.
+    device = torch.device(f"cuda:{rank}")
+
+    fn = eval(py_kernel_name)
+    args, kwargs = snode_args_kwargs(snode)
+
+    # TODO(ivankobzarev): fix out variants snode_args_kwargs
+    if "all_gather_into_tensor_out" in py_kernel_name:
+        args = args[1:] + args[0]
+
+    try:
+        with torch.distributed._time_estimator(
+            group=pg, device=device
+        ) as time_estimator:
+            w = fn(*args, **kwargs)
+            torch.ops._c10d_functional.wait_tensor.default(w)
+    except Exception as e:
+        # NCCL estimator can fail
+        log.info(e)
+        return None
+
+    est_time_us = time_estimator.estimated_time
+    # -1000 constant is NCCL return in case of error during estimations.
+    # Observed it for all_to_all estimations.
+    if est_time_us < 0:
+        return None
+    est_time_ms = est_time_us / 1e3
+    return est_time_ms
+
+
 def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
     """
-    Returns estimated NCCL collective runtime in nanoseconds (ns).
+    Returns estimated NCCL collective runtime in milliseconds (ms).
 
     The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
     We aim to estimate the runtime as accurately as possible.
@@ -220,6 +272,8 @@ def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
 
     if coll == NCCL_COLL.ALL_REDUCE:
         nsteps = 2 * (nRanks - 1)
+    elif coll == NCCL_COLL.ALL_TO_ALL:
+        nsteps = 2 * (nRanks - 1)
     elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER):
         nsteps = nRanks - 1
 
@@ -237,7 +291,7 @@ def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
             nInterSteps = 2 * nNodes
         else:
             nInterSteps = 0
-    elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER):
+    elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER, NCCL_COLL.ALL_TO_ALL):
         nInterSteps = nNodes - 1
 
     # First compute latency in us; then at the end, convert it to ns
@@ -256,7 +310,9 @@ def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
 
     # =============== final result ===============
     transport_ns = tensor_storage_size_GB / bandwidth_GB_per_ns
-    return transport_ns + latency_ns
+    ns = transport_ns + latency_ns
+    ms = ns / 1e6
+    return ms
 
 
 ################################################################################################################
diff --git a/torch/_inductor/comm_lowering.py b/torch/_inductor/comm_lowering.py
index b748f61f067b9..e46909432f17e 100644
--- a/torch/_inductor/comm_lowering.py
+++ b/torch/_inductor/comm_lowering.py
@@ -209,7 +209,9 @@ def _all_reduce(inp: ir.TensorBox, reduce_op: str, group_name: str) -> ir.Tensor
             inp.realize()
             V.graph.no_fuse_buffer_names.add(inp.get_name())
         inp = ir.ExternKernel.require_contiguous(inp)
-        ir._AllReduceKernel.create_inplace(
+        # Because we are lowering as inplace c10d.all_reduce_, we should generate
+        # _AllReduce_Kernel instead of _AllReduceKernel.
+        ir._AllReduce_Kernel.create_inplace(
             c10d.all_reduce_.default,
             inp,  # type: ignore[arg-type]
             reduce_op,
diff --git a/torch/_inductor/comms.py b/torch/_inductor/comms.py
index ff5801a04e65d..fa8bb30f238cf 100644
--- a/torch/_inductor/comms.py
+++ b/torch/_inductor/comms.py
@@ -4,6 +4,7 @@
 
 import heapq
 import importlib
+import itertools
 import logging
 import operator
 import sys
@@ -23,8 +24,15 @@
 
 if TYPE_CHECKING:
     from .ir import IRNode, Operation
-
-from .memory import estimate_peak_memory, FreeableInputBuffer, get_freeable_input_buf
+    from .scheduler import SchedulerBuffer
+
+from .memory import (
+    estimate_peak_memory,
+    estimate_peak_memory_allocfree,
+    FreeableInputBuffer,
+    get_freeable_input_buf,
+    SNodeMemory,
+)
 from .utils import (
     contains_collective,
     contains_wait,
@@ -44,6 +52,28 @@
     from torch._inductor.scheduler import BaseSchedulerNode
 
 
+def align_runtime_estimations_across_all_distributed_ranks(
+    snodes: list[BaseSchedulerNode],
+):
+    runtime_estimations = {}
+    for snode in snodes:
+        runtime_estimations[snode] = snode.get_estimated_runtime()
+    import torch.distributed as dist
+    from torch.distributed.distributed_c10d import _get_default_group
+
+    world_size = dist.get_world_size()
+    pg = _get_default_group()
+    gathered_runtime_estimations: list[list[float]] = [[] for _ in range(world_size)]
+    dist.all_gather_object(
+        gathered_runtime_estimations, list(runtime_estimations.values()), pg
+    )
+    median_runtime_estimations = torch.median(
+        torch.tensor(gathered_runtime_estimations), dim=0
+    ).values.tolist()
+    for i in range(len(snodes)):
+        snodes[i].override_estimated_runtime = median_runtime_estimations[i]
+
+
 def sink_waits(snodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
     """
     Greedily schedules waits as late as possible.
@@ -188,6 +218,46 @@ def _is_fake_dep(d):
     return isinstance(d, WeakDep) and d.is_fake
 
 
+def _group_names(gns: list[BaseSchedulerNode]) -> str:
+    return "~".join([gn.get_name() for gn in gns])
+
+
+def _initialize_memory_tracking(snodes, graph_inputs, graph_outputs):
+    """Initialize memory tracking data structures"""
+    name_to_freeable_input_buf = get_freeable_input_buf(snodes, graph_inputs)
+    peak_memory, snodes_curr_memory, snodes_allocfree, buf_to_snode_last_use = (
+        estimate_peak_memory_allocfree(
+            snodes, name_to_freeable_input_buf, graph_outputs
+        )
+    )
+    _curr_memory = dict(zip(snodes, snodes_curr_memory))
+    _curr_memory[None] = (0, 0)
+    return (
+        peak_memory,
+        _curr_memory,
+        snodes_allocfree,
+        buf_to_snode_last_use,
+        name_to_freeable_input_buf,
+    )
+
+
+def _initialize_double_linked_list(
+    snodes: list[BaseSchedulerNode],
+) -> tuple[
+    dict[BaseSchedulerNode, Optional[BaseSchedulerNode]],
+    dict[BaseSchedulerNode, Optional[BaseSchedulerNode]],
+    BaseSchedulerNode,
+]:
+    """Create double-linked list structure from snodes"""
+    _prev = {}
+    _next = {}
+    for i, snode in enumerate(snodes):
+        _prev[snode] = snodes[i - 1] if i > 0 else None
+        _next[snode] = snodes[i + 1] if i < len(snodes) - 1 else None
+    _head = snodes[0]
+    return _prev, _next, _head
+
+
 def _reorder_communication_preserving_peak_memory_internal(
     snodes: list[BaseSchedulerNode],
 ) -> tuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, ReorderInfo]]:
@@ -211,20 +281,22 @@ def _reorder_communication_preserving_peak_memory_internal(
     # heuristic to avoid degenerating to quadratic time
     graph_inputs: OrderedSet[str] = OrderedSet(V.graph.graph_inputs.keys())
     graph_outputs: OrderedSet[str] = OrderedSet(V.graph.get_output_names())
-    name_to_freeable_input_buf: dict[str, FreeableInputBuffer] = get_freeable_input_buf(
-        snodes, graph_inputs
-    )
-    peak_memory, curr_memory = estimate_peak_memory(
-        snodes, name_to_freeable_input_buf, graph_outputs
-    )
-    runtimes = {snode: estimate_op_runtime(snode) for snode in snodes}
-    _curr_memory = dict(zip(snodes, curr_memory))
-    _curr_memory[None] = 0  # type: ignore[index]
-
+    (
+        peak_memory,
+        _curr_memory,
+        snodes_allocfree,
+        buf_to_snode_last_use,
+        name_to_freeable_input_buf,
+    ) = _initialize_memory_tracking(snodes, graph_inputs, graph_outputs)
+    runtimes: dict[BaseSchedulerNode, float] = {
+        snode: estimate_op_runtime(snode) for snode in snodes
+    }
     # debug stats
     stats: dict[BaseSchedulerNode, ReorderInfo] = {}
 
-    def exposed_communication_time(collective_snode, remaining_snodes):
+    def exposed_communication_time(
+        collective_snode: BaseSchedulerNode, remaining_snodes: list[BaseSchedulerNode]
+    ) -> float:
         # assumes a linear schedule and computes the overlap of the collective with the remaining nodes
         comm_time = estimate_op_runtime(collective_snode)
         compute_time = 0.0
@@ -236,7 +308,7 @@ def exposed_communication_time(collective_snode, remaining_snodes):
                 # we can ignore it. Otherwise, it's the end of the road for overlap opportunities
                 break
 
-            def accumulate_time(_snode):
+            def accumulate_time(_snode: BaseSchedulerNode) -> None:
                 nonlocal compute_time
                 compute_time += runtimes[_snode]
 
@@ -245,18 +317,11 @@ def accumulate_time(_snode):
 
     total_moves = 0
 
-    # Dicts to keep track of "next" and "previous" as double-linked structure during grouping
-    _prev: dict[Optional[BaseSchedulerNode], Optional[BaseSchedulerNode]] = {}
-    _next: dict[Optional[BaseSchedulerNode], Optional[BaseSchedulerNode]] = {}
-    for i, snode in enumerate(snodes):
-        _prev[snode] = snodes[i - 1] if i > 0 else None
-        _next[snode] = snodes[i + 1] if i < len(snodes) - 1 else None
-    _curr_memory = dict(zip(snodes, curr_memory))
-    _curr_memory[None] = 0  # type: ignore[index]
-
-    _head = snodes[0]
+    _prev, _next, _head = _initialize_double_linked_list(snodes)
 
-    def _group_nodes(head, tail):
+    def _group_nodes(
+        head: Optional[BaseSchedulerNode], tail: Optional[BaseSchedulerNode]
+    ) -> list[BaseSchedulerNode]:
         ret = []
         n = head
         while True:
@@ -264,37 +329,167 @@ def _group_nodes(head, tail):
                 ret.append(n)
             if n == tail:
                 break
-            n = _next[n]
+            n = _next[n]  # type: ignore[index]
         return ret
 
-    def _group_names(head, tail):
-        ret = ""
-        for n in _group_nodes(head, tail):
-            if ret:
-                ret += "~"
-            ret += n.get_name()
-        return ret
+    def _perform_double_linked_list_swap(candidate, group_head, group_tail):
+        # swap (candidate, group_head...group_tail)
+        # Before:
+        # candidate_prev -0-> candidate -1-> group_head...group_tail -2-> group_tail_next
+        # After:
+        # candidate_prev -0-> group_head...group_tail -1-> candidate -2-> group_tail_next
+        # 0
+        candidate_prev = _prev[candidate]
+        if candidate_prev:
+            _next[candidate_prev] = group_head
+        _prev[group_head] = candidate_prev
+
+        # 2
+        group_tail_next = _next[group_tail]
+        if group_tail_next:
+            _prev[group_tail_next] = candidate
+        _next[candidate] = group_tail_next
+
+        # 1
+        _prev[candidate] = group_tail
+        _next[group_tail] = candidate
+
+        nonlocal _head
+        if _head == candidate:
+            _head = group_head
+
+    def _calculate_potential_peak_memory(
+        candidate, group_ns, group_n_to_bufs_after_swap_dealloc_by_candidate
+    ):
+        # Caching calculations of memory for group nodes and candidate,
+        # to apply without recalculation after swap.
+        _post_alloc_update: dict[BaseSchedulerNode, int] = {}
+        potential_peak: int = 0
+        if not group_n_to_bufs_after_swap_dealloc_by_candidate:
+            # Not accounting for buffers last use change
+            potential_peak = max(
+                group_peak_memory - candidate_delta_mem,
+                _curr_memory[group_tail][1]
+                - candidate_delta_mem
+                + candidate_allocfree.size_alloc,
+            )
+            return potential_peak, _post_alloc_update
+
+        # If candidate will be after group, the starting memory level of group nodes
+        # changes to the -(candidate.size_alloc - candidate.size_free)
+        mem_after_reorder_delta: int = -candidate_delta_mem
+        for gn in gns:
+            gn_post_alloc_mem = _curr_memory[gn][0] + mem_after_reorder_delta
+            _post_alloc_update[gn] = gn_post_alloc_mem
+            potential_peak = max(potential_peak, gn_post_alloc_mem)
+
+            bufs = group_n_to_bufs_after_swap_dealloc_by_candidate.get(gn, None)
+            if bufs is not None:
+                for buf in bufs:
+                    # Candidate will deallocate those buffers
+                    mem_after_reorder_delta += buf.mpi_buffer.size_free
+
+        candidate_mem_post_alloc = (
+            _curr_memory[group_tail][1]
+            + mem_after_reorder_delta
+            + candidate_allocfree.size_alloc
+        )
+        _post_alloc_update[candidate] = candidate_mem_post_alloc
+        potential_peak = max(potential_peak, candidate_mem_post_alloc)
+        return potential_peak, _post_alloc_update
+
+    def _update_memory_tracking_after_swap(
+        candidate,
+        gns,
+        group_n_to_bufs_after_swap_dealloc_by_candidate,
+        _post_alloc_update,
+    ):
+        if not group_n_to_bufs_after_swap_dealloc_by_candidate:
+            for gn in gns:
+                cm = _curr_memory[gn]
+                _curr_memory[gn] = (
+                    cm[0] - candidate_delta_mem,
+                    cm[1] - candidate_delta_mem,
+                )
+            _candidate_post_alloc_mem = (
+                _curr_memory[group_tail][1] + candidate_allocfree.size_alloc
+            )
+            _candidate_post_free_mem = (
+                _candidate_post_alloc_mem - candidate_allocfree.size_free
+            )
+            _curr_memory[candidate] = (
+                _candidate_post_alloc_mem,
+                _candidate_post_free_mem,
+            )
+            return
+
+        # Candidate becomes last use of some bufs
+        for (
+            gn,
+            bufs,
+        ) in group_n_to_bufs_after_swap_dealloc_by_candidate.items():
+            for buf in bufs:
+                buf_to_snode_last_use[buf] = candidate
+
+        size_free_to_move_to_candidate_sum: int = 0
+        for n in gns:
+            _gn_post_alloc_mem: int = _post_alloc_update[n]
+            size_free_to_move_to_candidate: int = sum(
+                buf.mpi_buffer.size_free
+                for buf in group_n_to_bufs_after_swap_dealloc_by_candidate[n]
+            )
+            size_free_to_move_to_candidate_sum += size_free_to_move_to_candidate
+            # group node does not deallocate this after swap
+            snodes_allocfree[n].size_free -= size_free_to_move_to_candidate
+            gn_post_free_mem: int = _gn_post_alloc_mem - snodes_allocfree[n].size_free
+            _curr_memory[n] = (_gn_post_alloc_mem, gn_post_free_mem)
+        _candidate_post_alloc_mem = _post_alloc_update[candidate]
+        snodes_allocfree[candidate].size_free += size_free_to_move_to_candidate_sum
+        candidate_post_free_mem = (
+            _candidate_post_alloc_mem - snodes_allocfree[candidate].size_free
+        )
+        _curr_memory[candidate] = (
+            _candidate_post_alloc_mem,
+            candidate_post_free_mem,
+        )
+
+    debug_num_collectives_to_reorder: Optional[int] = (
+        config.reorder_iterative_debug_limit_to_reorder
+    )
 
+    num_processed_collectives: int = 0
     curr = _head
+    debug_iterative_memory_recompute = config.reorder_iterative_debug_memory_recompute
+    iterative_recompute_error = False
+
     while _next[curr] is not None:
+        if iterative_recompute_error:
+            break
         if contains_collective(curr):
-            reorder_info = stats[curr] = ReorderInfo()
-            reorder_info.initial_exposed = reorder_info.final_exposed = (
-                exposed_communication_time(curr, _group_nodes(_next[curr], None))
+            if debug_num_collectives_to_reorder is not None and (
+                num_processed_collectives >= debug_num_collectives_to_reorder
+            ):
+                break
+            num_processed_collectives += 1
+
+            info = stats[curr] = ReorderInfo()
+            info.initial_exposed = info.final_exposed = exposed_communication_time(
+                curr, _group_nodes(_next[curr], None)
             )
 
             candidate = _prev[curr]
             group_head = curr
             group_tail = curr
-            group_peak_memory = _curr_memory[curr]
+            group_peak_memory = _curr_memory[curr][0]  # post_alloc memory
             while candidate is not None:
                 if contains_collective(candidate):
-                    reorder_info.limiting_factor = "collective ordering"
+                    info.limiting_factor = "collective ordering"
                     break
 
+                gns: list[BaseSchedulerNode] = _group_nodes(group_head, group_tail)
                 group = GroupedSchedulerNode(
                     curr.scheduler,
-                    _group_nodes(group_head, group_tail),
+                    gns,
                     temp_grouping=True,
                 )
 
@@ -314,7 +509,9 @@ def _group_names(head, tail):
 
                 if data_dep is not None:
 
-                    def is_groupable(candidate):
+                    def is_groupable(
+                        candidate: BaseSchedulerNode,
+                    ) -> tuple[bool, Optional[str]]:
                         # preserve ordering
                         if contains_collective(candidate):
                             return False, "contains_collective"
@@ -323,73 +520,106 @@ def is_groupable(candidate):
                             return False, "contains_gemm_like"
                         return True, None
 
-                    is_grp, grp_reason = is_groupable(candidate)
-                    if is_grp:
+                    is_groupable_result, grouping_reason = is_groupable(candidate)
+                    if is_groupable_result:
                         group_head = candidate
                         group_peak_memory = max(
-                            group_peak_memory, _curr_memory[candidate]
+                            group_peak_memory, _curr_memory[candidate][0]
                         )
-                        reorder_info.grouped += 1
-                        reorder_info.grouped_info = _group_names(group_head, group_tail)
+                        info.grouped += 1
+                        info.grouped_info = _group_names(gns)
                         candidate = _prev[candidate]
                         continue
                     else:
                         msg = (
                             f"data dependency {data_dep}(dep_names:{list(data_deps.keys())})"
-                            f"\n candidate:{candidate.get_name()}(os:{[candidate.get_buffer_names()]})"
-                            f"dep on {_group_names(group_head, group_tail)}"
-                            f"\n non_group_reason:{grp_reason}"
+                            f"\n candidate:{candidate.get_name()}(outs:{[candidate.get_buffer_names()]})"
+                            f"dep on {_group_names(gns)}"
+                            f"\n non_group_reason:{grouping_reason}"
                         )
-                        reorder_info.limiting_factor = msg
+                        info.limiting_factor = msg
                         break
 
-                delta_memory_candidate = (
-                    _curr_memory[candidate] - _curr_memory[_prev[candidate]]  # type: ignore[index]
+                candidate_allocfree: SNodeMemory = snodes_allocfree[candidate]
+                candidate_delta_mem: int = (
+                    candidate_allocfree.size_alloc - candidate_allocfree.size_free
                 )
+                # candidate and one of group nodes are successors of the same buffer
+                # and last use of the buffer happen in group nodes.
+                # This last use deallocates it.
+                # If we swap [candidate [group]] to [[group] candidate],
+                # candidate becomes the last use
+                # and deallocated this buffer instead of group node.
+                # we need to update size_free accordingly to group_node and candidate,
+                # and recalculate post_alloc, post_free for them.
+                #
+                # Buf that changes its last use snode,
+                # after swap will be deallocated only by candidate,
+                # while before it was deallocated by group node.
+                group_n_to_bufs_after_swap_dealloc_by_candidate: dict[
+                    BaseSchedulerNode, list[Union[FreeableInputBuffer, Any]]
+                ] = defaultdict(list)
+                for (
+                    buf,
+                    snode_last_use,
+                ) in buf_to_snode_last_use.items():
+                    succ_nodes = buf.mpi_buffer.succ_nodes
+                    if candidate not in succ_nodes:
+                        continue
 
-                if group_peak_memory - delta_memory_candidate > peak_memory:
-                    reorder_info.limiting_factor = "peak memory"
-                    break
+                    if not any(gn == snode_last_use for gn in gns):
+                        continue
+
+                    group_n_to_bufs_after_swap_dealloc_by_candidate[
+                        snode_last_use
+                    ].append(buf)
+
+                potential_peak, _post_alloc_update = _calculate_potential_peak_memory(
+                    candidate, gns, group_n_to_bufs_after_swap_dealloc_by_candidate
+                )
 
-                reorder_info.moves += 1
+                if potential_peak > peak_memory:
+                    info.limiting_factor = (
+                        f"peak memory new:{potential_peak} vs base:{peak_memory}"
+                    )
+                    break
+                info.moves += 1
                 total_moves += 1
 
-                mem_deltas = {}
-                for n in [candidate, *_group_nodes(group_head, group_tail)]:
-                    mem_deltas[n] = _curr_memory[n] - _curr_memory[_prev[n]]  # type: ignore[index]
-                # swap (candidate, group_head...group_tail)
-                # Before:
-                # candidate_prev -0-> candidate -1-> group_head...group_tail -2-> group_tail_next
-                # After:
-                # candidate_prev -0-> group_head...group_tail -1-> candidate -2-> group_tail_next
-                # 0
-                candidate_prev = _prev[candidate]
-                if candidate_prev:
-                    _next[candidate_prev] = group_head
-                _prev[group_head] = candidate_prev
-
-                # 2
-                group_tail_next = _next[group_tail]
-                if group_tail_next:
-                    _prev[group_tail_next] = candidate
-                _next[candidate] = group_tail_next
-
-                # 1
-                _prev[candidate] = group_tail
-                _next[group_tail] = candidate
-
-                if _head == candidate:
-                    _head = group_head
-
-                reorder_info.final_exposed = exposed_communication_time(
+                _perform_double_linked_list_swap(candidate, group_head, group_tail)
+
+                info.final_exposed = exposed_communication_time(
                     curr, _group_nodes(_next[curr], None)
                 )
-                # Recompute curr_memory
-                _prev_curr_memory = _curr_memory[_prev[group_head]]  # type: ignore[index]
-                for n in _group_nodes(group_head, candidate):
-                    _curr_memory[n] = _prev_curr_memory = (
-                        _prev_curr_memory + mem_deltas[n]
+
+                _update_memory_tracking_after_swap(
+                    candidate,
+                    gns,
+                    group_n_to_bufs_after_swap_dealloc_by_candidate,
+                    _post_alloc_update,
+                )
+
+                if debug_iterative_memory_recompute:
+                    # Compare iteratively recomputed memory data
+                    # with full run of estimate_peak_memory
+
+                    from .comms_debug import _debug_iterative_memory_recompute
+
+                    iterative_recompute_error = _debug_iterative_memory_recompute(
+                        candidate,
+                        gns,
+                        _group_names(gns),
+                        _group_nodes(_head, None),
+                        name_to_freeable_input_buf,
+                        graph_outputs,
+                        peak_memory,
+                        _curr_memory,
+                        snodes_allocfree,
+                        "reorder_communication_preserving_peak_memory",
+                        group_n_to_bufs_after_swap_dealloc_by_candidate,
                     )
+                    if iterative_recompute_error:
+                        break
                 candidate = _prev[group_head]
         curr = _next[curr]  # type: ignore[assignment]
 
@@ -415,15 +645,15 @@ def is_groupable(candidate):
     rows = [
         [
             node_summary(snode),
-            node_reorder_info.initial_exposed,
-            node_reorder_info.final_exposed,
-            node_reorder_info.improvement,
-            node_reorder_info.limiting_factor,
-            node_reorder_info.moves,
-            node_reorder_info.grouped,
-            node_reorder_info.grouped_info,
+            node_info.initial_exposed,
+            node_info.final_exposed,
+            node_info.improvement,
+            node_info.limiting_factor,
+            node_info.moves,
+            node_info.grouped,
+            node_info.grouped_info,
         ]
-        for snode, node_reorder_info in node_stats.items()
+        for snode, node_info in node_stats.items()
     ]
     if importlib.util.find_spec("tabulate"):
         from tabulate import tabulate
@@ -441,7 +671,7 @@ def is_groupable(candidate):
 
     new_snodes = _group_nodes(_head, None)
     assert len(new_snodes) == original_snodes_num
-    new_peak_memory, curr_memory = estimate_peak_memory(
+    new_peak_memory, _, _, _ = estimate_peak_memory_allocfree(
         new_snodes, name_to_freeable_input_buf, graph_outputs
     )
     reorder_log_str += f"\n peak_memory_before:{peak_memory}"
@@ -657,24 +887,21 @@ def _sink_waits_iterative_internal(
         return snodes, {}
     graph_inputs: OrderedSet[str] = OrderedSet(V.graph.graph_inputs.keys())
     graph_outputs: OrderedSet[str] = OrderedSet(V.graph.get_output_names())
-    name_to_freeable_input_buf: dict[str, FreeableInputBuffer] = get_freeable_input_buf(
-        snodes, graph_inputs
-    )
-    peak_memory, curr_memory = estimate_peak_memory(
-        snodes, name_to_freeable_input_buf, graph_outputs
-    )
+    (
+        peak_memory,
+        _curr_memory,
+        snodes_allocfree,
+        buf_to_snode_last_use,
+        name_to_freeable_input_buf,
+    ) = _initialize_memory_tracking(snodes, graph_inputs, graph_outputs)
+
+    _prev, _next, _head = _initialize_double_linked_list(snodes)
 
     stats: dict[BaseSchedulerNode, SinkWaitInfo] = {}
-    _prev: dict[Optional[BaseSchedulerNode], Optional[BaseSchedulerNode]] = {}
-    _next: dict[Optional[BaseSchedulerNode], Optional[BaseSchedulerNode]] = {}
-    _head = snodes[0]
-    for i, snode in enumerate(snodes):
-        _prev[snode] = snodes[i - 1] if i > 0 else None
-        _next[snode] = snodes[i + 1] if i < len(snodes) - 1 else None
-    _curr_memory = dict(zip(snodes, curr_memory))
-    _curr_memory[None] = 0  # type: ignore[index]
 
-    def _group_nodes(head, tail):
+    def _group_nodes(
+        head: Optional[BaseSchedulerNode], tail: Optional[BaseSchedulerNode]
+    ) -> list[BaseSchedulerNode]:
         ret = []
         n = head
         while True:
@@ -682,21 +909,125 @@ def _group_nodes(head, tail):
                 ret.append(n)
             if n == tail:
                 break
-            n = _next[n]
+            n = _next[n]  # type: ignore[index]
         return ret
 
-    def _group_names(head, tail):
-        ret = ""
-        for n in _group_nodes(head, tail):
-            if ret:
-                ret += "~"
-            ret += n.get_name()
-        return ret
+    def _calculate_potential_peak_memory(
+        candidate, group_ns, group_n_to_bufs_after_swap_dealloc_instead_of_candidate
+    ):
+        pre_group_mem = (
+            _curr_memory[group_head][0] - snodes_allocfree[group_head].size_alloc
+        )
+        # Stash memory tracing updates to not recompute them after swap
+        _post_alloc_update: dict[BaseSchedulerNode, int] = {}
+        _size_free_delta_update: dict[BaseSchedulerNode, int] = {}
+
+        potential_peak = 0
+        if not group_n_to_bufs_after_swap_dealloc_instead_of_candidate:
+            # Not accounting for buffers liveliness change
+            potential_peak = max(
+                group_peak_memory + candidate_delta_mem,
+                pre_group_mem + candidate_allocfree.size_alloc,
+            )
+            return potential_peak, _post_alloc_update, _size_free_delta_update
+
+        candidate_post_alloc = pre_group_mem + candidate_allocfree.size_alloc
+        _post_alloc_update[candidate] = candidate_post_alloc
+        potential_peak = candidate_post_alloc
+        candidate_size_free_to_move = sum(
+            buf.mpi_buffer.size_free  # type: ignore[attr-defined]
+            for buf in itertools.chain.from_iterable(
+                group_n_to_bufs_after_swap_dealloc_instead_of_candidate.values()
+            )
+        )
+        _size_free_delta_update[candidate] = -candidate_size_free_to_move
+        delta_mem = candidate_delta_mem + candidate_size_free_to_move
+        for gn in gns:
+            gn_post_alloc = _curr_memory[gn][0] + delta_mem
+            _post_alloc_update[gn] = gn_post_alloc
+            potential_peak = max(potential_peak, gn_post_alloc)
+            gn_size_free_to_add = 0
+            if gn in group_n_to_bufs_after_swap_dealloc_instead_of_candidate:
+                bufs = group_n_to_bufs_after_swap_dealloc_instead_of_candidate[gn]
+                for buf in bufs:
+                    gn_size_free_to_add += buf.mpi_buffer.size_free
+                _size_free_delta_update[gn] = gn_size_free_to_add
+            delta_mem -= gn_size_free_to_add
+        return potential_peak, _post_alloc_update, _size_free_delta_update
+
+    def _perform_double_linked_list_swap(candidate, group_head, group_tail):
+        # group_head_prev -0-> candidate -1-> group_head...group_tail -2-> candidate_next
+        # 0:
+        group_head_prev = _prev[group_head]
+        if group_head_prev:
+            _next[group_head_prev] = candidate
+        _prev[candidate] = group_head_prev
+
+        # 2:
+        candidate_next = _next[candidate]
+        if candidate_next:
+            _prev[candidate_next] = group_tail
+        _next[group_tail] = candidate_next
+
+        # 1:
+        _prev[group_head] = candidate
+        _next[candidate] = group_head
+        nonlocal _head
+        if group_head == _head:
+            _head = candidate
+
+    def _update_memory_tracking_after_swap(
+        candidate,
+        gns,
+        group_n_to_bufs_after_swap_dealloc_instead_of_candidate,
+        _post_alloc_update,
+        _size_free_delta_update,
+    ):
+        group_head = gns[0]
+        pre_group_mem = (
+            _curr_memory[group_head][0] - snodes_allocfree[group_head].size_alloc
+        )
+        if not group_n_to_bufs_after_swap_dealloc_instead_of_candidate:
+            candidate_post_alloc = pre_group_mem + candidate_allocfree.size_alloc
+            _curr_memory[candidate] = (
+                candidate_post_alloc,
+                candidate_post_alloc - candidate_allocfree.size_free,
+            )
+            for gn in gns:
+                cm = _curr_memory[gn]
+                _curr_memory[gn] = (
+                    cm[0] + candidate_delta_mem,
+                    cm[1] + candidate_delta_mem,
+                )
+            return
+
+        for n in [candidate, *gns]:
+            post_alloc = _post_alloc_update[n]
+            snodes_allocfree[n].size_free += _size_free_delta_update[n]
+            _curr_memory[n] = (
+                post_alloc,
+                post_alloc - snodes_allocfree[n].size_free,
+            )
 
     curr = snodes[-1]
 
     processed_waits = OrderedSet()  # type: ignore[var-annotated]
+    debug_iterative_memory_recompute = config.reorder_iterative_debug_memory_recompute
+    debug_num_sink_waits_to_reorder: Optional[int] = (
+        config.sink_waits_iterative_debug_limit_to_sink
+    )
+
+    iterative_recompute_error = False
+
     while _prev[curr] is not None:
+        if iterative_recompute_error:
+            break
+        if (
+            debug_num_sink_waits_to_reorder is not None
+            and len(processed_waits) >= debug_num_sink_waits_to_reorder
+        ):
+            break
+
         if contains_wait(curr) and curr not in processed_waits:
             processed_waits.add(curr)
             info = stats[curr] = SinkWaitInfo()
@@ -704,11 +1035,14 @@ def _group_names(head, tail):
             wait_snode = curr
             group_head = curr
             group_tail = curr
-            group_peak_memory = _curr_memory[curr]
+            group_peak_memory = _curr_memory[curr][0]
             while candidate is not None:
+                if iterative_recompute_error:
+                    break
+                gns: list[BaseSchedulerNode] = _group_nodes(group_head, group_tail)
                 group = GroupedSchedulerNode(
                     wait_snode.scheduler,
-                    _group_nodes(group_head, group_tail),
+                    gns,
                     temp_grouping=True,
                 )
 
@@ -753,15 +1087,15 @@ def is_groupable(snode):
                     if is_grp:
                         group_tail = candidate
                         group_peak_memory = max(
-                            group_peak_memory, _curr_memory[candidate]
+                            group_peak_memory, _curr_memory[candidate][0]
                         )
                         info.grouped += 1
-                        info.grouped_info = _group_names(group_head, group_tail)
+                        info.grouped_info = _group_names(gns)
                         candidate = _next[candidate]
                         continue
                     elif (data_dep is None) and both_contain_comms:
                         info.limiting_factor = (
-                            f"collective ordering {_group_names(group_head, group_tail)}"
+                            f"collective ordering {_group_names(gns)}"
                             f" with candidate:{candidate.get_name()}"
                         )
                         break
@@ -769,49 +1103,89 @@ def is_groupable(snode):
                         info.limiting_factor = (
                             f"data dependency {data_dep}(dep_names:{list(data_deps.keys())})"
                             f"\n candidate:{candidate.get_name()}(os:{[candidate.get_buffer_names()]})"
-                            f"dep on {_group_names(group_head, group_tail)}"
+                            f"dep on {gns}"
                             f"\n outs:{[o.get_name() for o in group_outs]}"
                             f"\n non_group_reason:{grp_reason}"
                         )
                         break
-                candidate_delta_memory = (
-                    _curr_memory[candidate] - _curr_memory[_prev[candidate]]  # type: ignore[index]
+                candidate_allocfree: SNodeMemory = snodes_allocfree[candidate]
+                candidate_delta_mem = (
+                    candidate_allocfree.size_alloc - candidate_allocfree.size_free
                 )
-                if group_peak_memory + candidate_delta_memory > peak_memory:
-                    info.limiting_factor = "peak_memory"
+                # [group] candidate -> candidate [group]
+                # Check for buffers with successors in group and candidate last successor
+                #
+                # Buf that  changes its last use snode,
+                # It was deallocated by candidate,
+                # but after swap it will be deallocated by group node.
+                group_n_to_bufs_after_swap_dealloc_instead_of_candidate: dict[
+                    BaseSchedulerNode, list[Union[FreeableInputBuffer, SchedulerBuffer]]
+                ] = defaultdict(list)
+                for (
+                    buf,
+                    snode_last_use,
+                ) in buf_to_snode_last_use.items():
+                    succ_nodes = buf.mpi_buffer.succ_nodes
+                    if snode_last_use != candidate:  # noqa: E711
+                        continue
+                    # candidate is last use of buf
+                    last_succ_gn = None
+                    for gn in gns:
+                        if gn in succ_nodes:
+                            last_succ_gn = gn
+                    if last_succ_gn is None:
+                        continue
+
+                    # gn has successors of buf that after potential swap will become
+                    # last use of buf and start deallocating buf instead of candidate
+                    group_n_to_bufs_after_swap_dealloc_instead_of_candidate[
+                        last_succ_gn
+                    ].append(buf)
+
+                potential_peak, _post_alloc_update, _size_free_delta_update = (
+                    _calculate_potential_peak_memory(
+                        candidate,
+                        gns,
+                        group_n_to_bufs_after_swap_dealloc_instead_of_candidate,
+                    )
+                )
+                if potential_peak > peak_memory:
+                    info.limiting_factor = (
+                        f"peak memory new:{potential_peak} vs base:{peak_memory}"
+                    )
                     break
 
                 info.moves += 1
                 info.moves_info += f"+{candidate.get_name()}"
 
-                # group_head_prev -0-> candidate -1-> group_head...group_tail -2-> candidate_next
-                mem_deltas = {}
-                for n in [candidate, *_group_nodes(group_head, group_tail)]:
-                    mem_deltas[n] = _curr_memory[n] - _curr_memory[_prev[n]]  # type: ignore[index]
-                # 0:
-                group_head_prev = _prev[group_head]
-                if group_head_prev:
-                    _next[group_head_prev] = candidate
-                _prev[candidate] = group_head_prev
-
-                # 2:
-                candidate_next = _next[candidate]
-                if candidate_next:
-                    _prev[candidate_next] = group_tail
-                _next[group_tail] = candidate_next
-
-                # 1:
-                _prev[group_head] = candidate
-                _next[candidate] = group_head
-                if group_head == _head:
-                    _head = candidate
-
-                # Recompute curr_memory
-                _prev_curr_memory = _curr_memory[_prev[candidate]]  # type: ignore[index]
-                for n in _group_nodes(candidate, group_tail):
-                    _curr_memory[n] = _prev_curr_memory = (
-                        _prev_curr_memory + mem_deltas[n]
+                _perform_double_linked_list_swap(candidate, group_head, group_tail)
+
+                _update_memory_tracking_after_swap(
+                    candidate,
+                    gns,
+                    group_n_to_bufs_after_swap_dealloc_instead_of_candidate,
+                    _post_alloc_update,
+                    _size_free_delta_update,
+                )
+
+                if debug_iterative_memory_recompute:
+                    from .comms_debug import _debug_iterative_memory_recompute
+
+                    iterative_recompute_error = _debug_iterative_memory_recompute(
+                        candidate,
+                        gns,
+                        _group_names(gns),
+                        _group_nodes(_head, None),
+                        name_to_freeable_input_buf,
+                        graph_outputs,
+                        peak_memory,
+                        _curr_memory,
+                        snodes_allocfree,
+                        "sink_waits_iterative",
+                        group_n_to_bufs_after_swap_dealloc_instead_of_candidate,
                     )
+                    if iterative_recompute_error:
+                        break
 
                 candidate = _next[group_tail]
         curr = _prev[curr]  # type: ignore[assignment]
@@ -850,11 +1224,11 @@ def is_groupable(snode):
     overlap_log.info(log_str)
     new_snodes = _group_nodes(_head, None)
     assert len(new_snodes) == original_snodes_num
-    new_peak_memory, curr_memory = estimate_peak_memory(
+    new_peak_memory, _, _, _ = estimate_peak_memory_allocfree(
         new_snodes, name_to_freeable_input_buf, graph_outputs
     )
-    log_str += f"\n peak_memory_before:{peak_memory}"
-    log_str += f"\n peak_memory_after:{new_peak_memory}"
+    log_str += f"\n sink_waits_iterative peak_memory_before:{peak_memory}"
+    log_str += f"\n sink_waits_iterative peak_memory_after:{new_peak_memory}"
     trace_structured(
         "artifact",
         metadata_fn=lambda: {
diff --git a/torch/_inductor/comms_debug.py b/torch/_inductor/comms_debug.py
new file mode 100644
index 0000000000000..b6012828b8731
--- /dev/null
+++ b/torch/_inductor/comms_debug.py
@@ -0,0 +1,112 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Union
+
+from torch._logging import trace_structured
+
+from .memory import estimate_peak_memory_allocfree
+
+
+if TYPE_CHECKING:
+    from torch.utils._ordered_set import OrderedSet
+
+    from .memory import FreeableInputBuffer, SNodeMemory
+    from .scheduler import BaseSchedulerNode, SchedulerBuffer
+
+
+def _debug_iterative_memory_recompute(
+    candidate: BaseSchedulerNode,
+    gns: list[BaseSchedulerNode],
+    group_names: str,
+    snodes: list[BaseSchedulerNode],
+    name_to_freeable_input_buf: dict[str, FreeableInputBuffer],
+    graph_outputs: OrderedSet[str],
+    peak_memory: int,
+    iter_curr_memory: dict[BaseSchedulerNode, tuple[int, int]],
+    snodes_allocfree: dict[BaseSchedulerNode, SNodeMemory],
+    tlparse_name: str,
+    gn_to_bufs_last_use: dict[
+        BaseSchedulerNode, list[Union[FreeableInputBuffer, SchedulerBuffer]]
+    ],
+) -> bool:
+    iterative_recompute_error = False
+    candidate_allocfree = snodes_allocfree[candidate]
+    est_peak_memory, snodes_curr_memory, snodes_allocfree, _ = (
+        estimate_peak_memory_allocfree(
+            snodes, name_to_freeable_input_buf, graph_outputs
+        )
+    )
+    est_curr_memory = dict(zip(snodes, snodes_curr_memory))
+    iter_cm = iter_curr_memory[candidate]
+    new_cm = est_curr_memory[candidate]
+    log = ""
+    if est_peak_memory > peak_memory:
+        log = "ITERATIVE PEAK DOES NOT MATCH"
+        iterative_recompute_error = True
+    if iter_cm != new_cm:
+        log = "ITERATIVE CURR MEMORY CANDIDATE DOES NOT MATCH"
+        iterative_recompute_error = True
+    for i, gn in enumerate(gns):
+        iter_gnm = iter_curr_memory[gn]
+        new_gnm = est_curr_memory[gn]
+        if iter_gnm != new_gnm:
+            log = f"ITERATIVE GN CURR MEMORY DOES NOT MATCH:{gn.get_name()}"
+            iterative_recompute_error = True
+    if iterative_recompute_error:
+        log += (
+            f"\nCANDIDATE:{candidate.get_name()}"
+            f"\nGROUP:{group_names}"
+            f"\nPEAK_MEMORY_BEFORE:{peak_memory}"
+            f"\nPEAK_MEMORY_AFTER_SWAP:{est_peak_memory}"
+            f"\nCANDIDATE:{candidate.debug_str()}"
+            f"\nCANDIDATE_ITER_CURR_MEMORY:{iter_cm}"
+            f"\nCANDIDATE_NEW__CURR_MEMORY:{new_cm}"
+            f"\nCANDIDATE_ITER_ALLOCFREE:{candidate_allocfree}"
+            f"\nCANDIDATE_NEW_ALLOCFREE:{snodes_allocfree[candidate]}"
+        )
+        peak_log = ""
+        for i, (pre, post) in enumerate(snodes_curr_memory):
+            if est_peak_memory == pre:
+                n = snodes[i]
+                peak_log = (
+                    f"\nNEW_PEAK:{est_peak_memory}(BASE:{peak_memory})"
+                    f" @ SNODE[{i}/{len(snodes)}]:{n.get_name()} {n.debug_str()}"
+                )
+                break
+        group_log = ""
+        for i, gn in enumerate(gns):
+            iter_gnm = iter_curr_memory[gn]
+            new_gnm = est_curr_memory[gn]
+            group_log += (
+                f"\nGROUP_NODE[{i}]:{gn.debug_str()}"
+                f"\nGROUP_NODE[{i}] ITER_GNM[{gn.get_name()}]:{iter_gnm}"
+                f"\nGROUP_NODE[{i}] ESTM_GNM[{gn.get_name()}]:{new_gnm}"
+                f"\nGROUP_NODE[{i}] ITER_allocfree:{snodes_allocfree[gn]}"
+                f"\nGROUP_NODE[{i}] ESTM_allocfree:{snodes_allocfree[gn]}"
+            )
+        log += peak_log
+        log += group_log
+        log += f"\nGN_TO_BUFS_LAST_USE:{gn_to_bufs_last_use}"
+        log += "\n\n".join(
+            [
+                (
+                    f"\nSNODE[{i}]\n{n.debug_str()}"
+                    f"\nITER_cur_mem:{iter_curr_memory[n]}"
+                    f"\nESTM_cur_mem:{est_curr_memory[n]}"
+                    f"\nITER_allocfree:{snodes_allocfree[n]}"
+                    f"\nESTM_allocfree:{snodes_allocfree[n]}"
+                )
+                for i, n in enumerate(snodes)
+            ]
+        )
+        tname = f"{tlparse_name}_ITERATIVE_RECOMPUTE_ERROR"
+        print(f"{tname}:\n{log}")
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": tname,
+                "encoding": "string",
+            },
+            payload_fn=lambda: log,
+        )
+    return iterative_recompute_error
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index bb00f46886f84..9e46613300456 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import contextlib
+import copy
 import enum
 import functools
 import io
@@ -64,7 +65,11 @@
     log_cudagraph_skip_and_bump_counter,
     PlaceholderInfo,
 )
-from torch._inductor.debug import save_args_for_compile_fx_inner
+from torch._inductor.custom_graph_pass import CustomPartitionerFn
+from torch._inductor.debug import (
+    create_mapping_pre_post_grad_nodes,
+    save_args_for_compile_fx_inner,
+)
 from torch._inductor.output_code import (
     CompiledAOTI,
     CompiledFxGraph,
@@ -150,6 +155,8 @@ def log_optimus_to_scuba(*args: object, **kwargs: object) -> None:
     from torch._inductor.fb.utils import log_optimus_to_scuba, time_and_log
 
 if TYPE_CHECKING:
+    import types
+
     from torch._functorch._aot_autograd.schemas import (
         FQN,
         GraphInputName,
@@ -721,6 +728,7 @@ class _CompileFxKwargs(TypedDict, total=False):
     layout_opt: Optional[bool]
     extern_node_serializer: Optional[Callable[[list[ExternKernelNode]], Any]]
     boxed_forward_device_index: Optional[BoxedDeviceIndex]
+    fx_wrapper: bool
 
 
 class _CompileFxCallable(Protocol):
@@ -742,6 +750,7 @@ def compile_fx_inner(
     kwargs.setdefault("is_backward", False)
     kwargs.setdefault("graph_id", None)
     kwargs.setdefault("cpp_wrapper", False)
+    kwargs.setdefault("fx_wrapper", False)
     kwargs.setdefault("is_inference", False)
     kwargs.setdefault("boxed_forward_device_index", None)
     kwargs.setdefault("layout_opt", None)
@@ -837,7 +846,9 @@ def _compile_fx_inner(
     backends_support_caching = all(
         backend.supports_caching
         for backend in (
-            get_wrapper_codegen_for_device(device.type, config.cpp_wrapper)
+            get_wrapper_codegen_for_device(
+                device.type, config.cpp_wrapper, config.fx_wrapper
+            )
             for device in get_all_devices(gm)
         )
         if backend is not None
@@ -1055,20 +1066,6 @@ def _compile_fx_inner(
 
     log.debug("FX codegen and compilation took %.3fs", time.time() - start)
 
-    if config.trace.provenance_tracking:
-        # Dump provenance artifacts for debugging trace
-        provenance_info = torch._inductor.debug.dump_inductor_provenance_info()
-        # provenance_info might be None if trace.provenance_tracking is not set
-        if provenance_info:
-            trace_structured(
-                "artifact",
-                metadata_fn=lambda: {
-                    "name": "inductor_provenance_tracking_node_mappings",
-                    "encoding": "json",
-                },
-                payload_fn=lambda: json.dumps(provenance_info),
-            )
-
     # This message is for printing overview information of inductor mm counts, shapes,etc after lowering
     if log.isEnabledFor(logging.INFO):
         mm_table_data = []
@@ -1175,6 +1172,7 @@ def codegen_and_compile(
         is_backward: bool = graph_kwargs.get("is_backward", False)
         graph_id: Optional[int] = graph_kwargs.get("graph_id", None)
         cpp_wrapper: bool = graph_kwargs.get("cpp_wrapper", False)
+        fx_wrapper: bool = graph_kwargs.get("fx_wrapper", False)
         aot_mode: bool = V.aot_compilation
         is_inference: bool = graph_kwargs.get("is_inference", False)
         extern_node_serializer: Optional[Callable[[list[ExternKernelNode]], Any]] = (
@@ -1310,20 +1308,10 @@ def codegen_and_compile(
                     },
                     payload_fn=lambda: inductor_post_grad_graph_str,
                 )
-                if config.trace.provenance_tracking:
+                if config.trace.provenance_tracking_level != 0:
                     provenance_tracking_json = (
                         torch.fx.traceback.get_graph_provenance_json(gm.graph)
                     )
-                    trace_structured(
-                        "artifact",
-                        metadata_fn=lambda: {
-                            "name": "inductor_post_to_pre_grad_nodes",
-                            "encoding": "json",
-                        },
-                        payload_fn=lambda: json.dumps(provenance_tracking_json),
-                    )
-                    from torch._inductor.debug import create_mapping_pre_post_grad_nodes
-
                     torch._inductor.debug._inductor_post_to_pre_grad_nodes = (
                         create_mapping_pre_post_grad_nodes(
                             torch._inductor.debug._pre_grad_graph_id,
@@ -1387,8 +1375,12 @@ def codegen_and_compile(
                         is_inference=is_inference,
                         is_backward=is_backward,
                         is_const_graph=True,
+                        fx_wrapper=fx_wrapper,
                     )
-                    with V.set_graph_handler(const_graph):
+                    with (
+                        V.set_graph_handler(const_graph),
+                        V.set_extern_kernel_nodes([]),
+                    ):
                         assert cpp_wrapper, "AOT mode only supports C++ wrapper"
                         const_graph.run()
                         const_wrapper_code, const_kernel_code = (
@@ -1417,13 +1409,14 @@ def codegen_and_compile(
                     ),
                     const_module=const_graph,
                     inputs_to_check=inputs_to_check,
+                    fx_wrapper=fx_wrapper,
                 )
                 metrics_helper = metrics.CachedMetricsHelper()
 
                 # We are going to start code generating runtime asserts, so make sure
                 # you don't start adding new ones in the lowering process
                 graph.freeze_runtime_asserts()
-                with V.set_graph_handler(graph):
+                with V.set_graph_handler(graph), V.set_extern_kernel_nodes([]):
                     graph.run(*example_inputs)
                     output_strides: list[Optional[tuple[_StrideExprStr, ...]]] = []
                     if graph.graph_outputs is not None:
@@ -1454,7 +1447,15 @@ def codegen_and_compile(
                     with dynamo_timed(
                         "GraphLowering.compile_to_fn", log_pt2_compile_event=True
                     ):
-                        if graph.aot_mode:
+                        if graph.aot_mode and graph.fx_wrapper:
+                            assert not graph.cpp_wrapper
+                            compiled_fn = graph.codegen()[0].gm  # type: ignore[attr-defined]
+                            output_code_log.debug(
+                                "Output graph module: \n%s",
+                                compiled_fn.print_readable(print_output=False),
+                            )
+
+                        elif graph.aot_mode:
                             from .codecache import AotCodeCompiler
 
                             assert graph.cpp_wrapper, (
@@ -1470,11 +1471,9 @@ def codegen_and_compile(
                                 )
 
                             serialized_extern_kernel_nodes = None
-                            if graph.extern_kernel_nodes:
+                            if V.extern_kernel_nodes:
                                 serialized_extern_kernel_nodes = (
-                                    graph.extern_node_serializer(
-                                        graph.extern_kernel_nodes
-                                    )
+                                    graph.extern_node_serializer(V.extern_kernel_nodes)
                                 )
                                 output_code_log.debug(
                                     "Serialized Extern Kernel Nodes: \n%s",
@@ -1509,6 +1508,34 @@ def codegen_and_compile(
                                 compiled_module, "runner", None
                             )
 
+                    # Dump provenance artifacts for debugging trace
+                    inductor_provenance_tracking_node_mappings = None
+                    inductor_kernel_stack_trace_str = None
+                    if config.trace.provenance_tracking_level != 0:
+                        inductor_provenance_tracking_node_mappings = json.dumps(
+                            torch._inductor.debug.dump_inductor_provenance_info()
+                        )
+                        inductor_kernel_stack_trace_str = json.dumps(
+                            torch._inductor.debug._inductor_kernel_stack_trace
+                        )
+                        trace_structured(
+                            "artifact",
+                            metadata_fn=lambda: {
+                                "name": "inductor_provenance_tracking_node_mappings",
+                                "encoding": "json",
+                            },
+                            payload_fn=lambda: inductor_provenance_tracking_node_mappings,
+                        )
+                        trace_structured(
+                            "artifact",
+                            metadata_fn=lambda: {
+                                "name": "inductor_provenance_tracking_kernel_stack_traces",
+                                "encoding": "json",
+                            },
+                            payload_fn=lambda: inductor_kernel_stack_trace_str,
+                        )
+
+                    node_runtimes = None
                     if inductor_metrics_log.isEnabledFor(logging.INFO):
                         num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
                         metrics.num_bytes_accessed += num_bytes
@@ -1523,6 +1550,11 @@ def codegen_and_compile(
                             },
                         )
 
+                    # Collect and dump op runtimes and tensor metadata for TLParse
+                    if config.log_tlparse:
+                        _, _, node_runtimes = graph.count_bytes()
+                        torch._inductor.debug.log_runtime_and_tensor_meta(node_runtimes)
+
                     # Collect and dump collective-op schedule for external diagnostics
                     torch._inductor.debug.log_collective_schedule(graph.scheduler.nodes)
 
@@ -1562,7 +1594,9 @@ def codegen_and_compile(
                             V.graph.disable_cudagraphs_reason = disable
 
                     if V.aot_compilation:
-                        assert isinstance(compiled_fn, (str, list))
+                        assert isinstance(
+                            compiled_fn, (str, list, torch.fx.GraphModule)
+                        ), type(compiled_fn)
                         return CompiledAOTI(compiled_fn)
 
                     # TODO: Hoist this above V.aot_compilation
@@ -1579,6 +1613,19 @@ def codegen_and_compile(
 
                     self._compile_stats[type(self)].codegen_and_compile += 1
 
+                    if (
+                        torch._inductor.debug.RECORD_GRAPH_EXECUTION
+                        and torch._inductor.debug.GRAPH_COMPILE_IDS is not None
+                    ):
+                        compile_id = str(
+                            torch._guards.CompileContext.current_compile_id()
+                        )
+                        graph_id = graph_kwargs.get("graph_id")
+                        if graph_id is not None:
+                            torch._inductor.debug.GRAPH_COMPILE_IDS[graph_id] = (
+                                compile_id
+                            )
+
                     return CompiledFxGraph(
                         compiled_fn,
                         graph,
@@ -1595,6 +1642,8 @@ def codegen_and_compile(
                         runnable_graph_str,
                         inductor_post_grad_graph_str,
                         compiled_fn_runner,
+                        inductor_provenance_tracking_node_mappings,
+                        inductor_kernel_stack_trace_str,
                     )
 
 
@@ -1843,17 +1892,17 @@ def compile_fx_aot(
     example_inputs_: list[InputType],
     inner_compile: _CompileFxCallable = compile_fx_inner,
     config_patches: Optional[dict[str, Any]] = None,
-) -> Union[list[Union[str, Weights]], str]:
+) -> Union[list[Union[str, Weights]], str, GraphModule]:
     assert isinstance(model_, GraphModule), model_
 
     # [See NOTE] Unwrapping subclasses AOT
     unwrap_tensor_subclass_parameters(model_)
 
-    config_patches: dict[str, Any] = (
-        {"cpp_wrapper": True}
-        if config_patches is None
-        else {**config_patches, "cpp_wrapper": True}
-    )
+    config_patches: dict[str, Any] = copy.deepcopy(config_patches or {})
+
+    if not (config_patches.get("fx_wrapper", False) or config.fx_wrapper):
+        # If fx_wrapper is not set, then set cpp_wrapper
+        config_patches["cpp_wrapper"] = True
 
     output_path = config_patches.get(
         "aot_inductor.output_path", config.aot_inductor.output_path
@@ -2046,6 +2095,290 @@ def get_cuda_device_context(gm: torch.fx.GraphModule) -> AbstractContextManager[
     )
 
 
+def partition_fn(
+    gm: GraphModule,
+    joint_inputs: Sequence[object],
+    **kwargs: object,
+) -> tuple[GraphModule, GraphModule]:
+    cuda_context = get_cuda_device_context(gm)
+    with cuda_context:
+        # We can skip the invoke_subgraph because the
+        # entire_partition_fn is called recursively for invoke_subgraph
+        # in partitioning.
+        _recursive_joint_graph_passes(gm, skip_invoke_subgraph=True)
+
+    static_lifetime_input_indices: Optional[list[int]] = kwargs.pop(  # type: ignore[assignment]
+        "static_lifetime_input_indices", None
+    )
+
+    if config.custom_partitioner_fn is None:
+        with dynamo_utils.dynamo_timed(
+            "min_cut_rematerialization_partition", log_pt2_compile_event=True
+        ):
+            return min_cut_rematerialization_partition(
+                gm,
+                joint_inputs,
+                compiler="inductor",
+                static_lifetime_input_indices=static_lifetime_input_indices,
+                **kwargs,
+            )
+    else:
+        assert isinstance(config.custom_partitioner_fn, CustomPartitionerFn)
+        with dynamo_utils.dynamo_timed(
+            config.custom_partitioner_fn.__class__.__name__,
+            log_pt2_compile_event=True,
+        ):
+            return config.custom_partitioner_fn(
+                gm,
+                joint_inputs,
+                compiler="inductor",
+                static_lifetime_input_indices=static_lifetime_input_indices,
+                **kwargs,
+            )
+
+
+def get_num_model_outputs(model: GraphModule) -> int:
+    model_outputs_node = output_node(model)
+    model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+    return len(model_outputs)
+
+
+@dataclass(frozen=True)
+class CompilerConfigExtra:
+    cudagraphs: BoxedBool
+    graph_id: int
+    forward_device: BoxedDeviceIndex
+
+
+def create_compiler_config_extra(config: types.ModuleType) -> CompilerConfigExtra:
+    # Although cudagraphs may have been enabled via config, various
+    # conditions (which are tested within the bowels of Inductor) may
+    # force cudagraphs to be disabled.  This mutable box lets us retrieve
+    # the final determination if cudagraphs actually can be used or not.
+    cudagraphs = BoxedBool(config.triton.cudagraphs)
+
+    # TODO: The modern style is to use CompileId from TracingContext to
+    # identify Inductor compilation.  However, this CompileId cannot
+    # uniquely identify multiple Inductor compilations that arise from
+    # DDPOptimizer
+    graph_id = next(_graph_counter)
+
+    # See [Backward Generation Handling]
+    forward_device = BoxedDeviceIndex(None)
+
+    return CompilerConfigExtra(
+        cudagraphs=cudagraphs,
+        graph_id=graph_id,
+        forward_device=forward_device,
+    )
+
+
+def compile_fx_forward(
+    gm: GraphModule,
+    example_inputs: Sequence[InputType],
+    num_orig_model_outputs: int,
+    num_example_inputs: int,
+    compiler_config_extra: CompilerConfigExtra,
+    inner_compile: Callable[..., OutputCode] = compile_fx_inner,
+    is_inference: bool = False,
+) -> OutputCode:
+    """
+    Compile the forward graph of the given graph module.
+
+    Args:
+        gm: The graph module to compile.
+        example_inputs: The example inputs to use for compilation.
+        num_orig_model_outputs: The number of model outputs from the original dynamo graph.
+        num_example_inputs: The number of example inputs from the original dynamo graph.
+        compiler_config_extra: Extra configuration for the compiler.
+        inner_compile: The inner compile function to use.
+        is_inference: Whether this is an inference graph.
+    """
+
+    if is_inference:
+        # partition_fn won't be called
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "before_joint_graph",
+                "encoding": "string",
+            },
+            payload_fn=lambda: gm.print_readable(
+                print_output=False, include_stride=True, include_device=True
+            ),
+        )
+
+        _recursive_joint_graph_passes(gm)
+
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "after_joint_graph",
+                "encoding": "string",
+            },
+            payload_fn=lambda: gm.print_readable(
+                print_output=False, include_stride=True, include_device=True
+            ),
+        )
+
+    fixed = torch._inductor.utils.num_fw_fixed_arguments(
+        num_example_inputs, len(example_inputs)
+    )
+
+    model_outputs_node = output_node(gm)
+    if config.keep_output_stride:
+        model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+        num_model_outputs = len(model_outputs)
+
+        context = torch._guards.TracingContext.try_get()
+        # See Note [User Outputs in the inductor graph]
+        if context is not None and context.fw_metadata and not is_inference:
+            original_output_start_index = (
+                context.fw_metadata.num_mutated_inp_runtime_indices
+            )
+        else:
+            original_output_start_index = 0
+
+        assert num_orig_model_outputs <= num_model_outputs
+
+        # Note [User Outputs in the inductor graph]
+        # We makes the following assumption
+        # For inference
+        #   len(orig_model_outputs) == len(model_outputs)
+        # For training
+        #   len(orig_model_outputs) <= len(model_outputs)
+        # During training, most of the time the model_outputs starts with
+        # original module's outputs followed by saved activations.
+        # But this can be not true if the model have inplace updated tensors.
+        # AOTAutograd will make those tensors being returned before the original
+        # module's output.
+        # To make things safe, we'll use original_output_start_index field
+        # set by AOTAutograd to decide where the original module outputs start.
+        orig_output_end_idx = original_output_start_index + num_orig_model_outputs
+        # Sanity check: we are about to splice out the "user" outputs from the full set
+        # of "graph" outputs. Make sure we're within bounds.
+        assert orig_output_end_idx <= num_model_outputs
+
+        model_outputs_node.meta["user_visible_output_idxs"] = [
+            idx
+            for idx in range(original_output_start_index, orig_output_end_idx)
+            if isinstance(model_outputs[idx], torch.fx.Node)
+        ]
+    else:
+        model_outputs_node.meta["user_visible_output_idxs"] = []
+
+    # We also mark the invoke_subgraph outputs as user_visible to
+    # force the outputs of invoke_subgraph subgraph to follow the
+    # original strides
+    _recursive_record_user_visible_output_idxs(gm)
+
+    return inner_compile(
+        gm,
+        example_inputs,
+        static_input_idxs=get_static_input_idxs(fixed),
+        cudagraphs=compiler_config_extra.cudagraphs,
+        graph_id=compiler_config_extra.graph_id,
+        is_inference=is_inference,
+        boxed_forward_device_index=compiler_config_extra.forward_device,
+    )
+
+
+def compile_fx_backward(
+    gm: GraphModule,
+    example_inputs: Sequence[InputType],
+    compiler_config_extra: CompilerConfigExtra,
+    inner_compile: Callable[..., OutputCode] = compile_fx_inner,
+) -> OutputCode:
+    """
+    Compile the backward graph of the given graph module.
+
+    Args:
+        gm: The graph module to compile.
+        example_inputs: The example inputs to use for compilation.
+        compiler_config_extra: Extra configuration for the compiler.
+        inner_compile: The inner compile function to use.
+    """
+    from torch._dynamo.convert_frame import compile_lock
+
+    with compile_lock:
+        model_outputs_node = output_node(gm)
+        if config.bw_outputs_user_visible:
+            model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+            model_outputs_node.meta["user_visible_output_idxs"] = [
+                idx
+                for idx, n in enumerate(model_outputs)
+                if isinstance(n, torch.fx.Node)
+            ]
+        else:
+            model_outputs_node.meta["user_visible_output_idxs"] = []
+
+        fixed = count_tangents(gm)
+        with (
+            config.patch(get_cpp_wrapper_config())
+            if config.cpp_wrapper
+            else contextlib.nullcontext()
+        ):
+            return inner_compile(
+                gm,
+                example_inputs,
+                static_input_idxs=list(range(fixed)),
+                cudagraphs=compiler_config_extra.cudagraphs,
+                is_backward=True,
+                graph_id=compiler_config_extra.graph_id,
+                boxed_forward_device_index=compiler_config_extra.forward_device,
+            )
+
+
+def run_pre_grad_passes(
+    model_: GraphModule, example_inputs_: Sequence[InputType]
+) -> GraphModule:
+    # "before_pre_grad_graph" is used in inductor provenance
+    # tracking highlighter front-end.
+    trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "before_pre_grad_graph",
+            "encoding": "string",
+        },
+        payload_fn=lambda: model_.print_readable(
+            print_output=False, include_stride=True, include_device=True
+        )
+        + f"\n\n # graph id: {id(model_.graph)}",
+    )
+    pre_grad_graphs_log.debug(
+        "%s",
+        lazy_format_graph_code(
+            "BEFORE PRE GRAD",
+            model_,
+            include_stride=True,
+            include_device=True,
+            colored=True,
+        ),
+    )
+    torch._inductor.debug._pre_grad_graph_id = id(model_.graph)
+
+    if config.trace.provenance_tracking_level == 1:
+        for node in model_.graph.nodes:
+            if node.stack_trace:
+                torch._inductor.debug._inductor_pre_grad_node_stack_trace[node.name] = (
+                    node.stack_trace
+                )
+
+    model_ = _recursive_pre_grad_passes(model_, example_inputs_)
+    trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "after_pre_grad_graph",
+            "encoding": "string",
+        },
+        payload_fn=lambda: model_.print_readable(
+            print_output=False, include_stride=True, include_device=True
+        )
+        + f"\n\n # graph id: {id(model_.graph)}",
+    )
+    return model_
+
+
 def compile_fx(
     model_: GraphModule,
     example_inputs_: Sequence[InputType],
@@ -2087,11 +2420,15 @@ def compile_fx(
             )
 
     # TODO: This probably shouldn't be a recursive call
-    if config.cpp_wrapper:
+    if config.cpp_wrapper or config.fx_wrapper:
+        cpp_wrapper_config = config.cpp_wrapper
+        fx_wrapper_config = config.fx_wrapper
+
         with (
             config.patch(
                 {
                     "cpp_wrapper": False,  # reset to break recursive call to compile_fx
+                    "fx_wrapper": False,  # reset to break recursive call to compile_fx
                     **get_cpp_wrapper_config(),
                 }
             ),
@@ -2137,7 +2474,11 @@ def compile_fx(
                 return compile_fx(
                     patched_mod,
                     fake_args,
-                    inner_compile=functools.partial(inner_compile, cpp_wrapper=True),
+                    inner_compile=functools.partial(
+                        inner_compile,
+                        cpp_wrapper=cpp_wrapper_config,
+                        fx_wrapper=fx_wrapper_config,
+                    ),
                     decompositions=decompositions,
                     ignore_shape_env=ignore_shape_env,
                 )
@@ -2171,7 +2512,9 @@ def compile_fx(
     with (
         _use_lazy_graph_module(dynamo_config.use_lazy_graph_module),
         enable_python_dispatcher(),
-        torch.fx.traceback.preserve_node_meta(config.trace.provenance_tracking),
+        torch.fx.traceback.preserve_node_meta(
+            config.trace.provenance_tracking_level == 1
+        ),
         torch._inductor.debug.reset_provenance_globals(),
     ):
         # Pre-grad passes cannot be run if we weren't given a GraphModule.
@@ -2180,50 +2523,7 @@ def compile_fx(
         # having AOTAutograd trace it.
         # TODO: Get rid of this?
         if isinstance(model_, GraphModule):
-            # "before_pre_grad_graph" is used in inductor provenance
-            # tracking highlighter front-end.
-            trace_structured(
-                "artifact",
-                metadata_fn=lambda: {
-                    "name": "before_pre_grad_graph",
-                    "encoding": "string",
-                },
-                payload_fn=lambda: model_.print_readable(
-                    print_output=False, include_stride=True, include_device=True
-                )
-                + f"\n\n # graph id: {id(model_.graph)}",
-            )
-            pre_grad_graphs_log.debug(
-                "%s",
-                lazy_format_graph_code(
-                    "BEFORE PRE GRAD",
-                    model_,
-                    include_stride=True,
-                    include_device=True,
-                    colored=True,
-                ),
-            )
-            torch._inductor.debug._pre_grad_graph_id = id(model_.graph)
-
-            if config.trace.provenance_tracking:
-                for node in model_.graph.nodes:
-                    if node.stack_trace:
-                        torch._inductor.debug._inductor_pre_grad_node_stack_trace[
-                            node.name
-                        ] = node.stack_trace
-
-            model_ = _recursive_pre_grad_passes(model_, example_inputs_)
-            trace_structured(
-                "artifact",
-                metadata_fn=lambda: {
-                    "name": "after_pre_grad_graph",
-                    "encoding": "string",
-                },
-                payload_fn=lambda: model_.print_readable(
-                    print_output=False, include_stride=True, include_device=True
-                )
-                + f"\n\n # graph id: {id(model_.graph)}",
-            )
+            model_ = run_pre_grad_passes(model_, example_inputs_)
 
         # TODO: Move this before recursive pre-grad passes
         # NB: This short circuit never occurs for Dynamo produced graphs
@@ -2239,20 +2539,7 @@ def compile_fx(
 
         num_example_inputs = len(example_inputs_)
 
-        # Although cudagraphs may have been enabled via config, various
-        # conditions (which are tested within the bowels of Inductor) may
-        # force cudagraphs to be disabled.  This mutable box lets us retrieve
-        # the final determination if cudagraphs actually can be used or not.
-        cudagraphs = BoxedBool(config.triton.cudagraphs)
-
-        # See [Backward Generation Handling]
-        forward_device = BoxedDeviceIndex(None)
-
-        # TODO: The modern style is to use CompileId from TracingContext to
-        # identify Inductor compilation.  However, this CompileId cannot
-        # uniquely identify multiple Inductor compilations that arise from
-        # DDPOptimizer
-        graph_id = next(_graph_counter)
+        compiler_config_extra = create_compiler_config_extra(config)
 
         decompositions = (
             decompositions if decompositions is not None else select_decomp_table()
@@ -2264,83 +2551,18 @@ def fw_compiler_base(
             is_inference: bool,
         ) -> OutputCode:
             with dynamo_utils.dynamo_timed("compile_fx.<locals>.fw_compiler_base"):
-                if is_inference:
-                    # partition_fn won't be called
-                    _recursive_joint_graph_passes(gm)
-
-                fixed = torch._inductor.utils.num_fw_fixed_arguments(
-                    num_example_inputs, len(example_inputs)
-                )
-
-                model_outputs_node = output_node(gm)
-                if config.keep_output_stride:
-                    model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
-                    num_model_outputs = len(model_outputs)
-
-                    context = torch._guards.TracingContext.try_get()
-                    # See Note [User Outputs in the inductor graph]
-                    if context is not None and context.fw_metadata and not is_inference:
-                        original_output_start_index = (
-                            context.fw_metadata.num_mutated_inp_runtime_indices
-                        )
-                    else:
-                        original_output_start_index = 0
-
-                    if isinstance(model_, GraphModule):
-                        *_, orig_model_outputs_node = model_.graph.nodes
-                        assert orig_model_outputs_node.op == "output"
-                        orig_model_outputs, _ = pytree.tree_flatten(
-                            orig_model_outputs_node.args
-                        )
-                        num_orig_model_outputs = len(orig_model_outputs)
-                    else:
-                        num_orig_model_outputs = num_model_outputs
-
-                    assert num_orig_model_outputs <= num_model_outputs
-
-                    # Note [User Outputs in the inductor graph]
-                    # We makes the following assumption
-                    # For inference
-                    #   len(orig_model_outputs) == len(model_outputs)
-                    # For training
-                    #   len(orig_model_outputs) <= len(model_outputs)
-                    # During training, most of the time the model_outputs starts with
-                    # original module's outputs followed by saved activations.
-                    # But this can be not true if the model have inplace updated tensors.
-                    # AOTAutograd will make those tensors being returned before the original
-                    # module's output.
-                    # To make things safe, we'll use original_output_start_index field
-                    # set by AOTAutograd to decide where the original module outputs start.
-                    orig_output_end_idx = (
-                        original_output_start_index + num_orig_model_outputs
-                    )
-                    # Sanity check: we are about to splice out the "user" outputs from the full set
-                    # of "graph" outputs. Make sure we're within bounds.
-                    assert orig_output_end_idx <= num_model_outputs
-
-                    model_outputs_node.meta["user_visible_output_idxs"] = [
-                        idx
-                        for idx in range(
-                            original_output_start_index, orig_output_end_idx
-                        )
-                        if isinstance(model_outputs[idx], torch.fx.Node)
-                    ]
+                if isinstance(model_, GraphModule):
+                    num_orig_model_outputs = get_num_model_outputs(model_)
                 else:
-                    model_outputs_node.meta["user_visible_output_idxs"] = []
-
-                # We also mark the invoke_subgraph outputs as user_visible to
-                # force the outputs of invoke_subgraph subgraph to follow the
-                # original strides
-                _recursive_record_user_visible_output_idxs(gm)
-
-                return inner_compile(
+                    num_orig_model_outputs = get_num_model_outputs(gm)
+                return compile_fx_forward(
                     gm,
                     example_inputs,
-                    static_input_idxs=get_static_input_idxs(fixed),
-                    cudagraphs=cudagraphs,
-                    graph_id=graph_id,
+                    num_orig_model_outputs=num_orig_model_outputs,
+                    num_example_inputs=num_example_inputs,
+                    compiler_config_extra=compiler_config_extra,
+                    inner_compile=inner_compile,
                     is_inference=is_inference,
-                    boxed_forward_device_index=forward_device,
                 )
 
         fw_compiler: Callable[[GraphModule, Sequence[InputType]], OutputCode] = (
@@ -2354,9 +2576,9 @@ def fw_compiler_base(
                 dynamo_model=model_,
                 num_example_inputs=num_example_inputs,
                 inner_compile=inner_compile,
-                cudagraphs=cudagraphs,
-                graph_id=graph_id,
-                forward_device=forward_device,
+                cudagraphs=compiler_config_extra.cudagraphs,
+                graph_id=compiler_config_extra.graph_id,
+                forward_device=compiler_config_extra.forward_device,
             )
         else:
             inference_compiler = functools.partial(fw_compiler_base, is_inference=True)
@@ -2364,69 +2586,19 @@ def fw_compiler_base(
                 OutputCode, inference_compiler
             )
 
-        def partition_fn(
-            gm: GraphModule,
-            joint_inputs: Sequence[object],
-            **kwargs: object,
-        ) -> tuple[GraphModule, GraphModule]:
-            cuda_context = get_cuda_device_context(gm)
-            with cuda_context:
-                # We can skip the invoke_subgraph because the
-                # entire_partition_fn is called recursively for invoke_subgraph
-                # in partitioning.
-                _recursive_joint_graph_passes(gm, skip_invoke_subgraph=True)
-
-            static_lifetime_input_indices: Optional[list[int]] = kwargs.pop(  # type: ignore[assignment]
-                "static_lifetime_input_indices", None
-            )
-
-            with dynamo_utils.dynamo_timed(
-                "min_cut_rematerialization_partition", log_pt2_compile_event=True
-            ):
-                return min_cut_rematerialization_partition(
-                    gm,
-                    joint_inputs,
-                    compiler="inductor",
-                    static_lifetime_input_indices=static_lifetime_input_indices,
-                    **kwargs,
-                )
-
         @compile_time_strobelight_meta(phase_name="backward")
         def bw_compiler(
             gm: GraphModule, example_inputs: Sequence[InputType]
         ) -> OutputCode:
-            from torch._dynamo.convert_frame import compile_lock
-
             with (
                 dynamo_utils.dynamo_timed("compile_fx.<locals>.bw_compiler"),
-                compile_lock,
             ):
-                model_outputs_node = output_node(gm)
-                if config.bw_outputs_user_visible:
-                    model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
-                    model_outputs_node.meta["user_visible_output_idxs"] = [
-                        idx
-                        for idx, n in enumerate(model_outputs)
-                        if isinstance(n, torch.fx.Node)
-                    ]
-                else:
-                    model_outputs_node.meta["user_visible_output_idxs"] = []
-
-                fixed = count_tangents(gm)
-                with (
-                    config.patch(get_cpp_wrapper_config())
-                    if config.cpp_wrapper
-                    else contextlib.nullcontext()
-                ):
-                    return inner_compile(
-                        gm,
-                        example_inputs,
-                        static_input_idxs=list(range(fixed)),
-                        cudagraphs=cudagraphs,
-                        is_backward=True,
-                        graph_id=graph_id,
-                        boxed_forward_device_index=forward_device,
-                    )
+                return compile_fx_backward(
+                    gm,
+                    example_inputs,
+                    compiler_config_extra=compiler_config_extra,
+                    inner_compile=inner_compile,
+                )
 
         bw_compiler = SerializableAOTDispatchCompiler(OutputCode, bw_compiler)
 
@@ -2513,8 +2685,8 @@ def bw_compiler(
                     decompositions=decompositions,
                     partition_fn=partition_fn,
                     keep_inference_input_mutations=True,
-                    cudagraphs=cudagraphs,
-                    boxed_forward_device_index=forward_device,
+                    cudagraphs=compiler_config_extra.cudagraphs,
+                    boxed_forward_device_index=compiler_config_extra.forward_device,
                     ignore_shape_env=ignore_shape_env,
                 )(model_, example_inputs_)
             except ShortenTraceback as e:
diff --git a/torch/_inductor/compile_worker/subproc_pool.py b/torch/_inductor/compile_worker/subproc_pool.py
index 0b670b268b37e..6342fc7e0fcd7 100644
--- a/torch/_inductor/compile_worker/subproc_pool.py
+++ b/torch/_inductor/compile_worker/subproc_pool.py
@@ -27,7 +27,8 @@
     TrackedProcessPoolExecutor,
 )
 from torch._inductor.compile_worker.utils import _async_compile_initializer
-from torch._inductor.utils import get_ld_library_path
+from torch._inductor.utils import get_ld_library_path, python_subprocess_env
+from torch._utils_internal import find_compile_subproc_binary
 
 
 log = logging.getLogger(__name__)
@@ -90,8 +91,14 @@ class SubprocException(Exception):
     Thrown when a job in a subprocess raises an Exception.
     """
 
-    def __init__(self, details: str) -> None:
-        super().__init__(f"An exception occurred in a subprocess:\n\n{details}")
+    def __init__(self, details: str, name: str = "<unknown>") -> None:
+        self.details = details
+        super().__init__(
+            f"An exception occurred in a subprocess:\n\nName={name}\n{details}"
+        )
+
+    def with_name(self, name: str) -> "SubprocException":
+        return SubprocException(self.details, name)
 
 
 class SubprocPickler:
@@ -137,6 +144,11 @@ def __init__(
         cmd = [
             sys.executable,
             entry,
+        ]
+        if (binary := find_compile_subproc_binary()) is not None:
+            cmd = [binary]
+
+        args = [
             f"--pickler={self.pickler.__class__.__module__}.{self.pickler.__class__.__name__}",
             f"--kind={self.kind.value}",
             f"--workers={nprocs}",
@@ -145,27 +157,33 @@ def __init__(
             f"--write-fd={str(subproc_write_fd)}",
             f"--torch-key={torch_key_str}",
         ]
-        local = False
+        cmd.extend(args)
+        log_path = None
+        self.log_file = None
+
         if config.worker_suppress_logging:
+            log_path = os.devnull
             log.info("Suppressing compile worker output due to config")
-            local = True
+        else:
+            log_path = config.torchinductor_worker_logpath
+            if not log_path:
+                log_path = config.get_worker_log_path()
+
+        if log_path:
+            self.log_file = open(log_path, "w")
 
         self.process = subprocess.Popen(
             cmd,
             env={
-                **os.environ,
-                # We need to set the PYTHONPATH so the subprocess can find torch.
-                "PYTHONPATH": os.environ.get(
-                    "TORCH_CUSTOM_PYTHONPATH", os.pathsep.join(sys.path)
-                ),
+                **python_subprocess_env(),
                 # Safeguard against creating a SubprocPool in the subprocess.
                 "TORCH_WARM_POOL": "0",
                 # Some internal usages need a modified LD_LIBRARY_PATH.
                 "LD_LIBRARY_PATH": get_ld_library_path(),
             },
             pass_fds=(subproc_read_fd, subproc_write_fd),
-            stdout=subprocess.DEVNULL if local else None,
-            stderr=subprocess.DEVNULL if local else None,
+            stdout=self.log_file,
+            stderr=self.log_file,
         )
         self.write_lock = threading.Lock()
         self.read_thread = threading.Thread(
@@ -262,6 +280,8 @@ def shutdown(self) -> None:
                 _send_msg(self.write_pipe, MsgHeader.SHUTDOWN)
                 self.write_pipe.close()
             self.process.wait(300)
+            if self.log_file:
+                self.log_file.close()
         except OSError as e:
             log.warning("Ignored OSError in pool shutdown:  %s", e)
         finally:
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index e5b5fe224cc81..f6921a057ba0f 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -81,6 +81,11 @@ def prologue_fusion_enabled() -> bool:
 # Whether to enable printing the source code for each future
 verbose_progress = False
 
+# Configurable compile worker logging path for subproc_pool
+worker_log_path = (
+    "/logs/dedicated_log_torch_compile_worker_rank" if is_fbcode() else None
+)
+
 # precompilation timeout
 precompilation_timeout_seconds: int = 60 * 60
 
@@ -91,6 +96,8 @@ def prologue_fusion_enabled() -> bool:
     default=True,
 )
 
+remote_gemm_autotune_cache: bool = False
+
 # use remote fx aot graph codegen cache
 # False: Disables the cache
 # True: Enables the cache
@@ -180,6 +187,8 @@ def prologue_fusion_enabled() -> bool:
     os.environ.get("TORCHINDUCTOR_CPP_WRAPPER_BUILD_SEPARATE", "0") == "1"
 )
 
+fx_wrapper: bool = os.environ.get("TORCHINDUCTOR_FX_WRAPPER", "0") == "1"
+
 # Controls automatic precompiling of common include files for codecache.CppCodeCache
 # (i.e. for cpp_wrapper mode and for cpp kernels on CPU).  AOTI header precompiling is
 # controlled by a separate flag.
@@ -257,6 +266,9 @@ def prologue_fusion_enabled() -> bool:
 post_grad_custom_pre_pass: torch._inductor.custom_graph_pass.CustomGraphPassType = None
 post_grad_custom_post_pass: torch._inductor.custom_graph_pass.CustomGraphPassType = None
 
+# Allow users to pass in custom partition function
+custom_partitioner_fn: torch._inductor.custom_graph_pass.CustomPartitionerFnType = None
+
 # Registers a custom joint graph pass.
 joint_custom_pre_pass: torch._inductor.custom_graph_pass.CustomGraphPassType = None
 joint_custom_post_pass: torch._inductor.custom_graph_pass.CustomGraphPassType = None
@@ -380,6 +392,16 @@ def prologue_fusion_enabled() -> bool:
 # enable operator reordering for peak memory optimization
 reorder_for_peak_memory = True
 
+reorder_iterative_debug_memory_recompute: bool = False
+reorder_iterative_debug_limit_to_reorder: Optional[int] = (
+    None
+    if (env_str := os.getenv("PYTORCH_REORDER_COLLECTIVES_LIMIT")) is None
+    else int(env_str)
+)
+sink_waits_iterative_debug_limit_to_sink: Optional[int] = (
+    None if (env_str := os.getenv("PYTORCH_SINK_WAITS_LIMIT")) is None else int(env_str)
+)
+
 bucket_all_gathers_fx: Literal["none", "all", "only_fsdp"] = "none"
 # By default torch._inductor.fx_passes.bucketing.bucket_size_determinator is used
 bucket_all_gathers_fx_bucket_size_determinator: Optional[Callable[[int], int]] = None
@@ -394,6 +416,8 @@ def prologue_fusion_enabled() -> bool:
 # for built-in estimation function, pass in "default"; for user-defined estimation function, pass in the function handle
 estimate_op_runtime = "default"
 
+runtime_estimations_mms_benchmark: bool = False
+
 # unit: GB/s, uni-directional P2P bandwidth per card
 # default value is NVLink
 intra_node_bw = 300
@@ -429,8 +453,18 @@ def prologue_fusion_enabled() -> bool:
     os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_REPORT_CHOICES_STATS", "1") == "1"
 )
 
+# Prune configs that require more shared memory than the hardware limit
+max_autotune_prune_choices_based_on_shared_mem = (
+    os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_PRUNE_CHOICES_BASED_ON_SHARED_MEM", "1")
+    == "1"
+)
+
 # enable inductor graph partition to allow multiple inductor graphs for the same dynamo graph
-graph_partition = False
+graph_partition: bool = (
+    os.environ.get("TORCHINDUCTOR_GRAPH_PARTITION", "1" if not is_fbcode() else "0")
+    == "1"
+)
+
 
 # force cublas and triton to use the same precision; cublas supports TF32 for matmul operations
 # when m, n, k are multiples of 16, 16, 8, whereas triton supports TF32 for matmul operations
@@ -528,6 +562,11 @@ def prologue_fusion_enabled() -> bool:
 # Specify a list of comma separated optimizations to use learned heuristics for
 autoheuristic_use = os.environ.get("TORCHINDUCTOR_AUTOHEURISTIC_USE", "mixed_mm")
 
+# If set to 1, will run a JIT post compile hook if one is set.
+run_jit_post_compile_hook = (
+    os.environ.get("TORCHINDUCTOR_RUN_JIT_POST_COMPILE_HOOK", "0") == "1"
+)
+
 
 def run_autoheuristic(name: str) -> bool:
     return collect_autoheuristic(name) or use_autoheuristic(name)
@@ -734,6 +773,10 @@ def decide_worker_start_method() -> str:
 
 worker_start_method: str = decide_worker_start_method()
 
+# Threshold to decide if a kernel has small memory access in bytes
+# Default value is 16 MB which is arbitrarily selected.
+small_memory_access_threshold: int = 16777216
+
 # Whether to log from subprocess workers that are launched.
 worker_suppress_logging: bool = Config(
     justknob="pytorch/compiler:worker_suppress_logging",
@@ -741,6 +784,12 @@ def decide_worker_start_method() -> str:
     default=True,
 )
 
+# Log per-operation runtime estimates for TLParse analysis.
+log_tlparse: bool = Config(
+    env_name_force="LOG_TLPARSE",
+    default=False,
+)
+
 # Flags to turn on all_reduce fusion. These 2 flags should be automatically turned
 # on by DDP and should not be set by the users.
 _fuse_ddp_communication = False
@@ -880,9 +929,15 @@ def decide_compile_threads() -> int:
 )
 pad_channels_last = False
 
+# Control if we will do padding on dynamic shapes
+pad_dynamic_shapes = False
+
 # Disable comprehensive padding on the CPU
 disable_padding_cpu = True
 
+# Control if we will expand the dimension of pointwise nodes to fuse
+expand_dimension_for_pointwise_nodes = False
+
 # The width of comprehensive padding, in bytes.
 # CUDA max memory transaction size is 128 bytes for a warp.
 padding_alignment_bytes = 128
@@ -1003,6 +1058,24 @@ def decide_compile_threads() -> int:
 autotune_lookup_table: dict[str, dict[str, Any]] = {}
 
 
+def get_worker_log_path() -> Optional[str]:
+    log_loc = None
+    if is_fbcode():
+        mast_job_name = os.environ.get("MAST_HPC_JOB_NAME", None)
+        global_rank = os.environ.get("ROLE_RANK", "0")
+
+        if mast_job_name is not None:
+            log_loc = f"/logs/dedicated_log_torch_compile_worker_rank{global_rank}"
+
+    return log_loc
+
+
+torchinductor_worker_logpath: str = Config(
+    env_name_force="TORCHINDUCTOR_WORKER_LOGPATH",
+    default="",
+)
+
+
 # config specific to codegen/cpp.py
 class cpp:
     """
@@ -1129,6 +1202,11 @@ class cpp:
         os.environ.get("TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL", "0") == "1"
     )
 
+    # Use static constexpr or static const for int array
+    use_constexpr_for_int_array = (
+        os.environ.get("TORCHINDUCTOR_CPP_USE_CONSTEXPR_FOR_INT_ARRAY", "1") == "1"
+    )
+
 
 class triton:
     """
@@ -1176,6 +1254,15 @@ class triton:
     # instead of recording and executing cudagraphs
     force_cudagraphs_warmup = False
 
+    # If False (default), torch.compile skips cudagraph for a graph if it
+    # contains cudagraph-unsafe ops. If True, we require that all cuda ops
+    # be captured into cudagraph. If this is not possible, this will raise
+    # an error.
+    cudagraph_or_error: bool = Config(
+        env_name_force="TORCHINDUCTOR_CUDAGRAPH_OR_ERROR",
+        default=False,
+    )
+
     # assertions on the fast path
     fast_path_cudagraph_asserts = False
 
@@ -1465,12 +1552,12 @@ class aot_inductor:
     precompile_headers: bool = not is_fbcode()
 
     # Embed generated kernel binary files into model.so
-    embed_kernel_binary: bool = False
+    embed_kernel_binary: Optional[bool] = None
 
     # Generate kernel files that support multiple archs
     # For CUDA, this means generating fatbin files for kernels, and the fatbin files
     # contains PTX and SASS for the current architecture.
-    emit_multi_arch_kernel: bool = False
+    emit_multi_arch_kernel: Optional[bool] = None
 
     # If not None, the generated files with use this name in file stem.
     # If None, we will use a hash to name files.
@@ -1698,6 +1785,9 @@ class rocm:
     # The threshold at which we trigger a splitK config - K // max(M,N) has to be greater than this
     split_k_threshold: int = 16
 
+    # The threshold at which we trigger a contiguous subgraph transformation
+    contiguous_threshold: int = 16
+
 
 # Backend to use for CPU codegen either "cpp" or "triton" (experimental) or "halide" (experimental)
 cpu_backend: Literal["cpp", "triton", "halide"] = "cpp"
@@ -1797,10 +1887,18 @@ class trace:
 
     log_autotuning_results = os.environ.get("LOG_AUTOTUNE_RESULTS", "0") == "1"
 
-    # Save mapping info from inductor generated triton kernel to post_grad fx nodes to pre_grad fx nodes
-    provenance_tracking = (
-        os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
-        or os.environ.get("INDUCTOR_PROVENANCE", "0") == "1"
+    # Save mapping info from inductor generated kernel to post_grad/pre_grad fx nodes
+    # Levels:
+    #   0 - disabled (default)
+    #   1 - normal
+    #   2 - basic
+    # Backward compatibility:
+    #   If TORCH_COMPILE_DEBUG=1, level is set to at least 1.
+    #   If INDUCTOR_PROVENANCE is set, use its integer value.
+    provenance_tracking_level: int = int(
+        os.environ.get(
+            "INDUCTOR_PROVENANCE", os.environ.get("TORCH_COMPILE_DEBUG", "0")
+        )
     )
 
 
@@ -1861,6 +1959,12 @@ class test_configs:
 
     graphsafe_rng_func_ignores_fallback_random = False
 
+    track_memory_lifecycle: Optional[Literal["assert", "log"]] = None
+
+    # If set to True, AOTI-generated CMakelists.txt will still use libtorch
+    # for unit testing
+    use_libtorch = False
+
 
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
diff --git a/torch/_inductor/config_comms.py b/torch/_inductor/config_comms.py
new file mode 100644
index 0000000000000..b5dbf424f35b4
--- /dev/null
+++ b/torch/_inductor/config_comms.py
@@ -0,0 +1,15 @@
+import sys
+
+from torch.utils._config_module import install_config_module
+
+
+# Whether to use c10d._time_estimator for collectives runtime estimations.
+runtime_estimations_use_nccl_lib_estimations: bool = False
+
+# Config to enable sync of runtime estimations across distributed ranks,
+# To prevent passes using this runtime estimations to make different
+# decisions on different distributed ranks.
+runtime_estimations_align_across_all_distributed_ranks: bool = False
+
+# adds patch, save_config, etc
+install_config_module(sys.modules[__name__])
diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
index b6a0e7aeef2ab..e2cb445ed1080 100644
--- a/torch/_inductor/cpp_builder.py
+++ b/torch/_inductor/cpp_builder.py
@@ -2,6 +2,7 @@
 # The design document please check this RFC: https://github.com/pytorch/pytorch/issues/124245
 
 import copy
+import ctypes
 import errno
 import functools
 import json
@@ -18,7 +19,7 @@
 import textwrap
 import warnings
 from collections.abc import Sequence
-from ctypes import cdll
+from ctypes import cdll, wintypes
 from ctypes.util import find_library
 from pathlib import Path
 from typing import Any, Optional, Union
@@ -28,7 +29,6 @@
 from torch._inductor import config, exc
 from torch._inductor.cpu_vec_isa import invalid_vec_isa, VecISA
 from torch._inductor.runtime.runtime_utils import cache_dir
-from torch._inductor.utils import aoti_model_name_from_config
 from torch.torch_version import TorchVersion
 
 
@@ -142,11 +142,201 @@ def check_compiler_exist_windows(compiler: str) -> None:
         pass
 
 
+class WinPeFileVersionInfo:
+    def __init__(self, file_path: str) -> None:
+        self.file_path = file_path
+        self.version_dll = ctypes.WinDLL("version.dll")  # type: ignore[attr-defined]
+        self._setup_functions()
+        self._get_version_info()
+
+    def _setup_functions(self) -> None:
+        self.version_dll.GetFileVersionInfoSizeW.argtypes = [
+            wintypes.LPCWSTR,
+            wintypes.LPDWORD,
+        ]
+        self.version_dll.GetFileVersionInfoSizeW.restype = wintypes.DWORD
+
+        self.version_dll.GetFileVersionInfoW.argtypes = [
+            wintypes.LPCWSTR,
+            wintypes.DWORD,
+            wintypes.DWORD,
+            wintypes.LPVOID,
+        ]
+        self.version_dll.GetFileVersionInfoW.restype = wintypes.BOOL
+
+        self.version_dll.VerQueryValueW.argtypes = [
+            wintypes.LPCVOID,
+            wintypes.LPCWSTR,
+            ctypes.POINTER(ctypes.c_void_p),
+            ctypes.POINTER(wintypes.UINT),
+        ]
+        self.version_dll.VerQueryValueW.restype = wintypes.BOOL
+
+    def _get_version_info(self) -> None:
+        dummy = wintypes.DWORD()
+        size = self.version_dll.GetFileVersionInfoSizeW(
+            self.file_path, ctypes.byref(dummy)
+        )
+
+        if size == 0:
+            raise RuntimeError(f"Can't get version info size of {self.file_path}.")
+
+        self.version_info = ctypes.create_string_buffer(size)
+        success = self.version_dll.GetFileVersionInfoW(
+            self.file_path, 0, size, self.version_info
+        )
+
+        if not success:
+            raise RuntimeError(f"Can't get version info of {self.file_path}.")
+
+    def get_language_id(self) -> int:
+        lp_buffer = ctypes.c_void_p()
+        u_len = wintypes.UINT()
+
+        success = self.version_dll.VerQueryValueW(
+            self.version_info,
+            r"\VarFileInfo\Translation",
+            ctypes.byref(lp_buffer),
+            ctypes.byref(u_len),
+        )
+
+        if not success or u_len.value == 0:
+            return 0
+
+        translations = []
+        lang_id: int = 0
+        if lp_buffer.value is not None:
+            for i in range(u_len.value // 4):
+                offset = i * 4
+                data = ctypes.string_at(lp_buffer.value + offset, 4)
+                lang_id = int.from_bytes(data[:2], "little")
+                code_page = int.from_bytes(data[2:4], "little")
+                translations.append((lang_id, code_page))
+        else:
+            # Handle the case where lp_buffer.value is None
+            print("Buffer is None")
+
+        return lang_id
+
+
+@functools.cache
+def check_msvc_cl_language_id(compiler: str) -> None:
+    """
+    Torch.compile() is only work on MSVC with English language pack well.
+    Check MSVC's language pack: https://github.com/pytorch/pytorch/issues/157673#issuecomment-3051682766
+    """
+
+    def get_msvc_cl_path() -> tuple[bool, str]:
+        """
+        Finds the path to cl.exe using vswhere.exe.
+        """
+        vswhere_path = os.path.join(
+            os.environ.get("ProgramFiles(x86)", "C:\\Program Files (x86)"),
+            "Microsoft Visual Studio",
+            "Installer",
+            "vswhere.exe",
+        )
+        if not os.path.exists(vswhere_path):
+            vswhere_path = os.path.join(
+                os.environ.get("ProgramFiles", "C:\\Program Files"),
+                "Microsoft Visual Studio",
+                "Installer",
+                "vswhere.exe",
+            )
+            if not os.path.exists(vswhere_path):
+                return False, ""  # vswhere.exe not found
+
+        try:
+            # Get the Visual Studio installation path
+            cmd = [
+                vswhere_path,
+                "-latest",
+                "-prerelease",
+                "-products",
+                "*",
+                "-requires",
+                "Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
+                "-property",
+                "installationPath",
+            ]
+            vs_install_path = subprocess.check_output(
+                cmd, text=True, encoding="utf-8"
+            ).strip()
+
+            if not vs_install_path:
+                return False, ""
+
+            # Find the latest MSVC toolset version within the installation
+            msvc_tools_path = os.path.join(vs_install_path, "VC", "Tools", "MSVC")
+            if not os.path.exists(msvc_tools_path):
+                return False, ""
+
+            # Get the latest toolset version directory
+            toolset_versions = [
+                d
+                for d in os.listdir(msvc_tools_path)
+                if os.path.isdir(os.path.join(msvc_tools_path, d))
+            ]
+            if not toolset_versions:
+                return False, ""
+            latest_toolset_version = sorted(toolset_versions, reverse=True)[0]
+
+            # Construct the full cl.exe path
+            cl_path = os.path.join(
+                msvc_tools_path,
+                latest_toolset_version,
+                "bin",
+                "HostX64",
+                "x64",
+                "cl.exe",
+            )
+            if os.path.exists(cl_path):
+                return True, cl_path
+            else:
+                # Fallback for older versions or different architectures if needed
+                cl_path = os.path.join(
+                    msvc_tools_path,
+                    latest_toolset_version,
+                    "bin",
+                    "HostX86",
+                    "x86",
+                    "cl.exe",
+                )
+                if os.path.exists(cl_path):
+                    return True, cl_path
+
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            return False, ""
+
+        return False, ""
+
+    if not _is_msvc_cl(compiler):
+        return
+
+    if os.path.exists(compiler):
+        # Passed compiler with path.
+        cl_exe_path = compiler
+    else:
+        b_ret, cl_exe_path = get_msvc_cl_path()
+        if b_ret is False:
+            return
+
+    version_info = WinPeFileVersionInfo(cl_exe_path)
+    lang_id = version_info.get_language_id()
+    if lang_id != 1033:
+        # MSVC English language id is 0x0409, and the DEC value is 1033.
+        raise RuntimeError(
+            "Torch.compile() is only support MSVC with English language pack,"
+            "Please reinstall its language pack to English."
+        )
+
+
 def get_cpp_compiler() -> str:
     if _IS_WINDOWS:
         compiler = os.environ.get("CXX", "cl")
         compiler = normalize_path_separator(compiler)
         check_compiler_exist_windows(compiler)
+        check_msvc_cl_language_id(compiler)
     else:
         if config.is_fbcode():
             return build_paths.cc
@@ -564,6 +754,11 @@ def _get_os_related_cpp_cflags(cpp_compiler: str) -> list[str]:
             "wd4067",
             "wd4068",
             "EHsc",
+            # For Intel oneAPI, ref: https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170
+            "Zc:__cplusplus",
+            # Enable max compatible to msvc for oneAPI headers.
+            # ref: https://github.com/pytorch/pytorch/blob/db38c44ad639e7ada3e9df2ba026a2cb5e40feb0/cmake/public/utils.cmake#L352-L358 # noqa: B950
+            "permissive-",
         ]
     else:
         cflags = ["Wno-unused-variable", "Wno-unknown-pragmas"]
@@ -581,45 +776,93 @@ def _get_os_related_cpp_cflags(cpp_compiler: str) -> list[str]:
     return cflags
 
 
+def _get_os_related_cpp_definitions(cpp_compiler: str) -> list[str]:
+    os_definitions: list[str] = []
+    if _IS_WINDOWS:
+        # On Windows, we need disable min/max macro to avoid C2589 error, as PyTorch CMake:
+        # https://github.com/pytorch/pytorch/blob/9a41570199155eee92ebd28452a556075e34e1b4/CMakeLists.txt#L1118-L1119
+        os_definitions.append("NOMINMAX")
+    else:
+        pass
+    return os_definitions
+
+
 def _get_ffast_math_flags() -> list[str]:
-    # ffast-math is equivalent to these flags as in
-    # https://github.com/gcc-mirror/gcc/blob/4700ad1c78ccd7767f846802fca148b2ea9a1852/gcc/opts.cc#L3458-L3468
-    # however gcc<13 sets the FTZ/DAZ flags for runtime on x86 even if we have
-    # -ffast-math -fno-unsafe-math-optimizations because the flags for runtime
-    # are added by linking in crtfastmath.o. This is done by the spec file which
-    # only does globbing for -ffast-math.
-    flags = [
-        "fno-trapping-math",
-        "funsafe-math-optimizations",
-        "ffinite-math-only",
-        "fno-signed-zeros",
-        "fno-math-errno",
-    ]
+    if _IS_WINDOWS:
+        flags = []
+    else:
+        # ffast-math is equivalent to these flags as in
+        # https://github.com/gcc-mirror/gcc/blob/4700ad1c78ccd7767f846802fca148b2ea9a1852/gcc/opts.cc#L3458-L3468
+        # however gcc<13 sets the FTZ/DAZ flags for runtime on x86 even if we have
+        # -ffast-math -fno-unsafe-math-optimizations because the flags for runtime
+        # are added by linking in crtfastmath.o. This is done by the spec file which
+        # only does globbing for -ffast-math.
+        flags = [
+            "fno-trapping-math",
+            "funsafe-math-optimizations",
+            "ffinite-math-only",
+            "fno-signed-zeros",
+            "fno-math-errno",
+        ]
+
+        flags.append("fno-finite-math-only")
+        if not config.cpp.enable_unsafe_math_opt_flag:
+            flags.append("fno-unsafe-math-optimizations")
+        flags.append(f"ffp-contract={config.cpp.enable_floating_point_contract_flag}")
 
-    if is_gcc():
-        flags.append("fexcess-precision=fast")
+        if is_gcc():
+            flags.append("fexcess-precision=fast")
 
     return flags
 
 
+def _get_inductor_debug_symbol_cflags() -> tuple[list[str], list[str]]:
+    """
+    When we turn on generate debug symbol.
+    On Windows, it should create a [module_name].pdb file. It helps debug by WinDBG.
+    On Linux, it should create some debug sections in binary file.
+    """
+    cflags: list[str] = []
+    ldflags: list[str] = []
+
+    if _IS_WINDOWS:
+        cflags = ["ZI", "_DEBUG"]
+        ldflags = ["DEBUG", "ASSEMBLYDEBUG ", "OPT:REF", "OPT:ICF"]
+    else:
+        cflags.append("g")
+
+    return cflags, ldflags
+
+
 def _get_optimization_cflags(
     cpp_compiler: str, min_optimize: bool = False
-) -> list[str]:
-    if _IS_WINDOWS:
-        return ["O1" if min_optimize else "O2"]
+) -> tuple[list[str], list[str]]:
+    cflags: list[str] = []
+    ldflags: list[str] = []
+
+    b_debug_build = (
+        config.aot_inductor.debug_compile
+        or os.environ.get("TORCHINDUCTOR_DEBUG_SYMBOL", "0") == "1"
+    )
+    wrapper_opt_level = config.aot_inductor.compile_wrapper_opt_level
+
+    if b_debug_build:
+        cflags, ldflags = _get_inductor_debug_symbol_cflags()
+        if _IS_WINDOWS:
+            cflags += ["Od", "Ob0", "Oy-"]
+        else:
+            cflags.append("O0")
     else:
-        wrapper_opt_level = config.aot_inductor.compile_wrapper_opt_level
-        cflags = (
-            ["O0", "g"]
-            if config.aot_inductor.debug_compile
-            else [wrapper_opt_level if min_optimize else "O3", "DNDEBUG"]
-        )
-        cflags += _get_ffast_math_flags()
-        cflags.append("fno-finite-math-only")
-        if not config.cpp.enable_unsafe_math_opt_flag:
-            cflags.append("fno-unsafe-math-optimizations")
-        cflags.append(f"ffp-contract={config.cpp.enable_floating_point_contract_flag}")
+        if _IS_WINDOWS:
+            cflags = ["O1" if min_optimize else "O2"]
+        else:
+            cflags = [wrapper_opt_level if min_optimize else "O3", "DNDEBUG"]
 
+    cflags += _get_ffast_math_flags()
+
+    if _IS_WINDOWS:
+        pass
+    else:
         if sys.platform != "darwin":
             # on macos, unknown argument: '-fno-tree-loop-vectorize'
             if _is_gcc(cpp_compiler):
@@ -635,10 +878,10 @@ def _get_optimization_cflags(
         if config.aot_inductor.enable_lto and _is_clang(cpp_compiler):
             cflags.append("flto=thin")
 
-        return cflags
+    return cflags, ldflags
 
 
-def _get_shared_cflag(do_link: bool) -> list[str]:
+def _get_shared_cflags(do_link: bool) -> list[str]:
     if _IS_WINDOWS:
         """
         MSVC `/MD` using python `ucrtbase.dll` lib as runtime.
@@ -668,14 +911,18 @@ def get_cpp_options(
     libraries: list[str] = []
     passthrough_args: list[str] = []
 
+    opt_cflags, opt_ldflags = _get_optimization_cflags(cpp_compiler, min_optimize)
+
     cflags = (
-        _get_shared_cflag(do_link)
-        + _get_optimization_cflags(cpp_compiler, min_optimize)
+        opt_cflags
+        + _get_shared_cflags(do_link)
         + _get_warning_all_cflag(warning_all)
         + _get_cpp_std_cflag()
         + _get_os_related_cpp_cflags(cpp_compiler)
     )
 
+    definitions += _get_os_related_cpp_definitions(cpp_compiler)
+
     if not _IS_WINDOWS and config.aot_inductor.enable_lto and _is_clang(cpp_compiler):
         ldflags.append("fuse-ld=lld")
         ldflags.append("flto=thin")
@@ -686,7 +933,7 @@ def get_cpp_options(
         definitions,
         include_dirs,
         cflags,
-        ldflags,
+        ldflags + opt_ldflags,
         libraries_dirs,
         libraries,
         passthrough_args,
@@ -1337,14 +1584,24 @@ def get_cpp_torch_device_options(
 
     if device_type == "xpu":
         definitions.append(" USE_XPU")
-        # Suppress multi-line comment warnings in sycl headers
-        cflags += ["Wno-comment"]
-        libraries += ["c10_xpu", "sycl", "ze_loader", "torch_xpu"]
-        if not find_library("ze_loader"):
-            raise OSError(
-                "Intel GPU driver is not properly installed, please follow the instruction "
-                "in https://github.com/pytorch/pytorch?tab=readme-ov-file#intel-gpu-support."
-            )
+        xpu_error_string = (
+            "Intel GPU driver is not properly installed, please follow the instruction "
+            "in https://github.com/pytorch/pytorch?tab=readme-ov-file#intel-gpu-support."
+        )
+        if _IS_WINDOWS:
+            ze_root = os.getenv("LEVEL_ZERO_V1_SDK_PATH")
+            if ze_root is None:
+                raise OSError(xpu_error_string)
+            include_dirs += [os.path.join(ze_root, "include")]
+            libraries_dirs += [os.path.join(ze_root, "lib")]
+            libraries += ["c10_xpu", "sycl", "ze_loader", "torch_xpu"]
+        else:
+            # Suppress multi-line comment warnings in sycl headers
+            cflags += ["Wno-comment"]
+            libraries += ["c10_xpu", "sycl", "ze_loader", "torch_xpu"]
+
+            if not find_library("ze_loader"):
+                raise OSError(xpu_error_string)
 
     if device_type == "mps":
         definitions.append(" USE_MPS")
@@ -1545,7 +1802,9 @@ def __init__(
         self._aot_mode: bool = False
 
         self._name = name
-        self._target_name = aoti_model_name_from_config()
+        self._target_name = (
+            config.aot_inductor.model_name_for_generated_files or "aoti_model"
+        )
 
         # Code start here, initial self internal variables firstly.
         self._build_option = BuildOption
@@ -1598,7 +1857,8 @@ def __init__(
         if isinstance(sources, str):
             sources = [sources]
 
-        if config.is_fbcode() and (not self._aot_mode or self._use_relative_path):
+        # Use relative paths only when requested (typically for remote builds)
+        if config.is_fbcode() and self._use_relative_path:
             # Will create another temp directory for building, so do NOT use the
             # absolute path.
             self._orig_source_paths = list(sources)
@@ -1781,22 +2041,54 @@ def save_compile_cmd_to_cmake(
             project({self._target_name} LANGUAGES CXX)
             set(CMAKE_CXX_STANDARD 17)
 
-            # May need to point CMAKE_PREFIX_PATH to the right torch location
-            find_package(Torch REQUIRED)
-
-            # Set a shared library target
+            # Set a library target
             add_library({self._target_name} {target_library_type})
 
-            # Add macro definitions
-            target_compile_definitions({self._target_name} PRIVATE {definitions})
-
-            # Add compile flags
-            target_compile_options({self._target_name} PRIVATE {self._cflags_args})
-            # Backend specific flags
-            target_compile_options({self._target_name} PRIVATE {self._passthrough_parameters_args} -c)
-
             """
         )
+
+        if (
+            not config.aot_inductor.compile_standalone
+            or config.test_configs.use_libtorch
+        ):
+            # When compile_standalone is True, the generated cpp project should
+            # not use Torch. But for unit testing purpose, we need to use Torch here.
+            contents += textwrap.dedent(
+                """
+                # May need to point CMAKE_PREFIX_PATH to the right torch location
+                find_package(Torch REQUIRED)
+
+                """
+            )
+            # flags and macros here are mostly CPU specific. Not emitting them for GPU models
+            # will make the generated CMake file more portable and won't really hurt performance.
+            # NOTE: standalone focuses on GPU now. For CPU, some of the flags and macros may
+            # be still needed.
+            contents += textwrap.dedent(
+                f"""
+                # Add macro definitions
+                target_compile_definitions({self._target_name} PRIVATE {definitions})
+
+                # Add compile flags
+                target_compile_options({self._target_name} PRIVATE {self._cflags_args})
+
+                # Backend-specific flags
+                target_compile_options({self._target_name} PRIVATE {self._passthrough_parameters_args} -c)
+
+                """
+            )
+        else:
+            # When compile_standalone is True, use TorchStandalone instead of Torch
+            contents += textwrap.dedent(
+                f"""
+                find_package(TorchStandalone REQUIRED)
+                # Set up include directories to find headers at the correct paths
+                target_include_directories({self._target_name} PRIVATE ${{TorchStandalone_INCLUDE_DIRS}})
+                target_include_directories({self._target_name} PRIVATE ${{TorchStandalone_INCLUDE_DIRS}}/standalone)
+
+                """
+            )
+
         if device_type == "cuda" and torch.version.hip is None:
             from torch._inductor.codecache import _nvcc_arch_as_compile_option
 
@@ -1804,7 +2096,11 @@ def save_compile_cmd_to_cmake(
             contents += textwrap.dedent(
                 f"""
                 enable_language(CUDA)
+                set(CMAKE_CUDA_STANDARD 17)
                 find_package(CUDAToolkit REQUIRED)
+                target_include_directories({self._target_name} PRIVATE ${{CUDAToolkit_INCLUDE_DIRS}})
+                target_compile_definitions({self._target_name} PRIVATE USE_CUDA)
+                target_link_libraries({self._target_name} PRIVATE cuda CUDA::cudart_static)
 
                 find_program(OBJCOPY_EXECUTABLE objcopy)
                 if(NOT OBJCOPY_EXECUTABLE)
@@ -1833,7 +2129,7 @@ def save_compile_cmd_to_cmake(
                     add_custom_command(
                         OUTPUT ${{FATBIN_FILE}}
                         COMMAND ${{CUDAToolkit_NVCC_EXECUTABLE}} --fatbin ${{PTX_FILE}} -o ${{FATBIN_FILE}} ${{NVCC_GENCODE_FLAGS}}
-                                -gencode arch=compute_80,code=compute_80
+                                -gencode arch=compute_{current_arch},code=compute_{current_arch}
                                 -gencode arch=compute_{current_arch},code=sm_{current_arch}
                         DEPENDS ${{PTX_FILE}}
                     )
@@ -1882,12 +2178,20 @@ def save_kernel_asm_to_cmake(self, cmake_path: str, asm_files: list[str]) -> Non
                     """
                 )
                 f.write(contents)
-            f.write(f"add_dependencies({self._target_name} ${{KERNEL_TARGETS}})\n")
-            f.write(
-                f"target_link_libraries({self._target_name} PRIVATE ${{KERNEL_OBJECT_FILES}})\n"
-            )
+            if asm_files:
+                f.write(f"add_dependencies({self._target_name} ${{KERNEL_TARGETS}})\n")
+                f.write(
+                    f"target_link_libraries({self._target_name} PRIVATE ${{KERNEL_OBJECT_FILES}})\n"
+                )
 
     def save_link_cmd_to_cmake(self, cmake_path: str) -> None:
+        if (
+            config.aot_inductor.compile_standalone
+            and not config.test_configs.use_libtorch
+        ):
+            # When compile_standalone is True, do not link with libtorch
+            return
+
         lflags = " ".join(self._build_option.get_ldflags())
         libs = " ".join(self._build_option.get_libraries())
         contents = textwrap.dedent(
diff --git a/torch/_inductor/cpu_vec_isa.py b/torch/_inductor/cpu_vec_isa.py
index b077c4da9c28d..f2fd105e6a961 100644
--- a/torch/_inductor/cpu_vec_isa.py
+++ b/torch/_inductor/cpu_vec_isa.py
@@ -11,6 +11,7 @@
 
 import torch
 from torch._inductor import config
+from torch._inductor.utils import python_subprocess_env
 
 
 _IS_WINDOWS = sys.platform == "win32"
@@ -131,12 +132,7 @@ def check_build(self, code: str) -> bool:
                     ],
                     cwd=output_dir,
                     stderr=subprocess.DEVNULL,
-                    env={
-                        **os.environ,
-                        "PYTHONPATH": os.environ.get(
-                            "TORCH_CUSTOM_PYTHONPATH", os.pathsep.join(sys.path)
-                        ),
-                    },
+                    env=python_subprocess_env(),
                 )
             except Exception:
                 return False
@@ -204,16 +200,58 @@ class VecAVX512(VecISA):
         else "/arch:AVX512"
     )  # TODO: use cflags
     _dtype_nelements = {torch.float: 16, torch.bfloat16: 32, torch.float16: 32}
+    _is_avx512_bf16_supported = False
 
     def __str__(self) -> str:
         return "avx512"
 
     __hash__: Callable[[VecISA], Any] = VecISA.__hash__  # type: ignore[assignment]
 
+    _avx512_bf16_code = """
+#include <cstdint>
+#include <immintrin.h>
+
+extern "C" __m512bh __avx512_bf16_chk_kernel(__m512 a, __m512 b) {
+    return _mm512_cvtne2ps_pbh(a, b);
+}
+"""
+
+    @functools.cache  # noqa: B019
+    def __bool__(self) -> bool:
+        if super().__bool__():
+            if config.is_fbcode():
+                return False
+            # check avx512_bf16
+            if torch.cpu._is_avx512_bf16_supported() and not _IS_WINDOWS:
+                # save _arch_flags
+                base_flags = self._arch_flags
+                # temporarily change _arch_flags for avx512_bf16 check_build
+                self._arch_flags += " -mavx512bf16"
+                if self.check_build(VecAMX._avx512_bf16_code):
+                    self._is_avx512_bf16_supported = True
+                # restore _arch_flags
+                self._arch_flags = base_flags
+
+            return True
+        return False
+
+    @functools.lru_cache(None)  # noqa: B019
+    def is_avx512_bf16_supported(self) -> bool:
+        return self._is_avx512_bf16_supported
+
+    def build_arch_flags(self) -> str:
+        if self._is_avx512_bf16_supported:
+            return self._arch_flags + " -mavx512bf16"
+        else:
+            return self._arch_flags
+
 
 @dataclasses.dataclass
 class VecAMX(VecAVX512):
     _arch_flags = VecAVX512._arch_flags + " -mamx-tile -mamx-bf16 -mamx-int8"
+    # check amx_fp16 separately since it is not always supported when amx is supported
+    # amx_fp16 intrinsic compilation need gcc >=13 on platforms which support amx_fp16
+    _is_amx_fp16_supported = False
 
     def __str__(self) -> str:
         return super().__str__() + " amx_tile"
@@ -241,15 +279,42 @@ def __str__(self) -> str:
 }
 """
 
+    _amx_fp16_code = _amx_code.replace("_tile_dpbf16ps", "_tile_dpfp16ps")
+
     @functools.cache  # noqa: B019
     def __bool__(self) -> bool:
         if super().__bool__():
             if config.is_fbcode():
                 return False
             if self.check_build(VecAMX._amx_code) and torch.cpu._init_amx():
+                # check amx-fp16 as well when check amx
+                if torch.cpu._is_amx_fp16_supported():
+                    # save _arch_flags
+                    base_flags = self._arch_flags
+                    # temporarily change _arch_flags for amx-fp16 check_build
+                    self._arch_flags += " -mamx-fp16"
+                    if self.check_build(VecAMX._amx_fp16_code):
+                        self._is_amx_fp16_supported = True
+                    # restore _arch_flags
+                    self._arch_flags = base_flags
+
                 return True
         return False
 
+    @functools.lru_cache(None)  # noqa: B019
+    def is_amx_fp16_supported(self) -> bool:
+        return self._is_amx_fp16_supported
+
+    def build_arch_flags(self) -> str:
+        extra_flags = ""
+        if self._is_avx512_bf16_supported:
+            # avx512_bf16 is not among the base flags, so we need to check and add it here
+            # And we need this flag in the WOQ case for dequantization
+            extra_flags += " -mavx512bf16"
+        if self._is_amx_fp16_supported:
+            extra_flags += " -mamx-fp16"
+        return self._arch_flags + extra_flags
+
 
 @dataclasses.dataclass
 class VecAVX2(VecISA):
diff --git a/torch/_inductor/cudagraph_utils.py b/torch/_inductor/cudagraph_utils.py
index 2686d1d2ddde2..effed470548cb 100644
--- a/torch/_inductor/cudagraph_utils.py
+++ b/torch/_inductor/cudagraph_utils.py
@@ -10,9 +10,11 @@
 from torch._inductor.utils import GraphPartitionMap, InputType
 from torch.utils._ordered_set import OrderedSet
 
+from .utils import is_using_cudagraph_partition
+
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Sequence, Set as AbstractSet
 
 
 perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
@@ -108,7 +110,8 @@ def format_default_skip_message(reason: str) -> str:
 
 
 def get_mutation_stack_trace(
-    placeholders: Sequence[PlaceholderInfo], mutation_indices: Sequence[int]
+    placeholders: Sequence[PlaceholderInfo],
+    mutation_indices: Union[AbstractSet[int], Sequence[int]],
 ) -> str:
     stack_trace: Optional[str] = ""
 
@@ -170,7 +173,8 @@ def check_multiple_devices_or_any_cpu_nodes(
     # meta tensors are supported since there is no compute
     device_node_mapping.pop(torch.device("meta"), None)
 
-    if torch._inductor.config.graph_partition:
+    # dynamo cudagraph does not support graph partition
+    if is_using_cudagraph_partition():
         # graph partition supports splitting on cpu op. So we can ignore cpu nodes.
         device_node_mapping.pop(torch.device("cpu"), None)
 
@@ -200,6 +204,10 @@ def check_lowering_disable_cudagraph(
 def log_cudagraph_skip_and_bump_counter(msg: str) -> None:
     perf_hint_log.warning(msg)
     counters["inductor"]["cudagraph_skips"] += 1
+
+    if torch._inductor.config.triton.cudagraph_or_error:
+        raise RuntimeError(msg)
+
     metrics_context = get_metrics_context()
     if metrics_context.in_progress():
         metrics_context.set("cudagraph_skip_reason", msg, overwrite=True)
diff --git a/torch/_inductor/custom_graph_pass.py b/torch/_inductor/custom_graph_pass.py
index c9a8e33a1145a..413a224724fd5 100644
--- a/torch/_inductor/custom_graph_pass.py
+++ b/torch/_inductor/custom_graph_pass.py
@@ -1,5 +1,6 @@
 import hashlib
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from functools import lru_cache
 from typing import Any, Callable, Optional, Union
 from typing_extensions import TypeAlias
@@ -102,3 +103,58 @@ def get_hash_for_files(paths: tuple[str], extra: str = "") -> bytes:
             hasher.update(path.encode("utf-8"))
             hasher.update(f.read())
     return hasher.digest()
+
+
+class CustomPartitionerFn(ABC):
+    """
+    Implement this interface for custom partitioner:
+
+    1) The __call__() method contains the implementation of the custom partitioner.
+
+    2) The uuid() method enables inductor to cache compiled graphs when your custom
+    partitioner are applied. This method can return any identifier as long as it uniquely
+    identifies your implementation (and can be pickled). The caching logic includes this
+    identifier in its key calculation, i.e., any new value will effectively invalidate
+    existing entries. We expect custom partitioner would typically depend purely on the
+    textual representation of the implementation. In that case, we recommend using the
+    'get_hash_for_files' helper below to compute a unique hash from the contents of a
+    static list of source files, i.e., the source(s) containing the custom partitioner
+    implementation. That approach ensures that any change to the implementation will
+    mean a new uuid.
+
+    EXAMPLE:
+
+    from torch._inductor.custom_graph_pass import get_hash_for_files
+
+    class MyCustomPartitionerFn(CustomPartitionerFn):
+        def __call__(
+            self,
+            gm: torch.fx.GraphModule,
+            joint_inputs: Sequence[object],
+            **kwargs: Any
+        ) -> tuple[torch.fx.GraphModule, torch.fx.GraphModule]:
+            # my custom partitioner implementation
+            #     ...
+
+        def uuid(self) -> Optional[Any]:
+            return get_hash_for_files((__file__,))
+
+    """
+
+    @abstractmethod
+    def __call__(
+        self, gm: torch.fx.GraphModule, joint_inputs: Sequence[object], **kwargs: Any
+    ) -> tuple[torch.fx.GraphModule, torch.fx.GraphModule]:
+        """
+        Implementation of the custom partitioner.
+        """
+
+    @abstractmethod
+    def uuid(self) -> Optional[Any]:
+        """
+        Return an ID to uniquely identify your custom partitioner implementation.
+        Return None to skip inductor code caching entirely.
+        """
+
+
+CustomPartitionerFnType: TypeAlias = Optional[CustomPartitionerFn]
diff --git a/torch/_inductor/debug.py b/torch/_inductor/debug.py
index 2400b8235ca9c..e9df7119bb752 100644
--- a/torch/_inductor/debug.py
+++ b/torch/_inductor/debug.py
@@ -22,8 +22,10 @@
 from torch import fx as fx
 from torch._dynamo.repro.after_aot import save_graph_repro
 from torch._dynamo.utils import get_debug_dir
+from torch._inductor import utils
 from torch._logging import getArtifactLogger
 from torch._logging._internal import trace_structured
+from torch._utils_internal import signpost_event
 from torch.fx.graph_module import GraphModule
 from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
 from torch.fx.passes.tools_common import legalize_graph
@@ -32,7 +34,7 @@
 from torch.utils._pytree import tree_map
 
 from . import config, ir  # noqa: F811, this is needed
-from .ir import ExternKernelOut
+from .ir import ExternKernel
 from .scheduler import (
     BaseSchedulerNode,
     FusedSchedulerNode,
@@ -45,6 +47,11 @@
 
 log = logging.getLogger(__name__)
 
+# Graph execution tracking for debugging
+GRAPH_EXECUTION_ORDER: Optional[list[dict[str, object]]] = None
+RECORD_GRAPH_EXECUTION: bool = False
+GRAPH_COMPILE_IDS: Optional[dict[int, Optional[str]]] = None
+
 ir_pre_fusion_log = getArtifactLogger(__name__, "ir_pre_fusion")
 ir_post_fusion_log = getArtifactLogger(__name__, "ir_post_fusion")
 SchedulerNodeList = list[Any]
@@ -314,10 +321,17 @@ def enable_aot_logging() -> Iterator[None]:
 # Used for provenance tracking
 # They are not stored in DebugContext because they are not set in
 # _inductor_triton_kernel_to_post_grad_node_info's Debug Context
-_inductor_post_to_pre_grad_nodes: dict[str, Any] = {}
-_inductor_triton_kernel_to_post_grad_node_info: dict[str, Any] = {}
+_inductor_post_to_pre_grad_nodes: dict[str, dict[str, list[str]]] = {}
+_inductor_triton_kernel_to_post_grad_node_info: dict[str, list[str]] = {}
 _pre_grad_graph_id: Optional[int] = None
 _inductor_pre_grad_node_stack_trace: dict[str, str] = {}
+_inductor_kernel_stack_trace: dict[str, list[str]] = {}
+_inductor_kernel_provenance_debug_handle: int = 0
+
+
+def reset_inductor_kernel_provenance_debug_handle() -> None:
+    global _inductor_kernel_provenance_debug_handle
+    _inductor_kernel_provenance_debug_handle = 0
 
 
 @contextlib.contextmanager
@@ -327,6 +341,8 @@ def reset_provenance_globals() -> Iterator[None]:
     global _pre_grad_graph_id
     global _inductor_post_to_pre_grad_nodes
     global _inductor_triton_kernel_to_post_grad_node_info
+    global _inductor_pre_grad_node_stack_trace
+    global _inductor_kernel_stack_trace
 
     # Store original values
     original_pre_grad_graph_id = _pre_grad_graph_id
@@ -334,11 +350,17 @@ def reset_provenance_globals() -> Iterator[None]:
     original_triton_kernel_to_post_grad_node_info = (
         _inductor_triton_kernel_to_post_grad_node_info.copy()
     )
+    original_inductor_pre_grad_node_stack_trace = (
+        _inductor_pre_grad_node_stack_trace.copy()
+    )
+    original_inductor_kernel_stack_trace = _inductor_kernel_stack_trace.copy()
 
     # Reset to default values
     _pre_grad_graph_id = -1
     _inductor_post_to_pre_grad_nodes = {}
     _inductor_triton_kernel_to_post_grad_node_info = {}
+    _inductor_pre_grad_node_stack_trace = {}
+    _inductor_kernel_stack_trace = {}
 
     try:
         yield
@@ -349,6 +371,10 @@ def reset_provenance_globals() -> Iterator[None]:
         _inductor_triton_kernel_to_post_grad_node_info = (
             original_triton_kernel_to_post_grad_node_info
         )
+        _inductor_kernel_stack_trace = original_inductor_kernel_stack_trace
+        _inductor_pre_grad_node_stack_trace = (
+            original_inductor_pre_grad_node_stack_trace
+        )
 
 
 class DebugContext:
@@ -718,7 +744,106 @@ def log_collective_schedule(nodes: Sequence[BaseSchedulerNode]) -> None:
         if isinstance(op := getattr(node, "node", None), ir._CollectiveKernel)
     ]
 
-    _dump_collective_schedule(schedule)
+    # Only log when there is at least one collective op
+    if schedule:
+        _dump_collective_schedule(schedule)
+
+
+def log_runtime_and_tensor_meta(node_runtimes: Sequence[tuple[Any, float]]) -> None:
+    """Log per-op runtime estimates and output tensor metadata for TLParse."""
+
+    try:
+        to_size_hints = V.graph.sizevars.size_hints
+
+        def to_list(x: Optional[Sequence[Any]]) -> list[Any]:
+            return list(to_size_hints(x)) if x is not None else []
+
+        def dtype_to_str(dtype: Any) -> Optional[str]:
+            if dtype is None:
+                return None
+            s = str(dtype)
+            s = s.removeprefix("torch.")
+            return s
+
+        ops: list[dict[str, Any]] = []
+        for s, runtime_ns in node_runtimes:
+            name = getattr(s.node, "python_kernel_name", s.get_name())
+            op_type = "collective" if utils.is_collective(s.node) else "compute"
+
+            # Build outputs metadata if available
+            outputs: list[dict[str, Any]] = []
+            try:
+                for buf in s.get_outputs():
+                    irnode = buf.node
+                    shape = irnode.maybe_get_size()
+                    stride = (
+                        irnode.get_stride()
+                        if isinstance(irnode.layout, ir.Layout)
+                        else None
+                    )
+                    dtype = irnode.maybe_get_dtype()
+                    outputs.append(
+                        {
+                            "shape": to_list(shape),
+                            "stride": to_list(stride),
+                            "dtype": dtype_to_str(dtype),
+                        }
+                    )
+            except Exception:
+                pass
+
+            ops.append(
+                {
+                    "name": name,
+                    "type": op_type,
+                    "estimated_runtime_ns": runtime_ns,
+                    "outputs": outputs,
+                }
+            )
+
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "inductor_runtime_and_tensor_meta",
+                "encoding": "json",
+            },
+            payload_fn=lambda: {"ops": ops},
+        )
+    except Exception:
+        log.debug("Failed to log inductor_runtime_and_tensor_meta", exc_info=True)
+
+
+def log_graph_execution() -> None:
+    """Emit a structured artifact with the graph execution order."""
+    if not GRAPH_EXECUTION_ORDER:
+        return
+    try:
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "graph_execution",
+                "encoding": "json",
+            },
+            payload_fn=lambda: {"graph_execution_order": GRAPH_EXECUTION_ORDER},
+        )
+    except Exception:
+        log.debug("Failed to log graph_execution", exc_info=True)
+
+
+@contextlib.contextmanager
+def record_and_log_graph_execution_order() -> Iterator[None]:
+    """Record graph execution order and log it once on exit."""
+    global RECORD_GRAPH_EXECUTION, GRAPH_EXECUTION_ORDER, GRAPH_COMPILE_IDS
+    GRAPH_EXECUTION_ORDER = []
+    GRAPH_COMPILE_IDS = {}
+    RECORD_GRAPH_EXECUTION = True
+    try:
+        yield
+    finally:
+        log_graph_execution()
+        RECORD_GRAPH_EXECUTION = False
+        GRAPH_EXECUTION_ORDER = None
+        GRAPH_COMPILE_IDS = None
 
 
 @dataclasses.dataclass
@@ -733,19 +858,17 @@ class TensorMetadataHolder:
 def create_mapping_pre_post_grad_nodes(
     pre_grad_graph_id: Optional[int],
     post_to_pre_grad_nodes_json: dict[str, Any],
-) -> dict[str, dict[str, Any]]:
+) -> dict[str, dict[str, list[str]]]:
     """
     Create bidirectional mappings between pre_grad graph nodes
     and post_grad graph code nodes, and vice versa.
     """
     # return a dummy dict if there's any error
-    empty_return: dict[str, dict[str, Any]] = {
+    empty_return: dict[str, dict[str, list[str]]] = {
         "preToPost": {},
         "postToPre": {},
     }
 
-    log.info("Creating node mappings for provenance tracking")
-
     if not isinstance(post_to_pre_grad_nodes_json, dict):
         log.error("Provenance tacking error: post_to_pre_grad_nodes_json is not a dict")
         return empty_return
@@ -815,10 +938,17 @@ def convert_sets_to_lists(d: dict[str, Any]) -> None:
     except Exception as e:
         # Since this is just logging code, it should never interfere with regular
         # program execution, so we use this try-except to guard against any error
-        log.error("Unexpected error in create_node_mapping: %s", e)
+        signpost_event(
+            "inductor",
+            "provenance_tracking_error",
+            {
+                "function": "create_mapping_pre_post_grad_nodes",
+                "error_msg": str(e),
+                "stack_trace": traceback.format_exc(),
+            },
+        )
         log.error("post_to_pre_grad_nodes_json:  %s", post_to_pre_grad_nodes_json)
         log.error("pre_grad_graph_id:  %s", pre_grad_graph_id)
-        log.error(traceback.format_exc())
         return empty_return
 
 
@@ -835,8 +965,6 @@ def create_node_mapping_kernel_to_post_grad(
         "postToCppCode": {},
     }
 
-    log.info("Creating node mappings for provenance tracking")
-
     if not isinstance(triton_kernel_to_post_grad_json, dict):
         log.error(
             "Provenance tacking error: triton_kernel_to_post_grad_json is not a dict"
@@ -869,82 +997,174 @@ def convert_sets_to_lists(d: dict[str, Any]) -> None:
     except Exception as e:
         # Since this is just logging code, it should never interfere with regular
         # program execution, so we use this try-except to guard against any error
-        log.error("Unexpected error in create_node_mapping: %s", e)
+        signpost_event(
+            "inductor",
+            "provenance_tracking_error",
+            {
+                "function": "create_mapping_kernel_to_post_grad",
+                "error_msg": str(e),
+                "stack_trace": traceback.format_exc(),
+            },
+        )
         log.error(
             "triton_kernel_to_post_grad_json:  %s", triton_kernel_to_post_grad_json
         )
-        log.error(traceback.format_exc())
         return empty_return
 
 
-def dump_inductor_provenance_info(
-    filename: str = "inductor_generated_kernel_to_post_grad_nodes.json",
-) -> dict[str, Any]:
-    global _pre_grad_graph_id
-    global _inductor_post_to_pre_grad_nodes
-    global _inductor_triton_kernel_to_post_grad_node_info
-    if config.trace.enabled:
-        with V.debug.fopen(filename, "w") as fd:
-            log.info("Writing provenance tracing debugging info to %s", fd.name)
-            json.dump(_inductor_triton_kernel_to_post_grad_node_info, fd)
-    node_mapping = {}
-    if _pre_grad_graph_id:
-        node_mapping_kernel = create_node_mapping_kernel_to_post_grad(
-            _inductor_triton_kernel_to_post_grad_node_info
+def dump_inductor_provenance_info() -> dict[str, Any]:
+    try:
+        global _pre_grad_graph_id
+        global _inductor_post_to_pre_grad_nodes
+        global _inductor_triton_kernel_to_post_grad_node_info
+        node_mapping: dict[str, Any] = {}
+        if _pre_grad_graph_id:
+            node_mapping_kernel = create_node_mapping_kernel_to_post_grad(
+                _inductor_triton_kernel_to_post_grad_node_info
+            )
+            node_mapping = {
+                **_inductor_post_to_pre_grad_nodes,
+                **node_mapping_kernel,
+            }
+            if config.trace.enabled:
+                with V.debug.fopen(
+                    "inductor_provenance_tracking_node_mappings.json", "w"
+                ) as fd:
+                    json.dump(node_mapping, fd)
+        # we need to update the node mapping version when node mapping format changes
+        # so the tlparse tool knows which node mapping version it is looking at
+        node_mapping["version"] = 2.0
+        return node_mapping
+    except Exception as e:
+        # Since this is just debugging, it should never interfere with regular
+        # program execution, so we use this try-except to guard against any error
+        signpost_event(
+            "inductor",
+            "provenance_tracking_error",
+            {
+                "function": "dump_inductor_provenance_info",
+                "error_msg": str(e),
+                "stack_trace": traceback.format_exc(),
+            },
         )
-        node_mapping = {
-            **_inductor_post_to_pre_grad_nodes,
-            **node_mapping_kernel,
-        }
-        if config.trace.enabled:
-            with V.debug.fopen(
-                "inductor_provenance_tracking_node_mappings.json", "w"
-            ) as fd:
-                json.dump(node_mapping, fd)
-    return node_mapping
+        return {}
+
+
+def create_kernel_information_json() -> dict[str, dict[str, list[str]]]:
+    """Create kernel information JSON"""
+    try:
+        global _inductor_post_to_pre_grad_nodes
+        global _inductor_kernel_stack_trace
+        global _inductor_triton_kernel_to_post_grad_node_info
+
+        post_to_pre = _inductor_post_to_pre_grad_nodes.get("postToPre", {})
+        all_kernels = OrderedSet(_inductor_kernel_stack_trace.keys()) | OrderedSet(
+            _inductor_triton_kernel_to_post_grad_node_info.keys()
+        )
+
+        result = {}
+        for kernel_name in all_kernels:
+            post_grad_nodes = _inductor_triton_kernel_to_post_grad_node_info.get(
+                kernel_name, []
+            )
+
+            pre_grad_nodes: OrderedSet[str] = OrderedSet()
+            for post_node in post_grad_nodes:
+                pre_grad_nodes.update(post_to_pre.get(post_node, []))
+
+            result[kernel_name] = {
+                "stack_traces": _inductor_kernel_stack_trace.get(kernel_name, []),
+                "post_grad_nodes": post_grad_nodes,
+                "pre_grad_nodes": list(pre_grad_nodes),
+            }
+
+        return result
+    except Exception as e:
+        signpost_event(
+            "inductor",
+            "provenance_tracking_error",
+            {
+                "function": "create_kernel_information_json",
+                "error_msg": str(e),
+                "stack_trace": traceback.format_exc(),
+            },
+        )
+        return {}
 
 
 def set_kernel_post_grad_provenance_tracing(
-    node_schedule: Union[Sequence[BaseSchedulerNode], ExternKernelOut],
+    node_schedule: Union[Sequence[BaseSchedulerNode], ExternKernel],
     kernel_name: str,
     is_extern: bool = False,
-) -> None:
-    from .codegen.simd_kernel_features import DisableReduction, EnableReduction
+) -> Optional[int]:
+    """
+    Set the mapping between `kernel_name` and the post_grad nodes in `node_schedule`.
 
-    global _inductor_triton_kernel_to_post_grad_node_info
-    if is_extern:
-        assert isinstance(node_schedule, ExternKernelOut)
-        curr_node_info = _inductor_triton_kernel_to_post_grad_node_info.setdefault(
-            kernel_name, []
-        )
-        # 'origins' on IR nodes gives what FX IR nodes contributed to any given fused kernel.
-        # "origin_node" is more precise and says that the contents of this node corresponds
-        # EXACTLY to the output of a particular FX node, but it's not always available
-        if node_schedule.origin_node:
-            origin_node_name = node_schedule.origin_node.name
-            if origin_node_name not in curr_node_info:
-                curr_node_info.append(origin_node_name)
-        else:
-            curr_node_info.extend(
-                origin.name
-                for origin in node_schedule.origins
-                if origin.name not in curr_node_info
+    Returns a unique int debug handler for each call to this function.
+    """
+
+    try:
+        from .codegen.simd_kernel_features import DisableReduction, EnableReduction
+
+        global _inductor_triton_kernel_to_post_grad_node_info
+        global _inductor_kernel_stack_trace
+        global _inductor_kernel_provenance_debug_handle
+
+        _inductor_kernel_provenance_debug_handle += 1
+        stack_traces: list[str] = []
+        kernel_name = f"{kernel_name}:{_inductor_kernel_provenance_debug_handle}"
+        if is_extern:
+            assert isinstance(node_schedule, ExternKernel)
+            curr_node_info = _inductor_triton_kernel_to_post_grad_node_info.setdefault(
+                kernel_name, []
             )
-    else:
-        assert isinstance(node_schedule, list)
-        for snode in node_schedule:
-            if snode not in (EnableReduction, DisableReduction):
-                if snode.node is not None:
-                    curr_node_info = (
-                        _inductor_triton_kernel_to_post_grad_node_info.setdefault(
-                            kernel_name, []
+            # 'origins' on IR nodes gives what FX IR nodes contributed to any given fused kernel.
+            # "origin_node" is more precise and says that the contents of this node corresponds
+            # EXACTLY to the output of a particular FX node, but it's not always available
+            if node_schedule.origin_node:
+                origin_node_name = node_schedule.origin_node.name
+                if origin_node_name not in curr_node_info:
+                    curr_node_info.append(origin_node_name)
+            else:
+                curr_node_info.extend(
+                    origin.name
+                    for origin in node_schedule.origins
+                    if origin.name not in curr_node_info
+                )
+            stack_traces = list(node_schedule.get_stack_traces())
+        else:
+            assert isinstance(node_schedule, list)
+            stack_traces_set: OrderedSet[str] = OrderedSet()
+            for snode in node_schedule:
+                if snode not in (EnableReduction, DisableReduction):
+                    if snode.node is not None:
+                        curr_node_info = (
+                            _inductor_triton_kernel_to_post_grad_node_info.setdefault(
+                                kernel_name, []
+                            )
                         )
-                    )
-                    curr_node_info.extend(
-                        origin.name
-                        for origin in snode.node.origins
-                        if origin.name not in curr_node_info
-                    )
+                        stack_traces_set.update(snode.node.get_stack_traces())
+                        curr_node_info.extend(
+                            origin.name
+                            for origin in snode.node.origins
+                            if origin.name not in curr_node_info
+                        )
+            stack_traces = list(stack_traces_set)
+        _inductor_kernel_stack_trace.setdefault(kernel_name, []).extend(stack_traces)
+        return _inductor_kernel_provenance_debug_handle
+    except Exception as e:
+        # Since this is just debugging, it should never interfere with regular
+        # program execution, so we use this try-except to guard against any error
+        signpost_event(
+            "inductor",
+            "provenance_tracking_error",
+            {
+                "function": "set_kernel_post_grad_provenance_tracing",
+                "error_msg": str(e),
+                "stack_trace": traceback.format_exc(),
+            },
+        )
+        return None
 
 
 def save_args_for_compile_fx_inner(*args: Any, **kwargs: Any) -> None:
@@ -1030,7 +1250,7 @@ def aot_inductor_minifier_wrapper(
 
     use_minifier = config.aot_inductor.dump_aoti_minifier
 
-    gm = exported_program.module()
+    gm = exported_program.module(check_guards=False)
     assert isinstance(gm, torch.fx.GraphModule)
 
     args, kwargs = exported_program.example_inputs
@@ -1059,7 +1279,7 @@ def aot_inductor_minifier_wrapper(
             tuple_inputs = tuple(flat_example_inputs)
             flattened_ep = torch.export.export(gm_copy, tuple_inputs, strict=False)
             func(
-                flattened_ep.module(),
+                flattened_ep.module(check_guards=False),
                 tuple_inputs,
                 inductor_configs=config_copy,
                 package_path=package_path,
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index d903d851ee872..eebe6c974e173 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -34,11 +34,7 @@
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     type_to_dtype,
 )
-from torch.fx.experimental.symbolic_shapes import (
-    guard_or_false,
-    guard_size_oblivious,
-    statically_known_true,
-)
+from torch.fx.experimental.symbolic_shapes import guard_or_false, statically_known_true
 
 from . import config, inductor_prims
 from .utils import (
@@ -162,19 +158,6 @@ def _embedding_dense_backward(
     )
 
 
-# TODO: for now, inductor doesn't handle asserts
-# because the condition is symbol -> tensor in the graph.
-@register_decomposition([aten._assert_async.msg])
-def assert_async_msg_decomp(tensor: torch.Tensor, msg: str) -> None:
-    return
-
-
-# Following `assert_async_msg_decomp` and implement as non-op.
-@register_decomposition([aten._functional_assert_async.msg])
-def functional_assert_async_msg_decomp(tensor: torch.Tensor, msg: str) -> None:
-    return
-
-
 @register_decomposition([aten.sym_constrain_range_for_size.default])
 def sym_constrain_range_for_size(
     symbol: torch.SymInt,
@@ -404,10 +387,10 @@ def non_empty_tensor(x: torch.Tensor) -> bool:
         # runtime assert forcing u0 to be zero.  So if this hasn't happened,
         # we know that the unbacked SymInt has appropriate size and there are
         # no problems.
-        if len(x.shape) == 1 and guard_size_oblivious(x.shape[0] == 0):
+        if len(x.shape) == 1 and guard_or_false(x.shape[0] == 0):
             return False
 
-        if dim < len(x.shape) and guard_size_oblivious(x.shape[dim] == 0):
+        if dim < len(x.shape) and guard_or_false(x.shape[dim] == 0):
             return False
 
         return True
@@ -476,6 +459,16 @@ def add(
     y_is_complex_tensor = torch.is_tensor(y) and y.is_complex()
     if not x_is_complex_tensor or not y_is_complex_tensor:
         return NotImplemented
+
+    output_size_zero = False
+    if x.ndim == 0 and y.ndim == 0:
+        output_size_zero = True
+
+    if x.ndim == 0:
+        x = x.reshape(1)
+    if y.ndim == 0:
+        y = y.reshape(1)
+
     z = y
     if alpha is not None:
         z = alpha * y
@@ -507,6 +500,9 @@ def reshape_tensor_complex(tensor: torch.Tensor) -> torch.Tensor:
     x_reshaped = reshape_tensor_complex(x.view(x.real.dtype))
     z_reshaped = reshape_tensor_complex(z.view(y.real.dtype))
     result = torch.flatten(x_reshaped + z_reshaped, start_dim=-2).view(complex_type)
+
+    if output_size_zero:
+        return result[0]
     return result
 
 
@@ -1154,3 +1150,25 @@ def rrelu_with_noise_functional(
     else:
         negative_slope = (lower + upper) / 2
         return aten.leaky_relu(self, negative_slope), torch.Tensor()
+
+
+@register_decomposition(aten.repeat_interleave.Tensor)
+def repeat_interleave_Tensor(
+    repeat: torch.Tensor,
+    output_size: Optional[int] = None,
+) -> torch.Tensor:
+    if config.triton.autotune_at_compile_time:
+        # We can't compile-time auto-tune this because
+        # it expects specific data in `repeat`
+        return NotImplemented
+    if output_size is None or type(output_size) is not int:
+        return NotImplemented
+    if repeat.device.type == "mps":
+        return NotImplemented
+    assert repeat.dtype in [torch.int32, torch.int64]
+    assert repeat.ndim == 1
+    cumsum = repeat.cumsum(0)
+    pos = torch.arange(output_size, device=repeat.device)
+    return torch.searchsorted(
+        cumsum, pos, out_int32=(repeat.dtype == torch.int32), right=True
+    )
diff --git a/torch/_inductor/dtype_propagation.py b/torch/_inductor/dtype_propagation.py
index 5f99d83e07e79..d80caa1e2b72c 100644
--- a/torch/_inductor/dtype_propagation.py
+++ b/torch/_inductor/dtype_propagation.py
@@ -373,6 +373,10 @@ def placeholder(self, index: int) -> torch.dtype:
             f"{type(self).__name__}: ops.placeholder should not appear here"
         )
 
+    @staticmethod
+    def device_assert_async(cond, msg: str) -> torch.dtype:
+        return torch.bool
+
 
 if TYPE_CHECKING:
 
diff --git a/torch/_inductor/exc.py b/torch/_inductor/exc.py
index ac321c9974ae8..a46663ed8f8c0 100644
--- a/torch/_inductor/exc.py
+++ b/torch/_inductor/exc.py
@@ -92,6 +92,9 @@ def __init__(self, cmd: list[str], output: str) -> None:
         if isinstance(output, bytes):
             output = output.decode("utf-8")
 
+        self.cmd = cmd
+        self.output = output
+
         super().__init__(
             textwrap.dedent(
                 """
@@ -108,6 +111,9 @@ def __init__(self, cmd: list[str], output: str) -> None:
             .format(cmd=" ".join(cmd), output=output)
         )
 
+    def __reduce__(self) -> tuple[type, tuple[list[str], str]]:
+        return (self.__class__, (self.cmd, self.output))
+
 
 class CUDACompileError(CppCompileError):
     pass
diff --git a/torch/_inductor/fuzzer.py b/torch/_inductor/fuzzer.py
index 82edd5d4d5b60..8149bc7e98e79 100644
--- a/torch/_inductor/fuzzer.py
+++ b/torch/_inductor/fuzzer.py
@@ -23,7 +23,8 @@
 )
 
 import torch
-from torch._inductor.custom_graph_pass import CustomGraphPass
+from functorch.compile import min_cut_rematerialization_partition
+from torch._inductor.custom_graph_pass import CustomGraphPass, CustomPartitionerFn
 from torch._inductor.scheduler import BaseSchedulerNode
 from torch.utils._config_module import _ConfigEntry, ConfigModule
 from torch.utils._ordered_set import OrderedSet
@@ -74,6 +75,20 @@ def uuid(self) -> Optional[Any]:
         return None
 
 
+class DummyPartitionerFn(CustomPartitionerFn):
+    """
+    A Dummy partitioner function to be used by ConfigFuzzer
+    """
+
+    def __call__(
+        self, gm: torch.fx.GraphModule, joint_inputs: Sequence[object], **kwargs: Any
+    ) -> tuple[torch.fx.GraphModule, torch.fx.GraphModule]:
+        return min_cut_rematerialization_partition(gm, joint_inputs, **kwargs)
+
+    def uuid(self) -> Optional[Any]:
+        return None
+
+
 T = TypeVar("T")
 
 
@@ -84,6 +99,7 @@ class TypeExemplars:
 
     TYPE_EXEMPLARS: dict[str, Any] = {
         CustomGraphPass.__name__: DummyPass(),
+        CustomPartitionerFn.__name__: DummyPartitionerFn(),
         torch.fx.graph.Graph.__name__: torch.fx.graph.Graph(),
         BaseSchedulerNode.__name__: BaseSchedulerNode(None),  # type: ignore[arg-type]
     }
@@ -499,6 +515,7 @@ def keys(self) -> KeysView[ComboType]:
         "joint_custom_post_pass": DEFAULT,  # Typing
         "joint_custom_pre_pass": DEFAULT,  # Typing
         "pre_grad_custom_pass": DEFAULT,  # Typing
+        "custom_partitioner_fn": DEFAULT,  # Typing
     },
     "torch._dynamo.config": {
         "traceable_tensor_subclasses": DEFAULT,  # Typing
diff --git a/torch/_inductor/fx_passes/bucketing.py b/torch/_inductor/fx_passes/bucketing.py
index 75dd3678d51c7..bf16454157b36 100644
--- a/torch/_inductor/fx_passes/bucketing.py
+++ b/torch/_inductor/fx_passes/bucketing.py
@@ -1,3 +1,4 @@
+import collections
 import logging
 from collections import defaultdict
 from typing import Any, Callable, Optional
@@ -7,6 +8,7 @@
 import torch.utils._pytree as pytree
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import detect_fake_mode
+from torch._inductor.runtime.runtime_utils import dynamo_timed
 from torch._logging import trace_structured
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.utils._ordered_set import OrderedSet
@@ -32,6 +34,7 @@ def bucket_cap_mb_by_bucket_idx_default(bucket_id: int) -> float:
 def bucket_all_gather(
     gm: torch.fx.GraphModule,
     bucket_cap_mb_by_bucket_idx: Optional[Callable[[int], float]] = None,
+    mode: Optional[str] = None,
 ) -> None:
     if bucket_cap_mb_by_bucket_idx is None:
         from torch._inductor.fx_passes.bucketing import (
@@ -42,12 +45,13 @@ def bucket_all_gather(
     ag_buckets = bucket_all_gather_by_mb(gm, bucket_cap_mb_by_bucket_idx)
     if len(ag_buckets) == 0:
         return
-    merge_all_gather(gm, ag_buckets)
+    merge_all_gather(gm, ag_buckets, mode)
 
 
 def bucket_reduce_scatter(
     gm: torch.fx.GraphModule,
     bucket_cap_mb_by_bucket_idx: Optional[Callable[[int], float]] = None,
+    mode: Optional[str] = None,
 ) -> None:
     if bucket_cap_mb_by_bucket_idx is None:
         from torch._inductor.fx_passes.bucketing import (
@@ -58,7 +62,7 @@ def bucket_reduce_scatter(
     rs_buckets = bucket_reduce_scatter_by_mb(gm, bucket_cap_mb_by_bucket_idx)
     if len(rs_buckets) == 0:
         return
-    merge_reduce_scatter(gm, rs_buckets)
+    merge_reduce_scatter(gm, rs_buckets, mode)
 
 
 def is_all_gather_into_tensor(node: torch.fx.Node) -> bool:  # type: ignore[arg-type]
@@ -86,6 +90,42 @@ def is_wait_tensor_from_all_gather_into_tensor(node: torch.fx.Node) -> bool:
     return is_wait_tensor(node) and is_all_gather_into_tensor(node.args[0])  # type: ignore[arg-type]
 
 
+def collect_node_descendants(
+    graph: torch.fx.Graph,
+) -> dict[torch.fx.Node, OrderedSet[torch.fx.Node]]:
+    """
+    Collects the descendants of each node in the graph.
+    Args:
+        graph (torch.fx.Graph): The graph to collect descendants from.
+    Returns:
+        dict[torch.fx.Node, OrderedSet[torch.fx.Node]]: A dictionary mapping each node to its descendants.
+    """
+    node_descendants: dict[torch.fx.Node, OrderedSet[torch.fx.Node]] = (
+        collections.defaultdict(OrderedSet)
+    )
+    outdegree = collections.defaultdict(int)
+    queue = []
+
+    for node in graph.nodes:
+        n_outdegree = len(node.users)
+        if n_outdegree == 0:
+            queue.append(node)
+        else:
+            outdegree[node] = len(node.users)
+
+    while queue:
+        node = queue.pop()
+        for input_node in node.all_input_nodes:
+            node_descendants[input_node] |= node_descendants[node]
+            node_descendants[input_node].add(node)
+            outdegree[input_node] -= 1
+
+            if outdegree[input_node] == 0:
+                queue.append(input_node)
+
+    return node_descendants
+
+
 def greedy_bucket_collective_by_mb(
     gm: torch.fx.GraphModule,
     bucket_cap_mb_by_bucket_idx: Callable[[int], float],
@@ -93,6 +133,12 @@ def greedy_bucket_collective_by_mb(
     node_group_key: Callable[[torch.fx.Node], Any],
     filter_wait_node: Optional[Callable[[torch.fx.Node], bool]] = None,
 ) -> list[list[torch.fx.Node]]:
+    """
+    Bucketing adjacent collectives with equal node_group_key.
+    We can not bucket non adjacent collectives,
+    as this will effectively change the order of collectives.
+    Reordering can lead to different order on different ranks.
+    """
     g = gm.graph
     found_candidates = False
     for node in g.nodes:
@@ -102,33 +148,41 @@ def greedy_bucket_collective_by_mb(
     if not found_candidates:
         return []
 
-    nodes_groups: dict[Any, list[torch.fx.Node]] = defaultdict(list)
-    nodes_successors: dict[torch.fx.Node, OrderedSet[torch.fx.Node]] = defaultdict(
-        OrderedSet
-    )
+    # TODO: pearce kelly algorithm for detecting cycles
+    node_descendents = collect_node_descendants(gm.graph)
+
+    nodes_groups: list[list[torch.fx.Node]] = []
+    cur_group: list[torch.fx.Node] = []
+    cur_group_key = None
 
     for node in g.nodes:
-        for n, successors in nodes_successors.items():
-            if any(arg in successors for arg in node.args):
-                successors.add(n)
         if is_wait_tensor(node) and filter_node(node.args[0]):
             if (filter_wait_node is None) or filter_wait_node(node):
                 coll_node = node.args[0]
                 group_key = node_group_key(coll_node)
-                nodes_groups[group_key].append(coll_node)
+                if group_key == cur_group_key:
+                    cur_group.append(coll_node)
+                else:
+                    if len(cur_group) > 1:
+                        nodes_groups.append(cur_group)
+                    cur_group = [coll_node]
+                    cur_group_key = group_key
+
+    if len(cur_group) > 1:
+        nodes_groups.append(cur_group)
 
     buckets: list[list[torch.fx.Node]] = []
-    for nodes in nodes_groups.values():
+    for nodes in nodes_groups:
         cur_bucket: list[torch.fx.Node] = []
-        cur_bucket_successors: OrderedSet[torch.fx.Node] = OrderedSet()
+        cur_bucket_descendents: OrderedSet[torch.fx.Node] = OrderedSet()
         cur_bucket_size_bytes: int = 0
         cur_bucket_id: int = 0
         bucket_size_bytes = int(
             bucket_cap_mb_by_bucket_idx(cur_bucket_id) * 1024 * 1024
         )
         for node in nodes:
-            if node in cur_bucket_successors:
-                # We can not bucket successors with the node
+            if node in cur_bucket_descendents:
+                # if there is a path from node to the current bucket, we cannot horizontally fuse (bucket)
                 continue
             assert "val" in node.meta
             n_val = node.meta["val"]
@@ -143,10 +197,10 @@ def greedy_bucket_collective_by_mb(
                 cur_bucket = []
                 cur_bucket_size_bytes = 0
                 cur_bucket_id += 1
-                cur_bucket_successors = OrderedSet()
+                cur_bucket_descendents = OrderedSet()
             cur_bucket_size_bytes += size_bytes
             cur_bucket.append(node)
-            cur_bucket_successors |= nodes_successors[node]
+            cur_bucket_descendents |= node_descendents[node]
         if len(cur_bucket) > 1:
             buckets.append(cur_bucket)
     return buckets
@@ -163,7 +217,7 @@ def bucket_all_gather_by_mb(
 
     Args:
         gm (torch.fx.GraphModule): GraphModule where to bucket all_gathers.
-        bucket_cap_mb_bucket_idx (Callable[[int], float]): Callable to specify cap of the bucket
+        bucket_cap_mb_by_bucket_idx (Callable[[int], float]): Callable to specify cap of the bucket
             in megabytes by bucket idx.  The idea of `bucket_cap_mb_by_bucket_idx` is to allow
             to specify different sizes of the buckets at the start,
             as first all_gather is usually exposed.  Interface of bucket_cap_mb_by_bucket_idx
@@ -201,14 +255,14 @@ def bucket_reduce_scatter_by_mb(
 
     Args:
         gm (torch.fx.GraphModule): GraphModule where to bucket reduce_scatters.
-        bucket_cap_mb_bucket_idx (Callable[[int], float]): Callable to specify cap of the bucket
+        bucket_cap_mb_by_bucket_idx (Callable[[int], float]): Callable to specify cap of the bucket
             in megabytes by bucket idx.  The idea of `bucket_cap_mb_by_bucket_idx` is to allow
             to specify different sizes of the buckets.
         filter_wait_node (Optional[Callable[[torch.fx.Node], bool]]): If specified,
             only reduce_scatter nodes with wait_node that satisfy `filter_wait_node` will be bucketed.
 
     Returns:
-        list[list[torch.fx.Node]]: List of buckets, where each bucket is a list of all_gather nodes.
+        list[list[torch.fx.Node]]: List of buckets, where each bucket is a list of reduce_scatter nodes.
     """
 
     def _rs_group_key(node: torch.fx.Node) -> tuple[str, str, torch.dtype]:
@@ -227,6 +281,52 @@ def _rs_group_key(node: torch.fx.Node) -> tuple[str, str, torch.dtype]:
     )
 
 
+@torch.library.custom_op("bucketing::_pre_bucket_reduce_scatter", mutates_args={})
+def _pre_bucket_reduce_scatter(
+    rs_ins: list[torch.Tensor],
+    group_size: int,
+) -> torch.Tensor:
+    rs_ins_flattened = [x.view(group_size, -1) for x in rs_ins]
+    new_rs_in = torch.cat(rs_ins_flattened, dim=1).flatten()
+    return new_rs_in
+
+
+def _pre_bucket_reduce_scatter_fake(
+    rs_ins: list[torch.Tensor],
+    group_size: int,
+) -> torch.Tensor:
+    out_numel = sum(rs_in.numel() for rs_in in rs_ins)
+    return torch.empty((out_numel,), device=rs_ins[0].device, dtype=rs_ins[0].dtype)
+
+
+_pre_bucket_reduce_scatter.register_fake(_pre_bucket_reduce_scatter_fake)
+
+
+def reduce_scatter_merge_fn_to_trace_custom_ops(
+    rs_ins: list[torch.Tensor],
+    group_size: int,
+    group_name: str,
+    reduce_op: str,
+    reduce_dtype: torch.dtype,  # type: ignore[name-defined]
+    device: torch.device,  # type: ignore[name-defined]
+) -> list[torch.Tensor]:  # type: ignore[no-untyped-def]
+    new_out_sizes = [(x.shape[0] // group_size,) + x.shape[1:] for x in rs_ins]
+    new_out_numels = [x.numel() // group_size for x in rs_ins]
+
+    new_rs_in = torch.ops.bucketing._pre_bucket_reduce_scatter(rs_ins, group_size)
+
+    # TODO - either use torch.cat or make sure inductor foreach codegen
+    # fires more reliably
+    new_rs_out = torch.ops.c10d_functional.wait_tensor(
+        torch.ops._c10d_functional.reduce_scatter_tensor.default(
+            new_rs_in, reduce_op, group_size, group_name
+        )
+    )
+    new_out_flat = new_rs_out.split(new_out_numels, 0)
+    new_outs = [x.view(s) for x, s in zip(new_out_flat, new_out_sizes)]
+    return new_outs
+
+
 def reduce_scatter_merge_fn_to_trace(
     rs_ins: list[torch.Tensor],
     group_size: int,
@@ -252,6 +352,74 @@ def reduce_scatter_merge_fn_to_trace(
     return new_outs
 
 
+@torch.library.custom_op("bucketing::_pre_bucket_all_gather", mutates_args={})
+def _pre_bucket_all_gather(
+    ag_ins: list[torch.Tensor],
+    group_size: int,
+    group_name: str,
+    dtype: torch.dtype,  # type: ignore[name-defined]
+    rank: int,
+) -> torch.Tensor:
+    ins_split_sizes = [ag_in.numel() for ag_in in ag_ins]
+    ag_input_numel = sum(ins_split_sizes)
+    device = ag_ins[0].device
+    new_ag_out = torch.empty(ag_input_numel * group_size, dtype=dtype, device=device)
+    new_ag_in = new_ag_out.narrow(0, ag_input_numel * rank, ag_input_numel)
+    foreach_copy_dsts = torch.split(new_ag_in, ins_split_sizes)
+    ag_ins_flattened = [ag_in.reshape(-1) for ag_in in ag_ins]
+    torch._foreach_copy_(foreach_copy_dsts, ag_ins_flattened)
+    return new_ag_out
+
+
+def _pre_bucket_all_gather_fake(
+    ag_ins: list[torch.Tensor],
+    group_size: int,
+    group_name: str,
+    dtype: torch.dtype,  # type: ignore[name-defined]
+    rank: int,
+) -> torch.Tensor:
+    ins_split_sizes = [ag_in.numel() for ag_in in ag_ins]
+    ag_input_numel = sum(ins_split_sizes)
+    device = ag_ins[0].device
+    new_ag_out = torch.empty(ag_input_numel * group_size, dtype=dtype, device=device)
+    return new_ag_out
+
+
+_pre_bucket_all_gather.register_fake(_pre_bucket_all_gather_fake)
+
+
+def all_gather_merge_fn_to_trace_custom_ops(
+    ag_ins: list[torch.Tensor],
+    group_size: int,
+    group_name: str,
+    dtype: torch.dtype,  # type: ignore[name-defined]
+    rank: int,
+) -> list[torch.Tensor]:
+    ins_sizes = [ag_in.shape for ag_in in ag_ins]
+    ins_split_sizes = [ag_in.numel() for ag_in in ag_ins]
+    ag_input_numel = sum(ins_split_sizes)
+    new_ag_out = torch.ops.bucketing._pre_bucket_all_gather(
+        ag_ins, group_size, group_name, dtype, rank
+    )
+    new_ag_in = new_ag_out.narrow(0, ag_input_numel * rank, ag_input_numel)
+    wait_tensor = torch.ops.c10d_functional.wait_tensor(
+        torch.ops._c10d_functional.all_gather_into_tensor_out.default(
+            new_ag_in, group_size, group_name, out=new_ag_out
+        )
+    )
+    new_ag_out_reshaped = wait_tensor.reshape(group_size, -1)
+    outs = torch.split_with_sizes(
+        new_ag_out_reshaped,
+        ins_split_sizes,
+        dim=1,
+    )
+    outs_reshaped = [
+        o.reshape((shape[0] * group_size,) + shape[1:])
+        for o, shape in zip(outs, ins_sizes)
+    ]
+    return outs_reshaped
+
+
 def all_gather_merge_fn_to_trace(
     ag_ins: list[torch.Tensor],
     group_size: int,
@@ -327,10 +495,17 @@ def all_gather_merge_fn_to_trace_functional(
 
 
 def _trace(fn, inps) -> torch.fx.GraphModule:  # type: ignore[no-untyped-def]
-    fake_mode = detect_fake_mode(inps)
-    assert fake_mode is not None
-    with fake_mode, enable_python_dispatcher():
-        return make_fx(fn)(*inps)
+    with dynamo_timed("fx.bucketing._trace", log_pt2_compile_event=True):
+        fake_mode = detect_fake_mode(inps)
+        assert fake_mode is not None
+        with fake_mode, enable_python_dispatcher():
+            out = make_fx(fn)(*inps)
+            for node in out.graph.find_nodes(
+                op="call_function", target=torch.ops.aten.detach.default
+            ):
+                node.replace_all_uses_with(node.args[0])
+                out.graph.erase_node(node)
+            return out
 
 
 def _insert_fn_trace_before_node(  # type: ignore[no-untyped-def]
@@ -348,200 +523,220 @@ def _insert_fn_trace_before_node(  # type: ignore[no-untyped-def]
     using :attr:`g_fn_inps` nodes of original graphas inputs of function graph,
     function graph outputs will replace :attr:`g_fn_outs` in original graph.
     """
-    fn_gm = _trace(
-        fn_to_trace,
-        inps,
-    )
-    fn_g = fn_gm.graph
-    fn_g_ins = fn_g.find_nodes(op="placeholder")
-    env = {fn_g_ins[idx]: g_fn_inps[idx] for idx in range(len(g_fn_inps))}
-    g_fn_new_outs: list[torch.fx.Node] = []
-    with g.inserting_before(insert_before_node):
-        for _n in fn_g.nodes:
-            if _n.op == "placeholder":
-                continue
-            _new_n = g.node_copy(_n, lambda x: env[x])
-            env[_n] = _new_n
-            if _n.op == "output":
-                g_fn_new_outs = _new_n.args[0]  # type: ignore[assignment]
-                g.erase_node(_new_n)
-    replacements = {  # noqa: C416
-        orig_out: new_out for orig_out, new_out in zip(g_fn_outs, g_fn_new_outs)
-    }
-    for orig_out, new_out in zip(g_fn_outs, g_fn_new_outs):
-        orig_out.replace_all_uses_with(new_out)
-    return replacements
+    with dynamo_timed(
+        "fx.bucketing._insert_fn_trace_before_node", log_pt2_compile_event=True
+    ):
+        fn_gm = _trace(
+            fn_to_trace,
+            inps,
+        )
+        fn_g = fn_gm.graph
+        fn_g_ins = fn_g.find_nodes(op="placeholder")
+        env = {fn_g_ins[idx]: g_fn_inps[idx] for idx in range(len(g_fn_inps))}
+        g_fn_new_outs: list[torch.fx.Node] = []
+        with g.inserting_before(insert_before_node):
+            for _n in fn_g.nodes:
+                if _n.op == "placeholder":
+                    continue
+                _new_n = g.node_copy(_n, lambda x: env[x])
+                env[_n] = _new_n
+                if _n.op == "output":
+                    g_fn_new_outs = _new_n.args[0]  # type: ignore[assignment]
+                    g.erase_node(_new_n)
+        replacements = {  # noqa: C416
+            orig_out: new_out for orig_out, new_out in zip(g_fn_outs, g_fn_new_outs)
+        }
+        for orig_out, new_out in zip(g_fn_outs, g_fn_new_outs):
+            orig_out.replace_all_uses_with(new_out)
+        return replacements
 
 
 def merge_reduce_scatter(
-    gm: torch.fx.GraphModule, rs_buckets: list[list[torch.fx.Node]]
+    gm: torch.fx.GraphModule,
+    rs_buckets: list[list[torch.fx.Node]],
+    mode: Optional[str] = None,
 ) -> None:
-    trace_structured(
-        "artifact",
-        metadata_fn=lambda: {
-            "name": "fx_bucketing_passes_reduce_scatter_buckets",
-            "encoding": "string",
-        },
-        payload_fn=lambda: str(rs_buckets),
-    )
-    n_buckets = len(rs_buckets)
-    g = gm.graph
-    rs_ins: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
-    rs_waits: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
-
-    for bucket_idx, rs_nodes in enumerate(rs_buckets):
-        rs0 = rs_nodes[0]
-        rs0_val = rs0.meta["val"]
-        _, reduce_op, group_size, group_name = rs0.args
-        reduce_dtype = rs0_val.dtype
-        device = rs0_val.device
-        for n in rs_nodes:
-            rs_val = n.meta["val"]
-            assert (
-                n.args[1] == reduce_op
-                and n.args[2] == group_size
-                and n.args[3] == group_name
-                and rs_val.device == device
-                and rs_val.dtype == reduce_dtype
-            )
-            assert len(n.users) == 1
-            wait_n = next(iter(n.users))
-            rs_ins[bucket_idx].append(n.args[0])  # type: ignore[arg-type]
-            rs_waits[bucket_idx].append(wait_n)
-
-    for bucket_idx in range(n_buckets):
-        _rs_ins = rs_ins[bucket_idx]
-        _rs_waits = rs_waits[bucket_idx]
-        _rs_ns = rs_buckets[bucket_idx]
-
-        rs0 = _rs_ns[0]
-        rs0_val = rs0.meta["val"]
-        _, reduce_op, group_size, group_name = rs0.args
-        reduce_dtype = rs0_val.dtype
-        device = rs0_val.device
-
-        replacements = _insert_fn_trace_before_node(
-            g,
-            reduce_scatter_merge_fn_to_trace,
-            (
-                pytree.tree_map(lambda node: node.meta["val"], _rs_ins),
-                group_size,
-                group_name,
-                reduce_op,
-                reduce_dtype,
-                device,
-            ),
-            _rs_ns[-1].next,
-            _rs_ins,
-            _rs_waits,
+    """
+    Merges specified buckets of reduce_scatter to joint reduce_scatter.
+    """
+    with dynamo_timed("fx.bucketing.merge_reduce_scatter", log_pt2_compile_event=True):
+        rs_merge_fn = reduce_scatter_merge_fn_to_trace
+        if mode and "custom_ops" in mode:
+            rs_merge_fn = reduce_scatter_merge_fn_to_trace_custom_ops
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "fx_bucketing_passes_reduce_scatter_buckets",
+                "encoding": "string",
+            },
+            payload_fn=lambda: str(rs_buckets),
         )
-        # [Note: Replacement in bucketing passes]
-        # After bucketing _rs_waits will be replaced with output nodes of
-        # fn_to_trace graph that will be inserted in the graph g.
-        # By this time we already prepared rs_ins, rs_waits.
-        # rs_ins for following buckets can be replaced _rs_waits with new nodes.
-        # We apply replacements to rs_ins.
+        n_buckets = len(rs_buckets)
+        g = gm.graph
+        rs_ins: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
+        rs_waits: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
+
+        for bucket_idx, rs_nodes in enumerate(rs_buckets):
+            rs0 = rs_nodes[0]
+            rs0_val = rs0.meta["val"]
+            _, reduce_op, group_size, group_name = rs0.args
+            reduce_dtype = rs0_val.dtype
+            device = rs0_val.device
+            for n in rs_nodes:
+                rs_val = n.meta["val"]
+                assert (
+                    n.args[1] == reduce_op
+                    and n.args[2] == group_size
+                    and n.args[3] == group_name
+                    and rs_val.device == device
+                    and rs_val.dtype == reduce_dtype
+                )
+                assert len(n.users) == 1
+                wait_n = next(iter(n.users))
+                rs_ins[bucket_idx].append(n.args[0])  # type: ignore[arg-type]
+                rs_waits[bucket_idx].append(wait_n)
+
+        for bucket_idx in range(n_buckets):
+            _rs_ins = rs_ins[bucket_idx]
+            _rs_waits = rs_waits[bucket_idx]
+            _rs_ns = rs_buckets[bucket_idx]
+
+            rs0 = _rs_ns[0]
+            rs0_val = rs0.meta["val"]
+            _, reduce_op, group_size, group_name = rs0.args
+            reduce_dtype = rs0_val.dtype
+            device = rs0_val.device
+
+            replacements = _insert_fn_trace_before_node(
+                g,
+                rs_merge_fn,
+                (
+                    pytree.tree_map(lambda node: node.meta["val"], _rs_ins),
+                    group_size,
+                    group_name,
+                    reduce_op,
+                    reduce_dtype,
+                    device,
+                ),
+                _rs_ns[-1].next,
+                _rs_ins,
+                _rs_waits,
+            )
+            # [Note: Replacement in bucketing passes]
+            # After bucketing _rs_waits will be replaced with output nodes of
+            # fn_to_trace graph that will be inserted in the graph g.
+            # By this time we already prepared rs_ins, rs_waits.
+            # rs_ins for following buckets can be replaced _rs_waits with new nodes.
+            # We apply replacements to rs_ins.
 
-        def _replace(x: torch.fx.Node) -> torch.fx.Node:
-            return replacements.get(x, x)
+            def _replace(x: torch.fx.Node) -> torch.fx.Node:
+                return replacements.get(x, x)
 
-        for j in range(bucket_idx + 1, n_buckets):
-            rs_ins[j] = pytree.tree_map(_replace, rs_ins[j])
+            for j in range(bucket_idx + 1, n_buckets):
+                rs_ins[j] = pytree.tree_map(_replace, rs_ins[j])
 
-        for rs_n, wait_n in zip(_rs_ns, _rs_waits):
-            g.erase_node(wait_n)
-            g.erase_node(rs_n)
+            for rs_n, wait_n in zip(_rs_ns, _rs_waits):
+                g.erase_node(wait_n)
+                g.erase_node(rs_n)
 
 
 def merge_all_gather(
-    gm: torch.fx.GraphModule, ag_buckets: list[list[torch.fx.Node]]
+    gm: torch.fx.GraphModule,
+    ag_buckets: list[list[torch.fx.Node]],
+    mode: Optional[str] = None,
 ) -> None:  # type: ignore[union-attr]
     """
     Merges specified buckets of all_gather to joint all_gather.
     """
-    from torch.distributed.distributed_c10d import _resolve_process_group
-
-    trace_structured(
-        "artifact",
-        metadata_fn=lambda: {
-            "name": "fx_bucketing_passes_all_gather_buckets",
-            "encoding": "string",
-        },
-        payload_fn=lambda: str(ag_buckets),
-    )
-    n_buckets = len(ag_buckets)
-
-    ag_node_to_pre_nodes = defaultdict(list)
-
-    ag_ins: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
-    ag_waits: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
-    for bucket_idx, ag_bucket in enumerate(ag_buckets):
-        _, group_size, group_name = ag_bucket[0].args
-        assert isinstance(group_name, str)
-        dtype = ag_bucket[0].meta["val"].dtype
-
-        for ag_node in ag_bucket:
-            assert len(ag_node.users) == 1, (
-                f"Expect only one user for {ag_node}, but got {ag_node.users}"
-            )
-            wait_node = next(iter(ag_node.users))
-            assert (
-                ag_node.args[1] == group_size
-                and ag_node.args[2] == group_name
-                and ag_node.meta["val"].dtype == dtype
-            )
-            ag_node_in = ag_node.args[0]
-            if (
-                ag_node_in.op == "call_function"  # type: ignore[union-attr]
-                and ag_node_in.target == torch.ops.prims.convert_element_type.default  # type: ignore[union-attr]
-                and len(ag_node_in.users) == 1  # type: ignore[union-attr]
-            ):
-                ag_node_to_pre_nodes[ag_node].append(ag_node_in)
-                ag_node_in = ag_node_in.args[0]  # type: ignore[union-attr]
-
-            ag_ins[bucket_idx].append(ag_node_in)  # type: ignore[union-attr, arg-type]
-            ag_waits[bucket_idx].append(wait_node)
-
-    g = gm.graph
-
-    for bucket_idx in range(n_buckets):
-        _ag_ins = ag_ins[bucket_idx]
-        _ag_waits = ag_waits[bucket_idx]
-        _ag_ns = ag_buckets[bucket_idx]
-
-        ag0 = _ag_ns[0]
-        ag0_val = ag0.meta["val"]
-        _, group_size, group_name = ag0.args
-        dtype = ag0_val.dtype
-        assert isinstance(group_name, str)
-
-        rank: int = dist.get_rank(_resolve_process_group(group_name))
-
-        replacements = _insert_fn_trace_before_node(
-            g,
-            all_gather_merge_fn_to_trace,
-            (
-                pytree.tree_map(lambda node: node.meta["val"], _ag_ins),
-                group_size,
-                group_name,
-                dtype,
-                rank,
-            ),
-            ag0.next,
-            _ag_ins,
-            _ag_waits,
+    with dynamo_timed("fx.bucketing.merge_all_gather", log_pt2_compile_event=True):
+        from torch.distributed.distributed_c10d import _resolve_process_group
+
+        ag_merge_fn = all_gather_merge_fn_to_trace
+        if mode and "custom_ops" in mode:
+            ag_merge_fn = all_gather_merge_fn_to_trace_custom_ops
+
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "fx_bucketing_passes_all_gather_buckets",
+                "encoding": "string",
+            },
+            payload_fn=lambda: str(ag_buckets),
         )
+        n_buckets = len(ag_buckets)
+
+        ag_node_to_pre_nodes = defaultdict(list)
+
+        ag_ins: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
+        ag_waits: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
+        for bucket_idx, ag_bucket in enumerate(ag_buckets):
+            _, group_size, group_name = ag_bucket[0].args
+            assert isinstance(group_name, str)
+            dtype = ag_bucket[0].meta["val"].dtype
+
+            for ag_node in ag_bucket:
+                assert len(ag_node.users) == 1, (
+                    f"Expect only one user for {ag_node}, but got {ag_node.users}"
+                )
+                wait_node = next(iter(ag_node.users))
+                assert (
+                    ag_node.args[1] == group_size
+                    and ag_node.args[2] == group_name
+                    and ag_node.meta["val"].dtype == dtype
+                )
+                ag_node_in = ag_node.args[0]
+                if (
+                    ag_node_in.op == "call_function"  # type: ignore[union-attr]
+                    and ag_node_in.target  # type: ignore[union-attr]
+                    == torch.ops.prims.convert_element_type.default  # type: ignore[union-attr]
+                    and len(ag_node_in.users) == 1  # type: ignore[union-attr]
+                ):
+                    ag_node_to_pre_nodes[ag_node].append(ag_node_in)
+                    ag_node_in = ag_node_in.args[0]  # type: ignore[union-attr]
+
+                ag_ins[bucket_idx].append(ag_node_in)  # type: ignore[union-attr, arg-type]
+                ag_waits[bucket_idx].append(wait_node)
+
+        g = gm.graph
+
+        for bucket_idx in range(n_buckets):
+            _ag_ins = ag_ins[bucket_idx]
+            _ag_waits = ag_waits[bucket_idx]
+            _ag_ns = ag_buckets[bucket_idx]
+
+            ag0 = _ag_ns[0]
+            ag0_val = ag0.meta["val"]
+            _, group_size, group_name = ag0.args
+            dtype = ag0_val.dtype
+            assert isinstance(group_name, str)
+
+            rank: int = dist.get_rank(_resolve_process_group(group_name))
+
+            replacements = _insert_fn_trace_before_node(
+                g,
+                ag_merge_fn,
+                (
+                    pytree.tree_map(lambda node: node.meta["val"], _ag_ins),
+                    group_size,
+                    group_name,
+                    dtype,
+                    rank,
+                ),
+                ag0.next,
+                _ag_ins,
+                _ag_waits,
+            )
 
-        # See Note: [Replacement in bucketing passes]
-        def _replace(x: torch.fx.Node) -> torch.fx.Node:
-            return replacements.get(x, x)
+            # See Note: [Replacement in bucketing passes]
+            def _replace(x: torch.fx.Node) -> torch.fx.Node:
+                return replacements.get(x, x)
 
-        for j in range(bucket_idx + 1, n_buckets):
-            ag_ins[j] = pytree.tree_map(_replace, ag_ins[j])
+            for j in range(bucket_idx + 1, n_buckets):
+                ag_ins[j] = pytree.tree_map(_replace, ag_ins[j])
 
-        # Erasing old nodes in reverse order
-        for ag_n, wait_n in zip(ag_buckets[bucket_idx], _ag_waits):
-            g.erase_node(wait_n)
-            g.erase_node(ag_n)
-            for n in reversed(ag_node_to_pre_nodes[ag_n]):
-                g.erase_node(n)  # type: ignore[arg-type]
+            # Erasing old nodes in reverse order
+            for ag_n, wait_n in zip(ag_buckets[bucket_idx], _ag_waits):
+                g.erase_node(wait_n)
+                g.erase_node(ag_n)
+                for n in reversed(ag_node_to_pre_nodes[ag_n]):
+                    g.erase_node(n)  # type: ignore[arg-type]
diff --git a/torch/_inductor/fx_passes/decompose_mem_bound_mm.py b/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
index b3511536b69e5..31c6dae82fdbe 100644
--- a/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
+++ b/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
@@ -70,9 +70,13 @@ def should_decompose_bmm(mat1, mat2) -> bool:
         if mat1.shape[0] < min_first_dimension_decomposition:
             return False
         # 2 of m, n, k must be <= MAX_OTHER_DIMENSION_DECOMPOSITION
-        if (mat1.shape[1] < max_other_dimension_decomposition) + (
-            mat1.shape[2] < max_other_dimension_decomposition
-        ) + (mat2.shape[2] < max_other_dimension_decomposition) < 2:
+        # use bool() to deal with BooleanAtom type
+        if (
+            bool(mat1.shape[1] < max_other_dimension_decomposition)
+            + bool(mat1.shape[2] < max_other_dimension_decomposition)
+            + bool(mat2.shape[2] < max_other_dimension_decomposition)
+            < 2
+        ):
             return False
         return True
     elif check_device(mat1, mat2, device="cpu"):
diff --git a/torch/_inductor/fx_passes/fsdp.py b/torch/_inductor/fx_passes/fsdp.py
index 086651b9a9d77..e7e574ae4934d 100644
--- a/torch/_inductor/fx_passes/fsdp.py
+++ b/torch/_inductor/fx_passes/fsdp.py
@@ -38,12 +38,25 @@ def is_graph_output(node: torch.fx.Node) -> bool:
 
 
 def is_fsdp_reduce_scatter_wait(wait: torch.fx.Node) -> bool:
-    return is_graph_output(wait)
+    if is_graph_output(wait):
+        return True
+
+    if len(wait.users) == 1:
+        user = next(iter(wait.users))
+        assert user is not None
+        return (
+            is_graph_output(user)
+            and user.op == "call_function"
+            and user.target == torch.ops.prims.convert_element_type.default
+        )
+
+    return False
 
 
 def bucket_fsdp_all_gather(
     gm: torch.fx.GraphModule,
     bucket_cap_mb_by_bucket_idx: Optional[Callable[[int], float]] = None,
+    mode: Optional[str] = None,
 ) -> None:
     """
     Bucketing pass for SimpleFSDP all_gather ops.
@@ -67,12 +80,13 @@ def bucket_fsdp_all_gather(
     )
     if len(ag_buckets) == 0:
         return
-    merge_all_gather(gm, ag_buckets)
+    merge_all_gather(gm, ag_buckets, mode)
 
 
 def bucket_fsdp_reduce_scatter(
     gm: torch.fx.GraphModule,
     bucket_cap_mb_by_bucket_idx: Optional[Callable[[int], float]] = None,
+    mode: Optional[str] = None,
 ) -> None:
     """
     Bucketing pass for SimpleFSDP reduce_scatter ops.
@@ -97,4 +111,4 @@ def bucket_fsdp_reduce_scatter(
     )
     if len(rs_buckets) == 0:
         return
-    merge_reduce_scatter(gm, rs_buckets)
+    merge_reduce_scatter(gm, rs_buckets, mode)
diff --git a/torch/_inductor/fx_passes/group_batch_fusion.py b/torch/_inductor/fx_passes/group_batch_fusion.py
index 357a9d66cdad7..3f8ebe0a7d57d 100644
--- a/torch/_inductor/fx_passes/group_batch_fusion.py
+++ b/torch/_inductor/fx_passes/group_batch_fusion.py
@@ -1365,10 +1365,13 @@ def apply_group_batch_fusion(graph: torch.fx.GraphModule, rule: GroupBatchFusion
             print_output=False, include_stride=True, include_device=True
         )
 
+        name = f"optimus_{str(rule.__class__.__name__)}"
+        if "MTIA" in name:
+            name = f"cff_{str(rule.__class__.__name__)}"
         trace_structured(
             "artifact",
             metadata_fn=lambda: {
-                "name": f"optimus_{str(rule.__class__.__name__)}",
+                "name": name,
                 "encoding": "string",
             },
             payload_fn=lambda: graph_str,
diff --git a/torch/_inductor/fx_passes/mkldnn_fusion.py b/torch/_inductor/fx_passes/mkldnn_fusion.py
index e5a0c0dc51c5d..868eb74824ddd 100644
--- a/torch/_inductor/fx_passes/mkldnn_fusion.py
+++ b/torch/_inductor/fx_passes/mkldnn_fusion.py
@@ -109,13 +109,15 @@ def pack_linear_weight(
             # depends on the alignment of internally-stored metadata.
             # In aot mode, we need to firstly save the packed weight, when loading it,
             # it will be in a different address which doesn't work.
-            # Disable MKL prepack linear in AOT mode
+            # Disable MKL prepack linear in AOT mode.
+            # Disable MKL prepack linear when batch_size has free symbols.
             packed_weight_op = (
                 mkldnn._reorder_linear_weight
                 if (
                     is_lp_weight
                     or mkldnn._is_mkldnn_acl_supported()
                     or V.aot_compilation
+                    or has_free_symbols(batch_size)
                 )
                 else torch.ops.mkl._mkl_reorder_linear_weight
             )
@@ -128,7 +130,12 @@ def pack_linear(
         ):
             packed_linear_inputs: tuple[Any, ...] = (input, packed_weight_node)
             transpose_weight_node = packed_weight_node.args[0]
-            if is_lp_weight or mkldnn._is_mkldnn_acl_supported() or V.aot_compilation:
+            if (
+                is_lp_weight
+                or mkldnn._is_mkldnn_acl_supported()
+                or V.aot_compilation
+                or has_free_symbols(batch_size)
+            ):
                 packed_linear_inputs += (bias, "none", [], "")
                 packed_linear_op: Callable[..., Any] = mkldnn._linear_pointwise.default
             else:
@@ -1218,7 +1225,6 @@ def is_const_or_cat_by_const(weight):
         weight_meta_value = linear_node.args[weight_idx].meta.get("val")
         if input_meta_value is None or weight_meta_value is None:
             return False
-        batch_size = input_meta_value.shape[0]
         if (
             input_meta_value.dtype == torch.float64
             or weight_meta_value.dtype == torch.float64
@@ -1236,12 +1242,12 @@ def is_const_or_cat_by_const(weight):
             reduced_f32_matmul_enabled and weight_meta_value.dtype == torch.float32
         )
         compute_with_lp = is_lp_weight or use_reduced_f32_for_fp32_weight
-        # on x86, for fp32, mkl should be enabled and batch_size should not be a free symbol.
+        # on x86, for fp32, mkl should be enabled.
         # on aarch64, use mkldnn op for fp32 as well if acl is enabled
         if (
             not compute_with_lp
             and not mkldnn._is_mkldnn_acl_supported()
-            and ((not torch._C.has_mkl) or has_free_symbols(batch_size))
+            and not torch._C.has_mkl
         ):
             return False
         for meta_value in [input_meta_value, weight_meta_value]:
@@ -1460,10 +1466,6 @@ def linear(match, *args, **kwargs):
                 )
                 compute_with_lp = is_lp_weight or use_reduced_f32_for_fp32_weight
                 batch_size = input.meta.get("val").shape[0]
-                if has_free_symbols(batch_size):
-                    assert compute_with_lp or mkldnn._is_mkldnn_acl_supported(), (
-                        f"only bf16/fp16 weight prepacking supports dynamic shape inputs but got {weight_dtype}"
-                    )
                 packed_weight_node = mkldnn_device_op.pack_linear_weight(
                     graph, compute_with_lp, transpose_weight_node, batch_size
                 )
diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py
index 7133d77740bc9..ba6953c091183 100644
--- a/torch/_inductor/fx_passes/post_grad.py
+++ b/torch/_inductor/fx_passes/post_grad.py
@@ -204,13 +204,14 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
 
         p = (
             bucket_fsdp_reduce_scatter
-            if config.bucket_reduce_scatters_fx == "fsdp"
+            if "fsdp" in config.bucket_reduce_scatters_fx
             else bucket_reduce_scatter
         )
         GraphTransformObserver(gm, "bucket_reduce_scatters").apply_graph_pass(
             lambda graph: p(
                 graph.owning_module,
                 config.bucket_reduce_scatters_fx_bucket_size_determinator,
+                config.bucket_reduce_scatters_fx,
             )
         )
         collectives_bucketing = True
@@ -223,13 +224,14 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
 
         p = (
             bucket_fsdp_all_gather  # type: ignore[assignment]
-            if config.bucket_all_gathers_fx == "fsdp"
+            if "fsdp" in config.bucket_all_gathers_fx
             else bucket_all_gather
         )
         GraphTransformObserver(gm, "bucket_all_gathers").apply_graph_pass(
             lambda graph: p(
                 graph.owning_module,
                 config.bucket_all_gathers_fx_bucket_size_determinator,
+                config.bucket_all_gathers_fx,
             )
         )
         collectives_bucketing = True
@@ -1760,17 +1762,44 @@ def __call__(self, graph: fx.Graph) -> None:
         movable_constructors = self.find_movable_constructors(graph, constructors)
 
         target_device = next(iter(target_devices))
-        for node in movable_constructors:
-            if node in cpu_placeholders:
-                with graph.inserting_after(node):
-                    gpu_node = graph.call_function(
-                        torch.ops.prims.device_put.default, (node, target_device)
+        movable_cpu_placeholders = movable_constructors & cpu_placeholders
+        if movable_cpu_placeholders:
+            node = next(iter(reversed(movable_cpu_placeholders)))
+            last_node = node
+            unsqueezed_nodes = []
+            for elem in movable_cpu_placeholders:
+                with graph.inserting_after(last_node):
+                    unsqueezed_nodes.append(
+                        graph.call_function(torch.ops.aten.unsqueeze.default, (elem, 0))
                     )
-                node.replace_all_uses_with(
-                    gpu_node,
-                    lambda x: x != gpu_node
-                    and x.target != torch.ops.aten.copy_.default,
+                    last_node = unsqueezed_nodes[-1]
+            with graph.inserting_after(last_node):
+                cpu_concat = graph.call_function(
+                    torch.ops.aten.cat.default, (unsqueezed_nodes,)
+                )
+                last_node = cpu_concat
+            with graph.inserting_after(last_node):
+                gpu_concat = graph.call_function(
+                    torch.ops.prims.device_put.default,
+                    (cpu_concat, target_device, True),
                 )
+                last_node = gpu_concat
+            with graph.inserting_after(last_node):
+                gpu_split = graph.call_function(
+                    torch.ops.aten.unbind.int, (gpu_concat,)
+                )
+                last_node = gpu_split
+            for idx, node in enumerate(movable_cpu_placeholders):
+                with graph.inserting_after(last_node):
+                    gpu_node = graph.call_function(operator.getitem, (gpu_split, idx))
+                    node.replace_all_uses_with(
+                        gpu_node,
+                        lambda x: x
+                        not in [cpu_concat, gpu_concat, gpu_split, gpu_node]
+                        + unsqueezed_nodes
+                        and x.target != torch.ops.aten.copy_.default,
+                    )
+                    last_node = gpu_node
 
                 # noop elimination if there are other device_put for gpu_node to
                 # target device. Alternatively, we could just move the other device_put
@@ -1784,10 +1813,12 @@ def __call__(self, graph: fx.Graph) -> None:
                 for noop in noop_device_puts:
                     noop.replace_all_uses_with(gpu_node)
                     graph.erase_node(noop)
-            else:
-                kwargs = node.kwargs.copy()
-                kwargs["device"] = target_device
-                node.kwargs = kwargs
+
+        movable_constructors -= movable_cpu_placeholders
+        for node in movable_constructors:
+            kwargs = node.kwargs.copy()
+            kwargs["device"] = target_device
+            node.kwargs = kwargs
 
     def find_movable_constructors(
         self, graph: fx.Graph, constructors: list[fx.Node]
diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py
index 70dfe9ae43b35..01f62bdf608ce 100644
--- a/torch/_inductor/fx_passes/quantization.py
+++ b/torch/_inductor/fx_passes/quantization.py
@@ -1530,7 +1530,7 @@ def _find_first_node_in_dequant_pattern(_node):
         counters["inductor"]["dequant_promotion_matcher_nodes"] += len(match.nodes)
 
 
-def _is_valid_dequant_conv_pattern(dtype):
+def _is_valid_dequant_conv_pattern(dtype, with_dtype_convert):
     def _inner(match):
         # Here we do some further check to ensure:
         # 1. It's a conv2d node with dim of 4, since we only support lowering of conv2d now.
@@ -1552,7 +1552,7 @@ def _inner(match):
 
         assert dtype in [torch.float32, torch.bfloat16]
 
-        if dtype == torch.float32:
+        if not with_dtype_convert:
             dequant_node = conv_node.args[0]
         else:
             convert_to_bf16 = conv_node.args[0]
@@ -1567,10 +1567,12 @@ def _inner(match):
     return _inner
 
 
-def _register_qconv_weight_prepack_pass(pattern, pass_number, dtype=torch.float32):
+def _register_qconv_weight_prepack_pass(
+    pattern, pass_number, dtype=torch.float32, with_dtype_convert=False
+):
     @register_freezing_graph_pattern(
         pattern,
-        extra_check=_is_valid_dequant_conv_pattern(dtype),
+        extra_check=_is_valid_dequant_conv_pattern(dtype, with_dtype_convert),
         pass_number=pass_number,
     )
     def qconv_weight_prepack(match: Match, *args, **kwargs):
@@ -1590,7 +1592,7 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
         assert dtype in [torch.float32, torch.bfloat16]
         conv_node = match.output_node()
         assert conv_node.target is aten.convolution.default
-        if dtype == torch.float32:
+        if not with_dtype_convert:
             dequant_node = conv_node.args[0]
         else:
             convert_to_bf16 = conv_node.args[0]
@@ -1695,7 +1697,7 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
             # Erase the original conv node
             graph.erase_node(conv_node)
             # Erase the dequant pattern
-            if dtype == torch.bfloat16:
+            if with_dtype_convert:
                 graph.erase_node(convert_to_bf16)  # type: ignore[possibly-undefined, arg-type]
             graph.erase_node(dequant_node)  # type: ignore[arg-type]
             # Erase the dequant per channel pattern
@@ -1711,7 +1713,7 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
 
 
 def _generate_dequant_convolution_node_pattern(
-    _dequant_per_channel_pattern, dtype=torch.float32
+    _dequant_per_channel_pattern, dtype=torch.float32, with_dtype_convert=False
 ):
     assert dtype in [torch.float32, torch.bfloat16]
     dequant_convolution_node_pattern = CallFunction(
@@ -1719,7 +1721,7 @@ def _generate_dequant_convolution_node_pattern(
         _may_generate_pattern_with_dtype_convert(
             get_dequantize_per_tensor_activation_pattern(),
             KeywordArg("autocast_act_dtype"),
-            dtype == torch.bfloat16,
+            with_dtype_convert,
         ),
         _dequant_per_channel_pattern,
         KeywordArg("b"),
@@ -1733,7 +1735,9 @@ def _generate_dequant_convolution_node_pattern(
     return dequant_convolution_node_pattern
 
 
-def _generate_qconv_weight_prepack_patterns(dtype=torch.float32):
+def _generate_qconv_weight_prepack_patterns(
+    dtype=torch.float32, with_dtype_convert=False
+):
     assert dtype in [torch.float32, torch.bfloat16]
     return (
         _generate_dequant_convolution_node_pattern(
@@ -1741,6 +1745,7 @@ def _generate_qconv_weight_prepack_patterns(dtype=torch.float32):
             if dtype == torch.float32
             else dequantize_per_channel_to_bf16_weight_pattern,
             dtype,
+            with_dtype_convert,
         ),
         # There is another pattern due to the pass of convert_conv_weights_to_channels_last
         # https://github.com/pytorch/pytorch/blob/07107919297db3f8ab37f11c12666b6d6d5f692e/torch/_inductor/freezing.py#L338-L362.
@@ -1751,6 +1756,7 @@ def _generate_qconv_weight_prepack_patterns(dtype=torch.float32):
             if dtype == torch.float32
             else dequantize_per_channel_to_bf16_clone_weight_pattern,
             dtype,
+            with_dtype_convert,
         ),
     )
 
@@ -1778,7 +1784,11 @@ def _get_linear_node(match, input_dim_exceeds_two, input_contiguous):
 
 
 def _get_linear_dq_node(
-    linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
+    linear_node,
+    input_index,
+    input_dim_exceeds_two,
+    input_contiguous,
+    with_dtype_convert,
 ):
     act_reshape_node = None
     activation_to_bf16_node = None
@@ -1787,7 +1797,7 @@ def _get_linear_dq_node(
         if input_contiguous:
             act_reshape_node = linear_node.args[input_index]
             assert act_reshape_node.target is aten.reshape.default
-            if dtype == torch.float32:
+            if not with_dtype_convert:
                 # pattern: linear -> reshape -> dequant
                 dequant_node = act_reshape_node.args[0]
             else:
@@ -1798,13 +1808,13 @@ def _get_linear_dq_node(
             # bmm pattern decomposed from linear when input dim exceeds 2 and not contiguous
             act_expand_node = linear_node.args[input_index]
             assert act_expand_node.target is aten.expand.default
-            if dtype == torch.float32:
+            if not with_dtype_convert:
                 dequant_node = act_expand_node.args[0]
             else:
                 activation_to_bf16_node = act_expand_node.args[0]
                 dequant_node = activation_to_bf16_node.args[0]
     else:
-        if dtype == torch.float32:
+        if not with_dtype_convert:
             # pattern: linear -> dequant
             dequant_node = linear_node.args[input_index]
         else:
@@ -1814,7 +1824,9 @@ def _get_linear_dq_node(
     return dequant_node, act_reshape_node, activation_to_bf16_node, act_expand_node
 
 
-def _is_valid_dequant_linear_pattern(dtype, input_dim_exceeds_two, input_contiguous):
+def _is_valid_dequant_linear_pattern(
+    dtype, input_dim_exceeds_two, input_contiguous, with_dtype_convert
+):
     def _inner(match):
         # Check dequant pattern has only 1 user.
         (
@@ -1830,7 +1842,11 @@ def _inner(match):
             _,
             _,
         ) = _get_linear_dq_node(
-            linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
+            linear_node,
+            input_index,
+            input_dim_exceeds_two,
+            input_contiguous,
+            with_dtype_convert,
         )
 
         assert dequant_node.target in [
@@ -1892,11 +1908,12 @@ def _register_qlinear_weight_prepack_pass(
     dtype=torch.float32,
     input_dim_exceeds_two=False,
     input_contiguous=True,
+    with_dtype_convert=False,
 ):
     @register_freezing_graph_pattern(
         pattern,
         extra_check=_is_valid_dequant_linear_pattern(
-            dtype, input_dim_exceeds_two, input_contiguous
+            dtype, input_dim_exceeds_two, input_contiguous, with_dtype_convert
         ),
         pass_number=pass_number,
     )
@@ -1928,7 +1945,11 @@ def qlinear_weight_prepack(match: Match, *args, **kwargs):
             activation_to_bf16_node,
             act_expand_node,
         ) = _get_linear_dq_node(
-            linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
+            linear_node,
+            input_index,
+            input_dim_exceeds_two,
+            input_contiguous,
+            with_dtype_convert,
         )
 
         if input_dim_exceeds_two and not input_contiguous:
@@ -2035,7 +2056,7 @@ def qlinear_weight_prepack(match: Match, *args, **kwargs):
                 else:
                     graph.erase_node(act_expand_node)
                     graph.erase_node(wgt_expand_node)  # type: ignore[possibly-undefined]
-            if dtype == torch.bfloat16:
+            if with_dtype_convert:
                 graph.erase_node(activation_to_bf16_node)
             # Erase the dequant pattern
             graph.erase_node(dequant_node)
@@ -2056,6 +2077,7 @@ def _generate_dequant_linear_node_pattern(
     dtype=torch.float32,
     input_dim_exceeds_two=False,
     is_tensor_overload=False,
+    with_dtype_convert=False,
 ):
     assert dtype in [torch.float32, torch.bfloat16]
     t_pattern = _generate_linear_t_pattern(_dequant_per_channel_pattern, dtype)
@@ -2067,7 +2089,7 @@ def _generate_dequant_linear_node_pattern(
                 _may_generate_pattern_with_dtype_convert(
                     get_dequantize_per_tensor_activation_pattern(is_tensor_overload),
                     KeywordArg("autocast_act_dtype"),
-                    dtype == torch.bfloat16,
+                    with_dtype_convert,
                 ),
                 KeywordArg("act_reshape_size"),
                 input_dim_exceeds_two,
@@ -2084,7 +2106,7 @@ def _generate_dequant_linear_node_pattern(
                 _may_generate_pattern_with_dtype_convert(
                     get_dequantize_per_tensor_activation_pattern(is_tensor_overload),
                     KeywordArg("autocast_act_dtype"),
-                    dtype == torch.bfloat16,
+                    with_dtype_convert,
                 ),
                 KeywordArg("act_reshape_size"),
                 input_dim_exceeds_two,
@@ -2102,6 +2124,7 @@ def _generate_dequant_bmm_node_pattern(
     dtype=torch.float32,
     with_bias=False,
     is_tensor_overload=False,
+    with_dtype_convert=False,
 ):
     # When activation of linear dim exceed 2 and not contiguous
     t_pattern = _generate_linear_t_pattern(_dequant_per_channel_pattern, dtype)
@@ -2114,7 +2137,7 @@ def _generate_dequant_bmm_node_pattern(
             _may_generate_pattern_with_dtype_convert(
                 get_dequantize_per_tensor_activation_pattern(is_tensor_overload),
                 KeywordArg("autocast_act_dtype"),
-                dtype == torch.bfloat16,
+                with_dtype_convert,
             ),
             KeywordArg("act_expand_size"),
         ),
@@ -2144,6 +2167,7 @@ def _generate_qlinear_weight_prepack_patterns(
     input_contiguous=True,
     with_bias=False,
     is_tensor_overload=False,
+    with_dtype_convert=False,
 ):
     if input_dim_exceeds_two and not input_contiguous:
         return _generate_dequant_bmm_node_pattern(
@@ -2151,6 +2175,7 @@ def _generate_qlinear_weight_prepack_patterns(
             dtype,
             with_bias,
             is_tensor_overload,
+            with_dtype_convert,
         )
     else:
         return _generate_dequant_linear_node_pattern(
@@ -2158,6 +2183,7 @@ def _generate_qlinear_weight_prepack_patterns(
             dtype,
             input_dim_exceeds_two,
             is_tensor_overload,
+            with_dtype_convert,
         )
 
 
@@ -2273,12 +2299,21 @@ def _register_dequant_promotion():
 
 
 def _register_qconv_weight_prepack():
-    for dtype in [torch.float32, torch.bfloat16]:
-        weight_prepack_patterns = _generate_qconv_weight_prepack_patterns(dtype)
+    for dtype, with_dtype_convert in itertools.product(
+        [torch.float32, torch.bfloat16], [True, False]
+    ):
+        if dtype == torch.float32 and with_dtype_convert:
+            continue
+        weight_prepack_patterns = _generate_qconv_weight_prepack_patterns(
+            dtype, with_dtype_convert
+        )
         for weight_prepack_pattern in weight_prepack_patterns:
             # Register to pass_number 1, so we can do dequant promotion in pass_number 0.
             _register_qconv_weight_prepack_pass(
-                weight_prepack_pattern, pass_number=1, dtype=dtype
+                weight_prepack_pattern,
+                pass_number=1,
+                dtype=dtype,
+                with_dtype_convert=with_dtype_convert,
             )
 
 
@@ -2318,15 +2353,23 @@ def _register_qlinear_weight_prepack():
     #   |            OPT(add)               |
 
     linear_weight_prepack_cases = itertools.product(
-        [torch.float32, torch.bfloat16], [True, False], [True, False]
+        [torch.float32, torch.bfloat16], [True, False], [True, False], [True, False]
     )
 
     # Step 1: register patterns from mm and addmm
-    for dtype, input_dim_exceeds_two, is_tensor_overload in linear_weight_prepack_cases:
+    for (
+        dtype,
+        input_dim_exceeds_two,
+        is_tensor_overload,
+        with_dtype_convert,
+    ) in linear_weight_prepack_cases:
+        if dtype == torch.float32 and with_dtype_convert:
+            continue
         weight_prepack_patterns = _generate_qlinear_weight_prepack_patterns(
             dtype,
             input_dim_exceeds_two,
             is_tensor_overload=is_tensor_overload,
+            with_dtype_convert=with_dtype_convert,
         )
         for weight_prepack_pattern in weight_prepack_patterns:
             # Register to pass_number 1, so we can do dequant promotion in pass_number 0.
@@ -2335,6 +2378,7 @@ def _register_qlinear_weight_prepack():
                 pass_number=1,
                 dtype=dtype,
                 input_dim_exceeds_two=input_dim_exceeds_two,
+                with_dtype_convert=with_dtype_convert,
             )
 
     # Step 2: register patterns from bmm
@@ -2342,15 +2386,23 @@ def _register_qlinear_weight_prepack():
     # refer to:
     # https://github.com/pytorch/pytorch/blob/80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968
     # in this case, we can convert it back to qlinear
-    for dtype, with_bias, is_tensor_overload in itertools.product(
-        [torch.float32, torch.bfloat16], [True, False], [True, False]
+    for (
+        dtype,
+        with_bias,
+        is_tensor_overload,
+        with_dtype_convert,
+    ) in itertools.product(
+        [torch.float32, torch.bfloat16], [True, False], [True, False], [True, False]
     ):
+        if dtype == torch.float32 and with_dtype_convert:
+            continue
         bmm_pattern = _generate_qlinear_weight_prepack_patterns(
             dtype=dtype,
             input_dim_exceeds_two=True,
             input_contiguous=False,
             with_bias=with_bias,
             is_tensor_overload=is_tensor_overload,
+            with_dtype_convert=with_dtype_convert,
         )
         _register_qlinear_weight_prepack_pass(
             bmm_pattern,
@@ -2360,6 +2412,7 @@ def _register_qlinear_weight_prepack():
             dtype=dtype,
             input_dim_exceeds_two=True,
             input_contiguous=False,
+            with_dtype_convert=with_dtype_convert,
         )
 
 
diff --git a/torch/_inductor/fx_passes/split_cat.py b/torch/_inductor/fx_passes/split_cat.py
index 327f96ae34ac7..af3631dc3288d 100644
--- a/torch/_inductor/fx_passes/split_cat.py
+++ b/torch/_inductor/fx_passes/split_cat.py
@@ -2,6 +2,7 @@
 import itertools
 import logging
 import operator
+import os
 from collections import defaultdict
 from collections.abc import Sequence
 from typing import Any, Callable, Optional, Union
@@ -9,7 +10,7 @@
 
 import torch
 from torch._dynamo.utils import counters
-from torch.fx.experimental.symbolic_shapes import free_symbols
+from torch.fx.experimental.symbolic_shapes import free_symbols, guard_or_false
 from torch.utils._ordered_set import OrderedSet
 
 from ..pattern_matcher import (
@@ -76,6 +77,8 @@
     "move_view_after_cat_aten_pass",
 ]
 
+backend = os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_BACKEND", "inductor")
+
 for pass_name in pre_grad_pass_names:
     # exclude all passes from the group batch fusion
     # they do not use pattern matcher
@@ -207,7 +210,7 @@ def normalize_split_base(
     split_node.replace_all_uses_with(new_split_node)
     new_split_node.meta.update(split_node.meta)
     graph.erase_node(split_node)
-    counters["inductor"]["normalization_pass"] += 1
+    counters[backend]["normalization_pass"] += 1
 
 
 @register_graph_pattern(
@@ -259,7 +262,7 @@ def remove_split_with_size_one(match: Match, *args, **kwargs):
         # erase the split node and its child
         graph.erase_node(user)
         graph.erase_node(split_node)
-        counters["inductor"]["remove_split_with_size_one_pass"] += 1
+        counters[backend]["remove_split_with_size_one_pass"] += 1
 
 
 @register_graph_pattern(
@@ -299,7 +302,7 @@ def normalize_unbind_default(match: Match, *args, **kwargs):
     node.replace_all_uses_with(new_node)
     new_node.meta.update(node.meta)
     graph.erase_node(node)
-    counters["inductor"]["normalization_pass"] += 1
+    counters[backend]["normalization_pass"] += 1
 
 
 @register_graph_pattern(
@@ -307,8 +310,6 @@ def normalize_unbind_default(match: Match, *args, **kwargs):
     pass_dict=construct_pattern_matcher_pass("normalization_pass"),
 )
 def normalize_cat_default(match: Match, *args, **kwargs):
-    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
-
     cat_node = match.nodes[0]
     graph = match.graph
     tensors = get_arg_value(cat_node, 0, "tensors")
@@ -333,7 +334,7 @@ def normalize_cat_default(match: Match, *args, **kwargs):
     def is_empty_tensor(x):
         # special case where torch.cat supports cat'ing with an empty tensor
         x_shape = x.meta["example_value"].shape
-        return len(x_shape) == 1 and guard_size_oblivious(x_shape[0] == 0)
+        return len(x_shape) == 1 and guard_or_false(x_shape[0] == 0)
 
     assert all(
         ndim == x.meta["example_value"].dim() or is_empty_tensor(x) for x in tensors
@@ -361,7 +362,7 @@ def is_empty_tensor(x):
     cat_node.replace_all_uses_with(new_cat_node)
     new_cat_node.meta.update(cat_node.meta)
     graph.erase_node(cat_node)
-    counters["inductor"]["normalization_pass"] += 1
+    counters[backend]["normalization_pass"] += 1
 
 
 @register_graph_pattern(
@@ -397,7 +398,7 @@ def normalize_stack_default(match: Match, *args, **kwargs):
     node.replace_all_uses_with(new_node)
     new_node.meta.update(node.meta)
     graph.erase_node(node)
-    counters["inductor"]["normalization_pass"] += 1
+    counters[backend]["normalization_pass"] += 1
 
 
 def find_next_users(split_node: torch.fx.Node) -> list[torch.fx.Node]:
@@ -658,7 +659,7 @@ def merge_splits(
     for node in to_remove:
         graph.erase_node(node)
 
-    counters["inductor"]["merge_splits_pass"] += 1
+    counters[backend]["merge_splits_pass"] += 1
 
 
 class SplitCatSimplifier:
@@ -718,7 +719,7 @@ def simplify(
             transform_params_list,  # type: ignore[arg-type]
         )
         self.erase_old_nodes(graph, split_node, next_users)  # type: ignore[arg-type]
-        counters["inductor"]["unbind_stack_pass"] += 1
+        counters[backend]["unbind_stack_pass"] += 1
 
     def get_user_input_list(
         self, split_node: torch.fx.Node, next_users: list[torch.fx.Node]
@@ -817,9 +818,9 @@ def get_simplified_split_ranges(
         split_ranges = self.fill_gaps(split_ranges, 0, cumulative_sizes[-1])
         if len(split_sections) == len(split_ranges):  # Simplification not possible
             return None
-        counters["inductor"]["scmerge_split_sections_removed"] = len(
-            split_sections
-        ) - len(split_ranges)
+        counters[backend]["scmerge_split_sections_removed"] = len(split_sections) - len(
+            split_ranges
+        )
         return split_ranges
 
     def has_non_overlapping_ranges(self, ranges: list[_Range]) -> bool:
@@ -927,7 +928,7 @@ def replace_split(
                         [r[1] - r[0] for r in split_ranges],
                         dim=split_dim,
                     )
-                counters["inductor"]["scmerge_split_added"] += 1
+                counters[backend]["scmerge_split_added"] += 1
             split_items = []
             with graph.inserting_after(new_split):
                 for i in range(len(split_ranges)):
@@ -1088,7 +1089,7 @@ def replace_cat(
                         user_inputs_new_transformed_meta,
                         dim=cat_dim,
                     )
-                    counters["inductor"]["scmerge_cat_added"] += 1
+                    counters[backend]["scmerge_cat_added"] += 1
                 else:
                     new_cat_node = user_inputs_new_transformed[-1]
                     new_cat_node.meta["example_value"] = (
@@ -1120,12 +1121,12 @@ def erase_old_nodes(
         next_users: list[torch.fx.Node],
     ):
         to_remove = [split_node]
-        counters["inductor"]["scmerge_split_removed"] += 1
+        counters[backend]["scmerge_split_removed"] += 1
         to_remove.extend(split_node.users.keys())
         for next_user in next_users:
             if next_user.target not in (torch.cat, torch.stack):
                 continue
-            counters["inductor"]["scmerge_cat_removed"] += 1
+            counters[backend]["scmerge_cat_removed"] += 1
             to_remove.append(next_user)
         for node in reversed(to_remove):
             if len(node.users.keys()) == 0:
@@ -1318,7 +1319,7 @@ def merge_split_squeeze(
             graph.erase_node(squeeze)
             graph.erase_node(getitem_node)
     graph.erase_node(split)
-    counters["inductor"]["split_cat_pass"] += 1
+    counters[backend]["split_cat_pass"] += 1
 
 
 getitem_unbind = ListOf(
@@ -1578,7 +1579,7 @@ def merge_getitem_cat(match: Match, split_sections: list[int], dim: int):
                 split_node = new_split_node
                 split_sections = new_split_sections
 
-                counters["inductor"]["merge_getitem_cat_pass"] += 1
+                counters[backend]["merge_getitem_cat_pass"] += 1
 
 
 # ############pattern to be optimized is#########
@@ -1639,7 +1640,7 @@ def mutate_cat_node(match: Match, split_sections: list[int], dim: int):
                 cat_user.replace_all_uses_with(split_node.args[0])  # type: ignore[arg-type]
                 # remove the cat node
                 graph.erase_node(cat_user)
-                counters["inductor"]["mutate_cat_pass"] += 1
+                counters[backend]["mutate_cat_pass"] += 1
             # case 2: the cat uses some getitems from the split
             elif is_node_meta_valid(split_node.args[0]):  # type: ignore[arg-type]
                 # check the split dim, and construct the slice tuple
@@ -1667,7 +1668,7 @@ def mutate_cat_node(match: Match, split_sections: list[int], dim: int):
 
                 # remove the cat node
                 graph.erase_node(cat_user)
-                counters["inductor"]["mutate_cat_pass"] += 1
+                counters[backend]["mutate_cat_pass"] += 1
 
 
 getitem_split_aten = ListOf(
@@ -1727,7 +1728,7 @@ def normalize_split_default_aten(match: Match, *args, **kwargs):
     split_node.replace_all_uses_with(new_split_node)
     new_split_node.meta.update(split_node.meta)
     graph.erase_node(split_node)
-    counters["inductor"]["normalization_aten_pass"] += 1
+    counters[backend]["normalization_aten_pass"] += 1
 
 
 @register_graph_pattern(
@@ -1768,7 +1769,7 @@ def normalize_split_with_size_default_aten(match: Match, *args, **kwargs):
     split_node.replace_all_uses_with(new_split_node)
     new_split_node.meta.update(split_node.meta)
     graph.erase_node(split_node)
-    counters["inductor"]["normalization_aten_pass"] += 1
+    counters[backend]["normalization_aten_pass"] += 1
 
 
 @register_graph_pattern(
@@ -1869,7 +1870,7 @@ def merge_split_cat_aten(match: Match, *args, **kwargs):
                 graph.erase_node(getitem_node)
         if len(split_node.users) == 0:
             graph.erase_node(split_node)
-        counters["inductor"]["split_cat_aten_pass"] += 1
+        counters[backend]["split_cat_aten_pass"] += 1
 
 
 @register_graph_pattern(
@@ -1929,7 +1930,7 @@ def merge_select_cat_aten(match: Match, *args, **kwargs):
             for select_node in select_nodes:
                 if len(select_node.users) == 0:
                     graph.erase_node(select_node)
-            counters["inductor"]["select_cat_aten_pass"] += 1
+            counters[backend]["select_cat_aten_pass"] += 1
 
 
 @register_graph_pattern(
@@ -1977,7 +1978,7 @@ def is_empty_tensor(x: torch.fx.Node) -> bool:
     cat_node.replace_all_uses_with(new_cat_node)
     new_cat_node.meta.update(cat_node.meta)
     graph.erase_node(cat_node)
-    counters["inductor"]["normalization_aten_pass"] += 1
+    counters[backend]["normalization_aten_pass"] += 1
 
 
 @register_graph_pattern(
@@ -2038,7 +2039,7 @@ def merge_unbind_stack_aten(match: Match, *args, **kwargs):
     for select_node in select_nodes:
         if len(select_node.users) == 0:
             graph.erase_node(select_node)
-    counters["inductor"]["unbind_stack_aten_pass"] += 1
+    counters[backend]["unbind_stack_aten_pass"] += 1
 
 
 def divide_into_consecutive_sublists(indices: list[int]) -> list[list[int]]:
@@ -2376,7 +2377,7 @@ def split_cat_to_slices(match: Match, split_sections: list[int], dim: int):
             cat_inputs = cat_node.args[0]  # type: ignore[union-attr]
             graph.erase_node(cat_node)
             remove_split_unbind_children(graph, cat_inputs)  # type: ignore[arg-type]
-            counters["inductor"]["split_cat_to_slices_pass"] += 1
+            counters[backend]["split_cat_to_slices_pass"] += 1
             continue
         if len(new_cat_args) > 1 and len(new_cat_args) < len(cat_inputs):
             new_args = (new_cat_args,)
@@ -2392,7 +2393,7 @@ def split_cat_to_slices(match: Match, split_sections: list[int], dim: int):
                 # remove the cat node
                 graph.erase_node(cat_node)
                 remove_split_unbind_children(graph, cat_inputs)
-                counters["inductor"]["split_cat_to_slices_pass"] += 1
+                counters[backend]["split_cat_to_slices_pass"] += 1
 
 
 # ############pattern to be optimized is#########
@@ -2453,7 +2454,7 @@ def unbind_cat_to_view(match: Match, unbind_input: torch.fx.Node, dim: int):
             cat_inputs = cat_node.args[0]  # type: ignore[union-attr]
             graph.erase_node(cat_node)
             remove_split_unbind_children(graph, cat_inputs)  # type: ignore[arg-type]
-            counters["inductor"]["unbind_cat_to_view_pass"] += 1
+            counters[backend]["unbind_cat_to_view_pass"] += 1
             continue
         if len(new_cat_args) > 1 and len(new_cat_args) < len(inputs):
             # get the view shape
@@ -2473,7 +2474,7 @@ def unbind_cat_to_view(match: Match, unbind_input: torch.fx.Node, dim: int):
             cat_inputs = cat_node.args[0]  # type: ignore[union-attr]
             graph.erase_node(cat_node)
             remove_split_unbind_children(graph, cat_inputs)  # type: ignore[arg-type]
-            counters["inductor"]["unbind_cat_to_view_pass"] += 1
+            counters[backend]["unbind_cat_to_view_pass"] += 1
 
 
 def reshape_cat_node_to_stack(
@@ -2623,7 +2624,7 @@ def split_stack_to_cats(match: Match, split_sections: list[int], dim: int):
         # case 1: only one node in the new cat args, don't need to cat
         if len(new_cat_args) == 1:
             reshape_cat_node_to_stack(graph, new_cat_args[0], stack_node, split_dim)
-            counters["inductor"]["split_stack_to_cats_pass"] += 1
+            counters[backend]["split_stack_to_cats_pass"] += 1
             continue
         if len(new_cat_args) > 1 and len(new_cat_args) < len(inputs):
             with graph.inserting_after(stack_node):
@@ -2636,7 +2637,7 @@ def split_stack_to_cats(match: Match, split_sections: list[int], dim: int):
                     new_cat_args_meta, dim=split_dim
                 )
                 reshape_cat_node_to_stack(graph, cat_node, stack_node, split_dim)
-                counters["inductor"]["split_stack_to_cats_pass"] += 1
+                counters[backend]["split_stack_to_cats_pass"] += 1
 
 
 # ############pattern to be optimized is#########
@@ -2695,7 +2696,7 @@ def unbind_stack_to_slices(match: Match, unbind_input: torch.fx.Node, dim: int):
         # case 1: only one node in the new cat args, don't need to cat
         if len(new_cat_args) == 1:
             reshape_cat_node_to_stack(graph, new_cat_args[0], stack_node, unbind_dim)
-            counters["inductor"]["unbind_stack_to_slices_pass"] += 1
+            counters[backend]["unbind_stack_to_slices_pass"] += 1
             continue
         if len(new_cat_args) > 1 and len(new_cat_args) < len(inputs):
             # get the view shape
@@ -2710,7 +2711,7 @@ def unbind_stack_to_slices(match: Match, unbind_input: torch.fx.Node, dim: int):
                     new_cat_args_meta, dim=cat_dim
                 )
                 reshape_cat_node_to_stack(graph, new_cat_node, stack_node, unbind_dim)
-            counters["inductor"]["unbind_stack_to_slices_pass"] += 1
+            counters[backend]["unbind_stack_to_slices_pass"] += 1
 
 
 # ############pattern to be optimized is#########
@@ -2815,7 +2816,7 @@ def move_reshape_out_of_split_stack(match: Match, *args, **kwargs):
             # check the input of stack node, and remove nodes that have no users
             remove_split_unbind_children(graph, stack_inputs)  # type: ignore[arg-type]
             remove_split_unbind_children(graph, split_users)  # type: ignore[arg-type]
-            counters["inductor"]["move_reshape_out_of_split_stack_pass"] += 1
+            counters[backend]["move_reshape_out_of_split_stack_pass"] += 1
             continue
         if len(new_cat_args) > 1 and len(new_cat_args) < len(inputs):
             # decompose the cat args into multiple stack nodes, i.e., we stack
@@ -2877,7 +2878,7 @@ def move_reshape_out_of_split_stack(match: Match, *args, **kwargs):
                 graph.erase_node(stack_node)
                 remove_split_unbind_children(graph, stack_inputs)  # type: ignore[arg-type]
                 remove_split_unbind_children(graph, split_users)  # type: ignore[arg-type]
-            counters["inductor"]["move_reshape_out_of_split_stack_pass"] += 1
+            counters[backend]["move_reshape_out_of_split_stack_pass"] += 1
 
 
 view_getitem_split_aten = ListOf(
@@ -2969,7 +2970,7 @@ def move_view_after_cat(match: Match, *args, **kwargs):
             cat_node.replace_all_uses_with(view_node)
             view_node.meta.update(cat_node.meta)
             graph.erase_node(cat_node)
-        counters["inductor"]["move_view_after_cat_aten_pass"] += 1
+        counters[backend]["move_view_after_cat_aten_pass"] += 1
 
 
 def match_einsum_strings(s: str) -> bool:
@@ -3031,4 +3032,4 @@ def should_replace_einsum(einsum_node) -> bool:
     input, weights = get_arg_value(einsum_node, 1), get_arg_value(einsum_node, 2)
     if should_replace_einsum(einsum_node):
         match.replace_by_example(repl, [input, weights])
-        counters["inductor"]["einsum_to_pointwise_pass"] += 1
+        counters[backend]["einsum_to_pointwise_pass"] += 1
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 31be050ab28df..d10dc7a464261 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -312,6 +312,7 @@ def __init__(
         const_module: Optional[GraphLowering] = None,
         name: Optional[str] = None,
         inputs_to_check: Optional[Sequence[int]] = None,
+        fx_wrapper: bool = False,
     ) -> None:
         super().__init__(gm)
         self.example_inputs = example_inputs
@@ -392,8 +393,6 @@ def __init__(
         self.inplaced_to_remove: OrderedSet[str] = OrderedSet()
         self.device_ops: DeviceOpOverrides = None  # type: ignore[assignment]
         self.wrapper_code: PythonWrapperCodegen = None  # type: ignore[assignment]
-        # See `ProxyExecutor Design Note` in ir.py for more details
-        self.extern_kernel_nodes: list[ir.ExternKernelNode] = []
 
         from torch._inductor.extern_node_serializer import extern_node_json_serializer
 
@@ -413,6 +412,7 @@ def __init__(
         self.creation_time = time.time()
         self.name = name  # type: ignore[assignment]
         self.cpp_wrapper = cpp_wrapper
+        self.fx_wrapper = fx_wrapper
 
         # record multi_kernel choice for cpp_wrapper so the second pass knows
         # which sub-kernel is picked. Copy cpp_wrapper to another variable
@@ -1112,10 +1112,11 @@ def placeholder(
             return None
         # See note: Note: [Generator arguments in AOTDispatcher]
         elif isinstance(example, torch.Generator):
-            assert (
-                len(V.graph.current_node.users) == 1
-                and next(iter(V.graph.current_node.users)).target
-                is torch._prims.rng_prims.graphsafe_run_with_rng_state
+            assert len(V.graph.current_node.users) == 1 and next(
+                iter(V.graph.current_node.users)
+            ).target in (
+                torch._prims.rng_prims.graphsafe_run_with_rng_state,
+                torch.ops.higher_order.invoke_subgraph,
             )
             gen = ir.GeneratorState(name=target, device=example.device)
             self.graph_inputs[target] = gen  # type: ignore[assignment]
@@ -2018,7 +2019,7 @@ def init_wrapper_code(
 
         self.device_ops = get_device_op_overrides(self.device_type)
         wrapper_code_gen_cls = get_wrapper_codegen_for_device(
-            self.device_type, self.cpp_wrapper
+            self.device_type, self.cpp_wrapper, self.fx_wrapper
         )
         assert wrapper_code_gen_cls is not None, (
             f"Device {self.device_type} not supported"
diff --git a/torch/_inductor/index_propagation.py b/torch/_inductor/index_propagation.py
index a43925b8d744e..0dc0a00412a83 100644
--- a/torch/_inductor/index_propagation.py
+++ b/torch/_inductor/index_propagation.py
@@ -65,7 +65,18 @@ def is_constant(self):
 
     def __post_init__(self):
         if _is_constant(self.expr):
-            self.expr = dtype_to_type(self.dtype)(self.expr)
+            expr = self.expr
+            if isinstance(expr, sympy.Expr):
+                expr = expr.expand(identity=True)
+            expr = dtype_to_type(self.dtype)(expr)
+            if is_integer_dtype(self.dtype):
+                bits = torch.iinfo(self.dtype).bits
+                if self.dtype.is_signed:
+                    expr = expr + 2 ** (bits - 1)
+                expr = expr % 2**bits
+                if self.dtype.is_signed:
+                    expr = expr - 2 ** (bits - 1)
+            self.expr = expr
 
 
 class SymPyOps:
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index a3bc472a129ca..eb490dd1c6e59 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -6,6 +6,7 @@
 import itertools
 import logging
 import operator
+import os
 import textwrap
 import traceback
 from collections.abc import Container, Generator, Iterable, Iterator, Sequence
@@ -156,6 +157,9 @@
 indent = functools.partial(textwrap.indent, prefix="  ")
 aten = torch.ops.aten
 
+autotune_warmup = int(os.getenv("TORCH_AUTOTUNE_WARMUP", 25))
+autotune_rep = int(os.getenv("TORCH_AUTOTUNE_REP", 100))
+
 """ [Note: Inductor IR]
 
 Inductor's IR is produced by executing 'lowering' code (see lowering.py).  Each
@@ -314,7 +318,7 @@ def get_fill_order(
     """
     Convert strides to fill order (argsort)
     """
-    if shape_env is None:
+    if shape_env is None or all(isinstance(s, (int, sympy.Integer)) for s in seq):
         sorted_idx: Sequence[int] = argsort(seq)
     else:
         # argsort_sym handles unbacked symints (with the help of the shape_env)
@@ -510,6 +514,7 @@ def try_match_insignificant_strides(
         old_layout.size,
         new_stride,
         old_layout.offset,
+        old_layout.is_pinned,
     )
     return TensorBox(ReinterpretView(data=storage, layout=new_layout))
 
@@ -548,9 +553,6 @@ class IRNode:
     # traces back to where the IRNode is created in Inductor
     traceback: Optional[list[str]] = dataclasses.field(init=False)
     origin_node: Optional[torch.fx.Node] = dataclasses.field(init=False)
-    # trace backs to user model code
-    # a single IRNode could correspond to multiple lines of code
-    stack_traces: dict[str, str] = dataclasses.field(init=False)
 
     @staticmethod
     @contextlib.contextmanager
@@ -589,34 +591,6 @@ def __post_init__(self) -> None:
         )
         self._post_init_setattr("origin_node", None)
 
-        # Group nodes by their stack traces to deduplicate
-        nodes_to_stack_trace = {}
-        if config.trace.provenance_tracking:
-            for node in origins:
-                if node.stack_trace:
-                    # nodes in the backward graph don't have mapping to pre_grad_graph
-                    nodes_to_stack_trace["post_grad+" + node.name] = node.stack_trace
-                else:
-                    if (
-                        "postToPre"
-                        not in torch._inductor.debug._inductor_post_to_pre_grad_nodes
-                    ):
-                        continue
-                    node_names = torch._inductor.debug._inductor_post_to_pre_grad_nodes[
-                        "postToPre"
-                    ].get(node.name, None)
-                    if node_names:
-                        for node_name in node_names:
-                            stack_trace = torch._inductor.debug._inductor_pre_grad_node_stack_trace.get(
-                                node_name, None
-                            )
-                            if stack_trace:
-                                nodes_to_stack_trace["pre_grad+" + node_name] = (
-                                    stack_trace
-                                )
-
-        self._post_init_setattr("stack_traces", nodes_to_stack_trace)
-
     def get_read_names(self) -> OrderedSet[str]:
         return OrderedSet(dep.name for dep in self.get_reads())
 
@@ -629,17 +603,48 @@ def get_origin_node(self) -> Optional[torch.fx.Node]:
     def get_defining_op(self) -> Optional[Operation]:
         return None
 
+    def get_stack_traces(self) -> OrderedSet[str]:
+        # Return stack traces to user model code
+        # A single IRNode could correspond to multiple lines of code
+        stack_traces: OrderedSet[str] = OrderedSet()
+        origins = self.origins
+        if isinstance(self, ExternKernel):
+            origin_node = self.get_origin_node()
+            if self.origin_node:
+                origins = OrderedSet([origin_node])
+        for node in origins:
+            if hasattr(node, "stack_trace") and node.stack_trace:
+                # nodes in the backward graph don't have mapping to pre_grad_graph
+                stack_traces.add(node.stack_trace)
+            else:
+                pre_grad_nodes = (
+                    torch._inductor.debug._inductor_post_to_pre_grad_nodes.get(
+                        "postToPre", {}
+                    ).get(node.name, [])
+                )
+                if not isinstance(pre_grad_nodes, list):
+                    continue
+                for node_name in pre_grad_nodes:
+                    stack_trace = (
+                        torch._inductor.debug._inductor_pre_grad_node_stack_trace.get(
+                            node_name, None
+                        )
+                    )
+                    if stack_trace:
+                        stack_traces.add(stack_trace)
+        return stack_traces
+
     def common_repr(self, shorten: bool = True) -> Sequence[str]:
         origins = f"origins={getattr(self, 'origins', '')}"
         if shorten and len(origins) > 64:
             # this can get *very* long
             origins = f"{origins[:61]}..."
-        if not self.stack_traces:
+        if not self.get_stack_traces():
             return [origins]
 
         stack_trace_str = []
-        for stack_trace in self.stack_traces.values():
-            stack_trace_str.append("stack_traces = {{")
+        for stack_trace in self.get_stack_traces():
+            stack_trace_str.append("stack_traces = {")
             stack_trace_str += stack_trace.split("\n")
             stack_trace_str.append("}")
         return [origins] + stack_trace_str
@@ -1089,7 +1094,10 @@ def constant_to_device(self, device: torch.device) -> IRNode:
         loader = self.make_loader()
         loader = patch.object(ConstantBuffer, "override_device", device)(loader)
         return Pointwise(
-            device=device, dtype=self.dtype, inner_fn=loader, ranges=self.ranges
+            device=device,
+            dtype=self.dtype,
+            inner_fn=loader,
+            ranges=self.ranges,
         )
 
 
@@ -1593,9 +1601,10 @@ def _maybe_increase_split(split: int) -> int:
             reduction_hint = hint
         if split == -1:
             assert input_node is not None
-            new_ranges, new_reduction_ranges = extract_input_node_reduction_ranges(
-                input_node
-            )
+            with patch.object(FlexibleLayout, "allow_indexing", True):
+                new_ranges, new_reduction_ranges = extract_input_node_reduction_ranges(
+                    input_node
+                )
             assert new_ranges is not None
             assert new_reduction_ranges is not None
             return cls.create_multilayer_existing_ranges(
@@ -2868,8 +2877,8 @@ def _normalize_size(x: IRNode, new_size: Sequence[_IntLike]) -> Sequence[_IntLik
             if new_size[i] == -1:
                 assert old_size[i] is not None
                 new_size[i] = old_size[i]
-            elif old_size[i] is None or V.graph.sizevars.shape_env.evaluate_expr(
-                sympy.Eq(old_size[i], 1), fallback_value=False
+            elif old_size[i] is None or V.graph.sizevars.is_size_one_or_false(
+                old_size[i]
             ):
                 pass
             else:
@@ -2895,9 +2904,7 @@ def create(cls, x: IRNode, new_size: Sequence[_IntLike]) -> BaseView:
             for stride, size in zip(old_layout.stride, old_layout.size):
                 new_stride.append(
                     stride
-                    if not V.graph.sizevars.shape_env.evaluate_expr(
-                        sympy.Eq(size, 1), fallback_value=False
-                    )
+                    if not V.graph.sizevars.is_size_one_or_false(size)
                     else sympy.S.Zero
                 )
             new_layout = FixedLayout(
@@ -2906,6 +2913,7 @@ def create(cls, x: IRNode, new_size: Sequence[_IntLike]) -> BaseView:
                 list(new_size),
                 new_stride,
                 old_layout.offset,
+                old_layout.is_pinned,
             )
             return ReinterpretView(data=storage, layout=new_layout)
 
@@ -2952,6 +2960,7 @@ def create(cls, x: IRNode, dims: Sequence[int]) -> BaseView:
                 [old_layout.size[i] for i in dims],
                 [old_layout.stride[i] for i in dims],
                 old_layout.offset,
+                old_layout.is_pinned,
             )
             return ReinterpretView(data=storage, layout=new_layout)
 
@@ -3013,6 +3022,7 @@ def create(cls, x: IRNode, *, dim: Optional[int] = None) -> IRNode:
                 new_size,
                 new_stride,
                 old_layout.offset,
+                old_layout.is_pinned,
             )
             return ReinterpretView(data=storage, layout=new_layout)
 
@@ -3131,6 +3141,7 @@ def fake_reindex(index: Any) -> tuple[int, ...]:
                 new_size,
                 FlexibleLayout.contiguous_strides(new_size),
                 old_layout.offset,
+                old_layout.is_pinned,
             )
             return ReinterpretView(data=storage, layout=new_layout)
 
@@ -3365,6 +3376,7 @@ def create(cls, x: IRNode, new_dtype: torch.dtype) -> BaseView:
                 old_layout.size,
                 old_layout.stride,
                 old_layout.offset,
+                old_layout.is_pinned,
             )
             return ReinterpretView(data=storage, layout=new_layout)
         return DtypeView(data=x, target_dtype=new_dtype)
@@ -3472,6 +3484,7 @@ def create(  # type: ignore[override]
                 new_size,
                 new_stride,
                 old_layout.offset + old_layout.stride[dim] * start,
+                old_layout.is_pinned,
             )
             return ReinterpretView(data=storage, layout=new_layout)
 
@@ -3543,12 +3556,21 @@ def constant_to_device(self, device: torch.device) -> IRNode:
 def is_contiguous_strides_for_shape(
     stride: Sequence[_IntLike], shape: Sequence[_IntLike]
 ) -> bool:
-    return all(
-        size == 1 or left == right
-        for left, right, size in zip(
-            stride, FlexibleLayout.contiguous_strides(shape), shape
-        )
-    )
+    expected_stride = 1
+    expected_stride_max = 1
+    for x, y in reversed(tuple(zip(shape, stride))):
+        if x == 1:
+            continue
+
+        if not V.graph.sizevars.statically_known_equals(
+            y, expected_stride
+        ) and not V.graph.sizevars.statically_known_equals(y, expected_stride_max):
+            return False
+
+        expected_stride_max *= sympy.Max(1, x)
+        expected_stride *= x
+
+    return True
 
 
 def get_align_for_dtype(dtype: torch.dtype) -> int:
@@ -3565,9 +3587,21 @@ def get_device(self) -> Optional[torch.device]:
     def storage_size(self) -> int:
         raise NotImplementedError(type(self).__name__)
 
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        raise NotImplementedError(type(self).__name__)
+
 
 @ir_dataclass
 class Layout(OutputSpec):
+    """
+    Layout base class
+
+    Carries tensor meta-information including offset and
+    whether it is pinned.
+    """
+
     def __init__(
         self,
         device: torch.device,
@@ -3575,6 +3609,7 @@ def __init__(
         size: Sequence[Expr],
         stride: Optional[Sequence[Expr]] = None,
         offset: Expr = Integer(0),
+        is_pinned: bool = False,
     ) -> None:
         if stride is None:
             stride = FlexibleLayout.contiguous_strides(size)
@@ -3585,6 +3620,9 @@ def __init__(
         self.size = size
         self.stride = stride
         self.offset = offset
+        self.is_pinned = is_pinned
+        # is_pinned implies cpu
+        assert (not self.is_pinned) or (self.device.type == "cpu")
 
     def __str__(self) -> str:
         offset = ""
@@ -3592,9 +3630,12 @@ def __str__(self) -> str:
             offset = f", offset={self.offset}"
 
         device_index_str = "" if self.device.index is None else f":{self.device.index}"
+        is_pinned_str = ""
+        if self.is_pinned:
+            is_pinned_str = f", is_pinned={self.is_pinned}"
         return (
             f"{type(self).__name__}('{self.device.type}{device_index_str}', {self.dtype}, "
-            f"size={self.size}, stride={self.stride}{offset})"
+            f"size={self.size}, stride={self.stride}{offset}{is_pinned_str})"
         )
 
     __repr__ = __str__
@@ -3609,6 +3650,7 @@ def get_example(self) -> torch.Tensor:
                 convert_shape_to_symint(self.stride),
                 dtype=self.dtype,
                 device=self.device,
+                pin_memory=self.is_pinned,
             )
 
     def is_contiguous(self) -> bool:
@@ -3702,18 +3744,20 @@ def _pad_strides(
         ):
             return in_strides
 
-        # get_stride_order does not work with dynamic shape. Also we can not
-        # statically decide if a padding is needed or how much padding we should
-        # do for dynamic shape.
-        #
-        # Skip padding the strides for dynamic shape for now.
-        if not all(
-            isinstance(s, (int, sympy.Integer))
-            for s in itertools.chain(in_strides, size)
-        ):
+        shape_env = V.graph._shape_env if hasattr(V.graph, "_shape_env") else None
+
+        def contains_unbacked_symints(expr: sympy.Expr | int) -> bool:
+            if shape_env is None:
+                return False
+            if not isinstance(expr, sympy.Expr):
+                return False
+            return any(shape_env.is_unbacked_symint(s) for s in expr.free_symbols)
+
+        # Skip padding the strides when it contains unbacked symints for now.
+        if shape_env and any(contains_unbacked_symints(s) for s in in_strides):
             return in_strides
 
-        stride_order = get_stride_order(in_strides)
+        stride_order = get_stride_order(in_strides, shape_env)
         fill_order = stride_order2fill_order(stride_order)
 
         new_strides = [0 for _ in range(len(in_strides))]
@@ -3725,11 +3769,17 @@ def _pad_strides(
         for rank, idx in enumerate(fill_order[1:], start=1):
             prev_idx = fill_order[rank - 1]
             stride = new_strides[prev_idx] * size[prev_idx]
-
-            if stride > config.padding_stride_threshold and stride % align != 0:
-                stride = ceildiv(stride, align) * align
-                padded = True
+            # Static stride and meets padding conditions OR
+            # Dynamic stride and config.pad_dynamic_shape=True
+            require_padding = (
+                isinstance(stride, (int, sympy.Integer))
+                and stride > config.padding_stride_threshold
+                and stride % align != 0
+            ) or (isinstance(stride, sympy.Expr) and config.pad_dynamic_shapes)
             new_strides[idx] = stride
+            if require_padding:
+                new_strides[idx] = ceildiv(stride, align) * align
+                padded = True
 
         if not padded:
             # Consider a tensor with shape [256, 1, 5, 5]
@@ -3760,6 +3810,7 @@ def as_fixed(self) -> FixedLayout:
             self.size,
             self.stride,
             self.offset,
+            self.is_pinned,
         )
 
     def make_indexer(self) -> Callable[[Sequence[Expr]], Expr]:
@@ -3776,11 +3827,21 @@ def __eq__(self, other: object) -> bool:
             and self.size == other.size
             and self.stride == other.stride
             and self.offset == other.offset
+            and self.is_pinned == other.is_pinned
         )
 
     def storage_size(self) -> Expr:
         return compute_required_storage_length(self.size, self.stride, self.offset)  # type: ignore[arg-type]
 
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        return (
+            get_free_symbols(self.size, unbacked_only)
+            | get_free_symbols(self.stride, unbacked_only)
+            | get_free_symbols(self.offset, unbacked_only)
+        )
+
 
 class FixedLayout(Layout):
     """A Tensor layout we cannot change"""
@@ -3889,6 +3950,7 @@ def as_stride_order(
             self.size,
             new_stride,
             self.offset,
+            self.is_pinned,
         )
 
     def as_exact_strides(
@@ -3904,6 +3966,7 @@ def as_exact_strides(
             self.size,
             new_stride,
             self.offset,
+            self.is_pinned,
         )
 
     def as_fill_order(self, order: Sequence[int]) -> FixedLayout:
@@ -3916,6 +3979,7 @@ def as_fill_order(self, order: Sequence[int]) -> FixedLayout:
             self.size,
             new_stride,
             self.offset,
+            self.is_pinned,
         )
 
     def as_same_order(self, stride: Sequence[_IntLike]) -> FixedLayout:
@@ -3928,6 +3992,7 @@ def as_same_order(self, stride: Sequence[_IntLike]) -> FixedLayout:
             self.size,
             new_stride,
             self.offset,
+            self.is_pinned,
         )
 
     def __init__(
@@ -3936,12 +4001,13 @@ def __init__(
         dtype: torch.dtype,
         size: Sequence[Expr],
         stride_order: Optional[Sequence[Union[int, Integer]]] = None,
+        is_pinned: bool = False,
     ) -> None:
         if stride_order:
             strides = FlexibleLayout.fill_ordered(size, stride_order)
         else:
             strides = FlexibleLayout.contiguous_strides(size)
-        super().__init__(device, dtype, size, strides)
+        super().__init__(device, dtype, size, strides, is_pinned=is_pinned)
 
 
 class NonOwningLayout(Layout):
@@ -3968,6 +4034,16 @@ def maybe_guard_aligned(self) -> bool:
 
         return V.graph.sizevars.statically_known_multiple_of(offset, ALIGNMENT)
 
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        assert isinstance(self.view, ReinterpretView)
+        box = self.view.data
+        assert isinstance(box, StorageBox), type(box)
+        input_buffer = box.data
+        assert isinstance(input_buffer, Buffer), type(box)
+        return input_buffer.layout.get_free_symbol_uses(unbacked_only)
+
 
 class CommBufferType(Enum):
     SYMM_MEM = "symm_mem"
@@ -4007,6 +4083,7 @@ def __init__(
             size=fixed.size,
             stride=fixed.stride,
             offset=fixed.offset,
+            is_pinned=fixed.is_pinned,
         )
         self.comm_buffer_type = comm_buffer_type
         self.group_name = group_name
@@ -4181,6 +4258,9 @@ def get_output_spec(self) -> OutputSpec:
     def get_storage_numel(self) -> int:
         return self.get_numel()
 
+    def get_is_pinned(self) -> bool:
+        return self.get_layout().is_pinned
+
     def freeze_layout(self) -> None:
         if isinstance(self.layout, Layout) and not isinstance(
             self.layout, NonOwningLayout
@@ -4347,7 +4427,22 @@ def has_tensor_output(self) -> bool:
 
 @ir_dataclass(frozen=False)
 class ComputedBuffer(OperationBuffer):
+    """
+    Represents a buffer that is computed during kernel execution rather than being an input.
+    """
+
     data: Loops
+    _force_realize: ClassVar[bool] = False
+
+    @staticmethod
+    @contextlib.contextmanager
+    def force_realize() -> Iterator[None]:
+        old_value = ComputedBuffer._force_realize
+        try:
+            ComputedBuffer._force_realize = True
+            yield
+        finally:
+            ComputedBuffer._force_realize = old_value
 
     def get_computed_buffer_name(self) -> Optional[str]:
         """
@@ -4402,32 +4497,35 @@ def get_free_symbol_uses(
         # those symbols that establishes a dependency).  However, we haven't
         # started codegen yet so we can't directly reuse that logic.
         #
-        # For now, I'm just yoloing with the size of the buffer.  Not sure if
-        # it is enough.
-        #
         # One thing you might wonder is if this is enough for a ComputedBuffer
         # denoting a reduction over i0.  Empirically, it is enough, but for an
         # unusual reason: we only need accurate dependencies for item() call,
         # but it's impossible to end up with a reduction over i0 from an
         # item() call without a regular non-reduction buffer first.
-        return (
-            get_free_symbols(self.get_size(), unbacked_only)
-            | get_free_symbols(self.get_stride(), unbacked_only)
-            | get_free_symbols(self.get_offset(), unbacked_only)
-            | self.data.get_free_symbol_uses(unbacked_only)
-            | self.get_read_writes().get_free_symbol_uses(unbacked_only)
-        )
+        result = self.layout.get_free_symbol_uses(
+            unbacked_only
+        ) | self.data.get_free_symbol_uses(unbacked_only)
+
+        if self.has_store_function() and isinstance(
+            self.get_store_function(), LoopBody
+        ):
+            result |= self.get_read_writes().get_free_symbol_uses(unbacked_only)
+        return result
 
     def make_loader(self) -> Callable[[Sequence[Expr]], OpsValue]:
         if (
             not self.get_reduction_type()
             and self.name not in V.graph.mutated_buffers
             and self.num_reads() == 0
+            and not self._force_realize
         ):
             # inline this op rather than generating ops.load()
             return self.data.make_loader()
         return super().make_loader()
 
+    def has_store_function(self) -> bool:
+        return isinstance(self.data, (Reduction, Scan, Sort, Pointwise))
+
     def get_store_function(self) -> Callable[..., None]:
         indexer = self.get_layout().as_fixed().make_indexer()
         if isinstance(self.data, (Reduction, Scan, Sort)):
@@ -4741,7 +4839,7 @@ def dummy(index: Sequence[Any], rindex: Sequence[Any]) -> Any:
                 return ops.load(inp.get_name(), indexer(index))
 
             deps.reads |= dependencies.extract_read_writes(
-                dummy, inp.get_size(), (), normalize=True
+                dummy, inp.get_size(), (), normalize=normalize
             ).reads
 
         return deps
@@ -4877,9 +4975,13 @@ def __init__(
 
     def benchmark(self, *args: Any, out: torch.Tensor) -> float:
         algo = self.to_callable()
+        benchmark_configs = {
+            "warmup": autotune_warmup,
+            "rep": autotune_rep,
+        }
         if config.profile_bandwidth_with_do_bench_using_profiling:
-            return do_bench_using_profiling(lambda: algo(*args))
-        return benchmarker.benchmark(algo, args, {"out": out})
+            return do_bench_using_profiling(lambda: algo(*args), **benchmark_configs)
+        return benchmarker.benchmark(algo, args, {"out": out}, **benchmark_configs)
 
     def call_name(self) -> str:
         raise NotImplementedError
@@ -5053,6 +5155,37 @@ def get_layout(self) -> Layout:
             return super().get_layout()
 
 
+class CuteDSLTemplateBuffer(TemplateBuffer):
+    """
+    Buffer for CuteDSL (CUTLASS Python DSL) template kernels.
+    Similar to other template buffers but specialized for CuteDSL operations.
+    """
+
+    def __init__(
+        self,
+        layout: Layout,
+        inputs: Sequence[IRNode],
+        make_kernel_render: Callable[_P, _T],
+        template: Any,
+        mutated_inputs: Optional[Iterable[IRNode]] = None,
+    ) -> None:
+        super().__init__(layout, inputs, make_kernel_render)
+        self.template = template
+        self.mutated_inputs = mutated_inputs
+        self.outputs: list[Buffer] = [self]
+
+        if mutated_inputs is not None:
+            assert isinstance(self.inputs[0], IRNode), type(self.inputs[0])
+            device = self.inputs[0].get_device()
+            self.outputs += [
+                MutationOutput(NoneLayout(device=device), buf, self)
+                for buf in mutated_inputs
+            ]
+
+    def get_outputs(self) -> list[Buffer]:
+        return self.outputs
+
+
 def is_node_sequence(
     nodes: Sequence[Union[IRNode, Sequence[IRNode]]],
 ) -> TypeIs[Sequence[IRNode]]:
@@ -5131,6 +5264,18 @@ def is_extern(self) -> bool:
     def num_reads(self) -> int:
         return 1
 
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        r = OrderedSet[sympy.Symbol]()
+        for inp in self.inputs:
+            if isinstance(inp, IRNode):
+                r |= inp.get_free_symbol_uses(unbacked_only)
+            else:
+                for inner_inp in inp:
+                    r |= inner_inp.get_free_symbol_uses(unbacked_only)
+        return r
+
 
 class NopKernel(InputsKernel):
     def is_no_op(self) -> bool:
@@ -5148,6 +5293,9 @@ class ConcatKernel(NopKernel):
 
     @classmethod
     def create(cls, inputs: Sequence[IRNode], dim: int) -> StorageBox:
+        """
+        Create the concat kernel from inputs
+        """
         device = inputs[0].get_device()
         dtype = inputs[0].get_dtype()
         new_size = list(inputs[0].get_size())
@@ -5201,6 +5349,10 @@ def create(cls, inputs: Sequence[IRNode], dim: int) -> StorageBox:
         ):
             output_stride = make_channels_last_strides_for(new_size)
 
+        is_pinned = all(
+            is_storage_and_layout(x) and x.get_layout().is_pinned for x in inputs
+        )
+
         assert device is not None
         concat_kernel = ConcatKernel(
             name=None,
@@ -5209,6 +5361,7 @@ def create(cls, inputs: Sequence[IRNode], dim: int) -> StorageBox:
                 dtype=dtype,
                 size=new_size,
                 stride=output_stride,
+                is_pinned=is_pinned,
             ),
             inputs=[],
         )
@@ -5285,6 +5438,11 @@ def can_realize_into_without_copy(
             and not isinstance(src.data, ExternKernelAlloc)
         )
 
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        return NopKernel.get_free_symbol_uses(self, unbacked_only)
+
     @classmethod
     def realize_into(cls, src: IRNode, dst: IRNode) -> IRNode:
         # Attempt to turn this into a ReinterpretView rather than assert.
@@ -5324,6 +5482,11 @@ def should_allocate(self) -> bool:
 
 @ir_dataclass(frozen=False)
 class ExternKernel(InputsKernel):
+    """
+    A class that represents Kernels which are not directly lowered to Inductor
+    Loop Level IR, such as custom operators, or aten operators which we fallback to.
+    """
+
     constant_args: Sequence[Any] = ()
     kwargs: dict[str, Any] = dataclasses.field(default_factory=dict)
     output_view: Optional[ReinterpretView] = None
@@ -5474,7 +5637,10 @@ def get_kernel_name(self) -> str:
         from .codegen.cpp_wrapper_cpu import CppWrapperCpu
 
         device = d.type if (d := self.get_device()) else V.graph.device_type
-        if V.graph.cpp_wrapper:
+        if V.graph.fx_wrapper:
+            assert self.python_kernel_name is not None
+            return self.python_kernel_name
+        elif V.graph.cpp_wrapper:
             assert isinstance(V.graph.wrapper_code, CppWrapperCpu), type(
                 V.graph.wrapper_code
             )
@@ -5642,8 +5808,7 @@ def convert_to_reinterpret_view(cls, x: IRNode) -> ReinterpretView:
         if (
             x_unwrap_view_fx_node is not None
             and "val" in x_unwrap_view_fx_node.meta
-            and isinstance(x_unwrap_view, (ReinterpretView, Buffer))
-            # and hasattr(x_unwrap_view, "layout")
+            and isinstance(x_unwrap_view, (ReinterpretView, Buffer, MutableBox))
             and isinstance(x_unwrap_view.layout, FlexibleLayout)
             and (
                 x_unwrap_view_fx_node.meta["val"].is_contiguous(
@@ -5688,6 +5853,7 @@ def convert_to_reinterpret_view(cls, x: IRNode) -> ReinterpretView:
                 size=x.get_size(),
                 stride=strides,
                 offset=offset,
+                is_pinned=False,
             ),
         )
 
@@ -6120,6 +6286,17 @@ def codegen_alignment_asserts(self, wrapper: PythonWrapperCodegen) -> None:
                     f"# buffer {name} (op: {op_name}) is assumed to be not aligned"
                 )
 
+    def codegen_memory_tracking(self, wrapper: PythonWrapperCodegen) -> None:
+        """
+        Track outputs of fallback operators if config.test_configs.track_memory_lifecycle
+        """
+        if not config.test_configs.track_memory_lifecycle or V.graph.cpp_wrapper:
+            return
+
+        wrapper.write_memory_track_allocation_once()
+        name = self.get_name()
+        wrapper.writeline(f"track_tensor({name}, '{name}')")
+
     def get_group_stride(self) -> tuple[list[Sequence[Expr]], list[Expr]]:
         """
         get output sizes and strides, for template_codegen
@@ -6168,7 +6345,7 @@ def get_free_symbol_uses(
         maybe_get_symbols = (
             maybe_free_unbacked_symbols if unbacked_only else maybe_free_symbols
         )
-        r = OrderedSet[sympy.Symbol]()
+        r = InputsKernel.get_free_symbol_uses(self, unbacked_only)
         for arg in self.constant_args:
             r |= maybe_get_symbols(arg)
         for arg in self.kwargs.values():
@@ -6315,6 +6492,14 @@ def get_mutation_names(self) -> Sequence[str]:
     def should_allocate(self) -> bool:
         return False
 
+    def get_mutation_buffers(self) -> Sequence[IRNode]:
+        mutation_names = self.get_mutation_names()
+        return [
+            buf
+            for buf in (V.graph.try_get_buffer(name) for name in mutation_names)
+            if buf is not None
+        ]
+
 
 class TMADescriptor(ExternKernel):
     """
@@ -6564,6 +6749,9 @@ def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         for name, arg in itertools.chain(
             named_args.items(), zip(itertools.repeat(""), extra_launch_args)
         ):
+            if name in constexpr_names and triton_version_uses_attrs_dict():
+                # see #160000 - we don't pass in constexpr args to speed up runtime.
+                continue
             raw_keys_filtered.append(name)
             raw_args_filtered.append(arg)
             if isinstance(arg, IRNode):
@@ -7011,12 +7199,21 @@ def create(cls, x: IRNode, device: torch.device, non_blocking: bool) -> IRNode:
         if x.get_size():
             # x.get_stride() may be unimplemented if x's size is empty
             stride = x.get_stride()
+        is_destination_pinned = (
+            is_gpu(x_device.type) and device.type == "cpu" and non_blocking
+        )
+        is_source_pinned = (
+            x_device.type == "cpu" and is_gpu(device.type) and non_blocking
+        )
+        if is_source_pinned and is_storage_and_layout(x):
+            x.get_layout().is_pinned = True
         return DeviceCopy(
             FixedLayout(
                 device,
                 x.get_dtype(),
                 x.get_size(),
                 stride,
+                is_pinned=is_destination_pinned,
             ),
             [cls.realize_input(x)],
             constant_args,
@@ -7144,7 +7341,10 @@ def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         # simplify(u0 == 0), you will get True (because we've already runtime assert'ed
         # that it's true).  But we're code generating the actual runtime assert here!!
         symbol = next(iter(self.get_free_symbol_uses(unbacked_only=False)))
-        if V.graph.cpp_wrapper:
+        if V.graph.fx_wrapper:
+            # TODO fix
+            pass
+        elif V.graph.cpp_wrapper:
             symbol_str = f"std::to_string({symbol})"
             sizevar = V.graph.wrapper_code.codegen_cpp_sizevar(
                 self.scalar, simplify=False
@@ -7493,7 +7693,7 @@ def handle_single_output(
             ),
         )
 
-        V.graph.extern_kernel_nodes.append(node)
+        V.extern_kernel_nodes.append(node)
 
         return [*args, *ordered_kwargs]
 
@@ -7579,16 +7779,24 @@ def is_number(t: torch.JitType) -> bool:
             if isinstance(self.layout, Layout):
                 self.codegen_size_asserts(wrapper)
                 self.codegen_alignment_asserts(wrapper)
+                self.codegen_memory_tracking(wrapper)
 
         self.codegen_unbacked_symbol_defs(wrapper)
 
     @staticmethod
     def tensor_to_layout(output: torch.Tensor) -> FixedLayout:
+        is_pinned = False
+        try:
+            is_pinned = output.is_pinned()
+        except RuntimeError:
+            # dispatch not implemented
+            pass
         return FixedLayout(
             output.device,
             output.dtype,
             convert_shape_to_inductor(output.size()),
             convert_shape_to_inductor(output.stride()),
+            is_pinned=is_pinned,
         )
 
     @classmethod
@@ -7720,6 +7928,31 @@ def __init__(
         )
 
 
+class MemoryCheckKernel(FallbackKernel):
+    """
+    Custom kernel for memory checking that generates direct function calls
+
+    TODO - the custom op was erroring with str inputs. should be able to custom op directly.
+    """
+
+    def codegen(self, wrapper: PythonWrapperCodegen) -> None:
+        """Override codegen to write direct function call"""
+        # Extract our arguments from nontensor_args
+        wrapper.write_memory_track_allocation_once()
+        alive_list, dead_list, is_final_step = self.constant_args
+
+        alive_repr = repr(alive_list)
+        dead_repr = repr(dead_list)
+        if is_final_step:
+            wrapper.writeline(
+                "# note: dont currently distinguish between buffers returned and dealloc'd in last step"
+            )
+            call = f"check_memory_step(allocated={alive_repr}, freed={dead_repr}, is_final_step={is_final_step})"
+        else:
+            call = f"check_memory_step(allocated={alive_repr}, freed={dead_repr})"
+        wrapper.writeline(call)
+
+
 @ir_dataclass
 class MultiOutputLayout(OutputSpec):
     device: torch.device
@@ -7964,6 +8197,7 @@ def realize(self) -> Optional[str]:
                 device=device,
                 dtype=self.data.get_dtype(),
                 size=self.data.get_size(),
+                is_pinned=False,
             ),
             data=self.data,
         )
@@ -8093,7 +8327,7 @@ def create(
         new_operands: list[IRNode] = []
 
         for idx, operand in enumerate(operands):
-            if isinstance(operand, ShapeAsConstantBuffer):
+            if isinstance(operand, (ShapeAsConstantBuffer, GeneratorState)):
                 new_operands.append(operand)
             else:
                 new_operands.append(
@@ -8144,6 +8378,7 @@ def create_output(
                         size=output.get_size(),
                         stride=output.get_stride(),
                         offset=output.get_layout().offset,
+                        is_pinned=output.get_layout().is_pinned,
                     ),
                     invoke_subgraph,  # type: ignore[has-type]
                     [(list, ind)],
@@ -8194,6 +8429,12 @@ def __init__(
         self.name = V.graph.register_buffer(self)
         V.graph.register_operation(self)
 
+    @staticmethod
+    def _maybe_expr(s: Union[int, torch.SymInt]) -> Union[int, sympy.Expr]:
+        if isinstance(s, int):
+            return s
+        return s.node.expr
+
     @classmethod
     def create(
         cls,
@@ -8260,19 +8501,17 @@ def create(
             unbacked_bindings=unbacked_bindings,
         )
 
-        def _maybe_expr(s: Union[int, torch.SymInt]) -> Union[int, sympy.Expr]:
-            if isinstance(s, int):
-                return s
-            return s.node.expr
-
         outputs = [
             MultiOutput(
                 FixedLayout(
                     device=device,
                     dtype=output.get_dtype(),
-                    size=[_maybe_expr(sz) for sz in merged_output.size()],
-                    stride=[_maybe_expr(sz) for sz in merged_output.stride()],
+                    size=[Conditional._maybe_expr(sz) for sz in merged_output.size()],
+                    stride=[
+                        Conditional._maybe_expr(sz) for sz in merged_output.stride()
+                    ],
                     offset=output.get_layout().offset,
+                    is_pinned=output.get_layout().is_pinned,
                 ),
                 conditional,
                 [(list, i)],
@@ -8320,6 +8559,8 @@ def _split_by_sym_type(
 
 @ir_dataclass(frozen=False)
 class WhileLoop(ExternKernel):
+    """The IR node for while_loop and while_loop_stack_output. It supports input mutation."""
+
     carried_inputs: Optional[Sequence[IRNode]] = None
     additional_inputs: Optional[Sequence[IRNode]] = None
     cond_subgraph: Optional[Subgraph] = None
@@ -8333,6 +8574,8 @@ def __init__(
         cond_subgraph: Subgraph,
         body_subgraph: Subgraph,
         layout: MultiOutputLayout,
+        unbacked_bindings: Optional[dict[sympy.Symbol, pytree.KeyPath]],
+        stack_output: bool,
     ) -> None:
         self.carried_inputs = carried_inputs
         self.additional_inputs = additional_inputs
@@ -8348,10 +8591,45 @@ def __init__(
             inputs=tensor_args,
             constant_args=sym_args,
         )
+        if unbacked_bindings is not None:
+            self.unbacked_bindings = unbacked_bindings
+        self.stack_output = stack_output
 
         self.name = V.graph.register_buffer(self)
         V.graph.register_operation(self)
 
+    # Accidental aliasing can be created due to cse, where the empty buffers we
+    # allocated for backward to use gets csed into the same buffer in function fx_graph_cse.
+    # See test_scan_multiple_layers_gradient for a concrete example.
+    @staticmethod
+    def _clone_aliased_inputs(carried_inputs: Sequence[IRNode]) -> Sequence[IRNode]:
+        if not _has_aliased_buffers(carried_inputs):
+            return carried_inputs
+
+        # Import clone from lowering module
+        from .lowering import clone
+
+        # Unwrap views to get the underlying buffers for comparison
+        unwrapped_buffers = [
+            buffer.unwrap_view() if isinstance(buffer, ReinterpretView) else buffer
+            for buffer in carried_inputs
+        ]
+
+        # Track which buffers we've seen and their indices
+        seen_buffers: OrderedSet[int] = OrderedSet()
+        result = []
+
+        for i, (original_input, unwrapped_buffer) in enumerate(
+            zip(carried_inputs, unwrapped_buffers)
+        ):
+            if id(unwrapped_buffer) in seen_buffers:
+                result.append(clone(original_input))
+            else:
+                seen_buffers.add(id(unwrapped_buffer))
+                result.append(original_input)
+
+        return result
+
     @classmethod
     def create(
         cls,
@@ -8359,7 +8637,11 @@ def create(
         body_fn: Subgraph,
         carried_inputs: Sequence[IRNode],
         additional_inputs: Sequence[IRNode],
+        stack_output: bool,
     ) -> Union[IRNode, Sequence[IRNode]]:
+        """create the while_loop IR node. stack_output controls whether it stack
+        each iterations' output, which is necessary for training.
+        """
         from torch._higher_order_ops.utils import check_input_alias_and_mutation
 
         def _require_exact_strides(
@@ -8387,6 +8669,7 @@ def _require_exact_strides(
         fake_additional_inputs = [x.meta["val"] for x in fx_additional_inputs]  # type: ignore[union-attr]
 
         carried_inputs_ = [cls.realize_input(x) for x in carried_inputs]
+        carried_inputs_ = WhileLoop._clone_aliased_inputs(carried_inputs_)
         carried_inputs_ = _require_exact_strides(carried_inputs_, fake_carried_inputs)
         additional_inputs_ = [cls.realize_input(x) for x in additional_inputs]
         additional_inputs_ = _require_exact_strides(
@@ -8465,9 +8748,14 @@ def _guard_list_equals(
             # as the MultiOutputLayout below requires single device
             assert op.get_device() == bo.get_device(), (i, op, bo, device)
             assert op.get_dtype() == bo.get_dtype(), (i, op, bo)
-            assert op.get_layout().offset == bo.get_layout().offset, (i, op, bo)
 
         assert device is not None
+
+        unbacked_bindings = resolve_unbacked_bindings(
+            V.graph.sizevars.shape_env,
+            V.graph.current_node.meta.get("unbacked_bindings", None),
+        )
+
         while_loop = WhileLoop(
             carried_inputs=carried_inputs_,
             additional_inputs=additional_inputs_,
@@ -8475,6 +8763,8 @@ def _guard_list_equals(
             body_subgraph=body_fn,
             # asserted above that there is at least one operand
             layout=MultiOutputLayout(device=device),
+            unbacked_bindings=unbacked_bindings,
+            stack_output=stack_output,
         )
 
         assert body_fn.graph is not None and isinstance(
@@ -8487,37 +8777,55 @@ def _guard_list_equals(
         )[3]
         mutated_idx_set = OrderedSet(mutated_idxs)
         mutated_inputs = [all_inputs[idx] for idx in mutated_idx_set]
-        real_outputs = {
-            idx: out
-            for idx, out in enumerate(body_outputs)
-            if idx not in mutated_idx_set
-        }
-        real_outputs = [
-            MultiOutput(
-                FixedLayout(
-                    device=output.get_device(),  # type: ignore[arg-type]
-                    dtype=output.get_dtype(),
-                    size=output.get_size(),
-                    stride=output.get_stride(),
-                    offset=output.get_layout().offset,
-                ),
-                while_loop,
-                [(list, idx)],
-            )
-            for idx, output in real_outputs.items()
-        ]
-        while_loop.outputs = real_outputs
-        while_loop.mutation_outputs = [
-            MutationOutput(inp.layout, inp, while_loop)  # type: ignore[attr-defined, union-attr]
-            for inp in mutated_inputs
-        ]
 
-        outputs_iter = iter(real_outputs)
+        # Create all outputs first
         mutated_inputs_iter = iter(mutated_inputs)
-        all_outputs = [
-            next(mutated_inputs_iter) if idx in mutated_idx_set else next(outputs_iter)
-            for idx in range(len(body_outputs))
-        ]
+        all_outputs: list[IRNode] = []
+        while_loop.outputs = []
+        while_loop.mutation_outputs = []
+        if stack_output:
+            assert len(mutated_idx_set) == 0, (
+                "NYI: while_loop_stack_output input mutations."
+            )
+            for idx, output in enumerate(V.graph.current_node.meta["val"]):
+                # Create MultiOutput for regular outputs
+                multi_out = MultiOutput(
+                    FixedLayout(
+                        device=output.device,  # type: ignore[arg-type]
+                        dtype=output.dtype,
+                        size=[Conditional._maybe_expr(sz) for sz in output.size()],
+                        stride=[Conditional._maybe_expr(st) for st in output.stride()],
+                    ),
+                    while_loop,
+                    [(list, idx)],
+                )
+                while_loop.outputs.append(multi_out)
+                all_outputs.append(multi_out)
+        else:
+            for idx, output in enumerate(body_outputs):
+                if idx in mutated_idx_set:
+                    assert idx < len(carried_inputs), "only carries can be mutated."
+                    # Create MutationOutput for mutated inputs
+                    mutated_input = next(mutated_inputs_iter)
+                    while_loop.mutation_outputs.append(
+                        MutationOutput(mutated_input.layout, mutated_input, while_loop)  # type: ignore[attr-defined, union-attr]
+                    )
+                    all_outputs.append(mutated_input)
+                else:
+                    multi_out = MultiOutput(
+                        FixedLayout(
+                            device=output.get_device(),  # type: ignore[arg-type]
+                            dtype=output.get_dtype(),
+                            size=output.get_size(),
+                            stride=output.get_stride(),
+                            offset=output.get_layout().offset,
+                        ),
+                        while_loop,
+                        [(list, idx)],
+                    )
+                    while_loop.outputs.append(multi_out)
+                    all_outputs.append(multi_out)
+
         for inp, out in zip(carried_inputs, all_outputs):
             if inp.get_name() in V.graph.graph_inputs:
                 # if a carried input of the while_loop is a graph input,
@@ -8529,7 +8837,20 @@ def _guard_list_equals(
         return all_outputs
 
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
-        wrapper.codegen_while_loop(self)
+        wrapper.codegen_while_loop(self, self.stack_output)
+        wrapper.codegen_unbacked_symbol_defs_for_outputs(
+            self.get_name(), self.outputs, getattr(self, "unbacked_bindings", {})
+        )
+
+    def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
+        if unbacked_bindings := getattr(self, "unbacked_bindings", None):
+            resolved = resolve_unbacked_bindings(
+                V.graph.sizevars.shape_env, unbacked_bindings
+            )
+            assert resolved is not None
+            return OrderedSet(resolved.keys())
+        else:
+            return OrderedSet()
 
 
 class EffectfulKernel(FallbackKernel):
@@ -8580,7 +8901,10 @@ def has_side_effects(self) -> bool:
 
 
 class NonTensorObj(IRNode):
-    pass
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        return OrderedSet()
 
 
 @ir_dataclass
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
index 7375deff9a5f9..e882be6df0df8 100644
--- a/torch/_inductor/kernel/bmm.py
+++ b/torch/_inductor/kernel/bmm.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 import logging
+from typing import TYPE_CHECKING
 
 import torch
 from torch._dynamo.utils import counters
@@ -22,13 +23,11 @@
     use_triton_template,
 )
 from ..virtualized import V
-from .mm_common import (
-    _is_static_problem,
-    addmm_epilogue,
-    is_batch_stride_largest,
-    mm_args,
-)
+from .mm_common import _is_static_problem, is_batch_stride_largest_or_zero, mm_args
+
 
+if TYPE_CHECKING:
+    from ..ir import ChoiceCaller
 
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
@@ -174,7 +173,7 @@ def may_require_contiguous(t, meta_t):
     name = "bmm"
 
     # Create MMKernelInputs for BMM at the top
-    kernel_inputs = MMKernelInputs([mat1, mat2])
+    kernel_inputs = MMKernelInputs([mat1, mat2], out_dtype=out_dtype)
 
     # below is for getting an overview logging info of inductor mms
     batch_size = mat1.get_size()[0]  # Extract batch dimension
@@ -190,34 +189,33 @@ def may_require_contiguous(t, meta_t):
         layout,
     )
 
+    aten_handler: ExternKernelChoice = aten_bmm
+    aten_extra_kwargs = {}
     if out_dtype:
         assert mat1.get_device().type == "cuda", "out_dtype is only supported for CUDA"
-        aten_func = aten_bmm_dtype.bind(
-            kernel_inputs.nodes(), layout, out_dtype=out_dtype
+        aten_handler = aten_bmm_dtype
+        aten_extra_kwargs = {"out_dtype": out_dtype}
+
+    choices: list[ChoiceCaller] = []
+    if use_aten_gemm_kernels():
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                [aten_handler],
+                name,
+                kwarg_overrides={aten_handler.uid: aten_extra_kwargs},
+            )
         )
-    else:
-        aten_func = aten_bmm.bind(kernel_inputs.nodes(), layout)
 
-    # options to tune from
-    choices = [aten_func] if use_aten_gemm_kernels() else []
-
-    if use_triton_template(layout):
+    if use_triton_template(layout, check_max_autotune=False):
         # TODO: add out_dtype support for Triton Template
         assert out_dtype is None, "out_dtype is not supported for Triton"
 
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, bmm_template.name, name
-        ):
-            bmm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
-            )
+        choices.extend(V.choices.get_mm_configs(kernel_inputs, [bmm_template], name))
     _, is_nonzero = _is_static_problem(layout)
-    batch_stride_largest = is_batch_stride_largest(mat1, mat2, layout)
+    batch_stride_largest_or_zero = is_batch_stride_largest_or_zero(mat1, mat2, layout)
     if (
-        batch_stride_largest
+        batch_stride_largest_or_zero
         and is_nonzero
         and use_cutlass_template(layout, m, n, k)
         and _use_cutlass_for_op(name)
@@ -245,11 +243,16 @@ def may_require_contiguous(t, meta_t):
 
 @L.register_lowering(aten.baddbmm)
 def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+    """
+    Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
+    """
     # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)
 
     # Create MMKernelInputs for BadDBMM at the top
-    kernel_inputs = MMKernelInputs([inp, mat1, mat2])
+    kernel_inputs = MMKernelInputs(
+        [inp, mat1, mat2], scalars=dict(alpha=alpha, beta=beta)
+    )
 
     # below is for getting an overview logging info of inductor mms
     batch_size = mat1.get_size()[0]
@@ -267,24 +270,17 @@ def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     )
     name = "baddbmm"
     # options to tune from
-    choices = (
-        [aten_baddbmm.bind(kernel_inputs.nodes(), layout, alpha=alpha, beta=beta)]
-        if use_aten_gemm_kernels()
-        else []
-    )
-
-    if use_triton_template(layout):
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, bmm_template.name, name
-        ):
-            bmm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
-                prefix_args=1,
-                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
-                epilogue_fn_hash=str(["addmm_epilogue", layout.dtype, alpha, beta]),
+    choices: list[ChoiceCaller] = []
+    if use_aten_gemm_kernels():
+        choices.extend(V.choices.get_mm_configs(kernel_inputs, [aten_baddbmm], name))
+
+    if use_triton_template(layout, check_max_autotune=False):
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                [bmm_template],
+                name,
             )
+        )
 
     return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
diff --git a/torch/_inductor/kernel/flex/common.py b/torch/_inductor/kernel/flex/common.py
index 8ee50753439eb..aab25ac0813bb 100644
--- a/torch/_inductor/kernel/flex/common.py
+++ b/torch/_inductor/kernel/flex/common.py
@@ -3,6 +3,7 @@
 
 import math
 from collections.abc import Sequence
+from pathlib import Path
 from typing import Any, Optional, Union
 
 import sympy
@@ -124,12 +125,6 @@ def build_subgraph_module_buffer(
     with V.set_graph_handler(pw_subgraph):  # type: ignore[arg-type]
         pw_subgraph.run(*args)
 
-    # Since we are allowing mutations/buffer creation, we need to register any fresh buffers
-    # creating during the pointwise subgraph lowering
-    if len(pw_subgraph.buffers) > 0:
-        for buffer in pw_subgraph.buffers:
-            V.graph.register_buffer(buffer)
-
     def convert_output_node_to_buffer(output_buffer) -> Optional[ComputedBuffer]:
         if output_buffer is None:
             return None
@@ -323,267 +318,13 @@ def next_power_of_two(n):
     return 2 ** math.ceil(math.log2(n))
 
 
-# ---- Common Template Strings ----
-compute_forward_block_mn = r"""
-@triton.jit
-def forward_block_mn(
-    {{gen_argdefs()}},
-    q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
-    # accumulated values
-    acc, l_i, m_i,
-    # Offsets
-    off_z, off_h, offs_m, offs_n,
-    # Offsets needed for TMA loads
-    kv_start,
-    kv_offset,
-    MATMUL_PRECISION, RCP_LN2,
-    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
-
-):
-    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
-    {{gen_defines() | indent_except_first(1)}}
-
-    # -- load k --
-    # NB reversed order to since K is transposed
-    {%- if USE_TMA %}
-    k = tl.load_tensor_descriptor(
-        desc_k,
-        [kv_start + kv_offset, 0],
-    )
-    {%- else %}
-    k = load_checked_block(K_block_ptr, SAFE_HEAD_DIM, IS_DIVISIBLE)
-    {%- endif %}
-
-    if USE_TMA:
-        k = tl.trans(k)
-    # -- compute qk ---
-    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
-    if not PRESCALE_QK:
-        qk *= SM_SCALE
-    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
-    # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
-    # which is larger than the actual number of elements. To avoid access memory out of bound,
-    # we need to mask out the elements that are out of Q_LEN & KV_LEN.
-    m = get_bounded_indices(offs_m, Q_LEN if CHECK_BLOCK_BOUNDARY else None)
-    n = get_bounded_indices(offs_n, KV_LEN if CHECK_BLOCK_BOUNDARY else None)
-
-    {{ modification(
-        subgraph_number=0,
-        output_name="post_mod_scores",
-        score="qk",
-        b="off_z",
-        h="off_h",
-        m="m",
-        n="n",
-        out="qk"
-    ) | indent_except_first(1) }}
-
-    if CHECK_BLOCK_BOUNDARY:
-        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
-        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
-
-    if not IS_FULL_BLOCKS:
-        {{ modification(
-            subgraph_number=1,
-            output_name="mask_mod_output",
-            score="qk",
-            b="off_z",
-            h="off_h",
-            m="m",
-            n="n",
-        ) | indent_except_first(2) }}
-
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, False)
-        # apply mask for partially unmasked blocks
-        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
-
-    if not PRESCALE_QK:
-        post_mod_scores *= RCP_LN2
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    # -- compute scaling constant ---
-    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
-    if not ROWS_GUARANTEED_SAFE:
-        masked_out_rows = (m_ij == float("-inf"))
-        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
-    else:
-        m_ij_masked = m_ij
-
-    alpha = tl.math.exp2(m_i - m_ij_masked)
-    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
-
-    # NB: l_i update is pulled up here since it's a bit faster
-    # NB: For headdim=256, it's faster to move it back down to after m_i =
-    # m_ij
-    l_i = l_i * alpha + tl.sum(p, 1)
-    # # -- scale and update acc --
-    acc = acc * alpha[:, None]
-    {%- if USE_TMA %}
-    v = tl.load_tensor_descriptor(
-        desc_v,
-        [kv_start + kv_offset, 0],
-    )
-    {%- else %}
-    v = load_checked_block(V_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
-    {%- endif %}
-    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
-
-    # -- update m_i
-    m_i = m_ij
-
-    return acc, l_i, m_i
-
-"""
-
-compute_forward_inner = r"""
-@triton.jit
-def forward_inner(
-    {{gen_argdefs()}},
-    q, K_block_ptr, V_block_ptr,
-    desc_k, desc_v, Q_LEN, KV_LEN,
-    # accumulated values
-    acc, l_i, m_i,
-    # Offsets used as inputs to score_mod & mask_mod
-    # of size [BLOCK_M, BLOCK_N] or scalar.
-    off_z, off_h, offs_m, offs_n,
-    # Offsets needed for TMA loads
-    kv_start,
-    # blocksparse data
-    kv_indices, kv_num_blocks,
-    # start kv and end kv block
-    block_n_start, block_n_end,
-    MATMUL_PRECISION,
-    IS_FULL_BLOCKS,
-):
-    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
-    {{gen_defines() | indent_except_first(1)}}
-
-    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
-    RCP_LN2: tl.constexpr = 1.44269504
-
-    if PRESCALE_QK:
-        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
-
-    kv_offset = 0
-
-    # loop over k, v and update accumulator until block_n_end
-    for start_n in range(block_n_start, block_n_end):
-        # Here IS_DIVISIBLE acts are the start_n = tl.multiple_of(start_n, BLOCK_N) from triton_fused_attention.
-        if IS_DIVISIBLE:
-            acc, l_i, m_i = forward_block_mn(
-                {{gen_argdefs()}},
-                q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
-                # accumulated values
-                acc, l_i, m_i,
-                # Offsets
-                off_z, off_h, offs_m, offs_n,
-                # Offsets needed for TMA loads
-                kv_start,
-                kv_offset,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS,
-            )
-        else:
-            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
-            # it's on par or slightly faster than only applying to the last block in fwd.
-            # However, we choose different strategy for bwd, where we only apply mod & mask
-            # to the last block because it's faster a lot.
-            acc, l_i, m_i = forward_block_mn(
-                {{gen_argdefs()}},
-                q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
-                # accumulated values
-                acc, l_i, m_i,
-                # Offsets
-                off_z, off_h, offs_m, offs_n,
-                # Offsets needed for TMA loads
-                kv_start,
-                kv_offset,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
-            )
-
-
-
-        offset = get_offset_for_next_block(
-            start_n, kv_indices, kv_num_blocks,
-            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
-        )
-
-        offs_n = offs_n + offset
-        kv_offset += offset
-        if not USE_TMA:
-            K_block_ptr = tl.advance(K_block_ptr, (0, offset))
-            V_block_ptr = tl.advance(V_block_ptr, (offset, 0))
+_TEMPLATE_DIR = Path(__file__).parent / "templates"
 
 
-    return acc, l_i, m_i
+def load_template(name: str) -> str:
+    """Load a template file and return its content."""
+    with open(_TEMPLATE_DIR / f"{name}.py.jinja") as f:
+        return f.read()
 
-"""
 
-# Inner Triton functions shared by flex_attention & split-k decoding kernels.
-compute_next_offset_func = r"""
-@triton.jit
-def get_offset_for_next_block(
-    loop_iter, col_indices, total_blocks,
-    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
-    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
-):
-    if BLOCKS_ARE_CONTIGUOUS:
-        return BLOCK
-    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
-    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
-    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
-    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
-    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
-    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
-    return offset
-"""
-
-get_bounded_indices_func = r"""
-@triton.jit
-def get_bounded_indices(indices, max_len=None):
-    return indices % max_len if max_len is not None else indices
-"""
-
-
-load_checked_block = r"""
-@triton.jit
-def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
-  if IS_DIVISIBLE and SAFE_HEAD_DIM:
-    return tl.load(block_ptr)
-  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
-    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
-  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
-      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
-  else:
-      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
-"""
-
-load_checked_2d = r"""
-@triton.jit
-def load_checked_2d(
-    ptr,
-    offs_m,
-    offs_n,
-    stride_m,
-    stride_n,
-    IS_DIVISIBLE_M: tl.constexpr,
-    IS_DIVISIBLE_N: tl.constexpr,
-    M_LEN: tl.constexpr,
-    N_DIM: tl.constexpr,
-):
-    # Calculate final pointer if strides are provided
-    if stride_m is not None and stride_n is not None:
-        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
-
-    # Handle all masking cases
-    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
-        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_DIM), other=0.0)
-    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
-        return tl.load(ptr, mask=(offs_n[None, :] < N_DIM), other=0.0)
-    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
-        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
-    else:  # Both divisible
-        return tl.load(ptr)
-"""
+# Template strings have been moved to templates/common.py.jinja
diff --git a/torch/_inductor/kernel/flex/flex_attention.py b/torch/_inductor/kernel/flex/flex_attention.py
index 0553fd06755d0..52144b03cf4d2 100644
--- a/torch/_inductor/kernel/flex/flex_attention.py
+++ b/torch/_inductor/kernel/flex/flex_attention.py
@@ -22,17 +22,12 @@
 )
 from .common import (
     build_subgraph_buffer,
-    compute_forward_block_mn,
-    compute_forward_inner,
-    compute_next_offset_func,
     create_indices_fake,
     create_num_blocks_fake_generator,
     create_placeholder,
-    get_bounded_indices_func,
     get_fwd_subgraph_outputs,
     infer_dense_strides,
-    load_checked_2d,
-    load_checked_block,
+    load_template,
     maybe_realize,
     set_head_dim_values,
     SubgraphResults,
@@ -58,7 +53,11 @@ def flex_attention_grid(batch_size, q_heads, num_queries, d_model, meta, *, cdiv
 
 def get_float32_precision():
     if (
-        torch.backends.cuda.matmul.fp32_precision == "ieee"
+        (
+            torch.backends.cuda.matmul.fp32_precision == "ieee"
+            if torch.backends.cuda.matmul.fp32_precision != "none"
+            else torch.get_float32_matmul_precision() == "highest"
+        )
         or torch.version.hip
         or torch.mtia.is_available()
     ):
@@ -67,267 +66,12 @@ def get_float32_precision():
         return "'tf32'"
 
 
-compute_flex_attention = r"""
-{{def_kernel("Q", "K", "V", "LSE", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}}
-    # Sub notation for this kernel:
-    #
-    # Q: Query, K: Key, V: Value
-    # M: Number of queries, N: Number of keys/values, D: Model dimension
-    # QK_HEAD_DIM: The dimension of the query and key embeddings
-    # V_HEAD_DIM: The dimension of the value embeddings
-    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
-    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
-    #
-    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
-    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
-    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
-    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
-    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
-    #
-    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
-    #
-    # (Modifiable) Performance tuning options
-    # BLOCK_M: The thread block size across the seqlen dim of Q.
-    # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block.
-
-    # The below are kernel options that can be applied for certain score_mods,
-    # or involve a numerics vs. perf tradeoff
-    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
-    # about 20% more numerical error, but slightly faster.
-    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
-    # is not masked out? If so, we can skip an extra safety check
-    # BLOCKS_ARE_CONTIGUOUS: Is it guaranteed that all blocks in the mask are
-    # contiguous? If so, we don't need to do an indirect jump for every block
-
-    tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0)
-    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
-
-    # Define strides of inputs
-    stride_qz, stride_qh, stride_qm, stride_qk = {{stride("Q")}}
-    stride_kz, stride_kh, stride_kn, stride_kk = {{stride("K")}}
-    stride_vz, stride_vh, stride_vn, stride_vk = {{stride("V")}}
-
-    ZQ = {{size("Q", 0)}}
-    HQ = {{size("Q", 1)}}
-    Q_LEN = {{size("Q", 2)}}
-    ZKV = {{size("K", 0)}}
-    KV_LEN = {{size("K", 2)}}
-
-    MATMUL_PRECISION = Q.dtype.element_ty
-
-    q_start = tl.program_id(0)
-    off_zq = tl.program_id(1)
-    off_hq = tl.program_id(2)
-
-    # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
-    # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
-    off_zkv = off_zq % ZKV
-    off_hkv = off_hq // GQA_SHARED_HEADS
-    off_g = off_hq % GQA_SHARED_HEADS
-
-    q_offset = off_zq * stride_qz + off_hq * stride_qh
-    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
-    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
-
-    Q = Q + q_offset
-    K = K + k_offset
-    V = V + v_offset
-
-    # Setting up the TMA descriptors for Q, K, V
-    desc_q = None
-    desc_k = None
-    desc_v = None
-    {%- if USE_TMA %}
-    desc_q = tl.make_tensor_descriptor(
-        base=Q,
-        shape=[Q_LEN, QK_HEAD_DIM],
-        strides=[stride_qm, 1],
-        block_shape=[BLOCK_M, QK_HEAD_DIM_ROUNDED],
-    )
-
-    desc_k = tl.make_tensor_descriptor(
-        base=K,
-        shape=[KV_LEN, QK_HEAD_DIM],
-        strides=[stride_kn, 1],
-        block_shape=[BLOCK_N, QK_HEAD_DIM_ROUNDED],
-    )
-
-    desc_v = tl.make_tensor_descriptor(
-        base=V,
-        shape=[KV_LEN, V_HEAD_DIM],
-        strides=[stride_vn, 1],
-        block_shape=[BLOCK_N, V_HEAD_DIM_ROUNDED],
-    )
-    {%- endif %}
-
-    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
-    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
-
-    sparse_idx_z = off_zq % SPARSE_Z
-    sparse_idx_hq = off_hq % SPARSE_HQ
-
-    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M)
-    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
-
-    stride_kv_num_blks_h = {{stride("KV_NUM_BLKS", 1)}}
-    stride_kv_idx_h = {{stride("KV_IDX", 1)}}
-    stride_kv_idx_m = {{stride("KV_IDX", 2)}}
-
-    # initialize pointer to m and l
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
-
-    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
-
-    # KV_IDX and KV_NUM_BLKS are always contiguous.
-    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
-    sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
-    sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950
-    K_block_ptr = None
-    V_block_ptr = None
-    Q_block_ptr = None
-
-    if not USE_TMA:
-        Q_block_ptr = tl.make_block_ptr(
-            base=Q ,
-            shape=(Q_LEN, QK_HEAD_DIM),
-            strides=(stride_qm, stride_qk),
-            offsets=(q_start * BLOCK_M, 0),
-            block_shape=(BLOCK_M, QK_HEAD_DIM_ROUNDED),
-            order=(1, 0)
-        )
-
-    {%- if USE_TMA %}
-    q = tl.load_tensor_descriptor(
-        desc_q,
-        [(q_start * BLOCK_M).to(tl.int32), 0],
-    )
-    {%- else %}
-        q = load_checked_block(Q_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
-    {%- endif %}
-
-    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    # We don't know anything "special" about these blocks, so we need to apply
-    # both score_mod and mask_mod to it
-    kv_indices = KV_IDX + sparse_kv_idx_offset
-    kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
-    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
-    block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
-
-
-    if not USE_TMA:
-        K_block_ptr = tl.make_block_ptr(
-            base=K,
-            shape=(QK_HEAD_DIM, KV_LEN),
-            strides=(stride_kk, stride_kn),
-            offsets=(0, kv_start),
-            block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
-            order=(0, 1)
-        )
-
-        V_block_ptr = tl.make_block_ptr(
-            base=V,
-            shape=(KV_LEN, V_HEAD_DIM),
-            strides=(stride_vn, stride_vk),
-            offsets=(kv_start, 0),
-            block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
-            order=(1, 0)
-        )
-
-    offs_n = kv_start + tl.arange(0, BLOCK_N)
-
-
-    acc, l_i, m_i = forward_inner(
-        {{gen_argdefs()}},
-        q, K_block_ptr, V_block_ptr,
-        desc_k, desc_v, Q_LEN, KV_LEN,
-        acc, l_i, m_i,
-        off_zq, off_hq, offs_m[:, None], offs_n[None, :],
-        kv_start,
-        kv_indices, kv_num_blocks,
-        0, block_n_end,
-        MATMUL_PRECISION,
-        IS_FULL_BLOCKS=False,
-    )
-
-    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    # We know these blocks are guaranteed to be "full", so we don't need to
-    # apply mask_mod to them - only score_mod
-    if HAS_FULL_BLOCKS:
-        # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
-        kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
-        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
-        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
-        block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
-        if not USE_TMA:
-            K_block_ptr = tl.make_block_ptr(
-                base=K,
-                shape=(QK_HEAD_DIM, KV_LEN),
-                strides=(stride_kk, stride_kn),
-                offsets=(0, kv_start),
-                block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
-                order=(0, 1)
-            )
-            V_block_ptr = tl.make_block_ptr(
-                base=V,
-                shape=(KV_LEN, V_HEAD_DIM),
-                strides=(stride_vn, stride_vk),
-                offsets=(kv_start, 0),
-                block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
-                order=(1, 0)
-            )
-        offs_n = kv_start + tl.arange(0, BLOCK_N)
-
-        acc, l_i, m_i = forward_inner(
-            {{gen_argdefs()}},
-            q, K_block_ptr, V_block_ptr,
-            desc_k, desc_v, Q_LEN, KV_LEN,
-            acc, l_i, m_i,
-            off_zq, off_hq, offs_m[:, None], offs_n[None, :],
-            kv_start,
-            kv_indices, kv_num_blocks,
-            0, block_n_end,
-            MATMUL_PRECISION,
-            IS_FULL_BLOCKS=True,
-        )
-
-
-    # [Note] Handle fully masked out rows:
-    # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf.
-    # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step
-    l_i = tl.where(l_i == 0.0, 1, l_i)
-
-    acc = acc / l_i[:, None]
-    idx_zq = tl.program_id(1)
-    idx_hq = tl.program_id(2)
-    idx_m = offs_m[:, None]
-    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :]
-
-    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
-
-    {{store_output(("idx_zq", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}}
-
-    if OUTPUT_LOGSUMEXP:
-        off_hz = off_zq * HQ + off_hq
-        l_ptrs = LSE + off_hz * Q_LEN + offs_m
-        lse = m_i + tl.math.log2(l_i)
-        if IS_DIVISIBLE:
-            tl.store(l_ptrs, lse)
-        else:
-            tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)
- """
-
-
 flex_attention_template = TritonTemplate(
     name="flex_attention",
     grid=flex_attention_grid,
-    source=compute_flex_attention
-    + compute_forward_inner
-    + compute_next_offset_func
-    + compute_forward_block_mn
-    + load_checked_block
-    + get_bounded_indices_func,
+    source=load_template("flex_attention")
+    + load_template("utilities")
+    + load_template("common"),
 )
 
 
@@ -361,7 +105,6 @@ def flex_attention(
             score_mod_other_buffers,
             mask_mod_other_buffers,
         )
-
     # below is cuda path if device is not cpu
     # tl.dot does not support embedding size less than 16
     small_dqk = V.graph.sizevars.evaluate_expr(sympy.Lt(query.get_size()[-1], 16))
@@ -425,7 +168,7 @@ def flex_attention(
     enable_gqa = V.graph.sizevars.evaluate_expr(
         sympy.Ne(query.get_size()[1], key.get_size()[1]),
     )
-    if _use_flex_decoding(query, kv_indices, kernel_options, enable_gqa):
+    if _use_flex_decoding(query, kv_indices, value, kernel_options, enable_gqa):
         return create_flex_decoding_kernel(
             query,
             key,
@@ -510,6 +253,12 @@ def flex_attention(
         dtype=torch.float32,  # The logsumexp is always stored in fp32 regardless of the input dtype
         device=query.get_device(),
     )
+    max_scores = empty_strided(
+        logsumexp_shape,  # Same shape as logsumexp
+        None,
+        dtype=torch.float32,  # The max scores are always stored in fp32 regardless of the input dtype
+        device=query.get_device(),
+    )
     kernel_options.setdefault("SM_SCALE", scale)
 
     # Determine GQA broadcast factor.
@@ -531,7 +280,9 @@ def flex_attention(
 
     dtype = query.get_dtype()
     head_dim = V.graph.sizevars.guard_int(query.get_size()[-1])
-    configs = V.choices.get_flex_attention_fwd_configs(head_dim, dtype)
+    configs = V.choices.get_flex_attention_fwd_configs(
+        head_dim, dtype, query.get_device().type
+    )
 
     # Mark SPARSE_KV_BLOCK_SIZE & SPARSE_Q_BLOCK_SIZE as static shapes and add guards.
     SPARSE_KV_BLOCK_SIZE = V.graph.sizevars.guard_int(SPARSE_KV_BLOCK_SIZE)
@@ -601,6 +352,7 @@ def flex_attention(
                 key,
                 value,
                 logsumexp,
+                max_scores,
                 kv_num_blocks,
                 kv_indices,
                 full_kv_num_blocks,
@@ -613,6 +365,7 @@ def flex_attention(
             ],
             mutated_inputs=[
                 logsumexp,
+                max_scores,
             ],
             call_sizes=query.get_size(),
             **cur_kernel_options,
@@ -625,6 +378,7 @@ def flex_attention(
             key,
             value,
             logsumexp,
+            max_scores,
             kv_num_blocks,
             kv_indices,
             full_kv_num_blocks,
@@ -634,10 +388,10 @@ def flex_attention(
         + list(mask_mod_other_buffers)
     )
     input_gen_fns = {
-        4: create_num_blocks_fake_generator(kv_indices),
-        5: create_indices_fake,
-        6: create_num_blocks_fake_generator(full_kv_indices),
-        7: create_indices_fake,
+        5: create_num_blocks_fake_generator(kv_indices),
+        6: create_indices_fake,
+        7: create_num_blocks_fake_generator(full_kv_indices),
+        8: create_indices_fake,
     }
 
     out = autotune_select_algorithm(
@@ -658,7 +412,7 @@ def flex_attention(
         subgraph_buffer, mask_graph_buffer
     )
 
-    return (out, logsumexp)
+    return (out, logsumexp, max_scores)
 
 
 # ---------------------------- Backward HOP Implementation ----------------------------
@@ -685,693 +439,7 @@ def flex_attention_backward_grid(
 flex_attention_backward_template = TritonTemplate(
     name="flex_attention_backward",
     grid=flex_attention_backward_grid,
-    source=r"""
-{{def_kernel("Q", "K", "V", "LSE", "DELTA", "DO", "DQ", "DV", "KV_NUM_BLKS", "KV_IDX", "Q_NUM_BLKS", "Q_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX", "FULL_Q_NUM_BLKS", "FULL_Q_IDX")}}
-    # Sub notation for this kernel:
-    #
-    # Q: Query, K: Key, V: Value
-    # LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype)
-    # DELTA: Precomputed sum(OUT*DO, axis=-1)
-    # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value
-    # DK: Derivative of Key, is the written to via the store_output call due to some limitations with
-    # inductor codegen
-    # M: Number of queries, N: Number of keys/values
-    # QK_HEAD_DIM: The dimension of the query and key embeddings
-    # V_HEAD_DIM: The dimension of the value embeddings
-    # z: Batch size, h: Number of heads, m: Number of queries or keys/values, d: Head dim
-    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
-    # (Modifiable) Performance tuning options
-    # BLOCK_M1: when calculating DK & DV, iterate over BLOCK_M1 across the seqlen dim of Q in each thread block.
-    # BLOCK_N1: when calculating DK & DV, the thread block size across the seqlen dim of K/V.
-    # BLOCK_M2: when calculating DQ, the thread block size across the seqlen dim of Q.
-    # BLOCK_N2: when calculating DQ, iterate over BLOCK_N2 across the seqlen dim of K/V in each thread block.
-    #
-    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
-    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
-    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
-    # Q_NUM_BLKS: The number of Q blocks (that may or may not require masking) for each query.
-    # Q_IDX: The indices of Q blocks (that may or may not require masking) for each query.
-    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
-    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
-    # FULL_Q_NUM_BLKS: The number of fully unmasked Q blocks (so we don't need masking) for each query.
-    # FULL_Q_IDX: The indices of fully unmasked Q blocks (so we don't need masking) for each query.
-
-    # The below are kernel options that can be applied for certain score_mods,
-    # or involve a numerics vs. perf tradeoff
-    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
-    # about 20% more numerical error, but slightly faster.
-
-    # Define strides of inputs
-    stride_qz, stride_qh, stride_qm, stride_qd = {{stride("Q")}}
-    stride_kz, stride_kh, stride_kn, stride_kd = {{stride("K")}}
-    stride_vz, stride_vh, stride_vn, stride_vd = {{stride("V")}}
-    stride_doz, stride_doh, stride_dom, stride_dod = {{stride("DO")}}
-
-    stride_dqz, stride_dqh, stride_dqm, stride_dqd = {{stride("DQ")}}
-    stride_dvz, stride_dvh, stride_dvm, stride_dvd = {{stride("DV")}}
-
-    ZQ = {{size("Q", 0)}}
-    HQ = {{size("Q", 1)}}
-    HKV = {{size("K", 1)}}
-    Q_LEN = {{size("Q", 2)}}
-    ZKV = {{size("K", 0)}}
-    KV_LEN = {{size("K", 2)}}
-
-    MATMUL_PRECISION = Q.dtype.element_ty
-
-    pid = tl.program_id(0)
-    NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1)
-    NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2)
-
-    off_zq = tl.program_id(1) # q batch idx
-    off_hkv = tl.program_id(2) # kv head idx
-    off_zkv = off_zq % ZKV # kv batch idx
-
-    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
-    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
-
-    sparse_idx_z = off_zq % SPARSE_Z
-
-    k_adj = (stride_kh * off_hkv + stride_kz * off_zkv).to(tl.int64)
-    v_adj = (stride_vh * off_hkv + stride_vz * off_zkv).to(tl.int64)
-    # first compute broadcasted dv of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
-    # then reduce to dv of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
-    dv_adj = (stride_dvh * off_hkv + stride_dvz * off_zq).to(tl.int64)
-
-    # offset K, V, DV pointers for batch/kv-head
-    K += k_adj
-    V += v_adj
-    DV += dv_adj
-
-    RCP_LN2 = 1.44269504
-    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
-    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
-
-    if pid >= NUM_KV_BLOCKS:
-        off_pid = pid - NUM_KV_BLOCKS
-        # THIS BLOCK DOES DQ
-        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M2)
-        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
-        off_hq2 = off_pid // NUM_Q_BLOCKS + off_hkv * GQA_SHARED_HEADS
-        start_m2_block = off_pid % NUM_Q_BLOCKS
-        off_pid_mask = start_m2_block // SPARSE_Q_MULTIPLE
-        stride_kv_num_blks_h = {{stride("KV_NUM_BLKS", 1)}}
-        stride_kv_idx_h = {{stride("KV_IDX", 1)}}
-        stride_kv_idx_m = {{stride("KV_IDX", 2)}}
-
-        sparse_idx_hq2 = off_hq2 % SPARSE_HQ
-        sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq2
-
-        sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask
-        sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m  # noqa: B950
-
-        # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
-        q_adj2 = (stride_qh * off_hq2 + stride_qz * off_zq).to(tl.int64)
-        do_adj2 = (stride_doh * off_hq2 + stride_doz * off_zq).to(tl.int64)
-        dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_zq).to(tl.int64)
-        off_chz2 = ((off_zq * HQ + off_hq2) * Q_LEN).to(tl.int64)
-
-        Q2 = Q + q_adj2
-        DO2 = DO + do_adj2
-        # TODO: This does not work if DQ is not the same layout as Q (for example,
-        # if Q is broadcasted)
-        DQ2 = DQ + dq_adj2
-        LSE2 = LSE + off_chz2
-        DELTA2 = DELTA + off_chz2
-
-        # dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM], dtype=tl.float32)
-        dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
-
-        start_m2 = start_m2_block * BLOCK_M2
-        offs_m2 = start_m2 + tl.arange(0, BLOCK_M2)
-
-        # load Q and do: they stay in SRAM throughout the inner loop.
-        q = load_checked_2d(Q2, offs_m2, offs_k, stride_qm, stride_qd, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
-        do = load_checked_2d(DO2, offs_m2, offs_v, stride_dom, stride_dod, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
-
-        if PRESCALE_QK:
-            q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
-
-        if IS_DIVISIBLE:
-            Di = tl.load(DELTA2 + offs_m2)
-            lse = tl.load(LSE2 + offs_m2)
-        else:
-            Di = tl.load(DELTA2 + offs_m2, mask=offs_m2 < Q_LEN)
-            lse = tl.load(LSE2 + offs_m2, mask=offs_m2 < Q_LEN)
-        lse = tl.where(lse == -float("inf"), 0.0, lse)
-        lse = lse[:, None]
-
-        # ~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-        # KV_IDX and KV_NUM_BLKS are always contiguous.
-        kv_indices = KV_IDX + sparse_kv_idx_offset
-        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
-        sparse_kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
-
-        offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
-        dq = bwd_dq_inner(
-            {{gen_argdefs()}},
-            K, V,
-            dq, q, do, Di, lse,
-            off_zq, off_hq2, offs_m2, offs_n2,
-            stride_kn, stride_kd, stride_vn, stride_vd,
-            kv_indices, sparse_kv_num_blocks,
-            MATMUL_PRECISION,
-            IS_FULL_BLOCKS=False,
-        )
-
-        if HAS_FULL_BLOCKS:
-            # ~~~~~~~~~~~ partial unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-            # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
-            kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
-            kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
-            sparse_kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
-
-            offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
-            dq = bwd_dq_inner(
-                {{gen_argdefs()}},
-                K, V,
-                dq, q, do, Di, lse,
-                off_zq, off_hq2, offs_m2, offs_n2,
-                stride_kn, stride_kd, stride_vn, stride_vd,
-                kv_indices, sparse_kv_num_blocks,
-                MATMUL_PRECISION,
-                IS_FULL_BLOCKS=True,
-            )
-
-        # Write back dQ.
-        dq_ptrs = DQ2 + offs_m2[:, None] * stride_dqm + offs_k[None, :] * stride_dqd
-        dq *= SM_SCALE
-        if IS_DIVISIBLE and SAFE_HEAD_DIM:
-            tl.store(dq_ptrs, dq)
-        else:
-            tl.store(dq_ptrs, dq, mask=(offs_m2[:, None] < Q_LEN) & (offs_k[None, :] < QK_HEAD_DIM))
-    else:
-        # THIS BLOCK DOES DK & DV
-        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
-        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N1)
-
-        pid_mask = pid // SPARSE_KV_MULTIPLE
-
-        stride_q_num_blks_h = {{stride("Q_NUM_BLKS", 1)}}
-        stride_q_idx_h = {{stride("Q_IDX", 1)}}
-        stride_q_idx_n = {{stride("Q_IDX", 2)}}
-
-
-        dv = tl.zeros([BLOCK_N1, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
-        dk = tl.zeros([BLOCK_N1, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
-
-        start_n1 = pid * BLOCK_N1
-        offs_n1 = start_n1 + tl.arange(0, BLOCK_N1)
-
-        # load K and V: they stay in SRAM throughout the inner loop.
-        k = load_checked_2d(K, offs_n1, offs_k, stride_kn, stride_kd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
-        v = load_checked_2d(V, offs_n1, offs_v, stride_vn, stride_vd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
-
-        if PRESCALE_QK:
-            k = (k * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
-
-        for off_g in range(0, GQA_SHARED_HEADS):
-            off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g
-
-            # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
-            q_adj1 = (stride_qh * off_hq1 + stride_qz * off_zq).to(tl.int64)
-            do_adj1 = (stride_doh * off_hq1 + stride_doz * off_zq).to(tl.int64)
-            dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_zq).to(tl.int64)
-            off_chz1 = ((off_zq * HQ + off_hq1) * Q_LEN).to(tl.int64)
-
-            Q1 = Q + q_adj1
-            DO1 = DO + do_adj1
-            # TODO: This does not work if DQ is not the same layout as Q (for example,
-            # if Q is broadcasted)
-            LSE1 = LSE + off_chz1
-            DELTA1 = DELTA + off_chz1
-
-            sparse_idx_hq1 = off_hq1 % SPARSE_HQ
-            sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq1
-
-            sparse_q_num_blks_offset = sparse_hz_offset * stride_q_num_blks_h + pid_mask
-            sparse_q_idx_offset = sparse_hz_offset * stride_q_idx_h + pid_mask * stride_q_idx_n  # noqa: B950
-
-            # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-            # Q_IDX and Q_NUM_BLKS are always contiguous.
-            q_indices = Q_IDX + sparse_q_idx_offset
-            q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
-            sparse_q_num_blocks = tl.load(Q_NUM_BLKS + sparse_q_num_blks_offset)
-
-            offs_m1 = q_start + tl.arange(0, BLOCK_M1)
-            dk, dv = bwd_dkdv_inner(
-                {{gen_argdefs()}},
-                Q1, DO1, DELTA1, LSE1,
-                dk, dv, k, v,
-                off_zq, off_hq1, offs_n1, offs_m1,
-                stride_qm, stride_qd, stride_dom, stride_dod,
-                q_indices, sparse_q_num_blocks,
-                MATMUL_PRECISION,
-                IS_FULL_BLOCKS=False,
-            )
-
-
-            if HAS_FULL_BLOCKS:
-                # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-                # FULL_Q_IDX and FULL_Q_NUM_BLKS are always contiguous.
-                q_indices = FULL_Q_IDX + sparse_q_idx_offset
-                q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
-                sparse_q_num_blocks = tl.load(FULL_Q_NUM_BLKS + sparse_q_num_blks_offset)
-
-                offs_m1 = q_start + tl.arange(0, BLOCK_M1)
-                dk, dv = bwd_dkdv_inner(
-                    {{gen_argdefs()}},
-                    Q1, DO1, DELTA1, LSE1,
-                    dk, dv, k, v,
-                    off_zq, off_hq1, offs_n1, offs_m1,
-                    stride_qm, stride_qd, stride_dom, stride_dod,
-                    q_indices, sparse_q_num_blocks,
-                    MATMUL_PRECISION,
-                    IS_FULL_BLOCKS=True,
-                )
-
-        # Write back dV and dK.
-        dv_ptrs = DV + offs_n1[:, None] * stride_dvm + offs_v[None, :] * stride_dvd
-
-        index_n = offs_n1[:, None]
-        index_k = offs_k[None, :]
-        index_v = offs_v[None, :]
-
-        if IS_DIVISIBLE and SAFE_HEAD_DIM:
-            tl.store(dv_ptrs, dv)
-        else:
-            tl.store(dv_ptrs, dv, mask=(index_n < KV_LEN) & (index_v < V_HEAD_DIM))
-
-        dk *= SM_SCALE
-
-        if SAFE_HEAD_DIM:
-            mask = index_n < KV_LEN
-        else:
-            mask = (index_n < KV_LEN) & (index_k < QK_HEAD_DIM)
-
-        # first compute broadcasted dk of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
-        # then reduce to dk of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
-        {{store_output(("off_zq", "off_hkv", "index_n", "index_k"), "dk", "mask", indent_width=8)}}
-
-@triton.jit
-def bwd_dq_inner(
-    {{gen_argdefs()}},
-    K, V,  # pointers
-    dq, q, do, Di, lse,
-    off_z, off_hq, offs_m2, offs_n2,
-    stride_kn, stride_kd, stride_vn, stride_vd,
-    kv_indices, sparse_kv_num_blocks,
-    MATMUL_PRECISION,
-    IS_FULL_BLOCKS,
-):
-    {{gen_defines() | indent_except_first(1) }}
-    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
-    RCP_LN2: tl.constexpr = 1.44269504
-    Q_LEN = {{size("Q", 2)}}
-    KV_LEN = {{size("K", 2)}}
-
-    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
-    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
-
-    kT_ptrs = K + offs_n2[None, :] * stride_kn + offs_k[:, None] * stride_kd
-    vT_ptrs = V + offs_n2[None, :] * stride_vn + offs_v[:, None] * stride_vd
-    # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
-    tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
-
-    hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1))
-    if not IS_DIVISIBLE:
-        if hi >= 1:
-            for start_n in range(0, hi - 1):
-                dq = bwd_dq_block_mn(
-                    {{gen_argdefs()}},
-                    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
-                    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
-                    stride_kn, stride_kd, stride_vn, stride_vd,
-                    kv_indices, sparse_kv_num_blocks,
-                    MATMUL_PRECISION, RCP_LN2,
-                    IS_FULL_BLOCKS,
-                )
-
-                # Increment pointers.
-                offset = get_offset_for_next_block(
-                    start_n, kv_indices, sparse_kv_num_blocks,
-                    SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
-                )
-
-                kT_ptrs += offset * stride_kn
-                vT_ptrs += offset * stride_vn
-
-                offs_n2 += offset
-
-            dq = bwd_dq_block_mn(
-                {{gen_argdefs()}},
-                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
-                off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
-                stride_kn, stride_kd, stride_vn, stride_vd,
-                kv_indices, sparse_kv_num_blocks,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
-            )
-    else:
-        for start_n in range(0, hi):
-            dq = bwd_dq_block_mn(
-                {{gen_argdefs()}},
-                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
-                off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
-                stride_kn, stride_kd, stride_vn, stride_vd,
-                kv_indices, sparse_kv_num_blocks,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS,
-            )
-
-            # Increment pointers.
-            offset = get_offset_for_next_block(
-                start_n, kv_indices, sparse_kv_num_blocks,
-                SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
-            )
-
-            kT_ptrs += offset * stride_kn
-            vT_ptrs += offset * stride_vn
-
-            offs_n2 += offset
-
-    return dq
-
-
-@triton.jit
-def bwd_dq_block_mn(
-    {{gen_argdefs()}},
-    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
-    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
-    stride_kn, stride_kd, stride_vn, stride_vd,
-    kv_indices, sparse_kv_num_blocks,
-    MATMUL_PRECISION, RCP_LN2,
-    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
-):
-    {{gen_defines() | indent_except_first(1)}}
-
-    # NB reversed order to since K is transposed
-    kT = load_checked_2d(kT_ptrs, offs_k, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, KV_LEN)
-    qk = tl.dot(q, kT, input_precision=FLOAT32_PRECISION)
-    if not PRESCALE_QK:
-        qk *= SM_SCALE
-    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
-    pre_mod_scores = qk
-    n = get_bounded_indices(offs_n2[None, :], KV_LEN if CHECK_BLOCK_BOUNDARY else None)
-    # The boundary check is done for the outer loop, but here it's possible since we're iterating across N dim
-    # that the M reads out of bounds prior to the last loop
-    m = get_bounded_indices(offs_m2[:, None], Q_LEN if (not IS_DIVISIBLE or CHECK_BLOCK_BOUNDARY) else None)
-
-    {{ modification(
-        subgraph_number=0,
-        output_name="post_mod_scores",
-        score="qk",
-        b="off_z",
-        h="off_hq",
-        m="m",
-        n="n",
-        out="qk"
-    ) | indent_except_first(1) }}
-
-    if CHECK_BLOCK_BOUNDARY:
-        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
-        post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf"))
-
-    if not IS_FULL_BLOCKS:
-        {{ modification(
-            subgraph_number=2,
-            output_name="mask_mod_output",
-            score="qk",
-            b="off_z",
-            h="off_hq",
-            m="m",
-            n="n",
-        ) | indent_except_first(2) }}
-
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, False)
-        # apply mask for partial masked block
-        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    if not PRESCALE_QK:
-        post_mod_scores *= RCP_LN2
-    p = tl.math.exp2(post_mod_scores - lse)
-    # Compute dP and dS.
-    # NB reversed order to since V is transposed
-    vT = load_checked_2d(vT_ptrs, offs_v, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, V_HEAD_DIM, KV_LEN)
-
-    dp = tl.dot(do, vT, input_precision=FLOAT32_PRECISION)
-    ds = p * (dp - Di[:, None])
-    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
-    {{ modification(
-        subgraph_number=1,
-        output_name = "grad_scores",
-        score="pre_mod_scores",
-        b="off_z",
-        h="off_hq",
-        m="m",
-        n="n",
-        grad_score_mod="ds"
-    ) | indent_except_first(1) }}
-    if CHECK_BLOCK_BOUNDARY:
-        grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0)
-
-    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
-    if WRITE_DQ:
-        scatter_mask = offs_m2[:, None] < Q_LEN and offs_n2[None, :] < KV_LEN
-        {{ modification(
-            subgraph_number=3,
-            output_name=None,
-            mask="scatter_mask",
-            score="pre_mod_scores",
-            b="off_z",
-            h="off_hq",
-            m="m",
-            n="n",
-            grad_score_mod="ds"
-        ) | indent_except_first(2) }}
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ds = grad_scores
-
-    if not IS_FULL_BLOCKS:
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, False)
-        # (grads) apply mask for partially unmasked block
-        ds = tl.where(mask_mod_output, ds, 0.0)
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ds = ds.to(MATMUL_PRECISION)
-    # Compute dQ.
-    dq += tl.dot(ds, tl.trans(kT), input_precision=FLOAT32_PRECISION)
-
-    return dq
-
-
-@triton.jit
-def bwd_dkdv_inner(
-    {{gen_argdefs()}},
-    Q, DO, DELTA, LSE, # pointers
-    dk, dv, k, v,
-    off_z, off_hq, offs_n1, offs_m1,
-    stride_qm, stride_qd, stride_dom, stride_dod,
-    q_indices, sparse_q_num_blocks,
-    MATMUL_PRECISION,
-    IS_FULL_BLOCKS,
-):
-    {{gen_defines() | indent_except_first(1) }}
-    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
-    RCP_LN2: tl.constexpr = 1.44269504
-    Q_LEN = {{size("Q", 2)}}
-    KV_LEN = {{size("K", 2)}}
-
-    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
-    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
-
-    qT_ptrs = Q + offs_m1[None, :] * stride_qm + offs_k[:, None] * stride_qd
-    do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod
-    # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
-    tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
-    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
-
-    if not IS_DIVISIBLE:
-        if hi >= 1:
-            for start_m in range(0, hi - 1):
-                dk, dv = bwd_dkdv_block_mn(
-                    {{gen_argdefs()}},
-                    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
-                    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
-                    stride_qm, stride_qd, stride_dom, stride_dod,
-                    q_indices, sparse_q_num_blocks,
-                    MATMUL_PRECISION, RCP_LN2,
-                    IS_FULL_BLOCKS,
-                )
-                # Increment pointers.
-                offset = get_offset_for_next_block(
-                    start_m, q_indices, sparse_q_num_blocks,
-                    SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
-                )
-
-                qT_ptrs += offset * stride_qm
-                do_ptrs += offset * stride_dom
-
-                offs_m1 += offset
-
-            dk, dv = bwd_dkdv_block_mn(
-                {{gen_argdefs()}},
-                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
-                off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
-                stride_qm, stride_qd, stride_dom, stride_dod,
-                q_indices, sparse_q_num_blocks,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
-            )
-    else:
-        for start_m in range(0, hi):
-            dk, dv = bwd_dkdv_block_mn(
-                {{gen_argdefs()}},
-                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
-                off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
-                stride_qm, stride_qd, stride_dom, stride_dod,
-                q_indices, sparse_q_num_blocks,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS,
-            )
-            # Increment pointers.
-            offset = get_offset_for_next_block(
-                start_m, q_indices, sparse_q_num_blocks,
-                SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
-            )
-
-            qT_ptrs += offset * stride_qm
-            do_ptrs += offset * stride_dom
-
-            offs_m1 += offset
-
-    return dk, dv
-
-
-@triton.jit
-def bwd_dkdv_block_mn(
-    {{gen_argdefs()}},
-    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
-    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
-    stride_qm, stride_qd, stride_dom, stride_dod,
-    q_indices, sparse_q_num_blocks,
-    MATMUL_PRECISION, RCP_LN2,
-    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
-):
-    {{gen_defines() | indent_except_first(1) }}
-
-    # NB reversed order since Q is transposed
-    qT = load_checked_2d(qT_ptrs, offs_k, offs_m1, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, Q_LEN)
-    # Load LSE before computing qk to reduce pipeline stall.
-    if IS_DIVISIBLE:
-        lse = tl.load(LSE + offs_m1)
-    else:
-        lse = tl.load(LSE + offs_m1, mask=offs_m1 < Q_LEN)
-    lse = tl.where(lse == -float("inf"), 0.0, lse)
-    qkT = tl.dot(k, qT, input_precision=FLOAT32_PRECISION)
-    if not PRESCALE_QK:
-        qkT *= SM_SCALE
-    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
-    m = get_bounded_indices(offs_m1[None, :], Q_LEN if CHECK_BLOCK_BOUNDARY else None)
-    # The boundary check is done for the outer loop, but here it's possible since we're iterating across M dim
-    # that the n reads out of bounds prior to the last loop
-    n = get_bounded_indices(offs_n1[:, None], KV_LEN if (not IS_DIVISIBLE or CHECK_BLOCK_BOUNDARY) else None)
-
-    pre_mod_scores = qkT
-    {{ modification(
-        subgraph_number=0,
-        output_name="post_mod_scores",
-        score="qkT",
-        b="off_z",
-        h="off_hq",
-        m="m",
-        n="n",
-        out="qkT"
-    ) | indent_except_first(1) }}
-
-    if CHECK_BLOCK_BOUNDARY:
-        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
-        post_mod_scores = tl.where(offs_n1[:, None] < KV_LEN, post_mod_scores, float("-inf"))
-
-    if not IS_FULL_BLOCKS:
-        {{ modification(
-            subgraph_number=2,
-            output_name="mask_mod_output",
-            score="qkT",
-            b="off_z",
-            h="off_hq",
-            m="m",
-            n="n",
-        ) | indent_except_first(2) }}
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, False)
-        # (grads) apply mask for fully masked block
-        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    if not PRESCALE_QK:
-        post_mod_scores *= RCP_LN2
-    pT = tl.math.exp2(post_mod_scores - lse[None, :])
-    do = load_checked_2d(do_ptrs, offs_m1, offs_v, None, None, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
-    # Compute dV.
-    ppT = pT
-    dv += tl.dot(ppT.to(MATMUL_PRECISION), do, input_precision=FLOAT32_PRECISION)
-    if IS_DIVISIBLE:
-        Di = tl.load(DELTA + offs_m1)
-    else:
-        Di = tl.load(DELTA + offs_m1, mask=offs_m1 < Q_LEN)
-    # Compute dP and dS.
-    dpT = tl.dot(v, tl.trans(do), input_precision=FLOAT32_PRECISION)
-    dsT = pT * (dpT - Di[None, :])
-    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
-    {{ modification(
-        subgraph_number=1,
-        output_name = "grad_scores",
-        score="pre_mod_scores",
-        b="off_z",
-        h="off_hq",
-        m="m",
-        n="n",
-        grad_score_mod="dsT"
-    ) | indent_except_first(1) }}
-
-    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
-    if not WRITE_DQ:
-        idx_b = off_z
-        idx_h = off_hq
-        idx_m = m
-        idx_n = n
-        scatter_mask = offs_m1[None, :] < Q_LEN and offs_n1[:, None] < KV_LEN
-        {{ modification(
-            subgraph_number=3,
-            output_name=None,
-            mask="scatter_mask",
-            score="pre_mod_scores",
-            b="idx_b",
-            h="idx_h",
-            m="idx_m",
-            n="idx_n",
-            grad_score_mod="dsT"
-        ) | indent_except_first(2) }}
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    if CHECK_BLOCK_BOUNDARY:
-        grad_scores = tl.where(offs_n1[:, None] < KV_LEN, grad_scores, 0.0)
-
-    dsT = grad_scores
-    if not IS_FULL_BLOCKS:
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, False)
-        # (grads) apply mask for partially unmasked block
-        dsT = tl.where(mask_mod_output, dsT, 0.0)
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    dk += tl.dot(dsT.to(MATMUL_PRECISION), tl.trans(qT), input_precision=FLOAT32_PRECISION)
-
-    return dk, dv
- """
-    + compute_next_offset_func
-    + get_bounded_indices_func
-    + load_checked_2d,
+    source=load_template("flex_backwards") + load_template("utilities"),
 )
 
 
@@ -1535,10 +603,12 @@ def flex_attention_backward(*args, **kwargs):
         for k, v in kernel_options.items()
     }
     kernel_options.setdefault("FLOAT32_PRECISION", get_float32_precision())
-    if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0:
-        kernel_options.setdefault("IS_DIVISIBLE", False)
-    else:
+    seq_q_divisible = V.graph.sizevars.statically_known_true(seq_len_q % 128 == 0)
+    seq_kv_divisible = V.graph.sizevars.statically_known_true(seq_len_kv % 128 == 0)
+    if seq_q_divisible and seq_kv_divisible:
         kernel_options.setdefault("IS_DIVISIBLE", True)
+    else:
+        kernel_options.setdefault("IS_DIVISIBLE", False)
 
     fwd_placeholder_inps = [
         create_placeholder(name, dtype, device)
@@ -1653,7 +723,9 @@ def flex_attention_backward(*args, **kwargs):
 
     dtype = query.get_dtype()
     head_dim = V.graph.sizevars.guard_int(query.get_size()[-1])
-    configs = V.choices.get_flex_attention_bwd_configs(head_dim, dtype)
+    configs = V.choices.get_flex_attention_bwd_configs(
+        head_dim, dtype, query.get_device().type
+    )
 
     # Default config for warp specialization
     num_consumer_groups, num_buffers_warp_spec = 0, 0
diff --git a/torch/_inductor/kernel/flex/flex_decoding.py b/torch/_inductor/kernel/flex/flex_decoding.py
index 83c6b59cec96c..7cee221189046 100644
--- a/torch/_inductor/kernel/flex/flex_decoding.py
+++ b/torch/_inductor/kernel/flex/flex_decoding.py
@@ -18,15 +18,10 @@
     TritonTemplate,
 )
 from .common import (
-    compute_forward_block_mn,
-    compute_forward_inner,
-    compute_next_offset_func,
     create_indices_fake,
     create_num_blocks_fake_generator,
-    get_bounded_indices_func,
     get_fwd_subgraph_outputs,
-    load_checked_2d,
-    load_checked_block,
+    load_template,
     maybe_realize,
     set_head_dim_values,
 )
@@ -36,7 +31,7 @@
 prims = torch.ops.prims
 
 
-def _use_flex_decoding(query, kv_indices, kernel_options, enable_gqa) -> bool:
+def _use_flex_decoding(query, kv_indices, value, kernel_options, enable_gqa) -> bool:
     """Decide which kernel to use, return true if use flex decoding kernel.
     Note:
        Since the number of splits is calculated based of the the number of batch and head dims
@@ -65,13 +60,24 @@ def _use_flex_decoding(query, kv_indices, kernel_options, enable_gqa) -> bool:
                 sympy.Eq(kv_indices.get_size()[1], query.get_size()[1]),
             )
         )
+
+    Hq = query.get_size()[1]
+    Hkv = value.get_size()[1]
+    ratio = Hq // Hkv
+
+    pw_of_two = V.graph.sizevars.guard_or_false(
+        sympy.And(sympy.Gt(ratio, 0), sympy.Eq(ratio & (ratio - 1), 0))
+    )
+
     return (
         not force_flex
+        and not kernel_options.get("OUTPUT_MAX", False)
         and short_query_length
         and static_batch
         and static_num_heads
         and non_zero_length
         and valid_block_mask_num_heads
+        and pw_of_two
     )
 
 
@@ -90,271 +96,17 @@ def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, me
 flex_decoding_template = TritonTemplate(
     name="flex_decoding",
     grid=flex_decoding_grid,
-    source=r"""
-    {{def_kernel("Q", "K", "V", "M", "L", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}}
-    # Sub notation for this kernel:
-    # Q: Query, K: Key, V: Value
-    # reduction buffers: M rowmax across local KV split, L local sumexp across local KV split
-    # M: Number of queries, N: Number of keys/values
-    # QK_HEAD_DIM: The dimension of the query and key embeddings
-    # V_HEAD_DIM: The dimension of the value embeddings
-    # BLOCK_M, QK_HEAD_DIM: M, and D dimemsion are always assigned to the same block
-    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head t: Number of kv splits
-    # (Modifiable) Config options:
-    # SPLIT_KV: number of blocks K & V are split into
-    # TILE_KV: length of each local KV split
-    # BLOCK_M: block size that Q is padded along seqlen dim.
-    # BLOCK_N: block size of K & V along N dimension.
-    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
-    #
-    # change of base out of the loop
-    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
-    # is not masked out? If so, we can skip an extra safety check
-    # SAFE_M_BOUNDARY: Is Q seqlen a multiple of BLOCK_M? If so, we can skip an extra boundary check for loading query.
-    # SAFE_N_BOUNDARY: Is KV seqlen a multiple of BLOCK_N? If so, we can skip an extra boundary check for loading key/value.
-
-    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base.
-    #
-    # SPARSE_KV_BLOCK_SIZE: sparse mask block size along KV seqlen dim.
-    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
-    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
-    #
-    #
-    # Output: ACC output accumulated across local KV split.
-
-    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
-
-    # Define Q Strides
-    stride_qz, stride_qh, stride_qg, stride_qm, stride_qk = {{stride("Q")}}
-    stride_kz, stride_kh, stride_kn, stride_kk = {{stride("K")}}
-    stride_vz, stride_vh, stride_vn, stride_vk = {{stride("V")}}
-    stride_mz, stride_mt, stride_mh, stride_mm = {{stride("M")}}
-    stride_lz, stride_lt, stride_lh, stride_lm = {{stride("L")}}
-
-
-    Z = {{size("Q", 0)}}
-    ZKV = {{size("K", 0)}}
-    HKV = {{size("Q", 1)}}
-    G: tl.constexpr = GQA_SHARED_HEADS
-    HQ = HKV * G
-    Q_LEN = {{size("Q", 3)}}
-    KV_LEN = {{size("K", 2)}}
-
-    MATMUL_PRECISION = Q.dtype.element_ty
-
-    # Make sure each split is a multiple of BLOCK_N
-    TILE_KV_OG = tl.cdiv(KV_LEN, SPLIT_KV)
-    TILE_KV = tl.cdiv(TILE_KV_OG, BLOCK_N) * BLOCK_N
-    TILE_KV_MULTIPLE: tl.constexpr = (TILE_KV // BLOCK_N)
-
-    off_z = tl.program_id(0) // HKV
-    off_zkv = off_z % ZKV
-    off_hkv = tl.program_id(0) % HKV
-    off_t = tl.program_id(1)
-
-    q_offset = off_z * stride_qz + off_hkv * stride_qh
-    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
-    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
-
-    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
-    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
-
-    sparse_idx_z = off_z % SPARSE_Z
-    sparse_idx_h = off_hkv % SPARSE_HQ
-
-    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
-    SPARSE_KV_BLOCK_CNT = tl.cdiv(KV_LEN, SPARSE_KV_BLOCK_SIZE)
-
-    # initialize pointer to m and l
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
-
-    # initialize offsets
-    tl.device_assert(BLOCK_M % G == 0)
-    BLOCK_M_PER_HQ: tl.constexpr = BLOCK_M // G
-    off_g = tl.arange(0, G)                                                 # [G]
-    offs_g = tl.ravel(tl.broadcast_to(off_g[:, None], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
-    offs_hq = offs_g + off_hkv * G
-    off_m = tl.arange(0, BLOCK_M_PER_HQ)                                    # [BLOCK_M_PER_HQ]
-    offs_m = tl.ravel(tl.broadcast_to(off_m[None, :], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
-    offs_d = tl.arange(0, QK_HEAD_DIM_ROUNDED)
-    offs_vd = tl.arange(0, V_HEAD_DIM_ROUNDED)
-
-    # Get HZ offsets for KV_NUM_BLKS and KV_IDX
-    stride_block_z, stride_block_h, stride_block_row = {{stride("KV_NUM_BLKS")}}
-    sparse_block_hz_offset = sparse_idx_z * stride_block_z + sparse_idx_h * stride_block_h
-    stride_kv_z, stride_kv_h, stride_kv_row, stride_kv_col = {{stride("KV_IDX")}}
-    sparse_idx_hz_offset = sparse_idx_z * stride_kv_z + sparse_idx_h * stride_kv_h
-
-    # Calculate KV blocks that belong this CTA.
-    block_n_start = off_t * TILE_KV_MULTIPLE                        # n_offset inside sparse block
-    block_n_end = block_n_start + TILE_KV_MULTIPLE                  # end BLOCK_N
-
-    q_range = stride_qg * off_g[:, None, None] + stride_qm * off_m[None, :, None] + stride_qk * offs_d[None, None, :]
-
-    if not SAFE_M_BOUNDARY and not SAFE_HEAD_DIM:
-        q = tl.load(Q + q_offset + q_range, mask=(offs_d[None, None, :] < QK_HEAD_DIM) & (off_m[None, :, None] < Q_LEN))
-    elif SAFE_M_BOUNDARY and not SAFE_HEAD_DIM:
-        q = tl.load(Q + q_offset + q_range, mask=offs_d[None, None, :] < QK_HEAD_DIM)
-    elif not SAFE_M_BOUNDARY and SAFE_HEAD_DIM:
-        q = tl.load(Q + q_offset + q_range, mask=off_m[None, :, None] < Q_LEN)
-    else:
-        q = tl.load(Q + q_offset + q_range)
-
-    q = tl.reshape(q, [BLOCK_M, QK_HEAD_DIM_ROUNDED])
-
-
-    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    # Apply both score_mod and mask_mod
-
-    # find first kv block we are loading and the number of blocks we are loading
-    # Offset the kv_indices tensor by the correct batch and head
-    kv_indices = KV_IDX + sparse_idx_hz_offset
-    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_block_hz_offset)
-    indices_idx = block_n_start // SPARSE_KV_MULTIPLE
-    off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
-    off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
-    # first kv block we're loading
-
-    # last valid block according to sparse mask
-    block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
-
-    K_block_ptr = tl.make_block_ptr(
-        base=K + k_offset,
-        shape=(QK_HEAD_DIM, KV_LEN),                # (d, N)
-        strides=(stride_kk, stride_kn),
-        offsets=(0, off_n),
-        block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
-        order=(0, 1)
-    )
-    V_block_ptr = tl.make_block_ptr(
-        base=V + v_offset,
-        shape=(KV_LEN, V_HEAD_DIM),
-        strides=(stride_vn, stride_vk),
-        offsets=(off_n, 0),
-        block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
-        order=(1, 0)
-    )
-    offs_n = tl.arange(0, BLOCK_N) + off_n
-
-    acc, l_i, m_i = forward_inner(
-        {{gen_argdefs()}},
-        q, K_block_ptr, V_block_ptr, None, None, Q_LEN, KV_LEN,
-        # accumulatd values
-        acc, l_i, m_i,
-        #offsets
-        off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
-        None,
-        #block sparse data
-        kv_indices, kv_num_blocks,
-        block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
-        MATMUL_PRECISION,
-        IS_FULL_BLOCKS=False,
-    )
-
-
-    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    # We know these blocks are guaranteed to be "full", so we don't need to
-    # apply mask_mod to them - only score_mod
-    if HAS_FULL_BLOCKS:
-        kv_indices = FULL_KV_IDX + sparse_idx_hz_offset
-        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_block_hz_offset)
-        # Assign full block in a reverse order for off_t. Prioritize the last CTA.
-        block_n_start = (SPLIT_KV - off_t - 1) * TILE_KV_MULTIPLE
-        block_n_end = block_n_start + TILE_KV_MULTIPLE
-        indices_idx = block_n_start // SPARSE_KV_MULTIPLE
-        off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
-        off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
-
-        # last valid block according to sparse mask
-        block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
-
-        K_block_ptr = tl.make_block_ptr(
-            base=K + k_offset,
-            shape=(QK_HEAD_DIM, KV_LEN),                # (d, N)
-            strides=(stride_kk, stride_kn),
-            offsets=(0, off_n),
-            block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
-            order=(0, 1)
-        )
-        V_block_ptr = tl.make_block_ptr(
-            base=V + v_offset,
-            shape=(KV_LEN, V_HEAD_DIM),
-            strides=(stride_vn, stride_vk),
-            offsets=(off_n, 0),
-            block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
-            order=(1, 0)
-        )
-        offs_n = tl.arange(0, BLOCK_N) + off_n
-
-        acc, l_i, m_i = forward_inner(
-            {{gen_argdefs()}},
-            q, K_block_ptr, V_block_ptr, None, None, Q_LEN, KV_LEN,
-            # accumulatd values
-            acc, l_i, m_i,
-            #offsets
-            off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
-            None,
-            #block sparse data
-            kv_indices, kv_num_blocks,
-            block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
-            MATMUL_PRECISION,
-            IS_FULL_BLOCKS=True,
-        )
-
-    m_offset = off_t * stride_mt + off_z * stride_mz
-    l_offset = off_t * stride_lt + off_z * stride_lz
-
-    M_block_ptr = tl.make_block_ptr(
-        base=M + m_offset,
-        shape=(G, Q_LEN),                   # (G, M)
-        strides=(stride_mh, stride_mm),
-        offsets=(off_hkv*G, 0),
-        block_shape=(G, BLOCK_M_PER_HQ),
-        order=(1, 0)
-    )
-    L_block_ptr = tl.make_block_ptr(
-        base=L + l_offset,
-        shape=(G, Q_LEN),                   # (G, M)
-        strides=(stride_lh, stride_lm),
-        offsets=(off_hkv*G, 0),
-        block_shape=(G, BLOCK_M_PER_HQ),
-        order=(1, 0)
-    )
-
-    # Store output, logsumexp and rowmax for cross CTA reduction. (all in float32, even when input data are in fp16)
-    m_i = m_i.reshape(G, BLOCK_M_PER_HQ)
-    l_i = l_i.reshape(G, BLOCK_M_PER_HQ)
-    if SAFE_M_BOUNDARY:
-        tl.store(M_block_ptr, m_i)
-        tl.store(L_block_ptr, l_i)
-    else:
-        tl.store(M_block_ptr, m_i, boundary_check=(1,))
-        tl.store(L_block_ptr, l_i, boundary_check=(1,))
-
-    # -- store output
-    idx_z = off_z
-    idx_t = off_t
-    idx_hq = off_hkv*G + off_g[:, None, None]
-    idx_m = off_m[None, :, None]
-    idx_d = offs_vd[None, None, :]
-
-    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
-    acc = acc.reshape(G, BLOCK_M_PER_HQ, V_HEAD_DIM)
-    {{store_output(("idx_z", "idx_t", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}}
- """
-    + compute_forward_inner
-    + get_bounded_indices_func
-    + load_checked_block
-    + load_checked_2d
-    + compute_next_offset_func
-    + compute_forward_block_mn,
+    source=load_template("flex_decode")
+    + load_template("utilities")
+    + load_template("common"),
 )
 
 
 def get_split_k(B: int, H: int, Mk: int) -> int:
-    num_SM = torch.cuda.get_device_properties("cuda").multi_processor_count
+    if torch.xpu.is_available():
+        num_SM = torch.xpu.get_device_properties("xpu").gpu_subslice_count
+    else:
+        num_SM = torch.cuda.get_device_properties("cuda").multi_processor_count
     bh = max(B * H, 1)  # NOTE: Handle B*h=0 case
     assert isinstance(bh, (int, sympy.Integer)), "B and H must be concrete integers"
     split_k = num_SM // bh * 2  # Each SM should at least get one block.
@@ -410,11 +162,12 @@ def create_flex_decoding_kernel(*args, **kwargs):
         for k, v in kernel_options.items()
     }
 
-    # TODO: Fix flex decoding non-divisible case!
-    if seq_len_q % 128 != 0 or seq_len_kv % 128 != 0:
-        kernel_options.setdefault("IS_DIVISIBLE", False)
-    else:
+    seq_q_divisible = V.graph.sizevars.statically_known_true(seq_len_q % 128 == 0)
+    seq_kv_divisible = V.graph.sizevars.statically_known_true(seq_len_kv % 128 == 0)
+    if seq_q_divisible and seq_kv_divisible:
         kernel_options.setdefault("IS_DIVISIBLE", True)
+    else:
+        kernel_options.setdefault("IS_DIVISIBLE", False)
 
     # Calculate GQA head sharing
     gqa_shared_heads = Hq // Hkv
@@ -458,7 +211,9 @@ def create_flex_decoding_kernel(*args, **kwargs):
     choices: list[Any] = []
     dtype = key.get_dtype()
     head_dim = V.graph.sizevars.guard_int(key.get_size()[-1])
-    configs = V.choices.get_flex_decode_configs(head_dim, dtype)
+    configs = V.choices.get_flex_decode_configs(
+        head_dim, dtype, query.get_device().type
+    )
 
     # TODO: fix autotuning.
 
@@ -505,7 +260,7 @@ def create_flex_decoding_kernel(*args, **kwargs):
                     )
                     * gqa_shared_heads
                 ),
-                16,
+                1 if torch.xpu.is_available() else 16,
             )
         ),
     )
@@ -600,6 +355,13 @@ def create_flex_decoding_kernel(*args, **kwargs):
             **cur_kernel_options,
         )
 
+    filtered_score_mod_buffers = [
+        buf for buf in score_mod_other_buffers if not isinstance(buf, sympy.Symbol)
+    ]
+    filtered_mask_mod_buffers = [
+        buf for buf in mask_mod_other_buffers if not isinstance(buf, sympy.Symbol)
+    ]
+
     inputs_for_flex_decoding = (
         [
             query,
@@ -612,8 +374,8 @@ def create_flex_decoding_kernel(*args, **kwargs):
             full_kv_num_blocks,
             full_kv_indices,
         ]
-        + list(score_mod_other_buffers)
-        + list(mask_mod_other_buffers)
+        + filtered_score_mod_buffers
+        + filtered_mask_mod_buffers
     )
 
     input_gen_fns = {
diff --git a/torch/_inductor/kernel/flex/templates/common.py.jinja b/torch/_inductor/kernel/flex/templates/common.py.jinja
new file mode 100644
index 0000000000000..f95beb1461292
--- /dev/null
+++ b/torch/_inductor/kernel/flex/templates/common.py.jinja
@@ -0,0 +1,204 @@
+
+
+# Common Imports
+@triton.jit
+def forward_block_mn(
+    {{gen_argdefs()}},
+    q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    kv_offset,
+    MATMUL_PRECISION, RCP_LN2,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    {{gen_defines() | indent_except_first(1)}}
+
+    # -- load k --
+    # NB reversed order to since K is transposed
+    kv_base_offset = kv_start + kv_offset
+    {%- if USE_TMA %}
+    k = tl.load_tensor_descriptor(
+        desc_k,
+        [kv_base_offset, 0],
+    )
+    {%- else %}
+
+    # Load K as [BLOCK_N, QK_HEAD_DIM_ROUNDED] then transpose to [QK_HEAD_DIM_ROUNDED, BLOCK_N]
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_n_load = kv_base_offset + tl.arange(0, BLOCK_N)
+    k = load_checked_2d(K, offs_n_load, offs_k, stride_kn, stride_kk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+    {%- endif %}
+
+    k = tl.trans(k)
+    # -- compute qk ---
+    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
+    # which is larger than the actual number of elements. To avoid access memory out of bound,
+    # we need to mask out the elements that are out of Q_LEN & KV_LEN.
+    m = get_bounded_indices(offs_m, Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    n = get_bounded_indices(offs_n, KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+
+    {{ modification(
+        subgraph_number=0,
+        output_name="post_mod_scores",
+        score="qk",
+        b="off_z",
+        h="off_h",
+        m="m",
+        n="n",
+        out="qk"
+    ) | indent_except_first(1) }}
+
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))
+
+    if not IS_FULL_BLOCKS:
+        {{ modification(
+            subgraph_number=1,
+            output_name="mask_mod_output",
+            score="qk",
+            b="off_z",
+            h="off_h",
+            m="m",
+            n="n",
+        ) | indent_except_first(2) }}
+
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, False)
+        # apply mask for partially unmasked blocks
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    # -- compute scaling constant ---
+    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
+    if not ROWS_GUARANTEED_SAFE:
+        masked_out_rows = (m_ij == float("-inf"))
+        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+    else:
+        m_ij_masked = m_ij
+
+    alpha = tl.math.exp2(m_i - m_ij_masked)
+    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])
+
+    # NB: l_i update is pulled up here since it's a bit faster
+    # NB: For headdim=256, it's faster to move it back down to after m_i =
+    # m_ij
+    l_i = l_i * alpha + tl.sum(p, 1)
+    # # -- scale and update acc --
+    acc = acc * alpha[:, None]
+    {%- if USE_TMA %}
+    v = tl.load_tensor_descriptor(
+        desc_v,
+        [kv_base_offset, 0],
+    )
+    {%- else %}
+    # Calculate offsets for V loading - reuse kv_base_offset from K loading
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    v = load_checked_2d(V, offs_n_load, offs_v, stride_vn, stride_vk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+    {%- endif %}
+    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
+
+    # -- update m_i
+    m_i = m_ij
+
+    return acc, l_i, m_i
+
+@triton.jit
+def forward_inner(
+    {{gen_argdefs()}},
+    q, K, V,
+    desc_k, desc_v, Q_LEN, KV_LEN,
+    # accumulated values
+    acc, l_i, m_i,
+    # Offsets used as inputs to score_mod & mask_mod
+    # of size [BLOCK_M, BLOCK_N] or scalar.
+    off_z, off_h, offs_m, offs_n,
+    # Offsets needed for TMA loads
+    kv_start,
+    # blocksparse data
+    kv_indices, kv_num_blocks,
+    # start kv and end kv block
+    block_n_start, block_n_end,
+    MATMUL_PRECISION,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
+    IS_FULL_BLOCKS,
+):
+    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
+    {{gen_defines() | indent_except_first(1)}}
+
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    RCP_LN2: tl.constexpr = 1.44269504
+
+    if PRESCALE_QK:
+        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+
+    kv_offset = 0
+
+    # loop over k, v and update accumulator until block_n_end
+    for start_n in range(block_n_start, block_n_end):
+        # Here IS_DIVISIBLE acts are the start_n = tl.multiple_of(start_n, BLOCK_N) from triton_fused_attention.
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                {{gen_argdefs()}},
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS,
+            )
+        else:
+            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
+            # it's on par or slightly faster than only applying to the last block in fwd.
+            # However, we choose different strategy for bwd, where we only apply mod & mask
+            # to the last block because it's faster a lot.
+            acc, l_i, m_i = forward_block_mn(
+                {{gen_argdefs()}},
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                # accumulated values
+                acc, l_i, m_i,
+                # Offsets
+                off_z, off_h, offs_m, offs_n,
+                # Offsets needed for TMA loads
+                kv_start,
+                kv_offset,
+                MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+
+
+
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
+        )
+
+        offs_n = offs_n + offset
+        kv_offset += offset
+
+
+    return acc, l_i, m_i
diff --git a/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja b/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja
new file mode 100644
index 0000000000000..0a16a28c6cd43
--- /dev/null
+++ b/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja
@@ -0,0 +1,215 @@
+{{def_kernel("Q", "K", "V", "LSE", "MAX", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}}
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # M: Number of queries, N: Number of keys/values, D: Model dimension
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    #
+    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
+    #
+    # (Modifiable) Performance tuning options
+    # BLOCK_M: The thread block size across the seqlen dim of Q.
+    # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block.
+
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # BLOCKS_ARE_CONTIGUOUS: Is it guaranteed that all blocks in the mask are
+    # contiguous? If so, we don't need to do an indirect jump for every block
+
+    tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0)
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qk = {{stride("Q")}}
+    stride_kz, stride_kh, stride_kn, stride_kk = {{stride("K")}}
+    stride_vz, stride_vh, stride_vn, stride_vk = {{stride("V")}}
+
+    ZQ = {{size("Q", 0)}}
+    HQ = {{size("Q", 1)}}
+    Q_LEN = {{size("Q", 2)}}
+    ZKV = {{size("K", 0)}}
+    KV_LEN = {{size("K", 2)}}
+
+    MATMUL_PRECISION = Q.dtype.element_ty
+
+    q_start = tl.program_id(0).to(INDEX_DTYPE)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE)
+    off_hq = tl.program_id(2).to(INDEX_DTYPE)
+
+    # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
+    # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
+    off_zkv = off_zq % ZKV
+    off_hkv = off_hq // GQA_SHARED_HEADS
+    off_g = off_hq % GQA_SHARED_HEADS
+
+    q_offset = off_zq * stride_qz + off_hq * stride_qh
+    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
+    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
+
+    Q = Q + q_offset
+    K = K + k_offset
+    V = V + v_offset
+
+    # Setting up the TMA descriptors for Q, K, V
+    desc_q = None
+    desc_k = None
+    desc_v = None
+    {%- if USE_TMA %}
+    desc_q = tl.make_tensor_descriptor(
+        base=Q,
+        shape=[Q_LEN, QK_HEAD_DIM],
+        strides=[stride_qm, 1],
+        block_shape=[BLOCK_M, QK_HEAD_DIM_ROUNDED],
+    )
+
+    desc_k = tl.make_tensor_descriptor(
+        base=K,
+        shape=[KV_LEN, QK_HEAD_DIM],
+        strides=[stride_kn, 1],
+        block_shape=[BLOCK_N, QK_HEAD_DIM_ROUNDED],
+    )
+
+    desc_v = tl.make_tensor_descriptor(
+        base=V,
+        shape=[KV_LEN, V_HEAD_DIM],
+        strides=[stride_vn, 1],
+        block_shape=[BLOCK_N, V_HEAD_DIM_ROUNDED],
+    )
+    {%- endif %}
+
+    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
+    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
+
+    sparse_idx_z = off_zq % SPARSE_Z
+    sparse_idx_hq = off_hq % SPARSE_HQ
+
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M)
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+
+    stride_kv_num_blks_h = {{stride("KV_NUM_BLKS", 1)}}
+    stride_kv_idx_h = {{stride("KV_IDX", 1)}}
+    stride_kv_idx_m = {{stride("KV_IDX", 2)}}
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+
+    # KV_IDX and KV_NUM_BLKS are always contiguous.
+    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
+    sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
+    sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950
+
+    {%- if USE_TMA %}
+    q = tl.load_tensor_descriptor(
+        desc_q,
+        [(q_start * BLOCK_M).to(tl.int32), 0],
+    )
+    {%- else %}
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    q = load_checked_2d(Q, offs_m, offs_k, stride_qm, stride_qk, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+    {%- endif %}
+
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We don't know anything "special" about these blocks, so we need to apply
+    # both score_mod and mask_mod to it
+    kv_indices = KV_IDX + sparse_kv_idx_offset
+    kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+    block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+
+
+    # K and V pointers will be passed directly to forward_inner
+
+    offs_n = kv_start + tl.arange(0, BLOCK_N)
+
+
+    acc, l_i, m_i = forward_inner(
+        {{gen_argdefs()}},
+        q, K, V,
+        desc_k, desc_v, Q_LEN, KV_LEN,
+        acc, l_i, m_i,
+        off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+        kv_start,
+        kv_indices, kv_num_blocks,
+        0, block_n_end,
+        MATMUL_PRECISION,
+        stride_kk, stride_kn, stride_vn, stride_vk,
+        IS_FULL_BLOCKS=False,
+    )
+
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+        kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+        block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+        # K and V pointers will be passed directly to forward_inner
+        offs_n = kv_start + tl.arange(0, BLOCK_N)
+
+        acc, l_i, m_i = forward_inner(
+            {{gen_argdefs()}},
+            q, K, V,
+            desc_k, desc_v, Q_LEN, KV_LEN,
+            acc, l_i, m_i,
+            off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+            kv_start,
+            kv_indices, kv_num_blocks,
+            0, block_n_end,
+            MATMUL_PRECISION,
+            stride_kk, stride_kn, stride_vn, stride_vk,
+            IS_FULL_BLOCKS=True,
+        )
+
+
+    # [Note] Handle fully masked out rows:
+    # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf.
+    # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step
+    l_i = tl.where(l_i == 0.0, 1, l_i)
+
+    acc = acc / l_i[:, None]
+    idx_zq = tl.program_id(1).to(INDEX_DTYPE)
+    idx_hq = tl.program_id(2).to(INDEX_DTYPE)
+    idx_m = offs_m[:, None].to(INDEX_DTYPE)
+    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :].to(INDEX_DTYPE)
+
+    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
+
+    tl.static_assert(acc.shape == [BLOCK_M, V_HEAD_DIM_ROUNDED])
+    {{store_output(("idx_zq", "idx_hq", "idx_m", "idx_d"), "acc", "mask", val_shape=["BLOCK_M", "V_HEAD_DIM_ROUNDED"])}}
+
+    if OUTPUT_LOGSUMEXP:
+        off_hz = off_zq * HQ + off_hq
+        l_ptrs = LSE + off_hz * Q_LEN + offs_m
+        lse = m_i + tl.math.log2(l_i)
+        if IS_DIVISIBLE:
+            tl.store(l_ptrs, lse)
+        else:
+            tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)
+
+    if OUTPUT_MAX:
+        off_hz = off_zq * HQ + off_hq
+        max_ptrs = MAX + off_hz * Q_LEN + offs_m
+        if IS_DIVISIBLE:
+            tl.store(max_ptrs, m_i)
+        else:
+            tl.store(max_ptrs, m_i, mask=offs_m < Q_LEN)
diff --git a/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja b/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja
new file mode 100644
index 0000000000000..f5a4dd5d3c195
--- /dev/null
+++ b/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja
@@ -0,0 +1,683 @@
+{{def_kernel("Q", "K", "V", "LSE", "DELTA", "DO", "DQ", "DV", "KV_NUM_BLKS", "KV_IDX", "Q_NUM_BLKS", "Q_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX", "FULL_Q_NUM_BLKS", "FULL_Q_IDX")}}
+    # Sub notation for this kernel:
+    #
+    # Q: Query, K: Key, V: Value
+    # LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype)
+    # DELTA: Precomputed sum(OUT*DO, axis=-1)
+    # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value
+    # DK: Derivative of Key, is the written to via the store_output call due to some limitations with
+    # inductor codegen
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # z: Batch size, h: Number of heads, m: Number of queries or keys/values, d: Head dim
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    # (Modifiable) Performance tuning options
+    # BLOCK_M1: when calculating DK & DV, iterate over BLOCK_M1 across the seqlen dim of Q in each thread block.
+    # BLOCK_N1: when calculating DK & DV, the thread block size across the seqlen dim of K/V.
+    # BLOCK_M2: when calculating DQ, the thread block size across the seqlen dim of Q.
+    # BLOCK_N2: when calculating DQ, iterate over BLOCK_N2 across the seqlen dim of K/V in each thread block.
+    #
+    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    # Q_NUM_BLKS: The number of Q blocks (that may or may not require masking) for each query.
+    # Q_IDX: The indices of Q blocks (that may or may not require masking) for each query.
+    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
+    # FULL_Q_NUM_BLKS: The number of fully unmasked Q blocks (so we don't need masking) for each query.
+    # FULL_Q_IDX: The indices of fully unmasked Q blocks (so we don't need masking) for each query.
+
+    # The below are kernel options that can be applied for certain score_mods,
+    # or involve a numerics vs. perf tradeoff
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
+    # about 20% more numerical error, but slightly faster.
+
+    # Define strides of inputs
+    stride_qz, stride_qh, stride_qm, stride_qd = {{stride("Q")}}
+    stride_kz, stride_kh, stride_kn, stride_kd = {{stride("K")}}
+    stride_vz, stride_vh, stride_vn, stride_vd = {{stride("V")}}
+    stride_doz, stride_doh, stride_dom, stride_dod = {{stride("DO")}}
+
+    stride_dqz, stride_dqh, stride_dqm, stride_dqd = {{stride("DQ")}}
+    stride_dvz, stride_dvh, stride_dvm, stride_dvd = {{stride("DV")}}
+
+    ZQ = {{size("Q", 0)}}
+    HQ = {{size("Q", 1)}}
+    HKV = {{size("K", 1)}}
+    Q_LEN = {{size("Q", 2)}}
+    ZKV = {{size("K", 0)}}
+    KV_LEN = {{size("K", 2)}}
+
+    MATMUL_PRECISION = Q.dtype.element_ty
+
+    pid = tl.program_id(0).to(INDEX_DTYPE)
+    NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1)
+    NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2)
+
+    off_zq = tl.program_id(1).to(INDEX_DTYPE) # q batch idx
+    off_hkv = tl.program_id(2).to(INDEX_DTYPE) # kv head idx
+    off_zkv = off_zq % ZKV # kv batch idx
+
+    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
+    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
+
+    sparse_idx_z = off_zq % SPARSE_Z
+
+    k_adj = (stride_kh * off_hkv + stride_kz * off_zkv).to(tl.int64)
+    v_adj = (stride_vh * off_hkv + stride_vz * off_zkv).to(tl.int64)
+    # first compute broadcasted dv of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+    # then reduce to dv of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+    dv_adj = (stride_dvh * off_hkv + stride_dvz * off_zq).to(tl.int64)
+
+    # offset K, V, DV pointers for batch/kv-head
+    K += k_adj
+    V += v_adj
+    DV += dv_adj
+
+    RCP_LN2 = 1.44269504
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+
+    if pid >= NUM_KV_BLOCKS:
+        off_pid = pid - NUM_KV_BLOCKS
+        # THIS BLOCK DOES DQ
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M2)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+        off_hq2 = off_pid // NUM_Q_BLOCKS + off_hkv * GQA_SHARED_HEADS
+        start_m2_block = off_pid % NUM_Q_BLOCKS
+        off_pid_mask = start_m2_block // SPARSE_Q_MULTIPLE
+        stride_kv_num_blks_h = {{stride("KV_NUM_BLKS", 1)}}
+        stride_kv_idx_h = {{stride("KV_IDX", 1)}}
+        stride_kv_idx_m = {{stride("KV_IDX", 2)}}
+
+        sparse_idx_hq2 = off_hq2 % SPARSE_HQ
+        sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq2
+
+        sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask
+        sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m  # noqa: B950
+
+        # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+        q_adj2 = (stride_qh * off_hq2 + stride_qz * off_zq).to(tl.int64)
+        do_adj2 = (stride_doh * off_hq2 + stride_doz * off_zq).to(tl.int64)
+        dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_zq).to(tl.int64)
+        off_chz2 = ((off_zq * HQ + off_hq2) * Q_LEN).to(tl.int64)
+
+        Q2 = Q + q_adj2
+        DO2 = DO + do_adj2
+        # TODO: This does not work if DQ is not the same layout as Q (for example,
+        # if Q is broadcasted)
+        DQ2 = DQ + dq_adj2
+        LSE2 = LSE + off_chz2
+        DELTA2 = DELTA + off_chz2
+
+        # dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM], dtype=tl.float32)
+        dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+
+        start_m2 = start_m2_block * BLOCK_M2
+        offs_m2 = start_m2 + tl.arange(0, BLOCK_M2)
+
+        # load Q and do: they stay in SRAM throughout the inner loop.
+        q = load_checked_2d(Q2, offs_m2, offs_k, stride_qm, stride_qd, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+        do = load_checked_2d(DO2, offs_m2, offs_v, stride_dom, stride_dod, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+
+        if PRESCALE_QK:
+            q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+
+        if IS_DIVISIBLE:
+            Di = tl.load(DELTA2 + offs_m2)
+            lse = tl.load(LSE2 + offs_m2)
+        else:
+            Di = tl.load(DELTA2 + offs_m2, mask=offs_m2 < Q_LEN)
+            lse = tl.load(LSE2 + offs_m2, mask=offs_m2 < Q_LEN)
+        lse = tl.where(lse == -float("inf"), 0.0, lse)
+        lse = lse[:, None]
+
+        # ~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        # KV_IDX and KV_NUM_BLKS are always contiguous.
+        kv_indices = KV_IDX + sparse_kv_idx_offset
+        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+        sparse_kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
+
+        offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+        dq = bwd_dq_inner(
+            {{gen_argdefs()}},
+            K, V,
+            dq, q, do, Di, lse,
+            off_zq, off_hq2, offs_m2, offs_n2,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION,
+            IS_FULL_BLOCKS=False,
+        )
+
+        if HAS_FULL_BLOCKS:
+            # ~~~~~~~~~~~ partial unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
+            kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
+            kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
+            sparse_kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
+
+            offs_n2 = kv_start + tl.arange(0, BLOCK_N2)
+            dq = bwd_dq_inner(
+                {{gen_argdefs()}},
+                K, V,
+                dq, q, do, Di, lse,
+                off_zq, off_hq2, offs_m2, offs_n2,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=True,
+            )
+
+        # Write back dQ.
+        dq_ptrs = DQ2 + offs_m2[:, None] * stride_dqm + offs_k[None, :] * stride_dqd
+        dq *= SM_SCALE
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dq_ptrs, dq)
+        else:
+            tl.store(dq_ptrs, dq, mask=(offs_m2[:, None] < Q_LEN) & (offs_k[None, :] < QK_HEAD_DIM))
+    else:
+        # THIS BLOCK DOES DK & DV
+        SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+        SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N1)
+
+        pid_mask = pid // SPARSE_KV_MULTIPLE
+
+        stride_q_num_blks_h = {{stride("Q_NUM_BLKS", 1)}}
+        stride_q_idx_h = {{stride("Q_IDX", 1)}}
+        stride_q_idx_n = {{stride("Q_IDX", 2)}}
+
+
+        dv = tl.zeros([BLOCK_N1, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_N1, QK_HEAD_DIM_ROUNDED], dtype=tl.float32)
+
+        start_n1 = pid * BLOCK_N1
+        offs_n1 = start_n1 + tl.arange(0, BLOCK_N1)
+
+        # load K and V: they stay in SRAM throughout the inner loop.
+        k = load_checked_2d(K, offs_n1, offs_k, stride_kn, stride_kd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+        v = load_checked_2d(V, offs_n1, offs_v, stride_vn, stride_vd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+
+        if PRESCALE_QK:
+            k = (k * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)
+
+        for off_g in range(0, GQA_SHARED_HEADS):
+            off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g
+
+            # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+            q_adj1 = (stride_qh * off_hq1 + stride_qz * off_zq).to(tl.int64)
+            do_adj1 = (stride_doh * off_hq1 + stride_doz * off_zq).to(tl.int64)
+            dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_zq).to(tl.int64)
+            off_chz1 = ((off_zq * HQ + off_hq1) * Q_LEN).to(tl.int64)
+
+            Q1 = Q + q_adj1
+            DO1 = DO + do_adj1
+            # TODO: This does not work if DQ is not the same layout as Q (for example,
+            # if Q is broadcasted)
+            LSE1 = LSE + off_chz1
+            DELTA1 = DELTA + off_chz1
+
+            sparse_idx_hq1 = off_hq1 % SPARSE_HQ
+            sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq1
+
+            sparse_q_num_blks_offset = sparse_hz_offset * stride_q_num_blks_h + pid_mask
+            sparse_q_idx_offset = sparse_hz_offset * stride_q_idx_h + pid_mask * stride_q_idx_n  # noqa: B950
+
+            # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # Q_IDX and Q_NUM_BLKS are always contiguous.
+            q_indices = Q_IDX + sparse_q_idx_offset
+            q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+            sparse_q_num_blocks = tl.load(Q_NUM_BLKS + sparse_q_num_blks_offset)
+
+            offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+            dk, dv = bwd_dkdv_inner(
+                {{gen_argdefs()}},
+                Q1, DO1, DELTA1, LSE1,
+                dk, dv, k, v,
+                off_zq, off_hq1, offs_n1, offs_m1,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION,
+                IS_FULL_BLOCKS=False,
+            )
+
+
+            if HAS_FULL_BLOCKS:
+                # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                # FULL_Q_IDX and FULL_Q_NUM_BLKS are always contiguous.
+                q_indices = FULL_Q_IDX + sparse_q_idx_offset
+                q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading
+                sparse_q_num_blocks = tl.load(FULL_Q_NUM_BLKS + sparse_q_num_blks_offset)
+
+                offs_m1 = q_start + tl.arange(0, BLOCK_M1)
+                dk, dv = bwd_dkdv_inner(
+                    {{gen_argdefs()}},
+                    Q1, DO1, DELTA1, LSE1,
+                    dk, dv, k, v,
+                    off_zq, off_hq1, offs_n1, offs_m1,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION,
+                    IS_FULL_BLOCKS=True,
+                )
+
+        # Write back dV and dK.
+        dv_ptrs = DV + offs_n1[:, None] * stride_dvm + offs_v[None, :] * stride_dvd
+
+        index_n = offs_n1[:, None]
+        index_k = offs_k[None, :]
+        index_v = offs_v[None, :]
+
+        if IS_DIVISIBLE and SAFE_HEAD_DIM:
+            tl.store(dv_ptrs, dv)
+        else:
+            tl.store(dv_ptrs, dv, mask=(index_n < KV_LEN) & (index_v < V_HEAD_DIM))
+
+        dk *= SM_SCALE
+
+        if SAFE_HEAD_DIM:
+            mask = index_n < KV_LEN
+        else:
+            mask = (index_n < KV_LEN) & (index_k < QK_HEAD_DIM)
+
+        # first compute broadcasted dk of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM]
+        # then reduce to dk of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM]
+        tl.static_assert(dk.shape == [BLOCK_N1, QK_HEAD_DIM_ROUNDED])
+        {{store_output(("off_zq", "off_hkv", "index_n", "index_k"), "dk", "mask", indent_width=8, val_shape=["BLOCK_N1", "QK_HEAD_DIM_ROUNDED"])}}
+
+@triton.jit
+def bwd_dq_inner(
+    {{gen_argdefs()}},
+    K, V,  # pointers
+    dq, q, do, Di, lse,
+    off_z, off_hq, offs_m2, offs_n2,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    {{gen_defines() | indent_except_first(1) }}
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = {{size("Q", 2)}}
+    KV_LEN = {{size("K", 2)}}
+
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+
+    kT_ptrs = K + offs_n2[None, :] * stride_kn + offs_k[:, None] * stride_kd
+    vT_ptrs = V + offs_n2[None, :] * stride_vn + offs_v[:, None] * stride_vd
+    # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
+
+    hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1))
+    if not IS_DIVISIBLE:
+        if hi >= 1:
+            for start_n in range(0, hi - 1):
+                dq = bwd_dq_block_mn(
+                    {{gen_argdefs()}},
+                    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+                    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+                    stride_kn, stride_kd, stride_vn, stride_vd,
+                    kv_indices, sparse_kv_num_blocks,
+                    MATMUL_PRECISION, RCP_LN2,
+                    IS_FULL_BLOCKS,
+                )
+
+                # Increment pointers.
+                offset = get_offset_for_next_block(
+                    start_n, kv_indices, sparse_kv_num_blocks,
+                    SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
+                )
+
+                kT_ptrs += offset * stride_kn
+                vT_ptrs += offset * stride_vn
+
+                offs_n2 += offset
+
+            dq = bwd_dq_block_mn(
+                {{gen_argdefs()}},
+                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+    else:
+        for start_n in range(0, hi):
+            dq = bwd_dq_block_mn(
+                {{gen_argdefs()}},
+                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+                stride_kn, stride_kd, stride_vn, stride_vd,
+                kv_indices, sparse_kv_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS,
+            )
+
+            # Increment pointers.
+            offset = get_offset_for_next_block(
+                start_n, kv_indices, sparse_kv_num_blocks,
+                SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
+            )
+
+            kT_ptrs += offset * stride_kn
+            vT_ptrs += offset * stride_vn
+
+            offs_n2 += offset
+
+    return dq
+
+
+@triton.jit
+def bwd_dq_block_mn(
+    {{gen_argdefs()}},
+    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+    stride_kn, stride_kd, stride_vn, stride_vd,
+    kv_indices, sparse_kv_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    {{gen_defines() | indent_except_first(1)}}
+
+    # NB reversed order to since K is transposed
+    kT = load_checked_2d(kT_ptrs, offs_k, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, KV_LEN)
+    qk = tl.dot(q, kT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qk *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    pre_mod_scores = qk
+    n = get_bounded_indices(offs_n2[None, :], KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across N dim
+    # that the M reads out of bounds prior to the last loop
+    m = get_bounded_indices(offs_m2[:, None], Q_LEN if (not IS_DIVISIBLE or CHECK_BLOCK_BOUNDARY) else None)
+
+    {{ modification(
+        subgraph_number=0,
+        output_name="post_mod_scores",
+        score="qk",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        out="qk"
+    ) | indent_except_first(1) }}
+
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf"))
+
+    if not IS_FULL_BLOCKS:
+        {{ modification(
+            subgraph_number=2,
+            output_name="mask_mod_output",
+            score="qk",
+            b="off_z",
+            h="off_hq",
+            m="m",
+            n="n",
+        ) | indent_except_first(2) }}
+
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, False)
+        # apply mask for partial masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    p = tl.math.exp2(post_mod_scores - lse)
+    # Compute dP and dS.
+    # NB reversed order to since V is transposed
+    vT = load_checked_2d(vT_ptrs, offs_v, offs_n2, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, V_HEAD_DIM, KV_LEN)
+
+    dp = tl.dot(do, vT, input_precision=FLOAT32_PRECISION)
+    ds = p * (dp - Di[:, None])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    {{ modification(
+        subgraph_number=1,
+        output_name = "grad_scores",
+        score="pre_mod_scores",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        grad_score_mod="ds"
+    ) | indent_except_first(1) }}
+    if CHECK_BLOCK_BOUNDARY:
+        grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0)
+
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if WRITE_DQ:
+        scatter_mask = (offs_m2[:, None] < Q_LEN ) & (offs_n2[None, :] < KV_LEN)
+        {{ modification(
+            subgraph_number=3,
+            output_name=None,
+            mask="scatter_mask",
+            score="pre_mod_scores",
+            b="off_z",
+            h="off_hq",
+            m="m",
+            n="n",
+            grad_score_mod="ds"
+        ) | indent_except_first(2) }}
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = grad_scores
+
+    if not IS_FULL_BLOCKS:
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, False)
+        # (grads) apply mask for partially unmasked block
+        ds = tl.where(mask_mod_output, ds, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ds = ds.to(MATMUL_PRECISION)
+    # Compute dQ.
+    dq += tl.dot(ds, tl.trans(kT), input_precision=FLOAT32_PRECISION)
+
+    return dq
+
+
+@triton.jit
+def bwd_dkdv_inner(
+    {{gen_argdefs()}},
+    Q, DO, DELTA, LSE, # pointers
+    dk, dv, k, v,
+    off_z, off_hq, offs_n1, offs_m1,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION,
+    IS_FULL_BLOCKS,
+):
+    {{gen_defines() | indent_except_first(1) }}
+    SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1)
+    RCP_LN2: tl.constexpr = 1.44269504
+    Q_LEN = {{size("Q", 2)}}
+    KV_LEN = {{size("K", 2)}}
+
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+
+    qT_ptrs = Q + offs_m1[None, :] * stride_qm + offs_k[:, None] * stride_qd
+    do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod
+    # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
+    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
+
+    if not IS_DIVISIBLE:
+        if hi >= 1:
+            for start_m in range(0, hi - 1):
+                dk, dv = bwd_dkdv_block_mn(
+                    {{gen_argdefs()}},
+                    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+                    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+                    stride_qm, stride_qd, stride_dom, stride_dod,
+                    q_indices, sparse_q_num_blocks,
+                    MATMUL_PRECISION, RCP_LN2,
+                    IS_FULL_BLOCKS,
+                )
+                # Increment pointers.
+                offset = get_offset_for_next_block(
+                    start_m, q_indices, sparse_q_num_blocks,
+                    SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
+                )
+
+                qT_ptrs += offset * stride_qm
+                do_ptrs += offset * stride_dom
+
+                offs_m1 += offset
+
+            dk, dv = bwd_dkdv_block_mn(
+                {{gen_argdefs()}},
+                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
+            )
+    else:
+        for start_m in range(0, hi):
+            dk, dv = bwd_dkdv_block_mn(
+                {{gen_argdefs()}},
+                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+                off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+                stride_qm, stride_qd, stride_dom, stride_dod,
+                q_indices, sparse_q_num_blocks,
+                MATMUL_PRECISION, RCP_LN2,
+                IS_FULL_BLOCKS,
+            )
+            # Increment pointers.
+            offset = get_offset_for_next_block(
+                start_m, q_indices, sparse_q_num_blocks,
+                SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
+            )
+
+            qT_ptrs += offset * stride_qm
+            do_ptrs += offset * stride_dom
+
+            offs_m1 += offset
+
+    return dk, dv
+
+
+@triton.jit
+def bwd_dkdv_block_mn(
+    {{gen_argdefs()}},
+    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+    stride_qm, stride_qd, stride_dom, stride_dod,
+    q_indices, sparse_q_num_blocks,
+    MATMUL_PRECISION, RCP_LN2,
+    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+):
+    {{gen_defines() | indent_except_first(1) }}
+
+    # NB reversed order since Q is transposed
+    qT = load_checked_2d(qT_ptrs, offs_k, offs_m1, None, None, SAFE_HEAD_DIM, IS_DIVISIBLE, QK_HEAD_DIM, Q_LEN)
+    # Load LSE before computing qk to reduce pipeline stall.
+    if IS_DIVISIBLE:
+        lse = tl.load(LSE + offs_m1)
+    else:
+        lse = tl.load(LSE + offs_m1, mask=offs_m1 < Q_LEN)
+    lse = tl.where(lse == -float("inf"), 0.0, lse)
+    qkT = tl.dot(k, qT, input_precision=FLOAT32_PRECISION)
+    if not PRESCALE_QK:
+        qkT *= SM_SCALE
+    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+    m = get_bounded_indices(offs_m1[None, :], Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    # The boundary check is done for the outer loop, but here it's possible since we're iterating across M dim
+    # that the n reads out of bounds prior to the last loop
+    n = get_bounded_indices(offs_n1[:, None], KV_LEN if (not IS_DIVISIBLE or CHECK_BLOCK_BOUNDARY) else None)
+
+    pre_mod_scores = qkT
+    {{ modification(
+        subgraph_number=0,
+        output_name="post_mod_scores",
+        score="qkT",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        out="qkT"
+    ) | indent_except_first(1) }}
+
+    if CHECK_BLOCK_BOUNDARY:
+        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+        post_mod_scores = tl.where(offs_n1[:, None] < KV_LEN, post_mod_scores, float("-inf"))
+
+    if not IS_FULL_BLOCKS:
+        {{ modification(
+            subgraph_number=2,
+            output_name="mask_mod_output",
+            score="qkT",
+            b="off_z",
+            h="off_hq",
+            m="m",
+            n="n",
+        ) | indent_except_first(2) }}
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, False)
+        # (grads) apply mask for fully masked block
+        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if not PRESCALE_QK:
+        post_mod_scores *= RCP_LN2
+    pT = tl.math.exp2(post_mod_scores - lse[None, :])
+    do = load_checked_2d(do_ptrs, offs_m1, offs_v, None, None, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM)
+    # Compute dV.
+    ppT = pT
+    dv += tl.dot(ppT.to(MATMUL_PRECISION), do, input_precision=FLOAT32_PRECISION)
+    if IS_DIVISIBLE:
+        Di = tl.load(DELTA + offs_m1)
+    else:
+        Di = tl.load(DELTA + offs_m1, mask=offs_m1 < Q_LEN)
+    # Compute dP and dS.
+    dpT = tl.dot(v, tl.trans(do), input_precision=FLOAT32_PRECISION)
+    dsT = pT * (dpT - Di[None, :])
+    # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+    {{ modification(
+        subgraph_number=1,
+        output_name = "grad_scores",
+        score="pre_mod_scores",
+        b="off_z",
+        h="off_hq",
+        m="m",
+        n="n",
+        grad_score_mod="dsT"
+    ) | indent_except_first(1) }}
+
+    # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
+    if not WRITE_DQ:
+        idx_b = off_z
+        idx_h = off_hq
+        idx_m = m
+        idx_n = n
+        scatter_mask = (offs_m1[None, :] < Q_LEN) & (offs_n1[:, None] < KV_LEN)
+        {{ modification(
+            subgraph_number=3,
+            output_name=None,
+            mask="scatter_mask",
+            score="pre_mod_scores",
+            b="idx_b",
+            h="idx_h",
+            m="idx_m",
+            n="idx_n",
+            grad_score_mod="dsT"
+        ) | indent_except_first(2) }}
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    if CHECK_BLOCK_BOUNDARY:
+        grad_scores = tl.where(offs_n1[:, None] < KV_LEN, grad_scores, 0.0)
+
+    dsT = grad_scores
+    if not IS_FULL_BLOCKS:
+        if CHECK_BLOCK_BOUNDARY:
+            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, False)
+        # (grads) apply mask for partially unmasked block
+        dsT = tl.where(mask_mod_output, dsT, 0.0)
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dk += tl.dot(dsT.to(MATMUL_PRECISION), tl.trans(qT), input_precision=FLOAT32_PRECISION)
+
+    return dk, dv
\ No newline at end of file
diff --git a/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja b/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
new file mode 100644
index 0000000000000..57adc1cd69d63
--- /dev/null
+++ b/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
@@ -0,0 +1,242 @@
+    {{def_kernel("Q", "K", "V", "M", "L", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}}
+    # Sub notation for this kernel:
+    # Q: Query, K: Key, V: Value
+    # reduction buffers: M rowmax across local KV split, L local sumexp across local KV split
+    # M: Number of queries, N: Number of keys/values
+    # QK_HEAD_DIM: The dimension of the query and key embeddings
+    # V_HEAD_DIM: The dimension of the value embeddings
+    # BLOCK_M, QK_HEAD_DIM: M, and D dimemsion are always assigned to the same block
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head t: Number of kv splits
+    # (Modifiable) Config options:
+    # SPLIT_KV: number of blocks K & V are split into
+    # TILE_KV: length of each local KV split
+    # BLOCK_M: block size that Q is padded along seqlen dim.
+    # BLOCK_N: block size of K & V along N dimension.
+    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
+    #
+    # change of base out of the loop
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # SAFE_M_BOUNDARY: Is Q seqlen a multiple of BLOCK_M? If so, we can skip an extra boundary check for loading query.
+    # SAFE_N_BOUNDARY: Is KV seqlen a multiple of BLOCK_N? If so, we can skip an extra boundary check for loading key/value.
+
+    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base.
+    #
+    # SPARSE_KV_BLOCK_SIZE: sparse mask block size along KV seqlen dim.
+    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
+    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
+    #
+    #
+    # Output: ACC output accumulated across local KV split.
+
+    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)
+
+    # Define Q Strides
+    stride_qz, stride_qh, stride_qg, stride_qm, stride_qk = {{stride("Q")}}
+    stride_kz, stride_kh, stride_kn, stride_kk = {{stride("K")}}
+    stride_vz, stride_vh, stride_vn, stride_vk = {{stride("V")}}
+    stride_mz, stride_mt, stride_mh, stride_mm = {{stride("M")}}
+    stride_lz, stride_lt, stride_lh, stride_lm = {{stride("L")}}
+
+
+    Z = {{size("Q", 0)}}
+    ZKV = {{size("K", 0)}}
+    HKV = {{size("Q", 1)}}
+    G: tl.constexpr = GQA_SHARED_HEADS
+    HQ = HKV * G
+    Q_LEN = {{size("Q", 3)}}
+    KV_LEN = {{size("K", 2)}}
+
+    MATMUL_PRECISION = Q.dtype.element_ty
+
+    # Make sure each split is a multiple of BLOCK_N
+    TILE_KV_OG = tl.cdiv(KV_LEN, SPLIT_KV)
+    TILE_KV = tl.cdiv(TILE_KV_OG, BLOCK_N) * BLOCK_N
+    TILE_KV_MULTIPLE: tl.constexpr = (TILE_KV // BLOCK_N)
+
+    off_z = tl.program_id(0).to(INDEX_DTYPE) // HKV
+    off_zkv = off_z % ZKV
+    off_hkv = tl.program_id(0).to(INDEX_DTYPE) % HKV
+    off_t = tl.program_id(1).to(INDEX_DTYPE)
+
+    q_offset = off_z * stride_qz + off_hkv * stride_qh
+    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
+    v_offset = off_zkv * stride_vz + off_hkv * stride_vh
+
+    K = K + k_offset
+    V = V + v_offset
+
+    SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
+    SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
+
+    sparse_idx_z = off_z % SPARSE_Z
+    sparse_idx_h = off_hkv % SPARSE_HQ
+
+    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
+    SPARSE_KV_BLOCK_CNT = tl.cdiv(KV_LEN, SPARSE_KV_BLOCK_SIZE)
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)
+
+    # initialize offsets
+    tl.device_assert(BLOCK_M % G == 0)
+    BLOCK_M_PER_HQ: tl.constexpr = BLOCK_M // G
+    off_g = tl.arange(0, G)                                                 # [G]
+    offs_g = tl.ravel(tl.broadcast_to(off_g[:, None], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
+    offs_hq = offs_g + off_hkv * G
+    off_m = tl.arange(0, BLOCK_M_PER_HQ)                                    # [BLOCK_M_PER_HQ]
+    offs_m = tl.ravel(tl.broadcast_to(off_m[None, :], [G, BLOCK_M_PER_HQ])) # [BLOCK_M]
+    offs_d = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_vd = tl.arange(0, V_HEAD_DIM_ROUNDED)
+
+    # Get HZ offsets for KV_NUM_BLKS and KV_IDX
+    stride_block_z, stride_block_h, stride_block_row = {{stride("KV_NUM_BLKS")}}
+    sparse_block_hz_offset = sparse_idx_z * stride_block_z + sparse_idx_h * stride_block_h
+    stride_kv_z, stride_kv_h, stride_kv_row, stride_kv_col = {{stride("KV_IDX")}}
+    sparse_idx_hz_offset = sparse_idx_z * stride_kv_z + sparse_idx_h * stride_kv_h
+
+    # Calculate KV blocks that belong this CTA.
+    block_n_start = off_t * TILE_KV_MULTIPLE                        # n_offset inside sparse block
+    block_n_end = block_n_start + TILE_KV_MULTIPLE                  # end BLOCK_N
+
+    q_range = stride_qg * off_g[:, None, None] + stride_qm * off_m[None, :, None] + stride_qk * offs_d[None, None, :]
+
+    if not SAFE_M_BOUNDARY and not SAFE_HEAD_DIM:
+        q = tl.load(Q + q_offset + q_range, mask=(offs_d[None, None, :] < QK_HEAD_DIM) & (off_m[None, :, None] < Q_LEN))
+    elif SAFE_M_BOUNDARY and not SAFE_HEAD_DIM:
+        q = tl.load(Q + q_offset + q_range, mask=offs_d[None, None, :] < QK_HEAD_DIM)
+    elif not SAFE_M_BOUNDARY and SAFE_HEAD_DIM:
+        q = tl.load(Q + q_offset + q_range, mask=off_m[None, :, None] < Q_LEN)
+    else:
+        q = tl.load(Q + q_offset + q_range)
+
+    q = tl.reshape(q, [BLOCK_M, QK_HEAD_DIM_ROUNDED])
+
+
+    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # find first kv block we are loading and the number of blocks we are loading
+    # Offset the kv_indices tensor by the correct batch and head
+    kv_indices = KV_IDX + sparse_idx_hz_offset
+    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_block_hz_offset)
+    MAX_KV_IDX = {{size("KV_IDX", -1)}}
+    indices_idx = (block_n_start // SPARSE_KV_MULTIPLE) % (MAX_KV_IDX)
+    off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
+    off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
+    # first kv block we're loading
+
+    # last valid block according to sparse mask
+    block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+
+    offs_n = tl.arange(0, BLOCK_N) + off_n
+
+    desc_k = None
+    desc_v = None
+    {%- if USE_TMA %}
+    desc_k = tl.make_tensor_descriptor(
+        base=K,
+        shape=[KV_LEN, QK_HEAD_DIM],
+        strides=[stride_kn, 1],
+        block_shape=[BLOCK_N, QK_HEAD_DIM_ROUNDED],
+    )
+
+    desc_v = tl.make_tensor_descriptor(
+        base=V,
+        shape=[KV_LEN, V_HEAD_DIM],
+        strides=[stride_vn, 1],
+        block_shape=[BLOCK_N, V_HEAD_DIM_ROUNDED],
+    )
+    {%- endif %}
+
+    acc, l_i, m_i = forward_inner(
+        {{gen_argdefs()}},
+        q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+        # accumulatd values
+        acc, l_i, m_i,
+        #offsets
+        off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+        off_n,
+        #block sparse data
+        kv_indices, kv_num_blocks,
+        block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
+        MATMUL_PRECISION,
+        stride_kk, stride_kn, stride_vn, stride_vk,
+        IS_FULL_BLOCKS=False,
+    )
+
+
+    # ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # We know these blocks are guaranteed to be "full", so we don't need to
+    # apply mask_mod to them - only score_mod
+    if HAS_FULL_BLOCKS:
+        kv_indices = FULL_KV_IDX + sparse_idx_hz_offset
+        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_block_hz_offset)
+        # Assign full block in a reverse order for off_t. Prioritize the last CTA.
+        block_n_start = (SPLIT_KV - off_t - 1) * TILE_KV_MULTIPLE
+        block_n_end = block_n_start + TILE_KV_MULTIPLE
+        indices_idx = (block_n_start // SPARSE_KV_MULTIPLE) % (MAX_KV_IDX)
+        off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
+        off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
+
+        # last valid block according to sparse mask
+        block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+
+        offs_n = tl.arange(0, BLOCK_N) + off_n
+
+        acc, l_i, m_i = forward_inner(
+            {{gen_argdefs()}},
+            q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+            # accumulatd values
+            acc, l_i, m_i,
+            #offsets
+            off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+            off_n,
+            #block sparse data
+            kv_indices, kv_num_blocks,
+            block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
+            MATMUL_PRECISION,
+            stride_kk, stride_kn, stride_vn, stride_vk,
+            IS_FULL_BLOCKS=True,
+        )
+
+    m_offset = off_t * stride_mt + off_z * stride_mz
+    l_offset = off_t * stride_lt + off_z * stride_lz
+
+    M_block_ptr = tl.make_block_ptr(
+        base=M + m_offset,
+        shape=(G, Q_LEN),                   # (G, M)
+        strides=(stride_mh, stride_mm),
+        offsets=(off_hkv*G, 0),
+        block_shape=(G, BLOCK_M_PER_HQ),
+        order=(1, 0)
+    )
+    L_block_ptr = tl.make_block_ptr(
+        base=L + l_offset,
+        shape=(G, Q_LEN),                   # (G, M)
+        strides=(stride_lh, stride_lm),
+        offsets=(off_hkv*G, 0),
+        block_shape=(G, BLOCK_M_PER_HQ),
+        order=(1, 0)
+    )
+
+    # Store output, logsumexp and rowmax for cross CTA reduction. (all in float32, even when input data are in fp16)
+    m_i = m_i.reshape(G, BLOCK_M_PER_HQ)
+    l_i = l_i.reshape(G, BLOCK_M_PER_HQ)
+    if SAFE_M_BOUNDARY:
+        tl.store(M_block_ptr, m_i)
+        tl.store(L_block_ptr, l_i)
+    else:
+        tl.store(M_block_ptr, m_i, boundary_check=(1,))
+        tl.store(L_block_ptr, l_i, boundary_check=(1,))
+
+    # -- store output
+    idx_z = off_z
+    idx_t = off_t
+    idx_hq = off_hkv*G + off_g[:, None, None]
+    idx_m = off_m[None, :, None]
+    idx_d = offs_vd[None, None, :]
+
+    mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
+    acc = acc.reshape(G, BLOCK_M_PER_HQ, V_HEAD_DIM)
+    {{store_output(("idx_z", "idx_t", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}}
diff --git a/torch/_inductor/kernel/flex/templates/utilities.py.jinja b/torch/_inductor/kernel/flex/templates/utilities.py.jinja
new file mode 100644
index 0000000000000..7e2367e4f2692
--- /dev/null
+++ b/torch/_inductor/kernel/flex/templates/utilities.py.jinja
@@ -0,0 +1,59 @@
+
+
+# Utility triton funcs
+@triton.jit
+def get_offset_for_next_block(
+    loop_iter, col_indices, total_blocks,
+    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
+    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
+):
+    if BLOCKS_ARE_CONTIGUOUS:
+        return BLOCK
+    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
+    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
+    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
+    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
+    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
+    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
+    return offset
+
+@triton.jit
+def get_bounded_indices(indices, max_len=None):
+    return indices % max_len if max_len is not None else indices
+
+@triton.jit
+def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
+  if IS_DIVISIBLE and SAFE_HEAD_DIM:
+    return tl.load(block_ptr)
+  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
+    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
+  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
+      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
+  else:
+      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")
+
+@triton.jit
+def load_checked_2d(
+    ptr,
+    offs_m,
+    offs_n,
+    stride_m,
+    stride_n,
+    IS_DIVISIBLE_M: tl.constexpr,
+    IS_DIVISIBLE_N: tl.constexpr,
+    M_LEN: tl.constexpr,
+    N_DIM: tl.constexpr,
+):
+    # Calculate final pointer if strides are provided
+    if stride_m is not None and stride_n is not None:
+        ptr = ptr + offs_m[:, None] * stride_m + offs_n[None, :] * stride_n
+
+    # Handle all masking cases
+    if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_DIM), other=0.0)
+    elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_n[None, :] < N_DIM), other=0.0)
+    elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
+    else:  # Both divisible
+        return tl.load(ptr)
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index d97eebdb78e5b..155c461775cbc 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -3,8 +3,6 @@
 import logging
 from typing import Any, Optional
 
-import sympy
-
 import torch
 from torch._dynamo.utils import counters
 from torch._inductor.autoheuristic.autoheuristic import AutoHeuristicSelectAlgorithm
@@ -15,6 +13,7 @@
     mm_operations,
 )
 from torch._inductor.codegen.cpp_gemm_template import CppGemmTemplate
+from torch._inductor.remote_gemm_autotune_cache import gen_best_config
 from torch._inductor.virtualized import V
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.torch_version import TorchVersion
@@ -23,15 +22,10 @@
 from ..codegen.cuda.gemm_template import CUTLASS2xGemmTemplate, CUTLASS3xGemmTemplate
 from ..codegen.rocm.ck_tile_universal_gemm_template import CKTileGemmTemplate
 from ..codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
-from ..codegen.subgraph import SubgraphTemplate
-from ..ir import FlexibleLayout, is_triton
+from ..codegen.subgraph import SubgraphChoiceCaller, SubgraphTemplate
+from ..ir import Buffer, ChoiceCaller, FlexibleLayout, is_triton, Layout
 from ..kernel_inputs import MMKernelInputs
-from ..lowering import (
-    add_layout_constraint,
-    constrain_to_fx_strides,
-    lowerings as L,
-    register_lowering,
-)
+from ..lowering import add_layout_constraint, constrain_to_fx_strides, register_lowering
 from ..select_algorithm import (
     autotune_select_algorithm,
     ExternKernelChoice,
@@ -40,8 +34,6 @@
 )
 from ..utils import (
     _use_cutlass_for_op,
-    get_k_splits,
-    get_tma_workspace_arg,
     use_aten_gemm_kernels,
     use_ck_gemm_template,
     use_ck_tile_gemm_template,
@@ -51,14 +43,7 @@
     use_triton_template,
     use_triton_tma_template,
 )
-from .mm_common import (
-    _is_static_problem,
-    addmm_epilogue,
-    mm_args,
-    mm_grid,
-    persistent_mm_grid,
-    scale_mm_epilogue,
-)
+from .mm_common import _is_static_problem, mm_args, mm_grid, persistent_mm_grid
 
 
 try:
@@ -282,16 +267,20 @@
     tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)
 
     {%- else %}
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
     a_desc = triton.language.make_tensor_descriptor(
         base=A,
         shape=[M, K] if A_ROW_MAJOR else [K, M],
-        strides=[K, 1] if A_ROW_MAJOR else [M, 1],
+        strides=[stride_am, 1] if A_ROW_MAJOR else [stride_ak, 1],
         block_shape=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
     )
     b_desc = triton.language.make_tensor_descriptor(
         base=B,
         shape=[K, N] if B_ROW_MAJOR else [N, K],
-        strides=[N, 1] if B_ROW_MAJOR else [K, 1],
+        strides=[stride_bk, 1] if B_ROW_MAJOR else [stride_bn, 1],
         block_shape=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
     )
     {%- endif %}
@@ -318,13 +307,13 @@
 
         {%- if TMA_EXPERIMENTAL_API %}
         a = tl._experimental_descriptor_load(
-            a_desc,
+            a_desc_ptr,
             [rm, rk] if A_ROW_MAJOR else [rk, rm],
             [BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
             A.dtype.element_ty,
         )
         b = tl._experimental_descriptor_load(
-            b_desc,
+            b_desc_ptr,
             [rk, rn] if B_ROW_MAJOR else [rn, rk],
             [BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
             B.dtype.element_ty,
@@ -460,16 +449,18 @@ def apply_scaling(
     tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)
 
     {%- else %}
+    stride_am = {{stride("A", 0)}}
+    stride_bn = {{stride("B", 1)}}
     a_desc = triton.language.make_tensor_descriptor(
         base=A,
         shape=[M, K],
-        strides=[K, 1],
+        strides=[stride_am, 1],
         block_shape=[BLOCK_M, BLOCK_K],
     )
     b_desc = triton.language.make_tensor_descriptor(
         base=B,
         shape=[N, K],
-        strides=[K, 1],
+        strides=[stride_bn, 1],
         block_shape=[BLOCK_N, BLOCK_K],
     )
     {%- endif %}
@@ -561,18 +552,21 @@ def lazy_register_extern_choice(fn):
     return ExternKernelChoice(fn)
 
 
-aten_mm = ExternKernelChoice(torch.mm, "at::mm_out")
+aten_mm = ExternKernelChoice(torch.mm, "at::mm_out", op_overload=aten.mm.out)
 
 aten_addmm = ExternKernelChoice(
-    torch.addmm, "at::addmm_out", op_overload=aten.addmm.default
+    torch.addmm, "at::addmm_out", op_overload=aten.addmm.out
 )
 
-aten__int_mm = ExternKernelChoice(torch._int_mm, "at::_int_mm_out")
+aten__int_mm = ExternKernelChoice(
+    torch._int_mm, "at::_int_mm_out", op_overload=aten._int_mm.out
+)
 
 aten__sparse_semi_structured_mm = ExternKernelChoice(
     torch._sparse_semi_structured_mm,
     "at::_sparse_semi_structured_mm",
     has_out_variant=False,
+    op_overload=aten._sparse_semi_structured_mm.default,
 )
 
 aten__fp8_mm = ExternKernelChoice(
@@ -584,23 +578,13 @@ def _is_int8_mat(mat):
     return mat.get_dtype() in (torch.int8, torch.uint8)
 
 
-@functools.lru_cache
-def using_b200() -> bool:
-    """Returns true if the device is a NVIDIA B200, otherwise returns false."""
-    if not torch.cuda.is_available():
-        return False
-    # compute capability 10.0 or 10.0a is NVIDIA B200
-    device_properties = torch.cuda.get_device_properties(torch.cuda.current_device())
-    return device_properties.major == 10
-
-
 def bias_addmm(inp, mat1, mat2, *, out=None, alpha=1, beta=1):
     """
     Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
     kernel under the hood.  There are a few shapes where this is slower,
     but they are rare.
     """
-    if inp.stride(0) == 0 or inp.size(0) == 1:
+    if (inp.stride(0) == 0 and inp.size(0) != 0) or inp.size(0) == 1:
         return torch.addmm(inp[0], mat1, mat2, out=out, alpha=alpha, beta=beta)
     return torch.addmm(inp, mat1, mat2, out=out, alpha=alpha, beta=beta)
 
@@ -648,6 +632,94 @@ def decomposeK(a, b, k_splits):
     return reduced_buf.to(a.dtype)
 
 
+class DecomposeKSugraphTemplate(SubgraphTemplate):
+    def __init__(self):
+        super().__init__(
+            name="decompose_k",
+        )
+
+    def generate(  # type: ignore[override]
+        self,
+        input_nodes: list[Buffer],
+        layout: Layout,
+        k_split: int,
+    ) -> SubgraphChoiceCaller:
+        from torch._dispatch.python import enable_python_dispatcher
+
+        from ..decomposition import select_decomp_table
+
+        name = f"decompose_k_mm_{k_split}_split"
+        description = f"{k_split=}"
+
+        with enable_python_dispatcher():
+            decompositions = select_decomp_table()
+            fn = make_fx(
+                functools.partial(decomposeK, k_splits=k_split),
+                decompositions,
+            )
+
+            return super().generate(
+                name=name,
+                input_nodes=input_nodes,
+                layout=layout,
+                make_fx_graph=fn,
+                description=description,
+            )
+
+
+decompose_k_subgraph_template = DecomposeKSugraphTemplate()
+
+
+class ContiguousTemplate(SubgraphTemplate):
+    def __init__(self, name: str, description: str, fn: Any):
+        self.name = name
+        self.description = description
+        self.fn = fn
+        super().__init__(
+            name=name,
+        )
+
+    def generate(  # type: ignore[override]
+        self,
+        input_nodes: list[Buffer],
+        layout: Layout,
+    ) -> SubgraphChoiceCaller:
+        from torch._dispatch.python import enable_python_dispatcher
+
+        from ..decomposition import select_decomp_table
+
+        with enable_python_dispatcher():
+            decompositions = select_decomp_table()
+            fn = make_fx(
+                self.fn,
+                decompositions,
+            )
+
+            return super().generate(
+                name=self.name,
+                input_nodes=input_nodes,
+                layout=layout,
+                make_fx_graph=fn,
+                description=self.description,
+            )
+
+
+def contiguous_mm(a, b):
+    return torch.mm(a, b.contiguous())
+
+
+def contiguous_addmm(inp, a, b):
+    return torch.addmm(inp, a, b.contiguous())
+
+
+mm_contiguous_subgraph_template = ContiguousTemplate(
+    "contiguous_mm", "contiguous mm", contiguous_mm
+)
+addmm_contiguous_subgraph_template = ContiguousTemplate(
+    "contiguous_addmm", "contiguous addmm", contiguous_addmm
+)
+
+
 @register_lowering(aten.mm, type_promotion_kind=None)
 def tuned_mm(mat1, mat2, *, layout=None):
     """
@@ -678,84 +750,35 @@ def tuned_mm(mat1, mat2, *, layout=None):
         aten_layout = FlexibleLayout(
             device=layout.device, dtype=layout.dtype, size=layout.size
         )
-
-    # options to tune from
-    choices = (
-        [aten_mm.bind(kernel_inputs.nodes(), aten_layout)]
-        if use_aten_gemm_kernels()
-        else []
-    )
+    choices: list[ChoiceCaller] = []
+    if use_aten_gemm_kernels():
+        choices.extend(
+            V.choices.get_mm_configs(kernel_inputs, [aten_mm], "mm", aten_layout)
+        )
     static_shape, is_nonzero = _is_static_problem(layout)
 
-    if is_nonzero and use_triton_template(layout):
-        # Get template params using the new unified function
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, mm_template.name, "mm"
-        ):
-            mm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
-            )
-
+    if is_nonzero and use_triton_template(layout, check_max_autotune=False):
+        # Get template choices using the new unified function
+        choices.extend(V.choices.get_mm_configs(kernel_inputs, [mm_template], "mm"))
         if use_triton_tma_template(mat1, mat2):
-            # Get TMA template params using the new unified function
-            for kwargs in V.choices.get_mm_configs(
-                kernel_inputs, layout, persistent_tma_mm_template.name, "mm"
-            ):
-                persistent_tma_mm_template.maybe_append_choice(
-                    choices,
-                    input_nodes=kernel_inputs.nodes(),
-                    layout=layout,
-                    workspace_arg=get_tma_workspace_arg(
-                        num_tma_descriptors=2,
-                        device=mat1.get_device(),
-                    ),
-                    **kwargs,
+            # Get TMA template choices using the new unified function
+            choices.extend(
+                V.choices.get_mm_configs(
+                    kernel_inputs, [persistent_tma_mm_template], "mm"
                 )
-
-        from torch._inductor.ir import get_free_symbols
-
-        # Only do split-k optimization if K is much larger than m, n and m, n are small
-        # and if there aren't any unbacked symbols
-        unbacked_symbols = any(
-            len(get_free_symbols(itr, unbacked_only=True)) > 0
-            for itr in (
-                mat1.get_size(),
-                mat1.get_stride(),
-                mat2.get_size(),
-                mat2.get_stride(),
             )
-        )
-        if use_decompose_k_choice(m, n, k) and not unbacked_symbols:
-            from torch._dispatch.python import enable_python_dispatcher
-
-            from ..decomposition import select_decomp_table
-
-            k_splits = get_k_splits(m, n, k)
-            for k_split in k_splits:
-                if not V.graph.sizevars.statically_known_true(
-                    sympy.Eq(sympy.Mod(k, k_split), 0)
-                ):
-                    continue
-
-                with enable_python_dispatcher():
-                    decompositions = select_decomp_table()
-
-                    decompose_k_subgraph_template = SubgraphTemplate(
-                        name=f"decompose_k_mm_{k_split}_split",
-                        make_fx_graph=make_fx(
-                            functools.partial(decomposeK, k_splits=k_split),
-                            decompositions,
-                        ),
-                    )
 
-                decompose_k_subgraph_template.maybe_append_choice(
-                    choices,
-                    input_nodes=(mat1, mat2),
-                    layout=layout,
+        if use_decompose_k_choice(m, n, k):
+            choices.extend(
+                V.choices.get_mm_configs(
+                    kernel_inputs, [decompose_k_subgraph_template], "mm"
                 )
+            )
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs, [mm_contiguous_subgraph_template], "mm"
+            )
+        )
 
     if (
         is_nonzero
@@ -789,21 +812,16 @@ def tuned_mm(mat1, mat2, *, layout=None):
         if use_aten_gemm_kernels():
             always_included.append("extern_mm")
         num_choices_before_extra_configs = len(choices)
-        for kwargs in V.choices.get_mm_configs(
-            # TODO(coconutruben): remove once we deprecate ah
-            # mm-extra is a hack to keep the ah functionality alive
-            # while we transition to the unified kwargs retrieval
-            kernel_inputs,
-            layout,
-            "mm-ah",
-            "mm",
-        ):
-            mm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
+        choices.extend(
+            V.choices.get_mm_configs(
+                # TODO(coconutruben): remove once we deprecate ah
+                # mm-extra is a hack to keep the ah functionality alive
+                # while we transition to the unified kwargs retrieval
+                kernel_inputs,
+                [mm_template],
+                "mm-ah",
             )
+        )
 
         # using AutoHeuristic for ranking
         ah_choices = mm_autoheuristic(
@@ -836,7 +854,19 @@ def tuned_mm(mat1, mat2, *, layout=None):
             lazy_register_extern_choice(k).bind(kernel_inputs.nodes(), layout)
         )
 
-    return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
+    best_config_future = None
+    # Purposely not awaiting the future here - this kicks off the best config lookup at lowering time
+    # The future will be awaited at scheduling time in select_algorithm.py
+    if torch._inductor.config.remote_gemm_autotune_cache:
+        best_config_future = gen_best_config(mat1, mat2)
+
+    return autotune_select_algorithm(
+        name,
+        choices,
+        kernel_inputs.nodes(),
+        layout,
+        best_config_future=best_config_future,
+    )
 
 
 @register_lowering(aten._int_mm, type_promotion_kind=None)
@@ -845,7 +875,7 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
     m, n, k, layout, mat1, mat2 = mm_args(
         mat1, mat2, layout=layout, out_dtype=torch.int32
     )
-
+    name = "int_mm"
     # below is for getting an overview logging info of inductor mms
     counters["aten_mm_info"][f"aten._int_mm_{m}_{n}_{k}"] += 1
     log.info(
@@ -860,41 +890,46 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
 
     static_shape, is_nonzero = _is_static_problem(layout)
     use_cutlass = static_shape and is_nonzero and use_cutlass_template(layout, m, n, k)
-
-    choices = (
-        [aten__int_mm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
-    )
+    choices: list[ChoiceCaller] = []
 
     # Create MMKernelInputs for Int MM
-    kernel_inputs = MMKernelInputs([mat1, mat2])
+    kernel_inputs = MMKernelInputs([mat1, mat2], out_dtype=torch.int32)
+    if use_aten_gemm_kernels():
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                [aten__int_mm],
+                name,
+            )
+        )
 
-    if use_cutlass and _use_cutlass_for_op("int_mm"):
+    if use_cutlass and _use_cutlass_for_op(name):
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
             choices, layout, kernel_inputs.nodes(), fuseable=True, non_fuseable=True
         )
 
-    if is_nonzero and use_triton_template(layout, enable_int32=True):
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, mm_template.name, "int_mm"
-        ):
-            mm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
-            )
+    if is_nonzero and use_triton_template(
+        layout, enable_int32=True, check_max_autotune=False
+    ):
+        choices.extend(V.choices.get_mm_configs(kernel_inputs, [mm_template], name))
 
-    return autotune_select_algorithm("int_mm", choices, kernel_inputs.nodes(), layout)
+    return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
 
 
 @register_lowering(aten.addmm, type_promotion_kind=None)
 def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+    """
+    Lowering for autotuning aten.addmm with different backends (Aten, Triton, CUTLASS, etc.)
+    """
     # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
     static_shape, is_nonzero = _is_static_problem(layout)
-
+    name = "addmm"
     # Create MMKernelInputs for AddMM at the top
-    kernel_inputs = MMKernelInputs([inp_expanded, mat1, mat2])
+    kernel_inputs = MMKernelInputs(
+        [inp_expanded, mat1, mat2], scalars=dict(alpha=alpha, beta=beta)
+    )
+    choices: list[ChoiceCaller] = []
 
     # below is for getting an overview logging info of inductor mms
     counters["aten_mm_info"][f"aten.addmm_{m}_{n}_{k}"] += 1
@@ -907,7 +942,7 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         mat2.get_dtype(),
         layout,
     )
-
+    aten_layout = layout
     if (not is_nonzero) or (
         not (inductor_config.max_autotune or inductor_config.max_autotune_gemm)
     ):
@@ -916,99 +951,74 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         from torch._inductor.ir import FixedLayout, FlexibleLayout
 
         if isinstance(layout, FixedLayout):
-            layout = FlexibleLayout(
+            aten_layout = FlexibleLayout(
                 device=layout.device, dtype=layout.dtype, size=layout.size
             )
-        choices = (
-            [
-                aten_addmm.bind(
-                    # TODO(coconutruben): replace with kernel_inputs.nodes()
-                    # once that supports the unexpanded nodes as well
-                    [inp, mat1, mat2],
-                    layout,
-                    alpha=alpha,
-                    beta=beta,
-                )
-            ]
-            if use_aten_gemm_kernels()
-            else []
+        # TODO(coconutruben): combine this with the main flow of addmm through
+        # a subgraph or something as inp vs inp_expanded causes some slight numeric
+        # differences
+        kernel_inputs = MMKernelInputs(
+            [inp, mat1, mat2], scalars=dict(alpha=alpha, beta=beta)
         )
-        return autotune_select_algorithm(
-            # TODO(coconutruben): replace with kernel_inputs.nodes()
-            # once that supports the unexpanded nodes as well
-            "addmm",
-            choices,
-            [inp, mat1, mat2],
-            layout,
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                [aten_addmm],
+                name,
+                aten_layout,
+            )
         )
+        return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
 
-    choices = (
-        [
-            aten_addmm.bind(
-                kernel_inputs.nodes(),
-                layout,
-                alpha=alpha,
-                beta=beta,
+    if use_aten_gemm_kernels():
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                [aten_bias_addmm],
+                name,
+            )
+        )
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                [aten_addmm],
+                name,
             )
-        ]
-        if use_aten_gemm_kernels()
-        else []
-    )
-
-    if (
-        use_aten_gemm_kernels()
-        and inp_expanded.get_stride()[0] == 0
-        and inp_expanded.get_device().type == "cuda"
-        and inductor_config.triton.autotune_cublasLt
-    ):
-        # unexpand inp to make sure fused addmm from cublasLt is used
-        choices.insert(
-            0,
-            aten_bias_addmm.bind(
-                kernel_inputs.nodes(),
-                layout,
-                alpha=alpha,
-                beta=beta,
-            ),
         )
 
-    if is_nonzero and use_triton_template(layout):
-        # Get template params using the new unified function
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, mm_template.name, "addmm"
-        ):
-            mm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
-                prefix_args=1,
-                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
-                epilogue_fn_hash=str(["addmm_epilogue", layout.dtype, alpha, beta]),
+    if is_nonzero and use_triton_template(layout, check_max_autotune=False):
+        # all the triton templates use the extra_kwargs
+        # Get template choices using the new unified function
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                [mm_template],
+                name,
             )
+        )
 
         if use_triton_tma_template(mat1, mat2):
-            # Get TMA template params using the new unified function
-            for kwargs in V.choices.get_mm_configs(
-                kernel_inputs, layout, persistent_tma_mm_template.name, "addmm"
-            ):
-                persistent_tma_mm_template.maybe_append_choice(
-                    choices,
-                    input_nodes=kernel_inputs.nodes(),
-                    layout=layout,
-                    workspace_arg=get_tma_workspace_arg(
-                        num_tma_descriptors=2,
-                        device=mat1.get_device(),
-                    ),
-                    **kwargs,
-                    prefix_args=1,
-                    epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+            # Get TMA template choices using the new unified function
+            choices.extend(
+                V.choices.get_mm_configs(
+                    kernel_inputs,
+                    [persistent_tma_mm_template],
+                    name,
                 )
+            )
+
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                [addmm_contiguous_subgraph_template],
+                "addmm",
+            )
+        )
 
     if (
         is_nonzero
         and use_cutlass_template(layout, m, n, k)
-        and _use_cutlass_for_op("addmm")
+        and _use_cutlass_for_op(name)
     ):
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
             choices,
@@ -1042,7 +1052,7 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
             has_bias=True,
         )
 
-    return autotune_select_algorithm("addmm", choices, kernel_inputs.nodes(), layout)
+    return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
 
 
 @register_lowering(aten._sparse_semi_structured_mm, type_promotion_kind=None)
@@ -1051,13 +1061,13 @@ def tuned_sparse_semi_structured_mm(
 ):
     from torch._inductor.select_algorithm import realize_inputs
 
+    # TODO(coconturuben): support V.choices.get_mm_configs for sparse_semi_structured_mm
     mat1, mat1_meta, mat2 = realize_inputs(mat1, mat1_meta, mat2)
     m1, k1 = mat1.get_size()
     m2, _ = mat1_meta.get_size()
     k2, n = mat2.get_size()
     m = V.graph.sizevars.check_equals_and_simplify(m1, m2)
     k = V.graph.sizevars.check_equals_and_simplify(2 * k1, k2)
-
     if layout is None:
         from torch._inductor.ir import FixedLayout
 
@@ -1139,115 +1149,76 @@ def tuned_scaled_mm(
         mat_b.get_dtype(),
         layout,
     )
-
+    name = "scaled_mm"
     check_supported_striding(mat_a, mat_b)
 
     scale_a_real, scale_b_real = realize_inputs(scale_a, scale_b)
 
-    input_nodes: tuple[Any, ...]
+    input_nodes: list[Any]
 
     if not bias:
-        input_nodes = (mat_a, mat_b, scale_a_real, scale_b_real)
+        input_nodes = [mat_a, mat_b, scale_a_real, scale_b_real]
     else:
         bias_real = realize_inputs(bias)
-        input_nodes = (mat_a, mat_b, scale_a_real, scale_b_real, bias_real)
+        input_nodes = [mat_a, mat_b, scale_a_real, scale_b_real, bias_real]
 
-    aten_choice = aten__fp8_mm.bind(
-        input_nodes, layout, out_dtype=out_dtype, use_fast_accum=use_fast_accum
+    # Create MMKernelInputs for Scaled MM (matrices are at indices 0, 1)
+    kernel_inputs = MMKernelInputs(
+        input_nodes, mat1_idx=0, mat2_idx=1, out_dtype=out_dtype
     )
 
-    choices = []
+    choices: list[ChoiceCaller] = []
     if use_aten_gemm_kernels():
-        choices.append(aten_choice)
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                [aten__fp8_mm],
+                name,
+                kwarg_overrides={
+                    aten__fp8_mm.uid: dict(
+                        out_dtype=out_dtype, use_fast_accum=use_fast_accum
+                    )
+                },
+            )
+        )
 
     # We dont have triton lowerings for the MX variants yet
     if scale_a.dtype != torch.float32:
-        return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout)
+        return autotune_select_algorithm(name, choices, input_nodes, layout)
 
     _, is_nonzero = _is_static_problem(layout)
 
-    # Prepare triton input nodes and create kernel_inputs at the top
-    triton_input_nodes: list[Any]
-    if bias and len(mat_b.get_size()) == len(bias.get_size()) + 1:
-        # Need to unsqueeze bias from [N] -> [1, N]
-        triton_bias = L[aten.unsqueeze](bias, 0)
-    else:
-        triton_bias = bias
-
-    if len(scale_a.get_size()) == 0 or len(scale_b.get_size()) == 0:
-        assert len(scale_a.get_size()) == len(scale_b.get_size())
-        # Need to unsqueeze scale from [] -> [1, 1]
-        triton_scale_a = L[aten.unsqueeze](L[aten.unsqueeze](scale_a, 0), 1)
-        triton_scale_b = L[aten.unsqueeze](L[aten.unsqueeze](scale_b, 0), 1)
-    else:
-        triton_scale_a = scale_a
-        triton_scale_b = scale_b
-
-    if bias:
-        triton_input_nodes = [
-            mat_a,
-            mat_b,
-            triton_scale_a,
-            triton_scale_b,
-            triton_bias,
-        ]
-        suffix_args = 3
-    else:
-        triton_input_nodes = [mat_a, mat_b, triton_scale_a, triton_scale_b]
-        suffix_args = 2
-
-    # Create MMKernelInputs for Scaled MM (matrices are at indices 0, 1)
-    kernel_inputs = MMKernelInputs(triton_input_nodes, mat1_idx=0, mat2_idx=1)
-
-    if is_nonzero and use_triton_template(layout, enable_float8=True):
+    if is_nonzero and use_triton_template(
+        layout, enable_float8=True, check_max_autotune=False
+    ):
+        overriders = dict(USE_FAST_ACCUM=use_fast_accum)
         # TODO (paulzhan): There is no template that exists for bias and TMA
         # Don't run tma template currently if bias exists
         if use_triton_tma_template(mat_a, mat_b) and not bias:
-            # Get TMA template params using the new unified function
-            for kwargs in V.choices.get_mm_configs(
-                kernel_inputs, layout, scaled_mm_device_tma_template.name, "scaled_mm"
-            ):
-                kwargs["USE_FAST_ACCUM"] = use_fast_accum
-                scaled_mm_device_tma_template.maybe_append_choice(
-                    choices,
-                    input_nodes=kernel_inputs.nodes(),
-                    layout=layout,
-                    workspace_arg=get_tma_workspace_arg(
-                        num_tma_descriptors=2,
-                        device=mat_a.get_device(),
-                    ),
-                    **kwargs,
+            # Get TMA template choices using the new unified function
+            choices.extend(
+                V.choices.get_mm_configs(
+                    kernel_inputs,
+                    [scaled_mm_device_tma_template],
+                    name,
+                    kwarg_overrides={scaled_mm_device_tma_template.uid: overriders},
                 )
+            )
 
-        # Get template params using the new unified function
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, mm_template.name, "scaled_mm"
-        ):
-            kwargs["USE_FAST_ACCUM"] = use_fast_accum
-            if V.graph.sizevars.guard_or_false(sympy.Le(k, 16)):
-                # Triton crashes however uncommon for real workloads
-                continue
-
-            # On NVIDIA B200 GPUs, K dim must be >= 32 for tcgen05.mma.kind::f8f6f4.* PTX instruction to be valid
-            # source: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-matrix-shape
-            if using_b200() and V.graph.sizevars.guard_or_false(sympy.Lt(k, 32)):
-                continue
-
-            # possibly appends a TritonTemplateCaller to choices
-            mm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
-                suffix_args=suffix_args,
-                epilogue_fn=scale_mm_epilogue(),
-                epilogue_fn_hash="scale_mm_epilogue",
+        # Get template choices using the new unified function
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                [mm_template],
+                name,
+                kwarg_overrides={mm_template.uid: overriders},
             )
+        )
 
     if (
         is_nonzero
         and use_cutlass_template(layout, m, n, k)
-        and _use_cutlass_for_op("scaled_mm")
+        and _use_cutlass_for_op(name)
     ):
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
             choices,
@@ -1259,9 +1230,7 @@ def tuned_scaled_mm(
     if is_nonzero and use_ck_gemm_template(layout, m, n, k):
         CKGemmTemplate.add_ck_gemm_choices(choices, layout, kernel_inputs.nodes())
 
-    return autotune_select_algorithm(
-        "scaled_mm", choices, kernel_inputs.nodes(), layout
-    )
+    return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
 
 
 @functools.cache
diff --git a/torch/_inductor/kernel/mm_common.py b/torch/_inductor/kernel/mm_common.py
index dee1c1ac9c352..228492fd9a1e5 100644
--- a/torch/_inductor/kernel/mm_common.py
+++ b/torch/_inductor/kernel/mm_common.py
@@ -180,7 +180,7 @@ def has_zero_dim(size: Sequence[_IntLike]) -> bool:
     )
 
 
-def is_batch_stride_largest(mat1, mat2, layout) -> bool:
+def is_batch_stride_largest_or_zero(mat1, mat2, layout) -> bool:
     """
     Checking if the batch stride is the largest in the stride.
     """
@@ -188,7 +188,7 @@ def is_batch_stride_largest(mat1, mat2, layout) -> bool:
     strides = [mat1.get_stride(), mat2.get_stride(), layout.stride]
     for size, stride in zip(sizes, strides):
         assert len(size) == len(stride) == 3, "Expect 3D tensors"
-        if stride[0] != sympy_product(size[1:]):
+        if stride[0] != 0 and stride[0] != sympy_product(size[1:]):
             return False
 
     return True
diff --git a/torch/_inductor/kernel/mm_plus_mm.py b/torch/_inductor/kernel/mm_plus_mm.py
index df3e8fcf1e656..2133931815949 100644
--- a/torch/_inductor/kernel/mm_plus_mm.py
+++ b/torch/_inductor/kernel/mm_plus_mm.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 
 import logging
+from typing import TYPE_CHECKING
 
 import torch
 
@@ -16,6 +17,9 @@
 from .mm_common import mm_args, mm_grid
 
 
+if TYPE_CHECKING:
+    from torch._inductor.ir import ChoiceCaller
+
 log = logging.getLogger(__name__)
 
 aten = torch.ops.aten
@@ -150,27 +154,17 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
 
     assert layout1 == layout2
     # options to tune from
-    choices = (
-        [aten_mm_plus_mm.bind(kernel_inputs.nodes(), layout1)]
-        if use_aten_gemm_kernels()
-        else []
-    )
+    choices: list[ChoiceCaller] = []
+    if use_aten_gemm_kernels():
+        choices.extend(
+            V.choices.get_mm_configs(kernel_inputs, [aten_mm_plus_mm], "mm_plus_mm")
+        )
 
-    if use_triton_template(layout1):
-        # Get template params using the new unified function
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout1, mm_plus_mm_template.name, "mm_plus_mm"
-        ):
-            # Apply BLOCK_K constraint specific to mm_plus_mm
-            # see https://github.com/triton-lang/triton/issues/1298
-            # BLOCK_K = K causes llvm error
-            if V.graph.sizevars.statically_known_lt(kwargs.get("BLOCK_K", k1), k1):
-                mm_plus_mm_template.maybe_append_choice(
-                    choices,
-                    input_nodes=kernel_inputs.nodes(),
-                    layout=layout1,
-                    **kwargs,
-                )
+    if use_triton_template(layout1, check_max_autotune=False):
+        # Get template choices using the new unified function
+        choices.extend(
+            V.choices.get_mm_configs(kernel_inputs, [mm_plus_mm_template], "mm_plus_mm")
+        )
 
     return autotune_select_algorithm(
         "mm_plus_mm", choices, kernel_inputs.nodes(), layout1
diff --git a/torch/_inductor/kernel_inputs.py b/torch/_inductor/kernel_inputs.py
index 6c66c1161900b..c579cf7565772 100644
--- a/torch/_inductor/kernel_inputs.py
+++ b/torch/_inductor/kernel_inputs.py
@@ -1,12 +1,15 @@
 from __future__ import annotations
 
-from typing import Any, Optional, TYPE_CHECKING
+from abc import ABC, abstractmethod
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch._inductor.config
 from torch._inductor import ir
 from torch._inductor.virtualized import V
 
+from .ir import FixedLayout, FlexibleLayout, Layout
+
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -14,22 +17,30 @@
     import sympy
 
 
-class KernelInputs:
+class KernelInputs(ABC):
     """
     Class to store and provide access to input nodes for kernels.
     This class takes in a tuple of input nodes and provides methods to access
     information about these nodes, such as their device type and device.
     """
 
-    def __init__(self, input_nodes: list[Any]):
+    def __init__(
+        self,
+        input_nodes: list[Any],
+        scalars: Optional[dict[str, Union[float, int]]] = None,
+        out_dtype: Optional[torch.dtype] = None,
+    ):
         """
         Initialize with a tuple of input nodes.
 
         Args:
             input_nodes: A tuple of input nodes to store
+            out_dtype: Optional output dtype to store
         """
         self._input_nodes = input_nodes
         self._device_name: Optional[str] = None
+        self._scalars = scalars if scalars is not None else {}
+        self._out_dtype = out_dtype
         assert len(input_nodes) > 0, "Expected at least one input node"
 
     def nodes(self, reorder: Optional[Sequence[int]] = None) -> list[Any]:
@@ -50,6 +61,16 @@ def nodes(self, reorder: Optional[Sequence[int]] = None) -> list[Any]:
         )
         return [self._input_nodes[i] for i in reorder]
 
+    @property
+    def count(self) -> int:
+        """
+        Get the number of input nodes.
+
+        Returns:
+            The number of input nodes
+        """
+        return len(self._input_nodes)
+
     @property
     def device_type(self) -> Optional[str]:
         """
@@ -153,6 +174,38 @@ def dtype(self, idx: int = 0) -> torch.dtype:
         """
         return self._input_nodes[idx].get_dtype()
 
+    @abstractmethod
+    def out_dtype(self) -> torch.dtype:
+        """
+        Get the output dtype, whether passed in or inferred from the nodes
+
+        Returns:
+            The output dtype
+        """
+
+    def get_scalar(self, name: str) -> Union[float, int]:
+        """
+        Get the scalar value for a given name.
+
+        Args:
+            name: Name of the scalar to get
+
+        Returns:
+            The scalar value
+        """
+        assert name in self._scalars, f"Scalar {name} not found, but required"
+        return self._scalars[name]
+
+    @abstractmethod
+    def output_layout(self, flexible: bool = True) -> Layout:
+        """
+        Abstract method to handle output layout generation.
+
+        Args:
+            out_dtype: Optional output dtype. If not provided, infer from inputs
+            flexible: If True, return FlexibleLayout, otherwise FixedLayout
+        """
+
 
 class MMKernelInputs(KernelInputs):
     """
@@ -160,14 +213,21 @@ class MMKernelInputs(KernelInputs):
     Provides additional methods to access M, N, K dimensions.
     """
 
-    def __init__(self, input_nodes: list[Any], mat1_idx: int = -2, mat2_idx: int = -1):
+    def __init__(
+        self,
+        input_nodes: list[Any],
+        scalars: Optional[dict[str, Union[float, int]]] = None,
+        out_dtype: Optional[torch.dtype] = None,
+        mat1_idx: int = -2,
+        mat2_idx: int = -1,
+    ):
         """
         Initialize with a tuple of input nodes.
 
         By default, we assume the last 2 input nodes are mat1 and mat2, but
         the caller can adjust when necessary
         """
-        super().__init__(input_nodes)
+        super().__init__(input_nodes, scalars, out_dtype)
         # for mm, we need at least 2 nodes, and we need to know which nodes
         # are the main matrixes e.g. addmm is (bias, mat1, mat2) whereas others
         # might be (mat1, mat2, scale), etc.
@@ -212,6 +272,47 @@ def mnk_symbolic(
         V.graph.sizevars.check_equals(k, k0)
         return (m, n, k)
 
+    def out_dtype(self) -> torch.dtype:
+        """
+        Get the output dtype, whether passed in or inferred from the nodes
+
+        Returns:
+            The output dtype
+        """
+        if self._out_dtype is not None:
+            return self._out_dtype
+        return self.mat1mat2()[0].get_dtype()
+
+    def output_layout(self, flexible: bool = True) -> Layout:
+        """
+        Handle output layout generation for matrix multiplication.
+
+        Args:
+            out_dtype: Optional output dtype. If not provided, infer from inputs
+            flexible: If True, return FlexibleLayout, otherwise FixedLayout
+        """
+        mat1, mat2 = self.mat1mat2()
+        out_dtype = self.out_dtype()
+        # NOTE: taken from mm_common.mm_args
+        *b1, m, k1 = mat1.get_size()
+        *b2, k2, n = mat2.get_size()
+        b = [V.graph.sizevars.check_equals_and_simplify(a, b) for a, b in zip(b1, b2)]
+        size = [*b, m, n]
+        if flexible:
+            return FlexibleLayout(self.device(), out_dtype, size)
+        else:
+            return FixedLayout(self.device(), out_dtype, size)
+
+    def mat1mat2(self) -> tuple[Any, Any]:
+        """
+        Get the mat1 and mat2 nodes.
+
+        Returns:
+            A tuple of (mat1, mat2) nodes
+        """
+        nodes = self.nodes()
+        return nodes[self._mat1_idx], nodes[self._mat2_idx]
+
     def mnk_hinted(self) -> tuple[int, int, int]:
         """
         Get the hinted M, N, K dimensions for matrix multiplication.
diff --git a/torch/_inductor/kernel_template_choice.py b/torch/_inductor/kernel_template_choice.py
new file mode 100644
index 0000000000000..ac42eaf5b95b0
--- /dev/null
+++ b/torch/_inductor/kernel_template_choice.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+from typing import Any, Optional, TYPE_CHECKING, Union
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from .codegen.common import KernelTemplate
+    from .ir import ChoiceCaller, Layout
+    from .kernel_inputs import KernelInputs
+    from .select_algorithm import ExternKernelChoice
+
+
+class KernelTemplateChoice:
+    """
+    A class that encapsulates all the components needed to create a ChoiceCaller from a template.
+
+    This class implements lazy evaluation for the choice property - the actual ChoiceCaller
+    is only created when first accessed via the choice property.
+    """
+
+    def __init__(
+        self,
+        template: Union[KernelTemplate, ExternKernelChoice],
+        kwargs: dict[str, Any],
+        extra_kwargs: dict[str, Any],
+        layout: Layout,
+        inputs: KernelInputs,
+    ):
+        self.template = template
+        self.kwargs = kwargs
+        self.extra_kwargs = extra_kwargs
+        self.layout = layout
+        self.inputs = inputs
+
+    @property
+    def choice(self) -> Optional[ChoiceCaller]:
+        """
+        Lazily evaluate and return the ChoiceCaller for this template choice.
+
+        On first access, calls template.choice_or_None() with the stored parameters.
+        If successful, caches and returns the ChoiceCaller. If it fails, caches
+        and returns None. Subsequent accesses return the cached value.
+
+        Returns:
+            ChoiceCaller if the template choice succeeds, None otherwise
+        """
+        if not hasattr(self, "_choice"):
+            # First time accessing choice - try to generate it
+            self._choice = self.template.choice_or_none(
+                **self.kwargs,
+                layout=self.layout,
+                input_nodes=self.inputs.nodes(),
+                **self.extra_kwargs,
+            )
+        return self._choice
+
+
+def make_ktc_generator(
+    template: Union[KernelTemplate, ExternKernelChoice],
+    cs: Generator[dict[str, Any], None, None],
+    overrides: dict[str, Any],
+    extra_kwargs: dict[str, Any],
+    layout: Layout,
+    inputs: KernelInputs,
+) -> Generator[KernelTemplateChoice, None, None]:
+    """
+    Create a generator of KernelTemplateChoice objects for a given template.
+
+    Args:
+        template: The template object (KernelTemplate or ExternKernelChoice)
+        cs: Generator of configurations from template heuristic
+        overrides: Override kwargs for the template
+        extra_kwargs: Extra kwargs from the heuristic
+        layout_val: Layout value for the template
+        inputs: KernelInputs for the op
+
+    Yields:
+        KernelTemplateChoice objects
+    """
+    for ckwargs in cs:
+        yield KernelTemplateChoice(
+            template=template,
+            kwargs={**ckwargs, **overrides},
+            extra_kwargs=extra_kwargs,
+            layout=layout,
+            inputs=inputs,
+        )
diff --git a/torch/_inductor/loop_body.py b/torch/_inductor/loop_body.py
index ffcf431c0cb30..5ae38810fa134 100644
--- a/torch/_inductor/loop_body.py
+++ b/torch/_inductor/loop_body.py
@@ -223,6 +223,53 @@ def merge_loops(self) -> LoopBody:
         )
         return new_body2
 
+    def expand_dimension_for_pointwise_node(
+        self, dimension: int, new_range: int
+    ) -> LoopBody:
+        """
+        Expand node on `dimension` to `new_range` and rely on index modular to avoid
+        out-of-boundary access.
+        """
+
+        old_body = self
+        old_sizes = self.sizes
+
+        iter_size, reduce_size = old_sizes
+        original_range = iter_size[dimension]
+        new_iter_size = list(iter_size)
+        new_iter_size[dimension] = new_range
+        new_sizes = (new_iter_size, reduce_size)
+
+        (iter_vars, reduce_vars), var_ranges = dependencies.index_vars_no_squeeze(
+            *new_sizes,
+            prefix="t",  # type: ignore[arg-type]
+        )
+
+        def new_body(*indices: Sequence[sympy.Expr]) -> Any:
+            index = [*itertools.chain.from_iterable(indices)]
+            assert len(index) == len(iter_size) + len(reduce_size)
+            iter_idx = index[: len(iter_size)]
+            reduce_idx = index[len(iter_size) :]
+
+            new_iter_idx = list(iter_idx)
+            new_iter_idx[dimension] = iter_idx[dimension] % original_range
+
+            return old_body(new_iter_idx, reduce_idx)
+
+        loop_body = LoopBody(
+            new_body, (iter_vars, reduce_vars), var_ranges, iter_vars, reduce_vars
+        )
+
+        # use the original symbol prefix so we can do multiple round of reordering
+        (iter_vars2, reduce_vars2), var_ranges2 = dependencies.index_vars_no_squeeze(
+            *new_sizes,
+            prefix="p",  # type: ignore[arg-type]
+        )
+        new_body = LoopBody(
+            loop_body, (iter_vars2, reduce_vars2), var_ranges2, iter_vars2, reduce_vars2
+        )
+        return new_body
+
     def reorder_iter_loops(self, new_order) -> LoopBody:
         """
         Reorder iteration loops and return a new LoopBody.
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 74a562365b692..d05bdd1354694 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -314,6 +314,26 @@ def in_namespace(op, namespace):
     return False
 
 
+def maybe_copy_cpu_scalar(x: TensorBox, device: torch.device) -> TensorBox:
+    """
+    Copy cpu scalar if doesn't not match with given `device`
+    """
+    if not isinstance(x.data, ir.ReinterpretView) or has_free_unbacked_symbols(
+        x.get_size()
+    ):
+        return x
+    size = [V.graph.sizevars.size_hint_or_throw(s) for s in x.get_size()]
+    cur_device = x.get_device()
+    if (
+        cur_device is not None
+        and cur_device.type == "cpu"
+        and cur_device != device
+        and (len(size) == 0 or (len(size) == 1 and size[0] == 1))
+    ):
+        return TensorBox(ir.StorageBox(ir.DeviceCopy.create(x, cur_device, False)))
+    return x
+
+
 def transform_args(
     args: list[Any],
     kwargs: dict[str, Any],
@@ -321,6 +341,10 @@ def transform_args(
     type_promotion_kind: Optional[ELEMENTWISE_TYPE_PROMOTION_KIND],
     convert_input_to_bool: bool,
 ) -> tuple[list[Any], dict[str, Any]]:
+    """
+    Transforms arguments for broadcasting and type promotion
+    """
+
     args_indices = [i for i, x in enumerate(args) if isinstance(x, TensorBox)]
     kwargs_indices = [k for k, v in kwargs.items() if isinstance(v, TensorBox)]
     # check that there's something to transform
@@ -348,6 +372,12 @@ def transform_args(
             args[args_indices[0]] if args_indices else kwargs[kwargs_indices[0]]
         ).get_device()
 
+        for i in args_indices:
+            args[i] = maybe_copy_cpu_scalar(args[i], device)
+
+        for k in kwargs_indices:
+            kwargs[k] = maybe_copy_cpu_scalar(kwargs[k], device)
+
         # sometimes args are an immutable list so we can't mutate them
         def promote(arg):
             if isinstance(arg, TensorBox):
@@ -497,13 +527,9 @@ def broadcast_symbolic_shapes(a, b):
     """
     output = []
     for x, y in itertools.zip_longest(reversed(a), reversed(b), fillvalue=sympy.S.One):
-        if V.graph.sizevars.shape_env.evaluate_expr(
-            sympy.Eq(y, 1), fallback_value=False
-        ):
+        if V.graph.sizevars.is_size_one_or_false(y):
             output.append(x)
-        elif V.graph.sizevars.shape_env.evaluate_expr(
-            sympy.Eq(x, 1), fallback_value=False
-        ):
+        elif V.graph.sizevars.is_size_one_or_false(x):
             output.append(y)
         else:
             V.graph.sizevars.check_equals(x, y)
@@ -949,13 +975,10 @@ def broadcast_tensors(*inputs):
     for x in inputs:
         sizes = x.get_size()
 
-        def is_length_one(size: sympy.Expr):
-            return V.graph.sizevars.shape_env.evaluate_expr(
-                sympy.Eq(size, 1), fallback_value=False
-            )
-
         if len(sizes) != len(target) or any(
-            is_length_one(a) != is_length_one(b) for a, b in zip(sizes, target)
+            V.graph.sizevars.is_size_one_or_false(a)
+            != V.graph.sizevars.is_size_one_or_false(b)
+            for a, b in zip(sizes, target)
         ):
             x = expand(x, target)
         outputs.append(x)
@@ -1329,6 +1352,34 @@ def inner_fn(idx):
     )
 
 
+def _assert_async(cond, msg):
+    cond.realize()
+    cond = to_dtype(cond, torch.bool)
+
+    def inner_fn(index):
+        with ir.ComputedBuffer.force_realize():
+            return ops.device_assert_async(cond.make_loader()(index), msg)
+
+    assertion_op = Pointwise.create(
+        device=cond.get_device(),
+        dtype=cond.get_dtype(),
+        inner_fn=inner_fn,
+        ranges=list(cond.get_size()),
+    )
+    assertion_op.realize()
+    return assertion_op
+
+
+@register_lowering(aten._assert_async.msg)
+def lower_assert_async(cond, msg):
+    return _assert_async(cond, msg)
+
+
+@register_lowering(aten._functional_assert_async.msg)
+def lower_assert_functional_async(cond, msg):
+    return _assert_async(cond, msg)
+
+
 @register_lowering(
     quantized_decomposed.dequantize_per_channel, type_promotion_kind=None
 )
@@ -2879,6 +2930,7 @@ def is_aligned(x):
 
 # index_reduce requires fallback when use_scatter_fallback(...) returns True
 make_fallback(aten.index_reduce)
+make_fallback(aten.repeat_interleave.Tensor, override_decomp=True)
 
 
 # Register with type_promotion_kind None.
@@ -2985,7 +3037,7 @@ def inner_fn(idx):
 
 @register_lowering(aten.slice_scatter, type_promotion_kind=None)
 def slice_scatter(x, src, dim=0, start=None, end=None, step=1):
-    assert x.get_dtype() == src.get_dtype()
+    src = to_dtype(src, x.get_dtype())
     x_loader = x.make_loader()
     dim = _validate_dim(x, dim, 0)
     dim_size = x.get_size()[dim]
@@ -6840,7 +6892,9 @@ def sym_size(a, dim):
     # int, but you KNOW that int must always be a constant,
     # then you do not need trace that call at all (and just
     # constant propagate the integer as is.)
-    assert isinstance(val, torch.SymInt)
+    assert isinstance(val, torch.SymInt), (
+        f"Expect val to be torch.SymInt but got val={val}"
+    )
     return val.node.expr
 
 
@@ -6848,7 +6902,9 @@ def sym_size(a, dim):
 def sym_stride(a, dim):
     val = V.graph.current_node.meta["val"]
     # See Note [Can val be an int?]
-    assert isinstance(val, torch.SymInt)
+    assert isinstance(val, torch.SymInt), (
+        f"Expect val to be torch.SymInt but got val={val}"
+    )
     return val.node.expr
 
 
@@ -6931,17 +6987,17 @@ def resize(x, size, *, memory_format=None):
         and torch.utils.deterministic.fill_uninitialized_memory  # type: ignore[attr-defined]
     ):
         if is_float_dtype(dtype):
-            uninitalized_val = float("nan")
+            uninitialized_val = float("nan")
         elif is_integer_dtype(dtype):
-            uninitalized_val = torch.iinfo(dtype).max
+            uninitialized_val = torch.iinfo(dtype).max
         else:
-            uninitalized_val = True
+            uninitialized_val = True
     else:
         # using zero as that is what empty does
-        uninitalized_val = 0.0
+        uninitialized_val = 0.0
 
     if V.graph.sizevars.statically_known_equals(old_numel, 0):  # type: ignore[arg-type]
-        return full(size, uninitalized_val, dtype=dtype, device=device)
+        return full(size, uninitialized_val, dtype=dtype, device=device)
 
     x_flat = as_strided(
         x,
@@ -6961,7 +7017,7 @@ def inner_fn(idx):
         flat_index_expr = ops.index_expr(flat_index, torch.int64)
         limit = ops.index_expr(old_numel, torch.int64)
         mask = ops.lt(flat_index_expr, limit)
-        return ops.masked(mask, lambda: flat_loader([flat_index]), uninitalized_val)
+        return ops.masked(mask, lambda: flat_loader([flat_index]), uninitialized_val)
 
     out = Pointwise.create(
         device=device, dtype=dtype, inner_fn=inner_fn, ranges=list(size)
@@ -7009,7 +7065,7 @@ def cond(pred, true_fn, false_fn, operands):
 
 
 @register_lowering(torch.ops.higher_order.while_loop, type_promotion_kind=None)
-def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
+def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs, stack_output=False):
     if any(
         isinstance(x, IRNode) and is_triton(x)
         for x in carried_inputs + additional_inputs
@@ -7029,11 +7085,18 @@ def _map_output(out: Any):
         else:
             raise RuntimeError(f"NYI unsupported output type: {type(out)}")
 
-    result = ir.WhileLoop.create(cond_fn, body_fn, carried_inputs, additional_inputs)
+    result = ir.WhileLoop.create(
+        cond_fn, body_fn, carried_inputs, additional_inputs, stack_output
+    )
     assert isinstance(result, Sequence)
     return list(map(_map_output, result))
 
 
+register_lowering(
+    torch.ops.higher_order.while_loop_stack_output, type_promotion_kind=None
+)(functools.partial(while_loop, stack_output=True))
+
+
 @register_lowering(torch.ops.higher_order.invoke_subgraph, type_promotion_kind=None)
 def invoke_subgraph(subgraph_fn: ir.Subgraph, identifier: str, *operands):
     result = ir.InvokeSubgraph.create(subgraph_fn, *operands)
diff --git a/torch/_inductor/memory.py b/torch/_inductor/memory.py
index d287208419a9f..27ca4415c8f0e 100644
--- a/torch/_inductor/memory.py
+++ b/torch/_inductor/memory.py
@@ -4,8 +4,9 @@
 import dataclasses
 import heapq
 import logging
-from typing import Callable, TYPE_CHECKING, TypedDict, Union
+from typing import Callable, Optional, TYPE_CHECKING, TypedDict, Union
 
+from torch._environment import is_fbcode
 from torch._utils_internal import signpost_event
 from torch.utils._ordered_set import OrderedSet
 
@@ -75,7 +76,7 @@ def get_freeable_input_buf(
     Create and keep track of all input buffers that can be freed during the program
 
     Returns:
-        A dictionary containing all freeble input buffers, keyed by their names.
+        A dictionary containing all freeable input buffers, keyed by their names.
     """
 
     def _dep_size_hint(dep: Dep) -> int:
@@ -87,13 +88,20 @@ def _dep_size_hint(dep: Dep) -> int:
         collections.defaultdict(OrderedSet)
     )
     dep_name_to_size: dict[str, int] = dict()
+
     for node in nodes:
         for dep in node.read_writes.reads:
-            if dep.name in graph_inputs and not dep.name.startswith(
-                ("primals_", "arg", "fwd_rng_state", "bwd_rng_state")
-            ):
-                dep_name_to_succ_nodes[dep.name].add(node)
-                dep_name_to_size[dep.name] = _dep_size_hint(dep)
+            if dep.name in graph_inputs:
+                dep_name = dep.name
+                # Subgraphs have a prefix for the name, cleanup the prefix
+                # before checking for known strings.
+                if V.graph.name:
+                    dep_name = dep_name.removeprefix(V.graph.name + "_")
+                if not dep_name.startswith(
+                    ("primals_", "arg", "fwd_rng_state", "bwd_rng_state")
+                ):
+                    dep_name_to_succ_nodes[dep.name].add(node)
+                    dep_name_to_size[dep.name] = _dep_size_hint(dep)
 
     # create FreeableInputBuffer objects and add them to the returned dictionary
     name_to_freeable_input_buf: dict[str, FreeableInputBuffer] = dict()
@@ -124,6 +132,28 @@ def compute_size_for_scheduler_buffer(
         buf1: at creation, 0 bytes allocated, when deleted, 10 bytes freed
         buf2: at creation, 0 bytes allocated, when deleted, 20 bytes freed
 
+    When an operation mutates a buffer in-place, the scheduler creates a new buffer name
+    to track the "before" and "after" states, even though they share the same memory.
+
+    The mutated buffer represents a rename with zero allocation and deallocation cost.
+    During dependency tracking, we transfer dependencies from the mutated name back to
+    the original buffer, ensuring the original memory is only freed when all aliases
+    are done.
+
+    This handles cases where a buffer has multiple non-overlapping aliases - rather than
+    trying to assign free costs to individual aliases, we forward all alias dependencies
+    to the original buffer.
+
+    Consider:
+        buf0 = op0()
+        buf1 = mutation_op_(buf0)
+        del buf0
+        ...
+        op(buf1)
+        del buf1
+
+    The only memory events are the creation prior to op0, and the deletion following buf1.
+
     Returns:
         A dictionary mapping a scheduler buffer to a tuple of (size_alloc, size_free).
     """
@@ -135,18 +165,11 @@ def compute_size_for_scheduler_buffer(
     def _compute_and_update_buf_size(
         sched_buf: SchedulerBuffer, user_of_MultiOutputLayout: bool = False
     ) -> int:
-        if isinstance(sched_buf.node.layout, NoneLayout):
-            # mutations should inherit the size of the mutated buffer
-            if sched_buf.get_mutations():
-                mutated_buf_name = sched_buf.get_mutations()[0]
-                if mutated_buf_name in sched_buf_to_size:
-                    (_size_alloc, _size_free) = sched_buf_to_size[mutated_buf_name]
-                else:
-                    (_size_alloc, _size_free) = (0, 0)
-                sched_buf_to_size[sched_buf.get_name()] = (0, _size_free)
-                sched_buf_to_size[mutated_buf_name] = (_size_alloc, 0)
-            else:
-                sched_buf_to_size[sched_buf.get_name()] = (0, 0)
+        if sched_buf.get_name() in V.graph.scheduler.mutation_real_name:
+            sched_buf_to_size[sched_buf.get_name()] = (0, 0)
+            return 0
+        elif isinstance(sched_buf.node.layout, NoneLayout):
+            sched_buf_to_size[sched_buf.get_name()] = (0, 0)
             return 0
         elif isinstance(sched_buf.node.layout, MultiOutputLayout):
             size_alloc = 0
@@ -200,6 +223,14 @@ def assign_memory_planning_info_for_scheduler_buffers(
         for dep in node.unmet_dependencies:
             dep_name_to_succ_nodes[dep.name].add(node)
 
+    # iterate in reverse, so dependencies are picked up transitively.
+    for mutating_buf_name, real_buf_name in reversed(
+        V.graph.scheduler.mutation_real_name.items()
+    ):
+        dep_name_to_succ_nodes[real_buf_name] |= dep_name_to_succ_nodes[
+            mutating_buf_name
+        ]
+
     # populate the MemoryPlanningInfoForBuffer attribute to each scheduler buffer
     # note: there are scheduler buffers not in dep_name_to_succ_nodes (e.g., graph outputs)
     for buf_name in name_to_buf.keys():
@@ -219,58 +250,81 @@ def assign_memory_planning_info_for_scheduler_nodes(
     """
     Assign to each scheduler node its predecessor and successor nodes.
     """
-    from .scheduler import SchedulerBuffer
 
-    for index, node in enumerate(nodes):
-        size_alloc = sum(buffer.mpi_buffer.size_alloc for buffer in node.get_outputs())
-        pred_buffers = OrderedSet[Union[SchedulerBuffer, FreeableInputBuffer]]()
-        for dep in node.read_writes.reads:
-            if dep.name in name_to_buf and dep in node.unmet_dependencies:
-                pred_buffers.add(name_to_buf[dep.name])
-            elif dep.name in name_to_freeable_input_buf:
-                pred_buffers.add(name_to_freeable_input_buf[dep.name])
-        pred_nodes = OrderedSet(
-            name_to_fused_node[pred_buffer.defining_op_name()]
-            for pred_buffer in pred_buffers
-            if (isinstance(pred_buffer, SchedulerBuffer))
-        )
+    node_to_pred_nodes: dict[BaseSchedulerNode, OrderedSet[BaseSchedulerNode]] = (
+        collections.defaultdict(OrderedSet)
+    )
+    node_to_succ_nodes: dict[BaseSchedulerNode, OrderedSet[BaseSchedulerNode]] = {}
+    node_to_pred_buffers: dict[
+        BaseSchedulerNode, OrderedSet[SchedulerBuffer | FreeableInputBuffer]
+    ] = collections.defaultdict(OrderedSet)
+
+    # collect all predecessors using existing successor mappings
+    for node in nodes:
         succ_nodes = OrderedSet(
             succ_node
             for buffer in node.get_outputs()
             for succ_node in buffer.mpi_buffer.succ_nodes
         )
+        node_to_succ_nodes[node] = succ_nodes
+
+        # For each successor, add current node as its predecessor
+        for succ_node in succ_nodes:
+            node_to_pred_nodes[succ_node].add(node)
+
+        # For each output buffer, add it as predecessor to its successor nodes
+        # TODO - is pred buffers needed ?
+        for buffer in node.get_outputs():
+            for succ_node in buffer.mpi_buffer.succ_nodes:
+                node_to_pred_buffers[succ_node].add(buffer)
+
+    for freeable_buffer in name_to_freeable_input_buf.values():
+        for succ_node in freeable_buffer.mpi_buffer.succ_nodes:
+            node_to_pred_buffers[succ_node].add(freeable_buffer)
+
+    # Second pass: assign memory planning info using completed predecessor mappings
+    for index, node in enumerate(nodes):
+        size_alloc = sum(buffer.mpi_buffer.size_alloc for buffer in node.get_outputs())
+        succ_nodes = node_to_succ_nodes[node]
+        pred_nodes = node_to_pred_nodes[node]
+
+        # make sure we do not make node a successor or predecessor of itself
+        succ_nodes.discard(node)
+        pred_nodes.discard(node)
+
         node.mpi_node = MemoryPlanningInfoForNode(
             index=index,
             size=size_alloc,
-            pred_buffers=pred_buffers,
-            pred_nodes=pred_nodes,
+            pred_buffers=node_to_pred_buffers[node],
+            pred_nodes=node_to_pred_nodes[node],
             succ_nodes=succ_nodes,
         )
 
 
-def estimate_peak_memory(
+# map each scheduler buffer to its size, start step, and end step
+@dataclasses.dataclass
+class BufferInfo:
+    buffer: Union[SchedulerBuffer, FreeableInputBuffer]
+    size_alloc: int
+    size_free: int
+    start_step: int
+    end_step: int
+
+
+def compute_memory_timeline(
     nodes: list[BaseSchedulerNode],
     name_to_freeable_input_buf: dict[str, FreeableInputBuffer],
     graph_outputs: OrderedSet[str],
-) -> tuple[int, list[int]]:
+) -> tuple[
+    list[BufferInfo],
+    dict[BaseSchedulerNode, int],
+    dict[Union[FreeableInputBuffer, SchedulerBuffer], BaseSchedulerNode],
+]:
     """
-    Given a list of nodes in their execution order, estimate the peak memory, by
-    keeping track of the liveliness of SchedulerBuffers and FreeableInputBuffers.
-
-    Returns:
-        int: peak memory
-        List[int]: memory usage at each node (or each step).
+    Compute buffer allocation and deallocation sizes and map their
+    lifetime to the node schedule
     """
 
-    # map each scheduler buffer to its size, start step, and end step
-    @dataclasses.dataclass
-    class BufferInfo:
-        buffer: Union[SchedulerBuffer, FreeableInputBuffer]
-        size_alloc: int
-        size_free: int
-        start_step: int
-        end_step: int
-
     # get the execution step of each node, this will be used to determine
     # the end_step of buffers
     node_to_step: dict[BaseSchedulerNode, int] = {
@@ -279,15 +333,33 @@ class BufferInfo:
 
     # get buffers' size and liveliness information
     buf_info_list: list[BufferInfo] = []
+    buf_to_snode_last_use: dict[
+        Union[FreeableInputBuffer, SchedulerBuffer], BaseSchedulerNode
+    ] = {}
+
+    def _get_end_step_and_snode(
+        buf: Union[FreeableInputBuffer, SchedulerBuffer],
+    ) -> tuple[int, Optional[BaseSchedulerNode]]:
+        max_step: int = -1
+        max_step_snode: Optional[BaseSchedulerNode] = None
+        succ_nodes = buf.mpi_buffer.succ_nodes
+        if succ_nodes:
+            for succ_node in succ_nodes:
+                step = node_to_step[succ_node]
+                if step > max_step:
+                    max_step = step
+                    max_step_snode = succ_node
+            assert max_step_snode is not None
+        return max_step, max_step_snode
+
     # 1. for freeable input buffers
     for buf_name, input_buf in name_to_freeable_input_buf.items():
-        end_step = (
-            len(nodes) - 1
-            if buf_name in graph_outputs
-            else max(
-                node_to_step[succ_node] for succ_node in input_buf.mpi_buffer.succ_nodes
-            )
-        )
+        end_step = -1
+        if buf_name not in graph_outputs:
+            end_step, end_step_snode = _get_end_step_and_snode(input_buf)
+            assert end_step_snode is not None
+            buf_to_snode_last_use[input_buf] = end_step_snode
+
         buf_info_list.append(
             BufferInfo(
                 input_buf,
@@ -304,17 +376,17 @@ class BufferInfo:
             # note: it is possible for a non-graph-output sched_buf to have no succ_nodes and
             # to be only used by its defining op (e.g., due to fusion when all consumers of
             # the buffer are fused with its defining op). In such cases, end_step is step.
-            end_step = (
-                len(nodes) - 1
-                if sched_buf.get_name() in graph_outputs
-                else max(
-                    [
-                        node_to_step[succ_node]
-                        for succ_node in sched_buf.mpi_buffer.succ_nodes
-                    ],
-                    default=step,
-                )
-            )
+            buf_name = sched_buf.get_name()
+            end_step = -1
+            if buf_name not in graph_outputs:
+                end_step, end_step_snode = _get_end_step_and_snode(sched_buf)
+                if end_step == -1:
+                    end_step = step
+                    buf_to_snode_last_use[sched_buf] = node
+                else:
+                    assert end_step_snode is not None
+                    buf_to_snode_last_use[sched_buf] = end_step_snode
+
             buf_info_list.append(
                 BufferInfo(
                     sched_buf,
@@ -325,6 +397,27 @@ class BufferInfo:
                 )
             )
 
+    return buf_info_list, node_to_step, buf_to_snode_last_use
+
+
+def estimate_peak_memory(
+    nodes: list[BaseSchedulerNode],
+    name_to_freeable_input_buf: dict[str, FreeableInputBuffer],
+    graph_outputs: OrderedSet[str],
+) -> tuple[int, list[int]]:
+    """
+    Given a list of nodes in their execution order, estimate the peak memory, by
+    keeping track of the liveliness of SchedulerBuffers and FreeableInputBuffers.
+
+    Returns:
+        int: peak memory
+        List[int]: memory usage at each node (or each step).
+    """
+
+    buf_info_list, _, _ = compute_memory_timeline(
+        nodes, name_to_freeable_input_buf, graph_outputs
+    )
+
     # incremental memory changes at each step
     memory = [0 for _ in range(len(nodes) + 1)]
 
@@ -345,6 +438,73 @@ class BufferInfo:
     return (max_memory, memories_at_nodes)
 
 
+@dataclasses.dataclass
+class SNodeMemory:
+    size_alloc: int
+    size_free: int
+
+
+def estimate_peak_memory_allocfree(
+    nodes: list[BaseSchedulerNode],
+    name_to_freeable_input_buf: dict[str, FreeableInputBuffer],
+    graph_outputs: OrderedSet[str],
+) -> tuple[
+    int,
+    list[tuple[int, int]],
+    dict[BaseSchedulerNode, SNodeMemory],
+    dict[Union[FreeableInputBuffer, SchedulerBuffer], BaseSchedulerNode],
+]:
+    """
+    Alternative version of estimate_peak_memory, that respects the fact,
+    that every SchedulerNode has multiple phases:
+    1. alloc ( outputs )
+    2. run_kernel
+    3. dealloc last_use buffers
+    estimate_peak_memory collapses memory into one value: size_alloc - size_free
+    While peak memory happens after alloc.
+
+    Duplicating the code to not migrate all callsites at once,
+    In future usages of estimate_peak_memory will migrate to this version.
+    """
+
+    buf_info_list, _, buf_to_snode_last_use = compute_memory_timeline(
+        nodes, name_to_freeable_input_buf, graph_outputs
+    )
+
+    # incremental memory changes at each step
+    step_idx_allocfree = [SNodeMemory(0, 0) for _ in range(len(nodes))]
+
+    # for each buffer, update memory when created and when freed
+    for buf_info in buf_info_list:
+        step_idx_allocfree[buf_info.start_step].size_alloc += buf_info.size_alloc
+        if buf_info.end_step != -1:
+            step_idx_allocfree[buf_info.end_step].size_free += buf_info.size_free
+
+    snodes_allocfree = {}
+    for i, node in enumerate(nodes):
+        snodes_allocfree[node] = step_idx_allocfree[i]
+
+    max_memory = 0
+    cur_memory = 0
+    snodes_curr_memory = []
+    for t in range(len(nodes)):
+        alloc = step_idx_allocfree[t].size_alloc
+        free = step_idx_allocfree[t].size_free
+        cur_memory += alloc
+        post_alloc = cur_memory
+        max_memory = max(max_memory, cur_memory)
+        cur_memory -= free
+        post_free = cur_memory
+        snodes_curr_memory.append((post_alloc, post_free))
+
+    return (
+        max_memory,
+        snodes_curr_memory,
+        snodes_allocfree,
+        buf_to_snode_last_use,
+    )
+
+
 def topological_sort_lpmf(
     nodes: list[BaseSchedulerNode],
     name_to_freeable_input_buf: dict[str, FreeableInputBuffer],
@@ -358,7 +518,7 @@ def topological_sort_lpmf(
     Buffer memory optimization for video codec application modeled in Simulink
     https://www.cs.york.ac.uk/rts/docs/DAC-1964-2006/PAPERS/2006/DAC06/PDFFILES/P0689.PDF
 
-    The algorithm maintain the max memory so far.
+    The algorithm maintains the max memory so far.
     At every iteration, for each scheduleable node, it computes:
         - how much memory needs to be allocated for the output buffers of this node;
         - how much memory can be freed as a result of executing this node.
@@ -584,6 +744,93 @@ def visit(n: BaseSchedulerNode) -> None:
     return result
 
 
+def validate_graph_acyclic(nodes: list[BaseSchedulerNode]) -> None:
+    """
+    Validate that the graph is acyclic by checking predecessor relationships.
+
+    Raises:
+        RuntimeError: If a cycle is detected in the graph
+    """
+    # DFS coloring scheme for cycle detection:
+    # WHITE (0): Node has not been visited yet
+    # GRAY (1): Node is currently being processed (in the recursion stack)
+    # BLACK (2): Node has been completely processed (finished exploring all its predecessors)
+    # A back edge (cycle) is detected when we encounter a GRAY node during DFS traversal
+    WHITE, GRAY, BLACK = 0, 1, 2
+    color = dict.fromkeys(nodes, WHITE)
+    path: list[BaseSchedulerNode] = []  # Track current DFS path
+
+    def dfs_visit(node: BaseSchedulerNode) -> None:
+        if color[node] == BLACK:
+            return
+
+        if color[node] == GRAY:
+            path.append(node)
+            path_info = " -> ".join([node.get_name() for node in path])
+
+            raise RuntimeError(
+                f"Cycle detected in memory planning graph"
+                f"Path containing cycle (i -> j: j is a dependency of i): {path_info} "
+                f"This indicates invalid dependency relationships in the scheduler graph"
+            )
+
+        color[node] = GRAY
+        path.append(node)
+
+        for pred_node in node.mpi_node.pred_nodes:
+            assert pred_node != node
+            dfs_visit(pred_node)
+
+        path.pop()
+        color[node] = BLACK
+
+    # Start DFS from all unvisited nodes
+    for node in nodes:
+        if color[node] == WHITE:
+            dfs_visit(node)
+
+
+def validate_unique_buffer_names(
+    nodes: list[BaseSchedulerNode],
+    name_to_buf: dict[str, SchedulerBuffer],
+    name_to_freeable_input_buf: dict[str, FreeableInputBuffer],
+) -> None:
+    """
+    Validate that for each node's output buffer, the name_to_buf mapping is correct.
+    For each output buffer buf, we should have name_to_buf[buf.get_name()] == buf.
+    Also validate that no buffer names overlap with freeable input buffer names.
+
+    Raises:
+        RuntimeError: If buffer name mapping is incorrect or names overlap
+    """
+    for node in nodes:
+        for buf in node.get_outputs():
+            buf_name = buf.get_name()
+
+            # Check if buffer name exists in the mapping
+            if buf_name not in name_to_buf:
+                raise RuntimeError(
+                    f"{buf_name} from {node.get_name()} is not found in name_to_buf mapping."
+                    f" This indicates a missing buffer mapping."
+                )
+
+            # Check if the mapping points to the correct buffer object
+            if name_to_buf[buf_name] != buf:
+                raise RuntimeError(
+                    f"Buffer name mapping is incorrect for '{buf_name}'."
+                    f"Expected name_to_buf['{buf_name}'] to be {buf.debug_str()}"
+                    f"but got {name_to_buf[buf_name].debug_str()}"
+                    f"This indicates some buffers share the same name"
+                )
+
+            # Check if buffer name conflicts with freeable input buffer names
+            if buf_name in name_to_freeable_input_buf:
+                raise RuntimeError(
+                    f"Buffer name conflict detected: '{buf_name}' from node {node.get_name()} "
+                    f"is also used as a freeable input buffer name. "
+                )
+
+
 def prepare_planning_info(
     nodes: list[BaseSchedulerNode],
     name_to_buf: dict[str, SchedulerBuffer],
@@ -640,6 +887,15 @@ def reorder_for_peak_memory(
         graph_outputs,
     )
 
+    # Validate planning info before proceeding with reordering
+    try:
+        validate_graph_acyclic(nodes)
+        validate_unique_buffer_names(nodes, name_to_buf, name_to_freeable_input_buf)
+    except RuntimeError as e:
+        torch_log.error("Memory planning validation failed: %s", e)
+        if not is_fbcode():  # TODO: remove after ensuring OSS side is safe
+            raise
+
     # keep track of the peak memory estimates of different methods
     peak_memory_diff_methods: list[PeakMemoryResult] = []
     peak_memory_diff_methods.append(
@@ -666,6 +922,8 @@ def reorder_for_peak_memory(
             torch_log.info("%s peak memory: %d", method.__name__, peak_memory)
         except Exception as e:
             torch_log.error("Failed to reorder for %s: %s", method.__name__, e)
+            if not is_fbcode():  # TODO: remove after ensuring OSS side is safe
+                raise
 
     signpost_event(
         category="inductor",
diff --git a/torch/_inductor/mkldnn_ir.py b/torch/_inductor/mkldnn_ir.py
index db63d880d971e..866c22abd0699 100644
--- a/torch/_inductor/mkldnn_ir.py
+++ b/torch/_inductor/mkldnn_ir.py
@@ -513,17 +513,20 @@ def __init__(
         inputs,
         constant_args=(),
     ) -> None:
+        self.device_type = get_device_type(inputs[0])
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
             op_overload=torch.ops.mkldnn._convolution_transpose_pointwise.default,
-            cpp_kernel_name="aoti_torch_cpu_mkldnn__convolution_transpose_pointwise",
+            cpp_kernel_name=f"aoti_torch_{self.device_type}_mkldnn__convolution_transpose_pointwise",
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
 
     @classmethod
@@ -590,6 +593,7 @@ def __init__(
             - const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, o_scale, o_zp,
               fp32_output, unary_attr, unary_scalars, unary_algorithm]
         """
+        self.device_type = get_device_type(inputs[0])
         self.has_bias = len(inputs) == 5
         super().__init__(
             layout,
@@ -597,11 +601,13 @@ def __init__(
             constant_args,
             None,
             op_overload=torch.ops.onednn.qconv_pointwise.default,
-            cpp_kernel_name="aoti_torch_cpu__qconv_pointwise_tensor",
+            cpp_kernel_name=f"aoti_torch_{self.device_type}__qconv_pointwise_tensor",
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
@@ -694,6 +700,7 @@ def __init__(
             - const_args [b, stride, padding, dilation, groups, o_scale, o_zp,
              output_dtype, accum_scale, accum_zp, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm]
         """
+        self.device_type = get_device_type(inputs[0])
         self.has_bias = len(inputs) == 8
         self.idx_for_inplace_sum = 6
         super().__init__(
@@ -702,11 +709,15 @@ def __init__(
             constant_args,
             None,
             op_overload=torch.ops.onednn.qconv2d_pointwise.binary,
-            cpp_kernel_name=("aoti_torch_cpu__qconv2d_pointwise_binary_tensor"),
+            cpp_kernel_name=(
+                f"aoti_torch_{self.device_type}__qconv2d_pointwise_binary_tensor"
+            ),
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
@@ -850,17 +861,20 @@ def __init__(
         inputs,
         constant_args=(),
     ) -> None:
+        self.device_type = get_device_type(inputs[0])
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
             op_overload=torch.ops.mkldnn._linear_pointwise.default,
-            cpp_kernel_name="aoti_torch_cpu__linear_pointwise",
+            cpp_kernel_name=f"aoti_torch_{self.device_type}__linear_pointwise",
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
 
     @classmethod
@@ -906,17 +920,20 @@ def __init__(
         inputs,
         constant_args=(),
     ) -> None:
+        self.device_type = get_device_type(inputs[0])
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
             op_overload=torch.ops.mkldnn._linear_pointwise.binary,
-            cpp_kernel_name="aoti_torch_cpu__linear_pointwise_binary",
+            cpp_kernel_name=f"aoti_torch_{self.device_type}__linear_pointwise_binary",
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
 
     @classmethod
@@ -971,6 +988,7 @@ def __init__(
             - const_args is: [bias, x_scale, x_zp, o_scale, o_zp,
               fp32_output, unary_attr, unary_scalars, unary_algorithm]
         """
+        self.device_type = get_device_type(inputs[0])
         self.has_bias = has_bias
         super().__init__(
             layout,
@@ -978,11 +996,15 @@ def __init__(
             constant_args,
             None,
             op_overload=(torch.ops.onednn.qlinear_pointwise.tensor),
-            cpp_kernel_name=("aoti_torch_cpu__qlinear_pointwise_tensor"),
+            cpp_kernel_name=(
+                f"aoti_torch_{self.device_type}__qlinear_pointwise_tensor"
+            ),
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
 
         if isinstance(self.layout, Layout):
@@ -1054,6 +1076,7 @@ def __init__(
             - const_args is: [bias, o_scale, o_zp,
               fp32_output, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm]
         """
+        self.device_type = get_device_type(inputs[0])
         self.has_bias = has_bias
         self.idx_for_inplace_sum = 6
         super().__init__(
@@ -1062,11 +1085,13 @@ def __init__(
             constant_args,
             None,
             op_overload=(torch.ops.onednn.qlinear_pointwise.binary_tensor),
-            cpp_kernel_name="aoti_torch_cpu__qlinear_pointwise_binary_tensor",
+            cpp_kernel_name=f"aoti_torch_{self.device_type}__qlinear_pointwise_binary_tensor",
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
diff --git a/torch/_inductor/ops_handler.py b/torch/_inductor/ops_handler.py
index 35b5f464dd775..a52257c61480c 100644
--- a/torch/_inductor/ops_handler.py
+++ b/torch/_inductor/ops_handler.py
@@ -706,6 +706,9 @@ def placeholder(self, index: int) -> T:
         """This is a fake op used in analysis but not codegen"""
         raise NotImplementedError
 
+    def device_assert_async(self, cond: T, msg: str) -> T:
+        raise NotImplementedError
+
 
 _ignore_op_re = re.compile(r"_.*|paren").fullmatch
 
@@ -788,6 +791,9 @@ def {target}(self, {", ".join(args)}):
             if target in OP_NAMES:
                 setattr(cls, target, impl)
 
+    def device_assert_async(self, cond, msg):
+        return None
+
 
 DefaultHandler._init_cls()
 
@@ -933,6 +939,9 @@ def sort(dtypes, values, stable, descending):
     def indirect_indexing(index_var, size, check=True, wrap_neg=True) -> sympy.Symbol:
         return sympy_index_symbol(str(index_var))
 
+    def device_assert_async(self, cond, msg):
+        return None
+
 
 class KernelFormatterHandler(DefaultHandler):
     def __init__(self, parent_handler: OpsHandler[Any]):
@@ -999,6 +1008,9 @@ def getvalue(self, result):
         self._output.writeline(f"return {result}")
         return self._output.getvalue()
 
+    def device_assert_async(self, cond, msg: str):
+        return f"ops.device_assert_async({cond}, {msg})"
+
 
 class WrapperHandler(DefaultHandler):
     def __init__(self, inner: OpsHandler[Any]):
diff --git a/torch/_inductor/output_code.py b/torch/_inductor/output_code.py
index 9690249d5cbbc..955c00c51d0b9 100644
--- a/torch/_inductor/output_code.py
+++ b/torch/_inductor/output_code.py
@@ -41,8 +41,10 @@
 )
 from torch._inductor.freezing_utils import has_frozen_params, is_frozen_param
 from torch._inductor.utils import (
+    _unstable_customized_partition_wrapper,
     align_inputs_from_check_idxs,
     BoxedBool,
+    CUDAGraphWrapperMetadata,
     GraphPartitionMap,
     InputType,
     output_node,
@@ -422,6 +424,8 @@ class CompiledFxGraph(OutputCode):
     # fx graph. The expression must be generated by:
     # ShapeEnv.produce_guards_expression()
     guards_expr: Optional[str]
+    inductor_provenance_mapping_str: Optional[str]
+    inductor_provenance_stack_traces_str: Optional[str]
 
     cudagraph_info: Optional[CudagraphCachedInfo]
     partition_maps: Optional[list[GraphPartitionMap]]
@@ -448,6 +452,8 @@ def __init__(
         runnable_graph_str: str,
         inductor_post_grad_graph_str: str,
         compiled_fn_runner: Optional[Any] = None,
+        inductor_provenance_mapping_str: Optional[str] = None,
+        inductor_provenance_stack_traces_str: Optional[str] = None,
     ) -> None:
         self.current_callable = current_callable
         self.compiled_fn_runner = compiled_fn_runner
@@ -462,6 +468,8 @@ def __init__(
                 self.source_code = f.read()
         self.runnable_graph_str = runnable_graph_str
         self.inductor_post_grad_graph_str = inductor_post_grad_graph_str
+        self.inductor_provenance_mapping_str = inductor_provenance_mapping_str
+        self.inductor_provenance_stack_traces_str = inductor_provenance_stack_traces_str
         self.cache_linemap = graph.cache_linemap
         # TODO - ordered set
         self.device_types = OrderedSet(graph.device_types)
@@ -581,6 +589,23 @@ def __del__(self) -> None:
 
     def __call__(self, inputs: Sequence[Any]) -> Any:
         assert self.current_callable is not None
+
+        if (
+            torch._inductor.debug.RECORD_GRAPH_EXECUTION
+            and torch._inductor.debug.GRAPH_EXECUTION_ORDER is not None
+        ):
+            graph_id = self.fx_kwargs.get("graph_id")
+            compile_id = (
+                torch._inductor.debug.GRAPH_COMPILE_IDS.get(graph_id)
+                if graph_id is not None
+                and torch._inductor.debug.GRAPH_COMPILE_IDS is not None
+                else None
+            )
+            torch._inductor.debug.GRAPH_EXECUTION_ORDER.append(
+                {
+                    "compile_id": compile_id,
+                }
+            )
         try:
             with record_function(
                 f"## Call CompiledFxGraph {self._fx_graph_cache_key} ##"
@@ -605,6 +630,23 @@ def post_compile(
         This runs whether or not we have a cache hit, and always runs directly after we get a CompiledFxGraph.
         The results of this function are *not* saved in the cache itself.
         """
+        if config.graph_partition and _unstable_customized_partition_wrapper.wrapper:
+            # Mechanically apply user-specified cudagraph wrappers without modification
+            assert self.recursively_apply_fns is not None
+            assert self.compiled_fn_runner is not None
+            num_partitions = len(self.compiled_fn_runner.partitions)
+            wrapper_metadatas = [
+                CUDAGraphWrapperMetadata(num_partitions, i)
+                for i in range(num_partitions)
+            ]
+            customized_wrapper = _unstable_customized_partition_wrapper.wrapper
+            customized_wrappers_with_metadata = [
+                lambda f, m=metadata: customized_wrapper(f, m)
+                for metadata in wrapper_metadatas
+            ]
+            self.recursively_apply_fns(customized_wrappers_with_metadata)
+            return
+
         set_tracing_context_output_strides(example_inputs, self)
         assert graph_kwargs["cudagraphs"] is not None
         assert graph_kwargs["is_backward"] is not None
@@ -723,7 +765,7 @@ class CompiledAOTI(OutputCode):
     Class holding an AOTInductor compiled so.
     """
 
-    filename: Union[str, list[Union[str, Weights]]]
+    filename: Union[str, list[Union[str, Weights]], torch.fx.GraphModule]
 
     def __call__(self, inputs: Sequence[Any]) -> Any:
         raise NotImplementedError("NYI")
diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
index 67e6773cf33d5..e8210f1e80f81 100644
--- a/torch/_inductor/pattern_matcher.py
+++ b/torch/_inductor/pattern_matcher.py
@@ -63,10 +63,11 @@
 from torch._prims_common import is_integer_dtype
 from torch._subclasses.fake_tensor import unset_fake_temporarily
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.experimental.symbolic_shapes import statically_known_true
+from torch.fx.experimental.symbolic_shapes import guard_or_false, statically_known_true
 from torch.fx.graph_module import _get_attr
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.fx.passes.graph_transform_observer import GraphTransformObserver
+from torch.fx.traceback import preserve_node_meta
 from torch.utils._ordered_set import OrderedSet
 
 from .._functorch import config as functorch_config
@@ -86,6 +87,8 @@
 Constant = Any
 NodeOrConstant = Union[Constant, torch.fx.Node]
 
+backend = os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_BACKEND", "inductor")
+
 
 class SearchFn(Protocol):
     __name__: str
@@ -127,7 +130,7 @@ def _transfer_meta(
     # transfer metadata after pattern matching occurs.
     # skip "val" and "tensor_meta" because this info is too specific; it's unlikely
     # to remain accurate after pattern matching has occurred.
-    if config.trace.provenance_tracking:
+    if config.trace.provenance_tracking_level == 1:
         # We handle "from_node" field of the node meta specially to record that the new node comes from the old_node.
         new_from_node = new_meta.get("from_node", []).copy()
         new_from_node.append(NodeSource(old_node, pass_name, NodeSourceAction.REPLACE))
@@ -1303,7 +1306,9 @@ def replace(
                 for user in old_uses:
                     idx = maybe_getitem(user)
                     if idx is None:
-                        raise AssertionError("can't handle")
+                        raise AssertionError(
+                            "Deleted index from getitem, did you erase the index and not properly replace it?"
+                        )
                     replace(user, new[idx])
                 graph.erase_node(old)
 
@@ -1973,11 +1978,12 @@ def apply(self, gm: Union[torch.fx.GraphModule, torch.fx.Graph]) -> int:
                         continue
                     if os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_DEBUG") == node.name:
                         log.warning("%s%s %s %s", node, node.args, m, entry.pattern)
-                    if is_match(m) and entry.extra_check(m):
+
+                    if is_match(m) and guard_or_false(entry.extra_check(m)):
                         count += 1
                         entry.apply(m, graph, node)
-                        counters["inductor"]["pattern_matcher_count"] += 1
-                        counters["inductor"]["pattern_matcher_nodes"] += len(m.nodes)
+                        counters[backend]["pattern_matcher_count"] += 1
+                        counters[backend]["pattern_matcher_nodes"] += len(m.nodes)
         return count
 
     def clear(self) -> None:
@@ -2102,7 +2108,7 @@ def fwd_only(
 ) -> torch.fx.GraphModule:
     """Build a normalized inference graph, for use with fx_to_pattern"""
     # TODO - look into using aot autograd, asserting no mutating ops here
-    with enable_python_dispatcher():
+    with enable_python_dispatcher(), preserve_node_meta():
         decompositions = (
             get_decomp_fn() if get_decomp_fn is not None else select_decomp_table()
         )
@@ -2213,13 +2219,13 @@ def init_once_fakemode(fn: Callable[..., Any]) -> Callable[[], Any]:
     @functools.cache
     @functools.wraps(fn)
     def lazy_init() -> Any:
-        counters_ref = counters["inductor"].copy()
+        counters_ref = counters[backend].copy()
 
         with torch._guards.tracing(None), unset_fake_temporarily(), FakeTensorMode():
             result = fn()
 
         # clear view matches encountered during tracing
-        counters["inductor"] = counters_ref
+        counters[backend] = counters_ref
 
         return result
 
diff --git a/torch/_inductor/remote_gemm_autotune_cache.py b/torch/_inductor/remote_gemm_autotune_cache.py
new file mode 100644
index 0000000000000..0ef026269b10c
--- /dev/null
+++ b/torch/_inductor/remote_gemm_autotune_cache.py
@@ -0,0 +1,20 @@
+import asyncio
+from typing import TypeVar
+
+import torch._inductor.config as config
+from torch._inductor import ir
+
+
+_T = TypeVar("_T")
+
+
+def gen_best_config(mat1: ir.StorageBox, mat2: ir.StorageBox) -> asyncio.Task[_T]:
+    """
+    Generate the best GEMM autotune config for the given matrices.
+    """
+    if config.is_fbcode():
+        from torch._inductor.fb.remote_gemm_autotune_cache import gen_best_config
+
+        return gen_best_config(mat1, mat2)
+    else:
+        raise NotImplementedError("Function gen_best_config is not yet implemented")
diff --git a/torch/_inductor/runtime/benchmarking.py b/torch/_inductor/runtime/benchmarking.py
index 5c9cc60bef87a..95b1ba64d1580 100644
--- a/torch/_inductor/runtime/benchmarking.py
+++ b/torch/_inductor/runtime/benchmarking.py
@@ -3,7 +3,7 @@
 from functools import cached_property, wraps
 from itertools import chain
 from statistics import median
-from typing import Any, Callable
+from typing import Any, Callable, Optional, Union
 from typing_extensions import Concatenate, ParamSpec, Self, TypeVar
 
 import torch
@@ -173,7 +173,7 @@ def benchmark_gpu(self: Self, _callable: Callable[[], Any], **kwargs: Any) -> fl
         return self.triton_do_bench(_callable, **kwargs, return_mode="median")
 
 
-class InductorBenchmarker(TritonBenchmarker):
+class InductorBenchmarker(TritonBenchmarker):  # noqa: docstring_linter
     @cached_property
     def L2_cache_size(self: Self) -> int:
         """Get the L2 cache size, in bytes, of the current device."""
@@ -205,15 +205,17 @@ def get_event_pairs_min_timing(
         )
 
     @time_and_count
-    def benchmark_gpu(
+    def benchmark_gpu(  # type: ignore[override]
         self: Self,
         _callable: Callable[[], Any],
         estimation_iters: int = 5,
         memory_warmup_iters: int = 100,
         benchmark_iters: int = 100,
         max_benchmark_duration: int = 25,
+        return_mode: str = "min",
+        grad_to_none: Optional[list[torch.Tensor]] = None,
         **kwargs: Any,
-    ) -> float:
+    ) -> Union[float, list[float]]:
         """Benchmark a GPU callable using a custom benchmarking implementation.
 
         Arguments:
@@ -231,10 +233,15 @@ def benchmark_gpu(
         of `memory_warmup_iters` and `benchmark_iters`, along with the estimated
         runtime of `_callable` and various other factors, and we then shrink
         `benchmark_iters` to fit in the allotted maximum duration.
+        - return_mode: Return mode for benchmark results. Options are "min" (default),
+        "all" (returns all measurements).
+        - grad_to_none: Optionally, a list of tensors whose gradients should be cleared
+        before each benchmark iteration.
         - **kwargs: Additional kwargs that may be passed to the fallback.
 
         Returns:
-        - The minimum runtime of `_callable`, in milliseconds.
+        - If return_mode="min": The minimum runtime of `_callable`, in milliseconds.
+        - If return_mode="all": List of all runtime measurements, in milliseconds.
         """
         # we don't want any outside errors propagating into benchmarking
         torch.cuda.synchronize()
@@ -250,6 +257,10 @@ def benchmark_gpu(
         # estimate the runtime of `_callable`
         event_pairs = self.get_event_pairs(estimation_iters)
         for start_event, end_event in event_pairs:
+            # Clear gradients before timing (matches triton.testing.do_bench)
+            if grad_to_none is not None:
+                for x in grad_to_none:
+                    x.grad = None
             buffer.zero_()
             start_event.record()
             _callable()
@@ -269,20 +280,37 @@ def benchmark_gpu(
         # benchmark `_callable`
         event_pairs = self.get_event_pairs(benchmark_iters)
         for start_event, end_event in event_pairs:
+            # Clear gradients before timing (matches triton.testing.do_bench)
+            if grad_to_none is not None:
+                for x in grad_to_none:
+                    x.grad = None
             buffer.zero_()
             start_event.record()
             _callable()
             end_event.record()
         torch.cuda.synchronize()
-        benchmarked_timing = self.get_event_pairs_min_timing(event_pairs)
 
         # explicitly delete the buffer, sometimes helps memory
         # footprint metrics in OSS Inductor performance benchmarks
         del buffer
 
-        # return the minimum of `estimated_timing` and `benchmarked_timing`,
-        # we just want the minimum timing overall so we might as well check both
-        return min(estimated_timing, benchmarked_timing)
+        # Return based on the requested mode
+        if return_mode == "all":
+            # Get all timings from event pairs
+            all_timings = [
+                start_event.elapsed_time(end_event)
+                for start_event, end_event in event_pairs
+            ]
+            return all_timings
+        elif return_mode == "min":
+            benchmarked_timing = self.get_event_pairs_min_timing(event_pairs)
+            # return the minimum of `estimated_timing` and `benchmarked_timing`,
+            # we just want the minimum timing overall so we might as well check both
+            return min(estimated_timing, benchmarked_timing)
+        else:
+            raise ValueError(
+                f"Unsupported return_mode: {return_mode}. Use 'min' or 'all'."
+            )
 
 
 benchmarker = (
diff --git a/torch/_inductor/runtime/compile_tasks.py b/torch/_inductor/runtime/compile_tasks.py
index 67140369faac4..850c7660d5d99 100644
--- a/torch/_inductor/runtime/compile_tasks.py
+++ b/torch/_inductor/runtime/compile_tasks.py
@@ -10,6 +10,8 @@
 from types import ModuleType
 from typing import Any, Callable, TYPE_CHECKING
 
+from torch._utils_internal import log_triton_builds
+
 
 if TYPE_CHECKING:
     from torch._inductor.runtime.triton_heuristics import CachingAutotuner
@@ -57,11 +59,18 @@ def _worker_compile_triton(
     from torch._inductor import config
 
     with config.patch(extra_config):
-        start_ns = time.time_ns()
-        kernel = load_kernel()
-        kernel.precompile(warm_cache_only=True)
-        elapsed_ns = time.time_ns() - start_ns
-        kernel.prepare_for_pickle()
-        # We can release this memory in the compile subprocesses:
-        linecache.clearcache()
-        return kernel, elapsed_ns // 1000
+        fail = None
+        try:
+            start_ns = time.time_ns()
+            kernel = load_kernel()
+            kernel.precompile(warm_cache_only=True)
+            elapsed_ns = time.time_ns() - start_ns
+            kernel.prepare_for_pickle()
+            # We can release this memory in the compile subprocesses:
+            linecache.clearcache()
+            return kernel, elapsed_ns // 1000
+        except Exception as e:
+            fail = str(e)
+            raise
+        finally:
+            log_triton_builds(fail=fail)
diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
index f58f4da061136..26b3bcf5cc5cf 100644
--- a/torch/_inductor/runtime/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -248,7 +248,10 @@ def autotune(
 
         log.debug("= Do coordinate descent tuning for %s =", self.name)
         log.debug(
-            "Baseline Config %s, baseline timing %f", baseline_config, baseline_timing
+            "%s: Baseline Config %s, baseline timing %f",
+            self.name,
+            baseline_config,
+            baseline_timing,
         )
         improved = True
         best_config = baseline_config
@@ -290,15 +293,17 @@ def autotune(
 
                 if improved:
                     msg = red_text(
-                        "Coordinate descend tuning found improvement of %.3fx by looking in all directions."
+                        "%s: Coordinate descend tuning found improvement of %.3fx by looking in all directions."
                     )
                     log.debug(
                         msg,
+                        self.name,
                         old_best_timing / best_timing,
                     )
 
         log.debug(
-            "Improve from %s %f -> %s %f, %.3fx",
+            "%s: Improve from %s %f -> %s %f, %.3fx",
+            self.name,
             baseline_config,
             baseline_timing,
             best_config,
diff --git a/torch/_inductor/runtime/debug_utils.py b/torch/_inductor/runtime/debug_utils.py
new file mode 100644
index 0000000000000..9c15ff890dda6
--- /dev/null
+++ b/torch/_inductor/runtime/debug_utils.py
@@ -0,0 +1,138 @@
+import functools
+import logging
+import threading
+import weakref
+
+import torch
+from torch.utils._ordered_set import OrderedSet
+
+
+log = logging.getLogger(__name__)
+
+local = threading.local()
+local.memory_tracker = None
+
+
+class BufferMemoryTracker:
+    """
+    Tracks inductor runtime allocations and deallocations to compare against
+    expected behavior.
+    """
+
+    def __init__(self) -> None:
+        self.tensor_tracker: dict[str, torch.storage.UntypedStorage] = (
+            weakref.WeakValueDictionary()  # type: ignore[assignment]
+        )
+        self.died_since_last_step: OrderedSet[str] = OrderedSet()
+        self.added_since_last_step: OrderedSet[str] = OrderedSet()
+        self.error = (
+            torch._inductor.config.test_configs.track_memory_lifecycle == "assert"
+        )
+
+    def set_tensor(self, name: str, tensor: torch.Tensor) -> None:
+        storage = tensor.untyped_storage()
+
+        self.added_since_last_step.add(name)
+        self.tensor_tracker[name] = storage
+
+        def on_tensor_death() -> None:
+            self.died_since_last_step.add(name)
+
+        weakref.finalize(storage, on_tensor_death)
+
+    def advance_step(self) -> None:
+        self.died_since_last_step.clear()
+        self.added_since_last_step.clear()
+
+    def log_or_raise(self, msg: str) -> None:
+        if self.error:
+            raise RuntimeError(msg)
+        else:
+            log.info(msg)
+
+    def check_step_delta(
+        self,
+        expected_allocated: list[str],
+        expected_freed: list[str],
+        is_final_step: bool,
+    ) -> None:
+        """Check only the delta changes since last step"""
+
+        # Check expected deaths - we dont currently distinguish between nodes which die in last step
+        # and are returned as outputs, so skip if final_step.
+        if not is_final_step:
+            missing_deaths = OrderedSet(expected_freed) - self.died_since_last_step
+            if missing_deaths:
+                self.log_or_raise(
+                    f"Expected tensors to die but still alive: {missing_deaths}"
+                )
+
+        # Check for unexpected deaths
+        unexpected_deaths = self.died_since_last_step - OrderedSet(expected_freed)
+        if unexpected_deaths:
+            self.log_or_raise(f"Unexpected tensor deaths: {unexpected_deaths}")
+
+        # Check newly alive tensors - separate messages like deaths
+        actual_allocated = self.added_since_last_step
+        expected_allocated_set = OrderedSet(expected_allocated)
+
+        extra_alive = actual_allocated - expected_allocated_set
+        if extra_alive:
+            self.log_or_raise(f"Unexpected allocated tensors: {extra_alive}")
+
+        missing_alive = expected_allocated_set - actual_allocated
+        if missing_alive:
+            self.log_or_raise(
+                f"Expected allocated tensors but missing: {missing_alive}"
+            )
+
+        # Reset for next step
+        self.advance_step()
+
+        if is_final_step:
+            local.memory_tracker = None
+
+
+def get_mem_tracker() -> BufferMemoryTracker:
+    if local.memory_tracker is None:
+        local.memory_tracker = BufferMemoryTracker()
+    return local.memory_tracker
+
+
+def track_tensor(tensor: torch.Tensor, name: str) -> None:
+    get_mem_tracker().set_tensor(name, tensor)
+
+
+def tracked_empty_strided(
+    size: list[int],
+    stride: list[int],
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    name: str,
+) -> torch.Tensor:
+    o = torch.empty_strided(size, stride, dtype=dtype, device=device)
+    track_tensor(o, name)
+    return o
+
+
+def check_memory_step(
+    allocated: list[str], freed: list[str], is_final_step: bool = False
+) -> None:
+    tracker = get_mem_tracker()
+    tracker.check_step_delta(allocated, freed, is_final_step)
+
+
+@functools.lru_cache(None)
+def register_check_mem_op() -> None:
+    lib = torch.library.Library("_inductor_debug", "FRAGMENT")  # noqa: TOR901
+    lib.define(
+        "check_memory_step(str[] allocated, str[] freed, bool is_final_step) -> ()"
+    )
+    lib.impl("check_memory_step", check_memory_step, "BackendSelect")
+    from torch._higher_order_ops.effects import _EffectType, _register_effectful_op
+
+    _register_effectful_op(
+        torch.ops._inductor_debug.check_memory_step.default,
+        _EffectType.ORDERED,
+    )
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
index 2732b9cecfb21..15b86b1b3d1ae 100644
--- a/torch/_inductor/runtime/hints.py
+++ b/torch/_inductor/runtime/hints.py
@@ -153,9 +153,6 @@ def create(cls, device) -> DeviceProperties:
         except AttributeError:
             if device_type == "xpu":
                 multi_processor_count = props.gpu_subslice_count
-            elif device_type == "mps":
-                # TODO: Fetch the actual value from ioreg
-                multi_processor_count = 8
             elif device_type == "mtia":
                 multi_processor_count = 64
             else:
diff --git a/torch/_inductor/runtime/static_cuda_launcher.py b/torch/_inductor/runtime/static_cuda_launcher.py
index a52df4745f590..bfea6fc119d96 100644
--- a/torch/_inductor/runtime/static_cuda_launcher.py
+++ b/torch/_inductor/runtime/static_cuda_launcher.py
@@ -54,7 +54,19 @@ def __init__(self, kernel: CompiledKernel) -> None:
             launch_enter = triton_knobs.runtime.launch_enter_hook
             launch_exit = triton_knobs.runtime.launch_exit_hook
 
-        if launch_enter is not None or launch_exit is not None:
+        def hook_is_empty(hook: Any) -> bool:
+            if hook is None:
+                return True
+            if (
+                triton_knobs
+                and (HookChain := getattr(triton_knobs, "HookChain", None)) is not None
+                and isinstance(hook, HookChain)
+            ):
+                # Support hooks after https://github.com/triton-lang/triton/pull/7866
+                return len(hook.calls) == 0
+            return False
+
+        if not hook_is_empty(launch_enter) or not hook_is_empty(launch_exit):
             raise NotImplementedError(
                 "We don't support launch enter or launch exit hooks"
             )
@@ -63,16 +75,21 @@ def __init__(self, kernel: CompiledKernel) -> None:
             kernel.shared if hasattr(kernel, "shared") else kernel.metadata.shared
         )
 
+        def needs_scratch_arg(scratch_name: str, param_name: str) -> bool:
+            if hasattr(kernel.metadata, param_name):
+                if getattr(kernel.metadata, param_name) > 0:
+                    raise NotImplementedError(
+                        f"{scratch_name} scratch not yet supported"
+                    )
+                return True
+            return False
+
         # Newer triton versions pass an extra global scratch parameter to the compiled cuda kernel.
         # Inductor never uses this field or enables it, but we still have to pass
         # an extra None into the set of params if its enabled
-        if hasattr(kernel.metadata, "global_scratch_size"):
-            if kernel.metadata.global_scratch_size > 0:
-                raise NotImplementedError("Global scratch not yet supported")
-            else:
-                self.has_global_scratch = True
-        else:
-            self.has_global_scratch = False
+        self.has_global_scratch = needs_scratch_arg("Global", "global_scratch_size")
+        # same situation for profile scratch - triton-lang/triton#7258
+        self.has_profile_scratch = needs_scratch_arg("Profile", "profile_scratch_size")
 
         self.arg_tys = self.arg_ty_from_signature(kernel.src)
         self.function: Optional[int] = (
@@ -214,12 +231,12 @@ def run(
         # thing, it should always match.
         # Get rid of constants before passing to cubin launcher
 
-        # Add a None if triton wants an extra parameter to the cubin
-        if self.has_global_scratch:
-            arg_tys = self.arg_tys + "O"
-            args = (*args, None)
-        else:
-            arg_tys = self.arg_tys
+        # Add a None if triton wants extra parameters for scratch spaces
+        arg_tys = self.arg_tys
+        for has_scratch in [self.has_global_scratch, self.has_profile_scratch]:
+            if has_scratch:
+                arg_tys = arg_tys + "O"
+                args = (*args, None)
         assert len(args) == len(arg_tys)
 
         # TODO: can handle grid functions here or in C++, so
diff --git a/torch/_inductor/runtime/triton_helpers.py b/torch/_inductor/runtime/triton_helpers.py
index b61baa66281f6..e003615b218fd 100644
--- a/torch/_inductor/runtime/triton_helpers.py
+++ b/torch/_inductor/runtime/triton_helpers.py
@@ -2,7 +2,6 @@
 # mypy: allow-untyped-defs
 import math as pymath
 import warnings
-from functools import wraps
 from typing import Any, Callable, TypeVar
 
 from .triton_compat import (  # noqa: F401
@@ -169,15 +168,15 @@ def max_with_index(value, index, dim):
 @triton.jit
 def exp(x, use_fast_math: tl.constexpr):
     if use_fast_math:
-        return libdevice.exp2(x * _LOG_2_E)
-    else:
         return math.exp(x)
+    else:
+        return libdevice.exp(x)
 
 
 @triton.jit
 def online_softmax_reduce(lhs_max, lhs_sum, dim, use_fast_math: tl.constexpr):
     out_max = max2(lhs_max, dim)
-    out_max_keepdim = out_max[:, None]
+    out_max_keepdim = tl.expand_dims(out_max, dim)
     delta = tl.where(out_max_keepdim == float("-inf"), 0, lhs_max - out_max_keepdim)
     out_sum = tl.sum(lhs_sum * exp(delta, use_fast_math), dim)
     return out_max, out_sum
@@ -315,8 +314,8 @@ def bucketize_binary_search(
     while full_range > 1:
         mid = (high + low) // 2
         mask = (
-            mid * BOUNDARIES_STRIDE + boundary_indices
-        ) < BOUNDARIES_UNDERLYING_NUMEL and mid < BOUNDARIES_SIZE
+            (mid * BOUNDARIES_STRIDE + boundary_indices) < BOUNDARIES_UNDERLYING_NUMEL
+        ).logical_and(mid < BOUNDARIES_SIZE)
         mid_indices = (
             mid
             if sorter_ptr is None or SORTER_STRIDE is None
@@ -723,10 +722,9 @@ def triton_builtin(f: Callable[..., _T]) -> Callable[..., _T]:
     """
     if builtins_use_semantic_kwarg:
         # support Triton before and after https://github.com/triton-lang/triton/pull/7054
-        @wraps(f)
-        def wrapper(*args, **kwargs):
-            kwargs["_builder"] = kwargs["_semantic"]
-            del kwargs["_semantic"]
+        # and after https://github.com/triton-lang/triton/pull/7239
+        def wrapper(*args, _semantic, **kwargs):
+            kwargs["_builder"] = _semantic
             return f(*args, **kwargs)
     else:
         wrapper = f  # type: ignore[assignment]
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index bbe9b04243e6c..0c51d840a61c9 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -31,6 +31,7 @@
 
 import torch
 from torch._dynamo.utils import set_feature_use
+from torch._environment import is_fbcode
 from torch._prims_common import compute_required_storage_length
 from torch.utils._ordered_set import OrderedSet
 
@@ -81,6 +82,14 @@
 )
 
 
+class InductorConfig(Config):
+    """Inductor-specific Triton config with additional control flags"""
+
+    def __init__(self, *args, dynamic_scale_rblock=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.dynamic_scale_rblock = dynamic_scale_rblock
+
+
 class NoTritonConfigsError(RuntimeError):
     pass
 
@@ -196,8 +205,7 @@ def _dump_launch_params(args, kwargs, launcher, kernel_name, grid):
             call_kwargs[k] = v
         else:
             call_kwargs[k] = v
-    if not triton_version_uses_attrs_dict():
-        call_kwargs.update(launcher.config.kwargs)
+    call_kwargs.update(launcher.config.kwargs)
     call_kwargs["num_warps"] = launcher.config.num_warps
     call_kwargs["num_stages"] = launcher.config.num_stages
     if HAS_WARP_SPEC:
@@ -366,6 +374,9 @@ def __init__(
         self.compile_id: Optional[CompileId] = None
         self.is_backward = False
 
+        # Mode for launch grid calculation
+        self.grid_mode: Literal["python", "python_slow", "cpp"] = "python"
+
     def is_statically_launchable(self):
         """
         Checks if every compiled kernel is statically launchable, which
@@ -608,7 +619,7 @@ def _make_launchers(self):
             raise RuntimeError(f"No valid triton configs. {type(exc).__name__}: {exc}")
         self.launchers = launchers
 
-    def prepare_for_pickle(self) -> tuple[Any, Any, Any, Any, Any]:
+    def prepare_for_pickle(self) -> tuple[Any, Any, Any, Any, Any, Any]:
         """Drop stuff from triton.JITFunction that does not pickle.
         This must be called after precompile so that these things are no longer needed.
         Returns a tuple of old values
@@ -619,14 +630,33 @@ def prepare_for_pickle(self) -> tuple[Any, Any, Any, Any, Any]:
             self.fn.used_global_vals,
             self.fn.repr,
             self.launchers,
+            getattr(self.fn, "_hash_lock", None),
         )
         self.fn.fn = None
         self.fn.__globals__ = None
         self.fn.used_global_vals = None
         self.fn.repr = _ConstRepr(self.fn.repr(self.fn))
         self.launchers = []
+        self.fn._hash_lock = None
         return old_values
 
+    def restore_after_unpickle(
+        self, old_values: Optional[tuple[Any, Any, Any, Any, Any, Any]]
+    ) -> None:
+        if old_values:
+            (
+                self.fn.fn,
+                self.fn.__globals__,
+                self.fn.used_global_vals,
+                self.fn.repr,
+                self.launchers,
+                self.fn._hash_lock,
+            ) = old_values
+        else:
+            # even if we don't need/have specific values, we do need the
+            # _hash_lock to be a valid RLock
+            self.fn._hash_lock = threading.RLock()
+
     def prepare_for_caching(self) -> None:
         """
         Statically Launched CUDA Kernels have a raw cubin on them
@@ -754,6 +784,36 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
                 compile_meta,
             )
             raise
+
+        # Simulate JIT Hook call
+        if (
+            torch._inductor.config.run_jit_post_compile_hook
+            and knobs
+            and getattr(knobs.runtime, "jit_post_compile_hook", None)
+        ):
+            try:
+                hook = knobs.runtime.jit_post_compile_hook
+
+                # base args everyone should get
+                call_kwargs = dict(
+                    key=getattr(self.fn, "cache_key", self.kernel_hash or str(self.fn)),
+                    repr=getattr(self.fn, "src", None),
+                    fn=self.fn,
+                    compile=binary,
+                    is_manual_warmup=False,
+                    already_compiled=True,
+                )
+
+                # only add inductor_args if the hook takes it
+                sig = inspect.signature(hook)
+                params = sig.parameters
+                if "inductor_args" in params:
+                    call_kwargs["inductor_args"] = self.inductor_meta["config_args"]
+
+                hook(**call_kwargs)
+            except Exception:
+                log.exception("jit_post_compile_hook failed")
+
         TritonBundler.put(
             triton_hash_to_path_key(binary.hash), self.triton_meta.get("device", 0)
         )
@@ -770,28 +830,6 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
 
         return TritonCompileResult(binary, cfg, compile_meta, self.inductor_meta)
 
-    def _get_args_with_constexprs(self, args, launcher):
-        """
-        `args` is passed in with only the non-constexpr args (because the constexpr arg values
-        depend on the config). However, in later triton versions, the constexpr args need to be
-        added into the args list.
-        """
-        if triton_version_uses_attrs_dict():
-            # first: aggregate the constexpr args in (index, val) pairs
-            # so we can sort them by index.
-            constexpr_args: list[tuple[int, Any]] = []
-            for arg_name, arg_val in launcher.config.kwargs.items():
-                if arg_name in self.fn.arg_names:
-                    constexpr_args.append((self.fn.arg_names.index(arg_name), arg_val))
-
-            constexpr_args.sort()
-            new_args = [*args]
-            for arg_idx, arg_val in constexpr_args:
-                new_args.insert(arg_idx, arg_val)
-
-            return new_args
-        return args
-
     def bench(self, launcher, *args, with_profiler=False, **kwargs):
         """Measure the performance of a given launcher"""
         # we don't skip configs with spilled registers when auto-tuning custom
@@ -820,23 +858,22 @@ def kernel_call():
             )
             # reset to zero before evaluating any config
             self.reset_to_zero_args(*args, **kwargs)
-            args_with_constexprs = self._get_args_with_constexprs(cloned_args, launcher)
             if autograd_profiler._is_profiler_enabled:
                 profiler_kwargs = self.get_profiler_kwargs(stream, launcher)
                 with torch._C._profiler._RecordFunctionFast(
                     self.inductor_meta.get("kernel_name", "triton kernel"),
-                    args_with_constexprs,
+                    cloned_args,
                     profiler_kwargs,
                 ):
                     launcher(
-                        *args_with_constexprs,
+                        *cloned_args,
                         **cloned_kwargs,
                         stream=stream,
                     )
 
             else:
                 launcher(
-                    *args_with_constexprs,
+                    *cloned_args,
                     **cloned_kwargs,
                     stream=stream,
                 )
@@ -1061,6 +1098,7 @@ def save_gpu_kernel(self, stream, launcher):
             "def_args": launcher.def_args,
             "call_args": launcher.call_args,
             "global_scratch": launcher.global_scratch,
+            "profile_scratch": launcher.profile_scratch,
         }
         from torch._inductor.codecache import CudaKernelParamCache
 
@@ -1239,7 +1277,6 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
         # so _RecordFunctionFast need to capture the args into CachingAutotuner::run()
         # make a copy here to avoid mutating the original args
         args_without_constexprs = tuple(args)
-        args = self._get_args_with_constexprs(args, launcher)
 
         if self.dump_launch_params:
             new_args, grid = self._interpret_args_grid(args, launcher.config)
@@ -1270,11 +1307,27 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
     def _interpret_args_grid(
         self, args: tuple[Any, ...], cfg: Config
     ) -> tuple[tuple[Any, ...], tuple[int, int, int]]:
-        grid = GridExpr.from_meta(self.inductor_meta, cfg).eval_slow(
+        if triton_version_uses_attrs_dict():
+
+            def filtered_signature() -> list[str]:
+                # constexprs are not passed in as args
+                return [
+                    x
+                    for x in self.triton_meta["signature"].keys()
+                    if x not in cfg.kwargs.keys()
+                ]
+        else:
+
+            def filtered_signature() -> list[str]:
+                return list(self.triton_meta["signature"].keys())
+
+        grid = GridExpr.from_meta(
+            self.inductor_meta, cfg, mode=self.grid_mode
+        ).eval_slow(
             dict(
                 zip(
                     [
-                        *self.triton_meta["signature"].keys(),
+                        *filtered_signature(),
                         *self.inductor_meta.get("extra_launcher_args", ()),
                     ],
                     args,
@@ -1295,6 +1348,10 @@ def __call__(self, _=None) -> str:
 
 
 class CompileResult(Generic[_T]):
+    """
+    Base class representing compiled result.
+    """
+
     def __init__(
         self,
         kernel: _T,
@@ -1358,21 +1415,30 @@ def _get_arg_lists(
         )
         none_args = none_args.difference(OrderedSet(compile_meta["signature"].keys()))
 
+        def _convert_constant(constant):
+            if isinstance(constant, str):
+                return "r'" + constant + "'"
+            else:
+                return repr(constant)
+
         if triton_version_uses_attrs_dict():
             call_args = arg_names
             def_args = arg_names
-            if (
-                "num_warps" in compile_meta["constants"]
-                or "num_stages" in compile_meta["constants"]
+            implicit_constants = OrderedSet(
+                (
+                    "num_warps",
+                    "num_stages",
+                )
+            ).union(OrderedSet(k for k in known_constants))
+            if implicit_constants := implicit_constants & OrderedSet(
+                compile_meta["constants"].keys()
             ):
                 # num_warps/num_stages are special implicit args that are not in the signature
                 # see test_triton_kernel_special_params
-                def_args = [
-                    arg for arg in def_args if arg not in ("num_warps", "num_stages")
-                ]
+                def_args = [arg for arg in def_args if arg not in implicit_constants]
                 repl = {
-                    k: str(compile_meta["constants"].get(k))
-                    for k in ("num_warps", "num_stages")
+                    k: _convert_constant(compile_meta["constants"].get(k))
+                    for k in implicit_constants
                 }
                 call_args = [repl.get(arg, arg) for arg in call_args]
         else:
@@ -1652,6 +1718,8 @@ def make_launcher(self) -> LauncherType:
 
         import math as math_lib
 
+        import triton as triton_lib
+
         import torch as torch_lib
 
         scope = {
@@ -1686,6 +1754,7 @@ def make_launcher(self) -> LauncherType:
             "runner": get_first_attr(binary, "run", "c_wrapper"),
             "math": math_lib,
             "torch": torch_lib,
+            "triton": triton_lib,
         }
 
         if not hasattr(binary, "launch_metadata"):
@@ -1754,9 +1823,23 @@ def make_launcher(self) -> LauncherType:
             launcher.def_args = def_args
             launcher.call_args = call_args
             kernel_metadata = getattr(self.kernel, "metadata", None)
-            launcher.global_scratch = getattr(
-                kernel_metadata, "global_scratch_size", None
+
+            # for the scratch arguments: None indicates that the kernel doesn't
+            # take any scratch argument; otherwise a number indicates the number
+            # of bytes of scratch that need to be provided.
+
+            # in AMD's Triton backend, the global scratch size is never provided
+            # (but for AMD it's safe to pass an extra null arg, so always include it)
+            global_scratch: Optional[int] = getattr(
+                kernel_metadata,
+                "global_scratch_size",
+                (0 if torch.version.hip else None),
+            )
+            profile_scratch: Optional[int] = getattr(
+                kernel_metadata, "profile_scratch_size", None
             )
+            launcher.global_scratch = global_scratch
+            launcher.profile_scratch = profile_scratch
         return launcher
 
 
@@ -2207,6 +2290,7 @@ def triton_config_reduction(
     num_stages=1,
     num_warps=None,
     register_intensive=False,
+    dynamic_scale_rblock=True,
 ) -> Config:
     """
     Construct a reduction triton config with some adjustment heuristics
@@ -2250,7 +2334,12 @@ def total_numel() -> int:
     cfg = _get_config({"x": x, **rnumels})
     check_max_block(cfg)
     check_config(cfg, xnumel=size_hints["x"])
-    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+    return InductorConfig(
+        cfg,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        dynamic_scale_rblock=dynamic_scale_rblock,
+    )
 
 
 def _get_config(numels: dict[str, int]) -> dict[str, int]:
@@ -2439,7 +2528,7 @@ def pointwise(
 
 
 def _reduction_configs(
-    *, size_hints: dict[str, int], inductor_meta: dict[str, Any]
+    *, size_hints: dict[str, int], inductor_meta: dict[str, Any], num_dynamic=0
 ) -> list[Config]:
     reduction_hint = inductor_meta.get("reduction_hint", None)
 
@@ -2448,11 +2537,10 @@ def _reduction_configs(
 
     register_intensive = False
     MAX_R0_BLOCK = 2048
-    if (
-        size_hints["x"] >= 1024
-        and inductor_meta.get("num_load", 0) + inductor_meta.get("num_reduction", 0)
-        >= 10
-    ):
+    loads_and_red = inductor_meta.get("num_load", 0) + inductor_meta.get(
+        "num_reduction", 0
+    )
+    if size_hints["x"] >= 1024 and loads_and_red >= 10:
         # A heuristics to reduce R0_BLOCK if a kernel potentially need many registers.
         # Consider load and reduction since load need move data into registers and
         # reduction needs an accumulator.
@@ -2468,7 +2556,14 @@ def _reduction_configs(
         MAX_R0_BLOCK = 1024
         register_intensive = True
 
-    def make_config(x, r, num_warps=None, num_stages=1, register_intensive=False):
+    def make_config(
+        x,
+        r,
+        num_warps=None,
+        num_stages=1,
+        register_intensive=False,
+        dynamic_scale_rblock=True,
+    ):
         # For 3D case with tiling scores, create an adapted version
         if "y" in size_hints:
             assert "tiling_scores" in inductor_meta
@@ -2490,19 +2585,86 @@ def make_config(x, r, num_warps=None, num_stages=1, register_intensive=False):
                 num_warps=num_warps,
                 num_stages=num_stages,
                 register_intensive=register_intensive,
+                dynamic_scale_rblock=dynamic_scale_rblock,
             )
 
+    def outer_config_opt():
+        # Default to 64 for vectorized loads
+        max_x_block, x_block = 256, 64
+        load_factor = inductor_meta.get("num_load", 0)
+        x = size_hints["x"]
+        num_warps = None
+
+        # Try to use all SMs with small x
+        if x <= 1024:
+            x_block = max(min(x // 128, 8), 2)
+            outer_r_block = min(rnumel, 64)
+        # Lower bound x = 1024, 1024 // 16 = 128 around # of SMs
+        elif x // 4096 <= 8:
+            x_block = 16
+            outer_r_block = 512 // x_block
+        elif num_dynamic > 1:
+            # Lots of compute with multiple dynamic shape per loop iteration
+            # Larger RBLOCK minimizes loop iteration
+            outer_r_block = max(min((rnumel // 64), 64), 8)
+        elif num_dynamic == 1:
+            # Dynamic shapes introduce a lot register pressure for indexing
+            outer_r_block = (
+                1
+                if load_factor >= 3
+                else min(next_power_of_2(max(rnumel, 128) // 128), 8)
+            )
+        else:
+            x_block = max(min(max_x_block, next_power_of_2(x // 4096)), x_block)
+            if load_factor < 4 or rnumel <= 128:
+                outer_r_block = 512 // x_block
+            else:
+                # Heavier reductions contain a lot more overhead per loop iteration
+                # We minimize the overhead by enlarging r block
+                if rnumel >= 2048:
+                    outer_r_block = 64
+                else:
+                    outer_r_block = 32
+                x_block = min(x_block, 32)
+                num_warps = 4
+
+        # Set register intensive to true by default as we try to maximize tiles with heuristic
+        return make_config(
+            x_block,
+            outer_r_block,
+            num_warps=num_warps,
+            register_intensive=register_intensive,
+        )
+
     contiguous_config = make_config(
         1,
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
-    outer_config = make_config(64, 8, register_intensive=register_intensive)
     tiny_config = make_config(
         2 * (256 // rnumel) if rnumel <= 256 else 1,
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
+
+    outer_config = make_config(64, 8, register_intensive=register_intensive)
+    # TODO (paulzhan): Test heuristic on AMD and internal testing
+    # for correctness
+    if not torch.version.hip and not is_fbcode():
+        outer_config = outer_config_opt()
+
+    configs = []
+
+    if inductor_meta.get("add_persistent_rblock") and loads_and_red <= 8:
+        xnumel = max(4096 // rnumel, 1)
+        c = make_config(
+            xnumel,
+            rnumel,
+            register_intensive=register_intensive,
+            dynamic_scale_rblock=False,
+        )
+        configs.append(c)
+
     # For 3d tiling, default to more autotuning initially
     if "y" in size_hints:
         pass
@@ -2511,14 +2673,15 @@ def make_config(x, r, num_warps=None, num_stages=1, register_intensive=False):
     ):
         pass  # skip all these cases
     elif reduction_hint == ReductionHint.INNER:
-        return [contiguous_config]
+        return configs + [contiguous_config]
     elif reduction_hint == ReductionHint.OUTER:
-        return [outer_config]
+        return configs + [outer_config]
     elif reduction_hint == ReductionHint.OUTER_TINY:
-        return [tiny_config]
+        return configs + [tiny_config]
     if disable_pointwise_autotuning(inductor_meta):
-        return [make_config(32, 128)]
-    return [
+        return configs + [make_config(32, 128)]
+
+    return configs + [
         contiguous_config,
         outer_config,
         tiny_config,
@@ -2622,7 +2785,15 @@ def reduction(
 
     assert triton_meta is not None
 
-    configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
+    num_dynamic = 0
+    for k in triton_meta["signature"].keys():
+        if "ks" in k:
+            num_dynamic += 1
+
+    configs = _reduction_configs(
+        size_hints=size_hints, inductor_meta=inductor_meta, num_dynamic=num_dynamic
+    )
+
     configs = _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs)
     return cached_autotune(
         size_hints,
@@ -2968,14 +3139,14 @@ class GridExpr:
     """Generate code for grid size expressions in launcher"""
 
     inductor_meta: dict[str, Any]
-    mode: Literal["python", "cpp"] = "python"
+    mode: Literal["python", "cpp", "python_slow"] = "python"
     prefix: list[str] = dataclasses.field(default_factory=list)
     x_grid: Union[str, int] = 1
     y_grid: Union[str, int] = 1
     z_grid: Union[str, int] = 1
 
     def __post_init__(self) -> None:
-        assert self.mode in ("python", "cpp")
+        assert self.mode in ("python", "cpp", "python_slow")
 
     def generate(self, meta: dict[str, int]) -> None:
         raise NotImplementedError
@@ -2987,9 +3158,15 @@ def ceildiv(
             return numel
         if isinstance(numel, int) and isinstance(block, int):
             return ceildiv(numel, block)  # constant fold
+        # This trick only works in python, where
+        # negative integer division is floored
         if self.mode == "python":
             return f"-(({numel}) // -({block}))"
-        # trick above doesn't work in C++ due to rounding differences
+        # This is more generic than above, and works in languages where
+        # positive integer division is floored/truncated
+        elif self.mode == "python_slow":
+            return f"(({numel} + {block} - 1) // ({block}))"
+        # For cpp code gen
         return f"(({numel} + ({block} - 1)) / ({block}))"
 
     def maximum(self, seq: list[Union[int, str]]) -> Union[int, str]:
@@ -2997,7 +3174,7 @@ def maximum(self, seq: list[Union[int, str]]) -> Union[int, str]:
         items = self._constant_fold(max, seq)
         if len(items) <= 1:
             return items[0]
-        if self.mode == "python":
+        if self.mode in ("python", "python_slow"):
             return f"max({', '.join(map(str, items))})"
         return functools.reduce(lambda x, y: f"std::max({x}, {y})", items)
 
@@ -3020,7 +3197,7 @@ def _constant_fold(
 
     def assign_tmp(self, name: str, expr: Union[str, int]) -> str:
         # Grid functions are one per kernel, so name collisions are fine
-        if self.mode == "python":
+        if self.mode in ("python", "python_slow"):
             return f"{name} = {expr}"
         if self.mode == "cpp":
             return f"uint32_t {name} = {expr};"
@@ -3030,7 +3207,7 @@ def assign_tmp(self, name: str, expr: Union[str, int]) -> str:
     def from_meta(
         inductor_meta: dict[str, Any],
         cfg: Union[Config, dict[str, int]],
-        mode: Literal["python", "cpp"] = "python",
+        mode: Literal["python", "cpp", "python_slow"] = "python",
     ) -> GridExpr:
         grid_cls = globals()[inductor_meta["grid_type"]]
         assert issubclass(grid_cls, GridExpr)
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 951f07ab7a5ba..7badacee1a7dd 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -26,6 +26,7 @@
 
 import torch
 import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
+import torch.utils._pytree as pytree
 from torch._dynamo.utils import counters, dynamo_timed
 from torch._inductor.codecache import LambdaFuture, PyCodeCache
 from torch._inductor.ir import TritonTemplateCallerBase
@@ -35,10 +36,13 @@
 from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
 from torch.utils._triton import has_triton
 
-from . import comms, config, dependencies, ir, metrics
+from . import comms, config, config_comms, dependencies, ir, metrics
 from .analyze_preserves_zero_mask import can_codegen_without_upcasts
 from .codegen.common import BackendFeature, get_scheduling_for_device, Kernel
-from .comm_analysis import estimate_nccl_collective_runtime
+from .comm_analysis import (
+    estimate_nccl_collective_runtime,
+    estimate_nccl_collective_runtime_nccl_estimator,
+)
 from .dependencies import Dep, MemoryDep, StarDep, WeakDep
 from .exc import GPUTooOldForTriton, TritonMissing
 from .fx_utils import count_flops_fx
@@ -54,6 +58,7 @@
 from .runtime.runtime_utils import green_text, red_text
 from .sizevars import SimplifyIndexing
 from .utils import (
+    _unstable_customized_partition_wrapper,
     cache_on_self,
     cmp,
     device_need_guard,
@@ -211,6 +216,7 @@ class BaseSchedulerNode:
     min_order: int
     max_order: int
     mpi_node: MemoryPlanningInfoForNode
+    override_estimated_runtime: Optional[float] = None
 
     def __init__(self, scheduler: Scheduler) -> None:
         self.scheduler: Scheduler = scheduler
@@ -237,6 +243,13 @@ def _init_from_node(self, node: ir.Operation) -> None:
             buf.get_name(): buf for buf in self.outputs
         }
 
+        # mutation_renames for the current node. Due to potential
+        # more mutations happening later, this can be different
+        # to Scheduler.mutation_renames. Also this dict should be small
+        # since only mutation information relevant to the deps for this
+        # node is stored here.
+        self.mutation_renames: dict[str, str] = {}
+
     def __repr__(self) -> str:
         return f"{type(self).__name__}(name={self.get_name()!r})"
 
@@ -296,11 +309,16 @@ def log_details(self) -> None:
 
     def reorder_loops_by_dep_pair(
         self, self_dep: MemoryDep, other_dep: MemoryDep
-    ) -> None:
-        return
+    ) -> bool:
+        return False
 
     def update_mutated_names(self, renames: dict[str, str]) -> None:
-        self.set_read_writes(self.read_writes.rename(renames))
+        self.mutation_renames = {
+            name: renames[name]
+            for name in (dep.name for dep in self.read_writes.reads_and_writes())
+            if name in renames
+        }
+        self.set_read_writes(self.read_writes.rename(self.mutation_renames))
 
     def add_fake_dep(self, dep: Dep) -> None:
         self.set_read_writes(self.read_writes.with_read(dep))
@@ -605,12 +623,15 @@ def codegen_originating_info(
             out_lines.append(op_info_str)
             if "stack_trace" in o.meta:
                 stack_trace = f"{o.meta['stack_trace']}"
-                stack_trace_last_line = stack_trace.split("|")[-1]
+                stack_trace_last_line = stack_trace.rsplit("|", maxsplit=1)[-1]
                 out_lines.append(
                     "#pragma CMT "
                     + stack_trace_last_line.replace("{", "{{")
                     .replace("}", "}}")
                     .replace("\n", "\\")
+                    .replace(
+                        "\\", "\\\\"
+                    )  # For windows safe path, avoid for example \x, \U.
                 )
                 out_lines.append("#pragma CMT END ORIGIN")
                 out_lines.append("")
@@ -807,10 +828,16 @@ def estimate_flops(self) -> int | None:
         counters["inductor"]["flop_count"] += resolved_flops
         return resolved_flops
 
-    @cache_on_self
     def get_estimated_runtime(self) -> float:
+        if self.override_estimated_runtime is not None:
+            return self.override_estimated_runtime
+
+        return self._get_estimated_runtime()
+
+    @cache_on_self
+    def _get_estimated_runtime(self) -> float:
         """
-        Returns estimated op runtime in nanoseconds (ns)
+        Returns estimated op runtime in milliseconds (ms)
         """
         buf = self.get_nodes()[0].get_outputs()[0]
         layout = buf.node.get_output_spec()
@@ -822,6 +849,21 @@ def get_estimated_runtime(self) -> float:
         if is_collective(self.node):
             assert isinstance(self.node, ir.IRNode)
             try:
+                if config_comms.runtime_estimations_use_nccl_lib_estimations:
+                    cache_key = get_estimate_runtime_cache_key_from_snode(self)
+                    cache = get_estimate_runtime_cache()
+                    cache_val = cache.lookup(cache_key)
+                    if cache_val is not None:
+                        assert isinstance(cache_val, float)
+                        return cache_val
+
+                    ms = estimate_nccl_collective_runtime_nccl_estimator(self)
+                    if ms is None:
+                        # NCCL estimations fail: fallback to in-tree algorithmic estimation.
+                        ms = estimate_nccl_collective_runtime(self.node)
+
+                    cache.set_value(cache_key, value=ms)
+                    return ms
                 return estimate_nccl_collective_runtime(self.node)
             except ValueError as e:
                 # We don't know how to estimate runtime for this collective,
@@ -840,6 +882,10 @@ def get_estimated_runtime(self) -> float:
             # since it doesn't take extra time to get the result after the collective is completed.
             return 0
 
+        ret = maybe_estimate_runtime_benchmark(self)
+        if ret is not None:
+            return ret
+
         dtype = buf.node.maybe_get_dtype()
         try:
             gpu_memory_bandwidth = get_gpu_dram_gbps()
@@ -860,7 +906,9 @@ def get_estimated_runtime(self) -> float:
 
         if flops_est == 0 or flops_est is None:
             # no flops estimate, so fall back to memory estimate
-            return self.get_read_write_buffers_sizes() / gpu_memory_bandwidth
+            ns = self.get_read_write_buffers_sizes() / gpu_memory_bandwidth
+            ms = ns / 1e6
+            return ms
 
         # TODO(xmfan): find a better heuristic to model FLOPS/latency relationship
         factor = 1.0
@@ -869,8 +917,10 @@ def get_estimated_runtime(self) -> float:
         compute_time = (factor * flops_est / gpu_flops) * 1e9
         transfer_time = counted_bytes / gpu_memory_bandwidth
 
-        # Return estimated runtime in nanoseconds
-        return max(compute_time, transfer_time)
+        # Return estimated runtime in milliseconds
+        ns = max(compute_time, transfer_time)
+        ms = ns / 1e6
+        return ms
 
     def get_template_node(self) -> Optional[ir.TemplateBuffer]:
         return None
@@ -895,6 +945,77 @@ def get_prologue_template_epilogue(
         return prologue, template_node, epilogue
 
 
+@functools.cache
+def get_estimate_runtime_cache() -> torch._inductor.codecache.LocalCache:
+    return torch._inductor.codecache.LocalCache()
+
+
+def get_estimate_runtime_cache_key_from_snode(snode: BaseSchedulerNode) -> str:
+    python_kernel_name = getattr(snode.node, "python_kernel_name", "")
+    args = snode.node.inputs  # type: ignore[union-attr]
+    args = snode.node.fill_non_provided_args(  # type: ignore[union-attr]
+        [*args, *snode.node.constant_args],  # type: ignore[union-attr]
+        snode.node.kwargs,  # type: ignore[union-attr]
+    )
+    kwargs = snode.node.kwargs  # type: ignore[union-attr]
+    flat_args, flat_args_pytree_spec = pytree.tree_flatten((args, kwargs))
+
+    def _is_tensor_ir(x) -> bool:  # type: ignore[no-untyped-def]
+        return isinstance(x, ir.IRNode) and not isinstance(x, ir.GeneratorState)
+
+    cache_key = str(
+        (python_kernel_name,)
+        + tuple(tuple(a.get_size()) if _is_tensor_ir(a) else None for a in flat_args)
+    )
+    return cache_key
+
+
+def _get_mm_like_fn(snode: BaseSchedulerNode) -> Optional[Callable[[Any], Any]]:
+    if not isinstance(snode, ExternKernelSchedulerNode):
+        return None
+    mms_fns = {
+        "extern_kernels.mm": torch.ops.aten.mm,
+        "extern_kernels.bmm": torch.ops.aten.bmm,
+        "extern_kernels.addmm": torch.ops.aten.addmm,
+    }
+    python_kernel_name = getattr(snode.node, "python_kernel_name", "")
+    if python_kernel_name not in mms_fns:
+        return None
+    if not isinstance(snode.node, ir.ExternKernel):
+        return None
+    return mms_fns[python_kernel_name]
+
+
+def maybe_estimate_runtime_benchmark(snode: BaseSchedulerNode) -> Optional[float]:
+    bench_fn = None
+    args_kwargs_fn = None
+    if config.runtime_estimations_mms_benchmark:
+        mm_fn = _get_mm_like_fn(snode)
+        if mm_fn is None:
+            return None
+        bench_fn = mm_fn
+        args_kwargs_fn = lambda: snode_args_kwargs(snode)  # noqa: E731
+    else:
+        return None
+
+    cache_key = get_estimate_runtime_cache_key_from_snode(snode)
+    cache = get_estimate_runtime_cache()
+    cache_val = cache.lookup(cache_key)
+    if cache_val is not None:
+        assert isinstance(cache_val, float)
+        return cache_val
+
+    from .utils import snode_args_kwargs
+
+    args, kwargs = args_kwargs_fn()
+    from triton.testing import do_bench
+
+    ms = do_bench(lambda: bench_fn(*args, **kwargs))
+
+    cache.set_value(cache_key, value=ms)
+    return ms
+
+
 class WhyNoFuse:
     # TODO when we drop support for Python < 3.10, we can use
     # @dataclass(slots=True) instead of manually specifying __slots__.
@@ -1009,6 +1130,11 @@ def __init__(self, scheduler: Scheduler, node: ir.Operation) -> None:
 
 
 class SchedulerNode(BaseSchedulerNode):
+    """
+    A SchedulerNode is a node for scheduling that encapsulates either
+    a ComputedBuffer or a TemplateBuffer.
+    """
+
     _sizes: tuple[Sequence[sympy.Expr], ...]
     _body: LoopBody
 
@@ -1078,7 +1204,9 @@ def refresh_dependencies(
         self.set_read_writes(
             dependencies.extract_read_writes(
                 self._body, *self._sizes, normalize=normalize
-            ).with_read(fake_deps)
+            )
+            .with_read(fake_deps)
+            .rename(self.mutation_renames)
         )
 
         self.pointwise_read_writes.clear_cache(self)
@@ -1100,6 +1228,23 @@ def apply_new_loop_order(self, new_order: Sequence[int]) -> None:
 
         self.refresh_dependencies(normalize=False, need_clear_tiling_cache=True)
 
+    def expand_dimension_for_pointwise_node(
+        self, dimension: int, new_range: int
+    ) -> None:
+        assert isinstance(self.node, (ir.ComputedBuffer, ir.TemplateBuffer))
+
+        self._body = self._body.expand_dimension_for_pointwise_node(
+            dimension, new_range
+        )
+        self._sizes = self._body.sizes
+
+        device = self.node.get_device_or_error()
+        group_fn = self.scheduler.get_backend(device).group_fn
+        self.group = (device, group_fn(self._sizes))
+
+        # Need normalize the prefix name to facilitate finding common dependencies
+        self.refresh_dependencies(normalize=True, need_clear_tiling_cache=True)
+
     def merge_loops(self) -> None:
         self._body = self._body.merge_loops()
         self._sizes = self._body.sizes
@@ -1114,7 +1259,7 @@ def merge_loops(self) -> None:
 
     def reorder_loops_by_dep_pair(
         self, self_dep: MemoryDep, other_dep: MemoryDep
-    ) -> None:
+    ) -> bool:
         new_order = None
         self_sizes = self._sizes[0]
         if len(self_sizes) == self_dep.num_vars == other_dep.num_vars:
@@ -1126,11 +1271,13 @@ def reorder_loops_by_dep_pair(
                 "Reorder loops for %s with order %s", self.get_name(), new_order
             )
             self.apply_new_loop_order(new_order)
+            return True
         else:
             loop_ordering_log.debug(
                 "Don't reordering %s because we can not decide the suitable loop order",
                 self.get_name(),
             )
+            return False
 
     def debug_str_extra(self) -> str:
         name = self.get_name()
@@ -1276,6 +1423,13 @@ def _get_atomic_add_buffers(self) -> OrderedSet[str]:
                     )
         return buffers_store_as_atomic_add
 
+    @cache_on_self
+    def has_side_effects(self) -> bool:
+        # self._body is None sometimes that's why this check was added
+        if self._body is not None and self._body.has_op("device_assert_async"):
+            return True
+        return super().has_side_effects()
+
 
 def refresh_group_node_dependencies(
     group_snode: Union[FusedSchedulerNode, GroupedSchedulerNode],
@@ -1380,10 +1534,13 @@ def estimate_flops(self) -> int | None:
 
     def reorder_loops_by_dep_pair(
         self, self_dep: MemoryDep, other_dep: MemoryDep
-    ) -> None:
+    ) -> bool:
+        """
+        Return true if a loop reordering is performed.
+        """
         if self.is_template():
             # We can not really reorder loops for a triton template
-            return
+            return False
         self_sizes = None
         for snode in self.snodes:
             assert isinstance(snode, SchedulerNode)
@@ -1391,7 +1548,7 @@ def reorder_loops_by_dep_pair(
                 loop_ordering_log.debug(
                     "Can not reorder fused node due to different sizes"
                 )
-                return
+                return False
             self_sizes = snode._sizes[0]
         new_order = None
 
@@ -1404,7 +1561,7 @@ def reorder_loops_by_dep_pair(
                 "Dont reordering fused node %s because we can not decide the suitable loop order",
                 self.get_name(),
             )
-            return
+            return False
         metrics.num_loop_reordering += 1
         loop_ordering_log.debug(
             "Reorder loops for fused node %s with order %s", self.get_name(), new_order
@@ -1414,6 +1571,7 @@ def reorder_loops_by_dep_pair(
             snode.apply_new_loop_order(new_order)
 
         refresh_group_node_dependencies(self)
+        return True
 
     def __init__(self, scheduler: Scheduler, snodes: list[BaseSchedulerNode]) -> None:
         super().__init__(scheduler)
@@ -1545,6 +1703,12 @@ def debug_str(self) -> str:
 
         return buf.getrawvalue().rstrip()
 
+    @cache_on_self
+    def has_side_effects(self) -> bool:
+        if self.snodes is not None:
+            return any(node.has_side_effects() for node in self.snodes)
+        return super().has_side_effects()
+
 
 class ForeachKernelSchedulerNode(FusedSchedulerNode):
     """
@@ -2046,6 +2210,10 @@ def merge(self, other: NodeUser) -> NodeUser:
 _post_grad_graph_counter = itertools.count()
 
 
+def used_non_deterministic_runtime_estimations() -> bool:
+    return config.runtime_estimations_mms_benchmark
+
+
 class Scheduler:
     """
     A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
@@ -2073,6 +2241,7 @@ def _init(self, nodes: list[ir.Operation]) -> None:
         )
 
         self.nodes = [self.create_scheduler_node(n) for n in nodes]
+        self.current_node: Optional[BaseSchedulerNode] = None
         self.update_zero_dim_cpu_tensor()
         # some new constants could have been created above
         self.available_buffer_names.update(V.graph.constants.keys())
@@ -2159,6 +2328,23 @@ def _init(self, nodes: list[ir.Operation]) -> None:
                 OrderedSet(V.graph.get_output_names()),
             )
         if config.reorder_for_compute_comm_overlap:
+            if not config.reorder_for_peak_memory:
+                from .memory import assign_memory_planning_info_for_scheduler_buffers
+
+                assign_memory_planning_info_for_scheduler_buffers(
+                    self.nodes, self.name_to_buf
+                )
+
+            if (
+                used_non_deterministic_runtime_estimations()
+                and config_comms.runtime_estimations_align_across_all_distributed_ranks
+            ):
+                from .comms import (
+                    align_runtime_estimations_across_all_distributed_ranks,
+                )
+
+                align_runtime_estimations_across_all_distributed_ranks(self.nodes)
+
             from torch._logging import trace_structured
 
             trace_structured(
@@ -2179,11 +2365,18 @@ def _init(self, nodes: list[ir.Operation]) -> None:
             self.nodes = comms.reorder_compute_and_comm_for_overlap(self.nodes)
         self.process_grouped_nodes()
 
-        if torch._inductor.config.graph_partition:
+        if (
+            torch._inductor.config.graph_partition
+            and torch._inductor.config.triton.cudagraphs
+        ):
             self.nodes = self.maybe_reorder_for_minimizing_partition(self.nodes)
             self.nodes = self.reorder_for_partition_with_simple_dependency(self.nodes)
 
         self.compute_last_usage()
+
+        if torch._inductor.config.test_configs.track_memory_lifecycle:
+            self.insert_memory_check_nodes()
+
         log_ir_post_fusion(self.nodes)
         V.debug.graph_diagram(self.nodes)
         self.debug_draw_graph()
@@ -2389,11 +2582,11 @@ def add_user(
                     for fs in s.free_symbols:
                         unbacked_symbol_to_origin_node[fs] = None
 
+        has_non_input_unbacked_defs = False
         for node in self.nodes:
-            log.debug("scheduling %s", node.node)
+            assert node.node is not None
             # unbacked symbols don't follow ordinary buffer dependencies, so
             # we track their def/uses separately
-            assert node.node is not None
             unbacked_symbol_defs = sorted(
                 node.node.get_unbacked_symbol_defs(), key=lambda x: x.name
             )
@@ -2402,20 +2595,28 @@ def add_user(
                 # Pick the first definer as canonical.  There may be multiple
                 # because if a MultiOutputLayout buffer propagates an unbacked
                 # symint to multiple outputs, they will all claim to def it.
+                has_non_input_unbacked_defs = True
                 if s not in unbacked_symbol_to_origin_node:
                     unbacked_symbol_to_origin_node[s] = node.get_name()
 
-            unbacked_symbol_uses = sorted(
-                node.node.get_free_symbol_uses(unbacked_only=True), key=lambda x: x.name
-            )
-            # if a kernel takes unbacked symints, register dependencies
-            for s in unbacked_symbol_uses:
-                assert s in unbacked_symbol_to_origin_node, (
-                    f"{s} not in {unbacked_symbol_to_origin_node}"
+        for node in self.nodes:
+            log.debug("scheduling %s", node.node)
+
+            if has_non_input_unbacked_defs:
+                assert node.node is not None
+
+                unbacked_symbol_uses = sorted(
+                    node.node.get_free_symbol_uses(unbacked_only=True),
+                    key=lambda x: x.name,
                 )
-                if (r := unbacked_symbol_to_origin_node[s]) is not None:
-                    for buf in self.name_to_node[r].get_outputs():
-                        node.add_fake_dep(StarDep(buf.get_name()))
+                # if a kernel takes unbacked symints, register dependencies
+                for s in unbacked_symbol_uses:
+                    assert s in unbacked_symbol_to_origin_node, (
+                        f"{s} not in {unbacked_symbol_to_origin_node}"
+                    )
+                    if (r := unbacked_symbol_to_origin_node[s]) is not None:
+                        for buf in self.name_to_node[r].get_outputs():
+                            node.add_fake_dep(StarDep(buf.get_name()))
 
             if (
                 len(node.read_writes.writes) == 1
@@ -2470,17 +2671,20 @@ def add_user(
             add_user(buf_name, OutputNode(StarDep(buf_name)))
 
         # make sure unbacked symints aren't dead-code-eliminated
-        for out in V.graph.graph_outputs:
-            for s in out.get_free_symbol_uses(unbacked_only=True):
-                assert s in unbacked_symbol_to_origin_node, (
-                    f"{s} not in {unbacked_symbol_to_origin_node.keys()}"
-                )
-                if r := unbacked_symbol_to_origin_node[s]:
-                    for buf_name in self.name_to_node[r].get_buffer_names():
-                        log.debug(
-                            "scheduling output %s for unbacked symint %s", buf_name, s
-                        )
-                        add_user(buf_name, OutputNode(StarDep(buf_name)))
+        if has_non_input_unbacked_defs:
+            for out in V.graph.graph_outputs:
+                for s in out.get_free_symbol_uses(unbacked_only=True):
+                    assert s in unbacked_symbol_to_origin_node, (
+                        f"{s} not in {unbacked_symbol_to_origin_node.keys()}"
+                    )
+                    if r := unbacked_symbol_to_origin_node[s]:
+                        for buf_name in self.name_to_node[r].get_buffer_names():
+                            log.debug(
+                                "scheduling output %s for unbacked symint %s",
+                                buf_name,
+                                s,
+                            )
+                            add_user(buf_name, OutputNode(StarDep(buf_name)))
 
         # make sure input mutation isn't dead-code-eliminated
         for name in self.mutation_renames:
@@ -2518,6 +2722,83 @@ def add_user(
         compute_dependencies_log.debug("BUFFER USER LIST\n")
         compute_dependencies_log.debug("===== AFTER SCHEDULING =====\n%s", str)
 
+    def insert_memory_check_nodes(self) -> None:
+        from .memory import (
+            assign_memory_planning_info_for_scheduler_buffers,
+            compute_memory_timeline,
+            FreeableInputBuffer,
+            get_freeable_input_buf,
+        )
+
+        graph_inputs: OrderedSet[str] = OrderedSet(V.graph.graph_inputs.keys())
+        name_to_freeable_input_buf: dict[str, FreeableInputBuffer] = (
+            get_freeable_input_buf(self.nodes, graph_inputs)
+        )
+
+        if not torch._inductor.config.reorder_for_peak_memory:
+            assign_memory_planning_info_for_scheduler_buffers(
+                self.nodes, self.name_to_buf
+            )
+
+        graph_outputs: OrderedSet[str] = OrderedSet(V.graph.get_output_names())
+        buf_info_list, _, _ = compute_memory_timeline(
+            self.nodes,
+            name_to_freeable_input_buf,
+            graph_outputs,
+        )
+
+        step_allocs_deallocs: list[tuple[list[str], list[str]]] = [
+            ([], []) for _ in range(len(self.nodes))
+        ]
+        for buf_info in buf_info_list:
+            # Skip zero-size buffers
+            if buf_info.size_alloc == 0 and buf_info.size_free == 0:
+                continue
+
+            buf_name = buf_info.buffer.get_name()
+
+            step_allocs_deallocs[buf_info.start_step][0].append(buf_name)
+            step_allocs_deallocs[buf_info.end_step][1].append(buf_name)
+
+        from torch._inductor.runtime.debug_utils import register_check_mem_op
+
+        register_check_mem_op()
+
+        def construct_mem_check_node(
+            step_idx: int, is_final_step: bool
+        ) -> ExternKernelSchedulerNode:
+            expected_newly_alive = step_allocs_deallocs[step_idx][0]
+            expected_newly_dead = step_allocs_deallocs[step_idx][1]
+
+            nontensor_args = [expected_newly_alive, expected_newly_dead, is_final_step]
+
+            node = ir.MemoryCheckKernel(
+                layout=NoneLayout(device=torch.device("cpu")),
+                kernel=torch.ops._inductor_debug.check_memory_step.default,
+                tensor_args=[],
+                nontensor_args=nontensor_args,
+                unflatten_args=lambda tensor_args, constant_args: (
+                    tensor_args,
+                    {
+                        "alive": constant_args[0],
+                        "dead": constant_args[1],
+                        "is_final_step": constant_args[2],
+                    },
+                ),
+            )
+            node.operation_name = f"mem_check_{self.nodes[step_idx].get_name()}"
+            return ExternKernelSchedulerNode(self, node)
+
+        new_nodes = []
+
+        for i, node in enumerate(self.nodes):
+            new_nodes.append(node)
+            new_nodes.append(
+                construct_mem_check_node(i, is_final_step=(i == len(self.nodes) - 1))
+            )
+
+        self.nodes = new_nodes
+
     def dead_node_elimination(self) -> None:
         """
         Remove any nodes without users
@@ -2653,10 +2934,10 @@ def compute_ancestors(self) -> None:
             node.max_order = order
 
     def merge_loops(self) -> None:
-        for node in self.nodes:
-            if not config.loop_ordering_after_fusion:
-                continue
+        if not config.loop_ordering_after_fusion:
+            return
 
+        for node in self.nodes:
             # Even for CPU, if we are using the halide backend, we still need
             # the merge loops steps below
             if not isinstance(node, (SchedulerNode, FusedSchedulerNode)) or (
@@ -3630,6 +3911,11 @@ def shared_data_after_reordering_loop(
         Right now just greedily reorder the loop of node1 to be compatible with node2,
         but ideally we should have some heuristics to reorder the loop for node2
         to be compatible with node1 if that's more efficient.
+
+        Return the amount of shared data re-computed in this method.
+        If no such recomputation happens, return -1 (not return 0 since 0 is a valid
+        amount of shared data).
+
         """
 
         # TODO Don't do loop reordering for CPU for now.
@@ -3637,14 +3923,14 @@ def shared_data_after_reordering_loop(
         if not config.loop_ordering_after_fusion or any(
             n.is_cpu() for n in [node1, node2]
         ):
-            return 0
+            return -1
 
         node1_buffer_names = node1.read_writes.buffer_names()
         node2_buffer_names = node2.read_writes.buffer_names()
         # Fast path: no common buffers.
         common_buffer_names = node1_buffer_names & node2_buffer_names
         if not common_buffer_names:
-            return 0
+            return -1
 
         node1_name2dep = {dep.name: dep for dep in node1.read_writes.reads_and_writes()}
         node2_name2dep = {dep.name: dep for dep in node2.read_writes.reads_and_writes()}
@@ -3667,13 +3953,13 @@ def shared_data_after_reordering_loop(
                 )
 
         if len(candidates) == 0:
-            return 0
+            return -1
 
         # Pick the largest buffer to guide the loop reordering
         _numel, lhs_dep, rhs_dep = max(candidates, key=operator.itemgetter(0))
 
         if not isinstance(lhs_dep, MemoryDep) or not isinstance(rhs_dep, MemoryDep):
-            return 0
+            return -1
 
         if lhs_dep.num_vars != rhs_dep.num_vars:
             # this can happen due to we don't merge loops.
@@ -3682,13 +3968,14 @@ def shared_data_after_reordering_loop(
             # normalization (merging loops)
             if lhs_dep.normalize() == rhs_dep.normalize():
                 return self.dep_size_hint(lhs_dep)
-            return 0
+            return -1
 
+        reordered = False
         # Only reorder loops for pointwise for now
         if not node1.is_reduction():
-            node1.reorder_loops_by_dep_pair(lhs_dep, rhs_dep)
+            reordered = node1.reorder_loops_by_dep_pair(lhs_dep, rhs_dep)
         elif not node2.is_reduction():
-            node2.reorder_loops_by_dep_pair(rhs_dep, lhs_dep)
+            reordered = node2.reorder_loops_by_dep_pair(rhs_dep, lhs_dep)
         else:
             loop_ordering_log.debug(
                 "Don't reorder loops since both nodes are reductions: %s v.s. %s",
@@ -3696,7 +3983,7 @@ def shared_data_after_reordering_loop(
                 node2.get_name(),
             )
 
-        return self.score_fusion_memory(node1, node2)
+        return self.score_fusion_memory(node1, node2) if reordered else -1
 
     def unfusable_node(self, node: BaseSchedulerNode) -> bool:
         """
@@ -3767,12 +4054,109 @@ def low_prec_fp(dtype: torch.dtype) -> bool:
 
         return True
 
+    def get_expand_dim_for_pointwise_nodes(
+        self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
+    ) -> Optional[tuple[int, SchedulerNode, sympy.Expr]]:
+        """
+        Fusing two small pointwise nodes significantly reduces kernel overhead
+        and launch overhead. However, slightly different sizes would prevent fusion.
+        Here, we decide if expanding sizes of one node is profitible by allowing
+        fusion, and returns the dimension to expand, node with smaller sizes,
+        and new size after expand.
+        """
+        # only support scheduler node
+        if not isinstance(node1, SchedulerNode) or not isinstance(node2, SchedulerNode):
+            return None
+
+        # only support computued buffer
+        if not (
+            isinstance(node1.node, ir.ComputedBuffer)
+            and isinstance(node2.node, ir.ComputedBuffer)
+        ):
+            return None
+
+        # does not support mutation yet since relying on index mod to handle
+        # out-of-boundary access.
+        if node1.has_aliasing_or_mutation() or node2.has_aliasing_or_mutation():
+            return None
+
+        # skip halide which does not support mod for index
+        if config.cpu_backend == "halide":
+            return None
+
+        # only support pointwise nodes with the same reduction size
+        n1_sizes, n2_sizes = node1._sizes, node2._sizes
+        n1_iter_sizes, n1_reduce_sizes = n1_sizes
+        n2_iter_sizes, n2_reduce_sizes = n2_sizes
+        if (
+            node1.is_reduction()
+            or node2.is_reduction()
+            or n1_reduce_sizes != n2_reduce_sizes
+            or len(n1_iter_sizes) != len(n2_iter_sizes)
+        ):
+            return None
+
+        # only support nodes with 1 write for simplification
+        if len(node1.read_writes.writes) > 1 or len(node2.read_writes.writes) > 1:
+            return None
+
+        # When memory access is small, reducing gpu kernel overhead is profitable over
+        # slightly larger memory access.
+        node1_write_memory = self.dep_size_hint(next(iter(node1.read_writes.writes)))
+        node2_write_memory = self.dep_size_hint(next(iter(node1.read_writes.writes)))
+        if (
+            max(node1_write_memory, node2_write_memory)
+            > config.small_memory_access_threshold
+        ):
+            return None
+
+        # does not support reinplace since `index % boundary` may lead to
+        # race condition
+        def has_reusable_buffer(node: BaseSchedulerNode) -> bool:
+            for read in node.read_writes.reads:
+                input_buf: Optional[Union[SchedulerBuffer, SchedulerDonatedBuffer]]
+                if read.name in self.name_to_donated_buffer:
+                    input_buf = self.name_to_donated_buffer[read.name]
+                else:
+                    input_buf = self.name_to_buf.get(read.name)
+
+                if (
+                    input_buf
+                    and V.graph.wrapper_code.can_reuse(input_buf, node)
+                    and not isinstance(input_buf.defining_op, NopKernelSchedulerNode)
+                ):
+                    return True
+            return False
+
+        if has_reusable_buffer(node1) or has_reusable_buffer(node2):
+            return None
+
+        # only support nodes with 1 mismatch dimension
+        mismatch_dimensions = []
+        for idx, (n1_size, n2_size) in enumerate(zip(n1_iter_sizes, n2_iter_sizes)):
+            if n1_size != n2_size:
+                mismatch_dimensions.append(idx)
+
+        if len(mismatch_dimensions) != 1:
+            return None
+
+        mismatch_dim = mismatch_dimensions[0]
+        mismatch_size1, mismatch_size2 = (
+            n1_iter_sizes[mismatch_dim],
+            n2_iter_sizes[mismatch_dim],
+        )
+        if V.graph.sizevars.statically_known_lt(mismatch_size1, mismatch_size2):
+            return mismatch_dim, node1, mismatch_size2
+        elif V.graph.sizevars.statically_known_lt(mismatch_size2, mismatch_size1):
+            return mismatch_dim, node2, mismatch_size1
+        else:
+            return None
+
     def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
         """
         Determine if it is possible to combine node1 and node2 into a
         single fused node.
         """
-
         if node1 is node2:
             return False
 
@@ -3876,7 +4260,6 @@ def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
         ):
             why("fusion for buffer explicit disabled")
             return False
-
         device = node1.get_device()
         device2 = node2.get_device()
         if device != device2:
@@ -3889,7 +4272,16 @@ def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
             shared_data_score < config.score_fusion_memory_threshold
             and config.loop_ordering_after_fusion
         ):
-            shared_data_score = self.shared_data_after_reordering_loop(node1, node2)
+            new_shared_data_score = self.shared_data_after_reordering_loop(node1, node2)
+            if new_shared_data_score >= 0:
+                shared_data_score = new_shared_data_score
+
+        if config.expand_dimension_for_pointwise_nodes and (
+            expand_analysis := self.get_expand_dim_for_pointwise_nodes(node1, node2)
+        ):
+            (expand_dim, smaller_node, expand_size) = expand_analysis
+            smaller_node.expand_dimension_for_pointwise_node(expand_dim, expand_size)
+            shared_data_score = self.score_fusion_memory(node1, node2)
 
         if loop_ordering_log.isEnabledFor(logging.DEBUG):
             loop_ordering_log.debug(
@@ -4231,6 +4623,15 @@ def should_partition(
     ) -> bool:
         """Return True if we should partition the inductor graph on this node"""
 
+        # When not using cudagraphs, keep all kernels in the `call` function
+        # instead of graph partition functions, since graph partition only brings
+        # benefit to cudagraph
+        if (
+            not torch._inductor.config.triton.cudagraphs
+            and _unstable_customized_partition_wrapper.wrapper is None
+        ):
+            return True
+
         # avoid duplicating logs when should_partition is called multiple times
         # on the same node
         def noop_log(msg: str, node: Optional[BaseSchedulerNode]) -> None:
@@ -4888,6 +5289,7 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                         assert device.index is not None, "device should have an index"
                         V.graph.wrapper_code.codegen_device_guard_enter(device.index)
 
+            self.current_node = node
             self.buffer_names_to_free.update(node.last_usage)
 
             if node.is_template():
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index b337e2b625fdf..ac8daee16417a 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -34,6 +34,7 @@
     identity,
     preserve_rng_state,
 )
+from torch._inductor.await_utils import await_sync
 from torch._inductor.utils import clear_on_fresh_cache
 from torch.utils._filelock import FileLock
 from torch.utils._ordered_set import OrderedSet
@@ -173,26 +174,39 @@ class PartialRender:
     of replacements after the initial render.
     """
 
-    FINALIZED_HOOK: object = object()
+    HookFn = Callable[[], str]
 
-    def __init__(self, code, replacement_hooks) -> None:
+    def __init__(
+        self, code: str, replacement_hooks: dict[str, Optional[HookFn]]
+    ) -> None:
         super().__init__()
-        self._code = code
-        self.replacement_hooks = replacement_hooks
+        self._code: str = code
+        self.replacement_hooks: dict[str, Optional[PartialRender.HookFn]] = (
+            replacement_hooks
+        )
 
     @property
-    def code(self):
+    def code(self) -> str:
+        """
+        The fully rendered code. Will **error** if any hooks have yet to be
+        finalized.
+        """
         remaining_active_hooks = [
-            key
-            for key, fn in self.replacement_hooks.items()
-            if fn is not self.FINALIZED_HOOK
+            key for key, fn in self.replacement_hooks.items() if fn is not None
         ]
         assert len(remaining_active_hooks) == 0, (
             f"The following hooks have not yet been finalized:\n {remaining_active_hooks=}"
         )
         return self._code
 
-    def finalize_hook(self, hook_key: str, strict=True) -> None:
+    def finalize_hook(self, hook_key: str, strict: bool = True) -> None:
+        """
+        Finalize a hook by name.
+
+        :param strict: If ``True``, raise an error if the hook wasn't found.
+
+        NOTE: Will **error** if the hook has already been finalized.
+        """
         if hook_key not in self.replacement_hooks:
             if strict:
                 raise RuntimeError(
@@ -200,11 +214,12 @@ def finalize_hook(self, hook_key: str, strict=True) -> None:
                 )
             else:
                 return
-        assert self.replacement_hooks[hook_key] is not self.FINALIZED_HOOK, (
-            "hook_key can only be called once"
-        )
-        self._code = self._code.replace(hook_key, self.replacement_hooks[hook_key]())
-        self.replacement_hooks[hook_key] = self.FINALIZED_HOOK
+
+        hook = self.replacement_hooks[hook_key]
+        assert hook is not None, f"Hook key {hook_key} can only be called once"
+        self._code = self._code.replace(hook_key, hook())
+
+        self.replacement_hooks[hook_key] = None
 
     def finalize_remaining(self) -> str:
         """
@@ -215,11 +230,17 @@ def finalize_remaining(self) -> str:
         finalize active hooks.
         """
         for key, fn in self.replacement_hooks.items():
-            if fn is not self.FINALIZED_HOOK:
+            if fn is not None:
                 self.finalize_hook(key)
         return self.code
 
     def finalize_all(self) -> str:
+        """
+        Finalize all active hooks.
+
+        NOTE: unlike ``finalize_remaining``, this method will **error** if any
+        hook has already been finalized.
+        """
         for key in self.replacement_hooks:
             self.finalize_hook(key)
         return self.code
@@ -242,7 +263,7 @@ class SubgraphInfo:
 
     # only copied over if not None
     range_trees: Optional[list["IterationRangesRoot"]] = None
-    numels = None  # type: ignore[var-annotated]
+    numels: Optional[dict[str, sympy.Expr]] = None
 
     def __post_init__(self):
         self.only_copy_if_non_none_fields = ("range_trees", "numels")
@@ -274,7 +295,8 @@ def load(self, name: str, index: sympy.Expr):
         if name not in self.fixed_inputs:
             index_str = self._process_indexing(index)
             var = self._add_kernel_input(name)
-            var_dtype = V.graph.get_buffer(name).dtype
+            buffer = V.graph.get_buffer(name)
+            var_dtype = buffer.dtype
             line = f"tl.load({var} + {index_str})"
 
             if (
@@ -284,11 +306,16 @@ def load(self, name: str, index: sympy.Expr):
                 line += ".to(tl.float32)"
                 var_dtype = torch.float32
 
-            out = self.kernel.cse.generate(self.kernel.compute, line, dtype=var_dtype)
+            out = self.kernel.cse.generate(
+                self.kernel.compute, line, dtype=var_dtype, shape=()
+            )
             return out
 
         return self.kernel.cse.generate(
-            self.kernel.compute, f"({self.fixed_inputs[name]})", dtype=torch.float32
+            self.kernel.compute,
+            f"({self.fixed_inputs[name]})",
+            dtype=torch.float32,
+            shape=(),
         )
 
     def indirect_indexing(self, index_var: str, size, check, wrap_neg=True):
@@ -438,6 +465,9 @@ def __init__(
         # by adding all inputs.
         self.prologue_loads_all_inputs = prologue_loads_all_inputs
 
+        # Extra functions to be exposed during partial template rendering.
+        self.extra_template_env_fns: list[Callable[..., Any]] = []
+
     def input_dependent_preserved_state(self) -> str:
         # Not adding self.args.output_buffers on purpose. But we do not need to reproduce it on a cache hit.
         # (never accessed).
@@ -573,7 +603,7 @@ def jit_lines(self):
 
         inductor_meta = {
             "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
-            **TritonKernel.inductor_meta_common(),
+            **self.inductor_meta_common(),
             **FixedGrid.setup_grid_as_args(),
         }
         if config.profile_bandwidth or config.benchmark_kernel:
@@ -583,6 +613,8 @@ def jit_lines(self):
             flops = self.estimate_flops()
             inductor_meta["kernel_flop"] = flops
 
+        inductor_meta["config_args"] = self.meta
+
         template_args = f"""
             num_stages={self.num_stages},
             num_warps={self.num_warps},
@@ -609,8 +641,7 @@ def hook():
             arg_defs, *_ = self.args.python_argdefs()
             return f"{', '.join(x.full_name() for x in arg_defs)}"
 
-        self.render_hooks["<ARGDEFS>"] = hook
-        return "<ARGDEFS>"
+        return self._register_hook("<ARGDEFS>", hook, allow_overwriting=True)
 
     def gen_defines(self):
         return self.defines
@@ -688,9 +719,7 @@ def hook():
                 code.splice(renames.getvalue())
             return code.getvalue()
 
-        assert "<DEF_KERNEL>" not in self.render_hooks
-        self.render_hooks["<DEF_KERNEL>"] = hook
-        return "<DEF_KERNEL>"
+        return self._register_hook("<DEF_KERNEL>", hook)
 
     def size(self, name: str, index: int):
         """
@@ -983,9 +1012,7 @@ def hook():
 
                 return textwrap.indent(self.body.getvalue(), " " * indent_width).strip()
 
-        assert hook_key not in self.render_hooks
-        self.render_hooks[hook_key] = hook
-        return hook_key
+        return self._register_hook(hook_key, hook)
 
     def store_output(
         self,
@@ -993,6 +1020,7 @@ def store_output(
         val: str,
         mask: Optional[str] = None,
         indent_width: int = 4,
+        val_shape: Optional[list[str]] = None,
     ):
         """Stores the final output and appends any epilogue fusions if the buffer hasn't been optimized away.
 
@@ -1043,7 +1071,9 @@ def store_output(
                 if "ACC_TYPE" in self.meta
                 else torch.float32
             )
-            epilogue_args = [V.kernel.cse.namedvar(val, dtype=acc_dtype)]
+            epilogue_args = [
+                V.kernel.cse.namedvar(val, dtype=acc_dtype, shape=val_shape)
+            ]
             for input_node in itertools.chain(
                 self.input_nodes[: self.prefix_args],
                 self.input_nodes[len(self.input_nodes) - self.suffix_args :],
@@ -1067,9 +1097,50 @@ def hook():
 
             return textwrap.indent(self.body.getvalue(), " " * indent_width).strip()
 
-        assert "<STORE_OUTPUT>" not in self.render_hooks
-        self.render_hooks["<STORE_OUTPUT>"] = hook
-        return "<STORE_OUTPUT>"
+        return self._register_hook("<STORE_OUTPUT>", hook)
+
+    def _register_hook(
+        self,
+        hook_name: str,
+        hook_fn: PartialRender.HookFn,
+        *,
+        allow_overwriting: bool = False,
+    ) -> str:
+        """
+        Register a hook function with a name.
+
+        ``hook_name`` should match the string that will be replaced via
+        ``hook_fn``, and should not already be in use for a hook.
+
+        If ``allow_overwriting`` is ``False``, will assert that there isn't
+        currently a registered hook of the same name before registering the new
+        one.
+        """
+
+        if not allow_overwriting:
+            assert hook_name not in self.render_hooks, (
+                f"Tried to register the hook {hook_name} multiple times. If "
+                "desired, pass allow_overwriting=True to _register_hook"
+            )
+        self.render_hooks[hook_name] = hook_fn
+        return hook_name
+
+    def _register_extra_template_env_fns(self, *fns: Callable[..., Any]):
+        """
+        Register some extra functions to expose when performing the initial
+        template render, so that they're in scope to by used by jinja
+        expressions.
+
+        These can be used to, for example, implement extra replacement hooks,
+        if the given function:
+
+        * Returns the name of their hook, which should also be the string to
+          replace via the hook function. The convention is to use the format
+          <HOOK_NAME>.
+        * Assigns the corresponding entry in ``self.render_hooks`` to a hook
+          function.
+        """
+        self.extra_template_env_fns.extend(fns)
 
     def render(self, template, kwargs, record_input_dependent_tracked_event=False):
         if record_input_dependent_tracked_event:
@@ -1089,6 +1160,7 @@ def render(self, template, kwargs, record_input_dependent_tracked_event=False):
                 self.modification,
                 self.gen_argdefs,
                 self.gen_defines,
+                *self.extra_template_env_fns,
             ]
         }
         return PartialRender(
@@ -1371,6 +1443,11 @@ def __init__(
     # was not used are the same.
     test_cache = False
 
+    @property
+    def uid(self) -> str:
+        # unique by prefixing with triton
+        return f"triton::{self.name}"
+
     def maybe_append_choice(
         self, choices: list[Any], **kwargs: Any
     ) -> Optional[NotImplementedError]:
@@ -1383,7 +1460,9 @@ def maybe_append_choice(
         """
 
         try:
-            choices.append(self.generate(generate_with_caching=True, **kwargs))
+            choice = self.generate(generate_with_caching=True, **kwargs)
+            if choice is not None:
+                choices.append(choice)
             return None
         except NotImplementedError as e:
             log.info(
@@ -1444,17 +1523,21 @@ def generate_and_load(
 
         for name, val in kwargs.items():
             defines.write(f"{name} : tl.constexpr = {val}\n")
-        defines = defines.getvalue()
 
         fake_out = ir.Buffer(name="buf_out", layout=layout)
         kernel_name = f"triton_{self.name}"
 
         numel = sympy_product(layout.size)
         buffers = itertools.chain(input_nodes, (fake_out,))
-        if not TritonScheduling.can_use_32bit_indexing(numel, buffers):
-            raise NotImplementedError(
-                "64-bit indexing is not yet implemented for triton templates"
-            )
+
+        if TritonScheduling.can_use_32bit_indexing(numel, buffers):
+            index_dtype = "tl.int32"
+        else:
+            index_dtype = "tl.int64"
+
+        # Add index dtype to defines so it's available in the template
+        defines.write(f"INDEX_DTYPE : tl.constexpr = {index_dtype}\n")
+        defines = defines.getvalue()
 
         kernel_options = {
             "input_nodes": input_nodes,
@@ -1831,6 +1914,36 @@ def bind(
             self, input_nodes, layout, kwargs, has_out_variant=self.has_out_variant
         )
 
+    @property
+    def uid(self) -> str:
+        # unique by prefixing with aten
+        return f"aten::{self.name}"
+
+    def choice_or_none(self, **kwargs: Any) -> Optional[ChoiceCaller]:
+        """
+        Maybe generates a new ChoiceCaller and returns it, or None if generation fails.
+
+        kwargs: Additional kwargs to be passed to generate a new ChoiceCaller.
+        """
+        temp_choices: list[Any] = []
+        result = self.maybe_append_choice(temp_choices, **kwargs)
+        if result is None and len(temp_choices) == 1:
+            return temp_choices[0]
+        return None
+
+    def maybe_append_choice(
+        self, choices: list[Any], **kwargs: Any
+    ) -> Optional[NotImplementedError]:
+        # convenience function to match the Template interface, so that
+        # templates and ExternKernelChoice can be treated the same when
+        # generating choice callers
+        assert "input_nodes" in kwargs, "input_nodes argument required"
+        assert "layout" in kwargs, "layout argument required"
+        input_nodes = kwargs.pop("input_nodes")
+        layout = kwargs.pop("layout")
+        choices.append(self.bind(input_nodes=input_nodes, layout=layout, **kwargs))
+        return None
+
 
 class TritonTemplateCaller(ir.TritonTemplateCallerBase):
     def __init__(
@@ -2280,6 +2393,7 @@ def __call__(
         input_gen_fns: Optional[dict[int, Callable[[ir.Buffer], torch.Tensor]]] = None,
         precompilation_timeout_seconds: int = 60 * 60,
         return_multi_template=False,
+        best_config_future=None,
     ):
         from .codegen.cuda.cuda_kernel import CUDATemplateCaller
 
@@ -2387,6 +2501,35 @@ def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
                 log.debug("Prescreening elapsed time: %.02fs", prescreening_elapse)
 
             autotune_start_ts = time.time()
+
+            if best_config_future is not None:
+                best_config = await_sync(best_config_future)
+
+                important_keys = [
+                    "ACC_TYPE",
+                    "ALLOW_TF32",
+                    "BLOCK_K",
+                    "BLOCK_M",
+                    "BLOCK_N",
+                    "EVEN_K",
+                    "GROUP_M",
+                    "USE_FAST_ACCUM",
+                    "num_stages",
+                    "num_warps",
+                    "num_consumer_groups",
+                    "num_buffers_warp_spec",
+                ]
+                choices = [
+                    choice
+                    for choice in choices
+                    if all(
+                        f"{k}={best_config[k]}" in choice.description
+                        for k in important_keys
+                    )
+                    for k in important_keys
+                ]
+                log.info("Filtered to %d choices based on best_config", len(choices))
+
             timings = self.lookup(
                 choices,
                 name,
@@ -2650,11 +2793,16 @@ def on_complete(future):
         def wait_on_futures():
             log.debug("Waiting on futures")
             counters["inductor"]["select_algorithm_precompile"] += 1
+            exceptions: list[tuple[ChoiceCaller, BaseException]] = []
             for future in as_completed(
                 futures,
                 timeout=precompilation_timeout_seconds,
             ):
                 if e := future.exception():
+                    counters["inductor"][
+                        "select_algorithm_num_precompilation_exceptions"
+                    ] += 1
+                    exceptions.append((futures[future], e))
                     from torch._inductor.codegen.cuda.cuda_kernel import (
                         CUDATemplateCaller,
                     )
@@ -2682,6 +2830,8 @@ def wait_on_futures():
                         futures.get(future),
                         elapsed_times.get(future),
                     )
+            if exceptions:
+                _log_autotune_exceptions(exceptions)
 
             executor.shutdown(wait=True)
 
@@ -3250,6 +3400,9 @@ def key_of(node):
     def add_feedback_saver(self, fn: FeedbackFunction):
         self.feedback_saver_fns.append(fn)
 
+    def clear_feedback_savers(self):
+        self.feedback_saver_fns = []
+
     def add_preprocessing_fn(self, fn: PreprocessingFunction):
         self.preprocessing_fns.append(fn)
 
@@ -3297,6 +3450,12 @@ def add_feedback_saver(
     cache.add_feedback_saver(fn)
 
 
+def clear_feedback_savers():
+    """Clear all feedback saver functions."""
+    cache = get_algorithm_selector_cache()
+    cache.clear_feedback_savers()
+
+
 def add_preprocessing_fn(
     fn: PreprocessingFunction,
 ):
@@ -3452,5 +3611,61 @@ def _log_autotune_choices_stats(
     sys.stderr.write(f"Autotune Choices Stats:\n{payload}\n")
 
 
+def _log_autotune_exceptions(
+    exceptions: list[tuple[ChoiceCaller, BaseException]],
+) -> None:
+    """Log autotune exceptions to chromium event logger."""
+    if not exceptions:
+        return
+
+    try:
+        pt2_compile_substack = get_chromium_event_logger().get_pt2_compile_substack()
+        if not pt2_compile_substack:
+            return
+
+        current_event = pt2_compile_substack[-1]
+        if not current_event.endswith("_template_precompiling"):
+            return
+
+        exception_details = []
+        for choice, exc in exceptions:
+            try:
+                choice_type = (
+                    "triton" if isinstance(choice, TritonTemplateCaller) else "other"
+                )
+                data = {
+                    "choice_type": choice_type,
+                    "choice": choice.description,
+                    "exception_message": str(exc),
+                }
+
+                exc_type_match = re.search(r"(\w+):", str(exc))
+                if exc_type_match:
+                    data["exception"] = exc_type_match.group(1)
+
+                if "OutOfMemoryError" in str(exc):
+                    required_match = re.search(r"Required: (\d+)", str(exc))
+                    if required_match:
+                        data["required_memory"] = required_match.group(1)
+
+                    limit_match = re.search(r"Hardware limit:\s*(\d+)", str(exc))
+                    if limit_match:
+                        data["hardware_limit"] = limit_match.group(1)
+
+                exception_details.append(data)
+            except Exception:
+                # Don't let logging errors break the main flow
+                continue
+
+        if exception_details:
+            metadata = json.dumps({"exceptions": exception_details})
+            get_chromium_event_logger().try_add_event_data(
+                current_event, metadata=metadata
+            )
+    except Exception:
+        # Silently ignore logging errors to avoid breaking autotune
+        pass
+
+
 # ensure lowering is imported so that `extern_kernels.*` is populated
 from . import lowering  # noqa: F401
diff --git a/torch/_inductor/shape_propagation.py b/torch/_inductor/shape_propagation.py
new file mode 100644
index 0000000000000..38e3714d78f33
--- /dev/null
+++ b/torch/_inductor/shape_propagation.py
@@ -0,0 +1,145 @@
+import functools
+from collections.abc import Sequence
+from typing import Callable, Optional, Protocol, Union
+
+import sympy
+
+import torch
+
+from .virtualized import OpsValue, V
+
+
+BlockShapeType = Optional[Sequence[Union[int, str]]]
+
+
+class ShapeVar(Protocol):
+    @property
+    def shape(self) -> BlockShapeType: ...
+
+
+ShapeArg = Union[ShapeVar, torch.types.Number, str, OpsValue, torch.dtype]
+
+# Inputs need to be cacheable (e.g., not a CSEVar) in order for the cache to be effective
+# So first decompose CSEVars -> tuple before calling this
+
+
+@functools.lru_cache(None)
+def get_broadcasted_shape(a: BlockShapeType, b: BlockShapeType) -> BlockShapeType:
+    assert isinstance(a, Sequence)
+    assert isinstance(b, Sequence)
+    if len(a) > len(b):
+        return get_broadcasted_shape(a, (*[1] * (len(a) - len(b)), *b))
+    elif len(a) < len(b):
+        b, a = a, b
+        return get_broadcasted_shape(a, (*[1] * (len(a) - len(b)), *b))
+    else:
+
+        def _get_broadcasted_dim(
+            d1: Union[int, str], d2: Union[int, str]
+        ) -> Union[int, str]:
+            if str(d1) == "1":
+                return d2
+            elif str(d2) == "1":
+                return d1
+            assert str(d1) == str(d2)
+            return d1
+
+        return tuple(_get_broadcasted_dim(d1, d2) for d1, d2 in zip(a, b))
+
+
+def broadcast_shapes_for_args(args: Sequence[ShapeArg]) -> BlockShapeType:
+    result_shape: BlockShapeType = None
+
+    for arg in args:
+        if hasattr(arg, "shape"):
+            shape = arg.shape
+            if shape is None:
+                return None
+            elif result_shape is None:
+                result_shape = tuple(shape)
+            else:
+                result_shape = get_broadcasted_shape(result_shape, tuple(shape))
+        elif isinstance(arg, (int, float)):
+            if result_shape is None:
+                result_shape = ()
+        elif isinstance(arg, torch.dtype):
+            continue
+        else:
+            from torch._inductor.loop_body import LoopBody, LoopBodyBlock
+
+            if isinstance(arg, (LoopBodyBlock, LoopBody, OpsValue)):
+                # TODO: fix me
+                return None
+            raise TypeError(f"Unknown type: {type(arg)}")
+
+    return result_shape
+
+
+class ShapePropagationOpsHandler:
+    """
+    Propagate shape from args to output
+    """
+
+    @staticmethod
+    def constant(value: torch.types.Number, dtype: torch.dtype) -> BlockShapeType:
+        # See implementation of constant for triton for the reason
+        from torch._inductor.codegen.triton import TritonKernel
+
+        if isinstance(V.kernel, TritonKernel):
+            ndim = V.kernel.triton_tensor_ndim()
+            return tuple([1] * ndim)
+        else:
+            return ()
+
+    @staticmethod
+    def store_reduction(name: str, index: int, value: ShapeArg) -> None:
+        return None
+
+    @staticmethod
+    def reduction(
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: str,
+        value: Union[ShapeArg, tuple[ShapeArg, ...]],
+    ) -> Union[BlockShapeType, tuple[BlockShapeType, ...]]:
+        raise NotImplementedError
+
+    @staticmethod
+    def store(
+        name: str, index: int, value: ShapeArg, mode: Optional[str] = None
+    ) -> None:
+        return None
+
+    @staticmethod
+    def to_dtype(
+        value: ShapeVar,
+        dtype: torch.dtype,
+        src_dtype: Optional[torch.dtype] = None,
+        use_compute_types: bool = True,
+    ) -> BlockShapeType:
+        return value.shape
+
+    @staticmethod
+    def index_expr(expr: sympy.Expr, dtype: torch.dtype) -> BlockShapeType:
+        # shape is implicitly embedded in expr.
+        return None
+
+    @staticmethod
+    def load_seed(name: str, offset: int) -> BlockShapeType:
+        return ()
+
+    @staticmethod
+    def indirect_indexing(
+        var: ShapeArg,
+        size: Union[sympy.Expr, int],
+        check: bool = True,
+        wrap_neg: bool = True,
+    ) -> None:
+        return None
+
+    def __getattr__(self, name: str) -> Callable[..., BlockShapeType]:
+        return lambda *args, **kwargs: broadcast_shapes_for_args(args)
+
+    @staticmethod
+    def device_assert_async(cond: ShapeArg, msg: str) -> None:
+        return None
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 1d42c03ecf79f..8727777b562b2 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -476,6 +476,13 @@ def evaluate_expr(
             fallback_value=fallback_value,
         )
 
+    def is_size_one_or_false(self, size: Expr) -> bool:
+        """Return True if size equals 1.
+
+        Unbacked symbolic sizes return False without introducing a guard.
+        """
+        return self.guard_or_false(sympy.Eq(size, 1))
+
     def evaluate_min(self, left: Expr, right: Expr) -> Expr:
         """return the smaller of left and right, and guard on that choice"""
         if isinstance(left, Expr):
diff --git a/torch/_inductor/standalone_compile.py b/torch/_inductor/standalone_compile.py
index a26a578755f63..26042535bc29b 100644
--- a/torch/_inductor/standalone_compile.py
+++ b/torch/_inductor/standalone_compile.py
@@ -10,6 +10,7 @@
 
 import torch.fx
 from torch._dynamo.utils import dynamo_timed
+from torch._inductor.cpp_builder import normalize_path_separator
 from torch._inductor.cudagraph_utils import BoxedDeviceIndex
 from torch._inductor.runtime.cache_dir_utils import temporary_cache_dir
 from torch._inductor.utils import BoxedBool, InputType
@@ -85,8 +86,10 @@ def save(
                 writer.write_bytes(torch_key())
                 writer.write_str(key)
                 writer.write_bytes(artifact_bytes)
-                with open(path, "wb") as file:
-                    file.write(writer.to_bytes())
+
+                from torch._inductor.codecache import write_atomic
+
+                write_atomic(path, writer.to_bytes())
             else:
                 assert format == "unpacked"
                 if os.path.exists(path):
@@ -116,6 +119,7 @@ def save(
     def load(
         *, path: str, format: Literal["binary", "unpacked"] = "binary"
     ) -> CompiledArtifact:
+        path = normalize_path_separator(path)
         with dynamo_timed("CompiledArtifact.load"):
             if format == "binary":
                 # can't assert that it is a file since it might not exist yet
diff --git a/torch/_inductor/subgraph_lowering.py b/torch/_inductor/subgraph_lowering.py
index 3c8116d402c96..180a9d0eba801 100644
--- a/torch/_inductor/subgraph_lowering.py
+++ b/torch/_inductor/subgraph_lowering.py
@@ -87,8 +87,7 @@ def mark_buffer_mutated(self, name: str) -> None:
 
     def register_buffer(self, buffer: ir.Buffer, *, set_name: bool = False) -> str:
         if self._approved_mutator():
-            name = self.qualify_name(f"buf{len(self.buffers)}")
-            self.buffers.append(buffer)
+            name = self.root_graph.register_buffer(buffer, set_name=set_name)
             return name
         else:
             raise SubgraphLoweringException(
diff --git a/torch/_inductor/template_heuristics/__init__.py b/torch/_inductor/template_heuristics/__init__.py
new file mode 100644
index 0000000000000..eb3d731525ea8
--- /dev/null
+++ b/torch/_inductor/template_heuristics/__init__.py
@@ -0,0 +1,6 @@
+# NOTE: add new template heuristics here, so they get imported and registered
+# TODO: write a simple glob if there are many heuristics to auto import them in the right order
+from . import aten, base, contiguous_mm, decompose_k, registry, triton
+
+# expose the entry function
+from .registry import get_template_heuristic
diff --git a/torch/_inductor/template_heuristics/aten.py b/torch/_inductor/template_heuristics/aten.py
new file mode 100644
index 0000000000000..72e66b1c14765
--- /dev/null
+++ b/torch/_inductor/template_heuristics/aten.py
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING
+
+from torch._inductor import config as inductor_config
+
+from ..kernel.bmm import aten_baddbmm, aten_bmm, aten_bmm_dtype
+from ..kernel.mm import aten__fp8_mm, aten__int_mm, aten_addmm, aten_bias_addmm, aten_mm
+from ..kernel.mm_plus_mm import aten_mm_plus_mm
+from .base import TemplateConfigHeuristics
+from .gemm import GemmMaxAutotuneTemplateConfigHeuristics
+from .registry import register_template_heuristic
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from ..kernel_inputs import KernelInputs
+
+
+# These are all labeled as device type None to indicate that they
+# are valid for all device types
+@register_template_heuristic(aten_mm.uid, None)
+@register_template_heuristic(aten__fp8_mm.uid, None)
+@register_template_heuristic(aten__int_mm.uid, None)
+@register_template_heuristic(aten_bmm.uid, None)
+@register_template_heuristic(aten_mm_plus_mm.uid, None)
+# bmm dtype is only valid on cuda
+@register_template_heuristic(aten_bmm_dtype.uid, "cuda")
+class ATenConfigHeuristics(TemplateConfigHeuristics):
+    """
+    Pseudo heuristic to make ATen choices go through the same flow as other templates
+
+    This is a single choice without kwargs
+
+    If you want to use this with an ATen choice that has kwargs, just subclass
+    """
+
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        yield dict()
+
+
+# None here indicates that this is valid for all device types on that op
+# Note (None, op) takes precedence over (device_type, None)
+@register_template_heuristic(aten_addmm.uid, None, op_name="addmm")
+@register_template_heuristic(aten_baddbmm.uid, None, op_name="baddbmm")
+class ATenAddMMConfigHeuristics(ATenConfigHeuristics):
+    def get_extra_kwargs(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> dict[str, Any]:
+        kwargs = super().get_extra_kwargs(kernel_inputs, op_name)
+        alpha = kernel_inputs.get_scalar("alpha")
+        beta = kernel_inputs.get_scalar("beta")
+        return {
+            **kwargs,
+            "alpha": alpha,
+            "beta": beta,
+        }
+
+
+@register_template_heuristic(aten_bias_addmm.uid, None, op_name="addmm")
+class ATenBiasAddMMConfigHeuristics(
+    ATenAddMMConfigHeuristics, GemmMaxAutotuneTemplateConfigHeuristics
+):
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        nodes = kernel_inputs.nodes()
+        # for addmm, bias is the first input
+        bias = nodes[0]
+        if bias.get_stride()[0] == 0 and inductor_config.triton.autotune_cublasLt:
+            yield dict()
diff --git a/torch/_inductor/template_heuristics/base.py b/torch/_inductor/template_heuristics/base.py
new file mode 100644
index 0000000000000..def2a2f59bee5
--- /dev/null
+++ b/torch/_inductor/template_heuristics/base.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from ..kernel_inputs import KernelInputs
+
+
+class TemplateConfigHeuristics:
+    """Base class for generating sets of configs for an associated template."""
+
+    def should_run(self, inputs: KernelInputs) -> bool:
+        """
+        hookup to check whether the configs are right to run at all e.g. you can check
+        max-autotune specific to your heuristic here or other things
+        If this returns False, get_template_configs will yield no configs
+
+        Args:
+            inputs: KernelInputs
+        """
+        return True
+
+    def get_template_configs(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        """
+        Get template configs for the given inputs.
+
+        Prefer to override the _get_template_configs_impl method
+        to leverage things like should_run
+        """
+        if not self.should_run(kernel_inputs):
+            return
+
+        yield from self._get_template_configs_impl(
+            kernel_inputs,
+            op_name,
+        )
+
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        """
+        Get template configs for the given inputs.
+        This is the main entry point for template-specific logic.
+        """
+        # base implementation yields no entries
+        yield from []
+
+    def get_extra_kwargs(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> dict[str, Any]:
+        """
+        Get extra kwargs for the given inputs/op for the template.
+
+        Use this to return kwargs that are needed for the template, but
+        do not change depending on the config/choice, but are rather
+        always the same, for all configs
+        """
+        return {}
+
+    def adjust_kernel_inputs(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> KernelInputs:
+        """
+        Adjust kernel inputs for the given inputs/op for the template.
+
+        override this to adjust the kernel inputs e.g. (un)squeezing
+        """
+        return kernel_inputs
diff --git a/torch/_inductor/template_heuristics/contiguous_mm.py b/torch/_inductor/template_heuristics/contiguous_mm.py
new file mode 100644
index 0000000000000..f7b65eba9c76c
--- /dev/null
+++ b/torch/_inductor/template_heuristics/contiguous_mm.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING
+
+import torch
+
+from ..ir import get_free_symbols
+from ..kernel.mm import (
+    addmm_contiguous_subgraph_template,
+    mm_contiguous_subgraph_template,
+)
+from ..kernel_inputs import KernelInputs, MMKernelInputs
+from ..utils import use_contiguous
+from .base import TemplateConfigHeuristics
+from .gemm import GemmMaxAutotuneTemplateConfigHeuristics
+from .registry import register_template_heuristic
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+@register_template_heuristic(mm_contiguous_subgraph_template.uid, None, op_name="mm")
+@register_template_heuristic(
+    addmm_contiguous_subgraph_template.uid, None, op_name="addmm"
+)
+class EmptyContiguousMMConfigHeuristics(TemplateConfigHeuristics):
+    """empty heuristics to skip contiguous mm on not cuda"""
+
+
+@register_template_heuristic(
+    mm_contiguous_subgraph_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
+    op_name="mm",
+)
+@register_template_heuristic(
+    addmm_contiguous_subgraph_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
+    op_name="addmm",
+)
+class ContiguousMMHeuristics(GemmMaxAutotuneTemplateConfigHeuristics):
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        """
+        Get all the valid k_splits for the given m, n, k.
+        """
+        assert isinstance(kernel_inputs, MMKernelInputs), (
+            f"{self.__class__.__name__} requires MMKernelInputs"
+        )
+        # Check for unbacked symbols - if found, yield nothing
+        unbacked_symbols = any(
+            len(get_free_symbols(itr, unbacked_only=True)) > 0
+            for itr in (
+                *kernel_inputs.shapes_symbolic(),
+                *kernel_inputs.strides_symbolic(),
+            )
+        )
+        if unbacked_symbols:
+            return
+        mat2 = kernel_inputs.mat1mat2()[1]
+        if mat2.get_layout().is_contiguous():
+            # no need for contiguous decomposition
+            return
+        m, n, k = kernel_inputs.mnk_symbolic()
+        if not use_contiguous(m, n, k):
+            return
+        yield {}
diff --git a/torch/_inductor/template_heuristics/decompose_k.py b/torch/_inductor/template_heuristics/decompose_k.py
new file mode 100644
index 0000000000000..7954396a10861
--- /dev/null
+++ b/torch/_inductor/template_heuristics/decompose_k.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING
+
+import sympy
+
+import torch
+
+from ..ir import get_free_symbols
+from ..kernel.mm import decompose_k_subgraph_template
+from ..kernel_inputs import KernelInputs, MMKernelInputs
+from ..utils import get_k_splits
+from ..virtualized import V
+from .base import TemplateConfigHeuristics
+from .gemm import GemmMaxAutotuneTemplateConfigHeuristics
+from .registry import register_template_heuristic
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+@register_template_heuristic(decompose_k_subgraph_template.uid, None, op_name="mm")
+class EmptyDecomposeKConfigHeuristics(TemplateConfigHeuristics):
+    """empty heuristics to skip decompose k on anything not cuda"""
+
+
+# on CUDA, we don't support hip for decompose_k yet
+@register_template_heuristic(
+    decompose_k_subgraph_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+    op_name="mm",
+)
+# TODO(coconutruben): enable decompose k on AMD by removing the register bool
+# and benchmarking it for performance and stability
+# TODO(coconutruben): enable decompose k on other devices (xpu, cpu, mps, mtia)
+# by either adding specific register_template_heuristic tags, or setting the
+# device to None (enabled on all devices)
+class DecomposeKConfigHeuristics(GemmMaxAutotuneTemplateConfigHeuristics):
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        """
+        Get all the valid k_splits for the given m, n, k.
+        """
+        assert isinstance(kernel_inputs, MMKernelInputs), (
+            f"{self.__class__.__name__} requires MMKernelInputs"
+        )
+
+        # Check for unbacked symbols - if found, yield nothing
+        unbacked_symbols = any(
+            len(get_free_symbols(itr, unbacked_only=True)) > 0
+            for itr in (
+                *kernel_inputs.shapes_symbolic(),
+                *kernel_inputs.strides_symbolic(),
+            )
+        )
+        if unbacked_symbols:
+            return
+
+        m, n, k = kernel_inputs.mnk_symbolic()
+        k_splits = get_k_splits(m, n, k)
+        for k_split in k_splits:
+            if not V.graph.sizevars.statically_known_true(
+                sympy.Eq(sympy.Mod(k, k_split), 0)
+            ):
+                continue
+            yield {"k_split": k_split}
diff --git a/torch/_inductor/template_heuristics/gemm.py b/torch/_inductor/template_heuristics/gemm.py
new file mode 100644
index 0000000000000..2d56f4c481ccd
--- /dev/null
+++ b/torch/_inductor/template_heuristics/gemm.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from .. import config as inductor_config
+from .base import TemplateConfigHeuristics
+
+
+if TYPE_CHECKING:
+    from ..kernel_inputs import KernelInputs
+
+
+class GemmMaxAutotuneTemplateConfigHeuristics(TemplateConfigHeuristics):
+    def should_run(self, inputs: KernelInputs) -> bool:
+        """
+        simple base override for GEMM family templates that run only in max-autotune
+        """
+        return inductor_config.max_autotune or inductor_config.max_autotune_gemm
diff --git a/torch/_inductor/template_heuristics/registry.py b/torch/_inductor/template_heuristics/registry.py
new file mode 100644
index 0000000000000..247c78fd55758
--- /dev/null
+++ b/torch/_inductor/template_heuristics/registry.py
@@ -0,0 +1,175 @@
+"""
+Template heuristic registry system for PyTorch Inductor.
+
+This module provides a centralized registration system for template heuristics,
+allowing automatic registration based on device type and conditional registration
+for CUDA vs ROCm based on torch.version.hip.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import logging
+from typing import Any, Optional, TYPE_CHECKING, Union
+
+from .base import TemplateConfigHeuristics
+
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+
+# Module-wide registry for template heuristics
+_TEMPLATE_HEURISTIC_REGISTRY: dict[
+    tuple[Union[str, None], ...], type[TemplateConfigHeuristics]
+] = {}
+
+# Manual cache for successful lookups only (fallback instances are not cached)
+_HEURISTIC_CACHE: dict[tuple[str, str, str], TemplateConfigHeuristics] = {}
+
+log = logging.getLogger(__name__)
+
+
+def register_template_heuristic(
+    template_name: str,
+    device_type: Union[str, None],
+    register: bool = True,
+    op_name: Optional[str] = None,
+) -> Any:
+    """
+    Decorator to register template heuristic classes.
+
+    Args:
+        template_name: Name of the template (e.g., "mm", "bmm", "scaled_mm")
+        device_type: Device type ("cuda", "cpu", "xpu")
+            Set this to None to indicate that the heuristic is applicable to all device types.
+        register: Whether to register this heuristic. Caller should pass the condition directly.
+        op_name: Name of the operator (e.g., "mm", "bmm", "scaled_mm"). This is optional
+            and is only used when a template uses different heuristics for different ops
+
+    Returns:
+        Decorator function that registers the class if conditions are met.
+
+    Example:
+        @register_template_heuristic("mm", "cuda", register=torch.version.hip is None)
+        class CUDAMMTemplateConfigHeuristic(MMTemplateConfigMixin, CUDAConfigHeuristic):
+            pass
+    """
+
+    def decorator(
+        cls: type[TemplateConfigHeuristics],
+    ) -> type[TemplateConfigHeuristics]:
+        if register:
+            key: tuple[Union[str, None], ...] = (template_name, device_type, op_name)
+            _TEMPLATE_HEURISTIC_REGISTRY[key] = cls
+            log.info(
+                f"Registered template heuristic: {cls.__name__} for '{template_name=}', '{device_type=}', '{op_name=}'"  # noqa: G004
+            )
+        return cls
+
+    return decorator
+
+
+def get_template_heuristic(
+    template_name: str, device_type: str, op_name: str
+) -> TemplateConfigHeuristics:
+    """
+    Retrieve a template heuristic instance for the given template and device type.
+
+    Args:
+        template_name: Name of the template (e.g., "mm", "bmm", "scaled_mm")
+        device_type: Device type ("cuda", "cpu", "xpu")
+        op_name: Name of the operator (e.g., "mm", "bmm", "scaled_mm")
+
+    Returns:
+        Template heuristic instance. If no specific heuristic is found,
+        returns a fallback TemplateConfigHeuristics() instance (uncached).
+    """
+    # Check cache first
+    cache_key = (template_name, device_type, op_name)
+    if cache_key in _HEURISTIC_CACHE:
+        return _HEURISTIC_CACHE[cache_key]
+
+    keys = [
+        # everything is specified
+        (template_name, device_type, op_name),
+        # heuristic is valid across all devices
+        (template_name, None, op_name),
+        # heuristic is valid across all ops for that device
+        (template_name, device_type, None),
+        # heuristic is always valid for that template
+        (template_name, None, None),
+    ]
+
+    # Look up in registry
+    heuristic_class = None
+    for key in keys:
+        if key in _TEMPLATE_HEURISTIC_REGISTRY:
+            heuristic_class = _TEMPLATE_HEURISTIC_REGISTRY[key]
+            break
+
+    if heuristic_class is None:
+        # Log error and return fallback instance (uncached)
+        log.error(
+            "No template heuristic found - template_name=%s, device_type=%s, op_name=%s. "
+            "Available combinations: %s. Using fallback TemplateConfigHeuristics instance.",
+            template_name,
+            device_type,
+            op_name,
+            list(_TEMPLATE_HEURISTIC_REGISTRY.keys()),
+        )
+        return TemplateConfigHeuristics()
+
+    # Cache successful lookup and return
+    instance = heuristic_class()
+    _HEURISTIC_CACHE[cache_key] = instance
+    return instance
+
+
+def clear_registry() -> None:
+    """
+    Clear all registered template heuristics.
+
+    This is primarily useful for testing purposes to ensure a clean state.
+    """
+    _TEMPLATE_HEURISTIC_REGISTRY.clear()
+    _HEURISTIC_CACHE.clear()
+
+
+@contextlib.contextmanager
+def override_template_heuristics(
+    device_type: str,
+    template_op_pairs: list[tuple[str, str]],
+) -> Iterator[None]:
+    """
+    Context manager to temporarily override template heuristics with an empty heuristic.
+
+    This is useful for testing purposes, where we want to ensure a specific template/op pair
+    is not used
+
+    Args:
+        device_type: Device type ("cuda", "cpu", "xpu")
+        template_op_pairs: List of (template_name, op_name) pairs to override.
+    """
+    # Save original entries to restore later
+    original_entries = {}
+    new_keys = []
+    _HEURISTIC_CACHE.clear()
+    try:
+        for template_name, op_name in template_op_pairs:
+            assert op_name is not None
+            key = (device_type, template_name, op_name)
+            if key in _TEMPLATE_HEURISTIC_REGISTRY:
+                original_entries[key] = _TEMPLATE_HEURISTIC_REGISTRY[key]
+                # TemplateConfigHeuristics base class returns no entries
+                # so we use it for overriding
+            _TEMPLATE_HEURISTIC_REGISTRY[key] = TemplateConfigHeuristics
+            new_keys.append(key)
+        yield
+    finally:
+        # Restore original entries or remove if they didn't exist before
+        for key in new_keys:
+            _TEMPLATE_HEURISTIC_REGISTRY.pop(key, None)
+            if key in original_entries:
+                _TEMPLATE_HEURISTIC_REGISTRY[key] = original_entries[key]
+        _HEURISTIC_CACHE.clear()
diff --git a/torch/_inductor/template_heuristics.py b/torch/_inductor/template_heuristics/triton.py
similarity index 76%
rename from torch/_inductor/template_heuristics.py
rename to torch/_inductor/template_heuristics/triton.py
index 57eaef9b4dbb9..0aaf70ae3f24d 100644
--- a/torch/_inductor/template_heuristics.py
+++ b/torch/_inductor/template_heuristics/triton.py
@@ -3,6 +3,7 @@
 import dataclasses
 import itertools
 import math
+import os
 from functools import partial
 from threading import Lock
 from typing import Any, Callable, Optional, TYPE_CHECKING
@@ -10,14 +11,29 @@
 import sympy
 
 import torch
+from torch._inductor.template_heuristics.triton_addmm import AddMMConfigMixin
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._triton import has_triton_stable_tma_api
 
-from . import config, config as inductor_config
-from .kernel_inputs import KernelInputs, MMKernelInputs
-from .template_registry import register_template_heuristic
-from .utils import get_backend_num_stages, get_num_sms, TMA_DESCRIPTOR_SIZE
-from .virtualized import V
+from .. import config, config as inductor_config
+from ..kernel.bmm import bmm_template
+from ..kernel.mm import (
+    mm_template,
+    persistent_tma_mm_template,
+    scaled_mm_device_tma_template,
+)
+from ..kernel.mm_plus_mm import mm_plus_mm_template
+from ..kernel_inputs import KernelInputs, MMKernelInputs
+from ..utils import (
+    get_backend_num_stages,
+    get_num_sms,
+    get_tma_workspace_arg,
+    TMA_DESCRIPTOR_SIZE,
+    using_b200,
+)
+from ..virtualized import V
+from .gemm import GemmMaxAutotuneTemplateConfigHeuristics
+from .registry import register_template_heuristic
 
 
 if TYPE_CHECKING:
@@ -486,7 +502,7 @@ def _scale_mm_configs(
         """
         if not self.should_scale_configs:
             return configs
-        from .runtime.runtime_utils import next_power_of_2
+        from ..runtime.runtime_utils import next_power_of_2
 
         min_block_size = 16
         min_block_size_k = 32 if (has_int8_tensor or self.has_int8_tensor) else 16
@@ -540,34 +556,69 @@ def _scale_mm_configs(
 
         return scaled_configs
 
+    def _get_exceeding_shared_memory_checker(
+        self,
+    ) -> Optional[Callable[[BaseConfig, int], bool]]:
+        """
+        Returns a function that checks whether a given configuration exceeds the available shared memory for the device.
+        If the device does not report available shared memory, returns None.
+        """
+
+        try:
+            device = torch.cuda.current_device()
+            props = torch.cuda.get_device_properties(device)
+            if not hasattr(props, "shared_memory_per_block_optin"):  # for NVidia GPUs
+                return None
+            sm_available = int(props.shared_memory_per_block_optin)
+        except Exception:
+            # If CUDA is not available or properties cannot be queried, return None
+            return None
+
+        # TODO make a BaseDeviceConfigHeuristics to handle different device configuration in its own implementation.
+        def exceeds(gemm_config: BaseConfig, dtype_size: int) -> bool:
+            shared_mem_accum = dtype_size * (
+                gemm_config.block_m * gemm_config.block_k
+                + gemm_config.block_n * gemm_config.block_k
+            )
+            return shared_mem_accum * gemm_config.num_stages > sm_available
+
+        return exceeds
+
+    def _prune_exceeding_max_shared_mem_configs(
+        self,
+        configs: list[BaseConfig],
+        dtype_size: int,
+    ) -> list[BaseConfig]:
+        if dtype_size <= 0:
+            return configs
+
+        is_exceeding_shared_memory = self._get_exceeding_shared_memory_checker()
+        if is_exceeding_shared_memory is None:
+            return configs
+
+        return [c for c in configs if not is_exceeding_shared_memory(c, dtype_size)]
+
     def _prune_exhaustive_configs(
         self,
         configs: list[BaseConfig],
         dtype_size: int,
     ) -> list[BaseConfig]:
-        import torch
+        is_exceeding_shared_memory = self._get_exceeding_shared_memory_checker()
 
         pruned_configs = []
         for gemm_config in configs:
-            device = torch.cuda.current_device()
-            props = torch.cuda.get_device_properties(device)
-            sm_available = props.shared_memory_per_block_optin  # type: ignore[attr-defined]
-            NUM_REG = 255
+            # Will use more shared memory than available
+            if is_exceeding_shared_memory and is_exceeding_shared_memory(
+                gemm_config, dtype_size
+            ):
+                continue
 
+            NUM_REG = 255
             acc_regs = math.ceil(
                 gemm_config.block_m * gemm_config.block_n / (gemm_config.num_warps * 32)
             )
-
-            shared_mem_accum = dtype_size * (
-                gemm_config.block_m * gemm_config.block_k
-                + gemm_config.block_n * gemm_config.block_k
-            )
-
-            # Will use more shared memory than available
-            if shared_mem_accum * gemm_config.num_stages > sm_available:
-                continue
             # Lower bound for register spillage, if exceeds the kernel will certainly spill
-            elif acc_regs > NUM_REG:
+            if acc_regs > NUM_REG:
                 continue
 
             pruned_configs.append(gemm_config)
@@ -599,6 +650,13 @@ def preprocess_mm_configs(
         scaled_configs = self._scale_mm_configs(
             m, n, k, configs, scale, has_int8_tensor, exclude
         )
+
+        # Filter out configs that require more shared memory than is available.
+        if config.max_autotune_prune_choices_based_on_shared_mem:
+            scaled_configs = self._prune_exceeding_max_shared_mem_configs(
+                scaled_configs, dtype_size
+            )
+
         if config.max_autotune_gemm_search_space == "EXHAUSTIVE":
             assert dtype_size > 0, "dtype_size must be provided for exhaustive search"
             scaled_configs = self._prune_exhaustive_configs(scaled_configs, dtype_size)
@@ -765,15 +823,27 @@ class CUDAConfigHeuristic(BaseConfigHeuristic):
     def __init__(self) -> None:
         super().__init__()
 
-        self.b200_default_flex_config = {
+        self.sm_120_default_flex_config = {
+            (torch.float32, 64): FlexConfig(128, 32, 2, 4),
+            (torch.float32, 128): FlexConfig(128, 32, 2, 4),
+            (torch.float32, 256): FlexConfig(64, 16, 2, 4),
+            (torch.bfloat16, 64): FlexConfig(128, 64, 2, 4),
+            (torch.bfloat16, 128): FlexConfig(128, 64, 2, 8),
+            (torch.bfloat16, 256): FlexConfig(32, 64, 2, 4),
+            (torch.float16, 64): FlexConfig(128, 64, 2, 4),
+            (torch.float16, 128): FlexConfig(128, 64, 2, 8),
+            (torch.float16, 256): FlexConfig(32, 64, 2, 4),
+        }
+
+        self.sm_100_default_flex_config = {
             (torch.float32, 64): FlexConfig(128, 32, 3, 4),
             (torch.float32, 128): FlexConfig(32, 64, 3, 4),
             (torch.float32, 256): FlexConfig(32, 32, 3, 4),
             (torch.bfloat16, 64): FlexConfig(128, 128, 3, 4),
-            (torch.bfloat16, 128): FlexConfig(128, 64, 2, 8),
+            (torch.bfloat16, 128): FlexConfig(128, 64, 3, 8),
             (torch.bfloat16, 256): FlexConfig(64, 32, 3, 4),
             (torch.float16, 64): FlexConfig(128, 128, 3, 4),
-            (torch.float16, 128): FlexConfig(128, 128, 3, 8),
+            (torch.float16, 128): FlexConfig(128, 64, 3, 8),
             (torch.float16, 256): FlexConfig(64, 32, 3, 4),
         }
 
@@ -785,7 +855,7 @@ def __init__(self) -> None:
             (torch.bfloat16, 128): FlexConfig(128, 64, 3, 8),
             (torch.bfloat16, 256): FlexConfig(64, 32, 3, 4),
             (torch.float16, 64): FlexConfig(128, 128, 3, 4),
-            (torch.float16, 128): FlexConfig(128, 128, 3, 8),
+            (torch.float16, 128): FlexConfig(128, 64, 3, 8),
             (torch.float16, 256): FlexConfig(64, 32, 3, 4),
         }
 
@@ -815,11 +885,15 @@ def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
                 default_config = FlexConfig(64, 64, 3, 4)
             else:
                 default_config = FlexConfig(128, 64, 3, 4)
-            if capability >= (10, 0):
-                default_config = self.b200_default_flex_config.get(
+            if capability >= (12, 0):
+                default_config = self.sm_120_default_flex_config.get(
+                    (dtype, head_dim), default_config
+                )
+            elif capability >= (10, 0):
+                default_config = self.sm_100_default_flex_config.get(
                     (dtype, head_dim), default_config
                 )
-            elif capability >= (9, 0):
+            elif capability == (9, 0):
                 default_config = self.h100_default_flex_config.get(
                     (dtype, head_dim), default_config
                 )
@@ -850,13 +924,18 @@ def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
 
         if dtype == torch.float32:
             default_config = FlexConfig(16, 16, 1, 4)
-        elif head_dim <= 256 and capability >= (9, 0):  # H100
+        elif head_dim <= 256 and capability == (9, 0):  # H100
             if head_dim == 64:
                 default_config = FlexConfig(64, 64, 3, 4)
             elif head_dim == 128:
                 default_config = FlexConfig(64, 128, 3, 8)
             else:
                 default_config = FlexConfig(64, 64, 2, 4)
+        elif head_dim <= 256 and capability >= (10, 0):  # B100
+            if head_dim == 64 or head_dim == 128:
+                default_config = FlexConfig(32, 32, 2, 4)
+            else:
+                default_config = FlexConfig(32, 32, 1, 4)
         elif capability >= (8, 0):  # A100
             if head_dim == 64:
                 default_config = FlexConfig(32, 128, 3, 4)
@@ -888,7 +967,7 @@ def get_flex_decode_configs(
                 return self.exhaustive_flex_decode_configs
             flex_decode_configs += self.flex_decode_autotune_configs
 
-        if capability >= (9, 0):  # sm_90+
+        if capability in [(9, 0), (10, 0), (10, 3)]:  # sm_90, sm_100, sm_103
             if head_dim > 128 and dtype == torch.float32:
                 default_config = FlexDecodeConfig(64, 1, 2)
             else:
@@ -1205,36 +1284,138 @@ def get_flex_decode_configs(
 
 class XPUConfigHeuristic(BaseConfigHeuristic):
     """
-    Placeholder child class for XPU specific overrides.
+    Placeholder child class for Intel GPU specific overrides.
     """
 
+    def __init__(self) -> None:
+        super().__init__()
 
-class MTIAConfigHeuristic(BaseConfigHeuristic):
-    """
-    Placeholder child class for MTIA specific overrides.
-    """
+        self.xpu_default_flex_config = {
+            (torch.float32, 64): FlexConfig(128, 32, 1, 16),
+            (torch.float32, 128): FlexConfig(128, 32, 1, 16),
+            (torch.float32, 256): FlexConfig(64, 16, 1, 8),
+            (torch.bfloat16, 64): FlexConfig(128, 64, 1, 16),
+            (torch.bfloat16, 128): FlexConfig(128, 64, 1, 16),
+            (torch.bfloat16, 256): FlexConfig(32, 64, 1, 4),
+            (torch.float16, 64): FlexConfig(128, 64, 1, 16),
+            (torch.float16, 128): FlexConfig(128, 64, 1, 16),
+            (torch.float16, 256): FlexConfig(32, 64, 1, 4),
+        }
+        self.flex_attn_fwd_autotune_configs: list[FlexConfig] = [
+            FlexConfig(32, 16, 2, 4),
+            FlexConfig(128, 64, 2, 16),
+            FlexConfig(128, 64, 2, 8),
+            FlexConfig(128, 32, 2, 16),
+            FlexConfig(128, 32, 2, 8),
+        ]
+        self.flex_attn_bwd_autotune_configs: list[FlexConfig] = []
+        self.flex_decode_autotune_configs: list[FlexDecodeConfig] = []
+
+        if not bool(os.getenv("CI")):
+            self.flex_attn_bwd_autotune_configs += [
+                FlexConfig(BLOCK1, BLOCK2, s, w)
+                for BLOCK1 in [32, 64]
+                for BLOCK2 in [32, 64, 128]
+                for s in [1, 3, 4, 5]  # num_stages
+                for w in ([4, 8] if BLOCK1 >= 128 or BLOCK2 >= 128 else [4])
+                if BLOCK2 % BLOCK1 == 0
+            ]
+            self.flex_decode_autotune_configs += [
+                FlexDecodeConfig(32, 1, 2),
+                FlexDecodeConfig(32, 1, 1),
+                FlexDecodeConfig(32, 2, 2),
+                FlexDecodeConfig(32, 2, 1),
+                FlexDecodeConfig(64, 1, 2),
+                FlexDecodeConfig(64, 1, 1),
+                FlexDecodeConfig(64, 2, 2),
+                FlexDecodeConfig(64, 2, 1),
+            ]
 
+    def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
+        flex_attn_fwd_configs: list[FlexConfig] = []
 
-# Template-specific mixin classes
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_attn_fwd_configs
+            flex_attn_fwd_configs += self.flex_attn_fwd_autotune_configs
+
+        if head_dim <= 256:
+            if dtype == torch.float32:
+                default_config = FlexConfig(64, 64, 1, 8)
+            else:
+                default_config = FlexConfig(128, 64, 1, 16)
+            default_config = self.xpu_default_flex_config.get(
+                (dtype, head_dim), default_config
+            )
+        else:
+            if dtype == torch.float32:
+                default_config = FlexConfig(32, 16, 1, 4)
+            else:
+                default_config = FlexConfig(64, 32, 1, 8)
+
+        if default_config not in flex_attn_fwd_configs:
+            flex_attn_fwd_configs.append(default_config)
+
+        return flex_attn_fwd_configs
+
+    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
+        flex_attn_bwd_configs: list[FlexConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_attn_bwd_configs
+            flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
+
+        if dtype == torch.float32:
+            default_config = FlexConfig(16, 16, 1, 4)
+        elif head_dim <= 256:
+            if head_dim == 64:
+                default_config = FlexConfig(64, 64, 1, 8)
+            elif head_dim == 128:
+                default_config = FlexConfig(64, 128, 1, 8)
+            else:
+                default_config = FlexConfig(64, 64, 1, 8)
+        else:  # modest hardware or extremely large head_dim
+            default_config = FlexConfig(16, 16, 1, 4)
+
+        if default_config not in flex_attn_bwd_configs:
+            flex_attn_bwd_configs.append(default_config)
+
+        return flex_attn_bwd_configs
 
+    def get_flex_decode_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexDecodeConfig]:
+        flex_decode_configs: list[FlexDecodeConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_decode_configs
+            flex_decode_configs += self.flex_decode_autotune_configs
+
+        default_config = FlexDecodeConfig(64, 1, 2)
+
+        if default_config not in flex_decode_configs:
+            flex_decode_configs.append(default_config)
+
+        return flex_decode_configs
 
-class TemplateConfigHeuristics:
-    def get_template_configs(
+    def _prune_exhaustive_configs(
         self,
-        kernel_inputs: KernelInputs,
-        layout: Any,
-        op_name: str,
-    ) -> Generator[dict[str, Any], None, None]:
-        """
-        Get template configs for the given inputs.
-        This is the main entry point for template-specific logic.
-        """
-        # NOTE: not an abstract class, because that clashed below for the mixin
-        # functionality. Can be adjusted, but not a high priority
-        yield from {}
+        configs: list[BaseConfig],
+        dtype_size: int,
+    ) -> list[BaseConfig]:
+        return configs
+
+
+class MTIAConfigHeuristic(BaseConfigHeuristic):
+    """
+    Placeholder child class for MTIA specific overrides.
+    """
 
 
-class MMTemplateConfigMixin(TemplateConfigHeuristics):
+# Template-specific mixin classes
+class MMTemplateConfigMixin(GemmMaxAutotuneTemplateConfigHeuristics):
     """
     Mixin class that converts config lists to template kwargs.
     This handles the logic that was previously in choices.get_mm_configs.
@@ -1249,6 +1430,9 @@ class MMTemplateConfigMixin(TemplateConfigHeuristics):
     ]
     _filter_configs: Callable[[list[BaseConfig]], list[BaseConfig]]
 
+    def _valid(self, kernel_inputs: KernelInputs) -> bool:
+        return True
+
     def _get_config_generator(
         self,
     ) -> partial[Generator[TritonConfig, None, None]]:
@@ -1262,10 +1446,9 @@ def _get_config_generator(
         else:
             return self.get_mm_configs()
 
-    def get_template_configs(
+    def _get_template_configs_impl(
         self,
         kernel_inputs: KernelInputs,
-        layout: Any,
         op_name: str,
     ) -> Generator[dict[str, Any], None, None]:
         """
@@ -1278,6 +1461,8 @@ def get_template_configs(
         input_nodes = kernel_inputs.nodes()
         if len(input_nodes) < 2:
             raise ValueError(f"Need at least 2 input tensors, got {len(input_nodes)}")
+        if not self._valid(kernel_inputs):
+            return
 
         # Extract M, N, K from kernel_inputs
         m, n, k = kernel_inputs.mnk_symbolic()
@@ -1291,7 +1476,11 @@ def get_template_configs(
         # Generate and process configs
         for c in configs(m, n, k, dtype_size=dtype.itemsize, op_name=op_name):
             template_kwargs = self._convert_config_to_template_kwargs(
-                c, m, n, k, layout
+                c,
+                m,
+                n,
+                k,
+                kernel_inputs.out_dtype(),
             )
             yield template_kwargs
 
@@ -1301,7 +1490,7 @@ def _convert_config_to_template_kwargs(
         m: sympy.Integer,
         n: sympy.Integer,
         k: sympy.Integer,
-        layout: Any,
+        out_dtype: torch.dtype,
     ) -> dict[str, Any]:
         """
         Convert triton config to template kwargs.
@@ -1325,7 +1514,7 @@ def _convert_config_to_template_kwargs(
             EVEN_K=even_k_symbolic,
             ALLOW_TF32=allow_tf32,
             USE_FAST_ACCUM=False,  # Option for _scaled_mm
-            ACC_TYPE=self._get_acc_type(layout.dtype),
+            ACC_TYPE=self._get_acc_type(out_dtype),
             num_stages=triton_config.num_stages,
             num_warps=triton_config.num_warps,
             **triton_config.kwargs,
@@ -1371,14 +1560,39 @@ def __init__(self) -> None:
         super().__init__()
         self.should_scale_configs = False
 
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        assert isinstance(kernel_inputs, MMKernelInputs), "Expect MMKernelInputs"
+        m, n, k = kernel_inputs.mnk_symbolic()
+        for kwargs in super()._get_template_configs_impl(kernel_inputs, op_name):
+            # Apply BLOCK_K constraint specific to mm_plus_mm
+            # see https://github.com/triton-lang/triton/issues/1298
+            # BLOCK_K = K causes llvm error
+            if V.graph.sizevars.statically_known_lt(kwargs.get("BLOCK_K", k), k):
+                yield kwargs
 
-# TMA-specific mixin for TMA templates
-class TMAConfigMixin(MMTemplateConfigMixin):
+
+class TMAWorkspaceMixin(MMTemplateConfigMixin):
     """
-    TMA-specific mixin that uses persistent configs and adds TMA options.
-    This inherits from MMTemplateConfigMixin and overrides config generation.
+    Small mixin to ensure that the workspace arg is correct for TMA
+    and TMA specific filtering can happen.
     """
 
+    def get_extra_kwargs(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> dict[str, Any]:
+        kwargs = super().get_extra_kwargs(kernel_inputs, op_name)
+        kwargs["workspace_arg"] = get_tma_workspace_arg(
+            num_tma_descriptors=2,
+            device=kernel_inputs.device(),
+        )
+        return kwargs
+
     def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
         """
         TMA specific filtering, as num_warps=2 not safe for TMA
@@ -1386,35 +1600,26 @@ def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
         configs = [c for c in configs if c.num_warps != 2]
         return super()._filter_configs(configs)
 
-    def get_template_configs(
+
+# TMA-specific mixin for TMA templates
+class TMATemplateConfigMixin(TMAWorkspaceMixin, MMTemplateConfigMixin):
+    """
+    TMA-specific mixin that uses persistent configs and adds TMA options.
+    This inherits from MMTemplateConfigMixin and overrides config generation.
+    """
+
+    def _get_template_configs_impl(
         self,
         kernel_inputs: KernelInputs,
-        layout: Any,
         op_name: str,
     ) -> Generator[dict[str, Any], None, None]:
         """
         Generate TMA template configs by calling super and adding TMA-specific options.
         """
-        # Get base template configs from superclass
-        for template_kwargs in super().get_template_configs(
-            kernel_inputs, layout, op_name
-        ):
-            # Add TMA-specific options (moved from mm_common.persistent_mm_options)
-            input_nodes = kernel_inputs.nodes()
-            self._add_tma_options(template_kwargs, input_nodes)
-            yield template_kwargs
-
-    def _add_tma_options(
-        self, template_kwargs: dict[str, Any], input_nodes: list[Any]
-    ) -> None:
-        """
-        Add TMA-specific options to template kwargs.
-        Moved from mm_common.persistent_mm_options and mm_common.tma_options.
-        """
-        # For TMA templates, we need the actual matrix tensors
-        mat1 = input_nodes[-2]
-        mat2 = input_nodes[-1]
-
+        assert isinstance(kernel_inputs, MMKernelInputs), (
+            "TMATemplateConfigMixin requires MMKernelInputs"
+        )
+        mat1, mat2 = kernel_inputs.mat1mat2()
         tma_opts = {
             "A_ROW_MAJOR": not mat1.layout.is_transposed(),
             "B_ROW_MAJOR": not mat2.layout.is_transposed(),
@@ -1422,29 +1627,66 @@ def _add_tma_options(
             "TMA_SIZE": TMA_DESCRIPTOR_SIZE,
             "TMA_EXPERIMENTAL_API": not has_triton_stable_tma_api(),
         }
-        template_kwargs.update(tma_opts)
+        # Get base template configs from superclass
+        for template_kwargs in super()._get_template_configs_impl(
+            kernel_inputs,
+            op_name,
+        ):
+            yield {**template_kwargs, **tma_opts}
 
 
-# Scaled MM-specific mixin for scaled MM templates (non-TMA)
-class ScaledMMConfigMixin(MMTemplateConfigMixin):
+# Scaled MM-specific mixin for scaled MM templates
+class BaseScaledMMConfigMixin(MMTemplateConfigMixin):
     """
-    Scaled MM-specific mixin that uses scaled configs and adds scaled MM options.
-    This is for non-TMA scaled MM templates only.
-    This inherits from MMTemplateConfigMixin and overrides config generation.
+    This is a base that handles the common case for ScaledMM
+
+    The TMA and non-TMA should build on top of this
     """
 
-    def get_template_configs(
+    def adjust_kernel_inputs(
+        self, kernel_inputs: KernelInputs, op_name: str
+    ) -> KernelInputs:
+        """
+        for scaled_mm, we need to unsqueeze scale tensors, and bias
+        """
+        assert isinstance(kernel_inputs, MMKernelInputs), (
+            "Expect MMKernelInputs for scaled MM"
+        )
+        inputs = super().adjust_kernel_inputs(kernel_inputs, op_name)
+        nodes = inputs.nodes()
+        mat_a, mat_b, scale_a, scale_b, *bias = nodes
+        bias = bias[0] if bias else None
+        # Prepare triton input nodes and create kernel_inputs at the top
+        from ..lowering import lowerings as L
+
+        aten = torch.ops.aten
+        if bias and len(mat_b.get_size()) == len(bias.get_size()) + 1:
+            # Need to unsqueeze bias from [N] -> [1, N]
+            bias = L[aten.unsqueeze](bias, 0)
+
+        if len(scale_a.get_size()) == 0 or len(scale_b.get_size()) == 0:
+            assert len(scale_a.get_size()) == len(scale_b.get_size())
+            # Need to unsqueeze scale from [] -> [1, 1]
+            scale_a = L[aten.unsqueeze](L[aten.unsqueeze](scale_a, 0), 1)
+            scale_b = L[aten.unsqueeze](L[aten.unsqueeze](scale_b, 0), 1)
+        nodes = [mat_a, mat_b, scale_a, scale_b]
+        if bias:
+            nodes.append(bias)
+        return MMKernelInputs(
+            nodes, mat1_idx=kernel_inputs._mat1_idx, mat2_idx=kernel_inputs._mat2_idx
+        )
+
+    def _get_template_configs_impl(
         self,
         kernel_inputs: KernelInputs,
-        layout: Any,
         op_name: str,
     ) -> Generator[dict[str, Any], None, None]:
         """
         Generate scaled MM template configs with scaled MM-specific options.
         Handles the remaining logic from mm_common including assertions and SCALING_ROWWISE.
         """
+        kernel_inputs = self.adjust_kernel_inputs(kernel_inputs, op_name)
         input_nodes = kernel_inputs.nodes()
-
         # Initial assertion from mm_common.scaled_mm_options
         assert len(input_nodes) >= 4, (
             f"scaled_mm requires at least 4 inputs, got {len(input_nodes)}"
@@ -1466,52 +1708,92 @@ def are_compatible_scales(size_a: Any, size_b: Any) -> bool:
 
             return False
 
+        def is_scalar_like(sz: Any) -> bool:
+            return (len(sz) == 0) or all(
+                V.graph.sizevars.statically_known_equals(d, 1) for d in sz
+            )
+
         size_a, size_b = scale_a.get_size(), scale_b.get_size()
         assert are_compatible_scales(size_a, size_b), (
             "Expect scale_a and scale_b to be either both scalars (including single-element tensors) "
             f"or 1-dimensional tensors with the same size. Got scale_a: {len(size_a)} and scale_b: {len(size_b)}."
         )
 
+        assert isinstance(kernel_inputs, MMKernelInputs), (
+            f"{self.__class__.__name__} requires MMKernelInputs"
+        )
+
+        if not self._valid(kernel_inputs):
+            return
+
         # Get base template configs from superclass
-        for template_kwargs in super().get_template_configs(
-            kernel_inputs, layout, op_name
+        for template_kwargs in super()._get_template_configs_impl(
+            kernel_inputs, op_name
         ):
             # Add scaled MM-specific options (moved from mm_common.scaled_mm_options)
             # Override accumulator type for scaled MM
             template_kwargs["ACC_TYPE"] = "tl.float32"
-            # Add SCALING_ROWWISE attribute based on scale_a tensor shape
-            template_kwargs["SCALING_ROWWISE"] = len(size_a) == 2
+            # Add SCALING_ROWWISE attribute based on scale tensor shapes
+            both_scalar_like = is_scalar_like(size_a) and is_scalar_like(size_b)
+            template_kwargs["SCALING_ROWWISE"] = not both_scalar_like
 
             yield template_kwargs
 
 
+class ScaledMMConfigMixin(BaseScaledMMConfigMixin):
+    """Mixing for scaled mm with the regular mm template"""
+
+    def get_extra_kwargs(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> dict[str, Any]:
+        kwargs = super().get_extra_kwargs(kernel_inputs, op_name)
+        from ..kernel.mm_common import scale_mm_epilogue
+
+        return {
+            **kwargs,
+            "suffix_args": kernel_inputs.count - 2,
+            "epilogue_fn": scale_mm_epilogue(),
+            "epilogue_fn_hash": "scale_mm_epilogue",
+        }
+
+    def _valid(self, kernel_inputs: KernelInputs) -> bool:
+        assert isinstance(kernel_inputs, MMKernelInputs), (
+            "Expect MMKernelInputs for ScaledMMConfigMixin"
+        )
+        _, _, k = kernel_inputs.mnk_symbolic()
+        if V.graph.sizevars.guard_or_false(sympy.Le(k, 16)):
+            # Triton crashes however uncommon for real workloads
+            return False
+
+        # On NVIDIA B200 GPUs, K dim must be >= 32 for tcgen05.mma.kind::f8f6f4.* PTX instruction to be valid
+        # source: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-matrix-shape
+        if using_b200() and V.graph.sizevars.guard_or_false(sympy.Lt(k, 32)):
+            return False
+        return True
+
+
 # Scaled TMA-specific mixin for scaled MM templates with TMA
-class ScaledTMAConfigMixin(ScaledMMConfigMixin):
+class ScaledTMAConfigMixin(TMAWorkspaceMixin, BaseScaledMMConfigMixin):
     """
-    Scaled TMA-specific mixin that extends ScaledMMConfigMixin with TMA functionality.
+    Scaled TMA-specific mixin that extends BaseScaledMMConfigMixin with TMA functionality.
     This is for scaled MM templates that use device TMA.
-    This inherits from ScaledMMConfigMixin and adds TMA-specific options.
+    This inherits from BaseScaledMMConfigMixin and adds TMA-specific options.
     """
 
-    def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
-        """
-        TMA specific filtering, as num_warps=2 not safe for TMA
-        """
-        configs = [c for c in configs if c.num_warps != 2]
-        return super()._filter_configs(configs)
-
-    def get_template_configs(
+    def _get_template_configs_impl(
         self,
         kernel_inputs: KernelInputs,
-        layout: Any,
         op_name: str,
     ) -> Generator[dict[str, Any], None, None]:
         """
         Generate scaled TMA template configs with both scaled MM and TMA-specific options.
         """
         # Get base scaled MM template configs from superclass
-        for template_kwargs in super().get_template_configs(
-            kernel_inputs, layout, op_name
+        for template_kwargs in super()._get_template_configs_impl(
+            kernel_inputs,
+            op_name,
         ):
             # Add TMA-specific options for device TMA scaled MM
             template_kwargs["TMA_SIZE"] = TMA_DESCRIPTOR_SIZE
@@ -1524,17 +1806,37 @@ def get_template_configs(
 # Template-specific heuristic classes using multiple inheritance
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
-@register_template_heuristic("mm", "cuda", register=torch.version.hip is None)
-# TODO(coconutruben): replace with template.name once templates are importable
-@register_template_heuristic("bmm", "cuda", register=torch.version.hip is None)
+@register_template_heuristic(
+    mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+)
+@register_template_heuristic(
+    bmm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+)
 class CUDAMMTemplateConfigHeuristic(MMTemplateConfigMixin, CUDAConfigHeuristic):
     """Standard MM template heuristic for CUDA"""
 
 
+@register_template_heuristic(
+    mm_template.uid, "cuda", register=torch.version.hip is None, op_name="addmm"
+)
+@register_template_heuristic(
+    bmm_template.uid, "cuda", register=torch.version.hip is None, op_name="baddbmm"
+)
+class CUDAAddMMTemplateConfigHeuristic(AddMMConfigMixin, CUDAMMTemplateConfigHeuristic):
+    """Addmm specific mixin for CUDA"""
+
+
 # TODO(coconutruben): deprecate once autoheuristic is deprecated
-# TODO(coconutruben): replace with template.name once templates are importable
-@register_template_heuristic("mm-ah", "cuda", register=torch.version.hip is None)
+@register_template_heuristic(
+    mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+    op_name="mm-ah",
+)
 class CUDAMMAHTemplateConfigHeuristic(MMTemplateConfigMixin, CUDAConfigHeuristic):
     """Standard MM template heuristic for CUDA using the extra mm configs only (for autoheuristic)"""
 
@@ -1545,11 +1847,14 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.extra_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "mm_persistent_tma", "cuda", register=torch.version.hip is None
+    persistent_tma_mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
 )
-class CUDAPersistentTMATemplateConfigHeuristic(TMAConfigMixin, CUDAConfigHeuristic):
+class CUDAPersistentTMATemplateConfigHeuristic(
+    TMATemplateConfigMixin, CUDAConfigHeuristic
+):
     """Persistent TMA template heuristic for CUDA"""
 
     def __init__(self) -> None:
@@ -1558,9 +1863,20 @@ def __init__(self) -> None:
         self.mm_configs = self.persistent_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "mm", "cuda", register=torch.version.hip is None, op_name="scaled_mm"
+    persistent_tma_mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+    op_name="addmm",
+)
+class CUDAAddmmPersistentTMATemplateConfigHeuristic(
+    AddMMConfigMixin, CUDAPersistentTMATemplateConfigHeuristic
+):
+    """Addmm specific mixin for CUDA"""
+
+
+@register_template_heuristic(
+    mm_template.uid, "cuda", register=torch.version.hip is None, op_name="scaled_mm"
 )
 class CUDAScaledMMTemplateConfigHeuristic(ScaledMMConfigMixin, CUDAConfigHeuristic):
     """Scaled MM template heuristic for CUDA"""
@@ -1576,9 +1892,10 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.scaled_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "scaled_mm_device_tma", "cuda", register=torch.version.hip is None
+    scaled_mm_device_tma_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
 )
 class CUDAScaledTMATemplateConfigHeuristic(ScaledTMAConfigMixin, CUDAConfigHeuristic):
     """Scaled TMA template heuristic for CUDA"""
@@ -1594,8 +1911,11 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.scaled_persistent_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
-@register_template_heuristic("mm_plus_mm", "cuda", register=torch.version.hip is None)
+@register_template_heuristic(
+    mm_plus_mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+)
 class CUDAMMPlusMMTemplateConfigHeuristic(
     MMPlusMMTemplateConfigMixin, CUDAConfigHeuristic
 ):
@@ -1612,9 +1932,11 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.mm_plus_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "mm", "cuda", register=torch.version.hip is None, op_name="int_mm"
+    mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+    op_name="int_mm",
 )
 class CUDAInt8MMTemplateConfigHeuristic(INT8MMTemplateConfigMixin, CUDAConfigHeuristic):
     """Int8 MM template heuristic for CUDA"""
@@ -1633,16 +1955,33 @@ def __init__(self) -> None:
 # ROCm template-specific classes
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
-@register_template_heuristic("mm", "cuda", register=torch.version.hip is not None)
-# TODO(coconutruben): replace with template.name once templates are importable
-@register_template_heuristic("bmm", "cuda", register=torch.version.hip is not None)
+@register_template_heuristic(
+    mm_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
+)
+@register_template_heuristic(
+    bmm_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
+)
 class ROCmMMTemplateConfigHeuristic(MMTemplateConfigMixin, ROCmConfigHeuristic):
     """Standard MM template heuristic for ROCm"""
 
 
-# TODO(coconutruben): deprecate once autoheuristic is deprecated
 # TODO(coconutruben): replace with template.name once templates are importable
+@register_template_heuristic(
+    mm_template.uid, "cuda", register=torch.version.hip is not None, op_name="addmm"
+)
+# TODO(coconutruben): replace with template.name once templates are importable
+@register_template_heuristic(
+    bmm_template.uid, "cuda", register=torch.version.hip is not None, op_name="baddbmm"
+)
+class ROCmAddMMTemplateConfigHeuristic(AddMMConfigMixin, ROCmMMTemplateConfigHeuristic):
+    """Addmm specific mixin for ROCm"""
+
+
+# TODO(coconutruben): deprecate once autoheuristic is deprecated
 @register_template_heuristic("mm-ah", "cuda", register=torch.version.hip is not None)
 class ROCmMMAHTemplateConfigHeuristic(MMTemplateConfigMixin, ROCmConfigHeuristic):
     """Standard MM template heuristic for ROCm using the extra mm configs only (for autoheuristic)"""
@@ -1654,9 +1993,11 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.extra_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "mm", "cuda", register=torch.version.hip is not None, op_name="scaled_mm"
+    mm_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
+    op_name="scaled_mm",
 )
 class ROCmScaledMMTemplateConfigHeuristic(ScaledMMConfigMixin, ROCmConfigHeuristic):
     """Scaled MM template heuristic for ROCm (non-TMA)"""
@@ -1672,9 +2013,11 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.scaled_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "mm", "cuda", register=torch.version.hip is not None, op_name="int_mm"
+    mm_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
+    op_name="int_mm",
 )
 class ROCmInt8MMTemplateConfigHeuristic(INT8MMTemplateConfigMixin, ROCmConfigHeuristic):
     """Int8 MM template heuristic for ROCm"""
@@ -1690,9 +2033,10 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.int8_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "mm_plus_mm", "cuda", register=torch.version.hip is not None
+    mm_plus_mm_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
 )
 class ROCmMMPlusMMTemplateConfigHeuristic(
     MMPlusMMTemplateConfigMixin, ROCmConfigHeuristic
@@ -1716,13 +2060,19 @@ def __init__(self) -> None:
 # CPU template-specific classes
 
 
-@register_template_heuristic("mm", "cpu")
-@register_template_heuristic("bmm", "cpu")
+@register_template_heuristic(mm_template.uid, "cpu")
+@register_template_heuristic(bmm_template.uid, "cpu")
 class CPUMMTemplateConfigHeuristic(MMTemplateConfigMixin, CPUConfigHeuristic):
     """Standard MM template heuristic for CPU"""
 
 
-@register_template_heuristic("mm", "cpu", op_name="scaled_mm")
+@register_template_heuristic(mm_template.uid, "cpu", op_name="addmm")
+@register_template_heuristic(bmm_template.uid, "cpu", op_name="baddbmm")
+class CPUAddmmTemplateConfigHeuristic(AddMMConfigMixin, CPUMMTemplateConfigHeuristic):
+    """Addmm specific mixin for CPU"""
+
+
+@register_template_heuristic(mm_template.uid, "cpu", op_name="scaled_mm")
 class CPUScaledMMTemplateConfigHeuristic(ScaledMMConfigMixin, CPUConfigHeuristic):
     """Scaled MM template heuristic for CPU (non-TMA)"""
 
@@ -1737,7 +2087,7 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.scaled_mm_configs
 
 
-@register_template_heuristic("mm", "cpu", op_name="int_mm")
+@register_template_heuristic(mm_template.uid, "cpu", op_name="int_mm")
 class CPUInt8MMTemplateConfigHeuristic(INT8MMTemplateConfigMixin, CPUConfigHeuristic):
     """Int8 MM template heuristic for CPU"""
 
@@ -1752,7 +2102,7 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.int8_mm_configs
 
 
-@register_template_heuristic("mm_plus_mm", "cpu")
+@register_template_heuristic(mm_plus_mm_template.uid, "cpu")
 class CPUMMPlusMMTemplateConfigHeuristic(
     MMPlusMMTemplateConfigMixin, CPUConfigHeuristic
 ):
@@ -1772,13 +2122,41 @@ def __init__(self) -> None:
 # XPU template-specific classes
 
 
-@register_template_heuristic("mm", "xpu")
-@register_template_heuristic("bmm", "xpu")
+@register_template_heuristic(mm_template.uid, "xpu")
+@register_template_heuristic(bmm_template.uid, "xpu")
 class XPUMMTemplateConfigHeuristic(MMTemplateConfigMixin, XPUConfigHeuristic):
     """Standard MM template heuristic for XPU"""
 
 
-@register_template_heuristic("mm", "xpu", op_name="scaled_mm")
+@register_template_heuristic(mm_template.uid, "xpu", op_name="addmm")
+@register_template_heuristic(bmm_template.uid, "xpu", op_name="baddbmm")
+class XPUAddmmTemplateConfigHeuristic(AddMMConfigMixin, XPUMMTemplateConfigHeuristic):
+    """Addmm specific mixin for XPU"""
+
+
+@register_template_heuristic(
+    persistent_tma_mm_template.uid,
+    "xpu",
+)
+class XPUPersistentTMATemplateConfigHeuristic(
+    TMATemplateConfigMixin, XPUConfigHeuristic
+):
+    """Persistent TMA template heuristic for XPU"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        # Override mm_configs to use persistent_mm_configs
+        self.mm_configs = self.persistent_mm_configs
+
+
+@register_template_heuristic(persistent_tma_mm_template.uid, "xpu", op_name="addmm")
+class XPUAddmmPersistentTMATemplateConfigHeuristic(
+    AddMMConfigMixin, XPUPersistentTMATemplateConfigHeuristic
+):
+    """Addmm specific mixin for XPU"""
+
+
+@register_template_heuristic(mm_template.uid, "xpu", op_name="scaled_mm")
 class XPUScaledMMTemplateConfigHeuristic(ScaledMMConfigMixin, XPUConfigHeuristic):
     """Scaled MM template heuristic for XPU (non-TMA)"""
 
@@ -1793,7 +2171,7 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.scaled_mm_configs
 
 
-@register_template_heuristic("mm", "xpu", op_name="int_mm")
+@register_template_heuristic(mm_template.uid, "xpu", op_name="int_mm")
 class XPUInt8MMTemplateConfigHeuristic(INT8MMTemplateConfigMixin, XPUConfigHeuristic):
     """Int8 MM template heuristic for XPU"""
 
@@ -1808,7 +2186,7 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.int8_mm_configs
 
 
-@register_template_heuristic("mm_plus_mm", "xpu")
+@register_template_heuristic(mm_plus_mm_template.uid, "xpu")
 class XPUMMPlusMMTemplateConfigHeuristic(
     MMPlusMMTemplateConfigMixin, XPUConfigHeuristic
 ):
@@ -1828,13 +2206,19 @@ def __init__(self) -> None:
 # MTIA template-specific classes
 
 
-@register_template_heuristic("mm", "mtia")
-@register_template_heuristic("bmm", "mtia")
+@register_template_heuristic(mm_template.uid, "mtia")
+@register_template_heuristic(bmm_template.uid, "mtia")
 class MTIAMMTemplateConfigHeuristic(MMTemplateConfigMixin, MTIAConfigHeuristic):
     """Standard MM template heuristic for MTIA"""
 
 
-@register_template_heuristic("mm", "mtia", op_name="scaled_mm")
+@register_template_heuristic(mm_template.uid, "mtia", op_name="addmm")
+@register_template_heuristic(bmm_template.uid, "mtia", op_name="baddbmm")
+class MTIAAddMMTemplateConfigHeuristic(AddMMConfigMixin, MTIAMMTemplateConfigHeuristic):
+    """Addmm specific mixin for MTIA"""
+
+
+@register_template_heuristic(mm_template.uid, "mtia", op_name="scaled_mm")
 class MTIAScaledMMTemplateConfigHeuristic(ScaledMMConfigMixin, MTIAConfigHeuristic):
     """Scaled MM template heuristic for MTIA (non-TMA)"""
 
@@ -1849,7 +2233,7 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.scaled_mm_configs
 
 
-@register_template_heuristic("mm", "mtia", op_name="int_mm")
+@register_template_heuristic(mm_template.uid, "mtia", op_name="int_mm")
 class MTIAInt8MMTemplateConfigHeuristic(INT8MMTemplateConfigMixin, MTIAConfigHeuristic):
     """Int8 MM template heuristic for MTIA"""
 
@@ -1864,7 +2248,7 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.int8_mm_configs
 
 
-@register_template_heuristic("mm_plus_mm", "mtia")
+@register_template_heuristic(mm_plus_mm_template.uid, "mtia")
 class MTIAMMPlusMMTemplateConfigHeuristic(
     MMPlusMMTemplateConfigMixin, MTIAConfigHeuristic
 ):
diff --git a/torch/_inductor/template_heuristics/triton_addmm.py b/torch/_inductor/template_heuristics/triton_addmm.py
new file mode 100644
index 0000000000000..a6643d1ce2a90
--- /dev/null
+++ b/torch/_inductor/template_heuristics/triton_addmm.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING
+
+from ..kernel.mm_common import addmm_epilogue
+from .base import TemplateConfigHeuristics
+
+
+if TYPE_CHECKING:
+    from ..kernel_inputs import KernelInputs
+
+
+class AddMMConfigMixin(TemplateConfigHeuristics):
+    """
+    Simple mixin to handle scalars for addmm like operators (addmm, baddbmm)
+    """
+
+    def get_extra_kwargs(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> dict[str, Any]:
+        kwargs = super().get_extra_kwargs(kernel_inputs, op_name)
+        assert op_name in [
+            "addmm",
+            "baddbmm",
+        ], f"op_name={op_name} invalid for AddMMConfigMixin"
+        alpha = kernel_inputs.get_scalar("alpha")
+        beta = kernel_inputs.get_scalar("beta")
+        return {
+            **kwargs,
+            "epilogue_fn": addmm_epilogue(kernel_inputs.out_dtype(), alpha, beta),
+            "epilogue_fn_hash": str(
+                ["addmm_epilogue", kernel_inputs.out_dtype(), alpha, beta]
+            ),
+            "prefix_args": 1,
+        }
diff --git a/torch/_inductor/template_registry.py b/torch/_inductor/template_registry.py
deleted file mode 100644
index d11343e63f0ff..0000000000000
--- a/torch/_inductor/template_registry.py
+++ /dev/null
@@ -1,98 +0,0 @@
-"""
-Template heuristic registry system for PyTorch Inductor.
-
-This module provides a centralized registration system for template heuristics,
-allowing automatic registration based on device type and conditional registration
-for CUDA vs ROCm based on torch.version.hip.
-"""
-
-from __future__ import annotations
-
-import logging
-from functools import cache
-from typing import Any, Optional, TYPE_CHECKING
-
-
-if TYPE_CHECKING:
-    from .template_heuristics import TemplateConfigHeuristics
-
-# Module-wide registry for template heuristics
-_TEMPLATE_HEURISTIC_REGISTRY: dict[tuple[str, ...], type[TemplateConfigHeuristics]] = {}
-
-log = logging.getLogger(__name__)
-
-
-def register_template_heuristic(
-    template_name: str,
-    device_type: str,
-    register: bool = True,
-    op_name: Optional[str] = None,
-) -> Any:
-    """
-    Decorator to register template heuristic classes.
-
-    Args:
-        template_name: Name of the template (e.g., "mm", "bmm", "scaled_mm")
-        device_type: Device type ("cuda", "cpu", "xpu")
-        register: Whether to register this heuristic. Caller should pass the condition directly.
-        op_name: Name of the operator (e.g., "mm", "bmm", "scaled_mm"). This is optional
-            and is only used when a template uses different heuristics for different ops
-
-    Returns:
-        Decorator function that registers the class if conditions are met.
-
-    Example:
-        @register_template_heuristic("mm", "cuda", register=torch.version.hip is None)
-        class CUDAMMTemplateConfigHeuristic(MMTemplateConfigMixin, CUDAConfigHeuristic):
-            pass
-    """
-
-    def decorator(
-        cls: type[TemplateConfigHeuristics],
-    ) -> type[TemplateConfigHeuristics]:
-        if register:
-            key: tuple[str, ...] = (device_type, template_name)
-            if op_name is not None:
-                key = (device_type, template_name, op_name)
-            _TEMPLATE_HEURISTIC_REGISTRY[key] = cls
-            log.info(
-                f"Registered template heuristic: {cls.__name__} for '{template_name=}', '{device_type=}', '{op_name=}'"  # noqa: G004
-            )
-        return cls
-
-    return decorator
-
-
-@cache
-def get_template_heuristic(
-    template_name: str, device_type: str, op_name: str
-) -> TemplateConfigHeuristics:
-    """
-    Retrieve a template heuristic instance for the given template and device type.
-
-    Args:
-        template_name: Name of the template (e.g., "mm", "bmm", "scaled_mm")
-        device_type: Device type ("cuda", "cpu", "xpu")
-
-    Returns:
-        Template heuristic instance.
-
-    Raises:
-        ValueError: If no heuristic is found for the given combination.
-    """
-    # First check the more specific key
-    keys = [(device_type, template_name, op_name), (device_type, template_name)]
-
-    # Look up in registry
-    heuristic_class = None
-    for key in keys:
-        if key in _TEMPLATE_HEURISTIC_REGISTRY:
-            heuristic_class = _TEMPLATE_HEURISTIC_REGISTRY[key]
-            break
-    if heuristic_class is None:
-        raise ValueError(
-            f"No template heuristic found for '{template_name=}', "
-            f"'{device_type=}', '{op_name=}'. "
-            f"Available combinations: {list(_TEMPLATE_HEURISTIC_REGISTRY.keys())}"
-        )
-    return heuristic_class()
diff --git a/torch/_inductor/triton_bundler.py b/torch/_inductor/triton_bundler.py
index b5ccb873e33f9..b210dbff5c849 100644
--- a/torch/_inductor/triton_bundler.py
+++ b/torch/_inductor/triton_bundler.py
@@ -183,14 +183,9 @@ def put_static_autotuner(cls, key: str, kernel: "CachingAutotuner") -> None:  #
                     new_kernel,
                 )
             )
+
             # Put the values back since we need it to use now
-            (
-                kernel.fn.fn,
-                kernel.fn.__globals__,
-                kernel.fn.used_global_vals,
-                kernel.fn.repr,
-                kernel.launchers,
-            ) = old_values
+            kernel.restore_after_unpickle(old_values)
 
     @classmethod
     def collect_static_autotuners(
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 74df1cd732490..abb850ea4cce4 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -18,6 +18,7 @@
 import shutil
 import statistics
 import sys
+import sysconfig
 import tempfile
 import textwrap
 import time
@@ -58,6 +59,7 @@
 import sympy
 
 import torch
+import torch.utils._pytree as pytree
 from torch._inductor.analysis.device_info import datasheet_tops
 from torch._inductor.runtime.hints import DeviceProperties
 from torch.utils._dtype_abbrs import dtype_abbrs
@@ -895,7 +897,15 @@ def is_unrealized_node(n: IRNode) -> bool:
             return is_unrealized_node(n.data)
         if isinstance(n, ir.StorageBox):
             return is_unrealized_node(n.data)
-        return isinstance(n, ir.IRNode) and not ir.IRNode.is_realized_node(n)
+        return isinstance(n, ir.IRNode) and not isinstance(
+            n,
+            (
+                ir.ComputedBuffer,
+                ir.InputsKernel,
+                ir.InputBuffer,
+                ir.TemplateBuffer,
+            ),
+        )
 
     # kwargs and args may include a container of node, for example torch.cat([t1, t2])
     # flatten them before search the unrealized nodes
@@ -1552,12 +1562,26 @@ def is_big_gpu(index_or_device: Union[int, torch.device] = 0) -> bool:
 
 @functools.lru_cache
 def get_max_num_sms() -> int:
+    if torch.xpu.is_available():
+        return torch.xpu.get_device_properties().gpu_subslice_count
     return torch.cuda.get_device_properties("cuda").multi_processor_count
 
 
+@functools.lru_cache
+def using_b200() -> bool:
+    """Returns true if the device is a NVIDIA B200, otherwise returns false."""
+    if not torch.cuda.is_available():
+        return False
+    # compute capability 10.0 or 10.0a is NVIDIA B200
+    device_properties = torch.cuda.get_device_properties(torch.cuda.current_device())
+    return device_properties.major == 10
+
+
 def get_num_sms() -> int:
     """Handle experimental carveout if set otherwise return hardware SM count"""
     # TODO we need to properly guard on this global
+    if torch.xpu.is_available():
+        return get_max_num_sms()
     carveout = torch._C._get_sm_carveout_experimental()
     return get_max_num_sms() - (carveout if carveout is not None else 0)
 
@@ -1611,7 +1635,11 @@ def _use_conv_autotune_backend(backend: str) -> bool:
 
 
 def use_triton_template(
-    layout: Layout, *, enable_int32: bool = False, enable_float8: bool = False
+    layout: Layout,
+    *,
+    enable_int32: bool = False,
+    enable_float8: bool = False,
+    check_max_autotune: bool = True,
 ) -> bool:
     from .codegen.common import BackendFeature, has_backend_feature
 
@@ -1628,7 +1656,8 @@ def use_triton_template(
             )
             or (layout.device.type == "cpu" and layout.dtype in layout_dtypes)
         )
-        and (config.max_autotune or config.max_autotune_gemm)
+        # some callers handle max-autotune checking externally
+        and (config.max_autotune or config.max_autotune_gemm or not check_max_autotune)
         and _use_autotune_backend("TRITON")
         and has_backend_feature(layout.device, BackendFeature.TRITON_TEMPLATES)
     )
@@ -1656,7 +1685,7 @@ def can_use_tma(*matrices: IRNode, add_guards: bool = False) -> bool:
     def _aligned(expr_bytes: Union[int, sympy.Expr]) -> bool:
         return V.graph.sizevars.statically_known_multiple_of(expr_bytes, TMA_ALIGNMENT)
 
-    def _is_tma_compatible(x: IRNode) -> bool:
+    def _is_tma_compatible_default(x: IRNode) -> bool:
         sizes = x.get_size()
         strides = x.get_stride()
         rank = len(sizes)
@@ -1716,12 +1745,31 @@ def _is_tma_compatible(x: IRNode) -> bool:
 
         return True
 
-    return has_triton_tma_device() and all(_is_tma_compatible(m) for m in matrices)
+    def _is_tma_compatible_xpu(x: IRNode) -> bool:
+        strides = x.get_stride()
+        strides_i = [V.graph.sizevars.symbolic_hint(st) for st in strides]
+        # Find the single contiguous (“inner”) dim
+        inner = [
+            i
+            for i, st in enumerate(strides_i)
+            if V.graph.sizevars.statically_known_equals(st, 1)
+        ]
+        if len(inner) != 1:
+            return False
+        return True
+
+    return has_triton_tma_device() and all(
+        _is_tma_compatible_default(m)
+        if (m_device := m.get_device()) is None or m_device.type != "xpu"
+        else _is_tma_compatible_xpu(m)
+        for m in matrices
+    )
 
 
 def use_triton_tma_template(*matrices: IRNode, add_guards: bool = False) -> bool:
     return (
-        can_use_tma(*matrices, add_guards=add_guards)
+        all(len(m.get_size()) == 2 for m in matrices)
+        and can_use_tma(*matrices, add_guards=add_guards)
         and config.triton.enable_persistent_tma_matmul
     )
 
@@ -1789,6 +1837,30 @@ def use_decompose_k_choice(m: _IntLike, n: _IntLike, k: _IntLike) -> bool:
     )
 
 
+@functools.cache
+def use_contiguous(m: _IntLike, n: _IntLike, k: _IntLike) -> bool:
+    """
+    Check if we should use the contiguous subgraph transform.
+    This transform makes the second matrix contiguous before the matmul.
+    """
+    contiguous_threshold = config.rocm.contiguous_threshold
+
+    # Similar conditions to decompose_k but for contiguous transform
+    from torch._inductor.virtualized import V
+
+    return (
+        bool(torch.version.hip)  # Only relevant on AMD
+        and V.graph.sizevars.statically_known_true(
+            sympy.And(
+                sympy.Ge(k, contiguous_threshold * m),
+                sympy.Ge(k, contiguous_threshold * n),
+            )
+        )
+        and not V.graph.aot_mode
+        and not V.graph.cpp_wrapper
+    )
+
+
 @functools.cache
 def get_k_splits(m: _IntLike, n: _IntLike, k: _IntLike) -> list[int]:
     # To limit compile time
@@ -3321,6 +3393,13 @@ def is_codegen_graph_partition_subgraph(wrapper: PythonWrapperCodegen) -> bool:
     )
 
 
+def is_using_cudagraph_partition() -> bool:
+    return (
+        torch._inductor.config.triton.cudagraphs
+        or _unstable_customized_partition_wrapper.wrapper is not None
+    ) and torch._inductor.config.graph_partition
+
+
 def dtype_from_size(size: int) -> torch.dtype:
     from .virtualized import V
 
@@ -3366,13 +3445,12 @@ def tabulate_2d(elements: Sequence[Sequence[T]], headers: Sequence[T]) -> str:
         for i, e in enumerate(row):
             widths[i] = max(widths[i], len(str(e)))
     lines = []
-    # Need nested {} for string formatting; ignore SET_LINTER here
-    lines.append("|".join(f" {h:{w}} " for h, w in zip(headers, widths)))  # noqa: set_linter
+    lines.append("|".join(f" {h:{w}} " for h, w in zip(headers, widths)))
     #              widths          whitespace      horizontal separators
     total_width = sum(widths) + (len(widths) * 2) + (len(widths) - 1)
     lines.append("-" * total_width)
     for row in elements:
-        lines.append("|".join(f" {e:{w}} " for e, w in zip(row, widths)))  # noqa: set_linter
+        lines.append("|".join(f" {e:{w}} " for e, w in zip(row, widths)))
     return "\n".join(lines)
 
 
@@ -3427,20 +3505,36 @@ def maybe_aoti_standalone_config(config_patches: dict[str, Any]) -> dict[str, An
     Returns:
         dict[str, Any]: The possibly-updated `config_patches` dictionary.
     """
+
+    def patch_config(
+        config_patches: dict[str, Any], config_name: str, config_value: Any
+    ) -> None:
+        value = config_patches.get(config_name, getattr(config, config_name))
+        if value is None:
+            config_patches[config_name] = config_value
+        elif not value and value != config_value:
+            raise RuntimeError(
+                f"Invalid config: {config_name}={config_value} when aot_inductor.compile_standalone is True."
+            )
+
     compile_standalone = config_patches.get(
         "aot_inductor.compile_standalone", config.aot_inductor.compile_standalone
     )
+    # Make a copy of the config_patches to avoid modifying the original dictionary, needed for testing
+    config_patches = config_patches.copy()
     if compile_standalone:
-        package_cpp_only = config_patches.get(
-            "aot_inductor.package_cpp_only", config.aot_inductor.package_cpp_only
+        # Standlaone AOTInductor means only generate cpp project for building a standalone binary
+        patch_config(config_patches, "aot_inductor.package_cpp_only", True)
+        # Standlaone AOTInductor needs to embed the kernel code in the binary
+        patch_config(config_patches, "aot_inductor.embed_kernel_binary", True)
+        # Default to use multi-arch kernel codegen for non-rocm GPU
+        patch_config(
+            config_patches, "aot_inductor.emit_multi_arch_kernel", not torch.version.hip
         )
-        if package_cpp_only is None:
-            config_patches = {**config_patches, "aot_inductor.package_cpp_only": True}
-        elif not package_cpp_only:
-            raise RuntimeError(
-                "compile_standalone=True requires package_cpp_only=True. "
-                "Please set aot_inductor.package_cpp_only=True in your inductor config."
-            )
+        patch_config(
+            config_patches, "aot_inductor.model_name_for_generated_files", "aoti_model"
+        )
+
     return config_patches
 
 
@@ -3471,14 +3565,6 @@ def is_valid_aoti_model_name() -> bool:
     return True
 
 
-def aoti_model_name_from_config() -> str:
-    from torch._inductor import config
-
-    model_name = config.aot_inductor.model_name_for_generated_files
-    model_name = "aoti_model" if model_name is None else model_name
-    return model_name
-
-
 def get_free_symbols(x: IterateExprs, unbacked_only: bool) -> OrderedSet[sympy.Symbol]:
     if unbacked_only:
         return free_unbacked_symbols(x)
@@ -3509,3 +3595,110 @@ def maybe_log_cudagraph_partition(
         warning_msg = f"{warning_msg}. Found from : \n {stack_trace}"
 
     perf_hint_log.warning(warning_msg)
+
+
+def python_subprocess_env() -> dict[str, str]:
+    """
+    Get a base environment for running Python subprocesses.
+    """
+
+    env = {
+        # Inherit the environment of the current process.
+        **os.environ,
+        # Set the PYTHONPATH so the subprocess can find torch.
+        "PYTHONPATH": os.environ.get(
+            "TORCH_CUSTOM_PYTHONPATH", os.pathsep.join(sys.path)
+        ),
+    }
+
+    # Set PYTHONHOME for internal builds, to account for builds that bundle the
+    # runtime.  Otherwise they will use the libraries and headers from the
+    # platform runtime instead.
+    #
+    # This can't be done for external builds.  The process can be run from a
+    # venv and that won't include Python headers.  The process needs to be able
+    # to search for and find the platform runtime.
+    if config.is_fbcode():
+        env["PYTHONHOME"] = sysconfig.get_path("data")
+
+    return env
+
+
+@dataclasses.dataclass(frozen=True)
+class CUDAGraphWrapperMetadata:
+    """
+    Metadata for Customized CUDAGraphWrapper.
+
+    Currently assumes there is 1 dynamo graph and will extend to
+    multiple graphs in the future.
+    """
+
+    # The number of partitions that are cudagraphable.
+    num_partitions: int
+
+    # Index of the current partition.
+    partition_index: int
+
+
+PartitionFnType = Callable[..., Any]
+CUDAGraphWrapperType = Callable[
+    [PartitionFnType, CUDAGraphWrapperMetadata], PartitionFnType
+]
+
+
+# only incremented by user call of mark_step_begin
+class CUDAGraphWrapper:
+    wrapper: Optional[CUDAGraphWrapperType] = None
+
+
+# A customized partition wrappers from users. Interface should be:
+#
+# def wrapper(fn: PartitionFnType, metadata: CUDAGraphWrapperMetadata) -> PartitionFnType
+#
+# Inductor generates N wrapper functions for N partition functions, and mechanically wrap
+# each partition fn with the generated wrapper function. Users need to handle all details
+# such as static inputs, dynamic shapes, etc.
+# Users could customize the wrapper based on the metadata. One example is to have special
+# handle for the first and last wrapper function.
+#
+# Warning: This API is unstable and may change in the future.
+_unstable_customized_partition_wrapper = CUDAGraphWrapper()
+
+
+def set_customized_partition_wrappers(wrapper: CUDAGraphWrapperType) -> None:
+    _unstable_customized_partition_wrapper.wrapper = wrapper
+
+
+def snode_args_kwargs(snode: BaseSchedulerNode) -> tuple[list[Any], dict[str, Any]]:
+    args = snode.node.inputs  # type: ignore[union-attr]
+    args = snode.node.fill_non_provided_args(  # type: ignore[union-attr]
+        [*args, *snode.node.constant_args],  # type: ignore[union-attr]
+        snode.node.kwargs,  # type: ignore[union-attr]
+    )
+    kwargs = snode.node.kwargs  # type: ignore[union-attr]
+    flat_args, flat_args_pytree_spec = pytree.tree_flatten((args, kwargs))
+
+    def _is_tensor_ir(x) -> bool:  # type: ignore[no-untyped-def]
+        return isinstance(x, torch._inductor.ir.IRNode) and not isinstance(
+            x, torch._inductor.ir.GeneratorState
+        )
+
+    flat_args = [
+        torch._inductor.ir.ir_node_to_tensor(a, guard_shape=False)
+        if _is_tensor_ir(a)
+        else a
+        for a in flat_args
+    ]
+
+    def _tensor(size, dtype, device) -> torch.Tensor:  # type: ignore[no-untyped-def]
+        return torch.empty(size, dtype=dtype, device=device)
+
+    def to_real_tensor(e: Any) -> Any:
+        if not isinstance(e, torch.Tensor):
+            return e
+        out = _tensor(e.size(), e.dtype, e.device)
+        return out
+
+    flat_args = [to_real_tensor(a) for a in flat_args]
+    args, kwargs = pytree.tree_unflatten(flat_args, flat_args_pytree_spec)
+    return args, kwargs
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index 6144f7c6f18c4..ea1073f88b714 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -80,6 +80,7 @@
     from torch._inductor.codegen.cpp_utils import LocalBufferContext
     from torch._inductor.debug import DebugContext
     from torch._inductor.graph import GraphLowering
+    from torch._inductor.ir import ExternKernelNode
     from torch._inductor.loop_body import InterpreterShim
     from torch._subclasses import FakeTensorMode
 
@@ -183,6 +184,9 @@ def get_index_dtype_as_torch_dtype(self):
     "ops", cast(type[OpsHandler[Any]], MockHandler)
 )
 _graph: Virtualized[GraphLowering] = Virtualized("graph", NullHandler)
+_extern_kernel_nodes: Virtualized[list[ExternKernelNode]] = Virtualized(
+    "extern_kernel_nodes", NullHandler
+)
 _real_inputs: Virtualized[list[torch.Tensor]] = Virtualized("real_inputs", NullHandler)
 _fake_mode: Virtualized[FakeTensorMode] = Virtualized("fake_mode", NullHandler)
 _kernel: Virtualized[NullKernelHandler] = Virtualized(
@@ -343,6 +347,9 @@ class _V:
     )
     get_ops_handler: Callable[[], OpsHandler[Any]] = _ops._get_handler
     set_graph_handler: Callable[[GraphLowering], Any] = _graph._set_handler
+    set_extern_kernel_nodes: Callable[[list[ExternKernelNode]], Any] = (
+        _extern_kernel_nodes._set_handler
+    )
     set_real_inputs: Callable[[Any], Any] = _real_inputs._set_handler
     get_real_inputs: Callable[[], Any] = _real_inputs._get_handler
     set_fake_mode: Callable[[Any], Any] = _fake_mode._set_handler
@@ -368,6 +375,15 @@ def graph(self) -> GraphLowering:
         """The graph currently being generated"""
         return _graph._get_handler()
 
+    @property
+    def extern_kernel_nodes(self) -> list[ExternKernelNode]:
+        """
+        The extern_kernel_nodes needed for the entire graph, including the
+        subgraphs.
+        See `ProxyExecutor Design Note` in ir.py for more details
+        """
+        return _extern_kernel_nodes._get_handler()
+
     @property
     def real_inputs(self):
         """non-fake example inputs"""
diff --git a/torch/_library/custom_ops.py b/torch/_library/custom_ops.py
index bd8acb2789e16..251cdefe0f05d 100644
--- a/torch/_library/custom_ops.py
+++ b/torch/_library/custom_ops.py
@@ -210,6 +210,7 @@ def __init__(
         self._lib = get_library_allowing_overwrite(self._namespace, self._name)
         self._register_to_dispatcher(self._tags)
         self._disabled_kernel: set = set()
+        self._used_triton_kernels: list[Any] = list()
         OPDEFS[self._qualname] = self
 
     @property
diff --git a/torch/_library/simple_registry.py b/torch/_library/simple_registry.py
index cfef278679ea5..bf25cde9cb531 100644
--- a/torch/_library/simple_registry.py
+++ b/torch/_library/simple_registry.py
@@ -28,9 +28,10 @@ def __init__(self):
         self._data = {}
 
     def find(self, qualname: str) -> "SimpleOperatorEntry":
-        if qualname not in self._data:
-            self._data[qualname] = SimpleOperatorEntry(qualname)
-        return self._data[qualname]
+        res = self._data.get(qualname, None)
+        if res is None:
+            self._data[qualname] = res = SimpleOperatorEntry(qualname)
+        return res
 
 
 singleton: SimpleLibraryRegistry = SimpleLibraryRegistry()
diff --git a/torch/_library/triton.py b/torch/_library/triton.py
index 72805c765d86d..741b341f7e210 100644
--- a/torch/_library/triton.py
+++ b/torch/_library/triton.py
@@ -1,4 +1,6 @@
+import ast
 import contextlib
+import inspect
 import threading
 from collections.abc import Generator, Iterable
 from typing import Any, Callable, Optional, Union
@@ -9,6 +11,79 @@
 from .infer_schema import infer_schema
 
 
+triton_ops_to_kernels: dict[str, list[object]] = {}
+
+
+def get_triton_kernels_for_op(name: str) -> list[object]:
+    return triton_ops_to_kernels.get(name, [])
+
+
+def get_inner_triton_kernels(fn: Callable[..., Any]) -> list[object]:
+    """
+    Inspect the source of an arbitrary callable passed to torch._library.triton_op,
+    and grab all of the triton kernels that are wrapped inside of it.
+
+    TODO: This check is best effort. It does *not* handle the case where the triton
+    kernel is hidden behind recursive function calls.
+    """
+
+    def find_triton_kernels(fn: Callable[..., Any]) -> list[object]:
+        try:
+            source = inspect.getsource(fn)
+        except (OSError, TypeError):
+            return []  # Source code not available
+
+        from torch._inductor.utils import IndentedBuffer
+
+        buffer = IndentedBuffer()
+        buffer.splice(source, strip=True)
+        tree = ast.parse(buffer.getrawvalue())
+
+        # Visitor to collect function calls and triton kernels
+        class Visitor(ast.NodeVisitor):
+            def __init__(self) -> None:
+                self.triton_kernels: list[Any] = []
+
+            def visit_Call(self, node: ast.Call) -> None:
+                triton_func_names = ("capture_triton", "wrap_triton")
+                if isinstance(node.func, ast.Attribute):
+                    attr = node.func
+                    if (
+                        isinstance(attr.value, ast.Attribute)
+                        and isinstance(attr.value.value, ast.Name)
+                        and attr.value.value.id == "torch"
+                        and attr.value.attr == "_library"
+                        and attr.attr in triton_func_names
+                    ):
+                        if node.args and isinstance(node.args[0], ast.Name):
+                            self.triton_kernels.append(node.args[0].id)
+
+                # Catch capture_triton, wrap_triton that's been
+                # imported directly
+                elif isinstance(node.func, ast.Name):
+                    if node.func.id in triton_func_names:
+                        if node.args and isinstance(node.args[0], ast.Name):
+                            self.triton_kernels.append(node.args[0].id)
+
+                self.generic_visit(node)
+
+        collector = Visitor()
+        collector.visit(tree)
+        closure_vars = inspect.getclosurevars(fn)
+        resolved = []
+        # First, resolve triton kernel names
+        for name in collector.triton_kernels:
+            if name in closure_vars.nonlocals:
+                resolved.append(closure_vars.nonlocals[name])
+            elif name in closure_vars.globals:
+                resolved.append(closure_vars.globals[name])
+            elif name in closure_vars.builtins:
+                resolved.append(closure_vars.builtins[name])
+        return resolved
+
+    return find_triton_kernels(fn)
+
+
 @exposed_in("torch.library")
 def triton_op(
     name: str,
@@ -155,9 +230,28 @@ def functional_decomp(  # type: ignore[no-untyped-def]
             if custom_triton_ops_decomposition_disabled():
                 return mode.__torch_dispatch__(op, types, args, kwargs)
             else:
+                # TODO: https://github.com/pytorch/pytorch/issues/160333
+                # We should deduplicate the unrecognized_types logic.
+                import torch._subclasses
+
+                unrecognized_types = [
+                    t
+                    for t in types
+                    if not issubclass(t, torch._subclasses.FakeTensor)
+                    and t
+                    not in [
+                        torch.Tensor,
+                        torch._subclasses.functional_tensor.FunctionalTensor,
+                    ]
+                ]
+
+                if unrecognized_types:
+                    return NotImplemented
                 with mode:
                     return fn(*args, **kwargs)
 
+        triton_kernels = get_inner_triton_kernels(fn)
+        triton_ops_to_kernels[name] = triton_kernels
         result.register_torch_dispatch(FunctionalTensorMode, functional_decomp)
         return result
 
diff --git a/torch/_logging/_internal.py b/torch/_logging/_internal.py
index ffd3160b47ee8..a418fe3b60970 100644
--- a/torch/_logging/_internal.py
+++ b/torch/_logging/_internal.py
@@ -726,8 +726,49 @@ def _invalid_settings_err_msg(settings, verbose=False):
     return msg
 
 
+def process_env_var_string_for_windows(env_var_str: str) -> str:
+    """
+    When we setup logging config as guide: https://docs.pytorch.org/docs/stable/logging.html
+    Such as:
+        TORCH_LOGS="+schedule,+inductor,+output_code"
+
+    On Linux, it shows as:
+        declare -x SSH_TTY="/dev/pts/0"
+        declare -x TERM="xterm"
+        declare -x TORCH_LOGS="+schedule,+inductor,+output_code"
+        declare -x USER="xu"
+
+    On Windows, it shows as:
+        TORCHINDUCTOR_WINDOWS_TESTS=1
+        TORCH_LOGS="+schedule,+inductor,+output_code"
+        UCRTVersion=10.0.22000.0
+
+    For Linux, it shows quotes by default, And Windows is not shows quotes.
+    Besides that, Windows would auto assemble quotes when env var processing.
+    On Linux, we will get variable: "+schedule,+inductor,+output_code"
+    On Windows, we will get variable: '"+schedule,+inductor,+output_code"'
+
+    So, we need remove the outer quotes for Windows.
+    """
+    _IS_WINDOWS = sys.platform == "win32"
+
+    def remove_outer_quotes(s: str) -> str:
+        if len(s) >= 2 and (
+            (s[0] == '"' and s[-1] == '"') or (s[0] == "'" and s[-1] == "'")
+        ):
+            return s[1:-1]
+        return s
+
+    if _IS_WINDOWS:
+        env_var_str = remove_outer_quotes(env_var_str)
+
+    return env_var_str
+
+
 @functools.lru_cache
 def _parse_log_settings(settings):
+    settings = process_env_var_string_for_windows(settings)
+
     if settings == "":
         return {}
 
@@ -1248,6 +1289,7 @@ def trace_structured_artifact(
     name: str,  # this will go in metadata
     encoding: str,
     payload_fn: Callable[[], Optional[Union[str, object]]] = lambda: None,
+    compile_id: Optional[CompileId] = None,
 ) -> None:
     trace_structured(
         "artifact",
@@ -1256,6 +1298,7 @@ def trace_structured_artifact(
             "encoding": encoding,
         },
         payload_fn=payload_fn,
+        compile_id=compile_id,
     )
 
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index fc9e8a8489d8a..7a0301371b119 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -2373,9 +2373,10 @@ def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int
             ret_shape.append(
                 _formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i])
             )
+    from torch.fx.experimental.symbolic_shapes import sym_or
 
     torch._check(
-        any(x > 0 for x in ret_shape[2:]),
+        sym_or(*[x > 0 for x in ret_shape[2:]]),
         lambda: f"Given input size per channel: {list(dims)}. "
         f"Calculated output size per channel: {ret_shape[2:]}. "
         f"Output size is too small",
@@ -3311,8 +3312,12 @@ def meta_repeat_interleave_Tensor(repeats, output_size=None):
 def meta_complex(real, imag):
     assert real.dtype.is_floating_point
     assert imag.dtype.is_floating_point
-    out_shape = _broadcast_shapes(real.shape, imag.shape)
-    return real.new_empty(out_shape, dtype=corresponding_complex_dtype(real.dtype))
+    result = elementwise_meta(
+        real.to(corresponding_complex_dtype(real.dtype)),
+        imag.to(corresponding_complex_dtype(imag.dtype)),
+        type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    )
+    return result
 
 
 @register_meta([aten.nonzero_static.default, aten.nonzero_static.out])
@@ -4189,6 +4194,18 @@ def is_booleanic(arg):
     return self
 
 
+@register_meta(
+    [
+        aten.add.Scalar,
+        aten.sub.Scalar,
+    ],
+)
+def meta_binop_alpha(self, other, alpha=1):
+    return elementwise_meta(
+        self, other, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
 @register_meta([aten.round.default, aten.round.decimals])
 def meta_round(self, **kwargs):
     return elementwise_meta(
@@ -5861,6 +5878,61 @@ def meta__scaled_dot_product_flash_attention_for_cpu_backward(
     return grad_q, grad_k, grad_v
 
 
+@register_meta([aten._scaled_dot_product_attention_math_for_mps])
+def meta__scaled_dot_product_attention_math_for_mps(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    attn_mask: Optional[Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    dropout_mask: Optional[Tensor] = None,
+    scale: Optional[float] = None,
+) -> tuple[Tensor, Tensor]:
+    def ensure_4d(x):
+        if x.dim() == 3:
+            return x.unsqueeze(0), True
+        elif x.dim() > 4:
+            batch_size = 1
+            for i in range(x.dim() - 3):
+                batch_size *= x.shape[i]
+            return x.view(batch_size, x.size(-3), x.size(-2), x.size(-1)), True
+        else:
+            return x, False
+
+    q_, unsqueezed = ensure_4d(query)
+    k_, _ = ensure_4d(key)
+    v_, _ = ensure_4d(value)
+
+    batch_size, num_head, q_size, head_size = q_.shape
+    _, k_size, max_seq_length, _ = k_.shape
+
+    def sdpa_vector_fast_mps():
+        out = q_.new_empty(q_.shape)
+        if unsqueezed:
+            out = out.view_as(query)
+
+        attn = q_.new_empty((batch_size, num_head, q_size, max_seq_length))
+        if unsqueezed:
+            if query.dim() == 3:
+                attn = attn.squeeze(0)
+            else:
+                shape = list(query.shape[:-3]) + attn.shape[1:4]
+                attn = attn.view(shape)
+        return out, attn
+
+    def sdpa_vector_2pass_mps():
+        blocks = 32
+        out = q_.new_empty(q_.shape)
+        intermediate = q_.new_empty((batch_size, num_head, q_size, blocks, head_size))
+        return out, intermediate
+
+    if (max_seq_length >= 1024) or (k_size < q_size and max_seq_length >= 4096):
+        return sdpa_vector_2pass_mps()
+    else:
+        return sdpa_vector_fast_mps()
+
+
 @register_meta([aten._scaled_dot_product_efficient_attention])
 def meta__scaled_dot_product_efficient_attention(
     query: Tensor,
@@ -7352,17 +7424,17 @@ def _meta_grouped_mm_common(
         fp8_dtype = torch.float8_e4m3fnuz if torch.version.hip else torch.float8_e4m3fn
         torch._check(
             mat_a.dtype == fp8_dtype and mat_b.dtype == fp8_dtype,
-            lambda: f"Expected inputs of E4M3 FP8 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",
+            lambda: f"Expected inputs of E4M3 FP8 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",  # noqa: B950
         )
     else:
         torch._check(
             mat_a.dtype == torch.bfloat16 and mat_b.dtype == torch.bfloat16,
-            lambda: f"Expected inputs of BF16 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",
+            lambda: f"Expected inputs of BF16 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",  # noqa: B950
         )
 
     torch._check(
         mat_a.dim() in [2, 3] and mat_b.dim() in [2, 3],
-        lambda: f"Multiplicands must be 2D or 3D but got mat_a.dim()={mat_a.dim()} and mat_b.dim()={mat_b.dim()}",
+        lambda: f"Multiplicands must be 2D or 3D but got mat_a.dim()={mat_a.dim()} and mat_b.dim()={mat_b.dim()}",  # noqa: B950
     )
 
     mat_a_is_2d = mat_a.dim() == 2
@@ -7386,11 +7458,11 @@ def is_col_major(mat):
 
         torch._check(
             is_row_major(mat_a),
-            lambda: f"Expected mat_a tensor to be row major in the last two dimensions, got strides {mat_a.stride()[-2:]}",
+            lambda: f"Expected mat_a tensor to be row major in the last two dimensions, got strides {mat_a.stride()[-2:]}",  # noqa: B950
         )
         torch._check(
             is_col_major(mat_b),
-            lambda: f"Expected mat_b tensor to be column major in the last two dimensions, got strides {mat_b.stride()[-2:]}",
+            lambda: f"Expected mat_b tensor to be column major in the last two dimensions, got strides {mat_b.stride()[-2:]}",  # noqa: B950
         )
 
     def check_valid_strides(mat_name, mat):
@@ -7402,7 +7474,7 @@ def check_valid_strides(mat_name, mat):
         ):
             torch._check(
                 mat_stride[end_dim] % alignment == 0,
-                lambda: f"Expected {mat_name} stride along {end_dim} dim to be multiple of 16 bytes, got {mat_stride[end_dim]}.",
+                lambda: f"Expected {mat_name} stride along {end_dim} dim to be multiple of 16 bytes, got {mat_stride[end_dim]}.",  # noqa: B950
             )
         elif mat_stride[end_dim] == 1 and mat_stride[end_dim - 1] >= max(
             1, mat.shape[end_dim]
@@ -7422,41 +7494,81 @@ def check_valid_strides(mat_name, mat):
 
     if scale_a is not None and scale_b is not None:
         torch._check(
-            scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32,
-            lambda: "Both scale_a and scale_b must be float (fp32) tensors, but got scale_a.dtype={scale_a.dtype} and scale_b.dtype={scale_b.dtype}.",  # noqa: B950
+            (scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32)
+            or (
+                scale_a.dtype == torch.float8_e8m0fnu
+                and scale_b.dtype == torch.float8_e8m0fnu
+            ),
+            lambda: f"For FP8 scales must both be float32, or for MXFP8 both scales must be float8_e8m0fnu. Got scale_a.dtype={scale_a.dtype} and scale_b.dtype={scale_b.dtype}.",  # noqa: B950
         )
+        is_mxfp8 = (
+            scale_a.dtype == torch.float8_e8m0fnu
+            and scale_b.dtype == torch.float8_e8m0fnu
+        )
+
+        def round_up(x, y):
+            """Rounds up x to nearest multiple of y"""
+            return ((x + y - 1) // y) * y
 
         def check_scale(scale_name, scale, mat, scaled_dim, scale_multiplier=1):
             if mat.dim() == 2:
-                torch._check(
-                    scale.dim() == 1,
-                    lambda: f"Expected {scale_name} to be 1D tensor, but got {scale.dim()}D tensor.",
-                )
                 torch._check(
                     scale.is_contiguous(),
                     lambda: f"Expected {scale_name} to be contiguous.",
                 )
-                torch._check(
-                    scale.shape[0] == mat.shape[scaled_dim] * scale_multiplier,
-                    lambda: f"Expected {scale_name} to have {mat.shape[scaled_dim] * scale_multiplier} elements, got {scale.shape[0]} elements.",  # noqa: B950
-                )
+                # For MXFP8, 2d tensors have variable size groups represented as subtensors,
+                # that are converted to blocked padded format individually. At compile time we don't know
+                # the group sizes yet, so we don't know the expect size of the blocked format scale.
+                # This limits what we can check here.
+                if is_mxfp8:
+                    torch._check(
+                        scale.dim() == mat.dim(),
+                        lambda: f"For MXFP8, scale must have same number of dimensions as target tensor, but {scale_name} has mat.ndim={mat.ndim} and scale.ndim={scale.ndim}",  # noqa: B950
+                    )
+                else:
+                    torch._check(
+                        scale.dim() == 1,
+                        lambda: f"Expected {scale_name} to be 1D tensor, but got {scale.dim()}D tensor.",
+                    )
+                    torch._check(
+                        scale.shape[0] == mat.shape[scaled_dim] * scale_multiplier,
+                        lambda: f"Expected {scale_name} to have {mat.shape[scaled_dim] * scale_multiplier} elements, got {scale.shape[0]} elements.",  # noqa: B950
+                    )
             else:
                 torch._check(
-                    scale.dim() == 2,
-                    lambda: f"Expected {scale_name} to be 2D tensor, but got {scale.dim()}D tensor.",
-                )
-                torch._check(
-                    scale.stride(1) == 1,
+                    scale.stride(-1) == 1,
                     lambda: f"Expected {scale_name} to be contiguous in the last dimension.",
                 )
                 torch._check(
                     scale.shape[0] == mat.shape[0],
                     lambda: f"Expected {scale_name} batch dimension to be {mat.shape[0]}, got {scale.shape[0]}.",
                 )
-                torch._check(
-                    scale.shape[1] == mat.shape[1 + scaled_dim],
-                    lambda: f"Expected {scale_name} non-batch dimension to be {mat.shape[1 + scaled_dim]}, got {scale.shape[1]}.",
-                )
+                # For MXFP8, 3d tensors have static 'groups' (stack of 2d tensors) so we can know the expected blocked
+                # scale sizes at compile time.
+                if is_mxfp8:
+                    torch._check(
+                        mat.ndim == scale.ndim,
+                        lambda: f"For MXFP8, scale should have same number of dimensions as target tensor, but {scale_name} has mat.ndim={mat.ndim} and scale.ndim={scale.ndim}",  # noqa: B950
+                    )
+                    # TODO: This logic only holds for RHS tensor in 2d-3d case.
+                    # We'll need to update it to handle LHS 3d tensor in 3d-2d and 3d-3d cases.
+                    G, K, N = scale.shape
+                    block_size = 32
+                    blocked_K = round_up(K / block_size, 4)
+                    blocked_N = round_up(N, 128)
+                    torch._check(
+                        mat.shape[-2] == blocked_K and mat.shape[-1] == blocked_N,
+                        lambda: f"For MXFP8, expected mat.shape={mat.shape} to have scale shape of ({G},{blocked_K},{blocked_N}), but got {scale.shape}",  # noqa: B950
+                    )
+                else:
+                    torch._check(
+                        scale.dim() == 2,
+                        lambda: f"Expected {scale_name} to be 2D tensor, but got {scale.dim()}D tensor.",
+                    )
+                    torch._check(
+                        scale.shape[1] == mat.shape[1 + scaled_dim],
+                        lambda: f"Expected {scale_name} non-batch dimension to be {mat.shape[1 + scaled_dim]}, got {scale.shape[1]}.",  # noqa: B950
+                    )
 
         scale_multiplier = (
             offs.shape[0] if offs is not None and mat_a_is_2d and mat_b_is_2d else 1
@@ -7583,6 +7695,20 @@ def _constant_pad_nd_meta(input, pad, value=0):
         f"{l_inp} dimensions.",
     )
 
+    if all(isinstance(p, utils.IntWithoutSymInt) and p <= 0 for p in pad):
+        c_input = input
+        for i in range(l_diff, l_inp):
+            pad_idx = 2 * (l_inp - i - 1)
+            if pad[pad_idx] < 0:
+                c_input = c_input.narrow(
+                    i, -pad[pad_idx], c_input.shape[i] + pad[pad_idx]
+                )
+
+            if pad[pad_idx + 1] < 0:
+                c_input = c_input.narrow(i, 0, c_input.shape[i] + pad[pad_idx + 1])
+
+        return c_input.clone()
+
     new_shape = list(input_sizes[:l_diff])
     for i in range(l_pad):
         pad_idx = len(pad) - ((i + 1) * 2)
diff --git a/torch/_ops.py b/torch/_ops.py
index 83a5dc0e57a5e..b351aa17dfa76 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -267,6 +267,7 @@ def resolve_key(op: OperatorBase, k: DispatchKey):  # type: ignore[valid-type]
     DispatchKey.BackendSelect,
     DispatchKey.AutocastCPU,  # type: ignore[attr-defined]
     DispatchKey.AutocastCUDA,  # type: ignore[attr-defined]
+    DispatchKey.AutocastXPU,  # type: ignore[attr-defined]
 ]
 
 
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index 6739b334c1169..bb26bbb508bd6 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -302,7 +302,7 @@ def _backend_select_impl(*args, **kwargs):
         else:
             return _prim_impl(*args, **kwargs)
 
-    name = schema.split("(")[0]
+    name = schema.split("(", maxsplit=1)[0]
     schema = schema[len(name) :]
 
     # register non-functional ops with old custom ops API
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index ff69550420213..91b0cc1f68d47 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -107,24 +107,18 @@ def __rmul__(self, other: Any) -> typing.Self: ...
 
 
 def same_shape(a: ShapeType, b: ShapeType, *, allow_rhs_unbacked=False) -> bool:
-    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    from torch.fx.experimental.symbolic_shapes import guard_or_true
 
     if len(a) != len(b):
         return False
 
     for x, y in zip(a, b):
         if allow_rhs_unbacked:
-            # TODO: We should check that the symbols are consistent
-            # with each other
             if isinstance(y, torch.SymInt):
                 continue
-        # NB: Naively, you would not expect to have to do an oblivious guard
-        # here because there is seemingly no broadcasting here, but in fact we
-        # use this in some situations to determine if we need to do an expand
-        # on the tensor because they don't line up, so you can definitely end
-        # up trying to prove u0 != 1 in this situation.  See
-        # python test/test_proxy_tensor.py -k test_cumsum_unbacked
-        if guard_size_oblivious(x != y):
+
+        # if we do not know, then they are not the same.
+        if guard_or_true(x != y):
             return False
 
     return True
@@ -254,31 +248,30 @@ def check_all_strides(
     return _check_strides_helper(a, b, only_cuda=only_cuda, significant_only=False)
 
 
-# This function is equivalent to compute_contiguous() from TensorImpl.cpp
-def is_contiguous(a: TensorLikeType, false_if_dde=False) -> bool:
+def check_contiguous_sizes_strides(sizes, strides, false_if_dde=False):
     """
-    Tests whether a tensor is contiguous or not.
-
-    Tensors are contiguous when they have no elements,
-    one element, or when they have "nested" strides.
+    Performs an equality check between actual stride & expected stride (based on composed sizes),
+    handling contiguous stride representations:
+    e.g. torch.empty(u0, u1, u2).contiguous().stride() -> (Max(1, u1) * Max(1, u2), Max(1, u2), 1)
+    and we'd like to treat this equal to (u1 * u2, u2, 1) for comparison purposes.
     """
+
     from torch.fx.experimental.symbolic_shapes import (
         guard_or_false,
         guard_or_true,
-        guard_size_oblivious,
         is_nested_int,
     )
 
-    maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
-    maybe_guard_or_true = guard_or_true if false_if_dde else guard_size_oblivious
+    def eval_eager(x):
+        return bool(x)
 
-    if maybe_guard_or_false(a.numel() < 2):
-        return True
+    maybe_guard_or_false = guard_or_false if false_if_dde else eval_eager
+    maybe_guard_or_true = guard_or_true if false_if_dde else eval_eager
 
     expected_stride = 1
     expected_stride_max = 1
 
-    for x, y in reversed(tuple(zip(a.shape, a.stride()))):
+    for x, y in reversed(tuple(zip(sizes, strides))):
         # Skips checking strides when a dimension has length 1.
         if maybe_guard_or_false(x == 1):
             continue
@@ -299,20 +292,42 @@ def is_contiguous(a: TensorLikeType, false_if_dde=False) -> bool:
     return True
 
 
-# This function is equivalent to compute_channels_last_contiguous_2d() in TensorImpl.cpp
-def is_channels_last_contiguous_2d(a: Tensor, false_if_dde=False) -> bool:
-    # NHWC or not channels last 2D contiguous
-    if a.ndim != 4:
-        return False
+# This function is equivalent to compute_contiguous() from TensorImpl.cpp
+def is_contiguous(a: TensorLikeType, false_if_dde=False) -> bool:
+    """
+    Tests whether a tensor is contiguous or not.
 
+    Tensors are contiguous when they have no elements,
+    one element, or when they have "nested" strides.
+    """
     from torch.fx.experimental.symbolic_shapes import (
         guard_or_false,
-        guard_or_true,
         guard_size_oblivious,
     )
 
     maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
-    maybe_guard_or_true = guard_or_true if false_if_dde else guard_size_oblivious
+
+    if maybe_guard_or_false(a.numel() < 2):
+        return True
+
+    return check_contiguous_sizes_strides(
+        a.shape, a.stride(), false_if_dde=false_if_dde
+    )
+
+
+# This function is equivalent to compute_channels_last_contiguous_2d() in TensorImpl.cpp
+def is_channels_last_contiguous_2d(a: Tensor, false_if_dde=False) -> bool:
+    # NHWC or not channels last 2D contiguous
+    if a.ndim != 4:
+        return False
+
+    from torch.fx.experimental.symbolic_shapes import guard_or_false, guard_or_true
+
+    def eval_eager(x):
+        return bool(x)
+
+    maybe_guard_or_false = guard_or_false if false_if_dde else eval_eager
+    maybe_guard_or_true = guard_or_true if false_if_dde else eval_eager
 
     expected_stride = 1
     for idx in (1, 3, 2, 0):
@@ -334,14 +349,13 @@ def is_channels_last_contiguous_3d(a: Tensor, false_if_dde=False) -> bool:
     if a.ndim != 5:
         return False
 
-    from torch.fx.experimental.symbolic_shapes import (
-        guard_or_false,
-        guard_or_true,
-        guard_size_oblivious,
-    )
+    from torch.fx.experimental.symbolic_shapes import guard_or_false, guard_or_true
 
-    maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
-    maybe_guard_or_true = guard_or_true if false_if_dde else guard_size_oblivious
+    def eval_eager(x):
+        return bool(x)
+
+    maybe_guard_or_false = guard_or_false if false_if_dde else eval_eager
+    maybe_guard_or_true = guard_or_true if false_if_dde else eval_eager
 
     expected_stride = 1
     for idx in (1, 4, 3, 2, 0):
@@ -406,7 +420,7 @@ def is_channels_last_contiguous_or_false_3d(a: Tensor) -> bool:
 
 
 # similar to is_contiguous_for_memory_format but return false on data dependency.
-def contiguous_for_memory_format_or_false(  # type: ignore[return]
+def is_contiguous_for_memory_format_or_false(  # type: ignore[return]
     a: Tensor, *, memory_format: torch.memory_format
 ) -> bool:
     return is_contiguous_for_memory_format(
@@ -438,32 +452,27 @@ def is_channels_last_contiguous_or_false(a: Tensor) -> bool:
     ) or is_channels_last_contiguous_or_false_3d(a)
 
 
-def is_non_overlapping_and_dense(a: Tensor) -> bool:
+def _is_non_overlapping_and_dense_or_false(sizes, strides) -> bool:
     """
-    True when a tensor is non-overlapping and dense.
+    Helper function for is_non_overlapping_and_dense.
+    For unbacked sizes & strides, returns True only if symbolically non-overlapping & dense,
+    and False otherwise.
 
-    A tensor is non-overlapping and dense when there exists a permutation of
-    its dimensions that is contiguous.
+    e.g. sizes: [u0, u1], strides: [u2, u3]
+    this may be non-overlapping & dense at runtime, for values {u0: 4, u1: 4, u2: 4, u3: 1},
+    but isn't true for all values.
     """
+    from torch.fx.experimental.symbolic_shapes import guard_or_false, guard_or_true
+    from torch.utils._sympy.functions import Max
 
-    from torch.fx.experimental.symbolic_shapes import (
-        guard_or_false,
-        guard_size_oblivious,
-    )
-
-    if a.is_sparse:
-        return False
-
-    # Short-circuits if the tensor is already contiguous or channels-last contiguous
-    if is_contiguous_or_false(a) or is_channels_last_contiguous_or_false(a):
+    # Short-circuits for 0/1-element tensors
+    if guard_or_false(prod(sizes) < 2):  # type: ignore[operator]
         return True
 
-    # The following is equivalent to compute_non_overlapping_and_dense in TensorImpl.cpp
-
     # Short-circuits for tensors of rank one, which are
     # non-overlapping and "dense" if their stride is one
-    if a.ndim == 1:
-        return a.stride()[0] == 1
+    if len(sizes) == 1:
+        return guard_or_false(strides[0] == 1)
 
     # Checks that there exists a permutation of the strides s.t. the tensor would be contiguous
     # Sorts (length, stride) pairs by stride
@@ -476,33 +485,44 @@ class K(NamedTuple):
         stride: int
 
         def __lt__(self, other):
-            return guard_size_oblivious(self.stride < other.stride)
-
-        def __gt__(self, other):
-            return guard_size_oblivious(self.stride > other.stride)
-
-        def __le__(self, other):
-            return guard_size_oblivious(self.stride <= other.stride)
+            # for backed symbols, this is practically a < operation
+            # for unbacked, we return True if < is statically known,
+            # then try to answer this symbolically, with stride ordering semantics
+            # (e.g. u0 < u0 is False, u0 < u1 is False with no axioms, u0 < 2 * u0 is True)
+            return (
+                guard_or_false(
+                    self.stride < other.stride
+                )  # checks statically known inequality
+                or (
+                    (
+                        guard_or_false(self.stride == 0)
+                        or guard_or_false(other.stride % self.stride == 0)
+                    )
+                    and guard_or_true(self.stride != other.stride)
+                )  # checks symbolic inequality (e.g. u0 < 2048 * u0)
+            )
 
-        def __ge__(self, other):
-            return guard_size_oblivious(self.stride >= other.stride)
+    lengths_and_strides = sorted(map(K, sizes, strides))
 
-        def __eq__(self, other):
-            return guard_size_oblivious(self.stride == other.stride)
+    # verify actual strides match the expected (composed sizes)
+    sizes = [x.size for x in lengths_and_strides][::-1]
+    strides = [x.stride for x in lengths_and_strides][::-1]
+    return check_contiguous_sizes_strides(sizes, strides, false_if_dde=True)
 
-    lengths_and_strides = sorted(map(K, a.shape, a.stride()))
 
-    expected_stride = 1
-    for length, stride in lengths_and_strides:
-        if guard_or_false(length == 1):
-            continue
+def is_non_overlapping_and_dense(a: Tensor) -> bool:
+    """
+    True when a tensor is non-overlapping and dense.
 
-        if guard_size_oblivious(stride != expected_stride):
-            return False
+    A tensor is non-overlapping and dense when there exists a permutation of
+    its dimensions that is contiguous.
+    """
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
 
-        expected_stride *= length
+    if a.is_sparse:
+        return False
 
-    return True
+    return _is_non_overlapping_and_dense_or_false(a.shape, a.stride())
 
 
 # NOTE: Based on the implementation in TensorIterator.cpp, but note that
@@ -516,7 +536,10 @@ def __eq__(self, other):
 def compute_elementwise_output_logical_to_physical_perm(
     *tensors, _skip_checks=False
 ) -> list[int]:
-    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_size_oblivious,
+    )
 
     if not _skip_checks and len(tensors) == 0:
         msg = "Can't compute elementwise output strides for zero tensors!"
@@ -550,11 +573,14 @@ def compute_elementwise_output_logical_to_physical_perm(
     is_contiguous = True
     is_channels_last = True
     for t in tensors:
-        is_contiguous = is_contiguous and contiguous_for_memory_format_or_false(
+        is_contiguous = is_contiguous and is_contiguous_for_memory_format_or_false(
             t, memory_format=torch.contiguous_format
         )
-        is_channels_last = is_channels_last and contiguous_for_memory_format_or_false(
-            t, memory_format=torch.channels_last
+        is_channels_last = (
+            is_channels_last
+            and is_contiguous_for_memory_format_or_false(
+                t, memory_format=torch.channels_last
+            )
         )
 
     if is_contiguous and not is_channels_last:
@@ -569,12 +595,23 @@ def should_swap(idx_a, idx_b):
         for tensor in tensors:
             stride_a = tensor.stride()[idx_a]
             stride_b = tensor.stride()[idx_b]
-
             if guard_size_oblivious(stride_a == 0) or guard_size_oblivious(
                 stride_b == 0
             ):
                 continue
 
+            if guard_or_false(stride_a == stride_b):
+                if guard_size_oblivious(shape[idx_a] > shape[idx_b]):
+                    return 1
+
+            # when stride_a = 1, we want stride_a < stride_b to be TRUE
+            # when stride_b = 1, we want stride_a < stride_b to be FALSE
+            elif guard_or_false(stride_a == 1):
+                return -1
+
+            elif guard_or_false(stride_b == 1):
+                return 1
+
             if guard_size_oblivious(stride_a < stride_b):
                 return -1
 
@@ -1879,10 +1916,12 @@ def compute_required_storage_length(
     40
 
     """
-    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
 
     # Short-circuits if the shape has no elements
-    if guard_size_oblivious(reduce(operator.mul, shape, 1) == 0):
+    # Note: we are unsafely assuming tensor is not empty here, without
+    # runtime assertions.
+    if guard_or_false(reduce(operator.mul, shape, 1) == 0):
         return 0
 
     max_offset = sum((x - 1) * y for x, y in zip(shape, strides))
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index c82d7aaecb853..783e440223796 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -19,7 +19,6 @@
 from torch import sym_float, sym_int
 from torch._prims_common import (
     BoolLike,
-    contiguous_for_memory_format_or_false,
     DeviceLikeType,
     Dim,
     DimsSequenceType,
@@ -29,6 +28,7 @@
     FloatLike,
     FloatWithoutSymFloat,
     IntLike,
+    is_contiguous_for_memory_format_or_false,
     is_contiguous_or_false,
     is_weakly_lesser_type,
     Number,
@@ -385,7 +385,7 @@ def handle_noncontiguous_outputs(input_tlist, output):
 
 
 def _broadcast_shapes(*_shapes):
-    from torch.fx.experimental.symbolic_shapes import guard_or_false
+    from torch.fx.experimental.symbolic_shapes import guard_or_false, is_nested_int
 
     shapes = tuple(
         (x,) if isinstance(x, IntLike) else x
@@ -396,10 +396,12 @@ def _broadcast_shapes(*_shapes):
     if len(shapes) == 0:
         return None
 
-    # Type checking
-    # TODO: make common validations available as utils
     for shape in shapes:
-        assert isinstance(shape, Sequence)
+        if not isinstance(shape, Sequence):
+            raise RuntimeError(
+                "Input shapes should be of type ints, a tuple of ints, or a list of ints, got ",
+                shape,
+            )
 
     # Computes common shape
     common_shape: list[Union[int, torch.SymInt]] = [
@@ -407,16 +409,26 @@ def _broadcast_shapes(*_shapes):
     ] * reduce(max, (len(shape) for shape in shapes))
     for arg_idx, shape in enumerate(shapes):
         for idx in range(-1, -1 - len(shape), -1):
-            # if both 1, or statically known the same, we rather pick non-broadcast path.
-            if guard_or_false(common_shape[idx] == shape[idx]):
-                continue
-            elif guard_or_false(common_shape[idx] == 1):
+            # NB: handle nested ints specially to avoid invalid guarding on Ne(j0, 1).
+            if is_nested_int(shape[idx]):
+                # Broadcasting is allowed for (j0, 1) or (j0, j0);
+                # not (j0, j1), (j0, 5), etc.
+                if is_nested_int(common_shape[idx]) and guard_or_false(
+                    shape[idx] == common_shape[idx]
+                ):
+                    continue
+            else:
+                if guard_or_false(shape[idx] == common_shape[idx]):
+                    continue
+
+            if guard_or_false(common_shape[idx] == 1):
                 if shape[idx] < 0:
                     raise ValueError(
                         "Attempting to broadcast a dimension with negative length!"
                     )
                 common_shape[idx] = shape[idx]
-            elif guard_or_false(shape[idx] == 1):
+
+            if not is_nested_int(shape[idx]) and guard_or_false(shape[idx] == 1):
                 # broadcast case .
                 continue
             else:
@@ -437,6 +449,38 @@ def _maybe_broadcast(*args, preserve_cpu_scalar_tensors=True):
         *(t.shape if isinstance(t, TensorLike) else None for t in args)
     )
 
+    def should_expand(a: ShapeType, b: ShapeType) -> bool:
+        from torch.fx.experimental.symbolic_shapes import (
+            guard_or_false,
+            sym_and,
+            sym_or,
+        )
+
+        if len(a) != len(b):
+            return True
+
+        for x, y in zip(a, b):
+            if guard_or_false(x != y):
+                # We know they are not the same.
+                return True
+
+            # They are the same or we do not know if they are the same or not.
+            # 1==1 no-broadcast
+            # u0==1 and 1==u0 cases. We broadcast!
+            if guard_or_false(sym_and(x == 1, y == 1)):
+                pass
+            elif guard_or_false(sym_or(x == 1, y == 1)):
+                # assume broadcasting.
+                return True
+
+            # u0==u1 assume the same, no broadcasting!
+            torch._check(
+                x == y,
+                "sizes assumed to be the same due to unbacked broadcasting semantics",
+            )
+
+        return False
+
     def __maybe_broadcast(x, shape):
         if x is None:
             return None
@@ -446,7 +490,7 @@ def __maybe_broadcast(x, shape):
             if preserve_cpu_scalar_tensors and utils.is_cpu_scalar_tensor(x):
                 return x
 
-            if not utils.same_shape(x.shape, common_shape):
+            if should_expand(x.shape, common_shape):
                 return x.expand(common_shape)
 
             return x
@@ -2780,10 +2824,7 @@ def cat_compute_output_memory_format(inputs):
 
     utils.check_same_device(*tensors, allow_cpu_scalar_tensors=False)
 
-    from torch.fx.experimental.symbolic_shapes import (
-        guard_or_false,
-        guard_size_oblivious,
-    )
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
 
     # This is a bit tricky.  Naively, you would expect to just pick one
     # arbitrary tensor and check that all tensors match this tensor.  However,
@@ -2837,7 +2878,7 @@ def cat_compute_output_memory_format(inputs):
                 # through), and is load bearing for our Inductor lowerings
                 # (which assume that size oblivious tests are OK to determine
                 # if a shape is permissibly zero.)
-                guard_size_oblivious(tensor.shape[0] == 0),
+                guard_or_false(tensor.shape[0] == 0),
                 lambda: f"Number of dimensions of tensors must match.  "
                 f"Expected {example.ndim}-D tensors, but got 1-D for "
                 f"tensor number {tensor_idx} in the list",
@@ -2991,7 +3032,7 @@ def contiguous(
     )
 
     # TODO: make logic consistent with aten contiguous
-    if contiguous_for_memory_format_or_false(a, memory_format=memory_format):
+    if is_contiguous_for_memory_format_or_false(a, memory_format=memory_format):
         return a
 
     return torch.clone(a, memory_format=memory_format)
@@ -3005,7 +3046,7 @@ def dstack(tensors: TensorSequenceType) -> TensorLikeType:
 
 
 @register_decomposition(aten.expand)
-def expand(a: Tensor, *shape) -> Tensor:
+def expand(a: Tensor, *shape, implicit: bool = False) -> Tensor:
     from torch.fx.experimental.symbolic_shapes import guard_or_false, sym_or
 
     # NOTE: cannot use utils.extract_shape_from_varargs here
@@ -3277,6 +3318,8 @@ def native_layer_norm(
     bias: Optional[Tensor],
     eps: float,
 ) -> tuple[Tensor, Tensor, Tensor]:
+    from torch.fx.experimental.symbolic_shapes import sym_eq
+
     normalized_ndim = len(normalized_shape)
     torch._check(
         normalized_ndim >= 1,
@@ -3288,7 +3331,7 @@ def native_layer_norm(
     # while torch.Size([1, 2, 3]) == (1, 2, 3) is True
     # therefore we use tuple(normalized_shape)
     torch._check(
-        weight is None or weight.shape == tuple(normalized_shape),
+        weight is None or sym_eq(weight.shape, tuple(normalized_shape)),
         lambda: "Expected weight to be of same shape as normalized_shape, but got "
         + "weight of shape "
         + str(weight.shape)  # type: ignore[union-attr]
@@ -3296,7 +3339,7 @@ def native_layer_norm(
         + str(normalized_shape),
     )
     torch._check(
-        bias is None or bias.shape == tuple(normalized_shape),
+        bias is None or sym_eq(bias.shape, tuple(normalized_shape)),
         lambda: "Expected bias to be of same shape as normalized_shape, but got "
         + "bias of shape "
         + str(bias.shape)  # type: ignore[union-attr]
@@ -3305,7 +3348,9 @@ def native_layer_norm(
     )
     torch._check(
         input.ndim >= normalized_ndim
-        and input.shape[(input.ndim - normalized_ndim) :] == tuple(normalized_shape),
+        and sym_eq(
+            input.shape[(input.ndim - normalized_ndim) :], tuple(normalized_shape)
+        ),
         lambda: "Given normalized_shape="
         + str(normalized_shape)
         + ", expected input with shape "
@@ -4185,7 +4230,7 @@ def index_select(x: TensorLike, dim: int, index: TensorLike):
 
 @register_decomposition(aten.squeeze.dims)
 def squeeze(a: TensorLikeType, dim: Optional[DimsType] = None) -> TensorLikeType:
-    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
 
     if dim is None:
         dims = tuple(idx for idx, size in enumerate(a.shape) if size == 1)
@@ -4200,7 +4245,8 @@ def squeeze(a: TensorLikeType, dim: Optional[DimsType] = None) -> TensorLikeType
         return prims.view_of(a)
 
     # Note: squeeze does not modify tensors when the given dim is not a dimension of length 1
-    dims = tuple(d for d in dims if guard_size_oblivious(a.shape[d] == 1))
+    # would it be better if we just not allow 1 for unbacked at runtiume?
+    dims = tuple(d for d in dims if guard_or_false(a.shape[d] == 1))
     if len(dims) == 0:
         return prims.view_of(a)
     if len(dims) == 1:
diff --git a/torch/_refs/linalg/__init__.py b/torch/_refs/linalg/__init__.py
index 418691fe24aaa..28711c2c5485f 100644
--- a/torch/_refs/linalg/__init__.py
+++ b/torch/_refs/linalg/__init__.py
@@ -180,7 +180,7 @@ def vector_norm(
             if keepdim or x.ndim == 0:
                 return to_result_dtype(x).contiguous()
             elif dim is None:
-                return x.flatten()[0]
+                return to_result_dtype(x).flatten()[0]
             else:
                 new_shape = [s for d, s in enumerate(x.shape) if d not in dim]
                 return to_result_dtype(x.view(new_shape)).contiguous()
diff --git a/torch/_subclasses/fake_impls.py b/torch/_subclasses/fake_impls.py
index 4d33280f7ac82..cefff832c5fdd 100644
--- a/torch/_subclasses/fake_impls.py
+++ b/torch/_subclasses/fake_impls.py
@@ -15,11 +15,11 @@
 from torch._dispatch.python import no_python_dispatcher
 from torch._ops import OpOverload
 from torch._prims_common import (
-    contiguous_for_memory_format_or_false,
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     is_boolean_dtype,
     is_contiguous,
+    is_contiguous_for_memory_format_or_false,
     is_contiguous_or_false,
     is_float_dtype,
     is_integer_dtype,
@@ -514,6 +514,8 @@ def maybe_guard_or_true(x):
 def _view_has_unbacked_input(a, shape):
     from torch.fx.experimental.symbolic_shapes import has_hint
 
+    shape = utils.extract_shape_from_varargs(shape, validate=False)
+
     return (
         any(not has_hint(s) for s in a.size())
         or any(not has_hint(s) for s in a.stride())
@@ -1254,13 +1256,13 @@ def slow(msg):
                     continue
                 definitely_contiguous = (
                     definitely_contiguous
-                    and contiguous_for_memory_format_or_false(
+                    and is_contiguous_for_memory_format_or_false(
                         op, memory_format=torch.contiguous_format
                     )
                 )
                 definitely_channels_last = (
                     definitely_channels_last
-                    and contiguous_for_memory_format_or_false(
+                    and is_contiguous_for_memory_format_or_false(
                         op, memory_format=torch.channels_last
                     )
                 )
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index e7d9e1fc23b47..5767f6a1d0c1e 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -940,6 +940,21 @@ def merge_devices(t: object) -> None:
                 if any(map(check_cpu_device, (common_device, t.device))):
                     return
 
+            # if prefer_device_type is set, prefer that device type over others
+            prefer_device_type = torch._functorch.config.fake_tensor_prefer_device_type
+            if prefer_device_type is not None:
+                common_has_preferred = prefer_device_type in common_device.type
+                t_has_preferred = prefer_device_type in t.device.type
+
+                if not common_has_preferred and t_has_preferred:
+                    # Switch to the preferred device type
+                    common_device = t.device
+                    is_cpu_zero_dim = t_is_cpu_zero_dim
+                    return
+                elif common_has_preferred and not t_has_preferred:
+                    # Keep the existing preferred device type
+                    return
+
             # mismatching devices of non-zero dim tensors, throw
             # This might be valid behavior and need to be explicitly modeled, e.g. reshape_as
             raise RuntimeError(
@@ -1662,6 +1677,10 @@ def _prep_args_for_hash(
         )
         from torch._higher_order_ops.utils import FunctionalizeCtxWrapper
 
+        if isinstance(args, (list, tuple, dict)):
+            result.append(type(args))
+            result.append(f"length_{len(args)}")
+
         if isinstance(args, dict):
             self._prep_args_for_hash(result, args.keys(), state, id_hashed_objects)
             self._prep_args_for_hash(result, args.values(), state, id_hashed_objects)
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 03a3fd91831b4..b73ee9abfc33a 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -417,6 +417,7 @@ def describe_tensor(
             stride=stride,
             storage_offset=storage_offset,
             dynamo_dynamic_indices=list(getattr(t, "_dynamo_dynamic_indices", set())),
+            dynamo_hint_overrides=getattr(t, "_dynamo_hint_overrides", {}),
             sparse_dim=(
                 t.sparse_dim() if t.is_sparse or is_sparse_compressed(t) else None
             ),
@@ -614,6 +615,7 @@ class MetaTensorDesc(Generic[_TensorT]):
     # defined on NJT
     size: tuple[int, ...]
     dynamo_dynamic_indices: list[int]
+    dynamo_hint_overrides: dict[int, int]
 
     layout: torch.layout = torch.strided
     is_inference: bool = False
@@ -956,6 +958,7 @@ def sym_sizes_strides_storage_offset(
                         [d in t.dynamo_dynamic_indices for d in range(t.ndim)],
                         src,
                         symbolic_context=symbolic_context,
+                        hint_overrides=t.dynamo_hint_overrides,
                     )
             else:
                 return (t.size, t.stride, t.storage_offset)
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 1713c39e39b1b..e8ead0868292e 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1006,7 +1006,8 @@ def merge_dicts(*dicts):
 tensor is constructed using :func:`torch.from_numpy`.
 
 If :attr:`data` is a CuPy array, the returned tensor will be located on the same device as the CuPy array unless
-specifically overwritten by :attr:`device` or a default device.
+specifically overwritten by :attr:`device` or a default device. The device of the CuPy array is inferred from the
+pointer of the array using `cudaPointerGetAttributes` unless :attr:`device` is provided with an explicit device index.
 
 .. seealso::
 
@@ -4741,7 +4742,8 @@ def merge_dicts(*dicts):
     edge_order (``int``, optional): 1 or 2, for `first-order
         <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
         `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
-        estimation of the boundary ("edge") values, respectively.
+        estimation of the boundary ("edge") values, respectively. Note that when :attr:`edge_order` is specified, each
+        dimension size of :attr:`input` should be at least edge_order+1
 
 Examples::
 
@@ -5553,26 +5555,48 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.is_floating_point,
     r"""
-is_floating_point(input) -> (bool)
+is_floating_point(input: Tensor) -> bool
 
 Returns True if the data type of :attr:`input` is a floating point data type i.e.,
 one of ``torch.float64``, ``torch.float32``, ``torch.float16``, and ``torch.bfloat16``.
 
 Args:
     {input}
+
+Example::
+
+    >>> torch.is_floating_point(torch.tensor([1.0, 2.0, 3.0]))
+    True
+    >>> torch.is_floating_point(torch.tensor([1, 2, 3], dtype=torch.int32))
+    False
+    >>> torch.is_floating_point(torch.tensor([1.0, 2.0, 3.0], dtype=torch.float16))
+    True
+    >>> torch.is_floating_point(torch.tensor([1, 2, 3], dtype=torch.complex64))
+    False
 """.format(**common_args),
 )
 
 add_docstr(
     torch.is_complex,
     r"""
-is_complex(input) -> (bool)
+is_complex(input: Tensor) -> bool
 
 Returns True if the data type of :attr:`input` is a complex data type i.e.,
 one of ``torch.complex64``, and ``torch.complex128``.
 
 Args:
     {input}
+
+Example::
+
+    >>> torch.is_complex(torch.tensor([1, 2, 3], dtype=torch.complex64))
+    True
+    >>> torch.is_complex(torch.tensor([1, 2, 3], dtype=torch.complex128))
+    True
+    >>> torch.is_complex(torch.tensor([1, 2, 3], dtype=torch.int32))
+    False
+    >>> torch.is_complex(torch.tensor([1.0, 2.0, 3.0], dtype=torch.float16))
+    False
 """.format(**common_args),
 )
 
@@ -6578,6 +6602,18 @@ def merge_dicts(*dicts):
 
 Returns the maximum value of all elements in the ``input`` tensor.
 
+.. note::
+    The difference between ``max``/``min`` and ``amax``/``amin`` is:
+        - ``amax``/``amin`` supports reducing on multiple dimensions,
+        - ``amax``/``amin`` does not return indices.
+
+    Both ``amax``/``amin`` evenly distribute gradients between equal values
+    when there are multiple input elements with the same minimum or maximum value.
+
+    For ``max``/``min``:
+        - If reduce over all dimensions(no dim specified), gradients evenly distribute between equally ``max``/``min`` values.
+        - If reduce over one specified axis, only propagate to the indexed element.
+
 Args:
     {input}
 
@@ -6716,9 +6752,13 @@ def merge_dicts(*dicts):
         - ``amax``/``amin`` supports reducing on multiple dimensions,
         - ``amax``/``amin`` does not return indices.
 
-    Both ``max``/``min`` and ``amax``/``amin`` evenly distribute gradients between equal values
+    Both ``amax``/``amin`` evenly distribute gradients between equal values
     when there are multiple input elements with the same minimum or maximum value.
 
+    For ``max``/``min``:
+        - If reduce over all dimensions(no dim specified), gradients evenly distribute between equally ``max``/``min`` values.
+        - If reduce over one specified axis, only propagate to the indexed element.
+
 {keepdim_details}
 
 Args:
@@ -7092,7 +7132,7 @@ def merge_dicts(*dicts):
 - ``linear``: ``a + (b - a) * fraction``, where ``fraction`` is the fractional part of the computed quantile index.
 - ``lower``: ``a``.
 - ``higher``: ``b``.
-- ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (rounding down for .5 fractions).
+- ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (follows :func:`torch.round`).
 - ``midpoint``: ``(a + b) / 2``.
 
 If :attr:`q` is a 1D tensor, the first dimension of the output represents the quantiles and has size
@@ -7108,7 +7148,7 @@ def merge_dicts(*dicts):
     {opt_keepdim}
 
 Keyword arguments:
-    interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+    interpolation (str, optional): interpolation method to use when the desired quantile lies between two data points.
                             Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
                             Default is ``linear``.
     {out}
@@ -7196,6 +7236,18 @@ def merge_dicts(*dicts):
 
 Returns the minimum value of all elements in the :attr:`input` tensor.
 
+.. note::
+    The difference between ``max``/``min`` and ``amax``/``amin`` is:
+        - ``amax``/``amin`` supports reducing on multiple dimensions,
+        - ``amax``/``amin`` does not return indices.
+
+    Both ``amax``/``amin`` evenly distribute gradients between equal values
+    when there are multiple input elements with the same minimum or maximum value.
+
+    For ``max``/``min``:
+        - If reduce over all dimensions(no dim specified), gradients evenly distribute between equally ``max``/``min`` values.
+        - If reduce over one specified axis, only propagate to the indexed element.
+
 Args:
     {input}
 
@@ -7324,9 +7376,13 @@ def merge_dicts(*dicts):
         - ``amax``/``amin`` supports reducing on multiple dimensions,
         - ``amax``/``amin`` does not return indices.
 
-    Both ``max``/``min`` and ``amax``/``amin`` evenly distribute gradients between equal values
+    Both ``amax``/``amin`` evenly distribute gradients between equal values
     when there are multiple input elements with the same minimum or maximum value.
 
+    For ``max``/``min``:
+        - If reduce over all dimensions(no dim specified), gradients evenly distribute between equally ``max``/``min`` values.
+        - If reduce over one specified axis, only propagate to the indexed element.
+
 {keepdim_details}
 
 Args:
@@ -7536,17 +7592,17 @@ def merge_dicts(*dicts):
   N-dimensional (where N > 2), then a batched matrix multiply is returned.  If the first
   argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the
   batched matrix multiply and removed after.  If the second argument is 1-dimensional, a
-  1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
-  The non-matrix (i.e. batch) dimensions are :ref:`broadcasted <broadcasting-semantics>` (and thus
-  must be broadcastable).  For example, if :attr:`input` is a
-  :math:`(j \times 1 \times n \times n)` tensor and :attr:`other` is a :math:`(k \times n \times n)`
-  tensor, :attr:`out` will be a :math:`(j \times k \times n \times n)` tensor.
-
-  Note that the broadcasting logic only looks at the batch dimensions when determining if the inputs
-  are broadcastable, and not the matrix dimensions. For example, if :attr:`input` is a
+  1 is appended to its dimension for the purpose of the batched matrix multiply and removed after.
+
+  The first N-2 dimensions of each argument, the batch dimensions, are
+  :ref:`broadcast <broadcasting-semantics>` (and thus must be broadcastable).
+  The last 2, the matrix dimensions, are handled as in the matrix-matrix product.
+
+  For example, if :attr:`input` is a
   :math:`(j \times 1 \times n \times m)` tensor and :attr:`other` is a :math:`(k \times m \times p)`
-  tensor, these inputs are valid for broadcasting even though the final two dimensions (i.e. the
-  matrix dimensions) are different. :attr:`out` will be a :math:`(j \times k \times n \times p)` tensor.
+  tensor, the batch dimensions are :math:`(j \times 1)` and :math:`(k)`,
+  and the matrix dimensions are :math:`(n \times m)` and :math:`(m \times p)`.
+  :attr:`out` will be a :math:`(j \times k \times n \times p)` tensor.
 
 This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. In particular the
 matrix-matrix (both arguments 2-dimensional) supports sparse arguments with the same restrictions
@@ -9953,7 +10009,7 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.sort,
     r"""
-sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+sort(input, dim=-1, descending=False, *, stable=False, out=None) -> (Tensor, LongTensor)
 
 Sorts the elements of the :attr:`input` tensor along a given dimension
 in ascending order by value.
@@ -9974,10 +10030,10 @@ def merge_dicts(*dicts):
     {input}
     dim (int, optional): the dimension to sort along
     descending (bool, optional): controls the sorting order (ascending or descending)
-    stable (bool, optional): makes the sorting routine stable, which guarantees that the order
-       of equivalent elements is preserved.
 
 Keyword args:
+    stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+        of equivalent elements is preserved.
     out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
         be optionally given to be used as output buffers
 
@@ -10018,7 +10074,7 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.argsort,
     r"""
-argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+argsort(input, dim=-1, descending=False, *, stable=False) -> Tensor
 
 Returns the indices that sort a tensor along a given dimension in ascending
 order by value.
@@ -10034,6 +10090,8 @@ def merge_dicts(*dicts):
     {input}
     dim (int, optional): the dimension to sort along
     descending (bool, optional): controls the sorting order (ascending or descending)
+
+Keyword args:
     stable (bool, optional): controls the relative order of equivalent elements
 
 Example::
@@ -12361,6 +12419,24 @@ def merge_dicts(*dicts):
     {device}
     {requires_grad}
     {memory_format}
+
+Example::
+
+    >>> x = torch.ones(2, 3)
+    >>> torch.full_like(x, 3.141592)
+    tensor([[ 3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416]])
+    >>> torch.full_like(x, 7)
+    tensor([[7., 7., 7.],
+            [7., 7., 7.]])
+    >>> torch.full_like(x, 0.5, dtype=torch.int32)
+    tensor([[0, 0, 0],
+            [0, 0, 0]], dtype=torch.int32)
+    >>> y = torch.randn(3, 4, dtype=torch.float64)
+    >>> torch.full_like(y, -1.0)
+    tensor([[-1., -1., -1., -1.],
+            [-1., -1., -1., -1.],
+            [-1., -1., -1., -1.]], dtype=torch.float64)
 """.format(**factory_like_common_args),
 )
 
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
index 8c448adb0c6a0..f20a88ce85402 100644
--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@@ -117,6 +117,10 @@ def signpost_event(category: str, name: str, parameters: dict[str, Any]):
     log.info("%s %s: %r", category, name, parameters)
 
 
+def add_mlhub_insight(category: str, insight: str, insight_description: str):
+    pass
+
+
 def log_compilation_event(metrics):
     log.info("%s", metrics)
 
@@ -350,3 +354,14 @@ def get_default_numa_options():
     Must return None or NumaOptions, but not specifying to avoid circular import.
     """
     return None
+
+
+def log_triton_builds(fail: Optional[str]):
+    pass
+
+
+def find_compile_subproc_binary() -> Optional[str]:
+    """
+    Allows overriding the binary used for subprocesses
+    """
+    return None
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index 2352bb836a9d2..9382a5500e0ee 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -403,9 +403,12 @@ def load(self):
                     func not in _get_allowed_globals().values()
                     and func not in _get_user_allowed_globals().values()
                 ):
-                    raise UnpicklingError(
+                    error_msg = (
                         f"Trying to call reduce for unrecognized function {func}"
                     )
+                    if hasattr(func, "__self__"):
+                        error_msg += f" which belongs to {func.__self__}"
+                    raise UnpicklingError(error_msg)
                 result = func(*args)
                 if func in torch._tensor_classes and "sparse" in func.__module__:
                     _sparse_tensors_to_validate.append(result)
@@ -517,7 +520,7 @@ def load(self):
             elif key[0] == BINPERSID[0]:
                 pid = self.stack.pop()
                 # Only allow persistent load of storage
-                if type(pid) is not tuple and not type(pid) is not int:
+                if type(pid) is not tuple and type(pid) is not int:
                     raise UnpicklingError(
                         f"persistent_load id must be tuple or int, but got {type(pid)}"
                     )
diff --git a/torch/accelerator/__init__.py b/torch/accelerator/__init__.py
index e9e48f1cf3061..4d1a78df1f74c 100644
--- a/torch/accelerator/__init__.py
+++ b/torch/accelerator/__init__.py
@@ -8,6 +8,16 @@
 import torch
 
 from ._utils import _device_t, _get_device_index
+from .memory import (
+    empty_cache,
+    max_memory_allocated,
+    max_memory_reserved,
+    memory_allocated,
+    memory_reserved,
+    memory_stats,
+    reset_accumulated_memory_stats,
+    reset_peak_memory_stats,
+)
 
 
 __all__ = [
@@ -15,9 +25,17 @@
     "current_device_idx",  # deprecated
     "current_device_index",
     "current_stream",
+    "empty_cache",
     "device_count",
     "device_index",
     "is_available",
+    "max_memory_allocated",
+    "max_memory_reserved",
+    "memory_allocated",
+    "memory_reserved",
+    "memory_stats",
+    "reset_accumulated_memory_stats",
+    "reset_peak_memory_stats",
     "set_device_idx",  # deprecated
     "set_device_index",
     "set_stream",
diff --git a/torch/accelerator/memory.py b/torch/accelerator/memory.py
new file mode 100644
index 0000000000000..d34a11a3a02e5
--- /dev/null
+++ b/torch/accelerator/memory.py
@@ -0,0 +1,201 @@
+from collections import OrderedDict
+from typing import Any
+
+import torch
+
+from ._utils import _device_t, _get_device_index
+
+
+__all__ = [
+    "empty_cache",
+    "max_memory_allocated",
+    "max_memory_reserved",
+    "memory_allocated",
+    "memory_reserved",
+    "memory_stats",
+    "reset_accumulated_memory_stats",
+    "reset_peak_memory_stats",
+]
+
+
+def empty_cache() -> None:
+    r"""Release all unoccupied cached memory currently held by the caching
+    allocator so that those can be used in other application.
+
+    .. note:: This function is a no-op if the memory allocator for the current
+        :ref:`accelerator <accelerators>` has not been initialized.
+    """
+    if not torch._C._accelerator_isAllocatorInitialized():
+        return
+    torch._C._accelerator_emptyCache()
+
+
+def memory_stats(device_index: _device_t = None, /) -> OrderedDict[str, Any]:
+    r"""Return a dictionary of accelerator device memory allocator statistics for a given device index.
+
+    The return value of this function is a dictionary of statistics, each of
+    which is a non-negative integer.
+
+    Core statistics:
+
+    - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of allocation requests received by the memory allocator.
+    - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of allocated memory.
+    - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of reserved segments from device memory allocation.
+    - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of reserved memory.
+    - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of active memory blocks.
+    - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of active memory.
+    - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of inactive, non-releasable memory blocks.
+    - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of inactive, non-releasable memory.
+
+    For these core statistics, values are broken down as follows.
+
+    Pool type:
+
+    - ``all``: combined statistics across all memory pools.
+    - ``large_pool``: statistics for the large allocation pool
+      (as of June 2025, for size >= 1MB allocations).
+    - ``small_pool``: statistics for the small allocation pool
+      (as of June 2025, for size < 1MB allocations).
+
+    Metric type:
+
+    - ``current``: current value of this metric.
+    - ``peak``: maximum value of this metric.
+    - ``allocated``: historical total increase in this metric.
+    - ``freed``: historical total decrease in this metric.
+
+    In addition to the core statistics, we also provide some simple event
+    counters:
+
+    - ``"num_alloc_retries"``: number of failed device memory allocation calls that
+      result in a cache flush and retry.
+    - ``"num_ooms"``: number of out-of-memory errors thrown.
+    - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls.
+    - ``"num_device_alloc"``: number of device memory allocation calls.
+    - ``"num_device_free"``: number of device memory free calls.
+
+    Args:
+        device_index (:class:`torch.device`, str, int, optional): the index of the device to target.
+            If not given, use :func:`torch.accelerator.current_device_index` by default.
+            If a :class:`torch.device` or str is provided, its type must match the current
+            :ref:`accelerator<accelerators>` device type.
+    """
+    if not torch._C._accelerator_isAllocatorInitialized():
+        return OrderedDict()
+    device_index = _get_device_index(device_index, optional=True)
+    stats = torch._C._accelerator_getDeviceStats(device_index)
+    flat_stats = []
+
+    def flatten(prefix: str, value: Any) -> None:
+        if isinstance(value, dict):
+            for k, v in value.items():
+                nested_prefix = f"{prefix}.{k}" if prefix else k
+                flatten(nested_prefix, v)
+        else:
+            flat_stats.append((prefix, value))
+
+    flatten("", stats)
+    flat_stats.sort()
+    return OrderedDict(flat_stats)
+
+
+def memory_allocated(device_index: _device_t = None, /) -> int:
+    r"""Return the current :ref:`accelerator<accelerators>` device memory occupied by tensors
+    in bytes for a given device index.
+
+    Args:
+        device_index (:class:`torch.device`, str, int, optional): the index of the device to target.
+            If not given, use :func:`torch.accelerator.current_device_index` by default.
+            If a :class:`torch.device` or str is provided, its type must match the current
+            :ref:`accelerator<accelerators>` device type.
+    """
+    return memory_stats(device_index).get("allocated_bytes.all.current", 0)
+
+
+def max_memory_allocated(device_index: _device_t = None, /) -> int:
+    r"""Return the current :ref:`accelerator<accelerators>` maximum device memory occupied by tensors
+    in bytes for a given device index.
+
+    By default, this returns the peak allocated memory since the beginning of
+    this program. :func:`~torch.accelerator.reset_peak_memory_stats` can be used to
+    reset the starting point in tracking this metric.
+
+    Args:
+        device_index (:class:`torch.device`, str, int, optional): the index of the device to target.
+            If not given, use :func:`torch.accelerator.current_device_index` by default.
+            If a :class:`torch.device` or str is provided, its type must match the current
+            :ref:`accelerator<accelerators>` device type.
+    """
+    return memory_stats(device_index).get("allocated_bytes.all.peak", 0)
+
+
+def memory_reserved(device_index: _device_t = None, /) -> int:
+    r"""Return the current :ref:`accelerator<accelerators>` device memory managed by the caching allocator
+    in bytes for a given device index.
+
+    Args:
+        device_index (:class:`torch.device`, str, int, optional): the index of the device to target.
+            If not given, use :func:`torch.accelerator.current_device_index` by default.
+            If a :class:`torch.device` or str is provided, its type must match the current
+            :ref:`accelerator<accelerators>` device type.
+    """
+    return memory_stats(device_index).get("reserved_bytes.all.current", 0)
+
+
+def max_memory_reserved(device_index: _device_t = None, /) -> int:
+    r"""Return the current :ref:`accelerator<accelerators>` maximum device memory managed by the caching allocator
+    in bytes for a given device index.
+
+    By default, this returns the peak cached memory since the beginning of this
+    program. :func:`~torch.accelerator.reset_peak_memory_stats` can be used to reset
+    the starting point in tracking this metric.
+
+    Args:
+        device_index (:class:`torch.device`, str, int, optional): the index of the device to target.
+            If not given, use :func:`torch.accelerator.current_device_index` by default.
+            If a :class:`torch.device` or str is provided, its type must match the current
+            :ref:`accelerator<accelerators>` device type.
+    """
+    return memory_stats(device_index).get("reserved_bytes.all.peak", 0)
+
+
+def reset_accumulated_memory_stats(device_index: _device_t = None, /) -> None:
+    r"""Reset the "accumulated" (historical) stats tracked by the current :ref:`accelerator<accelerators>`
+    memory allocator for a given device index.
+
+    Args:
+        device_index (:class:`torch.device`, str, int, optional): the index of the device to target.
+            If not given, use :func:`torch.accelerator.current_device_index` by default.
+            If a :class:`torch.device` or str is provided, its type must match the current
+            :ref:`accelerator<accelerators>` device type.
+
+    .. note:: This function is a no-op if the memory allocator for the current
+        :ref:`accelerator <accelerators>` has not been initialized.
+    """
+    device_index = _get_device_index(device_index, optional=True)
+    return torch._C._accelerator_resetAccumulatedStats(device_index)
+
+
+def reset_peak_memory_stats(device_index: _device_t = None, /) -> None:
+    r"""Reset the "peak" stats tracked by the current :ref:`accelerator<accelerators>`
+    memory allocator for a given device index.
+
+    Args:
+        device_index (:class:`torch.device`, str, int, optional): the index of the device to target.
+            If not given, use :func:`torch.accelerator.current_device_index` by default.
+            If a :class:`torch.device` or str is provided, its type must match the current
+            :ref:`accelerator<accelerators>` device type.
+
+    .. note:: This function is a no-op if the memory allocator for the current
+        :ref:`accelerator <accelerators>` has not been initialized.
+    """
+    device_index = _get_device_index(device_index, optional=True)
+    return torch._C._accelerator_resetPeakStats(device_index)
diff --git a/torch/amp/autocast_mode.py b/torch/amp/autocast_mode.py
index f93c050f45089..c758d47fc8150 100644
--- a/torch/amp/autocast_mode.py
+++ b/torch/amp/autocast_mode.py
@@ -324,7 +324,7 @@ def __init__(
         elif self.device == self.custom_backend_name:
             supported_dtype = self.custom_device_mod.get_amp_supported_dtype()
             if self.fast_dtype not in supported_dtype:
-                error_message = f"In {self.custom_backend_name} autocast, but the target dtype is not supported. "
+                error_message = f"In {self.custom_backend_name} autocast, but the target dtype {self.fast_dtype} is not supported. "
                 error_message += f"Disabling autocast.\n {self.custom_backend_name} Autocast only supports dtypes of "
                 error_message += (
                     ", ".join(str(dtype) for dtype in supported_dtype) + " currently."
diff --git a/torch/ao/pruning/sparsifier/utils.py b/torch/ao/pruning/sparsifier/utils.py
index 302f7e0b0b7c1..47185aeea5274 100644
--- a/torch/ao/pruning/sparsifier/utils.py
+++ b/torch/ao/pruning/sparsifier/utils.py
@@ -98,7 +98,7 @@ def get_arg_info_from_tensor_fqn(model: nn.Module, tensor_fqn: str) -> dict[str,
     # string manip to split tensor_fqn into module_fqn and tensor_name
     # if tensor_fqn is 'weight' then module_fqn and tensor_name are '' and 'weight'
     # if tensor_fqn is 'linear.weight' then module_fqn and tensor_name are 'linear' and 'weight'
-    tensor_name = tensor_fqn.split(".")[-1]
+    tensor_name = tensor_fqn.rsplit(".", maxsplit=1)[-1]
     module_fqn = tensor_fqn[: -len(tensor_name) - ("." in tensor_fqn)]
 
     module = fqn_to_module(model, module_fqn)
diff --git a/torch/ao/quantization/backend_config/observation_type.py b/torch/ao/quantization/backend_config/observation_type.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index 9513fb288850b..dc51ab943bc5b 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -94,6 +94,7 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
     modules: dict[str, torch.nn.Module],
     node_name_to_scope: dict[str, tuple[str, type]],
     node_name_to_qconfig: dict[str, QConfigAny],
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """Replace activation_post_process module call node with quantize and
     dequantize node working with decomposed Tensor
@@ -210,7 +211,11 @@ def add_dequantize_op_kwargs(dequantize_op, input_node):
                     # sure that the default overload can be used.
                     # TODO: maybe need more complex attr name here
                     qparam_node = create_getattr_from_value(
-                        model, graph, module_path + prefix + key, value_or_node
+                        model,
+                        graph,
+                        module_path + prefix + key,
+                        value_or_node,
+                        model_device,
                     )
                     quantize_op_inputs.append(qparam_node)
                 else:
@@ -362,6 +367,7 @@ def _replace_observer_with_quantize_dequantize_node(
     modules: dict[str, torch.nn.Module],
     node_name_to_scope: dict[str, tuple[str, type]],
     node_name_to_qconfig: dict[str, QConfigAny],
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """Replace activation_post_process module call node with quantize and
     dequantize node
@@ -442,7 +448,11 @@ def _replace_observer_with_quantize_dequantize_node(
                     # For scale and zero_point values we register them as buffers in the root module.
                     # TODO: maybe need more complex attr name here
                     qparam_node = create_getattr_from_value(
-                        model, graph, module_path + prefix + key, value_or_node
+                        model,
+                        graph,
+                        module_path + prefix + key,
+                        value_or_node,
+                        model_device,
                     )
                     quantize_op_inputs.append(qparam_node)
                 else:
@@ -740,6 +750,7 @@ def convert_weighted_module(
     backend_config: BackendConfig,
     is_decomposed: bool = False,
     is_reference: bool = False,
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """Convert a weighted module to reference quantized module in the model
     If the QConfig of a QAT module is not set, the module will still be converted to
@@ -828,7 +839,10 @@ def convert_weighted_module(
         is_ptq = weight_post_process is None
         if is_ptq:
             weight_post_process = qconfig.weight()  # type: ignore[union-attr, operator]
-            device = assert_and_get_unique_device(float_module)
+            if model_device is not None:
+                device = model_device
+            else:
+                device = assert_and_get_unique_device(float_module)
             if device:
                 weight_post_process.to(device)
 
@@ -1144,6 +1158,7 @@ def convert(
     qat_module_classes = get_qat_module_classes(backend_config)
     fused_module_classes = get_fused_module_classes(backend_config)
     statically_quantized_custom_module_nodes: set[Node] = set()
+    model_device = assert_and_get_unique_device(model)
 
     for node in list(model.graph.nodes):
         if node.op == "placeholder":
@@ -1197,6 +1212,7 @@ def convert(
                             modules,
                             node_name_to_scope,
                             node_name_to_qconfig,
+                            model_device,
                         )
                     else:
                         _replace_observer_with_quantize_dequantize_node(
@@ -1205,6 +1221,7 @@ def convert(
                             modules,
                             node_name_to_scope,
                             node_name_to_qconfig,
+                            model_device,
                         )
             elif isinstance(mod, DeQuantStub):
                 _replace_observer_or_dequant_stub_with_dequantize_node(
@@ -1234,6 +1251,7 @@ def convert(
                     backend_config,
                     is_decomposed,
                     is_reference,
+                    model_device,
                 )
             elif type_before_parametrizations(mod) in custom_module_classes:
                 convert_custom_module(
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index b1b2c6b05b33e..e70a078630d9d 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -478,6 +478,7 @@ def _insert_obs_or_fq(
     model: torch.nn.Module,
     named_modules: dict[str, torch.nn.Module],
     graph: Graph,
+    model_device: Optional[torch.device] = None,
 ) -> Node:
     """
     Attaches `obs_or_fq` to `model`, and creates a node which calls
@@ -485,7 +486,8 @@ def _insert_obs_or_fq(
 
     obs_or_fq: an instance of Observer or FakeQuantize module
     """
-    model_device = assert_and_get_unique_device(model)
+    if model_device is None:
+        model_device = assert_and_get_unique_device(model)
     if model_device:
         obs_or_fq.to(model_device)
     # add obs_or_fq module as attribute
@@ -805,6 +807,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
     backend_config: Optional[BackendConfig] = None,
+    model_device: Optional[torch.device] = None,
 ) -> Argument:
     """
     Given a `node` and an `arg`, inserts an input observer between
@@ -827,6 +830,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
                 obs_or_fq_map,
                 is_qat,
                 backend_config,
+                model_device,
             )
             new_arg_to_return.append(new_inner_arg)
         return type(arg)(new_arg_to_return)
@@ -945,7 +949,12 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
         obs_or_fq_map[(arg, node)] = arg_as_input_act_obs_or_fq
         if existing_obs_node is None:
             new_obs_node = _insert_obs_or_fq(
-                arg, arg_as_input_act_obs_or_fq, model, named_modules, graph
+                arg,
+                arg_as_input_act_obs_or_fq,
+                model,
+                named_modules,
+                graph,
+                model_device,
             )
             # override this arg to be the observed arg
             new_arg = new_obs_node
@@ -966,6 +975,7 @@ def _maybe_insert_input_observers_for_node(
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
     backend_config: Optional[BackendConfig] = None,
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """
     If needed, inserts observers to the input args and kwargs of `node`.
@@ -997,6 +1007,7 @@ def _maybe_insert_input_observers_for_node(
             obs_or_fq_map,
             is_qat,
             backend_config,
+            model_device,
         )
         new_args.append(new_arg)
 
@@ -1014,6 +1025,7 @@ def _maybe_insert_input_observers_for_node(
             obs_or_fq_map,
             is_qat,
             backend_config,
+            model_device,
         )
         new_kwargs[k] = new_kwarg
 
@@ -1663,6 +1675,7 @@ def insert_observers_for_model(
     outputs_seen_counter = 0
     results_node = None
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize] = {}
+    model_device = assert_and_get_unique_device(model)
 
     # TODO: change this to insert obs/fq by pattern instead of by node
     for node in nodes_before_observation:
@@ -1766,6 +1779,7 @@ def insert_observers_for_model(
                             obs_or_fq_map,
                             is_qat,
                             backend_config,
+                            model_device,
                         )
 
                         # insert equalization input observers if needed
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index fb17d6b164175..f8445da5fea19 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -254,7 +254,11 @@ def assert_and_get_unique_device(module: torch.nn.Module) -> Any:
 
 
 def create_getattr_from_value(
-    module: torch.nn.Module, graph: Graph, prefix: str, value: Any
+    module: torch.nn.Module,
+    graph: Graph,
+    prefix: str,
+    value: Any,
+    device: Optional[torch.device] = None,
 ) -> Node:
     """
     Given a value of any type, creates a getattr node corresponding to the value and
@@ -262,7 +266,8 @@ def create_getattr_from_value(
     """
     get_new_attr_name = get_new_attr_name_with_prefix(prefix)
     attr_name = get_new_attr_name(module)
-    device = assert_and_get_unique_device(module)
+    if device is None:
+        device = assert_and_get_unique_device(module)
     new_value = (
         value.detach().clone()
         if isinstance(value, torch.Tensor)
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index c2610fd3ca7f4..7b56fbe7232cb 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -1902,10 +1902,18 @@ def convert(self, model: torch.fx.GraphModule, observer_node: Node):
             else:
                 scale, zero_point = self.calculate_qparams()
                 scale_node = create_getattr_from_value(
-                    model, model.graph, "_scale", scale
+                    model,
+                    model.graph,
+                    "_scale",
+                    scale,
+                    scale.device if isinstance(scale, torch.Tensor) else None,
                 )
                 zero_point_node = create_getattr_from_value(
-                    model, model.graph, "_zero_point", zero_point
+                    model,
+                    model.graph,
+                    "_zero_point",
+                    zero_point,
+                    zero_point.device if isinstance(zero_point, torch.Tensor) else None,
                 )
 
             q_node = model.graph.call_function(
diff --git a/torch/ao/quantization/pt2e/prepare.py b/torch/ao/quantization/pt2e/prepare.py
index 8b1c5bfed4eb1..57ff311521015 100644
--- a/torch/ao/quantization/pt2e/prepare.py
+++ b/torch/ao/quantization/pt2e/prepare.py
@@ -22,6 +22,7 @@
     QuantizationSpecBase,
     SharedQuantizationSpec,
 )
+from torch.ao.quantization.utils import _assert_and_get_unique_device
 from torch.fx import Graph, GraphModule, Node
 from torch.fx.node import Argument
 
@@ -319,6 +320,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
     named_modules: dict[str, torch.nn.Module],
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
+    model_device: Optional[torch.device] = None,
 ) -> Argument:
     """
     Given a `node` and an `arg`, inserts an input observer between
@@ -337,6 +339,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
                 named_modules,
                 obs_or_fq_map,
                 is_qat,
+                model_device,
             )
             new_arg_to_return.append(new_inner_arg)
         return type(arg)(new_arg_to_return)
@@ -390,7 +393,12 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
 
     assert isinstance(model.graph, Graph)
     new_arg = _insert_obs_or_fq(
-        arg, input_edge_obs_or_fq, model, named_modules, model.graph
+        arg,
+        input_edge_obs_or_fq,
+        model,
+        named_modules,
+        model.graph,
+        model_device,
     )
     return new_arg
 
@@ -402,6 +410,7 @@ def _maybe_insert_input_observers_for_node(
     named_modules: dict[str, torch.nn.Module],
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """
     If needed, inserts observers to the input args and kwargs of `node`.
@@ -428,6 +437,7 @@ def _maybe_insert_input_observers_for_node(
             named_modules,
             obs_or_fq_map,
             is_qat,
+            model_device,
         )
         new_args.append(new_arg)
 
@@ -452,11 +462,17 @@ def _maybe_insert_output_observer_for_node(
     graph: Graph,
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
+    model_device: Optional[torch.device] = None,
 ) -> Optional[Node]:
     if node in obs_or_fq_map:
         output_act_obs_or_fq = obs_or_fq_map[node]
         new_output = _insert_obs_or_fq(
-            node, output_act_obs_or_fq, model, named_modules, graph
+            node,
+            output_act_obs_or_fq,
+            model,
+            named_modules,
+            graph,
+            model_device,
         )
         # propagate numeric debug handle from original node to observer/fake_quant node
         if (
@@ -479,6 +495,7 @@ def _maybe_insert_input_and_output_observers_for_node(
     model: torch.fx.GraphModule,
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
+    model_device: Optional[torch.device] = None,
 ):
     this_node_quantization_annotation = (
         node.meta["quantization_annotation"]
@@ -496,6 +513,7 @@ def _maybe_insert_input_and_output_observers_for_node(
         named_modules,
         obs_or_fq_map,
         is_qat,
+        model_device,
     )
 
     output_is_a_tensor = "val" in node.meta and isinstance(node.meta["val"], FakeTensor)
@@ -504,7 +522,13 @@ def _maybe_insert_input_and_output_observers_for_node(
 
     # this returns the new observer node if it was needed
     maybe_output_obs_node = _maybe_insert_output_observer_for_node(
-        node, model, named_modules, model.graph, obs_or_fq_map, is_qat
+        node,
+        model,
+        named_modules,
+        model.graph,
+        obs_or_fq_map,
+        is_qat,
+        model_device,
     )
 
     if maybe_output_obs_node is None:
@@ -552,11 +576,16 @@ def prepare(
     )
     if obs_or_fq_callback:
         obs_or_fq_callback(model, obs_or_fq_map)
+    model_device = _assert_and_get_unique_device(model)
 
     for node in nodes_before_observation:
         # TODO: simplify logic for inserting observers
         _maybe_insert_input_and_output_observers_for_node(
-            node, model, obs_or_fq_map, is_qat
+            node,
+            model,
+            obs_or_fq_map,
+            is_qat,
+            model_device,
         )
 
     model = GraphModule(model, model.graph)
diff --git a/torch/ao/quantization/pt2e/utils.py b/torch/ao/quantization/pt2e/utils.py
index f919c3d9dff05..699a4c384837d 100644
--- a/torch/ao/quantization/pt2e/utils.py
+++ b/torch/ao/quantization/pt2e/utils.py
@@ -361,7 +361,7 @@ def _get_aten_graph_module_for_pattern(
         example_inputs,
         kwargs,
         strict=True,
-    ).module()
+    ).module(check_guards=False)
 
     aten_pattern.graph.eliminate_dead_code()  # type: ignore[operator, union-attr]
     aten_pattern.recompile()  # type: ignore[operator]
diff --git a/torch/autograd/_functions/utils.py b/torch/autograd/_functions/utils.py
index a3f242920c7e1..1e74e21d3cef2 100644
--- a/torch/autograd/_functions/utils.py
+++ b/torch/autograd/_functions/utils.py
@@ -1,6 +1,4 @@
 # mypy: allow-untyped-defs
-import operator
-from functools import reduce
 
 
 def maybe_view(tensor, size, check_same_size=True):
@@ -26,38 +24,3 @@ def maybe_unexpand(tensor, old_size, check_same_size=True):
     for dim in expanded_dims:
         tensor = tensor.sum(dim, keepdim=True)
     return tensor
-
-
-# Check whether the op enable broadcasting, and whether it is supported by ONNX.
-# If dims1 and dims2 are different, then broadcast is True.
-# We always assume the combination of dims1 and dims2 is broadcastable.
-# The following types of broadcasting are supported in ONNX:
-#     1) Only one element in dims2, such as dims2 = [1, 1]
-#     2) dims2 is suffix of dims1, such as dims1 = [2, 3, 4], and dims2 = [3, 4]
-# Details can be found here: https://github.com/onnx/onnx/blob/master/docs/Operators.md#Gemm
-def check_onnx_broadcast(dims1, dims2):
-    broadcast = False
-    supported = True
-    len1 = len(dims1)
-    len2 = len(dims2)
-
-    numel2 = reduce(operator.mul, dims2)
-    if len1 < len2:
-        broadcast = True
-        if numel2 != 1:
-            supported = False
-    elif len1 > len2:
-        broadcast = True
-        if numel2 != 1 and dims1[len1 - len2 :] != dims2:
-            supported = False
-    else:
-        if dims1 != dims2:
-            broadcast = True
-            if numel2 != 1:
-                supported = False
-
-    if not supported:
-        raise ValueError(
-            f"Numpy style broadcasting is not supported in ONNX. Input dims are: {dims1}, {dims2}"
-        )
-    return broadcast
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index 8ba0d485022f1..e92f38b3af38b 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -210,29 +210,37 @@ def clone(self) -> "set_grad_enabled":
 
 
 class inference_mode(_DecoratorContextManager):
-    r"""Context-manager that enables or disables inference mode.
+    r"""Context manager that enables or disables inference mode.
 
-    InferenceMode is a context manager analogous to :class:`~no_grad`
-    to be used when you are certain your operations will have no interactions
-    with autograd (e.g., model training). Code run under this mode gets better
-    performance by disabling view tracking and version counter bumps. Note that
-    unlike some other mechanisms that locally enable or disable grad,
-    entering inference_mode also disables to :ref:`forward-mode AD <forward-mode-ad>`.
+    InferenceMode is analogous to :class:`~no_grad` and should be used
+    when you are certain your operations will not interact with autograd
+    (e.g., during data loading or model evaluation). Compared to
+    :class:`~no_grad`, it removes additional overhead by disabling view
+    tracking and version counter bumps. It is also more restrictive, in
+    that tensors created in this mode cannot be used in computations
+    recorded by autograd.
 
-    This context manager is thread local; it will not affect computation
+    This context manager is thread-local; it does not affect computation
     in other threads.
 
     Also functions as a decorator.
 
     .. note::
-        Inference mode is one of several mechanisms that can enable or
-        disable gradients locally see :ref:`locally-disable-grad-doc` for
-        more information on how they compare.
+        Inference mode is one of several mechanisms that can locally enable
+        or disable gradients. See :ref:`locally-disable-grad-doc` for a
+        comparison. If avoiding the use of tensors created in inference mode
+        in autograd-tracked regions is difficult, consider benchmarking your
+        code with and without inference mode to weigh the performance benefits
+        against the trade-offs. You can always use :class:`~no_grad` instead.
+
+    .. note::
+       Unlike some other mechanisms that locally enable or disable grad,
+       entering inference_mode also disables :ref:`forward-mode AD <forward-mode-ad>`.
 
     Args:
-        mode (bool or function): Either a boolean flag whether to enable or
-            disable inference mode or a Python function to decorate with
-            inference mode enabled
+        mode (bool or function): Either a boolean flag to enable or disable
+            inference mode, or a Python function to decorate with inference
+            mode enabled.
 
     Example::
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
diff --git a/torch/autograd/graph.py b/torch/autograd/graph.py
index bf643a97f60f6..4b2707b65d0f1 100644
--- a/torch/autograd/graph.py
+++ b/torch/autograd/graph.py
@@ -194,6 +194,9 @@ class GradientEdge(NamedTuple):
 
     node: Node
     output_nr: int
+    # This token can be used to ensure the graph stays alive when it cannot be
+    # done via the node field
+    ownership_token: Optional[Node] = None
 
 
 def get_gradient_edge(tensor: torch.Tensor) -> GradientEdge:
@@ -209,9 +212,18 @@ def get_gradient_edge(tensor: torch.Tensor) -> GradientEdge:
         )
     grad_fn = _get_grad_fn_or_grad_acc(tensor)
 
+    # Python-based Node are owned by the C++ side meaning the python grad_fn
+    # object we hold here does NOT keep the C++ graph alive.
+    # Create an ownership token by creating a new C++ node that own the graph
+    # we care about here.
+    token = None
+    if isinstance(grad_fn, torch._C._FunctionBase):
+        with torch.enable_grad():
+            token = tensor.view_as(tensor).grad_fn
+
     # Note that output_nr default to 0 which is the right value
     # for the AccumulateGrad node.
-    return GradientEdge(grad_fn, tensor.output_nr)
+    return GradientEdge(grad_fn, tensor.output_nr, ownership_token=token)
 
 
 def increment_version(tensor: Union[torch.Tensor, Iterable[torch.Tensor]]) -> None:
diff --git a/torch/backends/mps/__init__.py b/torch/backends/mps/__init__.py
index 2fe445bfcb0e5..5c3c507428cff 100644
--- a/torch/backends/mps/__init__.py
+++ b/torch/backends/mps/__init__.py
@@ -5,7 +5,14 @@
 from torch.library import Library as _Library
 
 
-__all__ = ["is_built", "is_available", "is_macos13_or_newer", "is_macos_or_newer"]
+__all__ = [
+    "get_core_count",
+    "get_name",
+    "is_built",
+    "is_available",
+    "is_macos13_or_newer",
+    "is_macos_or_newer",
+]
 
 
 def is_built() -> bool:
@@ -36,6 +43,23 @@ def is_macos13_or_newer(minor: int = 0) -> bool:
     return torch._C._mps_is_on_macos_or_newer(13, minor)
 
 
+@_lru_cache
+def get_name() -> str:
+    r"""Return Metal device name"""
+    return torch._C._mps_get_name()
+
+
+@_lru_cache
+def get_core_count() -> int:
+    r"""Return GPU core count.
+
+    According to the documentation, one core is comprised of 16 Execution Units.
+    One execution Unit has 8 ALUs.
+    And one ALU can run 24 threads, i.e. one core is capable of executing 3072 threads concurrently.
+    """
+    return torch._C._mps_get_core_count()
+
+
 _lib: Optional[_Library] = None
 
 
diff --git a/torch/compiler/__init__.py b/torch/compiler/__init__.py
index 163c25f12dbc8..08ec23b748eb5 100644
--- a/torch/compiler/__init__.py
+++ b/torch/compiler/__init__.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+import io
 from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import ParamSpec
 
@@ -23,6 +24,7 @@
     "set_stance",
     "set_enable_guard_collectives",
     "cudagraph_mark_step_begin",
+    "load_compiled_function",
     "wrap_numpy",
     "is_compiling",
     "is_dynamo_compiling",
@@ -639,3 +641,23 @@ def nested_compile_region(fn=None):
     )
 
     return _mark_compile_region(fn)
+
+
+def load_compiled_function(file: io.IOBase) -> Callable[..., Any]:
+    """
+    Load an aot-compiled function from a file.
+
+    .. warning::
+
+        This API is currently experimental and subject to change.
+
+    Args:
+        file: A file-like object containing the serialized compiled function.
+
+    Returns:
+        A torch-compiled function with compilation preloaded from disk.
+    """
+    from torch._dynamo.aot_compile import AOTCompiledFunction
+
+    data = file.read()
+    return AOTCompiledFunction.deserialize(data)
diff --git a/torch/compiler/config.py b/torch/compiler/config.py
index ceb8f41db8442..d30f6c66f29e9 100644
--- a/torch/compiler/config.py
+++ b/torch/compiler/config.py
@@ -59,6 +59,19 @@
 consistent profiles across all ranks.
 """
 
+pgo_extra_read_key: Optional[str] = Config(
+    env_name_default="TORCH_COMPILE_STICKY_PGO_READ", default=None
+)
+pgo_extra_write_key: Optional[str] = Config(
+    env_name_default="TORCH_COMPILE_STICKY_PGO_WRITE", default=None
+)
+"""
+Additional read/write keys for PGO.
+Write key: Besides writing to the default local/remote PGO state, this also writes to the specified key.
+Read key: Besides reading from the default state, this also reads from the specified key (if written to before)
+and merges it with the default state.
+"""
+
 
 cache_key_tag: str = Config(env_name_default="TORCH_COMPILE_CACHE_KEY_TAG", default="")
 """
diff --git a/torch/cpu/__init__.py b/torch/cpu/__init__.py
index 3ab86dfe3f211..b42b7f0ff54bd 100644
--- a/torch/cpu/__init__.py
+++ b/torch/cpu/__init__.py
@@ -27,8 +27,6 @@
     "Event",
 ]
 
-_device_t = Union[_device, str, int, None]
-
 
 def _is_avx2_supported() -> bool:
     r"""Returns a bool indicating if CPU supports AVX2."""
@@ -75,7 +73,7 @@ def is_available() -> bool:
     return True
 
 
-def synchronize(device: _device_t = None) -> None:
+def synchronize(device: torch.types.Device = None) -> None:
     r"""Waits for all kernels in all streams on the CPU device to complete.
 
     Args:
@@ -121,7 +119,7 @@ def wait(self, stream=None) -> None:
 _current_stream = _default_cpu_stream
 
 
-def current_stream(device: _device_t = None) -> Stream:
+def current_stream(device: torch.types.Device = None) -> Stream:
     r"""Returns the currently selected :class:`Stream` for a given device.
 
     Args:
@@ -181,7 +179,7 @@ def device_count() -> int:
     return 1
 
 
-def set_device(device: _device_t) -> None:
+def set_device(device: torch.types.Device) -> None:
     r"""Sets the current device, in CPU we do nothing.
 
     N.B. This function only exists to facilitate device-agnostic code
diff --git a/torch/csrc/DeviceAccelerator.cpp b/torch/csrc/DeviceAccelerator.cpp
index 3a97c0794684f..dc3da8881a715 100644
--- a/torch/csrc/DeviceAccelerator.cpp
+++ b/torch/csrc/DeviceAccelerator.cpp
@@ -1,4 +1,3 @@
-#include <c10/core/AllocatorConfig.h>
 #include <torch/csrc/DeviceAccelerator.h>
 #include <torch/csrc/utils/device_lazy_init.h>
 
@@ -74,8 +73,68 @@ void initModule(PyObject* module) {
     return at::accelerator::maybeExchangeDevice(device_index);
   });
 
-  m.def("_accelerator_setAllocatorSettings", [](std::string env) {
-    c10::CachingAllocator::setAllocatorSettings(env);
+  m.def("_accelerator_isAllocatorInitialized", []() {
+    const auto device_type = at::accelerator::getAccelerator(true).value();
+    return at::getDeviceAllocator(device_type)->initialized();
+  });
+
+  m.def("_accelerator_emptyCache", []() { at::accelerator::emptyCache(); });
+
+  m.def("_accelerator_getDeviceStats", [](c10::DeviceIndex device_index) {
+    using c10::CachingAllocator::Stat;
+    using c10::CachingAllocator::StatArray;
+    using c10::CachingAllocator::StatType;
+    using c10::CachingDeviceAllocator::DeviceStats;
+
+    const auto stats = at::accelerator::getDeviceStats(device_index);
+    const auto stat_to_dict = [](const Stat& stat) -> py::dict {
+      py::dict dict;
+      dict["current"] = stat.current;
+      dict["peak"] = stat.peak;
+      dict["allocated"] = stat.allocated;
+      dict["freed"] = stat.freed;
+      return dict;
+    };
+
+    const auto stat_array_to_dict = [=](const StatArray& stats) -> py::dict {
+      const std::array<const char*, static_cast<size_t>(StatType::NUM_TYPES)>
+          kStatTypeNames = {"all", "small_pool", "large_pool"};
+      py::dict dict;
+      for (const auto i : c10::irange(kStatTypeNames.size())) {
+        dict[kStatTypeNames[i]] = stat_to_dict(stats[i]);
+      }
+      return dict;
+    };
+
+    py::dict result;
+    result["num_alloc_retries"] = stats.num_alloc_retries;
+    result["num_ooms"] = stats.num_ooms;
+    result["max_split_size"] = stats.max_split_size;
+    result["num_sync_all_streams"] = stats.num_sync_all_streams;
+    result["num_device_alloc"] = stats.num_device_alloc;
+    result["num_device_free"] = stats.num_device_free;
+    result["allocated_bytes"] = stat_array_to_dict(stats.allocated_bytes);
+    result["reserved_bytes"] = stat_array_to_dict(stats.reserved_bytes);
+    result["active_bytes"] = stat_array_to_dict(stats.active_bytes);
+    result["requested_bytes"] = stat_array_to_dict(stats.requested_bytes);
+    result["allocation"] = stat_array_to_dict(stats.allocation);
+    result["segment"] = stat_array_to_dict(stats.segment);
+    result["active"] = stat_array_to_dict(stats.active);
+    result["inactive_split"] = stat_array_to_dict(stats.inactive_split);
+    result["inactive_split_bytes"] =
+        stat_array_to_dict(stats.inactive_split_bytes);
+    result["oversize_allocations"] = stat_to_dict(stats.oversize_allocations);
+    result["oversize_segments"] = stat_to_dict(stats.oversize_segments);
+    return result;
+  });
+
+  m.def(
+      "_accelerator_resetAccumulatedStats", [](c10::DeviceIndex device_index) {
+        at::accelerator::resetAccumulatedStats(device_index);
+      });
+
+  m.def("_accelerator_resetPeakStats", [](c10::DeviceIndex device_index) {
+    at::accelerator::resetPeakStats(device_index);
   });
 }
 
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
index 60a7bb644df01..d43d2b02a23ef 100644
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@@ -15,9 +15,7 @@
 #include <torch/csrc/utils/cpp_stacktraces.h>
 #include <torch/csrc/utils/pybind.h>
 
-#if defined(USE_DISTRIBUTED)
 #include <torch/csrc/distributed/c10d/exception.h>
-#endif
 
 inline void PyErr_SetString(PyObject* type, const std::string& message) {
   PyErr_SetString(type, message.c_str());
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 1aa8a8b6df8a8..6f052b0331edc 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -20,7 +20,6 @@
 #include <ATen/Parallel.h>
 #include <ATen/Utils.h>
 #include <ATen/core/Vitals.h>
-#include <ATen/detail/AcceleratorHooksInterface.h>
 #include <ATen/dlpack.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/ForeachUtils.h>
@@ -121,14 +120,12 @@
 #endif
 #endif
 
-#ifdef USE_DISTRIBUTED
 #ifdef USE_C10D
 #include <torch/csrc/distributed/autograd/python_autograd.h>
 #include <torch/csrc/distributed/c10d/c10d.h>
 #include <torch/csrc/distributed/rpc/rpc.h>
 #include <torch/csrc/distributed/rpc/testing/testing.h>
 #endif
-#endif
 
 #if defined(USE_VALGRIND)
 #include <callgrind.h>
@@ -553,11 +550,7 @@ static PyObject* THPModule_getBackcompatKeepdimWarn(
 }
 
 static PyObject* THPModule_hasDistributed(PyObject* _unused, PyObject* noargs) {
-#ifdef USE_DISTRIBUTED
   Py_RETURN_TRUE;
-#else
-  Py_RETURN_FALSE;
-#endif
 }
 
 static PyObject* THPModule_showConfig(PyObject* module, PyObject* noargs) {
@@ -1363,7 +1356,7 @@ static PyObject* THPModule_qEngine(PyObject* _unused, PyObject* noargs) {
 static PyObject* THPModule_supportedQEngines(
     PyObject* _unused,
     PyObject* noargs) {
-  auto qengines = at::globalContext().supportedQEngines();
+  const auto& qengines = at::globalContext().supportedQEngines();
   auto list =
       THPObjectPtr(PyList_New(static_cast<Py_ssize_t>(qengines.size())));
   if (!list)
@@ -1994,7 +1987,7 @@ PyObject* initModule() {
 #ifdef USE_XPU
   THPUtils_addPyMethodDefs(methods, THXPModule_methods());
 #endif
-#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
+#ifdef USE_C10D
   THPUtils_addPyMethodDefs(
       methods, torch::distributed::c10d::python_functions());
 #ifndef _WIN32
@@ -2203,6 +2196,8 @@ Call this whenever a new thread is created in order to propagate values from
       set_module_attr("_has_kleidiai", at::hasKleidiAI() ? Py_True : Py_False));
   ASSERT_TRUE(
       set_module_attr("has_lapack", at::hasLAPACK() ? Py_True : Py_False));
+  ASSERT_TRUE(set_module_attr(
+      "_has_eigen_sparse", at::hasEigenSparse() ? Py_True : Py_False));
 
   py_module.def("_valgrind_supported_platform", []() {
 #if defined(USE_VALGRIND)
@@ -2472,13 +2467,16 @@ Call this whenever a new thread is created in order to propagate values from
       });
 
   py_module.def(
-      "_get_fp32_precision_getter", [](std::string backend, std::string op) {
+      "_get_fp32_precision_getter",
+      [](const std::string& backend, const std::string& op) {
         return at::globalContext().float32Precision(backend, op);
       });
 
   py_module.def(
       "_set_fp32_precision_setter",
-      [](std::string backend, std::string op, std::string precision) {
+      [](const std::string& backend,
+         const std::string& op,
+         const std::string& precision) {
         at::globalContext().setFloat32Precision(backend, op, precision);
         return precision;
       });
@@ -2600,30 +2598,6 @@ Call this whenever a new thread is created in order to propagate values from
 
   ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_True));
 
-// See note [Pybind11 ABI constants]
-#define SET_STR_DEFINE(name) \
-  ASSERT_TRUE(set_module_attr("_" #name, THPUtils_packString(name)))
-
-#ifdef PYBIND11_COMPILER_TYPE
-  SET_STR_DEFINE(PYBIND11_COMPILER_TYPE);
-#else
-  ASSERT_TRUE(
-      set_module_attr("_" C10_STRINGIZE(PYBIND11_COMPILER_TYPE), Py_None));
-#endif
-
-#ifdef PYBIND11_STDLIB
-  SET_STR_DEFINE(PYBIND11_STDLIB);
-#else
-  ASSERT_TRUE(set_module_attr("_" C10_STRINGIZE(PYBIND11_STDLIB), Py_None));
-#endif
-
-#ifdef PYBIND11_BUILD_ABI
-  SET_STR_DEFINE(PYBIND11_BUILD_ABI);
-#else
-  ASSERT_TRUE(set_module_attr("_" C10_STRINGIZE(PYBIND11_BUILD_ABI), Py_None));
-#endif
-#undef SET_STR_DEFINE
-
   py_module.def(
       "_set_conj", [](const at::Tensor& x, bool conj) { x._set_conj(conj); });
   py_module.def(
diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp
index f289a286b19c7..e6016a7721e8b 100644
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@@ -82,6 +82,8 @@ struct ConcretePyInterpreterVTable final
 
   bool is_contiguous(const c10::TensorImpl* self, at::MemoryFormat)
       const override;
+  c10::SymBool sym_is_contiguous(const c10::TensorImpl* self, at::MemoryFormat)
+      const override;
   bool is_strides_like(const c10::TensorImpl* self, at::MemoryFormat)
       const override;
   bool is_non_overlapping_and_dense(const c10::TensorImpl* self) const override;
@@ -476,6 +478,33 @@ bool ConcretePyInterpreterVTable::is_contiguous(
   return PyObject_IsTrue(out.ptr());
 }
 
+c10::SymBool ConcretePyInterpreterVTable::sym_is_contiguous(
+    const c10::TensorImpl* self,
+    at::MemoryFormat memory_format) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  py::object out;
+  out = torchDispatchFromTensorImpl(
+      self,
+      "sym_is_contiguous",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("sym_is_contiguous")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten",
+      {py::cast(memory_format)});
+
+  if (out.is_none()) {
+    return self->sym_is_contiguous_default(memory_format);
+  }
+
+  return torch::is_symbool(out) ? out.cast<c10::SymBool>()
+                                : c10::SymBool{py::cast<bool>(out)};
+}
+
 bool ConcretePyInterpreterVTable::is_strides_like(
     const c10::TensorImpl* self,
     at::MemoryFormat memory_format) const {
diff --git a/torch/csrc/api/include/torch/nativert/ModelRunnerHandle.h b/torch/csrc/api/include/torch/nativert/ModelRunnerHandle.h
new file mode 100644
index 0000000000000..866e09b13407a
--- /dev/null
+++ b/torch/csrc/api/include/torch/nativert/ModelRunnerHandle.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Export.h>
+
+namespace torch::nativert {
+
+// We don't want to forward declare in general but including ModelRunner will
+// pollute the public API namespace too much. Therefore, we just use pimpl an
+// incomplete ModelRunner here.
+class ModelRunner;
+
+class TORCH_API ModelRunnerHandle {
+ public:
+  ModelRunnerHandle(
+      const std::string& packagePath,
+      const std::string& modelName);
+
+  ModelRunnerHandle(ModelRunnerHandle&&) = default;
+  ModelRunnerHandle& operator=(ModelRunnerHandle&&) = default;
+  ModelRunnerHandle(const ModelRunnerHandle&) = delete;
+  ModelRunnerHandle& operator=(const ModelRunnerHandle&) = delete;
+  ~ModelRunnerHandle();
+
+  c10::IValue run(
+      const std::vector<c10::IValue>& args,
+      const std::unordered_map<std::string, c10::IValue>& kwargs);
+
+  /**
+   * A low level API which expects user to always pass in flattened inputs.
+   * The ownership of the entire input list must be transferred to the
+   * executor via std::move or in-place construction.
+   */
+  std::vector<c10::IValue> runWithFlatInputsAndOutputs(
+      std::vector<c10::IValue> flatInputs);
+
+ private:
+  std::unique_ptr<ModelRunner> impl_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 8e13d4267edb5..2b7e7760754d4 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -1078,7 +1078,7 @@ std::vector<Tensor> cat_tensors_backward(
     auto& shape = sizes[i];
     // If input was empty tensor, gradInput should be empty tensor.
     if (shape.size() == 1) {
-      if (TORCH_GUARD_SIZE_OBLIVIOUS(shape[0].sym_eq(0))) {
+      if (TORCH_GUARD_OR_FALSE(shape[0].sym_eq(0))) {
         grad_inputs[i] = at::zeros({0}, grad_val.options());
         continue;
       }
@@ -3452,8 +3452,11 @@ std::tuple<Tensor, Tensor, Tensor> linalg_svd_jvp(
   const auto V = Vh.mH();
 
   // dP = U^H dA V
-  auto dP = m >= n ? at::matmul(U.mH(), at::matmul(dA, V))
-                   : at::matmul(at::matmul(U.mH(), dA), V);
+  // U^H (dA V) is O(km(n + k))
+  // (U^H dA) V is O(kn(m + k))
+  // So prefer U^H (dA V) if m < n
+  auto dP = m < n ? at::matmul(U.mH(), at::matmul(dA, V))
+                  : at::matmul(at::matmul(U.mH(), dA), V);
 
   auto dS =
       is_complex ? at::real(dP.diagonal(0, -2, -1)) : dP.diagonal(0, -2, -1);
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 4e8cb2efca0e1..f0024f8f0b070 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -979,13 +979,13 @@ static void validate_outputs_impl(
     }
 
     if (grad.device() != metadata.device()) {
-      // quick hack for: https://github.com/pytorch/pytorch/issues/65016 but
-      // should be eventually removed
-      if (!(metadata.is_tensor_subclass() ||
-            grad.unsafeGetTensorImpl()->is_python_dispatch())) {
-        if (grad.dim() == 0) {
-          grad = grad.to(metadata.device());
-        } else {
+      if (grad.dim() == 0) {
+        grad = grad.to(metadata.device());
+      } else {
+        // quick hack for: https://github.com/pytorch/pytorch/issues/65016 but
+        // should be eventually removed
+        if (!(metadata.is_tensor_subclass() ||
+              grad.unsafeGetTensorImpl()->is_python_dispatch())) {
           std::stringstream ss;
           ss << "invalid gradient at index " << i << " - expected device ";
           ss << metadata.device() << " but got " << grad.device();
diff --git a/torch/csrc/autograd/functions/basic_ops.cpp b/torch/csrc/autograd/functions/basic_ops.cpp
index af5763df659a0..d461c638df12a 100644
--- a/torch/csrc/autograd/functions/basic_ops.cpp
+++ b/torch/csrc/autograd/functions/basic_ops.cpp
@@ -57,13 +57,7 @@ auto UndefinedGrad::apply(variable_list&& inputs) -> variable_list {
 
 auto UndefinedGradBackward::apply(variable_list&& output_grads)
     -> variable_list {
-  tensor_list input_grads;
-  output_grads.reserve(input_grads.size());
-  for (auto& grad : output_grads) {
-    (void)grad; // Suppress unused variable warning
-    input_grads.emplace_back();
-  }
-  return input_grads;
+  return tensor_list(output_grads.size());
 }
 
 auto Identity::apply(variable_list&& grads) -> variable_list {
diff --git a/torch/csrc/autograd/functions/init.cpp b/torch/csrc/autograd/functions/init.cpp
index 5e19010f9ae3c..05c8901e1f60d 100644
--- a/torch/csrc/autograd/functions/init.cpp
+++ b/torch/csrc/autograd/functions/init.cpp
@@ -8,9 +8,7 @@
 #include <torch/csrc/autograd/python_autograd.h>
 #include <torch/csrc/autograd/python_cpp_function.h>
 #include <torch/csrc/autograd/python_variable.h>
-#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
-#endif
 #include <torch/csrc/jit/python/python_tracer.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_numbers.h>
@@ -150,11 +148,9 @@ void THPAutograd_initFunctions() {
   static PyTypeObject CopyBackwardsClass;
   addClass<CopyBackwards, NoCtor>(module, CopyBackwardsClass, "CopyBackwards");
 
-#ifdef USE_DISTRIBUTED
   static PyTypeObject SendRpcBackwardClass;
   addClass<torch::distributed::autograd::SendRpcBackward, NoCtor>(
       module, SendRpcBackwardClass, "SendRpcBackward");
-#endif
 
   static PyTypeObject CopySlicesClass;
   addClass<CopySlices, NoCtor>(module, CopySlicesClass, "CopySlices");
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index fd672a48502a5..78a0c6eeec7ac 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -674,6 +674,9 @@ struct ThreadLocalResults {
   CallTypeHelper<TraceKeyCacheState>::tuple_type trace_keys_;
   AppendOnlyList<c10::approx_time_t, BLOCK_SIZE> exit_times_;
   AppendOnlyList<c10::approx_time_t, BLOCK_SIZE> c_exit_times_;
+
+  int active_frames_{0};
+  int remaining_start_frames_{0};
 };
 
 // ============================================================================
@@ -701,7 +704,7 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
       PyFrameObject* frame,
       int what,
       PyObject* arg);
-
+  void register_gc_callback() override;
   void stop() override;
   void restart() override;
   std::vector<std::shared_ptr<Result>> getEvents(
@@ -720,6 +723,8 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
       PyFrameObject* frame,
       bool is_startup_frame);
 
+  static PyObject* gc_event_callback(PyObject* self, PyObject* args);
+
   void recordCCall(
       ThreadLocalResults& tls,
       PyFrameObject* frame,
@@ -730,6 +735,7 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
 
   std::atomic<bool> active_lock_{false};
   bool active_{false};
+  bool gc_callback_registered_{false};
 
   torch::profiler::impl::RecordQueue* queue_;
   PyInterpreterState* interpreter_{nullptr};
@@ -970,6 +976,27 @@ const std::vector<PyThreadState*> PythonTracer::interpreterThreads() const {
   return out;
 }
 
+// we are only registering on main thread while holding GIL so this should be
+// safe
+static PyObject* py_gc_callback = nullptr;
+// The C function to be called by Python's GC
+PyObject* PythonTracer::gc_event_callback(PyObject* self, PyObject* args) {
+  const char* phase;
+  PyObject* info;
+  if (!PyArg_ParseTuple(args, "sO", &phase, &info)) {
+    return nullptr;
+  }
+  PythonTracer* instance =
+      reinterpret_cast<PythonTracer*>(PyCapsule_GetPointer(self, nullptr));
+  if (!instance) {
+    PyErr_SetString(PyExc_RuntimeError, "Invalid tracer instance");
+    return nullptr;
+  }
+  instance->queue_->getSubqueue()->emplace_gc_call(
+      phase, c10::getApproximateTime());
+  Py_RETURN_NONE;
+}
+
 PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
     : queue_(queue),
 
@@ -999,7 +1026,8 @@ PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
     PyThreadState_Swap(thread_state);
 
     thread_local_results_.emplace_back(thread_state, &value_cache_, this);
-    auto* ctx = thread_local_results_.back().ctx_;
+    auto& tls = thread_local_results_.back();
+    auto* ctx = tls.ctx_;
 
     // When we begin profiling there are already frames on the Python
     // interpreter stack. To ensure a complete trace, we must push calls
@@ -1021,7 +1049,7 @@ PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
     }
 
     for (auto it = current_stack.rbegin(); it != current_stack.rend(); it++) {
-      recordPyCall(thread_local_results_.back(), it->get(), true);
+      recordPyCall(tls, it->get(), true);
       auto frame_refcount = Py_REFCNT(it->get());
 
       // We hold one reference in `current_stack`, and the interpreter holds
@@ -1029,6 +1057,8 @@ PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
       TORCH_INTERNAL_ASSERT(frame_refcount >= 2, frame_refcount);
     }
 
+    tls.remaining_start_frames_ = tls.active_frames_;
+
     // Note:
     //   This profile will not compose with other CPython profilers, and
     //   cannot be round tripped via `sys.settrace(sys.gettrace())`
@@ -1039,8 +1069,74 @@ PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
 #endif
 }
 
+void unregister_gc_callback() {
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  PyObject* gc_module = PyImport_ImportModule("gc");
+  if (!gc_module) {
+    PyErr_Print();
+    PyGILState_Release(gstate);
+    return;
+  }
+  PyObject* callbacks = PyObject_GetAttrString(gc_module, "callbacks");
+  if (!callbacks || !PyList_Check(callbacks)) {
+    PyErr_Print();
+    Py_XDECREF(gc_module);
+    Py_XDECREF(callbacks);
+    PyGILState_Release(gstate);
+    return;
+  }
+  Py_ssize_t idx = PySequence_Index(callbacks, py_gc_callback);
+  if (idx >= 0) {
+    PySequence_DelItem(callbacks, idx);
+  } else {
+    // Not found, maybe already removed
+  }
+  Py_DECREF(callbacks);
+  Py_DECREF(gc_module);
+  Py_XDECREF(py_gc_callback);
+  py_gc_callback = nullptr;
+  PyGILState_Release(gstate);
+}
+
+void PythonTracer::register_gc_callback() {
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  PyObject* gc_module = PyImport_ImportModule("gc");
+  if (!gc_module) {
+    PyErr_Print();
+    PyGILState_Release(gstate);
+    return;
+  }
+  PyObject* callbacks = PyObject_GetAttrString(gc_module, "callbacks");
+  if (!callbacks || !PyList_Check(callbacks)) {
+    PyErr_Print();
+    Py_XDECREF(gc_module);
+    Py_XDECREF(callbacks);
+    PyGILState_Release(gstate);
+    return;
+  }
+  static PyMethodDef method_def = {
+      "gc_event_callback",
+      (PyCFunction)gc_event_callback,
+      METH_VARARGS,
+      nullptr};
+  PyObject* capsule = PyCapsule_New(this, nullptr, nullptr);
+  py_gc_callback = PyCFunction_New(&method_def, capsule);
+  Py_DECREF(capsule); // PyCFunction_New increments refcount
+  if (PyList_Append(callbacks, py_gc_callback) < 0) {
+    PyErr_Print();
+  }
+  gc_callback_registered_ = true;
+  Py_DECREF(callbacks);
+  Py_DECREF(gc_module);
+  PyGILState_Release(gstate);
+}
+
 void PythonTracer::stop() {
   gil_and_restore_thread gil;
+  if (gc_callback_registered_) {
+    unregister_gc_callback();
+    gc_callback_registered_ = false;
+  }
   if (active_) {
     for (const auto thread_state : interpreterThreads()) {
       if (thread_state->c_profilefunc == &PythonTracer::pyProfileFn) {
@@ -1141,6 +1237,7 @@ void PythonTracer::recordPyCall(
   const auto time = c10::getApproximateTime();
   is_startup_frame ? start_frames_.push_back({key, time})
                    : queue_->getSubqueue()->emplace_py_call(key, time);
+  ++tls.active_frames_;
 }
 
 void PythonTracer::recordCCall(
@@ -1160,6 +1257,7 @@ void PythonTracer::recordCCall(
   auto key = tls.intern<CallType::PyCCall, EventType::PyCCall>(
       arg, (void*)(fn->m_ml), frame);
   queue_->getSubqueue()->emplace_py_call(key, c10::getApproximateTime());
+  ++tls.active_frames_;
 }
 
 // ============================================================================
@@ -1457,11 +1555,20 @@ int PythonTracer::pyProfileFn(
 
     case PyTrace_RETURN:
       local_results.exit_times_.emplace_back(c10::getApproximateTime());
+      local_results.active_frames_--;
+      if (local_results.active_frames_ <
+          local_results.remaining_start_frames_) {
+        local_results.remaining_start_frames_ = local_results.active_frames_;
+      }
       break;
 
     case PyTrace_C_EXCEPTION:
     case PyTrace_C_RETURN:
-      local_results.c_exit_times_.emplace_back(c10::getApproximateTime());
+      if (local_results.active_frames_ >
+          local_results.remaining_start_frames_) {
+        local_results.c_exit_times_.emplace_back(c10::getApproximateTime());
+        local_results.active_frames_--;
+      }
       break;
   }
   return 0;
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 089c0571aea46..14591bc1fb4a1 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -803,6 +803,7 @@ static void _get_tensors_to_save(
         }
       }
     }
+    Py_CLEAR(self->to_save);
   }
 }
 // Save any variables that requested by to_save
@@ -810,7 +811,7 @@ static void _save_variables(
     const std::vector<std::optional<at::Tensor>>& tensors_to_save,
     const std::shared_ptr<PyNode>& cdata_ptr,
     THPFunction* self) {
-  if (!self->to_save)
+  if (tensors_to_save.size() == 0)
     return;
   size_t num_saved = tensors_to_save.size();
   self->saved_variables.clear();
@@ -823,8 +824,6 @@ static void _save_variables(
       self->saved_variables.emplace_back(opt_tensor.value(), is_output);
     }
   }
-  // Free .to_save
-  Py_CLEAR(self->to_save);
 }
 
 // Mark requires_grad = 0 on non-differentiable variables (as per
@@ -1054,7 +1053,8 @@ void _trace_post_record(
       }
     }
   }
-  py::object onnx_globals = py::module::import("torch.onnx._globals");
+  py::object onnx_globals =
+      py::module::import("torch.onnx._internal.torchscript_exporter._globals");
   py::bool_ is_in_onnx_export =
       py::module::import("torch.onnx.__init__").attr("is_in_onnx_export");
   py::bool_ is_autograd_inlining_enabled =
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index c184dd63d2949..712719304ad63 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -157,7 +157,7 @@ void pushPyOutToStack(
     const char* msg) {
   TORCH_CHECK(
       PyGILState_Check(), "GIL must be held before you call pushPyOutToStack");
-  auto schema_returns = op.schema().returns();
+  const auto& schema_returns = op.schema().returns();
   const auto num_returns = schema_returns.size();
   if (num_returns == 0) {
     // Check that we got a None return from Python. Anything else is an error.
@@ -209,7 +209,8 @@ PyObject* ParameterClass = nullptr;
 static PyObject* THPVariable_NewWithVar(
     PyTypeObject* type,
     const at::TensorBase& _var,
-    bool allow_preexisting_pyobj = false);
+    bool allow_preexisting_pyobj = false,
+    std::optional<bool> has_torch_dispatch_if_known = std::nullopt);
 
 // clang-tidy gets confused by static const
 static const char* VOLATILE_WARNING =
@@ -626,6 +627,65 @@ static PyObject* THPVariable_make_subclass(
   END_HANDLE_TH_ERRORS
 }
 
+// Shared code factored out of THPVariable_make_wrapper_subclass and
+// THPVariable_make_dtensor.
+static Tensor make_tensor_for_subclass_helper(
+    SymIntArrayRef sym_sizes,
+    OptionalSymIntArrayRef sym_strides,
+    const std::optional<c10::SymInt>& sym_storage_offset,
+    const TensorOptions& options,
+    const std::optional<c10::SymInt>& storage_size,
+    std::optional<DispatchKeySet> extra_dispatch_keys) {
+  AutoDispatchBelowADInplaceOrView guard{}; // TODO: Remove.
+  tracer::impl::NoTracerDispatchMode tracer_guard{};
+
+  c10::SymInt size_bytes;
+  auto dtype_itemsize = static_cast<int64_t>(options.dtype().itemsize());
+
+  if (storage_size.has_value()) {
+    size_bytes = storage_size.value();
+  } else if (sym_strides.has_value()) {
+    size_bytes = at::detail::computeStorageNbytes(
+        sym_sizes,
+        sym_strides.value(),
+        dtype_itemsize,
+        sym_storage_offset.value_or(0));
+  } else {
+    size_bytes = at::detail::computeStorageNbytesContiguous(
+        sym_sizes, dtype_itemsize, sym_storage_offset.value_or(0));
+  }
+
+  // We use storages **only** to track aliasing of subclasses during tracing.
+  // The actual data pointers are not valid.
+  Storage storage{
+      Storage::use_byte_size_t{},
+      size_bytes,
+      /*allocator=*/c10::GetAllocator(c10::kMeta),
+      /*resizable=*/true};
+  // TODO: constructor should probably accept data pointer
+  storage.set_data_ptr_noswap(at::DataPtr{nullptr, options.device()});
+
+  auto keys = c10::DispatchKeySet({options.computeDispatchKey()});
+  if (extra_dispatch_keys.has_value()) {
+    keys = keys | *extra_dispatch_keys;
+  }
+  Tensor tensor = at::detail::make_tensor<TensorImpl>(
+      std::move(storage), keys, options.dtype());
+
+  TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
+
+  if (sym_strides.has_value()) {
+    tensor_impl->set_sizes_and_strides(
+        sym_sizes, sym_strides.value(), sym_storage_offset);
+  } else {
+    TORCH_CHECK(
+        !sym_storage_offset.has_value(),
+        "setting storage offset without stride not supported");
+    tensor_impl->generic_set_sizes_contiguous(sym_sizes);
+  }
+  return tensor;
+}
+
 static PyObject* THPVariable_make_wrapper_subclass(
     PyObject*,
     PyObject* args,
@@ -693,69 +753,20 @@ static PyObject* THPVariable_make_wrapper_subclass(
 
   // don't bother releasing GIL here, as we are not allocating any nontrivial
   // data
-  Tensor tensor;
-
-  {
-    AutoDispatchBelowADInplaceOrView guard{}; // TODO: Remove.
-    tracer::impl::NoTracerDispatchMode tracer_guard{};
-
-    auto sym_sizes = r.symintlist(1);
-    auto sym_strides_own = r.symintlistOptional(2);
-    auto sym_strides =
-        static_cast<std::optional<c10::SymIntArrayRef>>(sym_strides_own);
-    auto sym_storage_offset = r.toSymIntOptional(3);
-
-    c10::SymInt size_bytes;
-    auto dtype_itemsize = static_cast<int64_t>(options.dtype().itemsize());
-    auto storage_size = r.toSymIntOptional(14);
-
-    if (storage_size.has_value()) {
-      size_bytes = storage_size.value();
-    } else if (sym_strides.has_value()) {
-      size_bytes = at::detail::computeStorageNbytes(
-          sym_sizes,
-          sym_strides.value(),
-          dtype_itemsize,
-          sym_storage_offset.value_or(0));
-    } else {
-      size_bytes = at::detail::computeStorageNbytesContiguous(
-          sym_sizes, dtype_itemsize, sym_storage_offset.value_or(0));
-    }
-
-    // We use storages **only** to track aliasing of subclasses during tracing.
-    // The actual data pointers are not valid.
-    Storage storage{
-        Storage::use_byte_size_t{},
-        size_bytes,
-        /*allocator=*/c10::GetAllocator(c10::kMeta),
-        /*resizable=*/true};
-    // TODO: constructor should probably accept data pointer
-    storage.set_data_ptr_noswap(at::DataPtr{nullptr, r.device(7)});
-
-    auto keys = c10::DispatchKeySet({options.computeDispatchKey()});
-    if (auto mb_extra_keys = r.toDispatchKeySetOptional(13)) {
-      keys = keys | *mb_extra_keys;
-    }
-    tensor = at::detail::make_tensor<TensorImpl>(
-        std::move(storage), keys, options.dtype());
-
-    TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
-
-    if (sym_strides.has_value()) {
-      tensor_impl->set_sizes_and_strides(
-          sym_sizes, sym_strides.value(), sym_storage_offset);
-    } else {
-      TORCH_CHECK(
-          !sym_storage_offset.has_value(),
-          "setting storage offset without stride not supported");
-      tensor_impl->generic_set_sizes_contiguous(sym_sizes);
-    }
-
-    const auto sizes_strides_policy = r.stringViewOptional(10);
-    if (sizes_strides_policy.has_value()) {
-      tensor.unsafeGetTensorImpl()->set_python_custom_sizes_strides(
-          parseSizesStridesPolicyArgument(*sizes_strides_policy));
-    }
+  auto sym_sizes = r.symintlist(1);
+  auto sym_strides_own = r.symintlistOptional(2);
+  Tensor tensor = make_tensor_for_subclass_helper(
+      /*sym_sizes=*/r.symintlist(1),
+      /*sym_strides=*/r.symintlistOptional(2),
+      /*sym_storage_offset=*/r.toSymIntOptional(3),
+      options,
+      /*storage_size=*/r.toSymIntOptional(14),
+      r.toDispatchKeySetOptional(13));
+
+  const auto sizes_strides_policy = r.stringViewOptional(10);
+  if (sizes_strides_policy.has_value()) {
+    tensor.unsafeGetTensorImpl()->set_python_custom_sizes_strides(
+        parseSizesStridesPolicyArgument(*sizes_strides_policy));
   }
 
   tensor.set_requires_grad(r.toBool(9));
@@ -767,7 +778,76 @@ static PyObject* THPVariable_make_wrapper_subclass(
     tensor.unsafeGetTensorImpl()->set_python_custom_layout(true);
   }
 
-  return THPVariable_NewWithVar((PyTypeObject*)cls, tensor);
+  return THPVariable_NewWithVar(
+      (PyTypeObject*)cls,
+      tensor,
+      // false is the default
+      /*allow_preexisting_pyobj=*/false,
+      // we checked __torch_dispatch__ above; avoid checking again.
+      /*has_torch_dispatch_if_known=*/true);
+  END_HANDLE_TH_ERRORS
+}
+
+// DTensor-specific variant of make_wrapper_subclass to minimize DTensor
+// overhead.
+static PyObject* THPVariable_make_dtensor(
+    PyObject*,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+      "_make_dtensor(PyObject* cls, SymIntArrayRef size, SymIntArrayRef strides, "
+      "Tensor local_tensor, bool requires_grad)",
+  });
+  ParsedArgs<5> parsed_args{};
+  auto r = parser.parse(args, kwargs, parsed_args);
+  PyObject* cls = r.pyobject(0);
+
+  TORCH_CHECK_TYPE(
+      PyType_Check(cls),
+      "cls must be a type (got ",
+      Py_TYPE(cls)->tp_name,
+      ")");
+  // See note about the __torch_dispatch__ check in
+  // THPVariable_make_wrapper_subclass above.
+  py::object attr = PyObject_FastGetAttrString(cls, "__torch_dispatch__");
+  TORCH_CHECK_TYPE(
+      attr.ptr() != nullptr &&
+          attr.ptr() != torch::disabled_torch_dispatch_impl(),
+      ((PyTypeObject*)cls)->tp_name,
+      " must define __torch_dispatch__");
+
+  const auto& local_tensor = r.tensor(3);
+  const auto options = TensorOptions()
+                           .dtype(local_tensor.dtype())
+                           .device(local_tensor.device())
+                           .layout(local_tensor.layout());
+
+  DispatchKeySet extra_dispatch_keys;
+  const auto tensor_keys = local_tensor.key_set();
+  if (tensor_keys.has(c10::DispatchKey::Conjugate)) {
+    extra_dispatch_keys = extra_dispatch_keys.add(c10::DispatchKey::Conjugate);
+  }
+  if (tensor_keys.has(c10::DispatchKey::Negative)) {
+    extra_dispatch_keys = extra_dispatch_keys.add(c10::DispatchKey::Negative);
+  }
+
+  Tensor tensor = make_tensor_for_subclass_helper(
+      /*sym_sizes=*/r.symintlist(1),
+      /*sym_strides=*/r.symintlist(2),
+      /*sym_storage_offset=*/std::nullopt,
+      options,
+      /*storage_size=*/std::nullopt,
+      extra_dispatch_keys);
+  tensor.set_requires_grad(r.toBool(4));
+  return THPVariable_NewWithVar(
+      (PyTypeObject*)cls,
+      tensor,
+      // false is the default
+      /*allow_preexisting_pyobj=*/false,
+      // we know DTensor has __torch_dispatch__ and we double-checked
+      // above; avoid checking again.
+      /*has_torch_dispatch_if_known=*/true);
   END_HANDLE_TH_ERRORS
 }
 
@@ -1661,6 +1741,10 @@ static PyMethodDef extra_methods[] = {
      castPyCFunctionWithKeywords(THPVariable_make_wrapper_subclass),
      METH_STATIC | METH_VARARGS | METH_KEYWORDS,
      nullptr},
+    {"_make_dtensor",
+     castPyCFunctionWithKeywords(THPVariable_make_dtensor),
+     METH_STATIC | METH_VARARGS | METH_KEYWORDS,
+     nullptr},
     {"_fix_weakref", THPVariable_fix_weakref, METH_NOARGS, nullptr},
     {"_view_func",
      castPyCFunctionWithKeywords(THPVariable_view_func),
@@ -2023,10 +2107,11 @@ static void THPVariable_subclass_dealloc(PyObject* self) {
 static PyObject* THPVariable_NewWithVar(
     PyTypeObject* type,
     const at::TensorBase& _var,
-    bool allow_preexisting_pyobj) {
+    bool allow_preexisting_pyobj,
+    std::optional<bool> has_torch_dispatch_if_known) {
   // Make sure that the reinterpret into a THPVariable* will be valid
   TORCH_CHECK(
-      PyType_IsSubtype(type, &THPVariableType),
+      type == &THPVariableType || PyType_IsSubtype(type, &THPVariableType),
       "Creating a Tensor subclass from a class ",
       "that does not inherit from Tensor is not possible. Make sure your class inherits from Tensor.");
 
@@ -2116,7 +2201,9 @@ static PyObject* THPVariable_NewWithVar(
       v->cdata = MaybeOwned<Variable>::owned(Variable(_var));
       const auto& var = THPVariable_Unpack(v);
       var.unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(obj);
-      if (check_has_torch_dispatch(obj)) {
+      if (has_torch_dispatch_if_known.has_value()
+              ? *has_torch_dispatch_if_known
+              : check_has_torch_dispatch(obj)) {
         var.unsafeGetTensorImpl()->set_python_dispatch(true);
       }
     }
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index 9dd811eabe794..e618ee703378f 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -61,6 +61,33 @@ Py_ssize_t THPVariable_length(PyObject* self) {
 // and tuples of those types. We also handle bools as if they were a
 // Variable[ByteTensor].
 
+// We only go one deep, because that's all torchdim needs (it supports
+// a tuple/list of FCDs which triggers a split behavior, but you can
+// only do it at the top level) and it's all the dispatcher will do
+// as well.
+static bool sequence_has_torch_function(PyObject* seq) {
+  auto length = PySequence_Length(seq);
+  if (length < 0) {
+    PyErr_Clear();
+    return false;
+  }
+
+  for (Py_ssize_t i = 0; i < length; i++) {
+    THPObjectPtr item(PySequence_GetItem(seq, i));
+    if (!item.get()) {
+      PyErr_Clear();
+      continue;
+    }
+
+    // Only check direct torch function on item (no recursion)
+    if (check_has_torch_function(item.get(), /*ignore_mode*/ true)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 static int64_t count_specified_dimensions(PyObject* index) {
   // Count the number of indexed dimensions (everything but ellipsis and None)
   // -1 is a sentinel for __torch_function__
@@ -68,8 +95,10 @@ static int64_t count_specified_dimensions(PyObject* index) {
   auto size = PyTuple_GET_SIZE(index);
   for (Py_ssize_t i = 0; i < size; i++) {
     PyObject* obj = PyTuple_GET_ITEM(index, i);
-    if (check_has_torch_function(obj))
+    if (check_has_torch_function(obj)) {
       return -1;
+    }
+
     if (THPVariable_Check(obj)) {
       const auto& var = THPVariable_Unpack(obj);
       const auto& var_scalar_type = var.scalar_type();
@@ -78,10 +107,17 @@ static int64_t count_specified_dimensions(PyObject* index) {
       } else {
         count++;
       }
-    } else if (
-        obj != Py_None && obj != Py_Ellipsis && obj != Py_True &&
-        obj != Py_False) {
-      count++;
+    } else {
+      // Check sequences for __torch_function__ (top-level only)
+      if (PySequence_Check(obj)) {
+        if (sequence_has_torch_function(obj)) {
+          return -1; // Signal torch function handling needed
+        }
+      }
+      if (obj != Py_None && obj != Py_Ellipsis && obj != Py_True &&
+          obj != Py_False) {
+        count++;
+      }
     }
   }
   return count;
@@ -398,7 +434,7 @@ PyObject* THPVariable_getitem(PyObject* self, PyObject* index) {
   variable_list variableIndices;
   int64_t specified_dims = count_specified_dimensions(holder.get());
   if (specified_dims == -1) {
-    return handle_torch_function_indexing(self, holder.get());
+    return handle_torch_function_indexing(self, index);
   }
   Variable sliced = applySlicing(
       self_,
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
index 5b59229a145c9..0124a0212bc61 100644
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -39,8 +39,11 @@ SavedVariable::SavedVariable(
     // follow.
     TORCH_CHECK(
         !variable.is_inference(),
-        "Inference tensors cannot be saved for backward. To work around "
-        "you can make a clone to get a normal tensor and use it in autograd.")
+        "Inference tensors cannot be saved for backward. Please do not use "
+        "Tensors created in inference mode in computation tracked by autograd. "
+        "To work around this, you can make a clone to get a normal tensor and "
+        "use it in autograd, or use `torch.no_grad()` instead of "
+        "`torch.inference_mode()`.");
 
     was_default_constructed_ = false;
     saved_version_ = variable._version();
diff --git a/torch/csrc/cuda/Graph.cpp b/torch/csrc/cuda/Graph.cpp
index 377a92667a30f..2a551ae28e96d 100644
--- a/torch/csrc/cuda/Graph.cpp
+++ b/torch/csrc/cuda/Graph.cpp
@@ -101,5 +101,16 @@ void THCPGraph_init(PyObject* module) {
             // compile error.
             return reinterpret_cast<uintptr_t>(graph);
           },
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "raw_cuda_graph_exec",
+          [](::at::cuda::CUDAGraph& self) {
+            cudaGraphExec_t graph_exec = self.raw_cuda_graph_exec();
+            // We return a raw int here, since otherwise pybind11 will
+            // try to return the underlying struct of cudaGraphExec_t
+            // points to, which is opaque and therefore causes a
+            // compile error.
+            return reinterpret_cast<uintptr_t>(graph_exec);
+          },
           py::call_guard<py::gil_scoped_release>());
 }
diff --git a/torch/csrc/cuda/MemPool.cpp b/torch/csrc/cuda/MemPool.cpp
index feb22e360bb98..b651a4b5e68aa 100644
--- a/torch/csrc/cuda/MemPool.cpp
+++ b/torch/csrc/cuda/MemPool.cpp
@@ -16,15 +16,12 @@ void THCPMemPool_init(PyObject* module) {
       .def(
           py::init([](c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator,
                       bool is_user_created,
-                      bool use_on_oom,
-                      bool symmetric) {
+                      bool use_on_oom) {
             torch::utils::device_lazy_init(at::kCUDA);
             return std::make_shared<::c10::cuda::MemPool>(
-                allocator, is_user_created, use_on_oom, symmetric);
+                allocator, is_user_created, use_on_oom);
           }))
       .def_property_readonly("id", &::c10::cuda::MemPool::id)
-      .def_property_readonly(
-          "is_symmetric", &::c10::cuda::MemPool::is_symmetric)
       .def_property_readonly("allocator", &::c10::cuda::MemPool::allocator)
       .def("use_count", &::c10::cuda::MemPool::use_count);
 }
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 555f7beb74c69..23094f1a06af0 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -5,7 +5,6 @@
 #include <c10/core/Device.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/util/UniqueVoidPtr.h>
-#include <fmt/core.h>
 #include <pybind11/pytypes.h>
 #include <torch/csrc/utils/python_arg_parser.h>
 #include <unordered_set>
@@ -20,8 +19,8 @@
 #include <ATen/cuda/detail/CUDAHooks.h>
 #include <ATen/cuda/jiterator.h>
 #include <ATen/cuda/tunable/Tunable.h>
-#include <c10/core/AllocatorConfig.h>
 #include <c10/core/StorageImpl.h>
+#include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
@@ -422,6 +421,16 @@ PyObject* THCPModule_cudaCachingAllocator_enable(
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* THCPModule_cudaCachingAllocator_set_allocator_settings(
+    PyObject* _unused,
+    PyObject* env) {
+  HANDLE_TH_ERRORS
+  c10::cuda::CUDACachingAllocator::setAllocatorSettings(
+      THPUtils_unpackString(env));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject* THCPModule_getAllocatorBackend(PyObject* _unused, PyObject* noargs) {
   HANDLE_TH_ERRORS
   return THPUtils_packString(c10::cuda::CUDACachingAllocator::name());
@@ -898,6 +907,8 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
   py::str release_lock_on_malloc_s = "release_lock_on_cudamalloc";
   py::str pinned_use_host_register_s = "pinned_use_cuda_host_register";
   py::str roundup_power2_divisions_s = "roundup_power2_divisions";
+  py::str graph_capture_record_stream_reuse_s =
+      "graph_capture_record_stream_reuse";
 
   allocator_settings[last_allocator_settings_s] =
       snapshot.config_metadata.last_allocator_settings;
@@ -913,6 +924,8 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
       snapshot.config_metadata.release_lock_on_malloc;
   allocator_settings[pinned_use_host_register_s] =
       snapshot.config_metadata.pinned_use_host_register;
+  allocator_settings[graph_capture_record_stream_reuse_s] =
+      snapshot.config_metadata.graph_capture_record_stream_reuse;
   unsigned int roundup_key = 1;
   py::dict roundup_settings;
   for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
@@ -1007,34 +1020,6 @@ PyObject* THCPModule_cudaGetSyncDebugMode(PyObject* self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
-std::string uuid_to_string(const char* uuid_bytes) {
-  // UUIDs are a 128-bit label. CUDA and HIP store this as char[16].
-  // For string representation, the code here expands this to
-  // 8-4-4-4-12 hex format, so each byte becomes 2 hex characters.
-  return fmt::format(
-      "{:02x}{:02x}{:02x}{:02x}-"
-      "{:02x}{:02x}-"
-      "{:02x}{:02x}-"
-      "{:02x}{:02x}-"
-      "{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
-      (uint8_t)uuid_bytes[0],
-      (uint8_t)uuid_bytes[1],
-      (uint8_t)uuid_bytes[2],
-      (uint8_t)uuid_bytes[3],
-      (uint8_t)uuid_bytes[4],
-      (uint8_t)uuid_bytes[5],
-      (uint8_t)uuid_bytes[6],
-      (uint8_t)uuid_bytes[7],
-      (uint8_t)uuid_bytes[8],
-      (uint8_t)uuid_bytes[9],
-      (uint8_t)uuid_bytes[10],
-      (uint8_t)uuid_bytes[11],
-      (uint8_t)uuid_bytes[12],
-      (uint8_t)uuid_bytes[13],
-      (uint8_t)uuid_bytes[14],
-      (uint8_t)uuid_bytes[15]);
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // Cuda module initialization
 ////////////////////////////////////////////////////////////////////////////////
@@ -2043,6 +2028,10 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_cudaCachingAllocator_enable,
      METH_O,
      nullptr},
+    {"_cuda_cudaCachingAllocator_set_allocator_settings",
+     THCPModule_cudaCachingAllocator_set_allocator_settings,
+     METH_O,
+     nullptr},
     {"_cuda_getAllocatorBackend",
      THCPModule_getAllocatorBackend,
      METH_NOARGS,
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
index 3abd4acddc796..3c96d5c5908dd 100644
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@@ -458,6 +458,8 @@ std::string _memory_snapshot_pickled() {
   IValue release_lock_on_malloc_s = "release_lock_on_cudamalloc";
   IValue pinned_use_host_register_s = "pinned_use_cuda_host_register";
   IValue roundup_power2_divisions_s = "roundup_power2_divisions";
+  IValue graph_capture_record_stream_reuse_s =
+      "graph_capture_record_stream_reuse";
 
   allocator_settings.insert(
       last_allocator_settings_s,
@@ -478,6 +480,9 @@ std::string _memory_snapshot_pickled() {
   allocator_settings.insert(
       pinned_use_host_register_s,
       snapshot.config_metadata.pinned_use_host_register);
+  allocator_settings.insert(
+      graph_capture_record_stream_reuse_s,
+      snapshot.config_metadata.graph_capture_record_stream_reuse);
   unsigned int roundup_key = 1;
   auto roundup_settings = new_dict();
   for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
index 0ae3bb62370f9..655e0a5578c29 100644
--- a/torch/csrc/distributed/c10d/Backend.hpp
+++ b/torch/csrc/distributed/c10d/Backend.hpp
@@ -47,6 +47,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
     const std::string backend;
     std::string group_name;
+    std::vector<uint64_t> global_ranks_in_group;
   };
 
   explicit Backend(int rank, int size);
diff --git a/torch/csrc/distributed/c10d/FakeProcessGroup.hpp b/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
index e8cdbfbbe8c89..dc3c4889057c8 100644
--- a/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
@@ -20,7 +20,25 @@ class FakeWork : public Work {
 
 class FakeProcessGroup : public Backend {
  public:
-  FakeProcessGroup(int rank, int size) : Backend(rank, size) {}
+  struct Options : Backend::Options {
+    explicit Options() : Backend::Options("fake") {}
+
+    int fake_option = 0;
+  };
+
+  FakeProcessGroup(
+      int rank,
+      int size,
+      c10::intrusive_ptr<Options> options = c10::make_intrusive<Options>())
+      : Backend(rank, size), options_(std::move(options)) {}
+
+  const std::string getBackendName() const override {
+    return "fake";
+  }
+
+  c10::intrusive_ptr<Backend::Options> getBackendOptions() override {
+    return c10::static_intrusive_pointer_cast<Backend::Options>(options_);
+  }
 
   c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& /* tensors */,
@@ -194,6 +212,9 @@ class FakeProcessGroup : public Backend {
       const BarrierOptions& /* opts */ = BarrierOptions()) override {
     return c10::make_intrusive<FakeWork>();
   }
+
+ private:
+  c10::intrusive_ptr<Options> options_;
 };
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/FileStore.cpp b/torch/csrc/distributed/c10d/FileStore.cpp
index 862c983d9e050..7b0fc862e680d 100644
--- a/torch/csrc/distributed/c10d/FileStore.cpp
+++ b/torch/csrc/distributed/c10d/FileStore.cpp
@@ -33,7 +33,11 @@
 #define LOCK_SH 0x00000010
 #define LOCK_UN 0x00000100
 
-int flock_(int fd, int op) {
+#if defined(_WIN32) && defined(USE_ROCM)
+static
+#endif
+    int
+    flock_(int fd, int op) {
   HANDLE hdl = (HANDLE)_get_osfhandle(fd);
   DWORD low = 1, high = 0;
   OVERLAPPED offset = {0, 0, 0, 0, NULL};
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp
index bc47f40c6dc61..2384448a06e75 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/distributed/c10d/FlightRecorderDetail.hpp>
+#include <fstream>
 
 namespace c10d {
 
@@ -39,7 +40,7 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
     auto cacheDirPath = std::filesystem::path(homeDir + "/.cache/torch");
     // Create the .cache directory if it doesn't exist
     std::filesystem::create_directories(cacheDirPath);
-    auto defaultLocation = cacheDirPath / "nccl_trace_rank_";
+    auto defaultLocation = cacheDirPath / "comm_lib_trace_rank_";
 
     // For internal bc compatibility, we keep the old the ENV check.
     std::string fileNamePrefix = getCvarString(
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.hpp b/torch/csrc/distributed/c10d/FlightRecorder.hpp
index 768889015fb75..b0974495a87a9 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.hpp
@@ -20,10 +20,10 @@ namespace c10d {
 // (minor when adding fields, major when changing existing fields)
 // Also update both JSON and Pickle dumps to make use of the newly defined
 // field(s).
-DEFINE_CONSTANT(version_val, "2.9")
+DEFINE_CONSTANT(version_val, "2.10")
 DEFINE_CONSTANT(entries_key, "entries")
 DEFINE_CONSTANT(nccl_comm_key, "nccl_comm_state")
-DEFINE_CONSTANT(nccl_version_key, "nccl_version")
+DEFINE_CONSTANT(comm_lib_version_key, "comm_lib_version")
 DEFINE_CONSTANT(version_key, "version")
 DEFINE_CONSTANT(pg_config_key, "pg_config")
 DEFINE_CONSTANT(pg_status_key, "pg_status")
@@ -179,7 +179,7 @@ struct FlightRecorder {
   std::map<size_t, std::shared_ptr<ProcessGroupStatus>> all_pg_status_ = {};
   std::map<std::tuple<std::string, std::string>, std::vector<uint64_t>>
       pg_name_to_ranks_ = {};
-  std::string nccl_version_;
+  std::string comm_lib_version_;
 
   std::optional<size_t> record(
       size_t pg_id,
@@ -200,7 +200,7 @@ struct FlightRecorder {
       const std::tuple<std::string, std::string>& pg_name,
       std::vector<uint64_t> ranks);
 
-  void record_accelerator_version(const std::string nccl_version);
+  void record_accelerator_version(const std::string comm_lib_version);
 
   void update_state(Entry& r);
 
diff --git a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
index 608b9157ac391..473372fd44b4c 100644
--- a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
@@ -128,12 +128,12 @@ void FlightRecorder<EventType>::record_pg_ranks(
 
 template <typename EventType>
 void FlightRecorder<EventType>::record_accelerator_version(
-    const std::string nccl_version) {
+    const std::string comm_lib_version) {
   if (!enabled_) {
     return;
   }
   std::lock_guard<std::mutex> guard(mutex_);
-  nccl_version_ = std::move(nccl_version);
+  comm_lib_version_ = std::move(comm_lib_version);
 }
 
 template <typename EventType>
@@ -425,7 +425,7 @@ std::string FlightRecorder<EventType>::dump_json(
     bool onlyActive) {
   json result;
   result[version_key_str] = version_val_str;
-  result[nccl_version_key_str] = nccl_version_;
+  result[comm_lib_version_key_str] = comm_lib_version_;
   result[pg_config_key_str] = getPgConfigJson();
   result[pg_status_key_str] = getPgStatusJson();
 
@@ -522,7 +522,7 @@ std::string FlightRecorder<EventType>::dump(
   // common values
   result.insert(version_key, version_val);
   result.insert(pg_config_key, getPgConfig());
-  result.insert(nccl_version_key_str, nccl_version_);
+  result.insert(comm_lib_version_key_str, comm_lib_version_);
   result.insert(pg_status_key, getPgStatus());
 
   // collective trace
diff --git a/torch/csrc/distributed/c10d/HashStore.cpp b/torch/csrc/distributed/c10d/HashStore.cpp
index 15befd9ec34e2..1055afc4847d0 100644
--- a/torch/csrc/distributed/c10d/HashStore.cpp
+++ b/torch/csrc/distributed/c10d/HashStore.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/distributed/c10d/HashStore.hpp>
 
-#include <unistd.h>
 #include <cstdint>
 
 #include <chrono>
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index 9625bbfdde351..fbd8a403b97dc 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -528,16 +528,16 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
   // use. Note: if the hostname does not resolve to an address (e.g.
   // because of misconfigured /etc/hosts file), this will not work.
   const auto hostNameMax = sysconf(_SC_HOST_NAME_MAX);
-  auto hostname = std::unique_ptr<char[]>(new char[hostNameMax]);
-  auto rv = gethostname(hostname.get(), hostNameMax);
+  std::string hostname(hostNameMax, '\0');
+  auto rv = gethostname(hostname.data(), hostNameMax);
   if (rv != 0) {
     C10_THROW_ERROR(DistBackendError, c10::utils::str_error(errno));
   }
 
   // Use this machine's hostname if it resolves to an address.
-  if (doesHostnameResolveToUsableAddress(hostname.get())) {
+  if (doesHostnameResolveToUsableAddress(hostname.data())) {
     return ::c10d::GlooDeviceFactory::makeDeviceForHostname(
-        hostname.get(), lazyInit);
+        hostname.data(), lazyInit);
   }
 
   // Otherwise, use the loopback address.
@@ -797,7 +797,10 @@ class AsyncBroadcastWork : public ProcessGroupGloo::AsyncWork {
   const int rootTensor;
   const uint32_t tag;
 
-  void broadcast(at::Tensor& tensor) {
+  void broadcast(at::Tensor tensor) {
+    if (tensor.is_complex()) {
+      tensor = at::view_as_real(tensor);
+    }
     const auto& scalarType = tensor.scalar_type();
     gloo::BroadcastOptions opts(context_);
     opts.setRoot(rootRank);
@@ -1128,13 +1131,22 @@ class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {
   const uint32_t tag;
 
   void reduce(std::vector<at::Tensor>& tensors) {
-    const auto& scalarType = tensors[0].scalar_type();
+    auto tensor = tensors[0];
+    if (tensor.is_complex()) {
+      TORCH_CHECK(
+          c10d::isComplexViewAsRealAllowed(reduceOp),
+          "reduce does not support",
+          reduceOp,
+          "on complex tensors");
+      tensor = at::view_as_real(tensor);
+    }
     gloo::ReduceOptions opts(context_);
+    const auto& scalarType = tensor.scalar_type();
     opts.setRoot(rootRank);
     opts.setTag(tag);
     opts.setReduceFunction(getFunction(scalarType, reduceOp));
     opts.setTimeout(timeout_);
-    GENERATE_ALL_TYPES(scalarType, setOutput, opts, tensors[0]);
+    GENERATE_ALL_TYPES(scalarType, setOutput, opts, tensor);
     gloo::reduce(opts);
 
     // Gloo doesn't support AVG so we use SUM + division.
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index 2bb10b2fecfd6..4297807f2e8b9 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -255,7 +255,6 @@ class TORCH_API ProcessGroupGloo : public Backend {
       return c10::make_intrusive<Options>(timeout);
     }
 
-    std::vector<uint64_t> global_ranks_in_group;
     std::vector<std::shared_ptr<::gloo::transport::Device>> devices;
     int threads;
   };
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
index ee5977ed380df..6e680b41fe8de 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
@@ -9,7 +9,7 @@ namespace c10d {
 class AsyncAllreduceCUDADeviceWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncAllreduceCUDADeviceWork(
-      const std::shared_ptr<gloo::Context>& context,
+      std::shared_ptr<gloo::Context> context,
       std::vector<at::Tensor>& inputs,
       ReduceOp reduceOp,
       uint32_t tag,
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp b/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
index 439a79490c9f1..442cb490743b2 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
@@ -232,8 +232,8 @@ void setInput(O& opts, at::Tensor& tensor, std::vector<int64_t>& counts) {
 }
 
 template <typename T, typename O>
-void setOutputs(O& opts, std::vector<at::Tensor>& tensors) {
-  opts.setOutputs(getDataPointers<T>(tensors), tensors[0].numel());
+void setOutputs(O& opts, std::vector<at::Tensor>& tensors, int64_t count) {
+  opts.setOutputs(getDataPointers<T>(tensors), count);
 }
 
 template <typename T, typename O>
@@ -289,12 +289,23 @@ class AsyncAllreduceWork : public ProcessGroupGloo::AsyncWork {
   const uint32_t tag;
 
   void allreduce(std::vector<at::Tensor>& tensors) {
-    const auto& scalarType = tensors[0].scalar_type();
+    auto tensor = tensors[0];
+    if (tensor.is_complex()) {
+      TORCH_CHECK(
+          c10d::isComplexViewAsRealAllowed(reduceOp),
+          "all_reduce does not support",
+          reduceOp,
+          "on complex tensors");
+      tensor = at::view_as_real(tensor);
+    }
     gloo::AllreduceOptions opts(context_);
+    const auto& scalarType = tensor.scalar_type();
     opts.setReduceFunction(getFunction(scalarType, reduceOp));
     opts.setTag(tag);
     opts.setTimeout(timeout_);
-    GENERATE_ALL_TYPES(scalarType, setOutputs, opts, tensors);
+    // Use tensor.numel() instead of tensors[0].numel() to
+    // get the right number of elements when tensors[0] is complex
+    GENERATE_ALL_TYPES(scalarType, setOutputs, opts, tensors, tensor.numel());
     gloo::allreduce(opts);
 
     // Gloo doesn't support AVG so we use SUM + division.
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 3cb6aee8b9df8..88782701c6a53 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -68,23 +68,6 @@ inline bool isUnsupportedFloat8(at::ScalarType t) {
   );
 }
 
-bool complexViewAsRealAllowed(const ReduceOp& reduceOp) {
-  switch (reduceOp) {
-    // NOLINTNEXTLINE(bugprone-branch-clone)
-    case ReduceOp::SUM:
-      return true;
-    case ReduceOp::AVG:
-      return true;
-    case ReduceOp::PREMUL_SUM:
-      return true;
-    case ReduceOp::UNUSED:
-      return true;
-    default:
-      return false;
-  }
-  return false;
-}
-
 #ifdef ENABLE_NCCL_PREMUL_SUM_SUPPORT
 template <typename T, ncclDataType_t dataType>
 ncclRedOpRAII unpackPreMulSum(
@@ -291,8 +274,12 @@ bool shouldAllCommunicatorsRegisterAllTensors() {
 // - This map has also to be maintained as global variable since the register
 //   hooks are called outside the scope of any PG, thus we need traverse
 //   communicators in all PGs.
-using MemPoolSet = std::
-    unordered_set<c10::cuda::MempoolId_t, c10::hash<c10::cuda::MempoolId_t>>;
+
+// MemPoolSet has ids of mempools used with this communicator, and whether they
+// were registered with window APIs or not
+using MemPoolSet = std::unordered_set<
+    std::tuple<c10::cuda::MempoolId_t, bool>,
+    c10::hash<std::tuple<c10::cuda::MempoolId_t, bool>>>;
 static std::unordered_map<std::shared_ptr<NCCLComm>, MemPoolSet>
     ncclCommMemPoolMap;
 static std::mutex ncclCommMemPoolMapMutex;
@@ -310,10 +297,23 @@ static void cacheAllocatorRegisterHook(
   std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
   for (auto& [ncclComm, memPools] : ncclCommMemPoolMap) {
     if (te.device_ == ncclComm->getDeviceIndex()) {
-      if (shouldAllCommunicatorsRegisterAllTensors() ||
-          memPools.find(te.mempool_) != memPools.end()) {
+      bool symm = false;
+      bool should_register = shouldAllCommunicatorsRegisterAllTensors();
+      auto it =
+          std::find_if(memPools.begin(), memPools.end(), [&](const auto& tup) {
+            return std::get<0>(tup) == te.mempool_;
+          });
+      if (it != memPools.end()) {
+        should_register = true;
+        symm = std::get<1>(*it);
+      }
+      if (should_register) {
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
-        ncclComm->registerSegment(reinterpret_cast<void*>(te.addr_), te.size_);
+        ncclComm->registerSegment(
+            reinterpret_cast<void*>(te.addr_),
+            te.size_,
+            /*errorOnRereg*/ false,
+            /*window*/ symm);
       }
     }
   }
@@ -330,10 +330,19 @@ static void cacheAllocatorDeregisterHook(
   std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
   for (auto& [ncclComm, memPools] : ncclCommMemPoolMap) {
     if (te.device_ == ncclComm->getDeviceIndex()) {
-      if (shouldAllCommunicatorsRegisterAllTensors() ||
-          memPools.find(te.mempool_) != memPools.end()) {
+      bool symm = false;
+      bool should_register = shouldAllCommunicatorsRegisterAllTensors();
+      auto it =
+          std::find_if(memPools.begin(), memPools.end(), [&](const auto& tup) {
+            return std::get<0>(tup) == te.mempool_;
+          });
+      if (it != memPools.end()) {
+        should_register = true;
+        symm = std::get<1>(*it);
+      }
+      if (should_register) {
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
-        ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_));
+        ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_), symm);
       }
     }
   }
@@ -968,8 +977,9 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   const std::string OFF = "OFF";
   std::string torch_distributed_debug =
       getCvarString({"TORCH_DISTRIBUTED_DEBUG"}, OFF.c_str());
-  LOG(INFO) << logPrefix() << "ProcessGroupNCCL initialization options: "
-            << "size: " << size << ", global rank: " << globalRank()
+  LOG(INFO) << logPrefix()
+            << "ProcessGroupNCCL initialization options: " << "size: " << size
+            << ", global rank: " << globalRank()
             << ", TIMEOUT(ms): " << options_->timeout.count()
             << ", USE_HIGH_PRIORITY_STREAM: "
             << options_->is_high_priority_stream
@@ -1089,39 +1099,27 @@ ErrorType ProcessGroupNCCL::getError() {
   return error_;
 }
 
-void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool) {
+void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool, bool symm) {
   const auto key = std::to_string(pool->device());
-  auto device = at::Device(at::DeviceType::CUDA, pool->device());
   LOG(INFO) << logPrefix()
             << "Performing NCCL user buffer registration for all buffers in "
             << "MemPool: " << pool->id() << ", device index: " << key
             << ", i am " << this;
   auto ncclComm = getNCCLComm(key);
   if (ncclComm == nullptr) {
-    // HACK: currently we are using this function for NVLS
-    // reductions, and that's why using OpType::ALLREDUCE.
-    // If we end up using this API for zero-copy P2P, we might
-    // need to refactor and account for different OpType.
-    ncclComm = initNCCLComm(key, device, OpType::ALLREDUCE);
+    C10_THROW_ERROR(
+        DistBackendError,
+        "NCCL communicator has not been initialized before mem pool creation. You can pass `device_id` to init_process_group -- one way of eager initialization -- to work around this issue");
   }
-  TORCH_INTERNAL_ASSERT(ncclComm != nullptr);
   {
     std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
     auto iter = ncclCommMemPoolMap.find(ncclComm);
-    iter->second.insert(pool->id());
+    iter->second.insert(std::make_tuple(pool->id(), symm));
   }
   // We must ensure we're listening for allocator trace events in order to
   // register future segments allocated in this pool (this call is idempotent).
   attachAllocatorHooks();
   auto snapshot = c10::cuda::CUDACachingAllocator::snapshot(pool->id());
-  // TODO:
-  // if(pool->is_symmetric()) {
-  //   Allgather to verify len(mempool.snapshot.segments) matches across GPUs
-  //   Allgather to verify mempool.alloc_request_counter matches across GPUs
-  //   add alloc_request_counter per mempool (How many allocations a mempool has
-  //   served during its lifetime) this should guarantee pool is used in a
-  //   symmetric/SPMD manner
-  // }
   for (const auto& segmentInfo : snapshot.segments) {
     TORCH_INTERNAL_ASSERT(
         segmentInfo.device == pool->device(),
@@ -1131,31 +1129,35 @@ void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool) {
         reinterpret_cast<void*>(segmentInfo.address),
         segmentInfo.total_size,
         /*errorOnRereg=*/false, // ignores reregistration error
-        /*window=*/pool->is_symmetric()); // whether to use NCCL symmetric
-                                          // memory
+        /*window*/ symm); // whether to use NCCL symmetric memory
   }
 }
 
 void ProcessGroupNCCL::deregisterMemPool(c10::cuda::MemPool* pool) {
   const auto key = std::to_string(pool->device());
-  auto device = at::Device(at::DeviceType::CUDA, pool->device());
   LOG(INFO) << logPrefix()
             << "Performing NCCL user buffer deregistration for all buffers in "
             << "MemPool: " << pool->id() << ", device index: " << key
             << ", i am " << this;
   auto ncclComm = getNCCLComm(key);
   if (ncclComm == nullptr) {
-    // HACK: currently we are using this function for NVLS
-    // reductions, and that's why using OpType::ALLREDUCE.
-    // If we end up using this API for zero-copy P2P, we might
-    // need to refactor and account for different OpType.
-    ncclComm = initNCCLComm(key, device, OpType::ALLREDUCE);
+    C10_THROW_ERROR(
+        DistBackendError,
+        "NCCL communicator has not been initialized before mem pool creation. You can pass `device_id` to init_process_group -- one way of eager initialization -- to work around this issue");
   }
-  TORCH_INTERNAL_ASSERT(ncclComm != nullptr);
+  bool symm;
   {
     std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
     auto iter = ncclCommMemPoolMap.find(ncclComm);
-    iter->second.erase(pool->id());
+    auto mempool_it = std::find_if(
+        iter->second.begin(), iter->second.end(), [&](const auto& tup) {
+          return std::get<0>(tup) == pool->id();
+        });
+    TORCH_CHECK(
+        mempool_it != iter->second.end(),
+        "Trying to unregister not previously registered pool");
+    symm = std::get<1>(*mempool_it);
+    iter->second.erase(mempool_it);
   }
   auto snapshot = c10::cuda::CUDACachingAllocator::snapshot(pool->id());
   for (const auto& segmentInfo : snapshot.segments) {
@@ -1164,7 +1166,7 @@ void ProcessGroupNCCL::deregisterMemPool(c10::cuda::MemPool* pool) {
         "Mismatch between CUDA memory segment device and pool's device");
     // NOLINTNEXTLINE(performance-no-int-to-ptr)
     ncclComm->deregisterSegment(
-        reinterpret_cast<void*>(segmentInfo.address), pool->is_symmetric());
+        reinterpret_cast<void*>(segmentInfo.address), symm);
   }
 }
 
@@ -2284,6 +2286,10 @@ void ProcessGroupNCCL::Watchdog::runLoop() {
       // Work status logging for desync debug
       desyncDebugger_.logWorkStart(work);
 
+      // allow watchdog to do an event query on a side thread
+      at::cuda::CUDAGuard device_guard(work.ncclEndEvent_->device_index());
+      at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeThreadLocal};
+
       // a work could be started but not completed, so we should not update
       // lastStartedSeq and lastStartedOpName if the work state is checked
       // multiple times after the start
@@ -2295,10 +2301,6 @@ void ProcessGroupNCCL::Watchdog::runLoop() {
         pg_->pgStatus_->lastStartedNumelOut = work.numelOut_;
       }
 
-      // allow watchdog to do an event query on a side thread
-      at::cuda::CUDAGuard device_guard(work.ncclEndEvent_->device_index());
-      at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeThreadLocal};
-
       // Clean up completed work
       if (work.isCompleted()) {
         // In case user didn't call `work.wait()` with async collectives,
@@ -4384,7 +4386,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
   auto tensor = tensors.back();
   if (tensor.is_complex()) {
     TORCH_CHECK(
-        complexViewAsRealAllowed(opts.reduceOp),
+        c10d::isComplexViewAsRealAllowed(opts.reduceOp),
         "all_reduce does not support",
         opts.reduceOp,
         "on complex tensors");
@@ -4578,7 +4580,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
   auto tensor = tensors.back();
   if (tensor.is_complex()) {
     TORCH_CHECK(
-        complexViewAsRealAllowed(opts.reduceOp),
+        c10d::isComplexViewAsRealAllowed(opts.reduceOp),
         "reduce does not support",
         opts.reduceOp,
         "on complex tensors");
@@ -5056,14 +5058,12 @@ c10::DeviceIndex ProcessGroupNCCL::guessDeviceId() const {
   // offset wrt the device id if intra-node GPUs are sharded into multiple
   // dimensions.
   int devIdx = globalRank() % localDeviceCount_;
-  LOG(WARNING)
-      << logPrefix()
-      << c10::str(
-             " using GPU ",
-             devIdx,
-             " as device used by this process is currently unknown. ",
-             "This can potentially cause a hang if this rank to GPU mapping is incorrect. ",
-             "You can specify device_id in init_process_group() to force use of a particular device.");
+  if (devIdx == 0) { // only log on first rank of each node
+    LOG(WARNING) << c10::str(
+        "Guessing device ID based on global rank. ",
+        "This can cause a hang if rank to GPU mapping is heterogeneous. ",
+        "You can specify device_id in init_process_group()");
+  }
   return static_cast<c10::DeviceIndex>(devIdx);
 }
 
@@ -5752,7 +5752,7 @@ at::Tensor ProcessGroupNCCL::allocateTensor(
     // Pool is created
     memPool_ = std::make_unique<c10::cuda::MemPool>(allocator);
     // Register so that we call ncclCommRegister on all new allocations
-    registerMemPool(memPool_.get());
+    registerMemPool(memPool_.get(), /*symmetric*/ false);
     LOG(INFO) << logPrefix() << "Created memory pool";
   }
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 7304a4a21b559..f7a3a28caceb3 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -545,7 +545,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // the int value of `NCCL_SPLIT_NOCOLOR` (-1) instead.
     int split_color{-2};
 #endif
-    std::vector<uint64_t> global_ranks_in_group;
   };
 
   // Helper class related to TORCH_NCCL_DESYNC_DEBUG
@@ -1003,7 +1002,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   // Performs NCCL user buffer registration for all buffers in
   // the given MemPool
-  void registerMemPool(c10::cuda::MemPool* pool);
+  void registerMemPool(c10::cuda::MemPool* pool, bool symm = false);
 
   // Performs NCCL user buffer de-registration for all buffers in
   // the given MemPool
diff --git a/torch/csrc/distributed/c10d/Types.cpp b/torch/csrc/distributed/c10d/Types.cpp
new file mode 100644
index 0000000000000..300d21780bdb0
--- /dev/null
+++ b/torch/csrc/distributed/c10d/Types.cpp
@@ -0,0 +1,22 @@
+#include <torch/csrc/distributed/c10d/Types.hpp>
+
+namespace c10d {
+
+bool isComplexViewAsRealAllowed(const ReduceOp& reduceOp) {
+  switch (reduceOp) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    case ReduceOp::SUM:
+      return true;
+    case ReduceOp::AVG:
+      return true;
+    case ReduceOp::PREMUL_SUM:
+      return true;
+    case ReduceOp::UNUSED:
+      return true;
+    default:
+      return false;
+  }
+  return false;
+}
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp
index 8fec5dd0e9e2f..18db14f5cef04 100644
--- a/torch/csrc/distributed/c10d/Types.hpp
+++ b/torch/csrc/distributed/c10d/Types.hpp
@@ -110,6 +110,8 @@ ReduceOp makeNCCLPreMulSum(const T& factor) {
   return rop;
 }
 
+TORCH_API bool isComplexViewAsRealAllowed(const ReduceOp& reduceOp);
+
 constexpr auto kUnsetTimeout = std::chrono::milliseconds(-1);
 
 struct BroadcastOptions {
diff --git a/torch/csrc/distributed/c10d/Work.cpp b/torch/csrc/distributed/c10d/Work.cpp
index cdec9185ce537..2c1ee42727d8a 100644
--- a/torch/csrc/distributed/c10d/Work.cpp
+++ b/torch/csrc/distributed/c10d/Work.cpp
@@ -1,5 +1,5 @@
 #include <ATen/ThreadLocalState.h>
-#include <distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/cuda/StreamBlock.hpp>
 
 #include <torch/csrc/distributed/c10d/Work.hpp>
diff --git a/torch/csrc/distributed/c10d/control_plane/Handlers.cpp b/torch/csrc/distributed/c10d/control_plane/Handlers.cpp
index 0b4a2f9568400..973197ded14fc 100644
--- a/torch/csrc/distributed/c10d/control_plane/Handlers.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/Handlers.cpp
@@ -4,7 +4,10 @@
 #include <mutex>
 #include <shared_mutex>
 #include <stdexcept>
+#include <string>
+#include <unordered_map>
 #include <utility>
+#include <vector>
 
 namespace c10d::control_plane {
 
diff --git a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
index 3049464d96eeb..76f58b8338615 100644
--- a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
+++ b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
@@ -151,7 +151,7 @@ at::Tensor async_input_mm_impl(
           reinterpret_cast<ElementB*>(b.data_ptr<at::BFloat16>()),
           stride_B,
       },
-      {{1, 1},
+      {{},
        reinterpret_cast<ElementC*>(out.data_ptr<at::BFloat16>()),
        stride_C,
        reinterpret_cast<ElementC*>(out.data_ptr<at::BFloat16>()),
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 824f26414c9fb..0189326683585 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1128,6 +1128,9 @@ This class does not support ``__members__`` property.)");
           &::c10d::symmetric_memory::has_multicast_support)
       .def_static("set_backend", &::c10d::symmetric_memory::set_backend)
       .def_static("get_backend", &::c10d::symmetric_memory::get_backend)
+      .def_static(
+          "get_mempool_allocator",
+          &::c10d::symmetric_memory::get_mempool_allocator)
       .def_property_readonly("rank", &SymmetricMemory::get_rank)
       .def_property_readonly("world_size", &SymmetricMemory::get_world_size)
       .def_property_readonly(
@@ -1167,6 +1170,7 @@ This class does not support ``__members__`` property.)");
       .def_property_readonly("buffer_size", &SymmetricMemory::get_buffer_size)
       .def_property_readonly(
           "signal_pad_size", &SymmetricMemory::get_signal_pad_size)
+      .def_property_readonly("offset", &SymmetricMemory::get_offset)
       .def(
           "get_buffer",
           &SymmetricMemory::get_buffer,
@@ -1198,6 +1202,12 @@ This class does not support ``__members__`` property.)");
           py::arg("src_rank"),
           py::arg("channel") = 0,
           py::arg("timeout_ms") = 0)
+      .def(
+          "get_remote_tensor",
+          &SymmetricMemory::get_remote_tensor,
+          py::arg("peer"),
+          py::arg("sizes"),
+          py::arg("dtype"))
       // Util functions that are often used together with symmetric memory but
       // not necessarily directly on symmetric memory.
       .def_static(
@@ -3086,7 +3096,11 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
               py::arg("backend"),
               py::arg("timeout") = kProcessGroupDefaultTimeout)
           .def_readonly("backend", &::c10d::Backend::Options::backend)
-          .def_readwrite("_timeout", &::c10d::Backend::Options::timeout);
+          .def_readwrite("_timeout", &::c10d::Backend::Options::timeout)
+          .def_readwrite(
+              "global_ranks_in_group",
+              &::c10d::Backend::Options::global_ranks_in_group)
+          .def_readwrite("group_name", &::c10d::Backend::Options::group_name);
 
 #ifdef USE_C10D_GLOO
   static const std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME";
@@ -3102,12 +3116,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
       processGroupGloo, "_Options", backendOptions)
       .def(py::init<>())
       .def_readwrite("_devices", &::c10d::ProcessGroupGloo::Options::devices)
-      .def_readwrite("_threads", &::c10d::ProcessGroupGloo::Options::threads)
-      .def_readwrite(
-          "global_ranks_in_group",
-          &::c10d::ProcessGroupGloo::Options::global_ranks_in_group)
-      .def_readwrite(
-          "group_name", &::c10d::ProcessGroupGloo::Options::group_name);
+      .def_readwrite("_threads", &::c10d::ProcessGroupGloo::Options::threads);
 
   processGroupGloo
       .def_static(
@@ -3335,7 +3344,11 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           .def(
               "perform_nocolor_split",
               &::c10d::ProcessGroupNCCL::performNocolorSplit)
-          .def("register_mem_pool", &::c10d::ProcessGroupNCCL::registerMemPool)
+          .def(
+              "register_mem_pool",
+              &::c10d::ProcessGroupNCCL::registerMemPool,
+              py::arg("pool"),
+              py::arg("symm") = false)
           .def(
               "deregister_mem_pool",
               &::c10d::ProcessGroupNCCL::deregisterMemPool)
@@ -3400,6 +3413,11 @@ for details.
 #ifdef NCCL_HAS_NVLS_CTAS
       .def_readwrite("nvls_ctas", &ncclConfig_t::nvlsCTAs)
 #endif
+      .def(
+          "unsafe_get_ptr",
+          [](const ncclConfig_t& self) {
+            return reinterpret_cast<uintptr_t>(&self);
+          })
       .def_property(
           "net_name",
           [](const ncclConfig_t& self) { return self.netName; },
@@ -3464,11 +3482,6 @@ Example::
           "split_from", &::c10d::ProcessGroupNCCL::Options::split_from)
       .def_readwrite(
           "split_color", &::c10d::ProcessGroupNCCL::Options::split_color)
-      .def_readwrite(
-          "global_ranks_in_group",
-          &::c10d::ProcessGroupNCCL::Options::global_ranks_in_group)
-      .def_readwrite(
-          "group_name", &::c10d::ProcessGroupNCCL::Options::group_name)
       .def(
           "__copy__",
           [](const ::c10d::ProcessGroupNCCL::Options& self) {
@@ -3507,17 +3520,49 @@ Example::
           .def(
               py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
                           int rank,
-                          int size) {
+                          int size,
+                          c10::intrusive_ptr<::c10d::ProcessGroupXCCL::Options>
+                              options) {
                 // gil_scoped_release is not safe as a call_guard in init.
                 // https://github.com/pybind/pybind11/issues/5473
                 py::gil_scoped_release nogil{};
-
                 return c10::make_intrusive<::c10d::ProcessGroupXCCL>(
-                    store, rank, size);
+                    store, rank, size, std::move(options));
               }),
               py::arg("store"),
               py::arg("rank"),
-              py::arg("size"));
+              py::arg("size"),
+              py::arg("options"),
+              R"(Create a new ProcessGroupXCCL instance.)");
+
+  intrusive_ptr_class_<::c10d::ProcessGroupXCCL::Options>(
+      processGroupXCCL, "Options", backendOptions)
+      .def(py::init<>());
+  module
+      .def(
+          "_dump_xccl_trace",
+          [](std::optional<bool> includeCollectives,
+             std::optional<bool> includeStackTraces,
+             std::optional<bool> onlyActive) {
+            return py::bytes(::c10d::dump_xccl_trace(
+                includeCollectives.value_or(true),
+                includeStackTraces.value_or(true),
+                onlyActive.value_or(false)));
+          },
+          py::arg("includeCollectives") = std::optional<bool>(),
+          py::arg("includeStackTraces") = std::optional<bool>(),
+          py::arg("onlyActive") = std::optional<bool>(),
+          R"(
+Arguments:
+    includeCollectives(bool, optional): Whether to include collective work traces. Default is True.
+    includeStackTraces(bool, optional): Whether to include stacktraces in the collective work traces. Default is True.
+    onlyActive (bool, optional): Whether to only include active collective work traces. Default is False.
+Returns:
+    Stringified pickle work traces.
+    Default settings return everything - i.e. contains XCCL comm dumps and collective traces.
+      )")
+      .def("get_xccl_version", [] { return ::c10d::getXcclVersion(); });
+
 #endif
 
 #ifdef USE_C10D_UCC
@@ -3776,14 +3821,27 @@ such as `dist.all_reduce(tensor, async_op=True)`.
 
   auto fakeProcessGroup =
       intrusive_ptr_no_gil_destructor_class_<::c10d::FakeProcessGroup>(
-          module, "FakeProcessGroup", backend)
-          .def(
-              py::init([](int rank, int size) {
-                return c10::make_intrusive<::c10d::FakeProcessGroup>(
-                    rank, size);
-              }),
-              py::arg("rank"),
-              py::arg("world_size"));
+          module, "FakeProcessGroup", backend);
+  intrusive_ptr_class_<::c10d::FakeProcessGroup::Options>(
+      fakeProcessGroup, "Options", backendOptions)
+      .def(py::init())
+      .def_readwrite(
+          "fake_option", &::c10d::FakeProcessGroup::Options::fake_option);
+  fakeProcessGroup
+      .def(
+          py::init([](int rank,
+                      int size,
+                      c10::intrusive_ptr<::c10d::FakeProcessGroup::Options>
+                          options) {
+            return c10::make_intrusive<::c10d::FakeProcessGroup>(
+                rank, size, std::move(options));
+          }),
+          py::arg("rank"),
+          py::arg("world_size"),
+          py::arg("options") =
+              c10::make_intrusive<::c10d::FakeProcessGroup::Options>())
+      .def_property_readonly(
+          "options", &::c10d::FakeProcessGroup::getBackendOptions);
   auto fakeWork =
       intrusive_ptr_no_gil_destructor_class_<::c10d::FakeWork>(
           module, "FakeWork", work)
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
index e9fc7aefaf57e..bd1446c579411 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@@ -5,6 +5,7 @@
 
 #include <ATen/ceil_div.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/PeerToPeerAccess.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/error.h>
@@ -46,11 +47,13 @@ AllocationRef::AllocationRef(
     void* ptr,
     HandleType handle,
     size_t block_size,
-    int device_idx)
+    int device_idx,
+    bool is_multicast)
     : ptr(ptr),
       handle(handle),
       block_size(block_size),
-      device_idx(device_idx) {}
+      device_idx(device_idx),
+      is_multicast(is_multicast) {}
 
 AllocationRef::~AllocationRef() {
   if (is_finalizing()) {
@@ -63,6 +66,12 @@ AllocationRef::~AllocationRef() {
   auto driver_api = c10::cuda::DriverAPI::get();
   C10_CUDA_DRIVER_CHECK(
       driver_api->cuMemUnmap_(reinterpret_cast<CUdeviceptr>(ptr), block_size));
+#if defined(CUDART_SUPPORTS_MULTICAST)
+  if (is_multicast) {
+    C10_CUDA_DRIVER_CHECK(
+        driver_api->cuMulticastUnbind_(handle, device_idx, 0, block_size));
+  }
+#endif
   C10_CUDA_DRIVER_CHECK(driver_api->cuMemRelease_(handle));
 #elif defined(USE_ROCM)
   C10_HIP_CHECK(hipMemUnmap(reinterpret_cast<hipDeviceptr_t>(ptr), block_size));
@@ -137,78 +146,6 @@ void* CUDASymmetricMemory::get_multicast_ptr() {
   return mc_addr_;
 }
 
-at::Tensor CUDASymmetricMemory::get_buffer(
-    int rank,
-    c10::IntArrayRef sizes,
-    c10::ScalarType dtype,
-    int64_t storage_offset) {
-  const size_t numel = std::accumulate(
-      sizes.begin(),
-      sizes.end(),
-      static_cast<size_t>(1),
-      std::multiplies<size_t>());
-  const auto element_size = c10::elementSize(dtype);
-  const auto req_size = (numel + storage_offset) * element_size;
-  TORCH_CHECK(
-      req_size <= buffer_size_,
-      "CUDASymmetricMemory::get_buffer: the requested size (",
-      req_size,
-      " bytes) exceeds the allocated size (",
-      buffer_size_,
-      " bytes)");
-  auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
-      storage_offset * element_size;
-  auto device = c10::Device(c10::DeviceType::CUDA, local_device_idx_);
-  auto options = at::TensorOptions().dtype(dtype).device(device);
-  return at::for_blob(data_ptr, sizes)
-      .options(options)
-      .target_device(device)
-      .make_tensor();
-}
-
-at::Tensor CUDASymmetricMemory::get_signal_pad(
-    int rank,
-    c10::IntArrayRef sizes,
-    std::optional<c10::ScalarType> dtype,
-    int64_t storage_offset) {
-  // If the dtype is unspecified, default it to UInt32, as it
-  // is the most common type for signaling purposes.
-  if (!dtype.has_value()) {
-    dtype = c10::ScalarType::UInt32;
-  }
-
-  // If the shape is unspecified, treat the signal pad as a 1d tensor.
-  const auto element_size = c10::elementSize(*dtype);
-  std::vector<int64_t> shape;
-  if (!sizes.empty()) {
-    shape = sizes.vec();
-  } else {
-    shape.push_back(signal_pad_size / element_size);
-  }
-
-  const size_t numel = std::accumulate(
-      shape.begin(),
-      shape.end(),
-      static_cast<size_t>(1),
-      std::multiplies<size_t>());
-  const auto req_size = (numel + storage_offset) * element_size;
-  TORCH_CHECK(
-      req_size <= signal_pad_size,
-      "CUDASymmetricMemory::get_signal_pad: the requested size (",
-      req_size,
-      " bytes) exceeds the allocated size (",
-      signal_pad_size,
-      " bytes)");
-  auto data_ptr = reinterpret_cast<uint8_t*>(signal_pads_[rank]) +
-      storage_offset * element_size;
-  auto device = c10::Device(c10::DeviceType::CUDA, local_device_idx_);
-  auto options = at::TensorOptions().dtype(*dtype).device(device);
-  return at::for_blob(data_ptr, shape)
-      .options(options)
-      .target_device(device)
-      .make_tensor();
-}
-
 void check_channel(int channel, int world_size) {
   TORCH_CHECK(
       channel >= 0,
@@ -379,6 +316,14 @@ int CUDASymmetricMemory::get_world_size() {
   return world_size_;
 }
 
+c10::Device CUDASymmetricMemory::get_device() {
+  return c10::Device(c10::DeviceType::CUDA, local_device_idx_);
+}
+
+bool CUDASymmetricMemory::world_within_direct_access() {
+  return true;
+}
+
 Block::Block(
     c10::intrusive_ptr<AllocationRef> alloc_ref,
     int device_idx,
@@ -412,23 +357,11 @@ void* CUDASymmetricMemoryAllocator::alloc(
   prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   // NOLINTNEXTLINE(bugprone-signed-char-misuse)
   prop.location.id = device_idx;
-  const auto driver_api = c10::cuda::DriverAPI::get();
-
+  bool has_fabric_support = at::cuda::get_fabric_access(device_idx);
+  LOG(INFO) << "CUDASymmetricMemoryAllocator::alloc: has_fabric_support " << has_fabric_support;
   if (handle_type_ == Expandable_Segments_Handle_Type::UNSPECIFIED) {
-    // Initialize NVML
-    if (driver_api->nvmlInit_v2_() == NVML_SUCCESS) {
-      // Get the driver version
-      int version = -1;
-      const auto res = driver_api->nvmlSystemGetCudaDriverVersion_v2_(&version);
-      if (res == NVML_SUCCESS) {
-        // Check if driver is sufficiently new
-        if (version < 12040) {
-          handle_type_ = Expandable_Segments_Handle_Type::POSIX_FD;
-        }
-      }
-    }
+    handle_type_ = has_fabric_support ? Expandable_Segments_Handle_Type::FABRIC_HANDLE : Expandable_Segments_Handle_Type::POSIX_FD;
   }
-
   if (handle_type_ == Expandable_Segments_Handle_Type::POSIX_FD) {
     prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
   } else {
@@ -436,24 +369,16 @@ void* CUDASymmetricMemoryAllocator::alloc(
   }
 
   size_t granularity;
+  auto driver_api = c10::cuda::DriverAPI::get();
   C10_CUDA_DRIVER_CHECK(driver_api->cuMemGetAllocationGranularity_(
       &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
   block_size = at::round_up(block_size, granularity);
 
   HandleType handle;
-  auto status = driver_api->cuMemCreate_(&handle, block_size, &prop, 0);
-  if (handle_type_ == Expandable_Segments_Handle_Type::UNSPECIFIED) {
-    if (status != CUDA_SUCCESS) {
-      prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
-      handle_type_ = Expandable_Segments_Handle_Type::POSIX_FD;
-      status = driver_api->cuMemCreate_(&handle, block_size, &prop, 0);
-    } else {
-      handle_type_ = Expandable_Segments_Handle_Type::FABRIC_HANDLE;
-    }
-  }
-  C10_CUDA_DRIVER_CHECK(status);
+  C10_CUDA_DRIVER_CHECK(driver_api->cuMemCreate_(&handle, block_size, &prop, 0));
 
 #elif defined(USE_ROCM)
+  handle_type_ = Expandable_Segments_Handle_Type::POSIX_FD;
   hipMemAllocationProp prop = {};
   prop.type = hipMemAllocationTypePinned;
   prop.location.type = hipMemLocationTypeDevice;
@@ -519,6 +444,7 @@ struct RendezvousRequest {
   size_t buffer_size;
   size_t signal_pad_offset;
   bool has_multicast_support;
+  char hostname[HOST_NAME_MAX + 1];
 };
 
 void validate_rendezvous_requests(
@@ -526,13 +452,15 @@ void validate_rendezvous_requests(
     int world_size) {
   TORCH_CHECK(reqs.size() == (size_t)world_size);
 
-  std::unordered_set<int> device_indices;
-  device_indices.reserve(world_size);
+  // For NVL72 systems, multiple hosts can be within a single nvlink domain.
+  // Multiple blocks will have same device_idx but they are on different hosts.
+  // Use (hostname, device_idx) pair to uniquely identify each allocation.
+  std::set<std::pair<std::string, int>> device_host_pairs;
   for (auto req : reqs) {
-    device_indices.insert(req.device_idx);
+    device_host_pairs.insert(std::make_pair(std::string(req.hostname), req.device_idx));
   }
   if (!allow_overlapping_devices() &&
-      device_indices.size() < (size_t)world_size) {
+      device_host_pairs.size() < (size_t)world_size) {
     TORCH_CHECK(
         false,
         "CUDASymmetricMemoryAllocator::rendezvous: ",
@@ -722,6 +650,9 @@ c10::intrusive_ptr<CUDASymmetricMemory> make_symm_mem(
       .buffer_size = block->buffer_size,
       .signal_pad_offset = block->signal_pad_offset,
       .has_multicast_support = device_has_multicast_support(block->device_idx)};
+
+  // Populate hostname field for host identification
+  gethostname(local_req.hostname, sizeof(local_req.hostname));
   auto reqs = storeExchange.all_gather(store, rank, world_size, local_req);
   validate_rendezvous_requests(reqs, world_size);
 
@@ -797,6 +728,10 @@ c10::intrusive_ptr<CUDASymmetricMemory> make_symm_mem(
   for (int r = 0; r < world_size; ++r) {
     if (r == rank) {
       alloc_refs.emplace_back(block->alloc_ref);
+      if (mc_addr != nullptr) {
+        alloc_refs.push_back(c10::make_intrusive<AllocationRef>(
+            mc_addr, mc_handle, block->block_size, block->device_idx, true));
+      }
       continue;
     }
     alloc_refs.push_back(c10::make_intrusive<AllocationRef>(
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
index a5340ffc9806e..39a6122bcdb27 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
@@ -15,12 +15,14 @@ struct AllocationRef : public c10::intrusive_ptr_target {
   HandleType handle;
   size_t block_size;
   int device_idx;
+  bool is_multicast;
 
   AllocationRef(
       void* ptr,
       HandleType handle,
       size_t block_size,
-      int device_idx);
+      int device_idx,
+      bool is_multicast = false);
 
   ~AllocationRef();
 };
@@ -50,24 +52,14 @@ class CUDASymmetricMemory : public SymmetricMemory {
   bool has_multicast_support() override;
   void* get_multicast_ptr() override;
 
-  at::Tensor get_buffer(
-      int rank,
-      c10::IntArrayRef sizes,
-      c10::ScalarType dtype,
-      int64_t storage_offset) override;
-
-  at::Tensor get_signal_pad(
-      int rank,
-      c10::IntArrayRef sizes,
-      std::optional<c10::ScalarType> dtype,
-      int64_t storage_offset) override;
-
   void barrier(int channel, size_t timeout_ms) override;
   void put_signal(int dst_rank, int channel, size_t timeout_ms) override;
   void wait_signal(int src_rank, int channel, size_t timeout_ms) override;
 
   int get_rank() override;
   int get_world_size() override;
+  c10::Device get_device() override;
+  bool world_within_direct_access() override;
 
  private:
   std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs_;
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
index 3a004ae73ce74..572c5a8fd369d 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
@@ -104,7 +104,8 @@ void init_elementwise_launch_config(
     size_t max_num_blocks,
     size_t max_num_threads,
     int& num_blocks,
-    int& num_threads) {
+    int& num_threads,
+    int world_size) {
   // Align to preserve alignment in each split
   const size_t aligned_numel = at::round_up(numel, alignment * splits);
   const size_t numel_per_split = aligned_numel / splits;
@@ -112,9 +113,11 @@ void init_elementwise_launch_config(
 
   if (numel_per_split <= max_num_threads * numel_per_thread) {
     num_blocks = 1;
-    num_threads = at::round_up(
-        at::ceil_div(numel_per_split, numel_per_thread),
-        static_cast<size_t>(at::cuda::warp_size()));
+    num_threads = at::ceil_div(numel_per_split, numel_per_thread);
+    // `sync_remote_blocks` maps threads to peers, so we need to make sure there
+    // are enough threads
+    num_threads = max(num_threads, world_size);
+    num_threads = at::round_up(num_threads, at::cuda::warp_size());
   } else {
     num_blocks = std::min(
         at::ceil_div(numel_per_split, max_num_threads * numel_per_thread),
@@ -185,7 +188,8 @@ at::Tensor multimem_all_reduce_(
       8,
       1024,
       num_blocks,
-      num_threads);
+      num_threads,
+      symm_mem->get_world_size());
 
   AT_DISPATCH_FLOAT_AND_BFLOAT16(
       input.scalar_type(), "multimem_all_reduce_", [&]() {
@@ -271,7 +275,8 @@ at::Tensor multimem_one_shot_all_reduce_out(
       8,
       1024,
       num_blocks,
-      num_threads);
+      num_threads,
+      symm_mem->get_world_size());
 
   AT_DISPATCH_FLOAT_AND_BFLOAT16(
       input.scalar_type(), "multimem_one_shot_all_reduce", [&]() {
@@ -378,7 +383,8 @@ at::Tensor multimem_all_gather_out(
       8,
       1024,
       num_blocks,
-      num_threads);
+      num_threads,
+      symm_mem->get_world_size());
 
   DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
     multimem_all_gather_kernel<k_alignment>
@@ -493,7 +499,8 @@ at::Tensor one_shot_all_reduce_out_impl(
       one_shot_all_reduce_max_num_blocks,
       one_shot_all_reduce_max_num_threads,
       num_blocks,
-      num_threads);
+      num_threads,
+      symm_mem->get_world_size());
 
   AT_DISPATCH_FLOAT_AND_BFLOAT16(
       input.scalar_type(), "one_shot_all_reduce", [&]() {
@@ -748,7 +755,8 @@ at::Tensor two_shot_all_reduce_impl(
       two_shot_all_reduce_max_num_blocks,
       two_shot_all_reduce_max_num_threads,
       num_blocks,
-      num_threads);
+      num_threads,
+      symm_mem->get_world_size());
 
   if (!output.has_value()) {
     AT_DISPATCH_FLOAT_AND_BFLOAT16(
@@ -895,7 +903,8 @@ at::Tensor reduce_scatter_out(
       two_shot_all_reduce_max_num_blocks,
       two_shot_all_reduce_max_num_threads,
       num_blocks,
-      num_threads);
+      num_threads,
+      symm_mem->get_world_size());
   if (split_last_dim) {
     AT_DISPATCH_FLOAT_AND_BFLOAT16(
         input.scalar_type(), "two_shot_all_reduce", [&]() {
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
index 958b547bd4cfa..daf273446ef3a 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
@@ -1,8 +1,18 @@
 #pragma once
 
+#include <cstdint>
+
 namespace c10d::symmetric_memory {
 
-constexpr size_t signal_pad_size = 2048;
+// Covers NVL72
+constexpr int max_cuda_p2p_domain_size = 72;
+// Maximum number of channels
+constexpr int symm_max_nblocks = 32;
+
+// Maximally, a rank will need to sync with all other ranks, over all
+// channels. Each signal is 32 bits, which is the minimum unit for atomic cas.
+constexpr size_t signal_pad_size =
+    symm_max_nblocks * max_cuda_p2p_domain_size * sizeof(uint32_t);
 
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 using HandleType = CUmemGenericAllocationHandle;
diff --git a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
index 55695ca27c8ec..0eda605fad6fb 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
@@ -93,82 +93,6 @@ class NCCLSymmetricMemory : public SymmetricMemory {
     return nullptr;
   }
 
-  // TODO: This is up for change.
-  at::Tensor get_buffer(
-      int rank,
-      c10::IntArrayRef sizes,
-      c10::ScalarType dtype,
-      int64_t storage_offset) override {
-    // TODO: deduplicate
-    const size_t numel = std::accumulate(
-        sizes.begin(),
-        sizes.end(),
-        static_cast<size_t>(1),
-        std::multiplies<size_t>());
-    const auto element_size = c10::elementSize(dtype);
-    const auto req_size = (numel + storage_offset) * element_size;
-    TORCH_CHECK(
-        req_size <= buffer_size_,
-        "NCCLSymmetricMemory::get_buffer: the requested size (",
-        req_size,
-        " bytes) exceeds the allocated size (",
-        buffer_size_,
-        " bytes)");
-    auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
-        storage_offset * element_size;
-    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
-    auto options = at::TensorOptions().dtype(dtype).device(device);
-    return at::for_blob(data_ptr, sizes)
-        .options(options)
-        .target_device(device)
-        .make_tensor();
-  }
-
-  // TODO: This is up for change.
-  at::Tensor get_signal_pad(
-      int rank,
-      c10::IntArrayRef sizes,
-      std::optional<c10::ScalarType> dtype,
-      int64_t storage_offset) override {
-    // TODO: deduplicate
-    // If the dtype is unspecified, default it to UInt32, as it
-    // is the most common type for signaling purposes.
-    if (!dtype.has_value()) {
-      dtype = c10::ScalarType::UInt32;
-    }
-
-    // If the shape is unspecified, treat the signal pad as a 1d tensor.
-    const auto element_size = c10::elementSize(*dtype);
-    std::vector<int64_t> shape;
-    if (!sizes.empty()) {
-      shape = sizes.vec();
-    } else {
-      shape.push_back(signal_pad_size / element_size);
-    }
-
-    const size_t numel = std::accumulate(
-        shape.begin(),
-        shape.end(),
-        static_cast<size_t>(1),
-        std::multiplies<size_t>());
-    const auto req_size = (numel + storage_offset) * element_size;
-    TORCH_CHECK(
-        req_size <= signal_pad_size,
-        "NCCLSymmetricMemory::get_signal_pad: the requested size (",
-        req_size,
-        " bytes) exceeds the allocated size (",
-        signal_pad_size,
-        " bytes)");
-    auto data_ptr = reinterpret_cast<uint8_t*>(signal_pads_[rank]) +
-        storage_offset * element_size;
-    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
-    auto options = at::TensorOptions().dtype(*dtype).device(device);
-    return at::for_blob(data_ptr, shape)
-        .options(options)
-        .target_device(device)
-        .make_tensor();
-  }
-
   void barrier(int channel, size_t timeout_ms) override {
     // TODO
   }
@@ -189,6 +113,10 @@ class NCCLSymmetricMemory : public SymmetricMemory {
     return world_size_;
   }
 
+  c10::Device get_device() override {
+    return c10::Device(c10::DeviceType::CUDA, device_idx_);
+  }
+
   virtual std::vector<int>& get_rank_to_global_rank() override {
     return rank_to_global_rank_;
   };
diff --git a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
index 1c513c66fae6f..f3a63b1c2d11c 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
@@ -21,7 +21,7 @@
 namespace c10d {
 namespace symmetric_memory {
 
-/* Start of CUDASymmetricMemory implementation */
+/* Start of NVSHMEMSymmetricMemory implementation */
 
 static StoreExchange storeExchange = StoreExchange("NVSHMEMSymmetricMemory");
 
@@ -43,21 +43,24 @@ struct NVSHMEMAllocation {
   }
 };
 
-class NVSHMEMSymmetricMemory : public SymmetricMemory {
+// A class to hold the base pointers and signal pad pointers for a group of
+// peers. One `NVSHMEMPeerAllocInfo` object can be shared by multiple
+// `NVSHMEMSymmetricMemory` objects when latter reside on the same allocation
+// and rendezvous over the same group. (The `NVSHMEMSymmetricMemory` objects may
+// have different offsets compared to the base address.)
+class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
  public:
-  NVSHMEMSymmetricMemory(
+  NVSHMEMPeerAllocInfo(
       std::shared_ptr<NVSHMEMAllocation> allocation,
       const std::string& group_name)
-      : allocation_(allocation),
-        buffer_size_(allocation->buffer_size),
-        device_idx_(allocation->device_idx),
-        group_name_(group_name) {
+      : base_ptr_(allocation->ptr),
+        buffer_size_(allocation->buffer_size) {
     // For logging only
     static int exchanged_n_times = 0;
-    c10::cuda::CUDAGuard guard(device_idx_);
+    c10::cuda::CUDAGuard guard(allocation->device_idx);
 
     auto global_rank = get_group_info("0").rank;
-    GroupInfo& group_info = get_group_info(group_name_);
+    GroupInfo& group_info = get_group_info(group_name);
     auto store = group_info.store;
     rank_ = group_info.rank;
     world_size_ = group_info.world_size;
@@ -70,19 +73,27 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
       if (rank_ == 0) {
         LOG(INFO) << "[rank " << rank_ << "]"
                   << " rank_to_global_rank: " << group_info.rank_to_global_rank
-                  << ", group_name: " << group_name_
+                  << ", group_name: " << group_name
                   << ", exchanged_n_times: " << exchanged_n_times;
       }
     }
     TORCH_INTERNAL_ASSERT(!group_info.rank_to_global_rank.empty());
     rank_to_global_rank_ = group_info.rank_to_global_rank;
+
+    world_within_cuda_p2p_ = true;
     for (int r = 0; r < world_size_; ++r) {
-      buffers_.push_back(nvshmem_ptr(
-          allocation->ptr, rank_to_global_rank_[r]));
+      auto peer_ptr = nvshmem_ptr(
+          base_ptr_, rank_to_global_rank_[r]);
+      buffers_.push_back(peer_ptr);
+      // If a peer is over network, `nvshmem_ptr` returns null
+      if (peer_ptr == nullptr) {
+        world_within_cuda_p2p_ = false;
+      }
     }
 
     // TODO: use the same allocation for signal pad
     void* signal_pad_ptr = nvshmem_malloc(signal_pad_size);
+    TORCH_CHECK(signal_pad_ptr != nullptr, "nvshmem_malloc failed");
     AT_CUDA_CHECK(cudaMemset(signal_pad_ptr, 0, signal_pad_size));
 
     for (int r = 0; r < world_size_; ++r) {
@@ -113,28 +124,70 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
         cudaMemcpyHostToDevice));
   }
 
+ private:
+  void* base_ptr_;
+  size_t buffer_size_;
+  int rank_;
+  int world_size_;
+  std::vector<void*> buffers_;
+  std::vector<void*> signal_pads_;
+  void** buffers_dev_;
+  void** signal_pads_dev_;
+  std::vector<int> rank_to_global_rank_;
+  int* rank_to_global_rank_dev_;
+  // Whether the world is within CUDA P2P only, not network
+  bool world_within_cuda_p2p_;
+
+  friend class NVSHMEMSymmetricMemory;
+};
+
+class NVSHMEMSymmetricMemory : public SymmetricMemory {
+ public:
+  NVSHMEMSymmetricMemory(
+      std::shared_ptr<NVSHMEMAllocation> allocation,
+      const std::string& group_name)
+      : allocation_(allocation),
+        device_idx_(allocation->device_idx),
+        group_name_(group_name) {
+    // A handle stores two types of info:
+    // (i) allocation's base ptrs and base signal pads, ours and peers'
+    pai_ = c10::make_intrusive<NVSHMEMPeerAllocInfo>(allocation, group_name);
+    // (ii) offset of tensor compared to base ptr (in byte)
+    offset_ = 0;
+  }
+
+  // Exact copy is not needed / supported
+  NVSHMEMSymmetricMemory(const NVSHMEMSymmetricMemory& other) = delete;
+
+  // Copy with offset is allowed
+  // This is mostly a shallow copy that shares the pointer to `NVSHMEMPeerAllocInfo` which has been created by `other`
+  NVSHMEMSymmetricMemory(const NVSHMEMSymmetricMemory& other, size_t offset)
+      : allocation_(other.allocation_), device_idx_(other.device_idx_), group_name_(other.group_name_), pai_(other.pai_) {
+    offset_ = offset;
+  }
+
   ~NVSHMEMSymmetricMemory() override{
       // TODO
   };
 
   std::vector<void*> get_buffer_ptrs() override {
-    return buffers_;
+    return pai_->buffers_;
   }
 
   std::vector<void*> get_signal_pad_ptrs() override {
-    return signal_pads_;
+    return pai_->signal_pads_;
   }
 
   void** get_buffer_ptrs_dev() override {
-    return buffers_dev_;
+    return pai_->buffers_dev_;
   }
 
   void** get_signal_pad_ptrs_dev() override {
-    return signal_pads_dev_;
+    return pai_->signal_pads_dev_;
   }
 
   size_t get_buffer_size() override {
-    return buffer_size_;
+    return pai_->buffer_size_;
   }
 
   size_t get_signal_pad_size() override {
@@ -151,78 +204,8 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
     return nullptr;
   }
 
-  at::Tensor get_buffer(
-      int rank,
-      c10::IntArrayRef sizes,
-      c10::ScalarType dtype,
-      int64_t storage_offset) override {
-    // TODO: deduplicate
-    const size_t numel = std::accumulate(
-        sizes.begin(),
-        sizes.end(),
-        static_cast<size_t>(1),
-        std::multiplies<size_t>());
-    const auto element_size = c10::elementSize(dtype);
-    const auto req_size = (numel + storage_offset) * element_size;
-    TORCH_CHECK(
-        req_size <= buffer_size_,
-        "NVSHMEMSymmetricMemory::get_buffer: the requested size (",
-        req_size,
-        " bytes) exceeds the allocated size (",
-        buffer_size_,
-        " bytes)");
-    auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
-        storage_offset * element_size;
-    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
-    auto options = at::TensorOptions().dtype(dtype).device(device);
-    return at::for_blob(data_ptr, sizes)
-        .options(options)
-        .target_device(device)
-        .make_tensor();
-  }
-
-  at::Tensor get_signal_pad(
-      int rank,
-      c10::IntArrayRef sizes,
-      std::optional<c10::ScalarType> dtype,
-      int64_t storage_offset) override {
-    // TODO: deduplicate
-    // If the dtype is unspecified, default it to UInt32, as it
-    // is the most common type for signaling purposes.
-    if (!dtype.has_value()) {
-      dtype = c10::ScalarType::UInt32;
-    }
-
-    // If the shape is unspecified, treat the signal pad as a 1d tensor.
-    const auto element_size = c10::elementSize(*dtype);
-    std::vector<int64_t> shape;
-    if (!sizes.empty()) {
-      shape = sizes.vec();
-    } else {
-      shape.push_back(signal_pad_size / element_size);
-    }
-
-    const size_t numel = std::accumulate(
-        shape.begin(),
-        shape.end(),
-        static_cast<size_t>(1),
-        std::multiplies<size_t>());
-    const auto req_size = (numel + storage_offset) * element_size;
-    TORCH_CHECK(
-        req_size <= signal_pad_size,
-        "NVSHMEMSymmetricMemory::get_signal_pad: the requested size (",
-        req_size,
-        " bytes) exceeds the allocated size (",
-        signal_pad_size,
-        " bytes)");
-    auto data_ptr = reinterpret_cast<uint8_t*>(signal_pads_[rank]) +
-        storage_offset * element_size;
-    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
-    auto options = at::TensorOptions().dtype(*dtype).device(device);
-    return at::for_blob(data_ptr, shape)
-        .options(options)
-        .target_device(device)
-        .make_tensor();
+  size_t get_offset() override {
+    return offset_;
   }
 
   void barrier(int channel, size_t timeout_ms) override {
@@ -238,40 +221,40 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
   }
 
   int get_rank() override {
-    return rank_;
+    return pai_->rank_;
   }
 
   int get_world_size() override {
-    return world_size_;
+    return pai_->world_size_;
+  }
+
+  c10::Device get_device() override {
+    return c10::Device(c10::DeviceType::CUDA, device_idx_);
   }
 
-  virtual const std::vector<int>& get_rank_to_global_rank() override {
-    return rank_to_global_rank_;
+  const std::vector<int>& get_rank_to_global_rank() override {
+    return pai_->rank_to_global_rank_;
   };
 
   int* get_rank_to_global_rank_dev() override {
-    return rank_to_global_rank_dev_;
+    return pai_->rank_to_global_rank_dev_;
   };
 
+  bool world_within_direct_access() {
+    return pai_->world_within_cuda_p2p_;
+  }
+
  private:
   std::shared_ptr<NVSHMEMAllocation> allocation_;
-  size_t buffer_size_;
-  std::vector<void*> buffers_;
-  std::vector<void*> signal_pads_;
   int device_idx_;
-  int rank_;
-  int world_size_;
-  void** buffers_dev_;
-  void** signal_pads_dev_;
   std::string group_name_;
-
-  std::vector<int> rank_to_global_rank_;
-  int* rank_to_global_rank_dev_;
+  c10::intrusive_ptr<NVSHMEMPeerAllocInfo> pai_;
+  size_t offset_{0};  // in byte
 };
 
 // Bootstrap based on user's setting for NCCL
 // Long term, this may be a bit unclean; short term, it improves UX
-void maybe_initialize_env_vars() {
+static void maybe_initialize_env_vars() {
   auto nccl_socket_if_name = c10::utils::get_env("NCCL_SOCKET_IFNAME");
   auto nccl_hca_list = c10::utils::get_env("NCCL_IB_HCA");
   auto nccl_ib_gid_index = c10::utils::get_env("NCCL_IB_GID_INDEX");
@@ -293,16 +276,20 @@ void maybe_initialize_env_vars() {
   }
 }
 
-void initialize_nvshmem_with_store(
+static void initialize_nvshmem_with_store(
     c10::intrusive_ptr<c10d::Store> store,
     int rank,
-    int world_size) {
+    int world_size,
+    int device_idx) {
   static bool is_initialized = false;
   if (is_initialized) {
     return;
   }
 
+  c10::cuda::CUDAGuard guard(device_idx);
   maybe_initialize_env_vars();
+  // Make sure the CUDA runtime is initialized.
+  cudaFree(nullptr);
 
   nvshmemx_uniqueid_t unique_id;
   NVSHMEM_CHECK(
@@ -337,14 +324,17 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
         group_name == std::nullopt,
         "NVSHMEMSymmetricMemoryAllocator::alloc "
         "must not be called with a group_name");
+    c10::cuda::CUDAGuard guard(device_idx);
 
     auto group_info = get_group_info("0");
     auto store = group_info.store;
     int rank = group_info.rank;
     int world_size = group_info.world_size;
 
-    initialize_nvshmem_with_store(store, rank, world_size);
+    initialize_nvshmem_with_store(store, rank, world_size, device_idx);
     auto ptr = nvshmem_malloc(size);
+    // If size is 0 (which is legal allocation request) we shouldn't error out
+    TORCH_CHECK(ptr != nullptr || size == 0, "nvshmem_malloc failed");
     auto allocation =
         std::make_shared<NVSHMEMAllocation>(ptr, size, device_idx);
     // TODO: thread safety
@@ -376,13 +366,48 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
         return it->second;
       }
     }
-    auto it = allocations_.find(ptr);
-    TORCH_CHECK(it != allocations_.end());
-    auto symm_mem =
-        c10::make_intrusive<NVSHMEMSymmetricMemory>(it->second, *group_name);
+    // In case of MemPool, tensor.storage().data_ptr() may not match
+    // exactly an allocation's base address. Thus we perform the search by
+    // testing if the former is within an allocation's range.
+    auto alloc_it = std::find_if(allocations_.begin(), allocations_.end(),
+                               [&](const auto& pair){
+                                  auto& allocation = pair.second;
+                                  auto ptr_int = reinterpret_cast<uintptr_t>(ptr);
+                                  auto base_ptr = reinterpret_cast<uintptr_t>(allocation->ptr);
+                                  return ptr_int >= base_ptr && ptr_int < base_ptr + allocation->buffer_size; });
+    TORCH_CHECK(alloc_it != allocations_.end(),
+        "Pointer not within any SymmetricMemory allocation, "
+        "is the tensor allocated from SymmetricMemory?");
+
+    auto& allocation = alloc_it->second;
+
+    // Search again using allocation base ptr (which is the key we use for caching, see below)
+    auto it = symm_mems_.find(std::make_tuple(allocation->ptr, *group_name));
+    c10::intrusive_ptr<NVSHMEMSymmetricMemory> symm_mem;
+    if (it != symm_mems_.end()) {
+      // Base allocation has been rendezvoused
+      symm_mem = it->second;
+    } else {
+      // Create a new rendezvous
+      symm_mem =
+          c10::make_intrusive<NVSHMEMSymmetricMemory>(allocation, *group_name);
+    }
 
-    symm_mems_[std::make_tuple(ptr, *group_name)] = symm_mem;
-    return symm_mem;
+    // Cache rendezvous using allocation's base address as key
+    symm_mems_[std::make_tuple(allocation->ptr, *group_name)] = symm_mem;
+
+    // TODO: change the `ptr` below to `tensor.data_ptr()` when adding support
+    // for user slice/view operations. For MemPool support,
+    // `tensor.storate().data_ptr()` is fine (today's `ptr`).
+
+    // If the tensor's ptr happen to be the same as allocation ptr
+    if (ptr == allocation->ptr) {
+      return symm_mem;
+    } else {
+      // Return a copy of the SymmetricMemory with an offset. This is a
+      // "shallow" copy adjusting the offset field in the handle.
+      return c10::make_intrusive<NVSHMEMSymmetricMemory>(*symm_mem, (uintptr_t)ptr - (uintptr_t)allocation->ptr);
+    }
   };
 
   bool has_multicast_support(int device_idx) override {
@@ -400,7 +425,7 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
 
  private:
   std::unordered_map<void*, std::shared_ptr<NVSHMEMAllocation>> allocations_;
-  std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<SymmetricMemory>>
+  std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<NVSHMEMSymmetricMemory>>
       symm_mems_;
 };
 
diff --git a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
index 2831a4416de9d..949e6d7c9fbd8 100644
--- a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
@@ -266,6 +266,167 @@ TORCH_API bool has_multicast_support(
     return allocator->has_multicast_support(device_idx);
   }
 }
+
+// MemPool Support
+
+// A map from device type to allocator for MemPool.
+// TODO: Consolidate with `AllocatorMap` above.
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+class MemPoolAllocatorMap {
+ public:
+  MemPoolAllocatorMap(const MemPoolAllocatorMap&) = delete;
+  MemPoolAllocatorMap& operator=(const MemPoolAllocatorMap&) = delete;
+  static MemPoolAllocatorMap& get() {
+    static MemPoolAllocatorMap instance;
+    return instance;
+  }
+
+  // Register allocator for MemPool given device type
+  void register_mempool_allocator(
+      c10::DeviceType device_type,
+      std::shared_ptr<c10::Allocator> allocator) {
+    mempool_allocators_[device_type] = std::move(allocator);
+  }
+
+  // Get allocator for MemPool given device
+  std::shared_ptr<c10::Allocator> get_mempool_allocator(c10::Device device) {
+    auto it = mempool_allocators_.find(device.type());
+    if (it == mempool_allocators_.end()) {
+      TORCH_CHECK(
+          false,
+          "SymmetricMemory MemPool did not find backend for device type ",
+          device.type());
+    }
+    return it->second;
+  }
+
+ private:
+  MemPoolAllocatorMap() = default;
+
+  std::unordered_map<c10::DeviceType, std::shared_ptr<c10::Allocator>>
+      mempool_allocators_;
+};
+
+// Register allocator for MemPool given device type
+C10_EXPORT void register_mempool_allocator(
+    c10::DeviceType device_type,
+    std::shared_ptr<c10::Allocator> allocator) {
+  return MemPoolAllocatorMap::get().register_mempool_allocator(
+      device_type, std::move(allocator));
+}
+
+// Get allocator for MemPool given device
+TORCH_API std::shared_ptr<c10::Allocator> get_mempool_allocator(
+    c10::Device device) {
+  return MemPoolAllocatorMap::get().get_mempool_allocator(device);
+}
+
+// Helper function:
+// Calculate the number of bytes of a tensor given its shape and dtype
+static inline size_t nbytes_of(c10::IntArrayRef sizes, c10::ScalarType dtype) {
+  const auto numel = std::accumulate(
+      sizes.begin(), sizes.end(), static_cast<size_t>(1), std::multiplies<>());
+  return numel * c10::elementSize(dtype);
+}
+
+// Helper function:
+// Get the buffer pointer for a peer at a given offset
+static at::Tensor get_buffer_at_byte_offset(
+    SymmetricMemory* handle,
+    int peer,
+    c10::IntArrayRef sizes,
+    c10::ScalarType dtype,
+    size_t offset_bytes) {
+  TORCH_CHECK(
+      peer >= 0 && peer < handle->get_world_size(),
+      "Invalid peer rank: ",
+      peer);
+  auto peer_ptr = handle->get_buffer_ptrs()[peer];
+  TORCH_CHECK(
+      peer_ptr != nullptr,
+      "Cannot get buffer across nodes, my rank: ",
+      handle->get_rank(),
+      ", peer: ",
+      peer);
+  const size_t tensor_bytes = nbytes_of(sizes, dtype);
+  const auto req_size = offset_bytes + tensor_bytes;
+  const auto buffer_size = handle->get_buffer_size();
+  TORCH_CHECK(
+      req_size <= buffer_size,
+      "SymmetricMemory::get_buffer: the requested size (",
+      req_size,
+      " bytes) exceeds the allocated size (",
+      buffer_size,
+      " bytes)");
+  auto data_ptr = reinterpret_cast<uint8_t*>(peer_ptr) + offset_bytes;
+  auto device = handle->get_device();
+  auto options = at::TensorOptions().dtype(dtype).device(device);
+  return at::for_blob(data_ptr, sizes)
+      .options(options)
+      .target_device(device)
+      .make_tensor();
+}
+
+// Implementation of SymmetricMemory APIs common to all backends
+
+at::Tensor SymmetricMemory::get_buffer(
+    int rank,
+    c10::IntArrayRef sizes,
+    c10::ScalarType dtype,
+    int64_t storage_offset) {
+  // storage_offset is in element, convert to byte
+  const auto offset_bytes = storage_offset * c10::elementSize(dtype);
+  return get_buffer_at_byte_offset(this, rank, sizes, dtype, offset_bytes);
+}
+
+at::Tensor SymmetricMemory::get_remote_tensor(
+    int peer,
+    c10::IntArrayRef sizes,
+    c10::ScalarType dtype) {
+  return get_buffer_at_byte_offset(this, peer, sizes, dtype, get_offset());
+}
+
+at::Tensor SymmetricMemory::get_signal_pad(
+    int rank,
+    c10::IntArrayRef sizes,
+    std::optional<c10::ScalarType> dtype,
+    int64_t storage_offset) {
+  // If the dtype is unspecified, default it to UInt32, as it
+  // is the most common type for signaling purposes.
+  if (!dtype.has_value()) {
+    dtype = c10::ScalarType::UInt32;
+  }
+
+  // If the shape is unspecified, treat the signal pad as a 1d tensor.
+  const auto element_size = c10::elementSize(*dtype);
+  const auto signal_pad_size = get_signal_pad_size();
+  std::vector<int64_t> shape;
+  if (!sizes.empty()) {
+    shape = sizes.vec();
+  } else {
+    shape.push_back(static_cast<int64_t>(signal_pad_size / element_size));
+  }
+
+  const auto req_pad_bytes = nbytes_of(shape, *dtype);
+  const auto offset_bytes = storage_offset * element_size;
+  const auto req_size = offset_bytes + req_pad_bytes;
+  TORCH_CHECK(
+      req_size <= signal_pad_size,
+      "SymmetricMemory::get_signal_pad: the requested size (",
+      req_size,
+      " bytes) exceeds the allocated size (",
+      signal_pad_size,
+      " bytes)");
+  auto data_ptr =
+      reinterpret_cast<uint8_t*>(get_signal_pad_ptrs()[rank]) + offset_bytes;
+  auto device = get_device();
+  auto options = at::TensorOptions().dtype(dtype).device(device);
+  return at::for_blob(data_ptr, shape)
+      .options(options)
+      .target_device(device)
+      .make_tensor();
+}
+
 } // namespace c10d::symmetric_memory
 
 namespace {
@@ -336,7 +497,8 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
 
   m.def("nvshmem_put(Tensor(a!) tensor, int peer) -> ()");
   m.def("nvshmem_get(Tensor(a!) tensor, int peer) -> ()");
-  m.def("nvshmem_broadcast(Tensor(a!) input, str group_name) -> Tensor(a!)");
+  m.def(
+      "nvshmem_broadcast(Tensor(a!) input, int root, str group_name) -> Tensor(a!)");
   m.def(
       "nvshmem_all_to_all(Tensor input, Tensor(a!) out, str group_name) -> Tensor(a!)");
   m.def(
diff --git a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
index c2828de04c9b3..d2cb70e1b1ae9 100644
--- a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
@@ -50,20 +50,29 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
   virtual size_t get_buffer_size() = 0;
   virtual size_t get_signal_pad_size() = 0;
 
+  virtual size_t get_offset() {
+    TORCH_CHECK(false, "NYI");
+  }
+
   virtual bool has_multicast_support() = 0;
   virtual void* get_multicast_ptr() = 0;
 
-  virtual at::Tensor get_buffer(
+  at::Tensor get_buffer(
       int rank,
       c10::IntArrayRef sizes,
       c10::ScalarType dtype,
-      int64_t storage_offset) = 0;
+      int64_t storage_offset);
 
-  virtual at::Tensor get_signal_pad(
+  at::Tensor get_signal_pad(
       int rank,
       c10::IntArrayRef sizes,
       std::optional<c10::ScalarType> dtype = std::nullopt,
-      int64_t storage_offset = 0) = 0;
+      int64_t storage_offset = 0);
+
+  at::Tensor get_remote_tensor(
+      int peer,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype);
 
   virtual void barrier(int channel, size_t timeout_ms) = 0;
   virtual void put_signal(int dst_rank, int channel, size_t timeout_ms) = 0;
@@ -71,6 +80,7 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
 
   virtual int get_rank() = 0;
   virtual int get_world_size() = 0;
+  virtual c10::Device get_device() = 0;
 
   virtual const std::vector<int>& get_rank_to_global_rank() {
     TORCH_CHECK(false, "NYI");
@@ -79,6 +89,12 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
   virtual int* get_rank_to_global_rank_dev() {
     TORCH_CHECK(false, "NYI");
   }
+
+  // Returns true if *all* peers within the group are accessible via direct
+  // memory load and store.
+  virtual bool world_within_direct_access() {
+    TORCH_CHECK(false, "NYI");
+  }
 };
 
 class SymmetricMemoryAllocator : public c10::intrusive_ptr_target {
@@ -184,4 +200,11 @@ TORCH_API void set_backend(const std::string& name);
 
 TORCH_API std::optional<std::string> get_backend(c10::Device device);
 
+C10_EXPORT void register_mempool_allocator(
+    c10::DeviceType device_type,
+    std::shared_ptr<c10::Allocator> allocator);
+
+TORCH_API std::shared_ptr<c10::Allocator> get_mempool_allocator(
+    c10::Device device);
+
 } // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp b/torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
new file mode 100644
index 0000000000000..bfbe02bd6f86d
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
@@ -0,0 +1,39 @@
+#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+namespace {
+using namespace c10d::symmetric_memory;
+
+// Alloc functor for MemPool
+void* cuda_symm_alloc(size_t size, int device, void* stream) {
+  static auto allocator = get_allocator(c10::DeviceType::CUDA);
+  TORCH_CHECK(
+      allocator->name() == "NVSHMEM", "Only NVSHMEM backend is supported");
+  // Note: this alloc functor works for the NVSHMEM and NCCL backends only,
+  // because only these backends takes `nullopt` for the `group` argument which
+  // is not given by MemPool's invocation (actually these two backends requires
+  // it to be `nullopt`).
+  return allocator->alloc(size, device, /*group_name=*/std::nullopt);
+}
+
+// Free functor for MemPool
+void cuda_symm_free(void* ptr, size_t size, int device, void* stream) {
+  static auto allocator = get_allocator(c10::DeviceType::CUDA);
+  TORCH_CHECK(
+      allocator->name() == "NVSHMEM", "Only NVSHMEM backend is supported");
+  allocator->free(ptr);
+}
+
+// Register allocator for CUDA MemPool
+struct RegisterCUDAMemPoolAllocator {
+  RegisterCUDAMemPoolAllocator() {
+    std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator> allocator =
+        torch::cuda::CUDAPluggableAllocator::createCustomAllocator(
+            cuda_symm_alloc, cuda_symm_free);
+    register_mempool_allocator(c10::DeviceType::CUDA, allocator);
+  }
+};
+
+static RegisterCUDAMemPoolAllocator register_cuda_mempool_allocator_;
+
+} // namespace
diff --git a/torch/csrc/distributed/c10d/symm_mem/env.hpp b/torch/csrc/distributed/c10d/symm_mem/env.hpp
new file mode 100644
index 0000000000000..d1998ef9070dd
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/env.hpp
@@ -0,0 +1,18 @@
+#include <c10/util/env.h>
+
+namespace c10d::symmetric_memory {
+
+static int getenv_nblocks() {
+  static int num_blocks = -1; // Uninitialized
+  if (num_blocks == -1) {
+    auto str = c10::utils::get_env("TORCH_SYMMMEM_NBLOCKS");
+    if (str.has_value()) {
+      num_blocks = std::stoi(str.value());
+    } else {
+      num_blocks = -2; // Not set
+    }
+  }
+  return num_blocks;
+}
+
+} // namespace c10d::symmetric_memory
\ No newline at end of file
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
index 55ebebb28e244..7c97d6cbc9dc8 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@@ -1,7 +1,10 @@
 #include <dlfcn.h>
+#include <ATen/ceil_div.h>
 #include <c10/cuda/CUDAGuard.h>
 
+#include <torch/csrc/distributed/c10d/symm_mem/env.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
+#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_team_manager.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
@@ -12,10 +15,18 @@
 // NVSHMEM minimum SM arch
 #define _NVSHMEM_MIN_SM_ARCH 700
 
+// If CUDA_ARCH is less than sm_70, or on sm_110, skip NVSHMEM device APIs
+#define _NVSHMEM_DEVICELIB_SUPPORTED 1
+#if defined(__CUDA_ARCH__)
+#  if (__CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH) || (__CUDA_ARCH__ == 1100)
+#    undef _NVSHMEM_DEVICELIB_SUPPORTED
+#  endif
+#endif
+
 // Some NVSHMEM device APIs do not compile on older SM archs
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH)
+#ifndef _NVSHMEM_DEVICELIB_SUPPORTED
 // Only include host APIs. See nvshmem.h for details.
-#define NVSHMEM_HOSTLIB_ONLY
+#  define NVSHMEM_HOSTLIB_ONLY
 #endif  // Must be done before nvshmem.h is included
 
 #include <nvshmem.h>
@@ -26,8 +37,6 @@ namespace c10d::nvshmem_extension {
 #define THREADS_PER_BLOCK 512
 #define WARP_SIZE 32
 
-constexpr int MiB = 1024 * 1024;
-
 extern "C" void nvshmem_init() __attribute__((weak));
 
 // Check if NVSHMEM is available
@@ -67,50 +76,22 @@ void nvshmemx_cumodule_init(uintptr_t module) {
     "nvshmemx_cumodule_init failed");
 }
 
-static std::unordered_map<std::string, nvshmem_team_t> group_name_to_team_;
-
-nvshmem_team_t group_to_team(
-    const std::string& group_name,
-    const std::vector<int>& global_ranks) {
-  auto it = group_name_to_team_.find(group_name);
-  if (it != group_name_to_team_.end()) {
-    return it->second;
-  }
-  TORCH_CHECK(global_ranks.size() > 1);
-  int stride = global_ranks[1] - global_ranks[0];
-  for (size_t r = 1; r < global_ranks.size(); ++r) {
-    TORCH_CHECK(global_ranks[r] - global_ranks[r - 1] == stride);
-  }
-
-  nvshmem_team_t team;
-  NVSHMEM_CHECK(
-      nvshmem_team_split_strided(
-          NVSHMEM_TEAM_WORLD,
-          global_ranks[0],
-          stride,
-          global_ranks.size(),
-          nullptr,
-          0,
-          &team),
-          "nvshmem_team_split_strided failed");
-  group_name_to_team_[group_name] = team;
-  TORCH_CHECK(team != NVSHMEM_TEAM_INVALID);
-  return team;
-}
-
-at::Tensor nvshmem_broadcast(at::Tensor& input, const std::string& group_name) {
+at::Tensor nvshmem_broadcast(at::Tensor& input, const int64_t root, const std::string& group_name) {
   auto input_hdl = c10d::symmetric_memory::rendezvous(input, group_name);
   int rank = input_hdl->get_rank();
-  int world_size = input_hdl->get_world_size();
-  auto team = group_to_team(group_name, input_hdl->get_rank_to_global_rank());
-  void* buffer_ptr = input_hdl->get_buffer_ptrs()[rank];
+  void* buffer_ptr = input.mutable_data_ptr();
+  auto buffer_size = input.numel() * input.element_size();
+  auto& team_manager = TeamManager::get(input.device());
+  auto team = team_manager.get_team(group_name, input_hdl->get_rank_to_global_rank());
+  int team_size = nvshmem_team_n_pes(team);
+  TORCH_CHECK(root < team_size, "root must be smaller than group size");
 
   auto stream = at::cuda::getCurrentCUDAStream();
-  nvshmemx_broadcastmem_on_stream(team, buffer_ptr, buffer_ptr, input_hdl->get_buffer_size(), 0, stream);
+  nvshmemx_broadcastmem_on_stream(team, buffer_ptr, buffer_ptr, buffer_size, root, stream);
   return input;
 }
 
-void nvshmem_put(at::Tensor& tensor, int64_t peer) {
+void nvshmem_put(at::Tensor& tensor, const int64_t peer) {
   // TODO: support non-contiguous tensors
   TORCH_CHECK(tensor.is_contiguous(),
       "put op currently supports contiguous tensors only");
@@ -119,13 +100,14 @@ void nvshmem_put(at::Tensor& tensor, int64_t peer) {
   auto rank = hdl->get_rank();
   void* buffer_ptr = hdl->get_buffer_ptrs()[rank];
   auto buffer_size = tensor.numel() * tensor.element_size();
+  TORCH_CHECK(peer < hdl->get_world_size(), "peer must be smaller than world size");
 
   c10::cuda::CUDAGuard guard(tensor.device());
   auto stream = at::cuda::getCurrentCUDAStream();
   nvshmemx_putmem_on_stream(buffer_ptr, tensor.data_ptr(), buffer_size, peer, stream);
 }
 
-void nvshmem_get(at::Tensor& tensor, int64_t peer) {
+void nvshmem_get(at::Tensor& tensor, const int64_t peer) {
   // TODO: support non-contiguous tensors
   TORCH_CHECK(tensor.is_contiguous(),
       "get op currently supports contiguous tensors only");
@@ -134,10 +116,11 @@ void nvshmem_get(at::Tensor& tensor, int64_t peer) {
   auto rank = hdl->get_rank();
   void* buffer_ptr = hdl->get_buffer_ptrs()[rank];
   auto buffer_size = tensor.numel() * tensor.element_size();
+  TORCH_CHECK(peer < hdl->get_world_size(), "peer must be smaller than world size");
 
   c10::cuda::CUDAGuard guard(tensor.device());
   auto stream = at::cuda::getCurrentCUDAStream();
-  nvshmemx_getmem_on_stream(tensor.data_ptr(), buffer_ptr, buffer_size, peer, stream);
+  nvshmemx_getmem_on_stream(tensor.mutable_data_ptr(), buffer_ptr, buffer_size, peer, stream);
 }
 
 at::Tensor nvshmem_all_to_all(
@@ -148,11 +131,17 @@ at::Tensor nvshmem_all_to_all(
   auto out_hdl = c10d::symmetric_memory::rendezvous(out, group_name);
   int rank = input_hdl->get_rank();
   int world_size = input_hdl->get_world_size();
-  auto team = group_to_team(group_name, input_hdl->get_rank_to_global_rank());
-
-  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
-  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
-  size_t bytes_per_rank = input_hdl->get_buffer_size() / world_size;
+  auto& team_manager = TeamManager::get(input.device());
+  auto team = team_manager.get_team(group_name, input_hdl->get_rank_to_global_rank());
+
+  void* input_ptr = input.data_ptr();
+  void* output_ptr = out.mutable_data_ptr();
+  TORCH_CHECK(input.is_contiguous() && out.is_contiguous());
+  TORCH_CHECK_EQ(input.numel(), out.numel());
+  TORCH_CHECK_EQ(input.dtype(), out.dtype());
+  TORCH_CHECK_EQ(input.numel() % world_size, 0);
+  auto buffer_size = input.numel() * input.element_size();
+  size_t bytes_per_rank = buffer_size / world_size;
 
   auto stream = at::cuda::getCurrentCUDAStream(input.device().index());
   nvshmemx_alltoallmem_on_stream(team, output_ptr, input_ptr, bytes_per_rank, stream);
@@ -191,15 +180,19 @@ __device__ int64_t prefixSum(int64_t *odata, int64_t *idata, int n) {
 // - input splits (IN)
 // - output splits (OUT) and
 // - source offsets (OUT).
-__global__ void exchangeSplitAndOffset(int64_t* in_out_splits, int mype, int npes) {
-#if __CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH
-  CUDA_KERNEL_ASSERT_MSG(false, "SM arch too old for NVSHMEM");
+__global__ void exchangeSplitAndOffset(int64_t* in_out_splits, nvshmem_team_t team) {
+#ifndef _NVSHMEM_DEVICELIB_SUPPORTED
+  CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
 #else
+  CUDA_KERNEL_ASSERT(team != NVSHMEM_TEAM_INVALID);
+  int mype = nvshmem_team_my_pe(team);
+  int npes = nvshmem_team_n_pes(team);
   auto input_splits = in_out_splits;
   auto output_splits = in_out_splits + npes;
   auto source_offsets = in_out_splits + npes * 2;
   int tid = threadIdx.x;
 
+  CUDA_KERNEL_ASSERT(npes <= THREADS_PER_BLOCK);
   __shared__ int64_t peer_offsets[THREADS_PER_BLOCK];
 
   // Scan input splits to get the source offsets
@@ -208,9 +201,10 @@ __global__ void exchangeSplitAndOffset(int64_t* in_out_splits, int mype, int npe
 
   // Use 1 block to do the exchange
   if (tid < npes) {
-    int peer = tid;
-    nvshmem_int64_p(source_offsets + mype, peer_offsets[peer], peer);
-    nvshmem_int64_p(output_splits + mype, input_splits[peer], peer);
+    // tid is peer index within team, but put calls require global rank
+    int peer_global = nvshmem_team_translate_pe(team, tid, NVSHMEM_TEAM_WORLD);
+    nvshmem_int64_p(source_offsets + mype, peer_offsets[tid], peer_global);
+    nvshmem_int64_p(output_splits + mype, input_splits[tid], peer_global);
   }
   // This barrier ensures that all remote PEs see the updated values
   nvshmemx_barrier_all_block();
@@ -220,10 +214,13 @@ __global__ void exchangeSplitAndOffset(int64_t* in_out_splits, int mype, int npe
 // This kernel is used to do the actual data exchange.
 // `in_out_splits` has the same definition as in `exchangeSplitAndOffset`.
 // `stride` is the stride at dim 0, unit in byte.
-__global__ void allToAllV(void *send_data, void *recv_data, int64_t* in_out_splits, size_t stride, int mype, int npes) {
-#if __CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH
-  CUDA_KERNEL_ASSERT_MSG(false, "SM arch too old for NVSHMEM");
+__global__ void allToAllV(void *send_data, void *recv_data, int64_t* in_out_splits, size_t stride, nvshmem_team_t team) {
+#ifndef _NVSHMEM_DEVICELIB_SUPPORTED
+  CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
 #else
+  CUDA_KERNEL_ASSERT(team != NVSHMEM_TEAM_INVALID);
+  int mype = nvshmem_team_my_pe(team);
+  int npes = nvshmem_team_n_pes(team);
   auto output_splits = in_out_splits + npes;
   auto source_offsets = in_out_splits + npes * 2;
   int bid = blockIdx.x;
@@ -231,6 +228,7 @@ __global__ void allToAllV(void *send_data, void *recv_data, int64_t* in_out_spli
   int blocks_per_peer = max(gridDim.x / npes, 1);
 
   // Calculate the output offsets
+  CUDA_KERNEL_ASSERT(npes <= THREADS_PER_BLOCK);
   __shared__ int64_t peer_offsets[THREADS_PER_BLOCK];
   prefixSum(peer_offsets, output_splits, npes);
   __syncthreads();
@@ -238,6 +236,7 @@ __global__ void allToAllV(void *send_data, void *recv_data, int64_t* in_out_spli
   // Target a different peer based on bid
   for (int i = bid / blocks_per_peer; i < npes; i += gridDim.x / blocks_per_peer) {
     int peer = (mype + i) % npes;
+    auto peer_global = nvshmem_team_translate_pe(team, peer, NVSHMEM_TEAM_WORLD);
     // Total amount from `peer`
     auto peer_size = output_splits[peer] * stride;
     // Amount to get from `peer` in this block
@@ -248,19 +247,36 @@ __global__ void allToAllV(void *send_data, void *recv_data, int64_t* in_out_spli
     auto block_offset = block_size * (bid % blocks_per_peer);
     auto source_offset = source_offsets[peer] * stride + block_offset;
     auto write_offset = peer_offsets[peer] * stride + block_offset;
-    nvshmemx_getmem_block(
+    nvshmemx_getmem_nbi_block(
       (char*)recv_data + write_offset,
       (char*)send_data + source_offset,
       block_size,
-      peer);
+      peer_global);
   }
   // Write out the output offsets (to the scratchpad line)
   if (bid == 0 && tid < npes) {
     source_offsets[tid] = peer_offsets[tid];
   }
+  // Make sure getmem_nbi calls finish
+  nvshmem_quiet();
 #endif
 }
 
+static int get_a2a_nblocks(size_t size, int world_size, bool intra_node) {
+  // Check user setting first
+  int num_blocks = c10d::symmetric_memory::getenv_nblocks();
+  if (num_blocks > 0) {  // set by user
+    return num_blocks;
+  }
+  // 16B per thread, 8 loops
+  constexpr size_t chunk_size = 16 * THREADS_PER_BLOCK * 8;
+  num_blocks = at::ceil_div(size, chunk_size);
+  // Allow kernel to target even number of blocks per peer
+  num_blocks = at::round_up(num_blocks, world_size);
+  const int max_blocks = intra_node ? 64 : 16;
+  return std::min(num_blocks, max_blocks);
+}
+
 at::Tensor all_to_all_vdev(
     at::Tensor& input,
     at::Tensor& out,
@@ -281,18 +297,22 @@ at::Tensor all_to_all_vdev(
   int rank = input_hdl->get_rank();
   int world_size = input_hdl->get_world_size();
 
-  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
-  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
-  int64_t* splits_ptr = (int64_t*)(splits_hdl->get_buffer_ptrs()[rank]);
+  void* input_ptr = input.data_ptr();
+  void* output_ptr = out.mutable_data_ptr();
+  int64_t* splits_ptr = (int64_t*)(in_out_splits.mutable_data_ptr());
 
-  auto stream = at::cuda::getCurrentCUDAStream(input.device().index());
+  TORCH_CHECK_EQ(input.device(), out.device());
+  auto device = input.device();
+  c10::cuda::CUDAGuard guard(device);
+  auto& team_manager = TeamManager::get(device);
+  auto team = team_manager.get_team(group_name, input_hdl->get_rank_to_global_rank());
+  auto stream = at::cuda::getCurrentCUDAStream(device.index());
 
   // Exchange output splits and source offsets
   // Use collective launch because kernel involves nvshmem barrier
   void* args0[] = {
       &splits_ptr,
-      &rank,
-      &world_size};
+      &team};
   nvshmemx_collective_launch(
       (const void*)exchangeSplitAndOffset,
       dim3(1),
@@ -302,28 +322,11 @@ at::Tensor all_to_all_vdev(
       stream);
 
   // CTA Tuning
-  // Intra-node: use multiple blocks per peer to increase data parallelism, up to 8.
-  // Up to 1 MB -> 1 block
-  // Up to 2 MB -> 2 blocks
-  // Up to 4 MB -> 4 blocks
-  // More -> 8 blocks
-  // The tuning for `num_blocks` below multiplies these numbers by world_size
-  // (e.g. 8 -> 8 * 8). If world_size is smaller, we simply shift the blocks
-  // towards data parallelism. (There may be room for improvement here)
   auto input_size = input.numel() * input.element_size();
-  int num_blocks = input_size < MiB ? 8 :
-      (input_size < 2 * MiB ? 16 :
-      (input_size < 4 * MiB ? 32 : 64));
-
-  // Inter-node: limit the total the number of blocks:
-  // = 16 for 16GPUs which is enough to max out 90 GB/s bandwidth perf
-  // = 8 for more than 16 GPUs which is enough to max out approx 50 GB/s bandwidth perf
-  // Above assumes 400Gb/s NIC for inter-node and 400GB/s NVLinks for intra-node comms.
-  // TODO: better intra vs inter detection, currently it is based on world_size.
-  int max_inter_node_blocks = world_size <= 16 ? 16 : 8;
-  if (world_size > 8) {
-    num_blocks = std::min(num_blocks, max_inter_node_blocks);
-  }
+  int num_blocks = get_a2a_nblocks(
+    input_size,
+    input_hdl->get_world_size(),
+    input_hdl->world_within_direct_access());
 
   // Stride at dim 0 (assuming input is contiguous, TODO)
   size_t stride_bytes = input.stride(0) * input.element_size();
@@ -334,8 +337,7 @@ at::Tensor all_to_all_vdev(
       &output_ptr,
       &splits_ptr,
       &stride_bytes,
-      &rank,
-      &world_size};
+      &team};
   nvshmemx_collective_launch(
       (const void*)allToAllV,
       dim3(num_blocks),
@@ -366,10 +368,13 @@ at::Tensor all_to_all_vdev(
 */
 
 template <bool HAS_IN_OFFSETS>
-__global__ void exchangeSplitAndOffset_2d(int64_t* in_splits_offsets, int64_t* out_splits_offsets, int mype, int npes, int ne, size_t input_dim0, bool rank_is_row_in) {
-#if __CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH
-  CUDA_KERNEL_ASSERT_MSG(false, "SM arch too old for NVSHMEM");
+__global__ void exchangeSplitAndOffset_2d(int64_t* in_splits_offsets, int64_t* out_splits_offsets, nvshmem_team_t team, int ne, size_t input_dim0, bool rank_is_row_in) {
+#ifndef _NVSHMEM_DEVICELIB_SUPPORTED
+  CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
 #else
+  CUDA_KERNEL_ASSERT(team != NVSHMEM_TEAM_INVALID);
+  int mype = nvshmem_team_my_pe(team);
+  int npes = nvshmem_team_n_pes(team);
   int nsplits = npes * ne;
   auto input_splits = in_splits_offsets;
   auto output_splits = out_splits_offsets;
@@ -408,8 +413,9 @@ __global__ void exchangeSplitAndOffset_2d(int64_t* in_splits_offsets, int64_t* o
     // (or vice versa).
     auto split_val = input_splits[tid];
     CUDA_KERNEL_ASSERT(split_val >= 0 && "split value is negative\n");
-    nvshmem_int64_p(source_offsets + dst_offset, input_offsets[tid], peer);
-    nvshmem_int64_p(output_splits + dst_offset, split_val, peer);
+    auto peer_global = nvshmem_team_translate_pe(team, peer, NVSHMEM_TEAM_WORLD);
+    nvshmem_int64_p(source_offsets + dst_offset, input_offsets[tid], peer_global);
+    nvshmem_int64_p(output_splits + dst_offset, split_val, peer_global);
   }
   // This barrier ensures that all remote PEs see the updated values
   nvshmemx_barrier_all_block();
@@ -462,9 +468,9 @@ __device__ int64_t prefixSum_warp(int64_t *odata, int64_t *idata, int n) {
 // In dispatch case, rank_is_row_out = false, major_size = ne, minor_size = npes.
 // In combine case, rank_is_row_out = true, major_size = npes, minor_size = ne.
 
-__global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_splits, int64_t* out_splits_offsets, size_t stride, int minor_size, int major_size, int64_t major_align, bool rank_is_row_out) {
-#if __CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH
-  CUDA_KERNEL_ASSERT_MSG(false, "SM arch too old for NVSHMEM");
+__global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_splits, int64_t* out_splits_offsets, size_t stride, int minor_size, int major_size, int64_t major_align, bool rank_is_row_out, nvshmem_team_t team) {
+#ifndef _NVSHMEM_DEVICELIB_SUPPORTED
+  CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
 #else
   int nsplits = minor_size * major_size;
   auto output_splits = out_splits_offsets;
@@ -534,16 +540,19 @@ __global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_split
     auto source_offset = source_offsets[eid] * stride;
     auto e_offset = tile_prefix_sums[row][col];
     auto write_offset = e_offset * stride;
-    nvshmemx_getmem_block(
+    auto peer_global = nvshmem_team_translate_pe(team, rank_is_row_out ? row : col, NVSHMEM_TEAM_WORLD);
+    nvshmemx_getmem_nbi_block(
       (char*)recv_data + write_offset,
       (char*)send_data + source_offset,
       peer_size,
-      rank_is_row_out ? row : col);  // peer
+      peer_global);  // peer's global index
   }
   // Write out the output offsets (to the scratchpad line)
   if (bid == 0 && tid < nsplits) {
     source_offsets[tid] = tile_prefix_sums[tid / minor_size][tid % minor_size];
   }
+  // Make sure getmem_nbi calls finish
+  nvshmem_quiet();
 #endif
 }
 
@@ -603,10 +612,10 @@ void all_to_all_vdev_2d(
   int64_t major_align_val = major_align.value_or(1);
   TORCH_CHECK(major_align_val > 0, "major_align must be positive");
 
-  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
-  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
-  int64_t* in_splits_ptr = (int64_t*)(in_splits_hdl->get_buffer_ptrs()[rank]);
-  int64_t* out_splits_offsets_ptr = (int64_t*)(out_splits_offsets_hdl->get_buffer_ptrs()[rank]);
+  void* input_ptr = input.data_ptr();
+  void* output_ptr = out.mutable_data_ptr();
+  int64_t* in_splits_ptr = (int64_t*)(in_splits.data_ptr());
+  int64_t* out_splits_offsets_ptr = (int64_t*)(out_splits_offsets.mutable_data_ptr());
 
   // Shape checks
   TORCH_CHECK(in_splits.is_contiguous()
@@ -645,6 +654,8 @@ void all_to_all_vdev_2d(
       "all tensor arguments must be on the same CUDA device");
   c10::cuda::CUDAGuard guard(device);
   auto stream = at::cuda::getCurrentCUDAStream();
+  auto& team_manager = TeamManager::get(device);
+  auto team = team_manager.get_team(group_name, input_hdl->get_rank_to_global_rank());
 
   // Exchange output splits and source offsets
   auto input_dim0 = input.size(0);
@@ -653,8 +664,7 @@ void all_to_all_vdev_2d(
   void* args0[] = {
       &in_splits_ptr,
       &out_splits_offsets_ptr,
-      &rank,
-      &world_size,
+      &team,
       &ne,
       &input_dim0,
       &rank_is_row_in};
@@ -685,7 +695,8 @@ void all_to_all_vdev_2d(
       &world_size,
       &ne,
       &major_align_val,
-      &rank_is_row_out};
+      &rank_is_row_out,
+      &team};
   nvshmemx_collective_launch(
       (const void*)allToAllV_2d,
       dim3(num_blocks),
@@ -737,10 +748,10 @@ void all_to_all_vdev_2d_offset(
 
   int64_t major_align_val = 0;
 
-  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
-  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
-  int64_t* out_splits_offsets_ptr = (int64_t*)(out_splits_offsets_hdl->get_buffer_ptrs()[rank]);
-  int64_t* in_splits_offsets_ptr = (int64_t*)(in_splits_offsets_hdl->get_buffer_ptrs()[rank]);
+  void* input_ptr = input.data_ptr();
+  void* output_ptr = out.mutable_data_ptr();
+  int64_t* out_splits_offsets_ptr = (int64_t*)(out_splits_offsets.mutable_data_ptr());
+  int64_t* in_splits_offsets_ptr = (int64_t*)(in_splits_offsets.data_ptr());
 
   // Shape checks
   TORCH_CHECK(out_splits_offsets.is_contiguous()
@@ -778,6 +789,8 @@ void all_to_all_vdev_2d_offset(
       "all tensor arguments must be on the same CUDA device");
   c10::cuda::CUDAGuard guard(device);
   auto stream = at::cuda::getCurrentCUDAStream();
+  auto& team_manager = TeamManager::get(device);
+  auto team = team_manager.get_team(group_name, input_hdl->get_rank_to_global_rank());
 
   // Exchange output splits and source offsets
   auto input_dim0 = input.size(0);
@@ -786,8 +799,7 @@ void all_to_all_vdev_2d_offset(
   void* args0[] = {
       &in_splits_offsets_ptr,
       &out_splits_offsets_ptr,
-      &rank,
-      &world_size,
+      &team,
       &ne,
       &input_dim0,
       &rank_is_row_in};
@@ -818,7 +830,8 @@ void all_to_all_vdev_2d_offset(
       &ne,
       &world_size,
       &major_align_val,
-      &rank_is_row_out};
+      &rank_is_row_out,
+      &team};
   nvshmemx_collective_launch(
       (const void*)allToAllV_2d,
       dim3(num_blocks),
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
index f364e2ebfa3df..fc37bd931fa90 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
@@ -21,11 +21,11 @@ TORCH_API bool is_nvshmem_available();
 // operations.
 TORCH_API void nvshmemx_cumodule_init(uintptr_t module);
 
-TORCH_API void nvshmem_put(at::Tensor& tensor, int64_t peer);
+TORCH_API void nvshmem_put(at::Tensor& tensor, const int64_t peer);
 
-TORCH_API void nvshmem_get(at::Tensor& tensor, int64_t peer);
+TORCH_API void nvshmem_get(at::Tensor& tensor, const int64_t peer);
 
-at::Tensor nvshmem_broadcast(at::Tensor& input, const std::string& group_name);
+at::Tensor nvshmem_broadcast(at::Tensor& input, const int64_t root, const std::string& group_name);
 
 at::Tensor nvshmem_all_to_all(
     at::Tensor& input,
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_team_manager.hpp b/torch/csrc/distributed/c10d/symm_mem/nvshmem_team_manager.hpp
new file mode 100644
index 0000000000000..0deda8e8f6575
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_team_manager.hpp
@@ -0,0 +1,170 @@
+#pragma once
+
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Exception.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+// Starting from NVSHMEM 3.3.9, nvshmem_host.h exists so that we can cleanly
+// include only the nvshmem host library headers:
+// #include <nvshmem_host.h>
+// It translates into the following two lines:
+#include <host/nvshmem_api.h>
+#include <host/nvshmemx_api.h>
+// For maximum compatibility, we use the "host/" style for now.
+
+namespace c10d::nvshmem_extension {
+
+// This corresponds to max nblocks
+constexpr int MAX_N_TEAMS = 128;
+
+// A pool of teams for each group. These are duplicate teams.
+using TeamPool = std::vector<nvshmem_team_t>;
+
+// Manage all the team business. Singleton.
+class TeamManager {
+ public:
+  // Constructor
+  explicit TeamManager(const c10::Device device) : device_(device) {}
+
+  // Get single, global manager.
+  static TeamManager& get(const c10::Device device) {
+    static TeamManager manager(device);
+    TORCH_CHECK(
+        manager.device_ == device,
+        "Detected use of TeamManager on multiple devices. This is not supported.");
+    return manager;
+  }
+
+  // Get a team for a group.
+  nvshmem_team_t get_team(
+      const std::string& group_name,
+      const std::vector<int>& global_ranks) {
+    auto [team_pool, pool_updated] =
+        group_to_team_pool(group_name, global_ranks, 1);
+    // Return the fist available team
+    return team_pool[0];
+  }
+
+  // Get n teams for a group.
+  // The first element of the returned pair is the team pool on host side.
+  // The second element of the returned pair is the team pool on device side.
+  // This API must be call with a device guard.
+  std::pair<const TeamPool&, nvshmem_team_t*> get_n_teams(
+      const std::string& group_name,
+      const std::vector<int>& global_ranks,
+      const int need_n) {
+    // A device guard is required for malloc and memcpy below
+    c10::cuda::CUDAGuard guard(device_);
+    // Get the team pool with the requested number of teams
+    auto [team_pool, pool_updated] =
+        group_to_team_pool(group_name, global_ranks, need_n);
+    // Check if the pool already exists in device memory
+    nvshmem_team_t* team_pool_dev = nullptr;
+    constexpr auto pool_bytes = sizeof(nvshmem_team_t) * MAX_N_TEAMS;
+    auto it = team_pool_devptrs_.find(group_name);
+    if (it == team_pool_devptrs_.end()) {
+      // If not, allocate a new pool in device memory
+      team_pool_dev = reinterpret_cast<nvshmem_team_t*>(
+          c10::cuda::CUDACachingAllocator::raw_alloc(pool_bytes));
+      team_pool_devptrs_[group_name] = team_pool_dev;
+    } else {
+      team_pool_dev = it->second;
+    }
+    // Update the pool in device memory if host side pool is updated
+    if (pool_updated) {
+      TORCH_INTERNAL_ASSERT(team_pool.size() == MAX_N_TEAMS);
+      auto stream = at::cuda::getCurrentCUDAStream();
+      C10_CUDA_CHECK(cudaMemcpyAsync(
+          team_pool_dev,
+          team_pool.data(),
+          pool_bytes,
+          cudaMemcpyHostToDevice,
+          stream));
+    }
+    return std::make_pair(std::cref(team_pool), team_pool_dev);
+  }
+
+  ~TeamManager() noexcept {
+    // Free the team pools in device memory
+    // Note that we do it in a best effort manner because the team pool is
+    // managed by a static TeamManager and the destruction order of static
+    // objects is undetermined. If the destructor is called after the CUDA
+    // context is destroyed, cudaFree would fail.
+    try {
+      // cudaFree generally implies a device synchronization, meaning it will
+      // block until all preceding CUDA operations on the device have completed
+      // before freeing the memory. Thus we don't need to worry about freeing
+      // the memory before CUDA kernels complete.
+      for (auto& [_, team_pool_dev] : team_pool_devptrs_) {
+        c10::cuda::CUDACachingAllocator::raw_delete(team_pool_dev);
+      }
+    } catch (...) {
+      // Ignore the error
+      std::cerr << "Failed to free the team pool in device memory, skipping\n";
+    }
+  }
+
+ private:
+  // Get the team pool for a group. If the pool doesn't exist, create it. If the
+  // pool exists but is not large enough, create more teams.
+  // The first element of the returned pair is the team pool on host side.
+  // The second element of the returned pair is a boolean indicating if the pool
+  // is updated.
+  std::pair<const TeamPool&, bool> group_to_team_pool(
+      const std::string& group_name,
+      const std::vector<int>& global_ranks,
+      const int need_n) {
+    TORCH_CHECK(need_n < MAX_N_TEAMS, "Too many teams requested");
+    // Guarding the NVSHMEM API calls below just to be safe
+    c10::cuda::CUDAGuard guard(device_);
+
+    // Insert a new team pool if not exists
+    auto [it, inserted] = group_name_to_team_pool_.emplace(
+        group_name, TeamPool(MAX_N_TEAMS, NVSHMEM_TEAM_INVALID));
+    auto& team_pool = it->second;
+    bool pool_updated = inserted;
+
+    // Create new teams if what's requested is more than what we have
+    int stride = 0; // stride in globe, uninitialized
+    for (int i = 0; i < need_n; ++i) {
+      if (team_pool[i] != NVSHMEM_TEAM_INVALID) {
+        continue;
+      }
+      // Some checks before we create new teams
+      if (stride == 0) { // Check only once
+        TORCH_CHECK(global_ranks.size() > 1);
+        stride = global_ranks[1] - global_ranks[0];
+        for (size_t r = 1; r < global_ranks.size(); ++r) {
+          TORCH_CHECK(global_ranks[r] - global_ranks[r - 1] == stride);
+        }
+      }
+      nvshmem_team_t team = NVSHMEM_TEAM_INVALID;
+      nvshmem_team_split_strided(
+          NVSHMEM_TEAM_WORLD,
+          global_ranks[0],
+          stride,
+          global_ranks.size(),
+          nullptr,
+          0,
+          &team);
+      TORCH_CHECK(team != NVSHMEM_TEAM_INVALID, "Failed to create a new team");
+      team_pool[i] = team;
+      pool_updated = true;
+    }
+    return std::make_pair(std::cref(team_pool), pool_updated);
+  }
+
+ private:
+  // Device where the team manager is created
+  const c10::Device device_;
+  // A map from group name to team pool for that group.
+  std::unordered_map<std::string, TeamPool> group_name_to_team_pool_;
+  // A map from group name to team pool array in device memory.
+  std::unordered_map<std::string, nvshmem_team_t*> team_pool_devptrs_;
+};
+
+} // namespace c10d::nvshmem_extension
\ No newline at end of file
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 1907520702503..c25e83c07c6db 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -8,10 +8,8 @@
 
 #include <fmt/format.h>
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated")
-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
 #include <tensorpipe/tensorpipe.h>
 C10_DIAGNOSTIC_POP()
-C10_DIAGNOSTIC_POP()
 
 #include <torch/csrc/distributed/rpc/agent_utils.h>
 #include <torch/csrc/distributed/rpc/tensorpipe_utils.h>
diff --git a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
index 03b43184d143b..4c326b6a0e276 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
@@ -7,12 +7,10 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 
-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated")
 #include <tensorpipe/tensorpipe.h>
 #include <tensorpipe/tensorpipe_cuda.h>
 C10_DIAGNOSTIC_POP()
-C10_DIAGNOSTIC_POP()
 
 namespace torch::distributed::rpc {
 namespace {
diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
index f28aefc06dee0..86308ae6cdf35 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
@@ -6,10 +6,8 @@
 #include <limits>
 
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated")
-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
 #include <tensorpipe/tensorpipe.h>
 C10_DIAGNOSTIC_POP()
-C10_DIAGNOSTIC_POP()
 
 namespace torch::distributed::rpc {
 namespace {
diff --git a/torch/csrc/dynamo/cpython_includes.h b/torch/csrc/dynamo/cpython_includes.h
index 616be16563cfa..8c88addf5e42b 100644
--- a/torch/csrc/dynamo/cpython_includes.h
+++ b/torch/csrc/dynamo/cpython_includes.h
@@ -23,11 +23,13 @@
 #include <internal/pycore_frame.h>
 #if IS_PYTHON_3_14_PLUS
 #include <internal/pycore_interpframe_structs.h>
+#endif
+#if IS_PYTHON_3_14_PLUS && !defined(_WIN32)
 #include <internal/pycore_stackref.h>
 #endif
 #endif
 
-#if IS_PYTHON_3_14_PLUS
+#if IS_PYTHON_3_14_PLUS && !defined(_WIN32)
 #include <internal/pycore_code.h>
 #endif
 
@@ -38,11 +40,16 @@
 extern "C" {
 #endif
 
-#if IS_PYTHON_3_14_PLUS
+#if IS_PYTHON_3_14_PLUS && !defined(_WIN32)
 
 #define F_CODE(x) (PyCodeObject*)PyStackRef_AsPyObjectBorrow(x->f_executable)
 #define PREV_INSTR(x) (x)->instr_ptr
 
+#elif IS_PYTHON_3_14_PLUS && defined(_WIN32)
+
+#define F_CODE(x) ((PyCodeObject*)((x)->f_executable.bits))
+#define PREV_INSTR(x) (x)->instr_ptr
+
 #else
 
 #if IS_PYTHON_3_13_PLUS
diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
index 72bb8839bac35..07d28e7c77cfb 100644
--- a/torch/csrc/dynamo/eval_frame.c
+++ b/torch/csrc/dynamo/eval_frame.c
@@ -479,15 +479,15 @@ PyObject* dynamo_eval_custom_code(
     THP_EVAL_API_FRAME_OBJECT* frame,
     PyCodeObject* code,
     const char* trace_annotation,
-    int throw_flag) {}
+    int throw_flag) { return NULL; }
 THPPyInterpreterFrame* THPPyInterpreterFrame_New(
-    THP_EVAL_API_FRAME_OBJECT* frame) {}
+    THP_EVAL_API_FRAME_OBJECT* frame) { return NULL; }
 PyObject* dynamo_eval_frame_default(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
-    int throw_flag) {}
+    int throw_flag) { return NULL; }
 
-static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};
+static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {{NULL}};
 
 static PyTypeObject THPPyInterpreterFrameType = {
     PyVarObject_HEAD_INIT(NULL, 0)
@@ -544,7 +544,6 @@ static PyObject* decrement_working_threads(
 
 static PyObject* set_eval_frame(
     PyObject* new_callback,
-    PyThreadState* tstate,
     PyObject* module) {
   // Change the eval frame callback and return the old one
   //  - None: disables TorchDynamo
@@ -552,21 +551,31 @@ static PyObject* set_eval_frame(
   //  - Python callable(): enables TorchDynamo
   PyObject* old_callback = eval_frame_callback_get();
 
-  // owned by caller
-  Py_INCREF(old_callback);
+  // Common case: if Dynamo is actually off, we might see a lot of
+  // traffic setting the callback to None when it was already
+  // None. Skip messing with threading, thread-local storage, and
+  // reference counts.
+  if (old_callback != new_callback) {
+    if (new_callback == Py_None) {
+      decrement_working_threads(PyThreadState_GET(), module);
+    } else {
+      increment_working_threads(PyThreadState_GET(), module);
+    }
 
-  if (old_callback != Py_None && new_callback == Py_None) {
-    decrement_working_threads(tstate, module);
-  } else if (old_callback == Py_None && new_callback != Py_None) {
-    increment_working_threads(tstate, module);
-  }
+    Py_INCREF(new_callback);
 
-  Py_INCREF(new_callback);
-  Py_DECREF(old_callback);
+    // Set thread local callback. This will drive behavior of our shim, if/when it
+    // is installed.
+    eval_frame_callback_set(new_callback);
 
-  // Set thread local callback. This will drive behavior of our shim, if/when it
-  // is installed.
-  eval_frame_callback_set(new_callback);
+    // Transfer owned reference from eval_frame_callback_get() to caller
+    // without Py_DECREF/Py_INCREF.
+  } else {
+    // We retain a reference to old_callback because it's still the
+    // eval_frame_callback, so we need to give the caller their
+    // own reference.
+    Py_INCREF(old_callback);
+  }
 
   return old_callback;
 }
@@ -582,7 +591,7 @@ static PyObject* set_eval_frame_py(PyObject* module, PyObject* callback) {
       "python enabled=%d and is run_only=%d",
       callback != Py_None,
       callback == Py_False);
-  return set_eval_frame(callback, PyThreadState_GET(), module);
+  return set_eval_frame(callback, module);
 }
 
 static PyObject* set_skip_guard_eval_unsafe(
diff --git a/torch/csrc/dynamo/framelocals_mapping.cpp b/torch/csrc/dynamo/framelocals_mapping.cpp
index c4ee36d87767b..16420ddc90e60 100644
--- a/torch/csrc/dynamo/framelocals_mapping.cpp
+++ b/torch/csrc/dynamo/framelocals_mapping.cpp
@@ -4,7 +4,9 @@
 #include <torch/csrc/dynamo/cpython_includes.h>
 #include <torch/csrc/dynamo/debug_macros.h>
 
+#define Py_BUILD_CORE
 #include <internal/pycore_code.h>
+#undef Py_BUILD_CORE
 
 #if IS_PYTHON_3_11_PLUS
 
diff --git a/torch/csrc/dynamo/guards.cpp b/torch/csrc/dynamo/guards.cpp
index ae7aa20be29c8..c8e0ae9c27360 100644
--- a/torch/csrc/dynamo/guards.cpp
+++ b/torch/csrc/dynamo/guards.cpp
@@ -834,6 +834,7 @@ static PyObject* check_obj_id(PyObject* dummy, PyObject* args) {
 
 static std::unordered_map<PyObject*, uint64_t> dict_version_map;
 static int dict_version_watcher_id;
+static int dict_recursive_tag_watcher_id;
 static uint64_t global_dict_version_id = 1;
 static int dict_version_watch_callback(
     PyDict_WatchEvent event,
@@ -1042,7 +1043,8 @@ static void _parse_empty_strided_args(
 static PyObject* _empty_strided_device(
     PyObject* dummy,
     PyObject* args,
-    c10::DeviceType device_type) {
+    c10::DeviceType device_type,
+    bool is_pinned = false) {
   HANDLE_TH_ERRORS;
   at::SmallVector<int64_t, 8> sizes;
   at::SmallVector<int64_t, 8> strides;
@@ -1050,7 +1052,7 @@ static PyObject* _empty_strided_device(
   _parse_empty_strided_args(args, sizes, strides, dtype);
   if (device_type == c10::DeviceType::CPU) {
     return THPVariable_Wrap(
-        at::detail::empty_strided_cpu(sizes, strides, dtype));
+        at::detail::empty_strided_cpu(sizes, strides, dtype, is_pinned));
   }
 #ifdef USE_CUDA
   else if (device_type == c10::DeviceType::CUDA) {
@@ -1084,6 +1086,13 @@ static PyObject* _empty_strided_cpu(PyObject* dummy, PyObject* args) {
   return _empty_strided_device(dummy, args, c10::DeviceType::CPU);
 }
 
+static PyObject* _empty_strided_cpu_pinned(PyObject* dummy, PyObject* args) {
+  // at::empty_strided is surprising slow.  This is a lower-overhead
+  // version that saves ~2us on every allocation.
+  return _empty_strided_device(
+      dummy, args, c10::DeviceType::CPU, /*is_pinned=*/true);
+}
+
 static PyObject* _empty_strided_cuda(PyObject* dummy, PyObject* args) {
   // at::empty_strided is surprising slow.  This is lower-overhead.
   return _empty_strided_device(dummy, args, c10::DeviceType::CUDA);
@@ -1127,6 +1136,10 @@ static PyMethodDef _methods[] = {
     {"assert_alignment", assert_alignment, METH_VARARGS, nullptr},
     {"dict_version", dict_version, METH_VARARGS, nullptr},
     {"_empty_strided_cpu", _empty_strided_cpu, METH_VARARGS, nullptr},
+    {"_empty_strided_cpu_pinned",
+     _empty_strided_cpu_pinned,
+     METH_VARARGS,
+     nullptr},
     {"_empty_strided_cuda", _empty_strided_cuda, METH_VARARGS, nullptr},
     {"_empty_strided_xpu", _empty_strided_xpu, METH_VARARGS, nullptr},
     {"_empty_strided_mtia", _empty_strided_mtia, METH_VARARGS, nullptr},
@@ -1545,6 +1558,37 @@ class GuardManager;
 class RootGuardManager;
 class DictGuardManager;
 
+// Global registry used by the *recursive-dict-tag* optimisation.
+//
+// Key   : `PyObject*` pointing to a watched `dict`
+// Value : list of `GuardManager*` instances that have recorded that dict
+//
+// Why is this global?
+// -------------------
+// * CPython allows only a small, fixed number of dict-watcher IDs (≈64).
+//   All `GuardManager`s therefore share a single watcher callback.
+// * Different guard managers (possibly across different frames) can end up
+//   watching the same dictionary pointer. Therefore, we have a list of guard
+//   managers for each dict pointer.
+//
+// When is watch registered?
+//  * During the recording phase of recursive dict tag matching in GuardManager.
+//
+// When are they watched?
+//  * In the dict_recursive_tag_watch_callback function.
+//
+// When are the dict pointers unwatched?
+//  * If a dict is mutated or the guard manager deallocates.
+//  * Read `unwatch_all_saved_dict_pointers` docstring for more details.
+//
+// Expected size
+// -------------
+// Every compilation frame contributes its tag-safe dicts to this registry, so
+// the container can grow large over the lifetime of the process.  That’s
+// acceptable: lookup is by pointer (hash/equals = identity) and each entry
+// stores only lightweight pointers.
+std::unordered_map<PyObject*, std::list<GuardManager*>> dict_to_guard_managers;
+
 /**
  * Base class for the leaf guard in the GuardManager hierarchy.
  */
@@ -2613,6 +2657,7 @@ class GuardManager {
 
   virtual ~GuardManager() {
     cleanup_tag_safe_entries();
+    disable_recursive_dict_tag_optimization();
   }
 
   void cleanup_tag_safe_entries() {
@@ -2715,6 +2760,11 @@ class GuardManager {
     _tensor_pointers[value] = tensor_pointers;
   }
 
+  void disable_recursive_dict_tag_optimization() {
+    unwatch_all_saved_dict_pointers();
+    _disable_dict_tag_matching = true;
+  }
+
  public:
   // For cloning
   GuardManager(
@@ -2821,6 +2871,10 @@ class GuardManager {
   }
 
   bool check_dict_pointer_tags(PyObject* value) {
+    if (_dict_callback_installed) {
+      // This means that for 3.12+, there are callbacks watching dict pointers.
+      return true;
+    }
     for (auto& kv : _dict_pointers[value]) {
       PyObject* dict_pointer = kv.first;
       uint64_t old_tag = kv.second;
@@ -2951,6 +3005,11 @@ class GuardManager {
           throw std::runtime_error(
               "Could not register a callback for recursive dict tag optimization");
         }
+#if IS_PYTHON_3_12_PLUS
+        // Ideally we don't need to even register a weakref callback for value.
+        // But it does not hurt to be more cautious
+        _dict_callback_installed = watch_dict_pointers(value);
+#endif
       }
     }
     if (!result) {
@@ -2967,8 +3026,9 @@ class GuardManager {
     }
     GuardManager* guard_manager = static_cast<GuardManager*>(
         PyCapsule_GetPointer(self_capsule, "GuardManager*"));
-    if (guard_manager)
-      guard_manager->_disable_dict_tag_matching = true;
+    if (guard_manager) {
+      guard_manager->disable_recursive_dict_tag_optimization();
+    }
     Py_RETURN_NONE;
   }
 
@@ -3019,6 +3079,81 @@ class GuardManager {
     return true;
   }
 
+  bool watch_dict_pointers(PyObject* value) {
+#if IS_PYTHON_3_12_PLUS
+    // -----------------------------------------------------------------------------
+    // CPython 3.12 dict-watcher integration
+    // -----------------------------------------------------------------------------
+    //
+    // We register a single watcher on all every dictionary pointer recorded by
+    // a tag-safe root.  The watcher callback fires *once* for any structural
+    // change to those dictionaries
+    //
+    // Fast-path benefit
+    // -----------------
+    // In steady state we no longer need to iterate over the recorded
+    // dictionaries and compare their `ma_version_tag`s (the
+    // “are-tags-unchanged” loop that used to dominate the fast-path guard
+    // evaluation).  The presence of an *active watcher* is itself a guarantee
+    // that none of the dicts has mutated; if one **does** mutate, the callback
+    // simply flips `_disable_dict_tag_matching = true`, causing the next guard
+    // evaluation to skip the recursive-dict-tag optimisation entirely.
+    for (auto& kv : _dict_pointers[value]) {
+      PyObject* dict_pointer = kv.first;
+      int rc = PyDict_Watch(dict_recursive_tag_watcher_id, dict_pointer);
+      if (rc != 0) {
+        PyErr_Clear();
+        return false;
+      }
+      dict_to_guard_managers[dict_pointer].push_back(this);
+    }
+#endif
+    return true;
+  }
+
+  void unwatch_all_saved_dict_pointers() {
+    /*
+    We may have recorded hundreds/thousands of dict pointers for the recursive
+    dict-tag optimisation. If any of those dicts mutates, we want to disable the
+    optimisation and then unwatch as many dict pointers as we can.
+
+    Be careful: the same dict pointer can be recorded by multiple GuardManagers.
+    So the flow is:
+
+      1) Remove *this* GuardManager from dict_to_guard_managers[dict_pointer].
+      2) If the list for that dict becomes empty, then:
+          - PyDict_Unwatch(dict_recursive_tag_watcher_id, dict_pointer)
+          - erase the dict_pointer entry from dict_to_guard_managers.
+    */
+#if IS_PYTHON_3_12_PLUS
+    if (!_disable_dict_tag_matching) {
+      for (auto& value_stashed_pointers : _dict_pointers) {
+        auto stashed_pointers = value_stashed_pointers.second;
+
+        for (auto& stashed_pointer : stashed_pointers) {
+          PyObject* dict_pointer = stashed_pointer.first;
+
+          // Delete the guard manager from the dict_to_guard_managers
+          auto it = std::find(
+              dict_to_guard_managers[dict_pointer].begin(),
+              dict_to_guard_managers[dict_pointer].end(),
+              this);
+          if (it != dict_to_guard_managers[dict_pointer].end()) {
+            dict_to_guard_managers[dict_pointer].erase(it);
+          }
+
+          // Unwatch the dict pointer if this was the last guard manager
+          // watching it.
+          if (dict_to_guard_managers[dict_pointer].empty()) {
+            PyDict_Unwatch(dict_recursive_tag_watcher_id, dict_pointer);
+            dict_to_guard_managers.erase(dict_pointer);
+          }
+        }
+      }
+    }
+#endif
+  }
+
   virtual bool check_nopybind(FrameLocalsMapping* value) {
     return check_nopybind_template(value);
   }
@@ -3258,6 +3393,9 @@ class GuardManager {
   std::unordered_map<PyObject*, std::vector<PyObject*>> _tensor_pointers;
   std::vector<WeakEntry> _tag_safe_entries;
 
+  // 3.12+ related helper
+  bool _dict_callback_installed = false;
+
  protected:
   // weakref to the type of guarded value
   // protected because it is used for cloning by DictGuardManager
@@ -3945,6 +4083,27 @@ void add_relational_guard_resetter_to_cloned_root(
   root->add_relational_guard_resetter(std::move(guard));
 }
 
+#if IS_PYTHON_3_12_PLUS
+static int dict_recursive_tag_watch_callback(
+    PyDict_WatchEvent event,
+    PyObject* dict,
+    PyObject* key,
+    PyObject* new_value) noexcept {
+  if (event != PyDict_EVENT_CLONED) {
+    auto it = dict_to_guard_managers.find(dict);
+    if (it != dict_to_guard_managers.end()) {
+      auto guard_managers = it->second;
+      for (auto& guard_manager : guard_managers) {
+        if (guard_manager) {
+          guard_manager->disable_recursive_dict_tag_optimization();
+        }
+      }
+    }
+  }
+  return 0; // keep watching
+}
+#endif
+
 std::unique_ptr<GuardManager> make_guard_manager(
     RootGuardManager* root,
     std::string source,
@@ -7546,6 +7705,13 @@ PyObject* torch_c_dynamo_guards_init() {
     throw std::runtime_error("Failed to install dict_version_watch_callback");
   }
 
+  dict_recursive_tag_watcher_id =
+      PyDict_AddWatcher(dict_recursive_tag_watch_callback);
+  if (dict_recursive_tag_watcher_id == -1) {
+    throw std::runtime_error(
+        "Failed to install dict_recursive_tag_watch_callback");
+  }
+
 #endif
 
   return m;
diff --git a/torch/csrc/export/pt2_archive_constants.h b/torch/csrc/export/pt2_archive_constants.h
index 804cadccbd43c..1583f759acb65 100644
--- a/torch/csrc/export/pt2_archive_constants.h
+++ b/torch/csrc/export/pt2_archive_constants.h
@@ -33,11 +33,14 @@ namespace torch::_export::archive_spec {
   DO(WEIGHTS_DIR, "data/weights/")                                             \
   DO(WEIGHT_FILENAME_PREFIX, "weight_")                                        \
   DO(WEIGHTS_PARAM_CONFIG_FORMAT, "data/weights/{}_model_param_config.json")   \
+  DO(WEIGHTS_CONFIG_FILENAME_FORMAT, "data/weights/{}_weights_config.json")    \
   /* constants, including tensor_constants, non-persistent buffers and script  \
    * objects */                                                                \
   DO(CONSTANTS_DIR, "data/constants/")                                         \
   DO(CONSTANTS_PARAM_CONFIG_FORMAT,                                            \
      "data/constants/{}_model_constants_config.json")                          \
+  DO(CONSTANTS_CONFIG_FILENAME_FORMAT,                                         \
+     "data/constants/{}_constants_config.json")                                \
   DO(TENSOR_CONSTANT_FILENAME_PREFIX, "tensor_")                               \
   DO(CUSTOM_OBJ_FILENAME_PREFIX, "custom_obj_")                                \
   /* example inputs */                                                         \
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.cpp b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
index b835b1a00821e..aa8ef905d57aa 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.cpp
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
@@ -445,7 +445,8 @@ class RAIIMinizArchive {
  public:
   RAIIMinizArchive(const std::string& zip_path) {
     mz_zip_zero_struct(&_zip_archive);
-    if (!mz_zip_reader_init_file(&_zip_archive, zip_path.c_str(), 0)) {
+    if (!mz_zip_reader_init_file(
+            &_zip_archive, normalize_path_separator(zip_path).c_str(), 0)) {
       throw std::runtime_error(fmt::format(
           "Failed to initialize zip archive: {}",
           mz_zip_get_error_string(mz_zip_get_last_error(&_zip_archive))));
diff --git a/torch/csrc/inductor/aoti_runtime/model_base.h b/torch/csrc/inductor/aoti_runtime/model_base.h
index 6e80c90499a0e..589c341343454 100644
--- a/torch/csrc/inductor/aoti_runtime/model_base.h
+++ b/torch/csrc/inductor/aoti_runtime/model_base.h
@@ -1,13 +1,270 @@
 #pragma once
-
 #ifdef _WIN32
 #include <Windows.h>
 #include <functional> // std::function
-#else
+#ifdef USE_MMAP_SELF
+#include <errno.h>
+#include <fcntl.h>
+#include <io.h>
+#include <sys/stat.h>
+
+#define PROT_READ 0x1
+#define PROT_WRITE 0x2
+#define PROT_EXEC 0x4
+
+#define MAP_SHARED 0x01
+#define MAP_PRIVATE 0x02
+#define MAP_FAILED ((void*)-1)
+
+#define SEEK_SET 0
+#define SEEK_CUR 1
+#define SEEK_END 2
+
+struct Dl_info {
+  char dli_fname[MAX_PATH]; /**< Filename of defining object */
+  void* dli_fbase; /**< Load address of that object */
+  const char* dli_sname; /**< Name of nearest lower symbol */
+  void* dli_saddr; /**< Exact value of nearest symbol */
+};
+typedef struct Dl_info Dl_info;
+
+int dladdr(const void* addr, Dl_info* info) {
+  // only returns filename, FWIW.
+  CHAR tpath[MAX_PATH];
+  MEMORY_BASIC_INFORMATION mbi;
+  char* path;
+  char* tmp;
+  size_t length;
+  int ret = 0;
+
+  if (!info)
+    return 0;
+
+  HMODULE hModule;
+  if (!GetModuleHandleExA(
+          GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+              GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+          (LPCSTR)addr,
+          &hModule) ||
+      hModule == NULL)
+    return 0;
+
+  ret = GetModuleFileNameA(hModule, (LPSTR)&tpath, MAX_PATH);
+  if (!ret)
+    return 0;
+
+  path = tpath;
+
+  length = strlen(path);
+  if (length >= MAX_PATH) {
+    length = MAX_PATH - 1;
+    path[MAX_PATH - 1] = '\0';
+  }
+
+  tmp = path;
+  while (*tmp) {
+    if (*tmp == '\\')
+      *tmp = '/';
+    tmp++;
+  }
+
+  memcpy(info->dli_fname, path, length + 1);
+  info->dli_fbase = hModule;
+  info->dli_sname = NULL;
+  info->dli_saddr = NULL;
+  return 1;
+}
+
+static DWORD get_creation_disposition(int flags) {
+  if (flags & O_CREAT) {
+    if (flags & O_EXCL)
+      return CREATE_NEW;
+    if (flags & O_TRUNC)
+      return CREATE_ALWAYS;
+    return OPEN_ALWAYS;
+  }
+  if (flags & O_TRUNC)
+    return TRUNCATE_EXISTING;
+  return OPEN_EXISTING;
+}
+
+#define O_ACCMODE 03
+#define O_RDONLY 00
+#define O_WRONLY 01
+#define O_RDWR 02
+
+static DWORD get_access_mode(int flags) {
+  switch (flags & O_ACCMODE) {
+    case O_RDONLY:
+      return GENERIC_READ;
+    case O_WRONLY:
+      return GENERIC_WRITE;
+    case O_RDWR:
+      return GENERIC_READ | GENERIC_WRITE;
+    default:
+      return GENERIC_READ;
+  }
+}
+#ifndef O_DSYNC
+#define O_DSYNC 00010000 /* used to be O_SYNC, see below */
+#endif
+
+#ifndef O_SYNC
+#define __O_SYNC 04000000
+#define O_SYNC (__O_SYNC | O_DSYNC)
+#endif
+
+int open(char* pathname, int flags) {
+  DWORD dwDesiredAccess = get_access_mode(flags);
+  DWORD dwCreationDisposition = get_creation_disposition(flags);
+  DWORD dwShareMode = FILE_SHARE_READ | FILE_SHARE_WRITE;
+  DWORD dwFlagsAndAttributes = FILE_ATTRIBUTE_NORMAL;
+
+  if (flags & O_SYNC) {
+    dwFlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH;
+  }
+
+  if (flags & O_SEQUENTIAL) {
+    dwFlagsAndAttributes |= FILE_FLAG_SEQUENTIAL_SCAN;
+  }
+
+  if (flags & O_RANDOM) {
+    dwFlagsAndAttributes |= FILE_FLAG_RANDOM_ACCESS;
+  }
+
+  HANDLE hFile = CreateFileA(
+      pathname,
+      dwDesiredAccess,
+      dwShareMode,
+      NULL,
+      dwCreationDisposition,
+      dwFlagsAndAttributes,
+      NULL);
+
+  if (hFile == INVALID_HANDLE_VALUE) {
+    switch (GetLastError()) {
+      case ERROR_FILE_NOT_FOUND:
+        errno = ENOENT;
+        break;
+      case ERROR_PATH_NOT_FOUND:
+        errno = ENOTDIR;
+        break;
+      case ERROR_ACCESS_DENIED:
+        errno = EACCES;
+        break;
+      case ERROR_FILE_EXISTS:
+        errno = EEXIST;
+        break;
+      case ERROR_TOO_MANY_OPEN_FILES:
+        errno = EMFILE;
+        break;
+      default:
+        errno = EIO;
+    }
+    return -1;
+  }
+
+  int fd = _open_osfhandle((intptr_t)hFile, flags);
+  if (fd == -1) {
+    CloseHandle(hFile);
+    errno = EMFILE;
+    return -1;
+  }
+
+  if (flags & O_APPEND) {
+    lseek(fd, 0, SEEK_END);
+  }
+
+  return fd;
+}
+
+int close(int fd) {
+  return _close(fd);
+}
+
+void* mmap(
+    void* addr,
+    size_t length,
+    int prot,
+    int flags,
+    int fd,
+    off_t offset) {
+  HANDLE hFile = (HANDLE)_get_osfhandle(fd);
+  if (hFile == INVALID_HANDLE_VALUE) {
+    errno = EBADF;
+    return MAP_FAILED;
+  }
+
+  DWORD flProtect;
+  if (prot & PROT_WRITE) {
+    flProtect = PAGE_READWRITE;
+  } else if (prot & PROT_READ) {
+    flProtect = PAGE_READONLY;
+  } else {
+    flProtect = PAGE_NOACCESS;
+  }
+
+  flProtect = PAGE_READONLY;
+
+  DWORD dwDesiredAccess = 0;
+  if (prot & PROT_READ)
+    dwDesiredAccess |= FILE_MAP_READ;
+  if (prot & PROT_WRITE)
+    dwDesiredAccess |= FILE_MAP_WRITE;
+  if (prot & PROT_EXEC)
+    dwDesiredAccess |= FILE_MAP_EXECUTE;
+
+  dwDesiredAccess = FILE_MAP_READ;
+
+  SYSTEM_INFO SysInfo;
+  GetSystemInfo(&SysInfo);
+  DWORD dwSysGran = SysInfo.dwAllocationGranularity;
+
+  DWORD dwFileMapStart = (offset / dwSysGran) * dwSysGran;
+  DWORD dwMapViewSize = (offset % dwSysGran) + length;
+  DWORD dwFileMapSize = offset + length;
+  int iViewDelta = offset - dwFileMapStart;
+
+  HANDLE hMapping =
+      CreateFileMapping(hFile, NULL, flProtect, 0, dwFileMapSize, NULL);
+
+  if (!hMapping) {
+    DWORD dwErrCode = GetLastError();
+    errno = EACCES;
+    return MAP_FAILED;
+  }
+
+  void* lpMapAddress = MapViewOfFileEx(
+      hMapping, dwDesiredAccess, 0, dwFileMapStart, dwMapViewSize, addr);
+  if (!lpMapAddress) {
+    DWORD dwErrCode = GetLastError();
+    errno = EINVAL;
+  }
+
+  void* pData = (char*)lpMapAddress + iViewDelta;
+
+  CloseHandle(hMapping);
+
+  if (!lpMapAddress) {
+    return MAP_FAILED;
+  }
+
+  return pData;
+}
+
+int munmap(void* addr, size_t length) {
+  if (!UnmapViewOfFile(addr)) {
+    errno = EINVAL;
+    return -1;
+  }
+  return 0;
+}
+#endif // USE_MMAP_SELF
+#else // !_WIN32
 #include <dlfcn.h>
 #include <sys/mman.h>
 #include <unistd.h>
-#endif
+#endif // _WIN32
 
 #include <fcntl.h>
 #include <optional>
@@ -330,6 +587,7 @@ class AOTInductorModelBase {
 #endif
 
     size_t bytes_read = 0;
+    size_t non_folded_idx = 0; // Separate index for non-folded constants
     for (size_t i = 0; i < num_constants; i++) {
       bool from_folded = this->constant_from_folded(i);
       if (from_folded) {
@@ -339,12 +597,13 @@ class AOTInductorModelBase {
       size_t data_size = this->constant_data_size(i);
       uint8_t* internal_ptr = (data_size != 0)
           ? constant_ptr(
-                constants_internal_offset[i],
+                constants_internal_offset[non_folded_idx],
                 bytes_read,
                 data_size,
                 /* skip_copy = */ false)
           : nullptr;
       bytes_read += data_size;
+      non_folded_idx++; // Increment the non-folded index
 
       // Create at::Tensor from copied memory.
       auto dtype = this->constant_dtype(i);
diff --git a/torch/csrc/inductor/aoti_runtime/utils.h b/torch/csrc/inductor/aoti_runtime/utils.h
index b6c009805c71d..b813b3f6f745c 100644
--- a/torch/csrc/inductor/aoti_runtime/utils.h
+++ b/torch/csrc/inductor/aoti_runtime/utils.h
@@ -12,6 +12,7 @@
 // C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
 // applies to other files under torch/csrc/inductor/aoti_runtime/.
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/headeronly/util/shim_utils.h>
 
 #if defined(__GNUC__) || defined(__clang__)
 #define AOTI_NOINLINE __attribute__((noinline))
@@ -21,27 +22,18 @@
 #define AOTI_NOINLINE
 #endif
 
-AOTI_NOINLINE static void throw_exception(
-    const char* call,
-    const char* file,
-    int64_t line) {
-  std::stringstream ss;
-  ss << call << " API call failed at " << file << ", line " << line;
-  throw std::runtime_error(ss.str());
-}
-
-#define AOTI_TORCH_ERROR_CODE_CHECK(call)       \
-  if ((call) != AOTI_TORCH_SUCCESS) {           \
-    throw_exception(#call, __FILE__, __LINE__); \
+#define AOTI_TORCH_ERROR_CODE_CHECK(call)                                  \
+  if ((call) != AOTI_TORCH_SUCCESS) {                                      \
+    torch::headeronly::detail::throw_exception(#call, __FILE__, __LINE__); \
   }
 
 using AOTIRuntimeError = int32_t;
 #define AOTI_RUNTIME_SUCCESS 0
 #define AOTI_RUNTIME_FAILURE 1
 
-#define AOTI_RUNTIME_ERROR_CODE_CHECK(call)     \
-  if ((call) != AOTI_RUNTIME_SUCCESS) {         \
-    throw_exception(#call, __FILE__, __LINE__); \
+#define AOTI_RUNTIME_ERROR_CODE_CHECK(call)                                \
+  if ((call) != AOTI_RUNTIME_SUCCESS) {                                    \
+    torch::headeronly::detail::throw_exception(#call, __FILE__, __LINE__); \
   }
 
 namespace torch::aot_inductor {
@@ -50,11 +42,80 @@ using DeleterFnPtr = void (*)(void*);
 
 inline void noop_deleter(void*) {}
 
+inline void delete_record_function_object(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_record_function_end(
+      reinterpret_cast<AtenRecordFunctionHandle>(ptr)));
+}
+
 inline void delete_tensor_object(void* ptr) {
   AOTI_TORCH_ERROR_CODE_CHECK(
       aoti_torch_delete_tensor_object(reinterpret_cast<AtenTensorHandle>(ptr)));
 }
 
+class RAIIAtenRecordFunctionHandle {
+ public:
+  RAIIAtenRecordFunctionHandle() : handle_(nullptr, noop_deleter) {}
+  RAIIAtenRecordFunctionHandle(const RAIIAtenRecordFunctionHandle& other) =
+      delete;
+  RAIIAtenRecordFunctionHandle& operator=(
+      const RAIIAtenRecordFunctionHandle& other) = delete;
+
+  // Initiate an RAII RecordFunction without Inputs
+  RAIIAtenRecordFunctionHandle(const char* name, IValueMapHandle kwargs)
+      : handle_(nullptr, delete_record_function_object) {
+    AtenRecordFunctionHandle tmp_handle = nullptr;
+    aoti_record_function_start(name, kwargs, nullptr, 0, &tmp_handle);
+    handle_.reset(tmp_handle);
+  }
+
+  // Initiate an RAII RecordFunction with Inputs
+  RAIIAtenRecordFunctionHandle(
+      const char* name,
+      IValueMapHandle kwargs,
+      std::vector<C10IValueHandle> inputs)
+      : handle_(nullptr, delete_record_function_object) {
+    AtenRecordFunctionHandle tmp_handle = nullptr;
+    aoti_record_function_start(
+        name, kwargs, inputs.data(), inputs.size(), &tmp_handle);
+    handle_.reset(tmp_handle);
+  }
+
+  // Steal the ownership from another RAIIAtenRecordFunctionHandle using
+  // std::move
+  RAIIAtenRecordFunctionHandle(RAIIAtenRecordFunctionHandle&& other) = default;
+  RAIIAtenRecordFunctionHandle& operator=(
+      RAIIAtenRecordFunctionHandle&& other) = default;
+
+  // Steal the ownership from raw AtenRecordFunctionHandle
+  RAIIAtenRecordFunctionHandle(AtenRecordFunctionHandle handle)
+      : handle_(handle, delete_record_function_object) {}
+
+  ~RAIIAtenRecordFunctionHandle() {
+    handle_.reset();
+  }
+
+  // Return a raw AtenRecordFunctionHandle to be used by aoti_torch functions
+  // Note: this function does NOT transfer the ownership of the handle
+  operator AtenRecordFunctionHandle() const {
+    return handle_.get();
+  }
+
+  AtenRecordFunctionHandle release() {
+    return handle_.release();
+  }
+
+  AtenRecordFunctionHandle get() const {
+    return handle_.get();
+  }
+
+  void reset() {
+    handle_.reset();
+  }
+
+ private:
+  std::unique_ptr<AtenRecordFunctionOpaque, DeleterFnPtr> handle_;
+};
+
 // RAIIAtenTensorHandle steals the tensor objects created by the libtorch C ABI
 class RAIIAtenTensorHandle {
  public:
diff --git a/torch/csrc/inductor/aoti_torch/c/macros.h b/torch/csrc/inductor/aoti_torch/c/macros.h
index 6f1346cdcf86a..e49cd39deac0c 100644
--- a/torch/csrc/inductor/aoti_torch/c/macros.h
+++ b/torch/csrc/inductor/aoti_torch/c/macros.h
@@ -52,6 +52,9 @@ using AtenGeneratorHandle = AtenGeneratorOpaque*;
 struct AOTIProxyExecutorOpaque;
 using AOTIProxyExecutorHandle = AOTIProxyExecutorOpaque*;
 
+struct C10IValueOpaque;
+using C10IValueHandle = C10IValueOpaque*;
+
 using AOTITorchError = int32_t;
 #define AOTI_TORCH_SUCCESS 0
 #define AOTI_TORCH_FAILURE 1
diff --git a/torch/csrc/inductor/aoti_torch/c/shim.h b/torch/csrc/inductor/aoti_torch/c/shim.h
index 9d512ce1f4817..3ce4dd82cfdab 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim.h
@@ -220,6 +220,9 @@ aoti_torch_get_device_type(AtenTensorHandle tensor, int32_t* ret_device_type);
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_get_device_index(AtenTensorHandle tensor, int32_t* ret_device_index);
 
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_layout(AtenTensorHandle tensor, int32_t* ret_layout);
+
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_offset(
     AtenTensorHandle tensor,
     int64_t* ret_storage_offset);
@@ -227,6 +230,9 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_offset(
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_is_contiguous(AtenTensorHandle tensor, bool* ret_is_contiguous);
 
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_is_defined(AtenTensorHandle tensor, bool* ret_is_defined);
+
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_new_tensor_handle(
     AtenTensorHandle orig_handle,
     AtenTensorHandle* new_handle);
@@ -267,6 +273,16 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided(
     AtenTensorHandle* ret_new_tensor // returns new reference
 );
 
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided_pinned(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AtenTensorHandle* ret_new_tensor // returns new reference
+);
+
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_as_strided(
     AtenTensorHandle self,
     const int64_t* sizes_ptr,
@@ -387,6 +403,22 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_zero_(AtenTensorHandle self);
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_check_inf_and_nan(const char* tensor_name, AtenTensorHandle tensor);
 
+struct AtenRecordFunctionOpaque;
+using AtenRecordFunctionHandle = AtenRecordFunctionOpaque*;
+
+struct IValueMapOpaque;
+using IValueMapHandle = IValueMapOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_record_function_start(
+    const char* name,
+    IValueMapHandle kwargs,
+    const C10IValueHandle* inputs,
+    const uint64_t n_inputs,
+    AtenRecordFunctionHandle* guard);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_record_function_end(AtenRecordFunctionHandle guard);
+
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_out(
     AtenTensorHandle out,
     AtenTensorHandle self,
@@ -483,6 +515,39 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_call_dispatcher(
     const char* overloadName,
     StableIValue* stack);
 
+// Device-generic guard for managing device context
+struct DeviceGuardOpaque;
+using DeviceGuardHandle = DeviceGuardOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_device_guard(
+    int32_t device_index,
+    DeviceGuardHandle* ret_guard // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_device_guard(DeviceGuardHandle guard);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_device_guard_set_index(
+    DeviceGuardHandle guard,
+    int32_t device_index);
+
+// Device-generic stream for managing stream objects
+struct StreamOpaque;
+using StreamHandle = StreamOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_delete_stream(StreamHandle stream);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_stream_id(StreamHandle stream, int64_t* ret_stream_id);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_current_stream(
+    int32_t device_index,
+    StreamHandle* ret_stream // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_current_device_index(int32_t* ret_device_index);
+
 #ifdef USE_CUDA
 
 struct CUDAGuardOpaque;
diff --git a/torch/csrc/inductor/aoti_torch/c/shim_xpu.h b/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
index 408c99ca655f6..c25fe6443c948 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
@@ -107,6 +107,100 @@ aoti_torch_xpu_mkldnn__convolution_pointwise_binary_(
     const char** unary_algorithm,
     AtenTensorHandle* ret0);
 
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__qlinear_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* post_op_name,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char* post_op_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_xpu__qlinear_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* other,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double other_scale,
+    int64_t other_zero_point,
+    const char* binary_post_op,
+    double binary_alpha,
+    const char* unary_post_op,
+    const double** unary_post_op_args,
+    int64_t unary_post_op_args_len_,
+    const char* unary_post_op_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__qconv_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* attr,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_xpu__qconv2d_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle accum,
+    AtenTensorHandle* B,
+    const int64_t* stride_args,
+    int64_t stride_len_,
+    const int64_t* padding_args,
+    int64_t padding_len_,
+    const int64_t* dilation_args,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double accum_scale,
+    int64_t accum_zero_point,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0);
+
 #endif // AT_MKLDNN_ENABLED()
 #ifdef __cplusplus
 } // extern "C"
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h
index cc2dcdf4c75e0..4672e3293c5a0 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h
@@ -14,7 +14,12 @@
 extern "C" {
 #endif
 
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_aten_amax(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_aten_fill__Scalar(AtenTensorHandle self, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_aten_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_aten_new_empty(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_aten_new_zeros(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_aten_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
index 92d30ded855f8..470919cf389c3 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
@@ -51,6 +51,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_lstm_cell(AtenTenso
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__weight_int4pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScaleAndZeros, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
index 56bd071158585..179c0074b3cdf 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
@@ -78,6 +78,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mul_Scalar(AtenTensorHandle self
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index a33198fd1ba06..2cdeab071cd82 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -1,4 +1,5 @@
 #include <ATen/native/quantized/cpu/qlinear.h>
+#include <ATen/record_function.h>
 #include <c10/core/DeviceType.h>
 #include <c10/core/DispatchKey.h>
 #include <c10/core/GradMode.h>
@@ -24,6 +25,10 @@
 #include <iostream>
 #include <vector>
 
+#include <c10/core/Device.h>
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #else
@@ -384,6 +389,15 @@ AOTITorchError aoti_torch_get_device_index(
   });
 }
 
+AOTITorchError aoti_torch_get_layout(
+    AtenTensorHandle tensor,
+    int32_t* ret_layout) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* t = tensor_handle_to_tensor_pointer(tensor);
+    *ret_layout = static_cast<int32_t>(t->layout());
+  });
+}
+
 AOTITorchError aoti_torch_get_storage_offset(
     AtenTensorHandle tensor,
     int64_t* ret_storage_offset) {
@@ -402,6 +416,15 @@ AOTITorchError aoti_torch_is_contiguous(
   });
 }
 
+AOTITorchError aoti_torch_is_defined(
+    AtenTensorHandle tensor,
+    bool* ret_is_defined) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* t = tensor_handle_to_tensor_pointer(tensor);
+    *ret_is_defined = t->defined();
+  });
+}
+
 AOTITorchError aoti_torch_new_tensor_handle(
     AtenTensorHandle orig_handle,
     AtenTensorHandle* new_handle) {
@@ -452,6 +475,28 @@ AOTITorchError aoti_torch_empty_strided(
   });
 }
 
+AOTITorchError aoti_torch_empty_strided_pinned(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AtenTensorHandle* ret_new_tensor) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::IntArrayRef sizes(sizes_ptr, ndim);
+    c10::IntArrayRef strides(strides_ptr, ndim);
+    TORCH_CHECK(
+        c10::DeviceType(device_type) == c10::DeviceType::CPU,
+        "only CPU tensors can be pinned");
+    *ret_new_tensor = new_tensor_handle(at::detail::empty_strided_cpu(
+        sizes,
+        strides,
+        static_cast<c10::ScalarType>(dtype),
+        /*is_pinned=*/true));
+  });
+}
+
 AOTITorchError aoti_torch_create_tensor_from_blob(
     void* data,
     int64_t ndim,
@@ -1056,6 +1101,45 @@ AOTITorchError aoti_torch_check_inf_and_nan(
   });
 }
 
+AOTITorchError aoti_record_function_start(
+    const char* name,
+    IValueMapHandle kwargs,
+    const C10IValueHandle* inputs,
+    const uint64_t n_inputs,
+    AtenRecordFunctionHandle* guard) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::RecordFunction* newGuard =
+        new at::RecordFunction(at::RecordScope::FUNCTION);
+    std::unordered_map<std::string, c10::IValue> recordKwargs;
+
+    if (kwargs != nullptr) {
+      auto wrappedKwargs =
+          reinterpret_cast<std::unordered_map<std::string, C10IValueHandle>*>(
+              kwargs);
+      for (const auto& pair : *wrappedKwargs) {
+        recordKwargs.emplace(
+            pair.first, *(reinterpret_cast<c10::IValue*>(pair.second)));
+      }
+    }
+
+    std::vector<c10::IValue> recordInputs(n_inputs);
+    for (size_t i = 0; i < n_inputs; i++) {
+      recordInputs.push_back(*reinterpret_cast<c10::IValue*>(inputs[i]));
+    }
+
+    newGuard->before(name, &recordInputs, &recordKwargs);
+    *guard = reinterpret_cast<AtenRecordFunctionHandle>(newGuard);
+  });
+}
+
+AOTITorchError aoti_record_function_end(AtenRecordFunctionHandle guard) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::RecordFunction* t = reinterpret_cast<at::RecordFunction*>(guard);
+
+    delete t;
+  });
+}
+
 AOTITorchError aoti_torch_scatter_out(
     AtenTensorHandle out,
     AtenTensorHandle self,
@@ -1182,8 +1266,7 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
   if (msg) {
     std::cout << "  " << msg;
   }
-  std::cout << "  "
-            << "]:" << '\n';
+  std::cout << "  " << "]:" << '\n';
 
   // Print exact tensor values for small size tensors
   const int64_t numel = t->numel();
@@ -1590,3 +1673,60 @@ AOTITorchError aoti_torch_call_dispatcher(
     }
   });
 }
+
+AOTITorchError aoti_torch_create_device_guard(
+    int32_t device_index,
+    DeviceGuardHandle* ret_guard // returns new reference
+) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    // checked=true will fail if no accelerator is available
+    const auto device_type =
+        at::accelerator::getAccelerator(/*checked=*/true).value();
+    c10::Device device(device_type, device_index);
+    c10::DeviceGuard* guard = new c10::DeviceGuard(device);
+    *ret_guard = reinterpret_cast<DeviceGuardHandle>(guard);
+  });
+}
+
+AOTITorchError aoti_torch_delete_device_guard(DeviceGuardHandle guard) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { delete reinterpret_cast<c10::DeviceGuard*>(guard); });
+}
+
+AOTITorchError aoti_torch_device_guard_set_index(
+    DeviceGuardHandle guard,
+    int32_t device_index) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { reinterpret_cast<c10::DeviceGuard*>(guard)->set_index(device_index); });
+}
+
+AOTITorchError aoti_torch_delete_stream(StreamHandle stream) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { delete reinterpret_cast<c10::Stream*>(stream); });
+}
+
+AOTITorchError aoti_torch_stream_id(
+    StreamHandle stream,
+    int64_t* ret_stream_id) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::Stream* stream_ptr = reinterpret_cast<c10::Stream*>(stream);
+    *ret_stream_id = stream_ptr->id();
+  });
+}
+
+// This function creates a new Stream object and makes StreamHandle point to it.
+// The caller is responsible for managing the object's lifecycle.
+AOTITorchError aoti_torch_get_current_stream(
+    int32_t device_index,
+    StreamHandle* ret_stream) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::Stream stream = at::accelerator::getCurrentStream(device_index);
+    c10::Stream* stream_ptr = new c10::Stream(stream);
+    *ret_stream = reinterpret_cast<StreamHandle>(stream_ptr);
+  });
+}
+
+AOTITorchError aoti_torch_get_current_device_index(int32_t* ret_device_index) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_device_index = at::accelerator::getDeviceIndex(); });
+}
diff --git a/torch/csrc/inductor/aoti_torch/shim_cpu.cpp b/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
index 904bd5f9e51ff..a610685fe9557 100644
--- a/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
@@ -1,7 +1,5 @@
 
-#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/Functional.hpp>
-#endif
 #include <torch/csrc/inductor/aoti_torch/c/shim_cpu.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 
@@ -19,16 +17,6 @@ using namespace torch::aot_inductor;
 
 #if AT_MKLDNN_ENABLED()
 
-template <typename T>
-static c10::List<T> convert_to_c10_List(const T* scalars, const int64_t len) {
-  c10::List<T> scalars_list;
-  scalars_list.reserve(len);
-  for (int64_t i = 0; i < len; i++) {
-    scalars_list.emplace_back(scalars[i]);
-  }
-  return scalars_list;
-}
-
 AOTITorchError aoti_torch_cpu_mkldnn__convolution_pointwise_binary(
     AtenTensorHandle X,
     AtenTensorHandle other,
@@ -543,7 +531,6 @@ AOTITorchError aoti_torch_cpu__weight_int4pack_mm_cpu_tensor(
   });
 }
 
-#ifdef USE_DISTRIBUTED
 AOTITorchError aoti_torch_cpu__c10d_functional_all_reduce_(
     AtenTensorHandle inp,
     const char* reduce_op,
@@ -576,4 +563,3 @@ AOTITorchError aoti_torch_cpu__c10d_functional_wait_tensor(
     *ret0 = new_tensor_handle(std::move(tmp_result));
   });
 }
-#endif
diff --git a/torch/csrc/inductor/aoti_torch/shim_xpu.cpp b/torch/csrc/inductor/aoti_torch/shim_xpu.cpp
index 33f8985d83bdf..c05872ae04239 100644
--- a/torch/csrc/inductor/aoti_torch/shim_xpu.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_xpu.cpp
@@ -80,6 +80,8 @@ AOTITorchError aoti_torch_get_current_sycl_queue(void** ret) {
 
 #if AT_MKLDNN_ENABLED()
 #include <ATen/native/mkldnn/xpu/Conv.h>
+#include <ATen/native/mkldnn/xpu/qconv.h>
+#include <ATen/native/mkldnn/xpu/qlinear.h>
 
 AOTITorchError aoti_torch_xpu_mkldnn__convolution_pointwise_binary(
     AtenTensorHandle X,
@@ -204,4 +206,227 @@ AOTITorchError aoti_torch_xpu_mkldnn__convolution_pointwise(
   });
 }
 
+AOTITorchError aoti_torch_xpu__qlinear_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* post_op_name,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char* post_op_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(post_op_args_len_);
+    for (int64_t i = 0; i < post_op_args_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(post_op_args[i]));
+    }
+
+    auto tmp_result =
+        at::native::xpu::QLinearOnednnXPU::q_linear_pointwise_tensor(
+            *tensor_handle_to_tensor_pointer(X),
+            *tensor_handle_to_tensor_pointer(act_scale),
+            *tensor_handle_to_tensor_pointer(act_zero_point),
+            *tensor_handle_to_tensor_pointer(onednn_weight),
+            *tensor_handle_to_tensor_pointer(weight_scales),
+            *tensor_handle_to_tensor_pointer(weight_zero_points),
+            pointer_to_optional<at::Tensor>(B),
+            output_scale,
+            output_zero_point,
+            pointer_to_optional<at::ScalarType>(output_dtype),
+            post_op_name,
+            scalars_list,
+            post_op_algorithm);
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_xpu__qlinear_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* other,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double other_scale,
+    int64_t other_zero_point,
+    const char* binary_post_op,
+    double binary_alpha,
+    const char* unary_post_op,
+    const double** unary_post_op_args,
+    int64_t unary_post_op_args_len_,
+    const char* unary_post_op_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(unary_post_op_args_len_);
+    for (int64_t i = 0; i < unary_post_op_args_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(unary_post_op_args[i]));
+    }
+
+    auto tmp_result =
+        at::native::xpu::QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
+            *tensor_handle_to_tensor_pointer(X),
+            *tensor_handle_to_tensor_pointer(act_scale),
+            *tensor_handle_to_tensor_pointer(act_zero_point),
+            *tensor_handle_to_tensor_pointer(onednn_weight),
+            *tensor_handle_to_tensor_pointer(weight_scales),
+            *tensor_handle_to_tensor_pointer(weight_zero_points),
+            pointer_to_optional<at::Tensor>(other),
+            pointer_to_optional<at::Tensor>(B),
+            output_scale,
+            output_zero_point,
+            pointer_to_optional<at::ScalarType>(output_dtype),
+            other_scale,
+            other_zero_point,
+            binary_post_op,
+            binary_alpha,
+            unary_post_op,
+            scalars_list,
+            unary_post_op_algorithm);
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_xpu__qconv_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    const int64_t* stride_args,
+    int64_t stride_len_,
+    const int64_t* padding_args,
+    int64_t padding_len_,
+    const int64_t* dilation_args,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* attr,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(post_op_args_len_);
+    for (int64_t i = 0; i < post_op_args_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(post_op_args[i]));
+    }
+
+    c10::List<int64_t> stride_list =
+        convert_to_c10_List<int64_t>(stride_args, stride_len_);
+    c10::List<int64_t> padding_list =
+        convert_to_c10_List<int64_t>(padding_args, padding_len_);
+    c10::List<int64_t> dilation_list =
+        convert_to_c10_List<int64_t>(dilation_args, dilation_len_);
+
+    auto tmp_result = at::native::xpu::QConvoneDNNXPU::run_pointwise_tensor(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(act_scale),
+        *tensor_handle_to_tensor_pointer(act_zero_point),
+        *tensor_handle_to_tensor_pointer(onednn_weight),
+        *tensor_handle_to_tensor_pointer(weight_scales),
+        *tensor_handle_to_tensor_pointer(weight_zero_points),
+        pointer_to_optional<at::Tensor>(B),
+        stride_list,
+        padding_list,
+        dilation_list,
+        groups,
+        output_scale,
+        output_zero_point,
+        pointer_to_optional<at::ScalarType>(output_dtype),
+        attr,
+        scalars_list,
+        pointer_to_optional<std::string_view>(algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_xpu__qconv2d_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle accum,
+    AtenTensorHandle* B,
+    const int64_t* stride_args,
+    int64_t stride_len_,
+    const int64_t* padding_args,
+    int64_t padding_len_,
+    const int64_t* dilation_args,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double accum_scale,
+    int64_t accum_zero_point,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> unary_scalars_list;
+    unary_scalars_list.reserve(unary_scalars_len_);
+    for (int64_t i = 0; i < unary_scalars_len_; i++) {
+      unary_scalars_list.emplace_back(pointer_to_optional(unary_scalars[i]));
+    }
+
+    c10::List<int64_t> stride_list =
+        convert_to_c10_List<int64_t>(stride_args, stride_len_);
+    c10::List<int64_t> padding_list =
+        convert_to_c10_List<int64_t>(padding_args, padding_len_);
+    c10::List<int64_t> dilation_list =
+        convert_to_c10_List<int64_t>(dilation_args, dilation_len_);
+
+    auto tmp_result =
+        at::native::xpu::QConvoneDNNXPU::run_pointwise_binary_tensor(
+            *tensor_handle_to_tensor_pointer(X),
+            *tensor_handle_to_tensor_pointer(act_scale),
+            *tensor_handle_to_tensor_pointer(act_zero_point),
+            *tensor_handle_to_tensor_pointer(onednn_weight),
+            *tensor_handle_to_tensor_pointer(weight_scales),
+            *tensor_handle_to_tensor_pointer(weight_zero_points),
+            *tensor_handle_to_tensor_pointer(accum),
+            pointer_to_optional<at::Tensor>(B),
+            stride_list,
+            padding_list,
+            dilation_list,
+            groups,
+            output_scale,
+            output_zero_point,
+            pointer_to_optional<at::ScalarType>(output_dtype),
+            accum_scale,
+            accum_zero_point,
+            binary_attr,
+            pointer_to_optional<c10::Scalar>(alpha),
+            pointer_to_optional<std::string_view>(unary_attr),
+            unary_scalars_list,
+            pointer_to_optional<std::string_view>(unary_algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
 #endif // AT_MKLDNN_ENABLED()
diff --git a/torch/csrc/inductor/aoti_torch/utils.h b/torch/csrc/inductor/aoti_torch/utils.h
index 4f19fd670d0fc..22018cd70c829 100644
--- a/torch/csrc/inductor/aoti_torch/utils.h
+++ b/torch/csrc/inductor/aoti_torch/utils.h
@@ -222,4 +222,14 @@ inline std::optional<c10::ArrayRef<T>> pointer_to_optional_list(
       : std::nullopt;
 }
 
+template <typename T>
+static c10::List<T> convert_to_c10_List(const T* scalars, const int64_t len) {
+  c10::List<T> scalars_list;
+  scalars_list.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    scalars_list.emplace_back(scalars[i]);
+  }
+  return scalars_list;
+}
+
 } // namespace torch::aot_inductor
diff --git a/torch/csrc/inductor/cpp_wrapper/common.h b/torch/csrc/inductor/cpp_wrapper/common.h
index 9d9ae16462cc1..a2eebfcc86032 100644
--- a/torch/csrc/inductor/cpp_wrapper/common.h
+++ b/torch/csrc/inductor/cpp_wrapper/common.h
@@ -6,8 +6,7 @@
 #include <utility>
 
 #include <Python.h>
-#define PYBIND11_SIMPLE_GIL_MANAGEMENT
-#include <pybind11/gil.h>
+#include <pybind11/gil_simple.h>
 
 // Include some often-used cpp_wrapper headers, for precompiling.
 #include <c10/util/BFloat16.h>
diff --git a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
index d40ac15f132cf..0ac2c79d1e98a 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
+++ b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
@@ -12,7 +12,7 @@ cases*/
 
 static constexpr auto bfloat16_type_string = "__nv_bfloat16";
 
-#if defined(USE_ROCM)
+#if defined(USE_ROCM) && ROCM_VERSION < 70000
 static auto type_declarations_template = at::jit::CodeTemplate(R"(
 ${HalfHeader}
 ${BFloat16Header}
diff --git a/torch/csrc/jit/frontend/error_report.cpp b/torch/csrc/jit/frontend/error_report.cpp
index d642746abaaa5..d5a8408e971c0 100644
--- a/torch/csrc/jit/frontend/error_report.cpp
+++ b/torch/csrc/jit/frontend/error_report.cpp
@@ -6,7 +6,34 @@ namespace torch::jit {
 
 // Avoid storing objects with destructor in thread_local for mobile build.
 #ifndef C10_MOBILE
-static thread_local std::vector<Call> calls;
+// [NOTE: Thread-safe CallStack]
+// `calls` maintains a stack of Python calls that resulted in the
+// currently compiled TorchScript code. RAII ErrorReport::CallStack
+// push and pop from the `calls` object during compilation to track
+// these stacks so that they can be used to report compilation errors
+//
+// Q: Why can't this just be a thread_local vector<Call> (as it was previously)?
+//
+// A: Sometimes a CallStack RAII guard is created in Python in a given
+//    thread (say, thread A). Then later, someone can call
+//    sys._current_frames() from another thread (thread B), which causes
+//    thread B to hold references to the CallStack guard. e.g.
+//    1. CallStack RAII guard created by thread A
+//    2. CallStack guard now has a reference from thread B
+//    3. thread A releases guard, but thread B still holds a reference
+//    4. thread B releases guard, refcount goes to 0, and we
+//       call the destructor
+//    under this situation, **we pop an element off the wrong `call`
+//    object (from the wrong thread!)
+//
+//    To fix this:
+//    * in CallStack, store a reference to which thread's `calls`
+//      the CallStack corresponds to, so you can pop from the correct
+//      `calls` object.
+//    * make it a shared_ptr and add a mutex to make this thread safe
+//      (since now multiple threads access a given thread_local calls object)
+static thread_local std::shared_ptr<ErrorReport::Calls> calls =
+    std::make_shared<ErrorReport::Calls>();
 #endif // C10_MOBILE
 
 ErrorReport::ErrorReport(const ErrorReport& e)
@@ -17,20 +44,23 @@ ErrorReport::ErrorReport(const ErrorReport& e)
 
 #ifndef C10_MOBILE
 ErrorReport::ErrorReport(const SourceRange& r)
-    : context(r), error_stack(calls.begin(), calls.end()) {}
+    : context(r), error_stack(calls->get_stack()) {}
 
 void ErrorReport::CallStack::update_pending_range(const SourceRange& range) {
-  calls.back().caller_range = range;
+  calls->update_pending_range(range);
 }
 
 ErrorReport::CallStack::CallStack(
     const std::string& name,
     const SourceRange& range) {
-  calls.push_back({name, range});
+  source_callstack_ = calls;
+  source_callstack_->push_back({name, range});
 }
 
 ErrorReport::CallStack::~CallStack() {
-  calls.pop_back();
+  if (source_callstack_) {
+    source_callstack_->pop_back();
+  }
 }
 #else // defined C10_MOBILE
 ErrorReport::ErrorReport(const SourceRange& r) : context(r) {}
@@ -61,7 +91,7 @@ static std::string get_stacked_errors(const std::vector<Call>& error_stack) {
 
 std::string ErrorReport::current_call_stack() {
 #ifndef C10_MOBILE
-  return get_stacked_errors(calls);
+  return get_stacked_errors(calls->get_stack());
 #else
   TORCH_CHECK(false, "Call stack not supported on mobile");
 #endif // C10_MOBILE
diff --git a/torch/csrc/jit/frontend/error_report.h b/torch/csrc/jit/frontend/error_report.h
index 635dd35468e3b..9f5ad9bf3bb68 100644
--- a/torch/csrc/jit/frontend/error_report.h
+++ b/torch/csrc/jit/frontend/error_report.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/jit/frontend/tree.h>
+#include <mutex>
 
 namespace torch::jit {
 
@@ -18,6 +19,38 @@ struct TORCH_API ErrorReport : public std::exception {
 
   const char* what() const noexcept override;
 
+  class TORCH_API Calls {
+   private:
+    std::vector<Call> calls_;
+    mutable std::mutex mutex_;
+
+   public:
+    void push_back(Call call) {
+      std::lock_guard<std::mutex> lock(mutex_);
+      calls_.push_back(std::move(call));
+    }
+
+    void pop_back() {
+      std::lock_guard<std::mutex> lock(mutex_);
+      calls_.pop_back();
+    }
+
+    bool empty() const {
+      std::lock_guard<std::mutex> lock(mutex_);
+      return calls_.empty();
+    }
+
+    void update_pending_range(const SourceRange& range) {
+      std::lock_guard<std::mutex> lock(mutex_);
+      calls_.back().caller_range = range;
+    }
+
+    std::vector<Call> get_stack() const {
+      std::lock_guard<std::mutex> lock(mutex_);
+      return calls_;
+    }
+  };
+
   struct TORCH_API CallStack {
     // These functions are used to report why a function was being compiled
     // (i.e. what was the call stack of user functions at compilation time that
@@ -28,6 +61,9 @@ struct TORCH_API ErrorReport : public std::exception {
     // Change the range that is relevant for the current function (i.e. after
     // each successful expression compilation, change it to the next expression)
     static void update_pending_range(const SourceRange& range);
+
+   private:
+    std::shared_ptr<Calls> source_callstack_;
   };
 
   static std::string current_call_stack();
diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp
index bbfeb3787c918..4df9fb6639842 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.cpp
+++ b/torch/csrc/jit/frontend/schema_type_parser.cpp
@@ -33,6 +33,7 @@ using c10::StorageType;
 using c10::StreamObjType;
 using c10::StringType;
 using c10::Symbol;
+using c10::SymBoolType;
 using c10::SymIntType;
 using c10::TensorType;
 using c10::TupleType;
@@ -66,6 +67,7 @@ TypePtr SchemaTypeParser::parseBaseType() {
       {"int", c10::TypeFactory::get<IntType>()},
       {"SymInt", c10::TypeFactory::get<SymIntType>()},
       {"bool", c10::TypeFactory::get<BoolType>()},
+      {"SymBool", c10::TypeFactory::get<SymBoolType>()},
       {"None", c10::TypeFactory::get<NoneType>()},
       {"NoneType", c10::TypeFactory::get<NoneType>()},
       {"Capsule", c10::TypeFactory::get<CapsuleType>()},
diff --git a/torch/csrc/jit/frontend/sugared_value.cpp b/torch/csrc/jit/frontend/sugared_value.cpp
index 5f1a3e798bf93..0e9f0c9c2178c 100644
--- a/torch/csrc/jit/frontend/sugared_value.cpp
+++ b/torch/csrc/jit/frontend/sugared_value.cpp
@@ -359,8 +359,8 @@ void SimpleValue::setAttr(
         throw(
             ErrorReport(loc)
             << "Assignment to attribute '" << field
-            << "' cannot be of a type that contains class "
-            << "'" << classType->repr_str() << "'.\n"
+            << "' cannot be of a type that contains class " << "'"
+            << classType->repr_str() << "'.\n"
             << "Classes that recursively contain instances of themselves"
             << " are not yet supported");
       }
@@ -826,4 +826,82 @@ SugaredValuePtr SugaredEnumClass::iter(
   return enum_values_list_constant;
 }
 
+std::shared_ptr<SugaredValue> TorchCheckValue::call(
+    const SourceRange& loc,
+    GraphFunction& m,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
+    size_t n_binders) {
+  if (args.size() + kwargs.size() < 1 || args.size() + kwargs.size() > 2) {
+    throw(
+        ErrorReport(loc) << "torch._check() expects 1 or 2 arguments, got "
+                         << (args.size() + kwargs.size()));
+  }
+
+  NamedValue* cond_arg = nullptr;
+  NamedValue* message_arg = nullptr;
+  bool found_cond_kwarg = false;
+  bool found_message_kwarg = false;
+
+  for (const auto& kwarg : kwargs) {
+    if (kwarg.name() == "cond") {
+      if (found_cond_kwarg) {
+        throw(
+            ErrorReport(loc)
+            << "torch._check() got multiple values for argument 'cond'");
+      }
+      cond_arg = const_cast<NamedValue*>(&kwarg);
+      found_cond_kwarg = true;
+    } else if (kwarg.name() == "message") {
+      if (found_message_kwarg) {
+        throw(
+            ErrorReport(loc)
+            << "torch._check() got multiple values for argument 'message'");
+      }
+      message_arg = const_cast<NamedValue*>(&kwarg);
+      found_message_kwarg = true;
+    } else {
+      throw(
+          ErrorReport(loc) << "torch._check() got unexpected keyword argument '"
+                           << kwarg.name() << "'");
+    }
+  }
+
+  if (args.size() >= 1) {
+    if (found_cond_kwarg) {
+      throw(
+          ErrorReport(loc)
+          << "torch._check() got multiple values for argument 'cond'");
+    }
+    cond_arg = const_cast<NamedValue*>(&args[0]);
+  }
+
+  if (args.size() >= 2) {
+    if (found_message_kwarg) {
+      throw(
+          ErrorReport(loc)
+          << "torch._check() got multiple values for argument 'message'");
+    }
+    message_arg = const_cast<NamedValue*>(&args[1]);
+  }
+
+  if (!cond_arg) {
+    throw(
+        ErrorReport(loc) << "torch._check() missing required argument 'cond'");
+  }
+
+  std::vector<NamedValue> assert_args;
+  assert_args.push_back(*cond_arg);
+
+  if (message_arg) {
+    assert_args.push_back(*message_arg);
+  } else {
+    Value* default_msg = insertConstant(*m.graph(), std::string(""), loc);
+    assert_args.emplace_back(loc, "message", default_msg);
+  }
+
+  emitBuiltinCall(loc, *m.graph(), Symbol::aten("_assert"), assert_args, {});
+  return std::make_shared<NoneValue>();
+}
+
 } // namespace torch::jit
diff --git a/torch/csrc/jit/frontend/sugared_value.h b/torch/csrc/jit/frontend/sugared_value.h
index d88e77b16cd1b..59ddea774d5d1 100644
--- a/torch/csrc/jit/frontend/sugared_value.h
+++ b/torch/csrc/jit/frontend/sugared_value.h
@@ -136,8 +136,7 @@ struct TORCH_API SugaredValue
   // Value *
   virtual Value* len(const SourceRange& loc, GraphFunction& m) {
     throw(
-        ErrorReport(loc) << "'" << kind() << "'"
-                         << " object is not iterable");
+        ErrorReport(loc) << "'" << kind() << "'" << " object is not iterable");
   }
 
   // expression for ith element for iterable value
@@ -858,4 +857,19 @@ struct TORCH_API SliceValue : public SugaredValue {
   Value* step_;
 };
 
+struct TORCH_API TorchCheckValue : public SugaredValue {
+  explicit TorchCheckValue() = default;
+
+  std::string kind() const override {
+    return "torch._check sugared value";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+};
+
 } // namespace torch::jit
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index a0e6babe54b62..cddae77768228 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -260,10 +260,12 @@ void NodeToONNX(
     ::torch::onnx::OperatorExportTypes operator_export_type,
     py::dict& env,
     py::set& values_in_env) {
-  py::object onnx = py::module::import("torch.onnx");
-  py::object onnx_globals = py::module::import("torch.onnx._globals");
-  py::object onnx_registration =
-      py::module::import("torch.onnx._internal.registration");
+  py::object onnx_utils =
+      py::module::import("torch.onnx._internal.torchscript_exporter.utils");
+  py::object onnx_globals =
+      py::module::import("torch.onnx._internal.torchscript_exporter._globals");
+  py::object onnx_registration = py::module::import(
+      "torch.onnx._internal.torchscript_exporter.registration");
 
   // Setup all the lambda helper functions.
 
@@ -474,7 +476,7 @@ void NodeToONNX(
     // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
     // Python. Check #87343 for details.
     py::list new_nodes = py::list();
-    py::object raw_output = onnx.attr("_run_symbolic_function")(
+    py::object raw_output = onnx_utils.attr("_run_symbolic_function")(
         g->shared_from_this(),
         new_block,
         n,
@@ -590,7 +592,7 @@ void NodeToONNX(
 
       // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
       // Python. Check #87343 for details.
-      py::object raw_output = onnx.attr("_run_symbolic_method")(
+      py::object raw_output = onnx_utils.attr("_run_symbolic_method")(
           new_block->owningGraph()->shared_from_this(),
           op->name(),
           pyobj.attr("symbolic"),
@@ -605,7 +607,7 @@ void NodeToONNX(
       // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
       // Python. Check #87343 for details.
       py::list new_nodes = py::list();
-      py::object raw_output = onnx.attr("_run_symbolic_function")(
+      py::object raw_output = onnx_utils.attr("_run_symbolic_function")(
           new_block->owningGraph()->shared_from_this(),
           new_block,
           n,
diff --git a/torch/csrc/jit/passes/onnx/constant_fold.cpp b/torch/csrc/jit/passes/onnx/constant_fold.cpp
index 9cf12ffde38a2..0ac07adf0d45c 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp
@@ -76,8 +76,8 @@ static std::optional<at::Tensor> runTorchSlice_opset9(
   if (!(node->hasAttributeS("starts") && node->hasAttributeS("ends"))) {
     return std::nullopt;
   }
-  auto startsAttr = node->is(attr::starts);
-  auto endsAttr = node->is(attr::ends);
+  auto const& startsAttr = node->is(attr::starts);
+  auto const& endsAttr = node->is(attr::ends);
   if (startsAttr.size() != endsAttr.size()) {
     return std::nullopt;
   }
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.cpp b/torch/csrc/jit/passes/onnx/function_extraction.cpp
index ece03b19e961e..32c0e1b77c2cb 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.cpp
+++ b/torch/csrc/jit/passes/onnx/function_extraction.cpp
@@ -216,7 +216,7 @@ void FunctionExtractor::FunctionContext::SetAttrName(
   TORCH_INTERNAL_ASSERT(
       v_it != scope_ctxs_[scope_key_]->env_to_subgraph_.end());
   auto* n_in_def = v_it->second->node();
-  auto n_attr_it = node_attr_to_name_[n_in_def][attr.toUnqualString()] = name;
+  node_attr_to_name_[n_in_def][attr.toUnqualString()] = name;
 }
 
 std::optional<std::string> FunctionExtractor::FunctionContext::FindAttrName(
@@ -405,7 +405,7 @@ std::optional<ScopePtr> FunctionExtractor::InferScope(Node* n) {
       auto common_ancestor = FindCommonAncestor(scopes);
       if (common_ancestor.has_value() &&
           IsValidScope(common_ancestor.value())) {
-        return common_ancestor.value();
+        return common_ancestor;
       }
     }
   }
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index 73106ba0ef3c7..71595b769ac1c 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -35,8 +35,8 @@ static bool isRNN(const Node* node) {
 }
 
 static bool isNopTranspose(const std::vector<int64_t>& perm) {
-  for (int64_t i = 0, perm_size = perm.size(); i < perm_size; i++) {
-    if (perm[i] != i) {
+  for (size_t i = 0, perm_size = perm.size(); i < perm_size; i++) {
+    if (perm[i] != static_cast<int64_t>(i)) {
       return false;
     }
   }
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index 7a28f1e41c1b5..a188eb0abd6b8 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -10,8 +10,6 @@
 
 #include <c10/util/irange.h>
 
-#include <limits>
-
 namespace torch::jit {
 
 namespace {
@@ -193,8 +191,7 @@ std::pair<Value*, Value*> PrepareCopyForONNX(Node* node) {
   expanded_value->node()->copyMetadata(node);
 
   auto index_put = graph->insert(
-      aten::index_put_,
-      {node->input(0), dummy_list, expanded_value, node->input(2)});
+      aten::index_put_, {node->input(0), dummy_list, expanded_value});
   index_put->node()->copyMetadata(node);
   index_put->copyMetadata(node->output());
   node->output()->replaceAllUsesWith(index_put);
@@ -344,7 +341,7 @@ static void PrepareForRemoveMutations(MutationRemover& mr, Block* b) {
         auto it =
             std::find(node->inputs().begin(), node->inputs().end(), input);
         if (it != node->inputs().end()) {
-          int index = std::distance(node->inputs().begin(), it);
+          auto index = std::distance(node->inputs().begin(), it);
           TORCH_WARN(
               "ONNX Preprocess - Removing mutation from node ",
               node->kind().toQualString(),
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 086e50ae6a7a3..452b18f3efc31 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -282,7 +282,7 @@ Value* CloneValueFromListConstruct(
   auto input = n_graph->addInput();
   if (scalar_type) {
     auto v_type = TensorType::create(
-        scalar_type.value(),
+        scalar_type,
         at::kCPU,
         c10::SymbolicShape(),
         c10::VaryingShape<c10::Stride>{},
@@ -411,7 +411,9 @@ void ConvertGraphToONNXProto(
   }
 }
 
-std::optional<at::Tensor> ComputeConstantFolding(Node* n, int opset_version) {
+std::optional<at::Tensor> ComputeConstantFolding(
+    const Node* n,
+    int opset_version) {
   if (n->inputs().empty()) {
     return std::nullopt;
   }
@@ -463,7 +465,7 @@ std::optional<::c10::SymbolicShape> ComputeShapeFromReshape(
   auto it_0 = std::find_if(shape_vector.begin(), shape_vector.end(), is_zero);
   bool shape_has_zero = it_0 != shape_vector.end();
 
-  int minus_one_pos = -1;
+  int64_t minus_one_pos = -1;
   for (auto i : c10::irange(shape_vector.size())) {
     if (shape_vector[i].value() == -1) {
       minus_one_pos = i;
@@ -773,7 +775,7 @@ void ProcessBroadcastNode(Node* n) {
 }
 
 void ProcessShapeForConcatNode(Node* n) {
-  int axis = n->i(attr::axis);
+  auto axis = n->i(attr::axis);
   if (ConstantValueMap::HasRank(n->input(0)->debugName())) {
     auto rank = ConstantValueMap::GetRank(n->input(0)->debugName()).value();
     size_t axis_adjust = 0;
@@ -1244,7 +1246,7 @@ void ProcessUnsqueezeNode(Node* n) {
 void ComputeConstant(Node* n, int opset_version) {
   if (n->kind() == ::c10::onnx::Constant) {
     if (n->kindOf(attr::value) == AttributeKind::t) {
-      at::Tensor const_val = n->t(attr::value);
+      const at::Tensor& const_val = n->t(attr::value);
       at::Tensor const_val_copy =
           at::empty(const_val.sizes(), const_val.options());
       const_val_copy.copy_(const_val);
@@ -1381,7 +1383,7 @@ void ComputeConstant(Node* n, int opset_version) {
                 .value()
                 .sizes();
         if (input0_shape_size.has_value()) {
-          auto input0_shape_value = input0_shape_size.value();
+          const auto& input0_shape_value = input0_shape_size.value();
           if (ConstantValueMap::HasValue(n->input(1)->debugName())) {
             // When value of `shape` is statically known,
             // output shape can be computed.
@@ -1474,7 +1476,7 @@ void ComputeConstant(Node* n, int opset_version) {
                 .value()
                 .sizes();
         if (input0_shape_size.has_value()) {
-          auto input0_shape_value = input0_shape_size.value();
+          const auto& input0_shape_value = input0_shape_size.value();
           int64_t total_size = 1;
           auto is_full_static = true;
           for (const auto i : c10::irange(input0_shape_value.size())) {
@@ -1510,7 +1512,7 @@ void ComputeConstant(Node* n, int opset_version) {
                 .value()
                 .sizes();
         if (input0_shape_size.has_value()) {
-          auto input0_shape_value = input0_shape_size.value();
+          const auto& input0_shape_value = input0_shape_size.value();
           if (ConstantValueMap::HasValue(n->input(1)->debugName())) {
             auto shape_temp = ConstantValueMap::GetValueInto1DInt64Vector(
                 n->input(1)->debugName());
@@ -1659,10 +1661,10 @@ void SpecialPostProcess(Node* n) {
       };
 
       auto find_sequence_empty = [](Value* input,
-                                    TensorTypePtr t_type) -> Node* {
+                                    const TensorTypePtr& t_type) -> Node* {
         auto find_sequence_empty_impl =
             [](Value* input,
-               TensorTypePtr t_type,
+               const TensorTypePtr& t_type,
                auto& find_sequence_empty_ref) -> Node* {
           auto input_node = input->node();
           TORCH_INTERNAL_ASSERT(input_node);
@@ -1708,7 +1710,7 @@ void SpecialPostProcess(Node* n) {
           return nullptr;
         };
         return find_sequence_empty_impl(
-            input, std::move(t_type), find_sequence_empty_impl);
+            input, t_type, find_sequence_empty_impl);
       };
 
       if (seq_node && t_type && t_type->scalarType()) {
@@ -2255,7 +2257,7 @@ void ONNXSetDynamicInputShape(
   }
 }
 
-static bool HasSequenceTypeOutput(Node* node) {
+static bool HasSequenceTypeOutput(const Node* node) {
   if (node->kind() == ::c10::onnx::SplitToSequence ||
       node->kind() == ::c10::onnx::SequenceInsert ||
       node->kind() == ::c10::onnx::SequenceEmpty ||
diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
index 3116c0721a6c4..63e6804c97eb3 100644
--- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
+++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
@@ -21,83 +21,6 @@ using namespace ::c10::onnx;
 
 }
 
-// Get the scale of the input to quantized op. There are two cases here
-// 1. For ops with output_scale specified in op signature, we get the output
-// scale
-// 2. For ops with no output scale in op signature (like quantized::relu)
-// we traverse up the graph to get the scale from its input until we hit a node
-// where scale is explicitly specified.
-double getScaleFromInput(Node* input_node) {
-  std::optional<IValue> scale;
-  std::string input_name = input_node->kind().toQualString();
-  std::unordered_set<std::string> noscale_ops = {
-      "quantized::max_pool2d",
-      "aten::max_pool2d",
-      "aten::relu",
-      "prim::ListUnpack",
-      "aten::split_with_sizes",
-      "quantized::nchw2nhwc",
-      "quantized::nhwc2nchw",
-      "aten::slice",
-      "aten::avg_pool2d",
-      "quantized::cat",
-      "prim::ListConstruct",
-      "aten::upsample_nearest2d",
-      "aten::sigmoid",
-      "aten::reshape"};
-  if (input_name == "aten::quantize_per_tensor") {
-    TORCH_CHECK(
-        input_node->inputs().size() > 1,
-        "aten::quantize_per_tensor expected scale to be 2nd input");
-    scale = toIValue(input_node->inputs()[1]);
-    return scale.value().toDouble();
-  } else if (input_name == "quantized::linear") {
-    // %r = quantized::linear(%input, %packed_weight, %w_scale, %w_zero_point)
-    TORCH_CHECK(
-        input_node->inputs().size() > 2,
-        "quantized::linear expected scale to be 3rd input");
-    scale = toIValue(input_node->inputs()[2]);
-    return scale.value().toDouble();
-  } else if (input_name == "quantized::conv2d") {
-    // %r = quantized::conv2d(%input, %packed_weight, %w_scale, %w_zero_point)
-    TORCH_CHECK(
-        input_node->inputs().size() > 2,
-        "quantized::conv2d expected scale to be 3rd input");
-    auto num_inputs = input_node->inputs().size();
-    scale = toIValue(input_node->inputs()[num_inputs - 2]);
-    return scale.value().toDouble();
-  } else if (input_name == "quantized::conv2d_relu") {
-    // %r = quantized::conv2d_relu(%input, %packed_weight, %w_scale,
-    // %w_zero_point)
-    TORCH_CHECK(
-        input_node->inputs().size() > 2,
-        "quantized::conv2d_relu expected scale to be 3rd input");
-    auto num_inputs = input_node->inputs().size();
-    scale = toIValue(input_node->inputs()[num_inputs - 2]);
-    return scale.value().toDouble();
-  } else if (input_name == "quantized::add") {
-    // %r = quantized::add(%input_a, %input_b, %w_scale, %w_zero_point)
-    TORCH_CHECK(
-        input_node->inputs().size() > 2,
-        "quantized::add expected scale to be 3rd input");
-    scale = toIValue(input_node->inputs()[2]);
-    return scale.value().toDouble();
-  } else if (input_name == "aten::sigmoid") {
-    // For the _caffe2::Int8Sigmoid op output scale is 1.0/256
-    // And output zero_point is set to 0 (quint8 type).
-    return 1.0L / 256;
-  }
-  // For the ops below the scale is not part of the op signature, so we traverse
-  // up the graph to get the scale from its input when defined in the graph.
-  else if (noscale_ops.find(input_name) != noscale_ops.end()) {
-    return getScaleFromInput(input_node->inputs()[0]->node());
-  }
-  TORCH_INTERNAL_ASSERT(
-      false,
-      "Unrecognized quantized operator while trying to compute q_scale for operator ",
-      input_name);
-}
-
 static std::vector<Node*> CreateQuantizedWeights(
     std::shared_ptr<Graph>& graph,
     const at::Tensor& weight,
@@ -315,7 +238,7 @@ static void unpackQuantizedWeightsHelper(
         auto config_vals = elements[1].to<std::vector<int64_t>>();
         auto tensors = elements[2].to<std::vector<std::optional<at::Tensor>>>();
 
-        std::optional<at::Tensor> weight = tensors[1];
+        const std::optional<at::Tensor>& weight = tensors[1];
         TORCH_INTERNAL_ASSERT(
             weight, "Weight should always be present in serialized qconv.");
         unpacked_weight = *weight;
@@ -373,7 +296,7 @@ static void unpackQuantizedWeightsHelper(
         TORCH_INTERNAL_ASSERT(version == "2", "Unknown serialization version");
         std::vector<at::Tensor> non_optional = elements[1].toTensorVector();
 
-        at::Tensor conv_params_packed = non_optional[0];
+        const at::Tensor& conv_params_packed = non_optional[0];
         unpacked_weight = non_optional[1];
 
         const int64_t kSpatialDim = conv_params_packed[0].item<int64_t>();
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 1471546092230..bb052fc8421ff 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -196,11 +196,20 @@ static void removeProfileNodesAndSpecializeTypes(Block* b) {
       if (it->input()->type()->kind() == c10::TypeKind::TensorType) {
         input_tensor_type = it->input()->type()->expect<TensorType>();
       } else {
-        input_tensor_type = it->input()
-                                ->type()
-                                ->expectRef<OptionalType>()
-                                .getElementType()
-                                ->expect<TensorType>();
+        auto element_type = it->input()
+                              ->type();
+        if (element_type->cast<OptionalType>()) {
+          input_tensor_type = element_type->expectRef<OptionalType>()
+                                          .getElementType()
+                                          ->expect<TensorType>();
+        } else {
+          // This handles the following scenario:
+          // 1. profiling nodes are inserted
+          // 2. optimizations simplify a Tensor? -> None type
+          // 3. Now the input to the prim::profile() is actually a None type.
+          element_type->expect<NoneType>();
+        }
+
         input_is_optional = true;
       }
 
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 254162764afa4..a784ba72e6550 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1726,7 +1726,7 @@ void initJITBindings(PyObject* module) {
                       const py::args& args, const py::kwargs& kwargs) {
                     ToIValueAllowNumbersAsTensors g(allow_numbers_as_tensors);
                     return _get_operation_for_overload_or_packet(
-                        {op}, symbol, args, kwargs, /*is_overload*/ true);
+                        op, symbol, args, kwargs, /*is_overload*/ true);
                   });
               auto func_dk =
                   py::cpp_function([op, symbol, allow_numbers_as_tensors](
@@ -1735,10 +1735,12 @@ void initJITBindings(PyObject* module) {
                                        const py::kwargs& kwargs) {
                     ToIValueAllowNumbersAsTensors g(allow_numbers_as_tensors);
                     return _get_operation_for_overload_or_packet(
-                        {op}, symbol, args, kwargs, /*is_overload*/ true, dk_);
+                        op, symbol, args, kwargs, /*is_overload*/ true, dk_);
                   });
               return py::make_tuple(
-                  func, func_dk, py::cast(op->getTags().vec()));
+                  std::move(func),
+                  std::move(func_dk),
+                  py::cast(op->getTags().vec()));
             }
           }
           return std::nullopt;
@@ -1958,17 +1960,25 @@ void initJITBindings(PyObject* module) {
            std::vector<Argument>,
            bool,
            bool>())
-      .def_property_readonly(
-          "name", [](FunctionSchema& self) { return self.name(); })
-      .def_property_readonly(
-          "overload_name",
-          [](FunctionSchema& self) { return self.overload_name(); })
-      .def_property_readonly(
-          "arguments", [](FunctionSchema& self) { return self.arguments(); })
-      .def_property_readonly(
-          "returns", [](FunctionSchema& self) { return self.returns(); })
+      .def_property_readonly("name", &FunctionSchema::name)
+      .def_property_readonly("overload_name", &FunctionSchema::overload_name)
+      .def_property_readonly("arguments", &FunctionSchema::arguments)
+      .def_property_readonly("returns", &FunctionSchema::returns)
+      .def(
+          "_is_view_op",
+          [](const FunctionSchema& self) -> bool {
+            for (const auto& arg : self.arguments()) {
+              if (arg.alias_info() && !arg.alias_info()->isWrite()) {
+                return true;
+              }
+            }
+            return false;
+          })
       .def(
           "is_backward_compatible_with",
+          // FunctionSchema::isBackwardCompatibleWith has an extra
+          // defaulted argument, so we can't just use a
+          // pointer-to-member here.
           [](const FunctionSchema& self, const FunctionSchema& old_schema) {
             return self.isBackwardCompatibleWith(old_schema);
           })
@@ -1991,14 +2001,14 @@ void initJITBindings(PyObject* module) {
           })
       .def(
           "__str__",
-          [](FunctionSchema& self) {
+          [](const FunctionSchema& self) {
             std::stringstream ss;
             ss << self;
             return ss.str();
           })
       .def(
           "__repr__",
-          [](FunctionSchema& self) {
+          [](const FunctionSchema& self) {
             std::stringstream ss;
             ss << self;
             return ss.str();
@@ -2012,8 +2022,9 @@ void initJITBindings(PyObject* module) {
           [](const py::str& schema) { // __setstate__, note: no `self` argument
             return parseSchema(schema);
           }))
-      .def_property_readonly(
-          "is_mutable", [](FunctionSchema& self) { return self.is_mutable(); });
+      .def_property_readonly("is_mutable", [](const FunctionSchema& self) {
+        return self.is_mutable();
+      });
   py::class_<Argument>(m, "Argument")
       .def(py::init<
            std::string,
@@ -2022,18 +2033,17 @@ void initJITBindings(PyObject* module) {
            std::optional<IValue>,
            bool,
            std::optional<AliasInfo>>())
-      .def_property_readonly("name", [](Argument& self) { return self.name(); })
-      .def_property_readonly("type", [](Argument& self) { return self.type(); })
-      .def_property_readonly(
-          "real_type", [](Argument& self) { return self.real_type(); })
+      .def_property_readonly("name", &Argument::name)
+      .def_property_readonly("type", &Argument::type)
+      .def_property_readonly("real_type", &Argument::real_type)
       .def_property_readonly(
           "N",
-          [](Argument& self) -> py::object {
+          [](const Argument& self) -> py::object {
             return (self.N()) ? py::cast(*self.N()) : py::none();
           })
       .def_property_readonly(
           "default_value",
-          [](Argument& self) -> py::object {
+          [](const Argument& self) -> py::object {
             if (!self.default_value()) {
               return py::none();
             }
@@ -2042,38 +2052,38 @@ void initJITBindings(PyObject* module) {
           })
       .def(
           "has_default_value",
-          [](Argument& self) -> py::bool_ {
+          [](const Argument& self) -> py::bool_ {
             return self.default_value().has_value();
           })
       .def_property_readonly(
-          "alias_info", [](Argument& self) { return self.alias_info(); })
+          "alias_info", [](const Argument& self) { return self.alias_info(); })
       .def_property_readonly(
           "is_write",
-          [](Argument& self) {
+          [](const Argument& self) {
             if (self.alias_info() == nullptr) {
               return false;
             }
             return self.alias_info()->isWrite();
           })
       .def_property_readonly(
-          "is_out", [](Argument& self) { return self.is_out(); })
-      .def_property_readonly("kwarg_only", [](Argument& self) -> bool {
+          "is_out", [](const Argument& self) { return self.is_out(); })
+      .def_property_readonly("kwarg_only", [](const Argument& self) -> bool {
         return self.kwarg_only();
       });
   py::class_<AliasInfo>(m, "_AliasInfo")
       .def(py::init<bool, std::set<std::string>, std::set<std::string>>())
       .def_property_readonly(
-          "is_write", [](AliasInfo& self) { return self.isWrite(); })
+          "is_write", [](const AliasInfo& self) { return self.isWrite(); })
       .def_property_readonly(
           "before_set",
-          [](AliasInfo& self) {
+          [](const AliasInfo& self) {
             std::set<py::str> before_set_python;
             for (const auto& set : self.beforeSets()) {
               before_set_python.insert(py::str(set.toUnqualString()));
             }
             return before_set_python;
           })
-      .def_property_readonly("after_set", [](AliasInfo& self) {
+      .def_property_readonly("after_set", [](const AliasInfo& self) {
         std::set<py::str> after_set_python;
         for (const auto& set : self.afterSets()) {
           after_set_python.insert(py::str(set.toUnqualString()));
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index 3f2708619be86..a366aa58f822d 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -90,7 +90,7 @@ IValue toIValue(py::handle obj, const TypePtr& type, std::optional<int32_t> N) {
         if (PyBool_Check(obj.ptr())) {
           scalar = at::Scalar(THPUtils_unpackBool(obj.ptr()));
         } else if (THPUtils_checkLong(obj.ptr())) {
-          scalar = at::Scalar(THPUtils_unpackLong(obj.ptr()));
+          scalar = THPUtils_unpackInteger<at::Scalar>(obj.ptr());
         } else if (PyComplex_Check(obj.ptr())) {
           scalar = at::Scalar(THPUtils_unpackComplexDouble(obj.ptr()));
         } else if (THPUtils_checkDouble(obj.ptr())) {
@@ -512,7 +512,7 @@ IValue toIValue(py::handle obj, const TypePtr& type, std::optional<int32_t> N) {
       if (py::isinstance<py::bool_>(obj)) {
         return py::cast<bool>(obj);
       } else if (py::isinstance<py::int_>(obj)) {
-        return py::cast<int64_t>(obj);
+        return THPUtils_unpackInteger<IValue>(obj.ptr());
       } else if (py::isinstance<py::float_>(obj)) {
         return py::cast<double>(obj);
       } else if (PyComplex_CheckExact(obj.ptr())) {
@@ -598,6 +598,8 @@ py::object toPyObject(IValue ivalue) {
           return py::cast(*tensor.const_data_ptr<bool>());
         case at::ScalarType::Long:
           return py::cast(*tensor.const_data_ptr<int64_t>());
+        case at::ScalarType::UInt64:
+          return py::cast(*tensor.const_data_ptr<uint64_t>());
         case at::ScalarType::Double:
           return py::cast(*tensor.const_data_ptr<double>());
         case at::ScalarType::ComplexDouble:
@@ -763,6 +765,8 @@ py::object toPyObject(IValue ivalue) {
     return py::cast(std::move(ivalue).toSymFloat());
   } else if (ivalue.isSymBool()) {
     return py::cast(std::move(ivalue).toSymBool());
+  } else if (ivalue.isUnsigned()) {
+    return py::cast(std::move(ivalue).toUInt());
   } else {
     TORCH_CHECK(
         false,
@@ -776,9 +780,17 @@ std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
     const std::vector<std::shared_ptr<Operator>>& operations,
     const py::args& args,
     const py::kwargs& kwargs) {
+  return getOpWithStack(
+      c10::ArrayRef<std::shared_ptr<Operator>>(operations), args, kwargs);
+}
+
+std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
+    c10::ArrayRef<std::shared_ptr<Operator>> operations,
+    const py::args& args,
+    const py::kwargs& kwargs) {
   Stack stack;
   if (operations.size() == 1) {
-    std::shared_ptr<Operator> op = operations.at(0);
+    std::shared_ptr<Operator> op = operations[0];
     // Create a stack full of the arguments and keyword arguments.
     stack = createStackForSchema(op->schema(), args, kwargs, std::nullopt);
 
@@ -830,6 +842,15 @@ py::object invokeOperatorFromPython(
     const py::args& args,
     const py::kwargs& kwargs,
     std::optional<c10::DispatchKey> dk) {
+  return invokeOperatorFromPython(
+      c10::ArrayRef<std::shared_ptr<Operator>>(operations), args, kwargs, dk);
+}
+
+py::object invokeOperatorFromPython(
+    c10::ArrayRef<std::shared_ptr<Operator>> operations,
+    const py::args& args,
+    const py::kwargs& kwargs,
+    std::optional<c10::DispatchKey> dk) {
   auto [found_op, stack] = getOpWithStack(operations, args, kwargs);
   {
     pybind11::gil_scoped_release no_gil_guard;
@@ -851,8 +872,9 @@ std::optional<py::object> _maybe_handle_torch_function(
     const py::args& args,
     const py::kwargs& kwargs) {
   std::vector<PyObject*> overloaded_args;
-  size_t total_arg_num = args.size() + kwargs.size();
-  for (const auto i : c10::irange(args.size())) {
+  const auto args_size = args.size();
+  size_t total_arg_num = args_size + kwargs.size();
+  for (const auto i : c10::irange(args_size)) {
     is_tensor_and_append_overloaded(args[i].ptr(), &overloaded_args);
     is_tensor_list_and_append_overloaded(
         args[i].ptr(),
@@ -907,6 +929,17 @@ py::object _get_operation_for_overload_or_packet(
     const py::kwargs& kwargs,
     bool is_overload,
     std::optional<c10::DispatchKey> dk) {
+  return _get_operation_for_overload_or_packet(
+      c10::ArrayRef(operations), symbol, args, kwargs, is_overload, dk);
+}
+
+py::object _get_operation_for_overload_or_packet(
+    c10::ArrayRef<std::shared_ptr<Operator>> operations,
+    Symbol symbol,
+    const py::args& args,
+    const py::kwargs& kwargs,
+    bool is_overload,
+    std::optional<c10::DispatchKey> dk) {
   std::string ns = symbol.ns().toUnqualString();
   std::string method_name = symbol.toUnqualString();
   std::string overload_name = operations[0]->schema().overload_name();
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index f80ae1b9481c4..2c0c1ea4b9cf2 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -13,6 +13,8 @@
 #include <torch/csrc/Layout.h>
 #include <torch/csrc/QScheme.h>
 #include <torch/csrc/Stream.h>
+#include <torch/csrc/distributed/rpc/py_rref.h>
+#include <torch/csrc/distributed/rpc/rref_impl.h>
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/frontend/schema_matching.h>
 #include <torch/csrc/jit/frontend/tracer.h>
@@ -24,10 +26,6 @@
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_arg_parser.h>
 #include <torch/csrc/utils/six.h>
-#ifdef USE_DISTRIBUTED
-#include <torch/csrc/distributed/rpc/py_rref.h>
-#include <torch/csrc/distributed/rpc/rref_impl.h>
-#endif
 
 #include <ATen/core/function_schema.h>
 #include <c10/core/Stream.h>
@@ -1277,12 +1275,27 @@ TORCH_PYTHON_API std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
     const py::args& args,
     const py::kwargs& kwargs);
 
+// Efficient overload (does not require vector allocation) of the
+// above for use from C++ code.
+std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
+    c10::ArrayRef<std::shared_ptr<Operator>> operations,
+    const py::args& args,
+    const py::kwargs& kwargs);
+
 TORCH_PYTHON_API py::object invokeOperatorFromPython(
     const std::vector<std::shared_ptr<Operator>>& operations,
     const py::args& args,
     const py::kwargs& kwargs,
     std::optional<c10::DispatchKey> dk = std::nullopt);
 
+// Efficient overload (does not require vector allocation) of the
+// above for use from C++ code.
+py::object invokeOperatorFromPython(
+    c10::ArrayRef<std::shared_ptr<Operator>> operations,
+    const py::args& args,
+    const py::kwargs& kwargs,
+    std::optional<c10::DispatchKey> dk = std::nullopt);
+
 TORCH_PYTHON_API std::optional<py::object> _maybe_handle_torch_function(
     const std::string& ns,
     const std::string& method_name,
@@ -1304,4 +1317,14 @@ TORCH_PYTHON_API py::object _get_operation_for_overload_or_packet(
     bool is_overload,
     std::optional<c10::DispatchKey> dk = std::nullopt);
 
+// Efficient overload (does not require vector allocation) of the
+// above for use from C++ code.
+py::object _get_operation_for_overload_or_packet(
+    c10::ArrayRef<std::shared_ptr<Operator>> operations,
+    Symbol symbol,
+    const py::args& args,
+    const py::kwargs& kwargs,
+    bool is_overload,
+    std::optional<c10::DispatchKey> dk = std::nullopt);
+
 } // namespace torch::jit
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index b9db0be814e45..808fe7d3605ba 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -1222,8 +1222,10 @@ std::shared_ptr<SugaredValue> toSugaredValue(
   } else if (
       obj.ptr() == py::module::import("torch.jit").attr("isinstance").ptr()) {
     return SpecialFormValue::create(prim::isinstance);
+  } else if (obj.ptr() == py::module::import("torch").attr("_check").ptr()) {
+    return std::make_shared<TorchCheckValue>();
 #ifdef USE_RPC
-    // RPC module is only available when build flag "USE_DISTRIBUTED" is on.
+    // This is not defined on WINDOWS
   } else if (
       isRpcAvailable &&
       obj.ptr() ==
@@ -1236,7 +1238,6 @@ std::shared_ptr<SugaredValue> toSugaredValue(
     return SpecialFormValue::create(prim::rpc_sync);
   } else if (
       isRpcAvailable &&
-      // RPC module is only available  when build flag "USE_DISTRIBUTED" is on.
       obj.ptr() ==
           py::module::import("torch.distributed.rpc").attr("remote").ptr()) {
     return SpecialFormValue::create(prim::rpc_remote);
diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h
index 6ae9f52a0cda2..be582cfb7cdd8 100644
--- a/torch/csrc/jit/runtime/interpreter.h
+++ b/torch/csrc/jit/runtime/interpreter.h
@@ -128,13 +128,8 @@ struct InterpreterContinuation {
       std::optional<at::ThreadLocalState> tls_state = std::nullopt)
       : state(std::move(state_)),
         stack(std::move(stack_)),
-        tls_state_(std::move(tls_state))
-#ifdef USE_DISTRIBUTED
-        ,
-        dist_autograd_context_id_(dist_autograd_context_id)
-#endif
-  {
-  }
+        tls_state_(std::move(tls_state)),
+        dist_autograd_context_id_(dist_autograd_context_id) {}
 
   void operator()();
 
@@ -142,9 +137,10 @@ struct InterpreterContinuation {
   InterpreterState state;
   Stack stack;
   std::optional<at::ThreadLocalState> tls_state_ = std::nullopt;
-#ifdef USE_DISTRIBUTED
-  int64_t dist_autograd_context_id_;
+#ifndef USE_RPC
+  [[maybe_unused]]
 #endif
+  int64_t dist_autograd_context_id_;
 };
 
 // what is the tensors type, including state from the current execution context
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index d5586a5b9cd7b..9e408682ca6c3 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1910,7 +1910,7 @@ REGISTER_OPERATOR_FUNCTOR(aten::div, aten_div, [](Node* n) -> SROperator {
     }
     auto& out_t = p_node->Output(0).toTensor();
 
-    if (in0_t.sizes() == in1_t.sizes() &&
+    if (te && te->checkInput<float>(in0_t) && in0_t.sizes() == in1_t.sizes() &&
         in0_t.scalar_type() == in1_t.scalar_type() &&
         in0_t.strides() == in1_t.strides() && in0_t.is_contiguous() &&
         in0_t.scalar_type() == at::kFloat) {
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index 526c840bc10e8..e3379f4de65ac 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -79,9 +79,7 @@ class TORCH_API Pickler {
   void pushTuple(const IValue& ivalue);
   void pushString(const std::string& string);
   void pushDevice(const IValue& ivalue);
-#ifdef USE_DISTRIBUTED
   void pushRRef(const IValue& ivalue);
-#endif
   // unmemoized version
   void pushStringImpl(const std::string& string);
   void pushStorageOfTensor(const at::Tensor& tensor);
diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h
index 702a1d8816e7f..208cf554ad2bb 100644
--- a/torch/csrc/jit/serialization/unpickler.h
+++ b/torch/csrc/jit/serialization/unpickler.h
@@ -140,9 +140,7 @@ class TORCH_API Unpickler {
   void rebuildParameter();
   void rebuildTensorFromTypeV2();
   void rebuildSparseTensor();
-#ifdef USE_DISTRIBUTED
   void rebuildRRef();
-#endif
   PickleOpCode readInstruction();
   PickleOpCode readOpCode() {
     return static_cast<PickleOpCode>(read<uint8_t>());
diff --git a/torch/csrc/lazy/ts_backend/ts_lowering_context.h b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
index 5e6cc4234846d..356ea3d8e9231 100644
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.h
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
@@ -91,7 +91,7 @@ class TORCH_API TSLoweringContext : public LoweringContext {
     for (torch::jit::Value* output : root_tuple_) {
       graph_->block()->registerOutput(output);
     }
-    return std::shared_ptr<Computation>(new TSComputation(graph_));
+    return std::make_shared<TSComputation>(graph_);
   }
 
   // Retrieves the lowered operation for an output. If the requested output is
diff --git a/torch/csrc/mps/Module.cpp b/torch/csrc/mps/Module.cpp
index 95263f108c825..51c77aba6d765 100644
--- a/torch/csrc/mps/Module.cpp
+++ b/torch/csrc/mps/Module.cpp
@@ -501,6 +501,12 @@ void initModule(PyObject* module) {
     at::mps::getMPSProfiler().startCapture(fileName);
   });
   m.def("_mps_stopCapture", []() { at::mps::getMPSProfiler().stopCapture(); });
+  m.def("_mps_get_name", []() {
+    return at::mps::MPSDevice::getInstance()->getName();
+  });
+  m.def("_mps_get_core_count", []() {
+    return at::mps::MPSDevice::getInstance()->getCoreCount();
+  });
 }
 #endif /* USE_MPS */
 
diff --git a/torch/csrc/profiler/README.md b/torch/csrc/profiler/README.md
index 339c84c0a08e7..dc27337349ddc 100644
--- a/torch/csrc/profiler/README.md
+++ b/torch/csrc/profiler/README.md
@@ -13,14 +13,49 @@ The profiler instruments PyTorch to collect information about the model's execut
 - [Codebase Structure](#codebase-structure)
 - [`RecordFunction`](#recordfunction)
 - [Autograd Integration](#autograd-integration)
-- [Collection and Post-Processing](#collection-and-post-processing)
+- [Torch Operation Collection](#torch-operation-collection)
+- [Allocation Event Collection](#allocation-event-collection)
 - [Kineto Integration](#kineto-integration)
 - [Python Tracing](#python-tracing)
+- [Clock Alignment](#clock-alignment)
 
 ## Codebase Structure ##
 
-TODO
-
+This section highlights directories an files that are significant to the profiler. Lesser relevant files, directories, and modules are omitted.
+```
+torch/
+│
+├── profiler/                # Main package containing the core frontend logic
+│   ├── __init__.py          # Initialization file for profiler package
+│   ├── profiler.py          # Main profiler frontend class
+│   └── _utils.py            # FunctionEvent utils
+│
+├── autograd/               # Autograd package
+│   ├── __init__.py          # Initialization file for autograd package
+│   ├── profiler.py          # Main profiler backend class
+│   └── profiler_utils.py    # FunctionEvent utils
+│
+├── csrc/                   # C and C++ source code
+│   └── profiler/            # Profiler C++ source code
+│       ├── collection.cpp                 # Main collection logic
+│       ├── collection.h                   # Collection definitions
+│       ├── kineto_client_interface.cpp   # Interface to call Profiler from kineto (on-demand only)
+│       ├── kineto_client_interface.h     # Client interface definitions
+│       ├── kineto_shim.cpp                # Shim to call kineto from profiler
+│       ├── kineto_shim.h                  # Shim definitions
+│       ├── util.cpp                       # utils for handling args in profiler events
+│       ├── util.h                         # util definitions
+│       └── README.md                      # This file
+│   └── autograd/            # Autograd C++ source code
+│       ├── profiler_python.cpp          # Main python stack collection logic
+│       ├── profiler_python.h            # Python stack collection definitions
+│       ├── profiler_kineto.cpp          # Profiler backend logic for starting collection/kineto
+│       └── profiler_kineto.h            # Profiler backend definitions for starting collection/kineto
+│   └── ATen/                # ATen C++ source code
+│       ├── record_function.cpp          # RecordFunction collection logic
+│       └── record_function.h            # RecordFunction definitions
+└── LICENSE                  # License information
+```
 ## `RecordFunction` ##
 
 [aten/src/ATen/record_function.h](../../../aten/src/ATen/record_function.h)
@@ -43,14 +78,39 @@ The profiler records two pieces of information from the autograd engine:
 
 (\*) Note that only op invocations whose inputs require gradients are assigned a sequence number
 
-## Collection and Post-Processing ##
+## Torch Operation Collection ##
+This section describes the general flow for collecting torch operations during auto-trace (in-process, synchronous tracing). For details on on-demand tracing (out-of-process, asynchronous), please refer to the Libkineto README.
+
+When a trace begins, the autograd/profiler backend calls into `profiler_kineto.cpp` to prepare, start, or stop collection. At the start of tracing, the `onFunctionEnter` and `onFunctionExit` callbacks defined in `profiler_kineto.cpp` are registered.
+
+Callback registration can be either global or local, depending on the `ExperimentalConfig` used:
+- **Global:** The callback is registered to all threads throughout execution.
+- **Local:** The callback is registered only to threads present *at the start* of tracing.
+Within `onFunctionEnter`, the profiler creates a `ThreadLocalSubqueue` instance for each thread, ensuring that each CPU operation is associated with the thread on which it was executed. When a torch operation is entered, the profiler calls `begin_op` (defined in `collection.cpp`) to record the necessary information. The `begin_op` routine is intentionally lightweight, as it is on the "hot path" during profiling. Excessive overhead here would distort the profile and reduce its usefulness. Therefore, only minimal information is collected during the callback; most logic occurs during post-processing.
 
-TODO
+## Allocation Event Collection ##
+
+Unlike torch operations, which have a start and stop, allocation events are represented as `cpu_instant_event` (zero duration). As a result, `RecordFunction` is bypassed for these events. Instead, `emplace_allocation_event` is called directly to enqueue the event into the appropriate `ThreadLocalSubqueue`.
 
 ## Kineto Integration ##
 
-TODO
+Kineto serves as an abstraction layer for collecting events across multiple architectures. It interacts with libraries such as CUPTI to receive GPU and accelerator events, which are then forwarded to the frontend profiler. Kineto requires time to "prepare" (also referred to as "warmup") these third-party modules to avoid distorting the profile with initialization routines. While this could theoretically be done at job startup, keeping a heavy library like CUPTI running unnecessarily introduces significant overhead.
+As previously mentioned, `profiler_kineto.cpp` is used in the backend to invoke the appropriate profiler stage. It also calls into `kineto_shim.cpp`, which triggers the corresponding routines in Kineto. Once a trace is complete, all events collected by Kineto are forwarded to the profiler for two main reasons:
+1. To coalesce all data and complete any post-processing between profiler and Kineto events.
+2. To forward these events to the Python frontend as `FunctionEvents`.
+The final step in integration is file export. After all events have been collected and post-processed, they can be exported to a JSON file for visualization in Perfetto or Chrome Tracer. This is done by calling Kineto's `ActivityTraceInterface::save`, which writes all event information to disk.
 
 ## Python Tracing ##
 
-TODO
+When `with_stack=True` is set in the profiler, the Python stack tracer is generated using the `make` function defined in `PythonTracerBase`. The implementation resides in `profiler_python.cpp`.
+To profile the stack, `PyEval_SetProfile` is used to trace and handle various execution events within a Python program. This enables comprehensive profiling by monitoring and responding to specific cases:
+- **Python Function Calls (`PyTrace_CALL`):** The `recordPyCall` method logs each Python function call, capturing essential details for later analysis.
+- **C Function Calls (`PyTrace_C_CALL`):** The `recordCCall` method documents calls to C functions, including relevant arguments, providing a complete view of the program's execution flow.
+- **Python Function Returns (`PyTrace_RETURN`):** Exit times of Python functions are recorded, enabling precise measurement of function execution durations.
+- **C Function Returns and Exceptions (`PyTrace_C_RETURN` and `PyTrace_C_EXCEPTION`):** Exit times for C functions are tracked, whether they conclude normally or due to an exception, ensuring all execution paths are accounted for.
+This setup allows for detailed and accurate data collection on both Python and C function executions, facilitating thorough post-processing and analysis. After profiling, the accumulated event stacks are processed to match entrances and exits, constructing complete events for further analysis by the profiler.
+**Note:** For Python 3.12.0–3.12.4, a bug in CPython requires the use of `sys.monitoring` as a workaround.
+
+## Clock Alignment ##
+
+Depending on the system environment, the profiler will use the most efficient clock when creating a timestamp. The default for most Linux systems is TSC, which records time in the form of CPU cycles. To convert from this time to the unix time in nanoseconds, we create a clock converter. If Kineto is included in the profiler, this converter will also be passed into Kineto as well to ensure alignment.
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index b9abd5ae508f3..c7f759cd077c9 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -613,6 +613,7 @@ std::string Result::name() const {
       ATTRIBUTE(OutOfMemory, std::string("[OutOfMemory]")),
       ATTRIBUTE(PyCall, toString(e)),
       ATTRIBUTE(PyCCall, std::string(e.function_name_.str())),
+      ATTRIBUTE(PythonGC, std::string("Python GC")),
       [](const auto& e) -> std::string { return e.name_; }));
 }
 
@@ -631,6 +632,7 @@ libkineto::ActivityType Result::kinetoType() const {
       ATTRIBUTE(OutOfMemory, libkineto::ActivityType::CPU_INSTANT_EVENT),
       ATTRIBUTE(PyCall, libkineto::ActivityType::PYTHON_FUNCTION),
       ATTRIBUTE(PyCCall, libkineto::ActivityType::PYTHON_FUNCTION),
+      ATTRIBUTE(PythonGC, libkineto::ActivityType::PYTHON_FUNCTION),
       ATTRIBUTE(Kineto, e.activity_type_)));
 }
 
@@ -650,6 +652,7 @@ int64_t Result::endTimeNS() const {
       ATTRIBUTE(Allocation, start_time_ns_),
       ATTRIBUTE(OutOfMemory, start_time_ns_),
       ATTRIBUTE(Kineto, start_time_ns_ + e.duration_ns_),
+      ATTRIBUTE(PythonGC, start_time_ns_ + e.duration_ns_),
       [&](const auto& e) -> int64_t { return e.end_time_ns_; }));
 
   // In rare cases we're willing to tolerate ops which are missing an end time
@@ -700,6 +703,9 @@ RecordQueue::RecordQueue(
       activities_{std::move(activities)} {
   if (tracePython()) {
     python_tracer_ = python_tracer::PythonTracerBase::make(this);
+    if (getPythonGcEvents()) {
+      python_tracer_->register_gc_callback();
+    }
   }
 }
 
@@ -707,6 +713,10 @@ bool RecordQueue::tracePython() const {
   return config_.with_stack && activities_.count(ActivityType::CPU);
 }
 
+bool RecordQueue::getPythonGcEvents() const {
+  return config_.experimental_config.record_python_gc_info;
+}
+
 ThreadLocalSubqueue* RecordQueue::getSubqueue() {
   // In the most common case, a thread will want to write to the same sub-queue
   // that it wrote to last call. The only time that isn't true is if:
@@ -1488,6 +1498,31 @@ RecordQueue::getRecords(
     queue.allocations_.clear();
     materialize(queue.ooms_);
 
+    std::optional<int64_t> pending_start;
+    for (auto& e : queue.pythongc_) {
+      if (e.first.find("start") != std::string::npos) {
+        pending_start = e.second;
+      } else if (e.first.find("stop") != std::string::npos) {
+        if (pending_start.has_value()) {
+          out.emplace_back(Result::create(
+              /*start_time_ns_=*/converter(pending_start.value()),
+              /*start_tid_=*/queue.tid(),
+              /*kineto_info_=*/queue.kineto_info(),
+              /*extra_fields_=*/
+              // NOLINTNEXTLINE
+              ExtraFields<EventType::PythonGC>{
+                  e.first,
+                  converter(e.second) - converter(pending_start.value())}));
+          pending_start.reset();
+        } else {
+          // Handle the case where "stop" is found without a matching "start"
+          // For example, you might want to log a warning or take other action:
+          LOG(WARNING) << R"("stop" event found without a matching "start": )"
+                       << e.first;
+        }
+      }
+    }
+
     for (auto& i : queue.py_calls_) {
       python_enters.push_back(
           {i.first, queue.tid(), queue.kineto_info(), converter(i.second)});
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index 59ebda87a176e..847819f971957 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -34,7 +34,8 @@ enum class EventType : uint8_t {
   OutOfMemory,
   PyCall,
   PyCCall,
-  Kineto
+  Kineto,
+  PythonGC
 };
 
 // ============================================================================
@@ -191,6 +192,12 @@ struct ExtraFields<EventType::Backend> {
   jit_modules_t jit_modules_;
 };
 
+template <>
+struct ExtraFields<EventType::PythonGC> {
+  std::string phase;
+  int64_t duration_ns_;
+};
+
 template <>
 struct ExtraFields<EventType::Vulkan> {
   using raw_event_t = std::pair<c10::approx_time_t, vulkan_id_t>;
@@ -415,7 +422,8 @@ struct TORCH_API Result : public std::enable_shared_from_this<Result> {
       ExtraFields<EventType::OutOfMemory>,
       ExtraFields<EventType::PyCall>,
       ExtraFields<EventType::PyCCall>,
-      ExtraFields<EventType::Kineto>>
+      ExtraFields<EventType::Kineto>,
+      ExtraFields<EventType::PythonGC>>
       extra_fields_;
 
   std::weak_ptr<Result> parent_;
@@ -549,6 +557,11 @@ class TORCH_API ThreadLocalSubqueue {
     py_calls_.emplace_back(std::forward<Args>(args)...);
   }
 
+  template <class... Args>
+  void emplace_gc_call(Args&&... args) {
+    pythongc_.emplace_back(std::forward<Args>(args)...);
+  }
+
   uint64_t tid() const {
     return tid_;
   }
@@ -639,6 +652,9 @@ class TORCH_API ThreadLocalSubqueue {
       std::pair<python_tracer::TraceKey, c10::approx_time_t>,
       BlockSize>
       py_calls_;
+  // gc with_stack (Python)
+  AppendOnlyList<std::pair<std::string, c10::approx_time_t>, BlockSize>
+      pythongc_;
 };
 
 class TORCH_API RecordQueue {
@@ -646,6 +662,7 @@ class TORCH_API RecordQueue {
   RecordQueue(ProfilerConfig config, std::set<ActivityType> activities);
 
   bool tracePython() const;
+  bool getPythonGcEvents() const;
   ThreadLocalSubqueue* getSubqueue();
   void stop();
   void restart();
diff --git a/torch/csrc/profiler/orchestration/observer.cpp b/torch/csrc/profiler/orchestration/observer.cpp
index 18b792a1abe97..5ef0690d18115 100644
--- a/torch/csrc/profiler/orchestration/observer.cpp
+++ b/torch/csrc/profiler/orchestration/observer.cpp
@@ -21,6 +21,7 @@ ExperimentalConfig::ExperimentalConfig(
     bool disable_external_correlation,
     bool profile_all_threads,
     bool capture_overload_names,
+    bool record_python_gc_info,
     std::string custom_profiler_config,
     bool adjust_timestamps)
     : profiler_metrics{std::move(profiler_metrics)},
@@ -32,6 +33,7 @@ ExperimentalConfig::ExperimentalConfig(
       disable_external_correlation{disable_external_correlation},
       profile_all_threads{profile_all_threads},
       capture_overload_names{capture_overload_names},
+      record_python_gc_info{record_python_gc_info},
       custom_profiler_config(std::move(custom_profiler_config)),
       adjust_timestamps{adjust_timestamps} {}
 
diff --git a/torch/csrc/profiler/orchestration/observer.h b/torch/csrc/profiler/orchestration/observer.h
index 427736e6c6359..ba62e9b56b5c6 100644
--- a/torch/csrc/profiler/orchestration/observer.h
+++ b/torch/csrc/profiler/orchestration/observer.h
@@ -62,6 +62,7 @@ struct TORCH_API ExperimentalConfig {
       bool disable_external_correlation = false,
       bool profile_all_threads = false,
       bool capture_overload_names = false,
+      bool record_python_gc_info = false,
       std::string custom_profiler_config = "",
       bool adjust_timestamps = false);
   explicit operator bool() const;
@@ -102,6 +103,12 @@ struct TORCH_API ExperimentalConfig {
    * function schema and stored in the profile  */
   bool capture_overload_names;
 
+  /*
+   * Controls whether or not python gc info is recorded. This is used to
+   * determine if gc collect is slowing down your profile.
+   */
+  bool record_python_gc_info;
+
   /*
    * A custom_profiler_config option is introduced to allow custom backends
    * to apply custom configurations as needed.
diff --git a/torch/csrc/profiler/orchestration/python_tracer.cpp b/torch/csrc/profiler/orchestration/python_tracer.cpp
index d5d120d376f25..0d1ad389f8896 100644
--- a/torch/csrc/profiler/orchestration/python_tracer.cpp
+++ b/torch/csrc/profiler/orchestration/python_tracer.cpp
@@ -11,6 +11,7 @@ struct NoOpPythonTracer : public PythonTracerBase {
 
   void stop() override {}
   void restart() override {}
+  void register_gc_callback() override {}
   std::vector<std::shared_ptr<Result>> getEvents(
       std::function<c10::time_t(c10::approx_time_t)>,
       std::vector<CompressedEvent>&,
diff --git a/torch/csrc/profiler/orchestration/python_tracer.h b/torch/csrc/profiler/orchestration/python_tracer.h
index 52387e92e562b..1011f75b82308 100644
--- a/torch/csrc/profiler/orchestration/python_tracer.h
+++ b/torch/csrc/profiler/orchestration/python_tracer.h
@@ -48,6 +48,7 @@ struct TORCH_API PythonTracerBase {
 
   virtual void stop() = 0;
   virtual void restart() = 0;
+  virtual void register_gc_callback() = 0;
   virtual std::vector<std::shared_ptr<Result>> getEvents(
       std::function<c10::time_t(c10::approx_time_t)> time_converter,
       std::vector<CompressedEvent>& enters,
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index 062f87a465ccb..aa7abe9433fe1 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -341,6 +341,7 @@ void initPythonBindings(PyObject* module) {
               bool /* disable_external_correlation*/,
               bool /* profile_all_threads */,
               bool /* capture_overload_names */,
+              bool /* record_python_gc_info */,
               std::string /* custom_profiler_config*/
               >(),
           "An experimental config for Kineto features. Please note that"
@@ -360,6 +361,7 @@ void initPythonBindings(PyObject* module) {
           "    disable_external_correlation (bool) : whether to disable external correlation\n"
           "    profile_all_threads (bool) : whether to profile all threads\n"
           "    capture_overload_names (bool) : whether to include ATen overload names in the profile\n"
+          "    record_python_gc_info (bool) : adds python gc events to profile\n"
           "    custom_profiler_config (string) : Used to pass some configurations to the custom profiler backend.\n",
           py::arg("profiler_metrics") = std::vector<std::string>(),
           py::arg("profiler_measure_per_kernel") = false,
@@ -370,6 +372,7 @@ void initPythonBindings(PyObject* module) {
           py::arg("disable_external_correlation") = false,
           py::arg("profile_all_threads") = false,
           py::arg("capture_overload_names") = false,
+          py::arg("record_python_gc_info") = false,
           py::arg("custom_profiler_config") = "")
       .def(py::pickle(
           [](const ExperimentalConfig& p) { // __getstate__
@@ -393,6 +396,7 @@ void initPythonBindings(PyObject* module) {
                 p.disable_external_correlation,
                 p.profile_all_threads,
                 p.capture_overload_names,
+                p.record_python_gc_info,
                 p.custom_profiler_config,
                 p.performance_events);
           },
diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
index 1c88e80d4021c..e46c141cd3f4d 100644
--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
@@ -30,15 +30,12 @@
 #include <torch/csrc/profiler/standalone/execution_trace_observer.h>
 #include <torch/csrc/profiler/util.h>
 
-#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
-#endif // USE_DISTRIBUTED
 
 using namespace at;
 
 // Collective property attributes
 // https://github.com/pytorch/pytorch/issues/124674
-#ifdef USE_DISTRIBUTED
 constexpr auto kETCommsName = "collective_name";
 constexpr auto kETInMsgNelems = "in_msg_nelems";
 constexpr auto kETOutMsgNelems = "out_msg_nelems";
@@ -49,7 +46,6 @@ constexpr auto kETGlobalRankStride = "global_rank_stride";
 constexpr auto kETGroupSize = "pg_size";
 constexpr auto kETProcessGroupName = "pg_name";
 constexpr auto kETProcessGroupDesc = "pg_desc";
-#endif // USE_DISTRIBUTED
 
 namespace torch::profiler::impl {
 
@@ -269,7 +265,6 @@ static std::ofstream openOutputFile(const std::string& name) {
   return stream;
 }
 
-#ifdef USE_DISTRIBUTED
 static std::string getAttrJson(
     const std::string& name,
     const std::string& type,
@@ -282,7 +277,6 @@ static std::string getAttrJson(
       type,
       value);
 }
-#endif
 
 static void writeJsonNode(
     std::ofstream& out,
@@ -660,7 +654,6 @@ static void handleKernelBackendInfo(
 inline std::string getCommsNodeAttrs(const RecordFunction& fn) { // NOLINT
   std::vector<std::string> attrs;
 
-#ifdef USE_DISTRIBUTED
   // We rely on paramcommsdebug object that is available in thread local info
   auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
       c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO));
@@ -704,8 +697,6 @@ inline std::string getCommsNodeAttrs(const RecordFunction& fn) { // NOLINT
 
   addAttr(kGroupSize, kETGroupSize, "uint64");
 
-#endif // USE_DISTRIBUTED
-
   // XXX consider using as string stream?
   return attrs.empty() ? "" : fmt::format(", {}", fmt::join(attrs, ", "));
 }
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index 0b2979e6fb7ea..e97699a99fd1c 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -11,9 +11,7 @@
 #ifdef USE_KINETO
 #include <libkineto.h>
 #endif
-#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
-#endif // USE_DISTRIBUTED
 
 namespace torch::profiler::impl {
 
@@ -455,7 +453,7 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
     // @lint-ignore CLANGTIDY
     const SaveNcclMetaConfig& config) {
   std::unordered_map<std::string, std::string> map;
-#ifdef USE_DISTRIBUTED
+#if !defined(BUILD_LITE_INTERPRETER) && !defined(C10_MOBILE)
   auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
       c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO));
 
@@ -565,7 +563,7 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
       }
     }
   }
-#endif // USE_DISTRIBUTED
+#endif // !defined(BUILD_LITE_INTERPRETER) && !defined(C10_MOBILE)
   return map;
 }
 
diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h
index f2ae57fa0e591..dcb4b866a2de3 100644
--- a/torch/csrc/profiler/util.h
+++ b/torch/csrc/profiler/util.h
@@ -185,7 +185,6 @@ struct HashCombine {
   }
 };
 
-#ifdef USE_DISTRIBUTED
 constexpr auto kCommsName = "Collective name";
 constexpr auto kDtype = "dtype";
 constexpr auto kInMsgNelems = "In msg nelems";
@@ -203,6 +202,5 @@ constexpr auto kP2pSrc = "Src Rank";
 constexpr auto kP2pDst = "Dst Rank";
 constexpr auto kInTensorsStart = "Input Tensors start";
 constexpr auto kOutTensorsStart = "Output Tensors start";
-#endif // USE_DISTRIBUTED
 
 } // namespace torch::profiler::impl
diff --git a/torch/csrc/serialization.cpp b/torch/csrc/serialization.cpp
index b2bcabc363c17..dd3027d372dcf 100644
--- a/torch/csrc/serialization.cpp
+++ b/torch/csrc/serialization.cpp
@@ -351,16 +351,14 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
         _storage_nbytes);
   }
 
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-  std::unique_ptr<char[]> cpu_data;
+  std::string cpu_data;
 
   uint8_t* data{};
   if (storage->device_type() == at::kCPU) {
     data = static_cast<uint8_t*>(storage->mutable_data());
   } else {
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-    cpu_data = std::unique_ptr<char[]>(new char[nbytes]);
-    data = (uint8_t*)cpu_data.get();
+    cpu_data.resize(nbytes);
+    data = (uint8_t*)cpu_data.data();
   }
 
   // fast track for bytes and little endian
@@ -370,24 +368,23 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
     doRead(file, data, storage->nbytes());
   } else {
     int64_t buffer_size = std::min(size, (int64_t)5000);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-    std::unique_ptr<uint8_t[]> le_buffer(
-        new uint8_t[buffer_size * element_size]);
+    std::vector<uint8_t> le_buffer;
+    le_buffer.resize(buffer_size * element_size);
 
     for (int64_t i = 0; i < size; i += buffer_size) {
       size_t to_convert = std::min(size - i, buffer_size);
-      doRead(file, le_buffer.get(), element_size * to_convert);
+      doRead(file, le_buffer.data(), element_size * to_convert);
 
       // NOLINTNEXTLINE(bugprone-branch-clone)
       if (element_size == 2) {
         torch::utils::THP_decodeBuffer(
-            (int16_t*)data + i, le_buffer.get(), true, to_convert);
+            (int16_t*)data + i, le_buffer.data(), true, to_convert);
       } else if (element_size == 4) {
         torch::utils::THP_decodeBuffer(
-            (int32_t*)data + i, le_buffer.get(), true, to_convert);
+            (int32_t*)data + i, le_buffer.data(), true, to_convert);
       } else if (element_size == 8) {
         torch::utils::THP_decodeBuffer(
-            (int64_t*)data + i, le_buffer.get(), true, to_convert);
+            (int64_t*)data + i, le_buffer.data(), true, to_convert);
       }
     }
   }
diff --git a/torch/csrc/stable/accelerator.h b/torch/csrc/stable/accelerator.h
new file mode 100644
index 0000000000000..e104107dbc5bf
--- /dev/null
+++ b/torch/csrc/stable/accelerator.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/headeronly/util/shim_utils.h>
+
+#include <memory>
+
+using DeleterFnPtr = void (*)(void*);
+
+namespace torch::stable::accelerator {
+
+namespace {
+inline void delete_device_guard(void* ptr) {
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_delete_device_guard(reinterpret_cast<DeviceGuardHandle>(ptr)));
+}
+
+} // namespace
+
+// this is bigger than DeviceIndex in c10/core/Device.h but it is the type we
+// can converge on in this world as DeviceIndex in libtorch is not stable.
+using DeviceIndex = int32_t;
+using StreamId = int64_t; // this is from c10/core/Stream.h
+
+class DeviceGuard {
+ public:
+  explicit DeviceGuard() = delete;
+  explicit DeviceGuard(DeviceIndex device_index)
+      : guard_(nullptr, delete_device_guard) {
+    DeviceGuardHandle ptr = nullptr;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_create_device_guard(device_index, &ptr));
+    guard_.reset(ptr);
+  }
+
+  void set_index(DeviceIndex device_index) {
+    TORCH_ERROR_CODE_CHECK(
+        aoti_torch_device_guard_set_index(guard_.get(), device_index));
+  }
+
+ private:
+  std::unique_ptr<DeviceGuardOpaque, DeleterFnPtr> guard_;
+};
+
+class Stream {
+ public:
+  explicit Stream() = delete;
+
+  // Construct a stable::Stream from a StreamHandle
+  // Steals ownership from the StreamHandle
+  explicit Stream(StreamHandle stream)
+      : stream_(stream, [](StreamHandle stream) {
+          TORCH_ERROR_CODE_CHECK(aoti_torch_delete_stream(stream));
+        }) {}
+
+  StreamId id() const {
+    StreamId stream_id;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_stream_id(stream_.get(), &stream_id));
+    return stream_id;
+  }
+
+ private:
+  std::shared_ptr<StreamOpaque> stream_;
+};
+
+inline Stream getCurrentStream(DeviceIndex device_index) {
+  StreamHandle stream = nullptr;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_current_stream(device_index, &stream));
+  return Stream(stream);
+}
+
+// Get the current device index
+inline DeviceIndex getCurrentDeviceIndex() {
+  DeviceIndex device_index;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_current_device_index(&device_index));
+  return device_index;
+}
+
+} // namespace torch::stable::accelerator
diff --git a/torch/csrc/stable/library.h b/torch/csrc/stable/library.h
index ec779fd67fb08..741b6229042a4 100644
--- a/torch/csrc/stable/library.h
+++ b/torch/csrc/stable/library.h
@@ -4,229 +4,16 @@
 // code for better UX.
 
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
-#include <torch/csrc/stable/tensor.h>
 
-#include <optional>
+// Technically, this file doesn't use anything from stableivalue_conversions.h,
+// but we need to include it here as the contents of stableivalue_conversions.h
+// used to live here and so we need to expose them for backwards compatibility.
+#include <torch/csrc/stable/stableivalue_conversions.h>
 
 // use anonymous namespace to avoid collisions between differing
 // versions of this file that may be included by different sources
 namespace {
 
-// =============================================================================
-//  helpers for converting between StableIValue and T
-// =============================================================================
-
-// forward declare so that from/to() calls in detail work
-template <typename T>
-StableIValue from(T val);
-template <typename T>
-T to(StableIValue val);
-
-namespace detail {
-
-// =============================================================================
-// FROM CONVERSIONS (T -> StableIValue)
-// =============================================================================
-
-// Specialization for general copyable types (catch-all) => StableIValue
-template <typename T>
-struct FromImpl {
-  static StableIValue call(T val) {
-    static_assert(
-        sizeof(T) <= sizeof(StableIValue),
-        "StableLibrary stack does not support parameter types larger than 64 bits.");
-    static_assert(std::is_trivially_copyable_v<T>);
-    // Initialization should be cheap enough; let's give people well-specified
-    // reproducible behavior.
-    StableIValue result = 0;
-    // NOTE [ -Wclass-memaccess ]: reinterpret_cast to suppress
-    // overzealous -Wclass-memaccess. (see
-    // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107361) We have a
-    // static_assert above that T is trivially copyable, which should be
-    // enough.
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-    std::memcpy(&result, reinterpret_cast<const void*>(&val), sizeof(val));
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    // if value has size less than sizeof(StableIValue), then only lowest bytes
-    // have to be updated
-    std::memcpy(
-        reinterpret_cast<unsigned char*>(&result) + sizeof(StableIValue) -
-            sizeof(val),
-        reinterpret_cast<const void*>(&val),
-        sizeof(val));
-#else
-#error Unexpected or undefined __BYTE_ORDER__
-#endif
-    return result;
-  }
-};
-
-// Specialization for std::nullopt_t => StableIValue
-template <>
-struct FromImpl<std::nullopt_t> {
-  static StableIValue call(std::nullopt_t val) {
-    return from(nullptr);
-  }
-};
-
-// Specialization for std::optional => StableIValue
-// [Handling std::optional]
-// When the schema is represented by an optional type, say int?, then we
-// expect the custom extension representation to be a std::optional<int>
-// (critically NOT int!). In order for all parameters to be stably parsed and
-// handled by our dispatcher, we liaison custom extension parameters through
-// boxed kernels, meaning that every value will make its way to be an IValue:
-//
-// custom extension value --(from)-> StableIValue --(to_ivalue)-> IValue
-//
-// When the custom extension value is a literal that can be trivially
-// casted to StableIValue, e.g., an int, a float, a pointer, this route is
-// ...trivial. The below specialization is for a case when the custom
-// extension value would NOT fit within a StableIValue: a std::optional.
-//
-// If the std::optional has no value, it is treated as std::nullopt,
-// whose StableIValue representation is from(nullptr). Otherwise, we:
-// 1. unwrap the std::optional<T>
-// 2. recursively convert its value of type T to a StableIValue
-// 3. allocate heap space for said StableIValue
-// 4. convert the resulting StableIValue* into a StableIValue
-//
-// note that this allocates heap memory! which we expect to be cleaned
-// up in the to_ivalue() function defined in shim_common.cpp. We
-// purposefully hide this implementation detail from the user so that
-// all the user needs to know is:
-//
-// The schema requests an optional (T?) so I must call `from` on a
-// std::optional<T> or a std::nullopt.
-template <typename T>
-struct FromImpl<std::optional<T>> {
-  static StableIValue call(const std::optional<T>& val) {
-    if (!val.has_value()) {
-      return from(std::nullopt);
-    }
-    StableIValue* heap_val = new StableIValue(from(val.value()));
-    return from(heap_val);
-  }
-};
-
-// Specialization for torch::stable::Tensor => StableIValue
-// Returns a new owning reference of the underlying Tensor.
-template <>
-struct FromImpl<torch::stable::Tensor> {
-  static StableIValue call(const torch::stable::Tensor& val) {
-    AtenTensorHandle new_ath;
-    aoti_torch_new_tensor_handle(val.get(), &new_ath);
-    return from(new_ath);
-  }
-};
-
-// =============================================================================
-// TO CONVERSIONS (StableIValue -> T)
-// =============================================================================
-
-// Specialization for StableIValue => general copyable types (catch-all)
-template <typename T>
-struct ToImpl {
-  static T call(StableIValue val) {
-    static_assert(std::is_trivially_copyable_v<T>);
-    // T may not have a default constructor. (For example, it might be
-    // c10::Device.) However, std::memcpy implicitly creates a T at the
-    // destination. So, we can use a union to work around this lack of
-    // default constructor.
-    union Result {
-      Result() {}
-      T t;
-    };
-    Result result;
-    // See NOTE[ -Wclass-memaccess ] above.
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-    std::memcpy(reinterpret_cast<void*>(&result.t), &val, sizeof(result));
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    static_assert(
-        sizeof(T) <= sizeof(StableIValue),
-        "StableLibrary stack does not support parameter types larger than 64 bits.");
-    // if value has size less than sizeof(StableIValue), then only lowest bytes
-    // have to be updated
-    std::memcpy(
-        reinterpret_cast<void*>(&result.t),
-        reinterpret_cast<unsigned char*>(&val) + sizeof(StableIValue) -
-            sizeof(result),
-        sizeof(result));
-#else
-#error Unexpected or undefined __BYTE_ORDER__
-#endif
-    return result.t;
-  }
-};
-
-// Specialization for StableIValue => std::nullopt_t
-template <>
-struct ToImpl<std::nullopt_t> {
-  static std::nullopt_t call(StableIValue val) {
-    // val should be equivalent to from(nullptr)
-    return std::nullopt;
-  }
-};
-
-// Specialization for StableIValue => std::optional, see [Handling
-// std::optional] as the semantic is the same but in reverse direction as we go
-// from IValue --(from_ivalue)-> StableIValue --(to<T>)-> T in custom extension
-template <typename T>
-struct ToImpl<std::optional<T>> {
-  static std::optional<T> call(StableIValue val) {
-    auto sivp = to<StableIValue*>(val);
-
-    // sivp is either nullptr or a pointer to a StableIValue
-    if (sivp == nullptr) {
-      return {};
-    }
-    auto inner_val = to<T>(*sivp);
-
-    // free the memory associated with StableIValue* sivp
-    delete sivp;
-
-    return std::make_optional(inner_val);
-  }
-};
-
-// Specialization for StableIValue => torch::stable::Tensor
-// The resulting stable::Tensor steals ownership of the input's
-// underlying AtenTensorHandle.
-template <>
-struct ToImpl<torch::stable::Tensor> {
-  static torch::stable::Tensor call(StableIValue val) {
-    return torch::stable::Tensor(to<AtenTensorHandle>(val));
-  }
-};
-
-} // namespace detail
-
-// Expose the partially templated class functions through single functions
-template <typename T>
-StableIValue from(T val) {
-  return detail::FromImpl<T>::call(val);
-}
-
-template <typename T>
-StableIValue from(const std::optional<T>& val) {
-  return detail::FromImpl<std::optional<T>>::call(val);
-}
-
-// The below overload is used! See https://godbolt.org/z/859cshxrW
-// We are suppressing the warning for versions clang12- and gcc11-
-[[maybe_unused]] StableIValue from(const torch::stable::Tensor& val) {
-  return detail::FromImpl<torch::stable::Tensor>::call(val);
-}
-
-template <typename T>
-T to(StableIValue val) {
-  return detail::ToImpl<T>::call(val);
-}
-
-// =============================================================================
-//  end to helpers for converting between StableIValue and T
-// =============================================================================
-
 class StableLibrary final {
  private:
   TorchLibraryHandle lib_;
diff --git a/torch/csrc/stable/ops.h b/torch/csrc/stable/ops.h
index a8f68f4a5e3ad..4a11c7256bf4b 100644
--- a/torch/csrc/stable/ops.h
+++ b/torch/csrc/stable/ops.h
@@ -1,14 +1,19 @@
 #pragma once
 
-#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/stableivalue_conversions.h>
 #include <array>
 #include <cstdint>
 #include <optional>
+#include <string>
+#include <vector>
 
 #include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
+#include <torch/headeronly/core/ScalarType.h>
 
 using torch::stable::Tensor;
 
+namespace torch::stable {
+
 // We expect this to be the stable version of the empty_like op that takes in
 // no kwargs (device, dtype, layout, memory_format). We will add kwargs
 // support in the future.
@@ -21,7 +26,7 @@ inline Tensor empty_like(const Tensor& self) {
       from(std::nullopt),
       from(std::nullopt),
       from(std::nullopt)};
-  AOTI_TORCH_ERROR_CODE_CHECK(
+  TORCH_ERROR_CODE_CHECK(
       aoti_torch_call_dispatcher("aten::empty_like", "", stack.data()));
   return to<Tensor>(stack[0]);
 }
@@ -32,16 +37,155 @@ inline Tensor empty_like(const Tensor& self) {
 // actually a Scalar. This is because Scalar.h is currently not
 // header-only.
 inline Tensor fill_(const Tensor& self, double value) {
-  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_aten_fill__Scalar(self.get(), value));
+  TORCH_ERROR_CODE_CHECK(aoti_torch_aten_fill__Scalar(self.get(), value));
   return self;
 }
 
+// We expect this to be the stable version of the narrow.default op.
+// narrow takes in a SymInt for start and length, but these are typed as
+// int64_t as SymInt is not yet header-only.
+inline Tensor narrow(Tensor& self, int64_t dim, int64_t start, int64_t length) {
+  AtenTensorHandle ret0 = nullptr;
+
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_aten_narrow(self.get(), dim, start, length, &ret0));
+  return Tensor(ret0);
+}
+
+// We expect this to be a stable version of the new_empty op that takes in
+// only dtype information.
+inline Tensor new_empty(
+    const Tensor& self,
+    std::vector<int64_t> size,
+    std::optional<c10::ScalarType> dtype = std::nullopt) {
+  int32_t device_type;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(self.get(), &device_type));
+
+  int32_t device_index;
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_get_device_index(self.get(), &device_index));
+
+  int32_t target_dtype;
+  if (dtype.has_value()) {
+    target_dtype = to<int32_t>(from(dtype.value()));
+  } else {
+    TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(self.get(), &target_dtype));
+  }
+
+  int32_t layout;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_layout(self.get(), &layout));
+
+  AtenTensorHandle ret0;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_aten_new_empty(
+      self.get(),
+      size.data(),
+      static_cast<int64_t>(size.size()),
+      &target_dtype,
+      &layout,
+      &device_type,
+      device_index,
+      nullptr, // pin_memory (nullptr for default)
+      &ret0));
+
+  return Tensor(ret0);
+}
+
+// We expect this to be a stable version of the new_zeros op that takes in
+// only dtype information.
+inline Tensor new_zeros(
+    const Tensor& self,
+    std::vector<int64_t> size,
+    std::optional<c10::ScalarType> dtype = std::nullopt) {
+  int32_t device_type;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(self.get(), &device_type));
+
+  int32_t device_index;
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_get_device_index(self.get(), &device_index));
+
+  int32_t target_dtype;
+  if (dtype.has_value()) {
+    target_dtype = to<int32_t>(from(dtype.value()));
+  } else {
+    TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(self.get(), &target_dtype));
+  }
+
+  int32_t layout;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_layout(self.get(), &layout));
+
+  AtenTensorHandle ath;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_aten_new_zeros(
+      self.get(),
+      size.data(),
+      static_cast<int64_t>(size.size()),
+      &target_dtype,
+      &layout,
+      &device_type,
+      device_index,
+      nullptr, // pin_memory (nullptr for default)
+      &ath));
+
+  return Tensor(ath);
+}
+
+// We expect this to be the stable version of the pad.default op.
+// pad.default takes in a SymInt[] as the pad argument however pad is typed as
+// use std::vector<int64_t> because
+// (1) IntArrayRef is not yet header-only
+// (2) SymInt is not yet header-only
+inline Tensor pad(
+    const Tensor& self,
+    std::vector<int64_t> pad,
+    const std::string& mode = "constant",
+    double value = 0.0) {
+  AtenTensorHandle ret0 = nullptr;
+
+  TORCH_ERROR_CODE_CHECK(aoti_torch_aten_pad(
+      self.get(), pad.data(), pad.size(), mode.c_str(), &value, &ret0));
+  return Tensor(ret0);
+}
+
+// We expect the following two functions to be stable versions of the
+// amax.default op with identical semantics to the existing amax.default op. If
+// `keepdim` is true, the result will have the same number of dimensions as
+// `self`, with the specified dimension having size 1. Otherwise, the result
+// will have one fewer dimension than `self`, with the specified dimension
+// removed.
+
+// This function is an overload to compute the maximum value along each slice of
+// `self` along a single dimension `dim`.
+inline Tensor amax(const Tensor& self, int64_t dim, bool keepdim = false) {
+  AtenTensorHandle ret = nullptr;
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_aten_amax(self.get(), &dim, 1, keepdim, &ret));
+  return Tensor(ret);
+}
+
+// This function is an overload to compute the maximum value along each slice of
+// `self` reducing over all the dimensions in the vector `dims`. The
+// amax.default op takes in a SymInt[] as the dims argument, however dims is
+// typed as use std::vector<int64_t> here because (1) IntArrayRef is not yet
+// header-only (2) SymInt is not yet header-only
+inline Tensor amax(
+    const Tensor& self,
+    std::vector<int64_t> dims,
+    bool keepdim = false) {
+  AtenTensorHandle ret = nullptr;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_aten_amax(
+      self.get(),
+      dims.data(),
+      static_cast<int64_t>(dims.size()),
+      keepdim,
+      &ret));
+  return Tensor(ret);
+}
+
 // We expect this to be the stable version of the transpose op with identical
 // semantics to the existing transpose.int op.
 inline Tensor transpose(const Tensor& self, int64_t dim0, int64_t dim1) {
   const auto num_args = 3;
   std::array<StableIValue, num_args> stack{from(self), from(dim0), from(dim1)};
-  AOTI_TORCH_ERROR_CODE_CHECK(
+  TORCH_ERROR_CODE_CHECK(
       aoti_torch_call_dispatcher("aten::transpose", "int", stack.data()));
   return to<Tensor>(stack[0]);
 }
@@ -52,7 +196,9 @@ inline Tensor transpose(const Tensor& self, int64_t dim0, int64_t dim1) {
 inline Tensor zero_(Tensor& self) {
   const auto num_args = 1;
   std::array<StableIValue, num_args> stack{from(self)};
-  AOTI_TORCH_ERROR_CODE_CHECK(
+  TORCH_ERROR_CODE_CHECK(
       aoti_torch_call_dispatcher("aten::zero_", "", stack.data()));
   return to<Tensor>(stack[0]);
 }
+
+} // namespace torch::stable
diff --git a/torch/csrc/stable/stableivalue_conversions.h b/torch/csrc/stable/stableivalue_conversions.h
new file mode 100644
index 0000000000000..ce5fdd941c6d4
--- /dev/null
+++ b/torch/csrc/stable/stableivalue_conversions.h
@@ -0,0 +1,345 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/stable/tensor_struct.h>
+#include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/util/shim_utils.h>
+
+#include <optional>
+
+// use anonymous namespace to avoid collisions between differing
+// versions of this file that may be included by different sources
+namespace {
+
+// forward declare so that the from/to() implementations in the detail
+// namespace of library.h where the real work is done can compile.
+template <typename T>
+StableIValue from(T val);
+template <typename T>
+T to(StableIValue val);
+
+// =============================================================================
+//  helpers for converting between StableIValue and T
+// =============================================================================
+
+// note that the signatures for from and to are forward declared in
+// stable/stableivalue_conversions.h but defined below to avoid circular
+// dependencies where other headers (like tensor-inl.h) will need to/from.
+
+namespace detail {
+
+// =============================================================================
+// FROM CONVERSIONS (T -> StableIValue)
+// =============================================================================
+
+// Specialization for general copyable types (catch-all) => StableIValue
+template <typename T>
+struct FromImpl {
+  static StableIValue call(T val) {
+    static_assert(
+        sizeof(T) <= sizeof(StableIValue),
+        "StableLibrary stack does not support parameter types larger than 64 bits.");
+    static_assert(std::is_trivially_copyable_v<T>);
+    // Initialization should be cheap enough; let's give people well-specified
+    // reproducible behavior.
+    StableIValue result = 0;
+    // NOTE [ -Wclass-memaccess ]: reinterpret_cast to suppress
+    // overzealous -Wclass-memaccess. (see
+    // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107361) We have a
+    // static_assert above that T is trivially copyable, which should be
+    // enough.
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    std::memcpy(&result, reinterpret_cast<const void*>(&val), sizeof(val));
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    // if value has size less than sizeof(StableIValue), then only lowest bytes
+    // have to be updated
+    std::memcpy(
+        reinterpret_cast<unsigned char*>(&result) + sizeof(StableIValue) -
+            sizeof(val),
+        reinterpret_cast<const void*>(&val),
+        sizeof(val));
+#else
+#error "Unexpected or undefined __BYTE_ORDER__"
+#endif
+    return result;
+  }
+};
+
+// Specialization for torch::headeronly::ScalarType => StableIValue
+// Note that we call into the shim to translate between the user's
+// ScalarType and libtorch's ScalarType, which can be different!
+// Also note that the list below is not comprehensive, as it does not
+// include types that are no longer really used and should probably be
+// deprecated (like qint8).
+using torch::headeronly::ScalarType;
+template <>
+struct FromImpl<ScalarType> {
+  static StableIValue call(ScalarType val) {
+    switch (val) {
+      case ScalarType::Byte:
+        return from(aoti_torch_dtype_uint8());
+      case ScalarType::Char:
+        return from(aoti_torch_dtype_int8());
+      case ScalarType::Short:
+        return from(aoti_torch_dtype_int16());
+      case ScalarType::Int:
+        return from(aoti_torch_dtype_int32());
+      case ScalarType::Long:
+        return from(aoti_torch_dtype_int64());
+      case ScalarType::Half:
+        return from(aoti_torch_dtype_float16());
+      case ScalarType::Float:
+        return from(aoti_torch_dtype_float32());
+      case ScalarType::Double:
+        return from(aoti_torch_dtype_float64());
+      case ScalarType::ComplexHalf:
+        return from(aoti_torch_dtype_complex32());
+      case ScalarType::ComplexFloat:
+        return from(aoti_torch_dtype_complex64());
+      case ScalarType::ComplexDouble:
+        return from(aoti_torch_dtype_complex128());
+      case ScalarType::Bool:
+        return from(aoti_torch_dtype_bool());
+      case ScalarType::BFloat16:
+        return from(aoti_torch_dtype_bfloat16());
+      case ScalarType::Float8_e5m2:
+        return from(aoti_torch_dtype_float8_e5m2());
+      case ScalarType::Float8_e4m3fn:
+        return from(aoti_torch_dtype_float8_e4m3fn());
+      case ScalarType::Float8_e5m2fnuz:
+        return from(aoti_torch_dtype_float8_e5m2fnuz());
+      case ScalarType::Float8_e4m3fnuz:
+        return from(aoti_torch_dtype_float8_e4m3fnuz());
+      case ScalarType::UInt16:
+        return from(aoti_torch_dtype_uint16());
+      case ScalarType::UInt32:
+        return from(aoti_torch_dtype_uint32());
+      case ScalarType::UInt64:
+        return from(aoti_torch_dtype_uint64());
+      default:
+        throw std::runtime_error(
+            "Not yet supported ScalarType, please file an issue describing your use case.");
+    }
+  }
+};
+
+// Specialization for std::nullopt_t => StableIValue
+template <>
+struct FromImpl<std::nullopt_t> {
+  static StableIValue call(std::nullopt_t val) {
+    return from(nullptr);
+  }
+};
+
+// Specialization for std::optional => StableIValue
+// [Handling std::optional]
+// When the schema is represented by an optional type, say int?, then we
+// expect the custom extension representation to be a std::optional<int>
+// (critically NOT int!). In order for all parameters to be stably parsed and
+// handled by our dispatcher, we liaison custom extension parameters through
+// boxed kernels, meaning that every value will make its way to be an IValue:
+//
+// custom extension value --(from)-> StableIValue --(to_ivalue)-> IValue
+//
+// When the custom extension value is a literal that can be trivially
+// casted to StableIValue, e.g., an int, a float, a pointer, this route is
+// ...trivial. The below specialization is for a case when the custom
+// extension value would NOT fit within a StableIValue: a std::optional.
+//
+// If the std::optional has no value, it is treated as std::nullopt,
+// whose StableIValue representation is from(nullptr). Otherwise, we:
+// 1. unwrap the std::optional<T>
+// 2. recursively convert its value of type T to a StableIValue
+// 3. allocate heap space for said StableIValue
+// 4. convert the resulting StableIValue* into a StableIValue
+//
+// note that this allocates heap memory! which we expect to be cleaned
+// up in the to_ivalue() function defined in shim_common.cpp. We
+// purposefully hide this implementation detail from the user so that
+// all the user needs to know is:
+//
+// The schema requests an optional (T?) so I must call `from` on a
+// std::optional<T> or a std::nullopt.
+template <typename T>
+struct FromImpl<std::optional<T>> {
+  static StableIValue call(const std::optional<T>& val) {
+    if (!val.has_value()) {
+      return from(std::nullopt);
+    }
+    return from(new StableIValue(from(val.value())));
+  }
+};
+
+// Specialization for torch::stable::Tensor => StableIValue
+// Returns a new owning reference of the underlying Tensor.
+template <>
+struct FromImpl<torch::stable::Tensor> {
+  static StableIValue call(const torch::stable::Tensor& val) {
+    AtenTensorHandle new_ath;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_new_tensor_handle(val.get(), &new_ath));
+    return from(new_ath);
+  }
+};
+
+// =============================================================================
+// TO CONVERSIONS (StableIValue -> T)
+// =============================================================================
+
+// Specialization for StableIValue => general copyable types (catch-all)
+template <typename T>
+struct ToImpl {
+  static T call(StableIValue val) {
+    static_assert(std::is_trivially_copyable_v<T>);
+    // T may not have a default constructor. (For example, it might be
+    // c10::Device.) However, std::memcpy implicitly creates a T at the
+    // destination. So, we can use a union to work around this lack of
+    // default constructor.
+    union Result {
+      Result() {}
+      T t;
+    };
+    Result result;
+    // See NOTE[ -Wclass-memaccess ] above.
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    std::memcpy(reinterpret_cast<void*>(&result.t), &val, sizeof(result));
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    static_assert(
+        sizeof(T) <= sizeof(StableIValue),
+        "StableLibrary stack does not support parameter types larger than 64 bits.");
+    // if value has size less than sizeof(StableIValue), then only lowest bytes
+    // have to be updated
+    std::memcpy(
+        reinterpret_cast<void*>(&result.t),
+        reinterpret_cast<unsigned char*>(&val) + sizeof(StableIValue) -
+            sizeof(result),
+        sizeof(result));
+#else
+#error "Unexpected or undefined __BYTE_ORDER__"
+#endif
+    return result.t;
+  }
+};
+
+// Specialization for StableIValue => torch::headeronly::ScalarType
+template <>
+struct ToImpl<ScalarType> {
+  static ScalarType call(StableIValue val) {
+    int32_t shim_scalartype = to<int32_t>(val);
+    if (shim_scalartype == aoti_torch_dtype_uint8()) {
+      return ScalarType::Byte;
+    } else if (shim_scalartype == aoti_torch_dtype_int8()) {
+      return ScalarType::Char;
+    } else if (shim_scalartype == aoti_torch_dtype_int16()) {
+      return ScalarType::Short;
+    } else if (shim_scalartype == aoti_torch_dtype_int32()) {
+      return ScalarType::Int;
+    } else if (shim_scalartype == aoti_torch_dtype_int64()) {
+      return ScalarType::Long;
+    } else if (shim_scalartype == aoti_torch_dtype_float16()) {
+      return ScalarType::Half;
+    } else if (shim_scalartype == aoti_torch_dtype_float32()) {
+      return ScalarType::Float;
+    } else if (shim_scalartype == aoti_torch_dtype_float64()) {
+      return ScalarType::Double;
+    } else if (shim_scalartype == aoti_torch_dtype_complex32()) {
+      return ScalarType::ComplexHalf;
+    } else if (shim_scalartype == aoti_torch_dtype_complex64()) {
+      return ScalarType::ComplexFloat;
+    } else if (shim_scalartype == aoti_torch_dtype_complex128()) {
+      return ScalarType::ComplexDouble;
+    } else if (shim_scalartype == aoti_torch_dtype_bool()) {
+      return ScalarType::Bool;
+    } else if (shim_scalartype == aoti_torch_dtype_bfloat16()) {
+      return ScalarType::BFloat16;
+    } else if (shim_scalartype == aoti_torch_dtype_float8_e5m2()) {
+      return ScalarType::Float8_e5m2;
+    } else if (shim_scalartype == aoti_torch_dtype_float8_e4m3fn()) {
+      return ScalarType::Float8_e4m3fn;
+    } else if (shim_scalartype == aoti_torch_dtype_float8_e5m2fnuz()) {
+      return ScalarType::Float8_e5m2fnuz;
+    } else if (shim_scalartype == aoti_torch_dtype_float8_e4m3fnuz()) {
+      return ScalarType::Float8_e4m3fnuz;
+    } else if (shim_scalartype == aoti_torch_dtype_uint16()) {
+      return ScalarType::UInt16;
+    } else if (shim_scalartype == aoti_torch_dtype_uint32()) {
+      return ScalarType::UInt32;
+    } else if (shim_scalartype == aoti_torch_dtype_uint64()) {
+      return ScalarType::UInt64;
+    } else {
+      throw std::runtime_error(
+          "Not yet supported ScalarType " + std::to_string(shim_scalartype) +
+          ", please file an issue describing your use case.");
+    }
+  }
+};
+
+// Specialization for StableIValue => std::nullopt_t
+template <>
+struct ToImpl<std::nullopt_t> {
+  static std::nullopt_t call(StableIValue val) {
+    // val should be equivalent to from(nullptr)
+    return std::nullopt;
+  }
+};
+
+// Specialization for StableIValue => std::optional, see [Handling
+// std::optional] as the semantic is the same but in reverse direction as we go
+// from IValue --(from_ivalue)-> StableIValue --(to<T>)-> T in custom extension
+template <typename T>
+struct ToImpl<std::optional<T>> {
+  static std::optional<T> call(StableIValue val) {
+    auto sivp = to<StableIValue*>(val);
+
+    // sivp is either nullptr or a pointer to a StableIValue
+    if (sivp == nullptr) {
+      return {};
+    }
+    auto inner_val = to<T>(*sivp);
+
+    // free the memory associated with StableIValue* sivp
+    delete sivp;
+
+    return std::make_optional(inner_val);
+  }
+};
+
+// Specialization for StableIValue => torch::stable::Tensor
+// The resulting stable::Tensor steals ownership of the input's
+// underlying AtenTensorHandle.
+template <>
+struct ToImpl<torch::stable::Tensor> {
+  static torch::stable::Tensor call(StableIValue val) {
+    return torch::stable::Tensor(to<AtenTensorHandle>(val));
+  }
+};
+
+} // namespace detail
+
+// Expose the partially templated class functions through single functions
+template <typename T>
+StableIValue from(T val) {
+  return detail::FromImpl<T>::call(val);
+}
+
+template <typename T>
+StableIValue from(const std::optional<T>& val) {
+  return detail::FromImpl<std::optional<T>>::call(val);
+}
+
+// The below overload is used! See https://godbolt.org/z/859cshxrW
+// We are suppressing the warning for versions clang12- and gcc11-
+[[maybe_unused]] StableIValue from(const torch::stable::Tensor& val) {
+  return detail::FromImpl<torch::stable::Tensor>::call(val);
+}
+
+template <typename T>
+T to(StableIValue val) {
+  return detail::ToImpl<T>::call(val);
+}
+
+// =============================================================================
+//  end to helpers for converting between StableIValue and T
+// =============================================================================
+
+} // namespace
diff --git a/torch/csrc/stable/tensor.h b/torch/csrc/stable/tensor.h
index 1b9b3fecb4173..8762372a415cf 100644
--- a/torch/csrc/stable/tensor.h
+++ b/torch/csrc/stable/tensor.h
@@ -1,126 +1,4 @@
 #pragma once
 
-// TODO ASAP: THIS FILE SHOULD BE HEADER ONLY BUT ISN'T ENFORCED:
-// I only need it for AOTI_TORCH_ERROR_CODE_CHECK, see #154908
-#include <torch/csrc/inductor/aoti_runtime/utils.h>
-
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
-
-namespace torch::stable {
-
-using DeviceIndex =
-    int8_t; // this is from c10/core/Device.h and can be header only
-
-// The torch::stable::Tensor class is a highlevel C++ wrapper around
-// the C shim Tensor APIs. We've modeled this class after TensorBase, as custom
-// op kernels only really need to interact with Tensor metadata (think sizes,
-// strides, device, dtype). Other functions on Tensor (like empty_like) should
-// live like the ATen op that they are and exist outside of this struct.
-//
-// There are several goals of this class over AtenTensorHandle and
-// RAIIAtenTensorHandle:
-// 1. torch::stable::Tensor is a nicer UX much closer to torch::Tensor than the
-//    C APIs with AtenTensorHandle. Under the hood we still call to these C shim
-//    APIs to preserve stability.
-// 2. RAIIAtenTensorHandle boils down to a uniq_ptr that forces the user to pass
-//    around ownership. This makes it difficult to pass one input into 2
-//    different functions, e.g., doing something like c = a(t) + b(t) for
-//    stable::Tensor t. Thus, we use a shared_ptr here.
-class Tensor {
- private:
-  std::shared_ptr<AtenTensorOpaque> ath_;
-
- public:
-  Tensor() = delete;
-
-  // Construct a stable::Tensor from an AtenTensorHandle (ATH)
-  // Steals ownership from the ATH
-  explicit Tensor(AtenTensorHandle ath)
-      : ath_(ath, [](AtenTensorHandle ath) {
-          AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_tensor_object(ath));
-        }) {}
-
-  // Copy and move constructors can be default cuz the underlying handle is a
-  // shared_ptr
-  Tensor(const Tensor& other) = default;
-  Tensor(Tensor&& other) noexcept = default;
-
-  // Copy and move assignment operators can be default cuz the underlying handle
-  // is a shared_ptr
-  Tensor& operator=(const Tensor& other) = default;
-  Tensor& operator=(Tensor&& other) noexcept = default;
-
-  // Destructor can be default: shared ptr has custom deletion logic
-  ~Tensor() = default;
-
-  // Returns a borrowed reference to the AtenTensorHandle
-  AtenTensorHandle get() const {
-    return ath_.get();
-  }
-
-  // =============================================================================
-  // C-shimified TensorBase APIs: the below APIs have the same signatures and
-  // semantics as their counterparts in TensorBase.h.
-  // =============================================================================
-
-  void* data_ptr() const {
-    void* data_ptr;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(ath_.get(), &data_ptr));
-    return data_ptr;
-  }
-
-  int64_t dim() const {
-    int64_t dim;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dim(ath_.get(), &dim));
-    return dim;
-  }
-
-  int64_t numel() const {
-    int64_t numel;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_numel(ath_.get(), &numel));
-    return numel;
-  }
-
-  // note: this is a subset of the original TensorBase API. It takes no
-  // arguments whereas the original API takes in a kwarg of memory format.
-  // Here, we assume the default contiguous memory format.
-  bool is_contiguous() const {
-    bool is_contiguous;
-    AOTI_TORCH_ERROR_CODE_CHECK(
-        aoti_torch_is_contiguous(ath_.get(), &is_contiguous));
-    return is_contiguous;
-  }
-
-  int64_t stride(int64_t dim) const {
-    int64_t stride;
-    AOTI_TORCH_ERROR_CODE_CHECK(
-        aoti_torch_get_stride(ath_.get(), dim, &stride));
-    return stride;
-  }
-
-  DeviceIndex get_device() const {
-    int32_t device_index;
-    AOTI_TORCH_ERROR_CODE_CHECK(
-        aoti_torch_get_device_index(ath_.get(), &device_index));
-    return static_cast<DeviceIndex>(device_index);
-  }
-
-  bool is_cuda() const {
-    int32_t device_type;
-    AOTI_TORCH_ERROR_CODE_CHECK(
-        aoti_torch_get_device_type(ath_.get(), &device_type));
-    return device_type == aoti_torch_device_type_cuda();
-  }
-
-  int64_t size(int64_t dim) const {
-    int64_t size;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_size(ath_.get(), dim, &size));
-    return size;
-  }
-
-  // =============================================================================
-  // END of C-shimified TensorBase APIs
-  // =============================================================================
-};
-
-} // namespace torch::stable
+#include <torch/csrc/stable/tensor_inl.h>
+#include <torch/csrc/stable/tensor_struct.h>
diff --git a/torch/csrc/stable/tensor_inl.h b/torch/csrc/stable/tensor_inl.h
new file mode 100644
index 0000000000000..cbc6f30ed6562
--- /dev/null
+++ b/torch/csrc/stable/tensor_inl.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// This file implements tensor.h. We separated out the Tensor struct so that
+// other files can depend on the Tensor struct (like library.h) and the
+// implementations of the Tensor methods can depend on APIs in library.h
+// without circular dependencies.
+
+#include <torch/csrc/stable/stableivalue_conversions.h>
+#include <torch/csrc/stable/tensor_struct.h>
+#include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/util/shim_utils.h>
+
+namespace torch::stable {
+
+using torch::headeronly::ScalarType;
+
+ScalarType Tensor::scalar_type() const {
+  int32_t dtype;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(ath_.get(), &dtype));
+  return to<ScalarType>(from(dtype));
+}
+
+} // namespace torch::stable
diff --git a/torch/csrc/stable/tensor_struct.h b/torch/csrc/stable/tensor_struct.h
new file mode 100644
index 0000000000000..568f52dc19274
--- /dev/null
+++ b/torch/csrc/stable/tensor_struct.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/util/shim_utils.h>
+#include <climits>
+#include <memory>
+
+#include <torch/csrc/stable/accelerator.h>
+
+namespace torch::stable {
+
+using accelerator::DeviceIndex;
+using torch::headeronly::ScalarType;
+
+// The torch::stable::Tensor class is a highlevel C++ wrapper around
+// the C shim Tensor APIs. We've modeled this class after TensorBase, as custom
+// op kernels only really need to interact with Tensor metadata (think sizes,
+// strides, device, dtype). Other functions on Tensor (like empty_like) should
+// live like the ATen op that they are and exist outside of this struct.
+//
+// There are several goals of this class over AtenTensorHandle and
+// RAIIAtenTensorHandle:
+// 1. torch::stable::Tensor is a nicer UX much closer to torch::Tensor than the
+//    C APIs with AtenTensorHandle. Under the hood we still call to these C shim
+//    APIs to preserve stability.
+// 2. RAIIAtenTensorHandle boils down to a uniq_ptr that forces the user to pass
+//    around ownership. This makes it difficult to pass one input into 2
+//    different functions, e.g., doing something like c = a(t) + b(t) for
+//    stable::Tensor t. Thus, we use a shared_ptr here.
+class Tensor {
+ private:
+  std::shared_ptr<AtenTensorOpaque> ath_;
+
+ public:
+  // Construct a stable::Tensor with an uninitialized AtenTensorHandle (ATH)
+  // Steals ownership from the ATH
+  Tensor() {
+    AtenTensorHandle ret;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_new_uninitialized_tensor(&ret));
+    ath_ = std::shared_ptr<AtenTensorOpaque>(ret, [](AtenTensorHandle ath) {
+      TORCH_ERROR_CODE_CHECK(aoti_torch_delete_tensor_object(ath));
+    });
+  }
+
+  // Construct a stable::Tensor from an AtenTensorHandle (ATH)
+  // Steals ownership from the ATH
+  explicit Tensor(AtenTensorHandle ath)
+      : ath_(ath, [](AtenTensorHandle ath) {
+          TORCH_ERROR_CODE_CHECK(aoti_torch_delete_tensor_object(ath));
+        }) {}
+
+  // Copy and move constructors can be default cuz the underlying handle is a
+  // shared_ptr
+  Tensor(const Tensor& other) = default;
+  Tensor(Tensor&& other) noexcept = default;
+
+  // Copy and move assignment operators can be default cuz the underlying handle
+  // is a shared_ptr
+  Tensor& operator=(const Tensor& other) = default;
+  Tensor& operator=(Tensor&& other) noexcept = default;
+
+  // Destructor can be default: shared ptr has custom deletion logic
+  ~Tensor() = default;
+
+  // Returns a borrowed reference to the AtenTensorHandle
+  AtenTensorHandle get() const {
+    return ath_.get();
+  }
+
+  // =============================================================================
+  // C-shimified TensorBase APIs: the below APIs have the same signatures and
+  // semantics as their counterparts in TensorBase.h.
+  // =============================================================================
+
+  void* data_ptr() const {
+    void* data_ptr;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(ath_.get(), &data_ptr));
+    return data_ptr;
+  }
+
+  int64_t dim() const {
+    int64_t dim;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_get_dim(ath_.get(), &dim));
+    return dim;
+  }
+
+  int64_t numel() const {
+    int64_t numel;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_get_numel(ath_.get(), &numel));
+    return numel;
+  }
+
+  // note: this is a subset of the original TensorBase API. It takes no
+  // arguments whereas the original API takes in a kwarg of memory format.
+  // Here, we assume the default contiguous memory format.
+  bool is_contiguous() const {
+    bool is_contiguous;
+    TORCH_ERROR_CODE_CHECK(
+        aoti_torch_is_contiguous(ath_.get(), &is_contiguous));
+    return is_contiguous;
+  }
+
+  int64_t stride(int64_t dim) const {
+    int64_t stride;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_get_stride(ath_.get(), dim, &stride));
+    return stride;
+  }
+
+  // This is almost the same API as the one in TensorBase.h, except
+  // we add a check that the returned device_index is within the
+  // range of int8_t.
+  int8_t get_device() const {
+    int32_t device_index;
+    TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_device_index(ath_.get(), &device_index));
+    STD_TORCH_CHECK(
+        device_index >= std::numeric_limits<int8_t>::min() &&
+            device_index <= std::numeric_limits<int8_t>::max(),
+        "Device index is out of range of return type int8_t, please use get_device_index() instead.");
+    return static_cast<int8_t>(device_index);
+  }
+
+  // The same as get_device but with two differences:
+  // 1. it has a more suiting name
+  // 2. it returns a DeviceIndex, which is int32_t in this world
+  //    that should be more stable than the likely shifting
+  //    DeviceIndex in libtorch (it is int8_t that might become int16_t)
+  DeviceIndex get_device_index() const {
+    int32_t device_index;
+    TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_device_index(ath_.get(), &device_index));
+    return device_index;
+  }
+
+  bool is_cuda() const {
+    int32_t device_type;
+    TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_device_type(ath_.get(), &device_type));
+    return device_type == aoti_torch_device_type_cuda();
+  }
+
+  bool is_cpu() const {
+    int32_t device_type;
+    TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_device_type(ath_.get(), &device_type));
+    return device_type == aoti_torch_device_type_cpu();
+  }
+
+  int64_t size(int64_t dim) const {
+    int64_t size;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_get_size(ath_.get(), dim, &size));
+    return size;
+  }
+
+  bool defined() const {
+    bool defined;
+    TORCH_ERROR_CODE_CHECK(aoti_torch_is_defined(ath_.get(), &defined));
+    return defined;
+  }
+
+  // defined in tensor-inl.h to avoid circular dependencies
+  ScalarType scalar_type() const;
+
+  // =============================================================================
+  // END of C-shimified TensorBase APIs
+  // =============================================================================
+};
+
+} // namespace torch::stable
diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
index eee9af9d9ecbf..c23a41e8e64ef 100644
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@@ -240,6 +240,34 @@ uint8_t storage_get(const at::Storage& self, ptrdiff_t idx) {
   return self_t[idx].item<uint8_t>();
 }
 
+std::string uuid_to_string(const char* uuid_bytes) {
+  // UUIDs are a 128-bit label. CUDA/HIP and XPU store this as char[16].
+  // For string representation, the code here expands this to
+  // 8-4-4-4-12 hex format, so each byte becomes 2 hex characters.
+  return fmt::format(
+      "{:02x}{:02x}{:02x}{:02x}-"
+      "{:02x}{:02x}-"
+      "{:02x}{:02x}-"
+      "{:02x}{:02x}-"
+      "{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
+      (uint8_t)uuid_bytes[0],
+      (uint8_t)uuid_bytes[1],
+      (uint8_t)uuid_bytes[2],
+      (uint8_t)uuid_bytes[3],
+      (uint8_t)uuid_bytes[4],
+      (uint8_t)uuid_bytes[5],
+      (uint8_t)uuid_bytes[6],
+      (uint8_t)uuid_bytes[7],
+      (uint8_t)uuid_bytes[8],
+      (uint8_t)uuid_bytes[9],
+      (uint8_t)uuid_bytes[10],
+      (uint8_t)uuid_bytes[11],
+      (uint8_t)uuid_bytes[12],
+      (uint8_t)uuid_bytes[13],
+      (uint8_t)uuid_bytes[14],
+      (uint8_t)uuid_bytes[15]);
+}
+
 template class THPPointer<THPStorage>;
 // NOLINTBEGIN(misc-use-internal-linkage)
 namespace torch::gdb {
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index be79adccb74f4..71a2b10e59046 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -201,3 +201,5 @@ bool maybeThrowBackCompatKeepdimWarn(char* func);
 void storage_fill(const at::Storage& self, uint8_t value);
 void storage_set(const at::Storage& self, ptrdiff_t idx, uint8_t value);
 uint8_t storage_get(const at::Storage& self, ptrdiff_t idx);
+
+std::string uuid_to_string(const char* uuid_bytes);
diff --git a/torch/csrc/utils/generated_serialization_types.h b/torch/csrc/utils/generated_serialization_types.h
index 98803390e5104..bec4e283dcac8 100644
--- a/torch/csrc/utils/generated_serialization_types.h
+++ b/torch/csrc/utils/generated_serialization_types.h
@@ -1,5 +1,5 @@
 // @generated by update_schema.py
-// checksum<<5c990535d373dcaa291a4f994b4d7b025e0f8e806ca5268085ef699d0e4d3000>>
+// checksum<<74d07b92c36d5854263145c231553dcda15215f0460e7ace43554248c05378ec>>
 // clang-format off
 
 #pragma once
@@ -61,6 +61,7 @@ class ForwardRef {
     ptr_ = std::make_unique<T>(*other.ptr_);
     return *this;
   }
+  ~ForwardRef();
   const T& operator*() const {
     return *ptr_;
   }
@@ -128,6 +129,7 @@ inline void from_json(const nlohmann::json& j, F64& f) {
 class AOTInductorModelPickleData;
 class Argument;
 class BufferMutationSpec;
+class ComplexValue;
 class ConstantValue;
 class CustomObjArgument;
 class Device;
@@ -148,7 +150,6 @@ class InputToParameterSpec;
 class InputToTensorConstantSpec;
 class InputTokenSpec;
 class LossOutputSpec;
-class Model;
 class ModuleCallEntry;
 class ModuleCallSignature;
 class NamedArgument;
@@ -157,7 +158,9 @@ class Node;
 class OptionalTensorArgument;
 class OutputSpec;
 class OutputTokenSpec;
-class Program;
+class ParameterMutationSpec;
+class PayloadConfig;
+class PayloadMeta;
 class RangeConstraint;
 class SchemaVersion;
 class SymBool;
@@ -1197,16 +1200,43 @@ class CustomObjArgument {
   friend void from_json(const nlohmann::json& nlohmann_json_j, CustomObjArgument& nlohmann_json_t);
 };
 
+class ComplexValue {
+ private:
+  F64 real;
+  F64 imag;
+
+ public:
+
+  const F64& get_real() const {
+    return real;
+  }
+
+  void set_real(F64 def) {
+    real = std::move(def);
+  }
+
+  const F64& get_imag() const {
+    return imag;
+  }
+
+  void set_imag(F64 def) {
+    imag = std::move(def);
+  }
+
+  friend void to_json(nlohmann::json& nlohmann_json_j, const ComplexValue& nlohmann_json_t);
+  friend void from_json(const nlohmann::json& nlohmann_json_j, ComplexValue& nlohmann_json_t);
+};
+
 class Argument {
   struct Void {};
 
  public:
   enum class Tag {
-    AS_NONE, AS_TENSOR, AS_TENSORS, AS_INT, AS_INTS, AS_FLOAT, AS_FLOATS, AS_STRING, AS_STRINGS, AS_SYM_INT, AS_SYM_INTS, AS_SCALAR_TYPE, AS_MEMORY_FORMAT, AS_LAYOUT, AS_DEVICE, AS_BOOL, AS_BOOLS, AS_SYM_BOOL, AS_SYM_BOOLS, AS_GRAPH, AS_OPTIONAL_TENSORS, AS_CUSTOM_OBJ, AS_OPERATOR, AS_SYM_FLOAT, AS_SYM_FLOATS, AS_OPTIONAL_TENSOR
+    AS_NONE, AS_TENSOR, AS_TENSORS, AS_INT, AS_INTS, AS_FLOAT, AS_FLOATS, AS_STRING, AS_STRINGS, AS_SYM_INT, AS_SYM_INTS, AS_SCALAR_TYPE, AS_MEMORY_FORMAT, AS_LAYOUT, AS_DEVICE, AS_BOOL, AS_BOOLS, AS_SYM_BOOL, AS_SYM_BOOLS, AS_GRAPH, AS_OPTIONAL_TENSORS, AS_CUSTOM_OBJ, AS_OPERATOR, AS_SYM_FLOAT, AS_SYM_FLOATS, AS_OPTIONAL_TENSOR, AS_COMPLEX
   };
 
  private:
-  std::variant<Void, bool, TensorArgument, std::vector<TensorArgument>, int64_t, std::vector<int64_t>, F64, std::vector<F64>, std::string, std::vector<std::string>, SymIntArgument, std::vector<SymIntArgument>, ScalarType, MemoryFormat, Layout, Device, bool, std::vector<bool>, SymBoolArgument, std::vector<SymBoolArgument>, GraphArgument, std::vector<OptionalTensorArgument>, CustomObjArgument, std::string, SymFloatArgument, std::vector<SymFloatArgument>, OptionalTensorArgument> variant_;
+  std::variant<Void, bool, TensorArgument, std::vector<TensorArgument>, int64_t, std::vector<int64_t>, F64, std::vector<F64>, std::string, std::vector<std::string>, SymIntArgument, std::vector<SymIntArgument>, ScalarType, MemoryFormat, Layout, Device, bool, std::vector<bool>, SymBoolArgument, std::vector<SymBoolArgument>, GraphArgument, std::vector<OptionalTensorArgument>, CustomObjArgument, std::string, SymFloatArgument, std::vector<SymFloatArgument>, OptionalTensorArgument, ComplexValue> variant_;
   Tag tag_;
 
  public:
@@ -1448,6 +1478,15 @@ class Argument {
     tag_ = Tag::AS_OPTIONAL_TENSOR;
   }
 
+  const ComplexValue& get_as_complex() const {
+    return std::get<27>(variant_);
+  }
+
+  void set_as_complex(ComplexValue def) {
+    variant_.emplace<27>(std::move(def));
+    tag_ = Tag::AS_COMPLEX;
+  }
+
   friend void to_json(nlohmann::json& nlohmann_json_j, const Argument& nlohmann_json_t) {
 
     if (nlohmann_json_t.tag_ == Tag::AS_NONE) {
@@ -1554,6 +1593,10 @@ class Argument {
       nlohmann_json_j["as_optional_tensor"] = nlohmann_json_t.get_as_optional_tensor();
       return;
     }
+    if (nlohmann_json_t.tag_ == Tag::AS_COMPLEX) {
+      nlohmann_json_j["as_complex"] = nlohmann_json_t.get_as_complex();
+      return;
+    }
   }
 
   friend void from_json(const nlohmann::json& nlohmann_json_j, Argument& nlohmann_json_t) {
@@ -1688,6 +1731,11 @@ class Argument {
       nlohmann_json_t.tag_ = Tag::AS_OPTIONAL_TENSOR;
       return;
     }
+    if (nlohmann_json_j.contains("as_complex")) {
+      nlohmann_json_t.variant_.emplace<27>(nlohmann_json_j.at("as_complex").template get<ComplexValue>());
+      nlohmann_json_t.tag_ = Tag::AS_COMPLEX;
+      return;
+    }
   }
 };
 
@@ -1719,6 +1767,7 @@ inline std::string_view printEnum(const Argument::Tag& e) {
     case Argument::Tag::AS_SYM_FLOAT: return "AS_SYM_FLOAT";
     case Argument::Tag::AS_SYM_FLOATS: return "AS_SYM_FLOATS";
     case Argument::Tag::AS_OPTIONAL_TENSOR: return "AS_OPTIONAL_TENSOR";
+    case Argument::Tag::AS_COMPLEX: return "AS_COMPLEX";
     default:
       throw std::runtime_error("Unknown enum value");
   }
@@ -1751,6 +1800,7 @@ inline void parseEnum(std::string_view s, Argument::Tag& t) {
   if (s == "AS_SYM_FLOAT") { t = Argument::Tag::AS_SYM_FLOAT; return; }
   if (s == "AS_SYM_FLOATS") { t = Argument::Tag::AS_SYM_FLOATS; return; }
   if (s == "AS_OPTIONAL_TENSOR") { t = Argument::Tag::AS_OPTIONAL_TENSOR; return; }
+  if (s == "AS_COMPLEX") { t = Argument::Tag::AS_COMPLEX; return; }
   throw std::runtime_error("Unknown enum value: " + std::string{s});
 }
 
@@ -2494,6 +2544,33 @@ class BufferMutationSpec {
   friend void from_json(const nlohmann::json& nlohmann_json_j, BufferMutationSpec& nlohmann_json_t);
 };
 
+class ParameterMutationSpec {
+ private:
+  TensorArgument arg;
+  std::string parameter_name;
+
+ public:
+
+  const TensorArgument& get_arg() const {
+    return arg;
+  }
+
+  void set_arg(TensorArgument def) {
+    arg = std::move(def);
+  }
+
+  const std::string& get_parameter_name() const {
+    return parameter_name;
+  }
+
+  void set_parameter_name(std::string def) {
+    parameter_name = std::move(def);
+  }
+
+  friend void to_json(nlohmann::json& nlohmann_json_j, const ParameterMutationSpec& nlohmann_json_t);
+  friend void from_json(const nlohmann::json& nlohmann_json_j, ParameterMutationSpec& nlohmann_json_t);
+};
+
 class GradientToParameterSpec {
  private:
   TensorArgument arg;
@@ -2598,11 +2675,11 @@ class OutputSpec {
 
  public:
   enum class Tag {
-    USER_OUTPUT, LOSS_OUTPUT, BUFFER_MUTATION, GRADIENT_TO_PARAMETER, GRADIENT_TO_USER_INPUT, USER_INPUT_MUTATION, TOKEN
+    USER_OUTPUT, LOSS_OUTPUT, BUFFER_MUTATION, GRADIENT_TO_PARAMETER, GRADIENT_TO_USER_INPUT, USER_INPUT_MUTATION, TOKEN, PARAMETER_MUTATION
   };
 
  private:
-  std::variant<Void, UserOutputSpec, LossOutputSpec, BufferMutationSpec, GradientToParameterSpec, GradientToUserInputSpec, UserInputMutationSpec, OutputTokenSpec> variant_;
+  std::variant<Void, UserOutputSpec, LossOutputSpec, BufferMutationSpec, GradientToParameterSpec, GradientToUserInputSpec, UserInputMutationSpec, OutputTokenSpec, ParameterMutationSpec> variant_;
   Tag tag_;
 
  public:
@@ -2673,6 +2750,15 @@ class OutputSpec {
     tag_ = Tag::TOKEN;
   }
 
+  const ParameterMutationSpec& get_parameter_mutation() const {
+    return std::get<8>(variant_);
+  }
+
+  void set_parameter_mutation(ParameterMutationSpec def) {
+    variant_.emplace<8>(std::move(def));
+    tag_ = Tag::PARAMETER_MUTATION;
+  }
+
   friend void to_json(nlohmann::json& nlohmann_json_j, const OutputSpec& nlohmann_json_t) {
 
     if (nlohmann_json_t.tag_ == Tag::USER_OUTPUT) {
@@ -2703,6 +2789,10 @@ class OutputSpec {
       nlohmann_json_j["token"] = nlohmann_json_t.get_token();
       return;
     }
+    if (nlohmann_json_t.tag_ == Tag::PARAMETER_MUTATION) {
+      nlohmann_json_j["parameter_mutation"] = nlohmann_json_t.get_parameter_mutation();
+      return;
+    }
   }
 
   friend void from_json(const nlohmann::json& nlohmann_json_j, OutputSpec& nlohmann_json_t) {
@@ -2742,6 +2832,11 @@ class OutputSpec {
       nlohmann_json_t.tag_ = Tag::TOKEN;
       return;
     }
+    if (nlohmann_json_j.contains("parameter_mutation")) {
+      nlohmann_json_t.variant_.emplace<8>(nlohmann_json_j.at("parameter_mutation").template get<ParameterMutationSpec>());
+      nlohmann_json_t.tag_ = Tag::PARAMETER_MUTATION;
+      return;
+    }
   }
 };
 
@@ -2754,6 +2849,7 @@ inline std::string_view printEnum(const OutputSpec::Tag& e) {
     case OutputSpec::Tag::GRADIENT_TO_USER_INPUT: return "GRADIENT_TO_USER_INPUT";
     case OutputSpec::Tag::USER_INPUT_MUTATION: return "USER_INPUT_MUTATION";
     case OutputSpec::Tag::TOKEN: return "TOKEN";
+    case OutputSpec::Tag::PARAMETER_MUTATION: return "PARAMETER_MUTATION";
     default:
       throw std::runtime_error("Unknown enum value");
   }
@@ -2767,6 +2863,7 @@ inline void parseEnum(std::string_view s, OutputSpec::Tag& t) {
   if (s == "GRADIENT_TO_USER_INPUT") { t = OutputSpec::Tag::GRADIENT_TO_USER_INPUT; return; }
   if (s == "USER_INPUT_MUTATION") { t = OutputSpec::Tag::USER_INPUT_MUTATION; return; }
   if (s == "TOKEN") { t = OutputSpec::Tag::TOKEN; return; }
+  if (s == "PARAMETER_MUTATION") { t = OutputSpec::Tag::PARAMETER_MUTATION; return; }
   throw std::runtime_error("Unknown enum value: " + std::string{s});
 }
 
@@ -3013,6 +3110,7 @@ class ExportedProgram {
   SchemaVersion schema_version;
   std::vector<std::string> verifiers = {};
   std::string torch_version = "<=2.4";
+  std::vector<std::string> guards_code = {};
 
  public:
 
@@ -3064,89 +3162,79 @@ class ExportedProgram {
     torch_version = std::move(def);
   }
 
-  friend void to_json(nlohmann::json& nlohmann_json_j, const ExportedProgram& nlohmann_json_t);
-  friend void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nlohmann_json_t);
-};
-
-class Program {
- private:
-  std::unordered_map<std::string, ExportedProgram> methods;
-
- public:
-
-  const std::unordered_map<std::string, ExportedProgram>& get_methods() const {
-    return methods;
+  const std::vector<std::string>& get_guards_code() const {
+    return guards_code;
   }
 
-  void set_methods(std::unordered_map<std::string, ExportedProgram> def) {
-    methods = std::move(def);
+  void set_guards_code(std::vector<std::string> def) {
+    guards_code = std::move(def);
   }
 
-  friend void to_json(nlohmann::json& nlohmann_json_j, const Program& nlohmann_json_t);
-  friend void from_json(const nlohmann::json& nlohmann_json_j, Program& nlohmann_json_t);
+  friend void to_json(nlohmann::json& nlohmann_json_j, const ExportedProgram& nlohmann_json_t);
+  friend void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nlohmann_json_t);
 };
 
-class Model {
+class PayloadMeta {
  private:
-  std::string name;
-  std::unordered_map<std::string, std::string> tensorPaths;
-  Program program;
-  std::unordered_map<std::string, Program> delegates;
-  std::unordered_map<std::string, std::string> deviceAllocationMap;
-  std::unordered_map<std::string, std::string> constantPaths;
+  std::string path_name;
+  bool is_param;
+  bool use_pickle;
+  std::optional<TensorMeta> tensor_meta;
 
  public:
 
-  const std::string& get_name() const {
-    return name;
+  const std::string& get_path_name() const {
+    return path_name;
   }
 
-  void set_name(std::string def) {
-    name = std::move(def);
+  void set_path_name(std::string def) {
+    path_name = std::move(def);
   }
 
-  const std::unordered_map<std::string, std::string>& get_tensorPaths() const {
-    return tensorPaths;
+  const bool& get_is_param() const {
+    return is_param;
   }
 
-  void set_tensorPaths(std::unordered_map<std::string, std::string> def) {
-    tensorPaths = std::move(def);
+  void set_is_param(bool def) {
+    is_param = std::move(def);
   }
 
-  const Program& get_program() const {
-    return program;
+  const bool& get_use_pickle() const {
+    return use_pickle;
   }
 
-  void set_program(Program def) {
-    program = std::move(def);
+  void set_use_pickle(bool def) {
+    use_pickle = std::move(def);
   }
 
-  const std::unordered_map<std::string, Program>& get_delegates() const {
-    return delegates;
+  const std::optional<TensorMeta>& get_tensor_meta() const {
+    return tensor_meta;
   }
 
-  void set_delegates(std::unordered_map<std::string, Program> def) {
-    delegates = std::move(def);
+  void set_tensor_meta(std::optional<TensorMeta> def) {
+    tensor_meta = std::move(def);
   }
 
-  const std::unordered_map<std::string, std::string>& get_deviceAllocationMap() const {
-    return deviceAllocationMap;
-  }
+  friend void to_json(nlohmann::json& nlohmann_json_j, const PayloadMeta& nlohmann_json_t);
+  friend void from_json(const nlohmann::json& nlohmann_json_j, PayloadMeta& nlohmann_json_t);
+};
 
-  void set_deviceAllocationMap(std::unordered_map<std::string, std::string> def) {
-    deviceAllocationMap = std::move(def);
-  }
+class PayloadConfig {
+ private:
+  std::unordered_map<std::string, PayloadMeta> config;
 
-  const std::unordered_map<std::string, std::string>& get_constantPaths() const {
-    return constantPaths;
+ public:
+
+  const std::unordered_map<std::string, PayloadMeta>& get_config() const {
+    return config;
   }
 
-  void set_constantPaths(std::unordered_map<std::string, std::string> def) {
-    constantPaths = std::move(def);
+  void set_config(std::unordered_map<std::string, PayloadMeta> def) {
+    config = std::move(def);
   }
 
-  friend void to_json(nlohmann::json& nlohmann_json_j, const Model& nlohmann_json_t);
-  friend void from_json(const nlohmann::json& nlohmann_json_j, Model& nlohmann_json_t);
+  friend void to_json(nlohmann::json& nlohmann_json_j, const PayloadConfig& nlohmann_json_t);
+  friend void from_json(const nlohmann::json& nlohmann_json_j, PayloadConfig& nlohmann_json_t);
 };
 
 class AOTInductorModelPickleData {
@@ -3287,6 +3375,17 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, BufferMutationSpec&
   nlohmann_json_t.buffer_name = nlohmann_json_j.value("buffer_name", nlohmann_json_default_obj.buffer_name);
 }
 
+inline void to_json(nlohmann::json& nlohmann_json_j, const ComplexValue& nlohmann_json_t) {
+  nlohmann_json_j["real"] = nlohmann_json_t.real;
+  nlohmann_json_j["imag"] = nlohmann_json_t.imag;
+}
+
+inline void from_json(const nlohmann::json& nlohmann_json_j, ComplexValue& nlohmann_json_t) {
+  ComplexValue nlohmann_json_default_obj;
+  nlohmann_json_t.real = nlohmann_json_j.value("real", nlohmann_json_default_obj.real);
+  nlohmann_json_t.imag = nlohmann_json_j.value("imag", nlohmann_json_default_obj.imag);
+}
+
 inline void to_json(nlohmann::json& nlohmann_json_j, const CustomObjArgument& nlohmann_json_t) {
   nlohmann_json_j["name"] = nlohmann_json_t.name;
   nlohmann_json_j["class_fqn"] = nlohmann_json_t.class_fqn;
@@ -3316,6 +3415,7 @@ inline void to_json(nlohmann::json& nlohmann_json_j, const ExportedProgram& nloh
   nlohmann_json_j["schema_version"] = nlohmann_json_t.schema_version;
   nlohmann_json_j["verifiers"] = nlohmann_json_t.verifiers;
   nlohmann_json_j["torch_version"] = nlohmann_json_t.torch_version;
+  nlohmann_json_j["guards_code"] = nlohmann_json_t.guards_code;
 }
 
 inline void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nlohmann_json_t) {
@@ -3326,6 +3426,7 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nl
   nlohmann_json_t.schema_version = nlohmann_json_j.value("schema_version", nlohmann_json_default_obj.schema_version);
   nlohmann_json_t.verifiers = nlohmann_json_j.value("verifiers", nlohmann_json_default_obj.verifiers);
   nlohmann_json_t.torch_version = nlohmann_json_j.value("torch_version", nlohmann_json_default_obj.torch_version);
+  nlohmann_json_t.guards_code = nlohmann_json_j.value("guards_code", nlohmann_json_default_obj.guards_code);
 }
 
 inline void to_json(nlohmann::json& nlohmann_json_j, const ExternKernelNode& nlohmann_json_t) {
@@ -3509,25 +3610,6 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, LossOutputSpec& nlo
   nlohmann_json_t.arg = nlohmann_json_j.value("arg", nlohmann_json_default_obj.arg);
 }
 
-inline void to_json(nlohmann::json& nlohmann_json_j, const Model& nlohmann_json_t) {
-  nlohmann_json_j["name"] = nlohmann_json_t.name;
-  nlohmann_json_j["tensorPaths"] = nlohmann_json_t.tensorPaths;
-  nlohmann_json_j["program"] = nlohmann_json_t.program;
-  nlohmann_json_j["delegates"] = nlohmann_json_t.delegates;
-  nlohmann_json_j["deviceAllocationMap"] = nlohmann_json_t.deviceAllocationMap;
-  nlohmann_json_j["constantPaths"] = nlohmann_json_t.constantPaths;
-}
-
-inline void from_json(const nlohmann::json& nlohmann_json_j, Model& nlohmann_json_t) {
-  Model nlohmann_json_default_obj;
-  nlohmann_json_t.name = nlohmann_json_j.value("name", nlohmann_json_default_obj.name);
-  nlohmann_json_t.tensorPaths = nlohmann_json_j.value("tensorPaths", nlohmann_json_default_obj.tensorPaths);
-  nlohmann_json_t.program = nlohmann_json_j.value("program", nlohmann_json_default_obj.program);
-  nlohmann_json_t.delegates = nlohmann_json_j.value("delegates", nlohmann_json_default_obj.delegates);
-  nlohmann_json_t.deviceAllocationMap = nlohmann_json_j.value("deviceAllocationMap", nlohmann_json_default_obj.deviceAllocationMap);
-  nlohmann_json_t.constantPaths = nlohmann_json_j.value("constantPaths", nlohmann_json_default_obj.constantPaths);
-}
-
 inline void to_json(nlohmann::json& nlohmann_json_j, const ModuleCallEntry& nlohmann_json_t) {
   nlohmann_json_j["fqn"] = nlohmann_json_t.fqn;
   nlohmann_json_j["signature"] = nlohmann_json_t.signature;
@@ -3604,13 +3686,39 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, OutputTokenSpec& nl
   nlohmann_json_t.arg = nlohmann_json_j.value("arg", nlohmann_json_default_obj.arg);
 }
 
-inline void to_json(nlohmann::json& nlohmann_json_j, const Program& nlohmann_json_t) {
-  nlohmann_json_j["methods"] = nlohmann_json_t.methods;
+inline void to_json(nlohmann::json& nlohmann_json_j, const ParameterMutationSpec& nlohmann_json_t) {
+  nlohmann_json_j["arg"] = nlohmann_json_t.arg;
+  nlohmann_json_j["parameter_name"] = nlohmann_json_t.parameter_name;
+}
+
+inline void from_json(const nlohmann::json& nlohmann_json_j, ParameterMutationSpec& nlohmann_json_t) {
+  ParameterMutationSpec nlohmann_json_default_obj;
+  nlohmann_json_t.arg = nlohmann_json_j.value("arg", nlohmann_json_default_obj.arg);
+  nlohmann_json_t.parameter_name = nlohmann_json_j.value("parameter_name", nlohmann_json_default_obj.parameter_name);
+}
+
+inline void to_json(nlohmann::json& nlohmann_json_j, const PayloadConfig& nlohmann_json_t) {
+  nlohmann_json_j["config"] = nlohmann_json_t.config;
+}
+
+inline void from_json(const nlohmann::json& nlohmann_json_j, PayloadConfig& nlohmann_json_t) {
+  PayloadConfig nlohmann_json_default_obj;
+  nlohmann_json_t.config = nlohmann_json_j.value("config", nlohmann_json_default_obj.config);
+}
+
+inline void to_json(nlohmann::json& nlohmann_json_j, const PayloadMeta& nlohmann_json_t) {
+  nlohmann_json_j["path_name"] = nlohmann_json_t.path_name;
+  nlohmann_json_j["is_param"] = nlohmann_json_t.is_param;
+  nlohmann_json_j["use_pickle"] = nlohmann_json_t.use_pickle;
+  nlohmann_json_j["tensor_meta"] = nlohmann_json_t.tensor_meta;
 }
 
-inline void from_json(const nlohmann::json& nlohmann_json_j, Program& nlohmann_json_t) {
-  Program nlohmann_json_default_obj;
-  nlohmann_json_t.methods = nlohmann_json_j.value("methods", nlohmann_json_default_obj.methods);
+inline void from_json(const nlohmann::json& nlohmann_json_j, PayloadMeta& nlohmann_json_t) {
+  PayloadMeta nlohmann_json_default_obj;
+  nlohmann_json_t.path_name = nlohmann_json_j.value("path_name", nlohmann_json_default_obj.path_name);
+  nlohmann_json_t.is_param = nlohmann_json_j.value("is_param", nlohmann_json_default_obj.is_param);
+  nlohmann_json_t.use_pickle = nlohmann_json_j.value("use_pickle", nlohmann_json_default_obj.use_pickle);
+  nlohmann_json_t.tensor_meta = nlohmann_json_j.value("tensor_meta", nlohmann_json_default_obj.tensor_meta);
 }
 
 inline void to_json(nlohmann::json& nlohmann_json_j, const RangeConstraint& nlohmann_json_t) {
@@ -3717,6 +3825,7 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, UserOutputSpec& nlo
 
 template <typename T> ForwardRef<T>::ForwardRef(ForwardRef<T>&&) = default;
 template <typename T> ForwardRef<T>& ForwardRef<T>::operator=(ForwardRef<T>&&) = default;
+template <typename T> ForwardRef<T>::~ForwardRef() = default;
 } // namespace _export
 } // namespace torch
 
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 8a16b0211dce6..613657e03b926 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -303,6 +303,10 @@ static py::object maybe_get_registered_torch_dispatch_rule(
   return result;
 }
 
+// NB: Invariant: if you run this function, you MUST test if the returned
+// py::object is nullptr, as this will occur WITHOUT error condition being set.
+// And if an error happens, this function is responsible for throwing a C++
+// error.
 static py::object dispatch_on_subclass(
     PyObject* args,
     PyObject* kwargs,
@@ -382,6 +386,7 @@ static py::object dispatch_on_subclass(
       break;
     }
   }
+  // NB: PyErr_Occurred is NOT set here, this means NO dispatch happened
   return ret;
 }
 
@@ -583,9 +588,15 @@ auto handle_torch_function_no_python_arg_parser(
   }
 
   if (ret.ptr() == nullptr) {
-    // if an exception occurred in a user's implementation of
-    // __torch_function__, throw it
-    throw python_error();
+    // We didn't successfully dispatch anything, this should be impossible
+    TORCH_INTERNAL_ASSERT(
+        0,
+        "dispatch_on_subclass called with NO overloaded args that actually triggered dispatch, "
+        "perhaps there is a divergence in how you detect torch function/dispatch and how overloaded args is "
+        "computed?  overloaded_args = ",
+        overloaded_args,
+        ", is_mode_active = ",
+        is_mode_active());
   } else if (ret.ptr() == Py_NotImplemented) {
     // all __torch_function__ implementations in overloaded_args
     // returned NotImplemented, so we raise a TypeError.
@@ -666,7 +677,22 @@ auto handle_torch_function_indexing(
   auto size = PyTuple_GET_SIZE(index_tup.ptr());
   for (auto i : c10::irange(size)) {
     auto* obj = PyTuple_GetItem(index_tup.ptr(), i);
-    is_tensor_and_append_overloaded(obj, &overridable_args);
+    auto r = is_tensor_and_append_overloaded(obj, &overridable_args);
+    if (!r && PySequence_Check(obj)) {
+      auto inner_size = PySequence_Length(obj);
+      if (inner_size < 0) {
+        // PySequence_Length failed, but we continue as this is optional
+        // optimization
+        PyErr_Clear();
+        continue;
+      }
+      for (auto j : c10::irange(inner_size)) {
+        THPObjectPtr inner_obj(PySequence_GetItem(obj, j));
+        if (inner_obj.get()) {
+          is_tensor_and_append_overloaded(inner_obj.get(), &overridable_args);
+        }
+      }
+    }
   }
   if (val != nullptr) {
     is_tensor_and_append_overloaded(val, &overridable_args);
@@ -793,17 +819,29 @@ bool is_tensor_and_append_overloaded(
   return false;
 }
 
-static bool is_scalar_list(PyObject* obj) {
+static bool is_scalar_list(
+    PyObject* obj,
+    std::vector<PyObject*>* overloaded_args = nullptr) {
   auto tuple = six::isTuple(obj);
   if (!(tuple || PyList_Check(obj))) {
     return false;
   }
   // NOLINTNEXTLINE(bugprone-branch-clone)
   const auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
+  bool has_torch_func = false;
+
   for (const auto idx : c10::irange(size)) {
     PyObject* iobj =
         tuple ? PyTuple_GET_ITEM(obj, idx) : PyList_GET_ITEM(obj, idx);
-    if (!THPUtils_checkScalar(iobj)) {
+
+    // Check if this element has torch function
+    if (overloaded_args &&
+        check_has_torch_function(iobj, /*ignore_mode*/ true)) {
+      append_overloaded_arg(overloaded_args, iobj, /*obj_is_type*/ false);
+      has_torch_func = true;
+    }
+
+    if (!THPUtils_checkScalar(iobj) && !has_torch_func) {
       return false;
     }
   }
@@ -853,7 +891,9 @@ static bool is_float_or_symfloat(PyObject* obj) {
   return false;
 }
 
-static bool is_float_or_complex_list(PyObject* obj) {
+static bool is_float_or_complex_list(
+    PyObject* obj,
+    std::vector<PyObject*>* overloaded_args = nullptr) {
   auto tuple = six::isTuple(obj);
   if (!(tuple || PyList_Check(obj))) {
     return false;
@@ -861,10 +901,25 @@ static bool is_float_or_complex_list(PyObject* obj) {
 
   // NOLINTNEXTLINE(bugprone-branch-clone)
   const auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
-  if (size > 0) {
-    PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, 0) : PyList_GET_ITEM(obj, 0);
-    if (!is_float_or_symfloat(iobj) && !PyComplex_Check(iobj)) {
-      return false;
+  bool has_torch_func = false;
+
+  for (long idx = 0; idx < size; idx++) {
+    PyObject* iobj =
+        tuple ? PyTuple_GET_ITEM(obj, idx) : PyList_GET_ITEM(obj, idx);
+
+    // Check if this element has torch function
+    if (overloaded_args &&
+        check_has_torch_function(iobj, /*ignore_mode*/ true)) {
+      append_overloaded_arg(overloaded_args, iobj, /*obj_is_type*/ false);
+      has_torch_func = true;
+    }
+
+    // For the first element, do the original type checking
+    if (idx == 0) {
+      if (!is_float_or_symfloat(iobj) && !PyComplex_Check(iobj) &&
+          !has_torch_func) {
+        return false;
+      }
     }
   }
 
@@ -872,10 +927,14 @@ static bool is_float_or_complex_list(PyObject* obj) {
 }
 
 static bool is_int_or_symint(PyObject* obj) {
+  // Call checkLong first so that actual ints go fast.
+  if (THPUtils_checkLong(obj)) {
+    return true;
+  }
+
   // THPUtils_checkIndex may call __index__ or __int__
   // which may have side effects if obj is a symint node
   // so we do `is_symint` check first
-  // TODO: maybe we should be using checkLong here?
   if (torch::is_symint(py::handle(obj))) {
     return true;
   }
@@ -905,26 +964,51 @@ static bool is_int_or_symint(PyObject* obj) {
 static bool is_int_or_symint_list(
     PyObject* obj,
     int broadcast_size,
-    int64_t* failed_idx = nullptr) {
-  if (PyTuple_Check(obj) || PyList_Check(obj)) {
-    if (PySequence_Size(obj) == 0) {
+    int64_t* failed_idx = nullptr,
+    std::vector<PyObject*>* overloaded_args = nullptr) {
+  const bool is_tuple = PyTuple_Check(obj);
+  if (is_tuple || PyList_Check(obj)) {
+    const auto size = is_tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
+    if (size == 0) {
       return true;
     }
-    auto item = py::reinterpret_steal<py::object>(PySequence_GetItem(obj, 0));
 
-    if (is_int_or_symint(item.ptr())) {
-      return true;
-    }
+    // Check all elements, not just the first one, when looking for torch
+    // functions
+    bool has_torch_func = false;
+
+    for (Py_ssize_t idx = 0; idx < size; idx++) {
+      PyObject* item_ptr =
+          is_tuple ? PyTuple_GET_ITEM(obj, idx) : PyList_GET_ITEM(obj, idx);
+
+      // Check if this element has torch function
+      if (overloaded_args &&
+          check_has_torch_function(item_ptr, /*ignore_mode*/ true)) {
+        append_overloaded_arg(overloaded_args, item_ptr, /*obj_is_type*/ false);
+        has_torch_func = true;
+      }
+
+      // For the first element, do the original type checking
+      if (idx == 0) {
+        if (is_int_or_symint(item_ptr)) {
+          continue;
+        }
 
-    // NOTE: JIT tracer allows arbitrary scalar tensors to act as ints
-    // in an intlist argument. Even float or complex scalar tensors.
-    bool r =
-        (jit::tracer::isTracing() && THPVariable_Check(item.ptr()) &&
-         THPVariable_Unpack(item.ptr()).sizes().empty());
-    if (!r && failed_idx != nullptr) {
-      *failed_idx = 0;
+        // NOTE: JIT tracer allows arbitrary scalar tensors to act as ints
+        // in an intlist argument. Even float or complex scalar tensors.
+        bool r =
+            (jit::tracer::isTracing() && THPVariable_Check(item_ptr) &&
+             THPVariable_Unpack(item_ptr).sizes().empty());
+        if (!r && failed_idx != nullptr) {
+          *failed_idx = 0;
+        }
+        if (!r && !has_torch_func) {
+          return false;
+        }
+      }
     }
-    return r;
+
+    return true;
   }
 
   // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single
@@ -938,6 +1022,27 @@ auto FunctionParameter::check(
     std::vector<PyObject*>& overloaded_args,
     int argnum,
     int64_t* failed_idx) -> bool {
+  if (_check(obj, overloaded_args, argnum, failed_idx)) {
+    return true;
+  }
+  // NB: This will not detect torch function inside elements of a list.  So
+  // you still have to handle that manually
+  // NB: torch function on Tensor subclasses NOT eligible here, you handled
+  // that internally
+  if (check_has_torch_function(obj, /*ignore_mode*/ true) &&
+      !THPVariable_Check(obj)) {
+    // unrelated objects with __torch_function__
+    append_overloaded_arg(&overloaded_args, obj, /*obj_is_type*/ false);
+    return true;
+  }
+  return false;
+}
+
+auto FunctionParameter::_check(
+    PyObject* obj,
+    std::vector<PyObject*>& overloaded_args,
+    int argnum,
+    int64_t* failed_idx) -> bool {
   switch (type_) {
     case ParameterType::TENSOR: {
       if (is_tensor_and_append_overloaded(obj, &overloaded_args)) {
@@ -1003,7 +1108,7 @@ auto FunctionParameter::check(
           obj, &overloaded_args, argnum, true /* throw_error */);
     }
     case ParameterType::FLOAT_LIST:
-      return is_float_or_complex_list(obj);
+      return is_float_or_complex_list(obj, &overloaded_args);
     case ParameterType::GENERATOR:
       return THPGenerator_Check(obj);
     case ParameterType::BOOL:
@@ -1013,15 +1118,7 @@ auto FunctionParameter::check(
     case ParameterType::PYOBJECT:
       return true;
     case ParameterType::SCALARTYPE:
-      if (THPDtype_Check(obj) || THPPythonScalarType_Check(obj)) {
-        return true;
-      }
-      if (check_has_torch_function(obj, /*ignore_mode*/ true)) {
-        // tensor subclasses and unrelated objects with __torch_function__
-        append_overloaded_arg(&overloaded_args, obj, /*obj_is_type*/ false);
-        return true;
-      }
-      return false;
+      return THPDtype_Check(obj) || THPPythonScalarType_Check(obj);
     case ParameterType::LAYOUT:
       return THPLayout_Check(obj);
     case ParameterType::MEMORY_FORMAT:
@@ -1038,13 +1135,13 @@ auto FunctionParameter::check(
     case ParameterType::STRING:
       return THPUtils_checkString(obj);
     case ParameterType::SCALAR_LIST:
-      return is_scalar_list(obj);
+      return is_scalar_list(obj, &overloaded_args);
     case ParameterType::SYM_INT:
       return is_int_or_symint(obj);
     // Allow SymInt where int is expected; we'll guard in this case
     case ParameterType::INT_LIST:
     case ParameterType::SYM_INT_LIST:
-      return is_int_or_symint_list(obj, size, failed_idx);
+      return is_int_or_symint_list(obj, size, failed_idx, &overloaded_args);
     case ParameterType::DISPATCH_KEY_SET:
       return py::isinstance<c10::DispatchKeySet>(py::handle(obj));
     default:
@@ -1592,7 +1689,8 @@ bool FunctionSignature::parse(
       // should avoid having complex signatures that make use of it...
     } else if (
         varargs_eligible &&
-        (is_int_or_symint_list(args, param.size, &failed_idx))) {
+        (is_int_or_symint_list(
+            args, param.size, &failed_idx, &overloaded_args))) {
       // take all positional arguments as this parameter
       // e.g. permute(1, 2, 3) -> permute((1, 2, 3))
       dst[i++] = args;
@@ -1788,21 +1886,7 @@ at::Tensor PythonArgs::tensor_slow(int i) {
   if (PyBool_Check(obj)) {
     scalar = at::Scalar(THPUtils_unpackBool(obj));
   } else if (THPUtils_checkLong(obj)) {
-    int overflow = -1;
-    long long value = PyLong_AsLongLongAndOverflow(obj, &overflow);
-    if (value == -1 && PyErr_Occurred()) {
-      throw python_error();
-    }
-    if (overflow != 0) {
-      // try unsigned
-      unsigned long long value = PyLong_AsUnsignedLongLong(obj);
-      if (value == static_cast<unsigned long long>(-1) && PyErr_Occurred()) {
-        throw python_error();
-      }
-      scalar = at::Scalar(static_cast<uint64_t>(value));
-    } else {
-      scalar = at::Scalar(static_cast<int64_t>(value));
-    }
+    scalar = THPUtils_unpackInteger<at::Scalar>(obj);
   } else if (PyComplex_Check(obj)) {
     scalar = at::Scalar(THPUtils_unpackComplexDouble(obj));
   } else if (THPUtils_checkDouble(obj)) {
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index bc281f2512a5e..a81f861ae9030 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -322,6 +322,12 @@ struct FunctionParameter {
       int argnum,
       int64_t* failed_idx = nullptr);
 
+  bool _check(
+      PyObject* obj,
+      std::vector<PyObject*>& overloaded_args,
+      int argnum,
+      int64_t* failed_idx = nullptr);
+
   void set_default_str(const std::string& str);
   TORCH_PYTHON_API std::string type_name() const;
 
@@ -1053,13 +1059,20 @@ inline double PythonArgs::toDouble(int i) {
 }
 
 inline bool PythonArgs::toBool(int i) {
-  if (!args[i])
+  if (!args[i]) {
     return signature.params[i].default_bool;
+  }
+  if (args[i] == Py_True) {
+    return true;
+  }
+  if (args[i] == Py_False) {
+    return false;
+  }
   if (torch::is_symbool(py::handle(args[i]))) {
     return py::cast<c10::SymBool>(py::handle(args[i]))
         .guard_bool(__FILE__, __LINE__);
   }
-  return args[i] == Py_True;
+  return false;
 }
 
 inline double PythonArgs::toDoubleWithDefault(int i, double default_double) {
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 019ce2070634d..9d6eb35c71789 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/utils/python_dispatch.h>
 
 #include <ATen/ATen.h>
+#include <ATen/DTensorState.h>
 #include <ATen/FuncTorchTLS.h>
 #include <ATen/FunctionalTensorWrapper.h>
 #include <ATen/TensorSubclassLikeUtils.h>
@@ -26,6 +27,8 @@
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_raii.h>
 
+#include <cstdlib>
+#include <cstring>
 #include <iostream>
 #include <utility>
 
@@ -33,6 +36,10 @@ namespace py = pybind11;
 
 namespace torch::impl::dispatch {
 
+// Global storage for leaked Python filenames to ensure they remain valid
+// for the lifetime of Library objects
+static std::vector<std::string> leaked_python_filenames_;
+
 // NB: I'd like to index this on OperatorHandle, but I can't, as I can't
 // guarantee that the main interpreter has finish doing all registrations before
 // the other interpreters start banging on it
@@ -497,13 +504,18 @@ void initDispatchBindings(PyObject* module) {
          const char* file,
          uint32_t linenum) {
         HANDLE_TH_ERRORS
+        // Store the file string in global storage to ensure it remains valid
+        // for the lifetime of the Library object
+        leaked_python_filenames_.emplace_back(file);
+        const char* leaked_file = leaked_python_filenames_.back().c_str();
+
         return std::make_unique<torch::Library>(
             parseKind(kind),
             std::move(name),
             std::string(dispatch).empty()
                 ? std::nullopt
                 : std::make_optional(c10::parseDispatchKey(dispatch)),
-            "/dev/null", // temporary workaround
+            leaked_file,
             linenum);
         END_HANDLE_TH_ERRORS_PYBIND
       },
@@ -514,6 +526,12 @@ void initDispatchBindings(PyObject* module) {
       py::arg("file") = "/dev/null",
       py::arg("linenum") = 0);
 
+  m.def(
+      "_dispatch_clear_leaked_python_filenames",
+      []() { leaked_python_filenames_.clear(); },
+      "Clear the global storage of leaked Python filenames. "
+      "WARNING: Only call this if you're sure no Library objects are still using the filenames.");
+
   m.def(
       "_dispatch_find_schema_or_throw",
       [](const char* name, const char* overload_name) -> c10::OperatorHandle {
@@ -602,6 +620,43 @@ void initDispatchBindings(PyObject* module) {
             c10::parseDispatchKey(dispatch));
       });
 
+  // Bind SafeKernelFunction class
+  py::class_<c10::SafeKernelFunction>(m, "_SafeKernelFunction")
+      .def(
+          "call_boxed",
+          [](const c10::SafeKernelFunction& self,
+             c10::DispatchKeySet keyset,
+             py::args args,
+             const py::kwargs& kwargs) {
+            const auto& op = self.opHandle();
+            auto stack = torch::jit::createStackForSchema(
+                op.schema(),
+                std::move(args),
+                kwargs,
+                /*self=*/std::nullopt);
+            self.callBoxed(op, keyset, &stack);
+            return torch::jit::createPyObjectForStack(std::move(stack));
+          })
+      .def(
+          "__repr__",
+          [](const c10::SafeKernelFunction& self) {
+            return "SafeKernelFunction(debug='" + self.debug() + "')";
+          })
+      .def_property_readonly(
+          "op_handle", [](const c10::SafeKernelFunction& self) -> py::object {
+            return py::cast(self.opHandle());
+          });
+
+  m.def(
+      "_dispatch_get_computed_kernel_for_dispatch_key",
+      [](const char* name,
+         c10::DispatchKey dispatch) -> c10::SafeKernelFunction {
+        auto op =
+            c10::Dispatcher::singleton().findOp(torch::jit::parseName(name));
+        TORCH_CHECK(op, "operator ", name, " does not exist");
+        return op->getComputedKernelForDispatchKey(dispatch);
+      });
+
   m.def("_dispatch_find_dangling_impls", []() -> std::vector<std::string> {
     auto danglingImpls = c10::Dispatcher::singleton().findDanglingImpls();
 
@@ -991,6 +1046,13 @@ void initDispatchBindings(PyObject* module) {
   m.def("_only_lift_cpu_tensors", &torch::utils::only_lift_cpu_tensors);
   m.def("_set_only_lift_cpu_tensors", &torch::utils::set_only_lift_cpu_tensors);
 
+  m.def(
+      "_get_dtensor_allow_implicit_replication",
+      &at::get_dtensor_allow_implicit_replication);
+  m.def(
+      "_set_dtensor_allow_implicit_replication",
+      &at::set_dtensor_allow_implicit_replication);
+
   using c10::impl::TorchDispatchModeKey;
   py::enum_<TorchDispatchModeKey>(m, "_TorchDispatchModeKey")
       .value("FUNCTIONAL", TorchDispatchModeKey::FUNCTIONAL)
diff --git a/torch/csrc/utils/python_numbers.h b/torch/csrc/utils/python_numbers.h
index 25ca2692b3291..a8b9b8632a00b 100644
--- a/torch/csrc/utils/python_numbers.h
+++ b/torch/csrc/utils/python_numbers.h
@@ -208,3 +208,22 @@ inline c10::DeviceIndex THPUtils_unpackDeviceIndex(PyObject* obj) {
   }
   return (c10::DeviceIndex)value;
 }
+
+template <typename T>
+inline T THPUtils_unpackInteger(PyObject* obj) {
+  int overflow = -1;
+  const auto value = PyLong_AsLongLongAndOverflow(obj, &overflow);
+  if (value == -1 && PyErr_Occurred()) {
+    throw python_error();
+  }
+  if (!overflow) {
+    return static_cast<int64_t>(value);
+  }
+  // try unsigned
+  const auto uvalue = PyLong_AsUnsignedLongLong(obj);
+  if (uvalue == static_cast<std::decay_t<decltype(uvalue)>>(-1) &&
+      PyErr_Occurred()) {
+    throw python_error();
+  }
+  return static_cast<uint64_t>(uvalue);
+}
diff --git a/torch/csrc/utils/python_strings.h b/torch/csrc/utils/python_strings.h
index a6cb8d5c30b50..1d26c4333bc2b 100644
--- a/torch/csrc/utils/python_strings.h
+++ b/torch/csrc/utils/python_strings.h
@@ -116,7 +116,7 @@ inline py::object PyObject_FastGetAttrString(PyObject* obj, const char* name) {
   }
   /* Attribute referenced by (PyObject *)name */
   else if (tp->tp_getattro != nullptr) {
-    auto w = py::reinterpret_steal<py::object>(THPUtils_internString(name));
+    auto w = py::reinterpret_steal<py::object>(PyUnicode_FromString(name));
     if (w.ptr() == nullptr) {
       return py::object();
     }
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 45f58cde9a659..35511300f703e 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -304,7 +304,7 @@ Tensor internal_new_from_data(
     TORCH_CHECK(
         !pin_memory,
         "Can't pin tensor constructed from __cuda_array_interface__");
-    auto tensor = tensor_from_cuda_array_interface(data);
+    auto tensor = tensor_from_cuda_array_interface(data, device_opt);
     const auto& inferred_scalar_type =
         type_inference ? tensor.scalar_type() : scalar_type;
 
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index c8548884692fd..b9839a79f6110 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -27,7 +27,9 @@ bool is_numpy_int(PyObject* obj) {
 bool is_numpy_scalar(PyObject* obj) {
   throw std::runtime_error("PyTorch was compiled without NumPy support");
 }
-at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
+at::Tensor tensor_from_cuda_array_interface(
+    PyObject* obj,
+    std::optional<c10::Device> device_opt) {
   throw std::runtime_error("PyTorch was compiled without NumPy support");
 }
 
@@ -380,7 +382,9 @@ bool is_numpy_scalar(PyObject* obj) {
        PyArray_IsScalar(obj, ComplexFloating));
 }
 
-at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
+at::Tensor tensor_from_cuda_array_interface(
+    PyObject* obj,
+    std::optional<c10::Device> device_opt) {
   if (!is_numpy_available()) {
     throw std::runtime_error("Numpy is not available");
   }
@@ -489,7 +493,13 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
     // ref:
     // https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html#cuda-array-interface-version-3
     if (data_ptr != nullptr) {
-      return {};
+      if (device_opt.has_value() && device_opt->has_index()) {
+        // if device_opt is provided with explicit device index, use it
+        return device_opt;
+      } else {
+        // otherwise infer from cudaPointerGetAttributes later in from_blob
+        return std::nullopt;
+      }
     } else {
       const auto current_device = at::detail::getCUDAHooks().getCurrentDevice();
       return Device(
diff --git a/torch/csrc/utils/tensor_numpy.h b/torch/csrc/utils/tensor_numpy.h
index a7c1d8cf5476e..5f93cbb089c21 100644
--- a/torch/csrc/utils/tensor_numpy.h
+++ b/torch/csrc/utils/tensor_numpy.h
@@ -22,7 +22,9 @@ TORCH_API bool is_numpy_bool(PyObject* obj);
 TORCH_API bool is_numpy_scalar(PyObject* obj);
 
 void warn_numpy_not_writeable();
-at::Tensor tensor_from_cuda_array_interface(PyObject* obj);
+at::Tensor tensor_from_cuda_array_interface(
+    PyObject* obj,
+    std::optional<c10::Device> device_opt = std::nullopt);
 
 void validate_numpy_for_dlpack_deleter_bug();
 bool is_numpy_dlpack_deleter_bugged();
diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp
index 715bf5b8fb66f..d49fc0539a087 100644
--- a/torch/csrc/xpu/Module.cpp
+++ b/torch/csrc/xpu/Module.cpp
@@ -295,8 +295,23 @@ static void registerXpuDeviceProperties(PyObject* module) {
     return static_cast<int64_t>(prop.architecture);
   };
 #endif
+  // Wrapper class for XPU UUID
+  struct XPUuuid {
+    XPUuuid(const std::array<unsigned char, 16>& uuid) : bytes(uuid) {}
+    const std::array<unsigned char, 16>& bytes{};
+  };
   auto m = py::handle(module).cast<py::module>();
 
+  py::class_<XPUuuid>(m, "_XPUuuid")
+      .def_property_readonly(
+          "bytes",
+          [](const XPUuuid& uuid) {
+            return std::vector<uint8_t>(uuid.bytes.begin(), uuid.bytes.end());
+          })
+      .def("__str__", [](const XPUuuid& uuid) {
+        return uuid_to_string(reinterpret_cast<const char*>(uuid.bytes.data()));
+      });
+
 #define DEFINE_READONLY_MEMBER(member) \
   def_readonly(#member, &DeviceProp::member)
 
@@ -328,6 +343,9 @@ static void registerXpuDeviceProperties(PyObject* module) {
       .def_property_readonly("architecture", get_device_architecture)
 #endif
       .def_property_readonly("type", get_device_type)
+      .def_property_readonly(
+          "uuid",
+          [](const DeviceProp& prop) -> XPUuuid { return XPUuuid(prop.uuid); })
       .def(
           "__repr__",
           [&get_device_type, &gpu_subslice_count](const DeviceProp& prop) {
@@ -335,7 +353,9 @@ static void registerXpuDeviceProperties(PyObject* module) {
             stream << "_XpuDeviceProperties(name='" << prop.name
                    << "', platform_name='" << prop.platform_name << "', type='"
                    << get_device_type(prop) << "', device_id=0x" << std::hex
-                   << std::uppercase << prop.device_id << std::dec
+                   << std::uppercase << prop.device_id << std::dec << ", uuid="
+                   << uuid_to_string(
+                          reinterpret_cast<const char*>(prop.uuid.data()))
                    << ", driver_version='" << prop.driver_version
                    << "', total_memory="
                    << prop.global_mem_size / (1024ull * 1024) << "MB"
diff --git a/torch/cuda/error.py b/torch/cuda/error.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/torch/cuda/graphs.py b/torch/cuda/graphs.py
index 5431c6bfb0670..3946b7b3360ad 100644
--- a/torch/cuda/graphs.py
+++ b/torch/cuda/graphs.py
@@ -173,6 +173,13 @@ def raw_cuda_graph(self) -> int:
         """  # noqa: B950
         return super().raw_cuda_graph()
 
+    def raw_cuda_graph_exec(self) -> int:
+        r"""Returns the underlying cudaGraphExec_t. ``instantiate`` must have been called if ``keep_graph`` is True, or ``capture_end`` must have been called if ``keep_graph`` is False. If you call ``instantiate()`` after ``raw_cuda_graph_exec()``, the previously returned cudaGraphExec_t will be destroyed. It is your responsibility not to use this object after destruction.
+
+        See the following for APIs for how to manipulate this object: `Graph Execution <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH__EXEC.html>`_ and `cuda-python Graph Execution bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-execution>`_
+        """  # noqa: B950
+        return super().raw_cuda_graph_exec()
+
 
 class graph:
     r"""Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 63e59096162fb..5a1a0adc02afc 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -255,9 +255,9 @@ def memory_stats(device: "Device" = None) -> dict[str, Any]:
 
     - ``all``: combined statistics across all memory pools.
     - ``large_pool``: statistics for the large allocation pool
-      (as of October 2019, for size >= 1MB allocations).
+      (as of June 2025, for size >= 1MB allocations).
     - ``small_pool``: statistics for the small allocation pool
-      (as of October 2019, for size < 1MB allocations).
+      (as of June 2025, for size < 1MB allocations).
 
     Metric type:
 
@@ -1075,8 +1075,8 @@ def _save_memory_usage(filename="output.svg", snapshot=None):
         f.write(_memory(snapshot))
 
 
-# Keep for BC only
-_set_allocator_settings = torch._C._accelerator_setAllocatorSettings
+def _set_allocator_settings(env: str):
+    return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env)
 
 
 def get_allocator_backend() -> str:
@@ -1169,28 +1169,21 @@ class MemPool(_MemPool):
         use_on_oom(bool): a bool that indicates if this pool can be used
             as a last resort if a memory allocation outside of the pool fails due
             to Out Of Memory. This is False by default.
-        symmetric(bool): a bool that indicates if this pool is symmetrical
-            across ranks. This is False by default.
+
     """
 
     def __init__(
         self,
         allocator: Optional[_cuda_CUDAAllocator] = None,
         use_on_oom: bool = False,
-        symmetric: bool = False,
     ):
-        super().__init__(allocator, True, use_on_oom, symmetric)
+        super().__init__(allocator, True, use_on_oom)
 
     @property
     def id(self) -> tuple[int, int]:
         r"""Returns the ID of this pool as a tuple of two ints."""
         return super().id
 
-    @property
-    def is_symmetric(self) -> bool:
-        r"""Returns whether this pool is used for NCCL's symmetric memory."""
-        return super().is_symmetric
-
     @property
     def allocator(self) -> Optional[_cuda_CUDAAllocator]:
         r"""Returns the allocator this MemPool routes allocations to."""
diff --git a/torch/distributed/_C_stubs.py b/torch/distributed/_C_stubs.py
new file mode 100644
index 0000000000000..b241006372b6a
--- /dev/null
+++ b/torch/distributed/_C_stubs.py
@@ -0,0 +1,150 @@
+# mypy: allow-untyped-defs
+"""
+Python stubs for backend-specific distributed components.
+
+Since _C._distributed_c10d always exists now, this module only provides
+stubs for backend-specific functionality that may not be available in all builds
+(e.g., NCCL, UCC, MPI, Gloo, etc.).
+"""
+
+from __future__ import annotations
+
+from typing import Optional, TYPE_CHECKING
+
+from torch._C._distributed_c10d import Store
+
+
+if TYPE_CHECKING:
+    from datetime import timedelta
+
+import torch
+
+
+# Store classes
+class HashStore(Store):
+    """Stub HashStore for builds without this functionality."""
+
+    def __init__(self, *args, **kwargs):
+        self._data = {}
+
+    def set(self, key: str, value: str):
+        self._data[key] = value
+
+    def get(self, key: str) -> bytes:
+        return self._data.get(key, "").encode()
+
+
+# Backend-specific process group stubs
+class ProcessGroupMPI:
+    """Stub ProcessGroupMPI for non-MPI builds."""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class ProcessGroupNCCL:
+    """Stub ProcessGroupNCCL for non-NCCL builds."""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class ProcessGroupGloo:
+    """Stub ProcessGroupGloo for non-Gloo builds."""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class ProcessGroupUCC:
+    """Stub ProcessGroupUCC for non-UCC builds."""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class ProcessGroupXCCL:
+    """Stub ProcessGroupXCCL for non-XCCL builds."""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class _ProcessGroupWrapper:
+    """Stub _ProcessGroupWrapper for non-Gloo builds."""
+
+    def __init__(self, process_group, *args, **kwargs):
+        self._process_group = process_group
+
+    def __getattr__(self, name):
+        return getattr(self._process_group, name)
+
+
+# NCCL-specific function stubs
+_DEFAULT_PG_NCCL_TIMEOUT: Optional[timedelta] = None
+
+
+def _hash_tensors(tensors):
+    """Stub function to hash tensors - returns dummy hash."""
+    return 0
+
+
+def _dump_nccl_trace_json(
+    includeCollectives: Optional[bool] = None, onlyActive: Optional[bool] = None
+) -> bytes:
+    """Stub function that returns empty JSON trace."""
+    return b"{}"
+
+
+def _dump_nccl_trace(
+    includeCollectives: Optional[bool] = None,
+    includeStackTraces: Optional[bool] = None,
+    onlyActive: Optional[bool] = None,
+) -> bytes:
+    """Stub function that returns empty pickle trace."""
+    return b""
+
+
+# NVSHMEM/SymmetricMemory stubs
+def _is_nvshmem_available() -> bool:
+    """Stub function that returns False indicating NVSHMEM is not available."""
+    return False
+
+
+def _nvshmemx_cumodule_init(module: int) -> None:
+    """Stub function for NVSHMEM CU module initialization."""
+
+
+class _SymmetricMemory:
+    """Stub _SymmetricMemory class for builds without this functionality."""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    @classmethod
+    def empty_strided_p2p(cls, size, stride, dtype, device, group_name=None):
+        """Stub that returns a regular tensor."""
+        return torch.empty(size, dtype=dtype, device=device)
+
+    @classmethod
+    def rendezvous(cls, tensor, group_name=None):
+        """Stub that returns None."""
+        return None
+
+    @classmethod
+    def set_group_info(cls, *args, **kwargs):
+        """Stub that does nothing."""
+
+    @classmethod
+    def set_backend(cls, name):
+        """Stub that does nothing."""
+
+    @classmethod
+    def get_backend(cls, device):
+        """Stub that returns None."""
+        return None
+
+    @classmethod
+    def has_multicast_support(cls, device_type, device_index):
+        """Stub that returns False."""
+        return False
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index 38e2fdbee803a..836b00c51c3a4 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -14,16 +14,10 @@
 
 def is_available() -> bool:
     """
-    Return ``True`` if the distributed package is available.
-
-    Otherwise,
-    ``torch.distributed`` does not expose any other APIs. Currently,
-    ``torch.distributed`` is available on Linux, MacOS and Windows. Set
-    ``USE_DISTRIBUTED=1`` to enable it when building PyTorch from source.
-    Currently, the default value is ``USE_DISTRIBUTED=1`` for Linux and Windows,
-    ``USE_DISTRIBUTED=0`` for MacOS.
+    Always returns ``True``.  Note that even if distributed is available,
+    there may not necessarily be any usable backends.
     """
-    return hasattr(torch._C, "_c10d_init")
+    return True
 
 
 if is_available() and not torch._C._c10d_init():
@@ -36,132 +30,124 @@ def is_available() -> bool:
 DistStoreError = torch._C._DistStoreError
 QueueEmptyError = torch._C._DistQueueEmptyError
 
-if is_available():
-    from torch._C._distributed_c10d import (
-        _broadcast_coalesced,
-        _compute_bucket_assignment_by_size,
-        _ControlCollectives,
-        _DEFAULT_FIRST_BUCKET_BYTES,
-        _make_nccl_premul_sum,
-        _register_builtin_comm_hook,
-        _register_comm_hook,
-        _StoreCollectives,
-        _test_python_store,
-        _verify_params_across_processes,
-        Backend as _Backend,
-        BuiltinCommHookType,
-        DebugLevel,
-        FileStore,
-        get_debug_level,
-        GradBucket,
-        Logger,
-        PrefixStore,
-        ProcessGroup as ProcessGroup,
-        Reducer,
-        set_debug_level,
-        set_debug_level_from_env,
-        Store,
-        TCPStore,
-        Work as _Work,
-    )
-
-    class _DistributedPdb(pdb.Pdb):
-        """
-        Supports using PDB from inside a multiprocessing child process.
-
-        Usage:
-        _DistributedPdb().set_trace()
-        """
-
-        def interaction(self, *args, **kwargs):
-            _stdin = sys.stdin
-            try:
-                sys.stdin = open("/dev/stdin")
-                pdb.Pdb.interaction(self, *args, **kwargs)
-            finally:
-                sys.stdin = _stdin
-
-    _breakpoint_cache: dict[int, typing.Any] = {}
-
-    def breakpoint(rank: int = 0, skip: int = 0, timeout_s=3600):
-        """
-        Set a breakpoint, but only on a single rank.  All other ranks will wait for you to be
-        done with the breakpoint before continuing.
-
-        Args:
-            rank (int): Which rank to break on.  Default: ``0``
-            skip (int): Skip the first ``skip`` calls to this breakpoint. Default: ``0``.
-        """
-        if skip > 0:
-            key = hash(str(traceback.format_exc()))
-            counter = _breakpoint_cache.get(key, 0) + 1
-            _breakpoint_cache[key] = counter
-            if counter <= skip:
-                log.warning("Skip the breakpoint, counter=%d", counter)
-                return
-
-        # avoid having the default timeout (if short) interrupt your debug session
-        if timeout_s is not None:
-            for group in torch.distributed.distributed_c10d._pg_map:
-                torch.distributed.distributed_c10d._set_pg_timeout(
-                    timedelta(seconds=timeout_s), group
-                )
-
-        if get_rank() == rank:
-            pdb = _DistributedPdb()
-            pdb.message(
-                "\n!!! ATTENTION !!!\n\n"
-                f"Type 'up' to get to the frame that called dist.breakpoint(rank={rank})\n"
-            )
-            pdb.set_trace()
-        # If Meta/Python keys are in the TLS, we want to make sure that we ignore them
-        # and hit the (default) CPU/CUDA implementation of barrier.
-        meta_in_tls = torch._C._meta_in_tls_dispatch_include()
-        guard = torch._C._DisableTorchDispatch()  # type: ignore[attr-defined]
-        torch._C._set_meta_in_tls_dispatch_include(False)
+from torch.distributed._distributed_c10d import (
+    _broadcast_coalesced,
+    _compute_bucket_assignment_by_size,
+    _ControlCollectives,
+    _DEFAULT_FIRST_BUCKET_BYTES,
+    _make_nccl_premul_sum,
+    _register_builtin_comm_hook,
+    _register_comm_hook,
+    _StoreCollectives,
+    _test_python_store,
+    _verify_params_across_processes,
+    Backend as _Backend,
+    BuiltinCommHookType,
+    DebugLevel,
+    FileStore,
+    get_debug_level,
+    GradBucket,
+    Logger,
+    PrefixStore,
+    ProcessGroup as ProcessGroup,
+    Reducer,
+    set_debug_level,
+    set_debug_level_from_env,
+    Store,
+    TCPStore,
+    Work as _Work,
+)
+
+
+class _DistributedPdb(pdb.Pdb):
+    """
+    Supports using PDB from inside a multiprocessing child process.
+
+    Usage:
+    _DistributedPdb().set_trace()
+    """
+
+    def interaction(self, *args, **kwargs):
+        _stdin = sys.stdin
         try:
-            barrier()
+            sys.stdin = open("/dev/stdin")
+            pdb.Pdb.interaction(self, *args, **kwargs)
         finally:
-            torch._C._set_meta_in_tls_dispatch_include(meta_in_tls)
-            del guard
-
-    if sys.platform != "win32":
-        from torch._C._distributed_c10d import HashStore
-
-    from .device_mesh import DeviceMesh, init_device_mesh
-
-    # Variables prefixed with underscore are not auto imported
-    # See the comment in `distributed_c10d.py` above `_backend` on why we expose
-    # this.
-    from .distributed_c10d import *  # noqa: F403
-    from .distributed_c10d import (
-        _all_gather_base,
-        _coalescing_manager,
-        _CoalescingManager,
-        _create_process_group_wrapper,
-        _get_process_group_name,
-        _rank_not_in_group,
-        _reduce_scatter_base,
-        _time_estimator,
-        get_node_local_rank,
-    )
-    from .remote_device import _remote_device
-    from .rendezvous import (
-        _create_store_from_options,
-        register_rendezvous_handler,
-        rendezvous,
-    )
-
-    set_debug_level_from_env()
-
-else:
-    # This stub is sufficient to get
-    #   python test/test_public_bindings.py -k test_correct_module_names
-    # working even when USE_DISTRIBUTED=0.  Feel free to add more
-    # stubs as necessary.
-    # We cannot define stubs directly because they confuse pyre
-
-    class _ProcessGroupStub:
-        pass
-
-    sys.modules["torch.distributed"].ProcessGroup = _ProcessGroupStub  # type: ignore[attr-defined]
+            sys.stdin = _stdin
+
+
+_breakpoint_cache: dict[int, typing.Any] = {}
+
+
+def breakpoint(rank: int = 0, skip: int = 0, timeout_s=3600):
+    """
+    Set a breakpoint, but only on a single rank.  All other ranks will wait for you to be
+    done with the breakpoint before continuing.
+
+    Args:
+        rank (int): Which rank to break on.  Default: ``0``
+        skip (int): Skip the first ``skip`` calls to this breakpoint. Default: ``0``.
+    """
+    if skip > 0:
+        key = hash(str(traceback.format_exc()))
+        counter = _breakpoint_cache.get(key, 0) + 1
+        _breakpoint_cache[key] = counter
+        if counter <= skip:
+            log.warning("Skip the breakpoint, counter=%d", counter)
+            return
+
+    # avoid having the default timeout (if short) interrupt your debug session
+    if timeout_s is not None:
+        for group in torch.distributed.distributed_c10d._pg_map:
+            torch.distributed.distributed_c10d._set_pg_timeout(
+                timedelta(seconds=timeout_s), group
+            )
+
+    if get_rank() == rank:
+        pdb = _DistributedPdb()
+        pdb.message(
+            "\n!!! ATTENTION !!!\n\n"
+            f"Type 'up' to get to the frame that called dist.breakpoint(rank={rank})\n"
+        )
+        pdb.set_trace()
+    # If Meta/Python keys are in the TLS, we want to make sure that we ignore them
+    # and hit the (default) CPU/CUDA implementation of barrier.
+    meta_in_tls = torch._C._meta_in_tls_dispatch_include()
+    guard = torch._C._DisableTorchDispatch()  # type: ignore[attr-defined]
+    torch._C._set_meta_in_tls_dispatch_include(False)
+    try:
+        barrier()
+    finally:
+        torch._C._set_meta_in_tls_dispatch_include(meta_in_tls)
+        del guard
+
+
+if sys.platform != "win32":
+    from torch.distributed._distributed_c10d import HashStore
+
+from .device_mesh import DeviceMesh, init_device_mesh
+
+# Variables prefixed with underscore are not auto imported
+# See the comment in `distributed_c10d.py` above `_backend` on why we expose
+# this.
+from .distributed_c10d import *  # noqa: F403
+from .distributed_c10d import (
+    _all_gather_base,
+    _coalescing_manager,
+    _CoalescingManager,
+    _create_process_group_wrapper,
+    _get_process_group_name,
+    _rank_not_in_group,
+    _reduce_scatter_base,
+    _time_estimator,
+    get_node_local_rank,
+)
+from .remote_device import _remote_device
+from .rendezvous import (
+    _create_store_from_options,
+    register_rendezvous_handler,
+    rendezvous,
+)
+
+
+set_debug_level_from_env()
diff --git a/torch/distributed/_composable/checkpoint_activation.py b/torch/distributed/_composable/checkpoint_activation.py
index 0fe23cab72c45..2d109ad56835b 100644
--- a/torch/distributed/_composable/checkpoint_activation.py
+++ b/torch/distributed/_composable/checkpoint_activation.py
@@ -79,6 +79,7 @@ def checkpoint(module: nn.Module, **kwargs) -> nn.Module:
     user_context_fns = kwargs.pop("context_fn", None)
     determinism_check = kwargs.pop("determinism_check", _DEFAULT_DETERMINISM_MODE)
     debug = kwargs.pop("debug", False)
+    early_stop = kwargs.pop("early_stop", True)
 
     if kwargs:
         raise ValueError(
@@ -103,6 +104,7 @@ def context_fns():
                 context_fns,
                 determinism_check,
                 debug,
+                early_stop,
                 *args,
                 **kwargs,
             )
diff --git a/torch/distributed/_composable/replicate_with_fsdp.py b/torch/distributed/_composable/replicate_with_fsdp.py
index b49d240e4d75e..219501a0a7086 100644
--- a/torch/distributed/_composable/replicate_with_fsdp.py
+++ b/torch/distributed/_composable/replicate_with_fsdp.py
@@ -43,7 +43,7 @@
     from torch.distributed.tensor import Shard
 
 
-cls_to_fsdp_cls: dict[type, type] = {}
+cls_to_replicate_cls: dict[type, type] = {}
 
 _ROOT_MODULE_PREFIX = ""
 
@@ -51,10 +51,10 @@
 
 
 class _ReplicateStateContext:
-    """This has state shared across FSDP states."""
+    """This has state shared across Replicate states."""
 
     def __init__(self) -> None:
-        # All FSDP states in the root state's module tree
+        # All Replicate states in the root state's module tree
         self.all_states: list[_ReplicateState] = []
         # Iteration's forward root runs the once-per-forward logic; this root
         # may not be the overall root set by lazy initialization in cases where
@@ -173,7 +173,7 @@ def replicate_impl(
     offload_policy: OffloadPolicy = OffloadPolicy(),
     ignored_params: Optional[set[nn.Parameter]] = None,
 ):
-    torch._C._log_api_usage_once("torch.distributed.fsdp.fully_shard")
+    torch._C._log_api_usage_once("torch.distributed._composable.replicate_with_fsdp")
     if isinstance(module, (nn.ModuleList, nn.ModuleDict)):
         raise ValueError(
             f"replicate does not support containers that do not implement forward: {module}"
@@ -224,11 +224,11 @@ def replicate_impl(
     # Place Replicate leftmost for highest priority in the method resolution order
     for module in modules:
         cls = module.__class__
-        new_cls = cls_to_fsdp_cls.get(cls, None)
+        new_cls = cls_to_replicate_cls.get(cls, None)
         if not new_cls:
             dct = {"__deepcopy__": _unimplemented_deepcopy}
-            new_cls = type(f"FSDP{cls.__name__}", (FSDPModule, cls), dct)
-            cls_to_fsdp_cls[cls] = new_cls
+            new_cls = type(f"Replicate{cls.__name__}", (FSDPModule, cls), dct)
+            cls_to_replicate_cls[cls] = new_cls
         module.__class__ = new_cls
     return arg_module
 
@@ -262,27 +262,7 @@ def replicate(
         )
 
     device_mesh = kwargs.pop("device_mesh", None)
-    if device_mesh is not None:
-        from torch.distributed.device_mesh import _mesh_resources
-
-        root_mesh = _mesh_resources.get_root_mesh(device_mesh)
-        # if a root mesh is not the same as device_mesh,
-        # meaning the device_mesh is sliced out from the root mesh.
-        if root_mesh != device_mesh:
-            # TODO: This is a temporary work around to enable DDP + TP.
-            # We should do the logic in DDP so that the 2D implementation is
-            # sound and the state_dict works out of the box.
-            #
-            # This won't conflict with what is done in DDP class as the module
-            # replicate is going to pass is NOT the original module.
-            from torch.distributed.tensor.parallel.ddp import (
-                _localize_dtensor,
-                _reconstruct_dtensor,
-            )
-
-            module.register_forward_pre_hook(_reconstruct_dtensor)
-            module.register_forward_hook(_localize_dtensor)
-    else:
+    if device_mesh is None:
         device_mesh = replicate_mesh()
 
     module = replicate_impl(module, mesh=device_mesh, **kwargs)
diff --git a/torch/distributed/_dist2.py b/torch/distributed/_dist2.py
index ce5cb8d7e0cc3..1c27bf55d6834 100644
--- a/torch/distributed/_dist2.py
+++ b/torch/distributed/_dist2.py
@@ -10,7 +10,7 @@
 from typing import Protocol, Union
 
 import torch
-from torch._C._distributed_c10d import (
+from torch.distributed._distributed_c10d import (
     _current_process_group,
     _set_process_group,
     ProcessGroup,
diff --git a/torch/distributed/_distributed_c10d.py b/torch/distributed/_distributed_c10d.py
new file mode 100644
index 0000000000000..beb7830edc1da
--- /dev/null
+++ b/torch/distributed/_distributed_c10d.py
@@ -0,0 +1,245 @@
+# mypy: disable-error-code="assignment"
+# noqa: F401
+"""
+Centralized module for importing and re-exporting torch._C._distributed_c10d components.
+
+IMPORTANT PATTERN:
+Never access torch._C._distributed_c10d directly in code. Always import from and use
+torch.distributed._distributed_c10d which is guaranteed to have all functions available.
+
+Example:
+    # WRONG: torch._C._distributed_c10d._set_global_rank(rank)
+    # RIGHT:
+    from torch.distributed._distributed_c10d import _set_global_rank
+    _set_global_rank(rank)
+"""
+
+from typing import TYPE_CHECKING
+
+# Import all core distributed components from the C extension
+# NB: This list has to be spelled out because the _C module doesn't have __all__
+from torch._C._distributed_c10d import (
+    _allow_inflight_collective_as_graph_input,
+    _broadcast_coalesced,
+    _compute_bucket_assignment_by_size,
+    _ControlCollectives,
+    _current_process_group,
+    _DEFAULT_FIRST_BUCKET_BYTES,
+    _DEFAULT_PG_TIMEOUT,
+    _DistributedBackendOptions,
+    _make_nccl_premul_sum,
+    _register_builtin_comm_hook,
+    _register_comm_hook,
+    _register_process_group,
+    _register_work,
+    _resolve_process_group,
+    _set_allow_inflight_collective_as_graph_input,
+    _set_global_rank,
+    _set_process_group,
+    _StoreCollectives,
+    _test_python_store,
+    _unregister_all_process_groups,
+    _unregister_process_group,
+    _verify_params_across_processes,
+    _WorkerServer,
+    AllgatherOptions,
+    AllreduceCoalescedOptions,
+    AllreduceOptions,
+    AllToAllOptions,
+    Backend,
+    BarrierOptions,
+    BroadcastOptions,
+    BuiltinCommHookType,
+    DebugLevel,
+    FakeProcessGroup,
+    FakeWork,
+    FileStore,
+    GatherOptions,
+    get_debug_level,
+    GradBucket,
+    Logger,
+    PrefixStore,
+    ProcessGroup,
+    ReduceOp,
+    ReduceOptions,
+    Reducer,
+    ReduceScatterOptions,
+    ScatterOptions,
+    set_debug_level,
+    set_debug_level_from_env,
+    Store,
+    TCPStore,
+    Work,
+)
+
+
+# Backend-specific components that may not be available
+_MPI_AVAILABLE = False
+_NCCL_AVAILABLE = False
+_GLOO_AVAILABLE = False
+_UCC_AVAILABLE = False
+_XCCL_AVAILABLE = False
+
+# HashStore
+try:
+    from torch._C._distributed_c10d import HashStore
+except ImportError:
+    if not TYPE_CHECKING:
+        from torch.distributed._C_stubs import HashStore
+
+# NVSHMEM/SymmetricMemory components
+
+# There are multiple backends for SymmetricMemory, as a result,
+# _SymmetricMemory should not be imported together with NVSHMEM related modules.
+try:
+    from torch._C._distributed_c10d import _SymmetricMemory
+except ImportError:
+    if not TYPE_CHECKING:
+        from torch.distributed._C_stubs import _SymmetricMemory
+
+try:
+    from torch._C._distributed_c10d import (
+        _is_nvshmem_available,
+        _nvshmemx_cumodule_init,
+    )
+except ImportError:
+    if not TYPE_CHECKING:
+        from torch.distributed._C_stubs import (
+            _is_nvshmem_available,
+            _nvshmemx_cumodule_init,
+        )
+
+# MPI backend
+try:
+    from torch._C._distributed_c10d import ProcessGroupMPI
+
+    _MPI_AVAILABLE = True
+except ImportError:
+    if not TYPE_CHECKING:
+        from torch.distributed._C_stubs import ProcessGroupMPI
+
+# NCCL backend
+try:
+    from torch._C._distributed_c10d import (
+        _DEFAULT_PG_NCCL_TIMEOUT,
+        _dump_nccl_trace,
+        _dump_nccl_trace_json,
+        _hash_tensors,
+        ProcessGroupNCCL,
+    )
+
+    _NCCL_AVAILABLE = True
+except ImportError:
+    if not TYPE_CHECKING:
+        from torch.distributed._C_stubs import (
+            _DEFAULT_PG_NCCL_TIMEOUT,
+            _dump_nccl_trace,
+            _dump_nccl_trace_json,
+            _hash_tensors,
+            ProcessGroupNCCL,
+        )
+
+# Gloo backend
+try:
+    from torch._C._distributed_c10d import _ProcessGroupWrapper, ProcessGroupGloo
+
+    _GLOO_AVAILABLE = True
+except ImportError:
+    if not TYPE_CHECKING:
+        from torch.distributed._C_stubs import _ProcessGroupWrapper, ProcessGroupGloo
+
+# UCC backend
+try:
+    from torch._C._distributed_c10d import ProcessGroupUCC
+
+    _UCC_AVAILABLE = True
+except ImportError:
+    if not TYPE_CHECKING:
+        from torch.distributed._C_stubs import ProcessGroupUCC
+
+# XCCL backend
+try:
+    from torch._C._distributed_c10d import ProcessGroupXCCL
+
+    _XCCL_AVAILABLE = True
+except ImportError:
+    if not TYPE_CHECKING:
+        from torch.distributed._C_stubs import ProcessGroupXCCL
+
+# Provide backwards compatibility by making all symbols available at module level
+__all__ = [
+    # Basic components
+    "_broadcast_coalesced",
+    "_compute_bucket_assignment_by_size",
+    "_ControlCollectives",
+    "_DEFAULT_FIRST_BUCKET_BYTES",
+    "_DEFAULT_PG_TIMEOUT",
+    "_DEFAULT_PG_NCCL_TIMEOUT",
+    "_make_nccl_premul_sum",
+    "_register_builtin_comm_hook",
+    "_register_comm_hook",
+    "_StoreCollectives",
+    "_test_python_store",
+    "_verify_params_across_processes",
+    "_allow_inflight_collective_as_graph_input",
+    "_register_work",
+    "_set_allow_inflight_collective_as_graph_input",
+    "_is_nvshmem_available",
+    "_nvshmemx_cumodule_init",
+    "_SymmetricMemory",
+    "_hash_tensors",
+    "_set_global_rank",
+    "_dump_nccl_trace",
+    "_dump_nccl_trace_json",
+    "Backend",
+    "BuiltinCommHookType",
+    "DebugLevel",
+    "FakeProcessGroup",
+    "FileStore",
+    "get_debug_level",
+    "GradBucket",
+    "HashStore",
+    "Logger",
+    "PrefixStore",
+    "ProcessGroup",
+    "Reducer",
+    "ReduceOp",
+    "set_debug_level",
+    "set_debug_level_from_env",
+    "Store",
+    "TCPStore",
+    "Work",
+    "FakeWork",
+    # Additional distributed_c10d components
+    "_DistributedBackendOptions",
+    "_register_process_group",
+    "_resolve_process_group",
+    "_unregister_all_process_groups",
+    "_unregister_process_group",
+    "_current_process_group",
+    "_set_process_group",
+    "_WorkerServer",
+    "AllgatherOptions",
+    "AllreduceCoalescedOptions",
+    "AllreduceOptions",
+    "AllToAllOptions",
+    "BarrierOptions",
+    "BroadcastOptions",
+    "GatherOptions",
+    "ReduceOptions",
+    "ReduceScatterOptions",
+    "ScatterOptions",
+    # Process group implementations
+    "ProcessGroupMPI",
+    "ProcessGroupNCCL",
+    "ProcessGroupGloo",
+    "ProcessGroupUCC",
+    "ProcessGroupXCCL",
+    "_ProcessGroupWrapper",
+    # Availability flags
+    "_MPI_AVAILABLE",
+    "_NCCL_AVAILABLE",
+    "_GLOO_AVAILABLE",
+    "_UCC_AVAILABLE",
+    "_XCCL_AVAILABLE",
+]
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
index 0b53da3988bd8..95feb6cd79714 100644
--- a/torch/distributed/_functional_collectives.py
+++ b/torch/distributed/_functional_collectives.py
@@ -7,6 +7,10 @@
 import torch
 import torch.distributed as dist
 import torch.distributed.distributed_c10d as c10d
+from torch.distributed._distributed_c10d import (
+    _allow_inflight_collective_as_graph_input,
+    _set_allow_inflight_collective_as_graph_input,
+)
 from torch.distributed.device_mesh import DeviceMesh
 from torch.fx.experimental.proxy_tensor import get_proxy_mode
 
@@ -815,6 +819,11 @@ def _are_we_tracing() -> bool:
     # If fake mode is turned on, we are almost definitely compiling/tracing.
     if torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.FAKE) is not None:
         return True
+    # See Note [enable_python_dispatcher in dynamo]
+    if torch._C._dispatch_tls_is_dispatch_key_included(
+        torch._C.DispatchKey.PythonDispatcher
+    ):
+        return True
     return get_proxy_mode() is not None
 
 
@@ -853,15 +862,13 @@ def all_reduce_wait_compiled(y):
     will be registered in the work registry, and the wait_tensor() in compiled region called on
     the output tensor of the collective will wait on the correct work object.
     """
-    previous = torch._C._distributed_c10d._allow_inflight_collective_as_graph_input()
+    previous = _allow_inflight_collective_as_graph_input()
 
     try:
-        torch._C._distributed_c10d._set_allow_inflight_collective_as_graph_input(value)
+        _set_allow_inflight_collective_as_graph_input(value)
         yield
     finally:
-        torch._C._distributed_c10d._set_allow_inflight_collective_as_graph_input(
-            previous
-        )
+        _set_allow_inflight_collective_as_graph_input(previous)
 
 
 def _make_all_gather_out_tensor(input, group_size):
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
index 2bfbbcb575cd6..772483322cc56 100644
--- a/torch/distributed/_shard/sharded_tensor/api.py
+++ b/torch/distributed/_shard/sharded_tensor/api.py
@@ -1146,8 +1146,12 @@ def reshard(self, resharding_spec: shard_spec.ShardingSpec) -> ShardedTensor:
             resharding_spec, shard_spec.ChunkShardingSpec
         ) or not isinstance(self._sharding_spec, shard_spec.ChunkShardingSpec):
             raise NotImplementedError("Only ChunkShardingSpec supported for reshard.")
-        if len(self.local_shards()) != 1:
-            raise NotImplementedError("Only single local shard supported for reshard.")
+
+        num_local_shards = len(self.local_shards())
+        if num_local_shards != 1:
+            raise NotImplementedError(
+                f"Only single local shard supported for reshard. Number of shards: {num_local_shards}"
+            )
 
         if self._sharding_spec.dim == resharding_spec.dim:  # type: ignore[attr-defined]
             if self._sharding_spec.placements == resharding_spec.placements:  # type: ignore[attr-defined]
@@ -1180,8 +1184,11 @@ def local_tensor(self) -> torch.Tensor:
         Returns:
             A :class:`torch.Tensor` of the local shard.
         """
-        if len(self.local_shards()) != 1:
-            raise NotImplementedError("Only single local shard is supported.")
+        num_local_shards = len(self.local_shards())
+        if num_local_shards != 1:
+            raise NotImplementedError(
+                f"Only single local shard is supported. Number of shards: {num_local_shards}"
+            )
         return self.local_shards()[0].tensor
 
     @classmethod
diff --git a/torch/distributed/_shard/sharded_tensor/reshard.py b/torch/distributed/_shard/sharded_tensor/reshard.py
index daef9c3586184..2bc3d65e5c8cb 100644
--- a/torch/distributed/_shard/sharded_tensor/reshard.py
+++ b/torch/distributed/_shard/sharded_tensor/reshard.py
@@ -4,7 +4,7 @@
 import torch
 import torch.distributed as dist
 import torch.distributed._shard.sharding_spec as shard_spec
-from torch._C._distributed_c10d import ProcessGroup
+from torch.distributed._distributed_c10d import ProcessGroup
 from torch.distributed._shard.metadata import ShardMetadata
 from torch.distributed._shard.sharding_spec._internals import (
     get_chunked_dim_size,
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
index 61808d0adf62a..f02563619d2fa 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
@@ -4,7 +4,7 @@
 
 import torch
 import torch.distributed as dist
-from torch._C._distributed_c10d import ReduceOp
+from torch.distributed._distributed_c10d import ReduceOp
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._shard.sharding_spec import ChunkShardingSpec
 from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
diff --git a/torch/distributed/_state_dict_utils.py b/torch/distributed/_state_dict_utils.py
index 61a2729ec45e9..8c527e7efe5d4 100644
--- a/torch/distributed/_state_dict_utils.py
+++ b/torch/distributed/_state_dict_utils.py
@@ -423,7 +423,7 @@ def tensor_func(
             t = t.share_memory_()
             if pin_memory:
                 pin_memory_utils.pin_memory(t.data_ptr(), t.numel() * t.element_size())
-                weakref.finalize(t, pin_memory_utils.unpin_memory, t)
+                weakref.finalize(t, pin_memory_utils.unpin_memory, t.data_ptr())
 
             return t
         elif pin_memory:
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
index d050c8b40c6c1..8154cd9809139 100644
--- a/torch/distributed/_symmetric_memory/__init__.py
+++ b/torch/distributed/_symmetric_memory/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import math
 import os
 import socket
@@ -7,13 +9,18 @@
 from datetime import timedelta
 from enum import Enum
 from functools import partial
-from typing import Any, Callable, Literal, Optional
+from typing import Any, Callable, Literal
 
 import torch
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.distributed_c10d as c10d
 from torch._C._autograd import DeviceType
-from torch._C._distributed_c10d import _SymmetricMemory, Work as _Work
+from torch.distributed._distributed_c10d import (
+    _register_work,
+    _SymmetricMemory,
+    ProcessGroup,
+    Work as _Work,
+)
 
 
 _group_name_to_store: dict[str, c10d.Store] = {}
@@ -47,11 +54,11 @@ def enable_symm_mem_for_group(group_name: str) -> None:
 
 
 _is_test_mode: bool = False
-_mocked_group_names: Optional[set[str]] = None
+_mocked_group_names: set[str] | None = None
 
 
 @contextmanager
-def _test_mode(group_names: Optional[set[str]] = None) -> Generator[None, None, None]:
+def _test_mode(group_names: set[str] | None = None) -> Generator[None, None, None]:
     """
     Forces ``is_symm_mem_enabled_for_group()`` to return ``True`` and the ops
     defined in the ``symm_mem`` namespace to use fallback implementations.
@@ -83,7 +90,7 @@ def is_symm_mem_enabled_for_group(group_name: str) -> bool:
     return group_name in _group_name_to_store
 
 
-_group_name_to_workspace_tensor: dict[str, Optional[torch.Tensor]] = {}
+_group_name_to_workspace_tensor: dict[str, torch.Tensor | None] = {}
 
 
 def get_symm_mem_workspace(group_name: str, min_size: int) -> _SymmetricMemory:
@@ -469,7 +476,7 @@ class _ScaleMode(Enum):
 
 
 def _check_and_verify_fp8_all_gather_scale_mode(
-    shard: torch.Tensor, scale: Optional[torch.Tensor], gather_dim: int, group_size: int
+    shard: torch.Tensor, scale: torch.Tensor | None, gather_dim: int, group_size: int
 ) -> _ScaleMode:
     full_shape = list(shard.shape)
     full_shape[gather_dim] *= group_size
@@ -498,13 +505,13 @@ def _fused_all_gather_matmul_impl(
     mm_out_op: torch._ops.OpOverload,
     A_shard: torch.Tensor,
     Bs: list[torch.Tensor],
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     kwargs_list: list[dict[str, Any]],
-    out_dtypes: list[Optional[torch.dtype]],
+    out_dtypes: list[torch.dtype | None],
     gather_dim: int,
     group_name: str,
     return_A: bool,
-) -> tuple[Optional[torch.Tensor], list[torch.Tensor]]:
+) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
     if A_shard.dim() < 2:
         raise ValueError("A_shard must be a matrix")
     for B in Bs:
@@ -627,7 +634,7 @@ def _fused_all_gather_matmul_fallback(
     group_name: str,
     *,
     return_A: bool = True,
-) -> tuple[Optional[torch.Tensor], list[torch.Tensor]]:
+) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
     group_size = c10d._get_group_size_by_name(group_name)
     A = torch.ops._c10d_functional.all_gather_into_tensor(
         A_shard.contiguous(), group_size, group_name
@@ -649,7 +656,7 @@ def _fused_all_gather_matmul(
     group_name: str,
     *,
     return_A: bool = True,
-) -> tuple[Optional[torch.Tensor], list[torch.Tensor]]:
+) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
     """
     Perform the following logic with micro-pipelined computation and
     communication:
@@ -819,9 +826,9 @@ def _fused_all_gather_scaled_matmul_fallback(
     B_scales: list[torch.Tensor],
     gather_dim: int,
     group_name: str,
-    biases: list[Optional[torch.Tensor]],
-    result_scales: list[Optional[torch.Tensor]],
-    out_dtypes: list[Optional[torch.dtype]],
+    biases: list[torch.Tensor | None],
+    result_scales: list[torch.Tensor | None],
+    out_dtypes: list[torch.dtype | None],
     use_fast_accum: list[bool],
 ) -> tuple[torch.Tensor, list[torch.Tensor]]:
     out_dtypes = _maybe_convert_scalar_types_to_dtypes(out_dtypes)
@@ -857,9 +864,9 @@ def scaled_matmul(
         B: torch.Tensor,
         A_scale: torch.Tensor,
         B_scale: torch.Tensor,
-        bias: Optional[torch.Tensor],
-        result_scale: Optional[torch.Tensor],
-        out_dtype: Optional[torch.dtype],
+        bias: torch.Tensor | None,
+        result_scale: torch.Tensor | None,
+        out_dtype: torch.dtype | None,
         use_fast_accum: bool,
     ) -> torch.Tensor:
         leading_dims = A.shape[:-1]
@@ -893,9 +900,9 @@ def _fused_all_gather_scaled_matmul(
     B_scales: list[torch.Tensor],
     gather_dim: int,
     group_name: str,
-    biases: list[Optional[torch.Tensor]],
-    result_scales: list[Optional[torch.Tensor]],
-    out_dtypes: list[Optional[torch.dtype]],
+    biases: list[torch.Tensor | None],
+    result_scales: list[torch.Tensor | None],
+    out_dtypes: list[torch.dtype | None],
     use_fast_accum: list[bool],
 ) -> tuple[torch.Tensor, list[torch.Tensor]]:
     """
@@ -1046,7 +1053,7 @@ def _fused_matmul_reduce_scatter_impl(
     A: torch.Tensor,
     B: torch.Tensor,
     kwargs: dict[str, Any],
-    out_dtype: Optional[torch.dtype],
+    out_dtype: torch.dtype | None,
     reduce_op: str,
     scatter_dim: int,
     group_name: str,
@@ -1108,9 +1115,9 @@ def _fused_scaled_matmul_reduce_scatter(
     scatter_dim_after_maybe_reshape: int,
     group_name: str,
     output_shape: list[int],
-    bias: Optional[torch.Tensor] = None,
-    result_scale: Optional[torch.Tensor] = None,
-    out_dtype: Optional[torch.dtype] = None,
+    bias: torch.Tensor | None = None,
+    result_scale: torch.Tensor | None = None,
+    out_dtype: torch.dtype | None = None,
     use_fast_accum: bool = False,
 ) -> torch.Tensor:
     if _is_test_mode:
@@ -1162,9 +1169,9 @@ def _fused_scaled_matmul_reduce_scatter_fallback(
     scatter_dim_after_maybe_reshape: int,
     group_name: str,
     output_shape: list[int],
-    bias: Optional[torch.Tensor] = None,
-    result_scale: Optional[torch.Tensor] = None,
-    out_dtype: Optional[torch.dtype] = None,
+    bias: torch.Tensor | None = None,
+    result_scale: torch.Tensor | None = None,
+    out_dtype: torch.dtype | None = None,
     use_fast_accum: bool = False,
 ) -> torch.Tensor:
     if A_scale.numel() > 1:
@@ -1208,7 +1215,7 @@ def _fused_scaled_matmul_reduce_scatter_impl(
     B: torch.Tensor,
     A_scale: torch.Tensor,
     kwargs: dict[str, Any],
-    out_dtype: Optional[torch.dtype],
+    out_dtype: torch.dtype | None,
     reduce_op: str,
     orig_scatter_dim: int,
     scatter_dim_after_maybe_reshape: int,
@@ -1270,6 +1277,11 @@ def _fused_scaled_matmul_reduce_scatter_impl(
             .flatten(0, -2)
         )
         A_scale_shards = list(A_scale.chunk(group.size()))
+        # cuBLAS's row-wise kernel requires scales to be aligned to 16 bytes.
+        # When we slice them we might break this and need to reallocate them.
+        A_scale_shards = [
+            t if t.data_ptr() % 16 == 0 else t.clone() for t in A_scale_shards
+        ]
     else:
         raise ValueError("A_scale cannot be none for scaled_mm")
 
@@ -1345,7 +1357,7 @@ def restride_A_for_fused_matmul_reduce_scatter(
 
 def _maybe_convert_scalar_types_to_dtypes(
     scalar_types: list[Any],
-) -> list[Optional[torch.dtype]]:
+) -> list[torch.dtype | None]:
     """
     When a list of `torch.dtype`s is passed through the dispatcher as
     `ScalarType[]`, it is converted to a list of scalar type enum values. This
@@ -1377,7 +1389,7 @@ def _maybe_convert_scalar_types_to_dtypes(
     if any(not isinstance(x, (type(None), int)) for x in scalar_types):
         return scalar_types
 
-    dtypes: list[Optional[torch.dtype]] = []
+    dtypes: list[torch.dtype | None] = []
     for scalar_type in scalar_types:
         if scalar_type is None:
             dtypes.append(scalar_type)
@@ -1481,7 +1493,7 @@ def _low_contention_all_gather(
             src_buf = symm_mem.get_buffer(remote_rank, tensor.shape, tensor.dtype)
             chunks[remote_rank].copy_(src_buf)
         symm_mem.barrier()
-        torch._C._distributed_c10d._register_work(output, Work())
+        _register_work(output, Work())
         return output
 
 
@@ -1529,7 +1541,7 @@ def _low_contention_reduce_scatter_with_symm_mem_input(
             ret = ret.mean(dim=0)
         else:
             raise ValueError(f"reduce_op ({reduce_op}) is not supported")
-        torch._C._distributed_c10d._register_work(ret, Work())
+        _register_work(ret, Work())
         return ret
 
 
@@ -1564,7 +1576,7 @@ def _low_contention_reduce_scatter_with_workspace(
             ret = ret.mean(dim=0)
         else:
             raise ValueError(f"reduce_op ({reduce_op}) is not supported")
-        torch._C._distributed_c10d._register_work(ret, Work())
+        _register_work(ret, Work())
         return ret
 
 
@@ -1616,7 +1628,7 @@ def _all_to_all_vdev_2d_meta(
     in_splits: torch.Tensor,
     out_splits_offsets: torch.Tensor,
     group_name: str,
-    major_align: Optional[int] = None,
+    major_align: int | None = None,
 ) -> None:
     return None
 
@@ -1638,18 +1650,16 @@ def _all_to_all_vdev_2d_offset_meta(
 
 
 from collections.abc import Sequence
-from typing import Any, overload, TYPE_CHECKING, Union
-
-from torch.types import _device, _dtype, _int
+from typing import overload, TYPE_CHECKING, Union
 
 
 if TYPE_CHECKING:
-    from torch._C._distributed_c10d import ProcessGroup
+    from torch.types import _device, _dtype, _int
 
 
 @overload
 def empty(
-    *size: _int, dtype: Optional[_dtype] = None, device: Optional[_device] = None
+    *size: _int, dtype: _dtype | None = None, device: _device | None = None
 ) -> torch.Tensor: ...
 
 
@@ -1657,15 +1667,15 @@ def empty(
 def empty(
     size: Sequence[_int],
     *,
-    dtype: Optional[_dtype] = None,
-    device: Optional[_device] = None,
+    dtype: _dtype | None = None,
+    device: _device | None = None,
 ) -> torch.Tensor: ...
 
 
 def empty(  # type: ignore[misc]
     *size: Any,
-    dtype: Optional[_dtype] = None,
-    device: Optional[_device] = None,
+    dtype: _dtype | None = None,
+    device: _device | None = None,
 ) -> torch.Tensor:
     r"""
     empty(*size, *, dtype=None, device=None) -> Tensor
@@ -1706,7 +1716,7 @@ def empty(  # type: ignore[misc]
 
 
 def rendezvous(
-    tensor: torch.Tensor, group: Union[str, "ProcessGroup"]
+    tensor: torch.Tensor, group: Union[str, ProcessGroup]
 ) -> _SymmetricMemory:
     r"""
     rendezvous(tensor, group) -> _SymmetricMemory
@@ -1721,8 +1731,6 @@ def rendezvous(
         group (Union[str, :class:`torch.distributed.ProcessGroup`]): The group identifying the
             participating processes. This can be either a group name or a process group object.
     """
-    from torch._C._distributed_c10d import ProcessGroup
-
     if isinstance(group, str):
         group_name = group
     elif isinstance(group, ProcessGroup):
@@ -1740,11 +1748,7 @@ def is_nvshmem_available() -> bool:
 
     Check if NVSHMEM is available in current build and on current system.
     """
-    try:
-        from torch._C._distributed_c10d import _is_nvshmem_available
-    except ImportError:
-        # Not all builds have NVSHMEM support.
-        return False
+    from torch.distributed._distributed_c10d import _is_nvshmem_available
 
     # Check if NVSHMEM is available on current system.
     return _is_nvshmem_available()
@@ -1764,7 +1768,7 @@ def set_backend(name: Literal["NVSHMEM", "CUDA", "NCCL"]) -> None:
     _SymmetricMemory.set_backend(name)
 
 
-def get_backend(device: _device) -> Optional[str]:
+def get_backend(device: _device) -> str | None:
     r"""
     Get the backend for symmetric memory allocation for a given device. If not
     found, return None.
@@ -1776,4 +1780,14 @@ def get_backend(device: _device) -> Optional[str]:
     return _SymmetricMemory.get_backend(torch.device(device))
 
 
+def get_mempool_allocator(device: _device):  # type: ignore[no-untyped-def]
+    r"""
+    Get the MemPool allocator for symmetric memory for a given device.
+    Args:
+        device (class:`torch.device` or str): the device for which to get the
+        MemPool allocator.
+    """
+    return _SymmetricMemory.get_mempool_allocator(torch.device(device))
+
+
 __all__ = ["empty", "rendezvous", "is_nvshmem_available", "set_backend", "get_backend"]
diff --git a/torch/distributed/_symmetric_memory/_nvshmem_triton.py b/torch/distributed/_symmetric_memory/_nvshmem_triton.py
index dda1885a8e167..7b7828227d7d1 100644
--- a/torch/distributed/_symmetric_memory/_nvshmem_triton.py
+++ b/torch/distributed/_symmetric_memory/_nvshmem_triton.py
@@ -1,15 +1,69 @@
 import os
+import subprocess
 import sysconfig
-from typing import Optional
+from typing import Any, Optional
 
 from torch.utils._triton import has_triton
 
 
+def _find_nvshmem_device_library() -> str:
+    paths = [os.path.join(sysconfig.get_path("purelib"), "nvidia", "nvshmem", "lib")]
+
+    # Add common system installation paths
+    common_paths = [
+        "/usr/local/lib",
+        "/usr/lib",
+        "/opt/nvidia/nvshmem/lib",
+    ]
+    paths.extend(common_paths)
+
+    try:
+        import torch
+
+        torch_lib = os.path.join(os.path.dirname(torch.__file__), "lib")
+        so_path = os.path.join(torch_lib, "libtorch_nvshmem.so")
+
+        if os.path.exists(so_path):
+            try:
+                result = subprocess.run(
+                    ["readelf", "-d", so_path],
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+
+                for line in result.stdout.splitlines():
+                    if ("RPATH" in line or "RUNPATH" in line) and "[" in line:
+                        rpath = line.split("[", 1)[1].split("]", 1)[0]
+                        for p in rpath.split(":"):
+                            p = p.strip().replace("$ORIGIN", torch_lib)
+                            if p and p not in paths:
+                                paths.append(p)
+            except subprocess.CalledProcessError:
+                pass
+
+    except ImportError:
+        pass
+
+    for path in paths:
+        device_lib = os.path.join(path, "libnvshmem_device.bc")
+        if os.path.exists(device_lib):
+            return device_lib
+
+    raise RuntimeError(f"NVSHMEM device library not found. Searched: {paths}")
+
+
 def enable_triton(lib_dir: Optional[str] = None) -> dict[str, str]:
     """
     Enable NVSHMEM device functions for Triton. It performs a NVSHMEM
     device-side initialization on the kernel module created by Triton.
 
+    This function sets a global hook that initializes NVSHMEM for Triton
+    kernels. To avoid unnecessary initializations, the hook only acts on
+    kernels that have "nvshmem" in their function name. Therefore, it is
+    required that all Triton kernels using NVSHMEM primitives follow this
+    naming convention.
+
     Args:
         lib_dir (Optional[str]): The directory where the NVSHMEM device library
         is located. If not provided, it will use the default path where NVSHMEM
@@ -19,92 +73,210 @@ def enable_triton(lib_dir: Optional[str] = None) -> dict[str, str]:
         dict[str, str]: A dictionary containing the NVSHMEM device library name
         and path.
     """
-    from triton.runtime.jit import JITFunction
+    import triton
 
-    from torch._C._distributed_c10d import _nvshmemx_cumodule_init
+    from torch.distributed._distributed_c10d import _nvshmemx_cumodule_init
 
-    # Detect NVSHMEM device library path from python library path
-    if lib_dir is None:
-        py_lib_path = sysconfig.get_path("purelib")
-        lib_dir = py_lib_path + "/nvidia/nvshmem/lib"
-
-    lib_path = os.path.join(lib_dir, "libnvshmem_device.bc")
-    if not os.path.exists(lib_path):
-        raise RuntimeError("NVSHMEM device library not found")
+    if lib_dir is not None:
+        lib_path = os.path.join(lib_dir, "libnvshmem_device.bc")
+        if not os.path.exists(lib_path):
+            raise RuntimeError(
+                f"NVSHMEM device library not found at specified path: {lib_path}"
+            )
+    else:
+        # Otherwise, search for the library automatically.
+        lib_path = _find_nvshmem_device_library()
 
     extern_libs = {"libnvshmem_device": lib_path}
 
     # A hook function to initialize NVSHMEM in Triton
     def nvshmem_init_hook(*args, **kwargs) -> None:  # type: ignore[no-untyped-def]
-        key = kwargs["key"]
-        device = kwargs["compile"]["device"]
         jit_function = kwargs["fn"].jit_function
-        kernel_cache, _, _, _ = jit_function.device_caches[device]
-        kernel = kernel_cache.get(key, None)
-        kernel.run
-        _nvshmemx_cumodule_init(kernel.module)
+        # Only initialize NVSHMEM module for kernels containing "nvshmem" in their name
+        if "nvshmem" in jit_function.fn.__name__:
+            key = kwargs["key"]
+            device = kwargs["compile"]["device"]
+            jit_function = kwargs["fn"].jit_function
+            kernel_cache, _, _, _ = jit_function.device_caches[device]
+            kernel = kernel_cache.get(key, None)
+            if kernel is not None:
+                kernel.run
+                _nvshmemx_cumodule_init(kernel.module)
 
     # Register the function as a post-compile hook
-    JITFunction.compiled_hook = nvshmem_init_hook
+    triton.knobs.runtime.jit_post_compile_hook = nvshmem_init_hook
 
     # Return to user so that they can use it in Triton kernel invocation
     return extern_libs
 
 
 if has_triton():
+    import triton
+    import triton.language as tl
     from triton.language import core
 
+    @triton.jit  # type: ignore[misc]
+    def put(dest, source, nelems, pe):  # type: ignore[no-untyped-def]
+        """
+        Put tensor data from local PE to a remote PE.
+
+        This high-level function provides a tensor-aware interface for NVSHMEM put
+        operations. It automatically handles type checking and size calculations, making
+        the API more ergonomic and type-safe.
+
+        Args:
+            dest: Destination tensor on the remote PE. Type must match source.
+            source: Source tensor on the local PE containing data to be copied.
+            nelems: Number of elements to transfer.
+            pe: PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
+
+        Notes:
+            - Performs compile-time type checking between dest and source tensors.
+            - Automatically calculates byte size from tensor type and element count.
+            - This is a blocking operation that returns after data has been copied out
+              of the source array on the local PE.
+            - The operation does not guarantee delivery to the destination PE.
+              Use nvshmem_fence() for ordering or nvshmem_quiet() for completion.
+
+        Example:
+            ```
+            # Transfer 100 elements to PE 1
+            nvshmem.put(dest_tensor, src_tensor, 100, 1)
+            ```
+        """
+        tl.static_assert(dest.type == source.type)
+        nbytes = nelems * dest.type.element_ty.itemsize
+        return putmem_block_extern_wrapper(
+            dest.to(tl.int64), source.to(tl.int64), nbytes, pe
+        )
+
     @core.extern
-    def putmem_block(dst, src, nelems, pe, _builder=None):  # type: ignore[no-untyped-def]
+    def putmem_block_extern_wrapper(dest, source, size_bytes, pe, _semantic=None):  # type: ignore[no-untyped-def]
+        """Low-level extern wrapper for NVSHMEM put"""
         return core.extern_elementwise(
             "",
             "",
-            [dst, src, nelems, pe],
+            [dest, source, size_bytes, pe],
             {
                 (
-                    core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
+                    core.dtype("int64"),  # dest ptr
+                    core.dtype("int64"),  # source ptr
+                    core.dtype("int64"),  # size in bytes
+                    core.dtype("int64"),  # pe number
                 ): ("nvshmemx_putmem_block", core.dtype("int32"))
             },
             is_pure=False,
-            _builder=_builder,
+            _semantic=_semantic,
+        )
+
+    @triton.jit  # type: ignore[misc]
+    def get(dest, source, nelems, pe):  # type: ignore[no-untyped-def]
+        """
+        Get tensor data from a remote PE to local PE.
+
+        This high-level function provides a tensor-aware interface for NVSHMEM get
+        operations. It automatically handles type checking and size calculations, making
+        the API more ergonomic and type-safe.
+
+        Args:
+            dest: Destination tensor on the local PE. Type must match source.
+            source: Source tensor on the remote PE containing data to be copied.
+            nelems: Number of elements to transfer.
+            pe: PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
+
+        Notes:
+            - Performs compile-time type checking between dest and source tensors.
+            - Automatically calculates byte size from tensor type and element count.
+            - This is a blocking operation that returns after data has been delivered
+              to the destination array on the local PE.
+            - The destination data is guaranteed to be available for use after the call returns.
+
+        Example:
+            ```
+            # Get 100 elements from PE 0
+            nvshmem.get(dest_tensor, src_tensor, 100, 0)
+            ```
+        """
+        tl.static_assert(dest.type == source.type)
+        nbytes = nelems * dest.type.element_ty.itemsize
+        return getmem_block_extern_wrapper(
+            dest.to(tl.int64), source.to(tl.int64), nbytes, pe
         )
 
     @core.extern
-    def getmem_block(dst, src, nelems, pe, _builder=None):  # type: ignore[no-untyped-def]
+    def getmem_block_extern_wrapper(dest, source, size_bytes, pe, _semantic=None):  # type: ignore[no-untyped-def]
+        """Low-level extern wrapper for NVSHMEM get"""
         return core.extern_elementwise(
             "",
             "",
-            [dst, src, nelems, pe],
+            [dest, source, size_bytes, pe],
             {
                 (
-                    core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
+                    core.dtype("int64"),  # dest ptr
+                    core.dtype("int64"),  # source ptr
+                    core.dtype("int64"),  # size in bytes
+                    core.dtype("int64"),  # pe number
                 ): ("nvshmemx_getmem_block", core.dtype("int32"))
             },
             is_pure=False,
-            _builder=_builder,
+            _semantic=_semantic,
         )
 
     @core.extern
     def putmem_signal_block(  # type: ignore[no-untyped-def]
         dst,
         src,
-        nelems,
+        size_bytes,
         sig_addr,
         signal,
         sig_op,
         pe,
-        _builder=None,
+        _semantic=None,
     ):  # type: ignore[no-untyped-def]
+        """
+        Put data to remote PE with atomic signal operation using block-scoped operation.
+
+        This function copies data from the local PE to the remote PE and then
+        atomically updates a signal variable on the remote PE to indicate completion.
+        This enables efficient point-to-point synchronization between PEs.
+
+        Args:
+            dst (int64): Symmetric address of the destination data object on the remote PE.
+            src (int64): Local address of the source data object containing data to be copied.
+            size_bytes (int64): Number of bytes to transfer. Must be positive.
+            sig_addr (int64): Symmetric address of the signal variable (uint64_t) on the remote PE.
+                             Must be 8-byte aligned symmetric memory.
+            signal (int64): Value to be used in the signal operation.
+            sig_op (int64): Signal operation type. Common values:
+                           - NVSHMEM_SIGNAL_SET (0): Atomic set operation
+                           - NVSHMEM_SIGNAL_ADD (5): Atomic add operation
+            pe (int64): PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
+            _semantic: Optional semantic information for Triton compilation.
+
+        Returns:
+            int32: Status code (0 for success).
+
+        Notes:
+            - This is a blocking operation that returns after data has been copied out
+              of the source array and the signal has been updated on the remote PE.
+            - The signal update is performed atomically with respect to other signal
+              operations and synchronization routines.
+            - The signal variable must be of type uint64_t in symmetric memory.
+            - Use with nvshmem_signal_wait_until() for synchronization.
+
+        Example:
+            ```
+            # Transfer data and set completion flag to 1
+            NVSHMEM_SIGNAL_SET = 0
+            nvshmem.putmem_signal_block(
+                dst_ptr, src_ptr, 1024, sig_ptr, 1, NVSHMEM_SIGNAL_SET, target_pe
+            )
+            ```
+        """
         return core.extern_elementwise(
             "",
             "",
-            [dst, src, nelems, sig_addr, signal, sig_op, pe],
+            [dst, src, size_bytes, sig_addr, signal, sig_op, pe],
             {
                 (
                     core.dtype("int64"),
@@ -117,11 +289,51 @@ def putmem_signal_block(  # type: ignore[no-untyped-def]
                 ): ("nvshmemx_putmem_signal_block", core.dtype("int32"))
             },
             is_pure=False,
-            _builder=_builder,
+            _semantic=_semantic,
         )
 
+    # Wait and Signal Operations
+
+    @triton.jit  # type: ignore[misc]
+    def wait_until(ivar, cmp_op, cmp_val):  # type: ignore[no-untyped-def]
+        """
+        Wait until a tensor variable meets a specified condition.
+
+        This high-level function provides a tensor-aware interface for NVSHMEM wait_until
+        operations. It automatically handles tensor address extraction, making
+        the API more ergonomic and type-safe.
+
+        Args:
+            ivar_tensor: Tensor to monitor (typically int64/uint64) in symmetric memory.
+            cmp: Comparison operator. Common values:
+                 - NVSHMEM_CMP_EQ (0): Wait until ivar == cmp_val
+                 - NVSHMEM_CMP_NE (1): Wait until ivar != cmp_val
+                 - NVSHMEM_CMP_GT (2): Wait until ivar > cmp_val
+                 - NVSHMEM_CMP_GE (3): Wait until ivar >= cmp_val
+                 - NVSHMEM_CMP_LT (4): Wait until ivar < cmp_val
+                 - NVSHMEM_CMP_LE (5): Wait until ivar <= cmp_val
+            cmp_val: Value to compare against.
+
+        Notes:
+            - This is a blocking operation that will wait indefinitely until the
+              condition is satisfied.
+            - The tensor must be in symmetric memory and accessible from other PEs.
+
+        Example:
+            ```
+            # Wait until flag tensor becomes 1 (set by another PE)
+            NVSHMEM_CMP_EQ = 0
+            nvshmem.wait_until_tensor(flag_tensor, NVSHMEM_CMP_EQ, 1)
+            ```
+        """
+        tl.static_assert(
+            ivar.type.element_ty.itemsize == 8,
+            "wait_until expects a 64-bit type for the synchronization variable",
+        )
+        return wait_until_extern_wrapper(ivar.to(tl.int64), cmp_op, cmp_val)
+
     @core.extern
-    def wait_until(ivar, cmp, cmp_val, _builder=None):  # type: ignore[no-untyped-def]
+    def wait_until_extern_wrapper(ivar, cmp, cmp_val, _semantic=None):  # type: ignore[no-untyped-def]
         return core.extern_elementwise(
             "",
             "",
@@ -134,11 +346,49 @@ def wait_until(ivar, cmp, cmp_val, _builder=None):  # type: ignore[no-untyped-de
                 ): ("nvshmem_longlong_wait_until", core.dtype("int32"))
             },
             is_pure=False,
-            _builder=_builder,
+            _semantic=_semantic,
         )
 
     @core.extern
-    def signal_wait_until(sig_addr, cmp, cmp_val, _builder=None):  # type: ignore[no-untyped-def]
+    def signal_wait_until(sig_addr, cmp, cmp_val, _semantic=None):  # type: ignore[no-untyped-def]
+        """
+        Wait until a signal variable meets a specified condition.
+
+        This function blocks the calling thread until the value at the specified
+        signal variable satisfies the given comparison condition. Signal variables
+        are special uint64_t symmetric objects used for efficient synchronization
+        with signal operations.
+
+        Args:
+            sig_addr (int64): Symmetric address of the signal variable (uint64_t).
+                             Must be 8-byte aligned symmetric memory.
+            cmp (int64): Comparison operator. Common values:
+                        - NVSHMEM_CMP_EQ (0): Wait until signal == cmp_val
+                        - NVSHMEM_CMP_NE (1): Wait until signal != cmp_val
+                        - NVSHMEM_CMP_GT (2): Wait until signal > cmp_val
+                        - NVSHMEM_CMP_GE (3): Wait until signal >= cmp_val
+                        - NVSHMEM_CMP_LT (4): Wait until signal < cmp_val
+                        - NVSHMEM_CMP_LE (5): Wait until signal <= cmp_val
+            cmp_val (int64): Value to compare against.
+            _semantic: Optional semantic information for Triton compilation.
+
+        Returns:
+            int32: Status code (0 for success).
+
+        Notes:
+            - This is a blocking operation designed specifically for signal variables.
+            - Signal variables are updated atomically by putmem_signal operations.
+            - More efficient than wait_until for signal-based synchronization patterns.
+            - Ensures the signal update is fully complete before returning.
+            - Commonly used with putmem_signal_block for producer-consumer patterns.
+
+        Example:
+            ```
+            # Wait for signal to be set to completion value
+            NVSHMEM_CMP_EQ = 0
+            nvshmem.signal_wait_until(signal_ptr, NVSHMEM_CMP_EQ, 42)
+            ```
+        """
         return core.extern_elementwise(
             "",
             "",
@@ -151,11 +401,45 @@ def signal_wait_until(sig_addr, cmp, cmp_val, _builder=None):  # type: ignore[no
                 ): ("nvshmem_signal_wait_until", core.dtype("int32"))
             },
             is_pure=False,
-            _builder=_builder,
+            _semantic=_semantic,
         )
 
     @core.extern
-    def signal_op(sig_addr, signal, sig_op, pe, _builder=None):  # type: ignore[no-untyped-def]
+    def signal_op(sig_addr, signal, sig_op, pe, _semantic=None):  # type: ignore[no-untyped-def]
+        """
+        Perform an atomic signal operation on a remote PE.
+
+        This function atomically updates a signal variable on the specified remote PE
+        using the given operation and value. This enables efficient point-to-point
+        synchronization and notification between PEs.
+
+        Args:
+            sig_addr (int64): Symmetric address of the signal variable (uint64_t) on the remote PE.
+                             Must be 8-byte aligned symmetric memory.
+            signal (int64): Value to be used in the signal operation.
+            sig_op (int64): Signal operation type. Common values:
+                           - NVSHMEM_SIGNAL_SET (0): Atomically set sig_addr = signal
+                           - NVSHMEM_SIGNAL_ADD (5): Atomically set sig_addr += signal
+            pe (int64): PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
+            _semantic: Optional semantic information for Triton compilation.
+
+        Returns:
+            int32: Status code (0 for success).
+
+        Notes:
+            - This is a one-sided operation - the remote PE does not need to participate.
+            - The signal operation is performed atomically on the remote PE.
+            - Can be used with signal_wait_until() on the remote PE for synchronization.
+            - Provides low-overhead notification mechanism between PEs.
+            - The signal variable must be of type uint64_t in symmetric memory.
+
+        Example:
+            ```python
+            # Atomically set remote signal to 1 to notify completion
+            NVSHMEM_SIGNAL_SET = 0
+            nvshmem.signal_op(remote_signal_ptr, 1, NVSHMEM_SIGNAL_SET, target_pe)
+            ```
+        """
         return core.extern_elementwise(
             "",
             "",
@@ -169,11 +453,47 @@ def signal_op(sig_addr, signal, sig_op, pe, _builder=None):  # type: ignore[no-u
                 ): ("nvshmemx_signal_op", core.dtype("int32"))
             },
             is_pure=False,
-            _builder=_builder,
+            _semantic=_semantic,
         )
 
+    # Memory Ordering Operations
     @core.extern
-    def fence(_builder=None):  # type: ignore[no-untyped-def]
+    def fence(_semantic=None):  # type: ignore[no-untyped-def]
+        """
+        Ensure ordering of put operations to each remote PE.
+
+        This function provides a memory fence that ensures point-to-point ordering
+        of remote memory operations. Put operations issued before the fence are
+        guaranteed to be ordered before put operations issued after the fence,
+        when targeting the same remote PE.
+
+        Args:
+            _semantic: Optional semantic information for Triton compilation.
+
+        Returns:
+            int32: Status code (0 for success).
+
+        Notes:
+            - This provides weaker ordering guarantees than quiet().
+            - Operations to each PE are ordered, but operations to different PEs
+              may still be reordered relative to each other.
+            - Does not guarantee completion of operations, only ordering.
+            - Non-blocking operations are not ordered by fence - use quiet() instead.
+            - Essential for ensuring correct ordering in communication patterns.
+
+        Memory Ordering Guarantees:
+            - Put operations before fence() → ordered before → Put operations after fence()
+            - Ordering is maintained per-destination-PE basis
+            - Remote PEs can observe the enforced ordering
+
+        Example:
+            ```
+            # Ensure first put completes before second put to same PE
+            nvshmem.put(dst, src, nelems, target_pe)
+            nvshmem.fence()  # Enforce ordering
+            nvshmem.put(dst2, src2, nelems, target_pe)
+            ```
+        """
         return core.extern_elementwise(
             "",
             "",
@@ -182,11 +502,46 @@ def fence(_builder=None):  # type: ignore[no-untyped-def]
                 (): ("nvshmem_fence", core.dtype("int32")),
             },
             is_pure=False,
-            _builder=_builder,
+            _semantic=_semantic,
         )
 
     @core.extern
-    def quiet(_builder=None):  # type: ignore[no-untyped-def]
+    def quiet(_semantic=None):  # type: ignore[no-untyped-def]
+        """
+        Wait for completion of all outstanding put operations.
+
+        This function blocks until all outstanding remote memory operations issued
+        by the calling PE have completed. It provides stronger guarantees than
+        fence() by ensuring both ordering and completion of all operations.
+
+        Args:
+            _semantic: Optional semantic information for Triton compilation.
+
+        Returns:
+            int32: Status code (0 for success).
+
+        Notes:
+            - This is a blocking operation that waits for completion.
+            - Ensures all previous put operations have been delivered to their destinations.
+            - Provides global ordering - operations to ALL PEs are ordered.
+            - Required to complete non-blocking operations.
+            - More expensive than fence() but provides stronger guarantees.
+
+        Memory Ordering Guarantees:
+            - All put operations before quiet() are completed before any operations after quiet()
+            - Operations are visible to all PEs as having occurred before subsequent operations
+            - Both blocking and non-blocking operations are completed
+
+        Example:
+            ```
+            # Ensure all data transfers complete before setting completion flag
+            nvshmem.putmem_block(data_ptr, src_ptr, data_size, target_pe)
+            nvshmem.quiet()  # Wait for data transfer completion
+            nvshmem.putmem_block(
+                flag_ptr, flag_src_ptr, 8, target_pe
+            )  # Signal completion
+            ```
+        """
         return core.extern_elementwise(
             "",
             "",
@@ -195,88 +550,433 @@ def quiet(_builder=None):  # type: ignore[no-untyped-def]
                 (): ("nvshmem_quiet", core.dtype("int32")),
             },
             is_pure=False,
-            _builder=_builder,
+            _semantic=_semantic,
         )
 
+    # PE Information Operations
     @core.extern
-    def my_pe(_builder=None):  # type: ignore[no-untyped-def]
+    def my_pe(_semantic=None):  # type: ignore[no-untyped-def]
+        """
+        Get the PE number of the calling PE.
+
+        This function returns the unique identifier (PE number) of the current
+        processing element within the NVSHMEM job. PE numbers range from 0 to
+        nvshmem_n_pes() - 1.
+
+        Args:
+            _semantic: Optional semantic information for Triton compilation.
+
+        Returns:
+            int32: PE number of the calling PE (0 ≤ pe < nvshmem_n_pes()).
+
+        Notes:
+            - This is a pure function that returns the same value throughout execution.
+            - PE numbering starts from 0 and is contiguous.
+            - Each PE has a unique identifier within the NVSHMEM job.
+            - Can be called from both host and device code.
+            - Essential for implementing PE-specific logic and communication patterns.
+
+        Example:
+            ```
+            # Get current PE number for conditional logic
+            pe = nvshmem.my_pe()
+            if pe == 0:
+                # Root PE logic
+                pass
+            else:
+                # Non-root PE logic
+                pass
+            ```
+        """
         return core.extern_elementwise(
             "",
             "",
             [],
             {(): ("nvshmem_my_pe", core.dtype("int32"))},
             is_pure=True,
-            _builder=_builder,
+            _semantic=_semantic,
         )
 
     @core.extern
-    def n_pes(_builder=None):  # type: ignore[no-untyped-def]
+    def n_pes(_semantic=None):  # type: ignore[no-untyped-def]
+        """
+        Get the total number of PEs in the NVSHMEM job.
+
+        This function returns the total count of processing elements (PEs)
+        participating in the current NVSHMEM job. This value remains constant
+        throughout the execution of the program.
+
+        Args:
+            _semantic: Optional semantic information for Triton compilation.
+
+        Returns:
+            int32: Total number of PEs in the job (always ≥ 1).
+
+        Notes:
+            - This is a pure function that returns the same value throughout execution.
+            - The value is determined at NVSHMEM initialization and never changes.
+            - Valid PE numbers range from 0 to n_pes() - 1.
+            - Can be called from both host and device code.
+            - Essential for implementing collective operations and communication patterns.
+
+        Example:
+            ```
+            # Broadcast from root to all other PEs
+            total_pes = nvshmem.n_pes()
+            my_rank = nvshmem.my_pe()
+
+            if my_rank == 0:
+                # Send to all other PEs
+                for peer in range(1, total_pes):
+                    nvshmem.putmem_block(dst_ptr, src_ptr, size, peer)
+            ```
+        """
         return core.extern_elementwise(
             "",
             "",
             [],
             {(): ("nvshmem_n_pes", core.dtype("int32"))},
             is_pure=True,
-            _builder=_builder,
+            _semantic=_semantic,
         )
 
+    # Synchronization Operations
     @core.extern
-    def barrier_all(_builder=None):  # type: ignore[no-untyped-def]
+    def barrier_all(_semantic=None):  # type: ignore[no-untyped-def]
+        """
+        Synchronize all PEs with completion guarantee.
+
+        This function creates a barrier across all PEs in the NVSHMEM job. It ensures
+        that all local and remote memory updates issued before the barrier by any PE
+        are completed before any PE exits the barrier. This provides both
+        synchronization and memory consistency.
+
+        Args:
+            _semantic: Optional semantic information for Triton compilation.
+
+        Returns:
+            int32: Status code (0 for success).
+
+        Notes:
+            - This is a collective operation - all PEs must participate.
+            - Stronger guarantee than sync_all() - ensures completion of remote operations.
+            - Blocks until all PEs reach the barrier AND all memory operations complete.
+            - Must be called from kernels launched with cooperative launch.
+            - Provides full memory consistency across all PEs.
+            - More expensive than sync_all() due to completion guarantees.
+
+        Memory Consistency Guarantees:
+            - All memory updates before barrier_all() are visible to all PEs
+            - All remote memory operations are completed before any PE continues
+            - Provides a global synchronization point with memory ordering
+
+        Example:
+            ```
+            # Ensure all PEs complete their work before proceeding
+            # All PEs execute this - it's a collective operation
+            nvshmem.barrier_all()
+            # At this point, all previous operations are complete on all PEs
+            ```
+        """
         return core.extern_elementwise(
             "",
             "",
             [],
             {(): ("nvshmem_barrier_all", core.dtype("int32"))},
             is_pure=False,
-            _builder=_builder,
+            _semantic=_semantic,
         )
 
     @core.extern
-    def sync_all(_builder=None):  # type: ignore[no-untyped-def]
+    def sync_all(_semantic=None):  # type: ignore[no-untyped-def]
+        """
+        Synchronize all PEs with local completion guarantee.
+
+        This function creates a lightweight synchronization barrier across all PEs.
+        It ensures that all local store operations issued before the sync are
+        visible to other PEs, but does not guarantee completion of remote memory
+        operations initiated by the calling PE.
+
+        Args:
+            _semantic: Optional semantic information for Triton compilation.
+
+        Returns:
+            int32: Status code (0 for success).
+
+        Notes:
+            - This is a collective operation - all PEs must participate.
+            - Lighter weight than barrier_all() - only ensures local store visibility.
+            - Does not guarantee completion of remote memory updates initiated locally.
+            - Must be called from kernels launched with cooperative launch.
+            - Suitable when only synchronization (not completion) is needed.
+            - More efficient than barrier_all() for synchronization-only patterns.
+
+        Memory Consistency Guarantees:
+            - Local store operations are visible to other PEs
+            - Does NOT ensure completion of outgoing remote operations
+            - Provides synchronization point without full completion overhead
+
+        Example:
+            ```
+            # Lightweight synchronization between PEs
+            # All PEs execute this - it's a collective operation
+            nvshmem.sync_all()
+            # Local stores are visible, but remote ops may still be in flight
+            ```
+        """
         return core.extern_elementwise(
             "",
             "",
             [],
             {(): ("nvshmem_sync_all", core.dtype("int32"))},
             is_pure=False,
-            _builder=_builder,
+            _semantic=_semantic,
         )
 
-    @core.extern
-    def alltoall(team, dest, source, nelems, _builder=None):  # type: ignore[no-untyped-def]
-        """Perform alltoall operation on NVSHMEM symmetric memory"""
+    # Collective Operations (mem-based APIs - sizes in bytes)
+    @triton.jit  # type: ignore[misc]
+    def alltoall(team, dest, source, nelems_per_pe):  # type: ignore[no-untyped-def]
+        """
+        All-to-all tensor exchange between PEs in a team.
+
+        This high-level function provides a tensor-aware interface for NVSHMEM alltoall
+        operations. Each PE sends nelems_per_pe elements to every other PE and receives
+        the same amount from every other PE.
+
+        Args:
+            team: Team handle for the collective operation. Use 0 for NVSHMEM_TEAM_WORLD.
+            dest: Destination tensor. Must be large enough for nelems_per_pe * n_pes elements.
+            source: Source tensor containing data for all PEs. Must contain nelems_per_pe * n_pes elements.
+            nelems_per_pe: Number of elements to exchange with each PE.
+
+        Notes:
+            - Performs compile-time type checking between dest and source tensors.
+            - Automatically calculates byte size from tensor type and element count.
+            - This is a collective operation - all PEs in the team must participate.
+            - Data layout: source=[data_for_pe0, data_for_pe1, ...], dest=[data_from_pe0, data_from_pe1, ...]
+
+        Example:
+            ```
+            # Each PE exchanges 10 elements with every other PE
+            nvshmem.alltoall(0, dest_tensor, src_tensor, 10)
+            ```
+        """
+        tl.static_assert(dest.type == source.type)
+        size_bytes_per_pe = nelems_per_pe * dest.type.element_ty.itemsize
+        return alltoallmem_block_extern_wrapper(
+            team, dest.to(tl.int64), source.to(tl.int64), size_bytes_per_pe
+        )
+
+    @core.extern  # type: ignore[misc]
+    def alltoallmem_block_extern_wrapper(
+        team: Any, dest: Any, source: Any, size_bytes: Any, _semantic: Any = None
+    ) -> None:
+        """Low-level extern wrapper for NVSHMEM alltoall"""
         return core.extern_elementwise(
             "",
             "",
-            [team, dest, source, nelems],
+            [team, dest, source, size_bytes],
             {
                 (
                     core.dtype("int64"),  # team handle
                     core.dtype("int64"),  # dest ptr
                     core.dtype("int64"),  # source ptr
-                    core.dtype("int64"),  # nelems
-                ): ("nvshmem_longlong_alltoall", core.dtype("int32"))
+                    core.dtype("int64"),  # size in bytes
+                ): ("nvshmemx_alltoallmem_block", core.dtype("int32"))
             },
             is_pure=False,
-            _builder=_builder,
+            _semantic=_semantic,
         )
 
-    @core.extern
-    def broadcast(team, dest, source, nelems, pe_root, _builder=None):  # type: ignore[no-untyped-def]
-        """Broadcasts data from a root PE to all other PEs in a team"""
+    @triton.jit  # type: ignore[misc]
+    def broadcast(team, dest, source, nelems, pe_root):  # type: ignore[no-untyped-def]
+        """
+        Broadcast tensor data from a root PE to all other PEs in a team.
+
+        This high-level function provides a tensor-aware interface for NVSHMEM broadcast
+        operations. It automatically handles type checking and size calculations, making
+        the API more ergonomic and type-safe.
+
+        Args:
+            team: Team handle for the collective operation. Use 0 for NVSHMEM_TEAM_WORLD.
+            dest: Destination tensor with type information. All PEs receive data here.
+            source: Source tensor on the root PE. Type must match dest.
+            nelems: Number of elements to broadcast.
+            pe_root: PE number of the root PE that provides the source data.
+
+        Notes:
+            - Performs compile-time type checking between dest and source tensors.
+            - Automatically calculates byte size from tensor type and element count.
+            - This is a collective operation - all PEs in the team must participate.
+            - Must be called from kernels launched with cooperative launch.
+
+        Example:
+            ```
+            # Broadcast 100 elements from PE 0 to all PEs
+            nvshmem.broadcast(0, dest_tensor, src_tensor, 100, 0)
+            ```
+        """
+        tl.static_assert(dest.type == source.type)
+        nbytes = nelems * dest.type.element_ty.itemsize
+        return broadcastmem_block_extern_wrapper(
+            team, dest.to(tl.int64), source.to(tl.int64), nbytes, pe_root
+        )
+
+    @core.extern  # type: ignore[misc]
+    def broadcastmem_block_extern_wrapper(
+        team: Any,
+        dest: Any,
+        source: Any,
+        size_bytes: Any,
+        pe_root: Any,
+        _semantic: Any = None,
+    ) -> None:
+        """Low-level extern wrapper for NVSHMEM broadcast"""
         return core.extern_elementwise(
             "",
             "",
-            [team, dest, source, nelems, pe_root],
+            [team, dest, source, size_bytes, pe_root],
             {
                 (
                     core.dtype("int64"),  # team handle
                     core.dtype("int64"),  # dest ptr
                     core.dtype("int64"),  # source ptr
-                    core.dtype("int64"),  # nelems
+                    core.dtype("int64"),  # size in bytes
                     core.dtype("int64"),  # pe_root
-                ): ("nvshmem_longlong_broadcast", core.dtype("int32"))
+                ): ("nvshmemx_broadcastmem_block", core.dtype("int32"))
             },
             is_pure=False,
-            _builder=_builder,
+            _semantic=_semantic,
+        )
+
+    # Reduction Operation
+    @triton.jit  # type: ignore[misc]
+    def reduce(team, dest, source, nreduce, operation: tl.constexpr):  # type: ignore[no-untyped-def]
+        """
+        Performs a collective reduction on tensors across a team of PEs.
+
+        This high-level function provides a tensor-aware interface for NVSHMEM
+        reduction operations. It automatically infers the data type from the
+        input tensors and calls the appropriate underlying NVSHMEM function.
+
+        Args:
+            team: The team handle for the collective (0 for NVSHMEM_TEAM_WORLD).
+            dest: Destination tensor for the reduction results.
+            source: Source tensor containing data to be reduced. Must be the same type as dest.
+            nreduce: The number of elements in the source tensor to reduce.
+            operation: The reduction operation to perform ("sum", "max", "min", "prod").
+
+        Notes:
+            - Performs compile-time type checking between dest and source tensors.
+            - This is a collective operation that must be called by all PEs in the team.
+            - Requires a cooperative grid launch.
+
+        Example:
+            ```
+            # Perform a sum reduction on two tensors
+            nvshmem.reduce(0, dest_tensor, src_tensor, 100, "sum")
+            ```
+        """
+        tl.static_assert(dest.type == source.type)
+        dtype = dest.type.element_ty
+        return reduce_extern_wrapper(
+            team,
+            dest.to(tl.int64),
+            source.to(tl.int64),
+            nreduce,
+            operation,
+            dtype,
+        )
+
+    @core.extern  # type: ignore[misc]
+    def reduce_extern_wrapper(
+        team: Any,
+        dest: Any,
+        source: Any,
+        nreduce: Any,
+        operation: str,
+        dtype: Any,
+        _semantic: Any = None,
+    ) -> None:
+        """
+        Low-level extern wrapper for NVSHMEM reduction operations.
+
+        This function provides a generic interface to NVSHMEM reduction operations,
+        automatically selecting the appropriate NVSHMEM function based on the data type
+        and operation specified.
+        Args:
+            team (int64): The team handle (0 for NVSHMEM_TEAM_WORLD).
+            dest (pointer): Destination pointer where reduction results are stored.
+            source (pointer): Source pointer containing data to be reduced.
+            nreduce (int64): Number of elements to reduce.
+            operation (str): Reduction operation ("sum", "max", "min", "prod").
+            dtype: Data type specification - accepts torch.dtype, tl.dtype, str, or constexpr.
+            _semantic: Optional semantic information for Triton compilation.
+
+        Raises:
+            ValueError: If the operation is not supported.
+            TypeError: If the data type is not supported.
+
+        Example:
+            nvshmem.reduce(0, dest_ptr, src_ptr, 100, "sum", torch.float32)
+        """
+        # Mapping from Triton dtype names to NVSHMEM typenames
+        DTYPE_TO_NVSHMEM_MAP = {
+            "int8": "int8",
+            "int16": "int16",
+            "int32": "int32",
+            "int64": "int64",
+            "uint8": "uint8",
+            "uint16": "uint16",
+            "uint32": "uint32",
+            "uint64": "uint64",
+            "fp16": "half",
+            "bf16": "bfloat16",
+            "fp32": "float",
+            "fp64": "double",
+        }
+
+        # Triton dtype names are standardized as fp16, bf16, fp32, etc.
+        dtype_name = str(dtype).replace("tl.", "")
+
+        if dtype_name not in DTYPE_TO_NVSHMEM_MAP:
+            raise TypeError(
+                f"Unsupported reduction dtype: {dtype_name}. Supported dtypes: {list(DTYPE_TO_NVSHMEM_MAP.keys())}"
+            )
+
+        # Extract operation name from constexpr if needed
+        op_name = operation.value if hasattr(operation, "value") else operation
+
+        # Validate operation is supported
+        supported_ops = {"sum", "max", "min", "prod"}
+        if op_name not in supported_ops:
+            raise ValueError(
+                f"Unsupported reduction operation: '{op_name}'. Supported ops are {supported_ops}"
+            )
+
+        # Map to NVSHMEM typename and validate dtype is supported
+        nvshmem_typename = DTYPE_TO_NVSHMEM_MAP.get(dtype_name)
+        if nvshmem_typename is None:
+            raise TypeError(
+                f"Unsupported reduction dtype: {dtype_name}. Supported dtypes are {list(DTYPE_TO_NVSHMEM_MAP.keys())}"
+            )
+
+        # Generate NVSHMEM function name
+        nvshmem_func = f"nvshmem_{nvshmem_typename}_{op_name}_reduce"
+
+        # Define function signature - all parameters are int64 in Triton (they are just ptrs)
+        signature = (
+            core.dtype("int64"),  # team handle
+            core.dtype("int64"),  # destination pointer
+            core.dtype("int64"),  # source pointer
+            core.dtype("int64"),  # number of elements
+        )
+
+        return core.extern_elementwise(
+            "",
+            "",
+            [team, dest, source, nreduce],
+            {signature: (nvshmem_func, core.dtype("int32"))},
+            is_pure=False,
+            _semantic=_semantic,
         )
diff --git a/torch/distributed/_tools/fake_collectives.py b/torch/distributed/_tools/fake_collectives.py
index 3b201b395334b..b89970ab33480 100644
--- a/torch/distributed/_tools/fake_collectives.py
+++ b/torch/distributed/_tools/fake_collectives.py
@@ -2,7 +2,9 @@
 from typing import Any
 
 import torch
-from torch._C._distributed_c10d import (
+
+# Import centralized distributed components
+from torch.distributed._distributed_c10d import (
     _resolve_process_group,
     FakeWork,
     ProcessGroup,
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
index 2a08212dfa9cc..6153d8e03fdff 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
@@ -232,13 +232,11 @@ def hook_with_zero_step(
         )
     ddp_ref = weakref.ref(ddp)
 
-    # NOTE: Gloo may hang with this overlapping approach, so we require
-    # NCCL/HCCL backend for now; see https://github.com/pytorch/pytorch/issues/62300
+    # NOTE: Gloo may hang with this overlapping approach; see https://github.com/pytorch/pytorch/issues/62300
     pg = dist.get_backend(ddp_ref().process_group)  # type: ignore[union-attr]
-    if (pg != dist.Backend.NCCL) and (pg != "hccl"):
+    if pg == dist.Backend.GLOO:
         raise RuntimeError(
-            "Overlapping DDP with ZeRO using this approach currently requires "
-            "NCCL/HCCL backend to avoid hangs"
+            "Gloo backend using Overlapping DDP with ZeRO may meet hangs"
         )
 
     if shard_buckets:
@@ -394,13 +392,11 @@ def hook_with_zero_step_interleaved(
         )
     ddp_ref = weakref.ref(ddp)
 
-    # NOTE: Gloo may hang with this overlapping approach, so we require
-    # NCCL/HCCL backend for now; see https://github.com/pytorch/pytorch/issues/62300
+    # NOTE: Gloo may hang with this overlapping approach; see https://github.com/pytorch/pytorch/issues/62300
     pg = dist.get_backend(ddp_ref().process_group)  # type: ignore[union-attr]
-    if (pg != dist.Backend.NCCL) and (pg != "hccl"):
+    if pg == dist.Backend.GLOO:
         raise RuntimeError(
-            "Overlapping DDP with ZeRO using this approach currently requires "
-            "NCCL/HCCL backend to avoid hangs"
+            "Gloo backend using Overlapping DDP with ZeRO may meet hangs"
         )
 
     if shard_buckets:
diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py
index fa8cc184eddc5..3e3243002a9c0 100644
--- a/torch/distributed/algorithms/model_averaging/utils.py
+++ b/torch/distributed/algorithms/model_averaging/utils.py
@@ -5,10 +5,6 @@
 
 import torch
 import torch.distributed as dist
-
-# The two imports below are not always available depending on the
-# USE_DISTRIBUTED compile flag. Make sure they raise import error
-# if we're trying to use them.
 from torch.distributed import group, ProcessGroup
 
 
diff --git a/torch/distributed/checkpoint/__init__.py b/torch/distributed/checkpoint/__init__.py
index 56bac60b95662..c9eb7de5b25a8 100644
--- a/torch/distributed/checkpoint/__init__.py
+++ b/torch/distributed/checkpoint/__init__.py
@@ -11,6 +11,7 @@
 )
 from .optimizer import load_sharded_optimizer_state_dict
 from .planner import LoadPlan, LoadPlanner, ReadItem, SavePlan, SavePlanner, WriteItem
+from .quantized_hf_storage import QuantizedHuggingFaceStorageReader
 from .state_dict_loader import load, load_state_dict
 from .state_dict_saver import async_save, save, save_state_dict
 from .storage import StorageReader, StorageWriter
diff --git a/torch/distributed/checkpoint/_async_executor.py b/torch/distributed/checkpoint/_async_executor.py
index e7e47dfffc145..428c697b91e9b 100644
--- a/torch/distributed/checkpoint/_async_executor.py
+++ b/torch/distributed/checkpoint/_async_executor.py
@@ -21,6 +21,8 @@ def execute_save(
         storage_writer: Optional[StorageWriter] = None,
         planner: Optional[SavePlanner] = None,
         process_group: Optional[dist.ProcessGroup] = None,
+        no_dist: bool = False,
+        use_collectives: bool = True,
     ) -> Future:
         """
         Execute the checkpoint save request asynchronously.
diff --git a/torch/distributed/checkpoint/_async_process_executor.py b/torch/distributed/checkpoint/_async_process_executor.py
index 5fab9e8cc243d..e708433058440 100644
--- a/torch/distributed/checkpoint/_async_process_executor.py
+++ b/torch/distributed/checkpoint/_async_process_executor.py
@@ -44,6 +44,8 @@ class _AsyncCheckpointRequest:
     checkpoint_request_id: _CheckpointRequestIdentifier
     storage_writer: Optional[StorageWriter] = None
     planner: Optional[SavePlanner] = None
+    no_dist: bool = False
+    use_collectives: bool = True
 
 
 @dataclass(init=False)
@@ -150,6 +152,8 @@ def save(
         checkpoint_id: Union[str, os.PathLike, None] = None,
         storage_writer: Optional[StorageWriter] = None,
         planner: Optional[SavePlanner] = None,
+        no_dist: bool = False,
+        use_collectives: bool = True,
     ) -> Metadata:
         # Create a unique identifier to locate requests/responses
         # from the checkpoint daemon process.
@@ -159,6 +163,8 @@ def save(
             checkpoint_request_id=checkpoint_request_id,
             storage_writer=storage_writer,
             planner=planner,
+            no_dist=no_dist,
+            use_collectives=use_collectives,
         )
         self._send(async_cp_request)
         result = self._wait_for_response()
@@ -172,6 +178,8 @@ def _execute_save(
         checkpoint_request_id: _CheckpointRequestIdentifier,
         storage_writer: Optional[StorageWriter] = None,
         planner: Optional[SavePlanner] = None,
+        no_dist: bool = False,
+        use_collectives: bool = True,
     ) -> Metadata:
         from torch.distributed.checkpoint.state_dict_saver import save
 
@@ -180,6 +188,8 @@ def _execute_save(
             checkpoint_id=checkpoint_request_id.checkpoint_id,
             storage_writer=storage_writer,
             planner=planner,
+            no_dist=no_dist,
+            use_collectives=use_collectives,
         )
         return metadata
 
@@ -239,6 +249,8 @@ def _checkpointing_subprocess(
                         checkpoint_request_id=obj.checkpoint_request_id,
                         storage_writer=obj.storage_writer,
                         planner=obj.planner,
+                        no_dist=obj.no_dist,
+                        use_collectives=obj.use_collectives,
                     )
                     parent_conn.send(response)
                     logger.info(
@@ -272,6 +284,8 @@ def _execute_save_impl(
         storage_writer: Optional[StorageWriter] = None,
         planner: Optional[SavePlanner] = None,
         process_group: Optional[dist.ProcessGroup] = None,
+        no_dist: bool = False,
+        use_collectives: bool = True,
     ) -> Metadata:
         global _CHECKPOINT_PROCESS
         if _CHECKPOINT_PROCESS is None:
@@ -299,6 +313,8 @@ def create_checkpoint_daemon_process() -> None:
             checkpoint_id=checkpoint_id,
             storage_writer=storage_writer,
             planner=planner,
+            no_dist=no_dist,
+            use_collectives=use_collectives,
         )
 
     def execute_save(
@@ -309,6 +325,8 @@ def execute_save(
         storage_writer: Optional[StorageWriter] = None,
         planner: Optional[SavePlanner] = None,
         process_group: Optional[dist.ProcessGroup] = None,
+        no_dist: bool = False,
+        use_collectives: bool = True,
     ) -> Future:
         """
         NOTE:
@@ -339,6 +357,8 @@ def execute_save(
             checkpoint_id=checkpoint_id,
             storage_writer=storage_writer,
             planner=planner,
+            no_dist=no_dist,
+            use_collectives=use_collectives,
         )
         f.add_done_callback(lambda f: self._executor.shutdown(wait=False))
 
diff --git a/torch/distributed/checkpoint/_async_thread_executor.py b/torch/distributed/checkpoint/_async_thread_executor.py
index 3fad17b2dea98..8dfe63413d433 100644
--- a/torch/distributed/checkpoint/_async_thread_executor.py
+++ b/torch/distributed/checkpoint/_async_thread_executor.py
@@ -18,6 +18,8 @@ def save_wrapper(
     storage_writer: Optional[StorageWriter] = None,
     planner: Optional[SavePlanner] = None,
     process_group: Optional[dist.ProcessGroup] = None,
+    no_dist: bool = False,
+    use_collectives: bool = True,
 ) -> Future:
     from torch.distributed.checkpoint.state_dict_saver import save
 
@@ -32,6 +34,8 @@ def save_wrapper(
         storage_writer=storage_writer,
         planner=planner,
         process_group=process_group,
+        no_dist=no_dist,
+        use_collectives=use_collectives,
     )
 
 
@@ -49,6 +53,8 @@ def execute_save(
         storage_writer: Optional[StorageWriter] = None,
         planner: Optional[SavePlanner] = None,
         process_group: Optional[dist.ProcessGroup] = None,
+        no_dist: bool = False,
+        use_collectives: bool = True,
     ) -> Future:
         f: Future = self._executor.submit(
             save_wrapper,
@@ -57,6 +63,8 @@ def execute_save(
             storage_writer=storage_writer,
             planner=planner,
             process_group=process_group,
+            no_dist=no_dist,
+            use_collectives=use_collectives,
         )
         f.add_done_callback(lambda f: self._executor.shutdown(wait=False))
 
diff --git a/torch/distributed/checkpoint/_consolidate_hf_safetensors.py b/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
index dc988e999c4ed..9db89d038658a 100644
--- a/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
+++ b/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
@@ -1,33 +1,27 @@
 # pyre-strict
 
 import concurrent.futures
+import glob
 import json
 import logging
 import math
 import mmap
 import os
-import shutil
 import struct
-import tempfile
 import time
 from dataclasses import dataclass, field
 from typing import Any, Optional
 
-import fsspec  # type: ignore[import-untyped]
-from fsspec.core import url_to_fs  # type: ignore[import-untyped]
-from fsspec.implementations.local import LocalFileSystem  # type: ignore[import-untyped]
-
 import torch
+from torch import distributed as dist
 from torch.distributed.checkpoint._hf_utils import (
     _gen_file_name,
     _get_dcp_custom_metadata,
-    _get_dtype,
     _get_safetensors_file_metadata,
     _metadata_fn,
     DATA_OFFSETS_KEY,
     DEFAULT_EXTRA_METADATA_KEY,
     DTYPE_KEY,
-    FILE_NAME,
     SAVED_OFFSETS_KEY,
     SHAPE_KEY,
     SUFFIX,
@@ -100,6 +94,9 @@ def _parse_input_metadata(
     Raises:
         ValueError: If no DCP custom metadata is found in a safetensors file
     """
+
+    from safetensors.torch import _getdtype  # type: ignore[import]
+
     # Dictionary to track the full size of each tensor across all shards
     fqn_to_size_mapping: dict[str, tuple[list[int], str]] = {}
 
@@ -134,18 +131,17 @@ def _parse_input_metadata(
         tensor_size = tensor_info[0]
         dtype_str = tensor_info[1]
         for output_data in output_files_data.values():
-            # Add this tensor to the output file if it's already assigned there or if we're using a single output file
-            if fqn in output_data.fqn_data or len(output_files_data) == 1:
+            # Add this tensor to the output file if it's already assigned there
+            if fqn in output_data.fqn_data:
                 output_data.fqn_data[fqn] = _FqnData(
                     shape_in_file=tensor_size,
-                    dtype_size=torch.finfo(_get_dtype(dtype_str)).bits
+                    dtype_size=torch.finfo(_getdtype(dtype_str)).bits
                     // 8,  # Convert bits to bytes
                     dtype_str=dtype_str,
                 )
 
 
 def _write_metadata(
-    fs: fsspec.AbstractFileSystem,
     output_files_data: dict[str, _OutputFileData],
 ) -> None:
     """
@@ -156,12 +152,11 @@ def _write_metadata(
     field for each tensor in the output_files_data.
 
     Args:
-        fs: Filesystem interface for file operations
         output_files_data: Dictionary mapping output file paths to their metadata
     """
     # Process each output file
     for file_path, output_data in output_files_data.items():
-        with fs.open(file_path, "wb") as f:
+        with open(file_path, "wb") as f:
             metadata = {}
             curr_offset = 0
 
@@ -205,7 +200,6 @@ def _write_metadata(
 
 
 def _read_tensor_data_mmap(
-    input_fs: fsspec.AbstractFileSystem,
     file_path: str,
     start_offset: int,
     end_offset: int,
@@ -215,7 +209,6 @@ def _read_tensor_data_mmap(
     Read tensor data from a safetensors file using memory mapping for efficiency.
 
     Args:
-        input_fs: Filesystem interface for input file operations
         file_path: Path to the safetensors file
         start_offset: Start offset of tensor data within the data section
         end_offset: End offset of tensor data within the data section
@@ -224,24 +217,15 @@ def _read_tensor_data_mmap(
     Returns:
         Raw tensor data as bytes
     """
-    # For local files, use mmap for efficient access
-    if isinstance(input_fs, LocalFileSystem):
-        # Local file - use mmap
-        with open(file_path, "rb") as f:
-            with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
-                absolute_start = metadata_size + start_offset
-                absolute_end = metadata_size + end_offset
-                return bytes(mm[absolute_start:absolute_end])
-    else:
-        # Remote file - fall back to regular read
-        with input_fs.open(file_path, "rb") as f:
-            f.seek(metadata_size + start_offset)
-            return f.read(end_offset - start_offset)
+    # Use mmap for efficient access
+    with open(file_path, "rb") as f:
+        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
+            absolute_start = metadata_size + start_offset
+            absolute_end = metadata_size + end_offset
+            return bytes(mm[absolute_start:absolute_end])
 
 
 def _process_output_file(
-    input_fs: fsspec.AbstractFileSystem,
-    output_fs: fsspec.AbstractFileSystem,
     output_file: str,
     output_data: _OutputFileData,
     input_files_data: dict[str, _InputFileData],
@@ -252,60 +236,66 @@ def _process_output_file(
     This function is designed to be run in parallel for different output files.
 
     Args:
-        input_fs: Filesystem interface for input file operations
-        output_fs: Filesystem interface for output file operations
         output_file: Path to the output file
         output_data: Metadata for the output file
         input_files_data: Dictionary mapping input file paths to their metadata
     """
-    # Process each input safetensors file
-    for safetensors_file in input_files_data.keys():
-        file_metadata = input_files_data[safetensors_file].metadata
-        input_metadata_size = input_files_data[safetensors_file].metadata_size
 
-        for fqn, metadata in file_metadata.items():
-            if fqn == DEFAULT_EXTRA_METADATA_KEY:
-                continue
+    sorted_tensors = sorted(
+        output_data.fqn_data.items(), key=lambda x: x[1].offset_in_file
+    )
 
-            # Skip if this tensor doesn't belong in this output file
-            if fqn not in output_data.fqn_data:
-                continue
+    with open(output_file, "r+b") as output_stream:
+        output_stream.seek(0, os.SEEK_END)
+        # Process each tensor in sequential output order
+        for tensor_fqn, tensor_fqn_data in sorted_tensors:
+            full_tensor_mv = memoryview(
+                bytearray(
+                    math.prod(tensor_fqn_data.shape_in_file)
+                    * tensor_fqn_data.dtype_size
+                )
+            )
 
-            data_offsets = metadata[DATA_OFFSETS_KEY]
+            # Process each input safetensors file
+            for safetensors_file in input_files_data.keys():
+                file_metadata = input_files_data[safetensors_file].metadata
+                input_metadata_size = input_files_data[safetensors_file].metadata_size
 
-            # Use memory mapping to read tensor data efficiently
-            data_to_write = _read_tensor_data_mmap(
-                input_fs,
-                safetensors_file,
-                data_offsets[0],
-                data_offsets[1],
-                input_metadata_size,
-            )
+                if tensor_fqn not in file_metadata.keys():
+                    continue
 
-            # Get the offsets of this tensor shard within the full tensor
-            custom_metadata = _get_dcp_custom_metadata(file_metadata)
-            offsets_of_tensor_being_read = custom_metadata[fqn][SAVED_OFFSETS_KEY]  # type: ignore[index]
-
-            # Get metadata for this tensor in the output file
-            fqn_data = output_data.fqn_data[fqn]
-
-            # Write this tensor shard to the appropriate position in the output file
-            _write_sub_tensor_to_file_optimized(
-                output_fs,
-                data_to_write,
-                fqn_data.dtype_size,  # Size of each element in bytes
-                fqn_data.shape_in_file,  # Full tensor shape
-                offsets_of_tensor_being_read,  # Where this shard belongs in the full tensor
-                metadata[SHAPE_KEY],  # Shape of this shard
-                output_file,
-                # Calculate the exact byte position where this tensor data should start
-                output_data.metadata_size + fqn_data.offset_in_file,
-            )
+                metadata = file_metadata[tensor_fqn]
+
+                data_offsets = metadata[DATA_OFFSETS_KEY]
+
+                # Use memory mapping to read tensor data efficiently
+                data_to_write = _read_tensor_data_mmap(
+                    safetensors_file,
+                    data_offsets[0],
+                    data_offsets[1],
+                    input_metadata_size,
+                )
+
+                # Get the offsets of this tensor shard within the full tensor
+                fqn_custom_metadata = _get_dcp_custom_metadata(file_metadata)[
+                    tensor_fqn
+                ]  # type: ignore[index]
+                offsets_of_tensor_being_read = fqn_custom_metadata[SAVED_OFFSETS_KEY]  # type: ignore[index]
+
+                # Write this tensor shard to the appropriate position in the output file
+                _write_sub_tensor_to_file_optimized(
+                    full_tensor_mv,
+                    data_to_write,
+                    tensor_fqn_data.dtype_size,  # Size of each element in bytes
+                    tensor_fqn_data.shape_in_file,  # Full tensor shape
+                    offsets_of_tensor_being_read,  # Where this shard belongs in the full tensor
+                    metadata[SHAPE_KEY],  # Shape of this shard
+                )
+
+            output_stream.write(full_tensor_mv)
 
 
 def _write_data(
-    input_fs: fsspec.AbstractFileSystem,
-    output_fs: fsspec.AbstractFileSystem,
     input_files_data: dict[str, _InputFileData],
     output_files_data: dict[str, _OutputFileData],
     num_threads: int = 1,
@@ -318,8 +308,6 @@ def _write_data(
     the work is split across threads with each thread handling a different output file.
 
     Args:
-        input_fs: Filesystem interface for input file operations
-        output_fs: Filesystem interface for output file operations
         input_files_data: Dictionary mapping input file paths to their metadata
         output_files_data: Dictionary mapping output file paths to their metadata
         num_threads: Number of threads to use for parallel processing
@@ -327,9 +315,7 @@ def _write_data(
     if num_threads <= 1 or len(output_files_data) <= 1:
         # Sequential processing
         for output_file, output_data in output_files_data.items():
-            _process_output_file(
-                input_fs, output_fs, output_file, output_data, input_files_data
-            )
+            _process_output_file(output_file, output_data, input_files_data)
     else:
         # Parallel processing with ThreadPoolExecutor
         with concurrent.futures.ThreadPoolExecutor(
@@ -340,8 +326,6 @@ def _write_data(
                 futures.append(
                     executor.submit(
                         _process_output_file,
-                        input_fs,
-                        output_fs,
                         output_file,
                         output_data,
                         input_files_data,
@@ -358,406 +342,178 @@ def _write_data(
                     raise
 
 
-def _write_row_wise_tensor(
-    fs: fsspec.AbstractFileSystem,
-    sub_tensor_bytes: bytearray,
-    element_size: int,
-    full_tensor_strides: list[int],
-    sub_tensor_strides: list[int],
-    sub_tensor_offsets: list[int],
-    sub_tensor_shape: list[int],
-    output_file_path: str,
-    output_start_byte: int,
-) -> None:
-    """
-    Writes a row-wise sharded tensor to the output file.
-
-    This is an optimized path for tensors that are sharded along the first dimension,
-    with all other dimensions being complete. This allows writing entire rows at once.
-
-    Args:
-        fs: Filesystem interface for file operations
-        sub_tensor_bytes: Byte array containing the sub-tensor data
-        element_size: The size of each element in bytes
-        full_tensor_strides: Strides of the full tensor
-        sub_tensor_strides: Strides of the sub-tensor
-        sub_tensor_offsets: The starting offsets of the sub-tensor within the full tensor
-        sub_tensor_shape: The shape of the sub-tensor
-        output_file_path: The path to the file where the full tensor is stored
-        output_start_byte: The starting byte of the full tensor in the file
-    """
-    # Open the output file in read+binary mode to allow seeking and writing
-    with fs.open(output_file_path, "r+b") as out_f:
-        # Calculate the number of elements in each row
-        elements_per_row = full_tensor_strides[
-            0
-        ]  # This is the stride of the first dimension
-
-        # For each row in the sub-tensor
-        for row_idx in range(sub_tensor_shape[0]):
-            # Calculate the row index in the full tensor
-            full_row_idx = sub_tensor_offsets[0] + row_idx
-
-            # Calculate the position in the full tensor
-            full_pos = full_row_idx * full_tensor_strides[0]
-            full_byte_offset = output_start_byte + full_pos * element_size
-
-            # Calculate the position in the sub-tensor
-            sub_pos = row_idx * sub_tensor_strides[0]
-            sub_byte_offset = sub_pos * element_size
-
-            # Extract the row data from the sub-tensor
-            row_size = elements_per_row * element_size
-            row_data = sub_tensor_bytes[sub_byte_offset : sub_byte_offset + row_size]
-
-            # Seek to the correct position in the output file and write the data
-            out_f.seek(full_byte_offset)
-            out_f.write(row_data)
-
-
-def _write_column_wise_tensor(
-    fs: fsspec.AbstractFileSystem,
-    sub_tensor_bytes: bytearray,
-    element_size: int,
-    tensor_shape: list[int],
-    sub_tensor_offsets: list[int],
-    sub_tensor_shape: list[int],
-    output_file_path: str,
-    output_start_byte: int,
-) -> None:
-    """
-    Writes a column-wise sharded 2D tensor to the output file.
-
-    This is an optimized path for 2D tensors that are sharded along the second dimension,
-    with the first dimension being complete. This requires writing column by column.
-
-    Args:
-        fs: Filesystem interface for file operations
-        sub_tensor_bytes: Byte array containing the sub-tensor data
-        element_size: The size of each element in bytes
-        tensor_shape: The shape of the overall tensor
-        sub_tensor_strides: Strides of the sub-tensor
-        sub_tensor_offsets: The starting offsets of the sub-tensor within the full tensor
-        sub_tensor_shape: The shape of the sub-tensor
-        output_file_path: The path to the file where the full tensor is stored
-        output_start_byte: The starting byte of the full tensor in the file
-    """
-    # Open the output file in read+binary mode to allow seeking and writing
-    with fs.open(output_file_path, "r+b") as out_f:
-        # For each column in the sub-tensor
-        for col_idx in range(sub_tensor_shape[1]):
-            # Calculate the column index in the full tensor
-            full_col_idx = sub_tensor_offsets[1] + col_idx
-
-            # For each row in the column
-            for row_idx in range(sub_tensor_shape[0]):
-                # Calculate the position in the full tensor
-                full_pos = row_idx * tensor_shape[1] + full_col_idx
-                full_byte_offset = output_start_byte + full_pos * element_size
-
-                # Calculate the position in the sub-tensor
-                sub_pos = row_idx * sub_tensor_shape[1] + col_idx
-                sub_byte_offset = sub_pos * element_size
-
-                # Extract the element data from the sub-tensor
-                element_data = sub_tensor_bytes[
-                    sub_byte_offset : sub_byte_offset + element_size
-                ]
-
-                # Seek to the correct position in the output file and write the data
-                out_f.seek(full_byte_offset)
-                out_f.write(element_data)
-
-
-def _write_element_by_element(
-    fs: fsspec.AbstractFileSystem,
-    sub_tensor_bytes: bytearray,
-    element_size: int,
-    tensor_shape: list[int],
-    full_tensor_strides: list[int],
-    sub_tensor_strides: list[int],
-    sub_tensor_offsets: list[int],
-    sub_tensor_shape: list[int],
-    output_file_path: str,
-    output_start_byte: int,
-) -> None:
-    """
-    Writes a sub-tensor to the output file using a general element-by-element approach.
-
-    This is a general approach that works for any sharding pattern, but is less efficient
-    than the specialized approaches for row-wise or column-wise sharding.
-
-    Args:
-        fs: Filesystem interface for file operations
-        sub_tensor_bytes: Byte array containing the sub-tensor data
-        element_size: The size of each element in bytes
-        tensor_shape: The shape of the overall tensor
-        full_tensor_strides: Strides of the full tensor
-        sub_tensor_strides: Strides of the sub-tensor
-        sub_tensor_offsets: The starting offsets of the sub-tensor within the full tensor
-        sub_tensor_shape: The shape of the sub-tensor
-        output_file_path: The path to the file where the full tensor is stored
-        output_start_byte: The starting byte of the full tensor in the file
-    """
-    # Open the output file in read+binary mode to allow seeking and writing
-    with fs.open(output_file_path, "r+b") as out_f:
-        # Create a list to hold the current indices for each dimension
-        indices = [0] * len(tensor_shape)
-
-        # Calculate the total number of elements in the sub-tensor
-        total_elements = 1
-        for dim_size in sub_tensor_shape:
-            total_elements *= dim_size
-
-        # Process each element in the sub-tensor
-        for element_idx in range(total_elements):
-            # Calculate the indices for this element in the sub-tensor
-            sub_idx = element_idx
-            for dim in range(len(sub_tensor_shape) - 1, -1, -1):
-                indices[dim] = sub_idx % sub_tensor_shape[dim]
-                sub_idx //= sub_tensor_shape[dim]
-
-            # Calculate the position of this element in the sub-tensor's byte array
-            sub_pos = 0
-            for dim in range(len(sub_tensor_shape)):
-                sub_pos += indices[dim] * sub_tensor_strides[dim]
-            sub_byte_offset = sub_pos * element_size
-
-            # Calculate the position of this element in the full tensor
-            full_pos = 0
-            for dim in range(len(tensor_shape)):
-                # The global index is the local index plus the offset for this dimension
-                global_idx = indices[dim] + sub_tensor_offsets[dim]
-                full_pos += global_idx * full_tensor_strides[dim]
-            full_byte_offset = output_start_byte + full_pos * element_size
-
-            # Extract the element data from the sub-tensor
-            element_data = sub_tensor_bytes[
-                sub_byte_offset : sub_byte_offset + element_size
-            ]
-
-            # Seek to the correct position in the output file and write the data
-            out_f.seek(full_byte_offset)
-            out_f.write(element_data)
-
-
 def _write_sub_tensor_to_file_optimized(
-    fs: fsspec.AbstractFileSystem,
+    full_tensor_mv: memoryview,
     sub_tensor_bytes: bytes,
     element_size: int,
     tensor_shape: list[int],
     sub_tensor_offsets: list[int],
     sub_tensor_shape: list[int],
-    output_file_path: str,
-    output_start_byte: int,
 ) -> None:
     """
-    Optimized version of _write_sub_tensor_to_file with enhanced sharding pattern detection.
+    Optimized version that writes the maximum number of contiguous bytes possible.
 
-    Uses advanced pattern detection to optimize common sharding patterns:
-    - Row-wise sharding with memory-efficient bulk copying
-    - Contiguous chunk detection for direct memory operations
-    - General fallback for arbitrary patterns
+    Uses a unified algorithm that calculates the maximum contiguous bytes that can be
+    written in each iteration and continues until the entire subtensor is written.
+    Handles all sharding patterns efficiently:
+    - Full sub-tensor at once for row-wise sharding
+    - Row-by-row for column-wise sharding
+    - Optimized chunks for other patterns
 
     Args:
-        fs: Filesystem interface for file operations
+        full_tensor_mv: Buffer to write the full tensor to
         sub_tensor_bytes: Raw tensor data as bytes
         element_size: Size of each element in bytes
         tensor_shape: Shape of the full tensor
         sub_tensor_offsets: Starting offsets of the sub-tensor within the full tensor
         sub_tensor_shape: Shape of the sub-tensor
-        output_file_path: Path to the output file
-        output_start_byte: Starting byte position of the tensor in the file
     """
     # Handle empty tensors
     if not tensor_shape or not sub_tensor_shape:
         return
 
-    # Enhanced row-wise sharding detection
-    if len(tensor_shape) >= 2 and len(sub_tensor_shape) >= 2:
-        # Check if this is a row-wise chunk (all dims except first are complete)
-        is_row_wise = all(
-            sub_tensor_shape[i] == tensor_shape[i] and sub_tensor_offsets[i] == 0
-            for i in range(1, len(tensor_shape))
+    # Calculate tensor strides for efficient indexing
+    tensor_strides = [1]
+    for i in range(len(tensor_shape) - 1, 0, -1):
+        tensor_strides.insert(0, tensor_strides[0] * tensor_shape[i])
+
+    sub_tensor_strides = [1]
+    for i in range(len(sub_tensor_shape) - 1, 0, -1):
+        sub_tensor_strides.insert(0, sub_tensor_strides[0] * sub_tensor_shape[i])
+
+    total_elements = math.prod(sub_tensor_shape)
+
+    elements_written = 0
+    while elements_written < total_elements:
+        # Convert linear index to multi-dimensional indices
+        temp_idx = elements_written
+        indices = []
+        for dim_size in reversed(sub_tensor_shape):
+            indices.append(temp_idx % dim_size)
+            temp_idx //= dim_size
+        indices.reverse()
+
+        # Calculate maximum contiguous elements we can write from this position
+        max_contiguous = _calculate_max_contiguous_elements(
+            indices, sub_tensor_shape, tensor_shape
         )
 
-        if is_row_wise:
-            # Optimized row-wise copy using bulk memory operations
-            _write_row_wise_tensor_optimized(
-                fs,
-                sub_tensor_bytes,
-                element_size,
-                tensor_shape,
-                sub_tensor_offsets,
-                sub_tensor_shape,
-                output_file_path,
-                output_start_byte,
-            )
-            return
-
-    # Fall back to the original implementation for complex patterns
-    _write_sub_tensor_to_file(
-        fs,
-        bytearray(sub_tensor_bytes),
-        element_size,
-        tensor_shape,
-        sub_tensor_offsets,
-        sub_tensor_shape,
-        output_file_path,
-        output_start_byte,
-    )
-
-
-def _write_row_wise_tensor_optimized(
-    fs: fsspec.AbstractFileSystem,
-    sub_tensor_bytes: bytes,
-    element_size: int,
-    tensor_shape: list[int],
-    sub_tensor_offsets: list[int],
-    sub_tensor_shape: list[int],
-    output_file_path: str,
-    output_start_byte: int,
-) -> None:
-    """
-    Optimized row-wise tensor writing using bulk memory operations.
-
-    This function an optimization strategy:
-    - Direct memory copy for contiguous rows
-    - Minimal file seeking operations
-    - Bulk data transfer instead of element-by-element
-    """
-    with fs.open(output_file_path, "r+b") as out_f:
-        # Optimized row-wise copy
-        elements_per_row = math.prod(tensor_shape[1:])
-        bytes_per_row = elements_per_row * element_size
+        # Calculate source position in bytes
+        src_pos = sum(idx * stride for idx, stride in zip(indices, sub_tensor_strides))
+        src_byte_offset = src_pos * element_size
 
-        start_row = sub_tensor_offsets[0]
-        num_rows = sub_tensor_shape[0]
-
-        # Calculate byte positions
-        tensor_start_byte = output_start_byte + start_row * bytes_per_row
-        chunk_size_bytes = num_rows * bytes_per_row
+        # Calculate destination position in bytes
+        dest_indices = [
+            idx + offset for idx, offset in zip(indices, sub_tensor_offsets)
+        ]
+        dest_pos = sum(
+            idx * stride for idx, stride in zip(dest_indices, tensor_strides)
+        )
+        dest_byte_offset = dest_pos * element_size
+
+        # Write the contiguous chunk
+        bytes_to_write = max_contiguous * element_size
+        chunk_data = sub_tensor_bytes[
+            src_byte_offset : src_byte_offset + bytes_to_write
+        ]
+        full_tensor_mv[dest_byte_offset : dest_byte_offset + bytes_to_write] = (
+            chunk_data
+        )
 
-        # Direct memory copy for contiguous rows
-        out_f.seek(tensor_start_byte)
-        out_f.write(sub_tensor_bytes[:chunk_size_bytes])
+        elements_written += max_contiguous
 
 
-def _write_sub_tensor_to_file(
-    fs: fsspec.AbstractFileSystem,
-    sub_tensor_bytes: bytearray,
-    element_size: int,
-    tensor_shape: list[int],
-    sub_tensor_offsets: list[int],
+def _calculate_max_contiguous_elements(
+    indices: list[int],
     sub_tensor_shape: list[int],
-    output_file_path: str,
-    output_start_byte: int,
-) -> None:
+    tensor_shape: list[int],
+) -> int:
     """
-    Original implementation - writes a sub-tensor from a byte array into a file representing the full tensor at specified offsets.
+    Calculate the maximum number of contiguous elements that can be written from current position.
 
-    This function handles the complex task of placing a tensor shard (sub-tensor) at the correct
-    position within the consolidated tensor file. It works by calculating the exact byte offsets
-    for each slice of data and writing them to the appropriate positions. This implementation
-    supports tensors of any dimensionality with optimized paths for common sharding patterns:
-    - Row-wise sharding (optimized path)
-    - Column-wise sharding for 2D tensors (optimized path)
-    - Any other arbitrary sharding pattern (general element-by-element approach)
+    This determines the largest chunk by checking how elements are laid out in memory
+    and finding natural boundaries where contiguity breaks.
 
     Args:
-        fs: Filesystem interface for file operations
-        sub_tensor_bytes: Byte array containing the sub-tensor data
-        element_size: The size of each element in bytes
-        tensor_shape: The shape of the overall tensor (list)
-        sub_tensor_offsets: The starting offsets of the sub-tensor within the full tensor (list)
-        sub_tensor_shape: The shape of the sub-tensor (list)
-        output_file_path: The path to the file where the full tensor is stored
-        output_start_byte: The starting byte of the full tensor in the file
+        indices: Current position indices in the sub-tensor
+        sub_tensor_shape: Shape of the sub-tensor being written
+        tensor_shape: Shape of the full tensor
+
+    Raises:
+        ValueError: If input lists are empty, have mismatched lengths, or contain invalid values
     """
-    # Handle the case of empty tensors
-    if not tensor_shape or not sub_tensor_shape:
-        return
+    # Validate input lists are not empty
+    if not indices or not sub_tensor_shape or not tensor_shape:
+        raise ValueError("Input lists cannot be empty")
 
-    # Calculate strides for the full tensor (row-major order, C-style)
-    # Stride is the number of elements to skip to move to the next element in that dimension
-    full_tensor_strides = [1] * len(tensor_shape)
-    for i in range(len(tensor_shape) - 2, -1, -1):
-        full_tensor_strides[i] = full_tensor_strides[i + 1] * tensor_shape[i + 1]
-
-    # Calculate strides for the sub-tensor (row-major order, C-style)
-    sub_tensor_strides = [1] * len(sub_tensor_shape)
-    for i in range(len(sub_tensor_shape) - 2, -1, -1):
-        sub_tensor_strides[i] = sub_tensor_strides[i + 1] * sub_tensor_shape[i + 1]
-
-    # Check if this is a row-wise sharded tensor
-    # Row-wise sharding is detected when the last dimension is complete
-    # and only the first dimension is partial
-    is_row_wise = False
-    if len(tensor_shape) >= 2:
-        # Check if all dimensions except the first are complete
-        all_other_dims_complete = True
-        for i in range(1, len(tensor_shape)):
-            if sub_tensor_shape[i] != tensor_shape[i]:
-                all_other_dims_complete = False
-                break
-
-        # Row-wise sharding: first dimension is partial, all others are complete
-        is_row_wise = all_other_dims_complete and sub_tensor_shape[0] < tensor_shape[0]
-
-    # Check if this is a column-wise sharded 2D tensor
-    # Column-wise sharding is detected when the first dimension is complete
-    # and the second dimension is partial (only for 2D tensors)
-    is_column_wise = False
-    if len(tensor_shape) == 2:
-        is_column_wise = (
-            sub_tensor_shape[0] == tensor_shape[0]
-            and sub_tensor_shape[1] < tensor_shape[1]
+    # Validate all lists have the same length (same number of dimensions)
+    if not (len(indices) == len(sub_tensor_shape) == len(tensor_shape)):
+        raise ValueError(
+            f"All input lists must have the same length. Got indices: {len(indices)}, "
+            f"sub_tensor_shape: {len(sub_tensor_shape)}, tensor_shape: {len(tensor_shape)}"
         )
 
-    # Call the appropriate function based on the sharding pattern
-    if is_row_wise:
-        _write_row_wise_tensor(
-            fs,
-            sub_tensor_bytes,
-            element_size,
-            full_tensor_strides,
-            sub_tensor_strides,
-            sub_tensor_offsets,
-            sub_tensor_shape,
-            output_file_path,
-            output_start_byte,
-        )
-    elif is_column_wise:
-        _write_column_wise_tensor(
-            fs,
-            sub_tensor_bytes,
-            element_size,
-            tensor_shape,
-            sub_tensor_offsets,
-            sub_tensor_shape,
-            output_file_path,
-            output_start_byte,
-        )
-    else:
-        _write_element_by_element(
-            fs,
-            sub_tensor_bytes,
-            element_size,
-            tensor_shape,
-            full_tensor_strides,
-            sub_tensor_strides,
-            sub_tensor_offsets,
-            sub_tensor_shape,
-            output_file_path,
-            output_start_byte,
-        )
+    # Validate indices are within bounds of sub_tensor_shape
+    for i, (idx, sub_dim) in enumerate(zip(indices, sub_tensor_shape)):
+        if idx >= sub_dim:
+            raise ValueError(
+                f"Index {idx} at dimension {i} is out of bounds for sub-tensor shape {sub_tensor_shape}"
+            )
+
+    # Validate sub_tensor dimensions don't exceed tensor dimensions
+    for i, (sub_dim, tensor_dim) in enumerate(zip(sub_tensor_shape, tensor_shape)):
+        if sub_dim > tensor_dim:
+            raise ValueError(
+                f"Sub-tensor dimension {sub_dim} at position {i} exceeds tensor dimension {tensor_dim}"
+            )
+
+    # Start with elements remaining in the last dimension
+    max_contiguous = sub_tensor_shape[-1] - indices[-1]
+
+    # Check if we can extend across multiple dimensions
+    # We can write across dimension boundaries if we're writing complete "rows"
+    # and the layout in destination tensor maintains contiguity
+
+    # For 2D case: check if we can write multiple complete rows
+    if len(sub_tensor_shape) >= 2:
+        # If we're at the start of a row and can write complete rows
+        if indices[-1] == 0:  # At start of last dimension (column)
+            rows_remaining = sub_tensor_shape[-2] - indices[-2]  # Rows left to write
+
+            # Check if writing complete rows maintains contiguity in destination
+            # This is true for row-wise sharding or when sub-tensor spans full width
+            if sub_tensor_shape[-1] == tensor_shape[-1]:  # Full width
+                max_contiguous = rows_remaining * sub_tensor_shape[-1]
+
+            # For higher dimensions, check if we can extend further
+            if len(sub_tensor_shape) >= 3 and indices[-2] == 0:
+                # Check if we can write complete 2D slices
+                remaining_in_dim = sub_tensor_shape[-3] - indices[-3]
+                if (
+                    sub_tensor_shape[-1] == tensor_shape[-1]
+                    and sub_tensor_shape[-2] == tensor_shape[-2]
+                ):
+                    max_contiguous = (
+                        remaining_in_dim * sub_tensor_shape[-2] * sub_tensor_shape[-1]
+                    )
+
+    return max_contiguous
 
 
 def _write_overall_metadata_file(
-    fs: fsspec.AbstractFileSystem,
     output_dir: str,
     output_files_data: dict[str, _OutputFileData],
 ) -> None:
+    """
+    Write the overall metadata file that maps tensor names to their file locations.
+
+    This creates a model.safetensors.index.json file that HuggingFace models use
+    to locate tensors across multiple files.
+
+    Args:
+        output_dir: Directory where the metadata file will be written
+        output_files_data: Dictionary mapping output file paths to their metadata
+    """
     total_size = 0
     weight_map = {}
     for output_path, value in output_files_data.items():
@@ -770,36 +526,52 @@ def _write_overall_metadata_file(
     metadata_to_write["weight_map"] = weight_map
 
     metadata_path = os.path.join(output_dir, f"{_metadata_fn}")
-    with fs.open(metadata_path, "w") as metadata_file:
+    with open(metadata_path, "w") as metadata_file:
         json.dump(metadata_to_write, metadata_file, indent=2)
 
 
-def _upload_files_to_remote_fs(
-    local_fs: fsspec.AbstractFileSystem,
-    local_dir: str,
-    output_fs: fsspec.AbstractFileSystem,
+def _consolidate_safetensors_files(
+    input_dir: str,
     output_dir: str,
-) -> None:
-    """
-    Uploads the consolidated files to the remote filesystem.
-    """
-    for path in local_fs.ls(local_dir, detail=False):
-        file = os.path.basename(path)
-        model_str = FILE_NAME.split("-")[0]
-        # Upload only the consolidated files with full tensors or the metadata file.
-        # The check for file.startwith(model_str) is to ensure that we only upload
-        # the consolidated files in the format "model-0000n-of-0000m.safetensors"
-        # and not the files with sharded tensors.
-        if file.endswith(SUFFIX) and file.startswith(model_str) or file == _metadata_fn:
-            local_path = os.path.join(local_dir, file)
-            remote_path = os.path.join(output_dir, file)
-            output_fs.put_file(local_path, remote_path)
+    fqn_to_file_mapping: dict[str, str],
+    num_threads: int,
+) -> dict[str, _OutputFileData]:
+    output_files_data: dict[str, _OutputFileData] = {}
+    # Create multiple output files based on the provided mapping
+    for fqn, filename in fqn_to_file_mapping.items():
+        output_path = os.path.join(output_dir, filename)
+
+        if output_path not in output_files_data:
+            output_files_data[output_path] = _OutputFileData(fqn_data={fqn: _FqnData()})
+        else:
+            output_files_data[output_path].fqn_data[fqn] = _FqnData()
+
+    # Find all safetensors files in the input directory
+    safetensors_files = glob.glob(os.path.join(input_dir, f"*{SUFFIX}"))
+
+    # Read metadata from all input files
+    input_files_data: dict[str, _InputFileData] = {}
+    for safetensor_file in safetensors_files:
+        with open(safetensor_file, "rb") as f:
+            metadata, size = _get_safetensors_file_metadata(f)
+            input_files_data[safetensor_file] = _InputFileData(
+                metadata_size=size, metadata=metadata
+            )
+    # Step 1: Parse metadata to determine tensor shapes and types
+    _parse_input_metadata(input_files_data, output_files_data)
+
+    # Step 2: Write metadata headers to output files
+    _write_metadata(output_files_data)
+    # Step 3: Write actual tensor data from input files to output files
+    _write_data(input_files_data, output_files_data, num_threads)
+
+    return output_files_data
 
 
 def consolidate_safetensors_files(
     input_dir: str,
     output_dir: str,
-    fqn_to_index_mapping: Optional[dict[str, int]] = None,
+    fqn_to_index_mapping: dict[str, int],
     num_threads: int = 1,
 ) -> None:
     """
@@ -827,74 +599,118 @@ def consolidate_safetensors_files(
         output_dir,
         start_time,
     )
-    # Create filesystem using fsspec for file operations
-    input_fs, _ = url_to_fs(input_dir)
-    output_fs, _ = url_to_fs(output_dir)
-
-    if not isinstance(output_fs, LocalFileSystem):
-        local_output_dir = tempfile.mkdtemp()
-        logger.info("Created temporary directory %s", local_output_dir)
-        local_output_fs, _ = url_to_fs(local_output_dir)
-    else:
-        local_output_fs = output_fs
-        local_output_dir = output_dir
 
-    # Initialize the output file structure
-    output_files_data: dict[str, _OutputFileData] = {}
-    if fqn_to_index_mapping is not None:
-        # Create multiple output files based on the provided mapping
-        for fqn, index in fqn_to_index_mapping.items():
-            # Generate names like "model-00001-of-00005.safetensors"
-            file_name = _gen_file_name(index, max(fqn_to_index_mapping.values()))
-            output_path = f"{local_output_dir}/{file_name}"
-
-            if output_path not in output_files_data:
-                output_files_data[output_path] = _OutputFileData(
-                    fqn_data={fqn: _FqnData()}
-                )
-            else:
-                output_files_data[output_path].fqn_data[fqn] = _FqnData()
-    else:
-        # If no mapping is provided, create a single output file
-        file_name = _gen_file_name(1, 1)
-        output_path = f"{local_output_dir}/{file_name}"
-        output_files_data[output_path] = _OutputFileData()
+    max_index = max(fqn_to_index_mapping.values())
+    fqn_to_file_mapping = {
+        fqn: _gen_file_name(idx, max_index) for fqn, idx in fqn_to_index_mapping.items()
+    }
 
-    # Find all safetensors files in the input directory
-    safetensors_files = []
-    for file in input_fs.ls(input_dir, detail=False):
-        if file.endswith(SUFFIX):
-            safetensors_files.append(file)
+    output_files_data = _consolidate_safetensors_files(
+        input_dir, output_dir, fqn_to_file_mapping, num_threads
+    )
 
-    # Read metadata from all input files
-    input_files_data: dict[str, _InputFileData] = {}
-    for safetensor_file in safetensors_files:
-        with input_fs.open(safetensor_file, "rb") as f:
-            metadata, size = _get_safetensors_file_metadata(f)
-            input_files_data[safetensor_file] = _InputFileData(
-                metadata_size=size, metadata=metadata
-            )
+    # Step 4: Write overall model.index.safetensors.json file with weight map
+    _write_overall_metadata_file(output_dir, output_files_data)
 
-    # Step 1: Parse metadata to determine tensor shapes and types
-    _parse_input_metadata(input_files_data, output_files_data)
+    logger.info("Done consolidating. Took %.2f secs.", time.time() - start_time)
 
-    # Step 2: Write metadata headers to output files
-    _write_metadata(local_output_fs, output_files_data)
 
-    # Step 3: Write actual tensor data from input files to output files
-    _write_data(
-        input_fs, local_output_fs, input_files_data, output_files_data, num_threads
+def consolidate_safetensors_files_on_every_rank(
+    input_dir: str,
+    output_dir: str,
+    fqn_to_index_mapping: dict[str, int],
+    num_threads: int = 1,
+    process_group: Optional[dist.ProcessGroup] = None,
+) -> None:
+    """
+    Consolidate sharded safetensors files across multiple ranks, with each rank handling a subset of output files.
+
+    This function distributes the consolidation work by assigning output files to different ranks.
+    All tensors with the same index in fqn_to_index_mapping are processed by the same rank,
+    as they belong to the same output file.
+
+    If process_group is provided, rank and world_size will be derived from it. Otherwise,
+    they will be automatically detected from the distributed environment if available.
+
+    Args:
+        input_dir: Directory containing sharded safetensors files
+        output_dir: Directory where consolidated files will be written
+        fqn_to_index_mapping: Mapping of tensor names to output file indices
+        num_threads: Number of threads to use for parallel processing on each rank
+        process_group: PyTorch distributed process group (default: None, will use default group)
+    """
+
+    start_time = time.time()
+    # Derive rank and world_size from process_group or default distributed environment
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank(group=process_group)
+        world_size = dist.get_world_size(group=process_group)
+    else:
+        # Default to single process mode if distributed is not initialized
+        rank = 0
+        world_size = 1
+        logger.warning(
+            "Distributed environment not initialized. Running in single process mode."
+        )
+    logger.info(
+        "Rank %d/%d: Consolidating safetensors files from %s to %s",
+        rank,
+        world_size,
+        input_dir,
+        output_dir,
     )
 
-    # Step 4: Write overall model.index.safetensors.json file with weight map
-    _write_overall_metadata_file(local_output_fs, local_output_dir, output_files_data)
+    # Find all unique indices in the mapping
+    unique_indices = set(fqn_to_index_mapping.values())
 
-    logger.info("Done consolidating. Took %.2f secs.", time.time() - start_time)
+    # Distribute indices across ranks
+    indices_for_this_rank = []
+    for idx in unique_indices:
+        # Simple distribution: index % world_size == rank
+        if idx % world_size == rank:
+            indices_for_this_rank.append(idx)
+
+    logger.info(
+        "Rank %d: Assigned %d output files out of %d total files",
+        rank,
+        len(indices_for_this_rank),
+        len(unique_indices),
+    )
 
-    if local_output_dir != output_dir:
-        logger.info("Copying consolidated files to remote storage %s", output_dir)
-        _upload_files_to_remote_fs(
-            local_output_fs, local_output_dir, output_fs, output_dir
+    # Filter the fqn_to_index_mapping to only include tensors for this rank
+    filtered_mapping = {
+        fqn: idx
+        for fqn, idx in fqn_to_index_mapping.items()
+        if idx in indices_for_this_rank
+    }
+
+    if filtered_mapping:
+        # Convert index mapping to filename mapping
+        max_index = max(unique_indices)
+        filtered_filename_mapping = {}
+        for fqn, idx in filtered_mapping.items():
+            filename = _gen_file_name(idx, max_index)
+            filtered_filename_mapping[fqn] = filename
+
+        # Call the existing consolidation function with the filtered mapping
+        _consolidate_safetensors_files(
+            input_dir=input_dir,
+            output_dir=output_dir,
+            fqn_to_file_mapping=filtered_filename_mapping,
+            num_threads=num_threads,
         )
-        shutil.rmtree(local_output_dir)
-        logger.info("Deleting temporary directory %s", local_output_dir)
+
+    logger.info(
+        "Rank %d: Done consolidating. Processed %d unique indices in %.2f secs.",
+        rank,
+        len(indices_for_this_rank),
+        time.time() - start_time,
+    )
+
+    # Wait for all ranks to complete
+    if dist.is_available() and dist.is_initialized():
+        logger.info("Rank %d: Waiting for all ranks to complete...", rank)
+        dist.barrier()
+        logger.info("Rank %d: All ranks have completed.", rank)
+        if rank == 0:
+            logger.info("Total time taken: %.2f secs.", time.time() - start_time)
diff --git a/torch/distributed/checkpoint/_experimental/checkpoint_process.py b/torch/distributed/checkpoint/_experimental/checkpoint_process.py
index 8917245236e36..5bca7c3e6e864 100644
--- a/torch/distributed/checkpoint/_experimental/checkpoint_process.py
+++ b/torch/distributed/checkpoint/_experimental/checkpoint_process.py
@@ -185,8 +185,8 @@ def _subprocess(
                     logger.info("Writing checkpoint to %s", path)
 
                     checkpoint_writer.write(
-                        state_dict=request.payload["state_dict"],
                         path=path,
+                        state_dict=request.payload["state_dict"],
                         **request.payload["kwargs"],
                     )
 
diff --git a/torch/distributed/checkpoint/_experimental/checkpoint_writer.py b/torch/distributed/checkpoint/_experimental/checkpoint_writer.py
index 1f9026d6e8322..3b0041fbf292b 100644
--- a/torch/distributed/checkpoint/_experimental/checkpoint_writer.py
+++ b/torch/distributed/checkpoint/_experimental/checkpoint_writer.py
@@ -94,16 +94,16 @@ def __init__(
 
     def write(
         self,
-        state_dict: STATE_DICT,
         path: str,
+        state_dict: STATE_DICT,
         **kwargs: dict[str, Any],
     ) -> Optional[Future[None]]:
         """
         Writes the state_dict to storage.
 
         Args:
-            state_dict (STATE_DICT): The state_dict to write.
             path (str): The path to write the checkpoint to.
+            state_dict (STATE_DICT): The state_dict to write.
             **kwargs: Additional keyword arguments passed to hooks.
 
         Returns:
diff --git a/torch/distributed/checkpoint/_experimental/checkpointer.py b/torch/distributed/checkpoint/_experimental/checkpointer.py
index 839a6c970f584..2609bd9c4af42 100644
--- a/torch/distributed/checkpoint/_experimental/checkpointer.py
+++ b/torch/distributed/checkpoint/_experimental/checkpointer.py
@@ -35,16 +35,16 @@ class Checkpointer(abc.ABC):
     @abc.abstractmethod
     def save(
         self,
-        state_dict: STATE_DICT,
         path: str,
+        state_dict: STATE_DICT,
         **kwargs: dict[str, Any],
     ) -> Optional[tuple[Future, Future]]:
         """
         Save a state dictionary to storage.
 
         Args:
-            state_dict: The state dictionary to save.
             path: The path where the checkpoint should be saved.
+            state_dict: The state dictionary to save.
             **kwargs: Additional keyword arguments to pass to the writer.
 
         Returns:
@@ -123,26 +123,26 @@ def __init__(
 
     def save(
         self,
-        state_dict: STATE_DICT,
         path: str,
+        state_dict: STATE_DICT,
         **kwargs: dict[str, Any],
     ) -> Optional[tuple[Future, Future]]:
         """
         Save a state dictionary to storage synchronously.
 
         Args:
-            state_dict: The state dictionary to save.
             path: The path where the checkpoint should be saved.
+            state_dict: The state dictionary to save.
             **kwargs: Additional keyword arguments to pass to the writer.
 
         Returns:
             Always returns None as operations are synchronous.
 
         Example:
-            checkpointer.save(state_dict, "/path/to/checkpoint")
+            checkpointer.save("/path/to/checkpoint", state_dict)
         """
         logger.debug("Saving checkpoint synchronously to %s", path)
-        self._writer.write(state_dict, path, **kwargs)
+        self._writer.write(path, state_dict, **kwargs)
         return None
 
     def load(
@@ -241,23 +241,23 @@ def __init__(
 
     def save(
         self,
-        state_dict: STATE_DICT,
         path: str,
+        state_dict: STATE_DICT,
         **kwargs: Any,
     ) -> Optional[tuple[Future, Future]]:
         """
         Save a state dictionary to storage asynchronously.
 
         Args:
-            state_dict: The state dictionary to save.
             path: The path where the checkpoint should be saved.
+            state_dict: The state dictionary to save.
             **kwargs: Additional keyword arguments to pass to the stager and writer.
 
         Returns:
             A tuple of (stage_future, write_future) representing the staging and writing operations.
 
         Example:
-            stage_future, write_future = checkpointer.save(state_dict, "/path/to/checkpoint")
+            stage_future, write_future = checkpointer.save("/path/to/checkpoint", state_dict)
             # ... do other work ...
             write_future.result()  # Wait for completion
         """
diff --git a/torch/distributed/checkpoint/_hf_utils.py b/torch/distributed/checkpoint/_hf_utils.py
index 1a3f627fd69b5..0d14229b7f8cc 100644
--- a/torch/distributed/checkpoint/_hf_utils.py
+++ b/torch/distributed/checkpoint/_hf_utils.py
@@ -51,8 +51,6 @@ class _HFStorageInfo:
     """This is the per entry storage info."""
 
     relative_path: str
-    offset: int
-    length: int
     shape: torch.Size
     dtype: torch.dtype
 
diff --git a/torch/distributed/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py
index 66cdff8a6b7f8..3c9f5831b7e81 100644
--- a/torch/distributed/checkpoint/default_planner.py
+++ b/torch/distributed/checkpoint/default_planner.py
@@ -654,7 +654,7 @@ def _validate_global_plan(global_plan: list[SavePlan], metadata: Metadata) -> bo
 
         # Check whether combined chunk cover the whole tensor
         tensor_volume = reduce(operator.mul, value.size, 1)
-        if chunks_volume != tensor_volume:
+        if len(global_plan) > 1 and chunks_volume != tensor_volume:
             logger.warning(
                 """
                     key:%s invalid fill tensor-volume:
diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
index 95c9e182f1f9d..cc4115cb7de0e 100644
--- a/torch/distributed/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@@ -620,32 +620,55 @@ def __init__(
         self.overwrite = overwrite
         self.transforms = _StorageWriterTransforms(_extensions)
         self.serialization_format = serialization_format
+        self.rank: Optional[int] = None
+        self.use_collectives: bool = True
 
     def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
         if checkpoint_id:
             self.path = self.fs.init_path(checkpoint_id)
         self.save_id = _generate_uuid()
 
-    def set_up_storage_writer(self, is_coordinator: bool) -> None:
-        pass
+    def set_up_storage_writer(
+        self, is_coordinator: bool, *args: Any, **kwargs: Any
+    ) -> None:
+        self.rank = kwargs.get("rank", None)
+        self.use_collectives = kwargs.get("use_collectives", True)
+
+    def _metadata_exists(self) -> bool:
+        if self.use_collectives:
+            # A global checkpoint metadata file
+            metadata_path = self._get_metadata_path(rank=None)
+        else:
+            # A rank 0 specific metadata file if every rank has written its own metadata
+            # Just looking for lowest rank metadata file is sufficient
+            metadata_path = self._get_metadata_path(rank=0)
+
+        return self.fs.exists(metadata_path)
 
     def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
         self.fs.mkdir(self.path)
-        if self.fs.exists(self.metadata_path):
+        if self._metadata_exists():
             if self.overwrite:
                 warnings.warn(
-                    f"Detected an existing checkpoint in {self.metadata_path}, overwriting since {self.overwrite=}."
+                    f"Detected an existing checkpoint in {self.path}, overwriting since {self.overwrite=}."
                     " Past version 2.5 of PyTorch, `overwrite` will default to False. Set this variable to True to"
                     " maintain this functionality or False to raise when an existing checkpoint is found."
                 )
             else:
                 raise RuntimeError(f"Checkpoint already exists and {self.overwrite=}.")
 
+        if self.rank is not None and not self.use_collectives:
+            plan = dataclasses.replace(
+                plan, storage_data=_StoragePrefix(f"__{self.rank}_")
+            )
+
         return plan
 
     def prepare_global_plan(self, plans: list[SavePlan]) -> list[SavePlan]:
         new_plans = [
             dataclasses.replace(plan, storage_data=_StoragePrefix(f"__{i}_"))
+            if plan.storage_data is None
+            else plan
             for i, plan in enumerate(plans)
         ]
         return new_plans
@@ -737,8 +760,12 @@ def finish(self, metadata: Metadata, results: list[list[WriteResult]]) -> None:
         metadata.storage_data = storage_md
 
         metadata.storage_meta = self.storage_meta()
-
-        tmp_path = cast(Path, self.fs.concat_path(self.path, f"{_metadata_fn}.tmp"))
+        tmp_filename = (
+            f"__{self.rank}{_metadata_fn}.tmp"
+            if not self.use_collectives and self.rank is not None
+            else f"{_metadata_fn}.tmp"
+        )
+        tmp_path = cast(Path, self.fs.concat_path(self.path, tmp_filename))
         with self.fs.create_stream(tmp_path, "wb") as metadata_file:
             pickle.dump(metadata, metadata_file)
             if self.sync_files:
@@ -748,17 +775,22 @@ def finish(self, metadata: Metadata, results: list[list[WriteResult]]) -> None:
                     os.sync()
 
         # delete in-case other checkpoints were present.
-        if self.fs.exists(self.metadata_path):
-            self.fs.rm_file(self.metadata_path)
+        if not self.use_collectives and self.rank is not None:
+            metadata_path = self._get_metadata_path(self.rank)
+        else:
+            metadata_path = self._get_metadata_path()
 
-        self.fs.rename(tmp_path, self.metadata_path)
+        if self.fs.exists(metadata_path):
+            self.fs.rm_file(metadata_path)
+
+        self.fs.rename(tmp_path, metadata_path)
 
     def storage_meta(self) -> Optional[StorageMeta]:
         return StorageMeta(checkpoint_id=self.checkpoint_id, save_id=self.save_id)
 
-    @property
-    def metadata_path(self) -> Union[str, os.PathLike]:
-        return cast(Path, self.fs.concat_path(self.path, _metadata_fn))
+    def _get_metadata_path(self, rank: Optional[int] = None) -> os.PathLike:
+        filename = f"{_metadata_fn}" if rank is None else f"__{rank}{_metadata_fn}"
+        return cast(Path, self.fs.concat_path(self.path, filename))
 
     @property
     def checkpoint_id(self) -> Union[str, os.PathLike]:
@@ -810,6 +842,8 @@ def __init__(
         self.storage_data: dict[Any, Any] = {}
         self.load_id = _generate_uuid()
         self.transforms = _StorageReaderTransforms(_extension_registry)
+        self.rank = None
+        self.use_collectives = True
 
     def _slice_file(self, file, sinfo: _StorageInfo) -> IO[bytes]:
         return cast(IO[bytes], _create_file_view(file, sinfo.offset, sinfo.length))
@@ -879,9 +913,14 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         fut.set_result(None)
         return fut
 
+    def _get_metadata_path(self, rank: Optional[int] = None) -> os.PathLike:
+        filename = f"{_metadata_fn}" if rank is None else f"__{rank}{_metadata_fn}"
+        return cast(Path, self.fs.concat_path(self.path, filename))
+
     # Implementing the abstract function in StorageReader
-    def read_metadata(self) -> Metadata:
-        path = self.fs.concat_path(self.path, ".metadata")
+    def read_metadata(self, *args: Any, **kwargs: Any) -> Metadata:
+        rank = kwargs.get("rank", None)
+        path = self._get_metadata_path(rank)
         with self.fs.create_stream(path, "rb") as metadata_file:
             metadata = pickle.load(metadata_file)
 
@@ -891,8 +930,12 @@ def read_metadata(self) -> Metadata:
 
         return metadata
 
-    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
+    def set_up_storage_reader(
+        self, metadata: Metadata, is_coordinator: bool, *args: Any, **kwargs: Any
+    ) -> None:
         self.storage_data = metadata.storage_data
+        self.rank = kwargs.get("rank", None)
+        self.use_collectives = kwargs.get("use_collectives", True)
         assert self.storage_data is not None
 
     def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
@@ -923,7 +966,8 @@ class FileSystemWriter(_FileSystemWriter, BlockingAsyncStager):
     * File creation is atomic
 
     The checkpoint consist of one file per write request plus
-    a `.metadata` file with the serialized metadata.
+    a global `.metadata` file with the serialized metadata if rank coordination is enabled.
+    a rank local `__{rank}.metadata` file with the serialized metadata if rank coordination is NOT enabled.
 
     """
 
diff --git a/torch/distributed/checkpoint/hf_storage.py b/torch/distributed/checkpoint/hf_storage.py
index 13fd61910dd21..17db989727d4a 100644
--- a/torch/distributed/checkpoint/hf_storage.py
+++ b/torch/distributed/checkpoint/hf_storage.py
@@ -3,26 +3,20 @@
 import json
 import logging
 import queue
+import threading
 from typing import Any, Optional
 
 import torch
-from torch.distributed._shard._utils import narrow_tensor_by_index
+from torch.distributed.checkpoint import FileSystemReader, FileSystemWriter
 from torch.distributed.checkpoint._consolidate_hf_safetensors import (
     consolidate_safetensors_files,
 )
-from torch.distributed.checkpoint._fsspec_filesystem import FsspecReader, FsspecWriter
 from torch.distributed.checkpoint._hf_utils import (
     _gen_file_name,
-    _get_dtype,
-    _get_safetensors_file_metadata,
     _HFStorageInfo,
     _metadata_fn,
     CUSTOM_METADATA_KEY,
-    DATA_OFFSETS_KEY,
-    DEFAULT_EXTRA_METADATA_KEY,
-    DTYPE_KEY,
     SAVED_OFFSETS_KEY,
-    SHAPE_KEY,
     SHARDED_DIR_NAME,
     SUFFIX,
 )
@@ -52,11 +46,9 @@
 __all__ = ["HuggingFaceStorageWriter", "HuggingFaceStorageReader"]
 
 
-class HuggingFaceStorageWriter(FsspecWriter):
+class HuggingFaceStorageWriter(FileSystemWriter):
     """
-    A writer that writes to a huggingface repository in the huggingface format.
-    Uses Fsspec back-end to communicate with back-end storage.
-    Fsspec registration of the storage solution is required.
+    A writer that writes to storage in the huggingface safetensors format.
     """
 
     def __init__(
@@ -64,26 +56,20 @@ def __init__(
         path: str,
         fqn_to_index_mapping: Optional[dict[str, int]] = None,
         thread_count: int = 1,
-        token: Optional[str] = None,
         save_distributed: bool = False,
         enable_consolidation: bool = False,
-        consolidated_output_path: Optional[str] = None,
         thread_count_consolidation: int = 1,
     ) -> None:
         """
         Initialize the huggingface writer pointing to path.
 
         Args:
-            path: hf directory where the checkpoint will be read from.
-                  Needs to have .safetensors files, but can be from any fsspec supported storage,
-                  including localFS and hf://.
-                  This needs to be a remote path if you want to enable consolidation after saving.
+            path: directory where the checkpoint will be read from.
             fqn_to_index_mapping: A mapping from tensor FQN to the index of the file that the tensor should be written to.
                               Indices are from 1 to N, where N is the number of files. If not provided,
                               the tensors will be written to a single file. If none, then all the tensors on the
                               same rank will be written to the same file.
             thread_count: Number of threads to use to write distributed checkpoint. Default to 1.
-            token: The token to use to authenticate with huggingface hub.
             save_distributed: If True, save the checkpoint using distributed APIs where every rank saves its own shard.
                         Default is False which assumes rank-0 checkpointing of the full state_dict.
             enable_consolidation: If True, consolidate the sharded checkpoint after saving. The sharded tensors will be
@@ -92,19 +78,11 @@ def __init__(
                                 to consolidated output files. Default to 1.
         """
 
-        if token is not None:
-            super().__init__(
-                path=path,
-                token=token,
-                serialization_format=SerializationFormat.SAFETENSORS,
-                thread_count=thread_count,
-            )
-        else:
-            super().__init__(
-                path=path,
-                serialization_format=SerializationFormat.SAFETENSORS,
-                thread_count=thread_count,
-            )
+        super().__init__(
+            path=path,
+            serialization_format=SerializationFormat.SAFETENSORS,
+            thread_count=thread_count,
+        )
         self.fqn_to_index_mapping: Optional[dict[str, int]] = fqn_to_index_mapping
         self.save_distributed: bool = save_distributed
         self.enable_consolidation: bool = enable_consolidation
@@ -167,11 +145,17 @@ def finish(self, metadata: Metadata, results: list[list[WriteResult]]) -> None:
             logger.info("Not consolidating sharded checkpoint in finish step.")
             return
         if self.save_distributed:
+            fqn_to_index_mapping: dict[str, int] = (
+                self.fqn_to_index_mapping
+                if self.fqn_to_index_mapping is not None
+                else dict.fromkeys(metadata.state_dict_metadata.keys(), 1)
+            )
+
             return consolidate_safetensors_files(
                 input_dir=str(self.path),
                 output_dir=self.consolidated_output_path,  # type: ignore[arg-type]
                 num_threads=self.thread_count_consolidation,
-                fqn_to_index_mapping=self.fqn_to_index_mapping,
+                fqn_to_index_mapping=fqn_to_index_mapping,
             )
 
         # writing a model.index.safetensors.json file with fqn to file mapping
@@ -215,30 +199,61 @@ def metadata_path(self) -> str:
         return _metadata_fn
 
 
-class HuggingFaceStorageReader(FsspecReader):
+class HuggingFaceStorageReader(FileSystemReader):
     """
-    A reader that reads from a huggingface repository in the huggingface format.
-    Uses in Fsspec back-end to communicate with storage.
-    Fsspec registration of the storage solution is required.
+    A reader that reads a checkpoint in the huggingface safetensors format.
     """
 
-    def __init__(self, path: str, token: Optional[str] = None) -> None:
+    def __init__(self, path: str, thread_count: int = 1) -> None:
         """
         Initialize the huggingface reader pointing to path.
 
         Args:
-            path: hf directory where the checkpoint will be read from.
-            Needs to have .safetensors file, but can be from any fsspec supported storage,
-            including localFS and hf://.
-            token: The token to use to authenticate with huggingface hub.
+            path: directory where the checkpoint will be read from.
+            thread_count: Number of threads to use to read distributed checkpoint. Default to 1.
         """
 
-        if token is not None:
-            super().__init__(path=path, token=token)
-        else:
-            super().__init__(path=path)
+        super().__init__(path=path)
+        self.thread_count = thread_count
+
+    def _process_read_request(self, f, req: ReadItem, planner: LoadPlanner) -> None:
+        """Helper function to process a single read request."""
+        # Create slices for each dimension based on offsets and lengths
+        slices = tuple(
+            slice(offset, offset + length)
+            for offset, length in zip(req.storage_offsets, req.lengths)
+        )
+        tensor = f.get_slice(req.storage_index.fqn)[slices]
+        target_tensor = planner.resolve_tensor(req).detach()
+
+        assert target_tensor.size() == tensor.size(), (
+            f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+        )
+
+        target_tensor.copy_(tensor)
+        planner.commit_tensor(req, target_tensor)
+
+    def _read_files_from_queue(
+        self,
+        file_queue: queue.Queue,
+        result_queue: queue.Queue,
+        planner: LoadPlanner,
+    ) -> None:
+        from safetensors import safe_open  # type: ignore[import]
+
+        try:
+            while True:
+                file_name, reqs = file_queue.get_nowait()
+                with safe_open(filename=file_name, framework="pt") as f:
+                    for req in reqs:
+                        self._process_read_request(f, req, planner)
+                result_queue.put(True)  # Signal that this file has been processed
+        except queue.Empty:
+            pass
 
     def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
+        from safetensors import safe_open  # type: ignore[import]
+
         per_file: dict[str, list[ReadItem]] = {}
 
         for read_item in plan.items:
@@ -246,36 +261,56 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
             file_name = item_md.relative_path
             per_file.setdefault(file_name, []).append(read_item)
 
-        for file_name, reqs in per_file.items():
-            with self.fs.create_stream(file_name, "rb") as stream:
-                for req in reqs:
-                    item_md = self.storage_data[req.storage_index]
-
-                    stream.seek(item_md.offset)
-                    tensor_bytes = stream.read(item_md.length)
-
-                    tensor = torch.frombuffer(
-                        tensor_bytes,
-                        dtype=item_md.dtype,
-                    )
-                    tensor = tensor.reshape(item_md.shape)
-                    tensor = narrow_tensor_by_index(
-                        tensor, req.storage_offsets, req.lengths
-                    )
-                    target_tensor = planner.resolve_tensor(req).detach()
-
-                    assert target_tensor.size() == tensor.size(), (
-                        f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
-                    )
-
-                    target_tensor.copy_(tensor)
-                    planner.commit_tensor(req, target_tensor)
+        if self.thread_count <= 1 or len(per_file) <= 1:
+            for file_name, reqs in per_file.items():
+                with safe_open(filename=file_name, framework="pt") as f:
+                    for req in reqs:
+                        self._process_read_request(f, req, planner)
+        else:
+            # Use parallel implementation with thread pool
+            file_queue: queue.Queue = queue.Queue()
+            result_queue: queue.Queue = queue.Queue()
+
+            # Fill the queue with files to process
+            for file_name, reqs in per_file.items():
+                file_queue.put((file_name, reqs))
+
+            # Create and start worker threads
+            threads = []
+            num_threads = min(self.thread_count, len(per_file))
+            for _ in range(num_threads):
+                t = threading.Thread(
+                    target=self._read_files_from_queue,
+                    args=(file_queue, result_queue, planner),
+                )
+                t.start()
+                threads.append(t)
+
+            # Wait for all threads to complete
+            for t in threads:
+                t.join()
+
+            # Check if all files were processed
+            processed_count = 0
+            try:
+                while True:
+                    result_queue.get_nowait()
+                    processed_count += 1
+            except queue.Empty:
+                pass
+
+            assert processed_count == len(per_file), (
+                f"Not all files were processed: {processed_count} out of {len(per_file)}"
+            )
 
         fut: Future = Future()
         fut.set_result(None)
         return fut
 
     def read_metadata(self) -> Metadata:
+        from safetensors import safe_open  # type: ignore[import]
+        from safetensors.torch import _getdtype  # type: ignore[import]
+
         state_dict_metadata: dict[str, TensorStorageMetadata] = {}
         storage_data: dict[MetadataIndex, _HFStorageInfo] = {}
 
@@ -285,53 +320,47 @@ def read_metadata(self) -> Metadata:
                 safetensors_files.append(file)
 
         for safetensor_file in safetensors_files:
-            with self.fs.create_stream(safetensor_file, "rb") as f:
-                safetensors_metadata, metadata_size = _get_safetensors_file_metadata(f)
-                custom_metadata = safetensors_metadata.get(DEFAULT_EXTRA_METADATA_KEY)
+            with safe_open(safetensor_file, framework="pt") as f:
+                keys = f.keys()
+                extra_metadata = f.metadata()
 
                 dcp_sharding_info = None
-                if custom_metadata and custom_metadata.get(CUSTOM_METADATA_KEY):
+                if extra_metadata and extra_metadata.get(CUSTOM_METADATA_KEY):
                     dcp_sharding_info = json.loads(
-                        custom_metadata.get(CUSTOM_METADATA_KEY)
+                        extra_metadata.get(CUSTOM_METADATA_KEY)
                     )
 
-                for key, val in safetensors_metadata.items():
-                    if key == DEFAULT_EXTRA_METADATA_KEY:
-                        continue
-
+                for key in keys:
+                    shape = f.get_slice(key).get_shape()
+                    dtype = f.get_slice(key).get_dtype()
                     # construct state_dict_metadata
                     if dcp_sharding_info is not None:
                         offset = dcp_sharding_info[key][SAVED_OFFSETS_KEY]
                     else:
-                        offset = [0] * len(val[SHAPE_KEY])
+                        offset = [0] * len(shape)
 
                     if key not in state_dict_metadata:
                         state_dict_metadata[key] = TensorStorageMetadata(
-                            properties=TensorProperties(
-                                dtype=_get_dtype(val[DTYPE_KEY])
-                            ),
+                            properties=TensorProperties(dtype=_getdtype(dtype)),
                             size=torch.Size(
-                                [
-                                    saved + offset
-                                    for saved, offset in zip(val[SHAPE_KEY], offset)
-                                ]
+                                [saved + offset for saved, offset in zip(shape, offset)]
                             ),
                             chunks=[
                                 ChunkStorageMetadata(
                                     offsets=torch.Size(offset),
-                                    sizes=torch.Size(val[SHAPE_KEY]),
+                                    sizes=torch.Size(shape),
                                 )
                             ],
                         )
                     else:
                         state_dict_metadata[key].chunks.append(
                             ChunkStorageMetadata(
-                                torch.Size(offset), sizes=torch.Size(val[SHAPE_KEY])
+                                torch.Size(offset), sizes=torch.Size(shape)
                             )
                         )
                         size = list(state_dict_metadata[key].size)
                         for i in range(len(size)):
-                            size[i] = max(size[i], val[SHAPE_KEY][i] + offset[i])
+                            size[i] = max(size[i], shape[i] + offset[i])
                         state_dict_metadata[key].size = torch.Size(size)
 
                     # construct storage data
@@ -340,15 +369,11 @@ def read_metadata(self) -> Metadata:
                             fqn=key, offset=dcp_sharding_info[key][SAVED_OFFSETS_KEY]
                         )
                     else:
-                        metadata_index = MetadataIndex(
-                            fqn=key, offset=[0] * len(val[SHAPE_KEY])
-                        )
+                        metadata_index = MetadataIndex(fqn=key, offset=[0] * len(shape))
                     storage_data[metadata_index] = _HFStorageInfo(
                         relative_path=safetensor_file,
-                        offset=val[DATA_OFFSETS_KEY][0] + metadata_size,
-                        length=val[DATA_OFFSETS_KEY][1] - val[DATA_OFFSETS_KEY][0],
-                        shape=torch.Size(val[SHAPE_KEY]),
-                        dtype=_get_dtype(val[DTYPE_KEY]),
+                        shape=torch.Size(shape),
+                        dtype=_getdtype(dtype),
                     )
 
         metadata = Metadata(
diff --git a/torch/distributed/checkpoint/quantized_hf_storage.py b/torch/distributed/checkpoint/quantized_hf_storage.py
new file mode 100644
index 0000000000000..1bc8b852ed815
--- /dev/null
+++ b/torch/distributed/checkpoint/quantized_hf_storage.py
@@ -0,0 +1,244 @@
+# mypy: allow-untyped-defs
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+import torch
+from torch.distributed.checkpoint._hf_utils import _metadata_fn
+from torch.distributed.checkpoint.planner import LoadPlanner, ReadItem
+
+from .hf_storage import HuggingFaceStorageReader
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+__all__ = ["QuantizedHuggingFaceStorageReader"]
+
+
+class QuantizedHuggingFaceStorageReader(HuggingFaceStorageReader):
+    """
+    Extension of HuggingFaceStorageReader that handles quantized tensors.
+    Checkpoint should have the full tensor in a SafeTensor file. The quantized
+    tensor should not be sharded across multiple files.
+
+    This reader handles the dequantization of tensors during the read process,
+    converting them from quantized blocks to full dequantized tensors before
+    copying to the target tensor.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        thread_count: int = 1,
+        target_dtype: torch.dtype = torch.float32,
+        block_size: int = 128,
+    ):
+        """
+        Initialize the HuggingFace storage reader to load quantized checkpoints
+
+        Args:
+            path: directory where the checkpoint will be read from.
+            thread_count: Number of threads to use to read distributed checkpoint. Defaults to 1.
+            target_dtype: Target dtype for dequantized tensor. Defaults to torch.float32.
+            block_size: Fixed block size for dequantization. Defaults to 128.
+        """
+        super().__init__(path=path, thread_count=thread_count)
+
+        self.target_dtype: torch.dtype = target_dtype
+        self.block_size: int = block_size
+        self._weight_scale_mapping: dict[str, str] = {}
+        # Track which file contains each tensor
+        self._weight_map: dict[str, str] = {}
+
+    def read_metadata(self) -> Any:
+        self._load_quantization_metadata()
+        return super().read_metadata()
+
+    def _load_quantization_metadata(self):
+        """Load quantization metadata from the checkpoint."""
+        checkpoint_path = Path(self.path)
+        # Load weight mapping from index file
+        index_file = checkpoint_path / _metadata_fn
+
+        with open(index_file) as f:
+            index_data = json.load(f)
+            weight_map = index_data.get("weight_map", {})
+            self._build_weight_scale_mapping(weight_map)
+
+    def _build_weight_scale_mapping(self, weight_map: dict[str, str]):
+        """Analyze and build weight-scale tensor pairs from weight mapping."""
+        # Store the complete weight map for file location lookups
+        self._weight_map = weight_map
+
+        for tensor_name in weight_map.keys():
+            if tensor_name.endswith(".weight_scale_inv"):
+                weight_name = tensor_name.replace(".weight_scale_inv", ".weight")
+                if weight_name in weight_map:
+                    self._weight_scale_mapping[weight_name] = tensor_name
+
+    def _process_read_request(
+        self, f: Any, req: ReadItem, planner: LoadPlanner
+    ) -> None:
+        """Override the Helper function that processes a single read request."""
+        tensor_fqn = req.storage_index.fqn
+
+        # Check if this is a quantized tensor that needs dequantization
+        if self._is_tensor_quantized(tensor_fqn):
+            tensor = self._read_quantized_tensor_with_block_alignment(req, f)
+        else:
+            # Standard tensor reading
+            slices = tuple(
+                slice(offset, offset + length)
+                for offset, length in zip(req.storage_offsets, req.lengths)
+            )
+            tensor = f.get_slice(tensor_fqn)[slices]
+
+        target_tensor = planner.resolve_tensor(req).detach()
+
+        assert target_tensor.size() == tensor.size(), (
+            f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+        )
+
+        target_tensor.copy_(tensor)
+        planner.commit_tensor(req, target_tensor)
+
+    def _calculate_scale_shape(
+        self, weight: torch.Tensor, block_size: int
+    ) -> tuple[int, int]:
+        """Calculate expected scale tensor shape based on weight tensor and block size."""
+        rows, cols = weight.shape
+        block_rows = (rows + block_size - 1) // block_size  # Ceiling division
+        block_cols = (cols + block_size - 1) // block_size  # Ceiling division
+        return (block_rows, block_cols)
+
+    def _dequantize_tensor(
+        self,
+        weight: torch.Tensor,
+        scale_inv: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Dequantize tensor using block-wise scaling.
+
+        Args:
+            weight: Quantized weight tensor
+            scale_inv: Scale inverse tensor for dequantization
+
+        Returns:
+            Dequantized tensor
+        """
+        # Convert to float32 for computation
+        # Certain quantized dtypes like Float8_e4m3fn
+        # don't support multiplication on CPU yet in PyTorch.
+        upcasted_weight = weight.to(torch.float32)
+
+        # Get original dimensions
+        orig_shape = weight.shape
+
+        # Calculate block dimensions for the local shard
+        expected_scale_shape = self._calculate_scale_shape(weight, self.block_size)
+        block_rows, block_cols = expected_scale_shape
+
+        # Create output tensor in target dtype
+        dequantized = weight.detach().to(dtype=self.target_dtype, copy=True)
+
+        # Apply scaling factors to each block
+        for i in range(block_rows):
+            row_start = i * self.block_size
+            row_end = min(row_start + self.block_size, orig_shape[0])
+
+            for j in range(block_cols):
+                col_start = j * self.block_size
+                col_end = min(col_start + self.block_size, orig_shape[1])
+
+                # Get the block
+                block = upcasted_weight[row_start:row_end, col_start:col_end]
+
+                scale = scale_inv[i, j]
+                block = block * scale
+
+                # Explicitly convert block to target dtype
+                block_converted = block.to(dtype=self.target_dtype)
+                # Store the dequantized block
+                dequantized[row_start:row_end, col_start:col_end] = block_converted
+
+        return dequantized
+
+    def _is_tensor_quantized(self, tensor_fqn: str) -> bool:
+        """
+        Check if a tensor is a quantized.
+
+        Args:
+            tensor_fqn: Fully qualified name of the tensor
+
+        Returns:
+            True if tensor is quantized and has a corresponding scale tensor,
+            False otherwise
+        """
+        # Skip scale tensors themselves
+        if tensor_fqn.endswith(".weight_scale_inv"):
+            return False
+
+        # Check if this weight tensor has a corresponding scale tensor
+        if tensor_fqn not in self._weight_scale_mapping:
+            return False
+
+        return True
+
+    def _read_quantized_tensor_with_block_alignment(
+        self, req: ReadItem, safetensor_file: Any
+    ) -> torch.Tensor:
+        """
+        Read a quantized tensor with block alignment.
+
+        Args:
+            req: Read request containing tensor info and required slices
+            safetensor_file: Open safetensors file handle
+
+        Returns:
+            Dequantized tensor ready for use
+        """
+        tensor_fqn = req.storage_index.fqn
+        scale_fqn = self._weight_scale_mapping[tensor_fqn]
+
+        try:
+            # Load the quantized weight
+            weight_slices = tuple(
+                slice(offset, offset + length)
+                for offset, length in zip(req.storage_offsets, req.lengths)
+            )
+            quantized_tensor = safetensor_file.get_slice(tensor_fqn)[weight_slices]
+
+            # Load the corresponding scale inverse tensor
+            # Use weight_map to find the correct file for the scale tensor
+            scale_file_name = self._weight_map.get(scale_fqn)
+            if scale_file_name is None:
+                raise ValueError(f"Scale tensor {scale_fqn} not found in weight_map")
+
+            # Check if scale tensor is in the same file as the weight tensor
+            weight_file_name = self._weight_map.get(tensor_fqn)
+
+            if scale_file_name == weight_file_name:
+                # Scale tensor is in the same file, use current handle
+                scale_inv = safetensor_file.get_tensor(scale_fqn)
+            else:
+                # Scale tensor is in a different file, need to open it
+                from safetensors import safe_open  # type: ignore[import]
+
+                scale_file_path = Path(self.path) / scale_file_name
+                with safe_open(
+                    scale_file_path, framework="pt", device="cpu"
+                ) as scale_file:
+                    scale_inv = scale_file.get_tensor(scale_fqn)
+
+            # Perform dequantization
+            dequantized_tensor = self._dequantize_tensor(
+                weight=quantized_tensor,
+                scale_inv=scale_inv,
+            )
+
+            return dequantized_tensor
+
+        except Exception as e:
+            logger.error("Failed to read the quantized tensor!!")
+            raise e
diff --git a/torch/distributed/checkpoint/staging.py b/torch/distributed/checkpoint/staging.py
index 9e1031c7fddae..e7acf4975173c 100644
--- a/torch/distributed/checkpoint/staging.py
+++ b/torch/distributed/checkpoint/staging.py
@@ -1,11 +1,17 @@
+import os
+import tempfile
 from concurrent.futures import Future, ThreadPoolExecutor
 from contextlib import nullcontext
 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from datetime import timedelta
+from typing import Any, cast, Optional, Union
 from typing_extensions import deprecated, Protocol, runtime_checkable
 
 import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
 from torch.distributed._state_dict_utils import _copy_state_dict, _create_cpu_state_dict
+from torch.distributed.checkpoint._pg_transport import PGTransport
 from torch.distributed.checkpoint._state_dict_stager import StateDictStager
 from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
 
@@ -315,3 +321,146 @@ def synchronize_staging(self) -> None:
 
     def close(self) -> None:
         pass
+
+
+class _ReplicationStager(AsyncStager):
+    """
+    An AsyncStager implementation that replicates state_dict across training ranks
+    using PGTransport.
+
+    Args:
+        pg: ProcessGroup for distributed communication
+        timeout: Timeout for communication operations
+        device: Device to use for tensor operations
+        storage_dir: Directory to store persisted state_dicts
+
+    Warning: This is experimental and subject to change.
+    """
+
+    _synchronize_after_execute: bool = False
+
+    def __init__(
+        self,
+        pg: ProcessGroup,
+        timeout: timedelta = timedelta(minutes=30),
+        device: torch.device = torch.device("cpu"),
+        storage_dir: Optional[str] = None,
+    ):
+        self._pg = pg
+        self._timeout = timeout
+        self._device = device
+        self._transport = PGTransport(pg, timeout, device, None)
+
+        # Set up storage directory for persisting exchanged state_dicts
+        if storage_dir is None:
+            self._storage_dir = tempfile.mkdtemp(prefix="replication_stager_")
+        else:
+            self._storage_dir = storage_dir
+        os.makedirs(self._storage_dir, exist_ok=True)
+
+    def stage(
+        self, state_dict: STATE_DICT_TYPE
+    ) -> Union[Future[STATE_DICT_TYPE], STATE_DICT_TYPE]:
+        """
+        Stage the state_dict by replicating it across ranks. Returns a state_dict representing
+        the received replica.
+
+        Perform the actual replication logic. Creates bidirectional pairs where each rank exchanges
+        state_dict with its partner at (rank + world_size//2) % world_size.
+        Uses simple rank-based ordering to prevent deadlocks.
+
+        Assumes world_size is always even.
+        """
+        if not dist.is_initialized():
+            return state_dict
+
+        world_size = dist.get_world_size()
+
+        current_rank = dist.get_rank()
+
+        # Calculate partner rank using half-world offset
+        # creates bidirectional pairs for replication.
+        offset = world_size // 2
+        partner_rank = (current_rank + offset) % world_size
+
+        # Use simple rank-based ordering to prevent deadlocks.
+        # Lower-numbered rank sends first, higher-numbered rank receives first.
+        if current_rank < partner_rank:
+            # Send first, then receive
+            self._transport.send_checkpoint([partner_rank], state_dict)
+            received_state_dict = self._transport.recv_checkpoint(partner_rank)
+        else:
+            # Receive first, then send
+            received_state_dict = self._transport.recv_checkpoint(partner_rank)
+            self._transport.send_checkpoint([partner_rank], state_dict)
+
+        # Persist the received state_dict for future discoverability
+        received_state_dict = cast(STATE_DICT_TYPE, received_state_dict)
+        self._persist_state_dict(received_state_dict, current_rank, partner_rank)
+
+        return received_state_dict
+
+    def _persist_state_dict(
+        self, state_dict: STATE_DICT_TYPE, current_rank: int, partner_rank: int
+    ) -> None:
+        """
+        Persist the received state_dict to disk for future discoverability.
+        Only keeps one replica per rank, overwriting any previous replica.
+        Uses atomic write pattern (temp file + rename).
+
+        Args:
+            state_dict: The state_dict received from partner rank
+            current_rank: Current rank that received the state_dict
+            partner_rank: Rank that sent the state_dict
+        """
+        final_path = self._get_persisted_path(current_rank, partner_rank)
+        temp_path = final_path + ".tmp"
+
+        try:
+            # Ensure parent directory exists and is writable
+            os.makedirs(os.path.dirname(final_path), exist_ok=True)
+
+            # Write to temporary file with explicit flushing
+            with open(temp_path, "wb") as f:
+                torch.save(state_dict, f)
+                # Flush application buffers to OS buffers
+                f.flush()
+                # Force OS buffers to disk for durability
+                os.fsync(f.fileno())
+
+            # Atomic rename to final location
+            os.rename(temp_path, final_path)
+        except Exception as e:
+            # Clean up temp file if it exists
+            try:
+                if os.path.exists(temp_path):
+                    os.remove(temp_path)
+            except Exception:
+                pass  # Ignore cleanup errors
+            # Re-raise the original exception with more context
+            raise RuntimeError(
+                f"Failed to persist state_dict from rank {partner_rank} to rank {current_rank}: {e}"
+            ) from e
+
+    def _get_persisted_path(self, current_rank: int, partner_rank: int) -> str:
+        """
+        Get the file path where a state_dict would be persisted.
+
+        Args:
+            current_rank: Current rank
+
+        Returns:
+            File path for the persisted state_dict
+        """
+        filename = f"rank_{current_rank}_replica_partner_{partner_rank}.pt"
+        return os.path.join(self._storage_dir, filename)
+
+    def synchronize_staging(self) -> None:
+        """
+        No-op function, since staging is blocking.
+        """
+
+    def close(self) -> None:
+        """
+        Clean up resources. Persisted files are intentionally left for future discovery.
+        """
diff --git a/torch/distributed/checkpoint/state_dict_loader.py b/torch/distributed/checkpoint/state_dict_loader.py
index 41e185574b194..ae3c4df775abd 100644
--- a/torch/distributed/checkpoint/state_dict_loader.py
+++ b/torch/distributed/checkpoint/state_dict_loader.py
@@ -1,8 +1,10 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
+import inspect
+import logging
 import os
 import warnings
-from typing import Any, cast, Optional, Union
+from typing import Any, cast, Optional, TYPE_CHECKING, Union
 from typing_extensions import deprecated
 
 import torch
@@ -18,8 +20,13 @@
 from .utils import _api_bc_check, _DistWrapper, _profile
 
 
+if TYPE_CHECKING:
+    from torch.distributed.checkpoint.metadata import Metadata
+
 __all__ = ["load_state_dict", "load"]
 
+logger = logging.getLogger()
+
 
 @deprecated(
     "`load_state_dict` is deprecated and will be removed in future versions. "
@@ -213,12 +220,48 @@ def _load_state_dict(
         ckpt_kwargs["checkpoint_id"] = ckpt_id
         ckpt_kwargs["process_group"] = distW.group
 
+    use_collectives = True
+    metadata: Optional[Metadata] = None
+
     @_dcp_method_logger(**ckpt_kwargs)
     def local_step():
+        nonlocal use_collectives
+        nonlocal metadata
+
+        # Use global metadata if available, otherwise fallback to rank local metadata
+        try:
+            metadata = storage_reader.read_metadata()
+        except Exception:
+            logger.info(
+                "Global metadata is not found. Falling back to rank local metadata."
+            )
+
+        if (
+            not metadata
+            and "kwargs" in inspect.signature(storage_reader.read_metadata).parameters
+        ):
+            try:
+                metadata = storage_reader.read_metadata(rank=distW.rank)  # noqa: F841
+                use_collectives = False
+            except Exception:
+                logger.info("Rank local metadata is not found.")
+
         assert planner is not None
-        metadata = storage_reader.read_metadata()
+        assert metadata is not None
         planner.set_up_planner(state_dict, metadata, distW.is_coordinator)
-        storage_reader.set_up_storage_reader(metadata, distW.is_coordinator)
+
+        if (
+            "kwargs"
+            in inspect.signature(storage_reader.set_up_storage_reader).parameters
+        ):
+            storage_reader.set_up_storage_reader(
+                metadata,
+                distW.is_coordinator,
+                rank=distW.rank,
+                use_collectives=use_collectives,
+            )
+        else:
+            storage_reader.set_up_storage_reader(metadata, distW.is_coordinator)
 
         local_plan = planner.create_local_plan()
         local_plan = storage_reader.prepare_local_plan(local_plan)
@@ -231,18 +274,29 @@ def global_step(all_local_plans):
         all_local_plans = storage_reader.prepare_global_plan(all_local_plans)
         return all_local_plans
 
-    central_plan: LoadPlan = distW.reduce_scatter("plan", local_step, global_step)
+    central_plan: Optional[LoadPlan] = None
+    if use_collectives:
+        central_plan = distW.reduce_scatter("plan", local_step, global_step)
+    else:
+        local_plan: LoadPlan = local_step()
+        global_plan: list[LoadPlan] = global_step([local_plan])
+        central_plan = global_plan[0]
 
     @_dcp_method_logger(**ckpt_kwargs)
     def read_data():
         assert planner is not None
+        assert central_plan is not None
         final_local_plan = planner.finish_plan(central_plan)
         all_reads = storage_reader.read_data(final_local_plan, planner)
 
         all_reads.wait()
         return None
 
-    _ = distW.all_gather("read", read_data)
+    if use_collectives:
+        _ = distW.all_gather("read", read_data)
+    else:
+        read_data()
+        distW.barrier()
 
 
 def _load_state_dict_from_keys(
diff --git a/torch/distributed/checkpoint/state_dict_saver.py b/torch/distributed/checkpoint/state_dict_saver.py
index 93610a26a180f..9971f19db8174 100644
--- a/torch/distributed/checkpoint/state_dict_saver.py
+++ b/torch/distributed/checkpoint/state_dict_saver.py
@@ -32,7 +32,7 @@
     StagingOptions,
 )
 from torch.distributed.checkpoint.stateful import Stateful
-from torch.distributed.checkpoint.storage import StorageWriter
+from torch.distributed.checkpoint.storage import StorageWriter, WriteResult
 from torch.distributed.distributed_c10d import _get_default_group
 
 from .utils import _api_bc_check, _DistWrapper, _profile
@@ -92,6 +92,7 @@ def save(
     planner: Optional[SavePlanner] = None,
     process_group: Optional[dist.ProcessGroup] = None,
     no_dist: bool = False,
+    use_collectives: bool = True,
 ) -> Metadata:
     """
     Save a distributed model in SPMD style.
@@ -143,8 +144,13 @@ def save(
             (Default: ``None``)
         no_dist (bool):
             If ``True``, this function will assume the intent is to load
-            a checkpoint without using cross-rank synchronization.
+            a checkpoint on a single rank/process.
             (Default: ``False``)
+        use_collectives (bool): If ``False``, this function will assume the intent is to save
+            a checkpoint without using cross-rank synchronization.
+            (Default: ``True``)
+            This configuration is experimental and should be used with caution.
+            It will change the format of the saved checkpoint and may not be backward compatible.
 
     Returns:
         Metadata: Metadata object for the saved checkpoint.
@@ -190,6 +196,7 @@ def save(
             process_group=process_group,
             no_dist=no_dist,
             planner=planner,
+            use_collectives=use_collectives,
         )
 
 
@@ -217,6 +224,8 @@ def async_save(
     process_group: Optional[dist.ProcessGroup] = None,
     async_checkpointer_type: AsyncCheckpointerType = AsyncCheckpointerType.THREAD,
     async_stager: Optional[AsyncStager] = None,
+    no_dist: bool = False,
+    use_collectives: bool = True,
 ) -> Union[Future, AsyncSaveResponse]:
     """Asynchronous version of ``save``. This code first de-stages the state_dict on to the
     staging storage (defaults to CPU memory), and then calls the `save` in a separate thread.
@@ -249,6 +258,13 @@ def async_save(
         async_stager (AsyncStager):
             provides staging implementation. If storage_writer implements AsyncStager
             and async_stager is provided, async_stager will be used for staging
+        no_dist (bool):
+            If ``True``, this function will assume the intent is to save
+            a checkpoint on a single rank/process.
+            (Default: ``False``)
+        use_collectives: If False, Save the checkpoint without rank coordination. (Default: ``True``)
+            This configuration is experimental and should be used with caution.
+            It will change the format of the saved checkpoint and may not be backward compatible.
 
     Returns:
         Future: A future holding the resultant Metadata object from `save`.
@@ -320,6 +336,8 @@ def stage_state_dict() -> Union[Future[STATE_DICT_TYPE], STATE_DICT_TYPE]:
         storage_writer=storage_writer,
         planner=planner,
         process_group=process_group,
+        no_dist=no_dist,
+        use_collectives=use_collectives,
     )
 
     if isinstance(staging_future_or_state_dict, Future):
@@ -374,6 +392,7 @@ def _save_state_dict(
     coordinator_rank: int = 0,
     no_dist: bool = False,
     planner: Optional[SavePlanner] = None,
+    use_collectives: bool = True,
 ) -> Metadata:
     torch._C._log_api_usage_once("torch.distributed.checkpoint.save_state_dict")
 
@@ -406,7 +425,18 @@ def local_step():
                 storage_meta=storage_meta,
                 is_coordinator=distW.is_coordinator,
             )
-        storage_writer.set_up_storage_writer(distW.is_coordinator)
+
+        if (
+            "kwargs"
+            in inspect.signature(storage_writer.set_up_storage_writer).parameters
+        ):
+            storage_writer.set_up_storage_writer(
+                distW.is_coordinator,
+                rank=distW.rank,
+                use_collectives=use_collectives,
+            )
+        else:
+            storage_writer.set_up_storage_writer(distW.is_coordinator)
 
         local_plan = planner.create_local_plan()
         local_plan = storage_writer.prepare_local_plan(local_plan)
@@ -421,11 +451,18 @@ def global_step(all_local_plans):
         all_local_plans = storage_writer.prepare_global_plan(all_local_plans)
         return all_local_plans
 
-    central_plan: SavePlan = distW.reduce_scatter("plan", local_step, global_step)
+    central_plan: Optional[SavePlan] = None
+    if use_collectives:
+        central_plan = distW.reduce_scatter("plan", local_step, global_step)
+    else:
+        local_plan: SavePlan = local_step()
+        global_plan: list[SavePlan] = global_step([local_plan])
+        central_plan = global_plan[0]
 
     @_dcp_method_logger(**ckpt_kwargs)
     def write_data():
         assert planner is not None
+        assert central_plan is not None
         final_local_plan = planner.finish_plan(central_plan)
         all_writes = storage_writer.write_data(final_local_plan, planner)
 
@@ -438,4 +475,11 @@ def finish_checkpoint(all_results):
         storage_writer.finish(metadata=global_metadata, results=all_results)
         return global_metadata
 
-    return distW.all_reduce("write", write_data, finish_checkpoint)
+    if use_collectives:
+        metadata = distW.all_reduce("write", write_data, finish_checkpoint)
+    else:
+        write_results: list[WriteResult] = write_data()
+        metadata = finish_checkpoint([write_results])
+        distW.barrier()
+
+    return metadata
diff --git a/torch/distributed/checkpoint/storage.py b/torch/distributed/checkpoint/storage.py
index 8cc8b9f7520dc..b184d7b170052 100644
--- a/torch/distributed/checkpoint/storage.py
+++ b/torch/distributed/checkpoint/storage.py
@@ -61,7 +61,9 @@ def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
         ...
 
     @abc.abstractmethod
-    def set_up_storage_writer(self, is_coordinator: bool) -> None:
+    def set_up_storage_writer(
+        self, is_coordinator: bool, *args: Any, **kwargs: Any
+    ) -> None:
         """
         Initialize this instance.
 
@@ -200,7 +202,7 @@ def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
         ...
 
     @abc.abstractmethod
-    def read_metadata(self) -> Metadata:
+    def read_metadata(self, *args: Any, **kwargs: Any) -> Metadata:
         """
         Read the checkpoint metadata.
 
@@ -210,7 +212,9 @@ def read_metadata(self) -> Metadata:
         """
 
     @abc.abstractmethod
-    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
+    def set_up_storage_reader(
+        self, metadata: Metadata, is_coordinator: bool, *args: Any, **kwargs: Any
+    ) -> None:
         """
         Initialize this instance.
 
diff --git a/torch/distributed/collective_utils.py b/torch/distributed/collective_utils.py
index b1a7c824c2e3b..715cd251ea4d7 100644
--- a/torch/distributed/collective_utils.py
+++ b/torch/distributed/collective_utils.py
@@ -9,12 +9,29 @@
 
 from __future__ import annotations
 
+import importlib
+import logging
+from collections import defaultdict
 from dataclasses import dataclass
-from typing import Any, Callable, cast, Generic, Optional, TypeVar, Union
+from typing import Any, Callable, cast, Generic, Optional, TYPE_CHECKING, TypeVar, Union
 
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+import torch
 import torch.distributed as dist
 
 
+__all__: list[str] = [
+    "SyncPayload",
+    "broadcast",
+    "all_gather",
+    "all_gather_object_enforce_type",
+]
+
+logger = logging.getLogger(__name__)
+
 T = TypeVar("T")
 
 
@@ -215,3 +232,112 @@ def all_gather_object_enforce_type(
                 f"Object type at index {i} is {type(object_list[i])}, "
                 f"while first object type is {type(first_obj)}"
             )
+
+
+def _summarize_ranks(ranks: Iterable[int]) -> str:
+    ranks = sorted(ranks)
+    assert min(ranks) >= 0, "ranks should all be positive"
+    assert len(set(ranks)) == len(ranks), "ranks should not contain duplicates"
+    curr: Optional[Union[int, range]] = None
+    ranges = []
+    while ranks:
+        x = ranks.pop(0)
+        if curr is None:
+            curr = x
+        elif isinstance(curr, int):
+            if x == curr + 1:
+                curr = range(curr, x + 1, 1)
+            else:
+                step = x - curr
+                curr = range(curr, x + step, step)
+        else:
+            assert isinstance(curr, range)
+            if x == curr.stop:
+                curr = range(curr.start, curr.stop + curr.step, curr.step)
+            else:
+                ranges.append(curr)
+                curr = x
+
+    if isinstance(curr, int):
+        ranges.append(range(curr, curr + 1, 1))
+    elif isinstance(curr, range):
+        ranges.append(curr)
+
+    result = []
+    for r in ranges:
+        if len(r) == 1:
+            result.append(f"{r.start}")
+        elif r.step == 1:
+            result.append(f"{r.start}:{r.stop}")
+        else:
+            result.append(f"{r.start}:{r.stop}:{r.step}")
+    return ",".join(result)
+
+
+def _check_philox_rng_sync(
+    generator: torch.Generator, group: dist.ProcessGroup
+) -> tuple[dict[Any, set], str]:
+    local_state = generator.get_state()
+    all_states = [torch.empty_like(local_state) for _ in range(group.size())]
+    torch.distributed.all_gather(all_states, local_state)
+    seeds_offsets = [
+        (state[:8].view(torch.uint64).item(), state[8:].view(torch.uint64).item())
+        for state in all_states
+    ]
+    seed_offset_ranks = defaultdict(set)
+    for rank, (seed, offset) in enumerate(seeds_offsets):
+        seed_offset_ranks[(seed, offset)].add(rank)
+    return seed_offset_ranks, "(Seed, Offset)"
+
+
+def _check_cpu_rng_sync(
+    generator: torch.Generator, group: dist.ProcessGroup
+) -> tuple[dict[Any, set], str]:
+    # seed is returned as uint64_t from C impl, so may not fit in torch int64 tensor directly.
+    state_tensor = generator.get_state()
+    all_state_tensors = [torch.empty_like(state_tensor) for _ in range(group.size())]
+    torch.distributed.all_gather(all_state_tensors, state_tensor)
+    state_ranks = defaultdict(set)
+    for rank, state_tensor in enumerate(all_state_tensors):
+        # Summarize the state vector of the CPU rng.
+        # The properties that matter most are (1) its different if there is a state difference, (2) its printable
+        # (see desync table- not viable to print whole state vector of size 5k)
+        state_ranks[torch.hash_tensor(state_tensor).item()].add(rank)
+    return state_ranks, "Generator state hash"
+
+
+def _check_rng_sync_internal(
+    generator: torch.Generator, group: dist.ProcessGroup
+) -> tuple[dict[Any, set], str]:
+    if generator.device.type == "cuda":
+        return _check_philox_rng_sync(generator, group)
+    elif generator.device.type == "cpu":
+        return _check_cpu_rng_sync(generator, group)
+    else:
+        raise NotImplementedError(
+            f"Unsupported generator device: {generator.device.type}"
+        )
+
+
+def _desync_table_str(tag: str, value_ranks: dict[Any, set[int]]) -> str:
+    headers = ["Ranks", f"{tag} values"]
+    rank_values = [
+        [_summarize_ranks(ranks), str(value)] for value, ranks in value_ranks.items()
+    ]
+    if importlib.util.find_spec("tabulate"):
+        from tabulate import tabulate
+
+        return tabulate(rank_values, headers=headers)
+    row_str = "\n".join([str(row) for row in rank_values])
+    return str(f"{headers}\n{row_str}")
+
+
+def _check_rng_sync(
+    generator: torch.Generator, group: dist.ProcessGroup
+) -> Optional[str]:
+    value_ranks, value_header = _check_rng_sync_internal(generator, group)
+    log_str = None
+    if len(value_ranks) > 1:
+        log_str = f"Generator desync detected:\n{_desync_table_str(value_header, value_ranks)}"
+        logger.error(log_str)
+    return log_str
diff --git a/torch/distributed/constants.py b/torch/distributed/constants.py
index c1e604bc86753..bfa8785218645 100644
--- a/torch/distributed/constants.py
+++ b/torch/distributed/constants.py
@@ -1,7 +1,11 @@
 from datetime import timedelta
 from typing import Optional
 
-from torch._C._distributed_c10d import _DEFAULT_PG_TIMEOUT
+# Import from centralized fallback module - no ImportError handling needed
+from torch.distributed._distributed_c10d import (
+    _DEFAULT_PG_NCCL_TIMEOUT,
+    _DEFAULT_PG_TIMEOUT,
+)
 
 
 __all__ = ["default_pg_timeout", "default_pg_nccl_timeout"]
@@ -16,11 +20,4 @@
 # Later, we could consider merging them back together at the c++ layer if we can align on a same value.
 # (only if TORCH_NCCL_BLOCKING_WAIT or TORCH_NCCL_ASYNC_ERROR_HANDLING is set to 1).
 
-try:
-    from torch._C._distributed_c10d import _DEFAULT_PG_NCCL_TIMEOUT
-
-    default_pg_nccl_timeout: Optional[timedelta] = _DEFAULT_PG_NCCL_TIMEOUT
-except ImportError:
-    # if C++ NCCL support is not compiled, we don't have access to the default nccl value.
-    # if anyone is actually trying to use nccl in this state, it should error.
-    default_pg_nccl_timeout = None
+default_pg_nccl_timeout: Optional[timedelta] = _DEFAULT_PG_NCCL_TIMEOUT
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index 85f2fff4f831b..904d1f84100cc 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -5,40 +5,20 @@
 import os
 import threading
 import warnings
+from collections.abc import Iterator
 from functools import reduce
-from itertools import chain
+from itertools import chain, zip_longest
 from typing import Optional, TYPE_CHECKING, Union
 
 import torch
-from torch.distributed import is_available
 from torch.utils._typing_utils import not_none
 
 
 __all__ = ["init_device_mesh", "DeviceMesh"]
 
 
-if not is_available():
-    import sys
-
-    # We need to create the stubs when distributed is not available.
-    # Otherwise, we would fail the doc tests (```./.ci/pytorch/docs-test.sh```),
-    # since it would try to import ``torch.distributed.device_mesh`` or
-    # ``torch.distributed.init_device_mesh`` but cannot find them.
-
-    class _DeviceMeshStub:
-        pass
-
-    def _init_device_mesh_stub():
-        pass
-
-    sys.modules["torch.distributed.device_mesh"].DeviceMesh = _DeviceMeshStub  # type: ignore[attr-defined]
-    sys.modules[
-        "torch.distributed.device_mesh"
-    ].init_device_mesh = _init_device_mesh_stub  # type: ignore[attr-defined]
-
-
-else:
-    from torch._C._distributed_c10d import Backend as C10dBackend
+if True:  # just to temporarily avoid reindentation
+    from torch.distributed._distributed_c10d import Backend as C10dBackend
     from torch.distributed.distributed_c10d import (
         _get_default_group,
         _resolve_process_group,
@@ -69,7 +49,7 @@ def __init__(self) -> None:
             self.mesh_stack: list[DeviceMesh] = []
             self.child_to_root_mapping: dict[DeviceMesh, DeviceMesh] = {}
             self.mesh_dim_group_options: dict[
-                int, tuple[str, Optional[C10dBackend.Options]]
+                int, tuple[Optional[str], Optional[C10dBackend.Options]]
             ] = {}
             self.root_to_flatten_mapping: dict[DeviceMesh, dict[str, DeviceMesh]] = {}
             # Record flatten mesh name to its mesh dim index in root mesh.
@@ -166,7 +146,13 @@ def create_sub_mesh(
             return res_submesh
 
         def create_flatten_mesh(
-            self, device_mesh: "DeviceMesh", mesh_dim_name: Optional[str] = None
+            self,
+            device_mesh: "DeviceMesh",
+            mesh_dim_name: Optional[str] = None,
+            backend_override: tuple[Optional[str], Optional[C10dBackend.Options]] = (
+                None,
+                None,
+            ),
         ) -> "DeviceMesh":
             root_mesh = _mesh_resources.get_root_mesh(device_mesh)
 
@@ -181,7 +167,7 @@ def create_flatten_mesh(
             # Check whether the mesh_dim_name for flattened mesh is valid.
             self.flatten_name_to_root_dims.setdefault(root_mesh, {})
             invalid_dim_names = chain(
-                *list(not_none(root_mesh.mesh_dim_names)),
+                list(not_none(root_mesh.mesh_dim_names)),
                 *self.flatten_name_to_root_dims[root_mesh].keys(),
             )
             if mesh_dim_name in invalid_dim_names:
@@ -217,6 +203,7 @@ def create_flatten_mesh(
                     root_mesh.device_type,
                     mesh_nd,
                     mesh_dim_names=(mesh_dim_name,),
+                    backend_override=(backend_override,),
                 )
                 if cur_rank in mesh_nd:
                     res_flattened_mesh = flattened_mesh
@@ -283,7 +270,7 @@ def get_mesh_dim_by_name(
         def _set_mesh_dim_group_options(
             self,
             dim: int,
-            backend: str,
+            backend: Optional[str],
             pg_options: Optional[C10dBackend.Options] = None,
         ) -> None:
             self.mesh_dim_group_options[dim] = (backend, pg_options)
@@ -439,6 +426,9 @@ def __init__(
             mesh: Union[torch.Tensor, "ArrayLike"],
             *,
             mesh_dim_names: Optional[tuple[str, ...]] = None,
+            backend_override: Optional[
+                tuple[tuple[Optional[str], Optional[C10dBackend.Options]], ...]
+            ] = None,
             _init_backend: bool = True,
         ) -> None:
             self.device_type = device_type
@@ -450,6 +440,8 @@ def __init__(
                 else torch.tensor(mesh, device="cpu", dtype=torch.int)
             )
             self.mesh_dim_names = tuple(mesh_dim_names) if mesh_dim_names else None
+            if backend_override is None:
+                backend_override = ((None, None),) * self.mesh.ndim
 
             # private field to pre-generate DeviceMesh's hash
             self._flatten_mesh_list = tuple(self.mesh.flatten().tolist())
@@ -463,7 +455,7 @@ def __init__(
                 # process (we need to know if the current global rank is in the mesh or not).
                 if _init_backend:
                     self._setup_world_group_and_device()
-                    self._init_process_groups()
+                    self._init_process_groups(backend_override)
 
                 if is_initialized() and get_backend() == "threaded":
                     self._thread_id = threading.get_ident()
@@ -513,19 +505,25 @@ def _setup_world_group_and_device(self):
                     # heuristic to set the current cuda/cuda-like device base on num of gpu devices available in each host
                     # NOTE: This device selection would only work for homogeneous hardware.
                     num_devices_per_host = device_handle.device_count()
-                    if (
-                        world_size > num_devices_per_host
-                        and world_size % num_devices_per_host != 0
-                    ):
-                        raise RuntimeError(
-                            f"DeviceMesh only support homogeneous hardware, but found "
-                            f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!"
-                        )
-                    device_handle.set_device(get_rank() % num_devices_per_host)
+                    if num_devices_per_host:
+                        if (
+                            world_size > num_devices_per_host
+                            and world_size % num_devices_per_host != 0
+                        ):
+                            raise RuntimeError(
+                                f"DeviceMesh only support homogeneous hardware, but found "
+                                f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!"
+                            )
+                        device_handle.set_device(get_rank() % num_devices_per_host)
 
             return _get_default_group()
 
-        def _init_process_groups(self):
+        def _init_process_groups(
+            self,
+            backend_override: tuple[
+                tuple[Optional[str], Optional[C10dBackend.Options]], ...
+            ],
+        ):
             # group_name associated with each mesh dimension, each
             # mesh dimension should have one sub-group per rank
             #
@@ -535,7 +533,9 @@ def _init_process_groups(self):
             if (
                 self.mesh.ndim == 1
                 and self.mesh.numel() == get_world_size()
-                and 0 not in _mesh_resources.mesh_dim_group_options
+                and _mesh_resources.mesh_dim_group_options.get(0, (None, None))
+                == (None, None)
+                and backend_override[0] == (None, None)
             ):
                 # Append the default pg to the first dim groups only if the default pg is compatible with `self.device_type`.
                 # Otherwise, create new pg.
@@ -563,12 +563,17 @@ def _init_process_groups(self):
                     # Respect dim group options specified via _MeshEnv.set_dim_group_options().
                     # Inherit from the parent group if no options are specified for the group.
                     if dim in _mesh_resources.mesh_dim_group_options:
+                        if backend_override[dim] != (None, None):
+                            raise RuntimeError(
+                                f"Dimension {dim} present both in the backend_override argument "
+                                "and via _mesh_resources._set_mesh_dim_group_options"
+                            )
                         (
                             backend,
                             pg_options,
                         ) = _mesh_resources.mesh_dim_group_options[dim]
                     else:
-                        backend, pg_options = None, None
+                        backend, pg_options = backend_override[dim]
 
                     # If we have a 2D mesh with mesh_dim_names ("dp", "tp"), the group description
                     # of the subgroups would be `mesh_dim_dp` and `mesh_name_tp`.
@@ -591,10 +596,19 @@ def _init_process_groups(self):
                     dim_group = None
                     has_split_group = False
                     if (
-                        bound_device_id := getattr(
-                            default_group, "bound_device_id", None
+                        (
+                            bound_device_id := getattr(
+                                default_group, "bound_device_id", None
+                            )
                         )
-                    ) is not None and torch.cuda.is_available():
+                        is not None
+                        and torch.cuda.is_available()
+                        and (
+                            backend is None
+                            or default_group._get_backend(torch.device("cuda")).name()
+                            == backend
+                        )
+                    ):
                         dim_group = split_group(
                             parent_pg=default_group,
                             pg_options=pg_options,
@@ -669,18 +683,17 @@ def __hash__(self):
             return self._hash
 
         def __eq__(self, other: object) -> bool:
+            if self is other:
+                return True
             if not isinstance(other, DeviceMesh):
                 return False
-            if id(self) == id(other):
-                return True
-            else:
-                return (
-                    self._flatten_mesh_list == other._flatten_mesh_list
-                    and self.mesh.shape == other.mesh.shape
-                    and self.device_type == other.device_type
-                    and self.mesh_dim_names == other.mesh_dim_names
-                    and self._thread_id == other._thread_id
-                )
+            return (
+                self._flatten_mesh_list == other._flatten_mesh_list
+                and self.mesh.shape == other.mesh.shape
+                and self.device_type == other.device_type
+                and self.mesh_dim_names == other.mesh_dim_names
+                and self._thread_id == other._thread_id
+            )
 
         def __getitem__(
             self, mesh_dim_names: Union[str, tuple[str, ...]]
@@ -968,15 +981,21 @@ def get_coordinate(self) -> Optional[list[int]]:
             """
             return self._coordinate_on_dim if self._coordinate_on_dim else None
 
-        def _flatten(self, mesh_dim_name: Optional[str] = None) -> "DeviceMesh":
+        def _flatten(
+            self,
+            mesh_dim_name: Optional[str] = None,
+            backend_override: Union[
+                None, str, C10dBackend.Options, tuple[str, C10dBackend.Options]
+            ] = None,
+        ) -> "DeviceMesh":
             """
             Returns a 1D DeviceMesh by flattening the current DeviceMesh.
 
             If no mesh_dim_name is provided, the default is a string concatenating the mesh_dim_names of the
             given submesh with each mesh_dim_name separated by "_". For example, if we have a 3D mesh
             DeviceMesh([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], mesh_dim_names=("dp", "cp", "tp")), calling
-            mesh_3d["dp", "cp"]._flatten() will create a 1D submesh DeviceMesh([0, 1, 2, 3], mesh_dim_names=("dp_cp",))
-            on rank 0, 1, 2, 3 and a 1D submesh DeviceMesh([4, 5, 6, 7], mesh_dim_names=("dp_cp",)) on rank 4, 5, 6, 7.
+            mesh_3d["dp", "cp"]._flatten() will create a 1D submesh DeviceMesh([0, 2, 4, 6], mesh_dim_names=("dp_cp",))
+            on rank 0, 2, 4, 6 and a 1D submesh DeviceMesh([1, 3, 5, 7], mesh_dim_names=("dp_cp",)) on rank 1, 3, 5, 7.
 
             After the flattened dimension is created, to access the flattened dimension in mesh_3d, one can use the
             existing slicing method to obtain the flattened mesh through calling mesh_3d["dp_cp"].
@@ -986,13 +1005,65 @@ def _flatten(self, mesh_dim_name: Optional[str] = None) -> "DeviceMesh":
                     "Cannot flatten a DeviceMesh without mesh_dim_names!"
                 )
 
-            return _mesh_resources.create_flatten_mesh(self, mesh_dim_name)
+            if backend_override is not None:
+                (backend_override_tuple,) = _normalize_backend_override(
+                    {0: backend_override}, 1
+                )
+            else:
+                backend_override_tuple = (None, None)
+
+            return _mesh_resources.create_flatten_mesh(
+                self, mesh_dim_name, backend_override_tuple
+            )
+
+    def _normalize_backend_override(
+        backend_override: dict[
+            Union[int, str],
+            Union[str, C10dBackend.Options, tuple[str, C10dBackend.Options]],
+        ],
+        ndim: int,
+        mesh_dim_names: Optional[tuple[str, ...]] = None,
+    ) -> Iterator[tuple[Optional[str], Optional[C10dBackend.Options]]]:
+        if mesh_dim_names is None:
+            mesh_dim_names = ()
+        for dim_idx, dim_name in zip_longest(range(ndim), mesh_dim_names):
+            if dim_name is not None and dim_name in backend_override:
+                if dim_idx in backend_override:
+                    raise RuntimeError(
+                        f"Found redundant dim index {dim_idx} and "
+                        f"name {dim_name} in backend_override"
+                    )
+                val = backend_override.pop(dim_name)
+            elif dim_idx in backend_override:
+                val = backend_override.pop(dim_idx)
+            else:
+                yield (None, None)
+                continue
+
+            if isinstance(val, str):
+                yield (val, None)
+            elif isinstance(val, C10dBackend.Options):
+                yield (None, val)
+            else:
+                yield val
+
+        if backend_override:
+            raise RuntimeError(
+                f"Found invalid keys in backend_override: got {list(backend_override.keys())}, "
+                f"expected integers in range [0, {ndim}) or one of {mesh_dim_names}"
+            )
 
     def init_device_mesh(
         device_type: str,
         mesh_shape: tuple[int, ...],
         *,
         mesh_dim_names: Optional[tuple[str, ...]] = None,
+        backend_override: Optional[
+            dict[
+                Union[int, str],
+                Union[str, C10dBackend.Options, tuple[str, C10dBackend.Options]],
+            ]
+        ] = None,
     ) -> DeviceMesh:
         """
         Initializes a `DeviceMesh` based on `device_type`, `mesh_shape`, and `mesh_dim_names` parameters.
@@ -1017,6 +1088,11 @@ def init_device_mesh(
             mesh_dim_names (Tuple[str], optional): A tuple of mesh dimension names to assign to each dimension
                 of the multi-dimensional array describing the layout of devices. Its length must match the length
                 of `mesh_shape`. Each string in `mesh_dim_names` must be unique.
+            backend_override (Dict[int | str, tuple[str, Options] | str | Options], optional): Overrides for some or all of
+                the ProcessGroups that will be created for each mesh dimension. Each key can be either the index of a
+                dimension or its name (if mesh_dim_names is provided). Each value can be a tuple containing the name
+                of the backend and its options, or just one of these two components (in which case the other will be
+                set to its default value).
 
         Returns:
             DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
@@ -1043,6 +1119,15 @@ def init_device_mesh(
                     f"Found len(mesh_dim_names): {len(mesh_dim_names)} and len(mesh_shape):{len(mesh_shape)}.",
                 )
 
+        if backend_override is not None:
+            backend_override_tuple = tuple(
+                _normalize_backend_override(
+                    backend_override, len(mesh_shape), mesh_dim_names
+                )
+            )
+        else:
+            backend_override_tuple = None
+
         # assume valid device types are all letters
         if device_type and not device_type.isalpha():
             raise RuntimeError(
@@ -1058,6 +1143,7 @@ def init_device_mesh(
             device_type=device_type,
             mesh=mesh,
             mesh_dim_names=mesh_dim_names,
+            backend_override=backend_override_tuple,
         )
 
         return device_mesh
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index a7ca2453b251f..40660b41fe3eb 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -19,13 +19,21 @@
 from typing_extensions import deprecated
 
 import torch
+import torch.distributed._distributed_c10d as _c10d
 from torch._C import _DistStoreError as DistStoreError
-from torch._C._distributed_c10d import (
+from torch._utils_internal import set_pytorch_distributed_envs_from_justknobs
+from torch.distributed._distributed_c10d import (  # Process group implementations; Availability flags
     _DistributedBackendOptions,
+    _GLOO_AVAILABLE,
+    _MPI_AVAILABLE,
+    _NCCL_AVAILABLE,
+    _ProcessGroupWrapper,
     _register_process_group,
     _resolve_process_group,
+    _UCC_AVAILABLE,
     _unregister_all_process_groups,
     _unregister_process_group,
+    _XCCL_AVAILABLE,
     AllgatherOptions,
     AllreduceCoalescedOptions,
     AllreduceOptions,
@@ -37,6 +45,11 @@
     get_debug_level,
     PrefixStore,
     ProcessGroup,
+    ProcessGroupGloo,
+    ProcessGroupMPI,
+    ProcessGroupNCCL,
+    ProcessGroupUCC,
+    ProcessGroupXCCL,
     ReduceOp,
     ReduceOptions,
     ReduceScatterOptions,
@@ -44,7 +57,6 @@
     Store,
     Work,
 )
-from torch._utils_internal import set_pytorch_distributed_envs_from_justknobs
 from torch.monitor import _WaitCounter
 from torch.overrides import handle_torch_function, has_torch_function
 from torch.utils._typing_utils import not_none
@@ -131,17 +143,11 @@
     "split_group",
 ]
 
-_MPI_AVAILABLE = True
-_NCCL_AVAILABLE = True
-_GLOO_AVAILABLE = True
-_UCC_AVAILABLE = True
-_XCCL_AVAILABLE = True
-
 _pickler = pickle.Pickler
 _unpickler = pickle.Unpickler
 
 
-# Change __module__ of all imported types from torch._C._distributed_c10d that are public
+# Change __module__ of all imported types from the distributed wrapper that are public
 def _export_c_types() -> None:
     _public_types_to_change_module = [
         AllreduceCoalescedOptions,
@@ -167,45 +173,26 @@ def _export_c_types() -> None:
 
 _export_c_types()
 
-try:
-    from torch._C._distributed_c10d import ProcessGroupMPI
-
+# Add process groups to __all__ and set their module based on availability
+if _MPI_AVAILABLE:
     ProcessGroupMPI.__module__ = "torch.distributed.distributed_c10d"
     __all__ += ["ProcessGroupMPI"]
-except ImportError:
-    _MPI_AVAILABLE = False
-
-try:
-    from torch._C._distributed_c10d import ProcessGroupNCCL
 
+if _NCCL_AVAILABLE:
     ProcessGroupNCCL.__module__ = "torch.distributed.distributed_c10d"
     __all__ += ["ProcessGroupNCCL"]
-except ImportError:
-    _NCCL_AVAILABLE = False
-
-try:
-    from torch._C._distributed_c10d import _ProcessGroupWrapper, ProcessGroupGloo
 
+if _GLOO_AVAILABLE:
     ProcessGroupGloo.__module__ = "torch.distributed.distributed_c10d"
     __all__ += ["ProcessGroupGloo"]
-except ImportError:
-    _GLOO_AVAILABLE = False
-
-try:
-    from torch._C._distributed_c10d import ProcessGroupUCC
 
+if _UCC_AVAILABLE:
     ProcessGroupUCC.__module__ = "torch.distributed.distributed_c10d"
     __all__ += ["ProcessGroupUCC"]
-except ImportError:
-    _UCC_AVAILABLE = False
-
-try:
-    from torch._C._distributed_c10d import ProcessGroupXCCL
 
+if _XCCL_AVAILABLE:
     ProcessGroupXCCL.__module__ = "torch.distributed.distributed_c10d"
     __all__ += ["ProcessGroupXCCL"]
-except ImportError:
-    _XCCL_AVAILABLE = False
 
 logger = logging.getLogger(__name__)
 
@@ -1325,7 +1312,8 @@ def _get_default_store() -> Store:
 def _update_default_pg(pg) -> None:
     _world.default_pg = pg
     rank = pg.rank() if pg is not None and pg != GroupMember.NON_GROUP_MEMBER else -1
-    torch._C._distributed_c10d._set_global_rank(rank)
+
+    _c10d._set_global_rank(rank)
 
 
 def get_backend_config(group: Optional[ProcessGroup] = None) -> str:
@@ -1751,11 +1739,16 @@ def init_process_group(
     else:
         # backward compatible API
         if store is None:
-            rendezvous_iterator = rendezvous(
-                not_none(init_method), rank, world_size, timeout=timeout
-            )
-            store, rank, world_size = next(rendezvous_iterator)
-            store.set_timeout(timeout)
+            if backend == "fake":
+                from torch.testing._internal.distributed.fake_pg import FakeStore
+
+                store = FakeStore()
+            else:
+                rendezvous_iterator = rendezvous(
+                    not_none(init_method), rank, world_size, timeout=timeout
+                )
+                store, rank, world_size = next(rendezvous_iterator)
+                store.set_timeout(timeout)
 
             # Use a PrefixStore to avoid accidental overrides of keys used by
             # different systems (e.g. RPC) in case the store is multi-tenant.
@@ -1934,9 +1927,9 @@ def _new_process_group_helper(
     if "," not in str(backend) and ":" not in str(backend):
         assert backend in Backend.backend_type_map, f"Unknown backend type {backend}"
         if backend == Backend.UNDEFINED:
-            # Currently when backend is UNDEFINED, both ``gloo`` and ``nccl`` backends
-            # will be created, we use nccl(if cuda is available) or gloo as default
-            # backend so we can correctly call getDefaultBackend which in ProcessGroup.
+            # Currently when backend is UNDEFINED, only one backend will be initialized
+            # we use nccl (if cuda is available) or gloo as default backend
+            # so we can correctly call getDefaultBackend which in ProcessGroup.
             if Backend.NCCL in backend_config.get_device_backend_map().values():
                 pg._set_default_backend(ProcessGroup.BackendType.NCCL)
             else:
@@ -1957,7 +1950,7 @@ def _new_process_group_helper(
 
     if device_id:
         pg.bound_device_id = device_id
-    backend_class: torch._C._distributed_c10d.Backend
+    backend_class: _c10d.Backend
     for device, backend_str in backend_config.get_device_backend_map().items():
         # Use the group name as prefix in the default store, such that
         # a single store can be reused by multiple groups.
@@ -2035,8 +2028,12 @@ def _new_process_group_helper(
         elif backend_str == Backend.XCCL:
             if not is_xccl_available():
                 raise RuntimeError("Distributed package doesn't have XCCL built in")
+            backend_options = ProcessGroupXCCL.Options()
+            backend_options.global_ranks_in_group = global_ranks_in_group
+            backend_options.group_name = group_name
+            backend_options._timeout = timeout
             backend_class = ProcessGroupXCCL(
-                backend_prefix_store, group_rank, group_size
+                backend_prefix_store, group_rank, group_size, backend_options
             )
             backend_type = ProcessGroup.BackendType.XCCL
         else:
@@ -3068,7 +3065,9 @@ def _object_to_tensor(obj, device, group):
         if get_debug_level() == DebugLevel.DETAIL and is_nccl_available():
             backend = get_backend(group)
             if backend == Backend.NCCL:
-                hash = torch._C._distributed_c10d._hash_tensors([byte_tensor])
+                from torch.distributed._distributed_c10d import _hash_tensors
+
+                hash = _hash_tensors([byte_tensor])
                 logger.warning(
                     "_object_to_tensor size: %s hash value: %s",
                     byte_tensor.numel(),
@@ -3083,7 +3082,9 @@ def _tensor_to_object(tensor, tensor_size, group):
         if get_debug_level() == DebugLevel.DETAIL and is_nccl_available():
             backend = get_backend(group)
             if backend == Backend.NCCL:
-                hash = torch._C._distributed_c10d._hash_tensors([tensor])
+                from torch.distributed._distributed_c10d import _hash_tensors
+
+                hash = _hash_tensors([tensor])
                 logger.warning(
                     "_tensor_to_object size: %s hash value: %s", tensor.numel(), hash
                 )
@@ -3323,6 +3324,7 @@ def send_object_list(
     group: Optional[ProcessGroup] = None,
     device: Optional[torch.device] = None,
     group_dst: Optional[int] = None,
+    use_batch: bool = False,
 ):
     """
     Sends picklable objects in ``object_list`` synchronously.
@@ -3343,6 +3345,10 @@ def send_object_list(
             ``device`` before sending. Default is ``None``.
         group_dst (int, optional): Destination rank on ``group``.
             Must specify one of ``dst`` and ``group_dst`` but not both
+        use_batch (bool, optional): If True, use batch p2p operations instead of
+            regular send operations. This avoids initializing 2-rank communicators and
+            uses existing entire group communicators. See batch_isend_irecv for usage and
+            assumptions. Default is ``False``.
     Returns:
         ``None``.
 
@@ -3406,7 +3412,12 @@ def send_object_list(
     object_sizes_tensor = torch.cat(size_list)
 
     # Send object sizes
-    send(object_sizes_tensor, group_dst=group_dst, group=group)
+    if use_batch:
+        batch_isend_irecv(
+            [P2POp(isend, object_sizes_tensor, group_peer=group_dst, group=group)]
+        ).pop().wait()
+    else:
+        send(object_sizes_tensor, group_dst=group_dst, group=group)
 
     # Concatenate and send serialized object tensors
     # Note: torch.cat will do an extra memory copy to the current device, if the tensor_list
@@ -3416,7 +3427,12 @@ def send_object_list(
     else:
         object_tensor = torch.cat(tensor_list)
 
-    send(object_tensor, group_dst=group_dst, group=group)
+    if use_batch:
+        batch_isend_irecv(
+            [P2POp(isend, object_tensor, group_peer=group_dst, group=group)]
+        ).pop().wait()
+    else:
+        send(object_tensor, group_dst=group_dst, group=group)
 
 
 @_exception_logger
@@ -3426,6 +3442,7 @@ def recv_object_list(
     group: Optional[ProcessGroup] = None,
     device: Optional[torch.device] = None,
     group_src: Optional[int] = None,
+    use_batch: bool = False,
 ):
     """
     Receives picklable objects in ``object_list`` synchronously.
@@ -3443,6 +3460,10 @@ def recv_object_list(
         device (``torch.device``, optional): If not None, receives on this device.
             Default is ``None``.
         group_src (int, optional): Destination rank on ``group``.  Invalid to specify both ``src`` and ``group_src``.
+        use_batch (bool, optional): If True, use batch p2p operations instead of
+            regular send operations. This avoids initializing 2-rank communicators and
+            uses existing entire group communicators. See batch_isend_irecv for usage and
+            assumptions. Default is ``False``.
 
     Returns:
         Sender rank. -1 if rank is not part of the group. If rank is part of the group,
@@ -3486,6 +3507,10 @@ def recv_object_list(
         >>> objects
         ['foo', 12, {1: 2}]
     """
+    group = _group_or_default_group(group)
+    group_src = _canonicalize_group_rank(group, src, group_src)
+    _check_not_self_rank(group, group_src, "source")
+
     if _rank_not_in_group(group):
         _warn_not_in_group("recv_object_list")
         return -1
@@ -3502,7 +3527,21 @@ def recv_object_list(
     )
 
     # Receive object sizes
-    rank_sizes = recv(object_sizes_tensor, src=src, group=group, group_src=group_src)
+    if use_batch:
+        work = batch_isend_irecv(
+            [
+                P2POp(
+                    irecv,
+                    object_sizes_tensor,
+                    group_peer=group_src,
+                    group=group,
+                )
+            ]
+        ).pop()
+        work.wait()
+        rank_sizes = get_global_rank(group, group_src)
+    else:
+        rank_sizes = recv(object_sizes_tensor, group=group, group_src=group_src)
 
     # Tensor to receive serialized objects into.
     object_tensor = torch.empty(  # type: ignore[call-overload]
@@ -3511,7 +3550,21 @@ def recv_object_list(
         device=current_device,
     )
 
-    rank_objects = recv(object_tensor, src=src, group=group, group_src=group_src)
+    if use_batch:
+        work = batch_isend_irecv(
+            [
+                P2POp(
+                    irecv,
+                    object_tensor,
+                    group_peer=group_src,
+                    group=group,
+                )
+            ]
+        ).pop()
+        work.wait()
+        rank_objects = get_global_rank(group, group_src)
+    else:
+        rank_objects = recv(object_tensor, group=group, group_src=group_src)
     assert rank_sizes == rank_objects, (
         "Mismatch in return ranks for object sizes and objects."
     )
@@ -4811,9 +4864,11 @@ def barrier(
         # may use default device 0, causing issues like hang or all processes
         # creating context on device 0.
         opts.device = device
-        warnings.warn(  # warn only once
-            "No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. "
-        )
+        if group.rank() == 0:
+            warnings.warn(  # warn only once
+                "barrier(): using the device under current context. "
+                "You can specify `device_id` in `init_process_group` to mute this warning."
+            )
 
     work = group.barrier(opts=opts)
 
@@ -4906,7 +4961,7 @@ def monitored_barrier(
 
 
 def _create_process_group_wrapper(
-    wrapped_pg: torch._C._distributed_c10d.Backend,
+    wrapped_pg: _c10d.Backend,
     store_prefix: str,
     store: Store,
     rank: int,
diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
index 2759f20bd2778..1175da3b91b7c 100644
--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@@ -27,7 +27,7 @@
 from torch.distributed.elastic.multiprocessing import ProcessFailure, SignalException
 from torch.distributed.elastic.rendezvous import RendezvousGracefulExitError
 from torch.distributed.elastic.utils.logging import get_logger
-from torch.distributed.numa.binding import NumaOptions
+from torch.numa.binding import NumaOptions
 
 
 __all__ = [
@@ -104,13 +104,6 @@ def __post_init__(self):
             self.entrypoint = self.fn
         assert self.entrypoint
 
-        if (
-            self.numa_options is not None
-            and not self.numa_options.should_fall_back_if_binding_fails
-            and not isinstance(self.entrypoint, str)
-        ):
-            raise ValueError("numa_options is only supported for str entrypoints.")
-
     def get_entrypoint_name(self):
         """Get the entry point name.
 
diff --git a/torch/distributed/elastic/control_plane.py b/torch/distributed/elastic/control_plane.py
index 817255edd23dc..63334a0ca3f62 100644
--- a/torch/distributed/elastic/control_plane.py
+++ b/torch/distributed/elastic/control_plane.py
@@ -14,7 +14,7 @@
 
 @contextmanager
 def _worker_server(socket_path: str) -> Generator[None, None, None]:
-    from torch._C._distributed_c10d import _WorkerServer
+    from torch.distributed._distributed_c10d import _WorkerServer
 
     server = _WorkerServer(socket_path)
     try:
diff --git a/torch/distributed/elastic/multiprocessing/__init__.py b/torch/distributed/elastic/multiprocessing/__init__.py
index d283e0129f0ac..3f9fabd720bdd 100644
--- a/torch/distributed/elastic/multiprocessing/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/__init__.py
@@ -80,7 +80,7 @@ def trainer(a, b, c):
     to_map,
 )
 from torch.distributed.elastic.utils.logging import get_logger
-from torch.distributed.numa.binding import NumaOptions
+from torch.numa.binding import NumaOptions
 
 
 __all__ = [
@@ -141,8 +141,8 @@ def start_processes(
     For each process, the ``log_dir`` will contain:
 
     #. ``{local_rank}/error.json``: if the process failed, a file with the error info
-    #. ``{local_rank}/stdout.json``: if ``redirect & STDOUT == STDOUT``
-    #. ``{local_rank}/stderr.json``: if ``redirect & STDERR == STDERR``
+    #. ``{local_rank}/stdout.log``: if ``redirect & STDOUT == STDOUT``
+    #. ``{local_rank}/stderr.log``: if ``redirect & STDERR == STDERR``
 
     .. note:: It is expected that the ``log_dir`` exists, is empty, and is a directory.
 
@@ -227,6 +227,7 @@ def start_processes(
             log_line_prefixes=log_line_prefixes,
             start_method=start_method,
             logs_specs=logs_specs,
+            numa_options=numa_options,
         )
 
     try:
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index 6cd8d2a12f351..ed3ea86b0f2aa 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -37,7 +37,7 @@
     SubprocessHandler,
 )
 from torch.distributed.elastic.multiprocessing.tail_log import TailLog
-from torch.distributed.numa.binding import maybe_wrap_with_numa_bindings, NumaOptions
+from torch.numa.binding import NumaOptions
 
 
 IS_WINDOWS = sys.platform == "win32"
@@ -631,6 +631,7 @@ def __init__(
         start_method: str,
         logs_specs: LogsSpecs,
         log_line_prefixes: Optional[dict[int, str]] = None,
+        numa_options: Optional[NumaOptions] = None,
     ):
         super().__init__(
             name,
@@ -655,6 +656,8 @@ def __init__(
         # successfully. If any process died on event.wait() calling set() method will deadlock.
         self._worker_finished_event = mp.get_context(self.start_method).Event()
 
+        self._numa_options: Optional[NumaOptions] = numa_options
+
     def _start(self):
         if self._pc:
             raise ValueError(
@@ -676,6 +679,7 @@ def _start(self):
             join=False,
             daemon=False,
             start_method=self.start_method,
+            numa_options=self._numa_options,
         )
 
     def _is_done(self) -> bool:
@@ -814,10 +818,6 @@ def __init__(
         log_line_prefixes: Optional[dict[int, str]] = None,
         numa_options: Optional[NumaOptions] = None,
     ):
-        entrypoint, args = maybe_wrap_with_numa_bindings(
-            entrypoint=entrypoint, local_rank_to_args=args, numa_options=numa_options
-        )
-
         super().__init__(
             name,
             entrypoint,
@@ -831,6 +831,7 @@ def __init__(
         self._running_local_ranks: set[int] = set(range(self.nprocs))
         self._failures: dict[int, ProcessFailure] = {}
         self.subprocess_handlers: dict[int, SubprocessHandler] = {}
+        self._numa_options: Optional[NumaOptions] = numa_options
 
     def _start(self):
         if self.subprocess_handlers:
@@ -845,6 +846,7 @@ def _start(self):
                 stdout=self.stdouts[local_rank],
                 stderr=self.stderrs[local_rank],
                 local_rank_id=local_rank,
+                numa_options=self._numa_options,
             )
             for local_rank in range(self.nprocs)
         }
diff --git a/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py b/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py
index fea707a3c3ab2..947ce7b001ef7 100644
--- a/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py
+++ b/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py
@@ -3,10 +3,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+from typing import Optional
 
 from torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler import (
     SubprocessHandler,
 )
+from torch.numa.binding import NumaOptions
 
 
 __all__ = ["get_subprocess_handler"]
@@ -19,6 +21,7 @@ def get_subprocess_handler(
     stdout: str,
     stderr: str,
     local_rank_id: int,
+    numa_options: Optional[NumaOptions] = None,
 ) -> SubprocessHandler:
     return SubprocessHandler(
         entrypoint=entrypoint,
@@ -27,4 +30,5 @@ def get_subprocess_handler(
         stdout=stdout,
         stderr=stderr,
         local_rank_id=local_rank_id,
+        numa_options=numa_options,
     )
diff --git a/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py b/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
index 6b927fcd6a670..6a2e7ae35c4b7 100644
--- a/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
+++ b/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
@@ -11,6 +11,11 @@
 from subprocess import Popen
 from typing import Any, Optional
 
+from torch.numa.binding import (
+    maybe_temporarily_apply_numa_binding_to_current_thread,
+    NumaOptions,
+)
+
 
 __all__ = ["SubprocessHandler"]
 
@@ -39,6 +44,7 @@ def __init__(
         stdout: Optional[str],
         stderr: Optional[str],
         local_rank_id: int,
+        numa_options: Optional[NumaOptions],
     ):
         self._stdout = open(stdout, "w") if stdout else None
         self._stderr = open(stderr, "w") if stderr else None
@@ -47,13 +53,20 @@ def __init__(
         env_vars.update(env)
 
         args_str = (entrypoint, *[str(e) for e in args])
+
         self.local_rank_id = local_rank_id
-        self.proc: Popen = self._popen(args_str, env_vars)
+
+        # See HACK [NUMA inheritance] in spawn.py for context.
+        with maybe_temporarily_apply_numa_binding_to_current_thread(
+            gpu_index=local_rank_id, numa_options=numa_options
+        ):
+            self.proc: Popen = self._popen(args_str, env_vars)
 
     def _popen(self, args: tuple, env: dict[str, str]) -> Popen:
         kwargs: dict[str, Any] = {}
         if not IS_WINDOWS:
             kwargs["start_new_session"] = True
+
         return Popen(
             # pyre-fixme[6]: Expected `Union[typing.Sequence[Union[_PathLike[bytes],
             #  _PathLike[str], bytes, str]], bytes, str]` for 1st param but got
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
index 6e1d92b0d63b2..90b4b91a5cc7a 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
@@ -603,7 +603,7 @@ def foreach_reduce(
                 if non_blocking:
                     # Record an event on which to block the CPU thread to
                     # ensure that the D2H copy finishes before the optimizer
-                    fsdp_param.grad_offload_event = reduce_scatter_stream.record_event()
+                    fsdp_param.grad_offload_event = post_reduce_stream.record_event()
             if to_accumulate_grad:
                 assert isinstance(fsdp_param.sharded_param.grad, DTensor)
                 fsdp_param.sharded_param.grad._local_tensor += new_sharded_grad
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
index b7c8f4ea7c78a..db8f2bf722f01 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
@@ -694,6 +694,7 @@ def all_gather_inputs(self) -> list[torch.Tensor]:  # 1D
                 ), (
                     f"Invalid fsdp_pre_all_gather: {pre_all_gather_signature}\n"
                     "Expects fsdp_pre_all_gather(self, mesh: DeviceMesh, "
+                    "outer_size: torch.Size, outer_stride: tuple[int, ...], "
                     "module: nn.Module, mp_policy: MixedPrecisionPolicy)"
                 )
                 if num_fn_params == 1:
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
index 121f3d4c13885..554367e8705c8 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
@@ -32,7 +32,7 @@
     HSDPMeshInfo,
     TrainingState,
 )
-from ._fsdp_param import FSDPParam, ParamModuleInfo, ShardedState
+from ._fsdp_param import alloc_storage, FSDPParam, ParamModuleInfo, ShardedState
 
 
 logger = logging.getLogger("torch.distributed.fsdp.fully_shard")
@@ -166,6 +166,7 @@ def __init__(
         self._module_to_pre_load_state_dict_hook_handle: _ModuleToHandleDict = {}
         self._all_reduce_hook: Optional[Callable[[torch.Tensor], None]] = None
         self._all_gather_comm: AllGather = DefaultAllGather()
+        self._all_gather_output = torch.empty(0, device=self.device)
         self._reduce_scatter_comm: ReduceScatter = DefaultReduceScatter()
         # Optional stream to run the user-defined all-reduce hook in
         # Saved here and not in the comm. context because we allow the user to
@@ -310,6 +311,22 @@ def unshard(self, async_op: bool = False):
             # used in the all-gather streams
             self._wait_all_gather_streams_on_event(self._reshard_after_forward_event)
             self._reshard_after_forward_event = None
+
+        world_size = self._all_gather_process_group.size()
+        if world_size == 1:
+            # can't skip due to early return in wait_for_unshard if
+            # no self._all_gather_result
+            self._all_gather_result = AllGatherResult(
+                all_gather_output=self._all_gather_output,
+                all_gather_event=self.device_handle.Event().record(),
+                all_gather_work=None,
+                param_all_gather_input_dtypes=[],
+                param_all_gather_input_numels=[],
+                all_gather_input_split_sizes=[],
+            )
+
+            return
+
         with record_function(self._with_fqn("FSDP::all_gather")):
             self._all_gather_result = foreach_all_gather(
                 self.fsdp_params,
@@ -336,18 +353,52 @@ def wait_for_unshard(self):
             if prev_all_gather_state := self.comm_ctx.all_gather_state:
                 self._wait_all_gather_streams_on_event(prev_all_gather_state.event)
                 self.comm_ctx.all_gather_state = None  # free the all-gather result
-        with record_function(self._with_fqn("FSDP::all_gather_copy_out")):
-            foreach_all_gather_copy_out(
-                self._all_gather_result,
-                self.fsdp_params,
-                self._all_gather_process_group,
-            )
+        world_size = self._all_gather_process_group.size()
+        if world_size == 1:
+            # directly initialize unsharded parameters from sharded parameters
+
+            for fsdp_param in self.fsdp_params:
+                # Use all_gather_inputs which already handles conversion to param_dtype
+                # This is consistent with the world_size > 1 path
+                all_gather_input = fsdp_param.all_gather_inputs[0]
+
+                # Make sure the all_gather_outputs has proper storage size before using it
+                # First ensure we have at least one tensor in all_gather_outputs
+                fsdp_param.init_all_gather_outputs(
+                    [all_gather_input.numel()],
+                    [all_gather_input.dtype],
+                    world_size,
+                    self.device,
+                    force_recreate=False,
+                )
+
+                tensor = fsdp_param.all_gather_outputs[0]
+                alloc_storage(tensor)
+
+                # find alternative way to check if tensor.is_inference
+                with torch.autograd._unsafe_preserve_version_counter(tensor):
+                    tensor.copy_(all_gather_input)
+
+        else:
+            with record_function(self._with_fqn("FSDP::all_gather_copy_out")):
+                foreach_all_gather_copy_out(
+                    self._all_gather_result,
+                    self.fsdp_params,
+                    self._all_gather_process_group,
+                )
+
         for fsdp_param in self.fsdp_params:
             fsdp_param.init_unsharded_param()
+
         self._to_unsharded()
         all_gather_copy_out_event = self.device_handle.Event()
         all_gather_copy_out_event.record()
-        if not async_op and self._training_state == TrainingState.FORWARD:
+
+        if (
+            not async_op
+            and self._training_state == TrainingState.FORWARD
+            and world_size > 1
+        ):
             # Defer free to allow for overlap of this copy-out with next
             # all-gather collective
             self.comm_ctx.all_gather_state = AllGatherState(
@@ -355,6 +406,7 @@ def wait_for_unshard(self):
             )
         else:
             self._wait_all_gather_streams_on_event(all_gather_copy_out_event)
+
         self._all_gather_result = None  # free unless saved in `all_gather_state`
 
     def _wait_all_gather_streams_on_event(self, event: Optional[torch.Event]):
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
index d788ad568bd5c..acf23b27ca2a6 100644
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@@ -26,7 +26,7 @@
 from torch.distributed.elastic.rendezvous import RendezvousParameters
 from torch.distributed.elastic.rendezvous.utils import parse_rendezvous_endpoint
 from torch.distributed.elastic.utils.logging import get_logger
-from torch.distributed.numa.binding import NumaOptions
+from torch.numa.binding import NumaOptions
 
 
 __all__ = ["LaunchConfig", "elastic_launch", "launch_agent"]
@@ -107,7 +107,12 @@ def __post_init__(self):
         if self.logs_specs is None:
             self.logs_specs = DefaultLogsSpecs()
 
-        if self.numa_options is None and torch.cuda.is_available():
+        if (
+            self.numa_options is None
+            and torch.cuda.is_available()
+            # We assume local_rank n uses cuda device n.
+            and torch.cuda.device_count() == self.nproc_per_node
+        ):
             self.numa_options = get_default_numa_options()
             logger.info("Using default numa options = %r", self.numa_options)
 
diff --git a/torch/distributed/nn/functional.py b/torch/distributed/nn/functional.py
index eeff877260bcc..2bdf3fe2bdffd 100644
--- a/torch/distributed/nn/functional.py
+++ b/torch/distributed/nn/functional.py
@@ -2,10 +2,6 @@
 import torch
 import torch.distributed as dist
 from torch.autograd import Function
-
-# The two imports below are not always available depending on the
-# USE_DISTRIBUTED compile flag. Make sure they raise import error
-# if we're trying to use them.
 from torch.distributed import group, ReduceOp
 
 
diff --git a/torch/distributed/numa/__init__.py b/torch/distributed/numa/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/torch/distributed/pipelining/_IR.py b/torch/distributed/pipelining/_IR.py
index f21e9cde8d375..3dfb0fe25c4cd 100644
--- a/torch/distributed/pipelining/_IR.py
+++ b/torch/distributed/pipelining/_IR.py
@@ -681,7 +681,7 @@ def _from_traced(
         ``output_loss_value_spec={'loss': True, 'model_out': False}``
         """
 
-        traced = exported_program.module()
+        traced = exported_program.module(check_guards=False)
 
         if split_policy is not None:
             logger.info("Auto-splitting model")
diff --git a/torch/distributed/pipelining/__init__.py b/torch/distributed/pipelining/__init__.py
index e715a839997ec..aacaf0b7f5e4a 100644
--- a/torch/distributed/pipelining/__init__.py
+++ b/torch/distributed/pipelining/__init__.py
@@ -3,6 +3,7 @@
 from .schedules import (
     _ScheduleForwardOnly,
     Schedule1F1B,
+    ScheduleDualPipeV,
     ScheduleGPipe,
     ScheduleInterleaved1F1B,
     ScheduleInterleavedZeroBubble,
@@ -25,4 +26,5 @@
     "ScheduleLoopedBFS",
     "ScheduleInterleavedZeroBubble",
     "ScheduleZBVZeroBubble",
+    "ScheduleDualPipeV",
 ]
diff --git a/torch/distributed/pipelining/_schedule_visualizer.py b/torch/distributed/pipelining/_schedule_visualizer.py
index 38ba1241c4e5a..81be2b178343b 100644
--- a/torch/distributed/pipelining/_schedule_visualizer.py
+++ b/torch/distributed/pipelining/_schedule_visualizer.py
@@ -9,7 +9,8 @@
 visualize_schedule(ops, "test.png")
 """
 
-from typing import Optional, Union
+import collections
+from typing import NamedTuple, Optional, Union
 from unittest import mock
 
 from torch.distributed.pipelining.schedules import (
@@ -23,11 +24,18 @@
 from torch.distributed.pipelining.stage import PipelineStage
 
 
+class OpKey(NamedTuple):
+    stage_index: int
+    computation_type: _ComputationType
+    microbatch_index: int
+
+
 def get_schedule_ops(
     schedule: Union[str, type[_PipelineSchedule]],
     pp_degree: int,
     num_microbatches: int,
     num_stages_per_rank: Optional[int] = None,
+    add_spacing: bool = False,
 ) -> list[list[Optional[_Action]]]:
     """
     Get all actions for a given schedule, pp_degree, and num_microbatches. The actions are returned in a list of lists
@@ -76,11 +84,20 @@ def get_schedule_ops(
     for rank in range(pp_degree):
         all_actions.append(schedule_instance.pipeline_order[rank])
 
+    # Add spacing
+    if add_spacing:
+        # remove all Nones, then respace
+        # TODO: later we can change this at the schedule creation level to not use Nones
+        all_actions = [
+            [action for action in rank if action is not None] for rank in all_actions
+        ]
+        all_actions = add_schedule_op_spacing(all_actions)
+
     # Return the pipeline order
     return all_actions
 
 
-class _ComputationTypeColor:
+class _ComputationTypeVisual:
     def __init__(
         self,
         color: str,
@@ -92,24 +109,230 @@ def __init__(
         self.text = text
 
 
-# Update the mapping to use _ComputationTypeColor instances
+# Update the mapping to use _ComputationTypeVisual instances
 action_type_to_color_mapping = {
-    _ComputationType.FORWARD: _ComputationTypeColor("blue", "Forward"),
-    _ComputationType.BACKWARD_INPUT: _ComputationTypeColor("teal", "Backward Input"),
-    _ComputationType.BACKWARD_WEIGHT: _ComputationTypeColor("green", "Backward Weight"),
-    _ComputationType.FULL_BACKWARD: _ComputationTypeColor("orange", "Full Backward", 2),
-    _ComputationType.OVERLAP_F_B: _ComputationTypeColor("purple", "Overlap F+B", 3),
+    _ComputationType.FORWARD: _ComputationTypeVisual("blue", "Forward"),
+    _ComputationType.BACKWARD_INPUT: _ComputationTypeVisual("teal", "Backward Input"),
+    _ComputationType.BACKWARD_WEIGHT: _ComputationTypeVisual(
+        "green", "Backward Weight"
+    ),
+    _ComputationType.FULL_BACKWARD: _ComputationTypeVisual(
+        "orange", "Full Backward", 2
+    ),
+    _ComputationType.OVERLAP_F_B: _ComputationTypeVisual("purple", "Overlap F+B", 3),
 }
 
 
+def add_schedule_op_spacing(
+    schedule: list[list[Optional[_Action]]],
+) -> list[list[Optional[_Action]]]:
+    """
+    Add spacing to the schedule based on dependencies between ranks.
+
+    Before adding an operation to the list, this function checks if there are
+    dependencies from other ranks. If there are dependencies (other ranks have
+    not finished processing the required microbatch), it adds None instead.
+
+    For example, Forward microbatch 0 on rank 1 depends on rank 0 processing
+    Forward microbatch 0 first.
+
+    Args:
+        schedule: The original schedule as a list of lists where each inner list
+                 represents a rank and each element represents an action.
+
+    Returns:
+        A new schedule with proper spacing based on dependencies.
+    """
+    if not schedule:
+        return schedule
+
+    num_stages = (
+        max(
+            action.stage_index
+            for rank_actions in schedule
+            for action in rank_actions
+            if action is not None
+        )
+        + 1
+    )
+
+    num_ranks = len(schedule)
+    spaced_schedule: list[list[Optional[_Action]]] = [[] for _ in range(num_ranks)]
+    rank_ops = [collections.deque(ops) for ops in schedule]
+
+    # Track completion times: (stage_index, action_type, microbatch_index) -> completion_time
+    scheduled_ops: dict[OpKey, int] = {}
+
+    def is_dependency_ready(dependency_key: OpKey, timestep: int) -> bool:
+        """Check if a dependency operation has completed by the given timestep."""
+        return (
+            dependency_key in scheduled_ops
+            and timestep >= scheduled_ops[dependency_key]
+        )
+
+    def get_dependencies(action: _Action) -> list[OpKey]:
+        """Get the list of dependencies for an action."""
+        stage_idx = action.stage_index
+        comp_type = action.computation_type
+        mb_idx = action.microbatch_index
+
+        # Ensure mb_idx is not None for dependency tracking
+        assert mb_idx is not None, f"Action {action} has None microbatch_index"
+
+        # First stage forward has no dependencies
+        if stage_idx == 0 and comp_type == _ComputationType.FORWARD:
+            return []
+
+        # Last stage backward depends on forward from previous stage
+        if stage_idx == num_stages - 1 and comp_type in (
+            _ComputationType.FULL_BACKWARD,
+            _ComputationType.BACKWARD_INPUT,
+        ):
+            return [OpKey(stage_idx - 1, _ComputationType.FORWARD, mb_idx)]
+
+        # Forward depends on previous stage forward
+        if comp_type == _ComputationType.FORWARD:
+            return [OpKey(stage_idx - 1, _ComputationType.FORWARD, mb_idx)]
+
+        # Backward depends on next stage backward
+        if comp_type in (
+            _ComputationType.FULL_BACKWARD,
+            _ComputationType.BACKWARD_INPUT,
+        ):
+            return [
+                OpKey(stage_idx + 1, _ComputationType.FULL_BACKWARD, mb_idx),
+                OpKey(stage_idx + 1, _ComputationType.BACKWARD_INPUT, mb_idx),
+            ]
+
+        # Weight backward depends on input backward
+        if comp_type == _ComputationType.BACKWARD_WEIGHT:
+            return [OpKey(stage_idx, _ComputationType.BACKWARD_INPUT, mb_idx)]
+
+        raise RuntimeError(f"Unknown computation type: {comp_type}")
+
+    def is_action_ready(action: _Action, timestep: int) -> bool:
+        """Check if an action is ready to be scheduled at the given timestep."""
+        # For OR dependencies (like backward), check if any dependency is satisfied
+        if action.computation_type in (
+            _ComputationType.FULL_BACKWARD,
+            _ComputationType.BACKWARD_INPUT,
+            _ComputationType.BACKWARD_WEIGHT,
+        ):
+            dependencies = get_dependencies(action)
+            return any(is_dependency_ready(dep, timestep) for dep in dependencies)
+        # For AND dependencies, all must be satisfied
+        elif action.computation_type == _ComputationType.FORWARD:
+            dependencies = get_dependencies(action)
+            return all(is_dependency_ready(dep, timestep) for dep in dependencies)
+        elif action.computation_type == _ComputationType.OVERLAP_F_B:
+            assert action.sub_actions is not None, (
+                f"OVERLAP_F_B action {action} has None sub_actions"
+            )
+            dep_list: list[bool] = []
+            for sub_action in action.sub_actions:
+                dep_list.append(is_action_ready(sub_action, timestep))
+            return all(dep_list)
+        else:
+            raise RuntimeError(f"Unknown computation type: {action.computation_type}")
+
+    def schedule_action(action: _Action, rank: int, timestep: int) -> int:
+        """Schedule an action and return completion time."""
+        spaced_schedule[rank].append(action)
+        comp_type = action.computation_type
+        comp_time = action_type_to_color_mapping[comp_type].width
+        completion_time = timestep + comp_time
+
+        if comp_type == _ComputationType.OVERLAP_F_B:
+            # For overlap actions, schedule each sub-action with cumulative timing
+            assert action.sub_actions is not None, (
+                f"OVERLAP_F_B action {action} has None sub_actions"
+            )
+            cumulative_time = 0
+            for sub_action in action.sub_actions:
+                assert sub_action.microbatch_index is not None, (
+                    f"Sub-action {sub_action} has None microbatch_index"
+                )
+                sub_comp_time = action_type_to_color_mapping[
+                    sub_action.computation_type
+                ].width
+                cumulative_time += sub_comp_time
+                scheduled_ops[
+                    OpKey(
+                        sub_action.stage_index,
+                        sub_action.computation_type,
+                        sub_action.microbatch_index,
+                    )
+                ] = timestep + cumulative_time
+        else:
+            assert action.microbatch_index is not None, (
+                f"Action {action} has None microbatch_index"
+            )
+            scheduled_ops[
+                OpKey(action.stage_index, comp_type, action.microbatch_index)
+            ] = completion_time
+
+        return completion_time
+
+    # Main scheduling loop
+    current_timestep = 0
+    timesteps_without_progress = 0
+    rank_completion_times = dict.fromkeys(range(num_ranks), 0)
+    while rank_ops:
+        print(f"Current timestep: {current_timestep}")
+        # Process all operations during timestep until we run out of ready operations
+        for rank, op_queue in enumerate(rank_ops):
+            if not op_queue:
+                continue
+
+            op_queue = rank_ops[rank]
+            action = op_queue[0]
+            print(f"Rank: {rank}, {action=}")
+            if action is None:
+                spaced_schedule[rank].append(None)
+                op_queue.popleft()
+                timesteps_without_progress = 0
+            elif current_timestep >= rank_completion_times[rank] and is_action_ready(
+                action, current_timestep
+            ):
+                rank_completion_times[rank] = schedule_action(
+                    action, rank, current_timestep
+                )
+                op_queue.popleft()
+                timesteps_without_progress = 0
+
+        # Add None for ranks that are waiting
+        for rank in range(num_ranks):
+            if current_timestep >= rank_completion_times[rank]:
+                spaced_schedule[rank].append(None)
+
+        # Remove empty queues and advance timestep
+        rank_ops = [op_queue for op_queue in rank_ops if op_queue]
+        current_timestep += 1
+        timesteps_without_progress += 1
+
+        if timesteps_without_progress > max(
+            visual.width for visual in action_type_to_color_mapping.values()
+        ):
+            raise RuntimeError("No progress made in scheduling - possible deadlock")
+
+    return spaced_schedule
+
+
 def visualize_schedule(
-    schedule: list[list[Optional[_Action]]], filename: Optional[str] = None
+    schedule: list[list[Optional[_Action]]],
+    filename: Optional[str] = None,
 ) -> None:
     """
     Visualize the schedule using matplotlib.
     The schedule is a list of lists where each inner list represents a rank and each element in the inner list represents an action.
     The actions are represented as rectangles with different colors based on their computation type.
     The filename is optional and if provided, the plot will be saved to that file.
+
+    Args:
+        schedule: The schedule to visualize.
+        filename: The filename to save the plot to. If not provided, the plot will be displayed.
+        add_schedule_spacing: If True, add spacing to the schedule based on dependencies between ranks.
+
     """
 
     import matplotlib.pyplot as plt
@@ -132,7 +355,7 @@ def visualize_schedule(
         for action in actions:
             if action is not None:
                 comp_type_color = action_type_to_color_mapping.get(
-                    action.computation_type, _ComputationTypeColor("black")
+                    action.computation_type, _ComputationTypeVisual("black")
                 )
                 used_computation.add(action.computation_type)
                 color = comp_type_color.color
diff --git a/torch/distributed/pipelining/schedules.py b/torch/distributed/pipelining/schedules.py
index d0133ae1f19b1..ffc23a654ec45 100644
--- a/torch/distributed/pipelining/schedules.py
+++ b/torch/distributed/pipelining/schedules.py
@@ -9,6 +9,7 @@
 from abc import ABC, abstractmethod
 from collections import Counter, defaultdict
 from enum import Enum
+from functools import lru_cache
 from typing import Any, Callable, NamedTuple, Optional, Union
 
 import torch
@@ -18,7 +19,7 @@
 from torch.nn.modules.loss import _Loss
 from torch.profiler import record_function
 
-from ._utils import generate_stage_to_rank_mapping
+from ._utils import generate_rank_to_stage_mapping, generate_stage_to_rank_mapping
 from .microbatch import merge_chunks, split_args_kwargs_into_chunks, TensorChunkSpec
 from .stage import _PipelineStageBase
 
@@ -33,6 +34,7 @@
     "ScheduleLoopedBFS",
     "ScheduleInterleavedZeroBubble",
     "ScheduleZBVZeroBubble",
+    "ScheduleDualPipeV",
 ]
 
 logger = logging.getLogger(__name__)
@@ -141,6 +143,16 @@ def __repr__(self):
                 repr_str += str(self.microbatch_index)
             return repr_str
 
+    @property
+    def is_compute_op(self) -> bool:
+        return self.computation_type in (
+            FORWARD,
+            FULL_BACKWARD,
+            BACKWARD_INPUT,
+            BACKWARD_WEIGHT,
+            OVERLAP_F_B,
+        )
+
     @staticmethod
     def from_str(action_string: str):
         """
@@ -196,6 +208,11 @@ def from_str(action_string: str):
         )
 
 
+@lru_cache
+def _get_profiler_function_name(action: _Action) -> str:
+    return f"PP:{str(action)}"
+
+
 def _format_pipeline_order(
     pipeline_order: dict[int, list[Optional[_Action]]],
     error_step_number: Optional[int] = None,
@@ -554,6 +571,13 @@ def __init__(
         )
 
     def _initialize_stage(self, args, kwargs):
+        # Prepare the communication needed for the pipeline schedule execution
+        # This is needed because during execution we always perform a series of batch P2P ops
+        # The first call of the batched P2P needs to involve the global group
+        all_ops: list[dist.P2POp] = []
+        all_ops.extend(self._stage._get_init_p2p_neighbors_ops())
+        _wait_batch_p2p(_batch_p2p(all_ops))
+
         self._stage._prepare_forward_infra(self._n_microbatches, args, kwargs)
         if self._has_backward:
             self._stage._prepare_backward_infra(self._n_microbatches)
@@ -1428,6 +1452,14 @@ def __init__(
             )
 
     def _initialize_stages(self, args: tuple[Any, ...], kwargs):
+        # Prepare the communication needed for the pipeline schedule execution
+        # This is needed because during execution we always perform a series of batch P2P ops
+        # The first call of the batched P2P needs to involve the global group
+        all_ops: list[dist.P2POp] = []
+        for stage in self._stages:
+            all_ops.extend(stage._get_init_p2p_neighbors_ops())
+        _wait_batch_p2p(_batch_p2p(all_ops))
+
         # may be 'none' value (if this stage sends its output shapes to the next stage via P2P)
         # or real value (if this stage and next stage are on the same device)
         next_stage_args: tuple[Any, ...] = tuple()
@@ -1725,7 +1757,7 @@ class _PipelineScheduleRuntime(PipelineScheduleMulti):
     subclassed and the subclass can be responsible for creating a schedule IR.
     """
 
-    def _load_actions(
+    def _prepare_schedule_with_comms(
         self,
         actions: dict[int, list[Optional[_Action]]],
         format: str = "compute_only",
@@ -1746,6 +1778,17 @@ def _load_actions(
                     self.pipeline_order_with_comms[rank].append(action)
             # TODO what level of validation should we offer for compute+comms schedule?
         elif format == "compute_only":
+            # Validate that the schedule does not have comms already added to it
+            for rank, action_list in actions.items():
+                for i, action in enumerate(action_list):
+                    if action is not None and not action.is_compute_op:
+                        raise ValueError(
+                            f"Expected compute-only schedule but found communication action "
+                            f"'{action}' at rank {rank}, position {i}. "
+                            f"Communication actions (e.g. SEND_F, RECV_F, etc.) "
+                            f"should not be present when format='compute_only'."
+                        )
+
             # Perform schedule lowering
             for rank in actions:
                 self.pipeline_order_with_comms[rank] = _add_unshard_reshard(
@@ -1770,28 +1813,35 @@ def _load_csv(self, filename: str, format: str = "compute_only"):
             # this will populate self.pipeline_order
             super()._load_csv(filename)
             # this will populate self.pipeline_order_with_comms
-            self._load_actions(self.pipeline_order)
+            self._prepare_schedule_with_comms(self.pipeline_order)
         elif format == "compute_comms":
             actions = {}
             with open(filename, newline="") as csvfile:
                 reader = csv.reader(csvfile)
                 for rank, row in enumerate(reader):
                     actions[rank] = [_Action.from_str(s) for s in row]
-                self._load_actions(actions, format=format)
+                self._prepare_schedule_with_comms(actions, format=format)
         else:
             raise NotImplementedError(f"{format=} is not implemented")
 
-    def _dump_csv(self, filename: str):
-        """Dump a CSV representation of the compute + comms schedule into a file with the provided filename."""
-        # TODO should there be an option to dump the compute_only schedule from PipelineScheduleRuntime? It's possible
-        # that it does not exist if it was created from a compute_comms schedule.
-        assert self.pipeline_order_with_comms is not None, (
-            "Must initialize compute_comms schedule before dump_csv"
-        )
-        with open(filename, "w", newline="") as csvfile:
-            writer = csv.writer(csvfile)
-            for rank in self.pipeline_order_with_comms:
-                writer.writerow(self.pipeline_order_with_comms[rank])
+    def _dump_csv(self, filename: str, format: str = "compute_comms"):
+        """Dump a CSV representation of the schedule into a file with the provided filename."""
+        if format == "compute_only":
+            assert self.pipeline_order is not None, (
+                "Compute only schedule must be available"
+            )
+            with open(filename, "w", newline="") as csvfile:
+                writer = csv.writer(csvfile)
+                for rank in self.pipeline_order:
+                    writer.writerow(self.pipeline_order[rank])
+        elif format == "compute_comms":
+            assert self.pipeline_order_with_comms is not None, (
+                "Must initialize compute_comms schedule before dump_csv"
+            )
+            with open(filename, "w", newline="") as csvfile:
+                writer = csv.writer(csvfile)
+                for rank in self.pipeline_order_with_comms:
+                    writer.writerow(self.pipeline_order_with_comms[rank])
 
     def _simulate(self):
         return _simulate_comms_compute(
@@ -1824,7 +1874,7 @@ def _step_microbatches(
         }
 
         assert self.pipeline_order_with_comms is not None, (
-            "Must call _load_actions() before calling _step_microbatches()"
+            "Must call _prepare_schedule_with_comms() before calling _step_microbatches()"
         )
 
         # recv ops indexed by (stage_idx, mb_idx) need to be waited on before use
@@ -1875,148 +1925,155 @@ def _assert_unsharded(stage_idx: int):
                     action,
                 )
 
-                # TODO(whc) it's not actually safe to use _batch_p2p here in the uncommon case the model has skip-connections,
-                # since we do not want to batch up ops between more than a pair of ranks.  _sorted_batch_p2p would be
-                # safe to use instead.
-                # However, I was wondering if I should avoid calling batched operators at all in the case that there is
-                # only one operator per batch.  I could iterate through the 'fwd_send_ops' one by one and run them.
-                if comp_type == SEND_F:
-                    send_ops.append(_batch_p2p(stage.get_fwd_send_ops(mb_index)))
-                elif comp_type == SEND_B:
-                    send_ops.append(_batch_p2p(stage.get_bwd_send_ops(mb_index)))
-                elif comp_type == RECV_F:
-                    assert (
-                        stage_idx,
-                        mb_index,
-                    ) not in fwd_recv_ops, (
-                        "Recv twice for {stage_idx=} {mb_index=} without executing forward"
-                    )
-                    fwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
-                        stage.get_fwd_recv_ops(mb_index)
-                    )
-                elif comp_type == RECV_B:
-                    assert (
-                        stage_idx,
-                        mb_index,
-                    ) not in bwd_recv_ops, (
-                        "Recv twice for {stage_idx=} {mb_index=} without executing backward"
-                    )
-                    bwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
-                        stage.get_bwd_recv_ops(mb_index)
-                    )
-                elif comp_type == UNSHARD:
-                    if stage_uses_fsdp:
+                with record_function(_get_profiler_function_name(action)):
+                    # TODO(whc) it's not actually safe to use _batch_p2p here in the uncommon case the model has skip-connections,
+                    # since we do not want to batch up ops between more than a pair of ranks.  _sorted_batch_p2p would be
+                    # safe to use instead.
+                    # However, I was wondering if I should avoid calling batched operators at all in the case that there is
+                    # only one operator per batch.  I could iterate through the 'fwd_send_ops' one by one and run them.
+                    if comp_type == SEND_F:
+                        send_ops.append(_batch_p2p(stage.get_fwd_send_ops(mb_index)))
+                    elif comp_type == SEND_B:
+                        send_ops.append(_batch_p2p(stage.get_bwd_send_ops(mb_index)))
+                    elif comp_type == RECV_F:
                         assert (
-                            stage_idx not in unsharded_stages
-                            and stage_idx not in unshard_ops
-                        ), f"Unsharding the same {stage_idx=} twice"
-                        unshard_ops[stage_idx] = stage.submod.unshard(async_op=True)  # type: ignore[operator]
-                elif comp_type == RESHARD:
-                    if stage_uses_fsdp:
-                        assert stage_idx in unsharded_stages, (
-                            f"Resharding {stage_idx=} without unsharding"
+                            stage_idx,
+                            mb_index,
+                        ) not in fwd_recv_ops, (
+                            "Recv twice for {stage_idx=} {mb_index=} without executing forward"
                         )
-                        assert stage_idx not in unshard_ops, (
-                            f"Resharding {stage_idx=} before finishing unshard"
+                        fwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
+                            stage.get_fwd_recv_ops(mb_index)
                         )
-                        stage.submod.reshard()  # type: ignore[operator]
-                elif comp_type == FORWARD:
-                    if stage_uses_fsdp:
-                        _assert_unsharded(stage_idx)
-
-                    if (
-                        not stage.is_first
-                        # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
-                        and not is_prev_stage_on_this_rank
-                    ):
+                    elif comp_type == RECV_B:
                         assert (
                             stage_idx,
                             mb_index,
-                        ) in fwd_recv_ops, f"Computing {action=} before receiving input"
-                        _wait_batch_p2p(fwd_recv_ops.pop((stage_idx, mb_index)))
-
-                    output = stage.forward_one_chunk(
-                        mb_index, arg_mbs[mb_index], kwarg_mbs[mb_index]
-                    )
-                    self._maybe_compute_loss(stage, output, target_mbs, mb_index)
+                        ) not in bwd_recv_ops, (
+                            "Recv twice for {stage_idx=} {mb_index=} without executing backward"
+                        )
+                        bwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
+                            stage.get_bwd_recv_ops(mb_index)
+                        )
+                    elif comp_type == UNSHARD:
+                        if stage_uses_fsdp:
+                            assert (
+                                stage_idx not in unsharded_stages
+                                and stage_idx not in unshard_ops
+                            ), f"Unsharding the same {stage_idx=} twice"
+                            unshard_ops[stage_idx] = stage.submod.unshard(async_op=True)  # type: ignore[operator]
+                    elif comp_type == RESHARD:
+                        if stage_uses_fsdp:
+                            assert stage_idx in unsharded_stages, (
+                                f"Resharding {stage_idx=} without unsharding"
+                            )
+                            assert stage_idx not in unshard_ops, (
+                                f"Resharding {stage_idx=} before finishing unshard"
+                            )
+                            stage.submod.reshard()  # type: ignore[operator]
+                    elif comp_type == FORWARD:
+                        if stage_uses_fsdp:
+                            _assert_unsharded(stage_idx)
+
+                        if (
+                            not stage.is_first
+                            # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
+                            and not is_prev_stage_on_this_rank
+                        ):
+                            assert (
+                                stage_idx,
+                                mb_index,
+                            ) in fwd_recv_ops, (
+                                f"Computing {action=} before receiving input"
+                            )
+                            _wait_batch_p2p(fwd_recv_ops.pop((stage_idx, mb_index)))
 
-                    # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
-                    # see [Note: V-schedule special case]
-                    if is_next_stage_on_this_rank:
-                        stage_index_to_stage[stage_idx + 1].set_local_fwd_input(
-                            output, mb_index
+                        output = stage.forward_one_chunk(
+                            mb_index, arg_mbs[mb_index], kwarg_mbs[mb_index]
                         )
+                        self._maybe_compute_loss(stage, output, target_mbs, mb_index)
 
-                elif comp_type == FULL_BACKWARD:
-                    if stage_uses_fsdp:
-                        _assert_unsharded(stage_idx)
+                        # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+                        # see [Note: V-schedule special case]
+                        if is_next_stage_on_this_rank:
+                            stage_index_to_stage[stage_idx + 1].set_local_fwd_input(
+                                output, mb_index
+                            )
 
-                    if (
-                        not stage.is_last
-                        # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
-                        and not is_next_stage_on_this_rank
-                    ):
-                        assert (
-                            stage_idx,
-                            mb_index,
-                        ) in bwd_recv_ops, (
-                            f"Attempted to run compute {action=} before receiving input"
+                    elif comp_type == FULL_BACKWARD:
+                        if stage_uses_fsdp:
+                            _assert_unsharded(stage_idx)
+
+                        if (
+                            not stage.is_last
+                            # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
+                            and not is_next_stage_on_this_rank
+                        ):
+                            assert (
+                                stage_idx,
+                                mb_index,
+                            ) in bwd_recv_ops, (
+                                f"Attempted to run compute {action=} before receiving input"
+                            )
+                            _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
+                        loss = self._maybe_get_loss(stage, mb_index)
+                        backward_counter[stage_idx] += 1
+                        last_backward = (
+                            backward_counter[stage_idx] == self._n_microbatches
                         )
-                        _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
-                    loss = self._maybe_get_loss(stage, mb_index)
-                    backward_counter[stage_idx] += 1
-                    last_backward = backward_counter[stage_idx] == self._n_microbatches
-                    grad_scale_factor = self._n_microbatches if self.scale_grads else 1
-                    stage.backward_one_chunk(
-                        mb_index,
-                        loss=loss,
-                        full_backward=True,
-                        last_backward=last_backward,
-                    )
-                    if last_backward:
-                        stage.scale_grads(grad_scale_factor)
-                    # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
-                    # see [Note: V-schedule special case]
-                    if is_prev_stage_on_this_rank:
-                        stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
-                            stage.get_local_bwd_output(mb_index), mb_index
+                        grad_scale_factor = (
+                            self._n_microbatches if self.scale_grads else 1
                         )
-                elif comp_type == BACKWARD_INPUT:
-                    if stage_uses_fsdp:
-                        _assert_unsharded(stage_idx)
-
-                    if not stage.is_last and not is_next_stage_on_this_rank:
-                        assert (
-                            stage_idx,
+                        stage.backward_one_chunk(
                             mb_index,
-                        ) in bwd_recv_ops, (
-                            f"Attempted to run compute {action=} before receiving input"
+                            loss=loss,
+                            full_backward=True,
+                            last_backward=last_backward,
                         )
-                        _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
-                    loss = self._maybe_get_loss(stage, mb_index)
-                    stage.backward_one_chunk(
-                        mb_index,
-                        loss=loss,
-                        full_backward=False,
-                        last_backward=False,
-                    )
-                    # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
-                    # see [Note: V-schedule special case]
-                    if is_prev_stage_on_this_rank:
-                        stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
-                            stage.get_local_bwd_output(mb_index), mb_index
+                        if last_backward:
+                            stage.scale_grads(grad_scale_factor)
+                        # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+                        # see [Note: V-schedule special case]
+                        if is_prev_stage_on_this_rank:
+                            stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
+                                stage.get_local_bwd_output(mb_index), mb_index
+                            )
+                    elif comp_type == BACKWARD_INPUT:
+                        if stage_uses_fsdp:
+                            _assert_unsharded(stage_idx)
+
+                        if not stage.is_last and not is_next_stage_on_this_rank:
+                            assert (
+                                stage_idx,
+                                mb_index,
+                            ) in bwd_recv_ops, (
+                                f"Attempted to run compute {action=} before receiving input"
+                            )
+                            _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
+                        loss = self._maybe_get_loss(stage, mb_index)
+                        stage.backward_one_chunk(
+                            mb_index,
+                            loss=loss,
+                            full_backward=False,
+                            last_backward=False,
                         )
-                elif comp_type == BACKWARD_WEIGHT:
-                    if stage_uses_fsdp:
-                        _assert_unsharded(stage_idx)
-                    backward_counter[stage_idx] += 1
-                    stage.backward_weight_one_chunk(
-                        mb_index,
-                        last_backward=backward_counter[stage_idx]
-                        == self._n_microbatches,
-                    )
-                else:
-                    raise ValueError(f"{action=} is unknown or unsupported")
+                        # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+                        # see [Note: V-schedule special case]
+                        if is_prev_stage_on_this_rank:
+                            stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
+                                stage.get_local_bwd_output(mb_index), mb_index
+                            )
+                    elif comp_type == BACKWARD_WEIGHT:
+                        if stage_uses_fsdp:
+                            _assert_unsharded(stage_idx)
+                        backward_counter[stage_idx] += 1
+                        stage.backward_weight_one_chunk(
+                            mb_index,
+                            last_backward=backward_counter[stage_idx]
+                            == self._n_microbatches,
+                        )
+                    else:
+                        raise ValueError(f"{action=} is unknown or unsupported")
             except Exception as e:
                 logger.error(
                     "_PipelineScheduleRuntime caught exception at step %s when running action %s.  Full Schedule:",
@@ -2735,6 +2792,227 @@ def _calculate_single_rank_operations(self, rank) -> list[Optional[_Action]]:
         return rank_ops
 
 
+class ScheduleDualPipeV(_PipelineScheduleRuntime):
+    """
+    The DualPipeV schedule. A more efficient schedule variant based on the
+    DualPipe schedule introduced by DeepSeek in https://arxiv.org/pdf/2412.19437
+
+    Based on the open sourced code from https://github.com/deepseek-ai/DualPipe
+    """
+
+    def __init__(
+        self,
+        stages: list[_PipelineStageBase],
+        n_microbatches: int,
+        loss_fn: Optional[Callable] = None,
+        args_chunk_spec: Optional[tuple[TensorChunkSpec, ...]] = None,
+        kwargs_chunk_spec: Optional[dict[str, TensorChunkSpec]] = None,
+        output_merge_spec: Optional[Union[dict[str, Any], tuple[Any]]] = None,
+        scale_grads: bool = True,
+    ):
+        self.pp_group_size = stages[0].group_size
+        super().__init__(
+            stages=stages,
+            n_microbatches=n_microbatches,
+            loss_fn=loss_fn,
+            args_chunk_spec=args_chunk_spec,
+            kwargs_chunk_spec=kwargs_chunk_spec,
+            output_merge_spec=output_merge_spec,
+            scale_grads=scale_grads,
+        )
+        self.stage_index_to_group_rank = generate_stage_to_rank_mapping(
+            self.pp_group_size, self._num_stages, style="v"
+        )
+        for stage in self._stages:
+            stage.stage_index_to_group_rank = self.stage_index_to_group_rank
+
+        self.n_local_stages = len(stages)
+        if self.n_local_stages != 2:
+            raise ValueError(
+                "ZBV requires exactly 2 stages per rank, but got "
+                f"{self.n_local_stages}."
+            )
+        if n_microbatches < self._num_stages:
+            raise ValueError(
+                "DualPipeV requires at least as many microbatches as stages, but got "
+                f"{n_microbatches} microbatches and {self._num_stages} stages."
+            )
+
+        self.rank = stages[0].group_rank
+        self.num_stages = stages[0].num_stages
+
+        # 1. Create the pipeline_order (all ranks do this calculation)
+        # This will be used to keep track of the current state of the entire pipeline
+        # pipeline_order[rank] = [Action(computation_type, microbatch_index, stage_index), ...]
+        self.pipeline_order: dict[int, list[Optional[_Action]]] = {}
+        for rank in range(self.pp_group_size):
+            rank_ops = self._calculate_single_rank_operations(rank)
+            self.pipeline_order[rank] = rank_ops
+
+        # Initialize the pipeline order with communication necessary to run with _PipelineScheduleRuntime
+        self._prepare_schedule_with_comms(self.pipeline_order)
+
+    def _calculate_single_rank_operations(self, rank) -> list[Optional[_Action]]:
+        actions: list[Optional[_Action]] = []
+        counters: dict[
+            tuple[int, _ComputationType], int
+        ] = {}  # (stage_index, computation_type) -> mb_index
+        weight_queue = []  # Queue of (stage_index, mb_index) for pending weight actions
+
+        num_ranks = self.pp_group_size
+        num_chunks = self._n_microbatches
+
+        rank_to_stages = generate_rank_to_stage_mapping(
+            num_ranks, num_ranks * 2, style="v"
+        )
+        stage0_index, stage1_index = rank_to_stages[rank]
+
+        def increment_backward_counts(stage_index: int):
+            """Helper method to increment BACKWARD_INPUT and BACKWARD_WEIGHT counters when FULL_BACKWARD is used."""
+            input_key = (stage_index, BACKWARD_INPUT)
+            weight_key = (stage_index, BACKWARD_WEIGHT)
+            counters[input_key] = counters.get(input_key, 0) + 1
+            counters[weight_key] = counters.get(weight_key, 0) + 1
+
+        def add_overlap_f_b(
+            actions: list,
+            forward_stage: int,
+            backward_stage: int,
+        ):
+            """Helper method to add an overlapped forward+backward action which tracks microbatch index."""
+            # Create new overlapped forward+backward action with sub_actions
+            forward_key = (forward_stage, FORWARD)
+            backward_key = (backward_stage, BACKWARD_INPUT)
+
+            forward_mb = counters.get(forward_key, 0)
+            backward_mb = counters.get(backward_key, 0)
+
+            sub_actions = (
+                _Action(forward_stage, FORWARD, forward_mb),
+                _Action(backward_stage, FULL_BACKWARD, backward_mb),
+            )
+            actions.append(_Action(-1, OVERLAP_F_B, None, sub_actions))
+
+            # Update counters for sub_actions
+            counters[forward_key] = forward_mb + 1
+            increment_backward_counts(backward_stage)
+
+        def add_action(
+            actions: list,
+            stage_index: int,
+            computation_type: _ComputationType,
+        ):
+            # Regular single action, for FULL_BACKWARD we only use the BACKWARD_INPUT counter
+            key = (
+                (stage_index, computation_type)
+                if computation_type != FULL_BACKWARD
+                else (stage_index, BACKWARD_INPUT)
+            )
+            mb_index = counters.get(key, 0)
+            actions.append(_Action(stage_index, computation_type, mb_index))
+
+            # If FULL_BACKWARD is used, just increment the separate BACKWARD_INPUT and BACKWARD_WEIGHT counters
+            if computation_type == FULL_BACKWARD:
+                increment_backward_counts(stage_index)
+            else:
+                # If BACKWARD_INPUT is updated, add corresponding weight action to queue
+                if computation_type == BACKWARD_INPUT:
+                    # Add weight action to queue for later processing
+                    weight_queue.append((stage_index, mb_index))
+                counters[key] = mb_index + 1
+
+        def add_weight_action_if_pending(actions: list):
+            """Helper method to add a weight action from the queue."""
+            if not weight_queue:
+                return  # No pending weight actions, skip
+            # Pop the oldest weight action from the queue
+            actual_stage_index, weight_mb_index = weight_queue.pop(0)
+            actions.append(
+                _Action(
+                    actual_stage_index,
+                    BACKWARD_WEIGHT,
+                    weight_mb_index,
+                )
+            )
+            # Update the counter for the actual stage that was processed
+            weight_key = (actual_stage_index, BACKWARD_WEIGHT)
+            counters[weight_key] = counters.get(weight_key, 0) + 1
+
+        # Step 1: F0
+        step_1 = (num_ranks - rank - 1) * 2
+        for _ in range(step_1):
+            add_action(actions, stage0_index, FORWARD)
+
+        # Step 2: F0F1
+        step_2 = rank + 1
+        for _ in range(step_2):
+            add_action(actions, stage0_index, FORWARD)
+            add_action(actions, stage1_index, FORWARD)
+
+        # Step 3: I1W1F1 (Use zero bubble)
+        step_3 = num_ranks - rank - 1
+        for _ in range(step_3):
+            add_action(actions, stage1_index, BACKWARD_INPUT)
+            add_weight_action_if_pending(actions)
+            add_action(actions, stage1_index, FORWARD)
+
+        # Step 4 (Main step): F0B1-F1B0 (combined, overlapped forward+backward)
+        step_4 = num_chunks - num_ranks * 2 + rank + 1
+        for i in range(step_4):
+            if i == 0 and rank == num_ranks - 1:
+                # NOTE: We don't overlap these two chunks to further reduce bubble size.
+                add_action(actions, stage0_index, FORWARD)
+                add_action(actions, stage1_index, FULL_BACKWARD)
+            else:
+                add_overlap_f_b(
+                    actions,
+                    forward_stage=stage0_index,
+                    backward_stage=stage1_index,
+                )
+            add_overlap_f_b(
+                actions,
+                forward_stage=stage1_index,
+                backward_stage=stage0_index,
+            )
+
+        # Step 5: B1-F1B0
+        step_5 = num_ranks - rank - 1
+        for _ in range(step_5):
+            add_action(actions, stage1_index, FULL_BACKWARD)
+            add_overlap_f_b(
+                actions,
+                forward_stage=stage1_index,
+                backward_stage=stage0_index,
+            )
+
+        # Step 6: B1B0 (The second half of the chunks use zero bubble)
+        step_6 = rank + 1
+        enable_zb = False
+        for i in range(step_6):
+            if i == step_6 // 2 and rank % 2 == 1:
+                enable_zb = True
+            comp_type = BACKWARD_INPUT if enable_zb else FULL_BACKWARD
+            add_action(actions, stage1_index, comp_type)
+            if i == step_6 // 2 and rank % 2 == 0:
+                enable_zb = True
+            comp_type = BACKWARD_INPUT if enable_zb else FULL_BACKWARD
+            add_action(actions, stage0_index, comp_type)
+
+        # Step 7: W0B0
+        step_7 = num_ranks - rank - 1
+        for _ in range(step_7):
+            add_weight_action_if_pending(actions)
+            comp_type = BACKWARD_INPUT if enable_zb else FULL_BACKWARD
+            add_action(actions, stage0_index, comp_type)
+
+        # Step 8: W0
+        step_8 = rank + 1
+        for _ in range(step_8):
+            add_weight_action_if_pending(actions)
+
+        return actions
+
+
 def get_schedule_class(schedule_name: str):
     """
     Maps a schedule name (case insensitive) to its corresponding class object.
@@ -2751,6 +3029,7 @@ def get_schedule_class(schedule_name: str):
         "PipelineScheduleSingle": PipelineScheduleSingle,
         "PipelineScheduleMulti": PipelineScheduleMulti,
         "ZBVZeroBubble": ScheduleZBVZeroBubble,
+        "DualPipeV": ScheduleDualPipeV,
     }
     lowercase_keys = {k.lower(): k for k in schedule_map.keys()}
     lowercase_schedule_name = schedule_name.lower()
diff --git a/torch/distributed/pipelining/stage.py b/torch/distributed/pipelining/stage.py
index e4de0ddd03ab5..6615ced0398e5 100644
--- a/torch/distributed/pipelining/stage.py
+++ b/torch/distributed/pipelining/stage.py
@@ -935,6 +935,60 @@ def _validate_fwd_outputs(self, outputs: tuple[torch.Tensor, ...]):
             f"Stage {self.stage_index} forward outputs", expected_tensors_meta, outputs
         )
 
+    def _get_init_p2p_neighbors_ops(self) -> list[dist.P2POp]:
+        """
+        Get the operations to initialize the p2p communicators between previous and next stages.
+        This is done so by creating a dummy tensor and sending it to the next stage and receiving
+        from the previous stage.
+        """
+        ops: list[dist.P2POp] = []
+        next_stage_peer_rank = self.stage_index_to_group_rank.get(self.stage_index + 1)
+        prev_stage_peer_rank = self.stage_index_to_group_rank.get(self.stage_index - 1)
+
+        recv_tensor = torch.zeros(1, device=self.device)
+        send_tensor = torch.tensor(self.stage_index, device=self.device)
+        # forward
+        if not self.is_first:
+            ops.append(
+                dist.P2POp(
+                    dist.irecv,
+                    recv_tensor,
+                    group_peer=prev_stage_peer_rank,
+                    group=self.group,
+                )
+            )
+        if not self.is_last:
+            ops.append(
+                dist.P2POp(
+                    dist.isend,
+                    send_tensor,
+                    group_peer=next_stage_peer_rank,
+                    group=self.group,
+                )
+            )
+
+        # backward
+        if not self.is_first:
+            ops.append(
+                dist.P2POp(
+                    dist.isend,
+                    send_tensor,
+                    group_peer=prev_stage_peer_rank,
+                    group=self.group,
+                )
+            )
+        if not self.is_last:
+            ops.append(
+                dist.P2POp(
+                    dist.irecv,
+                    recv_tensor,
+                    group_peer=next_stage_peer_rank,
+                    group=self.group,
+                )
+            )
+
+        return ops
+
 
 class _PipelineStage(_PipelineStageBase):
     def __init__(
@@ -1370,6 +1424,7 @@ def _shape_inference(
                 ),
                 group=self.group,
                 device=self.device,
+                use_batch=True,
             )
             recv_args = objects[0]
             assert isinstance(recv_args, tuple), type(recv_args)
@@ -1435,6 +1490,7 @@ def _shape_inference(
                 ),
                 group=self.group,
                 device=self.device,
+                use_batch=True,
             )
             outputs_meta = tuple()
 
diff --git a/torch/distributed/rpc/__init__.py b/torch/distributed/rpc/__init__.py
index adf901d6b6e3e..27a945a92e44c 100644
--- a/torch/distributed/rpc/__init__.py
+++ b/torch/distributed/rpc/__init__.py
@@ -37,7 +37,6 @@ def is_available() -> bool:
     import numbers
 
     import torch.distributed.autograd as dist_autograd
-    from torch._C._distributed_c10d import Store
     from torch._C._distributed_rpc import (  # noqa: F401
         _cleanup_python_rpc_handler,
         _DEFAULT_INIT_METHOD,
@@ -70,6 +69,7 @@ def is_available() -> bool:
         RpcBackendOptions,
         WorkerInfo,
     )
+    from torch.distributed._distributed_c10d import Store
 
     if _is_tensorpipe_available:
         from torch._C._distributed_rpc import (  # noqa: F401
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index c37ecd8f72d86..2738191f0e379 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -382,7 +382,7 @@ def main():
 from torch.distributed.elastic.utils import macros
 from torch.distributed.elastic.utils.logging import get_logger
 from torch.distributed.launcher.api import elastic_launch, LaunchConfig
-from torch.distributed.numa.binding import (
+from torch.numa.binding import (
     AffinityMode as _AffinityMode,  # Signify as private with _
     NumaOptions as _NumaOptions,
 )
diff --git a/torch/distributed/tensor/_api.py b/torch/distributed/tensor/_api.py
index b0ee136c135f6..38a46ed5f03c5 100644
--- a/torch/distributed/tensor/_api.py
+++ b/torch/distributed/tensor/_api.py
@@ -269,22 +269,13 @@ def __new__(
         # new method instruct wrapper tensor from local_tensor and add
         # placement spec, it does not do actual distribution
         assert spec.tensor_meta is not None, "TensorMeta should not be None!"
-        extra_dispatch_keys = torch._C.DispatchKeySet.from_raw_repr(0)
-        if torch._C._dispatch_keys(local_tensor).has(torch._C.DispatchKey.Conjugate):
-            extra_dispatch_keys = extra_dispatch_keys.add(
-                torch._C.DispatchKey.Conjugate
-            )
-        if torch._C._dispatch_keys(local_tensor).has(torch._C.DispatchKey.Negative):
-            extra_dispatch_keys = extra_dispatch_keys.add(torch._C.DispatchKey.Negative)
-        r = torch.Tensor._make_wrapper_subclass(
+
+        r = torch.Tensor._make_dtensor(
             cls,
             spec.tensor_meta.shape,
-            strides=spec.tensor_meta.stride,
-            dtype=local_tensor.dtype,
-            device=local_tensor.device,
-            layout=local_tensor.layout,
-            requires_grad=requires_grad,
-            _extra_dispatch_keys=extra_dispatch_keys,
+            spec.tensor_meta.stride,
+            local_tensor,
+            requires_grad,
         )
 
         r._spec = spec
@@ -355,6 +346,12 @@ def __coerce_same_metadata_as_tangent__(self, flatten_spec, expected_type=None):
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
+        # These are all ops that can show up in AccumulateGrad,
+        # which is susceptible to DTensor overheads
+        if func is torch.ops.aten.detach.default:
+            return DTensor(
+                args[0]._local_tensor.detach(), args[0]._spec, requires_grad=False
+            )
         return DTensor._op_dispatcher.dispatch(
             func,
             args,
diff --git a/torch/distributed/tensor/_collective_utils.py b/torch/distributed/tensor/_collective_utils.py
index 4fce6fea538a6..f01836c59592b 100644
--- a/torch/distributed/tensor/_collective_utils.py
+++ b/torch/distributed/tensor/_collective_utils.py
@@ -8,8 +8,10 @@
 import torch
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.tensor._dtensor_spec as dtensor_spec
-from torch._C._distributed_c10d import _resolve_process_group
 from torch._logging import warning_once
+
+# Import from centralized fallback module - no conditional imports needed
+from torch.distributed._distributed_c10d import _resolve_process_group
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
 from torch.distributed.distributed_c10d import (
     _get_group_size_by_name,
diff --git a/torch/distributed/tensor/_dispatch.py b/torch/distributed/tensor/_dispatch.py
index 346e2966b15b5..7ac7801b50bca 100644
--- a/torch/distributed/tensor/_dispatch.py
+++ b/torch/distributed/tensor/_dispatch.py
@@ -23,6 +23,7 @@
 )
 from torch.distributed.tensor._utils import try_find_mesh_from_args
 from torch.distributed.tensor.placement_types import Partial, Placement, Replicate
+from torch.utils._python_dispatch import return_and_correct_aliasing
 
 
 try:
@@ -120,11 +121,17 @@ def __init__(self) -> None:
             aten._amp_foreach_non_finite_check_and_unscale_.default: found_inf_reduce_handler,
         }
 
-        # This flag is used internally to control whether we treat the torch.Tensor(non-DTensor)
-        # as implicitly replicated or we throw error to user.
-        # NOTE: It is EXTREMELY UNSAFE to turn this flag on by default so we intentionally leave
-        # it as False by default.
-        self._allow_implicit_replication = False
+    # This flag is used internally to control whether we treat the torch.Tensor(non-DTensor)
+    # as implicitly replicated or we throw error to user.
+    # NOTE: It is EXTREMELY UNSAFE to turn this flag on by default so we intentionally leave
+    # it as False by default.
+    @property
+    def _allow_implicit_replication(self) -> bool:
+        return torch._C._get_dtensor_allow_implicit_replication()
+
+    @_allow_implicit_replication.setter
+    def _allow_implicit_replication(self, value: bool) -> None:
+        return torch._C._set_dtensor_allow_implicit_replication(value)
 
     def dispatch(
         self,
@@ -138,7 +145,6 @@ def dispatch(
         (2) registered sharding strategy, then rule
         (3) composite implicit autograd decomposition
         """
-
         if op_call in self._custom_op_handlers:
             return self._custom_op_handlers[op_call](op_call, args, kwargs)  # type: ignore[operator]
 
@@ -159,13 +165,18 @@ def dispatch(
                 return out
             else:
                 raise
+        except Exception as e:
+            raise RuntimeError(
+                f"Sharding propagation failed for {op_info.schema}"
+            ) from e
 
         output_sharding = op_info.output_sharding
         logger.debug("output_sharding for %s: %s", op_call, output_sharding)
         assert output_sharding is not None, "output sharding should not be None"
 
         mesh = op_info.compute_mesh
-        if mesh.get_coordinate() is not None:
+        participating = mesh.get_coordinate() is not None
+        if participating:
             # computation that happens in the current rank of the mesh, normal case
             if output_sharding.needs_redistribute:
                 # If sharding propagation decision needs redistribute, perform redistribute
@@ -197,8 +208,19 @@ def dispatch(
                     cast(dtensor.DTensor, args[0]),
                     cast(torch.Tensor, local_tensor_args[0]),
                 )
+
+                # If the user provided a generator, we hook it up to our RNG manager, but we also pop it from kwargs
+                # so the op_call does not directly use it (we want op_call to fall back to the 'default' which is
+                # our RNG manager)
+                maybe_user_generator = op_info.local_kwargs.pop("generator", None)
+                assert maybe_user_generator is None or isinstance(
+                    maybe_user_generator, torch.Generator
+                )
+                # maybe_user_generator = None
                 rng_context = (
-                    random._rng_tracker._distribute_region(first_arg._spec)
+                    random._rng_tracker._distribute_region(
+                        first_arg._spec, generator=maybe_user_generator
+                    )
                     if random._rng_tracker and not first_local_arg.is_meta
                     else contextlib.nullcontext()
                 )
@@ -267,7 +289,20 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
         if op_info.schema.is_inplace_op():
             # inplace op should return self instead of re-wrapping
             if output_sharding.output_spec is not None:
-                return args[0]
+                # NOTE: aten.squeeze_.dim is an inplace op but it also may change
+                # the inplace argument's tensor meta. Here we choose to special case
+                # this op because as far as I know this is the only inplace op that
+                # has such as behavior. We can extend this special case if necessary.
+                if op_call == aten.squeeze_.dim:
+                    output_spec = output_sharding.output_spec
+                    assert isinstance(output_spec, DTensorSpec)
+                    assert isinstance(args[0], dtensor.DTensor)
+                    args[0]._spec = output_spec
+                    # use return_and_correct_aliasing to match the outer and the inner
+                    # aliasing. See https://github.com/pytorch/pytorch/pull/158954
+                    return return_and_correct_aliasing(op_call, args, kwargs, args[0])
+                else:
+                    return args[0]
             else:
                 return None
         elif op_info.schema.is_out_variant_op():
@@ -289,7 +324,11 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
             assert len(out_dts) >= 1, "out variant should have at least one out arg"
             return tuple(out_dts) if len(out_dts) > 1 else out_dts[0]
         else:
-            return self.wrap(local_results, output_sharding.output_spec)  # type: ignore[possibly-undefined]
+            ret = self.wrap(local_results, output_sharding.output_spec)  # type: ignore[possibly-undefined]
+            if participating and op_info.schema.is_view_op():
+                return return_and_correct_aliasing(op_call, args, kwargs, ret)
+            else:
+                return ret
 
     @staticmethod
     def redistribute_local_args(
diff --git a/torch/distributed/tensor/_dtensor_spec.py b/torch/distributed/tensor/_dtensor_spec.py
index eb528ee4f9af1..bffb399b2bca8 100644
--- a/torch/distributed/tensor/_dtensor_spec.py
+++ b/torch/distributed/tensor/_dtensor_spec.py
@@ -40,6 +40,16 @@ def __setattr__(self, attr: str, value: Any) -> None:
         # change (though we do not expect `mesh` or `placements` to change)
         if hasattr(self, "_hash") and attr in ("mesh", "placements", "tensor_meta"):
             self._hash = None
+        # This assert was triggered by buggy handling for dict outputs in some
+        # FX passes, where you accidentally iterate over a dict and try to put
+        # keys into TensorMeta.  See https://github.com/pytorch/pytorch/issues/157919
+        if attr == "tensor_meta" and value is not None:
+            from torch.fx.passes.shape_prop import TensorMetadata
+
+            # TODO: the TensorMetadata arises from
+            # test/distributed/tensor/experimental/test_tp_transform.py::TensorParallelTest::test_tp_transform_e2e
+            # but I actually can't reproduce it, maybe it is also a bug!
+            assert isinstance(value, (TensorMeta, TensorMetadata)), value
 
     def _hash_impl(self) -> int:
         # hashing and equality check for DTensorSpec are used to cache the sharding
diff --git a/torch/distributed/tensor/_op_schema.py b/torch/distributed/tensor/_op_schema.py
index b892d8883527c..6f8c644095eec 100644
--- a/torch/distributed/tensor/_op_schema.py
+++ b/torch/distributed/tensor/_op_schema.py
@@ -1,4 +1,28 @@
 # mypy: allow-untyped-defs
+"""
+DTensor operator schema definitions and utilities.
+
+This module defines the core data structures and utilities for describing and managing
+distributed tensor operations in PyTorch's DTensor system. It provides the foundational
+schema types used for sharding propagation, operator strategy selection, and distributed
+execution planning.
+
+Key components:
+- OpSpec: Describes acceptable sharding placements for operations
+- OpStrategy: Represents the possible sharding strategies for an operator
+- TupleStrategy: Container for multiple strategies when ops have tuple/list of tensors input
+- OpSchema: Describes operator input/output schemas with DTensorSpecs
+- OutputSharding: Manages output sharding specifications and redistribution
+- RuntimeSchemaInfo: Runtime execution metadata for operators
+- OpInfo: Complete runtime operator execution information
+
+These schema definitions enable the DTensor system to:
+1. Propagate tensor sharding information to the operator outputs
+2. Greedily select sharding strategies for distributed operations
+3. Plan and execute tensor redistributions when needed
+4. Cache sharding decisions for performance optimization
+"""
+
 from collections.abc import Sequence
 from dataclasses import dataclass
 from functools import cached_property
@@ -288,7 +312,7 @@ class OpSchema:
     order preserved). It is mainly used by the DTensor's dispatching logic to perform various
     actions (i.e. sharding propagation, caching sharding decisions, redistribute, etc.)
 
-    NOTE: this should be used as a read only data class
+    NOTE: this must be used as a read only data class
     TODO: make this a frozen dataclass
 
     Args:
@@ -305,6 +329,8 @@ class OpSchema:
 
     schema_info: Optional[RuntimeSchemaInfo] = None
 
+    _comparison_key: Optional[tuple[object, ...]] = None
+
     @property
     def args_spec(self) -> tuple[DTensorSpec, ...]:
         """
@@ -367,9 +393,9 @@ def __post_init__(self) -> None:
                     has_symints = True
                     break
         self.has_symints = has_symints
+        self._recompute_comparison_key()
 
-    def arg_type_tensor_or_tensor_list_like(self, arg_idx: int) -> bool:
-        arg = self.args_schema[arg_idx]
+    def arg_type_tensor_or_tensor_list_like(self, arg: object) -> bool:
         is_tensor = isinstance(arg, DTensorSpec)
         if is_tensor:
             return True
@@ -450,8 +476,10 @@ def is_out_variant_op(self) -> bool:
         # be entirely correct, but it's good enough for now.
         return "out" in self.op._schema.overload_name
 
-    def __hash__(self) -> int:
-        # Only hash args and kwargs that op indicates to hash
+    def is_view_op(self) -> bool:
+        return self.op._schema._is_view_op()
+
+    def _recompute_comparison_key(self):
         if not self.schema_info:
             static_argnum = len(self.args_schema)
             static_kwargkey = None
@@ -462,15 +490,18 @@ def __hash__(self) -> int:
         args_to_hash = tuple(
             tuple(e) if isinstance(e, list) else e
             for i, e in enumerate(self.args_schema)
-            if self.arg_type_tensor_or_tensor_list_like(i) or i >= static_argnum
+            if self.arg_type_tensor_or_tensor_list_like(e) or i >= static_argnum
         )
         if static_kwargkey is not None:
             kwargs_to_hash = tuple(
                 self.kwargs_schema.get(k, None) for k in static_kwargkey
             )
-            return hash((self.op, args_to_hash, kwargs_to_hash))
+            self._comparison_key = (self.op, args_to_hash, kwargs_to_hash)
         else:
-            return hash((self.op, args_to_hash))
+            self._comparison_key = (self.op, args_to_hash)
+
+    def __hash__(self) -> int:
+        return hash(self._comparison_key)
 
     def __eq__(self, other: object) -> bool:
         # early return checks
@@ -483,31 +514,7 @@ def __eq__(self, other: object) -> bool:
         if len(self.args_schema) != len(other.args_schema):
             return False
 
-        # compare each element and early return if any of them is different
-        if not self.schema_info:
-            static_argnum = len(self.args_schema)
-            static_kwargkey = None
-        else:
-            static_argnum = self.schema_info.static_argnum
-            static_kwargkey = self.schema_info.static_kwargkey
-
-        for i, (self_arg, other_arg) in enumerate(
-            zip(self.args_schema, other.args_schema)
-        ):
-            if isinstance(self_arg, DTensorSpec) and self_arg != other_arg:
-                return False
-            elif i >= static_argnum and self_arg != other_arg:
-                return False
-
-        # check kwarg equality when there's a static kwarg key
-        if static_kwargkey:
-            for key in static_kwargkey:
-                if self.kwargs_schema.get(key, None) != other.kwargs_schema.get(
-                    key, None
-                ):
-                    return False
-
-        return True
+        return self._comparison_key == other._comparison_key
 
     def gen_fake_args(self) -> ArgsType:
         """
@@ -554,6 +561,7 @@ def _inplace_rewrap_schema_suggestion(self, origin_schema: "OpSchema") -> None:
                 new_arg_schema.append(arg)
         self.args_schema = tuple(new_arg_schema)
         self.kwargs_schema = origin_schema.kwargs_schema
+        self._recompute_comparison_key()
 
 
 @dataclass
diff --git a/torch/distributed/tensor/_ops/_math_ops.py b/torch/distributed/tensor/_ops/_math_ops.py
index 78d2ac3e4b137..1e6eb40939e4a 100644
--- a/torch/distributed/tensor/_ops/_math_ops.py
+++ b/torch/distributed/tensor/_ops/_math_ops.py
@@ -818,27 +818,38 @@ def nll_loss_backward_strategy(op_schema: OpSchema) -> OpStrategy:
     return grad_in_strategy
 
 
-@register_op_strategy(
-    [aten.native_layer_norm.default],
-    schema_info=RuntimeSchemaInfo(1),
-)
-def layer_norm_strategy(op_schema: OpSchema) -> OpStrategy:
+def _common_norm_forward_strategy(
+    op_schema: OpSchema,
+    rms_norm: bool = False,
+) -> OpStrategy:
+    """Common forward strategy logic for layer_norm and rms_norm."""
     mesh = op_schema.get_mesh_from_args()
 
-    # args must be: input, normalized_shape, weight, bias, eps
-    # for None weight and bias, their corresponding objects will
-    # be None as well. layer_norm_strategy returns one OpStrategy
-    # for the triple return values (out, mean, rstd).
-    assert len(op_schema.args_schema) == 5
-    (
-        input_strategy,
-        normalized_shape,
-        weight_strategy,
-        bias_strategy,
-        _,
-    ) = op_schema.args_schema
+    if not rms_norm:
+        # layer_norm args: input, normalized_shape, weight, bias, eps
+        # for None weight and bias, their corresponding objects will
+        # be None as well. layer_norm_strategy returns one OpStrategy
+        # for the triple return values (out, mean, rstd).
+        assert len(op_schema.args_schema) == 5
+        (
+            input_strategy,
+            normalized_shape,
+            weight_strategy,
+            bias_strategy,
+            _,
+        ) = op_schema.args_schema
+    else:
+        # rms_norm args: input, normalized_shape, weight, eps
+        assert len(op_schema.args_schema) == 4
+        (
+            input_strategy,
+            normalized_shape,
+            weight_strategy,
+            _,
+        ) = op_schema.args_schema
+        bias_strategy = None
 
-    # the current layer norm implementation requires that all
+    # the current norm implementation requires that all
     # input DTensor's sharding must be in form of OpStrategy
     assert isinstance(input_strategy, OpStrategy)
     assert isinstance(normalized_shape, (int, Sequence, torch.Size))
@@ -847,7 +858,7 @@ def layer_norm_strategy(op_schema: OpSchema) -> OpStrategy:
     input_ndim = input_strategy.ndim
     axis = input_ndim - len(normalized_size)
 
-    # we use OpStrategy because the output (out, mean, rstd)
+    # we use OpStrategy because the output values (out, mean, rstd)
     # should have the same placements
     output_strategy = OpStrategy([])
     for idx, input_placement_strategy in enumerate(input_strategy.strategies):
@@ -915,6 +926,22 @@ def layer_norm_strategy(op_schema: OpSchema) -> OpStrategy:
     return output_strategy
 
 
+@register_op_strategy(
+    [aten.native_layer_norm.default],
+    schema_info=RuntimeSchemaInfo(1),
+)
+def layer_norm_strategy(op_schema: OpSchema) -> OpStrategy:
+    return _common_norm_forward_strategy(op_schema)
+
+
+@register_op_strategy(
+    [aten._fused_rms_norm.default],
+    schema_info=RuntimeSchemaInfo(1),
+)
+def fused_rms_norm_strategy(op_schema: OpSchema) -> OpStrategy:
+    return _common_norm_forward_strategy(op_schema, rms_norm=True)
+
+
 def _common_norm_backward_strategy(
     op_schema: OpSchema,
     rms_norm: bool = False,
diff --git a/torch/distributed/tensor/_ops/_tensor_ops.py b/torch/distributed/tensor/_ops/_tensor_ops.py
index 1838abdb97cab..a5a037a3c73e6 100644
--- a/torch/distributed/tensor/_ops/_tensor_ops.py
+++ b/torch/distributed/tensor/_ops/_tensor_ops.py
@@ -570,7 +570,6 @@ def replica_only_strategy(op_schema: OpSchema) -> StrategyType:
         aten.scatter.value,
         aten.scatter_.src,
         aten.scatter.src,
-        aten.scatter_add.default,
     ],
     schema_info=RuntimeSchemaInfo(1),
 )
@@ -597,11 +596,44 @@ def scatter_strategy(op_schema: OpSchema) -> StrategyType:
     return op_strategy
 
 
-@register_op_strategy(aten.gather.default)
+@register_op_strategy(aten.scatter_add.default, schema_info=RuntimeSchemaInfo(1))
+def scatter_add_strategy(op_schema: OpSchema) -> StrategyType:
+    input_strategy = op_schema.args_schema[0]
+    dim = op_schema.args_schema[1]
+    index_strategy = op_schema.args_schema[2]
+
+    assert isinstance(input_strategy, OpStrategy)
+    assert isinstance(index_strategy, OpStrategy)
+    assert isinstance(dim, int)
+    dim = normalize_dim(dim, input_strategy.ndim)
+    mesh = input_strategy.mesh
+    input_shape = input_strategy.shape
+    index_shape = index_strategy.shape
+
+    single_mesh_dim_strategies = []
+
+    # placement list stores placements of [output, input, index, src]
+    # first we always have replicate all for inputs and output
+    all_replicate: PlacementList = [Replicate()] * 4
+    single_mesh_dim_strategies.append(all_replicate)
+
+    if len(input_shape) == len(index_shape):
+        for d in range(len(input_shape)):
+            if d != dim and input_shape[d] == index_shape[d]:
+                sharding: PlacementList = [Shard(d), Shard(d), Shard(d), Shard(d)]
+                single_mesh_dim_strategies.append(sharding)
+
+    return expand_to_full_mesh_op_strategy(
+        mesh, op_schema, single_mesh_dim_strategies, input_index=1
+    )
+
+
+@register_op_strategy(aten.gather.default, schema_info=RuntimeSchemaInfo(1))
 def gather_strategy(op_schema: OpSchema) -> StrategyType:
     mesh = op_schema.get_mesh_from_args()
     input_strategy = cast(OpStrategy, op_schema.args_schema[0])
     dim = cast(int, op_schema.args_schema[1])
+    dim = normalize_dim(dim, input_strategy.ndim)
     index_strategy = cast(OpStrategy, op_schema.args_schema[2])
 
     input_shape = input_strategy.shape
@@ -617,7 +649,7 @@ def gather_strategy(op_schema: OpSchema) -> StrategyType:
     # input sharding, input sharded, index accepts mask partial, output follows index
     # this only works when the input is sharded on the gather dimension, and
     # index has size 1 on the gather dimension
-    if index_shape[dim] == 1:
+    if dim < len(index_shape) and index_shape[dim] == 1:
         index_partial_placement = _MaskPartial(offset_shape=input_shape, offset_dim=dim)
         input_sharding: PlacementList = [
             index_partial_placement,
@@ -631,6 +663,12 @@ def gather_strategy(op_schema: OpSchema) -> StrategyType:
     index_sharding: PlacementList = [Shard(dim), Replicate(), Shard(dim)]
     single_mesh_dim_strategies.append(index_sharding)
 
+    if len(input_shape) == len(index_shape):
+        for d in range(len(input_shape)):
+            if d != dim:
+                sharding: PlacementList = [Shard(d), Shard(d), Shard(d)]
+                single_mesh_dim_strategies.append(sharding)
+
     return expand_to_full_mesh_op_strategy(
         mesh, op_schema, single_mesh_dim_strategies, input_index=1
     )
diff --git a/torch/distributed/tensor/_ops/_view_ops.py b/torch/distributed/tensor/_ops/_view_ops.py
index c942da67cd8a1..62e8c68e9be9d 100644
--- a/torch/distributed/tensor/_ops/_view_ops.py
+++ b/torch/distributed/tensor/_ops/_view_ops.py
@@ -22,7 +22,12 @@
     prod,
     register_op_strategy,
 )
-from torch.distributed.tensor.placement_types import Placement, Replicate, Shard
+from torch.distributed.tensor.placement_types import (
+    _StridedShard,
+    Placement,
+    Replicate,
+    Shard,
+)
 
 
 aten = torch.ops.aten
@@ -514,45 +519,48 @@ def maybe_get_shard_mesh_dim_and_placement(
                 return i, placement
         return None, None
 
+    # NOTE: This function has three responsibilities:
+    # 1. determine "theoretically" if an output dimension can be sharded, i.e. fill the shardable_dims map
+    # 2. determine "theoretically" the corresponding input dimension to shard on, via return value
+    # 3. throw an error when strict_view is enabled and we cannot shard an output dimension
+    # 1 and 2 doesn't require the info of whether current input is sharded.
+    # 3 requires that info, to decide whether we can error out. Maybe we can refactor
+    # to make this function purely "theoretical".
     def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
-        # TODO(whc) this helper is pretty hard to understand, at least it should be better documented if not refactored
         if isinstance(cmd, InputDim):
             return cmd
         elif isinstance(cmd, Flatten):
             for i, dim in enumerate(cmd.input_dims):
-                if isinstance(dim, InputDim):
-                    can_shard_dim = True
-                    shard_mesh_dim, shard_placement = (
-                        maybe_get_shard_mesh_dim_and_placement(dim)
-                    )
-                    input_sharded = shard_mesh_dim is not None
-                    if i > 0:
+                # so far all Flatten is always composed of InputDims; revisit this if needed
+                assert isinstance(dim, InputDim)
+                can_shard_dim = True
+                shard_mesh_dim, shard_placement = (
+                    maybe_get_shard_mesh_dim_and_placement(dim)
+                )
+                input_sharded = shard_mesh_dim is not None
+                if i > 0:
+                    can_shard_dim = False
+                    if strict_view and input_sharded:
+                        raise RuntimeError(
+                            f"Attempted to flatten multiple dimensions, with dimension {dim.input_dim} being sharded. ",
+                            "It cannot be performed without redistribution, which is disallowed by the current operator.",
+                        )
+                elif input_sharded:
+                    assert shard_placement is not None and shard_mesh_dim is not None
+                    tensor_dim_size = global_input_shape[shard_placement.dim]
+                    mesh_dim_size = mesh_sizes[shard_mesh_dim]
+                    if tensor_dim_size % mesh_dim_size != 0:
                         can_shard_dim = False
-                        if strict_view and input_sharded:
+                        if strict_view:
                             raise RuntimeError(
-                                f"Attempted to flatten sharded dimension {i}, ",
-                                "but only the leftmost dim of a Flatten can be sharded.",
+                                f"Attempted to flatten unevenly sharded dimension {i}, "
+                                "which would require resharding the input. "
+                                "Please explicitly redistribute the tensor instead."
                             )
-                    elif input_sharded:
-                        assert (
-                            shard_placement is not None and shard_mesh_dim is not None
-                        )
-                        tensor_dim_size = global_input_shape[shard_placement.dim]
-                        mesh_dim_size = mesh_sizes[shard_mesh_dim]
-                        if tensor_dim_size % mesh_dim_size != 0:
-                            can_shard_dim = False
-                            if strict_view:
-                                raise RuntimeError(
-                                    f"Attempted to flatten unevenly sharded dimension {i}, "
-                                    "which would require resharding the input. "
-                                    "Please explicitly redistribute the tensor instead."
-                                )
-
-                    shardable_dims[dim.input_dim] = [can_shard_dim] * mesh_ndim
-            dim0 = cmd.input_dims[0]
-            # TODO(whc) dim0 can be sharded or not sharded, can't it?
-            # should we only return it if its sharded in the placement?
-            return dim0 if isinstance(dim0, InputDim) else None
+                shardable_dims[dim.input_dim] = [can_shard_dim] * mesh_ndim
+
+            assert isinstance(cmd.input_dims[0], InputDim)
+            return cmd.input_dims[0]
         elif isinstance(cmd, Split):
             in_dim = get_in_dim_to_shard(cmd.input_dim)
             out_size = cmd.group_shape[cmd.split_id]
@@ -571,6 +579,14 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
                     out_size % mesh_dim_size == 0 for mesh_dim_size in mesh_sizes
                 ]
 
+                shard_mesh_dim, _ = maybe_get_shard_mesh_dim_and_placement(in_dim)
+                if strict_view and shard_mesh_dim is not None:
+                    if not shardable_dims[in_dim.input_dim][shard_mesh_dim]:
+                        raise RuntimeError(
+                            f"Attempted to split the sharded dimension {in_dim.input_dim} into multiple subdimensions. ",
+                            "It cannot be performed without redistribution, which is disallowed by the current operator.",
+                        )
+
                 # 2. here we special case things like [Shard(0), Shard(0)]
                 submesh_size = 1
                 for size, shard in zip(mesh_sizes, input_src_placements):
@@ -605,8 +621,30 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
         )
         for mesh_dim, p in enumerate(input_src_placements)
     ]
+
+    def _rewrite_shard_dim(p: Shard):
+        """
+        Rewrite the shard dim to the corresponding tensor dim in output.
+        For ``_StridedShard``, we can safely keep the placement type and
+        ``split_factor`` unchanged and only rewrite the ``dim`` because:
+        1. ``_StridedShard`` has no impact on sharding (i.e. how
+            tensor is partitioned) compared to ``Shard``. It only changes
+            how shards permute across the devices.
+        2. ``view()`` op on DTensor strictly forbids shard redistribution
+            which means if ``view()`` may cause shard permutation across
+            devices, it should be rejected. This is enforced in today's
+            sharding prop for ``view()``.
+        3. Since DTensor ``view()`` won't introduce any redistribution,
+            it's certain that ``placements`` won't change except the
+            inner ``dim`` attribute of ``Shard`` or ``_StridedShard``.
+        """
+        if isinstance(p, _StridedShard):
+            return _StridedShard(shard_dim_map[p.dim], split_factor=p.split_factor)
+        else:
+            return Shard(shard_dim_map[p.dim])
+
     output_placements = [
-        Shard(shard_dim_map[p.dim]) if isinstance(p, Shard) else p
+        _rewrite_shard_dim(p) if isinstance(p, Shard) else p
         for p in input_tgt_placements
     ]
 
@@ -677,6 +715,9 @@ def reshape_strategy(op_schema: OpSchema) -> StrategyType:
 
 
 register_op_strategy_map(aten.squeeze.default, torch.squeeze)
+register_op_strategy_map(
+    aten.squeeze_.dim, torch.squeeze, schema_info=RuntimeSchemaInfo(1)
+)
 register_op_strategy_map(
     aten.squeeze.dim, torch.squeeze, schema_info=RuntimeSchemaInfo(1)
 )
diff --git a/torch/distributed/tensor/_random.py b/torch/distributed/tensor/_random.py
index 082805db7fde3..dc3a1fb10e4b3 100644
--- a/torch/distributed/tensor/_random.py
+++ b/torch/distributed/tensor/_random.py
@@ -2,16 +2,17 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import contextlib
 import warnings
+from logging import getLogger
 from typing import Optional, Union
 
 import torch
-import torch.distributed as dist
-from torch import Tensor
 from torch.distributed.device_mesh import _get_device_handle, DeviceMesh
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor.placement_types import Shard
 
 
+logger = getLogger(__name__)
+
 __all__ = [
     "is_rng_supported_mesh",
     "manual_seed",
@@ -75,22 +76,69 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
         )
         return
 
+    # TODO: deprecate this API, but also need to ensure we disable broadcast for PP case, and that's currently
+    # bundled together with this API.  See torchtitan/distributed/utils.py:set_determinism
+    # warnings.warn(
+    #     "DTensor manual_seed() is deprecated, since DTensor no longer maintains a separate copy of generator state. "
+    #     "Use `torch.manual_seed` instead"
+    # )
+    # Note: we still need to ensure setting `run_state_sync=False` to support the the pp case
+
     # instantiate a RNG tracker if haven't. By default DTensor uses an
     # OffsetBasedRNGTracker to perform random operators.
     global _rng_tracker
     if not _rng_tracker:
         _rng_tracker = OffsetBasedRNGTracker(device_mesh, run_state_sync=False)
 
-    # the current rank is in mesh
-    if device_mesh.get_coordinate() is not None:
-        _rng_tracker._manual_seed(seed)
-    else:
+    if device_mesh.get_coordinate() is None:
         raise RuntimeError(
             "manual_seed requires the current rank to be a part of the device mesh "
             "otherwise DTensor RNG state on the rank will not be initialized and "
             "the behavior of DTensor random ops is undefined."
         )
 
+    # DTensor no longer maintains a copy of rng state. manual seed on dtensor is the same thing
+    # as manual seed on torch.
+    torch.manual_seed(seed)
+
+
+class _PhiloxState:
+    """
+    Convenience accessor for interpreting the packed bits of (seed: uint64, offset: uint64) in the philox state,
+    which for some reason is actually exposed as a size-16 uint8 tensor.
+
+    The state is always moved to .cpu since it is necessary for it to be on CPU before applying it back to a generator.
+    """
+
+    def __init__(self, state: torch.Tensor):
+        self._state = state.to("cpu")
+
+    @property
+    def state(self):
+        return self._state
+
+    @property
+    def offset(self) -> int:
+        return int(self._state[8:].view(dtype=torch.int64).item())
+
+    @offset.setter
+    def offset(self, offset: int) -> None:
+        offset_tensor = torch.tensor([offset], dtype=torch.uint64, device="cpu").view(
+            torch.uint8
+        )
+        self._state[8:] = offset_tensor
+
+    @property
+    def seed(self) -> int:
+        return int(self._state[:8].view(dtype=torch.int64).item())
+
+    @seed.setter
+    def seed(self, seed: int) -> None:
+        seed_tensor = torch.tensor([seed], dtype=torch.uint64, device="cpu").view(
+            torch.uint8
+        )
+        self._state[:8] = seed_tensor
+
 
 class _RNGStateTracker:
     """
@@ -109,14 +157,8 @@ def __init__(self, device: torch.device):
                 f"{self.__class__.__name__} instantiation requires the presence of "
                 f"{device.type} device but couldn't find."
             )
-
-        self._states: dict[str, Tensor] = {}
         self._use_distribute_region = True
 
-    @property
-    def rng_states(self) -> dict[str, Tensor]:
-        return self._states
-
     @property
     def distribute_region_enabled(self) -> bool:
         return self._use_distribute_region
@@ -125,28 +167,9 @@ def distribute_region_enabled(self) -> bool:
     def distribute_region_enabled(self, value) -> None:
         self._use_distribute_region = value
 
-    def rng_state_is_sync(self, name) -> bool:
-        return name in self.rng_states
-
-    def get_seed(self, name: str) -> int:
-        if name not in self.rng_states:
-            raise RuntimeError(
-                f"{self.__class__.__name__} does not have random state for {name}"
-            )
-
-        seed_tensor = (self.rng_states[name])[0:8].view(dtype=torch.int64)
-        return int(seed_tensor.item())
-
-    def set_seed(self, name: str, seed: int) -> None:
-        seed_tensor = torch.tensor([seed], dtype=torch.uint64, device="cpu").view(
-            torch.uint8
-        )
-        offset_tensor = torch.tensor([0], dtype=torch.uint64, device="cpu").view(
-            torch.uint8
-        )
-        self.rng_states[name] = torch.cat([seed_tensor, offset_tensor])
-
-    def _distribute_region(self, spec: DTensorSpec):
+    def _distribute_region(
+        self, spec: DTensorSpec, generator: Optional[torch.Generator] = None
+    ):
         pass
 
     def _manual_seed(self, parallel_seed: int) -> None:
@@ -176,71 +199,80 @@ def __init__(
                 f"CUDA/CUDA-like/XPU device. Got {self._device.type} instead."
             )
 
+        rng_state = self._get_device_state()
+        if run_state_sync:
+            # synchronize RNG state using rank 0's current one
+            torch.distributed.broadcast(rng_state, 0)
+            my_rng_state = self._get_device_state()
+            if not all(my_rng_state == rng_state):
+                logger.warning(
+                    "DTensor is synchronizing RNG states of every rank with the state from rank 0. "
+                    "This behavior is deprecated. "
+                    "Please call `torch.manual_seed()` on every rank that participates in SPMD DTensor Operations with "
+                    "the same seed. If using Pipeline Parallelism, each pipeling state would use a different seed, "
+                    "but all ranks belonging to one pipeline stage would use the same seed."
+                )
+            self._set_device_state(rng_state)
+
+    def _get_device_state(self) -> torch.Tensor:
         if self._device.type == "hpu":
             self._device_handle.set_rng_ctx("philox")
         rng_state = self._device_handle.get_rng_state().to(self._device)
         if self._device.type == "hpu":
             self._device_handle.unset_rng_ctx("philox")
-        if run_state_sync:
-            # synchronize RNG state using rank 0's current one
-            dist.broadcast(rng_state, 0)
-
-        self.rng_states["parallel-rng"] = rng_state.to("cpu")
+        return rng_state
 
-    def _manual_seed(self, parallel_seed: int) -> None:
-        self.set_seed("parallel-rng", parallel_seed)
+    def _set_device_state(self, state: torch.Tensor):
+        # It seems that the underlying generator wants a cpu tensor but the dtensor code expects `_get_device_state`
+        # to convert to a 'device' tensor, probably because we may use it with our backend comms for sync/debug
+        # for now, we just convert back to cpu here to make sure it always works.
+        if self._device.type == "hpu":
+            self._device_handle.set_rng_ctx("philox")
+        self._device_handle.set_rng_state(state.to("cpu"))
+        if self._device.type == "hpu":
+            self._device_handle.unset_rng_ctx("philox")
 
     @contextlib.contextmanager
-    def _distribute_region(self, spec: DTensorSpec):
-        # check if the parallel rng state has been synchronized or not
-        if not self.rng_state_is_sync("parallel-rng"):
-            raise RuntimeError(
-                "OffsetBasedRNGTracker requires the random state to be synchronized "
-                "before entering into a distribute region!"
-            )
+    def _distribute_region(
+        self, spec: DTensorSpec, generator: Optional[torch.Generator] = None
+    ):
+        if generator is not None:
+            # This is a little hacky, but for any user-passed generator, we store its state under a unique key,
+            # not because we need to keep a copy of it but because its the easiest way to make it work with the
+            # existing set/get APIs. We also ensure we remove it from rng_states after each _distribute_region.
+            state = _PhiloxState(generator.get_state())
+        else:
+            state = _PhiloxState(self._get_device_state())
 
         if self.distribute_region_enabled:
             if self._device.type == "hpu":
                 self._device_handle.set_rng_ctx("philox")
-            old_offset = self.get_offset("parallel-rng")
-            self._set_pre_op_offset(spec)
+            old_offset = state.offset
+            self._set_pre_op_offset(state, spec)
             with torch.random.fork_rng(
                 devices=[self._device], device_type=self._device.type
             ):
                 assert self._device_handle is not None
-                self._device_handle.set_rng_state(self.rng_states["parallel-rng"])
+                self._device_handle.set_rng_state(state.state)
                 try:
                     yield  # execute the region code
                 finally:
                     # update offset to synchronize among ranks
-                    self._set_post_op_offset(spec, old_offset)
+                    self._set_post_op_offset(state, spec, old_offset)
             if self._device.type == "hpu":
                 self._device_handle.unset_rng_ctx("philox")
         else:
             yield
 
-    def get_offset(self, name: str) -> int:
-        if name not in self.rng_states:
-            raise RuntimeError(
-                f"{self.__class__.__name__} does not have random state for {name}"
-            )
-
-        offset_tensor = (self.rng_states[name])[8:].view(dtype=torch.int64)
-        return int(offset_tensor.item())
-
-    def set_offset(self, name: str, offset: int) -> None:
-        if name not in self.rng_states:
-            raise RuntimeError(
-                f"{self.__class__.__name__} does not have random state for {name}"
-            )
-
-        seed_tensor = (self.rng_states[name])[0:8]
-        offset_tensor = torch.tensor([offset], dtype=torch.uint64, device="cpu").view(
-            torch.uint8
-        )
-        self.rng_states[name] = torch.cat([seed_tensor, offset_tensor])
+        if generator is not None:
+            # ensure we (a) propagate the state advancement back to the user's RNG so its visible and impacts any future
+            # usage of that RNG (dtensor or non-dtensor), (b) drop it from our own cache so that if the user updates
+            # the seed value in their rng and uses it with DTensor again, we always use the latest value
+            generator.set_state(state.state)
+        else:
+            self._set_device_state(state.state)
 
-    def _set_pre_op_offset(self, spec: DTensorSpec) -> None:
+    def _set_pre_op_offset(self, state: _PhiloxState, spec: DTensorSpec) -> None:
         """Set the starting RNG offset for current device's local shard before actual
         op execution. The pre_op_offset value should start from the current RNG offset
         and increment by the size of local shard until it reaches the size of the whole
@@ -248,6 +280,7 @@ def _set_pre_op_offset(self, spec: DTensorSpec) -> None:
         will be the same.
 
         Args:
+            state (:class:`Tensor`): The generator state to modify
             spec (:class:`DTensorSpec`): the spec of the DTensor object on which
                 we prepare the offset for running random ops.
 
@@ -350,20 +383,23 @@ def _set_pre_op_offset(self, spec: DTensorSpec) -> None:
         local_size = prod(local_size_on_rank_0)
 
         # get current RNG offset
-        current_offset = self.get_offset("parallel-rng")
+        current_offset = state.offset
 
         # pytorch: offset must be multiple of 4
         # source: aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
         offset_incr = (shard_linear_idx * local_size + 3) // 4 * 4
-        self.set_offset("parallel-rng", current_offset + offset_incr)
+        state.offset = current_offset + offset_incr
 
-    def _set_post_op_offset(self, spec: DTensorSpec, old_offset: int) -> None:
+    def _set_post_op_offset(
+        self, state: _PhiloxState, spec: DTensorSpec, old_offset: int
+    ) -> None:
         """Sets the RNG to a synchronized state after running the local random op. Every
         rank should set its RNG offset to `old_offset + DTensor.numel()` where old_offset is
         the offset before calling `set_pre_op_offset` i.e. the offset before running DTensor
         random ops.
 
         Args:
+            state (:class:`Tensor`): The generator state to modify.
             spec (:class:`DTensorSpec`): the spec of the DTensor object on which
                 we post-process the offset for running random ops.
 
@@ -378,7 +414,7 @@ def _set_post_op_offset(self, spec: DTensorSpec, old_offset: int) -> None:
         # pytorch: offset must be multiple of 4
         # source: aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
         numel = (numel + 3) // 4 * 4
-        self.set_offset("parallel-rng", old_offset + numel)
+        state.offset = old_offset + numel
 
     def _calc_shard_linear_idx(
         self, shard_coord: list[int], shard_size: list[int]
diff --git a/torch/distributed/tensor/_redistribute.py b/torch/distributed/tensor/_redistribute.py
index 11fc2d11e1a88..54d8723b92f89 100644
--- a/torch/distributed/tensor/_redistribute.py
+++ b/torch/distributed/tensor/_redistribute.py
@@ -7,6 +7,7 @@
 import torch
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.tensor._api as dtensor
+from torch.distributed._functional_collectives import _are_we_tracing
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
 from torch.distributed.tensor.device_mesh import DeviceMesh
 from torch.distributed.tensor.placement_types import (
@@ -181,10 +182,7 @@ def redistribute_local_tensor(
         # which should be an empty tensor
         return local_tensor
 
-    has_symints = any(isinstance(s, torch.SymInt) for s in current_spec.shape) or any(
-        isinstance(s, torch.SymInt) for s in target_spec.shape
-    )
-    if has_symints:
+    if _are_we_tracing():
         transform_infos = _gen_transform_infos_non_cached(current_spec, target_spec)
     else:
         transform_infos = _gen_transform_infos(current_spec, target_spec)
diff --git a/torch/distributed/tensor/_sharding_prop.py b/torch/distributed/tensor/_sharding_prop.py
index 1ccb42c47bfe6..cd5452a1e9c01 100644
--- a/torch/distributed/tensor/_sharding_prop.py
+++ b/torch/distributed/tensor/_sharding_prop.py
@@ -8,6 +8,7 @@
 import torch
 from torch._ops import OpOverload
 from torch._subclasses import FakeTensorMode
+from torch.distributed._functional_collectives import _are_we_tracing
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
 from torch.distributed.tensor._op_schema import (
     OpInfo,
@@ -198,8 +199,25 @@ def _propagate_tensor_meta_non_cached(
     def _propagate_tensor_meta(
         self, op_schema: OpSchema
     ) -> Union[None, TensorMeta, Sequence[Optional[TensorMeta]]]:
+        """
+        Cached version of _propagate_tensor_meta_non_cached
+        This is a private API. Use propagate_tensor_meta instead.
+        """
         return self._propagate_tensor_meta_non_cached(op_schema)
 
+    def propagate_tensor_meta(
+        self, op_schema: OpSchema
+    ) -> Union[None, TensorMeta, Sequence[Optional[TensorMeta]]]:
+        """
+        Propagate the tensor metadata, it could either return a TensorMeta
+        or a list/tuple of TensorMetas. This is a public API that should be
+        used if cache should be used.
+        """
+        if _are_we_tracing():
+            return self._propagate_tensor_meta_non_cached(op_schema)
+        else:
+            return self._propagate_tensor_meta(op_schema)
+
     def _wrap_output_spec_tensor_meta(
         self,
         op: OpOverload,
@@ -302,7 +320,7 @@ def propagate(self, op_info: OpInfo) -> None:
         # because SymInts are not hashable.
         # This is generally ok because this only happens during tracing in torch.compile,
         # and tracing does not need to be as fast as eagermode DTensor usages.
-        if op_info.schema.has_symints:
+        if _are_we_tracing():
             output_sharding = self.propagate_op_sharding_non_cached(op_info.schema)
         else:
             output_sharding = cast(
@@ -320,7 +338,6 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
             return OutputSharding(None, op_schema)
 
         out_tensor_meta = self._propagate_tensor_meta_non_cached(op_schema)
-
         if op_schema.op in self.op_strategy_funcs:
             # wrap the op_schema with op strategy for sharding strategy propagation
             strategy_schema = self._wrap_with_op_strategy(op_schema)
@@ -330,7 +347,7 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
 
             if isinstance(op_strategy, OpStrategy):
                 # single Op strategy
-                output_strategy = self._select_strategy(op_strategy)
+                output_strategy = self._select_strategy(op_strategy, op_schema)
 
                 # check if we need to redistribute the input
                 needs_redistribute = False
@@ -538,21 +555,56 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                 f"Operator {op_schema.op} does not have a sharding strategy registered."
             )
 
-    def _select_strategy(self, strategy: OpStrategy) -> OpSpec:
+    def _select_strategy(
+        self, strategy: OpStrategy, op_schema: Optional[OpSchema] = None
+    ) -> OpSpec:
         if len(strategy.strategies) == 1:
             # short cut with only one possible OpSpec
             return strategy.strategies[0]
 
         op_spec_costs: list[float] = []
-        for op_spec in strategy.strategies:
+        no_redistribute_strategy_index: int = -1
+        for strategy_idx, op_spec in enumerate(strategy.strategies):
             assert op_spec.redistribute_cost is not None, (
                 "must set redistribute cost each OpSpec!"
             )
             redistribute_cost = sum(chain.from_iterable(op_spec.redistribute_cost))
             op_spec_costs.append(redistribute_cost)
 
+            # If there's no redistribute cost, we record the index of the strategy
+            # which doesn't need redistribute.
+            # TODO: Currently this only applies to OpStrategy selection. Requires extra
+            # logic to make it work for TupleStrategy, if needed.
+            if op_schema is not None and redistribute_cost == 0:
+                needs_redistribute = False
+                for spec_idx, input_spec in enumerate(op_schema.args_spec):
+                    desired_spec = (
+                        op_spec.output_spec
+                        if op_spec.input_specs is None
+                        else op_spec.input_specs[spec_idx]
+                    )
+                    if input_spec.placements != desired_spec.placements:
+                        needs_redistribute = True
+                        break
+
+                if not needs_redistribute:
+                    no_redistribute_strategy_index = strategy_idx
+
         # for eager execution, we just select the one with the minimal redistribute cost
-        return strategy.strategies[op_spec_costs.index(min(op_spec_costs))]
+        min_cost = min(op_spec_costs)
+        if min_cost < 0:
+            # If there's negative cost, we select the one with the minimal cost,
+            # even if this means we need to redistribute, e.g. via local chunking.
+            # E.g. this can happen for ops in self.op_to_shape_and_stride_idx
+            # when the inputs / outputs are sharded.
+            selected_strategy_index = op_spec_costs.index(min_cost)
+        elif min_cost == 0 and no_redistribute_strategy_index != -1:
+            # If there's no redistribute cost, we select the one with no redistribute.
+            selected_strategy_index = no_redistribute_strategy_index
+        else:
+            selected_strategy_index = op_spec_costs.index(min_cost)
+
+        return strategy.strategies[selected_strategy_index]
 
     def _adjust_shape_and_stride_args(
         self,
diff --git a/torch/distributed/tensor/_utils.py b/torch/distributed/tensor/_utils.py
index 6521eeac9b3ea..a39c49f5230a4 100644
--- a/torch/distributed/tensor/_utils.py
+++ b/torch/distributed/tensor/_utils.py
@@ -284,12 +284,12 @@ def compute_global_tensor_shape(
     if isinstance(placements[0], Replicate):
         return shape
     elif isinstance(placements[0], Shard):
-        local_shape = torch.tensor(list(shape))
+        local_shape = torch.tensor(list(shape), device=mesh.device_type)
         gathered_shaped_tensors = [
             torch.empty_like(local_shape, device=local_shape.device)
             for _ in range(mesh.size())
         ]
-        funcol.all_gather_inplace(gathered_shaped_tensors, local_shape)
+        funcol.all_gather_inplace(gathered_shaped_tensors, local_shape, mesh)
         sharded_dim_sum = 0
         shard_dim = placements[0].dim
         other_dims = [d for d in range(mesh.ndim) if d != shard_dim]
diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
index f33a52c495a42..6cd06727cd2b2 100644
--- a/torch/distributed/tensor/experimental/_attention.py
+++ b/torch/distributed/tensor/experimental/_attention.py
@@ -49,25 +49,6 @@ class _RotateMethod(Enum):
 logger = logging.getLogger(__name__)
 
 
-def _need_scaling() -> bool:
-    if hasattr(torch.version, "hip") and torch.version.hip is not None:
-        gcn_arch_name = torch.cuda.get_device_properties("cuda").gcnArchName
-        _is_ck_supported = False
-        for arch in ["gfx942", "gfx950"]:
-            if arch in gcn_arch_name:
-                _is_ck_supported = True
-        # Check the function exists
-        _preferred_rocm_fa_library = torch.backends.cuda.preferred_rocm_fa_library
-        _CK_BACKEND = torch.backends.cuda._ROCmFABackends["ck"]
-        # Note: it is possible that CK is selected but not compiled in the binary.
-        if _is_ck_supported and _preferred_rocm_fa_library() == _CK_BACKEND:
-            # Unsure about CK's behavior, keep logsumexp untouched
-            return False
-        return True
-    else:
-        return False
-
-
 class _DispatchMode(Enum):
     MONKEY_PATCH = auto()
     TORCH_FUNCTION = auto()
@@ -489,8 +470,6 @@ def _templated_ring_attention(
             is_causal=is_causal_behavior.value,
             **kwargs,
         )
-        if _need_scaling():
-            logsumexp *= 0.6931471805599453
         sdpa_merger.step(out, logsumexp, partial)
 
     return *sdpa_merger.results(), *rest
diff --git a/torch/distributed/tensor/experimental/_func_map.py b/torch/distributed/tensor/experimental/_func_map.py
index fd91328c0b379..31cdd0f9a06fc 100644
--- a/torch/distributed/tensor/experimental/_func_map.py
+++ b/torch/distributed/tensor/experimental/_func_map.py
@@ -24,10 +24,10 @@
 
 
 def local_map(
-    func: Callable,
-    out_placements: OutputPlacements,
-    in_placements: Optional[InputPlacements] = None,
-    in_grad_placements: Optional[InputPlacements] = None,
+    func: Optional[Callable] = None,
+    out_placements: OutputPlacements = None,
+    in_placements: InputPlacements = None,
+    in_grad_placements: InputPlacements = None,
     device_mesh: Optional[DeviceMesh] = None,
     *,
     redistribute_inputs: bool = False,
@@ -133,114 +133,144 @@ def local_map(
     .. note:: This API is currently experimental and subject to change
     """
 
-    def wrapped(device_mesh: Optional[DeviceMesh], *args, **kwargs):
-        # process input args
-        flat_args, args_spec = pytree.tree_flatten(args)
-        if in_placements is not None:
-            assert len(in_placements) == len(flat_args), (
-                f"in_placements length {len(in_placements)} does not match the number "
-                f"of input args {len(flat_args)}!"
+    if func is None:
+        # decorator mode
+        def decorated(func):
+            return local_map(
+                func=func,
+                out_placements=out_placements,
+                in_placements=in_placements,
+                in_grad_placements=in_grad_placements,
+                device_mesh=device_mesh,
+                redistribute_inputs=redistribute_inputs,
             )
 
-        # we assume every DTensor object is placed on the same device mesh
-        flat_local_args = []
-        seen_dtensor_arg = False
-        for idx, arg in enumerate(flat_args):
-            if isinstance(arg, DTensor):
-                # TODO: the current code doesn't consider the uneven sharding case
-                # Need to think about what the consequence is when the input DTensor
-                # is uneven sharded.
-                if device_mesh is None:  # infer device mesh from the DTensor arg
-                    device_mesh = arg.device_mesh
-
-                # this function is applied to at least one DTensor argument
-                seen_dtensor_arg = True
-
-                if in_placements is not None:
-                    spec = in_placements[idx]
-                    assert spec is not None, (
-                        f"DTensor input {arg} expects placements but received {spec}!"
-                    )
-
-                    if not isinstance(spec, tuple):
-                        spec = tuple(spec)
-
-                    if arg.placements != spec:
-                        if redistribute_inputs:
-                            # redistribute to input placements
-                            arg = arg.redistribute(placements=spec)
-                        else:
-                            raise ValueError(
-                                f"arg {arg} in local_map has a mismatched placements: "
-                                f"arg placements is {arg.placements} but the input "
-                                f"placements is {spec}! "
-                                "If redistribute_inputs is wanted, set "
-                                "redistribute_inputs=True to local_map."
-                            )
-
-                if in_grad_placements is not None:
-                    spec = in_grad_placements[idx]
-                    assert spec is not None, (
-                        f"DTensor input {arg} expects in grad placements but received {spec}!"
-                    )
-                    if not isinstance(spec, tuple):
-                        spec = tuple(spec)
-                    local_arg = arg.to_local(grad_placements=spec)
-                else:
-                    local_arg = arg.to_local()
-
-                if isinstance(local_arg, AsyncCollectiveTensor):
-                    local_arg = local_arg.wait()
-
-                flat_local_args.append(local_arg)
-            else:
-                # Non-Tensor input must have None in `in_placements`
-                if in_placements is not None and not isinstance(arg, torch.Tensor):
-                    spec = in_placements[idx]
-                    assert spec is None, (
-                        f"Non-Tensor input {arg} expects None placements "
-                        f"but received {spec}!"
-                    )
+        return decorated
 
-                flat_local_args.append(arg)
+    return functools.partial(
+        _local_map_wrapped,
+        func,
+        out_placements,
+        in_placements,
+        in_grad_placements,
+        device_mesh,
+        redistribute_inputs,
+    )
 
-        local_args = pytree.tree_unflatten(flat_local_args, args_spec)
 
-        out = func(*local_args, **kwargs)
+def _local_map_wrapped(
+    func: Callable,
+    out_placements: OutputPlacements,
+    in_placements: InputPlacements,
+    in_grad_placements: InputPlacements,
+    device_mesh: Optional[DeviceMesh],
+    redistribute_inputs: bool,
+    *args,
+    **kwargs,
+):
+    # process input args
+    flat_args, args_spec = pytree.tree_flatten(args)
+    if in_placements is not None:
+        assert len(in_placements) == len(flat_args), (
+            f"in_placements length {len(in_placements)} does not match the number "
+            f"of input args {len(flat_args)}!"
+        )
+
+    # we assume every DTensor object is placed on the same device mesh
+    flat_local_args = []
+    seen_dtensor_arg = False
+    for idx, arg in enumerate(flat_args):
+        if isinstance(arg, DTensor):
+            # TODO: the current code doesn't consider the uneven sharding case
+            # Need to think about what the consequence is when the input DTensor
+            # is uneven sharded.
+            if device_mesh is None:  # infer device mesh from the DTensor arg
+                device_mesh = arg.device_mesh
+
+            # this function is applied to at least one DTensor argument
+            seen_dtensor_arg = True
+
+            if in_placements is not None:
+                spec = in_placements[idx]
+                assert spec is not None, (
+                    f"DTensor input {arg} expects placements but received {spec}!"
+                )
+
+                if not isinstance(spec, tuple):
+                    spec = tuple(spec)
+
+                if arg.placements != spec:
+                    if redistribute_inputs:
+                        # redistribute to input placements
+                        arg = arg.redistribute(placements=spec)
+                    else:
+                        raise ValueError(
+                            f"arg {arg} in local_map has a mismatched placements: "
+                            f"arg placements is {arg.placements} but the input "
+                            f"placements is {spec}! "
+                            "If redistribute_inputs is wanted, set "
+                            "redistribute_inputs=True to local_map."
+                        )
+
+            if in_grad_placements is not None:
+                spec = in_grad_placements[idx]
+                assert spec is not None, (
+                    f"DTensor input {arg} expects in grad placements but received {spec}!"
+                )
+                if not isinstance(spec, tuple):
+                    spec = tuple(spec)
+                local_arg = arg.to_local(grad_placements=spec)
+            else:
+                local_arg = arg.to_local()
 
-        if seen_dtensor_arg:
-            # process output to be DTensor if we've seen DTensor inputs
-            flat_out, out_spec = pytree.tree_flatten(out)
+            if isinstance(local_arg, AsyncCollectiveTensor):
+                local_arg = local_arg.wait()
 
-            flat_dist_out = []
-            out_placements_tuple = (
-                out_placements
-                if isinstance(out_placements, tuple)
-                else (out_placements,)
-            )
-            assert len(flat_out) == len(out_placements_tuple), (
-                "local_map requires one PlacementType be provided for each output value,"
-                f" received {len(out_placements_tuple)} out_placements but"
-                f" {len(flat_out)} is expected!"
-            )
-            for out, spec in zip(flat_out, out_placements_tuple):
-                if isinstance(out, torch.Tensor):
-                    assert not isinstance(out, DTensor), (
-                        f"torch.Tensor output expected but received {type(out)}: {out}"
-                    )
-
-                    flat_dist_out.append(
-                        DTensor.from_local(out, device_mesh, spec, run_check=False)
-                    )
-                else:
-                    assert spec is None, (
-                        f"Non-tensor output {out} expects None placements but received {spec}!"
-                    )
-
-                    flat_dist_out.append(out)
-
-            return pytree.tree_unflatten(flat_dist_out, out_spec)
+            flat_local_args.append(local_arg)
         else:
-            return out
+            # Non-Tensor input must have None in `in_placements`
+            if in_placements is not None and not isinstance(arg, torch.Tensor):
+                spec = in_placements[idx]
+                assert spec is None, (
+                    f"Non-Tensor input {arg} expects None placements "
+                    f"but received {spec}!"
+                )
+
+            flat_local_args.append(arg)
+
+    local_args = pytree.tree_unflatten(flat_local_args, args_spec)
+
+    out = func(*local_args, **kwargs)
+
+    if seen_dtensor_arg:
+        # process output to be DTensor if we've seen DTensor inputs
+        flat_out, out_spec = pytree.tree_flatten(out)
+
+        flat_dist_out = []
+        out_placements_tuple = (
+            out_placements if isinstance(out_placements, tuple) else (out_placements,)
+        )
+        assert len(flat_out) == len(out_placements_tuple), (
+            "local_map requires one PlacementType be provided for each output value,"
+            f" received {len(out_placements_tuple)} out_placements but"
+            f" {len(flat_out)} is expected!"
+        )
+        for out, spec in zip(flat_out, out_placements_tuple):
+            if isinstance(out, torch.Tensor):
+                assert not isinstance(out, DTensor), (
+                    f"torch.Tensor output expected but received {type(out)}: {out}"
+                )
+
+                flat_dist_out.append(
+                    DTensor.from_local(out, device_mesh, spec, run_check=False)
+                )
+            else:
+                assert spec is None, (
+                    f"Non-tensor output {out} expects None placements but received {spec}!"
+                )
+
+                flat_dist_out.append(out)
 
-    return functools.partial(wrapped, device_mesh)
+        return pytree.tree_unflatten(flat_dist_out, out_spec)
+    else:
+        return out
diff --git a/torch/distributed/tensor/parallel/loss.py b/torch/distributed/tensor/parallel/loss.py
index 5e485fe492c7a..32a90bc8f1fb3 100644
--- a/torch/distributed/tensor/parallel/loss.py
+++ b/torch/distributed/tensor/parallel/loss.py
@@ -112,7 +112,7 @@ def _propagate_tensor_meta(
     kwargs: dict[str, object],
 ) -> TensorMeta:
     op_info = DTensor._op_dispatcher.unwrap_to_op_info(op_call, args, kwargs)
-    tensor_meta = DTensor._op_dispatcher.sharding_propagator._propagate_tensor_meta(
+    tensor_meta = DTensor._op_dispatcher.sharding_propagator.propagate_tensor_meta(
         op_info.schema
     )
     if isinstance(tensor_meta, TensorMeta):
diff --git a/torch/export/__init__.py b/torch/export/__init__.py
index 3ed8a6c37883f..621cabf15a3b8 100644
--- a/torch/export/__init__.py
+++ b/torch/export/__init__.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import warnings
 import zipfile
@@ -52,6 +53,8 @@
 
 PassType = Callable[[torch.fx.GraphModule], Optional[PassResult]]
 
+log: logging.Logger = logging.getLogger(__name__)
+
 
 @deprecated(
     "`torch.export.export_for_training` is deprecated and will be removed in PyTorch 2.10. "
@@ -66,6 +69,7 @@ def export_for_training(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any, ...], list[Any]]] = None,
     strict: bool = False,
     preserve_module_call_signature: tuple[str, ...] = (),
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     """
     :func:`export_for_training` takes any nn.Module along with example inputs, and produces a traced graph representing
@@ -154,6 +158,7 @@ def export_for_training(
         dynamic_shapes,
         strict=strict,
         preserve_module_call_signature=preserve_module_call_signature,
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
     )
 
 
@@ -165,6 +170,7 @@ def export(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any, ...], list[Any]]] = None,
     strict: bool = False,
     preserve_module_call_signature: tuple[str, ...] = (),
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     """
     :func:`export` takes any nn.Module along with example inputs, and produces a traced graph representing
@@ -276,6 +282,7 @@ def export(
             strict=strict,
             preserve_module_call_signature=preserve_module_call_signature,
             pre_dispatch=True,
+            prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
         )
     except Exception as e:
         draft_export_msg = (
@@ -440,7 +447,8 @@ def load(
             f,
             expected_opset_version=expected_opset_version,
         )
-    except RuntimeError:
+    except RuntimeError as e:
+        log.warning("Ran into the following error when deserializing: %s", e)
         pt2_contents = PT2ArchiveContents({}, {}, {})
 
     if len(pt2_contents.exported_programs) > 0 or len(pt2_contents.extra_files) > 0:
@@ -450,10 +458,18 @@ def load(
         return pt2_contents.exported_programs["model"]
 
     # TODO: For backward compatibility, we support loading a zip file from 2.7. Delete this path in 2.9(?)
-    warnings.warn(
-        "This version of file is deprecated. Please generate a new pt2 saved file."
-    )
     with zipfile.ZipFile(f, "r") as zipf:
+        if "version" not in zipf.namelist():
+            raise RuntimeError(
+                "We ran into an error when deserializing the saved file. "
+                "Please check the warnings above for possible errors. "
+            )
+
+        log.warning(
+            "Trying to deserialize for the older format. This version of file is "
+            "deprecated. Please generate a new pt2 saved file."
+        )
+
         # Check the version
         version = zipf.read("version").decode().split(".")
         from torch._export.serde.schema import (
@@ -524,6 +540,7 @@ def draft_export(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any, ...], list[Any]]] = None,
     preserve_module_call_signature: tuple[str, ...] = (),
     strict: bool = False,
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     """
     A version of torch.export.export which is designed to consistently produce
@@ -539,6 +556,7 @@ def draft_export(
         dynamic_shapes=dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         strict=strict,
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
     )
 
 
diff --git a/torch/export/_draft_export.py b/torch/export/_draft_export.py
index 755ed346fe93d..2b14327b24512 100644
--- a/torch/export/_draft_export.py
+++ b/torch/export/_draft_export.py
@@ -371,6 +371,7 @@ def draft_export(
     preserve_module_call_signature: tuple[str, ...] = (),
     strict: bool = False,
     pre_dispatch: bool = True,
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     start_time = time.time()
     kwargs = kwargs or {}
@@ -396,6 +397,7 @@ def draft_export(
                 strict=strict,
                 pre_dispatch=pre_dispatch,
                 preserve_module_call_signature=preserve_module_call_signature,
+                prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
             )
         except Exception as exc:
             if (
@@ -420,6 +422,7 @@ def convert_dim_to_auto(dim: Any) -> Any:
                     strict=strict,
                     pre_dispatch=pre_dispatch,
                     preserve_module_call_signature=preserve_module_call_signature,
+                    prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
                 )
             else:
                 log_draft_export_usage(
diff --git a/torch/export/_leakage_detection_utils.py b/torch/export/_leakage_detection_utils.py
new file mode 100644
index 0000000000000..c72152759d236
--- /dev/null
+++ b/torch/export/_leakage_detection_utils.py
@@ -0,0 +1,112 @@
+import gc
+import types
+import typing
+import weakref
+
+import torch
+
+
+"""
+These functions are used to detect potential fake tensor leakage when using PT2 export.
+See NOTE [export non-strict fake tensor leak detection]
+
+There are some complications that made this logic overly complicated:
+1) Python 3.10 and Python 3.12 have different ways of implementing referrer so
+   we need to account for whether it is ref.__dict__ or the real ref object
+
+2) There are some internal PT2 references to fake tensors like `TrackedFake`
+3) closures, generators, and bound methods can hold fake tensors.
+4) global object can hold onto a fake tensor
+
+In general, these utils are our last resort to detect fake tensors. if the leak happens
+within the model attributes, we have a separate mechanism to detect. This tool relies a bit
+on garbage collector internal details, so I think it is unsafe to turn on by default, hence
+this tool should be used as debugging tool.
+"""
+
+
+# Things we never want to flag as leaks
+_SKIP_TYPES = (
+    types.FrameType,
+    types.ModuleType,
+)
+
+
+def _is_globals_or_locals(obj: typing.Any) -> bool:
+    # These comparisons only make sense within this frame; still cheap to check.
+    return obj is globals() or obj is locals()
+
+
+def _is_tracked_fake(obj: typing.Any) -> bool:
+    return isinstance(obj, torch.fx.experimental.symbolic_shapes.TrackedFake)
+
+
+def _is_gm_meta_like_dict(d: dict, o: typing.Any) -> bool:
+    # Hope gm.meta was a custom dict we can assert on
+    return d.get("val", None) is o
+
+
+def _dict_is_attr_of_tracked_fake(d: dict) -> bool:
+    """
+    Python 3.10 quirk: sometimes the referrer is obj.__dict__ instead of obj.
+    Check if this dict is exactly the __dict__ of a TrackedFake.
+    """
+    for parent in gc.get_referrers(d):
+        if (
+            hasattr(parent, "__dict__")
+            and parent.__dict__ is d
+            and _is_tracked_fake(parent)
+        ):
+            return True
+    return False
+
+
+def find_legit_leaks_from_referrers(active_fakes: weakref.WeakSet) -> weakref.WeakSet:
+    legit_leak: weakref.WeakSet = weakref.WeakSet()
+
+    # This is so that we don't falsely flag generator to be holding fake tensor
+    fake_list = list(active_fakes)
+    fake_list_id = id(fake_list)
+
+    for act in fake_list:
+        # Track by id to avoid processing duplicate referrers
+        seen = set()
+        # Assume it's a leak unless we find only ignorable referrers
+        flagged = False
+
+        for r in gc.get_referrers(act):
+            rid = id(r)
+            if rid in seen:
+                continue
+            seen.add(rid)
+
+            # Skip our own fake_list
+            if rid == fake_list_id:
+                continue
+
+            # Fast-path: skip obvious non-owners
+            if _is_globals_or_locals(r):
+                continue
+            if isinstance(r, _SKIP_TYPES):
+                continue
+            if _is_tracked_fake(r):
+                # TrackedFake should be ignored
+                continue
+
+            # Handle dicts carefully (Python 3.10 sometimes shows __dict__)
+            if isinstance(r, dict):
+                if _is_gm_meta_like_dict(r, act):
+                    continue
+                if _dict_is_attr_of_tracked_fake(r):
+                    continue
+                flagged = True
+                break
+
+            # Any other referrer we don't explicitly whitelist counts as a leak
+            flagged = True
+            break
+
+        if flagged:
+            legit_leak.add(act)
+
+    return legit_leak
diff --git a/torch/export/_swap.py b/torch/export/_swap.py
index df003403569ae..4c93956e32b49 100644
--- a/torch/export/_swap.py
+++ b/torch/export/_swap.py
@@ -163,7 +163,7 @@ def _remove_extraneous_pytrees(gm: torch.fx.GraphModule) -> None:
     """
 
     for node in gm.graph.nodes:
-        if node.op == "call_module":
+        if node.op == "call_module" and node.target != "_guards_fn":
             _try_remove_connecting_pytrees(node)
 
     gm.graph.eliminate_dead_code()
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index 51d1c1f36c3de..76d80ff6eeec8 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -2,12 +2,15 @@
 # mypy: allow-untyped-defs
 import dataclasses
 import functools
+import gc
 import inspect
 import logging
+import os
 import re
 import sys
 import time
 import warnings
+import weakref
 from contextlib import contextmanager, nullcontext
 from typing import Any, Callable, Optional, Union
 from typing_extensions import TypeAlias
@@ -69,6 +72,7 @@
 from torch._logging import dtrace_structured
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch._utils_internal import log_export_usage
+from torch.export._leakage_detection_utils import find_legit_leaks_from_referrers
 from torch.export._unlift import _check_input_constraints_pre_hook
 from torch.export.dynamic_shapes import (
     _check_dynamic_shapes,
@@ -107,6 +111,8 @@
 
 log = logging.getLogger(__name__)
 
+NONSTRICT_EXPORT_SANITIZE_TRACE = "NONSTRICT_EXPORT_SANITIZE_TRACE"
+
 
 # Type alias for dynamic shapes specification
 _DynamicShapesSpec: TypeAlias = Union[dict[str, Any], tuple[Any, ...], list[Any]]
@@ -384,7 +390,9 @@ def _get_param_buffer_mapping(
     param_lookup: dict[int, str] = {}
     buffer_lookup: dict[int, str] = {}
     for name, param in original_module.named_parameters(remove_duplicate=False):
-        param_lookup[id(param)] = name
+        if param_lookup.get(id(param)) is None:
+            # we only want to keep the first occurrence of a parameter to guarantee parity of original and traced module.
+            param_lookup[id(param)] = name
     for name, buffer in original_module.named_buffers(remove_duplicate=False):
         buffer_lookup[id(buffer)] = name
 
@@ -736,6 +744,10 @@ def _make_module_call_graph(
     return [*original, *additional]
 
 
+class _ExportModuleSpecTrackerDict(dict):
+    pass
+
+
 def _export_to_torch_ir(
     f: Callable,
     args: tuple[Any, ...],
@@ -744,7 +756,7 @@ def _export_to_torch_ir(
     *,
     preserve_module_call_signature: tuple[str, ...] = (),
     disable_constraint_solver: bool = False,
-    allow_complex_guards_as_runtime_asserts: bool = False,
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
     restore_fqn: bool = True,
     _log_export_usage: bool = True,
     same_signature: bool = True,
@@ -788,7 +800,9 @@ def _export_to_torch_ir(
 
     with torch._dynamo.config.patch(dataclasses.asdict(DEFAULT_EXPORT_DYNAMO_CONFIG)):
         try:
-            module_call_specs: dict[str, dict[str, pytree.TreeSpec]] = {}
+            module_call_specs: dict[str, dict[str, pytree.TreeSpec]] = (
+                _ExportModuleSpecTrackerDict()
+            )
             ctx = nullcontext()
             if not isinstance(f, torch.fx.GraphModule):
                 ctx = _wrap_submodules(  # type: ignore[assignment]
@@ -802,10 +816,7 @@ def _export_to_torch_ir(
                     assume_static_by_default=True,
                     tracing_mode="symbolic",
                     disable_constraint_solver=disable_constraint_solver,
-                    # currently the following 2 flags are tied together for export purposes,
-                    # but untangle for sake of dynamo export api
-                    prefer_deferred_runtime_asserts_over_guards=True,
-                    allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+                    prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
                     _log_export_usage=_log_export_usage,
                     same_signature=same_signature,
                 )(
@@ -1394,7 +1405,7 @@ def _strict_export(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]],
     preserve_module_call_signature: tuple[str, ...],
     orig_in_spec: TreeSpec,
-    allow_complex_guards_as_runtime_asserts: bool,
+    prefer_deferred_runtime_asserts_over_guards: bool,
     _to_aten_func: Callable,
 ) -> ExportArtifact:
     """
@@ -1408,7 +1419,7 @@ def _strict_export(
         dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         restore_fqn=False,  # don't need to restore because we will do it later
-        allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
         _log_export_usage=False,
     )
 
@@ -1674,7 +1685,24 @@ def override_getattribute_for_subclasses(args):
                     for k, (old_getattr, _) in tensor_type_to_old_getattribute.items():
                         k.__getattribute__ = old_getattr  # type: ignore[method-assign, attr-defined]
 
-            with ctx, override_getattribute_for_subclasses(flat_args):
+            @contextmanager
+            def _maybe_restore_grad_state():
+                """
+                When pre-dispatch export accidentally change grad state, we restore it back.
+                This can happen when we are calling torch._C._set_grad_enabled directly in the
+                forward.
+                """
+                old_state = torch.is_grad_enabled()
+                try:
+                    yield
+                finally:
+                    torch._C._set_grad_enabled(old_state)
+
+            with (
+                ctx,
+                override_getattribute_for_subclasses(flat_args),
+                _maybe_restore_grad_state(),
+            ):
                 gm = make_fx(
                     wrapped_fn,
                     record_module_stack=True,
@@ -1738,6 +1766,7 @@ def _is_impure(node):
                 zip(input_names[param_len : param_len + buffer_len], named_buffers)
             ),
             buffers_to_mutate={},
+            parameters_to_mutate={},
             user_inputs_to_mutate={},
             in_spec=in_spec,
             out_spec=out_spec.spec,
@@ -1833,7 +1862,7 @@ def _non_strict_export(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]],
     preserve_module_call_signature: tuple[str, ...],
     orig_in_spec: TreeSpec,
-    allow_complex_guards_as_runtime_asserts: bool,
+    prefer_deferred_runtime_asserts_over_guards: bool,
     _to_aten_func: Callable,
 ) -> ExportArtifact:
     """
@@ -1900,6 +1929,9 @@ def forward(self, *args, **kwargs):
                 _strip_root, sig.inputs_to_parameters
             )
             sig.buffers_to_mutate = pytree.tree_map(_strip_root, sig.buffers_to_mutate)
+            sig.parameters_to_mutate = pytree.tree_map(
+                _strip_root, sig.parameters_to_mutate
+            )
 
             for node in gm.graph.nodes:
                 if "nn_module_stack" in node.meta:
@@ -1927,7 +1959,7 @@ def forward(self, *args, **kwargs):
         args,
         kwargs,
         dynamic_shapes,
-        allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,  # for shape env initialization
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,  # for shape env initialization
     )
 
     fake_params_buffers = _fakify_params_buffers(fake_mode, mod)
@@ -2008,6 +2040,7 @@ def _export_for_training(
     *,
     strict: bool = True,
     preserve_module_call_signature: tuple[str, ...] = (),
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     global _EXPORT_MODULE_HIERARCHY
     _EXPORT_MODULE_HIERARCHY = _get_module_hierarchy(mod)
@@ -2025,6 +2058,16 @@ def _export_for_training(
     # Call the appropriate export function based on the strictness of tracing.
     export_func = _strict_export if strict else _non_strict_export
 
+    alive_fake_input_ids_before_export: list[int] = []
+
+    if not strict and os.environ.get(NONSTRICT_EXPORT_SANITIZE_TRACE, "0") == "1":
+        gc.collect()
+        alive_fake_input_ids_before_export = [
+            id(i)
+            for i in gc.get_objects()
+            if isinstance(i, torch._subclasses.fake_tensor.FakeTensor)
+        ]
+
     export_artifact = export_func(
         mod=mod,
         args=args,
@@ -2032,7 +2075,7 @@ def _export_for_training(
         dynamic_shapes=dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         orig_in_spec=orig_in_spec,
-        allow_complex_guards_as_runtime_asserts=False,
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
         _to_aten_func=_export_to_aten_ir_make_fx,
     )
 
@@ -2080,6 +2123,53 @@ def _export_for_training(
     )
 
     verify_additional_inputs(exported_program)
+
+    if not strict and os.environ.get(NONSTRICT_EXPORT_SANITIZE_TRACE, "0") == "1":
+        # See NOTE [export non-strict fake tensor leak detection]
+        from torch.fx.experimental.proxy_tensor import (
+            _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT,
+        )
+
+        fakes_after: list[torch._subclasses.fake_tensor.FakeTensor] = [
+            i
+            for i in gc.get_objects()
+            if isinstance(i, torch._subclasses.fake_tensor.FakeTensor)
+        ]
+
+        active_fakes: weakref.WeakSet = weakref.WeakSet()
+        for fake_tensor in fakes_after:
+            if id(fake_tensor) not in alive_fake_input_ids_before_export:
+                active_fakes.add(fake_tensor)
+
+        del fakes_after
+        del alive_fake_input_ids_before_export
+
+        legit_leak: weakref.WeakSet = find_legit_leaks_from_referrers(active_fakes)
+        leak_sources: list[str] = []
+        if len(legit_leak) > 0:
+            for fake_val in legit_leak:
+                if id(fake_val) in _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT:
+                    stack_trace = _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT[
+                        id(fake_val)
+                    ].meta.get("stack_trace", "<unknown stack trace>")
+
+                    # Get shape and dtype info
+                    shape_info = f"shape={fake_val.shape}, dtype={fake_val.dtype}"
+                    leak_info = f"FakeTensor({shape_info}): {stack_trace}"
+                    leak_sources.append(leak_info)
+
+            # Format the warning message more nicely
+            leak_details = "\n  ".join(leak_sources)
+            warnings.warn(
+                f"Detected {len(legit_leak)} fake tensors that are still alive after export.\n"
+                f"This is likely result of torch.export.export not being able to track side effects "
+                f"that is happening outside of model scope.\n\n"
+                f"Leaked tensors:\n  {leak_details}\n\n"
+                f"Alternatively, please file a bug report to PyTorch team for further debugging help."
+            )
+
+            del legit_leak
+
     return exported_program
 
 
@@ -2094,7 +2184,7 @@ def _export(
     strict: bool = True,
     preserve_module_call_signature: tuple[str, ...] = (),
     pre_dispatch: bool = False,
-    allow_complex_guards_as_runtime_asserts: bool = False,
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     """
     Traces either an nn.Module's forward function or just a callable with PyTorch
@@ -2125,7 +2215,7 @@ def _export(
         preserve_module_call_signature: A list of submodule paths for which the original
             calling conventions are preserved as metadata.
 
-        allow_complex_guards_as_runtime_asserts:
+        prefer_deferred_runtime_asserts_over_guards:
          With the current dynamic shapes language for dims and derived dims, we can run into constraints
          that are not expressible with the language. For example, flattening a matrix and adding to a vector,
          both fully dynamic (i.e. x.reshape([-1]) + y) emits a guard s0 * s1 = s2, which is not expressible.
@@ -2169,6 +2259,7 @@ def _export(
             dynamic_shapes,
             strict=strict,
             preserve_module_call_signature=preserve_module_call_signature,
+            prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
         )
         dtrace_structured("exported_program", payload_fn=lambda: str(ep))
         return ep
@@ -2193,7 +2284,7 @@ def _export(
         dynamic_shapes=dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         orig_in_spec=original_in_spec,
-        allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
         _to_aten_func=functools.partial(
             _export_to_aten_ir,
             pre_dispatch=pre_dispatch,
diff --git a/torch/export/_unlift.py b/torch/export/_unlift.py
index f7ae6cbf21ac7..f876e462214ca 100644
--- a/torch/export/_unlift.py
+++ b/torch/export/_unlift.py
@@ -1,10 +1,14 @@
 # mypy: allow-untyped-defs
 import copy
+import inspect
+import math
 import warnings
 from collections.abc import Sequence
 from itertools import chain
 from typing import Any, Optional
 
+import sympy
+
 import torch
 import torch.utils._pytree as pytree
 from torch._export.non_strict_utils import (
@@ -12,11 +16,16 @@
     _exit_enable_graph_inputs_of_type_nn_module,
     _get_graph_inputs_of_type_nn_module,
 )
+from torch._export.passes.add_runtime_assertions_for_constraints_pass import (
+    _convert_range_to_int,
+)
 from torch._export.utils import _check_input_constraints_for_graph
 from torch.export.unflatten import _assign_attr, _AttrKind
 from torch.fx.experimental.proxy_tensor import _pytree_subclasses_that_lose_info
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
 from torch.fx.traceback import NodeSource, NodeSourceAction
+from torch.utils._sympy.solve import try_solve
+from torch.utils._sympy.value_ranges import ValueRanges
 
 from ._remove_effect_tokens_pass import _remove_effect_tokens
 from ._tree_utils import reorder_kwargs
@@ -73,20 +82,107 @@ def _check_inputs_match(args, kwargs, in_spec: pytree.TreeSpec) -> list:
     return flat_args_with_path
 
 
-@torch._dynamo.disable
-def _check_input_constraints_pre_hook(self, args, kwargs):
-    if not self.validate_inputs:
-        return
+def _convert_guards_code_to_fn(
+    guards_code: list[str],
+    paths_of_placeholders: list[pytree.KeyPath],
+):
+    """
+    Generates Python code given guards code and paths of placeholders.
+    We assume that, based on source information,
+    - the tracer generates the guards code
+    - the input spec generates the paths of placeholders.
+
+    Example:
+
+    Suppose we are given the guards code "L['z']['k'].size()[1] == 3"
+    and we are given that ['z']['k'] is the path of placeholder #2.
+    Then we will generate:
+    ```
+    torch._assert(
+        args[2].size()[0] == 3,
+        "Guard failed: z['k'].size()[0] == 3",
+    )
+    ```
 
-    flat_args_with_path = _check_inputs_match(args, kwargs, self._in_spec)
+    FAQ: Why do we generate code based on (flattened) args instead of
+    the original (unflattened) inputs? Because this would require
+    inserting an additional pytree.unflatten call in our graph.
+
+    FAQ: Why do we not emit RuntimeError on guard failure as we used to?
+    Because it is inconvenient :/, get used to AssertionError instead.
+    """
+
+    import ast
+
+    from torch.fx.experimental.symbolic_shapes import SYMPY_INTERP
+
+    actual_guards_code = []
+    shadow_guards_code = []
+    for c in guards_code:
+        a, s = c, c
+        for idx, path in enumerate(paths_of_placeholders):
+            # e.g., replace L['z']['k'] with args[2] for Python code (actual)
+            a = a.replace("L" + pytree.keystr(path), f"args[{idx}]")
+            # e.g., replace L['z']['k'] with z['k'] for error message (shadow)
+            s = s.replace(
+                "L" + pytree.keystr(path),
+                path[0].key + pytree.keystr(path[1:]),  # type: ignore[attr-defined]
+            )
+        actual_guards_code.append(a)
+        shadow_guards_code.append(s.replace("\n", ""))
+
+    # generate function code as str
+    code_str = "\ndef _(*args):\n"
+    for actual, shadow in zip(actual_guards_code, shadow_guards_code):
+        # printing guards code may potentially introduce redundant parens;
+        # we can normalize them out for readability by parsing/unparsing
+        # NOTE: this is not necessary for correctness, just deemed desirable
+        _shadow = ast.unparse(ast.parse(shadow, mode="eval"))
+        # actual code and shadow error message
+        code_str += f'  torch._assert({actual}, "Guard failed: {_shadow}")\n'
+    code_str += "  return\n"
+
+    # populate namespace with sympy globals, materialize function (named `_`)
+    namespace = {**SYMPY_INTERP}
+    exec(code_str, namespace)
+
+    # create and return a module whose forward is the materialized function
+    # NOTE: we want Dynamo to trace through this module, to repopulate guards:
+    # otherwise we would lose them when retracing
+    # NOTE: calling this module will be a side effect (no users): so it must
+    # be marked impure to avoid being not cleaned up by DCE
+    guards_fn = GuardsFn()
+    guards_fn.forward = torch._dynamo.dont_skip_tracing(namespace["_"])  # type: ignore[call-overload, method-assign]
+    guards_fn._is_impure = True  # type: ignore[assignment]
+    return guards_fn
 
+
+@torch._dynamo.disable
+def _check_input_constraints_for_module(self, args, kwargs):
+    flat_args_with_path = _check_inputs_match(args, kwargs, self._in_spec)
     _check_input_constraints_for_graph(
-        [node for node in self.graph.nodes if node.op == "placeholder"],
+        self.graph.find_nodes(op="placeholder"),
         flat_args_with_path,
         self.range_constraints,
     )
 
 
+def _check_input_constraints_pre_hook(self, args, kwargs):
+    # preserve current behavior for clients that do not want any validation
+    if not self.validate_inputs:
+        return
+
+    # when a guards function exists, assume that the graph does calls it!
+    # so we do not need to check input constraints...but we still want
+    # to check inputs match, otherwise we'd get obscure pytree errors
+    if hasattr(self, "_guards_fn"):
+        _check_inputs_match(args, kwargs, self._in_spec)
+        return
+
+    # NOTE: this call is Dynamo disabled, as it used to be
+    _check_input_constraints_for_module(self, args, kwargs)
+
+
 def _unlift_inputs_as_getattr(
     gm: torch.fx.GraphModule,
     lifted_inputs: Sequence[Optional[str]],
@@ -419,10 +515,149 @@ def _create_stateful_graph_module(
     return stateful_gm
 
 
-def _unlift_exported_program_lifted_states(ep: ExportedProgram) -> torch.fx.GraphModule:
+def _get_input_paths(example_inputs, signature):
+    """
+    Generate paths of placeholders, needed for generating the guards function.
+
+    NOTE: Here we make use of the example inputs used for export as well as
+    the signature of the unlifted graph module (not preserved by export).
+    """
+
+    args, kwargs = example_inputs
+    ctx = signature.bind(*args, **kwargs).arguments
+    flat_example_inputs_with_paths = pytree.tree_leaves_with_path(ctx)
+    return [path for path, _ in flat_example_inputs_with_paths]
+
+
+def _get_input_guards_for_graph(
+    placeholders: list[torch.fx.Node],
+    range_constraints: dict[sympy.Symbol, ValueRanges],
+    paths_for_placeholders: list[pytree.KeyPath],
+):
+    """
+    Guards generated by the tracer include conditions observed in code, but
+    but do not include some additional checks we typically do in export.
+    For example, when dynamic shapes get specialized, are specified to be
+    within a range, or are specified to be in some equational relation,
+    corresponding input invalidation is done within a pre_hook, specifically,
+    `_check_input_constraints_for_graph`.
+
+    Here we generate guards corresponding to the checks that happen in
+    `_check_input_constraints_for_graph`, and add them to the guards already
+    generated by the tracer. In the future, it may be worthwhile to separate
+    them so that we can allow clients to turn off one but not the other.
+    (Looking at you, AOTI.)
+
+    NOTE: We should eventually reconcile this logic with `build_guards` that
+    is used by AOT Precompile.
+    """
+
+    deferred_expressions = []
+    new_guards_code = []
+    sources: dict[sympy.Expr, str] = {}
+
+    def handle_symint(expr, src):
+        if len(expr.free_symbols) == 1:
+            # complex equations (e.g., involving derived dims) need to
+            # handled later, since we may not have enough information
+            # just as we are passing through the placeholders in order
+            deferred_expressions.append((src, expr))
+        if expr in sources:
+            # expressions that appear in multiple sources should force
+            # inputs corresponding to those sources to be equal
+            # e.g., x.shape[0] == y.shape[1]
+            orig_src = sources[expr]
+            new_guards_code.append(f"{src} == {orig_src}")
+        else:
+            sources[expr] = src
+            # process value ranges as elsewhere in export
+            min_val, max_val = _convert_range_to_int(range_constraints[expr])
+            if min_val > 2:
+                new_guards_code.append(f"{src} >= {min_val}")
+            if max_val < math.inf:
+                new_guards_code.append(f"{src} <= {max_val}")
+
+    for placeholder, path in zip(placeholders, paths_for_placeholders):
+        src = "L" + pytree.keystr(path)
+        meta = placeholder.meta["val"]
+        # specializations
+        if isinstance(meta, int):
+            new_guards_code.append(f"{src} == {meta}")
+        if isinstance(meta, float):
+            if meta == math.inf:
+                new_guards_code.append(f"{src} == math.inf")
+            elif meta == -math.inf:
+                new_guards_code.append(f"{src} == -math.inf")
+            else:
+                new_guards_code.append(f"{src} == {meta}")
+        elif isinstance(meta, str):
+            new_guards_code.append(f"{src} == '{meta}'")
+        # range constraints and equalities
+        elif isinstance(meta, torch.SymInt) and meta.node.expr in range_constraints:
+            handle_symint(meta.node.expr, src)
+        elif isinstance(meta, torch.Tensor):
+            for i, dim in enumerate(meta.shape):
+                src = "L" + pytree.keystr(path) + f".size()[{i}]"
+                if isinstance(dim, int):
+                    # specializations
+                    new_guards_code.append(f"{src} == {dim}")
+                elif (
+                    isinstance(dim, torch.SymInt) and dim.node.expr in range_constraints
+                ):
+                    # range constraints and equalities
+                    handle_symint(dim.node.expr, src)
+
+    unification_map: dict[sympy.Symbol, sympy.Expr] = {}
+    py_printer = torch.utils._sympy.printers.PythonPrinter()
+
+    # process complex equations (e.g., involving derived dims)
+    for src, expr in deferred_expressions:
+        # we know this is the only symbol in expr (see check above)
+        symbol = next(iter(expr.free_symbols))
+        if symbol in sources:
+            # if s0 is already known to be directly sourced from inputs,
+            # e.g., z.shape[2], we do not need to do anything further
+            # (assume we have already processed constraints on s0 above)
+            continue
+
+        # otherwise s0 has some "hidden" source like 'dim'
+        # example: src = y.shape[1], expr = s0 + 1
+        if symbol in unification_map:
+            # suppose that we already know that s0 = x.shape[0] * 2
+            # so we can emit the guard: x.shape[0] * 2 + 1 = y.shape[1]
+            substitution = expr.subs(unification_map)
+            new_guards_code.append(
+                py_printer.doprint(sympy.Eq(substitution, sympy.Symbol(src)))
+            )
+        else:
+            # we do not yet know what s0 is, but given s0 + 1 = y.shape[1],
+            # we can solve for s0...now knowing that s0 = y.shape[1] - 1
+            solution = try_solve(sympy.Eq(expr, sympy.Symbol(src)), symbol)
+            if solution is not None:
+                definition = solution[1]
+                unification_map[symbol] = definition
+
+    return new_guards_code
+
+
+def _unlift_exported_program_lifted_states(
+    ep: ExportedProgram, check_guards=True
+) -> torch.fx.GraphModule:
+    # force check_guards=False for executorch because
+    # its pass infra has too many calls to .module()
+    # and but does not like call modules in the graph
+    # TODO: update executorch to check_guards=False
+    frame = inspect.currentframe()
+    while frame is not None:
+        if "executorch" in frame.f_code.co_filename:
+            check_guards = False
+            break
+        frame = frame.f_back
+
     # TODO T206340015
     if ep.verifiers[0].dialect != "TRAINING":
         ep = _remove_effect_tokens(ep)
+
     new_gm = torch.fx.GraphModule(ep.graph_module, copy.deepcopy(ep.graph))
     _register_attrs_to_new_gm(new_gm, ep.graph_signature, ep.state_dict, ep.constants)
     forward_arg_names = (
@@ -447,7 +682,11 @@ def _unlift_exported_program_lifted_states(ep: ExportedProgram) -> torch.fx.Grap
         (
             out_spec.target
             if out_spec.kind
-            in (OutputKind.BUFFER_MUTATION, OutputKind.USER_INPUT_MUTATION)
+            in (
+                OutputKind.BUFFER_MUTATION,
+                OutputKind.USER_INPUT_MUTATION,
+                OutputKind.PARAMETER_MUTATION,
+            )
             else None
         )
         for out_spec in ep.graph_signature.output_specs
@@ -485,4 +724,37 @@ def _unlift_exported_program_lifted_states(ep: ExportedProgram) -> torch.fx.Grap
     )
     unlift_gm = _create_stateful_graph_module(new_gm, ep.range_constraints, ep)
     unlift_gm.meta.update(ep.graph_module.meta)
+
+    # create a _guards_fn submodule and insert a call to it after placeholders
+    graph = unlift_gm.graph
+    placeholders = graph.find_nodes(op="placeholder")
+    if check_guards and placeholders and ep.example_inputs:
+        input_paths = _get_input_paths(
+            ep.example_inputs,
+            inspect.signature(unlift_gm.forward),
+        )
+        guards_code = _get_input_guards_for_graph(
+            placeholders, ep.range_constraints, input_paths
+        )
+        guards_code.extend(ep._guards_code)
+        unlift_gm._guards_fn = _convert_guards_code_to_fn(guards_code, input_paths)
+
+        root_nn_module_stack = torch.fx._utils.first_call_function_nn_module_stack(
+            graph
+        )
+        with graph.inserting_after(placeholders[-1]):
+            node = graph.call_module("_guards_fn", tuple(placeholders))
+            node.meta["nn_module_stack"] = root_nn_module_stack
+
+        unlift_gm.recompile()
+
     return unlift_gm
+
+
+class GuardsFn(torch.nn.Module):
+    """
+    Module class for guard functions.
+    """
+
+    def forward(self, *args):
+        pass
diff --git a/torch/export/dynamic_shapes.py b/torch/export/dynamic_shapes.py
index ccc3660f7600c..de41fdfdb3467 100644
--- a/torch/export/dynamic_shapes.py
+++ b/torch/export/dynamic_shapes.py
@@ -887,7 +887,7 @@ def verify(self, ep):
 
         epm = ep.module()
         for args, kwargs in self._examples:
-            torch.export._unlift._check_input_constraints_pre_hook(
+            torch.export._unlift._check_input_constraints_for_module(
                 epm, args, kwargs or {}
             )
 
@@ -945,7 +945,7 @@ def check_symbols(path, tensor, shape):
                         f"Unexpected dimension mapped to index {i} in input tensor shape {shape} "
                         f"specified at `dynamic_shapes{keystr(path)}` "
                         f"(expected None, an int, a Dim, Dim.AUTO, Dim.STATIC, or Dim.DYNAMIC, "
-                        f" but got {dim} instead)",
+                        f" but got {dim!r} instead)",
                         case_name="dynamic_shapes_validation",
                     )
         elif isinstance(shape, (tuple, list)):
@@ -968,7 +968,7 @@ def check_symbols(path, tensor, shape):
                         f"Unexpected dimension #{i} in input tensor shape {shape} "
                         f"specified at `dynamic_shapes{keystr(path)}` "
                         f"(expected None, an int, a Dim, Dim.AUTO, Dim.STATIC, or Dim.DYNAMIC, "
-                        f"but got {dim} instead)",
+                        f"but got {dim!r} instead)",
                         case_name="dynamic_shapes_validation",
                     )
         elif shape is not None:
diff --git a/torch/export/experimental/__init__.py b/torch/export/experimental/__init__.py
index 372eb3a29533d..1c87bb29bfe96 100644
--- a/torch/export/experimental/__init__.py
+++ b/torch/export/experimental/__init__.py
@@ -360,7 +360,8 @@ def _compiled_and_package(
             "aot_inductor.package": True,
             "aot_inductor.package_cpp_only": True,
             "always_keep_tensor_constants": True,
-            "aot_inductor.package_constants_in_so": False,
+            # we'll change this back to False once we enable weight deduping for standalone mode
+            "aot_inductor.package_constants_in_so": standalone,
             "aot_inductor.compile_standalone": standalone,
         }
         aoti_files_map = {}
diff --git a/torch/export/experimental/_utils.py b/torch/export/experimental/_utils.py
index b91dfbb0db802..67bda0c34ce4f 100644
--- a/torch/export/experimental/_utils.py
+++ b/torch/export/experimental/_utils.py
@@ -1,9 +1,11 @@
+import logging
 import typing
 
 from torch._inductor.utils import IndentedBuffer
 
 
 __all__ = []  # type: ignore[var-annotated]
+logger = logging.getLogger(__name__)
 
 
 def _get_main_cpp_file(
@@ -125,8 +127,10 @@ def _get_main_cpp_file(
                     [
                         f"auto constants_map{i + 1} = std::make_shared<ConstantMap>();",
                         f"auto constants_array{i + 1} = std::make_shared<std::vector<ConstantHandle>>();",
-                        f"auto model{i + 1} = AOTInductorModel{model_name}::Create(",
-                        f"    constants_map{i + 1}, constants_array{i + 1}, device_str,",
+                        f"auto model{i + 1} = std::make_unique<AOTInductorModel{model_name}>(",
+                        f"    std::move(constants_map{i + 1}),",
+                        f"    std::move(constants_array{i + 1}),",
+                        "    device_str,",
                         f'    "{package_name}/data/aotinductor/{model_name}/");',
                         f"model{i + 1}->load_constants();",
                     ]
@@ -154,7 +158,10 @@ def _get_main_cpp_file(
                 ib.writeline("\n// Validate outputs")
                 for i in range(len(model_names)):
                     ib.writeline(
-                        f"""std::cout << "output_tensor{i + 1}" << output_tensor{i + 1} << std::endl;"""
+                        f"""std::cout << "output_tensor{i + 1}\\n" << output_tensor{i + 1} << std::endl;"""
+                    )
+                    ib.writeline(
+                        f"""torch::save(output_tensor{i + 1}, "output_tensor{i + 1}.pt");"""
                     )
 
             ib.writeline("return 0;")
@@ -184,9 +191,14 @@ def _get_make_file(package_name: str, model_names: list[str], cuda: bool) -> str
             "",
             "set(CMAKE_CXX_STANDARD 17)",
             "",
-            "find_package(Torch REQUIRED)",
         ]
     )
+
+    from torch._inductor.config import test_configs
+
+    if test_configs.use_libtorch:
+        ib.writeline("find_package(Torch REQUIRED)")
+
     if cuda:
         ib.writeline("find_package(CUDA REQUIRED)")
 
@@ -200,6 +212,7 @@ def _get_make_file(package_name: str, model_names: list[str], cuda: bool) -> str
 
     model_libs = " ".join(model_names)
     ib.writeline(f"target_link_libraries(main PRIVATE torch {model_libs})")
+
     if cuda:
         ib.writeline("target_link_libraries(main PRIVATE cuda ${CUDA_LIBRARIES})")
 
diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py
index 125d2dd9c9bd9..1aa2e59d1752b 100644
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@@ -792,9 +792,9 @@ def _remove_unneccessary_copy_op_pass(
             if node.op == "output":
                 args, _ = pytree.tree_flatten(node.args)
                 for out in args:
-                    if (
-                        isinstance(out, torch.fx.Node)
-                        and out.name in new_graph_signature.buffers_to_mutate
+                    if isinstance(out, torch.fx.Node) and (
+                        out.name in new_graph_signature.buffers_to_mutate
+                        or out.name in new_graph_signature.parameters_to_mutate
                     ):
                         if (
                             out.op == "call_function"
@@ -1047,6 +1047,8 @@ class ExportedProgram:
     _verifiers: list[type[Verifier]]
     """List of verifier classes used to validate the exported program."""
 
+    _guards_code: list[str]
+
     def __init__(
         self,
         root: Union[torch.nn.Module, dict[str, Any]],
@@ -1084,6 +1086,8 @@ def __init__(
         # Validate should be always the last step of the constructor.
         self.validate()
 
+        self._guards_code = _convert_guards_to_code(_get_shape_env(self._graph_module))
+
     @property
     @compatibility(is_backward_compatible=False)
     def graph_module(self):
@@ -1379,13 +1383,20 @@ def __str__(self) -> str:
         )
         return string
 
-    def module(self) -> torch.fx.GraphModule:
+    def module(self, check_guards=True) -> torch.fx.GraphModule:
         """
         Returns a self contained GraphModule with all the parameters/buffers inlined.
+
+        - When `check_guards=True` (default), a `_guards_fn` submodule is generated
+          and a call to a `_guards_fn` submodule is inserted right after placeholders
+          in the graph. This module checks guards on inputs.
+        - When `check_guards=False`, a subset of these checks are performed by a
+          forward pre-hook on the graph module. No `_guards_fn` submodule is generated.
+
         """
         from ._unlift import _unlift_exported_program_lifted_states
 
-        module = _unlift_exported_program_lifted_states(self)
+        module = _unlift_exported_program_lifted_states(self, check_guards=check_guards)
 
         def _train(self, mode: bool = True):
             raise NotImplementedError("Calling train() is not supported yet.")
@@ -1677,3 +1688,25 @@ def _create_graph_module_for_export(root, graph):
         gm._graph = graph
 
     return gm
+
+
+def _convert_guards_to_code(shape_env):
+    if shape_env is None:
+        return []
+
+    local_vars = {
+        var
+        for var, sources in shape_env.var_to_sources.items()
+        if all(
+            not isinstance(source, torch._dynamo.source.ConstantSource)
+            for source in sources
+        )
+    }
+    py_printer = torch.fx.experimental.symbolic_shapes.ShapeGuardPythonPrinter(
+        shape_env.var_to_sources, lambda s: s.name(), shape_env.var_to_sources
+    )
+    return [
+        py_printer.doprint(guard.expr)
+        for guard in shape_env.guards
+        if guard.expr.free_symbols.issubset(local_vars)
+    ]
diff --git a/torch/export/graph_signature.py b/torch/export/graph_signature.py
index 75137d5463f3a..e8935e359b0ee 100644
--- a/torch/export/graph_signature.py
+++ b/torch/export/graph_signature.py
@@ -121,6 +121,7 @@ class OutputKind(Enum):
     USER_OUTPUT = auto()
     LOSS_OUTPUT = auto()
     BUFFER_MUTATION = auto()
+    PARAMETER_MUTATION = auto()
     GRADIENT_TO_PARAMETER = auto()
     GRADIENT_TO_USER_INPUT = auto()
     USER_INPUT_MUTATION = auto()
@@ -406,6 +407,16 @@ def buffers_to_mutate(self) -> Mapping[str, str]:
             and isinstance(s.target, str)
         )
 
+    @property
+    def parameters_to_mutate(self) -> Mapping[str, str]:
+        return _immutable_dict(
+            (s.arg.name, s.target)
+            for s in self.output_specs
+            if s.kind == OutputKind.PARAMETER_MUTATION
+            and isinstance(s.arg, TensorArgument)
+            and isinstance(s.target, str)
+        )
+
     @property
     def user_inputs_to_mutate(self) -> Mapping[str, str]:
         return _immutable_dict(
@@ -601,6 +612,7 @@ def _convert_to_export_graph_signature(
     inputs_to_buffers = graph_signature.inputs_to_buffers
     user_outputs = set(graph_signature.user_outputs)
     buffer_mutations = graph_signature.buffers_to_mutate
+    parameter_mutations = graph_signature.parameters_to_mutate
     user_input_mutations = graph_signature.user_inputs_to_mutate
     grad_params = (
         graph_signature.backward_signature.gradients_to_parameter  # type: ignore[union-attr]
@@ -662,13 +674,21 @@ def to_output_spec(idx: int, o: ArgumentSpec) -> OutputSpec:
         if not isinstance(o, TensorArgument):
             return OutputSpec(kind=OutputKind.USER_OUTPUT, arg=o, target=None)
         name = o.name
-        if idx < len(buffer_mutations) + len(user_input_mutations) + len(output_tokens):
+        if idx < len(buffer_mutations) + len(parameter_mutations) + len(
+            user_input_mutations
+        ) + len(output_tokens):
             if name in buffer_mutations:
                 return OutputSpec(
                     kind=OutputKind.BUFFER_MUTATION,
                     arg=o,
                     target=buffer_mutations[name],  # type: ignore[index]
                 )
+            elif name in parameter_mutations:
+                return OutputSpec(
+                    kind=OutputKind.PARAMETER_MUTATION,
+                    arg=o,
+                    target=parameter_mutations[name],  # type: ignore[index]
+                )
             elif name in user_input_mutations:
                 return OutputSpec(
                     kind=OutputKind.USER_INPUT_MUTATION,
diff --git a/torch/export/passes/__init__.py b/torch/export/passes/__init__.py
index 4e1d21de660dc..5e9c5a66008b9 100644
--- a/torch/export/passes/__init__.py
+++ b/torch/export/passes/__init__.py
@@ -52,19 +52,45 @@ def _get_new_device(
         if isinstance(v, torch.Tensor):
             ep._constants[k] = v.to(_get_new_device(v.device, location))
 
-    for node in ep.graph.nodes:
-        # move all the nodes kwargs with burnt-in device
-        if "device" in node.kwargs:
-            kwargs = node.kwargs.copy()
-            kwargs["device"] = _get_new_device(kwargs["device"], location)
-            node.kwargs = kwargs
-        # move all the tensor metadata
-        node.meta["val"] = pytree.tree_map(
-            lambda v: v.to(_get_new_device(v.device, location))
-            if isinstance(v, torch.Tensor)
-            else v,
-            node.meta.get("val"),
+    # move example_inputs if they exist
+    if ep.example_inputs is not None:
+        args, kwargs = ep.example_inputs
+        moved_args = pytree.tree_map_only(
+            torch.Tensor,
+            lambda tensor: tensor.to(_get_new_device(tensor.device, location)),
+            args,
         )
+        moved_kwargs = pytree.tree_map_only(
+            torch.Tensor,
+            lambda tensor: tensor.to(_get_new_device(tensor.device, location)),
+            kwargs,
+        )
+        ep._example_inputs = (moved_args, moved_kwargs)
+
+    for m in ep.graph_module.modules():
+        if isinstance(m, torch.fx.GraphModule):
+            for node in m.graph.nodes:
+                # move all the nodes kwargs with burnt-in device
+                if "device" in node.kwargs:
+                    kwargs = node.kwargs.copy()
+                    kwargs["device"] = _get_new_device(kwargs["device"], location)
+                    node.kwargs = kwargs
+
+                if (
+                    node.op == "call_function"
+                    and node.target == torch.ops.aten.to.device
+                ):
+                    args = list(node.args)
+                    args[1] = _get_new_device(args[1], location)
+                    node.args = tuple(args)
+
+                # move all the tensor metadata
+                node.meta["val"] = pytree.tree_map(
+                    lambda v: v.to(_get_new_device(v.device, location))
+                    if isinstance(v, torch.Tensor)
+                    else v,
+                    node.meta.get("val"),
+                )
 
     ep.validate()
     return ep
diff --git a/torch/export/pt2_archive/_package.py b/torch/export/pt2_archive/_package.py
index 323253a1501b8..db147e2fb8094 100644
--- a/torch/export/pt2_archive/_package.py
+++ b/torch/export/pt2_archive/_package.py
@@ -11,8 +11,22 @@
 
 import torch
 import torch.utils._pytree as pytree
-from torch._export.serde.serialize import deserialize, serialize, SerializedArtifact
+from torch._export.serde import schema
+from torch._export.serde.serialize import (
+    _dataclass_to_dict,
+    _dict_to_dataclass,
+    deserialize_device,
+    deserialize_scalar_type,
+    deserialize_size,
+    deserialize_storage_offset,
+    deserialize_stride,
+    ExportedProgramDeserializer,
+    serialize,
+    serialize_tensor_meta,
+    SerializedArtifact,
+)
 from torch._inductor.cpp_builder import normalize_path_separator
+from torch._subclasses.fake_tensor import FakeTensor
 from torch.export import ExportedProgram
 from torch.export._tree_utils import reorder_kwargs
 from torch.export.pt2_archive._package_weights import (
@@ -26,13 +40,16 @@
     ARCHIVE_FORMAT_VALUE,
     ARCHIVE_VERSION_PATH,
     ARCHIVE_VERSION_VALUE,
+    CONSTANTS_CONFIG_FILENAME_FORMAT,
     CONSTANTS_DIR,
     CUSTOM_OBJ_FILENAME_PREFIX,
     EXTRA_DIR,
     MODELS_DIR,
     MODELS_FILENAME_FORMAT,
     SAMPLE_INPUTS_FILENAME_FORMAT,
+    TENSOR_CONSTANT_FILENAME_PREFIX,
     WEIGHT_FILENAME_PREFIX,
+    WEIGHTS_CONFIG_FILENAME_FORMAT,
     WEIGHTS_DIR,
 )
 from torch.types import FileLike
@@ -308,6 +325,162 @@ def _package_aoti_files(
             logger.debug(weights_config)
 
 
+def _is_fake_tensor(t: torch.Tensor) -> bool:
+    return isinstance(t, FakeTensor)
+
+
+def _is_tensor_subclass(t: torch.Tensor) -> bool:
+    return isinstance(t, torch.Tensor) and type(t.data) is not torch.Tensor
+
+
+def _get_raw_tensor_bytes(value: torch.Tensor) -> bytes:
+    """
+    Get the raw bytes of a tensor. This is used to save the tensor in pt2 archive.
+    """
+    # NOTE: don't chain .cpu() with .data_ptr(). If an HtoD copy needs to be
+    # performed, the CPU copy needs to be kept alive when its underlying
+    # memory is accessed.
+    import ctypes
+
+    if _is_fake_tensor(value):
+        value_bytes = b""
+    elif value.data_ptr():
+        cpu_tensor = value.cpu().contiguous()
+        value_untyped_storage = cpu_tensor.untyped_storage()
+        # we store the raw bytes the untyped storage. Tensor metadata is stored separately
+        value_bytes = bytes(
+            ctypes.cast(
+                value_untyped_storage.data_ptr(),
+                ctypes.POINTER(ctypes.c_ubyte * value_untyped_storage.size()),
+            ).contents
+        )
+    else:
+        # for empty tensor
+        value_bytes = b""
+    return value_bytes
+
+
+def _package_state_dict(
+    exported_program: ExportedProgram,
+    archive_writer: PT2ArchiveWriter,
+    pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
+) -> schema.PayloadConfig:
+    weights_config: dict[str, schema.PayloadMeta] = {}
+    storage_map: dict[torch.UntypedStorage, str] = {}
+
+    idx = archive_writer.count_prefix(os.path.join(WEIGHTS_DIR, WEIGHT_FILENAME_PREFIX))
+    for weight_fqn, weight_tensor in exported_program.state_dict.items():
+        assert isinstance(weight_tensor, torch.Tensor), (
+            "only torch.Tensor is allowed in state_dict"
+        )
+        path_name = f"{WEIGHT_FILENAME_PREFIX}{idx}"
+        is_param = isinstance(weight_tensor, torch.nn.Parameter)
+        # use pickle for non-fake tensor subclasses
+        use_pickle = _is_tensor_subclass(weight_tensor) and not _is_fake_tensor(
+            weight_tensor
+        )
+        archive_path = os.path.join(WEIGHTS_DIR, path_name)
+        if use_pickle:
+            buffer = io.BytesIO()
+            torch.save(weight_tensor, buffer, pickle_protocol=pickle_protocol)
+            archive_writer.write_bytes(archive_path, buffer.getvalue())
+            idx += 1
+        else:
+            tensor_storage = weight_tensor.untyped_storage()
+            if tensor_storage not in storage_map:
+                storage_map[tensor_storage] = path_name
+                tensor_bytes = _get_raw_tensor_bytes(weight_tensor)
+                archive_writer.write_bytes(archive_path, tensor_bytes)
+                idx += 1
+            else:
+                path_name = storage_map[tensor_storage]
+
+        weights_config[weight_fqn] = schema.PayloadMeta(
+            path_name=path_name,
+            is_param=is_param,
+            use_pickle=use_pickle,
+            tensor_meta=serialize_tensor_meta(weight_tensor),
+        )
+
+    return schema.PayloadConfig(config=weights_config)
+
+
+def _package_constants(
+    exported_program: ExportedProgram,
+    archive_writer: PT2ArchiveWriter,
+    pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
+) -> schema.PayloadConfig:
+    constants_config: dict[str, schema.PayloadMeta] = {}
+    storage_map: dict[torch.UntypedStorage, str] = {}
+
+    tensor_idx = archive_writer.count_prefix(
+        os.path.join(CONSTANTS_DIR, TENSOR_CONSTANT_FILENAME_PREFIX)
+    )
+    custom_obj_idx = archive_writer.count_prefix(
+        os.path.join(CONSTANTS_DIR, CUSTOM_OBJ_FILENAME_PREFIX)
+    )
+
+    for constant_fqn, constant in exported_program.constants.items():
+        if isinstance(constant, torch.Tensor):
+            use_pickle = _is_tensor_subclass(constant) and not _is_fake_tensor(constant)
+            path_name = f"{TENSOR_CONSTANT_FILENAME_PREFIX}{tensor_idx}"
+            archive_path = os.path.join(CONSTANTS_DIR, path_name)
+            if use_pickle:
+                buffer = io.BytesIO()
+                torch.save(constant, buffer, pickle_protocol=pickle_protocol)
+                archive_writer.write_bytes(archive_path, buffer.getvalue())
+                tensor_idx += 1
+            else:
+                # Only save once when tensors share the same storage
+                tensor_storage = constant.untyped_storage()
+                if tensor_storage not in storage_map:
+                    storage_map[tensor_storage] = path_name
+                    tensor_bytes = _get_raw_tensor_bytes(constant)
+                    archive_writer.write_bytes(archive_path, tensor_bytes)
+                    tensor_idx += 1
+                else:
+                    path_name = storage_map[tensor_storage]
+
+            constants_config[constant_fqn] = schema.PayloadMeta(
+                path_name=path_name,
+                is_param=False,
+                use_pickle=use_pickle,
+                tensor_meta=serialize_tensor_meta(constant),
+            )
+
+        elif isinstance(constant, torch._C.ScriptObject):
+            # use pickle for custom objects
+            path_name = f"{CUSTOM_OBJ_FILENAME_PREFIX}{custom_obj_idx}"
+            custom_obj_idx += 1
+            constants_config[constant_fqn] = schema.PayloadMeta(
+                path_name=path_name,
+                is_param=False,
+                use_pickle=True,
+                tensor_meta=None,
+            )
+            archive_path = os.path.join(CONSTANTS_DIR, path_name)
+            custom_obj_bytes = torch._C._pickle_save(constant)
+            archive_writer.write_bytes(archive_path, custom_obj_bytes)
+
+        else:
+            raise RuntimeError(f"Unsupported constant type: {type(constant)}")
+
+    return schema.PayloadConfig(config=constants_config)
+
+
+def _package_payload_config(
+    archive_writer: PT2ArchiveWriter,
+    payload_config: schema.PayloadConfig,
+    config_file: str,
+) -> None:
+    """
+    Save the payload config as json file in the archive.
+    """
+    archive_writer.write_string(
+        config_file, json.dumps(_dataclass_to_dict(payload_config))
+    )
+
+
 def _package_exported_programs(
     archive_writer: PT2ArchiveWriter,
     exported_programs: Optional[Union[ExportedProgram, dict[str, ExportedProgram]]],
@@ -323,16 +496,23 @@ def _package_exported_programs(
     assert isinstance(exported_programs, dict)
 
     for model_name, ep in exported_programs.items():
-        artifact: SerializedArtifact = serialize(ep, opset_version, pickle_protocol)
+        weights_config = _package_state_dict(ep, archive_writer, pickle_protocol)
+        weights_config_file = WEIGHTS_CONFIG_FILENAME_FORMAT.format(model_name)
+        _package_payload_config(archive_writer, weights_config, weights_config_file)
+
+        constants_config = _package_constants(ep, archive_writer, pickle_protocol)
+        constants_config_file = CONSTANTS_CONFIG_FILENAME_FORMAT.format(model_name)
+        _package_payload_config(archive_writer, constants_config, constants_config_file)
+
+        artifact: SerializedArtifact = serialize(
+            ep,
+            opset_version,
+            pickle_protocol,
+        )
 
         archive_writer.write_bytes(
             MODELS_FILENAME_FORMAT.format(model_name), artifact.exported_program
         )
-        # TODO:Consider dedup this with the weights saved in package_aoti_files
-        archive_writer.write_bytes(f"{WEIGHTS_DIR}{model_name}.pt", artifact.state_dict)
-        archive_writer.write_bytes(
-            f"{CONSTANTS_DIR}{model_name}.pt", artifact.constants
-        )
         archive_writer.write_bytes(
             SAMPLE_INPUTS_FILENAME_FORMAT.format(model_name),
             artifact.example_inputs,
@@ -400,6 +580,7 @@ def package_pt2(
     if not (
         (isinstance(f, (io.IOBase, IO)) and f.writable() and f.seekable())
         or (isinstance(f, (str, os.PathLike)) and os.fspath(f).endswith(".pt2"))
+        or (isinstance(f, tempfile._TemporaryFileWrapper) and f.name.endswith(".pt2"))
     ):
         # TODO: turn this into an error
         logger.warning(
@@ -485,6 +666,185 @@ class PT2ArchiveContents:
     extra_files: dict[str, Any]
 
 
+def _create_flat_tensor_from_bytes(
+    tensor_bytes: bytes,
+    tensor_meta: schema.TensorMeta,
+) -> torch.Tensor:
+    """
+    Create a flat tensor from raw bytes with dtype, device and requires_grad.
+    It will be re-strided based on size, stride, and storage_offset later.
+    """
+    dtype = deserialize_scalar_type(tensor_meta.dtype)
+    size = deserialize_size(tensor_meta.sizes)
+    device = deserialize_device(tensor_meta.device)
+
+    if len(tensor_bytes) != 0:
+        tensor = torch.frombuffer(
+            tensor_bytes, dtype=dtype, requires_grad=tensor_meta.requires_grad
+        ).to(device)
+    else:
+        # cannot call torch.frombuffer() on empty bytes
+        logger.warning(
+            "Cannot call torch.frombuffer() on empty bytes. "
+            "Creating a tensor with zeros as workaround."
+        )
+        tensor = torch.zeros(size, dtype=dtype, device=device)
+
+    return tensor
+
+
+def _build_file_map(
+    archive_reader: PT2ArchiveReader,
+    config: schema.PayloadConfig,
+    base_dir: str,
+) -> dict[str, torch.Tensor]:
+    """
+    Build a map from file path to the payload in flat tensor format.
+    """
+    file_map: dict[str, torch.Tensor] = {}
+    for payload_meta in config.config.values():
+        # skip pickled objects
+        if payload_meta.use_pickle:
+            continue
+        # skip files that already exist in the map
+        if payload_meta.path_name in file_map:
+            continue
+
+        tensor_bytes = archive_reader.read_bytes(
+            os.path.join(base_dir, payload_meta.path_name)
+        )
+        assert payload_meta.tensor_meta is not None
+        tensor = _create_flat_tensor_from_bytes(tensor_bytes, payload_meta.tensor_meta)
+        file_map[payload_meta.path_name] = tensor
+
+    return file_map
+
+
+def _load_payload_config(
+    archive_reader: PT2ArchiveReader,
+    config_file: str,
+) -> schema.PayloadConfig:
+    """
+    Load and parse a payload config from the archive.
+    """
+    return _dict_to_dataclass(
+        schema.PayloadConfig,
+        json.loads(archive_reader.read_string(config_file)),
+    )
+
+
+def _load_state_dict(
+    archive_reader: PT2ArchiveReader,
+    model_name: str,
+) -> Union[dict[str, torch.Tensor], bytes]:
+    # Make it BC compatible with legacy weight files
+    legacy_weights_file = f"{WEIGHTS_DIR}{model_name}.pt"
+    if legacy_weights_file in archive_reader.get_file_names():
+        logger.warning(
+            "You are loading weight from the legacy format. "
+            "Please generate a new pt2 file using torch.export.save()."
+        )
+        return archive_reader.read_bytes(legacy_weights_file)
+    else:
+        weights_config_file = WEIGHTS_CONFIG_FILENAME_FORMAT.format(model_name)
+        assert weights_config_file in archive_reader.get_file_names(), (
+            f"{weights_config_file} not found in PT2 archive"
+        )
+        weights_config = _load_payload_config(archive_reader, weights_config_file)
+        # construct the mapping from file name (e.g. weight_0) to flat weight payload
+        state_dict_file_map = _build_file_map(
+            archive_reader, weights_config, WEIGHTS_DIR
+        )
+        # chain the mapping weight FQN -> weight file name -> strided weight payload
+        # so that the aliasing of weights is preserved
+        state_dict: dict[str, torch.Tensor] = {}
+        for weight_fqn, payload_meta in weights_config.config.items():
+            if payload_meta.use_pickle:
+                weight_bytes = archive_reader.read_bytes(
+                    os.path.join(WEIGHTS_DIR, payload_meta.path_name)
+                )
+                state_dict[weight_fqn] = torch.load(
+                    io.BytesIO(weight_bytes), weights_only=False
+                )
+            else:
+                tensor_meta = payload_meta.tensor_meta
+                assert tensor_meta is not None
+                weight_tensor = torch.as_strided(
+                    input=state_dict_file_map[payload_meta.path_name],
+                    size=deserialize_size(tensor_meta.sizes),
+                    stride=deserialize_stride(tensor_meta.strides),
+                    storage_offset=deserialize_storage_offset(
+                        tensor_meta.storage_offset
+                    ),
+                )
+                if payload_meta.is_param:
+                    state_dict[weight_fqn] = torch.nn.Parameter(weight_tensor)
+                else:
+                    state_dict[weight_fqn] = weight_tensor
+
+        return state_dict
+
+
+def _load_constants(
+    archive_reader: PT2ArchiveReader,
+    model_name: str,
+) -> Union[dict[str, torch.Tensor], bytes]:
+    # Make it BC compatible with legacy constant files
+    legacy_constants_file = f"{CONSTANTS_DIR}{model_name}.pt"
+    if legacy_constants_file in archive_reader.get_file_names():
+        logger.warning(
+            "You are loading constant from the legacy format. "
+            "Please generate a new pt2 file using torch.export.save()."
+        )
+        return archive_reader.read_bytes(legacy_constants_file)
+    else:
+        constants_config_file = CONSTANTS_CONFIG_FILENAME_FORMAT.format(model_name)
+        assert constants_config_file in archive_reader.get_file_names(), (
+            f"{constants_config_file} not found in PT2 archive"
+        )
+        constants_config = _load_payload_config(archive_reader, constants_config_file)
+        # construct the mapping from file name (e.g. constant_0) to constant payload
+        constant_file_map = _build_file_map(
+            archive_reader, constants_config, CONSTANTS_DIR
+        )
+        # chain the mapping constant FQN -> constant file name -> strided constant payload
+        # so that the aliasing of constants is preserved
+        constants: dict[str, torch.Tensor] = {}
+        for constant_fqn, payload_meta in constants_config.config.items():
+            path_name = payload_meta.path_name
+            if path_name.startswith(TENSOR_CONSTANT_FILENAME_PREFIX):
+                if payload_meta.use_pickle:
+                    constant_bytes = archive_reader.read_bytes(
+                        os.path.join(CONSTANTS_DIR, path_name)
+                    )
+                    constants[constant_fqn] = torch.load(
+                        io.BytesIO(constant_bytes), weights_only=False
+                    )
+                else:
+                    tensor_meta = payload_meta.tensor_meta
+                    assert tensor_meta is not None
+                    constant_tensor = torch.as_strided(
+                        input=constant_file_map[path_name],
+                        size=deserialize_size(tensor_meta.sizes),
+                        stride=deserialize_stride(tensor_meta.strides),
+                        storage_offset=deserialize_storage_offset(
+                            tensor_meta.storage_offset
+                        ),
+                    )
+                    constants[constant_fqn] = constant_tensor
+
+            elif path_name.startswith(CUSTOM_OBJ_FILENAME_PREFIX):
+                constant_bytes = archive_reader.read_bytes(
+                    os.path.join(CONSTANTS_DIR, path_name)
+                )
+                constants[constant_fqn] = torch._C._pickle_load_obj(constant_bytes)
+
+            else:
+                raise RuntimeError(f"Unsupported constant type: {path_name}")
+
+        return constants
+
+
 def _load_exported_programs(
     archive_reader: PT2ArchiveReader,
     file_names: list[str],
@@ -502,24 +862,25 @@ def _load_exported_programs(
             len(prefix) : -len(suffix)
         ]  # given "models/foo.json" we can now get "foo"
 
-        weights_file = f"{WEIGHTS_DIR}{model_name}.pt"
-        constants_file = f"{CONSTANTS_DIR}{model_name}.pt"
         sample_inputs_file = SAMPLE_INPUTS_FILENAME_FORMAT.format(model_name)
-
-        serialized_exported_program = archive_reader.read_bytes(file)
-        serialized_weights = archive_reader.read_bytes(weights_file)
-        serialized_constants = archive_reader.read_bytes(constants_file)
         serialized_sample_inputs = archive_reader.read_bytes(sample_inputs_file)
 
-        artifact: SerializedArtifact = SerializedArtifact(
+        from torch._export.serde.serialize import _bytes_to_dataclass
+
+        exported_program_bytes = archive_reader.read_bytes(file)
+        serialized_exported_program = _bytes_to_dataclass(
+            schema.ExportedProgram, exported_program_bytes
+        )
+        state_dict = _load_state_dict(archive_reader, model_name)
+        constants = _load_constants(archive_reader, model_name)
+
+        ep = ExportedProgramDeserializer(expected_opset_version).deserialize(
             serialized_exported_program,
-            serialized_weights,
-            serialized_constants,
+            state_dict,
+            constants,
             serialized_sample_inputs,
         )
 
-        # Deserialize ExportedProgram
-        ep = deserialize(artifact, expected_opset_version)
         exported_programs[model_name] = ep
 
     return exported_programs
diff --git a/torch/export/pt2_archive/constants.py b/torch/export/pt2_archive/constants.py
index 3fbf9c69fc1ba..772c3c0708412 100644
--- a/torch/export/pt2_archive/constants.py
+++ b/torch/export/pt2_archive/constants.py
@@ -9,6 +9,9 @@
 ARCHIVE_VERSION_PATH: str = pt2_archive_constants.ARCHIVE_VERSION_PATH
 ARCHIVE_VERSION_VALUE: str = pt2_archive_constants.ARCHIVE_VERSION_VALUE
 CONSTANTS_DIR: str = pt2_archive_constants.CONSTANTS_DIR
+CONSTANTS_CONFIG_FILENAME_FORMAT: str = (
+    pt2_archive_constants.CONSTANTS_CONFIG_FILENAME_FORMAT
+)
 CUSTOM_OBJ_FILENAME_PREFIX: str = pt2_archive_constants.CUSTOM_OBJ_FILENAME_PREFIX
 EXTRA_DIR: str = pt2_archive_constants.EXTRA_DIR
 MODELS_DIR: str = pt2_archive_constants.MODELS_DIR
@@ -20,6 +23,9 @@
 TENSOR_CONSTANT_FILENAME_PREFIX: str = (
     pt2_archive_constants.TENSOR_CONSTANT_FILENAME_PREFIX
 )
+WEIGHTS_CONFIG_FILENAME_FORMAT: str = (
+    pt2_archive_constants.WEIGHTS_CONFIG_FILENAME_FORMAT
+)
 WEIGHT_FILENAME_PREFIX: str = pt2_archive_constants.WEIGHT_FILENAME_PREFIX
 WEIGHTS_DIR: str = pt2_archive_constants.WEIGHTS_DIR
 XL_MODEL_WEIGHTS_DIR: str = pt2_archive_constants.XL_MODEL_WEIGHTS_DIR
diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py
index 0a0266f3a189c..d09307f66d6b8 100644
--- a/torch/export/unflatten.py
+++ b/torch/export/unflatten.py
@@ -27,7 +27,7 @@
     SymIntArgument,
     TensorArgument,
 )
-from torch.fx._symbolic_trace import is_fx_tracing
+from torch.fx._symbolic_trace import is_fx_symbolic_tracing
 from torch.fx.graph_module import _get_attr, _get_attr_via_attr_list, _print_readable
 from torch.utils._pytree import GetAttrKey, SequenceKey
 
@@ -53,6 +53,16 @@ class _AttrKind(Enum):
     MODULE = "module"
 
 
+@dataclass(frozen=True)
+class _TensorID:
+    """Custom tensor identifier containing storage, stride, and size information."""
+
+    untyped_storage: torch.UntypedStorage
+    stride: tuple
+    size: tuple
+    storage_offset: int
+
+
 RUN_WITH_INTERPRETER = True
 
 
@@ -124,6 +134,11 @@ class _SubmoduleBase:
     _ty: Optional[str]
 
     def type_name(self) -> Optional[str]:
+        """
+        Subclass of this class - InterpreterModule, InterpreterModuleDispatcher, represents
+        corresponding model in eager model. To get this type information for those modules
+        in eager model we need to use this method.
+        """
         return self._ty
 
 
@@ -148,7 +163,7 @@ def __init__(
 
     def forward(self, *args, **kwargs):
         assert self.graph_module is not None, "Didn't finalize this InterpreterModule"
-        if not is_fx_tracing() and (
+        if not is_fx_symbolic_tracing() and (
             torch.compiler.is_dynamo_compiling() or not self._run_with_interpreter
         ):
             # Dynamo cannot trace through torch.fx.Interpreter, so fall back to
@@ -280,6 +295,10 @@ def adapt(
         """NOTE: This adapter may mutate given ``input_args_with_path``."""
         ...
 
+    def get_flat_arg_paths(self) -> list[str]:
+        """Returns a list of paths that are used to access the flat args."""
+        return []
+
 
 class UnflattenedModule(torch.nn.Module):
     def __init__(
@@ -291,6 +310,17 @@ def __init__(
         if export_module.graph_signature.backward_signature is not None:
             raise ValueError("Unflattening on JointExportModule NYI")
 
+        def _id(obj):
+            """Returns _TensorID dataclass for tensors, otherwise id()."""
+            if isinstance(obj, torch.Tensor):
+                return _TensorID(
+                    untyped_storage=obj.untyped_storage(),
+                    stride=obj.stride(),
+                    size=obj.size(),
+                    storage_offset=obj.storage_offset(),  # type: ignore[arg-type]
+                )
+            return id(obj)
+
         fqn_list = [entry.fqn for entry in export_module.module_call_graph]
         assert fqn_list[0] == ""
         export_graph = deepcopy(export_module.graph)
@@ -335,16 +365,18 @@ def __init__(
         # graph's forward pass (_sink_params).
         state_dict = export_module.state_dict
         assigned_params: set[str] = set()  # tracking unused params
-        id_to_param: dict[int, torch.nn.Parameter] = {}  # handling weight-sharing
+        id_to_param: dict[
+            Union[int, _TensorID], torch.nn.Parameter
+        ] = {}  # handling weight-sharing
         for name in self.graph_signature.parameters:  # this loop adds used params
             param = state_dict[name]
-            if id(param) not in id_to_param:
-                id_to_param[id(param)] = torch.nn.Parameter(
+            if _id(param) not in id_to_param:
+                id_to_param[_id(param)] = torch.nn.Parameter(
                     param.clone(), requires_grad=param.requires_grad
                 )
 
             _assign_attr(
-                id_to_param[id(param)],
+                id_to_param[_id(param)],
                 self,
                 name,
                 attr_kind=_AttrKind.PARAMETER,
@@ -353,7 +385,7 @@ def __init__(
 
         non_persistent_buffers = set(self.graph_signature.non_persistent_buffers)
         assigned_buffers: set[str] = set()  # tracking unused buffers
-        id_to_buffer: dict[int, tuple[torch.nn.Parameter, bool]] = {}
+        id_to_buffer: dict[Union[int, _TensorID], tuple[torch.nn.Parameter, bool]] = {}
         for name in self.graph_signature.buffers:  # this loop adds used buffers
             if name in non_persistent_buffers:
                 persistent = False
@@ -362,11 +394,11 @@ def __init__(
                 persistent = True
                 buffer = state_dict[name]
 
-            if id(buffer) not in id_to_buffer:
-                id_to_buffer[id(buffer)] = (buffer.clone(), persistent)
+            if _id(buffer) not in id_to_buffer:
+                id_to_buffer[_id(buffer)] = (buffer.clone(), persistent)
 
             _assign_attr(
-                id_to_buffer[id(buffer)][0],
+                id_to_buffer[_id(buffer)][0],
                 self,
                 name,
                 attr_kind=_AttrKind.BUFFER,
@@ -381,44 +413,46 @@ def __init__(
                 continue
 
             is_buffer = False
-            if id(tensor) in id_to_buffer or not isinstance(
+            if _id(tensor) in id_to_buffer or not isinstance(
                 tensor, torch.nn.Parameter
             ):  # aliased buffer
                 is_buffer = True
 
             if is_buffer:
                 if (
-                    id(tensor) not in id_to_buffer
+                    _id(tensor) not in id_to_buffer
                 ):  # this is completely unused (not weight-sharing)
-                    id_to_buffer[id(tensor)] = (
+                    id_to_buffer[_id(tensor)] = (
                         tensor,
                         True,
                     )  # assign to respect original model
                 _assign_attr(
-                    id_to_buffer[id(tensor)][0],
+                    id_to_buffer[_id(tensor)][0],
                     self,
                     name,
                     attr_kind=_AttrKind.BUFFER,
                     persistent=True,
                 )
             else:
-                if id(tensor) not in id_to_param:  # this is unused
-                    id_to_param[id(tensor)] = tensor
+                if _id(tensor) not in id_to_param:  # this is unused
+                    id_to_param[_id(tensor)] = tensor
                 _assign_attr(
-                    id_to_param[id(tensor)],
+                    id_to_param[_id(tensor)],
                     self,
                     name,
                     attr_kind=_AttrKind.PARAMETER,
                 )
 
         # use id map so we don't double-clone aliased constants
-        id_to_const: dict[int, Union[torch.Tensor, torch._C.ScriptObject]] = {}
+        id_to_const: dict[
+            Union[int, _TensorID], Union[torch.Tensor, torch._C.ScriptObject]
+        ] = {}
         for fqn, constant in export_module.constants.items():
-            if id(constant) not in id_to_const:
+            if _id(constant) not in id_to_const:
                 if isinstance(constant, torch.Tensor):
                     constant = constant.clone()
-                id_to_const[id(constant)] = constant
-            _constant = id_to_const[id(constant)]
+                id_to_const[_id(constant)] = constant
+            _constant = id_to_const[_id(constant)]
             _assign_attr(
                 _constant,
                 self,
@@ -428,14 +462,18 @@ def __init__(
 
         # This is to handle parameters/buffers that point to the same tensor
         # object id -> list of (node_name, target_name)
-        consts_map: dict[int, list[tuple[str, str]]] = defaultdict(list)
+        consts_map: dict[Union[int, _TensorID], list[tuple[str, str]]] = defaultdict(
+            list
+        )
         consts_targets: set[str] = set()
 
         def add_to_consts_map(obj_id, node_name, target_name):
             name_list = consts_map[obj_id]
             name_list.append((node_name, target_name))
 
-        added_params_buffers: set[str] = set()  # track aliased/unused params, buffers
+        # track aliased/unused params, buffers
+        # prefer using untyped_storage() over id() when it's available
+        added_params_buffers: set[str] = set()
         for s in self.graph_signature.input_specs:
             if s.kind == InputKind.PARAMETER or (
                 s.kind == InputKind.BUFFER and s.persistent
@@ -443,42 +481,47 @@ def add_to_consts_map(obj_id, node_name, target_name):
                 assert hasattr(s.arg, "name")
                 assert isinstance(s.target, str)
                 add_to_consts_map(
-                    id(export_module.state_dict[s.target]), s.arg.name, s.target
+                    _id(export_module.state_dict[s.target]),
+                    s.arg.name,
+                    s.target,
                 )
                 consts_targets.add(s.target)
                 added_params_buffers.add(s.target)
             elif (
-                (s.kind == InputKind.BUFFER and not s.persistent)
+                s.kind == InputKind.BUFFER
+                and not s.persistent
                 or s.kind == InputKind.CONSTANT_TENSOR
                 or s.kind == InputKind.CUSTOM_OBJ
             ):
                 assert hasattr(s.arg, "name")
                 assert isinstance(s.target, str)
                 add_to_consts_map(
-                    id(export_module.constants[s.target]), s.arg.name, s.target
+                    _id(export_module.constants[s.target]),
+                    s.arg.name,
+                    s.target,
                 )
                 consts_targets.add(s.target)
 
         # add constants that are aliased and don't appear in graph signature
         for const_name, const in export_module.constants.items():
             if const_name not in consts_targets:
-                assert id(const) in consts_map, (
-                    "Constants should be either aliased or appear in graph signature"
-                )
-                ph_name, _ = consts_map[id(const)][0]
-                add_to_consts_map(id(const), ph_name, const_name)
+                const_id = _id(const)
+                assert const_id in consts_map
+                ph_name, _ = consts_map[const_id][0]
+                add_to_consts_map(const_id, ph_name, const_name)
                 added_params_buffers.add(s.target)
 
         # add aliased/unused params and buffers that don't appear in graph signature
         for fqn, tensor in export_module.state_dict.items():
             if fqn not in added_params_buffers:
-                if id(tensor) not in consts_map:
+                tensor_id = _id(tensor)
+                if tensor_id not in consts_map:
                     # completely unused (no weight-sharing), ignore.
                     # this weight doesn't appear in graph module,
                     # so won't cause FQN assignment issues
                     continue
-                ph_name, _ = consts_map[id(tensor)][0]
-                add_to_consts_map(id(tensor), ph_name, fqn)
+                ph_name, _ = consts_map[tensor_id][0]
+                add_to_consts_map(tensor_id, ph_name, fqn)
 
         # node name -> list of possible targets
         inputs_to_state: dict[str, list[str]] = {}
@@ -557,7 +600,7 @@ def process_forward_inputs(self, *args, **kwargs):
         )
         flat_args = [x[1] for x in flat_args_with_path]
 
-        if is_fx_tracing():
+        if is_fx_symbolic_tracing():
             return flat_args
 
         if in_spec != signature.in_spec:
@@ -577,12 +620,25 @@ def process_forward_inputs(self, *args, **kwargs):
             from torch._export.utils import _check_input_constraints_for_graph
 
             if self.adapted is True:
-                # TODO(suo): The FlatArgsAdapter returns a list of flat args,
-                # which we don't have keypaths for. For now, just create a dummy
-                # keypath to associate with the arg.
+                flat_arg_paths = (
+                    self.flat_args_adapter.get_flat_arg_paths()
+                    if self.flat_args_adapter
+                    else []
+                )
+                assert not flat_arg_paths or len(flat_arg_paths) == len(flat_args)
                 new_flat_args_with_path = [  # type: ignore[var-annotated]
-                    ((SequenceKey(idx=0), GetAttrKey(name="<unknown location>")), arg)
-                    for arg in flat_args
+                    (
+                        (
+                            SequenceKey(idx=idx),
+                            GetAttrKey(
+                                name=flat_arg_paths[idx]
+                                if flat_arg_paths
+                                else "<unknown location>"
+                            ),
+                        ),
+                        arg,
+                    )
+                    for idx, arg in enumerate(flat_args)
                 ]
             else:
                 new_flat_args_with_path = flat_args_with_path  # type: ignore[assignment]
@@ -594,13 +650,10 @@ def process_forward_inputs(self, *args, **kwargs):
         return flat_args
 
     def forward(self, *args, **kwargs):
-        flat_args = torch._dynamo.disable(
-            self.process_forward_inputs,
-            reason="do not trace into preprocessing the inputs",
-        )(*args, **kwargs)
+        flat_args = self.process_forward_inputs(*args, **kwargs)
         signature = self.module_call_graph[0].signature
 
-        if is_fx_tracing():
+        if is_fx_symbolic_tracing():
             return_val = torch.fx.Interpreter(self, graph=self.graph).run(
                 *flat_args, enable_io_processing=False
             )
@@ -719,7 +772,17 @@ def unflatten(
         hierarchy as the original eager module pre-export.
     """
     module = _remove_effect_tokens(module)
-    return UnflattenedModule(module, flat_args_adapter)
+    m = UnflattenedModule(module, flat_args_adapter)
+
+    # Disable process_forward_inputs as the adapter has many
+    # non-dynamo-traceable behavior.
+    m.process_forward_inputs = torch._dynamo.disable(  # type: ignore[method-assign]
+        m.process_forward_inputs,
+        reason="do not trace into preprocessing the inputs",
+        recursive=True,
+    )
+
+    return m
 
 
 def _inplace_buffer_and_input_mutations(
diff --git a/torch/functional.py b/torch/functional.py
index 09f750a4a9ba9..b5fcf8240c83f 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -105,58 +105,9 @@ def broadcast_shapes(*shapes):
     # This wrapper exists to support variadic args.
     # TODO Move this to C++ once the jit has better support for torch.Size.
     if not torch.jit.is_tracing():
-        max_len = 0
-        for shape in shapes:
-            if isinstance(shape, (int, torch.SymInt)):
-                if max_len < 1:
-                    max_len = 1
-            elif isinstance(shape, (tuple, list)):
-                s = len(shape)
-                if max_len < s:
-                    max_len = s
-        result = [1] * max_len
-
-        from torch.fx.experimental.symbolic_shapes import (
-            guard_size_oblivious,
-            is_nested_int,
-        )
-
-        for shape in shapes:
-            if isinstance(shape, (int, torch.SymInt)):
-                shape = (shape,)
-            if isinstance(shape, (tuple, list)):
-                for i in range(-1, -1 - len(shape), -1):
-                    if shape[i] < 0:
-                        raise RuntimeError(
-                            f"Trying to create tensor with negative dimension ({shape[i]}): ({shape[i]})"
-                        )
-
-                    # NB: handle nested ints specially to avoid invalid guarding on Ne(j0, 1).
-                    if is_nested_int(shape[i]):
-                        # Broadcasting is allowed for (j0, 1) or (j0, j0);
-                        # not (j0, j1), (j0, 5), etc.
-                        if is_nested_int(result[i]) and guard_size_oblivious(
-                            shape[i] == result[i]
-                        ):
-                            continue
-                    else:
-                        # NB: result is initialized to 1 so this is effectively an
-                        # equals one test
-                        if guard_size_oblivious(shape[i] == 1) or guard_size_oblivious(
-                            shape[i] == result[i]
-                        ):
-                            continue
-
-                    if result[i] != 1:
-                        raise RuntimeError(
-                            "Shape mismatch: objects cannot be broadcast to a single shape"
-                        )
-                    result[i] = shape[i]
-            else:
-                raise RuntimeError(
-                    "Input shapes should be of type ints, a tuple of ints, or a list of ints, got ",
-                    shape,
-                )
+        result = torch._refs._broadcast_shapes(*shapes)
+        if result is None:
+            return torch.Size([])
         return torch.Size(result)
     else:
         # with implementation above, torch.jit.trace hardcodes the sizes which makes subsequent replays fail
@@ -2124,7 +2075,8 @@ def _lu_impl(A, pivot=True, get_infos=False, out=None):
 
     Args:
         A (Tensor): the tensor to factor of size :math:`(*, m, n)`
-        pivot (bool, optional): controls whether pivoting is done. Default: ``True``
+        pivot (bool, optional): Whether to compute the LU decomposition with partial pivoting, or the regular LU
+                                decomposition. :attr:`pivot`\ `= False` not supported on CPU. Default: `True`.
         get_infos (bool, optional): if set to ``True``, returns an info IntTensor.
                                     Default: ``False``
         out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``,
diff --git a/torch/fx/_graph_pickler.py b/torch/fx/_graph_pickler.py
index 97e5755d7d52c..a53cefb2c0189 100644
--- a/torch/fx/_graph_pickler.py
+++ b/torch/fx/_graph_pickler.py
@@ -212,8 +212,6 @@ def __init__(self, node: SymNode) -> None:
         self.hint = node._hint
 
     def _to_sym_node(self) -> SymNode:
-        from torch.fx.experimental.sym_node import SymNode
-
         assert self.shape_env is not None
         return SymNode(self.expr, self.shape_env, self.pytype, self.hint)
 
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index ea575727d9187..4775bef4ba318 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -5,6 +5,7 @@
 import copy
 import functools
 import inspect
+import logging
 import math
 import os
 import warnings
@@ -26,6 +27,8 @@
 from .proxy import ParameterProxy, Proxy, Scope, ScopeContextManager, TracerBase
 
 
+log = logging.getLogger(__name__)
+
 HAS_VARSTUFF = inspect.CO_VARARGS | inspect.CO_VARKEYWORDS
 
 # These need to run in global scope to handle nested calls correctly
@@ -43,10 +46,26 @@
 _constant_attribute_types = get_args(_ConstantAttributeType)
 
 
+# We only want to print this once to avoid flooding logs
+@functools.lru_cache
+def is_fx_tracing_warning():
+    log.warning(
+        "is_fx_tracing will return true for both fx.symbolic_trace and "
+        "torch.export. Please use "
+        "is_fx_tracing_symbolic_tracing() for specifically fx.symbolic_trace "
+        "or torch.compiler.is_compiling() for specifically torch.export/compile."
+    )
+
+
 def is_fx_tracing():
+    is_fx_tracing_warning()
     return _is_fx_tracing_flag
 
 
+def is_fx_symbolic_tracing():
+    return _is_fx_tracing_flag and not torch.compiler.is_compiling()
+
+
 @compatibility(is_backward_compatible=True)
 class ProxyableClassMeta(type):
     """
@@ -435,7 +454,6 @@ def create_arg(self, a: Any) -> "Argument":
             setattr(self.root, qualname, a)
 
             return self.create_node("get_attr", qualname, (), {})
-
         return super().create_arg(a)
 
     @compatibility(is_backward_compatible=True)
diff --git a/torch/fx/experimental/const_fold.py b/torch/fx/experimental/const_fold.py
index 525014bf1e80e..3e53cb908fbfc 100644
--- a/torch/fx/experimental/const_fold.py
+++ b/torch/fx/experimental/const_fold.py
@@ -164,6 +164,9 @@ def split_const_subgraphs(
     attributes on the module prior to running the non-constant portion of the
     graph.
     """
+
+    import sympy
+
     if not isinstance(module, torch.fx.GraphModule):
         mod_traced = torch.fx.symbolic_trace(module)
     else:
@@ -194,6 +197,10 @@ def split_const_subgraphs(
         if node.is_impure():
             continue
 
+        # Skip folding nodes that have symbolic fill_value
+        if isinstance(node.kwargs.get("fill_value", None), sympy.Expr):
+            continue
+
         # Must be a constant foldable node at this point.
         const_nodes.add(node)
         if node.op != "get_attr":
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index a578723ea1cbb..ae4d1c59823a2 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -203,6 +203,9 @@ class _DisableUpdateTensorTracker(threading.local):
 _disable_update_tensor_tracker_tls = _DisableUpdateTensorTracker()
 
 
+_FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT: dict[int, torch.fx.Node] = {}
+
+
 def _is_proxy_tensor_update_tensor_tracker_disabled() -> bool:
     """
     Returns current state of disabling update tensor tracker.
@@ -817,6 +820,49 @@ def _maybe_record_pointwise_barrier(
     last_node.meta["low_precision_pointwise_barrier"] = True
 
 
+def _fetch_proxies_and_all_constant_flag(
+    flat_args_kwargs: Union[list[object], tuple[object, ...]], tracer: _ProxyTracer
+) -> tuple[list[object], tuple[object, ...], bool]:
+    """
+    Given flat arguments, fetch the proxies and whether they are all constants.
+    This is later used in proxy_call or when someone is trying to stitch together
+    graph node in tf or td modes.
+    """
+    f_flat_args_kwargs = [
+        (
+            fetch_object_proxy(tracer, x)
+            if isinstance(x, (Tensor, _AnyScriptObject))
+            else x
+        )
+        for x in flat_args_kwargs
+    ]
+
+    # If there are SymInts, we also should not consider this constant.
+    # However, fake tensor handling of SymInts is sufficiently broken that
+    # I couldn't write a test for this case
+    all_constant = (
+        not any(
+            t.constant is None
+            for t in f_flat_args_kwargs
+            if isinstance(t, _ProxyTensor)
+        )
+        # TODO: maybe constant SymInts should also be allowed?  Not sure if
+        # this can happen
+        and not any(isinstance(x, py_sym_types) for x in flat_args_kwargs)
+    )
+
+    proxy_flat_args_kwargs = [
+        e.proxy if isinstance(e, _ProxyTensor) else e for e in f_flat_args_kwargs
+    ]
+
+    proxy_flat_args_kwargs = [
+        (fetch_sym_proxy(tracer)(e) if isinstance(e, py_sym_types) else e)
+        for e in proxy_flat_args_kwargs
+    ]
+
+    return f_flat_args_kwargs, tuple(proxy_flat_args_kwargs), all_constant
+
+
 def proxy_call(
     proxy_mode: ProxyTorchDispatchMode,
     func: OpOverload,
@@ -869,27 +915,8 @@ def can_handle_tensor(x: Tensor) -> bool:
             return (args[0] != 0).item()  # type: ignore[attr-defined]
 
     tracer = proxy_mode.tracer
-    f_flat_args_kwargs = [
-        (
-            fetch_object_proxy(tracer, x)
-            if isinstance(x, (Tensor, _AnyScriptObject))
-            else x
-        )
-        for x in flat_args_kwargs
-    ]
-
-    # If there are SymInts, we also should not consider this constant.
-    # However, fake tensor handling of SymInts is sufficiently broken that
-    # I couldn't write a test for this case
-    all_constant = (
-        not any(
-            t.constant is None
-            for t in f_flat_args_kwargs
-            if isinstance(t, _ProxyTensor)
-        )
-        # TODO: maybe constant SymInts should also be allowed?  Not sure if
-        # this can happen
-        and not any(isinstance(x, py_sym_types) for x in flat_args_kwargs)
+    f_flat_args_kwargs, proxy_flat_args_kwargs, all_constant = (
+        _fetch_proxies_and_all_constant_flag(flat_args_kwargs, tracer)
     )
 
     if torch.Tag.data_dependent_output in func.tags:
@@ -917,13 +944,6 @@ def can_handle_tensor(x: Tensor) -> bool:
                 "in your make_fx call."
             )
 
-    proxy_flat_args_kwargs = [
-        e.proxy if isinstance(e, _ProxyTensor) else e for e in f_flat_args_kwargs
-    ]
-    proxy_flat_args_kwargs = [
-        (fetch_sym_proxy(proxy_mode.tracer)(e) if isinstance(e, py_sym_types) else e)
-        for e in proxy_flat_args_kwargs
-    ]
     proxy_args, proxy_kwargs = pytree.tree_unflatten(proxy_flat_args_kwargs, spec)
 
     # When we trace through a torch.tensor invocation, you never actually
@@ -1430,9 +1450,32 @@ def __torch_function__(
                 torch.amp.autocast_mode._exit_autocast,
             ]:
                 node.meta["val"] = None
+            # For autocast, the python APIs run so we don't have to run them again
+            # here.
+            if func is torch._C._set_grad_enabled:
+                func(*args, **kwargs)
             return node
-            # Don't actually run the function! We just want to trace the calls
-            # into a graph. We don't actually want to change global autograd state.
+
+        # We need more complicated handling here because the inputs
+        # to these functions are sometimes tensors or symints where
+        # we need to fetch the proxies properly.
+        if func in [
+            torch._functorch.predispatch._add_batch_dim,
+            torch._functorch.predispatch._remove_batch_dim,
+            torch._functorch.predispatch._vmap_increment_nesting,
+            torch._functorch.predispatch._vmap_decrement_nesting,
+            torch._functorch.vmap.lazy_load_decompositions,
+        ]:
+            _, proxies, _ = _fetch_proxies_and_all_constant_flag(args, self.tracer)
+            out_proxy = self.tracer.create_proxy(
+                "call_function",
+                func,
+                proxies,
+                {},
+            )
+            res = func(*args, **kwargs)
+            track_tensor_tree(res, out_proxy, constant=None, tracer=self.tracer)
+            return res
         return func(*args, **kwargs)
 
 
@@ -1864,6 +1907,25 @@ def trace(  # type: ignore[override]
     ) -> fx.Graph:
         res = super().trace(root, concrete_args)
 
+        # NOTE [export non-strict fake tensor leak detection]
+        # In non-strict export, we don't have dynamo's side effect
+        # tracking logic which makes some cases hard to detect.
+        # In general, our detecting strategy is:
+        #  (1) We do gc.collect() before export and get the alive fake tensors
+        #  (2) We dump the proxy to fake tensor map from make_fx tracer (_FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT)
+        #  (3) We query gc again to get alive fake tensors
+        #  (4) We take the delta between (1) and (3)
+        #  (5) Filter out fake tensors that are:
+        #      (1) Associated with TrackedFake (input tracking thing in symbolic_shapes)
+        #      (2) Associated with gm.meta
+        #  (6) Do ID match with the proxies
+
+        global _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT
+        _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT.clear()
+
+        for key, val in self.tensor_tracker.items():
+            _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT[id(key)] = val.proxy.node
+
         # Since we are making _AttrProxy mimic the original
         # submodule, when someone registers a module directly
         # to the tracer while tracing, the proxy object gets registered
@@ -1959,7 +2021,7 @@ def create_node(self, *args: object, **kwargs: object) -> fx.node.Node:
         # nn_module_stack
         if node.op not in ["placeholder", "output"]:
             if "nn_module_stack" not in node.meta:
-                node.meta["nn_module_stack"] = self.module_stack
+                node.meta["nn_module_stack"] = self.module_stack.copy()
             # convert nn_module_stack from Dict[key, (FQN, class)] -> Dict[str, Tuple[str, str]]
             for key, (fqn, mod_cls) in node.meta["nn_module_stack"].items():
                 if isinstance(mod_cls, type):
@@ -1993,6 +2055,7 @@ def __init__(
         _allow_fake_constant: bool,
         _error_on_data_dependent_ops: bool,
         record_stack_traces: bool = False,
+        parent_tracer: Optional[_MakefxTracer] = None,
     ) -> None:
         # Configurations that are used to initialize the context managers and their states.
         # Should not modify them during tracing.
@@ -2024,6 +2087,7 @@ def __init__(
             nullcontext()
         )
         self.record_stack_traces = record_stack_traces
+        self.parent_tracer: Optional[_MakefxTracer] = parent_tracer
 
     def _checkpoint_modes(self) -> list[Any]:
         return [
@@ -2272,6 +2336,15 @@ def _wrap_func(f: Callable[_P, R], phs: Sequence[PHBase]) -> Callable[_P, R]:
                 )
                 raise
 
+        if (
+            self.is_hop_subgraph_tracer()
+            and (fake_mode := torch._guards.detect_fake_mode(args))
+            and fake_mode.shape_env is not None
+        ):
+            from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
+
+            insert_deferred_runtime_asserts(t, fake_mode.shape_env, "reenter_make_fx")
+            t.recompile()
         # TODO: kind of a bad way to do it, should maybe figure out a better way
         if self.tracing_mode == "symbolic":
             assert self.fake_tensor_mode is not None
@@ -2282,6 +2355,9 @@ def trace(self, f: Callable, *args: object) -> fx.GraphModule:
         with self._init_modes_from_inputs(f, args):
             return self._trace_inner(f, *args)
 
+    def is_hop_subgraph_tracer(self) -> bool:
+        return self.parent_tracer is not None
+
     def trace_subgraph(self, f: Callable, *args: object) -> GraphModule:
         # Create a new tracer based on parent's config
         sub_tracer = _MakefxTracer(
@@ -2292,6 +2368,7 @@ def trace_subgraph(self, f: Callable, *args: object) -> GraphModule:
             self.record_module_stack,
             self._allow_fake_constant,
             self._error_on_data_dependent_ops,
+            parent_tracer=self,
         )
         with sub_tracer._init_modes_from_parent(self):
             return sub_tracer._trace_inner(f, *args)
@@ -2343,7 +2420,8 @@ def make_fx(
         record_module_stack,
         _allow_fake_constant,
         _error_on_data_dependent_ops,
-        record_stack_traces=record_stack_traces or config.trace.provenance_tracking,
+        record_stack_traces=record_stack_traces
+        or config.trace.provenance_tracking_level == 1,
     )
 
     @functools.wraps(f)
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index c6e757ca52011..b5758fdfa24d1 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -56,6 +56,7 @@
 
 # NB: The sym_* functions are used via getattr() and must be imported here.
 from torch import SymBool, SymFloat, SymInt
+from torch._C._functorch import get_unwrapped, is_batchedtensor
 from torch._guards import ShapeGuard, SLoc, Source, TracingContext
 from torch._logging import dtrace_structured, LazyString, structured, trace_structured
 from torch._subclasses.meta_utils import is_sparse_any
@@ -1146,7 +1147,10 @@ def expr(s: Union[SymInt, SymFloat, SymBool]) -> sympy.Expr:
         for attr in attrs:
             sub = getattr(a, attr)
             r.update(go(sub, path + (InnerTensorKey(attr),)))
-    elif isinstance(a, torch.Tensor):
+    elif isinstance(a, torch.Tensor) and is_batchedtensor(a):
+        unwrapped_tensor = get_unwrapped(a)
+        r.update(go(unwrapped_tensor, path))
+    elif isinstance(a, torch.Tensor) and not is_batchedtensor(a):
         from torch._subclasses.fake_tensor import FakeTensor
 
         assert isinstance(a, FakeTensor)
@@ -3532,7 +3536,6 @@ class ShapeEnvSettings:
     specialize_zero_one: bool
     duck_shape: bool
     prefer_deferred_runtime_asserts_over_guards: bool
-    allow_complex_guards_as_runtime_asserts: bool
     trace_asserts: bool
 
 
@@ -3670,10 +3673,6 @@ def _init(
         # in guards is helpful, since these guards in some sense are overly
         # pedantic.  See also https://github.com/pytorch/pytorch/issues/121749
         prefer_deferred_runtime_asserts_over_guards: bool = False,
-        # When True, does not emit or raise constraint violation errors on
-        # implicit guards generated by ops, and defers to runtime assertions
-        # in the graph instead. For export.
-        allow_complex_guards_as_runtime_asserts: bool = False,
         # XXX Add any new settings that could affect FakeTensor evaluation
         # to: torch._subclasses.fake_tensor._ShapeEnvSettings
         trace_asserts: bool = False,
@@ -3690,7 +3689,6 @@ def _init(
             specialize_zero_one=specialize_zero_one,
             duck_shape=duck_shape,
             prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
-            allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
             trace_asserts=trace_asserts,
         )
 
@@ -3902,10 +3900,6 @@ def duck_shape(self) -> bool:
     def prefer_deferred_runtime_asserts_over_guards(self) -> bool:
         return self.settings.prefer_deferred_runtime_asserts_over_guards
 
-    @property
-    def allow_complex_guards_as_runtime_asserts(self) -> bool:
-        return self.settings.allow_complex_guards_as_runtime_asserts
-
     @contextmanager
     def patch_source_specialization(
         self, source: Source, check_fn: Callable[[sympy.Symbol], sympy.Expr]
@@ -4366,19 +4360,23 @@ def _produce_dyn_sizes_from_int_tuple(
         tensor_size: Sequence[IntLikeType],
         source: Source,
         symbolic_context: SymbolicContext,
+        hint_overrides: Optional[dict[int, int]] = None,
     ) -> list[sympy.Expr]:
         assert all(not is_symbolic(val) for val in tensor_size), (
             f"Expect size to be a plain tuple of ints but got {tensor_size}"
         )
         from torch._dynamo.source import TensorProperty, TensorPropertySource
 
+        if not hint_overrides:
+            hint_overrides = {}
+
         _assert_symbol_context(symbolic_context)
         dynamic_dims = symbolic_context.dynamic_sizes  # type: ignore[attr-defined]
         constraint_dims = symbolic_context.constraint_sizes  # type: ignore[attr-defined]
         size = []
         for i, val in enumerate(tensor_size):
             sym = self.create_symbol(
-                val,
+                val if i not in hint_overrides else hint_overrides[i],
                 TensorPropertySource(source, TensorProperty.SIZE, i),
                 dynamic_dims[i],
                 constraint_dims[i],
@@ -4498,6 +4496,7 @@ def _create_symbolic_sizes_strides_storage_offset(
         source: Source,
         *,
         symbolic_context: Optional[SymbolicContext] = None,
+        hint_overrides: Optional[dict[int, int]] = None,
     ) -> tuple[
         tuple[IntLikeType, ...],
         tuple[IntLikeType, ...],
@@ -4505,6 +4504,9 @@ def _create_symbolic_sizes_strides_storage_offset(
     ]:
         dim = len(ex_size)
 
+        if not hint_overrides:
+            hint_overrides = {}
+
         # Reimplement the legacy behavior
         if symbolic_context is None:
             constraint_sizes: list[DimConstraint] = [None] * dim
@@ -4559,7 +4561,7 @@ def _create_symbolic_sizes_strides_storage_offset(
         from torch._dynamo.source import TensorProperty, TensorPropertySource
 
         size: list[sympy.Expr] = self._produce_dyn_sizes_from_int_tuple(
-            ex_size, source, symbolic_context
+            ex_size, source, symbolic_context, hint_overrides=hint_overrides
         )
         stride = self._compute_symbolic_stride(
             source,
@@ -4575,7 +4577,7 @@ def _create_symbolic_sizes_strides_storage_offset(
         sym_sizes = [
             self.create_symintnode(
                 sym,
-                hint=hint,
+                hint=hint if i not in hint_overrides else hint_overrides[i],
                 source=TensorPropertySource(source, TensorProperty.SIZE, i),
             )
             for i, (sym, hint) in enumerate(zip(size, ex_size))
@@ -4810,7 +4812,6 @@ def create_unbacked_symfloat(self) -> SymFloat:
         )
         self.counter["create_unbacked_symbol"] += 1
         if not self._ignore_fresh_unbacked_symbols_tls():
-            print(f"adding {symbol}")
             self.pending_fresh_unbacked_symbols.append(symbol)
         self.var_to_stack[symbol] = CapturedTraceback.extract(skip=1)
         vr = self.var_to_range[symbol] = ValueRanges.unknown()
@@ -6527,7 +6528,6 @@ def _make_data_dependent_error(
         expr: sympy.Basic,
         unhinted_expr: sympy.Basic,
         *,
-        size_oblivious_result: Optional[sympy.Basic] = None,
         expr_sym_node_id: Optional[int] = None,
     ) -> GuardOnDataDependentSymNode:
         # TODO: in a Dynamo context, having user code, and having the
@@ -6541,11 +6541,6 @@ def _make_data_dependent_error(
             if s in self.size_like:
                 size_like_symbols.append(s)
         size_oblivious_result_msg = ""
-        if size_oblivious_result is not None:
-            size_oblivious_result_msg = (
-                f"ATTENTION: guard_size_oblivious would fix the error, evaluating expression to {size_oblivious_result}.\n"
-                "Maybe you need to add guard_size_oblivious to framework code, see doc below for more guidance.\n\n"
-            )
         sloc, maybe_extra_debug = self._get_stack_summary(True)
         if expr.is_integer:  # type: ignore[attr-defined]
             desc = (
@@ -6553,6 +6548,11 @@ def _make_data_dependent_error(
             )
         else:
             desc = "Could not guard on data-dependent expression"
+            size_oblivious_result_msg = (
+                "consider using data-dependent friendly APIs such as "
+                "guard_or_false, guard_or_true and statically_known_true"
+            )
+
         msg = (
             f"{desc} {expr} (unhinted: {unhinted_expr}).  "
             f"(Size-like symbols: {', '.join(map(str, size_like_symbols)) or 'none'})\n\n"
@@ -6648,7 +6648,7 @@ def _set_replacement(self, a: sympy.Symbol, tgt: sympy.Expr, msg: str) -> None:
         assert isinstance(a, sympy.Symbol)
 
         if (
-            self.allow_complex_guards_as_runtime_asserts
+            self.prefer_deferred_runtime_asserts_over_guards
             and not _is_supported_equivalence(tgt)
         ):
             return  # continuing leads to placeholder shapes having complex expressions that we can't resolve
@@ -7577,16 +7577,9 @@ def compute_concrete_val() -> sympy.Basic:
                         ok = True
 
                     if not ok:
-                        size_oblivious_result = None
-                        # compute size_oblivious_result to suggest it as a fix for the user if it works.
-                        if not size_oblivious:
-                            size_oblivious_result = self._maybe_evaluate_static(
-                                expr, size_oblivious=True
-                            )
                         raise self._make_data_dependent_error(
                             expr.xreplace(self.var_to_val),
                             expr,
-                            size_oblivious_result=size_oblivious_result,
                             expr_sym_node_id=self._expr_sym_node_id,
                         )
                 else:
@@ -7630,7 +7623,15 @@ def compute_concrete_val() -> sympy.Basic:
                 # is no longer necessary)
                 self._maybe_guard_rel(g)
 
-                if not self.allow_complex_guards_as_runtime_asserts:
+                if (
+                    torch.compiler.is_exporting()
+                    and self.prefer_deferred_runtime_asserts_over_guards
+                ):
+                    # it's fine to defer simple guards here without checking,
+                    # the _maybe_guard_rel() call above will set replacements if possible,
+                    # and so the result here will be statically known
+                    self.guard_or_defer_runtime_assert(g, f"evaluate_expr: {orig_expr}")
+                else:
                     # at this point, we've evaluated the concrete expr value, and have
                     # flipped/negated the guard if necessary. Now we know what to guard
                     # or defer to runtime assert on.
@@ -7639,11 +7640,6 @@ def compute_concrete_val() -> sympy.Basic:
                     )
                     self.guards.append(guard)
                     self.axioms.update(dict(self.get_implications(self.simplify(g))))
-                else:
-                    # it's fine to defer simple guards here without checking,
-                    # the _maybe_guard_rel() call above will set replacements if possible,
-                    # and so the result here will be statically known
-                    self.guard_or_defer_runtime_assert(g, f"evaluate_expr: {orig_expr}")
             else:
                 self._log_guard("eval [guard suppressed]", g, forcing_spec=forcing_spec)
 
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index 4e1ab646593a2..a6cbe1cfe2c82 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -33,7 +33,7 @@ class Interpreter:
     transformations as well as analysis passes.
 
     Methods in the Interpreter class can be overridden to customize
-    the behavior of execution. The map of overrideable methods
+    the behavior of execution. The map of overridable methods
     in terms of call hierarchy::
 
         run()
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 0d9c67757a765..dbd6ed93ef26c 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -5,8 +5,8 @@
 import operator
 import types
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
-from typing_extensions import ParamSpec
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing_extensions import ParamSpec, TypeAlias, TypeVar
 
 import torch
 from torch._C import _fx_map_aggregate, _fx_map_arg, _NodeBase
@@ -46,7 +46,7 @@
 ]
 base_types = BaseArgumentTypes.__args__  # type: ignore[attr-defined]
 
-Target = Union[Callable[..., Any], str]
+Target: TypeAlias = Union[Callable[..., Any], str]
 
 Argument = Optional[
     Union[
@@ -151,9 +151,13 @@ def _get_qualified_name(func: Callable[..., Any]) -> str:
     if getattr(builtins, func.__name__, None) is func:
         return func.__name__
     # torch.Tensor.{fn}
-    if isinstance(
-        func, (types.MethodDescriptorType, types.WrapperDescriptorType)
-    ) and func is getattr(torch.Tensor, func.__name__, None):
+    if (
+        isinstance(func, (types.MethodDescriptorType, types.WrapperDescriptorType))
+        and func is getattr(torch.Tensor, func.__name__, None)
+    ) or (
+        func.__module__ == torch._tensor.__name__
+        and func.__qualname__ == f"Tensor.{func.__name__}"
+    ):
         return f"torch.Tensor.{func.__name__}"
     name = func.__name__
     if name == "<lambda>":
diff --git a/torch/fx/passes/_tensorify_python_scalars.py b/torch/fx/passes/_tensorify_python_scalars.py
index bc7537c23847f..dd8edb50e1612 100644
--- a/torch/fx/passes/_tensorify_python_scalars.py
+++ b/torch/fx/passes/_tensorify_python_scalars.py
@@ -203,7 +203,7 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                 and node.target is torch.ops.aten._local_scalar_dense.default
             ):
                 dtype = node.args[0].meta["val"].dtype
-                if dtype != torch.float64:
+                if not dtype.is_floating_point:
                     continue
 
                 assert isinstance(node.args[0], fx.Node), node.args[0]
@@ -212,6 +212,10 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                 expr_to_tensor_proxy[s] = MetaProxy(
                     node.args[0], tracer=tracer, fake_mode=fake_mode
                 )
+                # Upcast the float tensor to torch.float64 to avoid precision problem
+                expr_to_tensor_proxy[s] = torch.ops.prims.convert_element_type.default(
+                    expr_to_tensor_proxy[s], torch.float64
+                )
                 expr_to_sym_proxy[s] = MetaProxy(
                     node, tracer=tracer, fake_mode=fake_mode
                 )
diff --git a/torch/fx/passes/graph_transform_observer.py b/torch/fx/passes/graph_transform_observer.py
index 75a6ef6a2bcec..6479af665895c 100644
--- a/torch/fx/passes/graph_transform_observer.py
+++ b/torch/fx/passes/graph_transform_observer.py
@@ -43,7 +43,8 @@ def __init__(
         self.log_url = log_url
 
         self.active = (
-            self.log_url is not None or inductor_config.trace.provenance_tracking
+            self.log_url is not None
+            or inductor_config.trace.provenance_tracking_level == 1
         )
 
         if self.active:
diff --git a/torch/fx/passes/infra/partitioner.py b/torch/fx/passes/infra/partitioner.py
index 438661090942a..6fc17b959424d 100644
--- a/torch/fx/passes/infra/partitioner.py
+++ b/torch/fx/passes/infra/partitioner.py
@@ -18,16 +18,29 @@
 
 class Partition:
     def __init__(
-        self, id: Optional[int] = None, nodes: Optional[Iterable[Node]] = None
+        self,
+        id: Optional[int] = None,
+        nodes: Optional[Iterable[Node]] = None,
+        node_orders: Optional[Iterable[int]] = None,
     ):
         self.id = id
-        self.nodes = dict.fromkeys(nodes) if nodes is not None else {}
+        self.nodes: dict[Node, Optional[int]] = {}
+        if nodes is not None:
+            if node_orders is None:
+                self.nodes = dict.fromkeys(nodes, None)
+            else:
+                nodes_list = list(nodes)
+                node_orders_list = list(node_orders)
+                assert len(nodes_list) == len(node_orders_list), (
+                    "nodes and node_orders must have the same length"
+                )
+                self.nodes = dict(zip(nodes_list, node_orders_list))
 
     def __repr__(self) -> str:
         return str(self.nodes)
 
-    def add_node(self, node: Node):
-        self.nodes.update({node: None})
+    def add_node(self, node: Node, node_order: Optional[int] = None):
+        self.nodes.update({node: node_order})
 
     def remove_node(self, node: Node):
         del self.nodes[node]
@@ -172,7 +185,7 @@ def dfs_iter_find_cycle(all_user_nodes: set[Node]):
 
             return merge_id, True
 
-        def merge_single_node(node: Node, id: Optional[int]):
+        def merge_single_node(node: Node, node_order: Optional[int], id: Optional[int]):
             def _update_partition_map(node: Node, id: int):
                 # Iterate through all the users of this node and update the partition map to indicate
                 # that there is a path from the partition id of this node to the target partition id.
@@ -189,16 +202,19 @@ def _update_partition_map(node: Node, id: int):
                 assignment.pop(node)
             elif id not in partitions_by_id:
                 assignment[node] = id
-                partitions_by_id[id] = Partition(id=id, nodes=[node])
+                assert node_order is not None
+                partitions_by_id[id] = Partition(
+                    id=id, nodes=[node], node_orders=[node_order]
+                )
                 partition_users[id] = set(node.users)
                 _update_partition_map(node, id)
             else:
                 assignment[node] = id
-                partitions_by_id[id].add_node(node)
+                partitions_by_id[id].add_node(node, node_order)
 
         logger.debug("Proposing partitions...")
 
-        for node in reversed(self.graph_module.graph.nodes):
+        for node_order, node in enumerate(reversed(self.graph_module.graph.nodes)):
             # use Dict as an ordered set to ensure deterministic partitioning result, don't care value
             merge_candidates: dict[int, None] = {}
 
@@ -211,7 +227,7 @@ def _update_partition_map(node: Node, id: int):
                 partition_id = next(new_partition_id)
                 nodes_order[node] = partition_id
                 partitions_order[partition_id] = partition_id
-                merge_single_node(node, partition_id)
+                merge_single_node(node, node_order, partition_id)
                 merge_candidates[partition_id] = None
 
             # merge all possible partitions
@@ -228,6 +244,14 @@ def _update_partition_map(node: Node, id: int):
                     # in the graph, otherwise, this is a no-op
                     self_id, _ = maybe_merge_partition(self_id, other_id)
 
+        # sort partition nodes based on descending node order
+        for partition in partitions_by_id.values():
+            partition.nodes = dict(
+                sorted(
+                    partition.nodes.items(), key=operator.itemgetter(1), reverse=True
+                )
+            )
+
         # post processing to re-assign "getitem" nodes into upstream partition
         logger.debug("Reassigning getitem nodes to its producer node's partition...")
         nodes_reassignment: dict[Node, int] = {}
@@ -248,7 +272,7 @@ def _update_partition_map(node: Node, id: int):
                     if assignment.get(user, None) != id:  # type: ignore[arg-type]
                         nodes_reassignment[user] = id  # type: ignore[assignment]
         for node, id in nodes_reassignment.items():
-            merge_single_node(node, id)
+            merge_single_node(node, None, id)
 
         # filter out single node partitions
         if not self.allows_single_node_partition:
diff --git a/torch/fx/passes/runtime_assert.py b/torch/fx/passes/runtime_assert.py
index bb71a25971da7..19e101a5c120a 100644
--- a/torch/fx/passes/runtime_assert.py
+++ b/torch/fx/passes/runtime_assert.py
@@ -337,12 +337,13 @@ def match_symbol(symint, cb):
                 torch._check,
                 torch.ops.aten._assert_scalar.default,
             ):
+                cond = node.args[0] if node.args else node.kwargs.get("cond")
                 if (
-                    node.args[0] == True  # noqa: E712
-                    or (assert_expr := _get_sym_val(node.args[0])) in expr_to_proxy
+                    cond == True  # noqa: E712
+                    or (assert_expr := _get_sym_val(cond)) in expr_to_proxy
                     and assert_expr in added_asserts
                 ):
-                    arg = node.args[0]
+                    arg = cond
                     gm.graph.erase_node(node)
                     if isinstance(arg, fx.Node) and not arg.users:
                         gm.graph.erase_node(arg)
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 3815b2f058f0c..d734242abd82a 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -7,7 +7,7 @@
 import torch.fx
 from torch._dispatch.python import enable_python_dispatcher
 from torch._guards import detect_fake_mode
-from torch._prims_common import contiguous_for_memory_format_or_false
+from torch._prims_common import is_contiguous_for_memory_format_or_false
 from torch._subclasses.meta_utils import is_sparse_any
 from torch.fx._compatibility import compatibility
 from torch.fx.node import map_aggregate, Node
@@ -57,7 +57,7 @@ def _extract_tensor_metadata(
             torch.channels_last_3d,
         }
         for query_format in memory_formats:
-            if contiguous_for_memory_format_or_false(
+            if is_contiguous_for_memory_format_or_false(
                 result, memory_format=query_format
             ):
                 memory_format = query_format
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index 56ab34d3a87a0..4d9526c63f83d 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -248,6 +248,7 @@ def record_cross_partition_use(def_node: Node, use_node: Optional[Node]):
                                 s_def_partition = partitions[s_defined]
                                 s_def_partition.outputs.setdefault(s_node.name)
                                 s_def_partition.dependents.setdefault(used)
+                                use_partition.dependencies.setdefault(s_defined)
                 if defined is not None:
                     use_partition.dependencies.setdefault(defined)
 
diff --git a/torch/fx/passes/split_utils.py b/torch/fx/passes/split_utils.py
index 079b1b4364bd8..88da7ac7c4f55 100644
--- a/torch/fx/passes/split_utils.py
+++ b/torch/fx/passes/split_utils.py
@@ -17,7 +17,12 @@
 @compatibility(is_backward_compatible=False)
 def getattr_recursive(obj, name):
     for layer in name.split("."):
-        if hasattr(obj, layer):
+        if isinstance(obj, torch.nn.ModuleList):
+            if hasattr(obj, "_modules") and layer in obj._modules:
+                obj = obj._modules[layer]
+            else:
+                return None
+        elif hasattr(obj, layer):
             obj = getattr(obj, layer)
         else:
             return None
diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py
index d3ef35bdb1070..8a23c73785e8c 100644
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@@ -1,13 +1,16 @@
 # mypy: allow-untyped-defs
 import argparse
 import copy
+import json
 import logging
+import os
 from collections import defaultdict
 from collections.abc import Iterable, Sequence
 from dataclasses import dataclass
-from typing import Any, NamedTuple, Optional
+from typing import Any, Literal, NamedTuple, Optional
 
 import torch
+from torch._logging import trace_structured
 from torch.fx._compatibility import compatibility
 from torch.fx.node import map_arg
 from torch.fx.passes.graph_manipulation import get_size_of_node
@@ -32,6 +35,8 @@
     "Subgraph",
     "SplitResult",
     "generate_inputs_for_submodules",
+    "NodeEvent",
+    "NodeEventTracker",
 ]
 _LOGGER = logging.getLogger(__name__)
 
@@ -39,6 +44,35 @@
 DEFAULT_SKIP_FUSION = False
 DEFAULT_ALLOW_NON_TENSOR = False
 
+# ENV var and constants for node tracker
+
+TRACKER_DUMP_PATH = "_fx_net_tracker"
+NODES_SUFFIX = "_nodes.txt"
+ALL_SUFFIX = "_all.txt"
+
+ENV_FX_NET_ACC_SPLITTER_TRACKER_MODE = "FX_NET_ACC_SPLITTER_TRACKER_MODE"
+ENV_FX_NET_ACC_SPLITTER_TRACKER_DUMP_PATH = "FX_NET_ACC_SPLITTER_TRACKER_DUMP_PATH"
+ENV_FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES = (
+    "FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES"
+)
+
+DUMP_PREFIX = os.environ.get(
+    ENV_FX_NET_ACC_SPLITTER_TRACKER_DUMP_PATH, TRACKER_DUMP_PATH
+)
+
+"""
+Different modes of the event tracker for local debugging:
+"0": No local dumps. Information available by setting breakpoints and visually inspect in pdb.
+"1": Dump all events to DUMP_PREFIX_all.txt
+"2": In addition to events dump, track nodes specified by ENV_FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES
+     recursively and dump to DUMP_PREFIX_nodex.txt
+"3": In addition to events dump, track all nodes with more than 1 event recursively and dump to DUMP_PREFIX_nodex.txt
+In addition to the above local dumps, tracker is always enabled and dumps via trace_structured.
+"""
+TRACKER_MODE: Literal["0", "1", "2", "3"] = os.environ.get(
+    ENV_FX_NET_ACC_SPLITTER_TRACKER_MODE, "0"
+)  # type: ignore[assignment]
+
 
 class _SplitterSettingBase:
     def __init__(
@@ -99,6 +133,145 @@ def __init__(
         self.max_acc_splits: int = max_acc_splits
 
 
+@compatibility(is_backward_compatible=False)
+class NodeEvent:
+    """
+    An event in graph split that happened on a node.
+    source: Subject of the event
+    desc: readable description
+    dep: Optional dependency, usually the node that caused the event.
+    """
+
+    def __init__(
+        self, source: torch.fx.Node, desc: str, dep: Optional[torch.fx.Node] = None
+    ):
+        self.source = source
+        self.desc = desc
+        self.dep = dep
+
+    def to_str(self):
+        # source: The name of the subject of the event.
+        # desc: description of the event, in the format of <event_type>|<explanation>
+        # dep: The name of the cause of this event, which is another node, or #
+        # if it's caused by the subject node
+        return f"{self.source.name}: {self.desc} {self.dep.name if self.dep else '#'}"
+
+
+@compatibility(is_backward_compatible=False)
+class NodeEventTracker:
+    """
+    Tracks node events during the splitter execution.
+    """
+
+    def __init__(self, tracker_mode, dump_prefix):
+        self.tracker_mode = tracker_mode
+        self.dump_prefix = dump_prefix
+        # list of events
+        self.events = []
+        # dict from node name to event index
+        self.node_events = {}
+        self.writer = print
+
+    def add(self, node: torch.fx.Node, desc: str, dep: Optional[torch.fx.Node] = None):
+        """
+        Add a new event to the tracker.
+        """
+        event = NodeEvent(node, desc, dep)
+        self.events.append(event)
+        if node.name not in self.node_events:
+            self.node_events[node.name] = []
+        self.node_events[node.name].append(len(self.events) - 1)
+
+    def print_node(self, node_name, recursive=False, tab="", writer=None):
+        """
+        Print a node and its events.
+        @param recursive: if True, print nodes that caused the events on this current node.
+        @param tab: Indentation for dependencies.
+        @param writer: function to write to file. If None, use print.
+        """
+        if not writer:
+            writer = self.writer
+        for idx in self.node_events.get(node_name, []):
+            event = self.events[idx]
+            writer(tab + event.to_str())
+            if recursive and event.dep is not None:
+                self.print_node(
+                    event.dep.name, recursive=True, tab="| " + tab, writer=writer
+                )
+
+    def to_dict(self):
+        """
+        Create dict dump on all events.
+        """
+        ret: dict[str, list[str]] = {}
+        for name in self.node_events.keys():
+            ret[name] = []
+            for idx in self.node_events.get(name, []):
+                event = self.events[idx]
+                ret[name].append(event.to_str())
+        return ret
+
+    def print_all(self, writer=None):
+        """
+        Print all nodes in a list.
+        @param writer: function to write to file. If None, use print.
+        """
+        if not writer:
+            writer = self.writer
+        for name in self.node_events.keys():
+            writer(f"Node: {name}:")
+            self.print_node(name, recursive=False, tab="  ", writer=writer)
+
+    def dump(self):
+        """
+        Function to be invoked at the end of the finder execution to printout tracked events specified by the mode.
+        """
+        # dump via trace_structured
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "fx_net_acc_splitter_finder_events",
+                "encoding": "json",
+            },
+            payload_fn=lambda: json.dumps(self.to_dict()),
+        )
+
+        def writeln(f):
+            def fn(x):
+                return f.write(x + "\n")
+
+            return fn
+
+        # Mode 0: no local dump
+        # Mode >=1: Dump all events to file
+        if self.tracker_mode >= 1:
+            with open(self.dump_prefix + ALL_SUFFIX, "w") as f:
+                self.print_all(writeln(f))
+
+        def dump_selected_nodes(nodes):
+            with open(self.dump_prefix + NODES_SUFFIX, "w") as f:
+                for node_name in nodes:
+                    writeln(f"===== Tracking node {node_name} =====")
+                    self.print_node(
+                        node_name, recursive=True, tab="|-", writer=writeln(f)
+                    )
+                    writeln(f"===== End of tracking node {node_name} =====")
+
+        # Mode 2: Dump specific nodes in recursive manner.
+        # Mode 3: Dump all nodes with more than 1 event in recursive manner.
+        if self.tracker_mode == 2 or self.tracker_mode == 3:
+            nodes = (
+                os.environ.get(ENV_FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES, "").split(
+                    ","
+                )
+                if self.tracker_mode == 2
+                else [
+                    name for name, events in self.node_events.items() if len(events) > 1
+                ]
+            )
+            dump_selected_nodes(nodes)
+
+
 @compatibility(is_backward_compatible=False)
 class FxNetAccNodesFinder:
     """
@@ -125,6 +298,8 @@ def __init__(
         self.allow_non_tensor = allow_non_tensor
         self.acc_nodes: NodeSet = set()
 
+        self.tracker = NodeEventTracker(int(TRACKER_MODE), DUMP_PREFIX)
+
     def reduce_acc_nodes_non_tensor_input_helper(self, cpu_worklist: NodeList):
         """
         Transitively excludes nodes from ACC supported set.
@@ -139,7 +314,9 @@ def reduce_acc_nodes_non_tensor_input_helper(self, cpu_worklist: NodeList):
             for user in node.users:
                 if user in self.acc_nodes:
                     self.acc_nodes.remove(user)
+                    self.tracker.add(user, "acc_del|user_of_new_cpu_node", node)
                     if not is_node_output_tensor(user):
+                        self.tracker.add(user, "new_cpu_node|non_tensor_output")
                         cpu_worklist.append(user)
 
     def reduce_acc_nodes_non_tensor_input(self):
@@ -156,6 +333,7 @@ def reduce_acc_nodes_non_tensor_input(self):
                 continue
             if is_node_output_tensor(node):
                 continue
+            self.tracker.add(node, "new_cpu_node|callable_non_tensor_input")
             non_tensor_cpu_nodes.append(node)
 
         self.reduce_acc_nodes_non_tensor_input_helper(non_tensor_cpu_nodes)
@@ -174,6 +352,9 @@ def reduce_acc_nodes_non_tensor_output(self):
                 for user in acc_node.users:
                     if user not in self.acc_nodes:
                         new_cpu_nodes.append(acc_node)
+                        self.tracker.add(
+                            acc_node, "acc_del|non_tensor_output_with_cpu_user", user
+                        )
                         break
 
             if not new_cpu_nodes:
@@ -186,17 +367,22 @@ def reduce_acc_nodes_non_tensor_output(self):
 
     def __call__(self) -> NodeSet:
         submodules = dict(self.module.named_modules())
-        self.acc_nodes = {
-            n
-            for n in self.module.graph.nodes
-            if n.op in CALLABLE_NODE_OPS
-            and self.operator_support.is_node_supported(submodules, n)
-        }
+        self.acc_nodes = set()
+        for n in self.module.graph.nodes:
+            if n.op not in CALLABLE_NODE_OPS:
+                self.tracker.add(n, "init_cpu|not_callable")
+                continue
+            if not self.operator_support.is_node_supported(submodules, n):
+                self.tracker.add(n, "init_cpu|operator_support")
+                continue
+
+            self.tracker.add(n, "init_acc|callable_and_operator_supported")
+            self.acc_nodes.add(n)
 
         if not self.allow_non_tensor:
             self.reduce_acc_nodes_non_tensor_input()
             self.reduce_acc_nodes_non_tensor_output()
-
+        self.tracker.dump()
         return self.acc_nodes
 
 
@@ -719,7 +905,7 @@ def extend_acc_subgraph(self, tag: str):
         """
         # Dict that maps node to its users and ignore users that
         # are in the subgraph that has greater tag
-        deps = self.find_reverse_deps(tag_id=int(tag.split("_")[-1]))
+        deps = self.find_reverse_deps(tag_id=int(tag.rsplit("_", maxsplit=1)[-1]))
         self.update_reverse_deps_for_fusions(deps)
 
         # Parent nodes of the subgraph
diff --git a/torch/fx/passes/utils/fuser_utils.py b/torch/fx/passes/utils/fuser_utils.py
index 1b22490405de5..33db9fd03d790 100644
--- a/torch/fx/passes/utils/fuser_utils.py
+++ b/torch/fx/passes/utils/fuser_utils.py
@@ -96,7 +96,7 @@ def fuse_as_graphmodule(
     gm: GraphModule,
     nodes: NodeList,
     module_name: str,
-    partition_lookup_table: _Optional[dict[Node, None]] = None,
+    partition_lookup_table: _Optional[dict[Node, _Optional[int]]] = None,
     *,
     always_return_tuple: bool = False,
 ) -> tuple[GraphModule, tuple[Node, ...], tuple[Node, ...]]:
@@ -249,7 +249,7 @@ def erase_nodes(gm: GraphModule, nodes: NodeList) -> None:
 @compatibility(is_backward_compatible=False)
 def fuse_by_partitions(
     gm: GraphModule,
-    partitions: list[dict[Node, None]],
+    partitions: list[dict[Node, _Optional[int]]],
     prefix: str = "fused_",
     always_return_tuple: bool = False,
 ) -> GraphModule:
diff --git a/torch/fx/passes/utils/matcher_utils.py b/torch/fx/passes/utils/matcher_utils.py
index 4ecbe8640def1..aa58b52933f94 100644
--- a/torch/fx/passes/utils/matcher_utils.py
+++ b/torch/fx/passes/utils/matcher_utils.py
@@ -95,7 +95,7 @@ def __init__(
             )
 
         for node in pattern.nodes:
-            if node.op != "output":
+            if node.op != "output" and not node.is_impure():
                 assert len(node.users) > 0, (
                     "SubgraphMatcher cannot be initialized with an pattern with dead code"
                 )
diff --git a/torch/fx/traceback.py b/torch/fx/traceback.py
index 648a80b87b681..ed111b5f5b54b 100644
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@@ -1,15 +1,20 @@
 # mypy: allow-untyped-defs
 import copy
+import logging
 import traceback
 from contextlib import contextmanager
 from enum import Enum
 from typing import Any, Optional, Union
 
+from torch._utils_internal import signpost_event
+
 from ._compatibility import compatibility
 from .graph import Graph
 from .node import Node
 
 
+log = logging.getLogger(__name__)
+
 __all__ = [
     "preserve_node_meta",
     "has_preserved_node_meta",
@@ -311,12 +316,26 @@ def get_graph_provenance_json(graph: Graph) -> dict[str, Any]:
     """
     Given an fx.Graph, return a json that contains the provenance information of each node.
     """
-    provenance_tracking_json = {}
-    for node in graph.nodes:
-        if node.op == "call_function":
-            provenance_tracking_json[node.name] = (
-                [source.to_dict() for source in node.meta["from_node"]]
-                if "from_node" in node.meta
-                else []
-            )
-    return provenance_tracking_json
+    try:
+        provenance_tracking_json = {}
+        for node in graph.nodes:
+            if node.op == "call_function":
+                provenance_tracking_json[node.name] = (
+                    [source.to_dict() for source in node.meta["from_node"]]
+                    if "from_node" in node.meta
+                    else []
+                )
+        return provenance_tracking_json
+    except Exception as e:
+        # Since this is just debugging, it should never interfere with regular
+        # program execution, so we use this try-except to guard against any error
+        signpost_event(
+            "inductor",
+            "provenance_tracking_error",
+            {
+                "function": "get_graph_provenance_json",
+                "error_msg": str(e),
+                "stack_trace": traceback.format_exc(),
+            },
+        )
+        return {}
diff --git a/torch/header_only_apis.txt b/torch/header_only_apis.txt
index e0eaa91f4ca76..4cfeeb6238ad5 100644
--- a/torch/header_only_apis.txt
+++ b/torch/header_only_apis.txt
@@ -3,6 +3,9 @@
 # to guarantee that compiling these symbols do not require linking libtorch
 # to ensure header-only-ness.
 
+# torch/headeronly/util/shim_utils.h
+TORCH_ERROR_CODE_CHECK
+
 # c10/util/TypeCast.h
 convert
 
@@ -91,3 +94,8 @@ bits2x4
 bits4x2
 bits8
 bits16
+
+# torch/headeronly/core/ScalarType.h
+NumScalarTypes
+ScalarType
+# dummy_int1_7_t, dummy_uint1_7_t tested through ScalarType
diff --git a/torch/headeronly/CMakeLists.txt b/torch/headeronly/CMakeLists.txt
index 3b8f0d5466de0..93d2d7802b528 100644
--- a/torch/headeronly/CMakeLists.txt
+++ b/torch/headeronly/CMakeLists.txt
@@ -20,6 +20,7 @@ configure_file(
 
 file(GLOB HEADERONLY_HEADERS
     *.h
+    core/**/*.h
     cpu/**/*.h
     macros/*.h
     util/*.h
diff --git a/torch/headeronly/core/ScalarType.h b/torch/headeronly/core/ScalarType.h
new file mode 100644
index 0000000000000..0e426427997b3
--- /dev/null
+++ b/torch/headeronly/core/ScalarType.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <torch/headeronly/util/BFloat16.h>
+#include <torch/headeronly/util/Float4_e2m1fn_x2.h>
+#include <torch/headeronly/util/Float8_e4m3fn.h>
+#include <torch/headeronly/util/Float8_e4m3fnuz.h>
+#include <torch/headeronly/util/Float8_e5m2.h>
+#include <torch/headeronly/util/Float8_e5m2fnuz.h>
+#include <torch/headeronly/util/Float8_e8m0fnu.h>
+#include <torch/headeronly/util/Half.h>
+#include <torch/headeronly/util/bits.h>
+#include <torch/headeronly/util/complex.h>
+#include <torch/headeronly/util/qint32.h>
+#include <torch/headeronly/util/qint8.h>
+#include <torch/headeronly/util/quint2x4.h>
+#include <torch/headeronly/util/quint4x2.h>
+#include <torch/headeronly/util/quint8.h>
+
+#include <cstdint>
+
+namespace c10 {
+
+// dummy struct for uint1 to uint7, actual functionality
+// of these dtypes will be implemented in python with Tensor subclass
+template <unsigned int N>
+struct dummy_uint1_7_t {};
+
+// dummy struct for int1 to int7, actual functionality
+// of these dtypes will be implemented in python with Tensor subclass
+template <unsigned int N>
+struct dummy_int1_7_t {};
+
+// See [dtype Macros note] in c10/core/ScalarType.h regarding macros
+
+// NB: Order matters for this macro; it is relied upon in
+// _promoteTypesLookup and the serialization format.
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(_) \
+  _(uint8_t, Byte) /* 0 */                               \
+  _(int8_t, Char) /* 1 */                                \
+  _(int16_t, Short) /* 2 */                              \
+  _(int, Int) /* 3 */                                    \
+  _(int64_t, Long) /* 4 */                               \
+  _(at::Half, Half) /* 5 */                              \
+  _(float, Float) /* 6 */                                \
+  _(double, Double) /* 7 */                              \
+  _(c10::complex<c10::Half>, ComplexHalf) /* 8 */        \
+  _(c10::complex<float>, ComplexFloat) /* 9 */           \
+  _(c10::complex<double>, ComplexDouble) /* 10 */        \
+  _(bool, Bool) /* 11 */                                 \
+  _(c10::qint8, QInt8) /* 12 */                          \
+  _(c10::quint8, QUInt8) /* 13 */                        \
+  _(c10::qint32, QInt32) /* 14 */                        \
+  _(at::BFloat16, BFloat16) /* 15 */                     \
+  _(c10::quint4x2, QUInt4x2) /* 16 */                    \
+  _(c10::quint2x4, QUInt2x4) /* 17 */                    \
+  _(c10::bits1x8, Bits1x8) /* 18 */                      \
+  _(c10::bits2x4, Bits2x4) /* 19 */                      \
+  _(c10::bits4x2, Bits4x2) /* 20 */                      \
+  _(c10::bits8, Bits8) /* 21 */                          \
+  _(c10::bits16, Bits16) /* 22 */                        \
+  _(c10::Float8_e5m2, Float8_e5m2) /* 23 */              \
+  _(c10::Float8_e4m3fn, Float8_e4m3fn) /* 24 */          \
+  _(c10::Float8_e5m2fnuz, Float8_e5m2fnuz) /* 25 */      \
+  _(c10::Float8_e4m3fnuz, Float8_e4m3fnuz) /* 26 */      \
+  _(uint16_t, UInt16) /* 27 */                           \
+  _(uint32_t, UInt32) /* 28 */                           \
+  _(uint64_t, UInt64) /* 29 */                           \
+  _(c10::dummy_uint1_7_t<1>, UInt1) /* 30 */             \
+  _(c10::dummy_uint1_7_t<2>, UInt2) /* 31 */             \
+  _(c10::dummy_uint1_7_t<3>, UInt3) /* 32 */             \
+  _(c10::dummy_uint1_7_t<4>, UInt4) /* 33 */             \
+  _(c10::dummy_uint1_7_t<5>, UInt5) /* 34 */             \
+  _(c10::dummy_uint1_7_t<6>, UInt6) /* 35 */             \
+  _(c10::dummy_uint1_7_t<7>, UInt7) /* 36 */             \
+  _(c10::dummy_int1_7_t<1>, Int1) /* 37 */               \
+  _(c10::dummy_int1_7_t<2>, Int2) /* 38 */               \
+  _(c10::dummy_int1_7_t<3>, Int3) /* 39 */               \
+  _(c10::dummy_int1_7_t<4>, Int4) /* 40 */               \
+  _(c10::dummy_int1_7_t<5>, Int5) /* 41 */               \
+  _(c10::dummy_int1_7_t<6>, Int6) /* 42 */               \
+  _(c10::dummy_int1_7_t<7>, Int7) /* 43 */               \
+  _(c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */        \
+  _(c10::Float4_e2m1fn_x2, Float4_e2m1fn_x2) /* 45 */
+
+enum class ScalarType : int8_t {
+#define DEFINE_ST_ENUM_VAL_(_1, n) n,
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ST_ENUM_VAL_)
+#undef DEFINE_ENUM_ST_ENUM_VAL_
+      Undefined,
+  NumOptions
+};
+
+constexpr uint16_t NumScalarTypes =
+    static_cast<uint16_t>(ScalarType::NumOptions);
+
+} // namespace c10
+
+namespace torch::headeronly {
+using c10::dummy_int1_7_t;
+using c10::dummy_uint1_7_t;
+using c10::NumScalarTypes;
+using c10::ScalarType;
+} // namespace torch::headeronly
diff --git a/torch/headeronly/macros/Macros.h b/torch/headeronly/macros/Macros.h
index 3a4fc39369633..558edb175ae29 100644
--- a/torch/headeronly/macros/Macros.h
+++ b/torch/headeronly/macros/Macros.h
@@ -259,7 +259,8 @@ using namespace c10::xpu;
 // to resolve potential warnings.
 #if __CUDA_ARCH__ == 750
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
-#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890
+#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 || \
+    __CUDA_ARCH__ == 1200
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536;
 #else
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
diff --git a/torch/headeronly/ovrsource_defs.bzl b/torch/headeronly/ovrsource_defs.bzl
index c590f388ffb0e..3c3030c048b11 100644
--- a/torch/headeronly/ovrsource_defs.bzl
+++ b/torch/headeronly/ovrsource_defs.bzl
@@ -29,6 +29,7 @@ def define_torch_headeronly_ovrsource(name, is_mobile):
         public_include_directories = ["../.."],
         public_preprocessor_flags = pp_flags,
         public_raw_headers = native.glob([
+            "core/**/*.h",
             "cpu/**/*.h",
             "macros/*.h",
             "util/*.h",
diff --git a/torch/headeronly/util/shim_utils.h b/torch/headeronly/util/shim_utils.h
new file mode 100644
index 0000000000000..5acb3e2e347c1
--- /dev/null
+++ b/torch/headeronly/util/shim_utils.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <torch/headeronly/macros/Macros.h>
+
+#include <sstream>
+#include <stdexcept>
+
+#define TORCH_SUCCESS 0
+#define TORCH_FAILURE 1
+
+namespace torch::headeronly::detail {
+[[maybe_unused]] C10_NOINLINE static void throw_exception(
+    const char* call,
+    const char* file,
+    int64_t line) {
+  std::stringstream ss;
+  ss << call << " API call failed at " << file << ", line " << line;
+  throw std::runtime_error(ss.str());
+}
+} // namespace torch::headeronly::detail
+
+// This API is 100% inspired by AOTI_TORCH_ERROR_CODE_CHECK defined in
+// pytorch/torch/csrc/inductor/aoti_runtime/utils.h to handle the returns
+// of the APIs in the shim. We are genericizing this for more global use
+// of the shim beyond AOTI, for examples, see torch/csrc/stable/ops.h.
+#define TORCH_ERROR_CODE_CHECK(call)                                       \
+  if ((call) != TORCH_SUCCESS) {                                           \
+    torch::headeronly::detail::throw_exception(#call, __FILE__, __LINE__); \
+  }
diff --git a/torch/hub.py b/torch/hub.py
index c86b0561b0e9b..fc943a4dd004d 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -200,7 +200,12 @@ def _validate_not_a_forked_repo(repo_owner, repo_name, ref):
         while True:
             page += 1
             url = f"{url_prefix}?per_page=100&page={page}"
-            response = json.loads(_read_url(Request(url, headers=headers)))
+            try:
+                response = json.loads(_read_url(Request(url, headers=headers)))
+            except HTTPError:
+                # Retry without token in case it had insufficient permissions.
+                del headers["Authorization"]
+                response = json.loads(_read_url(Request(url, headers=headers)))
             # Empty response means no more data to process
             if not response:
                 break
diff --git a/torch/library.h b/torch/library.h
index ea3f2183a23da..f906e04ddecff 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -926,7 +926,7 @@ class TorchLibraryInit final {
             }
 
       void initialize() {
-        lib = std::unique_ptr<Library>(new Library(kind, ns, key, file, line));
+        lib = std::make_unique<Library>(kind, ns, key, file, line);
         init_function(*lib);
       }
 };
diff --git a/torch/library.py b/torch/library.py
index f24c3fbd42766..d36c181581483 100644
--- a/torch/library.py
+++ b/torch/library.py
@@ -45,6 +45,7 @@
     "register_torch_dispatch",
     "register_vmap",
     "get_ctx",
+    "get_kernel",
     "custom_op",
     "triton_op",
     "wrap_triton",
@@ -103,7 +104,7 @@ def __init__(self, ns, kind, dispatch_key=""):
                 " is a reserved namespace. Please try creating a library with another name.",
             )
 
-        frame = traceback.extract_stack(limit=3)[0]
+        frame = traceback.extract_stack(limit=2)[0]
         filename, lineno = frame.filename, frame.lineno
         self.m: Optional[Any] = torch._C._dispatch_library(
             kind, ns, dispatch_key, filename, lineno
@@ -1475,6 +1476,80 @@ def get_ctx() -> "torch._library.fake_impl.FakeImplCtx":
     return torch._library.fake_impl.global_ctx_getter()
 
 
+def get_kernel(
+    op: _op_identifier, dispatch_key: Union[str, torch.DispatchKey]
+) -> torch._C._SafeKernelFunction:
+    """Returns the computed kernel for a given operator and dispatch key.
+
+    This function retrieves the kernel that would be executed for a given
+    operator and dispatch key combination. The returned SafeKernelFunction
+    can be used to call the kernel in a boxed fashion. The intended use
+    case for this function is to retrieve the original kernel for a given
+    dispatch key and then register another kernel to the same dispatch key
+    that calls into the original kernel for certain cases.
+
+    Args:
+        op: Operator name (along with the overload) or OpOverload object
+            Can be a string (e.g., "aten::add.Tensor"), an OpOverload, or a CustomOpDef.
+        dispatch_key (str | torch.DispatchKey): The dispatch key to get the kernel for.
+            Can be a string (e.g., "CPU", "CUDA") or a DispatchKey enum value.
+
+    Returns:
+        torch._C._SafeKernelFunction: A safe kernel function that can be used to
+            call the kernel.
+
+    Raises:
+        RuntimeError: If the operator does not exist.
+
+    Example:
+        >>> # Get the CPU kernel for torch.add
+        >>> kernel = torch.library.get_kernel("aten::add.Tensor", "CPU")
+        >>>
+        >>> # You can also use DispatchKey enum
+        >>> kernel = torch.library.get_kernel("aten::add.Tensor", torch.DispatchKey.CPU)
+        >>>
+        >>> # Or use an OpOverload directly
+        >>> kernel = torch.library.get_kernel(torch.ops.aten.add.Tensor, "CPU")
+        >>>
+        >>> # Example: Using get_kernel in a custom op with conditional dispatch
+        >>> # Get the original kernel for torch.sin
+        >>> original_sin_kernel = torch.library.get_kernel("aten::sin", "CPU")
+        >>>
+        >>> # If input has negative values, use original sin, otherwise return zeros
+        >>> def conditional_sin_impl(dispatch_keys, x):
+        >>>     if (x < 0).any():
+        >>>         return original_sin_kernel.call_boxed(dispatch_keys, x)
+        >>>     else:
+        >>>         return torch.zeros_like(x)
+        >>>
+        >>> lib = torch.library.Library("aten", "IMPL")
+        >>> # with_keyset=True so the first argument to the impl is the current DispatchKeySet
+        >>> which needs to be the first argument to ``kernel.call_boxed``
+        >>> lib.impl("sin", conditional_sin_impl, "CPU", with_keyset=True)
+        >>>
+        >>> # Test the conditional behavior
+        >>> x_positive = torch.tensor([1.0, 2.0])
+        >>> x_mixed = torch.tensor([-1.0, 2.0])
+        >>> torch.sin(x_positive)
+        tensor([0., 0.])
+        >>> torch.sin(x_mixed)
+        tensor([-0.8415, 0.9093])
+    """
+    if not isinstance(op, (str, torch._ops.OpOverload)):
+        raise ValueError(f"get_kernel({op}): got unexpected type for op: {type(op)}")
+
+    if isinstance(op, torch._ops.OpOverload):
+        op = op._name
+
+    if isinstance(dispatch_key, str):
+        try:
+            dispatch_key = torch._C.DispatchKey.__members__[dispatch_key]
+        except KeyError:
+            raise ValueError(f"Invalid dispatch key: {dispatch_key}") from None
+
+    return torch._C._dispatch_get_computed_kernel_for_dispatch_key(op, dispatch_key)
+
+
 _OPCHECK_DEFAULT_UTILS = (
     "test_schema",
     "test_autograd_registration",
diff --git a/torch/masked/maskedtensor/_ops_refs.py b/torch/masked/maskedtensor/_ops_refs.py
index 8135f149a1bfc..9a4df21429ad6 100644
--- a/torch/masked/maskedtensor/_ops_refs.py
+++ b/torch/masked/maskedtensor/_ops_refs.py
@@ -285,7 +285,9 @@ def layout(func, *args, **kwargs):
     return _get_data(args[0]).layout
 
 
-@register_dispatch_func([torch.ops.aten.is_contiguous])
+@register_dispatch_func(
+    [torch.ops.aten.is_contiguous, torch.ops.aten.sym_is_contiguous]
+)
 def is_contiguous(func, *args, **kwargs):
     data = _get_data(args[0])
     if data.is_sparse:
diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py
index 4cef60948ad98..b11e5714fc2e8 100644
--- a/torch/multiprocessing/spawn.py
+++ b/torch/multiprocessing/spawn.py
@@ -12,6 +12,11 @@
 from concurrent.futures import as_completed, ThreadPoolExecutor
 from typing import Optional
 
+from torch.numa.binding import (
+    maybe_temporarily_apply_numa_binding_to_current_thread,
+    NumaOptions,
+)
+
 from . import _prctl_pr_set_pdeathsig  # type: ignore[attr-defined]
 
 
@@ -236,6 +241,7 @@ def start_processes(
     join=True,
     daemon=False,
     start_method="spawn",
+    numa_options: Optional[NumaOptions] = None,
 ):
     # To speed up performance in certain cases (see https://github.com/pytorch/pytorch/issues/133010),
     # this func will start processes in parallel if start_method is 'forkserver'.
@@ -267,12 +273,30 @@ def start_process(i):
         )
         tf.close()
         os.unlink(tf.name)
+
         process = mp.Process(
             target=_wrap,
             args=(fn, i, args, tf.name),
             daemon=daemon,
         )
-        process.start()
+
+        # HACK [NUMA inheritance]: Subprocesses inherit the parent thread's CPU
+        # affinity. So, we temporarily apply the bindings to the current thread,
+        # and then immediately undo them.
+        # This is necessary because the alternatives would be to
+        # either
+        # 1. Use numactl CLI. However, Python's multiprocessing library
+        # does not provide an API which would allow us to prepend
+        # the command it runs with numactl options.
+        # 2. Wrap the provided function such that it first applies
+        # NUMA bindings, and then executes as expected. However, this
+        # can result in worse memory locality, because torch and CUDA
+        # initialization would occur before applying the bindings, thus
+        # allowing some memory to be allocated on the wrong NUMA nodes.
+        with maybe_temporarily_apply_numa_binding_to_current_thread(
+            gpu_index=i, numa_options=numa_options
+        ):
+            process.start()
         return i, process, tf.name
 
     if not start_parallel:
diff --git a/torch/nativert/ModelRunner.cpp b/torch/nativert/ModelRunner.cpp
index f1c2a35db14cb..a7688860561e7 100644
--- a/torch/nativert/ModelRunner.cpp
+++ b/torch/nativert/ModelRunner.cpp
@@ -10,51 +10,17 @@
 #include <torch/nativert/executor/Placement.h>
 #include <torch/nativert/graph/GraphPasses.h>
 #include <torch/nativert/graph/Serialization.h>
+#include <torch/nativert/kernels/KernelHandlerRegistry.h>
 
 namespace torch::nativert {
 
 using torch::nativert::jsonToGraph;
 using torch::nativert::detail::itreeSpecLoads;
 
-namespace {
-std::shared_ptr<Weights> loadWeightsDefault(
-    Graph& graph,
-    caffe2::serialize::PyTorchStreamReader& reader,
-    std::string_view modelName) {
-  auto weightsPath = fmt::format(
-      "{}{}.pt", torch::_export::archive_spec::WEIGHTS_DIR, modelName);
-  auto constantsPath = fmt::format(
-      "{}{}.pt", torch::_export::archive_spec::CONSTANTS_DIR, modelName);
-  TORCH_CHECK(
-      reader.hasRecord(weightsPath), weightsPath, " not found in package");
-  TORCH_CHECK(
-      reader.hasRecord(constantsPath), constantsPath, " not found in package");
-  const auto& [weightsData, weightsSize] = reader.getRecord(weightsPath);
-  auto weights =
-      torch::jit::pickle_load_obj(
-          std::string_view{static_cast<char*>(weightsData.get()), weightsSize})
-          .toGenericDict();
-  const auto& [constantsData, constantsSize] = reader.getRecord(constantsPath);
-  auto constants =
-      torch::jit::pickle_load_obj(
-          std::string_view{
-              static_cast<char*>(constantsData.get()), constantsSize})
-          .toGenericDict();
-  std::unordered_map<std::string, c10::IValue> stateDict;
-  std::unordered_map<std::string, c10::IValue> constantsDict;
-  for (const auto& item : weights) {
-    stateDict[item.key().toStringRef()] = item.value();
-  }
-  for (const auto& item : constants) {
-    constantsDict[item.key().toStringRef()] = item.value();
-  }
-  return std::make_shared<Weights>(&graph, stateDict, constantsDict);
-}
-} // namespace
-
 ModelRunner::ModelRunner(
     const std::string& packagePath,
     const std::string& modelName) {
+  register_kernel_handlers();
   auto pytorchStreamReader =
       std::make_shared<caffe2::serialize::PyTorchStreamReader>(
           std::make_unique<caffe2::serialize::FileAdapter>(packagePath));
@@ -79,6 +45,16 @@ ModelRunner::ModelRunner(
                   .get_fqn()
                   .empty());
 
+  tensorPaths_ = getPayloadConfig(
+      pytorchStreamReader,
+      torch::_export::archive_spec::WEIGHTS_CONFIG_FILENAME_FORMAT,
+      modelName);
+
+  constantPaths_ = getPayloadConfig(
+      pytorchStreamReader,
+      torch::_export::archive_spec::CONSTANTS_CONFIG_FILENAME_FORMAT,
+      modelName);
+
   graph_ = jsonToGraph(exportedProgram_.get_graph_module());
 
   std::vector<const Value*> userInputs(
@@ -104,7 +80,7 @@ ModelRunner::ModelRunner(
   graph_->applyDevicePlacement(placement);
   selectScalarOverload(graph_.get());
 
-  auto weights = loadWeightsDefault(*graph_, *pytorchStreamReader, modelName);
+  auto weights = loadWeightsDefault(*graph_, pytorchStreamReader);
 
   weights->validateAllWeightsLoaded();
 
@@ -114,6 +90,46 @@ ModelRunner::ModelRunner(
       config, graph_, std::move(weights), pytorchStreamReader);
 }
 
+std::unordered_map<std::string, std::string> ModelRunner::getPayloadConfig(
+    const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
+        pytorchStreamReader,
+    std::string_view configFilenameFormat,
+    const std::string& modelName) {
+  std::string configPath =
+      fmt::format(fmt::runtime(configFilenameFormat), modelName);
+
+  TORCH_CHECK(
+      pytorchStreamReader->hasRecord(configPath),
+      configPath,
+      " not found in package");
+
+  const auto& [configData, configSize] =
+      pytorchStreamReader->getRecord(configPath);
+  const std::string configSerialized{
+      reinterpret_cast<char*>(configData.get()), configSize};
+
+  auto configJson = nlohmann::json::parse(configSerialized)
+                        .template get<torch::_export::PayloadConfig>();
+  auto config = configJson.get_config();
+  std::unordered_map<std::string, std::string> targetPaths;
+  for (const auto& configEntry : config) {
+    targetPaths[configEntry.first] = configEntry.second.get_path_name();
+  }
+  return targetPaths;
+}
+
+std::shared_ptr<Weights> ModelRunner::loadWeightsDefault(
+    Graph& graph,
+    const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>& reader) {
+  return std::make_shared<Weights>(
+      &graph,
+      reader,
+      tensorPaths_,
+      torch::_export::archive_spec::WEIGHTS_DIR,
+      constantPaths_,
+      torch::_export::archive_spec::CONSTANTS_DIR);
+}
+
 c10::IValue ModelRunner::run(
     const std::vector<c10::IValue>& args,
     const std::unordered_map<std::string, c10::IValue>& kwargs) {
@@ -136,4 +152,26 @@ std::vector<c10::IValue> ModelRunner::runWithFlatInputsAndOutputs(
   return executor_->execute(std::move(flatInputs));
 }
 
+uint64_t ModelRunner::numOutputs() const {
+  TORCH_CHECK(executor_, "ModelRunner not initialized");
+  return executor_->graphSignature().userOutputs().size();
+}
+
+ModelRunnerHandle::ModelRunnerHandle(
+    const std::string& packagePath,
+    const std::string& modelName)
+    : impl_(std::make_unique<ModelRunner>(packagePath, modelName)) {}
+ModelRunnerHandle::~ModelRunnerHandle() = default;
+
+c10::IValue ModelRunnerHandle::run(
+    const std::vector<c10::IValue>& args,
+    const std::unordered_map<std::string, c10::IValue>& kwargs) {
+  return impl_->run(args, kwargs);
+}
+
+std::vector<c10::IValue> ModelRunnerHandle::runWithFlatInputsAndOutputs(
+    std::vector<c10::IValue> flatInputs) {
+  return impl_->runWithFlatInputsAndOutputs(std::move(flatInputs));
+}
+
 } // namespace torch::nativert
diff --git a/torch/nativert/ModelRunner.h b/torch/nativert/ModelRunner.h
index 4c88757318850..ae433e43081dd 100644
--- a/torch/nativert/ModelRunner.h
+++ b/torch/nativert/ModelRunner.h
@@ -4,6 +4,7 @@
 
 #include <c10/macros/Export.h>
 #include <torch/csrc/utils/generated_serialization_types.h>
+#include <torch/nativert/ModelRunnerHandle.h>
 #include <torch/nativert/detail/ITree.h>
 #include <torch/nativert/executor/Executor.h>
 #include <torch/nativert/executor/Placement.h>
@@ -31,7 +32,19 @@ class TORCH_API ModelRunner {
   std::vector<c10::IValue> runWithFlatInputsAndOutputs(
       std::vector<c10::IValue> flatInputs);
 
+  uint64_t numOutputs() const;
+
+  std::shared_ptr<Weights> loadWeightsDefault(
+      Graph& graph,
+      const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>& reader);
+
  private:
+  std::unordered_map<std::string, std::string> getPayloadConfig(
+      const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
+          pytorchStreamReader,
+      std::string_view configFormat,
+      const std::string& modelName);
+
   // original non-delegated graph from torch.export()
   std::shared_ptr<Graph> graph_;
 
@@ -41,5 +54,9 @@ class TORCH_API ModelRunner {
   ITreeSpec outputSpec_;
 
   torch::_export::ExportedProgram exportedProgram_;
+
+  std::unordered_map<std::string, std::string> tensorPaths_;
+
+  std::unordered_map<std::string, std::string> constantPaths_;
 };
 } // namespace torch::nativert
diff --git a/torch/nativert/backends/__init__.py b/torch/nativert/backends/__init__.py
new file mode 100644
index 0000000000000..0981407836bde
--- /dev/null
+++ b/torch/nativert/backends/__init__.py
@@ -0,0 +1,4 @@
+from .lowered_aoti_module import LoweredBackendModule
+
+
+__all__ = ["LoweredBackendModule"]
diff --git a/torch/nativert/backends/lower_utils.py b/torch/nativert/backends/lower_utils.py
new file mode 100644
index 0000000000000..2b337f4f2c9d4
--- /dev/null
+++ b/torch/nativert/backends/lower_utils.py
@@ -0,0 +1,50 @@
+import torch
+from torch.export import ExportedProgram
+from torch.export.pt2_archive._package import AOTI_FILES, package_pt2
+from torch.types import FileLike
+
+from .lowered_aoti_module import LoweredBackendModule
+
+
+def lower_exported_program(
+    exported_program: ExportedProgram, model_name: str, backend_id: str
+) -> tuple[ExportedProgram, AOTI_FILES]:
+    """
+    Lower an exported program to AOTInductor and return a delegate ExportedProgram
+    with the `executorch_call_delegate` HOP
+    """
+    args, kwargs = exported_program.example_inputs
+    aoti_files = torch._inductor.aot_compile(
+        exported_program.module(), args, kwargs, options={"aot_inductor.package": True}
+    )
+    assert isinstance(aoti_files, list)
+
+    lowered_aoti_module = LoweredBackendModule(
+        exported_program, backend_id, module_name=model_name
+    )
+
+    aoti_delegate_ep = torch.export.export(lowered_aoti_module, args, kwargs)
+
+    return aoti_delegate_ep, aoti_files
+
+
+def package_nativert_with_aoti_delegate(
+    f: FileLike,
+    model_name: str,
+    backend_id: str,
+    original_ep: ExportedProgram,
+    delegate_ep: ExportedProgram,
+    delegate_files: AOTI_FILES,
+) -> None:
+    """
+    Package a pt2 archive file that can be consumed by NativeRT with AOTI Delegate
+    """
+    package_pt2(
+        f,
+        exported_programs={
+            model_name: original_ep,
+            f"{model_name}-{backend_id}": delegate_ep,
+        },
+        aoti_files={f"{model_name}-{backend_id}": delegate_files},  # type: ignore[dict-item]
+    )
+    return
diff --git a/torch/nativert/backends/lowered_aoti_module.py b/torch/nativert/backends/lowered_aoti_module.py
new file mode 100644
index 0000000000000..b0de0e3a26d1b
--- /dev/null
+++ b/torch/nativert/backends/lowered_aoti_module.py
@@ -0,0 +1,33 @@
+from typing import Optional
+
+import torch
+from torch.export import ExportedProgram
+
+
+class LoweredBackendModule(torch.nn.Module):
+    def __init__(
+        self,
+        original_exported_program: ExportedProgram,
+        backend_id: str,
+        *,
+        module_name: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+        self._backend_id = backend_id
+        self._module_name = module_name
+        self._original_exported_program = original_exported_program
+
+    @property
+    def backend_id(self) -> str:
+        return self._backend_id
+
+    @property
+    def module_name(self) -> Optional[str]:
+        return self._module_name
+
+    @property
+    def original_module(self) -> ExportedProgram:
+        return self._original_exported_program
+
+    def forward(self, *args):  # type: ignore[no-untyped-def]
+        return torch._higher_order_ops.executorch_call_delegate(self, *args)
diff --git a/torch/nativert/detail/MPMCQueue.h b/torch/nativert/detail/MPMCQueue.h
index 3b90503887bbb..8301ce3fdb4c5 100644
--- a/torch/nativert/detail/MPMCQueue.h
+++ b/torch/nativert/detail/MPMCQueue.h
@@ -55,6 +55,15 @@ class MPMCQueue {
     return true;
   }
 
+  /**
+   * Get the current size of the queue.
+   * @return The number of elements in the queue.
+   */
+  size_t size() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return storage_.size();
+  }
+
  private:
   std::mutex mutex_;
   std::deque<T> storage_;
diff --git a/torch/nativert/executor/ConstantFolder.cpp b/torch/nativert/executor/ConstantFolder.cpp
index 13d253394805b..4f78e13de5b6f 100644
--- a/torch/nativert/executor/ConstantFolder.cpp
+++ b/torch/nativert/executor/ConstantFolder.cpp
@@ -41,6 +41,8 @@ void ConstantFolder::unlinkConstants(
   const auto* input = &*graph_.nodes().begin();
   const auto* output = &*graph_.nodes().end();
 
+  c10::FastSet<const Node*> run_const_graph_nodes;
+
   { // ignore prim.Input and prim.Output
     auto ct = 0;
     for (auto& n : graph_.nodes()) {
@@ -49,6 +51,19 @@ void ConstantFolder::unlinkConstants(
       }
       nodeDynInputs[&n] = n.numInputs();
       nodeKernels[&n] = &kernels[++ct];
+
+      if (n.target() == "torch.ops.higher_order.run_const_graph") {
+        run_const_graph_nodes.insert(&n);
+      }
+    }
+  }
+
+  for (const auto* run_const_graph_node : run_const_graph_nodes) {
+    for (auto* user : run_const_graph_node->users()) {
+      if (user == input || user == output) {
+        continue;
+      }
+      nodeDynInputs[user] -= 1;
     }
   }
 
@@ -112,6 +127,7 @@ void ConstantFolder::unlinkConstants(
   for (const auto& f : foldables_) {
     VLOG(1) << "Const-folded node: " << *f.node;
   }
+  LOG(INFO) << "Const-folded " << foldables_.size() << " nodes";
 
   // remove moved (i.e., associated w/ const-folded nodes) kernels
   // from the input kernel vector
diff --git a/torch/nativert/executor/DelegateExecutor.cpp b/torch/nativert/executor/DelegateExecutor.cpp
index 78ec4a0c15823..6585ac34ddd6c 100644
--- a/torch/nativert/executor/DelegateExecutor.cpp
+++ b/torch/nativert/executor/DelegateExecutor.cpp
@@ -28,6 +28,7 @@ char* _mkdtemp(char* outputDir) {
 std::string extractToTemporaryFolder(
     caffe2::serialize::PyTorchStreamReader& packageReader,
     const std::string& targetPath) {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
   char outputDir[] = "/tmp/delegate_model_XXXXXX";
   char* tempdir = _mkdtemp(outputDir);
   TORCH_CHECK(
diff --git a/torch/nativert/executor/ExecutionFrame.cpp b/torch/nativert/executor/ExecutionFrame.cpp
index deedcde69c6cb..133a6125a26a2 100644
--- a/torch/nativert/executor/ExecutionFrame.cpp
+++ b/torch/nativert/executor/ExecutionFrame.cpp
@@ -11,6 +11,7 @@ ExecutionFrame::ExecutionFrame(const Graph& graph)
       persistent_(graph.numValues()),
       moveable_output_mask_(graph.userOutputs().size()) {
   updatePersistentValues(/* weights = nullptr */);
+  updateMovableOutputs();
 }
 
 ExecutionFrame::ExecutionFrame(
diff --git a/torch/nativert/executor/Executor.cpp b/torch/nativert/executor/Executor.cpp
index 932972ae2b5bc..50ad86daf32ac 100644
--- a/torch/nativert/executor/Executor.cpp
+++ b/torch/nativert/executor/Executor.cpp
@@ -10,10 +10,6 @@
 #include <torch/nativert/kernels/C10Kernel.h>
 #include <torch/nativert/kernels/KernelFactory.h>
 
-// Maximum number of retries when trying to get a frame from
-// clearedExecutionFrames_
-constexpr uint32_t kClearExecutionFrameRetries = 10;
-
 namespace torch::nativert {
 
 Executor::Executor(
@@ -29,7 +25,7 @@ Executor::Executor(
               ? std::optional<ConstantFolder>(*graph_)
               : std::nullopt),
       executionFrames_(executorConfig_.maxNumConcurrentThreads),
-      clearedExecutionFrames_(executorConfig_.maxNumConcurrentThreads),
+      inactiveExecutionFrames_(executorConfig_.maxNumConcurrentThreads),
       numExecutionFrames_(0),
       lastClearedTimestamp_(getCurrentTimestampSeconds()) {
   if (weights) {
@@ -119,13 +115,14 @@ void Executor::maybeRunConstantFolding(
       weights->updateFoldedConst(value->name(), outputs.at(idx));
     }
   }
+  // runtime constant folding after the run_const_graph HOPs, if applicable
+  if (constantFolder_.has_value()) {
+    constantFolder_->evaluate(*weights);
+  }
 }
 
 void Executor::processWeights(const std::shared_ptr<Weights>& weights) {
   maybeRunConstantFolding(weights);
-  if (constantFolder_.has_value()) {
-    constantFolder_->evaluate(*weights);
-  }
   for (auto& delegateExecutor : delegateExecutors_) {
     delegateExecutor->processWeights(weights);
   }
@@ -133,9 +130,6 @@ void Executor::processWeights(const std::shared_ptr<Weights>& weights) {
 
 void Executor::initWeights(const std::shared_ptr<Weights>& weights) {
   maybeRunConstantFolding(weights);
-  if (constantFolder_.has_value()) {
-    constantFolder_->evaluate(*weights);
-  }
 
   weights_.withLock([&](auto& w) { w = std::move(weights); });
 
@@ -193,34 +187,12 @@ Executor::ExecutorFramePtr Executor::getExecutorFrameFromPool() {
   std::shared_ptr<Weights> weights;
   weights_.withLock([&](auto& w) { weights = w; });
 
-  // First try to get a frame from clearedExecutionFrames_ if clearing is in
-  // progress
-  if (C10_UNLIKELY(clearingInProgress_)) {
-    ExecutionFrameEntry frameEntry;
-    uint32_t retry = 0;
-    while (
-        retry <
-        kClearExecutionFrameRetries) { // Limit retries to avoid infinite loop
-      if (clearedExecutionFrames_.readIfNotEmpty(frameEntry)) {
-        if (retry > 0) {
-          VLOG(1) << "Took " << retry
-                  << " retries to pop from clearedExecutionFrames_";
-        }
-        ExecutorFramePtr ptr{std::move(frameEntry.frame), *this};
-        if (ptr->weightVersion() != weights->version()) {
-          ptr->setWeights(*weights);
-        }
-        return ptr;
-      }
-      retry++;
-    }
-    // If we couldn't get a frame from cleared pool after retries, move onto
-    // main pool
-  }
-
   // Try to get a frame from the main pool or create a new one
   std::unique_ptr<ExecutionFrame> frame;
-  while (!executionFrames_.readIfNotEmpty(frame)) {
+
+  // Try to get a frame from executionFrames_ or inactiveExecutionFrames_
+  while (!executionFrames_.readIfNotEmpty(frame) &&
+         !inactiveExecutionFrames_.readIfNotEmpty(frame)) {
     int64_t numFrames = numExecutionFrames_.load();
     if (numFrames < executorConfig_.maxNumConcurrentThreads) {
       if (numExecutionFrames_.compare_exchange_strong(
@@ -243,6 +215,7 @@ Executor::ExecutorFramePtr Executor::getExecutorFrameFromPool() {
 }
 
 void Executor::clearStaleExecutionFrames() {
+  LOG(INFO) << "Clearing stale execution frames";
   if (!cleanupLock_.try_lock()) {
     // Another thread is already doing cleanup
     return;
@@ -250,41 +223,48 @@ void Executor::clearStaleExecutionFrames() {
   // Update timestamp first to minimize contention
   lastClearedTimestamp_ = getCurrentTimestampSeconds();
 
-  int numPopped = 0;
+  // Get the size of active execution frames queue directly
+  size_t activeFramesSize = executionFrames_.size();
+  size_t inactiveFramesSize = inactiveExecutionFrames_.size();
+  size_t total = activeFramesSize + inactiveFramesSize;
+  size_t numCleared = 0;
   std::unique_ptr<ExecutionFrame> frame;
 
-  // Move frames from executionFrames_ to clearedExecutionFrames_
-  while (executionFrames_.readIfNotEmpty(frame)) {
-    ++numPopped;
-    // Keep the first popped entries up to minimum size
-    if (numPopped > executorConfig_.minNumExecutionFrames) {
-      // Discard stale frames
-      frame.reset();
-      numExecutionFrames_ -= 1;
-      continue;
-    }
+  // If number of active frames is less than the configured min, then transfer
+  // the difference from inactive frames
+  size_t minFramesToKeep = std::min(
+      static_cast<size_t>(executorConfig_.minNumExecutionFrames), total);
+  size_t framesToTransfer =
+      (minFramesToKeep - activeFramesSize) > minFramesToKeep
+      ? static_cast<size_t>(0)
+      : minFramesToKeep - activeFramesSize;
+  ;
+  for (size_t i = 0;
+       i < framesToTransfer && inactiveExecutionFrames_.readIfNotEmpty(frame);
+       ++i) {
+    executionFrames_.writeIfNotFull(std::move(frame));
+  }
+
+  size_t newActiveFramesSize = executionFrames_.size();
 
-    ExecutionFrameEntry entry;
-    entry.used = false;
-    entry.frame = std::move(frame);
-    clearedExecutionFrames_.writeIfNotFull(std::move(entry));
-    // Enable clients to pop from clearedExecutionFrames_ while clearing is in
-    // progress
-    clearingInProgress_ = true;
+  // Clear remaining inactive frames (i.e. those that were not used in the last
+  // time interval)
+  while (inactiveExecutionFrames_.readIfNotEmpty(frame)) {
+    ++numCleared;
+    frame.reset();
+    numExecutionFrames_ -= 1;
   }
 
-  uint32_t numPushed = 0;
-  ExecutionFrameEntry frameEntry;
-  // Move frames back from clearedExecutionFrames_ to executionFrames_
-  while (clearedExecutionFrames_.readIfNotEmpty(frameEntry)) {
-    ++numPushed;
-    executionFrames_.writeIfNotFull(std::move(frameEntry.frame));
-    clearingInProgress_ = false;
+  // Move active frames to inactive so they are cleared next time if not used
+  // Check  newActiveFramesSize > 0 to guuard against other threads adding
+  // frames to active queue during while loop
+  while (executionFrames_.readIfNotEmpty(frame) && newActiveFramesSize > 0) {
+    --newActiveFramesSize;
+    inactiveExecutionFrames_.writeIfNotFull(std::move(frame));
   }
 
-  clearingInProgress_ = false;
-  VLOG(1) << "Cleared " << (numPopped - numPushed) << " out of " << numPopped
-          << " ExecutionFrame instances in the pool";
+  LOG(INFO) << "Cleared " << numCleared << " out of " << total
+            << " ExecutionFrame instances in the pool";
 
   cleanupLock_.unlock();
 }
@@ -292,6 +272,8 @@ void Executor::clearStaleExecutionFrames() {
 void Executor::returnExecutorFrameToPool(
     std::unique_ptr<ExecutionFrame> frame) {
   // Check if it's time to clean up stale frames
+  // TODO: consider moving cleanup to a dedicated thread so it does not impact
+  // p99 latency
   if (executorConfig_.doExecutionFrameCleanup &&
       lastClearedTimestamp_ +
               executorConfig_.executionFramePoolCleanupIntervalSec <
@@ -301,21 +283,11 @@ void Executor::returnExecutorFrameToPool(
 
   try {
     frame->destroyBorrowedIValues();
-
-    // Create an entry with used=true
-    if (C10_UNLIKELY(!clearingInProgress_)) {
-      TORCH_CHECK(
-          executionFrames_.writeIfNotFull(std::move(frame)),
-          "ExecutionFrame pool full");
-    } else {
-      ExecutionFrameEntry frameEntry;
-      frameEntry.used = true;
-      frameEntry.frame = std::move(frame);
-
-      TORCH_CHECK(
-          clearedExecutionFrames_.writeIfNotFull(std::move(frameEntry)),
-          "Cleared ExecutionFrame pool full");
-    }
+    // Always return to active execution frame pool, indicating that frame was
+    // used in the previous time interval
+    TORCH_CHECK(
+        executionFrames_.writeIfNotFull(std::move(frame)),
+        "ExecutionFrame pool full");
   } catch (...) {
     sem_.release();
     throw;
diff --git a/torch/nativert/executor/Executor.h b/torch/nativert/executor/Executor.h
index 4f40946b4b428..64f2372b9e85b 100644
--- a/torch/nativert/executor/Executor.h
+++ b/torch/nativert/executor/Executor.h
@@ -122,7 +122,7 @@ class Executor {
   std::vector<DelegateExecutor*> getDelegates();
 
   // Get the number of execution frames in the pool
-  int getNumExecutionFrames() const {
+  auto getNumExecutionFrames() const {
     return numExecutionFrames_.load();
   }
 
@@ -149,25 +149,6 @@ class Executor {
   void clearStaleExecutionFrames();
 
  private:
-  // Structure to track execution frame usage
-  struct ExecutionFrameEntry {
-    bool used{false};
-    std::unique_ptr<ExecutionFrame> frame;
-
-    // Add move constructor and assignment operator
-    ExecutionFrameEntry() = default;
-    ExecutionFrameEntry(ExecutionFrameEntry&& other) noexcept
-        : used(other.used), frame(std::move(other.frame)) {}
-    ExecutionFrameEntry& operator=(ExecutionFrameEntry&& other) noexcept {
-      used = other.used;
-      frame = std::move(other.frame);
-      return *this;
-    }
-    // Delete copy constructor and assignment operator
-    ExecutionFrameEntry(const ExecutionFrameEntry&) = delete;
-    ExecutionFrameEntry& operator=(const ExecutionFrameEntry&) = delete;
-  };
-
   void maybeRunConstantFolding(const std::shared_ptr<Weights>& weights);
   void validateInputs(const std::vector<c10::IValue>& inputs) const;
 
@@ -188,8 +169,8 @@ class Executor {
   c10::Semaphore sem_;
   torch::nativert::detail::MPMCQueue<std::unique_ptr<ExecutionFrame>>
       executionFrames_;
-  torch::nativert::detail::MPMCQueue<ExecutionFrameEntry>
-      clearedExecutionFrames_;
+  torch::nativert::detail::MPMCQueue<std::unique_ptr<ExecutionFrame>>
+      inactiveExecutionFrames_;
   std::atomic_int64_t numExecutionFrames_;
 
   std::unique_ptr<LayoutPlanner> layoutPlanner_;
diff --git a/torch/nativert/executor/ExecutorConfig.h b/torch/nativert/executor/ExecutorConfig.h
index 70f8fa88cf0d0..fb57f2b6f2ef6 100644
--- a/torch/nativert/executor/ExecutorConfig.h
+++ b/torch/nativert/executor/ExecutorConfig.h
@@ -11,7 +11,7 @@ struct ExecutorConfig {
   bool debugNan = false;
   bool enableStaticCPUKernels = true;
   bool runConstFolding = false;
-  bool doExecutionFrameCleanup = false;
+  bool doExecutionFrameCleanup = true;
   bool tryFreeUnmanagedValuesAfterUse = true;
   // allows up to max number of concurrent threads.
   int64_t maxNumConcurrentThreads = 8;
diff --git a/torch/nativert/executor/OpKernelKind.h b/torch/nativert/executor/OpKernelKind.h
index 045664cfdee19..5a8ba38316f67 100644
--- a/torch/nativert/executor/OpKernelKind.h
+++ b/torch/nativert/executor/OpKernelKind.h
@@ -11,6 +11,7 @@ enum class OpKernelKind : uint8_t {
   // static dispatch kernels that don't reuse
   // out TensorImpl
   kNativeStaticDispatchKernel,
+  kTritonKernel,
 };
 
 } // namespace torch::nativert
diff --git a/torch/nativert/executor/triton/CpuTritonKernelManager.cpp b/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
new file mode 100644
index 0000000000000..1f8d394ecf391
--- /dev/null
+++ b/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
@@ -0,0 +1,91 @@
+#include <torch/nativert/executor/triton/CpuTritonKernelManager.h>
+
+#include <c10/util/Logging.h>
+
+#ifndef _WIN32
+#include <dlfcn.h>
+#endif // _WIN32
+
+namespace torch::nativert {
+
+namespace {
+void* _dlopen(const char* filename) {
+#if defined(_WIN32)
+  return nullptr;
+#else
+  return dlopen(filename, RTLD_NOW | RTLD_LOCAL);
+#endif
+}
+
+void* _dlsym(void* handle, const char* name) {
+#if defined(_WIN32)
+  return nullptr;
+#else
+  return dlsym(handle, name);
+#endif
+}
+
+char* _dlerror() {
+#if defined(_WIN32)
+  throw std::runtime_error("dlerror not supported on Windows");
+#else
+  return dlerror();
+#endif
+}
+
+} // namespace
+
+CpuTritonKernelManager::CpuTritonKernelManager(
+    std::string kernel_name,
+    std::string kernel_bin_path,
+    std::string kernel_launcher_bin_path)
+    : TritonKernelManager(std::move(kernel_name), std::move(kernel_bin_path)),
+      kernel_launcher_bin_path_(std::move(kernel_launcher_bin_path)) {}
+
+void CpuTritonKernelManager::load() {
+  if (C10_LIKELY(kernel_fn_ != nullptr)) {
+    return;
+  }
+
+  kernel_handle_.reset(_dlopen(kernel_bin_path_.c_str()));
+  TORCH_CHECK(
+      kernel_handle_ != nullptr,
+      "could not dlopen ",
+      kernel_bin_path_,
+      ": ",
+      _dlerror());
+
+  launcher_handle_.reset(_dlopen(kernel_launcher_bin_path_.c_str()));
+  TORCH_CHECK(
+      launcher_handle_ != nullptr,
+      "could not dlopen ",
+      kernel_launcher_bin_path_,
+      ": ",
+      _dlerror());
+
+  kernel_fn_ = _dlsym(kernel_handle_.get(), kernel_name_.c_str());
+  TORCH_CHECK(
+      kernel_fn_ != nullptr,
+      "could not dlsym ",
+      kernel_name_,
+      ": ",
+      _dlerror());
+
+  launcher_fn_ =
+      reinterpret_cast<launcher_ptr_t>(_dlsym(launcher_handle_.get(), "run"));
+  TORCH_CHECK(launcher_fn_ != nullptr, "could not dlsym run: ", _dlerror());
+}
+
+void CpuTritonKernelManager::launch(
+    const LaunchParams& launch_params,
+    void** args /* { ...inputs, output }*/) {
+  load();
+  launcher_fn_(
+      launch_params.grid_dims.x,
+      launch_params.grid_dims.y,
+      launch_params.grid_dims.z,
+      args,
+      kernel_fn_);
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/triton/CpuTritonKernelManager.h b/torch/nativert/executor/triton/CpuTritonKernelManager.h
new file mode 100644
index 0000000000000..45b3327c878e4
--- /dev/null
+++ b/torch/nativert/executor/triton/CpuTritonKernelManager.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <torch/nativert/executor/triton/TritonKernelManager.h>
+
+#include <c10/core/Device.h>
+#include <c10/util/FbcodeMaps.h>
+
+#ifndef _WIN32
+#include <dlfcn.h>
+#endif
+
+typedef void* kernel_ptr_t;
+typedef void (
+    *launcher_ptr_t)(uint32_t, uint32_t, uint32_t, void**, kernel_ptr_t);
+
+namespace torch::nativert {
+
+struct DlcloseDeleter {
+  void operator()(void* p) const {
+    if (p) {
+#if defined(_WIN32)
+      TORCH_CHECK(false, "Windows is not supported");
+#else
+      dlclose(p);
+#endif
+    }
+  }
+};
+
+class CpuTritonKernelManager final : public TritonKernelManager {
+ public:
+  CpuTritonKernelManager(
+      std::string kernel_name,
+      std::string kernel_bin_path,
+      std::string kernel_launcher_bin_path);
+  ~CpuTritonKernelManager() final = default;
+  void launch(const LaunchParams& launch_params, void** args) final;
+
+ private:
+  void load();
+
+  kernel_ptr_t kernel_fn_{nullptr};
+  launcher_ptr_t launcher_fn_{nullptr};
+
+  std::unique_ptr<void, DlcloseDeleter> kernel_handle_{nullptr};
+  std::unique_ptr<void, DlcloseDeleter> launcher_handle_{nullptr};
+
+  std::string kernel_launcher_bin_path_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/triton/CudaTritonKernelManager.cpp b/torch/nativert/executor/triton/CudaTritonKernelManager.cpp
new file mode 100644
index 0000000000000..47f72ce0c5e37
--- /dev/null
+++ b/torch/nativert/executor/triton/CudaTritonKernelManager.cpp
@@ -0,0 +1,155 @@
+#include <torch/nativert/executor/triton/TritonKernelManager.h>
+
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_runtime.h>
+
+#include <c10/util/FbcodeMaps.h>
+#include <c10/util/Logging.h>
+
+namespace {
+const at::cuda::NVRTC& get_nvrtc() {
+  return at::globalContext().getNVRTC();
+}
+} // namespace
+
+#define CU_LOG_ERROR(fn, result, ...)                   \
+  {                                                     \
+    LOG(ERROR) << #fn << " returned error: " << result; \
+    const char* errMsg = nullptr;                       \
+    get_nvrtc().cuGetErrorString(result, &errMsg);      \
+    LOG(ERROR) << "cuGetErrorString: " << errMsg;       \
+  }
+
+namespace torch::nativert {
+
+// cuda kernels require an extra level of indirection
+// for who knows what reason.
+class CudaKernelInputs final : public KernelInputs {
+ public:
+  CudaKernelInputs(size_t num_args, size_t num_attrs)
+      : KernelInputs(num_args, num_attrs), arg_ptrs_(num_args) {};
+  ~CudaKernelInputs() final = default;
+
+  void add_arg(void* arg) override {
+    TORCH_CHECK(arg_idx_ < num_args_, "Too many args");
+    arg_ptrs_[arg_idx_] = arg;
+    inputs_[arg_idx_] = reinterpret_cast<void*>(&arg_ptrs_[arg_idx_]);
+    arg_idx_++;
+  }
+
+ private:
+  std::vector<void*> arg_ptrs_;
+};
+
+class CudaTritonKernelManager final : public TritonKernelManager {
+ public:
+  CudaTritonKernelManager(std::string kernel_name, std::string kernel_bin_path);
+  ~CudaTritonKernelManager() final;
+
+  CudaTritonKernelManager(const CudaTritonKernelManager& other);
+  CudaTritonKernelManager& operator=(const CudaTritonKernelManager& other);
+  CudaTritonKernelManager(CudaTritonKernelManager&& other) noexcept;
+  CudaTritonKernelManager& operator=(CudaTritonKernelManager&& other) noexcept;
+
+  void launch(const LaunchParams& launch_params, void** args) final;
+  std::unique_ptr<KernelInputs> create_inputs(size_t num_args, size_t num_attrs)
+      const final {
+    return std::unique_ptr<KernelInputs>(
+        new CudaKernelInputs(num_args, num_attrs));
+  }
+
+ private:
+  CUfunction load();
+  c10::FastMap<c10::DeviceIndex, CUfunction> cache_;
+  std::vector<CUmodule> loaded_modules_;
+};
+
+CudaTritonKernelManager::CudaTritonKernelManager(
+    std::string kernel_name,
+    std::string kernel_bin_path)
+    : TritonKernelManager(std::move(kernel_name), std::move(kernel_bin_path)) {
+  TORCH_CHECK(
+      at::globalContext().hasCUDA() || at::globalContext().hasHIP(),
+      "cuda or hip required");
+};
+
+CudaTritonKernelManager::~CudaTritonKernelManager() {
+  const auto& nvrtc = get_nvrtc();
+  for (auto& mod : loaded_modules_) {
+    if (CUresult err = nvrtc.cuModuleUnload(mod); err != 0) {
+      CU_LOG_ERROR(nvrtc.cuModuleUnload, err);
+    }
+  }
+}
+
+CUfunction CudaTritonKernelManager::load() {
+  const auto idx = c10::cuda::current_device();
+  if (const auto res = cache_.find(idx); res != cache_.end()) {
+    return res->second;
+  }
+
+  const auto& nvrtc = get_nvrtc();
+
+  CUmodule mod_ptr = nullptr;
+
+  if (CUresult err = nvrtc.cuModuleLoad(&mod_ptr, kernel_bin_path_.c_str());
+      err != 0) {
+    CU_LOG_ERROR(nvrtc.cuModuleLoad, err);
+    return nullptr;
+  }
+
+  CUfunction func = nullptr;
+
+  if (CUresult err =
+          nvrtc.cuModuleGetFunction(&func, mod_ptr, kernel_name_.c_str());
+      err != 0) {
+    CU_LOG_ERROR(nvrtc.cuModuleGetFunction, err);
+    return nullptr;
+  }
+
+  loaded_modules_.emplace_back(mod_ptr);
+  return cache_.emplace(idx, func).first->second;
+}
+
+void CudaTritonKernelManager::launch(
+    const LaunchParams& launch_params,
+    void** args /* { ...inputs, output }*/) {
+  const constexpr int kThreadsPerWarp = 2 << 4;
+
+  auto kernel_fn = load();
+  TORCH_CHECK(
+      kernel_fn != nullptr, "failed to load triton kernel: ", kernel_name_);
+  cudaStream_t stream = c10::cuda::getCurrentCUDAStream().stream();
+
+  AT_CUDA_DRIVER_CHECK(get_nvrtc().cuLaunchKernel(
+      kernel_fn,
+      launch_params.grid_dims.x,
+      launch_params.grid_dims.y,
+      launch_params.grid_dims.z,
+      /* blockDimX = */ kThreadsPerWarp * launch_params.num_warps,
+      /* blockDimY = */ 1,
+      /* blockDimZ = */ 1,
+      /* sharedMemBytes = */ launch_params.shared_memory_bytes,
+      stream,
+      args,
+      nullptr));
+}
+
+static std::unique_ptr<TritonKernelManager> _create_cuda_triton_kernel_manager(
+    std::string kernel_name,
+    std::string kernel_bin_path) {
+  return std::make_unique<CudaTritonKernelManager>(
+      std::move(kernel_name), std::move(kernel_bin_path));
+}
+
+} // namespace torch::nativert
+
+namespace {
+static bool _initialized_cuda_triton_kernel_manager = []() {
+  torch::nativert::create_cuda_triton_kernel_manager =
+      &torch::nativert::_create_cuda_triton_kernel_manager;
+  return true;
+}();
+} // namespace
diff --git a/torch/nativert/executor/triton/TritonKernelManager.h b/torch/nativert/executor/triton/TritonKernelManager.h
new file mode 100644
index 0000000000000..ffa8e2573bc02
--- /dev/null
+++ b/torch/nativert/executor/triton/TritonKernelManager.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <string>
+
+#include <c10/util/Exception.h>
+
+namespace torch::nativert {
+
+struct GridDims {
+ public:
+  GridDims(int x = 1, int y = 1, int z = 1) : x(x), y(y), z(z) {}
+  int x;
+  int y;
+  int z;
+};
+
+struct LaunchParams {
+  int num_warps = 4;
+  int shared_memory_bytes = 0;
+  GridDims grid_dims;
+};
+
+class KernelInputs {
+ public:
+  KernelInputs(size_t num_args, size_t num_attrs)
+      : num_args_(num_args),
+        inputs_(num_args + num_attrs),
+        num_attrs_(num_attrs) {}
+  virtual ~KernelInputs() = default;
+
+  virtual void add_arg(void* arg) {
+    TORCH_CHECK(arg_idx_ < num_args_, "Too many args");
+    inputs_[arg_idx_++] = arg;
+  }
+
+  void add_attribute(void* attr) {
+    TORCH_CHECK(attr_idx_ < num_attrs_, "Too many attributes");
+    inputs_[num_args_ + attr_idx_++] = attr;
+  }
+
+  void** as_void() {
+    return inputs_.data();
+  }
+
+ protected:
+  size_t num_args_;
+  size_t arg_idx_ = 0;
+  std::vector<void*> inputs_;
+
+ private:
+  size_t num_attrs_;
+  size_t attr_idx_ = 0;
+};
+
+class TritonKernelManager {
+ public:
+  TritonKernelManager(std::string kernel_name, std::string kernel_bin_path)
+      : kernel_name_(std::move(kernel_name)),
+        kernel_bin_path_(std::move(kernel_bin_path)) {}
+  virtual ~TritonKernelManager() = default;
+  virtual std::unique_ptr<KernelInputs> create_inputs(
+      size_t num_args,
+      size_t num_attrs) const {
+    return std::make_unique<KernelInputs>(num_args, num_attrs);
+  }
+  virtual void launch(const LaunchParams& launch_params, void** args) = 0;
+
+ protected:
+  std::string kernel_name_, kernel_bin_path_;
+};
+
+inline std::unique_ptr<TritonKernelManager> (
+    *create_cuda_triton_kernel_manager)(std::string, std::string) = nullptr;
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/TensorMeta.cpp b/torch/nativert/graph/TensorMeta.cpp
index 97afbc9f095e6..68d47a58fb68a 100644
--- a/torch/nativert/graph/TensorMeta.cpp
+++ b/torch/nativert/graph/TensorMeta.cpp
@@ -106,11 +106,14 @@ TensorMeta::TensorMeta(const torch::_export::TensorMeta& tensorMeta)
       layout_(convertJsonLayout(tensorMeta.get_layout())),
       requiresGrad_(tensorMeta.get_requires_grad()),
       device_(convertJsonDevice(tensorMeta.get_device())) {
-  if (tensorMeta.get_storage_offset().tag() ==
-      torch::_export::SymInt::Tag::AS_INT) {
+  const auto& storageOffset = tensorMeta.get_storage_offset();
+  if (storageOffset.tag() == torch::_export::SymInt::Tag::AS_INT) {
     storage_offset_ = tensorMeta.get_storage_offset().get_as_int();
-  } else {
-    TORCH_CHECK(false, "SymInt not supported yet");
+  } else if (storageOffset.tag() == torch::_export::SymInt::Tag::AS_EXPR) {
+    // TODO: it's still unclear how SymInt shape should be used in runtime
+    // setting the storage offset to 0 for now
+    hasSymbolicShape_ = true;
+    storage_offset_ = 0;
   }
 
   for (const auto& size : tensorMeta.get_sizes()) {
diff --git a/torch/nativert/graph/passes/SubgraphRewriter.cpp b/torch/nativert/graph/passes/SubgraphRewriter.cpp
new file mode 100644
index 0000000000000..9742a4c3a7aa2
--- /dev/null
+++ b/torch/nativert/graph/passes/SubgraphRewriter.cpp
@@ -0,0 +1,447 @@
+#include <variant>
+
+#include <torch/nativert/graph/Graph.h>
+#include <torch/nativert/graph/passes/SubgraphRewriter.h>
+
+namespace torch::nativert {
+
+const std::string kDummyTarget = "dummy";
+
+//-------------------------
+// SubgraphMatcher
+//-------------------------
+
+SubgraphMatcher::SubgraphMatcher(const Graph* pattern)
+    : pattern_(pattern), pattern_root_(findRootNode(pattern_)) {}
+
+const Node* SubgraphMatcher::findRootNode(const Graph* g) {
+  return g->outputNode()->inputs()[0].value->producer();
+}
+
+std::optional<Match> SubgraphMatcher::match(Node* target_node) {
+  if (!pattern_root_) {
+    return std::nullopt;
+  }
+
+  Match current_match;
+  if (tryMatchNode(pattern_root_, target_node, current_match)) {
+    for (const Value* output : pattern_->outputs()) {
+      TORCH_CHECK(
+          current_match.value_map.find(output) != current_match.value_map.end(),
+          "Not all outputs were matched to the pattern. ",
+          "Please check that the first output node suffices ",
+          "to traverse all output values in the pattern.");
+    }
+    return current_match;
+  }
+
+  return std::nullopt;
+}
+
+std::vector<Match> SubgraphMatcher::matchAll(Graph* graph) {
+  std::vector<Match> matches;
+
+  for (auto& node : graph->nodes()) {
+    auto maybeMatch = match(&node);
+    if (maybeMatch.has_value()) {
+      matches.push_back(*maybeMatch);
+    }
+  }
+  return matches;
+}
+
+namespace {
+bool compareConstants(const Constant& a, const Constant& b) {
+  return std::visit(
+      [](const auto& lhs, const auto& rhs) -> bool {
+        using LType = std::decay_t<decltype(lhs)>;
+        using RType = std::decay_t<decltype(rhs)>;
+
+        // Handle directly comparable types
+        if constexpr (
+            std::is_same_v<LType, RType> &&
+            !std::is_same_v<LType, std::unique_ptr<Graph>>) {
+          return lhs == rhs;
+        }
+        // Unsupported types (Graph)
+        LOG(ERROR) << "Unsupported Constant types for pattern matching: "
+                   << typeid(lhs).name() << " vs " << typeid(rhs).name();
+        throw std::runtime_error("Unsupported Constant types.");
+      },
+      a,
+      b);
+}
+
+auto findMatchingAttribute(const Node* target_node, const Attribute& attr) {
+  return std::find_if(
+      target_node->attributes().begin(),
+      target_node->attributes().end(),
+      [&](const Attribute& otherAttr) {
+        return attr.name == otherAttr.name &&
+            compareConstants(attr.value, otherAttr.value);
+      });
+}
+
+auto findInputByName(const Node* pattern_node, const std::string& inputName) {
+  return std::find_if(
+      pattern_node->inputs().begin(),
+      pattern_node->inputs().end(),
+      [&](const NamedArgument& patternInput) {
+        return inputName == patternInput.name;
+      });
+}
+} // namespace
+
+bool SubgraphMatcher::tryMatchNodeInputs(
+    const Node* pattern_node,
+    Node* target_node,
+    Match& match) {
+  TORCH_CHECK(
+      pattern_node->numInputs() + pattern_node->attributes().size() ==
+      target_node->numInputs() + target_node->attributes().size());
+  TORCH_CHECK(target_node->numInputs() <= pattern_node->numInputs());
+  TORCH_CHECK(pattern_node->attributes().size() <= target_node->numInputs());
+
+  // Target node inputs should match pattern node inputs
+  for (const auto i : c10::irange(target_node->numInputs())) {
+    // Compare input values
+    // Current target node input should match a pattern node input
+    const auto& inputMatch =
+        findInputByName(pattern_node, target_node->inputs()[i].name);
+    if (inputMatch == pattern_node->inputs().end()) {
+      return false;
+    }
+
+    const Value* pval = inputMatch->value;
+    Value* tval = target_node->inputs()[i].value;
+    if (!tryMatchValue(pval, tval, match)) {
+      return false;
+    }
+  }
+
+  // Pattern node attributes should match target node attributes
+  std::unordered_set<std::string> matched_attributes;
+  for (const auto i : c10::irange(pattern_node->attributes().size())) {
+    // Compare attributes
+    const auto& attr = pattern_node->attributes()[i];
+    auto it = findMatchingAttribute(target_node, attr);
+    if (it == target_node->attributes().end()) {
+      return false; // Attribute not found or values differ
+    }
+    matched_attributes.insert(it->name);
+  }
+
+  // Target node attributes that do not match pattern node attributes should
+  // match pattern node inputs
+  for (const auto i : c10::irange(target_node->attributes().size())) {
+    const auto& it = target_node->attributes()[i];
+    if (matched_attributes.find(it.name) != matched_attributes.end()) {
+      continue; // Skip attributes already matched
+    }
+    const auto& patternInput = findInputByName(pattern_node, it.name);
+    if (patternInput == pattern_node->inputs().end()) {
+      return false;
+    }
+    if (patternInput->value->producer()->target() != "prim.Input" ||
+        patternInput->value->users().size() > 1) {
+      return false; // Only a pattern graph input should match a constant attr
+    }
+
+    // Insert a dummy node to match the pattern input value
+    // Record the attribute that should be used to replace the dummy node
+    auto* targetGraph = target_node->owningGraph();
+    Node* dummyNode = targetGraph->createNode(kDummyTarget);
+    Value* dummyOutput = dummyNode->addOutput(
+        targetGraph->getUniqueValueName(), Type::Kind::None);
+    targetGraph->insertBefore(dummyNode, target_node);
+    if (match.value_map.find(patternInput->value) != match.value_map.end()) {
+      return match.value_map[patternInput->value]->producer()->target() ==
+          kDummyTarget;
+    }
+    match.value_map[patternInput->value] = dummyOutput;
+    match.dummy_input_to_attribute_map[dummyOutput] = &it.value;
+  }
+  return true;
+}
+
+bool SubgraphMatcher::tryMatchNode(
+    const Node* pattern_node,
+    Node* target_node,
+    Match& match) {
+  if (match.node_map.find(pattern_node) != match.node_map.end()) {
+    return match.node_map[pattern_node] == target_node;
+  }
+
+  // If the pattern node is an input, it should match every node
+  if (pattern_node->target() == "prim.Input") {
+    return true;
+  }
+
+  if (pattern_node->target() != target_node->target() ||
+      pattern_node->numOutputs() != target_node->numOutputs()) {
+    return false;
+  }
+
+  int64_t deltaInputCount = static_cast<int64_t>(pattern_node->numInputs()) -
+      static_cast<int64_t>(target_node->numInputs());
+  int64_t deltaAttributesCount =
+      static_cast<int64_t>(pattern_node->attributes().size()) -
+      static_cast<int64_t>(target_node->attributes().size());
+  // Number of inputs and attributes should match exactly
+  // and the pattern should always have >= input count of the target node
+  // and the pattern should always have <= attribute count of the target node
+  if (deltaInputCount + deltaAttributesCount != 0 ||
+      (deltaInputCount < 0 && deltaAttributesCount > 0)) {
+    return false;
+  }
+  match.node_map[pattern_node] = target_node;
+
+  for (const auto i : c10::irange(pattern_node->numOutputs())) {
+    const Value* pval = pattern_node->outputs()[i];
+    Value* tval = target_node->outputs()[i];
+    if (!tryMatchValue(pval, tval, match)) {
+      return false;
+    }
+  }
+
+  return tryMatchNodeInputs(pattern_node, target_node, match);
+}
+
+bool SubgraphMatcher::isOutputValue(const Value* val) {
+  for (const auto& output : pattern_->outputs()) {
+    if (val == output) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool SubgraphMatcher::tryMatchValue(
+    const Value* pval,
+    Value* tval,
+    Match& match) {
+  if (match.value_map.find(pval) != match.value_map.end()) {
+    return match.value_map[pval] == tval;
+  }
+
+  const Node* pProducer = pval->producer();
+  Node* tProducer = tval->producer();
+  // If the value in the pattern is an input, then it could have other uses
+  // outside of the subgraph. Similarly, output values can also have uses
+  // outside of the matching subgraph.
+  if (pval->users().size() != tval->users().size() &&
+      pProducer->target() != "prim.Input" && !isOutputValue(pval)) {
+    return false;
+  }
+
+  if (pval->type().kind() != tval->type().kind()) {
+    return false;
+  }
+
+  match.value_map[pval] = tval;
+
+  return tryMatchNode(pProducer, tProducer, match);
+}
+
+//-------------------------
+// SubgraphRewriter
+//-------------------------
+
+void SubgraphRewriter::registerRewritePattern(
+    const std::string& pattern,
+    const std::string& replacement) {
+  patterns_.emplace_back(RewriteRule{pattern, replacement});
+}
+
+bool SubgraphRewriter::run(
+    Graph* graph,
+    const std::vector<MatchFilter>& filters) {
+  bool mutated = false;
+  for (const auto& [pattern, replacement] : patterns_) {
+    const auto& pattern_graph = stringToGraph(pattern);
+    const auto& replacement_graph = stringToGraph(replacement);
+    mutated |= runForPattern(
+        graph, *pattern_graph.get(), *replacement_graph.get(), filters);
+  }
+  return mutated;
+}
+
+bool SubgraphRewriter::runForPattern(
+    Graph* graph,
+    const Graph& pattern,
+    const Graph& replacement,
+    const std::vector<MatchFilter>& filters) {
+  SubgraphMatcher matcher(&pattern);
+  std::vector<Match> matches = matcher.matchAll(graph);
+
+  VLOG(1) << "[GraphPasses] Found " << matches.size()
+          << " matches for : " << name_;
+
+  for (auto& m : matches) {
+    if (!std::all_of(filters.begin(), filters.end(), [&](const MatchFilter& f) {
+          return f(m, getVmap(pattern));
+        })) {
+      continue;
+    }
+    if (!overlapsWithUsedNodes(m, replacedNodes_)) {
+      rewriteMatch(graph, m, pattern, replacement);
+    }
+  }
+
+  for (auto* v : valuesToRewrite_) {
+    graph->replaceAllUses(v, valueRewrites_.at(v));
+  }
+
+  for (auto* n : replacedNodes_) {
+    for (const auto& input : n->inputs()) {
+      input.value->eraseUser(n);
+    }
+    n->inputs().clear();
+  }
+
+  for (auto* n : replacedNodes_) {
+    n->destroy();
+  }
+
+  bool mutated = (valuesToRewrite_.size() + valueRewrites_.size() +
+                  replacedNodes_.size()) > 0;
+
+  valuesToRewrite_.clear();
+  valueRewrites_.clear();
+  replacedNodes_.clear();
+
+  graph->cleanupDeadNodes();
+  graph->finalize();
+  graph->lint();
+
+  return mutated;
+}
+
+bool SubgraphRewriter::overlapsWithUsedNodes(
+    const Match& match,
+    const std::unordered_set<Node*>& usedNodes) {
+  // If any node or value used by this match is already in usedNodes/usedValues,
+  // then this match overlaps with a previously selected match.
+  for (auto& kv : match.node_map) {
+    Node* target_node = kv.second;
+    if (usedNodes.find(target_node) != usedNodes.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void SubgraphRewriter::rewriteMatch(
+    Graph* graph,
+    const Match& match,
+    const Graph& pattern,
+    const Graph& replacement) {
+  // TODO: Preserve original node metadata with python source traceback
+  std::unordered_map<const Value*, Value*> valueMap;
+
+  // Find the point at which to insert the new subgraph
+  // and get pointers to input/output values to insert at
+  Node* insertionPoint = nullptr;
+  std::vector<Value*> inputs, outputs;
+  for (Value* v : pattern.inputs()) {
+    if (match.value_map.find(v) == match.value_map.end()) {
+      continue;
+    }
+    Value* input = match.value_map.at(v);
+    // We want to insert after latest producer of any input that is not a dummy
+    // node
+    if (!insertionPoint ||
+        (insertionPoint->isBefore(input->producer()) &&
+         input->producer()->target() != kDummyTarget)) {
+      insertionPoint = input->producer();
+    }
+    inputs.push_back(input);
+  }
+  TORCH_CHECK(insertionPoint, "No insertion point found");
+
+  // Check we're not inserting after any of the outputs
+  bool insertionPointValid = true;
+  for (const auto* v : pattern.outputs()) {
+    Value* output = match.value_map.at(v);
+    outputs.push_back(match.value_map.at(v));
+    for (const auto* user : output->users()) {
+      if (user->isBefore(insertionPoint)) {
+        insertionPointValid = false;
+        break;
+      }
+    }
+  }
+  if (!insertionPointValid) {
+    return;
+  }
+  std::vector<Value*> newOutputs;
+  {
+    InsertingAfter guard(insertionPoint);
+
+    newOutputs = graph->insertGraph(replacement, inputs, valueMap);
+  }
+  TORCH_CHECK(outputs.size() == newOutputs.size());
+
+  for (auto i : c10::irange(outputs.size())) {
+    valuesToRewrite_.push_back(outputs[i]);
+    valueRewrites_[outputs[i]] = newOutputs[i];
+  }
+
+  for (auto& patternNode : pattern.nodes()) {
+    if (match.node_map.find(&patternNode) != match.node_map.end()) {
+      Node* n = match.node_map.at(&patternNode);
+      replacedNodes_.insert(n);
+    }
+  }
+
+  // Replace dummy values with constant attributes
+  for (const auto& inputToAttr : match.dummy_input_to_attribute_map) {
+    auto* dummy = inputToAttr.first;
+    // dummy might not be used in rewritten graph
+    // e.g., casted_batch_one_hot_lengths
+    if (dummy->users().empty()) {
+      continue;
+    }
+
+    for (auto& userNode : dummy->users()) {
+      auto& userInputs = userNode->inputs();
+      replacedNodes_.insert(dummy->producer());
+      for (auto it = userInputs.begin(); it != userInputs.end(); ++it) {
+        if (it->value == dummy) {
+          Attribute newAttr;
+          std::visit(
+              [&](auto&& val) -> void {
+                using T = std::decay_t<decltype(val)>;
+                if constexpr (std::is_same_v<T, std::unique_ptr<Graph>>) {
+                  LOG(ERROR)
+                      << "Graph attributes are not supported yet. Skipping attribute";
+                } else {
+                  newAttr.value = val;
+                }
+              },
+              *inputToAttr.second);
+          newAttr.name = it->name;
+          userNode->addAttribute(std::move(newAttr));
+          dummy->eraseUser(userNode);
+          userInputs.erase(it);
+          break;
+        }
+      }
+    }
+  }
+}
+
+c10::FastMap<std::string, const Value*> SubgraphRewriter::getVmap(
+    const Graph& pattern) {
+  c10::FastMap<std::string, const Value*> vmap;
+  for (const auto& v : pattern.inputs()) {
+    vmap[std::string(v->name())] = v;
+  }
+  for (const auto& n : pattern.nodes()) {
+    for (const Value* v : n.outputs()) {
+      vmap[std::string(v->name())] = v;
+    }
+  }
+  return vmap;
+}
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/passes/SubgraphRewriter.h b/torch/nativert/graph/passes/SubgraphRewriter.h
new file mode 100644
index 0000000000000..b2018a1f7f38f
--- /dev/null
+++ b/torch/nativert/graph/passes/SubgraphRewriter.h
@@ -0,0 +1,198 @@
+#pragma once
+
+#include <c10/util/FbcodeMaps.h>
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert {
+
+/*
+ * node_map: A map from nodes in the pattern to nodes in the actual graph.
+ * value_map : A map between values in the pattern to values in the actual
+ * graph.
+ * dummy_input_to_attribute_map: A map between the actual dummy input values to
+ * constant attributes in the actual graph that should replace the dummy nodes
+ */
+struct Match {
+  std::unordered_map<const Node*, Node*> node_map;
+  std::unordered_map<const Value*, Value*> value_map;
+  std::unordered_map<Value*, const Constant*>
+      dummy_input_to_attribute_map; // For constant attrs matching graph inputs
+};
+
+using MatchFilter = std::function<
+    bool(const Match&, const c10::FastMap<std::string, const Value*>&)>;
+
+inline std::ostream& operator<<(std::ostream& out, const Match& match) {
+  out << "\nNode mapping:\n";
+  for (const auto& kv : match.node_map) {
+    const Node* patternNode = kv.first;
+    Node* targetNode = kv.second;
+    out << "  Pattern Node: " << *patternNode
+        << " -> Target Node: " << *targetNode << "\n";
+  }
+
+  out << "Value mapping:\n";
+  for (const auto& kv : match.value_map) {
+    const Value* patternValue = kv.first;
+    Value* targetValue = kv.second;
+    out << "  Pattern Value: " << *patternValue
+        << " -> Target Value: " << *targetValue << "\n";
+  }
+
+  return out;
+}
+
+/**
+ * A helper class for matching a subgraph pattern within a larger graph.
+ * It attempts to match a given `pattern` graph inside a target `graph`,
+ * starting from a single "root" output node in the pattern graph. The
+ * matching process works backward through the graph, comparing each node
+ * in the pattern to corresponding nodes in the candidate graph.
+ *
+ * Note: This implementation currently only supports deterministic matching
+ * for patterns with one output node. It also only matches nodes connecting to
+ * output nodes
+ *
+ * Constraints for Patterns with Multiple Output Nodes:
+ * To avoid an exponential increase in the search space, this implementation
+ * starts searching from the first output node as an anchor as an heuristic. It
+ * assumes that all other output nodes in the pattern are interconnected through
+ * the graph from this anchor node, allowing the matcher to traverse from the
+ * anchor to other outputs.
+ *
+ * Important: The order of output nodes in the pattern matters. For example:
+ *
+ *   graph(%x):
+ *       %a = a.aaa(input=%x)
+ *       %b = b.bbb(input=%a)
+ *       return (%a, %b)
+ *
+ * If the search starts from %a, it will not explore the portion of the graph
+ * connected to %b. However, if the order is switched:
+ *
+ *   graph(%x):
+ *       %a = a.aaa(input=%x)
+ *       %b = b.bbb(input=%a)
+ *       return (%b, %a)
+ *
+ * The search will start from %b and successfully explore both %b and %a.
+ */
+class SubgraphMatcher {
+ public:
+  explicit SubgraphMatcher(const Graph* pattern);
+
+  /// Attempt to match the pattern at a given node in the target graph.
+  /// If successful, returns a Match, otherwise std::nullopt.
+  std::optional<Match> match(Node* target_node);
+
+  std::vector<Match> matchAll(Graph* target_graph);
+
+ private:
+  const Graph* pattern_;
+  const Node* pattern_root_;
+
+  /**
+   * Finds the root output node of a Graph g to start a match from
+   * Note that graphs with multiple output nodes, this will pick the first
+   * output node in the order provided.
+   **/
+  const Node* findRootNode(const Graph* g);
+
+  /**
+   * Tries to match nodes in the pattern_ graph with the target graph, starting
+   * from pattern_node and target_node. Nodes are considered to match if they
+   * have the same target type, and all input and output values to the nodes
+   * match. Matching nodes are stored to `match`
+   **/
+  bool tryMatchNode(const Node* pattern_node, Node* target_node, Match& match);
+
+  /**
+   * Match inputs of pattern_node w/ target_node. Store matching values to
+   *`match`
+   **/
+  bool tryMatchNodeInputs(
+      const Node* pattern_node,
+      Node* target_node,
+      Match& match);
+
+  /**
+   * Tries to match values in the pattern_ graph with the target graph, starting
+   * from pval and tval. Matching values are stored to `match`.
+   **/
+  bool tryMatchValue(const Value* pval, Value* tval, Match& match);
+
+  /**
+   * Returns true of val is an output of its graph, and false otherwise
+   **/
+  bool isOutputValue(const Value* val);
+};
+
+struct RewriteRule {
+  std::string pattern;
+  std::string replacement;
+};
+
+/**
+ * Rewrite subgraphs in a given graph.
+ * TODO: Write more detailed documentation
+ **/
+class SubgraphRewriter {
+ public:
+  SubgraphRewriter(const std::string& name) : name_(name) {}
+
+  /**
+   * Registers the rewrite pattern.
+   * @param patternA The subgraph str to match.
+   * @param patternB The subgraph str to replace with.
+   */
+  void registerRewritePattern(
+      const std::string& pattern,
+      const std::string& replacement);
+
+  /**
+   * Runs the subgraph rewrite process on a graph.
+   * @param graph The graph on which the rewrite is applied.
+   * @param pattern The subgraph to match.
+   * @param replacement The subgraph to replace with.
+   * @param filters A list of filters to apply to the match. If any filter
+   * predicate returns true, the match will not be considered.
+   */
+  bool /* mutated? */ runForPattern(
+      Graph* graph,
+      const Graph& pattern,
+      const Graph& replacement,
+      const std::vector<MatchFilter>& filters);
+
+  bool /* mutated? */ run(
+      Graph* graph,
+      const MatchFilter& filter =
+          [](const Match&, const c10::FastMap<std::string, const Value*>&) {
+            return true;
+          }) {
+    return run(graph, std::vector<MatchFilter>({filter}));
+  }
+
+  bool /* mutated? */ run(
+      Graph* graph,
+      const std::vector<MatchFilter>& filters);
+
+ private:
+  std::string name_;
+  std::vector<RewriteRule> patterns_; // The subgraph pattern to match
+  std::unordered_set<Node*> replacedNodes_;
+  std::vector<Value*> valuesToRewrite_;
+  std::unordered_map<const Value*, Value*> valueRewrites_;
+
+  // Helper methods
+  bool overlapsWithUsedNodes(
+      const Match& match,
+      const std::unordered_set<Node*>& replacedNodes);
+  void rewriteMatch(
+      Graph* graph,
+      const Match& match,
+      const Graph& pattern,
+      const Graph& replacement);
+
+  c10::FastMap<std::string, const Value*> getVmap(const Graph& pattern);
+};
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/passes/pass_manager/GraphPassRegistry.h b/torch/nativert/graph/passes/pass_manager/GraphPassRegistry.h
new file mode 100644
index 0000000000000..28a7f77aa8a1f
--- /dev/null
+++ b/torch/nativert/graph/passes/pass_manager/GraphPassRegistry.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <functional>
+#include <map>
+
+#include <c10/util/Logging.h>
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert {
+
+using PassSignature = std::function<bool(Graph*)>;
+using GraphPassIdentifier = std::string;
+
+class GraphPass {
+ public:
+  GraphPass(GraphPassIdentifier&& name, PassSignature&& pass)
+      : name_(std::move(name)), pass_(std::move(pass)) {}
+
+  const GraphPassIdentifier& name() const {
+    return name_;
+  }
+
+  const PassSignature& get() const {
+    return pass_;
+  }
+
+ private:
+  GraphPassIdentifier name_;
+  PassSignature pass_;
+};
+
+class GraphPassRegistry {
+ public:
+  static GraphPassRegistry& get() {
+    static GraphPassRegistry instance;
+    return instance;
+  }
+
+  static void add_pass(GraphPassIdentifier&& name, PassSignature&& pass) {
+    GraphPassRegistry::get().add_pass(
+        GraphPass(std::move(name), std::move(pass)));
+  }
+
+  void add_pass(GraphPass&& pass) {
+    if (auto it = registry_.find(pass.name()); it != registry_.end()) {
+      LOG(WARNING) << "Pass " << pass.name() << " already registered";
+      return;
+    }
+
+    GraphPassIdentifier name = pass.name();
+
+    LOG(INFO) << "Pass " << name << " registered";
+    registry_.insert({std::move(name), std::move(pass)});
+  }
+
+  void remove_pass(const GraphPassIdentifier& name) {
+    if (!registry_.erase(name)) {
+      LOG(WARNING) << "Pass " << name << " not registered but tried to remove";
+      return;
+    }
+    LOG(INFO) << "Pass " << name << " unregistered";
+  }
+
+  const GraphPass& get_pass(const GraphPassIdentifier& name) {
+    auto it = registry_.find(name);
+    if (it == registry_.end()) {
+      throw std::runtime_error("Pass " + name + " not registered to get");
+    }
+    return it->second;
+  }
+
+ private:
+  GraphPassRegistry() {
+    LOG(INFO) << "Creating GraphPassRegistry";
+  }
+
+  std::map<std::string, GraphPass> registry_;
+
+ public:
+  GraphPassRegistry(GraphPassRegistry const&) = delete;
+  void operator=(GraphPassRegistry const&) = delete;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp b/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp
new file mode 100644
index 0000000000000..7a838b2a651f9
--- /dev/null
+++ b/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp
@@ -0,0 +1,92 @@
+#include <torch/nativert/graph/passes/pass_manager/GraphPasses.h>
+
+#include <torch/nativert/graph/passes/SubgraphRewriter.h>
+#include <torch/nativert/graph/passes/pass_manager/GraphPassRegistry.h>
+
+namespace torch::nativert {
+
+void register_base_passes() {
+  GraphPassRegistry::add_pass("EmptyPass", [](Graph*) { return false; });
+
+  GraphPassRegistry::add_pass(
+      "LinearDynamicFp16UnpackedWeight", [](Graph* graph) {
+        std::string p = R"(
+    graph(%i, %w, %b):
+    %out_0 = torch.ops.aten.linear.default(input=%i, weight=%w, bias=%b)
+    return (%out_0))";
+
+        std::string p_1 = R"(
+    graph(%i, %w, %b):
+    %out_0 = torch.ops.quantized.linear_dynamic_fp16_unpacked_weight.default(X=%i, weight=%w, bias=%b)
+    return (%out_0))";
+
+        std::string p_new = R"(
+    graph(%i, %w, %b):
+    %pw = torch.ops.quantized.linear_prepack_fp16.default(W=%w, B=%b)
+    %out_0 = torch.ops.quantized.linear_dynamic_fp16.default(X=%i, W_prepack=%pw)
+    return (%out_0))";
+
+        SubgraphRewriter rewriter("LinearDynamicFp16UnpackedWeight");
+        rewriter.registerRewritePattern(p, p_new);
+        rewriter.registerRewritePattern(p_1, p_new);
+        return rewriter.run(graph);
+      });
+
+  GraphPassRegistry::add_pass(
+      "LinearReluDynamicFp16UnpackedWeight", [](Graph* graph) {
+        std::string p = R"(
+    graph(%i, %w, %b):
+    %out_0 = torch.ops.aten.linear.default(input=%i, weight=%w, bias=%b)
+    %out_1 = torch.ops.aten.relu.default(self=%out_0)
+    return (%out_1))";
+
+        std::string p_1 = R"(
+    graph(%i, %w, %b):
+    %out_0 = torch.ops.quantized.linear_dynamic_fp16_unpacked_weight.default(X=%i, weight=%w, bias=%b)
+    %out_1 = torch.ops.aten.relu.default(self=%out_0)
+    return (%out_1))";
+
+        std::string p_new = R"(
+    graph(%i, %w, %b):
+    %pw = torch.ops.quantized.linear_prepack_fp16.default(W=%w, B=%b)
+    %out_0 = torch.ops.quantized.linear_relu_dynamic_fp16.default(X=%i, W_prepack=%pw)
+    return (%out_0))";
+
+        SubgraphRewriter rewriter("LinearReluDynamicFp16UnpackedWeight");
+        rewriter.registerRewritePattern(p, p_new);
+        rewriter.registerRewritePattern(p_1, p_new);
+        return rewriter.run(graph);
+      });
+
+  GraphPassRegistry::add_pass("CleanUpDeadNodes", [](Graph* graph) {
+    return graph->cleanupDeadNodes();
+  });
+
+  GraphPassRegistry::add_pass("RemoveDetach", [](Graph* graph) {
+    std::vector<Node*> nodesToDestroy;
+
+    for (auto& node : graph->nodes()) {
+      if (node.target() == "torch.ops.aten.detach.default") {
+        nodesToDestroy.push_back(&node);
+        graph->replaceAllUses(node.outputs()[0], node.inputs()[0].value);
+      }
+    }
+
+    VLOG(1) << "[GraphPasses] Removed " << nodesToDestroy.size()
+            << " aten.detach nodes";
+
+    const bool mutated = !nodesToDestroy.empty();
+
+    for (Node* node : nodesToDestroy) {
+      node->destroy();
+    }
+
+    graph->renumberValues();
+    graph->finalize();
+    graph->lint();
+
+    return mutated;
+  });
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/passes/pass_manager/GraphPasses.h b/torch/nativert/graph/passes/pass_manager/GraphPasses.h
new file mode 100644
index 0000000000000..f625644486524
--- /dev/null
+++ b/torch/nativert/graph/passes/pass_manager/GraphPasses.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch::nativert {
+
+void register_base_passes();
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/passes/pass_manager/PassManager.cpp b/torch/nativert/graph/passes/pass_manager/PassManager.cpp
new file mode 100644
index 0000000000000..e023f223ed6f1
--- /dev/null
+++ b/torch/nativert/graph/passes/pass_manager/PassManager.cpp
@@ -0,0 +1,52 @@
+#include <torch/nativert/graph/passes/pass_manager/PassManager.h>
+
+#include <c10/util/CallOnce.h>
+
+#include <torch/nativert/graph/Graph.h>
+#include <torch/nativert/graph/passes/pass_manager/GraphPasses.h>
+
+namespace torch::nativert {
+
+GraphPassManager::GraphPassManager(
+    GraphPassPipeline pipeline,
+    PassManagerOptions opts)
+    : pipeline_(std::move(pipeline)), opts_(opts) {
+  static c10::once_flag flag;
+  c10::call_once(flag, [&]() { register_base_passes(); });
+}
+
+bool GraphPassManager::run(Graph* graph) {
+  bool changed = false;
+  for (const auto& pass_name : pipeline_) {
+    changed |= run_pass(graph, pass_name);
+  }
+  return changed;
+}
+
+bool GraphPassManager::run_pass(Graph* graph, const GraphPassIdentifier& name) {
+  const auto& pass = GraphPassRegistry::get().get_pass(name);
+
+  bool changed = pass_pre_run_hook(graph, pass);
+  changed |= (pass.get())(graph);
+  changed |= pass_post_run_hook(graph, pass);
+
+  return changed;
+}
+
+bool GraphPassManager::pass_pre_run_hook(Graph* graph, const GraphPass& pass) {
+  if (opts_.logGraphBetweenPasses()) {
+    LOG(INFO) << "Before pass: " << pass.name() << "\n"
+              << graph->toString() << "-------------------------";
+  }
+  return false;
+}
+
+bool GraphPassManager::pass_post_run_hook(Graph* graph, const GraphPass& pass) {
+  if (opts_.logGraphBetweenPasses()) {
+    LOG(INFO) << "After pass: " << pass.name() << "\n"
+              << graph->toString() << "-------------------------";
+  }
+  return false;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/passes/pass_manager/PassManager.h b/torch/nativert/graph/passes/pass_manager/PassManager.h
new file mode 100644
index 0000000000000..22ce0144bcd80
--- /dev/null
+++ b/torch/nativert/graph/passes/pass_manager/PassManager.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <memory>
+
+#include <torch/nativert/graph/Graph.h>
+#include <torch/nativert/graph/passes/pass_manager/PassPipeline.h>
+
+namespace torch::nativert {
+
+using torch::nativert::Graph;
+using torch::nativert::GraphPass;
+
+class PassManagerOptions {
+ public:
+  /* GETTERS */
+  bool logGraphBetweenPasses() const {
+    return log_graph_between_passes_;
+  }
+
+  /* SETTERS */
+  PassManagerOptions& setLogGraphBetweenPasses(bool log_graph_between_passes) {
+    log_graph_between_passes_ = log_graph_between_passes;
+    return *this;
+  }
+
+ private:
+  bool log_graph_between_passes_{false};
+};
+
+class GraphPassManager {
+ public:
+  explicit GraphPassManager(
+      GraphPassPipeline pipeline,
+      PassManagerOptions opts = {});
+  ~GraphPassManager() = default;
+
+  bool run(Graph* graph);
+
+  const GraphPassPipeline& pipeline() const {
+    return pipeline_;
+  }
+
+  const PassManagerOptions& opts() const {
+    return opts_;
+  }
+
+ private:
+  std::unique_ptr<GraphPass> create_pass(GraphPassIdentifier id);
+
+  bool run_pass(Graph* graph, const GraphPassIdentifier& config);
+  bool pass_pre_run_hook(Graph* graph, const GraphPass& pass);
+  bool pass_post_run_hook(Graph* graph, const GraphPass& pass);
+
+  const GraphPassPipeline pipeline_;
+  const PassManagerOptions opts_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/passes/pass_manager/PassPipeline.h b/torch/nativert/graph/passes/pass_manager/PassPipeline.h
new file mode 100644
index 0000000000000..634e7436ec016
--- /dev/null
+++ b/torch/nativert/graph/passes/pass_manager/PassPipeline.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <torch/nativert/graph/passes/pass_manager/GraphPassRegistry.h>
+
+namespace torch::nativert {
+
+using GraphPassIdentifier = std::string;
+
+class GraphPassPipeline : public std::vector<GraphPassIdentifier> {
+ public:
+  using std::vector<GraphPassIdentifier>::vector;
+
+  void push_front(GraphPassIdentifier pass) {
+    std::vector<GraphPassIdentifier>::insert(begin(), std::move(pass));
+  }
+
+  // concats the passed pipeline to the end of the current
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+  void concat(GraphPassPipeline&& other) {
+    std::move(other.begin(), other.end(), std::back_inserter(*this));
+  }
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/KernelFactory.cpp b/torch/nativert/kernels/KernelFactory.cpp
index adf9bae8877ad..3fc4f2bcdc53f 100644
--- a/torch/nativert/kernels/KernelFactory.cpp
+++ b/torch/nativert/kernels/KernelFactory.cpp
@@ -14,6 +14,7 @@
 #include <torch/nativert/kernels/HigherOrderKernel.h>
 #include <torch/nativert/kernels/KernelFactory.h>
 #include <torch/nativert/kernels/PrimKernelRegistry.h>
+#include <torch/nativert/kernels/TritonKernel.h>
 
 namespace torch::nativert {
 
@@ -77,6 +78,13 @@ void KernelFactory::registerHandler(
   });
 }
 
+/* static */ bool KernelFactory::isHandlerRegistered(
+    const std::string& handler) {
+  return getKernelFactoryRegistry().withLock([&](auto&& reg) {
+    return reg.handlers.find(handler) != reg.handlers.end();
+  });
+}
+
 ExecutionKernels KernelFactory::initializeNodeKernels(
     const Graph& graph,
     const std::shared_ptr<Weights>& weights,
@@ -123,6 +131,11 @@ ExecutionKernels KernelFactory::initializeNodeKernels(
     } else if (c10::starts_with(
                    node.target(), "torch.ops.higher_order.call_torchbind")) {
       nodeKernels.push_back(std::make_unique<CallTorchBindKernel>(&node));
+    } else if (c10::starts_with(
+                   node.target(),
+                   "torch.ops.higher_order.triton_kernel_wrapper_functional")) {
+      nodeKernels.push_back(
+          std::make_unique<TritonKernel>(&node, pytorchStreamReader.get()));
     } else if (
         c10::starts_with(
             node.target(),
@@ -168,17 +181,16 @@ ExecutionKernels KernelFactory::initializeNodeKernels(
               executionKernels.constFoldingExecutions.empty(),
               "HigherOrderKernel does not support const folding");
           if (executorConfig.maxParallelOps > 1) {
-            graphExecutors.emplace_back(
-                std::unique_ptr<GraphExecutorBase>(new ParallelGraphExecutor(
-                    *subgraph,
-                    std::move(executionKernels.nodeKernels),
-                    executorConfig)));
+            graphExecutors.emplace_back(std::make_unique<ParallelGraphExecutor>(
+                *subgraph,
+                std::move(executionKernels.nodeKernels),
+                executorConfig));
           } else {
-            graphExecutors.emplace_back(std::unique_ptr<GraphExecutorBase>(
-                new torch::nativert::SerialGraphExecutor(
+            graphExecutors.emplace_back(
+                std::make_unique<torch::nativert::SerialGraphExecutor>(
                     *subgraph,
                     std::move(executionKernels.nodeKernels),
-                    executorConfig)));
+                    executorConfig));
           }
         }
       }
diff --git a/torch/nativert/kernels/KernelFactory.h b/torch/nativert/kernels/KernelFactory.h
index 05773dc5e4c53..4b5486cd322bc 100644
--- a/torch/nativert/kernels/KernelFactory.h
+++ b/torch/nativert/kernels/KernelFactory.h
@@ -75,6 +75,8 @@ class KernelFactory {
   static void registerHandler(
       const std::string& name,
       KernelFactoryHandler handler);
+
+  static bool isHandlerRegistered(const std::string& handler);
 };
 
 } // namespace torch::nativert
diff --git a/torch/nativert/kernels/KernelHandlerRegistry.cpp b/torch/nativert/kernels/KernelHandlerRegistry.cpp
new file mode 100644
index 0000000000000..653ca5dfcb816
--- /dev/null
+++ b/torch/nativert/kernels/KernelHandlerRegistry.cpp
@@ -0,0 +1,68 @@
+#include <torch/nativert/kernels/KernelHandlerRegistry.h>
+
+#include <c10/util/Logging.h>
+#include <fmt/format.h>
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/CallOnce.h>
+
+#include <torch/nativert/graph/Graph.h>
+#include <torch/nativert/graph/GraphPasses.h>
+#include <torch/nativert/graph/GraphUtils.h>
+#include <torch/nativert/kernels/KernelFactory.h>
+#include <torch/nativert/kernels/KernelRegistry.h>
+
+namespace torch::nativert {
+
+namespace {
+std::string maybeRevisedStaticDispatchTarget(const Node& node) {
+  auto overloadName = selectScalarOverloadName(node);
+
+  if (!overloadName.empty() && !c10::ends_with(node.target(), overloadName)) {
+    const std::string& newTarget =
+        std::string(node.target())
+            .replace(node.target().rfind('.'), std::string::npos, overloadName);
+    LOG(INFO) << fmt::format(
+        "Converting Tensor to {} for node: {} -> {}",
+        overloadName,
+        node.target(),
+        newTarget);
+    return newTarget;
+  }
+  return std::string(node.target());
+}
+} // namespace
+
+void register_kernel_handlers() {
+  static c10::once_flag flag;
+  c10::call_once(flag, []() {
+    using OpKernelPtr = KernelFactoryHandler::OpKernelPtr;
+    using DelegateExecutorPtr = KernelFactoryHandler::DelegateExecutorPtr;
+    KernelFactory::registerHandler(
+        "static_cpu",
+        KernelFactoryHandler(
+            [](const Node& node,
+               const torch::nativert::ExecutorConfig& executorConfig) {
+              if (!executorConfig.enableStaticCPUKernels ||
+                  !torch::nativert::areAllIOTensorsAttributesOnCpu(node)) {
+                return false;
+              }
+              const std::string target = maybeRevisedStaticDispatchTarget(node);
+              return torch::nativert::StaticallyDispatchedCPUKernelRegistry()
+                  ->Has(target);
+            },
+            [](const Node& node,
+               // NOLINTNEXTLINE(performance-unnecessary-value-param)
+               std::shared_ptr<Weights> weights,
+               const torch::nativert::ExecutorConfig& executorConfig,
+               caffe2::serialize::PyTorchStreamReader* packageReader)
+                -> std::pair<OpKernelPtr, DelegateExecutorPtr> {
+              return {
+                  torch::nativert::StaticallyDispatchedCPUKernelRegistry()
+                      ->Create(maybeRevisedStaticDispatchTarget(node), &node),
+                  nullptr};
+            }));
+  });
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/KernelHandlerRegistry.h b/torch/nativert/kernels/KernelHandlerRegistry.h
new file mode 100644
index 0000000000000..985ca0819a9a2
--- /dev/null
+++ b/torch/nativert/kernels/KernelHandlerRegistry.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch::nativert {
+
+void register_kernel_handlers();
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/KernelRegistry.cpp b/torch/nativert/kernels/KernelRegistry.cpp
index f546d3c3b1ef9..f416210cc3938 100644
--- a/torch/nativert/kernels/KernelRegistry.cpp
+++ b/torch/nativert/kernels/KernelRegistry.cpp
@@ -196,8 +196,11 @@ static at::Tensor& mul_out(
   const auto& t_output = output.scalar_type();
   TORCH_CHECK(at::native::result_type(self, other) == t_output);
 
-  auto self_sizes = self.sizes();
-  at::native::resize_(output, self_sizes, std::nullopt);
+  at::native::resize_impl_cpu_(
+      output.unsafeGetTensorImpl(),
+      self.sizes(),
+      self.is_contiguous() ? at::OptionalIntArrayRef(std::nullopt)
+                           : self.strides());
 
   AT_DISPATCH_ALL_TYPES_AND2(
       kHalf, kBFloat16, t_output, "mul_Scalar_out", [&]() {
@@ -387,6 +390,7 @@ REGISTER_CPU_KERNEL("torch.ops.aten.leaky_relu.default", aten_leaky_relu, {
     return;
   }
   auto& out_t = KernelOutput(0).toTensor();
+  fastResizeToZero(out_t);
   at::cpu::leaky_relu_out(out_t, in0_t, in1_s);
 })
 
@@ -898,6 +902,7 @@ REGISTER_CPU_KERNEL("torch.ops.aten.repeat.default", aten_repeat, {
     return;
   }
   at::Tensor& out = KernelOutput(0).toTensor();
+  fastResizeToZero(out);
   at::native::repeat_out(out, self, repeats);
 })
 
@@ -1014,6 +1019,7 @@ REGISTER_CPU_KERNEL("torch.ops.aten.full_like.default", aten_full_like, {
         in0_t, dtype, layout, device, pin_memory, memory_format);
   }
   auto& out_t = KernelOutput(0).toTensor();
+  fastResizeToZero(out_t);
   at::native::resize_(out_t, in0_t.sizes(), std::nullopt);
   at::native::fill_out(out_t, in1_s);
 })
@@ -1045,6 +1051,18 @@ REGISTER_CPU_KERNEL("torch.ops.aten.where.self", aten_where, {
   at::native::where_self_out(cond, self, other, out);
 })
 
+REGISTER_CPU_KERNEL("torch.ops.fb.scale_gradient.default", fb_scale_gradient, {
+  const auto& in_0 = KernelInput(0).toTensor();
+
+  if (KernelOutput(0).isNone()) {
+    KernelOutput(0) = create_empty_from(in_0);
+  }
+  auto& out = KernelOutput(0).toTensor();
+  fastResizeToZero(out);
+  out.resize_(in_0.sizes());
+  out.copy_(in_0);
+})
+
 REGISTER_CPU_KERNEL(
     "torch.ops.quantized.embedding_bag_byte_rowwise_offsets.default",
     quantized_embedding_bag_byte_rowwise_offsets,
@@ -1122,6 +1140,25 @@ REGISTER_CPU_KERNEL(
           in_0, out_0, /* reduce_range= */ false);
     })
 
+REGISTER_CPU_KERNEL(
+    "torch.ops._quantized.wrapped_fbgemm_linear_fp16_weight.default",
+    _quantized_wrapped_fbgemm_linear_fp16_weight,
+    {
+      const auto& in_0 = KernelInput(0).toTensor();
+      const auto& weight = KernelInput(1).toTensor();
+      auto bias = KernelInput(2).toOptional<at::Tensor>();
+
+      if (auto& out_0 = KernelOutput(0); out_0.isNone()) {
+        out_0 = create_empty_from(in_0, at::kFloat);
+      }
+
+      auto& out_0 = KernelOutput(0).toTensor();
+      fastResizeToZero(out_0);
+
+      at::native::fbgemm_linear_fp16_weight(
+          in_0, weight, bias.value_or(at::Tensor()), out_0);
+    })
+
 REGISTER_CPU_KERNEL(
     "torch.ops.quantized.linear_relu_dynamic_fp16.default",
     quantized_linear_relu_dynamic_fp16,
@@ -1237,6 +1274,18 @@ REGISTER_CPU_KERNEL("torch.ops.aten.stack.default", aten_stack, {
   at::native::_stack_out_cpu(inputs, dim, out_t);
 })
 
+REGISTER_CPU_KERNEL("torch.ops.aten.fmod.Scalar", aten_fmod_scalar, {
+  const auto& self = KernelInput(0).toTensor();
+  const auto& other = KernelInput(1).toScalar();
+  if (KernelOutput(0).isNone()) {
+    KernelOutput(0) = at::native::fmod(self, other);
+    return;
+  }
+  auto& out = KernelOutput(0).toTensor();
+  fastResizeToZero(out);
+  at::native::fmod_out(self, other, out);
+})
+
 class OpKernel_aten__to_copy : public C10Kernel {
  public:
   explicit OpKernel_aten__to_copy(const Node* node)
diff --git a/torch/nativert/kernels/TritonKernel.cpp b/torch/nativert/kernels/TritonKernel.cpp
new file mode 100644
index 0000000000000..84fbf09a37f43
--- /dev/null
+++ b/torch/nativert/kernels/TritonKernel.cpp
@@ -0,0 +1,137 @@
+#include <torch/nativert/kernels/TritonKernel.h>
+
+#include <fmt/ostream.h>
+
+#include <c10/util/Enumerate.h>
+#include <c10/util/Exception.h>
+
+#include <ATen/Tensor.h>
+#include <ATen/core/op_registration/op_registration.h>
+
+#include <torch/nativert/executor/DelegateExecutor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+#include <torch/nativert/executor/triton/CpuTritonKernelManager.h>
+
+namespace torch::nativert {
+
+TritonKernel::TritonKernel(
+    const Node* node,
+    caffe2::serialize::PyTorchStreamReader* reader)
+    : OpKernel(node, OpKernelKind::kTritonKernel) {
+  TORCH_CHECK(reader != nullptr, "reader is null");
+
+  std::string kernel_name{};
+  bool found_grid = false;
+  for (const auto& attr : node_->attributes()) {
+    if (attr.name.empty()) {
+      attr_ptrs_.emplace_back(std::visit(
+          [](auto&& arg) -> void* {
+            using T = std::decay_t<decltype(arg)>;
+            if constexpr (std::is_same_v<T, None>) {
+              return nullptr;
+            }
+            return static_cast<void*>(const_cast<T*>(&arg));
+          },
+          attr.value));
+    } else if (attr.name == "name") {
+      kernel_name = std::get<std::string>(attr.value);
+    } else if (attr.name == "grid") {
+      found_grid = true;
+      auto grid = std::get<std::vector<int64_t>>(attr.value);
+      TORCH_CHECK(grid.size() == 3, "grid must be a 3D vector");
+      launch_params_.grid_dims = GridDims(
+          static_cast<int>(grid[0]),
+          static_cast<int>(grid[1]),
+          static_cast<int>(grid[2]));
+    } else if (attr.name == "num_warps") {
+      if (const int num_warps = static_cast<int>(std::get<int64_t>(attr.value));
+          num_warps > 0) {
+        launch_params_.num_warps = num_warps;
+      }
+    } else if (attr.name == "shared_memory_bytes") {
+      if (const int shared_memory_bytes =
+              static_cast<int>(std::get<int64_t>(attr.value));
+          shared_memory_bytes > 0) {
+        launch_params_.shared_memory_bytes = shared_memory_bytes;
+      }
+    } else if (attr.name == "output_indices") {
+      output_indices_ = std::get<std::vector<int64_t>>(attr.value);
+    }
+  }
+
+  TORCH_CHECK(!kernel_name.empty(), "kernel name not found");
+  TORCH_CHECK(found_grid, "grid attribute not found");
+  TORCH_CHECK(!output_indices_.empty(), "output_indices attribute not found");
+
+  auto kernel_prefix = std::string("data/triton") + "/" + kernel_name;
+
+  auto tmp_dir = extractToTemporaryFolder(*reader, kernel_prefix) + "/";
+
+  if (reader->hasRecord(kernel_prefix + "/" + kernel_name + ".cubin")) {
+    TORCH_CHECK(
+        create_cuda_triton_kernel_manager != nullptr,
+        "couldn't find cuda loader -- is this a gpu build?");
+    loader_ = create_cuda_triton_kernel_manager(
+        kernel_name, tmp_dir + kernel_name + ".cubin");
+  }
+
+  if (reader->hasRecord(kernel_prefix + "/" + kernel_name + ".hsaco")) {
+    TORCH_CHECK(
+        create_cuda_triton_kernel_manager != nullptr,
+        "couldn't find cuda loader -- is this a gpu build?");
+    loader_ = create_cuda_triton_kernel_manager(
+        kernel_name, tmp_dir + kernel_name + ".hsaco");
+  }
+
+  if (loader_ == nullptr) {
+    loader_ = std::unique_ptr<TritonKernelManager>(new CpuTritonKernelManager(
+        kernel_name,
+        tmp_dir + kernel_name + ".so",
+        tmp_dir + kernel_name + ".launcher.so"));
+  }
+}
+
+TritonKernel::~TritonKernel() = default;
+
+void TritonKernel::computeInternal(ExecutionFrame& executionFrame) const {
+  const auto num_inputs = node_->inputs().size();
+  const auto num_attrs = attr_ptrs_.size();
+
+  auto* loader = const_cast<TritonKernelManager*>(loader_.get());
+
+  auto inputs = loader->create_inputs(num_inputs, num_attrs);
+
+  for (const auto i : c10::irange(num_inputs)) {
+    inputs->add_arg(input(i, executionFrame).toTensor().data_ptr());
+  }
+
+  for (const auto i : c10::irange(num_attrs)) {
+    inputs->add_attribute(attr_ptrs_[i]);
+  }
+
+  loader->launch(launch_params_, inputs->as_void());
+
+  auto& out = output(0, executionFrame);
+  if (out.isNone()) {
+    auto list = c10::List<at::Tensor>();
+    for (const auto& i : output_indices_) {
+      list.emplace_back(input(i, executionFrame).toTensor());
+    }
+    out = c10::IValue(std::move(list));
+    return;
+  }
+
+  // todo: check if this is redundant
+  auto out_t = out.toTensorList();
+  for (const auto& i : output_indices_) {
+    out_t[i] = input(i, executionFrame).toTensor();
+  }
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/TritonKernel.h b/torch/nativert/kernels/TritonKernel.h
new file mode 100644
index 0000000000000..4f9f0e47b00cd
--- /dev/null
+++ b/torch/nativert/kernels/TritonKernel.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <c10/core/Device.h>
+
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/executor/OpKernel.h>
+#include <torch/nativert/executor/triton/TritonKernelManager.h>
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert {
+
+class TritonKernel : public OpKernel {
+ public:
+  TritonKernel() = delete;
+  TritonKernel(
+      const Node* node,
+      caffe2::serialize::PyTorchStreamReader* reader);
+  ~TritonKernel() override;
+
+  void computeInternal(ExecutionFrame& executionFrame) const override;
+
+ private:
+  std::unique_ptr<TritonKernelManager> loader_;
+
+  // unnamed node attributes will be passed as arguments to the kernel
+  std::vector<void*> attr_ptrs_;
+  std::vector<int64_t> output_indices_;
+  LaunchParams launch_params_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nested/_internal/nested_tensor.py b/torch/nested/_internal/nested_tensor.py
index 14e71c506385e..d3c4ba8c91661 100644
--- a/torch/nested/_internal/nested_tensor.py
+++ b/torch/nested/_internal/nested_tensor.py
@@ -234,14 +234,25 @@ def _maybe_min_seqlen(self) -> Optional[int]:
         mt = self._min_seqlen_tensor
         return None if mt is None else _load_val_from_tensor(mt)
 
+    def _is_contiguous_or_false(self):
+        if self.lengths() is not None:
+            return False
+        from torch._prims_common import is_contiguous_for_memory_format_or_false
+
+        return is_contiguous_for_memory_format_or_false(
+            self._values, memory_format=torch.contiguous_format
+        )
+
     def __repr__(self):  # type: ignore[override]
         # We should implement this in torch/_tensor_str.py instead
         grad_fn_str = (
             f", requires_grad={self.requires_grad}" if self.requires_grad else ""
         )
+
         if self.grad_fn:
             grad_fn_str = f", grad_fn={self.grad_fn}"
-        return f"NestedTensor(size={self._size}, offsets={self._offsets}{grad_fn_str}, contiguous={self.is_contiguous()})"
+
+        return f"NestedTensor(size={self._size}, offsets={self._offsets}{grad_fn_str}, contiguous={self._is_contiguous_or_false()})"
 
     # TODO: Remove this in favor of the default tensor subclass serialization logic.
     # We don't do this today because of https://github.com/pytorch/pytorch/issues/125622.
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
index 8eb962f8a308d..19b1fe670835f 100644
--- a/torch/nested/_internal/ops.py
+++ b/torch/nested/_internal/ops.py
@@ -516,6 +516,29 @@ def is_contiguous_general(func, *args, **kwargs):
 )(is_contiguous_general)
 
 
+@register_jagged_func(
+    torch.ops.aten.sym_is_contiguous.default, "self: jt_all, memory_format: any?"
+)
+def sym_is_contiguous_general(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(  # type: ignore[misc]
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+    inp = new_kwargs.pop("input")
+
+    # If created from narrow() check for lengths
+    if inp.lengths() is not None:
+        return False
+
+    new_kwargs["memory_format"] = new_kwargs.get(
+        "memory_format", torch.contiguous_format
+    )
+
+    if new_kwargs["memory_format"] == torch.preserve_format:
+        return True
+
+    return torch.ops.aten.sym_is_contiguous.default(inp._values, **new_kwargs)
+
+
 @register_jagged_func(
     torch.ops.aten.clone.default, "input: jt_all, memory_format: any?"
 )
@@ -841,6 +864,46 @@ def _softmax_default(func, *args, **kwargs):
     return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
 
 
+@register_jagged_func(
+    torch.ops.aten._log_softmax.default, "self: jt_all, dim: any, half_to_float: any"
+)
+def _log_softmax_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(  # type: ignore[misc]
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    if isinstance(new_kwargs["dim"], tuple):
+        raise RuntimeError(
+            "log_softmax(): not supported for dimensions of type 'tuple' for NestedTensor"
+        )
+
+    inp = new_kwargs.pop("input")
+
+    (
+        new_kwargs["dim"],
+        reduce_on_batch,
+        reduce_on_ragged,
+        _reduce_on_non_batch,
+    ) = _wrap_jagged_dims(
+        inp.dim(), (new_kwargs["dim"],), "log_softmax", inp._ragged_idx
+    )
+
+    if reduce_on_batch:
+        raise RuntimeError(
+            "log_softmax(): not supported when reducing across the batch dimension for NestedTensor"
+        )
+
+    if reduce_on_ragged:
+        raise RuntimeError(
+            "log_softmax(): not supported when reducing along the ragged dimension for NestedTensor"
+        )
+
+    # torch.log_softmax takes in the reduction dimension as an integer
+    new_kwargs["dim"] = new_kwargs["dim"][0]
+
+    return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
+
+
 @register_jagged_func(
     torch.ops.aten._softmax_backward_data.default,
     "grad_output: jt, output: jt, dim: any, input_dtype: any",
@@ -2620,7 +2683,7 @@ def flex_njt(
     kernel_options: Dict[str, Any],
     score_mod_other_buffers: Tuple = (),
     mask_mod_other_buffers: Tuple = (),
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert query.dim() == 4 and key.dim() == 4 and value.dim() == 4
 
     # TODO: Support this if needed; determine if NJT buffers need be unwrapped as dense.
@@ -2633,6 +2696,9 @@ def flex_njt(
             "currently supported. Please file an issue if this is important to you."
         )
 
+    # Always set them since 0 sized elements are not handled gracefully
+    kernel_options = {**kernel_options, "OUTPUT_MAX": True, "OUTPUT_LOGSUMEXP": True}
+
     # need to pass dense tensor of shape (B, n_heads, sum(seq_len), D)
     output = flex_attention_hop(
         query.values().unsqueeze(0),
@@ -2663,7 +2729,15 @@ def flex_njt(
         max_seqlen=query._maybe_max_seqlen,  # type: ignore[attr-defined]
     ).transpose(1, 2)
 
-    return (output_njt, logsumexp_njt)
+    max_scores_njt = torch.nested.nested_tensor_from_jagged(
+        output[2].transpose(1, 2).squeeze(0),
+        query._offsets,  # type: ignore[attr-defined]
+        query._lengths,  # type: ignore[attr-defined]
+        min_seqlen=query._maybe_min_seqlen,  # type: ignore[attr-defined]
+        max_seqlen=query._maybe_max_seqlen,  # type: ignore[attr-defined]
+    ).transpose(1, 2)
+
+    return (output_njt, logsumexp_njt, max_scores_njt)
 
 
 @flex_attention_backward_hop.py_impl(NestedTensor)  # type: ignore[misc]
diff --git a/torch/nn/attention/__init__.py b/torch/nn/attention/__init__.py
index 62012a837e4e0..efdd7daa0d2a6 100644
--- a/torch/nn/attention/__init__.py
+++ b/torch/nn/attention/__init__.py
@@ -39,6 +39,7 @@
         - FLASH_ATTENTION: The flash attention backend for scaled dot product attention.
         - EFFICIENT_ATTENTION: The efficient attention backend for scaled dot product attention.
         - CUDNN_ATTENTION: The cuDNN backend for scaled dot product attention.
+        - OVERRIDEABLE: The overridable backend for extension.
 
     See :func:`torch.nn.attention.sdpa_kernel` for more details.
 
diff --git a/torch/nn/attention/experimental/_paged_attention.py b/torch/nn/attention/experimental/_paged_attention.py
index 2e31b5ec3cec3..70eadcdadfaa0 100644
--- a/torch/nn/attention/experimental/_paged_attention.py
+++ b/torch/nn/attention/experimental/_paged_attention.py
@@ -198,6 +198,7 @@ def convert_logical_block_mask(
         self,
         block_mask: BlockMask,
         batch_idx: Optional[torch.Tensor] = None,
+        kv_len: Optional[torch.Tensor] = None,
     ) -> BlockMask:
         """
         Converts a logical block mask by mapping its logical kv indices to the corresponding
@@ -210,6 +211,8 @@ def convert_logical_block_mask(
                 batch dimension. This provides flexibility to convert a
                 block mask with smaller batch size than the page table;
                 shape :math:`(B)`.
+            kv_len (Optional[Tensor]): actual KV sequence length for upper bound check;
+                shape :math:`(B,)` to handle multiple batches.
         """
         B, H, ROWS, MAX_BLOCKS_IN_COL = block_mask.kv_indices.shape
 
@@ -261,7 +264,7 @@ def convert_logical_block_mask(
                 .to(torch.int32)
             )
 
-        new_mask_mod = self.get_mask_mod(block_mask.mask_mod)
+        new_mask_mod = self.get_mask_mod(block_mask.mask_mod, kv_len)
 
         seq_lengths = (block_mask.seq_lengths[0], self.n_pages * self.page_size)
         return BlockMask.from_kv_blocks(
@@ -275,7 +278,9 @@ def convert_logical_block_mask(
         )
 
     def get_mask_mod(
-        self, mask_mod: Optional[_mask_mod_signature]
+        self,
+        mask_mod: Optional[_mask_mod_signature],
+        kv_len: Optional[torch.Tensor] = None,
     ) -> _mask_mod_signature:
         """
         Converts a mask_mod based on mapping from the physical block index to the logical
@@ -283,6 +288,7 @@ def get_mask_mod(
 
         Args:
             mask_mod (_mask_mod_signature): mask_mod based on the logical block index.
+            kv_len (Optional[torch.Tensor]): actual KV sequence length for upper bound check.
         """
         if mask_mod is None:
             mask_mod = noop_mask
@@ -297,14 +303,21 @@ def new_mask_mod(
             physical_kv_offset = physical_kv_idx % self.page_size
             logical_block_idx = self.physical_to_logical[b, physical_kv_block]
             logical_kv_idx = logical_block_idx * self.page_size + physical_kv_offset
-            return torch.where(
-                logical_block_idx >= 0, mask_mod(b, h, q_idx, logical_kv_idx), False
+            live_block = logical_block_idx >= 0
+            within_upper_bound = (
+                logical_kv_idx < kv_len[b] if kv_len is not None else True
             )
+            within_lower_bound = logical_kv_idx >= 0
+            is_valid = live_block & within_upper_bound & within_lower_bound
+
+            return torch.where(is_valid, mask_mod(b, h, q_idx, logical_kv_idx), False)
 
         return new_mask_mod
 
     def get_score_mod(
-        self, score_mod: Optional[_score_mod_signature]
+        self,
+        score_mod: Optional[_score_mod_signature],
+        kv_len: Optional[torch.Tensor] = None,
     ) -> _score_mod_signature:
         """
         Converts a score_mod based on mapping from the physical block index to the logical
@@ -312,6 +325,8 @@ def get_score_mod(
 
         Args:
             score_mod (_score_mod_signature): score_mod based on the logical block index.
+            `kv_len (Optional[torch.Tensor]): actual KV sequence length for upper bound check.
+
         """
         if score_mod is None:
             score_mod = _identity
@@ -327,8 +342,15 @@ def new_score_mod(
             physical_kv_offset = physical_kv_idx % self.page_size
             logical_block_idx = self.physical_to_logical[b, physical_kv_block]
             logical_kv_idx = logical_block_idx * self.page_size + physical_kv_offset
+            live_block = logical_block_idx >= 0
+            within_upper_bound = (
+                logical_kv_idx < kv_len[b] if kv_len is not None else True
+            )
+            within_lower_bound = logical_kv_idx >= 0
+            is_valid = live_block & within_upper_bound & within_lower_bound
+
             return torch.where(
-                logical_block_idx >= 0,
+                is_valid,
                 score_mod(score, b, h, q_idx, logical_kv_idx),
                 float("-inf"),
             )
diff --git a/torch/nn/attention/flex_attention.py b/torch/nn/attention/flex_attention.py
index ec8027595e6f4..ccd5697aa49c5 100644
--- a/torch/nn/attention/flex_attention.py
+++ b/torch/nn/attention/flex_attention.py
@@ -9,7 +9,7 @@
 import operator
 import warnings
 from enum import Enum
-from typing import Callable, Optional, Union
+from typing import Any, Callable, NamedTuple, Optional, Union
 
 import torch
 from torch import Tensor
@@ -69,6 +69,8 @@ def _warn_once(
 __all__ = [
     "BlockMask",
     "flex_attention",
+    "AuxOutput",
+    "AuxRequest",
     "FlexKernelOptions",
     "create_block_mask",
     "create_mask",
@@ -199,6 +201,26 @@ class FlexKernelOptions(TypedDict, total=False):
     """ROCm-specific waves per execution unit."""
 
 
+class AuxRequest(NamedTuple):
+    """Request which auxiliary outputs to compute from flex_attention.
+
+    Each field is a boolean indicating whether that auxiliary output should be computed.
+    """
+
+    lse: bool = False
+    max_scores: bool = False
+
+
+class AuxOutput(NamedTuple):
+    """Auxiliary outputs from flex_attention operation.
+
+    Fields will be None if not requested, or contain the tensor if requested.
+    """
+
+    lse: Optional[Tensor] = None
+    max_scores: Optional[Tensor] = None
+
+
 class _ModificationType(Enum):
     """Enum for the type of modification function.
     - SCORE_MOD: score_mod function which accepts a score as the first argument
@@ -1263,7 +1285,12 @@ def causal_mask(b, h, q_idx, kv_idx):
 
 
 def _apply_kernel_options(
-    query: Tensor, key: Tensor, value: Tensor, return_lse: bool, kernel_options
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    return_lse: bool,
+    kernel_options,
+    return_aux: Optional[AuxRequest] = None,
 ):
     kernel_options = {} if kernel_options is None else dict(kernel_options)
 
@@ -1273,24 +1300,42 @@ def _apply_kernel_options(
     # This forces all biases grad scatters to be done in the DQ iteration loop of the backwards
     kernel_options.setdefault("WRITE_DQ", True)
 
+    any_inputs_on_cpu_device = (
+        query.device.type == "cpu"
+        or key.device.type == "cpu"
+        or value.device.type == "cpu"
+    )
+
+    # Determine what auxiliary outputs are needed
+    output_lse = return_lse
+    output_max = False
+
+    if return_aux is not None:
+        # New API takes precedence over legacy parameters
+        output_lse = return_aux.lse
+        output_max = return_aux.max_scores
+
     # If forward kernel needs to return logsumexp is decided by this rule internally.
     assert "OUTPUT_LOGSUMEXP" not in kernel_options
     kernel_options["OUTPUT_LOGSUMEXP"] = True
-    if not return_lse:
+    if not output_lse:
         # We used to check if q,k,v required grads but since captured buffers can require grad
         # we always write unless in no_grad
-        output_logsumexp = torch.is_grad_enabled()
-        kernel_options["OUTPUT_LOGSUMEXP"] = output_logsumexp
-        any_inputs_on_cpu_device = (
-            query.device.type == "cpu"
-            or key.device.type == "cpu"
-            or value.device.type == "cpu"
-        )
+        kernel_options["OUTPUT_LOGSUMEXP"] = torch.is_grad_enabled()
         if any_inputs_on_cpu_device:
-            # CPU with torch.compile now supports infernece, and will not return lse
+            # CPU with torch.compile now supports inference, and will not return lse
             # TODO: support CPU for training and return lse
             kernel_options["OUTPUT_LOGSUMEXP"] = False
 
+    # If forward kernel needs to return max is decided by this rule internally.
+    assert "OUTPUT_MAX" not in kernel_options
+    kernel_options["OUTPUT_MAX"] = output_max
+    if any_inputs_on_cpu_device and output_max:
+        # CPU doesn't support returning max yet
+        # TODO: support CPU for returning max
+        raise NotImplementedError("Returning max scores is not supported on CPU.")
+        kernel_options["OUTPUT_MAX"] = False
+
     return kernel_options
 
 
@@ -1306,11 +1351,8 @@ def _validate_device(query: Tensor, key: Tensor, value: Tensor):
     """TODO: Remove once non cuda/cpu devices support is added
     We only need to check query since we have already that q,k,v are on the same device
     """
-    if (
-        query.device.type != "cuda"
-        and query.device.type != "cpu"
-        and query.device.type != "hpu"
-    ):
+    supported_devices = {"cuda", "cpu", "xpu", "hpu"}
+    if query.device.type not in supported_devices:
         raise ValueError(
             "FlexAttention is only supported on CUDA, CPU or HPU devices. "
             f"Found input tensors on {query.device.type} device."
@@ -1405,7 +1447,9 @@ def flex_attention(
     enable_gqa: bool = False,
     return_lse: bool = False,
     kernel_options: Optional[FlexKernelOptions] = None,
-) -> Union[Tensor, tuple[Tensor, Tensor]]:
+    *,
+    return_aux: Optional[AuxRequest] = None,
+) -> Union[Tensor, tuple[Tensor, Tensor], tuple[Tensor, AuxOutput]]:
     r"""This function implements scaled dot product attention with an arbitrary attention score modification function.
 
     This function computes the scaled dot product attention between query, key, and value tensors with a user-defined
@@ -1439,14 +1483,23 @@ def score_mod(
         block_mask (Optional[BlockMask]): BlockMask object that controls the blocksparsity pattern of the attention.
         scale (Optional[float]): Scaling factor applied prior to softmax. If none, the default value is set to :math:`\frac{1}{\sqrt{E}}`.
         enable_gqa (bool): If set to True, enables Grouped Query Attention (GQA) and broadcasts key/value heads to query heads.
-        return_lse (bool): Whether to return the logsumexp of the attention scores. Default is False.
+        return_lse (bool): Whether to return the logsumexp of the attention scores. Default is False. **Deprecated**: Use ``return_aux=AuxRequest(lse=True)`` instead.
         kernel_options (Optional[FlexKernelOptions]):
             Options to control the behavior of the underlying Triton kernels.
             See :class:`FlexKernelOptions` for available options and usage examples.
+        return_aux (Optional[AuxRequest]): Specifies which auxiliary outputs to compute and return.
+            If None, only the attention output is returned. Use ``AuxRequest(lse=True, max_scores=True)``
+            to request both auxiliary outputs.
 
     Returns:
         output (Tensor): Attention output; shape :math:`(B, Hq, L, Ev)`.
 
+        When ``return_aux`` is not None:
+            aux (AuxOutput): Auxiliary outputs with requested fields populated.
+
+        When ``return_aux`` is None (deprecated paths):
+            lse (Tensor): Log-sum-exp of attention scores; shape :math:`(B, Hq, L)`. Only returned if ``return_lse=True``.
+
     Shape legend:
         - :math:`N: \text{Batch size} ... : \text{Any number of other batch dimensions (optional)}`
         - :math:`S: \text{Source sequence length}`
@@ -1550,21 +1603,65 @@ def score_mod(
             f"but got {query.device} and {block_mask.kv_num_blocks.device}."  # type: ignore[union-attr]
         )
 
+    # Handle deprecation warnings for old parameters
+    if return_lse and return_aux is not None:
+        raise ValueError(
+            "Cannot specify both return_lse and return_aux. "
+            "return_lse is deprecated, please use return_aux=AuxRequest(lse=True) instead."
+        )
+    elif return_lse and return_aux is None:
+        _warn_once(
+            "deprecated_return_lse",
+            "return_lse is deprecated and will be removed in v2.7. "
+            "Please use return_aux=AuxRequest(lse=True) instead.",
+            category=FutureWarning,
+        )
+
     kernel_options = _apply_kernel_options(
         query,
         key,
         value,
         return_lse,
         kernel_options,
+        return_aux,
     )
 
+    def _finalize_outputs(
+        out,
+        lse,
+        max_scores,
+        *,
+        return_aux: Optional[AuxRequest],
+        return_lse: bool,
+    ):
+        """Normalize stats and build return value (aux-aware, legacy-compatible)."""
+        ln2 = math.log(2.0)
+        return_lse = return_lse or return_aux is not None and return_aux.lse
+        return_max = return_aux is not None and return_aux.max_scores
+
+        lse_scaled = lse * ln2 if (return_lse and lse.numel() > 0) else None
+        max_scaled = (
+            max_scores * ln2 if (return_max and max_scores.numel() > 0) else None
+        )
+
+        if return_aux is not None:
+            return out, AuxOutput(
+                lse=lse_scaled,
+                max_scores=max_scaled,
+            )
+
+        if return_lse:
+            return out, lse_scaled
+
+        return out
+
     if torch.compiler.is_dynamo_compiling():
         # mark head_dim and number of heads to be static
         for x in [query, key, value]:
             torch._dynamo.mark_static(x, -3)
             torch._dynamo.mark_static(x, -1)
 
-        out, lse = flex_attention_hop(
+        out, lse, max_scores = flex_attention_hop(
             query,
             key,
             value,
@@ -1573,10 +1670,9 @@ def score_mod(
             scale,
             kernel_options,  # type: ignore[union-attr]
         )
-        if return_lse:
-            return out, lse * math.log(2)
-        else:
-            return out
+        return _finalize_outputs(
+            out, lse, max_scores, return_aux=return_aux, return_lse=return_lse
+        )
 
     if not _FLEX_ATTENTION_DISABLE_COMPILE_DEBUG:
         _warn_once(
@@ -1607,8 +1703,8 @@ def _flex_attention_hop_wrapper(*args, **kwargs):
             with _temp_remove_pre_dispatch_torch_function_mode():
                 with _temp_remove_metadata_torch_function_mode() as metadata_mode:
                     if metadata_mode:
-                        backend = make_eager_backend_with_torch_function_mode(
-                            metadata_mode
+                        backend: Union[str, Callable[..., Any]] = (
+                            make_eager_backend_with_torch_function_mode(metadata_mode)
                         )
                     else:
                         backend = "eager"
@@ -1620,7 +1716,7 @@ def _flex_attention_hop_wrapper(*args, **kwargs):
                             _flex_attention_hop_wrapper, backend=backend, fullgraph=True
                         )
 
-                    out, lse = flex_fn(
+                    out, lse, max_scores = flex_fn(
                         query,
                         key,
                         value,
@@ -1629,7 +1725,6 @@ def _flex_attention_hop_wrapper(*args, **kwargs):
                         scale,
                         kernel_options,
                     )
-    if return_lse:
-        return out, lse * math.log(2)
-    else:
-        return out
+    return _finalize_outputs(
+        out, lse, max_scores, return_aux=return_aux, return_lse=return_lse
+    )
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 6b61c3a5799db..92142fd44df88 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -5823,7 +5823,6 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
                 assert attn_mask is None
                 temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
                 attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
-                attn_bias.to(query.dtype)
 
             if attn_mask is not None:
                 if attn_mask.dtype == torch.bool:
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 6fa0d53c8a448..949c9f46d0085 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1287,7 +1287,9 @@ class probabilities only when a single class label per minibatch item is too res
           :math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`. The
           target data type is required to be long when using class indices. If containing class probabilities, the
           target must be the same shape input, and each value should be between :math:`[0, 1]`. This means the target
-          data type is required to be float when using class probabilities.
+          data type is required to be float when using class probabilities. Note that PyTorch does not strictly enforce
+          probability constraints on the class probabilities and that it is the user's responsibility to ensure
+          ``target`` contains valid probability distributions (see below examples section for more details).
         - Output: If reduction is 'none', shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
           in the case of K-dimensional loss, depending on the shape of the input. Otherwise, scalar.
 
@@ -1314,6 +1316,51 @@ class probabilities only when a single class label per minibatch item is too res
         >>> target = torch.randn(3, 5).softmax(dim=1)
         >>> output = loss(input, target)
         >>> output.backward()
+
+    .. note::
+        When ``target`` contains class probabilities, it should consist of soft labels—that is,
+        each ``target`` entry should represent a probability distribution over the possible classes for a given data sample,
+        with individual probabilities between ``[0,1]`` and the total distribution summing to 1.
+        This is why the :func:`softmax()` function is applied to the ``target`` in the class probabilities example above.
+
+        PyTorch does not validate whether the values provided in ``target`` lie in the range ``[0,1]``
+        or whether the distribution of each data sample sums to ``1``.
+        No warning will be raised and it is the user's responsibility
+        to ensure that ``target`` contains valid probability distributions.
+        Providing arbitrary values may yield misleading loss values and unstable gradients during training.
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> # Example of target with incorrectly specified class probabilities
+        >>> loss = nn.CrossEntropyLoss()
+        >>> torch.manual_seed(283)
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randn(3, 5)
+        >>> # Provided target class probabilities are not in range [0,1]
+        >>> target
+        tensor([[ 0.7105,  0.4446,  2.0297,  0.2671, -0.6075],
+                [-1.0496, -0.2753, -0.3586,  0.9270,  1.0027],
+                [ 0.7551,  0.1003,  1.3468, -0.3581, -0.9569]])
+        >>> # Provided target class probabilities do not sum to 1
+        >>> target.sum(axis=1)
+        tensor([2.8444, 0.2462, 0.8873])
+        >>> # No error message and possible misleading loss value
+        >>> loss(input, target).item()
+        4.6379876136779785
+        >>>
+        >>> # Example of target with correctly specified class probabilities
+        >>> # Use .softmax() to ensure true probability distribution
+        >>> target_new = target.softmax(dim=1)
+        >>> # New target class probabilities all in range [0,1]
+        >>> target_new
+        tensor([[0.1559, 0.1195, 0.5830, 0.1000, 0.0417],
+                [0.0496, 0.1075, 0.0990, 0.3579, 0.3860],
+                [0.2607, 0.1355, 0.4711, 0.0856, 0.0471]])
+        >>> # New target class probabilities sum to 1
+        >>> target_new.sum(axis=1)
+        tensor([1.0000, 1.0000, 1.0000])
+        >>> loss(input, target_new).item()
+        2.55349063873291
     """
 
     __constants__ = ["ignore_index", "reduction", "label_smoothing"]
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_ge b/torch/numa/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_ge
rename to torch/numa/__init__.py
diff --git a/torch/distributed/numa/binding.py b/torch/numa/binding.py
similarity index 65%
rename from torch/distributed/numa/binding.py
rename to torch/numa/binding.py
index 51876583ec56c..b92a046676f94 100644
--- a/torch/distributed/numa/binding.py
+++ b/torch/numa/binding.py
@@ -1,28 +1,24 @@
 import os
-import shutil
-import subprocess
 import traceback
 from collections import defaultdict
-from collections.abc import Iterable
-from dataclasses import dataclass
+from collections.abc import Iterable, Iterator
+from contextlib import contextmanager
+from dataclasses import asdict, dataclass
 from enum import Enum
+from logging import getLogger
 from typing import Callable, Optional, TypeVar
 
 import torch
 from torch._utils_internal import signpost_event
-from torch.distributed.elastic.utils.logging import get_logger
 
 
 __all__ = [
-    "maybe_wrap_with_numa_bindings",
     "AffinityMode",
+    "maybe_temporarily_apply_numa_binding_to_current_thread",
     "NumaOptions",
 ]
 
-
-_NUMACTL_COMMAND = "numactl"
-
-logger = get_logger(__file__)
+logger = getLogger(__name__)
 
 
 class AffinityMode(str, Enum):
@@ -40,10 +36,10 @@ class AffinityMode(str, Enum):
 @dataclass(frozen=True)
 class NumaOptions:
     affinity_mode: AffinityMode
+
     """
-    If true, we will silently return the original command if any of the following occur:
-    - An exception is raised as we compute the wrapped command.
-    - During a dry run of the wrapped command, numactl fails for any reason.
+    If true, we will fall back to using the original command/entrypoint if we fail to compute
+    or apply NUMA bindings.
 
     You should avoid using this option! It is only intended as a safety mechanism for facilitating
     mass rollouts of numa binding.
@@ -51,185 +47,136 @@ class NumaOptions:
     should_fall_back_if_binding_fails: bool = False
 
 
-def maybe_wrap_with_numa_bindings(
-    *,
-    entrypoint: str,
-    local_rank_to_args: dict[int, tuple],
-    numa_options: Optional[NumaOptions],
-) -> tuple[str, dict[int, tuple]]:
+@contextmanager
+def maybe_temporarily_apply_numa_binding_to_current_thread(
+    *, gpu_index: int, numa_options: Optional[NumaOptions]
+) -> Iterator[None]:
     """
-    Args:
-        entrypoint: The entrypoint to the program, such as might be input to Popen.
-            Example: "python"
-        local_rank_to_args: A mapping from local rank to args for the entrypoint.
-            Example: {0: ("trainer.py",)}
-        numa_options: See NumaOptions for details.
-
-    Returns:
-        A tuple of (entrypoint, local_rank_to_args), basically transforming the inputs,
-        where the entrypoint and args may now involve numa binding.
-        Example: ("numactl", {"0": ("--cpunodebind=0", "--preferred=0", "python", "trainer.py")})
+    1. Applies NUMA binding to the current thread, suitable for the thread
+    which will be interacting with GPU gpu_index.
+    2. Resets to the original CPU affinity before exiting the context manager.
     """
     if numa_options is None:
-        return (entrypoint, local_rank_to_args)
-
-    wrapped_local_rank_to_args = {}
-    for local_rank, args in local_rank_to_args.items():
-        try:
-            numactl_command_options = _maybe_get_numactl_options(
-                command_args=(entrypoint, *[str(arg) for arg in args]),
-                gpu_index=local_rank,
-                numa_options=numa_options,
-            )
-        except Exception:
-            if numa_options.should_fall_back_if_binding_fails:
-                # NOTE: If any element of the batch fails to apply NUMA bindings
-                # for any reason, we do not apply NUMA bindings to any element of the batch,
-                # for maximum safety. This only applies if fallback is enabled.
-                return (entrypoint, local_rank_to_args)
-            raise
-        wrapped_local_rank_to_args[local_rank] = (
-            *numactl_command_options,
-            entrypoint,
-            *args,
-        )
-    return (_NUMACTL_COMMAND, wrapped_local_rank_to_args)
+        yield
+        return
 
+    original_logical_cpu_indices = _get_allowed_cpu_indices_for_current_thread()
+    _apply_numa_binding_to_current_thread(
+        gpu_index=gpu_index, numa_options=numa_options
+    )
+    yield
+    _bind_current_thread_to_logical_cpus(
+        logical_cpu_indices=original_logical_cpu_indices
+    )
 
-def _maybe_get_numactl_options(
-    *,
-    command_args: tuple[str, ...],
-    gpu_index: int,
-    numa_options: NumaOptions,
-) -> tuple[str, ...]:
-    """
-    Args:
-        command_args: The args for a command, such as might be input to Popen.
-            Example: ("python", "trainer.py")
-        gpu_index: The index of the GPU that will be used by the subprocess which executes command_args.
-            Example: 0
-        numa_options: See NumaOptions for details.
 
-    Returns:
-        Depending on numa_options, something like
-            ("--cpunodebind=0", "--preferred=0")
-    """
+def _apply_numa_binding_to_current_thread(
+    *, gpu_index: int, numa_options: NumaOptions
+) -> None:
+    kwargs = {
+        "gpu_index": gpu_index,
+        "numa_options": asdict(numa_options),
+    }
+    logger.info("Attempting to apply NUMA binding, given input %r", kwargs)
+
     try:
-        _raise_if_numactl_not_available()
-        if numa_options.affinity_mode == AffinityMode.NODE:
-            numactl_command_options = _get_node_numactl_options(gpu_index=gpu_index)
-        elif numa_options.affinity_mode == AffinityMode.SOCKET:
-            numactl_command_options = _get_socket_numactl_options(gpu_index=gpu_index)
-        elif numa_options.affinity_mode == AffinityMode.EXCLUSIVE:
-            numactl_command_options = _get_exclusive_numactl_options(
-                gpu_index=gpu_index
-            )
-        elif numa_options.affinity_mode == AffinityMode.CORE_COMPLEX:
-            numactl_command_options = _get_core_complex_numactl_options(
-                gpu_index=gpu_index
-            )
-        else:
-            raise ValueError(
-                f"Affinity mode {numa_options.affinity_mode} not supported."
-            )
+        logical_cpu_indices = _get_logical_cpus_to_bind_to(
+            gpu_index=gpu_index, numa_options=numa_options
+        )
+        logger.info(
+            "Computed logical_cpu_indices=%s for NUMA binding",
+            _get_ranges_str_from_ints(logical_cpu_indices),
+        )
+
+        _raise_if_logical_cpu_indices_invalid(logical_cpu_indices=logical_cpu_indices)
+        logger.info(
+            "Validated logical_cpu_indices=%s for NUMA binding",
+            _get_ranges_str_from_ints(logical_cpu_indices),
+        )
+
+        _bind_current_thread_to_logical_cpus(logical_cpu_indices=logical_cpu_indices)
+        logger.info(
+            "Successfully bound to logical_cpu_indices=%s for NUMA binding",
+            _get_ranges_str_from_ints(logical_cpu_indices),
+        )
 
-        if numa_options.should_fall_back_if_binding_fails:
-            _raise_if_numactl_fails_dry_run(numactl_options=numactl_command_options)
         signpost_event(
             category="numa_binding",
-            name="wrap_command_success",
+            name="apply_success",
             parameters={
-                "original_command_args": command_args,
-                "gpu_index": gpu_index,
-                "numa_options": numa_options,
-                "numactl_command_options": numactl_command_options,
+                **kwargs,
+                "logical_cpu_indices": _get_ranges_str_from_ints(logical_cpu_indices),
             },
         )
-        return numactl_command_options
     except Exception:
         signpost_event(
             category="numa_binding",
-            name="wrap_command_exception",
+            name="apply_exception",
             parameters={
+                **kwargs,
                 "traceback": traceback.format_exc(),
-                "original_command_args": command_args,
-                "gpu_index": gpu_index,
-                "numa_options": numa_options,
             },
         )
-        logger.exception(
-            """Failed to wrap command with NUMA bindings.
-            Input:
-                command_args=%r,
-                gpu_index=%d,
-                numa_options=%r,
-        """,
-            command_args,
-            gpu_index,
-            numa_options,
-        )
+        logger.exception("Failed to apply NUMA binding for input=%r", kwargs)
+        if numa_options.should_fall_back_if_binding_fails:
+            logger.warning(
+                "Continuing executing without applying NUMA binding, despite exception %s",
+                traceback.format_exc(),
+            )
+            return None
         raise
 
 
-def _raise_if_numactl_fails_dry_run(*, numactl_options: tuple[str, ...]) -> None:
-    noop_args = _get_assembled_command_from_pieces(
-        # Execute arbitrary noop
-        command_args=("true",),
-        numactl_options=numactl_options,
-    )
-    try:
-        subprocess.run(
-            noop_args,
-            stdout=subprocess.DEVNULL,
-            # These allow us to capture the stderr as text
-            stderr=subprocess.PIPE,
-            text=True,
-            # Raise exception if nonzero exit status.
-            check=True,
-        )
-    except subprocess.CalledProcessError as e:
-        raise RuntimeError(
-            f"""Our binding logic inferred to prepend your command with options {noop_args[:-1]}.
-            Before doing that, we did a noop dry run with args {noop_args}, but that command failed.
-            This should NOT happen, and likely suggests a bug in pytorch's numa binding logic.
+def _raise_if_logical_cpu_indices_invalid(*, logical_cpu_indices: set[int]) -> None:
+    if not logical_cpu_indices:
+        raise RuntimeError("Must bind to a non-empty set of CPU indices")
 
-            The {_NUMACTL_COMMAND} command itself had this stderr:
 
-            {e.stderr}
-            """
-        ) from e
+def _bind_current_thread_to_logical_cpus(*, logical_cpu_indices: set[int]) -> None:
+    # 0 represents the current thread
+    os.sched_setaffinity(0, logical_cpu_indices)
 
 
-def _get_assembled_command_from_pieces(
-    *, command_args: tuple[str, ...], numactl_options: tuple[str, ...]
-) -> tuple[str, ...]:
-    # Syntax for invoking a command but with numactl activated is numactl <args> command <args>
-    return (_NUMACTL_COMMAND, *numactl_options, *command_args)
+def _get_logical_cpus_to_bind_to(
+    *,
+    gpu_index: int,
+    numa_options: NumaOptions,
+) -> set[int]:
+    """
+    Args:
+        gpu_index: The index of the GPU that will be used by the subprocess.
+            Example: 0
+        numa_options: See NumaOptions for details.
 
+    Returns:
+        Set of logical CPU indices to bind to.
+    """
+    if numa_options.affinity_mode == AffinityMode.NODE:
+        logical_cpus = _node_get_logical_cpus_to_bind_to(gpu_index=gpu_index)
+    elif numa_options.affinity_mode == AffinityMode.SOCKET:
+        logical_cpus = _socket_get_logical_cpus_to_bind_to(gpu_index=gpu_index)
+    elif numa_options.affinity_mode == AffinityMode.EXCLUSIVE:
+        logical_cpus = _exclusive_get_logical_cpus_to_bind_to(gpu_index=gpu_index)
+    elif numa_options.affinity_mode == AffinityMode.CORE_COMPLEX:
+        logical_cpus = _core_complex_get_logical_cpus_to_bind_to(gpu_index=gpu_index)
+    else:
+        raise ValueError(f"Affinity mode {numa_options.affinity_mode} not supported.")
 
-def _raise_if_numactl_not_available() -> None:
-    if not shutil.which(_NUMACTL_COMMAND):
-        raise RuntimeError(
-            f"{_NUMACTL_COMMAND} shell command is required for NUMA bindings."
-        )
+    return logical_cpus
 
 
-def _get_node_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
+def _node_get_logical_cpus_to_bind_to(*, gpu_index: int) -> set[int]:
     """
     Core logic of 'node' numa strategy.
-
-    Returns options to be used with numactl. E.g.,
-    ("--cpunodebind=0", "--preferred=0").
     """
     numa_node_index = _get_numa_node_index_for_gpu_index(gpu_index=gpu_index)
 
-    return (
-        f"--cpunodebind={numa_node_index}",
-        f"--preferred={numa_node_index}",
+    return _get_allowed_logical_cpu_indices_for_numa_node(
+        numa_node_index=numa_node_index
     )
 
 
-def _get_socket_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
+def _socket_get_logical_cpus_to_bind_to(*, gpu_index: int) -> set[int]:
     """
     Core logic of 'socket' numa strategy.
     """
@@ -240,19 +187,19 @@ def _get_socket_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
     numa_node_indices = _get_numa_node_indices_for_socket_index(
         socket_index=socket_index
     )
-    numa_node_indices_str = _get_ranges_str_from_ints(numa_node_indices)
-
-    return (
-        f"--cpunodebind={numa_node_indices_str}",
-        (
-            f"--preferred-many={numa_node_indices_str}"
-            if len(numa_node_indices) > 1
-            else f"--preferred={numa_node_indices_str}"
-        ),
-    )
+
+    logical_cpus = set()
+    for numa_node_index in numa_node_indices:
+        logical_cpus.update(
+            _get_allowed_logical_cpu_indices_for_numa_node(
+                numa_node_index=numa_node_index
+            )
+        )
+
+    return logical_cpus
 
 
-def _get_exclusive_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
+def _exclusive_get_logical_cpus_to_bind_to(*, gpu_index: int) -> set[int]:
     """
     Core logic of 'exclusive' numa strategy.
     """
@@ -311,21 +258,18 @@ def _get_exclusive_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
     )
 
     # Slice and flatten the logical CPUs from the selected physical cores
-    logical_cpu_indices_for_original_gpu = (
+    logical_cpu_indices_for_original_gpu = {
         logical_cpu_index
         for logical_cpu_indices in list(
             physical_core_to_allowed_logical_cpu_indices.values()
         )[start:end]
         for logical_cpu_index in logical_cpu_indices
-    )
+    }
 
-    return (
-        f"--physcpubind={_get_ranges_str_from_ints(logical_cpu_indices_for_original_gpu)}",
-        f"--preferred={numa_node_index}",
-    )
+    return logical_cpu_indices_for_original_gpu
 
 
-def _get_core_complex_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
+def _core_complex_get_logical_cpus_to_bind_to(*, gpu_index: int) -> set[int]:
     """
     Core logic of 'core-complex' numa strategy.
 
@@ -369,10 +313,7 @@ def _get_core_complex_numactl_options(*, gpu_index: int) -> tuple[str, ...]:
         max_level_cache_to_allowed_logical_cpu_indices.values()
     )[cache_index_for_original_gpu]
 
-    return (
-        f"--physcpubind={_get_ranges_str_from_ints(logical_cpu_indices_for_original_gpu)}",
-        f"--preferred={numa_node_index}",
-    )
+    return logical_cpu_indices_for_original_gpu
 
 
 K = TypeVar("K")
@@ -442,7 +383,7 @@ def _get_allowed_logical_cpu_indices_for_numa_node(*, numa_node_index: int) -> s
     all_cpu_indices = _get_cpu_indices_for_numa_node_MAYBE_NOT_ALLOWED(
         numa_node_index=numa_node_index
     )
-    allowed_cpu_indices = _get_allowed_cpu_indices_for_current_process()
+    allowed_cpu_indices = _get_allowed_cpu_indices_for_current_thread()
     return all_cpu_indices & allowed_cpu_indices
 
 
@@ -452,7 +393,7 @@ def _get_cpu_indices_for_numa_node_MAYBE_NOT_ALLOWED(
     """
     Returns:
         Indices of all CPUs associated with numa_node_index. However, the list
-        is not filtered based on whether the process is allowed to use them.
+        is not filtered based on whether the thread is allowed to use them.
     """
     cpulist_absolute_path = f"/sys/devices/system/node/node{numa_node_index}/cpulist"
     try:
@@ -601,6 +542,6 @@ def _get_numa_node_indices_for_socket_index(*, socket_index: int) -> set[int]:
     return matching_numa_node_indices
 
 
-def _get_allowed_cpu_indices_for_current_process() -> set[int]:
-    # 0 denotes current process
+def _get_allowed_cpu_indices_for_current_thread() -> set[int]:
+    # 0 denotes current thread
     return os.sched_getaffinity(0)
diff --git a/torch/onnx/README.md b/torch/onnx/README.md
index 7c8596365f270..3878f48d70be0 100644
--- a/torch/onnx/README.md
+++ b/torch/onnx/README.md
@@ -4,92 +4,3 @@ Torch->ONNX converter / exporter.
 
 - User-facing docs: https://pytorch.org/docs/main/onnx.html
 - Developer docs: https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter
-
-> Read the following if you are contributing to `torch.onnx`
-
-## Symbolic functions Opsets
-
-Opset 9 is the base version. It is selected as the base version because
-
-1. It is the first opset version supported by PyTorch export.
-2. Opset 9 is more robust than previous opset versions. Opset versions like 7/8 have limitations
-    that certain basic operators cannot be expressed in ONNX. Instead of basing on these limitations,
-    we chose to handle them as special cases separately.
-
-Backward support for opset versions beyond opset 7 is not in our roadmap.
-
-For opset versions other than 9, by default they will inherit the symbolic functions defined in
-symbolic_opset9.py.
-
-To extend support for updated operators in different opset versions on top of opset 9,
-simply add the updated symbolic functions in the respective symbolic_opset{version}.py file.
-Check out topk in symbolic_opset10.py, and upsample_nearest2d in symbolic_opset8.py for example.
-
-## Editing Symbolic Files
-
-- Use the internal `registration.onnx_symbolic` decorator to register a new symbolic function. Search for `def reshape(g, self, shape):` to see an example.
-- Parameter names must *exactly* match the names in
-  aten/src/ATen/native/native_functions.yaml, because
-  dispatch is done with keyword arguments.
-- Looking for inplace ops? They're detected by
-  `_jit_pass_onnx_remove_inplace_ops_for_onnx`, and
-  transparently dispatched to their non inplace versions in
-  "run_symbolic_function". See Note [Export inplace](#export-inplace)
-
-### A note on Tensor types
-
-In general, we should avoid depending on the type of Tensor Values contained
-within the trace graph. However, this is sometimes unavoidable (due to ONNX
-spec requirements, etc). The TensorType object has accessors for these properties that return the property if it is statically known and return nullopt otherwise.
-
-In general, we should prefer to rely on the least specific information possible.
-For example, not relying on tensor properties at all is better than relying
-on the number of dimensions which is better than relying on
-concrete shapes. Doing so will make the export symbolics
-more robust to different graphs.
-
-### Extra context for symbolic functions
-
-The first argument of a symbolic function is always a `GraphContext` object.
-
-`GraphContext` contains all methods defined in a `torch.Graph` object and context
-for the symbolic function.
-
-In general, symbolic functions only require inputs and attributes to
-the original node. An example of a symbolic function needing context is
-`prim::Loop`. It needs access to the sub-block of the original node.
-
-### Export inplace
-
-It would be better for us to export inplace annotations,
-than to not export them, since it is useful information that can
-help the target of an ONNX export export more efficiently. However,
-ONNX doesn't currently formalize inplace. Fortunately, it's sound to drop
-inplace annotations, but we are losing information this way.
-
-### Pointwise by scalar
-
-What happens if you add a tensor with a constant (e.g., x + 2)?  There are
-some moving parts to implementing the ONNX translation in this case:
-
-- By the time we get the scalar in a symbolic function here, it is no longer a
-  Python long/float, but a PyTorch tensor with `numel == 1` (eventually, we want
-  it to be a zero dim tensor but this change has not happened yet.) However, the
-  type of this scalar is *exactly* what the user wrote in Python, which may not
-  match the tensor it is being added to. PyTorch will do implicit conversions on
-  scalars; however, ONNX will not, so we must do the conversion ourselves. This
-  is what `symbolic_helper._if_scalar_type_as()` and
-  `_jit_pass_onnx_scalar_type_analysis` does.
-
-- Dispatch to these functions takes advantage an outrageous coincidence
-    between the tensor and scalar name.  When we add two tensors together,
-    you get the dispatch:
-
-    add(*[self, other], **{"alpha": alpha})
-
-    When you add a tensor and a scalar, you get the dispatch:
-
-    add(*[self], **{"other": other, "alpha": alpha})
-
-    By having the argument name line up with the name of the scalar attribute
-    if it exists, we can write a single function for both overloads.
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index 6c301ef294eb1..f9c955bef6d6f 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -6,80 +6,41 @@
     # Modules
     "errors",
     "ops",
-    "symbolic_helper",
-    "utils",
-    # All opsets
-    "symbolic_opset7",
-    "symbolic_opset8",
-    "symbolic_opset9",
-    "symbolic_opset10",
-    "symbolic_opset11",
-    "symbolic_opset12",
-    "symbolic_opset13",
-    "symbolic_opset14",
-    "symbolic_opset15",
-    "symbolic_opset16",
-    "symbolic_opset17",
-    "symbolic_opset18",
-    "symbolic_opset19",
-    "symbolic_opset20",
-    # Enums
-    "OperatorExportTypes",
-    "TrainingMode",
-    "TensorProtoDataType",
-    "JitScalarType",
     # Public functions
     "export",
     "is_in_onnx_export",
-    "select_model_mode_for_export",
-    "register_custom_op_symbolic",
-    "unregister_custom_op_symbolic",
     # Base error
     "OnnxExporterError",
     "ONNXProgram",
-    "enable_fake_mode",
 ]
 
 from typing import Any, Callable, TYPE_CHECKING
-from typing_extensions import deprecated
 
 import torch
 from torch._C import _onnx as _C_onnx
-from torch._C._onnx import OperatorExportTypes, TensorProtoDataType, TrainingMode
-
-from ._internal._exporter_legacy import enable_fake_mode
-from ._internal.exporter._onnx_program import ONNXProgram
-from ._type_utils import JitScalarType
-from .errors import OnnxExporterError
-from .utils import (
-    _run_symbolic_function,
-    _run_symbolic_method,
-    register_custom_op_symbolic,
-    select_model_mode_for_export,
-    unregister_custom_op_symbolic,
+from torch._C._onnx import (  # Deprecated members that are excluded from __all__
+    OperatorExportTypes as OperatorExportTypes,
+    TensorProtoDataType as TensorProtoDataType,
+    TrainingMode as TrainingMode,
 )
 
-
-from . import (  # usort: skip. Keep the order instead of sorting lexicographically
-    errors,
-    ops,
+from . import errors, ops
+from ._internal.exporter._onnx_program import ONNXProgram
+from ._internal.torchscript_exporter import (  # Deprecated members that are excluded from __all__
     symbolic_helper,
-    symbolic_opset7,
-    symbolic_opset8,
-    symbolic_opset9,
     symbolic_opset10,
-    symbolic_opset11,
-    symbolic_opset12,
-    symbolic_opset13,
-    symbolic_opset14,
-    symbolic_opset15,
-    symbolic_opset16,
-    symbolic_opset17,
-    symbolic_opset18,
-    symbolic_opset19,
-    symbolic_opset20,
+    symbolic_opset9,
     utils,
 )
+from ._internal.torchscript_exporter._type_utils import (
+    JitScalarType,  # Deprecated members that are excluded from __all__
+)
+from ._internal.torchscript_exporter.utils import (  # Deprecated members that are excluded from __all__
+    register_custom_op_symbolic,
+    select_model_mode_for_export,
+    unregister_custom_op_symbolic,
+)
+from .errors import OnnxExporterError
 
 
 if TYPE_CHECKING:
@@ -87,11 +48,10 @@
     from collections.abc import Collection, Mapping, Sequence
 
 # Set namespace for exposed private names
-JitScalarType.__module__ = "torch.onnx"
 ONNXProgram.__module__ = "torch.onnx"
 OnnxExporterError.__module__ = "torch.onnx"
-enable_fake_mode.__module__ = "torch.onnx"
 
+# TODO(justinchuby): Remove these two properties
 producer_name = "pytorch"
 producer_version = _C_onnx.PRODUCER_VERSION
 
@@ -114,7 +74,7 @@ def export(
     | Mapping[str, Sequence[int]]
     | None = None,
     keep_initializers_as_inputs: bool = False,
-    dynamo: bool = False,
+    dynamo: bool = True,
     # Dynamo only options
     external_data: bool = True,
     dynamic_shapes: dict[str, Any] | tuple[Any, ...] | list[Any] | None = None,
@@ -126,7 +86,7 @@ def export(
     profile: bool = False,
     dump_exported_program: bool = False,
     artifacts_dir: str | os.PathLike = ".",
-    fallback: bool = False,
+    fallback: bool = True,
     # Deprecated options
     training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
     operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX,
@@ -169,7 +129,10 @@ def export(
         output_names: names to assign to the output nodes of the graph, in order.
         opset_version: The version of the
             `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
-            to target. Must be >= 7.
+            to target. You should set ``opset_version`` according to the supported opset versions
+            of the runtime backend or compiler you want to run the exported model with.
+            Leave as default (``None``) to use the recommended version, or refer to
+            the ONNX operators documentation for more information.
         dynamic_axes:
 
             By default the exported model will have the shapes of all input and output tensors
@@ -385,7 +348,7 @@ def forward(self, x):
     else:
         import warnings
 
-        from torch.onnx.utils import export
+        from ._internal.torchscript_exporter.utils import export
 
         warnings.warn(
             "You are using the legacy TorchScript-based ONNX export. Starting in PyTorch 2.9, "
@@ -429,7 +392,7 @@ def forward(self, x):
 
 def is_in_onnx_export() -> bool:
     """Returns whether it is in the middle of ONNX export."""
-    from torch.onnx._globals import GLOBALS
     from torch.onnx._internal.exporter import _flags
+    from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 
     return GLOBALS.in_onnx_export or _flags._is_onnx_exporting
diff --git a/torch/onnx/_flags.py b/torch/onnx/_flags.py
index e30fd20a293a6..b88e3b3363f1d 100644
--- a/torch/onnx/_flags.py
+++ b/torch/onnx/_flags.py
@@ -43,8 +43,8 @@ def _load_boolean_flag(
     return state
 
 
-PLACEHOLDER: bool = _load_boolean_flag(
-    "TORCH_ONNX_PLACEHOLDER",
-    this_will="do nothing",
-    default=True,
+ENABLE_DRAFT_EXPORT: bool = _load_boolean_flag(
+    "TORCH_ONNX_ENABLE_DRAFT_EXPORT",
+    this_will="enable torch.export.draft_export as a strategy for capturing models",
+    default=False,
 )
diff --git a/torch/onnx/_internal/_exporter_legacy.py b/torch/onnx/_internal/_exporter_legacy.py
deleted file mode 100644
index f9ae42b26b84f..0000000000000
--- a/torch/onnx/_internal/_exporter_legacy.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# mypy: allow-untyped-defs
-from __future__ import annotations
-
-
-__all__ = [
-    "enable_fake_mode",
-]
-
-
-import contextlib
-import dataclasses
-import logging
-from typing import Any, TYPE_CHECKING
-
-import torch
-import torch._ops
-from torch.onnx._internal.fx import patcher as patcher
-
-
-# We can only import onnx from this module in a type-checking context to ensure that
-# 'import torch.onnx' continues to work without having 'onnx' installed. We fully
-# 'import onnx' inside of dynamo_export (by way of _assert_dependencies).
-if TYPE_CHECKING:
-    import io
-
-    from torch._subclasses import fake_tensor
-
-log = logging.getLogger(__name__)
-
-
-@dataclasses.dataclass
-class ONNXFakeContext:
-    """A dataclass used to store context for model export using FakeTensor.
-
-    This dataclass stores the FakeTensorMode instance used to convert
-    real tensors and model parameters into fake tensors. This :attr:`ONNXFakeContext.fake_mode` is
-    reused internally during tracing of a :class:`torch.nn.Module` into a FX :class:`GraphModule`.
-    """
-
-    fake_mode: fake_tensor.FakeTensorMode
-    """The fake tensor mode used for tracing model using fake tensors and parameters."""
-
-    state_dict_paths: tuple[str | io.BytesIO | dict[str, Any]] | None = None
-    """List of paths of files that contain the model :meth:`state_dict`"""
-
-
-@contextlib.contextmanager
-def enable_fake_mode():
-    """Enable fake mode for the duration of the context.
-
-    Internally it instantiates a :class:`torch._subclasses.fake_tensor.FakeTensorMode` context manager
-    that converts user input and model parameters into :class:`torch._subclasses.fake_tensor.FakeTensor`.
-
-    A :class:`torch._subclasses.fake_tensor.FakeTensor`
-    is a :class:`torch.Tensor` with the ability to run PyTorch code without having to
-    actually do computation through tensors allocated on a ``meta`` device. Because
-    there is no actual data being allocated on the device, this API allows for
-    initializing and exporting large models without the actual memory footprint needed for executing it.
-
-    It is highly recommended to initialize the model in fake mode when exporting models that
-    are too large to fit into memory.
-
-    .. note::
-        This function does not support torch.onnx.export(..., dynamo=True, optimize=True).
-        Please call ONNXProgram.optimize() outside of the function after the model is exported.
-
-    Example::
-
-        # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
-        >>> import torch
-        >>> class MyModel(torch.nn.Module):  # Model with a parameter
-        ...     def __init__(self) -> None:
-        ...         super().__init__()
-        ...         self.weight = torch.nn.Parameter(torch.tensor(42.0))
-        ...     def forward(self, x):
-        ...         return self.weight + x
-        >>> with torch.onnx.enable_fake_mode():
-        ...     # When initialized in fake mode, the model's parameters are fake tensors
-        ...     # They do not take up memory so we can initialize large models
-        ...     my_nn_module = MyModel()
-        ...     arg1 = torch.randn(2, 2, 2)
-        >>> onnx_program = torch.onnx.export(my_nn_module, (arg1,), dynamo=True, optimize=False)
-        >>> # Saving model WITHOUT initializers (only the architecture)
-        >>> onnx_program.save(
-        ...     "my_model_without_initializers.onnx",
-        ...     include_initializers=False,
-        ...     keep_initializers_as_inputs=True,
-        ... )
-        >>> # Saving model WITH initializers after applying concrete weights
-        >>> onnx_program.apply_weights({"weight": torch.tensor(42.0)})
-        >>> onnx_program.save("my_model_with_initializers.onnx")
-
-    .. warning::
-        This API is experimental and is *NOT* backward-compatible.
-
-    """
-    from torch._subclasses import fake_tensor
-    from torch.fx.experimental.symbolic_shapes import ShapeEnv
-
-    # This overrides the internal `FakeTensorMode` instance created by `torch._dynamo.export`[1].
-    # It is a good idea to keep them in sync (constructor args) to maintain the same default behavior
-    # [1] `torch/_dynamo/output_graph.py::InstructionTranslator::OutputGraph.__init__`
-    # Mixed fake/real tensors are only allowed when `torch.onnx.dynamo_export` is not called within `FakeTensorMode`
-    # This is needed because models can create new parameters during `forward(self, *args, **kwargs)` run
-    fake_mode = fake_tensor.FakeTensorMode(
-        allow_non_fake_inputs=not torch._guards.detect_fake_mode(),
-        shape_env=ShapeEnv(
-            allow_scalar_outputs=False, allow_dynamic_output_shape_ops=False
-        ),
-    )
-    # The patcher is needed for when user calls `fake_model.load_state_dict(...)` within fake mode
-    patcher_context = patcher.ONNXTorchPatcher()
-    fake_context = ONNXFakeContext(fake_mode=fake_mode)
-    with fake_mode, patcher_context:
-        yield fake_context
-    fake_context.state_dict_paths = tuple(
-        patcher_context.paths,
-    )  # type: ignore[assignment]
diff --git a/torch/onnx/_internal/_lazy_import.py b/torch/onnx/_internal/_lazy_import.py
index 3557ef099309e..5e2340fe4c42d 100644
--- a/torch/onnx/_internal/_lazy_import.py
+++ b/torch/onnx/_internal/_lazy_import.py
@@ -30,7 +30,7 @@ def __getattr__(self, attr: str) -> object:
     import onnx
     import onnx_ir  # type: ignore[import-untyped]
     import onnxscript
-    import onnxscript._framework_apis.torch_2_8 as onnxscript_apis
+    import onnxscript._framework_apis.torch_2_9 as onnxscript_apis
 
     onnxscript_ir = onnx_ir
 
@@ -38,4 +38,4 @@ def __getattr__(self, attr: str) -> object:
     onnx = _LazyModule("onnx")
     onnxscript = _LazyModule("onnxscript")
     onnxscript_ir = _LazyModule("onnx_ir")
-    onnxscript_apis = _LazyModule("onnxscript._framework_apis.torch_2_8")
+    onnxscript_apis = _LazyModule("onnxscript._framework_apis.torch_2_9")
diff --git a/torch/onnx/_internal/exporter/_capture_strategies.py b/torch/onnx/_internal/exporter/_capture_strategies.py
index 4774855e874ee..89a2b7e9e5e2f 100644
--- a/torch/onnx/_internal/exporter/_capture_strategies.py
+++ b/torch/onnx/_internal/exporter/_capture_strategies.py
@@ -12,7 +12,7 @@
 from typing import Any, Callable, TYPE_CHECKING
 
 import torch
-from torch.export import _draft_export
+from torch.onnx import _flags
 
 
 if TYPE_CHECKING:
@@ -251,7 +251,7 @@ class TorchExportDraftExportStrategy(CaptureStrategy):
     def _capture(
         self, model, args, kwargs, dynamic_shapes
     ) -> torch.export.ExportedProgram:
-        ep = _draft_export.draft_export(
+        ep = torch.export.draft_export(
             model, args, kwargs=kwargs, dynamic_shapes=dynamic_shapes
         )
         report = ep._report  # type: ignore[attr-defined]
@@ -263,25 +263,27 @@ def _capture(
     def _enter(self, model) -> None:
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
-            f"Obtain model graph for `{model_repr}` with `torch.export draft_export`..."
+            f"Obtain model graph for `{model_repr}` with `torch.export.draft_export`..."
         )
 
     def _success(self, model) -> None:
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
-            f"Obtain model graph for `{model_repr}` with `torch.export draft_export`... ✅"
+            f"Obtain model graph for `{model_repr}` with `torch.export.draft_export`... ✅"
         )
 
     def _failure(self, model, e) -> None:
         del e  # Unused
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
-            f"Obtain model graph for `{model_repr}` with `torch.export draft_export`... ❌"
+            f"Obtain model graph for `{model_repr}` with `torch.export.draft_export`... ❌"
         )
 
 
-CAPTURE_STRATEGIES = (
+CAPTURE_STRATEGIES: tuple[type[CaptureStrategy], ...] = (
     TorchExportNonStrictStrategy,  # strict=False is preferred over strict=True because it does not have dynamo issues
     TorchExportStrictStrategy,
-    TorchExportDraftExportStrategy,
 )
+
+if _flags.ENABLE_DRAFT_EXPORT:
+    CAPTURE_STRATEGIES = (*CAPTURE_STRATEGIES, TorchExportDraftExportStrategy)
diff --git a/torch/onnx/_internal/exporter/_compat.py b/torch/onnx/_internal/exporter/_compat.py
index cf83aa4061543..fe18f42e17b92 100644
--- a/torch/onnx/_internal/exporter/_compat.py
+++ b/torch/onnx/_internal/exporter/_compat.py
@@ -4,6 +4,7 @@
 # mypy: disable-error-code=attr-defined
 from __future__ import annotations
 
+import io
 import logging
 import warnings
 from collections.abc import Mapping, Sequence
@@ -11,8 +12,9 @@
 
 import torch
 from torch.onnx import _constants as onnx_constants
-from torch.onnx._internal._lazy_import import onnxscript_apis, onnxscript_ir as ir
+from torch.onnx._internal._lazy_import import onnx, onnxscript_apis, onnxscript_ir as ir
 from torch.onnx._internal.exporter import (
+    _constants,
     _core,
     _dynamic_shapes,
     _onnx_program,
@@ -60,12 +62,12 @@ def export_compat(
     keep_initializers_as_inputs: bool = False,
     external_data: bool = True,
     report: bool = False,
-    optimize: bool = False,
+    optimize: bool = True,
     verify: bool = False,
     profile: bool = False,
     dump_exported_program: bool = False,
     artifacts_dir: str | os.PathLike = ".",
-    fallback: bool = False,
+    fallback: bool = True,
     # Legacy export parameters for fallback
     legacy_export_kwargs: dict[str, Any] | None = None,
 ) -> _onnx_program.ONNXProgram:
@@ -107,7 +109,27 @@ def export_compat(
     dynamic_shapes_with_export_dim, need_axis_mapping = (
         _dynamic_shapes.convert_str_to_export_dim(dynamic_shapes)
     )
-    registry = _registration.ONNXRegistry().from_torchlib(opset_version=opset_version)
+
+    if opset_version < _constants.TORCHLIB_OPSET:
+        logger.warning(
+            "Setting ONNX exporter to use operator set version %s because "
+            "the requested opset_version %s is a lower version than we have implementations for. "
+            "Automatic version conversion will be performed, which may not be successful "
+            "at converting to the requested version. If version conversion is unsuccessful, "
+            "the opset version of the exported model will be kept at %s. "
+            "Please consider setting opset_version >=%s to leverage latest ONNX features",
+            _constants.TORCHLIB_OPSET,
+            opset_version,
+            _constants.TORCHLIB_OPSET,
+            _constants.TORCHLIB_OPSET,
+        )
+        registry_opset_version = _constants.TORCHLIB_OPSET
+    else:
+        registry_opset_version = opset_version
+
+    registry = _registration.ONNXRegistry().from_torchlib(
+        opset_version=registry_opset_version
+    )
     if custom_translation_table is not None:
         for torch_op, onnx_ops in custom_translation_table.items():
             # TODO(justinchuby): Support complex inputs with annotations
@@ -190,11 +212,23 @@ def export_compat(
         onnx_program.optimize()
 
     if f is not None:
-        onnx_program.save(
-            f,
-            include_initializers=export_params,
-            keep_initializers_as_inputs=keep_initializers_as_inputs,
-            external_data=external_data,
-        )
+        if isinstance(f, io.BytesIO):
+            # For legacy export compatibility, we allow f to be a BytesIO object.
+            # This is not explicitly supported but we may need to maintain the
+            # behavior indefinitely.
+            warnings.warn(
+                "Saving ONNX model to a BytesIO object is deprecated. "
+                "Please use a file path instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            onnx.save(onnx_program.model_proto, f)
+        else:
+            onnx_program.save(
+                f,
+                include_initializers=export_params,
+                keep_initializers_as_inputs=keep_initializers_as_inputs,
+                external_data=external_data,
+            )
 
     return onnx_program
diff --git a/torch/onnx/_internal/exporter/_core.py b/torch/onnx/_internal/exporter/_core.py
index a4e3eea2e1d28..85aa513c6d023 100644
--- a/torch/onnx/_internal/exporter/_core.py
+++ b/torch/onnx/_internal/exporter/_core.py
@@ -726,6 +726,12 @@ def _handle_output_node(
     # node.args[0] can be a tuple with more than one elements. This happens when,
     # for example, a subgraph has multiple outputs. We flatten them all as ONNX graph outputs
     for output in node.args[0]:  # type: ignore[index,union-attr]
+        if output is None:
+            logger.warning(
+                "Output node %s has None output. The output is ignored in the exported graph. Please ensure the graph output order is expected",
+                node.name,
+            )
+            continue
         output_value_name = output.name  # type: ignore[union-attr]
         assert isinstance(output_value_name, str), (
             f"Bug: Expected {output_value_name!r} to be a string"
diff --git a/torch/onnx/_internal/exporter/_flags.py b/torch/onnx/_internal/exporter/_flags.py
index de20e27418dfc..0f07508f831ec 100644
--- a/torch/onnx/_internal/exporter/_flags.py
+++ b/torch/onnx/_internal/exporter/_flags.py
@@ -3,17 +3,20 @@
 from __future__ import annotations
 
 import functools
-from typing import Any, Callable, cast, TypeVar
+from typing import Callable, TypeVar
+from typing_extensions import ParamSpec
 
 
 _is_onnx_exporting = False
 
-TCallable = TypeVar("TCallable", bound=Callable[..., Any])
+# Use ParamSpec to preserve parameter types instead of erasing to Any
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
 
 
-def set_onnx_exporting_flag(func: TCallable) -> TCallable:
+def set_onnx_exporting_flag(func: Callable[_P, _R]) -> Callable[_P, _R]:
     @functools.wraps(func)
-    def wrapper(*args: Any, **kwargs: Any) -> Any:
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
         global _is_onnx_exporting
         _is_onnx_exporting = True
         try:
@@ -22,4 +25,4 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
             # Ensure it resets even if an exception occurs
             _is_onnx_exporting = False
 
-    return cast(TCallable, wrapper)
+    return wrapper
diff --git a/torch/onnx/_internal/exporter/_reporting.py b/torch/onnx/_internal/exporter/_reporting.py
index dcbd84d26091b..e2e02e089c5d1 100644
--- a/torch/onnx/_internal/exporter/_reporting.py
+++ b/torch/onnx/_internal/exporter/_reporting.py
@@ -22,7 +22,7 @@ class ExportStatus:
     torch_export_strict: bool | None = None
     # Whether torch.export.export(..., strict=False) succeeds
     torch_export_non_strict: bool | None = None
-    # Whether torch.export._draft_export.draft_export() succeeds
+    # Whether torch.export.draft_export() succeeds
     torch_export_draft_export: bool | None = None
     # Whether decomposition succeeds
     decomposition: bool | None = None
@@ -47,7 +47,7 @@ def _format_export_status(status: ExportStatus) -> str:
         f"```\n"
         f"{_status_emoji(status.torch_export_non_strict)} Obtain model graph with `torch.export.export(..., strict=False)`\n"
         f"{_status_emoji(status.torch_export_strict)} Obtain model graph with `torch.export.export(..., strict=True)`\n"
-        f"{_status_emoji(status.torch_export_draft_export)} Obtain model graph with `torch.export._draft_export.draft_export`\n"
+        f"{_status_emoji(status.torch_export_draft_export)} Obtain model graph with `torch.export.draft_export`\n"
         f"{_status_emoji(status.decomposition)} Decompose operators for ONNX compatibility\n"
         f"{_status_emoji(status.onnx_translation)} Translate the graph into ONNX\n"
         f"{_status_emoji(status.onnx_checker)} Run `onnx.checker` on the ONNX model\n"
diff --git a/torch/onnx/_internal/exporter/_testing.py b/torch/onnx/_internal/exporter/_testing.py
index 58f18d0cc923c..c34c2f1a38c3d 100644
--- a/torch/onnx/_internal/exporter/_testing.py
+++ b/torch/onnx/_internal/exporter/_testing.py
@@ -71,6 +71,9 @@ class names like "TorchExportNonStrictStrategy".
     # ONNX outputs are always real, so we need to convert torch complex outputs to real representations
     torch_outputs_adapted = []
     for output in torch_outputs:
+        # ONNX graph does not support None outputs, so we skip them
+        if output is None:
+            continue
         if not isinstance(output, torch.Tensor):
             torch_outputs_adapted.append(torch.tensor(output))
         elif torch.is_complex(output):
diff --git a/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py b/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
index 039eeb3e2fc26..8c045d11a2b8f 100644
--- a/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
+++ b/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
@@ -10,6 +10,7 @@
 import logging
 from collections.abc import Sequence
 from typing import Any, Callable, TypeVar
+from typing_extensions import ParamSpec
 
 import onnxscript
 
@@ -17,7 +18,9 @@
 from torch.onnx._internal.exporter import _constants, _registration
 
 
-_T = TypeVar("_T", bound=Callable)
+# Use ParamSpec for better type preservation instead of bound Callable TypeVar
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
 
 logger = logging.getLogger("__name__")
 
@@ -33,7 +36,7 @@ def onnx_impl(
     opset_introduced: int = 18,
     no_compile: bool = False,
     private: bool = False,
-) -> Callable[[_T], _T]:
+) -> Callable[[Callable[_P, _R]], Callable[_P, _R]]:
     """Register an ONNX implementation of a torch op."""
 
     if isinstance(target, torch._ops.OpOverloadPacket):
@@ -44,8 +47,8 @@ def onnx_impl(
         )
 
     def wrapper(
-        func: _T,
-    ) -> _T:
+        func: Callable[_P, _R],
+    ) -> Callable[_P, _R]:
         processed_func: Any
         if no_compile:
             processed_func = func
diff --git a/torch/onnx/_internal/fx/__init__.py b/torch/onnx/_internal/fx/__init__.py
index b5716bdafced7..e69de29bb2d1d 100644
--- a/torch/onnx/_internal/fx/__init__.py
+++ b/torch/onnx/_internal/fx/__init__.py
@@ -1,8 +0,0 @@
-from .patcher import ONNXTorchPatcher
-from .serialization import save_model_with_external_data
-
-
-__all__ = [
-    "save_model_with_external_data",
-    "ONNXTorchPatcher",
-]
diff --git a/torch/onnx/_internal/fx/passes/_utils.py b/torch/onnx/_internal/fx/passes/_utils.py
deleted file mode 100644
index a7b05786ab171..0000000000000
--- a/torch/onnx/_internal/fx/passes/_utils.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# mypy: allow-untyped-defs
-"""Common utility functions for FX passes.
-
-These functions should NOT be directly invoked outside of `passes` package.
-"""
-
-from __future__ import annotations
-
-import collections
-import re
-from typing import Callable
-
-import torch.fx
-import torch.fx.traceback as fx_traceback
-
-
-def wrap_graph_module_for_node_meta_preservation(
-    graph_module: torch.fx.GraphModule,
-) -> Callable:
-    """Wrap a GraphModule with contexts to preserve node meta information, such as stacktrace info.
-
-    This is typically useful before calling `make_fx`. Without this wrapper, the
-    stacktrace information will be lost afterwards.
-    """
-
-    def wrapped(*args):
-        with fx_traceback.preserve_node_meta():
-            return torch.fx.Interpreter(graph_module).run(*args)
-
-    return wrapped
-
-
-def _get_node_base_name(node_name: str) -> tuple[str, int | None]:
-    pattern = r"(.*)\.(\d+)"
-    match = re.match(pattern, node_name)
-    if match is not None:
-        base_name, count_str = match.groups()
-        return base_name, int(count_str)
-    return node_name, None
-
-
-def set_node_name(
-    node: torch.fx.Node,
-    new_name: str,
-    name_to_node_cache: dict[str, torch.fx.Node],
-):
-    """Safely set the unique name of a node.
-
-    If the new name is already taken by another node, the name of the other node will be
-    updated. If `new_name` is a string of format f"{base_name}.{count}", where `count`
-    is an integer, the other node will be renamed as f"{base_name}.{count+1}". If not,
-    the other node will be renamed as "{new_name}.1". This function will iteratively
-    update the names until there is no conflict.
-
-    ``name_to_node_cache`` is required as an argument to avoid recomputation. The caller
-    is responsible for ensuring the cache is accurate and in sync with the owning module
-    of the node. The values in the cache will be updated accordingly.
-
-    Args:
-        node: The node to update.
-        new_name: The new name to use.
-        name_to_node_cache: A cache of node names to nodes.
-    """
-    node_name_to_set = collections.deque([(node, new_name)])
-
-    while node_name_to_set:
-        node, new_name = node_name_to_set.pop()
-        if new_name in name_to_node_cache and name_to_node_cache[new_name] != node:
-            base_name, postfix_count = _get_node_base_name(new_name)
-            if postfix_count is None:
-                postfix_count = 0
-            node_name_to_set.append(
-                (name_to_node_cache[new_name], f"{base_name}.{postfix_count + 1}")
-            )
-        node.name = new_name
-        name_to_node_cache[new_name] = node
-
-
-def replace_placeholder_name_and_target(
-    module: torch.fx.GraphModule, reference_module: torch.fx.GraphModule
-):
-    """Replace the argument names in module with those in reference_module.
-
-    This function assumes the two modules have the same signature structure.
-    The caller is responsible for ensuring this. Otherwise, the behavior of this
-    function is undefined. This function only does minimal sanity check that the two
-    modules have the same number of arguments.
-
-    Name conflicts between new names and existing node names in the graph are handled.
-    Check the documentation of :func:`set_node_name` for more details.
-
-    Raises:
-        RuntimeError: If the two modules have different number of arguments.
-    """
-    placeholders = [node for node in module.graph.nodes if node.op == "placeholder"]
-    reference_placeholders = [
-        node for node in reference_module.graph.nodes if node.op == "placeholder"
-    ]
-
-    if len(placeholders) != len(reference_placeholders):
-        raise RuntimeError(
-            "The two modules have different number of arguments. "
-            f"module: {len(placeholders)}, reference_module: {len(reference_placeholders)}"
-        )
-
-    name_to_node: dict[str, torch.fx.Node] = {}
-    for node in module.graph.nodes:
-        name_to_node[node.name] = node
-
-    for placeholder, reference_placeholder in zip(placeholders, reference_placeholders):
-        placeholder.target = reference_placeholder.target
-        set_node_name(placeholder, reference_placeholder.name, name_to_node)
-
-    module.recompile()
diff --git a/torch/onnx/_internal/fx/patcher.py b/torch/onnx/_internal/fx/patcher.py
deleted file mode 100644
index 6c9724e9f5a73..0000000000000
--- a/torch/onnx/_internal/fx/patcher.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# mypy: allow-untyped-defs
-import copy
-import functools
-from typing import TYPE_CHECKING, Union
-
-import torch
-
-
-if TYPE_CHECKING:
-    import io
-
-
-# TODO: Remove after https://github.com/huggingface/safetensors/pull/318
-@functools.cache
-def has_safetensors_and_transformers():
-    try:
-        # safetensors is not an exporter requirement, but needed for some huggingface models
-        import safetensors  # type: ignore[import]  # noqa: F401
-        import transformers  # type: ignore[import]  # noqa: F401
-        from safetensors import torch as safetensors_torch  # noqa: F401
-
-        return True
-    except ImportError:
-        return False
-
-
-class ONNXTorchPatcher:
-    """Context manager to temporarily patch PyTorch during FX-to-ONNX export.
-
-    This class is a collection of "patches" required by FX-to-ONNX exporter.
-
-    This context overrides several torch functions to support symbolic
-    export of large scale models.
-
-    torch.load:
-        This function is patched to record the files PyTorch stores model
-        parameters and buffers. Downstream FX-to-ONNX exporter can create
-        initializers from these files.
-    torch.fx._symbolic_trace._wrapped_methods_to_patch:
-        This list is extended with (torch.Tensor, "__getitem__") so that
-        weight[x, :, y] becomes exportable with torch.fx.symbolic_trace.
-    safetensors.torch.load_file:
-        This function is patched to allow safetensors to be loaded within
-        FakeTensorMode. Remove after https://github.com/huggingface/safetensors/pull/318
-
-    Search for ONNXTorchPatcher in test_fx_to_onnx_with_onnxruntime.py for
-    example usage.
-
-    TODO: Should this really be a global patcher? Can we make it a local patcher?
-        A reason for splitting this into several patchers is to patch one part of the code
-        as a collateral damage of patching another part of the code. For example, we
-        for tracing model with torch._dynamo.export, we don't need to patch
-        `torch.fx._symbolic_trace._wrapped_methods_to_patch`
-    """
-
-    def __init__(self) -> None:
-        # List of file paths processed by torch.load.
-        self.paths: list[Union[str, io.BufferedIOBase]] = []
-
-        def torch_load_wrapper(f, *args, **kwargs):
-            # Record path for later serialization into ONNX proto
-            self.paths.append(f)
-            # Then, call the original torch.load.
-            return self.torch_load(f, *args, **kwargs)
-
-        # Original version of torch.load.
-        self.torch_load = torch.load
-
-        # Wrapper or modified version of torch functions.
-        self.torch_load_wrapper = torch_load_wrapper
-
-        if has_safetensors_and_transformers():
-            import safetensors
-            import transformers
-
-            def safetensors_load_file_wrapper(filename, device="cpu"):
-                # Record path for later serialization into ONNX proto
-                self.paths.append(filename)
-                result = {}
-                with safetensors.torch.safe_open(  # type: ignore[attr-defined]
-                    filename, framework="pt", device=device
-                ) as f:
-                    for k in f.keys():
-                        fake_mode = torch._guards.detect_fake_mode()
-                        if not fake_mode:
-                            result[k] = f.get_tensor(k)
-                        else:
-                            empty_tensor = f.get_slice(k)
-                            result[k] = torch.empty(
-                                tuple(empty_tensor.get_shape()),
-                                dtype=safetensors.torch._getdtype(
-                                    empty_tensor.get_dtype()
-                                ),
-                            )
-                return result
-
-            self.safetensors_torch_load_file = safetensors.torch.load_file
-            self.safetensors_torch_load_file_wrapper = safetensors_load_file_wrapper
-            self.transformers_modeling_utils_safe_load_file = (
-                transformers.modeling_utils.safe_load_file
-            )
-
-    def __enter__(self):
-        torch.load = self.torch_load_wrapper
-
-        self.torch_fx__symbolic_trace__wrapped_methods_to_patch = (
-            torch.fx._symbolic_trace._wrapped_methods_to_patch
-        )
-        desired_wrapped_methods = copy.deepcopy(
-            torch.fx._symbolic_trace._wrapped_methods_to_patch
-        )
-        if (torch.Tensor, "__getitem__") not in desired_wrapped_methods:
-            # Adding `__getitem__` to the patching list will make tensor indexing traceable via
-            # torch.fx.symbolic_trace. Otherwise, `tensor[x, :, y]` cannot be traced.
-            # This happens because `__getitem__` is neither under torch domain nor an aten operator,
-            # so the patching (or similar Proxy-generating mechanism) doesn't happen automatically.
-            # Note that torch.fx.symbolic_trace defines FX_PATCH_GETITEM environment variable for
-            # enabling the line below for patching.
-            desired_wrapped_methods.append((torch.Tensor, "__getitem__"))
-        torch.fx._symbolic_trace._wrapped_methods_to_patch = desired_wrapped_methods
-
-        if has_safetensors_and_transformers():
-            import safetensors
-            import transformers
-
-            safetensors.torch.load_file = self.safetensors_torch_load_file_wrapper
-            transformers.modeling_utils.safe_load_file = (
-                self.safetensors_torch_load_file_wrapper
-            )
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        torch.load = self.torch_load
-        torch.fx._symbolic_trace._wrapped_methods_to_patch = (
-            self.torch_fx__symbolic_trace__wrapped_methods_to_patch
-        )
-        if has_safetensors_and_transformers():
-            import safetensors
-            import transformers
-
-            safetensors.torch.load_file = self.safetensors_torch_load_file
-            transformers.modeling_utils.safe_load_file = (
-                self.transformers_modeling_utils_safe_load_file
-            )
diff --git a/torch/onnx/_internal/fx/serialization.py b/torch/onnx/_internal/fx/serialization.py
deleted file mode 100644
index cda71e465758d..0000000000000
--- a/torch/onnx/_internal/fx/serialization.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# mypy: allow-untyped-defs
-from __future__ import annotations
-
-import io
-import logging
-import os
-from typing import IO, TYPE_CHECKING
-
-import torch
-from torch.onnx import _type_utils as jit_type_utils
-
-
-if TYPE_CHECKING:
-    import onnx
-
-    from torch.types import FileLike
-
-log = logging.getLogger(__name__)
-
-
-def _create_tensor_proto_with_external_data(
-    tensor: torch.Tensor,
-    name: str,
-    location: str,
-    basepath: str,
-    dtype_override: onnx.TypeProto | None = None,  # type: ignore[name-defined]
-) -> onnx.TensorProto:  # type: ignore[name-defined]
-    """Create a TensorProto with external data from a PyTorch tensor.
-    The external data is saved to os.path.join(basepath, location).
-
-    Args:
-        tensor: Tensor to be saved.
-        name: Name of the tensor (i.e., initializer name in ONNX graph).
-        location: Relative location of the external data file
-            (e.g., "/tmp/initializers/weight_0" when model is "/tmp/model_name.onnx").
-        basepath: Base path of the external data file (e.g., "/tmp/external_data" while model must be in "/tmp").
-
-
-    Reference for ONNX's external data format:
-        How to load?
-        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L187
-        How to save?
-        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L43
-        How to set ONNX fields?
-        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L88
-    """
-    # FIXME: Avoid importing onnx into torch.onnx.
-    import onnx
-
-    scalar_type = (
-        jit_type_utils.JitScalarType.from_onnx_type(
-            dtype_override.tensor_type.elem_type
-        )
-        if dtype_override is not None
-        else jit_type_utils.JitScalarType.from_dtype(tensor.dtype)
-    )
-
-    # Checkpoints can be stored with a different dtype as the model expects because
-    # the user script can explicitly cast the original type to something or maybe
-    # PyTorch's type promotion might do it
-    if dtype_override is not None and scalar_type.dtype() != tensor.dtype:
-        tensor = tensor.to(scalar_type.dtype())
-
-    tensor_proto = onnx.TensorProto()  # type: ignore[attr-defined]
-    tensor_proto.name = name
-    tensor_proto.data_type = scalar_type.onnx_type()  # type: ignore[assignment]
-
-    tensor_proto.dims.extend(tensor.shape)
-    tensor_proto.data_location = onnx.TensorProto.EXTERNAL  # type: ignore[attr-defined]
-
-    # Settings for saving one tensor per file.
-    # Offset is zero because there is no other tensor in the same file.
-    key_value_pairs = {
-        "location": location,
-        "offset": 0,
-        "length": tensor.untyped_storage().nbytes(),
-    }
-    for k, v in key_value_pairs.items():
-        entry = tensor_proto.external_data.add()
-        entry.key = k
-        entry.value = str(v)
-
-    # Actual path to write content of tensor.
-    external_data_file_path = os.path.join(basepath, location)
-    if os.path.exists(external_data_file_path):
-        os.remove(external_data_file_path)
-
-    # Create external data's folder if not exists.
-    external_data_dir_path = os.path.dirname(external_data_file_path)
-    if not os.path.exists(external_data_dir_path):
-        # if the demo_folder directory is not present
-        # then create it.
-        os.makedirs(external_data_dir_path)
-
-    # Create a fresh file.
-    with open(external_data_file_path, "xb") as data_file:
-        # No need to call "seek" because offset is 0.
-        # data_file.seek(0)
-        # Write tensor content to the file.
-        data_file.write(tensor.numpy(force=True).tobytes())
-
-    return tensor_proto
-
-
-def _convert_safetensors_to_torch_format(safetensors_file):
-    # It this function is called, safetensors is guaranteed to exist
-    # because the HF model with safetensors was already loaded and exported to ONNX
-    from safetensors import safe_open  # type: ignore[import-not-found, import-untyped]
-
-    tensors = {}
-    with safe_open(safetensors_file, framework="pt", device="cpu") as f:  # type: ignore[attr-defined]
-        for k in f.keys():
-            tensors[k] = f.get_tensor(k).cpu()
-    return tensors
-
-
-# TODO: generalize to allow more checkpoints formats (torch or gguf)
-def save_model_with_external_data(
-    basepath: str,
-    model_location: str,
-    initializer_location: str,
-    torch_state_dicts: tuple[dict | FileLike, ...],
-    onnx_model: onnx.ModelProto,  # type: ignore[name-defined]
-    rename_initializer: bool = False,
-) -> None:
-    """Load PyTorch tensors from files and add to "onnx_model" as external initializers.
-
-    Output files:
-        ONNX model file path:
-        ONNX initializer folder: os.path.join(basepath, initializer_location)
-
-    After running this function, you can do
-        ort_sess = onnxruntime.InferenceSession(os.path.join(basepath, model_location))
-    to execute the model.
-
-    Arguments:
-        basepath: Base path of the ONNX external data file (e.g., "/path/to/large_model/").
-        model_location: Relative location of the ONNX model file.
-            E.g., "model.onnx" so that the model file is saved to
-            "<basepath>/model.onnx".
-        initializer_location: Relative location of the ONNX initializer folder.
-            E.g., "initializers" so that the initializers are saved to
-            "<basepath>/initializers/".
-            Note: When initializers are >2GB, must be the same as `model_location`.
-        torch_state_dicts: Dictionaries or files which contain PyTorch tensors to be saved
-            as ONNX initializers. For non-dict arguments, `torch.load` will be used to load them from file-like objects.
-        onnx_model: ONNX model to be saved with external initializers.
-            If an input name matches a tensor loaded from "torch_state_dicts",
-            the tensor will be saved as that input's external initializer.
-        rename_initializer: Replaces "." by "_" for all ONNX initializer names.
-            Not needed by the official torch.onnx.dynamo_export. This is a hack
-            for supporting `FXSymbolicTracer` tracer with fake tensor mode.
-            In short, `FXSymbolicTracer` lifts FX parameters (self.linear_weight)
-            as inputs (`def forward(self, linear_weight)`) and therefore, `.` cannot be used.
-    """
-    # FIXME: Avoid importing onnx into torch.onnx.
-    import onnx
-
-    initializers_to_be_deleted = {}  # Using dict because it is **ordered**
-    existing_initializers = {
-        k.name: idx for idx, k in enumerate(onnx_model.graph.initializer)
-    }
-    onnx_input_names = {input.name for input in onnx_model.graph.input}
-    for el in torch_state_dicts:
-        if isinstance(el, dict):
-            # Useful for when state_dict is loaded with torch.load(..., mmap=True, map_location="cpu") by the user
-            # Using torch.save wouldn't leverage mmap, leading to higher memory usage
-            state_dict = el
-        else:
-            if isinstance(el, (str, os.PathLike)) and os.fspath(el).endswith(
-                ".safetensors"
-            ):
-                state_dict = _convert_safetensors_to_torch_format(el)
-            else:
-                try:
-                    # Loads checkpoint using memory-map on CPU to support really large models
-                    # The underlying torch.UntypedStorage is memory mapped, so state_dict is lazy loaded
-                    state_dict = torch.load(el, map_location="cpu", mmap=True)
-                except (RuntimeError, ValueError) as e:
-                    if "mmap can only be used with files saved with" in str(e) or (
-                        isinstance(el, (io.IOBase, IO))
-                        and el.readable()
-                        and el.seekable()
-                    ):
-                        log.warning(
-                            "Failed to load the checkpoint with memory-map enabled, retrying without memory-map."
-                            "Consider updating the checkpoint with mmap by using torch.save() on PyTorch version >= 1.6."
-                        )
-                        if isinstance(el, (io.IOBase, IO)):
-                            el.seek(0)  # torch.load from `try:` has read the file.
-                        state_dict = torch.load(el, map_location="cpu")
-                    else:
-                        raise e
-
-        for name, tensor in state_dict.items():
-            if rename_initializer:
-                # Basically, "transformer.attention.self.query.weight" is mapped
-                # to "transformer_attention_self_query_weight" for mimicking the
-                # name-modifying code in FX-to-ONNX exporter.
-                # See function _replace_get_attr_with_placeholder for details.
-                name = name.replace(".", "_")
-
-            # This block tries to match the onnx initializer name with torch parameter/buffer
-            #  e.g. A pytorch buffer 'transformer.h.0.attn.bias' can be named 'h.0.attn.bias' in a ONNX initializer
-            # For each PyTorch tensor name loaded by torch.load,
-            #  1.  Search its best match in ONNX model. E.g., the match of
-            #       "transformer_attention_weight" could be "attention_weight".
-            #  2.  Set "tensor" as the initializer of the matched ONNX input.
-            #      E.g., "tensor" is stored as the initializer of "attention_weight".
-            # Step 1 is required because sometimes, tensor names are stored with prefix the dictionary
-            # loaded by torch.load.
-            if name in onnx_input_names:
-                # Same input name shouldn't be matched again
-                onnx_input_names.remove(name)
-            else:
-                for onnx_input_name in onnx_input_names:
-                    if onnx_input_name.endswith(name) or name.endswith(onnx_input_name):
-                        # Find a match. Change name to the matched ONNX input name, so that we
-                        # create initializer with the right ONNX name.
-                        name = onnx_input_name
-                        onnx_input_names.remove(onnx_input_name)
-                        break
-
-            relative_tensor_file_path = os.path.join(initializer_location, name)
-            # Create one file per tensor.
-            # tensor_proto.raw_data is stored to external file at
-            # os.path.join(basepath, relative_tensor_file_path).
-            model_input_types = {k.name: k.type for k in onnx_model.graph.input}
-
-            # Mark for deletion - a replacement will be appended next
-            if name in existing_initializers:
-                initializers_to_be_deleted[existing_initializers[name]] = name
-            tensor_proto = _create_tensor_proto_with_external_data(
-                tensor,
-                name,
-                relative_tensor_file_path,
-                basepath,
-                model_input_types.pop(name, None),
-            )
-            # Add the tensor_proto to the ONNX model as an initializer with external data.
-            onnx_model.graph.initializer.append(tensor_proto)
-    # Remove old duplicated initializers, if any. delete in desc order to not invalidate deletion indices
-    initializers_to_be_deleted = dict(
-        sorted(initializers_to_be_deleted.items(), reverse=True)
-    )
-    for idx in initializers_to_be_deleted.keys():
-        del onnx_model.graph.initializer[idx]
-
-    # model_location should be a pure file name such as "file_name.onnx", not "folder/file_name.onnx".
-    onnx.save(onnx_model, os.path.join(basepath, model_location))  # type: ignore[attr-defined]
diff --git a/torch/onnx/_internal/torchscript_exporter/README.md b/torch/onnx/_internal/torchscript_exporter/README.md
new file mode 100644
index 0000000000000..af0ca464beda0
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/README.md
@@ -0,0 +1,91 @@
+# TorchScript Exporter
+
+> [!NOTE]
+> This directory hosts code for the legacy TorchScript-based ONNX exporter. It is *deprecated* since PyTorch 2.9 and should be removed along with TorchScript.
+
+## Symbolic functions Opsets
+
+Opset 9 is the base version. It is selected as the base version because
+
+1. It is the first opset version supported by PyTorch export.
+2. Opset 9 is more robust than previous opset versions. Opset versions like 7/8 have limitations
+    that certain basic operators cannot be expressed in ONNX. Instead of basing on these limitations,
+    we chose to handle them as special cases separately.
+
+Backward support for opset versions beyond opset 7 is not in our roadmap.
+
+For opset versions other than 9, by default they will inherit the symbolic functions defined in
+symbolic_opset9.py.
+
+To extend support for updated operators in different opset versions on top of opset 9,
+simply add the updated symbolic functions in the respective symbolic_opset{version}.py file.
+Check out topk in symbolic_opset10.py, and upsample_nearest2d in symbolic_opset8.py for example.
+
+## Editing Symbolic Files
+
+- Use the internal `registration.onnx_symbolic` decorator to register a new symbolic function. Search for `def reshape(g, self, shape):` to see an example.
+- Parameter names must *exactly* match the names in
+  aten/src/ATen/native/native_functions.yaml, because
+  dispatch is done with keyword arguments.
+- Looking for inplace ops? They're detected by
+  `_jit_pass_onnx_remove_inplace_ops_for_onnx`, and
+  transparently dispatched to their non inplace versions in
+  "run_symbolic_function". See Note [Export inplace](#export-inplace)
+
+### A note on Tensor types
+
+In general, we should avoid depending on the type of Tensor Values contained
+within the trace graph. However, this is sometimes unavoidable (due to ONNX
+spec requirements, etc). The TensorType object has accessors for these properties that return the property if it is statically known and return nullopt otherwise.
+
+In general, we should prefer to rely on the least specific information possible.
+For example, not relying on tensor properties at all is better than relying
+on the number of dimensions which is better than relying on
+concrete shapes. Doing so will make the export symbolics
+more robust to different graphs.
+
+### Extra context for symbolic functions
+
+The first argument of a symbolic function is always a `GraphContext` object.
+
+`GraphContext` contains all methods defined in a `torch.Graph` object and context
+for the symbolic function.
+
+In general, symbolic functions only require inputs and attributes to
+the original node. An example of a symbolic function needing context is
+`prim::Loop`. It needs access to the sub-block of the original node.
+
+### Export inplace
+
+It would be better for us to export inplace annotations,
+than to not export them, since it is useful information that can
+help the target of an ONNX export export more efficiently. However,
+ONNX doesn't currently formalize inplace. Fortunately, it's sound to drop
+inplace annotations, but we are losing information this way.
+
+### Pointwise by scalar
+
+What happens if you add a tensor with a constant (e.g., x + 2)?  There are
+some moving parts to implementing the ONNX translation in this case:
+
+- By the time we get the scalar in a symbolic function here, it is no longer a
+  Python long/float, but a PyTorch tensor with `numel == 1` (eventually, we want
+  it to be a zero dim tensor but this change has not happened yet.) However, the
+  type of this scalar is *exactly* what the user wrote in Python, which may not
+  match the tensor it is being added to. PyTorch will do implicit conversions on
+  scalars; however, ONNX will not, so we must do the conversion ourselves. This
+  is what `symbolic_helper._if_scalar_type_as()` and
+  `_jit_pass_onnx_scalar_type_analysis` does.
+
+- Dispatch to these functions takes advantage an outrageous coincidence
+    between the tensor and scalar name.  When we add two tensors together,
+    you get the dispatch:
+
+    add(*[self, other], **{"alpha": alpha})
+
+    When you add a tensor and a scalar, you get the dispatch:
+
+    add(*[self], **{"other": other, "alpha": alpha})
+
+    By having the argument name line up with the name of the scalar attribute
+    if it exists, we can write a single function for both overloads.
diff --git a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_gt b/torch/onnx/_internal/torchscript_exporter/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_gt
rename to torch/onnx/_internal/torchscript_exporter/__init__.py
diff --git a/torch/onnx/_experimental.py b/torch/onnx/_internal/torchscript_exporter/_experimental.py
similarity index 100%
rename from torch/onnx/_experimental.py
rename to torch/onnx/_internal/torchscript_exporter/_experimental.py
diff --git a/torch/onnx/_globals.py b/torch/onnx/_internal/torchscript_exporter/_globals.py
similarity index 100%
rename from torch/onnx/_globals.py
rename to torch/onnx/_internal/torchscript_exporter/_globals.py
diff --git a/torch/onnx/_type_utils.py b/torch/onnx/_internal/torchscript_exporter/_type_utils.py
similarity index 100%
rename from torch/onnx/_type_utils.py
rename to torch/onnx/_internal/torchscript_exporter/_type_utils.py
diff --git a/torch/onnx/_internal/jit_utils.py b/torch/onnx/_internal/torchscript_exporter/jit_utils.py
similarity index 97%
rename from torch/onnx/_internal/jit_utils.py
rename to torch/onnx/_internal/torchscript_exporter/jit_utils.py
index f3f82c0db7c20..6c00b6a9c8c41 100644
--- a/torch/onnx/_internal/jit_utils.py
+++ b/torch/onnx/_internal/torchscript_exporter/jit_utils.py
@@ -1,9 +1,6 @@
 # mypy: allow-untyped-defs
 """Utilities for manipulating the torch.Graph object and the torchscript."""
 
-# TODO(justinchuby): Move more of the symbolic helper functions here and expose
-# them to the user.
-
 from __future__ import annotations
 
 import dataclasses
@@ -14,8 +11,8 @@
 
 import torch
 from torch import _C
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import registration
+from torch.onnx._internal.torchscript_exporter import registration
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 
 
 _ATTR_PATTERN = re.compile("^(.+)_(([ifstgz])|(ty))$")
@@ -89,7 +86,6 @@ def op(
             The value representing the single output of this operator (see the `outputs`
             keyword argument for multi-return nodes).
         """
-        # FIXME(justinchuby): Add the return type back once we know how to handle mypy
         return _add_op(self, opname, *raw_args, outputs=outputs, **kwargs)
 
     def aten_op(self, operator: str, *args, overload_name: str = "", **kwargs):
@@ -211,8 +207,6 @@ def _add_op(
     The set of operators and the inputs/attributes they take
     is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
 
-    This function is monkey-patched onto Graph.
-
     Args:
         graph_context: The Torch Graph or Block.
         opname: The ONNX operator name, e.g., `Abs` or `Add`, or an operator qualified
@@ -337,7 +331,6 @@ def _add_attribute(node: _C.Node, key: str, value: Any, aten: bool):
     return getattr(node, f"{kind}_")(name, value)
 
 
-# TODO: Expose this to user when migrating symbolic helper functions to here.
 def _is_tensor(x: _C.Value) -> bool:
     return x.type().isSubtypeOf(_C.TensorType.get())
 
diff --git a/torch/onnx/_internal/onnx_proto_utils.py b/torch/onnx/_internal/torchscript_exporter/onnx_proto_utils.py
similarity index 99%
rename from torch/onnx/_internal/onnx_proto_utils.py
rename to torch/onnx/_internal/torchscript_exporter/onnx_proto_utils.py
index 04ed0f83ef84c..c79786cf707de 100644
--- a/torch/onnx/_internal/onnx_proto_utils.py
+++ b/torch/onnx/_internal/torchscript_exporter/onnx_proto_utils.py
@@ -9,10 +9,9 @@
 from typing import Any, TYPE_CHECKING
 
 import torch
-import torch.jit._trace
 import torch.serialization
 from torch.onnx import errors
-from torch.onnx._internal import jit_utils, registration
+from torch.onnx._internal.torchscript_exporter import jit_utils, registration
 
 
 if TYPE_CHECKING:
diff --git a/torch/onnx/_internal/registration.py b/torch/onnx/_internal/torchscript_exporter/registration.py
similarity index 100%
rename from torch/onnx/_internal/registration.py
rename to torch/onnx/_internal/torchscript_exporter/registration.py
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
new file mode 100644
index 0000000000000..a5e85aed01ef5
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
@@ -0,0 +1,2380 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+
+__all__ = [
+    "_apply_params",
+    "_arange_cast_helper",
+    "_arange_helper",
+    "_argmin_argmax_helper",
+    "_as_list_type",
+    "_avgpool_helper",
+    "_batchnorm_helper",
+    "_block_list_in_opset",
+    "_embedding_bag_helper",
+    "_flatten_helper",
+    "_generate_wrapped_number",
+    "_get_const",
+    "_get_dim_for_cross",
+    "_get_interpolate_attributes",
+    "_get_tensor_dim_size",
+    "_get_tensor_rank",
+    "_get_tensor_sizes",
+    "_handle_reduce_dim_none",
+    "_if_scalar_type_as",
+    "_index_fill_reshape_helper",
+    "_interpolate_get_scales_and_mode",
+    "_interpolate_get_scales_if_available",
+    "_interpolate_get_scales",
+    "_interpolate_helper",
+    "_interpolate_size_to_scales",
+    "_interpolate_warning",
+    "_is_bool",
+    "_is_constant",
+    "_is_fp",
+    "_is_list",
+    "_is_none",
+    "_is_onnx_constant",
+    "_is_packed_list",
+    "_is_scalar_list",
+    "_is_split_static",
+    "_is_tensor_list",
+    "_is_tensor",
+    "_is_tuple_construct",
+    "_is_value",
+    "_linalg_vector_norm_helper",
+    "_lt_helper",
+    "_max_helper",
+    "_maybe_cast_reduce_op_input",
+    "_maybe_cast_to_type",
+    "_maybe_get_const",
+    "_maybe_get_scalar",
+    "_min_helper",
+    "_node_get",
+    "_numel_helper",
+    "_onnx_opset_unsupported_detailed",
+    "_onnx_opset_unsupported",
+    "_onnx_unsupported",
+    "_op_with_optional_float_cast",
+    "_optional_input_placeholder_tensor",
+    "_overload_by_arg_count",
+    "_parse_arg",
+    "_reduce_op_symbolic_helper",
+    "_reduce_with_dtype_helper",
+    "_reducesum_helper",
+    "_repeat_interleave_single_value_repeat_helper",
+    "_repeat_interleave_split_helper",
+    "_reshape_helper",
+    "_scalar",
+    "_scatter_helper",
+    "_select_helper",
+    "_size_helper",
+    "_slice_helper",
+    "_sort_helper",
+    "_squeeze_helper",
+    "_topk_helper",
+    "_try_get_scalar_type",
+    "_type_promote_from_values",
+    "_unbind_helper",
+    "_unimplemented",
+    "_unpack_list",
+    "_unpack_quantized_tensor",
+    "_unpack_tuple",
+    "_unsqueeze_helper",
+    "_var_mean_helper",
+    "args_have_same_dtype",
+    "cast_pytorch_to_onnx",
+    "check_training_mode",
+    "dequantize_helper",
+    "is_complex_value",
+    "parse_args",
+    "pytorch_name_to_type",
+    "quantize_helper",
+    "quantized_args",
+    "requantize_bias_helper",
+    "scalar_name_to_pytorch",
+    "scalar_type_to_onnx",
+    "scalar_type_to_pytorch_type",
+]
+
+import functools
+import inspect
+import math
+import sys
+import typing
+import warnings
+from typing import Any, Callable, Literal, NoReturn, TypeVar as _TypeVar
+from typing_extensions import Concatenate as _Concatenate, ParamSpec as _ParamSpec
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch import _C
+from torch.onnx import _constants, errors
+from torch.onnx._internal.torchscript_exporter import _type_utils, jit_utils, utils
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+
+
+if typing.TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from torch.types import Number
+
+_T = _TypeVar("_T")
+_U = _TypeVar("_U")
+_P = _ParamSpec("_P")
+
+# ---------------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------------
+
+_ValueDescriptor = Literal[
+    "v",
+    "i",
+    "is",
+    "f",
+    "fs",
+    "b",
+    "s",
+    "t",
+    "none",
+]
+
+
+def _parse_arg(
+    value,
+    desc: _ValueDescriptor,
+    arg_name: str | None = None,
+    node_name: str | None = None,
+):
+    if desc == "none":
+        return value
+    if desc == "v" or not _is_value(value):
+        return value
+
+    node = value.node()
+    if node.mustBeNone():
+        return None
+    if node.kind() == "onnx::Constant":
+        node_val = _node_get(node, "value")
+        if desc == "i":
+            return int(node_val)
+        elif desc == "f":
+            return float(node_val)
+        elif desc == "b":
+            return bool(node_val)
+        elif desc == "s":
+            return str(node_val)
+        elif desc == "t":
+            return node_val
+        elif desc == "is":
+            return [int(v) for v in node_val]
+        elif desc == "fs":
+            return [float(v) for v in node_val]
+        else:
+            raise errors.SymbolicValueError(
+                f"ONNX symbolic does not understand the Constant node '{node}' "
+                f"specified with descriptor '{desc}'.",
+                value,
+            )
+    elif node.kind() == "prim::ListConstruct":
+        if desc == "is":
+            for v in node.inputs():
+                element_node = v.node()
+                if element_node.kind() != "onnx::Constant":
+                    raise errors.SymbolicValueError(
+                        f"Failed to export a node '{element_node}' "
+                        f"(in list node {node}) "
+                        f"because it is not constant. "
+                        f"Please try to make things (e.g. kernel sizes) static if possible.",
+                        value,
+                    )
+            return [int(_node_get(v.node(), "value")) for v in value.node().inputs()]
+        else:
+            raise errors.SymbolicValueError(
+                f"ONNX symbolic does not know how to unpack the ListConstruct node that "
+                f"is not a list of integers: '{node}'",
+                value,
+            )
+
+    if arg_name is None or node_name is None:
+        raise errors.SymbolicValueError(
+            f"Expected node type 'onnx::Constant', got '{node.kind()}'.",
+            value,
+        )
+
+    raise errors.SymbolicValueError(
+        "Expected node type 'onnx::Constant' "
+        f"for argument '{arg_name}' of node '{node_name}', got '{node.kind()}'.",
+        value,
+    )
+
+
+def _node_get(node: _C.Node, key: str):
+    """Gets attributes of a node which is polymorphic over return type."""
+    assert isinstance(node, _C.Node)
+    sel = node.kindOf(key)
+    return getattr(node, sel)(key)
+
+
+def _is_onnx_constant(value: _C.Value):
+    """Whether a Value is an ONNX constant."""
+    return value.node().kind() == "onnx::Constant"
+
+
+def _maybe_get_const(
+    value: _C.Value | torch.Tensor | Number | Sequence | None,
+    descriptor: _ValueDescriptor,
+):
+    # NOTE: prim::Constant at this stage usually means something not compatible in ONNX,
+    # otherwise it'd be converted to onnx::Constant
+    # TODO(justinchuby): Replace insinstance with _is_value once we figure out mypy
+    if isinstance(value, _C.Value) and _is_onnx_constant(value):
+        return _parse_arg(value, descriptor)
+    return value
+
+
+def _maybe_get_scalar(value):
+    value_t = _maybe_get_const(value, "t")
+    if isinstance(value_t, torch.Tensor) and value_t.shape == ():
+        return value_t
+    return value
+
+
+def _get_const(value, desc, arg_name):
+    if not _is_constant(value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected a constant value of the '{arg_name}' argument, "
+            f"got '{value}'",
+            value,
+        )
+    return _parse_arg(value, desc)
+
+
+def _unpack_list(list_value: _C.Value) -> list[_C.Value]:
+    list_node = list_value.node()
+    if list_node.kind() != "prim::ListConstruct":
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected node type prim::ListConstruct, got '{list_node}'.",
+            list_value,
+        )
+    return list(list_node.inputs())
+
+
+def _unpack_tuple(tuple_value: _C.Value) -> tuple[_C.Value, ...]:
+    tuple_node = tuple_value.node()
+    if not _is_tuple_construct(tuple_value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected node type 'prim::TupleConstruct', "
+            f"got '{tuple_node.kind()}'.",
+            tuple_value,
+        )
+    return tuple(tuple_node.inputs())
+
+
+def _unpack_quantized_tensor(tuple_value: _C.Value) -> tuple[_C.Value, ...]:
+    """Unpacks a quantized tensor into a tuple of tensor and scale/zero_point.
+    Args:
+        tuple_value: A tuple of tensor, scale, zero_point, and optionally axis.
+    Returns:
+        A tuple of tensor, scale, zero_point, and optionally axis.
+    """
+    tuple_node = tuple_value.node()
+    # A quantized tensor is represented as tuple of the form (tensor, scale, zero_point, <axis>)
+    if not _is_tuple_construct(tuple_value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected the output of `{tuple_node}` to be a quantized "
+            f"tensor. Is this likely due to missing support for quantized "
+            f"`{tuple_node.kind()}`. Please create an issue on {_constants.PYTORCH_GITHUB_ISSUES_URL}",
+            tuple_value,
+        )
+    unpacked = tuple(tuple_node.inputs())
+    assert len(unpacked) == 3 or len(unpacked) == 4
+    return unpacked
+
+
+# Check if list_value is output from prim::ListConstruct
+# This is usually called before _unpack_list to ensure the list can be unpacked.
+def _is_packed_list(list_value: Any) -> bool:
+    return _is_value(list_value) and list_value.node().kind() == "prim::ListConstruct"
+
+
+def parse_args(
+    *arg_descriptors: _ValueDescriptor,
+) -> Callable[[Callable[_Concatenate[_U, _P], _T]], Callable[_Concatenate[_U, _P], _T]]:
+    """A decorator which converts args from torch._C.Value to built-in types.
+
+    For example:
+
+    ```
+    @parse_args('v', 'i', 'fs')
+    foo(g, a, b, c):
+        assert isinstance(a, torch._C.Value)
+        assert isinstance(b, int)
+        assert isinstance(c, list)
+        assert isinstance(c[0], float)
+    ```
+
+    Args:
+        arg_descriptors: list of str, where each element is
+            a string that specifies the type to convert to. Valid descriptors:
+            "v": no conversion, keep torch._C.Value.
+            "i": int
+            "is": list of int
+            "f": float
+            "fs": list of float
+            "b": bool
+            "s": str
+            "t": torch.Tensor
+            "none": the variable is unused
+    """
+
+    def decorator(
+        fn: Callable[_Concatenate[_U, _P], _T],
+    ) -> Callable[_Concatenate[_U, _P], _T]:
+        fn._arg_descriptors = arg_descriptors  # type: ignore[attr-defined]
+
+        @functools.wraps(fn)
+        def wrapper(g: _U, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+            # some args may be optional, so the length may be smaller
+            FILE_BUG_MSG = (
+                "If you believe this is not due to custom symbolic implementation within your code or "
+                "an external library, please file an issue at "
+                "https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml to report this bug."
+            )
+            assert len(arg_descriptors) >= len(args), (
+                f"A mismatch between the number of arguments ({len(args)}) and "
+                f"their descriptors ({len(arg_descriptors)}) was found at symbolic function '{fn.__name__}'. "
+                f"{FILE_BUG_MSG}"
+            )
+
+            try:
+                sig = inspect.signature(fn)
+                arg_names = list(sig.parameters.keys())[1:]
+                fn_name = fn.__name__
+            except Exception:
+                # FIXME(justinchuby): Avoid catching Exception.
+                # Catch a more specific exception instead.
+                arg_names = [None] * len(args)  # type: ignore[list-item]
+                fn_name = None
+            args = [
+                _parse_arg(arg, arg_desc, arg_name, fn_name)  # type: ignore[method-assign]
+                for arg, arg_desc, arg_name in zip(args, arg_descriptors, arg_names)
+            ]
+            # only support _outputs in kwargs
+            assert len(kwargs) <= 1, (
+                f"Symbolic function {fn.__name__}'s '**kwargs' can contain a single "
+                f"key/value entry. "
+                f"{FILE_BUG_MSG}"
+            )
+
+            if len(kwargs) == 1:
+                assert "_outputs" in kwargs, (
+                    f"Symbolic function {fn.__name__}'s '**kwargs' can only contain "
+                    f"'_outputs' key at '**kwargs'. "
+                    f"{FILE_BUG_MSG}"
+                )
+            return fn(g, *args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+def quantized_args(
+    *arg_q_descriptors: bool,
+    scale: float | None = None,
+    zero_point: int | None = None,
+    quantize_output: bool = True,
+) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
+    """A decorator which extends support for quantized version of the base operator.
+
+    Quantization is detected by examining the arguments that are annotated by
+    `arg_q_descriptors`.
+
+    If quantization is detected, the base operator symbolic function will be wrapped with
+    argument de-quantization and output quantization.
+
+    Otherwise, only the base symbolic function will be invoked.
+
+    For example:
+
+    ```
+    @quantized_args(True, False)
+    def foo(g, x, y):
+        return x + y
+    ```
+
+    is equivalent to
+
+    ```
+    def q_foo(g, x, y):
+        if is_quantized_tensor(x):
+            x = dequantize(x)
+            out = foo(g, x, y)
+            return quantize(out)
+        else:
+            return foo(g, x, y)
+    ```
+
+    Args:
+        arg_q_descriptors: A sequence of bool, where each element represents if the
+          argument is QTensor for quantized version of this operator. It defaults
+          to False for unspecified (variable length) arguments.
+        scale: Quantized output scale. If None, derive from
+          the first quantized input scale.
+        zero_point: Quantized output zero point. If None,
+          derive from the first quantized input zero point.
+        quantize_output: If True, quantize the output of the base operator. Default is True
+    """
+
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(g, *args, **kwargs):
+            nonlocal scale
+            nonlocal zero_point
+            if scale is not None:
+                _scale = g.op("Constant", value_t=torch.tensor(scale))
+            else:
+                _scale = None
+            if zero_point is not None:
+                _zero_point = g.op("Constant", value_t=torch.tensor(zero_point))
+            else:
+                _zero_point = None
+
+            # Support variable length arguments by marking unspecified ones as non-quantized
+            arg_q_descriptors_extended = arg_q_descriptors + (False,) * (
+                len(args) - len(arg_q_descriptors)
+            )
+            descriptor_args = tuple(zip(arg_q_descriptors_extended, args))
+
+            def _is_arg_quantized(descriptor, arg):
+                return descriptor and _is_value(arg) and _is_tuple_construct(arg)
+
+            # Run regular symbolic function if none of the argument is QTensor.
+            is_quantized: list[bool] = []
+            for descriptor, arg in descriptor_args:
+                # ListConstruct
+                if _is_packed_list(arg):
+                    is_quantized.extend(
+                        _is_arg_quantized(descriptor, arg_input)
+                        for arg_input in arg.node().inputs()
+                    )
+                else:
+                    is_quantized.append(_is_arg_quantized(descriptor, arg))
+
+            if not any(is_quantized):
+                return fn(g, *args, **kwargs)
+
+            # Dequantize arguments that are quantized
+            non_quantized_args = []
+            for descriptor, arg in descriptor_args:
+                if _is_arg_quantized(descriptor, arg):
+                    # Quantized arg is a tuple of (value, scale, zero_point)
+                    dequantized_arg, arg_scale, arg_zero_point, _ = dequantize_helper(
+                        g, arg
+                    )
+                    non_quantized_args.append(dequantized_arg)
+                    # Set scale and zero_point to the first quantized input if not already set
+                    if _scale is None:
+                        _scale = arg_scale
+                    if _zero_point is None:
+                        _zero_point = arg_zero_point
+                # ListConstruct
+                elif _is_packed_list(arg):
+                    for arg_input in arg.node().inputs():
+                        if _is_arg_quantized(descriptor, arg_input):
+                            # Quantized arg is a tuple of (value, scale, zero_point)
+                            (
+                                dequantized_arg,
+                                arg_scale,
+                                arg_zero_point,
+                                _,
+                            ) = dequantize_helper(g, arg_input)
+                            # Set scale and zero_point to the first quantized input if not already set
+                            if _scale is None:
+                                _scale = arg_scale
+                            if _zero_point is None:
+                                _zero_point = arg_zero_point
+                            arg_input.replaceAllUsesWith(dequantized_arg)
+                    non_quantized_args.append(arg)
+                else:
+                    # Non-quantized arg
+                    non_quantized_args.append(arg)
+            # TODO(justinchuby): Only single output is supported for now. We may want to
+            # support multiple outputs in the future.
+            output = fn(g, *non_quantized_args, **kwargs)
+
+            assert _scale is not None, "Bug: Scale must be set for quantized operator"
+            assert _zero_point is not None, (
+                "Bug: Zero point must be set for quantized operator"
+            )
+
+            if quantize_output:
+                return quantize_helper(g, output, _scale, _zero_point)
+            return output
+
+        return wrapper
+
+    return decorator
+
+
+def _scalar(x: Any) -> Number | None:
+    """Convert a scalar tensor into a Python value."""
+    if isinstance(x, torch.Tensor) and x.shape == ():
+        return x.item()
+    return None
+
+
+def _if_scalar_type_as(self, tensor):
+    """
+    Convert self into the same type of tensor, as necessary.
+    We only support implicit casting for scalars, so we never
+    actually need to insert an ONNX cast operator here; just
+    fix up the scalar.
+    """
+    if isinstance(self, _C.Value):
+        return self
+
+    scalar_type = _type_utils.JitScalarType.from_value(
+        tensor, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        ty = scalar_type.scalar_name().lower()
+        return getattr(self, ty)()
+    return self
+
+
+def _is_none(x: Any) -> bool:
+    return x is None or (x.node().mustBeNone() if isinstance(x, _C.Value) else False)
+
+
+def _is_value(x: Any) -> bool:
+    return isinstance(x, _C.Value)
+
+
+def _is_constant(value: Any) -> bool:
+    return not _is_value(value) or value.node().kind() in {
+        "onnx::Constant",
+        "prim::Constant",
+    }
+
+
+def _is_tensor(x: _C.Value) -> bool:
+    return x.type().isSubtypeOf(_C.TensorType.get())
+
+
+# Note: _C.JitType is not exposed to Python and cannot be checked in runtime.
+def _as_list_type(jit_type: _C.JitType) -> _C.ListType | None:
+    if isinstance(jit_type, _C.ListType):
+        return jit_type
+    return None
+
+
+def _is_list(x: _C.Value) -> bool:
+    return _as_list_type(x.type()) is not None
+
+
+def _is_tensor_list(x: _C.Value) -> bool:
+    x_type = _as_list_type(x.type())
+    if x_type is None:
+        return False
+    return isinstance(x_type.getElementType(), _C.TensorType)
+
+
+def _is_scalar_list(x: _C.Value) -> bool:
+    """Checks if x is a scalar list, for example: List[float], List[int].
+
+    Besides checking the type is ListType, we also check if the data type is
+    a valid ONNX data type.
+    """
+    x_type = _as_list_type(x.type())
+    if x_type is None:
+        return False
+    scalar_type = _type_utils.JitScalarType.from_value(x)
+    return scalar_type.onnx_compatible()
+
+
+def _is_tuple_construct(x: _C.Value) -> bool:
+    return x.node().kind() == "prim::TupleConstruct"
+
+
+def is_complex_value(x: _C.Value) -> bool:
+    assert _is_value(x)
+    return _type_utils.JitScalarType.from_value(
+        x, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.COMPLEX32,
+        _type_utils.JitScalarType.COMPLEX64,
+        _type_utils.JitScalarType.COMPLEX128,
+    }
+
+
+def _get_tensor_rank(x: _C.Value) -> int | None:
+    if not _is_tensor(x) or x.type() is None:
+        return None
+    x_type = x.type()
+    x_type = typing.cast(_C.TensorType, x_type)
+    return x_type.dim()
+
+
+def _get_tensor_sizes(x: _C.Value, allow_nonstatic: bool = True):
+    if not _is_tensor(x) or x.type() is None:
+        return None
+    x_type = x.type()
+    x_type = typing.cast(_C.TensorType, x_type)
+    if allow_nonstatic:
+        # Each individual symbol is returned as None.
+        # e.g. [1, "a", "b"] -> [1, None, None]
+        return x_type.varyingSizes()
+    # returns None, if exists any symbol in sizes.
+    # e.g. [1, "a", "b"] -> None
+    return x_type.sizes()
+
+
+def _get_tensor_dim_size(x: _C.Value, dim: int) -> int | None:
+    sizes = _get_tensor_sizes(x)
+    return sizes[dim] if sizes else None
+
+
+def _get_dim_for_cross(x: _C.Value, dim: int | None):
+    if dim == -1:
+        tensor_rank = _get_tensor_rank(x)
+        assert tensor_rank is not None
+        return dim + tensor_rank
+    # If dim is not given, it defaults to the first dimension found with the size 3
+    if dim is None:
+        sizes = _get_tensor_sizes(x)
+        assert sizes is not None
+        for index, size in enumerate(sizes):
+            if size is not None and size == 3:
+                return index
+    return dim
+
+
+def _unimplemented(op: str, msg: str, value: _C.Value | None = None) -> None:
+    # For BC reasons, the behavior for Caffe2 does not raise exception for unimplemented operators
+    if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
+        _onnx_unsupported(f"{op}, {msg}", value)
+
+
+def _onnx_unsupported(op_name: str, value: _C.Value | None = None) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of operator {op_name}. "
+        f"Please feel free to request support or submit a pull request "
+        f"on PyTorch GitHub: {_constants.PYTORCH_GITHUB_ISSUES_URL}"
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+def _onnx_opset_unsupported(
+    op_name: str,
+    current_opset: int,
+    supported_opset: int,
+    value: _C.Value | None = None,
+) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of {op_name} in opset {current_opset}. "
+        f"Please try opset version {supported_opset}."
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+def _onnx_opset_unsupported_detailed(
+    op_name: str,
+    current_opset: int,
+    supported_opset: int,
+    reason: str,
+    value: _C.Value | None = None,
+) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of {op_name} in "
+        f"opset {current_opset}. {reason}. Please try opset version {supported_opset}."
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+def _block_list_in_opset(name: str):
+    def symbolic_fn(*args, **kwargs):
+        raise errors.OnnxExporterError(
+            f"ONNX export failed on {name}, which is not implemented for opset "
+            f"{GLOBALS.export_onnx_opset_version}. "
+            "Try exporting with other opset versions."
+        )
+
+    return symbolic_fn
+
+
+def _try_get_scalar_type(*args) -> _type_utils.JitScalarType | None:
+    for arg in args:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            arg, _type_utils.JitScalarType.UNDEFINED
+        )
+        if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+            return scalar_type
+    return None
+
+
+def _type_promote_from_values(*args) -> _type_utils.JitScalarType:
+    undef = _type_utils.JitScalarType.UNDEFINED
+    jit_types = [_try_get_scalar_type(arg) for arg in args]
+    if len(jit_types) == 0:
+        return undef
+    if len(jit_types) == 1:
+        return jit_types[0]  # type: ignore[return-value]
+    new_dtype = jit_types[0].dtype()  # type: ignore[union-attr]
+    for t in jit_types:
+        new_dtype = torch.promote_types(new_dtype, t.dtype())  # type: ignore[union-attr]
+    return _type_utils.JitScalarType.from_dtype(new_dtype)
+
+
+def _maybe_cast_to_type(
+    g: jit_utils.GraphContext, value, jit_type: _type_utils.JitScalarType
+):
+    if (
+        _type_utils.JitScalarType.from_value(value, _type_utils.JitScalarType.UNDEFINED)
+        != jit_type
+    ):
+        return g.op(
+            "Cast",
+            value,
+            to_i=jit_type.onnx_type(),
+        )
+    return value
+
+
+def _select_helper(g: jit_utils.GraphContext, self, dim, index, apply_reshape=True):
+    index_const = _maybe_get_scalar(index)
+    index_dim = _get_tensor_rank(index)
+    if not _is_value(index_const):
+        # Index is a constant scalar. Make it a size 1 constant tensor.
+        index = g.op("Constant", value_t=torch.LongTensor([index_const]))
+    elif index_dim is not None and apply_reshape:
+        if index_dim == 0:
+            # Index is a scalar. Reshape it to a size 1 tensor.
+            index = _reshape_helper(
+                g, index, g.op("Constant", value_t=torch.LongTensor([1]))
+            )
+
+    index_scalar_type = _type_utils.JitScalarType.from_value(
+        index, _type_utils.JitScalarType.UNDEFINED
+    )
+    if index_scalar_type not in {
+        _type_utils.JitScalarType.INT64,
+        _type_utils.JitScalarType.INT,
+    }:
+        index = g.op("Cast", index, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("Gather", self, index, axis_i=dim)
+
+
+def _slice_helper(
+    g: jit_utils.GraphContext,
+    input,
+    axes,
+    starts,
+    ends,
+    steps=None,
+):
+    if g.opset <= 9:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import (
+            _slice as _slice9,
+        )
+
+        return _slice9(g, input, axes, starts, ends)
+    else:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset10 import (
+            _slice as _slice10,
+        )
+
+        return _slice10(g, input, axes, starts, ends, steps)
+
+
+def _is_fp(value) -> bool:
+    return _type_utils.JitScalarType.from_value(
+        value, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.DOUBLE,
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.BFLOAT16,
+    }
+
+
+def _is_bool(value) -> bool:
+    return _type_utils.JitScalarType.from_value(
+        value, _type_utils.JitScalarType.UNDEFINED
+    ) in {_type_utils.JitScalarType.BOOL}
+
+
+def _generate_wrapped_number(g: jit_utils.GraphContext, scalar):
+    """Creates a wrapped number based on https://github.com/pytorch/pytorch/issues/9515.
+
+    A Tensor is a considered a "wrapped number" if it is
+    auto-wrapped from a C++ or Python number type. Integer types are
+    wrapped as 0-dim int64 tensors and floating-point types are
+    wrapped as 0-dim double tensors.
+
+    The input to this function is constant value. If the data type
+    is a floating point type, it is converted to a 0-dim double
+    tensor, else it is converted to a 0-dim tensor of its original type
+    """
+    assert not isinstance(scalar, torch.Tensor)
+    if isinstance(scalar, float):
+        return g.op("Constant", value_t=torch.tensor(scalar, dtype=torch.double))
+    return g.op("Constant", value_t=torch.tensor(scalar))
+
+
+def _sort_helper(g: jit_utils.GraphContext, input, dim, descending=True, out=None):
+    if out is not None:
+        _unimplemented("Sort", "Out parameter is not supported")
+    shape_ = g.op("Shape", input)
+    dim_size_ = g.op(
+        "Gather",
+        shape_,
+        g.op("Constant", value_t=torch.tensor([dim], dtype=torch.int64)),
+    )
+    if g.opset <= 10:
+        if not descending:
+            _unimplemented("Sort", "Ascending is not supported")
+        return g.op("TopK", input, dim_size_, axis_i=dim, outputs=2)
+    else:
+        return g.op(
+            "TopK", input, dim_size_, axis_i=dim, largest_i=descending, outputs=2
+        )
+
+
+def _topk_helper(
+    g: jit_utils.GraphContext, input, k, dim, largest=True, sorted=False, out=None
+):
+    if out is not None:
+        _unimplemented("TopK", "Out parameter is not supported")
+    if not _is_value(k):
+        k = g.op("Constant", value_t=torch.tensor([k], dtype=torch.int64))
+    else:
+        k = _reshape_helper(g, k, g.op("Constant", value_t=torch.tensor([1])))
+        if _try_get_scalar_type(k) != _type_utils.JitScalarType.INT64:
+            k = g.op("Cast", k, to_i=_C_onnx.TensorProtoDataType.INT64)
+    if g.opset <= 10:
+        if not largest:
+            _unimplemented("TopK", "Ascending is not supported")
+        return g.op("TopK", input, k, axis_i=dim, outputs=2)
+    else:
+        return g.op(
+            "TopK", input, k, axis_i=dim, largest_i=largest, sorted_i=sorted, outputs=2
+        )
+
+
+def _lt_helper(g: jit_utils.GraphContext, input, other):
+    if g.opset <= 8:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset8 import lt as _lt8
+
+        return _lt8(g, input, other)
+    else:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import lt as _lt9
+
+        return _lt9(g, input, other)
+
+
+def _interpolate_warning(interpolate_mode):
+    onnx_op = (
+        "onnx:Resize" if GLOBALS.export_onnx_opset_version >= 10 else "onnx:Upsample"
+    )
+    warnings.warn(
+        "You are trying to export the model with "
+        + onnx_op
+        + " for ONNX opset version "
+        "" + str(GLOBALS.export_onnx_opset_version) + ". "
+        "This operator might cause results to not match the expected results by PyTorch.\n"
+        "ONNX's Upsample/Resize operator did not match Pytorch's Interpolation until opset 11. "
+        "Attributes to determine how to transform the input were added in onnx:Resize in opset 11 "
+        "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n"
+        "We recommend using opset 11 and above for models using this operator."
+    )
+
+
+def _unsqueeze_helper(g: jit_utils.GraphContext, input, axes_i):
+    if len(axes_i) == 0:
+        # unnecessary unsqueeze if axes length==0
+        return input
+    elif _is_constant(axes_i[0]):
+        if g.opset >= 13:
+            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("Unsqueeze", input, axes)
+        return g.op("Unsqueeze", input, axes_i=axes_i)
+    # Tensor type
+    if g.opset < 13:
+        raise errors.SymbolicValueError(
+            "Opset version must be >= 13 for Unsqueeze with dynamic axes.", input
+        )
+    return g.op("Unsqueeze", input, axes_i[0])
+
+
+def _squeeze_helper(g: jit_utils.GraphContext, input, axes_i):
+    if _is_constant(axes_i[0]):
+        if g.opset >= 13:
+            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("Squeeze", input, axes)
+        return g.op("Squeeze", input, axes_i=axes_i)
+    # Tensor type
+    if g.opset < 13:
+        raise errors.SymbolicValueError(
+            "Opset version must be >= 13 for Squeeze with dynamic axes.", input
+        )
+    axes_t = axes_i[0]
+    axes_rank = _get_tensor_rank(axes_t)
+    assert axes_rank is not None
+    if axes_rank > 1:
+        raise errors.SymbolicValueError(
+            "For Squeeze axses as input, the axes rank must be one in ONNX spec.", input
+        )
+    elif axes_rank == 0:
+        # The axes is a scalar. Unsqueeze it to a rank 1 tensor.
+        axes_t = _unsqueeze_helper(g, axes_t, [0])
+        return g.op("Squeeze", input, axes_t)
+    return g.op("Squeeze", input, axes_t)
+
+
+def _reducesum_helper(
+    g: jit_utils.GraphContext,
+    input,
+    axes_i=None,
+    keepdims_i=1,
+    noop_with_empty_axes_i=0,
+):
+    keepdims_i = _maybe_get_const(keepdims_i, "i")
+    if g.opset >= 13:
+        if axes_i:
+            if not _is_value(axes_i):
+                axes_i = g.op(
+                    "Constant", value_t=torch.tensor(axes_i, dtype=torch.long)
+                )
+            return g.op(
+                "ReduceSum",
+                input,
+                axes_i,
+                keepdims_i=keepdims_i,
+                noop_with_empty_axes_i=noop_with_empty_axes_i,
+            )
+        return g.op(
+            "ReduceSum",
+            input,
+            keepdims_i=keepdims_i,
+            noop_with_empty_axes_i=noop_with_empty_axes_i,
+        )
+    else:
+        return g.op("ReduceSum", input, axes_i=axes_i, keepdims_i=keepdims_i)
+
+
+def _interpolate_size_to_scales(g: jit_utils.GraphContext, input, output_size, dim):
+    output_size = _maybe_get_const(output_size, "is")
+    if _is_value(output_size):
+        offset = 2
+        offsets = g.op("Constant", value_t=torch.ones(offset, dtype=torch.float32))
+        dividend = g.op("Cast", output_size, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        divisor = _slice_helper(
+            g, g.op("Shape", input), axes=[0], ends=[sys.maxsize], starts=[offset]
+        )
+        divisor = g.op("Cast", divisor, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        scale_dims = g.op("Div", dividend, divisor)
+        scales = g.op("Concat", offsets, scale_dims, axis_i=0)
+    else:
+        scales_constant = [
+            1.0
+            if i < 2
+            else float(output_size[-(dim - i)])
+            / float(input.type().sizes()[-(dim - i)])
+            for i in range(0, dim)
+        ]
+        scales = g.op(
+            "Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32)
+        )
+    return scales
+
+
+def _interpolate_get_scales_if_available(g: jit_utils.GraphContext, scales):
+    available_scales = _maybe_get_const(scales[0], "fs") != -1 and not _is_none(
+        scales[0]
+    )
+
+    if not available_scales:
+        return None
+
+    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
+    scales_list = g.op(
+        "Constant", value_t=torch.tensor(_maybe_get_const(scales[0], "fs"))
+    )
+    scales = g.op("Concat", offsets, scales_list, axis_i=0)
+    return scales
+
+
+def _get_interpolate_attributes(g: jit_utils.GraphContext, mode, args):
+    if mode == "nearest":
+        align_corners = None
+        scales = args[0:]
+    else:
+        align_corners = args[0]
+        scales = args[1:]
+    scales = _interpolate_get_scales_if_available(g, scales)
+    return scales, align_corners
+
+
+def _interpolate_get_scales(g: jit_utils.GraphContext, scale_factor, dim):
+    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
+    scale_factor_rank = _get_tensor_rank(scale_factor)
+    if isinstance(scale_factor.type(), _C.ListType) or (
+        scale_factor_rank is not None and scale_factor_rank > 0
+    ):
+        return g.op("Concat", offsets, scale_factor, axis_i=0)
+    else:
+        scale_factor = _unsqueeze_helper(g, scale_factor, [0])
+        scale_factor = g.op(
+            "Cast", scale_factor, to_i=_C_onnx.TensorProtoDataType.FLOAT
+        )
+        scales = [scale_factor for i in range(dim - 2)]
+    scale_factor = g.op("Concat", offsets, *scales, axis_i=0)
+    return scale_factor
+
+
+def _interpolate_get_scales_and_mode(
+    g: jit_utils.GraphContext, input, size, scale_factor, mode, align_corners
+):
+    mode = _maybe_get_const(mode, "s")
+    if "linear" in mode:
+        mode = "linear"
+    if "cubic" in mode:
+        mode = "cubic"
+    _interpolate_warning(mode)
+
+    align_corners = _maybe_get_const(align_corners, "b")
+    if isinstance(align_corners, bool) and align_corners:
+        return _unimplemented("interpolate", "align_corners == True")
+
+    if not input.type().dim():
+        return _unimplemented("interpolate", "missing input shape")
+    dim = input.type().dim()
+
+    if not _is_none(scale_factor):
+        scale_factor = _interpolate_get_scales(g, scale_factor, dim)
+    elif not _is_none(size):
+        if not _is_packed_list(size):
+            is_scalar = _maybe_get_const(size, "t").dim() == 0
+            if is_scalar:
+                size = _unsqueeze_helper(g, size, [0])
+                size = [size for i in range(dim - 2)]
+                size = g.op("Concat", *size, axis_i=0)
+        scale_factor = _interpolate_size_to_scales(g, input, size, dim)
+    else:
+        return _unimplemented(
+            "interpolate", "Both size and scales are None in __interpolate"
+        )
+    return scale_factor, mode
+
+
+def _argmin_argmax_helper(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+    op_name: str,
+):
+    def op_wrapper(input, axis_i, keepdims_i):
+        if g.opset >= 12:
+            return g.op(
+                op_name,
+                input,
+                axis_i=axis_i,
+                keepdims_i=keepdims_i,
+                select_last_index_i=False,
+            )
+        return g.op(op_name, input, axis_i=axis_i, keepdims_i=keepdims_i)
+
+    if _is_none(dim):
+        flattened = _reshape_helper(
+            g, input, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        output = op_wrapper(flattened, axis_i=0, keepdims_i=False)
+        if keepdim:
+            input_shape = g.op("Shape", input)
+            input_shape_shape = g.op("Shape", input_shape)
+            new_shape = g.op(
+                "ConstantOfShape",
+                input_shape_shape,
+                value_t=torch.tensor([1], dtype=torch.int64),
+            )
+            output = g.op("Reshape", output, new_shape)
+        return output
+
+    dim = _parse_arg(dim, "i")
+    return op_wrapper(input, axis_i=dim, keepdims_i=keepdim)
+
+
+def _interpolate_helper(name, dim, interpolate_mode):
+    @quantized_args(True, False, False)
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = _get_interpolate_attributes(g, interpolate_mode, args)
+        align_corners = _maybe_get_scalar(align_corners)
+        coordinate_transformation_mode = (
+            "asymmetric"
+            if interpolate_mode == "nearest"
+            else "align_corners"
+            if align_corners
+            else "half_pixel"
+        )
+
+        if scales is None:
+            input_size = g.op("Shape", input)
+            input_size_beg = _slice_helper(
+                g, input_size, axes=[0], ends=[2], starts=[0]
+            )
+            output_size = g.op(
+                "Cast", output_size, to_i=_C_onnx.TensorProtoDataType.INT64
+            )
+            output_size = g.op("Concat", input_size_beg, output_size, axis_i=0)
+
+            if g.opset >= 13:
+                empty_roi = _optional_input_placeholder_tensor(g)
+                empty_scales = _optional_input_placeholder_tensor(g)
+            else:
+                empty_roi = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+                empty_scales = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+
+            return g.op(
+                "Resize",
+                input,
+                empty_roi,
+                empty_scales,
+                output_size,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor",
+            )  # only valid when mode="nearest"
+        else:
+            if g.opset >= 13:
+                empty_roi = _optional_input_placeholder_tensor(g)
+            else:
+                empty_roi = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+
+            return g.op(
+                "Resize",
+                input,
+                empty_roi,
+                scales,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor",
+            )  # only valid when mode="nearest"
+
+    return symbolic_fn
+
+
+def __interpolate_helper(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+):
+    mode = _maybe_get_const(mode, "s")
+    if "linear" in mode:
+        mode = "linear"
+    if "cubic" in mode:
+        mode = "cubic"
+    align_corners = _maybe_get_const(align_corners, "b")
+    align_corners = False if not isinstance(align_corners, bool) else align_corners
+    coordinate_transformation_mode = (
+        "asymmetric"
+        if mode == "nearest"
+        else "align_corners"
+        if align_corners
+        else "half_pixel"
+    )
+
+    if not _is_none(size):
+        input_size = g.op("Shape", input)
+        input_size = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
+        # in some cases size is not a packed list but size is a scalar
+        # We need to also verify that (_maybe_get_const(size, "t").dim() == 0)
+        # but this information is not always available. Try to get the dim,
+        # and if not assume that it is not a scalar.
+        try:
+            is_scalar = not _is_packed_list(size) and (
+                _maybe_get_const(size, "t").dim() == 0
+            )
+        except AttributeError:
+            is_scalar = not _is_packed_list(size)
+            if not is_scalar:
+                warnings.warn(
+                    "Cannot verify if the output_size is a scalar "
+                    "while exporting interpolate. Assuming that it is not a scalar."
+                )
+
+        if is_scalar:
+            rank = _get_tensor_rank(input)
+            if rank is None:
+                return _unimplemented(
+                    "interpolate (with a scalar output_size)",
+                    "missing input shape (try giving an array of output_size values)",
+                )
+            size = _unsqueeze_helper(g, size, [0])
+            size = [size for i in range(rank - 2)]
+            size = g.op("Concat", *size, axis_i=0)
+        size = g.op("Cast", size, to_i=_C_onnx.TensorProtoDataType.INT64)
+        size = g.op("Concat", input_size, size, axis_i=0)
+
+        if g.opset >= 13:
+            empty_roi = _optional_input_placeholder_tensor(g)
+            empty_scales = _optional_input_placeholder_tensor(g)
+        else:
+            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+            empty_scales = g.op(
+                "Constant", value_t=torch.tensor([], dtype=torch.float32)
+            )
+
+        return g.op(
+            "Resize",
+            input,
+            empty_roi,
+            empty_scales,
+            size,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
+            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+            mode_s=mode,  # nearest, linear, or cubic
+            nearest_mode_s="floor",
+        )
+    else:  # if not _is_none(scales)
+        rank = _get_tensor_rank(input)
+        if rank is None:
+            return _unimplemented("interpolate (with scales)", "missing input shape")
+
+        if g.opset >= 13:
+            empty_roi = _optional_input_placeholder_tensor(g)
+        else:
+            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+
+        scales = _interpolate_get_scales(g, scale_factor, rank)
+        return g.op(
+            "Resize",
+            input,
+            empty_roi,
+            scales,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
+            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+            mode_s=mode,  # nearest, linear, or cubic
+            nearest_mode_s="floor",
+        )  # only valid when mode="nearest"
+
+
+def _unbind_helper(g: jit_utils.GraphContext, self, dim, _outputs):
+    if g.opset < 11:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import unbind
+    elif g.opset <= 12:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset11 import (
+            unbind,  # type: ignore[no-redef]
+        )
+    else:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset13 import (
+            unbind,  # type: ignore[no-redef]
+        )
+    return unbind(g, self, dim, _outputs)
+
+
+def _scatter_helper(g: jit_utils.GraphContext, self, dim, index, src):
+    if g.opset <= 10:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import scatter
+    else:
+        # for mypy, scatter was imported two lines above
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset11 import (
+            scatter,  # type: ignore[no-redef]
+        )
+    return scatter(g, self, dim, index, src)
+
+
+def _repeat_interleave_split_helper(g: jit_utils.GraphContext, self, reps, dim):
+    if g.opset <= 12:
+        split_out = g.op("Split", self, split_i=[1] * reps, axis_i=dim, outputs=reps)
+    else:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset13 import split
+
+        repeats = g.op("Constant", value_t=torch.tensor([1] * reps))
+        split_out = split(g, self, repeats, dim, _outputs=reps)
+    return split_out if reps > 1 else [split_out]
+
+
+def _repeat_interleave_single_value_repeat_helper(
+    g: jit_utils.GraphContext, self, repeats, dim
+):
+    from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import (
+        flatten,
+        unsqueeze,
+    )
+
+    if not _is_tensor(repeats):
+        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+
+    const_repeats: bool = _is_constant(repeats)
+    reps = _maybe_get_const(repeats, "t")
+
+    # Convert 'repeats' to 1-d if it is 0-d.
+    if _get_tensor_rank(repeats) == 0:
+        repeats = g.op("Reshape", repeats, g.op("Constant", value_t=torch.tensor([1])))
+
+    # Create a new dim of size 1, then expand it to be 'repeats' long, and finally collapse it.
+    unsqueezed = unsqueeze(g, self, dim + 1)
+
+    # repeats_per_dim is 1 for all dims except for the new unsqueezed dim, where it has value 'repeats'.
+    if const_repeats:
+        # 'Repeats' is a constant, 'repeats_per_dim' can be a constant.
+        onehot = torch.ones(_get_tensor_rank(unsqueezed), dtype=torch.int64)  # type: ignore[arg-type]
+        onehot[dim + 1] = reps
+        repeats_per_dim = g.op("Constant", value_t=onehot)
+    else:
+        # 'Repeats' is a variable, 'repeats_per_dim' cannot be a constant.
+        onehot = g.op(
+            "OneHot",
+            unsqueeze(g, dim + 1, 0),  # indices, must be >= 1-dimensional
+            g.op(
+                "Constant", value_t=torch.tensor(_get_tensor_rank(unsqueezed))
+            ),  # depth
+            g.op(
+                "Concat", g.op("Constant", value_t=torch.tensor([1])), repeats, axis_i=0
+            ),  # on/off values
+        )
+        repeats_per_dim = flatten(g, onehot, 0, 1)
+
+    tiled = g.op("Tile", unsqueezed, repeats_per_dim)
+    return flatten(g, tiled, dim, dim + 1)
+
+
+def _arange_cast_helper(
+    g: jit_utils.GraphContext, end, start=None, step=None, dtype=None
+) -> tuple[
+    _type_utils.JitScalarType,
+    _C.Value | None,
+    _C.Value | None,
+    _C.Value | None,
+]:
+    def _is_all_integral(scalars):
+        for scalar in scalars:
+            scalar_type = _type_utils.JitScalarType.from_value(
+                scalar, _type_utils.JitScalarType.UNDEFINED
+            )
+            if (
+                scalar_type != _type_utils.JitScalarType.INT64
+                and scalar_type != _type_utils.JitScalarType.UNDEFINED
+            ):
+                return False
+        return True
+
+    # This logic is based on torch.arange docs. If "dtype" is provided,
+    # infer input types from dtype. If not, then check if any of start, stop,
+    # or step are floating point, and infer the type from get_default.
+    # Otherwise, the dtype is inferred to be torch.int64.
+    if dtype is None or (_is_value(dtype) and _is_none(dtype)):
+        if _is_all_integral([start, end, step]):
+            scalar_type = _type_utils.JitScalarType.INT64
+        else:
+            scalar_type = _type_utils.JitScalarType.from_dtype(
+                torch.get_default_dtype()
+            )
+    else:
+        assert isinstance(dtype, int)
+        # TODO(justinchuby): Check if dtype is indeed a int.
+        scalar_type = _type_utils.JitScalarType(dtype)
+
+    start = g.op("Cast", start, to_i=scalar_type.onnx_type()) if start else None
+    end = g.op("Cast", end, to_i=scalar_type.onnx_type()) if end else None
+    step = g.op("Cast", step, to_i=scalar_type.onnx_type()) if step else None
+    return scalar_type, end, start, step
+
+
+def _arange_helper(g: jit_utils.GraphContext, *args):
+    if g.opset <= 10:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import arange
+    else:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset11 import (
+            arange,  # type: ignore[no-redef]
+        )
+    return arange(g, *args)
+
+
+def _size_helper(g: jit_utils.GraphContext, self, dim):
+    full_shape = g.op("Shape", self)
+    from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import select
+
+    return select(g, full_shape, g.op("Constant", value_t=torch.tensor([0])), dim)
+
+
+def _index_fill_reshape_helper(g: jit_utils.GraphContext, self, dim, index):
+    # 1. reshape index => [1, ..., 1, dim, 1, ..., 1]
+    # 2. expand index => [..., dim, ...], same shape as self except for dim.
+    # 3. expand value as well.
+    # 4. apply onnx::scatter.
+
+    from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import expand
+
+    if g.opset <= 10:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import scatter
+    else:
+        # for mypy, scatter was imported two lines above
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset11 import (
+            scatter,  # type: ignore[no-redef]
+        )
+
+    if self.type().dim() is None:
+        return _unimplemented("index_fill", "input rank not accessible")
+    self_dim = self.type().dim()
+    dim_value = _parse_arg(dim, "i")
+    if dim_value < 0:
+        dim_value += self_dim
+    unsqueezed_index = _unsqueeze_helper(
+        g, index, [i for i in range(self_dim) if i != dim_value]
+    )
+    expanded_index_shape = scatter(
+        g, g.op("Shape", self), 0, _unsqueeze_helper(g, dim, [0]), g.op("Shape", index)
+    )
+    expanded_index = expand(g, unsqueezed_index, expanded_index_shape, None)
+    return expanded_index_shape, expanded_index
+
+
+# By default, when any value in the 'shape' input is equal to zero
+# the corresponding dimension value is copied from the input tensor dynamically.
+# allowzero=1 indicates that if any value in the 'shape' input is set to zero,
+# the zero value is honored, similar to NumPy.
+# allowzero=1 is only supported for opset version >= 14.
+def _reshape_helper(g: jit_utils.GraphContext, input, shape, allowzero=0):
+    shape = _maybe_get_const(shape, "is")
+    if not _is_value(shape):
+        shape = g.op("Constant", value_t=torch.LongTensor(shape))
+    if g.opset <= 13:
+        if allowzero == 1:
+            _onnx_opset_unsupported(
+                "Reshape with allowzero=1", GLOBALS.export_onnx_opset_version, 14, input
+            )
+        return g.op("Reshape", input, shape)
+    else:
+        return g.op("Reshape", input, shape, allowzero_i=allowzero)
+
+
+def _batchnorm_helper(
+    g: jit_utils.GraphContext, input, weight, bias, running_mean, running_var
+):
+    from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import _var_mean
+
+    batch_size = _get_tensor_dim_size(input, 0)
+    channel_size = _get_tensor_dim_size(input, 1)
+
+    if weight is None or _is_none(weight):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of batch_norm for unknown channel size.",
+                input,
+            )
+        weight_value = torch.tensor(
+            [1.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or _is_none(bias):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of batch_norm for unknown channel size.",
+                input,
+            )
+        bias_value = torch.tensor(
+            [0.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        bias = g.op("Constant", value_t=bias_value)
+    # If track_running_stats is set to False batch statistics are instead used during evaluation time
+    if (
+        running_mean is None
+        or _is_none(running_mean)
+        or running_var is None
+        or _is_none(running_var)
+    ):
+        assert batch_size is not None and channel_size is not None
+        reshape_in = _reshape_helper(
+            g,
+            input,
+            g.op(
+                "Constant",
+                value_t=torch.tensor([batch_size, channel_size, -1], dtype=torch.int64),
+            ),
+        )
+        trans_in = g.op("Transpose", reshape_in, perm_i=[0, 2, 1])
+        running_var, running_mean = _var_mean(
+            g,
+            trans_in,
+            g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)),
+            False,
+            False,
+        )
+    return weight, bias, running_mean, running_var
+
+
+def _avgpool_helper(
+    tuple_fn: Callable[[Any], Sequence[int]],
+    padding: int | Sequence[int],
+    kernel_size,
+    stride,
+    divisor_override,
+    name,
+) -> tuple[int, ...]:
+    if divisor_override and divisor_override.node().kind() != "prim::Constant":
+        _unimplemented(name, "divisor_override")
+    return tuple(tuple_fn(padding))
+
+
+def check_training_mode(op_train_mode: int, op_name: str) -> None:
+    """Warns the user if the model's training mode and the export mode do not agree."""
+    if GLOBALS.training_mode == _C_onnx.TrainingMode.PRESERVE:
+        return
+
+    if op_train_mode:
+        op_mode_enum = _C_onnx.TrainingMode.TRAINING
+    else:
+        op_mode_enum = _C_onnx.TrainingMode.EVAL
+    if op_mode_enum == GLOBALS.training_mode:
+        # The modes agree. Do nothing
+        return
+
+    op_mode_text = f"train={bool(op_train_mode)}"
+    # Setting the model mode could result in op_mode != GLOBALS.training_mode
+    # if the model is a FuncModule. In this case we warn the user of
+    # the state and export depending on op_mode
+    # This is to support use-cases of fixing certain layer weights
+    # in training.
+    warnings.warn(
+        f"ONNX export mode is set to {GLOBALS.training_mode}, but operator '{op_name}' "
+        f"is set to {op_mode_text}. Exporting with {op_mode_text}."
+    )
+
+
+def _flatten_helper(g: jit_utils.GraphContext, input, start_dim, end_dim, dim):
+    input_size = g.op("Shape", input)
+    slice1 = _slice_helper(g, input_size, axes=[0], starts=[0], ends=[start_dim])
+    slices = [slice1, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long))]
+    if end_dim < dim - 1:
+        slice3 = _slice_helper(
+            g, input_size, axes=[0], starts=[end_dim + 1], ends=[dim]
+        )
+        slices = [
+            slice1,
+            g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+            slice3,
+        ]
+
+    final_shape = g.op("Concat", *slices, axis_i=0)
+    from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import (
+        _reshape_from_tensor,
+    )
+
+    return _reshape_from_tensor(g, input, final_shape)
+
+
+def _is_split_static(split_size_or_sizes, _outputs):
+    if _outputs is None:
+        return False
+    if (
+        _is_value(split_size_or_sizes)
+        and split_size_or_sizes.node().kind() != "onnx::Constant"
+    ):
+        return False
+    return True
+
+
+def _optional_input_placeholder_tensor(g):
+    n = g.op("prim::Constant")
+    n.setType(_C.OptionalType.ofTensor())
+    return n
+
+
+def _handle_reduce_dim_none(g: jit_utils.GraphContext, self, op_name):
+    rank = _get_tensor_rank(self)
+    if rank is not None and any(
+        _get_tensor_dim_size(self, i) == 0 for i in range(rank)
+    ):
+        # If input tensor is empty, according to ONNX ReduceSum definition,
+        # set keepdims=1 so that the resulted tensor has the same rank as the input.
+        return g.op(op_name, self, keepdims_i=1)
+    return g.op(op_name, self, keepdims_i=0)
+
+
+def dequantize_helper(
+    g: jit_utils.GraphContext,
+    qtensor: _C.Value,
+    qdtype: _C_onnx.TensorProtoDataType | None = None,
+) -> tuple[_C.Value, _C.Value, _C.Value, _C.Value | None]:
+    """Appends to graph `g` ONNX nodes that dequantizes `qtensor` into `tensor`.
+
+    Args:
+        g: Graph, the ONNX IR graph that is under construction.
+        qtensor: torch._C.Value, either a tuple of (quantized_tensor, scale, zero_point)
+            for per tensor quantization, or
+            (quantized_tensor, scale, zero_point, axis) for per channel quantization,
+            representing the quantized tensor.
+        qdtype: torch.onnx.TensorProtoDataType default None, if not None, represents the
+            data type of quantized tensor. It must be either
+            torch.onnx.TensorProtoDataType.UINT8 or torch.onnx.TensorProtoDataType.INT8.
+    """
+    unpacked_qtensors = _unpack_quantized_tensor(qtensor)
+    tensor, scale, zero_point = unpacked_qtensors[:3]
+    axis = unpacked_qtensors[3] if len(unpacked_qtensors) >= 4 else None
+    axis_i = _get_const(axis, "i", "axis")
+    input_qdtype = _type_utils.JitScalarType.from_value(tensor)
+    if qdtype is None:
+        if input_qdtype is not None:
+            qdtype = input_qdtype.onnx_type()
+        else:
+            qdtype = _C_onnx.TensorProtoDataType.UINT8
+    value = g.op("Cast", tensor, to_i=qdtype)
+    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    zero_point = g.op("Cast", zero_point, to_i=qdtype)
+
+    if axis_i is not None and GLOBALS.export_onnx_opset_version < 13:
+        _onnx_opset_unsupported_detailed(
+            "DequantizeLinear",
+            GLOBALS.export_onnx_opset_version,
+            13,
+            "Attribute axis is not supported.",
+            qtensor,
+        )
+
+    return (
+        g.op("DequantizeLinear", value, scale, zero_point, axis_i=axis_i),
+        scale,
+        zero_point,
+        axis,
+    )
+
+
+def quantize_helper(
+    g: jit_utils.GraphContext,
+    tensor: _C.Value,
+    scale: _C.Value,
+    zero_point: _C.Value,
+    axis: _C.Value | None = None,
+) -> _C.Value:
+    """Appends to graph `g` ONNX nodes that quantizes `tensor` based on `scale`, `zero_point` and `axis`.
+
+    Args:
+        g: Graph, the ONNX IR graph that is under construction.
+        tensor: torch._C.Value, representing the tensor to be quantized.
+        scale: torch._C.Value, quantized scale.
+        zero_point: torch._C.Value, quantized zero point.
+        axis: Optional[torch._C.Value] default None, if None, represents per tensor quantization.
+            Otherwise, represents per channel quantization, along given axis.
+
+    Returns:
+        A TupleConstruct storing information of the quantized tensor.
+    """
+    if (
+        axis is not None
+        and not _is_none(axis)
+        and GLOBALS.export_onnx_opset_version < 13
+    ):
+        _onnx_opset_unsupported_detailed(
+            "QuantizeLinear",
+            GLOBALS.export_onnx_opset_version,
+            13,
+            "Attribute axis is not supported.",
+            tensor,
+        )
+
+    assert scale is not None
+    if (
+        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
+        != _type_utils.JitScalarType.FLOAT
+    ):
+        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+    assert zero_point is not None
+    if _type_utils.JitScalarType.from_value(
+        zero_point, _type_utils.JitScalarType.UNDEFINED
+    ) not in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+    }:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    output = g.op(
+        "QuantizeLinear",
+        tensor,
+        scale,
+        zero_point,
+        axis_i=_get_const(axis, "i", "axis"),
+    )
+    args = [output, scale, zero_point]
+    if axis is not None and not _is_none(axis):
+        args.append(axis)
+    return g.op("prim::TupleConstruct", *args)
+
+
+def requantize_bias_helper(
+    g: jit_utils.GraphContext, bias, input_scale, weight_scale, axis=None
+):
+    """In PyTorch, bias is float and is quantized to int32 implicitly inside the quantized ATen op kernel.
+    In ONNX we need to make the quantization explicit because operators expect all of their inputs to be quantized.
+    Since int32 is not a supported output type by ONNX operator `QuantizeLinear`, quantization is exported using
+    regular operators.
+    """
+    bias_scale = g.op("Mul", weight_scale, input_scale)
+    bias_scale_shape = g.op("Shape", bias_scale)
+    bias_zero_point = g.op(
+        "ConstantOfShape", bias_scale_shape, value_t=torch.tensor([0], dtype=torch.int)
+    )
+    q_bias = g.op(
+        "Cast", g.op("Div", bias, bias_scale), to_i=_C_onnx.TensorProtoDataType.INT32
+    )
+    axis_args = []
+    if axis is not None and not _is_none(axis):
+        axis_args.append(axis)
+    return g.op("prim::TupleConstruct", q_bias, bias_scale, bias_zero_point, *axis_args)
+
+
+def args_have_same_dtype(args):
+    assert args
+    base_dtype = _type_utils.JitScalarType.from_value(args[0])
+    has_same_dtype = all(
+        _type_utils.JitScalarType.from_value(elem) == base_dtype for elem in args
+    )
+    return has_same_dtype
+
+
+def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kwargs):
+    """Some PyTorch operators (e.g., Clip/Min/ReLU/Pad) are super set of ONNX in terms of data types.
+    This function maximizes the exportability of PyTorch-ONNX by allowing ONNX-unsupported PyTorch
+    operator data type. For example, `Cast<int>(Clip<float>(Cast<float>(INPUT)))` can be used to mimic
+    `Clip<int>(INPUT)` (opset version < 12).
+
+    Args:
+        g (torch._C.Graph): graph to write the ONNX representation into.
+        op_name (str): operator name in ONNX.
+        *args (tuple): operands to the operator.
+        **kwargs (dict): attributes to the operator along with "opset_before" (optional, None by default)
+            indicating the smallest opset version to trigger such casting behavior and "target_float_t"
+            (optional, torch.onnx.JitScalarType.FLOAT by default) indicating the data type of internal operator.
+
+    Returns:
+        Optional[torch._C.Value, Tuple[torch._C.Value, ...]]: output(s) of the operator.
+    """
+    opset_before = kwargs.pop("opset_before", None)
+    target_float_t = kwargs.pop("target_float_t", _type_utils.JitScalarType.FLOAT)
+
+    inputs = list(args)
+    dtype_0 = _type_utils.JitScalarType.from_value(inputs[0])
+
+    require_cast = not _is_fp(inputs[0]) and (
+        opset_before is None or GLOBALS.export_onnx_opset_version < opset_before
+    )
+
+    if require_cast:
+        for input in inputs:
+            if input.isCompleteTensor():
+                input_scalar_type = _type_utils.JitScalarType.from_value(input)
+                if input_scalar_type != dtype_0:
+                    raise errors.SymbolicValueError(
+                        f"Inputs of {op_name} must have same dtype."
+                        f"Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
+                        input,
+                    )
+        for i, input in enumerate(inputs):
+            if input.isCompleteTensor() and not _is_fp(input):
+                inputs[i] = g.op(
+                    "Cast",
+                    input,
+                    to_i=target_float_t.onnx_type(),
+                )
+
+    self = g.op(op_name, *inputs, **kwargs)
+
+    if require_cast:
+        self = g.op("Cast", self, to_i=dtype_0.onnx_type())
+
+    return self
+
+
+def _maybe_cast_reduce_op_input(g: jit_utils.GraphContext, self):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        # This check only covers traced modules where dtype is present
+        # pytorch reduce-ops cast all other integral types to int64
+        if not _is_fp(self) and scalar_type != _type_utils.JitScalarType.INT64:
+            self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return self
+
+
+def _apply_params(*args, **kwargs):
+    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
+
+    def _apply(fn):
+        return fn(*args, **kwargs)
+
+    return _apply
+
+
+def _reduce_op_symbolic_helper(onnx_op_name, allow_multi_dim_support=True):
+    def symbolic(g, self, dim=None, keepdim=None):
+        self = _maybe_cast_reduce_op_input(g, self)
+        if dim is None or dim == ():
+            # Dim can be 0, which will cause (not dim) == True. So we don't want to do
+            # (not dim)
+            # all-reduce path
+            return _handle_reduce_dim_none(g, self, onnx_op_name)
+        else:
+            # dim-reduce path
+            keepdim = _get_const(keepdim, "i", "keepdim")
+            if g.opset < 18:
+                desc = "is" if allow_multi_dim_support else "i"
+                dim = _get_const(dim, desc, "dim")
+                dim_list = dim if allow_multi_dim_support else [dim]
+                return g.op(onnx_op_name, self, axes_i=dim_list, keepdims_i=keepdim)
+            else:
+                if _is_value(dim):
+                    axes = dim
+                else:
+                    if allow_multi_dim_support:
+                        axes = g.op(
+                            "Constant", value_t=torch.tensor(dim, dtype=torch.long)
+                        )
+                    else:
+                        axes = g.op(
+                            "Constant", value_t=torch.tensor([dim], dtype=torch.long)
+                        )
+                return g.op(onnx_op_name, self, axes, keepdims_i=keepdim)
+
+    return symbolic
+
+
+def _overload_by_arg_count(fn):
+    @functools.wraps(fn)
+    def wrapper(g, *args):
+        overloads = fn(g, *args)
+        for overload in overloads:
+            arg_descriptors = overload._arg_descriptors
+            if len(arg_descriptors) == len(args):
+                return overload(g, *args)
+        return _unimplemented(f"aten::{fn.__name__}", f"with {len(args)} arguments")
+
+    return wrapper
+
+
+def _reduce_with_dtype_helper(
+    onnx_op: str, name: str, allow_multi_dim_support: bool = True
+):
+    symbolic = _reduce_op_symbolic_helper(
+        onnx_op, allow_multi_dim_support=allow_multi_dim_support
+    )
+
+    @_overload_by_arg_count
+    def reduce(g, *args, **kwargs):
+        @quantized_args(True)
+        @parse_args("v", "none")
+        def reduce_nodim(g, self, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = _get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return _unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        dim_desc = "is" if allow_multi_dim_support else "i"
+
+        @quantized_args(True)
+        @parse_args("v", dim_desc, "i", "none")  # type: ignore[arg-type]
+        def reduce_dim(g, self, dim, keepdim, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = _get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return _unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self, dim, keepdim)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        return reduce_nodim, reduce_dim
+
+    return reduce
+
+
+def _max_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.max(input)
+    if dim_or_y is None and keepdim is None:
+        return g.op("ReduceMax", self, keepdims_i=0)
+    # torch.max(input, other)
+    if keepdim is None:
+        return _op_with_optional_float_cast(g, "Max", self, dim_or_y, opset_before=12)
+    # torch.max(input, dim, keepdim)
+    else:
+        keepdim = _get_const(keepdim, "i", "keepdim")
+        dim = _get_const(dim_or_y, "i", "dim")
+        if g.opset < 18:
+            max = g.op("ReduceMax", self, axes_i=[dim], keepdims_i=keepdim)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            max = g.op("ReduceMax", self, axes, keepdims_i=keepdim)
+        indices = g.op("ArgMax", self, axis_i=dim, keepdims_i=keepdim)
+        return max, indices
+
+
+def _min_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.min(input)
+    if dim_or_y is None and keepdim is None:
+        return g.op("ReduceMin", self, keepdims_i=0)
+    # torch.min(input, other)
+    if keepdim is None:
+        return _op_with_optional_float_cast(g, "Min", self, dim_or_y, opset_before=12)
+    # torch.min(input, dim, keepdim)
+    else:
+        keepdim = _get_const(keepdim, "i", "keepdim")
+        dim = _get_const(dim_or_y, "i", "dim")
+        if g.opset < 18:
+            min = g.op("ReduceMin", self, axes_i=[dim], keepdims_i=keepdim)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            min = g.op("ReduceMin", self, axes, keepdims_i=keepdim)
+        indices = g.op("ArgMin", self, axis_i=dim, keepdims_i=keepdim)
+        return min, indices
+
+
+def _numel_helper(g: jit_utils.GraphContext, self):
+    shape = g.op("Shape", self)
+    return g.op("ReduceProd", shape, keepdims_i=0)
+
+
+@parse_args("v", "is", "i", "i")
+def _var_mean_helper(g: jit_utils.GraphContext, input, dim, correction, keepdim):
+    if g.opset < 18:
+        if dim is None:
+            mean = g.op("ReduceMean", input, keepdims_i=0)
+            t_mean = mean
+            num_elements = _numel_helper(g, input)
+        else:
+            mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=keepdim)
+            t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1)
+            redudced_dims = g.op("Shape", input)
+            # dim could contain one or multiple dimensions
+            redudced_dims = g.op(
+                "Gather",
+                redudced_dims,
+                g.op("Constant", value_t=torch.tensor(dim)),
+                axis_i=0,
+            )
+            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
+        sub_v = g.op("Sub", input, t_mean)
+        sqr_sub = g.op("Mul", sub_v, sub_v)
+        keepdim_mean = 0 if dim is None else keepdim
+        var = g.op("ReduceMean", sqr_sub, axes_i=dim, keepdims_i=keepdim_mean)
+        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
+        if correction is None:
+            correction = 1
+        if correction != 0:
+            num_elements = g.op(
+                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
+            )
+            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
+            mul = g.op("Mul", var, num_elements)
+            var = g.op("Div", mul, g.op("Sub", num_elements, one))
+        return var, mean
+    else:
+        axes = None
+        if dim is None:
+            mean = g.op("ReduceMean", input, keepdims_i=0)
+            t_mean = mean
+            num_elements = _numel_helper(g, input)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+            mean = g.op("ReduceMean", input, axes, keepdims_i=keepdim)
+            t_mean = g.op("ReduceMean", input, axes, keepdims_i=1)
+            redudced_dims = g.op("Shape", input)
+            # dim could contain one or multiple dimensions
+            redudced_dims = g.op(
+                "Gather",
+                redudced_dims,
+                g.op("Constant", value_t=torch.tensor(dim)),
+                axis_i=0,
+            )
+            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
+        sub_v = g.op("Sub", input, t_mean)
+        sqr_sub = g.op("Mul", sub_v, sub_v)
+        keepdim_mean = 0 if dim is None else keepdim
+        if axes is None:
+            var = g.op("ReduceMean", sqr_sub, keepdims_i=keepdim_mean)
+        else:
+            var = g.op("ReduceMean", sqr_sub, axes, keepdims_i=keepdim_mean)
+        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
+        if correction is None:
+            correction = 1
+        if correction != 0:
+            num_elements = g.op(
+                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
+            )
+            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
+            mul = g.op("Mul", var, num_elements)
+            var = g.op("Div", mul, g.op("Sub", num_elements, one))
+        return var, mean
+
+
+def _embedding_bag_helper(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        return _onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
+    if padding_idx is not None and padding_idx >= 0:
+        raise RuntimeError("embedding_bag with padding_idx")
+
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    indices_len = _unsqueeze_helper(
+        g,
+        _size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
+        [0],
+    )
+    if not include_last_offset:
+        offsets = [offsets, indices_len]
+        offsets = g.op("Concat", *offsets, axis_i=0)
+
+    # Offsets holds the starting index position of each bag. So we create a list of the indices slices (determined by
+    # offsets) and gather those indices in indices_row. Then we use this subset of indices to gather from embeddings.
+    # The embeddings output is a loop scan output, so we can avoid creating a sequence and inserting elements in.
+    offsets_starts = _slice_helper(
+        g, offsets, axes=[0], starts=[0], ends=[sys.maxsize], steps=[1]
+    )
+    offsets_ends = _slice_helper(
+        g, offsets, axes=[0], starts=[1], ends=[sys.maxsize], steps=[1]
+    )
+
+    loop_len = _size_helper(g, offsets_ends, g.op("Constant", value_t=torch.tensor(0)))
+
+    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+        g, "Loop", loop_len, loop_condition, n_blocks=1
+    )
+    loop_block = loop_context.block
+
+    # FIXME(justinchuby): We need to handle what happens when we call b.op on a node return
+    block_input_iter = utils._add_input_to_block(loop_block)
+    utils._add_input_to_block(loop_block)
+
+    indices_start = loop_context.op(
+        "Gather", offsets_starts, block_input_iter, axis_i=0
+    )
+    indices_end = loop_context.op("Gather", offsets_ends, block_input_iter, axis_i=0)
+    indices_start = _unsqueeze_helper(loop_context, indices_start, [0])
+    indices_end = _unsqueeze_helper(loop_context, indices_end, [0])
+
+    indices_row = loop_context.op("Slice", indices, indices_start, indices_end, zero)
+    embeddings = loop_context.op("Gather", embedding_matrix, indices_row, axis_i=0)
+    if not _is_none(per_sample_weights):
+        per_sample_weights_row = loop_context.op(
+            "Slice", per_sample_weights, indices_start, indices_end, zero
+        )
+        per_sample_weights_row = _unsqueeze_helper(
+            loop_context, per_sample_weights_row, [1]
+        )
+        embeddings = loop_context.op("Mul", embeddings, per_sample_weights_row)
+    if mode == 0:
+        embeddings = _reducesum_helper(
+            loop_context, embeddings, axes_i=[0], keepdims_i=0
+        )
+    elif mode == 1:
+        if loop_context.opset < 18:
+            embeddings = loop_context.op(
+                "ReduceMean", embeddings, axes_i=[0], keepdims_i=0
+            )
+        else:
+            axes = loop_context.op(
+                "Constant", value_t=torch.tensor([0], dtype=torch.long)
+            )
+            embeddings = loop_context.op("ReduceMean", embeddings, axes, keepdims_i=0)
+    else:
+        if loop_context.opset < 18:
+            embeddings = loop_context.op(
+                "ReduceMax", embeddings, axes_i=[0], keepdims_i=0
+            )
+        else:
+            axes = loop_context.op(
+                "Constant", value_t=torch.tensor([0], dtype=torch.long)
+            )
+            embeddings = loop_context.op("ReduceMax", embeddings, axes, keepdims_i=0)
+
+    cond_out = loop_context.op(
+        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+    )
+    utils._add_output_to_block(loop_block, cond_out)
+    utils._add_output_to_block(loop_block, embeddings)
+
+    # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+    # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+    return loop.node().output(), None, None, None
+
+
+def _linalg_vector_norm_helper(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    axes = None
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.vector_norm.html
+    if _is_none(dim):
+        self = _reshape_helper(g, self, [-1])
+        keepdim = False
+    elif g.opset >= 18:
+        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+
+    if ord == math.inf:
+        if g.opset < 18:
+            result = g.op(
+                "ReduceMax", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
+            )
+        else:
+            if axes is None:
+                result = g.op("ReduceMax", g.op("Abs", self), keepdims_i=keepdim)
+            else:
+                result = g.op("ReduceMax", g.op("Abs", self), axes, keepdims_i=keepdim)
+    elif ord == -math.inf:
+        if g.opset < 18:
+            result = g.op(
+                "ReduceMin", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
+            )
+        else:
+            if axes is None:
+                result = g.op("ReduceMin", g.op("Abs", self), keepdims_i=keepdim)
+            else:
+                result = g.op("ReduceMin", g.op("Abs", self), axes, keepdims_i=keepdim)
+    elif ord == 0:
+        if g.opset < 11:
+            return _onnx_opset_unsupported_detailed(
+                "linalg_vector_norm", 9, 11, "ord=0 not supported", self
+            )
+        else:
+            if dim is None:
+                self = _reshape_helper(
+                    g,
+                    self,
+                    g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
+                )
+                keepdim = False
+
+            cond_op = g.op(
+                "Not",
+                g.op("Equal", self, g.op("Constant", value_t=torch.LongTensor([0]))),
+            )
+            cond_op = g.op(
+                "Cast",
+                cond_op,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+            return _reducesum_helper(g, cond_op, axes_i=dim, keepdims_i=keepdim)
+    elif ord == 1:
+        if g.opset < 18:
+            result = _reduce_op_symbolic_helper("ReduceL1")(
+                g, self, dim=dim, keepdim=keepdim
+            )
+        else:
+            if axes is None:
+                result = _reduce_op_symbolic_helper("ReduceL1")(
+                    g, self, keepdim=keepdim
+                )
+            else:
+                result = _reduce_op_symbolic_helper("ReduceL1")(
+                    g, self, axes, keepdim=keepdim
+                )
+    elif ord == 2:
+        if g.opset < 18:
+            result = _reduce_op_symbolic_helper("ReduceL2")(
+                g, self, dim=dim, keepdim=keepdim
+            )
+        else:
+            if axes is None:
+                result = _reduce_op_symbolic_helper("ReduceL2")(
+                    g, self, keepdim=keepdim
+                )
+            else:
+                result = _reduce_op_symbolic_helper("ReduceL2")(
+                    g, self, axes, keepdim=keepdim
+                )
+    else:
+        ord_op = g.op("Constant", value_t=torch.tensor(ord, dtype=torch.float32))
+        result = _reducesum_helper(
+            g, g.op("Pow", g.op("Abs", self), ord_op), axes_i=dim, keepdims_i=keepdim
+        )
+        result = g.op(
+            "Pow",
+            result,
+            g.op(
+                "Div",
+                g.op("Constant", value_t=torch.tensor(1, dtype=torch.float32)),
+                ord_op,
+            ),
+        )
+
+    if not _is_none(dtype):
+        dtype = _get_const(dtype, "i", "dtype")
+        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())  # type: ignore[arg-type]
+    return result
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+# TODO: remove these once we support Type's in the JIT IR and we can once again
+# use the unified toType operator
+cast_pytorch_to_onnx = {
+    "Byte": _C_onnx.TensorProtoDataType.UINT8,
+    "Char": _C_onnx.TensorProtoDataType.INT8,
+    "Double": _C_onnx.TensorProtoDataType.DOUBLE,
+    "Float": _C_onnx.TensorProtoDataType.FLOAT,
+    "Half": _C_onnx.TensorProtoDataType.FLOAT16,
+    "Int": _C_onnx.TensorProtoDataType.INT32,
+    "Long": _C_onnx.TensorProtoDataType.INT64,
+    "Short": _C_onnx.TensorProtoDataType.INT16,
+    "Bool": _C_onnx.TensorProtoDataType.BOOL,
+    "ComplexFloat": _C_onnx.TensorProtoDataType.COMPLEX64,
+    "ComplexDouble": _C_onnx.TensorProtoDataType.COMPLEX128,
+    "BFloat16": _C_onnx.TensorProtoDataType.BFLOAT16,
+    "Undefined": _C_onnx.TensorProtoDataType.UNDEFINED,
+}
+
+# Deprecated. Internally use _type_utils.ScalarType
+scalar_name_to_pytorch = {
+    "uint8_t": "Byte",
+    "int8_t": "Char",
+    "double": "Double",
+    "float": "Float",
+    "half": "Half",
+    "int": "Int",
+    "int64_t": "Long",
+    "int16_t": "Short",
+    "bool": "Bool",
+    "complex64": "ComplexFloat",
+    "complex128": "ComplexDouble",
+    "qint8": "QInt8",
+    "quint8": "QUInt8",
+    "qint32": "QInt32",
+    "bfloat16": "BFloat16",
+}
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+# This indicates each scalar type's corresponding
+# torch type. Related source:
+# https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
+scalar_type_to_pytorch_type = [
+    torch.uint8,  # 0
+    torch.int8,  # 1
+    torch.short,  # 2
+    torch.int,  # 3
+    torch.int64,  # 4
+    torch.half,  # 5
+    torch.float,  # 6
+    torch.double,  # 7
+    torch.complex32,  # 8
+    torch.complex64,  # 9
+    torch.complex128,  # 10
+    torch.bool,  # 11
+    torch.qint8,  # 12
+    torch.quint8,  # 13
+    torch.qint32,  # 14
+    torch.bfloat16,  # 15
+]
+
+# Deprecated. Internally use _type_utils.ScalarType
+# source of truth is
+# https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_dtypes.cpp
+pytorch_name_to_type = {
+    "Byte": torch.uint8,
+    "Char": torch.int8,
+    "Double": torch.double,
+    "Float": torch.float,
+    "Half": torch.half,
+    "Int": torch.int,
+    "Long": torch.int64,
+    "Short": torch.short,
+    "Bool": torch.bool,
+    "ComplexFloat": torch.complex64,
+    "ComplexDouble": torch.complex128,
+    "QInt8": torch.qint8,
+    "QUInt8": torch.quint8,
+    "QInt32": torch.qint32,
+    "BFloat16": torch.bfloat16,
+}
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+scalar_type_to_onnx = [
+    cast_pytorch_to_onnx["Byte"],  # 0
+    cast_pytorch_to_onnx["Char"],  # 1
+    cast_pytorch_to_onnx["Short"],  # 2
+    cast_pytorch_to_onnx["Int"],  # 3
+    cast_pytorch_to_onnx["Long"],  # 4
+    cast_pytorch_to_onnx["Half"],  # 5
+    cast_pytorch_to_onnx["Float"],  # 6
+    cast_pytorch_to_onnx["Double"],  # 7
+    cast_pytorch_to_onnx["Undefined"],  # 8
+    cast_pytorch_to_onnx["ComplexFloat"],  # 9
+    cast_pytorch_to_onnx["ComplexDouble"],  # 10
+    cast_pytorch_to_onnx["Bool"],  # 11
+    cast_pytorch_to_onnx["Char"],  # 12
+    cast_pytorch_to_onnx["Byte"],  # 13
+    cast_pytorch_to_onnx["Int"],  # 14
+    cast_pytorch_to_onnx["BFloat16"],  # 15
+]
+
+# Global set to store the list of quantized operators in the network.
+# This is currently only used in the conversion of quantized ops from PT -> C2 via ONNX.
+_quantized_ops: set[int] = set()
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py
new file mode 100644
index 0000000000000..6b36396250b47
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py
@@ -0,0 +1,1187 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+from __future__ import annotations
+
+import functools
+import sys
+import warnings
+from typing import TYPE_CHECKING
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch import _C
+from torch.onnx import _constants, errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 10
+# Opset 10 is supported by ONNX release 1.5.0
+# release on 04/24/19
+
+
+__all__ = [
+    "dequantize",
+    "div",
+    "embedding_bag",
+    "fake_quantize_per_tensor_affine",
+    "flip",
+    "fmod",
+    "isfinite",
+    "isinf",
+    "nan_to_num",
+    "quantize_per_tensor",
+    "quantized_add_relu",
+    "quantized_add",
+    "quantized_cat",
+    "quantized_conv1d_relu",
+    "quantized_conv2d_relu",
+    "quantized_conv3d_relu",
+    "quantized_conv1d",
+    "quantized_conv2d",
+    "quantized_conv3d",
+    "quantized_conv_transpose1d",
+    "quantized_conv_transpose2d",
+    "quantized_conv_transpose3d",
+    "quantized_group_norm",
+    "quantized_hardswish",
+    "quantized_instance_norm",
+    "quantized_layer_norm",
+    "quantized_leaky_relu",
+    "quantized_linear",
+    "quantized_linear_relu",
+    "quantized_mul",
+    "quantized_sigmoid",
+    "slice",
+    "sort",
+    "topk",
+]
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=10)
+
+
+@_onnx_symbolic("aten::div")
+def div(g: jit_utils.GraphContext, self, other, *args):
+    if len(args) == 0:
+        return opset9.true_divide(g, self, other)
+    else:
+        return _div_rounding_mode(g, self, other, *args)
+
+
+@symbolic_helper.parse_args("v", "v", "s")
+def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
+    if rounding_mode == "floor":
+        return _floor_divide(g, self, other)
+    else:
+        return opset9._div_rounding_mode(g, self, other, rounding_mode)
+
+
+@_onnx_symbolic("aten::_floor_divide")
+def _floor_divide(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        out = opset9.true_divide(g, self, other)
+        return g.op("Floor", out)
+    else:
+        # Integer division does truncation rounding
+        div = g.op("Div", self, other)
+        # Division is negative if: self < 0 != other < 0
+        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+        negative = g.op("Xor", g.op("Less", self, zero), g.op("Less", other, zero))
+
+        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
+        mod = g.op("Mod", self, other, fmod_i=0)
+        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
+
+        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        fixup = g.op("Sub", div, one)
+        return g.op("Where", fixup_mask, fixup, div)
+
+
+@_onnx_symbolic("aten::sort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def sort(g: jit_utils.GraphContext, self, dim, descending, out=None):
+    return symbolic_helper._sort_helper(g, self, dim, descending=descending, out=out)
+
+
+@_onnx_symbolic("aten::topk")
+@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    return symbolic_helper._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out
+    )
+
+
+def _aten_max_pool_onnx(
+    g: jit_utils.GraphContext,
+    self: _C.Value,
+    kernel_shape: Sequence[int],
+    strides: Sequence[int],
+    pads: Sequence[int],
+    dilations: Sequence[int],
+    ceil_mode: bool,
+    unbatched_rank: int,
+) -> _C.Value:
+    self_rank = g.op("Size", g.op("Shape", self))
+    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
+        self = g.op(
+            "Unsqueeze",
+            self,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    pool_result, _ = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        ceil_mode_i=ceil_mode,
+        dilations_i=dilations,
+        kernel_shape_i=kernel_shape,
+        pads_i=pads,
+        strides_i=strides,
+    )
+
+    if self_rank == unbatched_rank:
+        pool_result = g.op(
+            "Squeeze",
+            pool_result,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    return pool_result
+
+
+# For MaxPool
+def _adjust_attributes_of_max_pool(
+    expand_size: int,
+    kernel_size: Sequence[int] | int,
+    stride: Sequence[int] | int,
+    padding: Sequence[int] | int,
+    dilation: Sequence[int] | int,
+) -> tuple[Sequence[int], Sequence[int], Sequence[int], Sequence[int]]:
+    """Adjust attributes of avg_pool to match ONNX specification."""
+
+    if isinstance(dilation, int):
+        dilation = [dilation] * expand_size
+
+    if isinstance(kernel_size, int):
+        kernel_shape = [kernel_size] * expand_size
+    else:
+        kernel_shape = kernel_size  # type: ignore[assignment]
+
+    if isinstance(padding, int):
+        pads = [padding] * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 1:
+        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 2:
+        # 2D padding
+        pads = padding * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 3:
+        # 3D padding
+        pads = padding * 2  # type: ignore[operator, assignment]
+    else:
+        # When padding is already done for all dimensions,
+        # we don't need to double it
+        # eg: (1, 1, 1, 1, 1, 1)
+        pads = padding  # type: ignore[assignment]
+
+    if isinstance(stride, int):
+        strides = [stride] * expand_size
+    elif not stride:
+        strides = kernel_shape
+    else:
+        strides = stride  # type: ignore[assignment]
+
+    return (kernel_shape, strides, pads, dilation)
+
+
+def _aten_max_pool_with_indices_onnx(
+    g: jit_utils.GraphContext,
+    self: _C.Value,
+    kernel_shape: Sequence[int],
+    strides: Sequence[int],
+    pads: Sequence[int],
+    dilations: Sequence[int],
+    ceil_mode: bool,
+    unbatched_rank: int,
+    n_dims_one: Sequence[int],
+    n_dims_zero: Sequence[int],
+    n_dims_axes: Sequence[int],
+) -> tuple[_C.Value, Sequence[int]]:
+    self_rank = g.op("Size", g.op("Shape", self))
+    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
+        self = g.op(
+            "Unsqueeze",
+            self,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    pool_result, indices = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        ceil_mode_i=ceil_mode,
+        dilations_i=dilations,
+        kernel_shape_i=kernel_shape,
+        pads_i=pads,
+        strides_i=strides,
+    )
+    _, flatten_indices = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        dilations_i=dilations,
+        kernel_shape_i=n_dims_one,
+        strides_i=n_dims_one,
+    )
+
+    ends = g.op("Constant", value_t=torch.tensor(n_dims_one))
+    starts = g.op("Constant", value_t=torch.tensor(n_dims_zero))
+    axes = g.op("Constant", value_t=torch.tensor(n_dims_axes))
+
+    delta = g.op("Slice", flatten_indices, starts, ends, axes)
+    indices = g.op("Sub", indices, delta)
+
+    if self_rank == unbatched_rank:
+        pool_result = g.op(
+            "Squeeze", pool_result, value_t=torch.tensor([0], dtype=torch.int64)
+        )
+        indices = g.op("Squeeze", indices, value_t=torch.tensor([0], dtype=torch.int64))
+
+    return (pool_result, indices)
+
+
+@_onnx_symbolic(
+    "aten::max_pool1d",
+    decorate=[symbolic_helper._apply_params("max_pool1d", 1, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d",
+    decorate=[symbolic_helper._apply_params("max_pool2d", 2, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d",
+    decorate=[symbolic_helper._apply_params("max_pool3d", 3, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool1d_with_indices",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool1d_with_indices",
+            1,
+            return_indices=True,
+        )
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d_with_indices",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool2d_with_indices",
+            2,
+            return_indices=True,
+        )
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d_with_indices",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool3d_with_indices",
+            3,
+            return_indices=True,
+        )
+    ],
+)
+def _max_pool(name: str, expand_size: int, return_indices: bool):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+    def symbolic_fn(
+        g: jit_utils.GraphContext,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: int | Sequence[int],
+        dilation: Sequence[int],
+        ceil_mode: bool,
+    ):
+        kernel_shape, strides, pads, dilations = _adjust_attributes_of_max_pool(
+            expand_size, kernel_size, stride, padding, dilation
+        )
+
+        if return_indices:
+            return _aten_max_pool_with_indices_onnx(
+                g,
+                input,
+                kernel_shape,
+                strides,
+                pads,
+                dilations,
+                ceil_mode,
+                expand_size + 1,
+                ([1] * expand_size),
+                ([0] * expand_size),
+                ([2 + i for i in range(expand_size)]),
+            )
+        else:
+            return _aten_max_pool_onnx(
+                g,
+                input,
+                kernel_shape,
+                strides,
+                pads,
+                dilations,
+                ceil_mode,
+                expand_size + 1,
+            )
+
+    return symbolic_fn
+
+
+# For AvgPool
+def _adjust_attributes_of_avg_pool(
+    expand_size: int,
+    kernel_size: Sequence[int] | int,
+    stride: Sequence[int] | int,
+    padding: Sequence[int] | int,
+) -> tuple[Sequence[int], Sequence[int], Sequence[int]]:
+    """Adjust attributes of avg_pool to match ONNX specification."""
+
+    if isinstance(kernel_size, int):
+        kernel_shape = [kernel_size] * expand_size
+    else:
+        kernel_shape = kernel_size  # type: ignore[assignment]
+
+    if isinstance(padding, int):
+        pads = [padding] * expand_size * 2
+    elif len(padding) == 1:
+        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 2:
+        pads = padding * expand_size  # type: ignore[operator, assignment]
+    else:
+        pads = padding * 2  # type: ignore[operator, assignment]
+
+    if isinstance(stride, int):
+        strides = [stride] * expand_size
+    elif not stride:
+        strides = kernel_shape
+    else:
+        strides = stride  # type: ignore[assignment]
+
+    return (kernel_shape, strides, pads)
+
+
+@_onnx_symbolic(
+    "aten::avg_pool1d",
+    decorate=[symbolic_helper._apply_params("avg_pool1d", 1)],
+)
+@_onnx_symbolic(
+    "aten::avg_pool2d",
+    decorate=[symbolic_helper._apply_params("avg_pool2d", 2)],
+)
+@_onnx_symbolic(
+    "aten::avg_pool3d",
+    decorate=[symbolic_helper._apply_params("avg_pool3d", 3)],
+)
+def _avg_pool(name, expand_size):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+    def symbolic_fn(
+        g,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: int | Sequence[int],
+        ceil_mode: int,
+        count_include_pad: int,
+        divisor_override=None,
+    ):
+        kernel_shape, strides, pads = _adjust_attributes_of_avg_pool(
+            expand_size, kernel_size, stride, padding
+        )
+
+        result = g.op(
+            "AveragePool",
+            input,
+            ceil_mode_i=ceil_mode,
+            count_include_pad_i=count_include_pad,
+            kernel_shape_i=kernel_shape,
+            pads_i=pads,
+            strides_i=strides,
+        )
+
+        return result
+
+    return symbolic_fn
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+)
+def _interpolate(name, dim, interpolate_mode):
+    @symbolic_helper.quantized_args(True, False, False)
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        if scales is None:
+            scales = symbolic_helper._interpolate_size_to_scales(
+                g, input, output_size, dim
+            )
+        return g.op("Resize", input, scales, mode_s=interpolate_mode)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Resize", input, scales, mode_s=mode)
+
+
+def _slice(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    axes: list | torch.Tensor | torch._C.Value,
+    starts: list | torch.Tensor | torch._C.Value,
+    ends: list | torch.Tensor | torch._C.Value,
+    steps: list | torch.Tensor | torch._C.Value | None = None,
+):
+    def is_none_value(value):
+        if value is None:
+            return True
+        return (
+            isinstance(value, torch._C.Value)
+            and value.node().kind() == "prim::Constant"
+            and isinstance(value.type(), _C.NoneType)
+        )
+
+    def to_slice_input(list_or_value, default_value=None):
+        # Convert input param into a 1D torch.Value.
+        if is_none_value(list_or_value) and default_value is not None:
+            list_or_value = [default_value]
+
+        if isinstance(list_or_value, torch.Tensor):
+            return g.op("Constant", value_t=list_or_value.clone().detach())
+        elif isinstance(list_or_value, list):
+            return g.op("Constant", value_t=torch.tensor(list_or_value))
+
+        rank = symbolic_helper._get_tensor_rank(list_or_value)
+        if rank == 0:
+            return symbolic_helper._unsqueeze_helper(g, list_or_value, [0])
+        if rank == 1:
+            return list_or_value
+        raise errors.SymbolicValueError(
+            f"Rank must be 0 or 1, not {rank}", list_or_value
+        )
+
+    def get_const_value(list_or_value):
+        if isinstance(list_or_value, (list, torch.Tensor)):
+            if len(list_or_value) == 1:
+                return list_or_value[0]
+            return None
+        return symbolic_helper._maybe_get_const(list_or_value, "i")
+
+    # Check if slice is a no-op
+    if (
+        get_const_value(starts) == 0
+        and get_const_value(ends) == _constants.INT64_MAX
+        and (steps is None or get_const_value(steps) == 1)
+    ):
+        return input
+
+    axes = to_slice_input(axes)
+    starts = to_slice_input(starts, default_value=0)
+    ends = to_slice_input(ends, default_value=_constants.INT64_MAX)
+    if steps is None:
+        return g.op("Slice", input, starts, ends, axes)
+    steps = to_slice_input(steps, default_value=1)
+    return g.op("Slice", input, starts, ends, axes, steps)
+
+
+@_onnx_symbolic("aten::slice")
+def slice(g: jit_utils.GraphContext, self, *args):
+    if len(args) == 4:
+        # aten::slice(Tensor self, int dim, int? start=None, int? end=None, int step=1) -> Tensor
+        dims, start, end, step = args
+    elif len(args) == 3:
+        # aten::slice(t[] l, int? start=None, int? end=None, int step=1) -> t[]
+        start, end, step = args
+        dims = [0]
+    else:
+        raise errors.SymbolicValueError("Unknown aten::slice signature", self)
+
+    return symbolic_helper._slice_helper(
+        g,
+        self,
+        axes=dims,
+        starts=start,
+        ends=end,
+        steps=step,
+    )
+
+
+@_onnx_symbolic("aten::flip")
+@symbolic_helper.parse_args("v", "is")
+def flip(g: jit_utils.GraphContext, input, dims):
+    return symbolic_helper._slice_helper(
+        g,
+        input,
+        axes=dims,
+        starts=[-1] * len(dims),
+        ends=[-_constants.INT64_MAX] * len(dims),
+        steps=[-1] * len(dims),
+    )
+
+
+@_onnx_symbolic("aten::fmod")
+def fmod(g: jit_utils.GraphContext, input, other):
+    return g.op("Mod", input, other, fmod_i=1)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
+    if padding_idx is not None and padding_idx >= 0:
+        raise RuntimeError("embedding_bag with padding_idx")
+
+    warnings.warn(
+        "Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. "
+        "Please use opset 11 or higher to export model for dynamic input shape.'"
+    )
+    offsets_dim_0 = symbolic_helper._get_tensor_dim_size(offsets, 0)
+    if offsets_dim_0 is not None:
+        if include_last_offset:
+            offset_len = offsets_dim_0 - 1
+            offsets_extended = offsets
+        else:
+            offset_len = offsets_dim_0
+            offsets_extended = [
+                offsets,
+                g.op("Constant", value_t=torch.tensor([sys.maxsize])),
+            ]
+            offsets_extended = g.op("Concat", *offsets_extended, axis_i=0)
+        list_ = []
+        for i in range(offset_len):
+            start_ = symbolic_helper._unsqueeze_helper(
+                g,
+                opset9.select(g, offsets_extended, torch.tensor(0), torch.tensor(i)),
+                [0],
+            )
+            end_ = symbolic_helper._unsqueeze_helper(
+                g,
+                opset9.select(
+                    g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)
+                ),
+                [0],
+            )
+            axes_ = g.op("Constant", value_t=torch.tensor([0]))
+            indices_row = g.op("Slice", indices, start_, end_, axes_)
+
+            embeddings = g.op("Gather", embedding_matrix, indices_row)
+            if not symbolic_helper._is_none(per_sample_weights):
+                per_sample_weights_row = g.op(
+                    "Slice", per_sample_weights, start_, end_, axes_
+                )
+                per_sample_weights_row = symbolic_helper._unsqueeze_helper(
+                    g, per_sample_weights_row, [1]
+                )
+                embeddings = g.op("Mul", embeddings, per_sample_weights_row)
+            if mode == 0:
+                embeddings = symbolic_helper._reducesum_helper(
+                    g, embeddings, axes_i=[0], keepdims_i=0
+                )
+            elif mode == 1:
+                embeddings = g.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
+            else:
+                embeddings = g.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
+
+            embeddings = symbolic_helper._unsqueeze_helper(g, embeddings, [0])
+            list_.append(embeddings)
+
+        output = g.op("Concat", *list_, axis_i=0)
+        # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+        # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+        return output, None, None, None
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with unknown shape of offsets for opset 10 is not supported. "
+            "please use opset 11 or higher."
+        )
+
+
+@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i")
+def fake_quantize_per_tensor_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is a special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) == (0, 127):
+        symbolic_helper._onnx_opset_unsupported_detailed(
+            "fake_quantize_per_tensor_affine",
+            10,
+            13,
+            "Quantize range (0, 127) not supported, requires opset 13 Clip",
+            inputs,
+        )
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127)]:
+        raise errors.SymbolicValueError(
+            f"For (quant_min, quant_max), ONNX allows only (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    scale = symbolic_helper._maybe_get_scalar(scale)
+    if scale is None:
+        symbolic_helper._onnx_opset_unsupported_detailed(
+            "fake_quantize_per_tensor_affine",
+            10,
+            13,
+            "Non-constant scale not supported",
+            inputs,
+        )
+    scale = scale.float().data  # Avoid exporter generating double type
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    return g.op(
+        "DequantizeLinear",
+        g.op("QuantizeLinear", inputs, scale, zero_point),
+        scale,
+        zero_point,
+    )
+
+
+@_onnx_symbolic("aten::isinf")
+def isinf(g: jit_utils.GraphContext, input):
+    return g.op("IsInf", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE))
+
+
+@_onnx_symbolic("aten::isfinite")
+def isfinite(g: jit_utils.GraphContext, input):
+    inf_node = isinf(g, input)
+    nan_node = opset9.isnan(g, input)
+    return opset9.__not_(g, opset9.__or_(g, inf_node, nan_node))
+
+
+@_onnx_symbolic("aten::quantize_per_tensor")
+def quantize_per_tensor(g: jit_utils.GraphContext, input, scale, zero_point, dtype):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    # TODO(justinchuby): Extract all the cast ops into a helper function.
+    zero_point = g.op(
+        "Cast", zero_point, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+    )
+    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return symbolic_helper.quantize_helper(g, input, scale, zero_point)
+
+
+@_onnx_symbolic("aten::dequantize")
+def dequantize(g: jit_utils.GraphContext, input):
+    return symbolic_helper.dequantize_helper(g, input)[0]
+
+
+@_onnx_symbolic("aten::nan_to_num")
+@symbolic_helper.parse_args("v", "f", "f", "f")
+def nan_to_num(g: jit_utils.GraphContext, input, nan, posinf, neginf):
+    # Cannot create a int type tensor with inf/nan values, so we simply
+    # return the original tensor
+    if not symbolic_helper._is_fp(input):
+        return input
+    input_dtype = _type_utils.JitScalarType.from_value(input).dtype()
+    if nan is None:
+        nan = 0.0
+    nan_cond = opset9.isnan(g, input)
+    nan_result = g.op(
+        "Where",
+        nan_cond,
+        g.op("Constant", value_t=torch.tensor([nan], dtype=input_dtype)),
+        input,
+    )
+
+    # For None values of posinf, neginf we use the greatest/lowest finite
+    # value representable by input's dtype.
+    finfo = torch.finfo(input_dtype)
+    if posinf is None:
+        posinf = finfo.max
+    posinf_cond = opset9.logical_and(
+        g,
+        isinf(g, nan_result),
+        opset9.gt(g, nan_result, g.op("Constant", value_t=torch.LongTensor([0]))),
+    )
+    nan_posinf_result = g.op(
+        "Where",
+        posinf_cond,
+        g.op("Constant", value_t=torch.tensor([posinf], dtype=input_dtype)),
+        nan_result,
+    )
+
+    if neginf is None:
+        neginf = finfo.min
+    neginf_cond = opset9.logical_and(
+        g,
+        isinf(g, nan_posinf_result),
+        opset9.lt(
+            g, nan_posinf_result, g.op("Constant", value_t=torch.LongTensor([0]))
+        ),
+    )
+    return g.op(
+        "Where",
+        neginf_cond,
+        g.op("Constant", value_t=torch.tensor([neginf], dtype=input_dtype)),
+        nan_posinf_result,
+    )
+
+
+# Quantized symbolics ---------------------------------------------------------
+# https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
+# Support starts from opset 10 because `DequantizeLinear` and `QuantizeLinear` were
+# introduced in opset version 10.
+@_onnx_symbolic("quantized::linear")
+def quantized_linear(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::linear_relu")
+def quantized_linear_relu(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::add")
+def quantized_add(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.add(g, x, y)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::add_relu")
+def quantized_add_relu(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.add(g, x, y)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::mul")
+def quantized_mul(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.mul(g, x, y)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::hardswish")
+def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.hardswish(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::sigmoid")
+def quantized_sigmoid(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.sigmoid(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::leaky_relu")
+def quantized_leaky_relu(
+    g: jit_utils.GraphContext, x, negative_slope, inplace, op_scale, op_zero_point
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.leaky_relu(g, x, negative_slope, inplace)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::layer_norm")
+def quantized_layer_norm(
+    g: jit_utils.GraphContext,
+    x,
+    normalized_shape,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.layer_norm(g, x, normalized_shape, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::group_norm")
+def quantized_group_norm(
+    g: jit_utils.GraphContext,
+    x,
+    num_groups,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.group_norm(g, x, num_groups, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::instance_norm")
+@symbolic_helper.parse_args("v", "v", "v", "f", "v", "v")
+def quantized_instance_norm(
+    g: jit_utils.GraphContext,
+    q_input,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    input, _, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+
+    output = opset9.instance_norm(
+        g, input, weight, bias, None, None, False, 0.0, eps, False
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d_relu")
+def quantized_conv1d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d_relu")
+def quantized_conv2d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d_relu")
+def quantized_conv3d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d")
+def quantized_conv1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d")
+def quantized_conv2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d")
+def quantized_conv3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose1d")
+def quantized_conv_transpose1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose2d")
+def quantized_conv_transpose2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose3d")
+def quantized_conv_transpose3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose3d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::cat")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def quantized_cat(
+    g: jit_utils.GraphContext,
+    q_inputs: _C.Value,
+    dim: int,
+    op_scale: _C.Value,
+    op_zero_point: _C.Value,
+) -> _C.Value:
+    unpacked_inputs = symbolic_helper._unpack_list(q_inputs)
+    dequantized = [
+        symbolic_helper.dequantize_helper(g, input)[0] for input in unpacked_inputs
+    ]
+    concatenated = g.op("Concat", *dequantized, axis_i=dim)
+    return symbolic_helper.quantize_helper(g, concatenated, op_scale, op_zero_point)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py
new file mode 100644
index 0000000000000..f437e2670768c
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py
@@ -0,0 +1,1472 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 11."""
+
+from __future__ import annotations
+
+import functools
+import sys
+import warnings
+from typing import TYPE_CHECKING
+
+import torch
+from torch import _C
+from torch._C import _onnx as _C_onnx
+from torch.onnx import errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset10 as opset10,
+    symbolic_opset9 as opset9,
+    utils,
+)
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = [
+    "add",
+    "append",
+    "arange",
+    "argsort",
+    "atleast_1d",
+    "atleast_2d",
+    "atleast_3d",
+    "cat",
+    "chunk",
+    "clamp_max",
+    "clamp_min",
+    "clamp",
+    "constant_pad_nd",
+    "cumsum",
+    "Delete",
+    "embedding_bag",
+    "embedding_renorm",
+    "flatten",
+    "gather",
+    "hardtanh",
+    "hstack",
+    "im2col",
+    "index_fill",
+    "index",
+    "index_copy",
+    "index_put",
+    "insert",
+    "linalg_det",
+    "linalg_vector_norm",
+    "logdet",
+    "masked_scatter",
+    "masked_select",
+    "mm",
+    "narrow",
+    "normal",
+    "pad",
+    "pixel_shuffle",
+    "pop",
+    "prim_constant_chunk",
+    "reflection_pad",
+    "relu6",
+    "remainder",
+    "replication_pad",
+    "round",
+    "scatter",
+    "select",
+    "size",
+    "sort",
+    "split_with_sizes",
+    "split",
+    "squeeze",
+    "stack",
+    "topk",
+    "unbind",
+    "unique_dim",
+    "unsqueeze",
+    "vstack",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=11)
+
+
+@_onnx_symbolic("aten::hardtanh")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "f")
+def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    min_val = g.op(
+        "Constant",
+        value_t=torch.tensor(min_val, dtype=scalar_type.dtype()),
+    )
+    max_val = g.op(
+        "Constant",
+        value_t=torch.tensor(max_val, dtype=scalar_type.dtype()),
+    )
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Clip", self, min_val, max_val, opset_before=12
+    )
+
+
+@_onnx_symbolic("aten::clamp")
+def clamp(g: jit_utils.GraphContext, self, min, max):
+    def _cast_if_not_none(tensor, dtype):
+        if tensor is not None and not symbolic_helper._is_none(tensor):
+            return g.op(
+                "Cast",
+                tensor,
+                to_i=dtype.onnx_type(),
+            )
+        else:
+            return tensor
+
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        min = _cast_if_not_none(min, scalar_type)
+        max = _cast_if_not_none(max, scalar_type)
+
+    if symbolic_helper._is_none(min):
+        return clamp_max(g, self, max)
+    elif symbolic_helper._is_none(max):
+        return clamp_min(g, self, min)
+    else:
+        if (
+            symbolic_helper._get_tensor_rank(min) == 0
+            and symbolic_helper._get_tensor_rank(max) == 0
+        ):
+            return symbolic_helper._op_with_optional_float_cast(
+                g, "Clip", self, min, max, opset_before=12
+            )
+        else:
+            return clamp_max(g, clamp_min(g, self, min), max)
+
+
+@_onnx_symbolic("aten::clamp_min")
+@symbolic_helper.parse_args("v", "v")
+def clamp_min(g: jit_utils.GraphContext, self, min):
+    min = g.op("Cast", min, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
+    if symbolic_helper._get_tensor_rank(min) == 0:
+        max = opset9.unused(g)
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, min, max, opset_before=12
+        )
+    else:
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Max", self, min, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::clamp_max")
+@symbolic_helper.parse_args("v", "v")
+def clamp_max(g: jit_utils.GraphContext, self, max):
+    max = g.op("Cast", max, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
+    if symbolic_helper._get_tensor_rank(max) == 0:
+        min = opset9.unused(g)
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, min, max, opset_before=12
+        )
+    else:
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Min", self, max, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::relu6")
+def relu6(g: jit_utils.GraphContext, input):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.FLOAT
+    )
+    min_val = g.op(
+        "Constant",
+        value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+    )
+    max_val = g.op(
+        "Constant",
+        value_t=torch.tensor(6, dtype=scalar_type.dtype()),
+    )
+    return clamp(g, input, min_val, max_val)
+
+
+@_onnx_symbolic("aten::select")
+# Opset 11 gather accepts negative indices
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "v")
+def select(g: jit_utils.GraphContext, self, dim, index):
+    return g.op("Gather", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::index_put")
+def index_put(
+    g: jit_utils.GraphContext, self, indices_list_value, values, accumulate=False
+):
+    if symbolic_helper._is_packed_list(indices_list_value):
+        indices_list = symbolic_helper._unpack_list(indices_list_value)
+    else:
+        indices_list = [indices_list_value]
+    accumulate = symbolic_helper._parse_arg(accumulate, "b")
+
+    if len(indices_list) == 0:
+        return values
+
+    if len(indices_list) > 1:
+        for idx_ in range(len(indices_list)):
+            if symbolic_helper._is_bool(indices_list[idx_]):
+                indices_list[idx_] = g.op("NonZero", indices_list[idx_])
+        index = indices_list[0]
+
+        for ind in indices_list[1:]:
+            index = opset9.add(g, index, ind)
+        broadcast_index_shape = g.op("Shape", index)
+        indices_list = [
+            symbolic_helper._unsqueeze_helper(
+                g, opset9.expand(g, ind, broadcast_index_shape, None), [-1]
+            )
+            for ind in indices_list
+        ]
+        index = g.op("Concat", *indices_list, axis_i=-1)
+    else:
+        # Replace index_put node with masked_scatter or masked_fill
+        # when inputs to the index_put node contains a single boolean input.
+        #
+        # index_put -> masked_fill
+        #   * input index contains single tensor of Bool type (e.g.: %24 <- %23).
+        #   * input value contains single element (e.g.: %18).
+        #
+        # Torch IR
+        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
+        #   %16 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
+        #               aten::to(%8, %26, %27, %11, %12, %28, %29, %15)
+        #   %18 : Float(requires_grad=0, device=cpu) = prim::Constant[value={1}]()
+        #   %23 : Bool(8, strides=[1], device=cpu) = aten::view(%16, %22)
+        #   %24 : Tensor?[] = prim::ListConstruct(%23)
+        #   %25 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
+        #                aten::index_put(%mask, %24, %18, %30)
+        #   return (%25)
+        #
+        #
+        # index_put -> masked_scatter
+        #   * input index contains single tensor of Bool type (e.g.: %32 <- %31).
+        #   * input value contains multiple elements (e.g.: %28).
+        #
+        # Torch IR
+        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
+        #   %28 : Float(8, strides=[1], requires_grad=0, device=cpu)
+        #                = prim::Constant[value= 1  1  1  1  1  1  1  1 [ CPUFloatType{8} ]]()
+        #   %15 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #                = aten::ne(%mask, %some_const)
+        #   %23 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #                = aten::to(%15, %34, %35, %18, %19, %36, %37, %22)
+        #   %38 : Long(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
+        #   %30 : int[] = prim::Constant[value=[-1]]()
+        #   %31 : Bool(8, strides=[1], device=cpu) = aten::view(%23, %30)
+        #   %32 : Tensor?[] = prim::ListConstruct(%31)
+        #   %33 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #               = aten::index_put(%mask, %32, %28, %38)
+        #   return (%33)
+        index = indices_list[0]
+        bool_inp = index
+        if symbolic_helper._is_bool(bool_inp):
+            rank = symbolic_helper._get_tensor_rank(values)
+            if rank is not None and rank == 0:
+                return opset9.masked_fill(g, self, bool_inp, values)
+            mask_rank = symbolic_helper._get_tensor_rank(bool_inp)
+            self_rank = symbolic_helper._get_tensor_rank(self)
+            if (
+                mask_rank is not None
+                and self_rank is not None
+                and self_rank > mask_rank
+            ):
+                # Unsqueeze 'bool_inp' to be broadcastable to shape of 'self'.
+                bool_inp = symbolic_helper._unsqueeze_helper(
+                    g, bool_inp, list(range(mask_rank, self_rank))
+                )
+            return masked_scatter(g, self, bool_inp, values)
+        broadcast_index_shape = g.op("Shape", index)
+        index = symbolic_helper._unsqueeze_helper(g, index, [-1])
+    sub_data_shape = symbolic_helper._slice_helper(
+        g, g.op("Shape", self), axes=[0], starts=[len(indices_list)], ends=[sys.maxsize]
+    )
+    values_shape = g.op("Concat", broadcast_index_shape, sub_data_shape, axis_i=0)
+    # Check if values is a singular value and expand accordingly
+    rank = symbolic_helper._get_tensor_rank(values)
+    if rank is not None and rank == 0:
+        values = opset9.expand(g, values, values_shape, None)
+    values = symbolic_helper._reshape_helper(g, values, values_shape)
+
+    self_scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if self_scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        values_scalar_type = _type_utils.JitScalarType.from_value(
+            values, _type_utils.JitScalarType.UNDEFINED
+        )
+        if self_scalar_type != values_scalar_type:
+            values = g.op("Cast", values, to_i=self_scalar_type.onnx_type())
+    elif accumulate:
+        raise errors.SymbolicValueError("self does not have a valid scalar type.", self)
+
+    if accumulate:
+        zeros = g.op(
+            "ConstantOfShape",
+            g.op("Shape", self),
+            value_t=torch.tensor([0], dtype=self_scalar_type.dtype()),
+        )
+        result = g.op("ScatterND", zeros, index, values)
+        result = add(g, self, result)
+    else:
+        result = g.op("ScatterND", self, index, values)
+
+    return result
+
+
+@_onnx_symbolic("aten::pixel_shuffle")
+@symbolic_helper.parse_args("v", "i")
+def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is not None and rank != 4:
+        return symbolic_helper._unimplemented("pixel_shuffle", "only support 4d input")
+    return g.op("DepthToSpace", self, blocksize_i=upscale_factor, mode_s="CRD")
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bicubic2d",
+    decorate=[symbolic_helper._apply_params("upsample_bicubic2d", 4, "cubic")],
+)
+def _interpolate(name: str, dim: int, interpolate_mode: str):
+    return symbolic_helper._interpolate_helper(name, dim, interpolate_mode)
+
+
+@_onnx_symbolic("aten::__interpolate")
+@symbolic_helper.quantized_args(True, False, False, False, False, False, False)
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    return symbolic_helper.__interpolate_helper(
+        g, input, size, scale_factor, mode, align_corners, recompute_scale_factor
+    )
+
+
+@_onnx_symbolic("aten::gather")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
+    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
+        return symbolic_helper._unimplemented("gather", "sparse_grad == True")
+    return g.op("GatherElements", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::scatter")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(src)
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("ScatterElements", self, index, src, axis_i=dim)
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        if _type_utils.JitScalarType.from_value(self) != src_type:
+            src = g.op(
+                "Cast",
+                src,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+        return g.op(
+            "ScatterElements", self, index, opset9.expand_as(g, src, index), axis_i=dim
+        )
+
+
+@_onnx_symbolic("aten::cumsum")
+@symbolic_helper.parse_args("v", "i", "none")
+def cumsum(g: jit_utils.GraphContext, self, dim, dtype=None):
+    dim_tensor = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.int))
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        cast = g.op(
+            "Cast", self, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    else:
+        cast = self
+    csum = g.op("CumSum", cast, dim_tensor)
+    return csum
+
+
+@_onnx_symbolic("aten::masked_select")
+def masked_select(g: jit_utils.GraphContext, self, mask):
+    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
+    return g.op("GatherND", self, index)
+
+
+@_onnx_symbolic("aten::masked_scatter")
+def masked_scatter(g: jit_utils.GraphContext, self, mask, source):
+    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
+    # NOTE: source can have more elements than needed.
+    # It could also have arbitrary shape.
+    # This is not supported by ONNX::ScatterND, so we need to flatten and slice source tensor.
+    source = symbolic_helper._reshape_helper(g, source, torch.LongTensor([-1]))
+    source = symbolic_helper._slice_helper(
+        g,
+        source,
+        axes=torch.LongTensor([0]),
+        starts=torch.LongTensor([0]),
+        ends=opset9.size(g, index, torch.LongTensor([0])),
+    )
+    return g.op("ScatterND", self, index, source)
+
+
+@_onnx_symbolic("aten::len")
+def _len(g: jit_utils.GraphContext, self):
+    if (
+        symbolic_helper._is_tensor_list(self)
+        or self.node().kind() == "onnx::SplitToSequence"
+    ):
+        return g.op("SequenceLength", self)
+    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
+    return symbolic_helper._squeeze_helper(g, sz_0, [0])
+
+
+@_onnx_symbolic("aten::__getitem_")
+def __getitem_(g: jit_utils.GraphContext, self, i):
+    if symbolic_helper._is_tensor_list(self):
+        # SequenceAt requires that the input be a List of Tensors
+        return g.op("SequenceAt", self, i)
+    else:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import (
+            __getitem_ as getitem,
+        )
+
+        return getitem(g, self, i)
+
+
+@_onnx_symbolic("aten::_set_item")
+def _set_item(g: jit_utils.GraphContext, tensor_list, i, v):
+    tensor_list = g.op("SequenceErase", tensor_list, i)
+    return g.op("SequenceInsert", tensor_list, v, i)
+
+
+@_onnx_symbolic("aten::append")
+def append(g: jit_utils.GraphContext, self, tensor):
+    return g.op("SequenceInsert", self, tensor)
+
+
+@_onnx_symbolic("aten::add")
+def add(g: jit_utils.GraphContext, self, other, alpha=None):
+    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
+        tensor_list_node = other.node()
+        if tensor_list_node.kind() != "prim::ListConstruct":
+            return symbolic_helper._unimplemented(
+                "add", "does not support adding dynamic tensor list to another"
+            )
+        tensors = symbolic_helper._unpack_list(other)
+        l = self
+        for t in tensors:
+            l = g.op("SequenceInsert", l, t)
+        return l
+
+    return opset9.add(g, self, other, alpha)
+
+
+@_onnx_symbolic("aten::insert")
+def insert(g: jit_utils.GraphContext, self, pos, tensor):
+    return g.op("SequenceInsert", self, tensor, pos)
+
+
+@_onnx_symbolic("aten::pop")
+def pop(g: jit_utils.GraphContext, tensor_list, dim):
+    return g.op("SequenceErase", tensor_list, dim)
+
+
+@_onnx_symbolic("aten::Delete")
+def Delete(g: jit_utils.GraphContext, tensor_list, dim):
+    return g.op("SequenceErase", tensor_list, dim)
+
+
+@_onnx_symbolic("aten::cat")
+@symbolic_helper.quantized_args(True)
+def cat(g: jit_utils.GraphContext, tensor_list, dim):
+    if symbolic_helper._is_packed_list(tensor_list):
+        return opset9.cat(g, tensor_list, dim)
+    else:
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        return g.op("ConcatFromSequence", tensor_list, axis_i=dim)
+
+
+@_onnx_symbolic("aten::stack")
+def stack(g: jit_utils.GraphContext, tensor_list, dim):
+    if symbolic_helper._is_packed_list(tensor_list):
+        return opset9.stack(g, tensor_list, dim)
+    else:
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        return g.op("ConcatFromSequence", tensor_list, axis_i=dim, new_axis_i=1)
+
+
+@_onnx_symbolic("aten::_unique2")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def _unique2(g: jit_utils.GraphContext, self, sorted, return_inverse, return_counts):
+    u, _indices, inverse_indices, counts = g.op(
+        "Unique", self, sorted_i=sorted, outputs=4
+    )
+    return u, inverse_indices, counts
+
+
+@_onnx_symbolic("aten::unique_dim")
+@symbolic_helper.parse_args("v", "i", "i", "i", "i")
+def unique_dim(
+    g: jit_utils.GraphContext, self, dim, sorted, return_inverse, return_counts
+):
+    u, _indices, inverse_indices, counts = g.op(
+        "Unique", self, axis_i=dim, sorted_i=sorted, outputs=4
+    )
+    return u, inverse_indices, counts
+
+
+@_onnx_symbolic("aten::topk")
+@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    return symbolic_helper._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out
+    )
+
+
+@_onnx_symbolic("aten::sort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def sort(g: jit_utils.GraphContext, self, dim, descending, out=None):
+    return symbolic_helper._sort_helper(g, self, dim, descending=descending, out=out)
+
+
+@_onnx_symbolic("aten::argsort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def argsort(g: jit_utils.GraphContext, self, dim, descending, out=None):
+    _, indices = symbolic_helper._sort_helper(
+        g, self, dim, descending=descending, out=out
+    )
+    return indices
+
+
+@_onnx_symbolic("aten::round")
+@symbolic_helper.parse_args("v", "i")
+def round(g: jit_utils.GraphContext, self, decimals=0):
+    if not symbolic_helper._is_fp(self):
+        return self
+    if decimals == 0:
+        return g.op("Round", self)
+    mul = g.op("Mul", self, g.op("Constant", value_t=torch.tensor(pow(10, decimals))))
+    round = g.op("Round", mul)
+    return g.op(
+        "Mul", round, g.op("Constant", value_t=torch.tensor(pow(10, -1 * decimals)))
+    )
+
+
+@_onnx_symbolic("aten::remainder")
+def remainder(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_fp(input) or symbolic_helper._is_fp(other):
+        return opset9.remainder(g, input, other)
+    return g.op("Mod", input, other, fmod_i=0)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
+        if _outputs is None:
+            return split_out
+        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
+        if (
+            symbolic_helper._is_packed_list(split_size_or_sizes)
+            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
+        ):
+            split_sizes = [
+                symbolic_helper._unsqueeze_helper(g, v, [0])
+                for v in symbolic_helper._unpack_list(split_size_or_sizes)
+            ]
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            res = []
+            for i in range(_outputs):
+                end = g.op(
+                    "Add", start, split_sizes[i]
+                )  # split_sizes is a list of same length as _outputs
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+            return res
+        return [
+            g.op(
+                "SequenceAt",
+                split_out,
+                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+            )
+            for i in range(_outputs)
+        ]
+    else:
+        return opset9.split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    return split(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+    else:
+        return opset9.unbind(g, self, dim, _outputs)
+
+
+def _prepare_onnx_paddings(g: jit_utils.GraphContext, input, pad):
+    """Generate paddings in ONNX order based on pad in pytorch.
+
+    Args:
+        input: the input tensor.
+        pad: the paddings in pytorch.
+            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ..., dim_m_begin, dim_m_end,
+            where m is in range [0, n].
+    """
+    if (
+        not symbolic_helper._is_packed_list(pad)
+        and symbolic_helper._is_list(pad)
+        and symbolic_helper._is_scalar_list(pad)
+    ):
+        pad = g.op("ConcatFromSequence", pad, axis_i=0, new_axis_i=1)
+    # The desired order of paddings is
+    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
+    # n is the dimension of input.
+    # Assume zero-dimensions in the beginning, pad the "pad" sequence with zeros in the beginning
+    pad_len = opset9.size(g, pad, g.op("Constant", value_t=torch.tensor([0])))
+    # Set extension = [0] * (dim * 2 - len(pad))
+    rank = symbolic_helper._get_tensor_rank(input)
+    if rank is None:
+        rank = g.op("Size", g.op("Shape", input))
+    else:
+        rank = g.op("Constant", value_t=torch.tensor(rank, dtype=torch.int64))
+    extension = g.op(
+        "Sub",
+        g.op("Mul", rank, g.op("Constant", value_t=torch.tensor(2, dtype=torch.int64))),
+        pad_len,
+    )
+    # Concat pad with extension: paddings = [dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, 0, 0, ... ]
+    # Currently ONNX only supports int64 type for Pad
+    pad = g.op("Cast", pad, to_i=_C_onnx.TensorProtoDataType.INT64)
+    paddings = g.op(
+        "Concat",
+        pad,
+        g.op(
+            "ConstantOfShape", extension, value_t=torch.tensor([0], dtype=torch.int64)
+        ),
+        axis_i=0,
+    )
+    # Reshape and reverse order and collate first beginnings and then ends
+    # paddings = [[..., 0, dim_n-1_begin, dim_n_begin],
+    #               [..., 0, dim_n-1_end, dim_n_end]]
+    # Reshape back to 1-D paddings = [..., 0, dim_n - 1_begin, dim_n_begin, ..., 0, dim_n - 1_end, dim_n_end]
+    paddings = symbolic_helper._reshape_helper(
+        g, paddings, g.op("Constant", value_t=torch.tensor([-1, 2]))
+    )
+    paddings = g.op("Transpose", opset10.flip(g, paddings, [0]), perm_i=[1, 0])
+    paddings = symbolic_helper._reshape_helper(
+        g, paddings, g.op("Constant", value_t=torch.tensor([-1]))
+    )
+    padding_c = g.op("Cast", paddings, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return padding_c
+
+
+@_onnx_symbolic("aten::constant_pad_nd")
+def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value=None):
+    mode = "constant"
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, input)
+    pad = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, pad, value, mode_s=mode)
+
+
+@_onnx_symbolic("aten::reflection_pad1d")
+@_onnx_symbolic("aten::reflection_pad2d")
+@_onnx_symbolic("aten::reflection_pad3d")
+def reflection_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "reflect"
+    paddings = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, paddings, mode_s=mode)
+
+
+@_onnx_symbolic("aten::replication_pad1d")
+@_onnx_symbolic("aten::replication_pad2d")
+@_onnx_symbolic("aten::replication_pad3d")
+def replication_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "edge"
+    paddings = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, paddings, mode_s=mode)
+
+
+@_onnx_symbolic("aten::pad")
+def pad(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    pad: _C.Value,
+    mode: _C.Value,
+    value: _C.Value,
+):
+    mode = symbolic_helper._parse_arg(mode, "s")
+    if mode == "replicate":
+        return replication_pad(g, input, pad)
+    elif mode == "reflect":
+        return reflection_pad(g, input, pad)
+    elif mode == "constant":
+        return constant_pad_nd(g, input, pad, value)
+    elif mode == "circular":
+        return opset9._pad_circular(g, input, pad)
+    else:
+        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
+
+
+@_onnx_symbolic("aten::linalg_det")
+def linalg_det(g: jit_utils.GraphContext, self):
+    return g.op("Det", self)
+
+
+@_onnx_symbolic("aten::logdet")
+def logdet(g: jit_utils.GraphContext, input):
+    return opset9.log(g, linalg_det(g, input))
+
+
+@_onnx_symbolic("aten::arange")
+def arange(g: jit_utils.GraphContext, *args):
+    def _get_arange_dtype(dtype):
+        dtype = symbolic_helper._maybe_get_const(dtype, "i")
+        return dtype
+
+    if len(args) == 2 and all(isinstance(val, int) for val in args):
+        # aten::arange(Scalar start, Scalar end)
+        dtype = torch.int64
+        # Start index.
+        start = g.op(
+            "Constant",
+            value_t=torch.tensor(args[0], dtype=dtype),
+        )
+        # End (exclusive) index.
+        end = g.op(
+            "Constant",
+            value_t=torch.tensor(args[1], dtype=dtype),
+        )
+        # Step size from start to end indexes.
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=dtype),
+        )
+        return g.op("Range", start, end, delta_default)
+    elif len(args) == 2 or len(args) == 5:
+        if len(args) == 2:
+            # aten::arange(Scalar end, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[1])
+        type_, end, start, step = symbolic_helper._arange_cast_helper(
+            g, end=args[0], dtype=dtype
+        )
+        start_default = g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=type_.dtype()),
+        )
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=type_.dtype()),
+        )
+        return g.op("Range", start_default, end, delta_default)
+    elif len(args) == 4 or len(args) == 7:
+        if len(args) == 4:
+            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[3])
+        _, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], step=args[2], dtype=dtype
+        )
+        return g.op("Range", start, end, step)
+    elif len(args) == 6:
+        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+        dtype = _get_arange_dtype(args[2])
+        type_, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], dtype=dtype
+        )
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=type_.dtype()),
+        )
+        return g.op("Range", start, end, delta_default)
+    else:
+        return symbolic_helper._unimplemented(
+            "aten::arange", f"with {len(args)} arguments"
+        )
+
+
+@_onnx_symbolic("aten::_dim_arange")
+@symbolic_helper.parse_args("v", "i")
+def _dim_arange(g: jit_utils.GraphContext, like, dim):
+    like_shape = g.op("Shape", like)
+    stop = g.op(
+        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
+    )
+    return arange(g, stop, 4, None, None, None)
+
+
+@_onnx_symbolic("aten::size")
+@symbolic_helper.quantized_args(True, quantize_output=False)
+def size(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Shape", self)
+    return symbolic_helper._size_helper(g, self, dim)
+
+
+@_onnx_symbolic("aten::squeeze")
+def squeeze(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Squeeze", self)
+
+    # dim as a tensor
+    if not symbolic_helper._is_constant(dim):
+        return symbolic_helper._squeeze_helper(g, self, [dim])
+
+    dim = symbolic_helper._get_const(dim, "i", "dim")
+
+    input_rank = symbolic_helper._get_tensor_rank(self)
+    adjusted_dim = dim
+    if input_rank is not None and dim < 0:
+        adjusted_dim += input_rank
+    dim_size = symbolic_helper._get_tensor_dim_size(self, adjusted_dim)
+    if (dim < 0 and input_rank is None) or dim_size is None:
+        # If onnx shape inference is not on, export always as dynamic.
+        # Because we cannot tell if observed static shape is also static at runtime.
+        # create "cond" node (condition is shape[i]==1)
+        dim_constant = g.op("Constant", value_t=torch.tensor([dim]))
+        size = symbolic_helper._size_helper(g, self, dim_constant)
+        const_one = g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))
+        cond = g.op("Equal", size, const_one)
+        # create the "If" node and add the "then" and "else" blocks to it.
+        if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+            g, "If", cond, n_blocks=2
+        )
+        squeeze_ = symbolic_helper._squeeze_helper(if_context, self, [dim])
+        utils._add_output_to_block(if_context.block, squeeze_)
+        identity_ = else_context.op("Identity", self)
+        utils._add_output_to_block(else_context.block, identity_)
+        return if_op
+
+    # For static input shape
+    dim = adjusted_dim
+    if dim_size > 1:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(dim)
+            + ". The size of "
+            + "this dimension in the given input is "
+            + str(dim_size)
+            + ". The model will "
+            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
+            + "input shapes, please export with dynamic_axes argument."
+        )
+        return self
+    return symbolic_helper._squeeze_helper(g, self, [dim])
+
+
+@_onnx_symbolic("aten::unsqueeze")
+def unsqueeze(g: jit_utils.GraphContext, self, dim):
+    if symbolic_helper._is_constant(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+
+    return symbolic_helper._unsqueeze_helper(g, self, [dim])
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    return g.op("Gemm", self, other, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::index")
+def index(g: jit_utils.GraphContext, self, index):
+    if symbolic_helper._is_packed_list(index):
+        indices = symbolic_helper._unpack_list(index)
+    else:
+        indices = [index]
+
+    # Handle single mask index.
+    if len(indices) == 1:
+        index = indices[0]
+        if not symbolic_helper._is_none(index) and (
+            symbolic_helper._is_bool(index)
+            or _type_utils.JitScalarType.from_value(index)
+            == _type_utils.JitScalarType.UINT8
+        ):
+            index = opset9.nonzero(g, index)
+            return g.op("GatherND", self, index)
+    return opset9.index(g, self, index)
+
+
+@_onnx_symbolic("aten::index_fill")
+def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, self)
+    expanded_value = opset9.expand(g, value, expanded_index_shape, None)
+    return scatter(g, self, dim, expanded_index, expanded_value)
+
+
+@_onnx_symbolic("aten::index_copy")
+def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
+    _expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    return scatter(g, self, dim, expanded_index, source)
+
+
+@_onnx_symbolic("aten::bitwise_right_shift")
+@_onnx_symbolic("aten::__rshift_")
+def __rshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(self):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+        )
+
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.UINT8
+    ):
+        return g.op("BitShift", self, other, direction_s="RIGHT")
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+    )
+    rshift = g.op("Div", self, two_pow)
+    return rshift
+
+
+@_onnx_symbolic("aten::bitwise_left_shift")
+@_onnx_symbolic("aten::__lshift_")
+def __lshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(self):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+        )
+
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.UINT8
+    ):
+        return g.op("BitShift", self, other, direction_s="LEFT")
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+    )
+    lshift = g.op("Mul", self, two_pow)
+    return lshift
+
+
+def _get_im2col_indices_along_dim(
+    g: jit_utils.GraphContext, input_d, kernel_size_d, dilation_d, padding_d, stride_d
+):
+    # Input is always 4-D (N, C, H, W)
+    # Calculate indices of sliding blocks along spatial dimension
+    # Slide kernel over input each dim d:
+    # each dimension d ranges from 0 to input[d]+2xpadding[d]-dilation[d]x(kernel_size[d]-1)
+    # with steps = stride
+
+    blocks_d = g.op(
+        "Add", input_d, g.op("Constant", value_t=torch.tensor(padding_d * 2))
+    )
+    blocks_d = g.op(
+        "Sub",
+        blocks_d,
+        g.op("Constant", value_t=torch.tensor(dilation_d * (kernel_size_d - 1))),
+    )
+
+    # Stride kernel over input and find starting indices along dim d
+    blocks_d_indices = g.op(
+        "Range",
+        g.op("Constant", value_t=torch.tensor(0)),
+        blocks_d,
+        g.op("Constant", value_t=torch.tensor(stride_d)),
+    )
+
+    # Apply dilation on kernel and find its indices along dim d
+    kernel_grid = torch.arange(0, kernel_size_d * dilation_d, dilation_d)
+    kernel_grid = g.op("Constant", value_t=kernel_grid.unsqueeze(0))
+
+    # Broadcast and add kernel staring positions (indices) with
+    # kernel_grid along dim d, to get block indices along dim d
+    blocks_d_indices = symbolic_helper._unsqueeze_helper(
+        g, blocks_d_indices, [0]
+    )  # Reshape to [1, -1]
+    kernel_mask = symbolic_helper._reshape_helper(
+        g, kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1]))
+    )
+    block_mask = g.op("Add", blocks_d_indices, kernel_mask)
+
+    return block_mask
+
+
+def _get_im2col_padded_input(g: jit_utils.GraphContext, input, padding_h, padding_w):
+    # Input is always 4-D tensor (N, C, H, W)
+    # Padding tensor has the following format: (padding_h, padding_w)
+    # Reshape the padding to follow ONNX format: (dim1_begin, dim2_begin,...,dim1_end, dim2_end,...)
+    pad = g.op("Constant", value_t=torch.LongTensor([0, 0, padding_h, padding_w] * 2))
+    return g.op("Pad", input, pad)
+
+
+def _get_im2col_output_shape(g: jit_utils.GraphContext, input, kernel_h, kernel_w):
+    batch_dim = size(g, input, g.op("Constant", value_t=torch.tensor(0)))
+    channel_dim = size(g, input, g.op("Constant", value_t=torch.tensor(1)))
+    channel_unfolded = g.op(
+        "Mul", channel_dim, g.op("Constant", value_t=torch.tensor(kernel_h * kernel_w))
+    )
+
+    return g.op(
+        "Concat",
+        symbolic_helper._unsqueeze_helper(g, batch_dim, [0]),
+        symbolic_helper._unsqueeze_helper(g, channel_unfolded, [0]),
+        g.op("Constant", value_t=torch.tensor([-1])),
+        axis_i=0,
+    )
+
+
+@_onnx_symbolic("aten::im2col")
+@symbolic_helper.parse_args("v", "is", "is", "is", "is")
+def im2col(g: jit_utils.GraphContext, input, kernel_size, dilation, padding, stride):
+    # Input is always 4-D tensor (N, C, H, W)
+    # All other args are int[2]
+
+    input_h = size(g, input, g.op("Constant", value_t=torch.tensor(2)))
+    input_w = size(g, input, g.op("Constant", value_t=torch.tensor(3)))
+
+    stride_h, stride_w = stride[0], stride[1]
+    padding_h, padding_w = padding[0], padding[1]
+    dilation_h, dilation_w = dilation[0], dilation[1]
+    kernel_h, kernel_w = kernel_size[0], kernel_size[1]
+
+    blocks_row_indices = _get_im2col_indices_along_dim(
+        g, input_h, kernel_h, dilation_h, padding_h, stride_h
+    )
+    blocks_col_indices = _get_im2col_indices_along_dim(
+        g, input_w, kernel_w, dilation_w, padding_w, stride_w
+    )
+
+    output_shape = _get_im2col_output_shape(g, input, kernel_h, kernel_w)
+    padded_input = _get_im2col_padded_input(g, input, padding_h, padding_w)
+
+    # For a 4D matrix of size (1, 1, 3, 3) as below with kernel_size=2, stride=1, and dilation=1
+    # [[[[1., 2., 3.,],
+    #    [4., 5., 6.,],
+    #    [7., 8., 9.,]]]]
+    # First gather indices along rows (dim=2) with blocks_row_indices = [[0,1], [1,2]] to get:
+    # [[[[[1., 2., 3.],
+    #     [4., 5., 6.]],
+    #    [[4., 5., 6.],
+    #     [7., 8., 9.]]]]]
+    # And then gather along cols (dim=4) with blocks_row_indices = [[0,1], [1,2]] to get:
+    # [[[[[[1., 2.],
+    #      [4., 5.]],
+    #     [[2., 3.],
+    #      [5., 6]]],
+    #    [[[4., 5.],
+    #      [7., 8.]],
+    #     [[5., 6.],
+    #      [8., 9.]]]]]]
+    # Transpose dims 3 (depth) and 4 (rows), and then reshape to output shape (1, 1, 4, 4) to get:
+    #  [[[1., 2., 4., 5.],
+    #    [2., 3., 5., 6.],
+    #    [4., 5., 7., 8.],
+    #    [5., 6., 8., 9.]]]
+    output = g.op("Gather", padded_input, blocks_row_indices, axis_i=2)
+    output = g.op("Gather", output, blocks_col_indices, axis_i=4)
+    output = g.op("Transpose", output, perm_i=[0, 1, 2, 4, 3, 5])
+    return symbolic_helper._reshape_helper(g, output, output_shape)
+
+
+@_onnx_symbolic("aten::narrow")
+def narrow(g: jit_utils.GraphContext, input, dim, start, length):
+    end = g.op("Add", start, length)
+    return symbolic_helper._slice_helper(g, input, axes=dim, starts=start, ends=end)
+
+
+@_onnx_symbolic("aten::flatten")
+@symbolic_helper.quantized_args(True, False, False)
+@symbolic_helper.parse_args("v", "i", "i")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    dim = symbolic_helper._get_tensor_rank(input)
+    if dim == 1:
+        return input
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim == 1:
+        if end_dim == -1 or (dim is not None and end_dim == dim - 1):
+            return g.op("Flatten", input, axis_i=start_dim)
+    elif start_dim == 0:
+        if end_dim == -2 or (dim is not None and end_dim == dim - 2):
+            return g.op("Flatten", input, axis_i=end_dim + 1)
+    if dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+    # if end_dim is negative add dim
+    if end_dim < 0:
+        end_dim = dim + end_dim
+
+    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self,
+    ord,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    return symbolic_helper._embedding_bag_helper(
+        g,
+        embedding_matrix,
+        indices,
+        offsets,
+        scale_grad_by_freq,
+        mode,
+        sparse,
+        per_sample_weights,
+        include_last_offset,
+        padding_idx,
+    )
+
+
+@_onnx_symbolic("aten::embedding_renorm")
+@symbolic_helper.parse_args("v", "v", "f", "f")
+def embedding_renorm(g: jit_utils.GraphContext, weight, indices, max_norm, norm_type):
+    unique_indices = g.op("Unique", indices)
+    partial_weight = g.op("Gather", weight, unique_indices)
+    norm_i = int(norm_type)
+    if norm_i == 1:
+        norm_type = "ReduceL1"
+    elif norm_i == 2:
+        norm_type = "ReduceL2"
+    else:
+        raise errors.SymbolicValueError(
+            f"Unsupported: ONNX export of embedding_renorm with norm: {norm_i}. "
+            "Only 1. and 2. are supported.",
+            weight,
+        )
+    partial_weight_norm = g.op(norm_type, partial_weight, axes_i=[1], keepdims_i=1)
+    # https://github.com/pytorch/pytorch/blob/0a07488ed2c47765e337e290bd138c0e6e459cbd/aten/src/ATen/native/Embedding.cpp#L177
+    # Add 1e-7 to prevent division by zero.
+    partial_weight_norm_ = g.op(
+        "Add", partial_weight_norm, g.op("Constant", value_t=torch.tensor(1e-7))
+    )
+    max_norm = torch.tensor(max_norm)
+    scales = g.op("Div", max_norm, partial_weight_norm_)
+    partial_weight_renorm = g.op("Mul", partial_weight, scales)
+    partial_weight_renorm = g.op(
+        "Where",
+        g.op("Greater", partial_weight_norm, max_norm),
+        partial_weight_renorm,
+        partial_weight,
+    )
+    return g.op(
+        "ScatterND",
+        weight,
+        symbolic_helper._unsqueeze_helper(g, unique_indices, [1]),
+        partial_weight_renorm,
+    )
+
+
+@_onnx_symbolic("aten::chunk")
+def chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    # Calculate chunk size for dynamic chunk
+    dim_size = g.op("Gather", g.op("Shape", self), dim, axis_i=0)
+    chunk_size_s = g.op(
+        "Sub", chunks, g.op("Constant", value_t=torch.tensor([1], dtype=torch.long))
+    )
+    chunk_size = g.op("Div", g.op("Add", dim_size, chunk_size_s), chunks)
+    # Create splits vector
+    chunk_vec = [
+        opset9.expand(g, chunk_size, chunk_size_s, None),
+        g.op("Sub", dim_size, g.op("Mul", chunk_size, chunk_size_s)),
+    ]
+    chunk_vec = g.op("Concat", *chunk_vec, axis_i=0)
+    return split(g, self, chunk_vec, dim)
+
+
+@_onnx_symbolic("aten::normal")
+def normal(
+    g: jit_utils.GraphContext,
+    mean,
+    std,
+    sizes=None,
+    generator=None,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+):
+    # If you can sample from a given distribution with mean 0 and variance 1, then you can easily sample from a
+    # scale-location transformation of that distribution, which has mean mu and variance sigma's square. If x is a sample
+    # from a mean 0 and variance 1 distribution then
+    #       sigma x+mu
+    # is a sample with mean mu and variance sigma's square.
+    if sizes is not None and not symbolic_helper._is_none(sizes):
+        mean = opset9.expand(g, mean, sizes, None)
+    result = opset9.mul(g, std, g.op("RandomNormalLike", mean))
+    return add(g, result, mean)
+
+
+@_onnx_symbolic("aten::atleast_1d")
+def atleast_1d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 1D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1]))
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1]))
+        )
+    return self
+
+
+@_onnx_symbolic("aten::atleast_2d")
+def atleast_2d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 2D
+    #       If it's 1D, unsqueeze to 2D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1]))
+                )
+            elif tensor_rank == 1:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[0]
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1, 1]))
+        )
+    elif tensor_rank == 1:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
+    return self
+
+
+@_onnx_symbolic("aten::atleast_3d")
+def atleast_3d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 3D
+    #       If it's 1D, unsqueeze to 3D
+    #       If it's 2D, unsqueeze to 3D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
+                )
+            elif tensor_rank == 1:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[0]
+                )
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[-1]
+                )
+            elif tensor_rank == 2:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[-1]
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
+        )
+    elif tensor_rank == 1:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
+    elif tensor_rank == 2:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
+    return self
+
+
+@_onnx_symbolic("prim::ConstantChunk")
+def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    input_shape = g.op("Shape", self)
+    axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+    input_shape_dim = g.op("Gather", input_shape, axis, axis_i=0)
+    start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+    chunk_size = g.op("Constant", value_t=torch.tensor([chunks], dtype=torch.long))
+    chunk_size_minus_1 = g.op(
+        "Constant", value_t=torch.tensor([chunks - 1], dtype=torch.long)
+    )
+    input_shape_dim_shift = g.op("Add", input_shape_dim, chunk_size_minus_1)
+    chunk_dim = g.op("Div", input_shape_dim_shift, chunk_size)
+    res = []
+    for i in range(chunks):
+        index = g.op("Constant", value_t=torch.tensor([i + 1], dtype=torch.long))
+        end = g.op("Mul", chunk_dim, index)
+        res.append(g.op("Slice", self, start, end, axis))
+        start = end
+    return res
+
+
+@_onnx_symbolic("aten::hstack")
+def hstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
+    tensor_list = atleast_1d(g, tensor_list)
+    first_tensor = g.op(
+        "SequenceAt",
+        tensor_list,
+        g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)),
+    )
+    first_tensor_shape = g.op("Shape", first_tensor)
+    first_tensor_dim = g.op("Size", first_tensor_shape)
+
+    const_one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
+    equal_to_one = g.op("Equal", first_tensor_dim, const_one)
+
+    (
+        if_op_greater,
+        (if_context_equal, else_context_equal),
+        _,
+    ) = jit_utils.add_op_with_blocks(g, "If", equal_to_one, n_blocks=2, outputs=1)
+    result_if = if_context_equal.op(
+        "ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0
+    )
+    utils._add_output_to_block(if_context_equal.block, result_if)
+    result_else = else_context_equal.op(
+        "ConcatFromSequence", tensor_list, axis_i=1, new_axis_i=0
+    )
+    utils._add_output_to_block(else_context_equal.block, result_else)
+    result = if_op_greater.node().output()
+
+    return result
+
+
+@_onnx_symbolic("aten::vstack")
+def vstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
+    tensor_list = atleast_2d(g, tensor_list)
+    return g.op("ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
new file mode 100644
index 0000000000000..4316604097171
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
@@ -0,0 +1,465 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+from __future__ import annotations
+
+import functools
+import sys
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+    utils,
+)
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 12
+
+__all__ = [
+    "argmax",
+    "argmin",
+    "binary_cross_entropy_with_logits",
+    "celu",
+    "cross_entropy_loss",
+    "dropout",
+    "einsum",
+    "ge",
+    "le",
+    "native_dropout",
+    "nll_loss",
+    "nll_loss2d",
+    "nll_loss_nd",
+    "outer",
+    "pow",
+    "tensordot",
+    "unfold",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=12)
+
+
+def _einsum_helper(g: jit_utils.GraphContext, equation, tensors):
+    if not tensors:
+        raise RuntimeError("Einsum inputs are empty.")
+    # ONNX does not support bool for Einsum inputs.
+    if symbolic_helper._is_bool(tensors[0]):
+        tensors = [
+            g.op("Cast", tensor, to_i=_C_onnx.TensorProtoDataType.INT64)
+            for tensor in tensors
+        ]
+        return g.op(
+            "Cast",
+            g.op("Einsum", *tensors, equation_s=equation),
+            to_i=_C_onnx.TensorProtoDataType.BOOL,
+        )
+    else:
+        return g.op("Einsum", *tensors, equation_s=equation)
+
+
+@_onnx_symbolic("aten::einsum")
+@symbolic_helper.parse_args("s", "v", "is")
+def einsum(g: jit_utils.GraphContext, equation, tensor_list, path=None):
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    return _einsum_helper(g, equation, tensors)
+
+
+@_onnx_symbolic("aten::outer")
+@symbolic_helper.parse_args("v", "v")
+def outer(g: jit_utils.GraphContext, input, other):
+    # make sure to cast other to self's type
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(input):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(input).onnx_type(),
+        )
+    return _einsum_helper(g, "i,j->ij", [input, other])
+
+
+def _dropout_returns_masked_input_and_mask(
+    g: jit_utils.GraphContext, input: torch._C.Value, p: float, train: bool
+) -> tuple[torch._C.Value, torch._C.Value | None]:
+    symbolic_helper.check_training_mode(train, "dropout")
+    # In eval mode, dropout is non-op. That is, if the node's
+    # train param is set to False, dropout just returns its inputs.
+    if not train:
+        return input, None
+    p = g.op("Constant", value_t=torch.tensor(p))
+    t = g.op("Constant", value_t=torch.tensor(train, dtype=torch.bool))
+    r, mask = g.op("Dropout", input, p, t, outputs=2)
+    return r, mask
+
+
+@_onnx_symbolic("aten::dropout")
+@symbolic_helper.parse_args("v", "f", "b")
+def dropout(g: jit_utils.GraphContext, input, p, train):
+    masked, _ = _dropout_returns_masked_input_and_mask(g, input, p, train)
+    return masked
+
+
+@_onnx_symbolic("aten::native_dropout")
+@symbolic_helper.parse_args("v", "f", "b")
+def native_dropout(g: jit_utils.GraphContext, input, p, train):
+    return _dropout_returns_masked_input_and_mask(g, input, p, train)
+
+
+@_onnx_symbolic("aten::nll_loss")
+def nll_loss(g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index):
+    # none reduction : onnx::Constant[value={0}]
+    # mean reduction : onnx::Constant[value={1}]
+    # sum reduction : onnx::Constant[value={2}]
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    reduction_vals = ["none", "mean", "sum"]
+    reduction = reduction_vals[reduction]
+
+    # in onnx NegativeLogLikelihoodLoss specification, ignore_index is optional without default value.
+    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
+    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
+    if weight.node().mustBeNone():
+        nllloss = g.op(
+            "NegativeLogLikelihoodLoss",
+            self,
+            target,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+    else:
+        nllloss = g.op(
+            "NegativeLogLikelihoodLoss",
+            self,
+            target,
+            weight,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+
+    return nllloss
+
+
+@_onnx_symbolic("aten::nll_loss2d")
+def nll_loss2d(
+    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
+):
+    return nll_loss(g, self, target, weight, reduction, ignore_index)
+
+
+@_onnx_symbolic("aten::nll_loss_nd")
+def nll_loss_nd(
+    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
+):
+    return nll_loss(g, self, target, weight, reduction, ignore_index)
+
+
+@_onnx_symbolic("aten::cross_entropy_loss")
+def cross_entropy_loss(
+    g: jit_utils.GraphContext,
+    self,
+    target,
+    weight,
+    reduction,
+    ignore_index,
+    label_smoothing,
+):
+    # none reduction : onnx::Constant[value={0}]
+    # mean reduction : onnx::Constant[value={1}]
+    # sum reduction : onnx::Constant[value={2}]
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    reduction_vals = ["none", "mean", "sum"]
+    reduction = reduction_vals[reduction]
+
+    label_smoothing = symbolic_helper._maybe_get_const(label_smoothing, "f")
+    if label_smoothing is not None and label_smoothing > 0.0:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX does not support label_smoothing", self
+        )
+
+    # in onnx SoftmaxCrossEntropyLoss specification, ignore_index is optional without default value.
+    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
+    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
+    if weight.node().mustBeNone():
+        celoss = g.op(
+            "SoftmaxCrossEntropyLoss",
+            self,
+            target,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+    else:
+        celoss = g.op(
+            "SoftmaxCrossEntropyLoss",
+            self,
+            target,
+            weight,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+
+    return celoss
+
+
+@_onnx_symbolic("aten::binary_cross_entropy_with_logits")
+@symbolic_helper.parse_args("v", "v", "v", "v", "i")
+def binary_cross_entropy_with_logits(
+    g: jit_utils.GraphContext, input, target, weight, pos_weight, reduction
+):
+    p = g.op("Constant", value_t=torch.tensor([1]))
+    sig_x = opset9.sigmoid(g, input)
+    log_sig_x = opset9.log(g, sig_x)
+    sub_1_x = opset9.sub(g, p, sig_x)
+    sub_1_y = opset9.sub(g, p, target)
+    log_1_x = opset9.log(g, sub_1_x)
+    if pos_weight is None or symbolic_helper._is_none(pos_weight):
+        output = opset9.neg(
+            g,
+            opset9.add(
+                g, opset9.mul(g, target, log_sig_x), opset9.mul(g, sub_1_y, log_1_x)
+            ),
+        )
+    else:
+        output = opset9.neg(
+            g,
+            opset9.add(
+                g,
+                opset9.mul(g, opset9.mul(g, target, log_sig_x), pos_weight),
+                opset9.mul(g, sub_1_y, log_1_x),
+            ),
+        )
+
+    if weight is not None and not symbolic_helper._is_none(weight):
+        output = opset9.mul(g, weight, output)
+
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return g.op("ReduceSum", output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "binary_cross_entropy_with_logits with reduction other than none, mean, or sum",
+            input,
+        )
+
+
+@_onnx_symbolic("aten::celu")
+def celu(g: jit_utils.GraphContext, self, alpha):
+    alpha = symbolic_helper._maybe_get_const(alpha, "f")
+    # if the input is of type double cast it to float
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.DOUBLE
+    ):
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        out = g.op("Celu", self, alpha_f=alpha)
+        return g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
+
+    return g.op("Celu", self, alpha_f=alpha)
+
+
+@_onnx_symbolic("aten::argmax")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmax(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
+
+
+@_onnx_symbolic("aten::argmin")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmin(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
+
+
+@_onnx_symbolic("aten::pow")
+def pow(g: jit_utils.GraphContext, self, exponent):
+    return g.op("Pow", self, exponent)
+
+
+@_onnx_symbolic("aten::ge")
+def ge(g: jit_utils.GraphContext, input, other):
+    return g.op("GreaterOrEqual", input, other)
+
+
+@_onnx_symbolic("aten::le")
+def le(g: jit_utils.GraphContext, input, other):
+    return g.op("LessOrEqual", input, other)
+
+
+@_onnx_symbolic("aten::unfold")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
+    const_size = symbolic_helper._maybe_get_const(size, "i")
+    const_step = symbolic_helper._maybe_get_const(step, "i")
+    if not symbolic_helper._is_value(const_size) and not symbolic_helper._is_value(
+        const_step
+    ):
+        return opset9.unfold(g, input, dimension, const_size, const_step)
+
+    sizedim = symbolic_helper._get_tensor_dim_size(input, dimension)
+    if sizedim is not None:
+        low_start = g.op("Constant", value_t=torch.tensor(0))
+        low_end = g.op("Constant", value_t=torch.tensor(sizedim))
+        hi_end = g.op("Constant", value_t=torch.tensor(sizedim + 1))
+        low_indices = g.op("Range", low_start, low_end, step)
+        hi_indices = g.op("Range", size, hi_end, step)
+
+        low_size = symbolic_helper._size_helper(
+            g, low_indices, g.op("Constant", value_t=torch.tensor(0))
+        )
+        hi_size = symbolic_helper._size_helper(
+            g, hi_indices, g.op("Constant", value_t=torch.tensor(0))
+        )
+
+        ndim = symbolic_helper._get_tensor_rank(input)
+        assert ndim is not None
+        perm = list(range(0, ndim))
+        perm.append(perm.pop(dimension))
+
+        unsqueeze_list = []
+        loop_condition = g.op("Constant", value_t=torch.tensor(1))
+        loop_condition = g.op(
+            "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+        )
+        loop_len = g.op("Min", low_size, hi_size)
+
+        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+            g, "Loop", loop_len, loop_condition, n_blocks=1
+        )
+
+        loop_block = loop_context.block
+        block_input_iter = utils._add_input_to_block(loop_block)
+        cond = utils._add_input_to_block(loop_block)  # noqa: F841
+
+        starts = loop_context.op("Gather", low_indices, block_input_iter)
+        ends = loop_context.op("Gather", hi_indices, block_input_iter)
+        axes = loop_context.op("Constant", value_t=torch.tensor([2]))
+        starts = symbolic_helper._unsqueeze_helper(loop_context, starts, [0])
+        ends = symbolic_helper._unsqueeze_helper(loop_context, ends, [0])
+        stack = loop_context.op("Slice", input, starts, ends, axes)
+
+        unsqueeze = symbolic_helper._unsqueeze_helper(
+            loop_context, loop_context.op("Transpose", stack, perm_i=perm), [dimension]
+        )
+        unsqueeze_list.append(unsqueeze)
+        concat = loop_context.op("Concat", *unsqueeze_list, axis_i=0)
+
+        cond_out = loop_context.op(
+            "Cast", loop_condition, _C_onnx.TensorProtoDataType.BOOL
+        )
+        utils._add_output_to_block(loop_block, cond_out)
+        utils._add_output_to_block(loop_block, concat)
+
+        loop_output = loop.node().output()
+        perm = [0, 1, 2, 3, 4]
+        perm[0], perm[dimension + 1] = perm[dimension + 1], perm[0]
+        transpose = g.op("Transpose", loop_output, perm_i=perm)
+        squeeze = symbolic_helper._squeeze_helper(g, transpose, [0])
+
+        return squeeze
+
+    return symbolic_helper._unimplemented("Unfold", "input size not accessible")
+
+
+@_onnx_symbolic("aten::tensordot")
+@symbolic_helper.parse_args("v", "v", "is", "is", "v")
+def tensordot(g: jit_utils.GraphContext, input_a, input_b, dims_a, dims_b, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "Tensordot", "Out parameter is not supported for tensordot."
+        )
+
+    dim_count_a = symbolic_helper._get_tensor_rank(input_a)
+    if dim_count_a is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of tensordot for tensor(input_a) of unknown rank.",
+            input_a,
+        )
+
+    dim_count_b = symbolic_helper._get_tensor_rank(input_b)
+    if dim_count_b is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of tensordot for tensor(input_b) of unknown rank.",
+            input_b,
+        )
+
+    dims_a = [
+        (dims_a[i] + dim_count_a) if (dims_a[i] < 0) else dims_a[i]
+        for i in range(len(dims_a))
+    ]
+    dims_b = [
+        (dims_b[i] + dim_count_b) if (dims_b[i] < 0) else dims_b[i]
+        for i in range(len(dims_b))
+    ]
+
+    left_dims_a = [i for i in range(dim_count_a) if (i not in dims_a)]
+    left_dims_b = [i for i in range(dim_count_b) if (i not in dims_b)]
+
+    new_input_a = opset9.permute(g, input_a, left_dims_a + dims_a)
+    new_input_b = opset9.permute(g, input_b, dims_b + left_dims_b)
+
+    input_shape = g.op("Shape", new_input_a)
+    left_sizes_a = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[0], ends=[len(left_dims_a)]
+    )
+    shape_sizes = [
+        left_sizes_a,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+    ]
+    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
+
+    input_shape = g.op("Shape", output_a)
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
+    )
+    shape_sizes = [
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+        slices,
+    ]
+    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
+
+    input_shape = g.op("Shape", new_input_b)
+    left_sizes_b = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[len(dims_b)], ends=[sys.maxsize]
+    )
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[0], ends=[len(dims_b)]
+    )
+    shape_sizes = [
+        slices,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+    ]
+    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
+
+    input_shape = g.op("Shape", output_b)
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
+    )
+    shape_sizes = [
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+        slices,
+    ]
+    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
+
+    output = einsum(g, "ij,jk->ik", g.op("prim::ListConstruct", *[output_a, output_b]))
+
+    shape_sizes = [left_sizes_a, left_sizes_b]
+    return opset9._reshape_from_tensor(g, output, shape_sizes)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset13.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset13.py
new file mode 100644
index 0000000000000..e9da6a426f7f6
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset13.py
@@ -0,0 +1,1113 @@
+# mypy: allow-untyped-defs
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 13
+import functools
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch.onnx import _constants, errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset11 as opset11,
+    symbolic_opset9 as opset9,
+    utils,
+)
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=13)
+
+
+@_onnx_symbolic("aten::softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    softmax = g.op("Softmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        softmax = g.op(
+            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+
+    return softmax
+
+
+@_onnx_symbolic("aten::log_softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    return_op = g.op("LogSoftmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return_op = g.op(
+            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    return return_op
+
+
+@_onnx_symbolic("aten::frobenius_norm")
+@symbolic_helper.parse_args("v", "v", "i")
+def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
+    dim_val = symbolic_helper._maybe_get_const(dim, "is")
+    if not symbolic_helper._is_value(dim_val) and len(dim_val) == 0:
+        return g.op("ReduceL2", self, keepdims_i=0)
+    sqr = g.op("Mul", self, self)
+    sumsqr = symbolic_helper._reducesum_helper(g, sqr, dim, keepdims_i=keepdim)
+    return g.op("Sqrt", sumsqr)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
+        if _outputs is None:
+            return split_out
+        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
+        if (
+            symbolic_helper._is_packed_list(split_size_or_sizes)
+            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
+        ):
+            split_sizes = [
+                symbolic_helper._unsqueeze_helper(g, v, [0])
+                for v in symbolic_helper._unpack_list(split_size_or_sizes)
+            ]
+
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            res = []
+            for i in range(_outputs):
+                end = g.op(
+                    "Add", start, split_sizes[i]
+                )  # split_sizes is a list of same length as _outputs
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+            return res
+        return [
+            g.op(
+                "SequenceAt",
+                split_out,
+                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+            )
+            for i in range(_outputs)
+        ]
+
+    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
+    if split_val.dim() > 0:
+        return g.op("Split", self, split_size_or_sizes, axis_i=dim, outputs=_outputs)
+    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        if _outputs is not None:
+            size = split_size * _outputs
+        else:
+            raise errors.SymbolicValueError(
+                "Unknown dimension size not supported", self
+            )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    splits = g.op("Constant", value_t=torch.tensor(splits))
+    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    return split(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split")
+def unsafe_split(
+    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
+):
+    return split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split_with_sizes")
+def unsafe_split_with_sizes(
+    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
+):
+    return split_with_sizes(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::tensor_split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def tensor_split(
+    g: jit_utils.GraphContext, self, indices_or_sections, dim, _outputs=None
+):
+    axis = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    axis = opset11.unsqueeze(g, axis, 0)
+    const_1 = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
+
+    if symbolic_helper._is_split_static(indices_or_sections, _outputs):
+        split_val = symbolic_helper._node_get(indices_or_sections.node(), "value")
+
+        if split_val.dim() > 0:
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            res = []
+            assert _outputs is not None
+            for i in range(_outputs - 1):
+                end = g.op(
+                    "Gather",
+                    indices_or_sections,
+                    g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+                    axis_i=0,
+                )
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+
+            end = symbolic_helper._size_helper(g, self, axis)
+            res.append(g.op("Slice", self, start, end, axis))
+            return res
+
+        split_size = symbolic_helper._get_const(
+            indices_or_sections, "i", "indices_or_sections"
+        )
+
+        size = symbolic_helper._get_tensor_dim_size(self, dim)
+        if size is None:
+            if _outputs is not None:
+                size = split_size * _outputs
+            else:
+                raise errors.SymbolicValueError(
+                    "Unknown dimension size not supported", self
+                )
+
+        min_split_size = size // split_size
+        num_splits_one_extra = size % split_size
+
+        splits = num_splits_one_extra * [min_split_size + 1]
+        leftover = (split_size - num_splits_one_extra) * [min_split_size]
+
+        splits = g.op(
+            "Constant", value_t=torch.tensor(splits + leftover, dtype=torch.long)
+        )
+        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+    if (
+        symbolic_helper._is_tensor(indices_or_sections)
+        and symbolic_helper._get_tensor_rank(indices_or_sections) == 1
+    ):
+        loop_len = symbolic_helper._size_helper(
+            g, indices_or_sections, g.op("Constant", value_t=torch.tensor(0))
+        )
+        loop_len = opset11.unsqueeze(g, loop_len, 0)
+        loop_condition = g.op("Cast", const_1, to_i=_C_onnx.TensorProtoDataType.BOOL)
+
+        # To make the first slice in the below loop work,
+        # we pad a zero to the first position so that it will be the initial start of slice.
+        padding_0 = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+        indices_or_sections = g.op("Concat", padding_0, indices_or_sections, axis_i=0)
+
+        final_splits = g.op("SequenceEmpty")
+        # Loop inputs
+        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+            g, "Loop", loop_len, loop_condition, final_splits, outputs=1, n_blocks=1
+        )
+
+        loop_block = loop_context.block
+        block_input_iter = utils._add_input_to_block(loop_block)
+        cond = utils._add_input_to_block(loop_block)  # noqa: F841
+        final_splits = utils._add_input_to_block(loop_block)
+
+        start = loop_context.op(
+            "Gather", indices_or_sections, block_input_iter, axis_i=0
+        )
+        end = loop_context.op(
+            "Gather",
+            indices_or_sections,
+            loop_context.op("Add", block_input_iter, const_1),
+            axis_i=0,
+        )
+
+        slice = loop_context.op("Slice", self, start, end, axis)
+        final_splits = loop_context.op("SequenceInsert", final_splits, slice)
+
+        # Loop outputs
+        cond_out = loop_context.op("Identity", loop_condition)
+        utils._add_output_to_block(loop_block, cond_out)
+        utils._add_output_to_block(loop_block, final_splits)
+
+        loop_out = loop.node().output()
+        start = g.op(
+            "Gather",
+            indices_or_sections,
+            g.op("Constant", value_t=torch.tensor(-1, dtype=torch.long)),
+            axis_i=0,
+        )
+        start = opset11.unsqueeze(g, start, 0)
+        end = symbolic_helper._size_helper(g, self, axis)
+
+        last_slice = g.op("Slice", self, start, end, axis)
+
+        return g.op("SequenceInsert", loop_out, last_slice)
+
+    else:  # scalar tensor
+        dim_size = symbolic_helper._size_helper(g, self, axis)
+        min_split_size = g.op("Div", dim_size, indices_or_sections)
+        min_split_size_plus_1 = g.op(
+            "Add",
+            min_split_size,
+            const_1,
+        )
+        num_splits_one_extra = g.op("Mod", dim_size, indices_or_sections)
+        splits = g.op("Tile", min_split_size_plus_1, num_splits_one_extra)
+        leftover = g.op(
+            "Tile",
+            min_split_size,
+            g.op(
+                "Sub",
+                opset11.unsqueeze(g, indices_or_sections, 0),
+                num_splits_one_extra,
+            ),
+        )
+
+        splits = g.op("Concat", splits, leftover, axis_i=0)
+        if _outputs is None:
+            return g.op("SplitToSequence", self, splits, axis_i=dim)
+        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+
+    splits = g.op("Constant", value_t=torch.tensor([1] * _outputs))
+    outputs = g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+    outputs = [outputs] if _outputs == 1 else outputs
+    squeezed_outputs = [
+        g.op("Squeeze", out, g.op("Constant", value_t=torch.tensor([dim])))
+        for out in outputs
+    ]
+    return squeezed_outputs
+
+
+@_onnx_symbolic("aten::nonzero_numpy")
+# Emitted from `torch.nonzero(x, as_tuple=True)`
+def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
+    return unbind(g, opset9.nonzero(g, input), 1, _outputs=_outputs)
+
+
+@_onnx_symbolic("aten::where")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
+    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
+    if not symbolic_helper._is_bool(condition):
+        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    if self is None:
+        condition = opset9.nonzero(g, condition)
+        return symbolic_helper._unbind_helper(
+            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
+        )
+    return g.op("Where", condition, self, other)
+
+
+@_onnx_symbolic("aten::fake_quantize_per_channel_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i")
+def fake_quantize_per_channel_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    axis,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
+        raise errors.SymbolicValueError(
+            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    # ONNX defines zero_point to be int8 or uint8
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    quantized = g.op("QuantizeLinear", inputs, scale, zero_point, axis_i=axis)
+    if (quant_min, quant_max) == (0, 127):
+        quantized = g.op(
+            "Clip",
+            quantized,
+            opset9.unused(g),
+            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
+        )
+    return g.op("DequantizeLinear", quantized, scale, zero_point, axis_i=axis)
+
+
+@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i")
+def fake_quantize_per_tensor_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
+        raise errors.SymbolicValueError(
+            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    if (
+        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
+        != _type_utils.JitScalarType.FLOAT
+    ):
+        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    quantized = g.op("QuantizeLinear", inputs, scale, zero_point)
+    if (quant_min, quant_max) == (0, 127):
+        quantized = g.op(
+            "Clip",
+            quantized,
+            opset9.unused(g),
+            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
+        )
+    return g.op("DequantizeLinear", quantized, scale, zero_point)
+
+
+def _reduce_op_symbolic(onnx_op_name):
+    def symbolic(g, self, dim=None, keepdim=None):
+        self = symbolic_helper._maybe_cast_reduce_op_input(g, self)
+        if dim is None:
+            # all-reduce path
+            return symbolic_helper._handle_reduce_dim_none(g, self, onnx_op_name)
+        else:
+            keepdim = symbolic_helper._get_const(keepdim, "i", "keepdim")
+            return g.op(onnx_op_name, self, dim, keepdims_i=keepdim)
+
+    return symbolic
+
+
+@_onnx_symbolic(
+    "aten::sum",
+    decorate=[symbolic_helper._apply_params("ReduceSum", "sum")],
+)
+def _reduce_with_dtype(onnx_op, name):
+    symbolic = _reduce_op_symbolic(onnx_op)
+
+    @symbolic_helper._overload_by_arg_count
+    def reduce(g, *args, **kwargs):
+        @symbolic_helper.parse_args("v", "none")
+        def reduce_nodim(g, self, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return symbolic_helper._unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        @symbolic_helper.parse_args("v", "v", "i", "none")
+        def reduce_dim(g, self, dim, keepdim, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return symbolic_helper._unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self, dim, keepdim)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        return reduce_nodim, reduce_dim
+
+    return reduce
+
+
+# Ported from
+# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/core.py#L6097
+# NOTE: Supporting aten::unflatten before opset13 needs helper function to adjust ONNX op changes in Concat, Slice, ...
+@_onnx_symbolic("aten::unflatten")
+def unflatten(g: jit_utils.GraphContext, input, dim, unflattened_size):
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+
+    # dim could be negative
+    input_dim = g.op("Constant", value_t=torch.tensor([input_dim], dtype=torch.int64))
+    dim = g.op("Add", input_dim, dim)
+    dim = g.op("Mod", dim, input_dim)
+
+    input_size = g.op("Shape", input)
+
+    head_start_idx = g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64))
+    head_end_idx = g.op(
+        "Reshape", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
+    )
+    head_part_rank = g.op("Slice", input_size, head_start_idx, head_end_idx)
+
+    dim_plus_one = g.op(
+        "Add", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
+    )
+    tail_start_idx = g.op(
+        "Reshape",
+        dim_plus_one,
+        g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64)),
+    )
+    tail_end_idx = g.op(
+        "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
+    )
+    tail_part_rank = g.op("Slice", input_size, tail_start_idx, tail_end_idx)
+
+    final_shape = g.op(
+        "Concat", head_part_rank, unflattened_size, tail_part_rank, axis_i=0
+    )
+
+    return symbolic_helper._reshape_helper(g, input, final_shape)
+
+
+@_onnx_symbolic("aten::unsafe_chunk")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented("unsafe_chunk", "unknown dimension size")
+    split_size = (size + chunks - 1) // chunks
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+
+    # TODO: So far we don"t have a module using this method. We"ll keep
+    # this as a constant unless we see a request of dynamics in any
+    # user's modules.
+    splits = g.op("Constant", value_t=torch.tensor(splits, dtype=torch.long))
+    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::tile")
+def tile(g: jit_utils.GraphContext, self, dims):
+    self_shape = g.op("Shape", self)
+    self_rank = g.op("Size", self_shape)
+    dims_rank = g.op("Size", dims)
+    diff = g.op("Sub", self_rank, dims_rank)
+    const_zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    # 1. If dims is shorter than self.shape pad dims with 1
+    dims_shorter_than_self_shape = g.op("Greater", diff, const_zero)
+    (
+        if_op_greater,
+        (if_context_greater, else_context_greater),
+        _,
+    ) = jit_utils.add_op_with_blocks(
+        g, "If", dims_shorter_than_self_shape, n_blocks=2, outputs=1
+    )
+    const_one = if_context_greater.op("Constant", value_t=torch.LongTensor([1]))
+    diff_1d_greater = if_context_greater.op("Reshape", diff, const_one)
+    exapnd_ones_greater = if_context_greater.op("Expand", const_one, diff_1d_greater)
+    dims_ = if_context_greater.op("Concat", exapnd_ones_greater, dims, axis_i=0)
+    utils._add_output_to_block(if_context_greater.block, dims_)
+    identity_dim = else_context_greater.op("Identity", dims)
+    utils._add_output_to_block(else_context_greater.block, identity_dim)
+    dims_final = if_op_greater.node().output()
+
+    # 2. If dims is longer than self.shape pad self.shape with 1
+    dims_longer_than_self_shape = g.op("Less", diff, const_zero)
+    (
+        if_op_less,
+        (if_context_less, else_context_less),
+        _,
+    ) = jit_utils.add_op_with_blocks(
+        g, "If", dims_longer_than_self_shape, n_blocks=2, outputs=1
+    )
+    const_one = if_context_less.op("Constant", value_t=torch.LongTensor([1]))
+    diff_1d_less = if_context_less.op(
+        "Reshape",
+        if_context_less.op("Abs", diff),
+        const_one,
+    )
+    exapnd_ones_less = if_context_less.op("Expand", const_one, diff_1d_less)
+    self_final_shape = if_context_less.op(
+        "Concat", exapnd_ones_less, self_shape, axis_i=0
+    )
+    self_ = if_context_less.op("Reshape", self, self_final_shape)
+    utils._add_output_to_block(if_context_less.block, self_)
+    identity_self = else_context_less.op("Identity", self)
+    utils._add_output_to_block(else_context_less.block, identity_self)
+    self_final = if_op_less.node().output()
+
+    dims_final = g.op("Cast", dims_final, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("Tile", self_final, dims_final)
+
+
+@_onnx_symbolic("aten::repeat_interleave")
+def repeat_interleave(
+    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
+):
+    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
+    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
+    input_sizes = symbolic_helper._get_tensor_sizes(self)
+    if repeats_dim is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
+            self,
+        )
+    if repeats_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
+            self,
+        )
+    if input_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
+            self,
+        )
+
+    final_dim = dim
+    # if dim is None flatten
+    # By default, use the flattened input array, and return a flat output array
+    if symbolic_helper._is_none(dim):
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        dim = torch.tensor(0, dtype=torch.int64)
+    else:
+        dim = symbolic_helper._maybe_get_scalar(dim)
+
+    # Handle cases where dim is negative
+    if dim < 0:
+        dim += len(input_sizes)
+
+    output_sizes = input_sizes.copy()
+    for idx, input_size in enumerate(input_sizes):
+        if input_size is None:
+            output_sizes[idx], input_sizes[idx] = 0, -1
+
+    # Check if all indices should be repeated the same number of times.
+    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
+        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
+            g, self, repeats, dim
+        )
+
+    cond_dynamic_repeats = repeats_dim == 1 and repeats_sizes[0] is None
+    # If input size is dynamic or repeats vector is dynamic
+    if output_sizes[dim] == 0 or cond_dynamic_repeats:
+        reps = symbolic_helper._size_helper(g, self, dim)
+        reps = opset11.unsqueeze(g, reps, 0)
+
+        # Check if repeats is dynamic
+        # As repeats is dynamic, we use a where node as a substitute for the if statement
+        # If repests_dim = 1, expand repeats otherwise use original tensor
+        if cond_dynamic_repeats:
+            repeat_dim = symbolic_helper._size_helper(
+                g, repeats, g.op("Constant", value_t=torch.LongTensor([0]))
+            )
+            repeat_cond = g.op(
+                "Equal", repeat_dim, g.op("Constant", value_t=torch.LongTensor([1]))
+            )
+            repeats = where(g, repeat_cond, g.op("Expand", repeats, reps), repeats)
+    # There are cases when the repeats are 1-d tensor with multiple repeats, but dim
+    # provided along one of the dynamic axes provided. A simple example would be
+    # input.shape -> [1, 1, *] where * represents the dynamic axes, and dim = 2
+    # Now, repeat interleaving can be performed in pytorch when the value of * matches
+    # with the number of elements in repeat, for example if * -> 2, number of repeats
+    # should be 2 as well.
+    else:
+        return opset9.repeat_interleave(g, self, repeats, final_dim)
+
+    reps_like = g.op(
+        "ConstantOfShape",
+        g.op("Shape", repeats),
+        value_t=torch.tensor([1], dtype=torch.long),
+    )
+    r_splits = split(g, repeats, reps_like, 0)
+    i_splits = split(g, self, reps_like, dim)
+
+    output_sizes[dim], input_sizes[dim] = -1, 1
+
+    # Create a loop to iterate over each value along the dimension
+    # and perform individual interleaving using the repeats tensor
+    # Loop is of the following pattern
+    # input (trip_count, cond)
+    #   int trip_count = ...;
+    #   bool cond = ...;
+    #   for (int i=0; i < trip_count && cond; ++i) {
+    #     cond = ...;
+    #   }
+
+    # Loop conditions
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    loop_len = reps
+
+    # Create an empty sequence to store final expansions
+    final_splits = g.op("SequenceEmpty")
+
+    # Loop inputs
+    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+        g, "Loop", loop_len, loop_condition, final_splits, n_blocks=1
+    )
+
+    loop_block = loop_context.block
+    block_input_iter = utils._add_input_to_block(loop_block)
+    cond = utils._add_input_to_block(loop_block)  # noqa: F841
+    final_splits = utils._add_input_to_block(loop_block)
+
+    r_split = loop_context.op("SequenceAt", r_splits, block_input_iter)
+    i_split = loop_context.op("SequenceAt", i_splits, block_input_iter)
+
+    i_split = opset11.unsqueeze(loop_context, i_split, dim + 1)
+    r_concat = [
+        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[: dim + 1])),
+        r_split,
+        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1 :])),
+    ]
+    r_concat = loop_context.op("Concat", *r_concat, axis_i=0)
+    i_split = opset9.expand(loop_context, i_split, r_concat, None)
+    i_split = symbolic_helper._reshape_helper(
+        loop_context, i_split, g.op("Constant", value_t=torch.LongTensor(output_sizes))
+    )
+    final_splits = loop_context.op("SequenceInsert", final_splits, i_split)
+
+    # Loop outputs
+    cond_out = loop_context.op(
+        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+    )
+    utils._add_output_to_block(loop_block, cond_out)
+    utils._add_output_to_block(loop_block, final_splits)
+
+    loop_out = loop.node().output()
+    loop_out = g.op("ConcatFromSequence", loop_out, axis_i=dim)
+    return loop_out
+
+
+@_onnx_symbolic("aten::diagonal")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def diagonal(g: jit_utils.GraphContext, self, offset, dim1, dim2):
+    rank = symbolic_helper._get_tensor_rank(self)
+    # Replace negative indexing when rank is known
+    if rank is not None:
+        dim1 = dim1 if dim1 >= 0 else dim1 + rank
+        dim2 = dim2 if dim2 >= 0 else dim2 + rank
+
+    dim1_size = opset9.size(
+        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim1]))
+    )
+    dim2_size = opset9.size(
+        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim2]))
+    )
+    # Create appropriate mask
+    mask_shape = g.op("Concat", dim1_size, dim2_size, axis_i=0)
+    mask = opset9.zeros(g, mask_shape, None, None, None)
+    mask = g.op("EyeLike", mask, k_i=offset)
+    # dim1 and dim2 appended as a dimension at the end of the shape
+
+    if rank is not None:
+        axes = list(range(rank))
+        axes.remove(dim1)
+        axes.remove(dim2)
+        self = g.op("Transpose", self, perm_i=axes + [dim1, dim2])
+    else:
+        return symbolic_helper._unimplemented("diagonal", "unknown input rank")
+
+    # Multiply input and mask to calculate values along diagonal
+    # The mask consists of one values where diagonal values are to be calculated
+    # For example:
+    # [[1.1, 1.2, 1.3],   *    [[1, 0, 0]   =   [[1.1, 0, 0],
+    #  [2.1, 2.2, 2.3],         [0, 1, 0]        [0, 2.2, 0],
+    #  [3.1, 3.2, 3.3]]         [0, 0, 1]]       [0, 0, 3.3]]
+    result = g.op("Mul", self, mask)
+    result = symbolic_helper._reducesum_helper(g, result, axes_i=[-1], keepdims_i=0)
+
+    # Calculate gather indices based on offset and dims
+    # If offset is greater than zero, set offset to zero as this aids in
+    # calculation of selection window
+    offset_op = g.op("Constant", value_t=torch.LongTensor([offset]))
+    if offset >= 0:
+        diag_size = g.op(
+            "Max",
+            g.op("Min", dim1_size, g.op("Sub", dim2_size, offset_op)),
+            g.op("Constant", value_t=torch.LongTensor([0])),
+        )
+        offset = 0
+    else:
+        diag_size = g.op(
+            "Max",
+            g.op("Min", g.op("Add", dim1_size, offset_op), dim2_size),
+            g.op("Constant", value_t=torch.LongTensor([0])),
+        )
+    diag_size = g.op("Concat", diag_size, axis_i=0)
+
+    # Calculate which diagonal values to select
+    # For example, in cases with offsets:
+    # [[0, 1.1, 0]
+    #  [0, 0, 2.2]]
+    # we need to select the last two columns, so we create a tensor
+    # with all columns that are to be selected
+    # So in this example, it is [1, 2]
+    select_window_ones_fill = opset9.ones(g, diag_size, 4, None, None)
+    select_window = g.op(
+        "CumSum",
+        select_window_ones_fill,
+        g.op("Constant", value_t=torch.LongTensor([0])),
+    )
+    select_window = g.op(
+        "Add",
+        select_window,
+        g.op("Constant", value_t=torch.LongTensor([abs(offset) - 1])),
+    )
+
+    gather_shape = [
+        opset9.size(g, result, dim=g.op("Constant", value_t=torch.LongTensor([axis])))
+        for axis in list(range(rank))[:-2]
+    ]
+    gather_shape.append(diag_size)
+    gather_shape = g.op("Concat", *gather_shape, axis_i=0)
+    gather_indices = opset9.zeros(g, gather_shape, 4, None, None)
+
+    # There might be cases where offset value is greater than number of rows/columns
+    # and might cause the diagonal to overrun and as a result of this, diag_size would be zero.
+    # For example, if
+    #       offset = 9, dim1_size = 2 (columns), dim2_size = 4 (rows)
+    #       diag_size = max(min(2, (4-9)), 0) = 0, based on calculation above
+    # Cases with diagonal overrun always result in diag_size = max(0, -ve value) = 0
+    # In cases without diagonal overrun, we select the appropriate rows/columns along which we
+    # are calculating diagonal values. In cases with diagonal overrun, we return a tensor which has
+    # the dimension of the row/column where overrun occurred as 0-dim, as we are essentially
+    # returning an empty tensor
+    overrun_cond = g.op(
+        "Not",
+        g.op(
+            "Equal",
+            diag_size,
+            g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64)),
+        ),
+    )
+
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", overrun_cond, n_blocks=2
+    )
+
+    gather_indices_if_block = if_context.op("Add", gather_indices, select_window)
+    gather_indices_if_block = symbolic_helper._unsqueeze_helper(
+        if_context, gather_indices_if_block, [rank - 1]
+    )
+    final_non_overrun = if_context.op(
+        "GatherND", result, gather_indices_if_block, batch_dims_i=rank - 2
+    )
+    final_overrun = opset9.zeros(else_context, gather_shape, 6, None, None)
+    utils._add_output_to_block(if_context.block, final_non_overrun)
+    utils._add_output_to_block(else_context.block, final_overrun)
+    return if_op
+
+
+# Quantized ops
+
+
+@_onnx_symbolic("quantized::linear")
+def quantized_linear(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::linear_relu")
+def quantized_linear_relu(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d_relu")
+def quantized_conv1d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d_relu")
+def quantized_conv2d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d_relu")
+def quantized_conv3d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d")
+def quantized_conv1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d")
+def quantized_conv2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d")
+def quantized_conv3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose1d")
+def quantized_conv_transpose1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose2d")
+def quantized_conv_transpose2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose3d")
+def quantized_conv_transpose3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose3d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset14.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset14.py
new file mode 100644
index 0000000000000..5675f362893ea
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset14.py
@@ -0,0 +1,296 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 14.
+
+Note [ONNX operators that are added/updated in opset 14]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+    HardSwish, Trilu
+
+Updated operators:
+    Reshape
+    Add, Sub, Mul, Div
+    GRU, LSTM, RNN
+    BatchNorm, Cumsum, Relu
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+from __future__ import annotations
+
+import functools
+
+import torch
+from torch.onnx import _constants
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+)
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+
+
+__all__ = [
+    "hardswish",
+    "tril",
+    "triu",
+    "reshape",
+    "batch_norm",
+    "quantized_hardswish",
+    "scaled_dot_product_attention",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=14)
+
+
+@_onnx_symbolic("aten::hardswish")
+@symbolic_helper.parse_args("v")
+def hardswish(g: jit_utils.GraphContext, self):
+    return g.op("HardSwish", self)
+
+
+@_onnx_symbolic("aten::tril")
+def tril(g: jit_utils.GraphContext, self, diagonal, out=None):
+    return g.op("Trilu", self, diagonal, upper_i=0)
+
+
+@_onnx_symbolic("aten::triu")
+def triu(g: jit_utils.GraphContext, self, diagonal, out=None):
+    return g.op("Trilu", self, diagonal, upper_i=1)
+
+
+@_onnx_symbolic("aten::reshape")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v")
+def reshape(g: jit_utils.GraphContext, self, shape):
+    # NOTE: Due to bug in ORT https://github.com/microsoft/onnxruntime/issues/10664
+    #       Reshape export cannot utilize the new allowzero attribute introduced in opset 14.
+    return symbolic_helper._reshape_helper(g, self, shape, allowzero=0)
+
+
+@_onnx_symbolic("aten::batch_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
+def batch_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    training,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
+    if (
+        torch.is_autocast_enabled()
+        and not symbolic_helper.args_have_same_dtype(
+            [input, weight, bias, running_mean, running_var]
+        )
+        and GLOBALS.export_onnx_opset_version < 15
+    ):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "BatchNormalization",
+            14,
+            15,
+            "All input tensors must have the same `dtype`."
+            " Turn off Autocast or export using opset version 15.",
+            input,
+        )
+
+    symbolic_helper.check_training_mode(training, "batch_norm")
+    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
+        g, input, weight, bias, running_mean, running_var
+    )
+    out = g.op(
+        "BatchNormalization",
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        epsilon_f=eps,
+        momentum_f=1 - momentum,
+        training_mode_i=0 if not training else 1,
+        outputs=1 if not training else 3,
+    )
+    if not training:
+        return out
+    else:
+        res, new_running_mean, new_running_var = out
+        new_running_mean.setType(running_mean.type())
+        new_running_var.setType(running_var.type())
+        return res
+
+
+@_onnx_symbolic("quantized::hardswish")
+def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = hardswish(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+# Ported from
+# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/nn.py#L1504
+# aten_scaled_dot_product_attention
+# NOTE: Need op.Trilu
+@_onnx_symbolic("aten::scaled_dot_product_attention")
+@symbolic_helper.parse_args("v", "v", "v", "v", "f", "b", "v", "b")
+def scaled_dot_product_attention(
+    g: jit_utils.GraphContext,
+    query: torch._C.Value,
+    key: torch._C.Value,
+    value: torch._C.Value,
+    attn_mask: torch._C.Value | None = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: torch._C.Value | None = None,
+    enable_gqa: bool = False,
+):
+    assert (not is_causal) or (is_causal and symbolic_helper._is_none(attn_mask)), (
+        "is_causal and attn_mask cannot be set at the same time"
+    )
+    assert not enable_gqa, (
+        "conversion of scaled_dot_product_attention not implemented if enable_gqa is True"
+    )
+
+    if symbolic_helper._is_none(scale):
+        scale = _attention_scale(g, query)
+
+    if is_causal:
+        attn_mask = _causal_attention_mask(g, query, key)
+
+    # Swap the last two axes of key
+    # NOTE: onnx-script has different logic here, because the attribute perms in
+    # transpose needs list of ints
+    key_shape_builtin = symbolic_helper._get_tensor_rank(key)
+    key_transposed_axes = list(range(key_shape_builtin))
+    key_transposed_axes[-1], key_transposed_axes[-2] = (
+        key_transposed_axes[-2],
+        key_transposed_axes[-1],
+    )
+    key_transposed = g.op("Transpose", key, perm_i=key_transposed_axes)
+
+    # https://github.com/pytorch/pytorch/blob/12da0c70378b5be9135c6fda62a9863bce4a4818/aten/src/ATen/native/transformers/attention.cpp#L653
+    # Scale q, k before matmul for stability see https://tinyurl.com/sudb9s96 for math
+    query_scaled = g.op("Mul", query, g.op("Sqrt", scale))
+    key_transposed_scaled = g.op("Mul", key_transposed, g.op("Sqrt", scale))
+    mul_qk = g.op("MatMul", query_scaled, key_transposed_scaled)
+
+    if symbolic_helper._is_none(attn_mask):
+        mul_qk_add = mul_qk
+        attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
+    elif (
+        _type_utils.JitScalarType.from_value(attn_mask)
+        == _type_utils.JitScalarType.BOOL
+    ):
+        # Turn the Boolean mask to float: attn_mask.masked_fill(not attn_mask, -float('inf'))
+        const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
+        const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
+        attn_mask = g.op("Where", attn_mask, const_zero, const_neg_inf)
+        mul_qk_add = g.op("Add", mul_qk, attn_mask)
+        attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
+        # When using scaled dot product attention with a boolean mask, the softmax operation might return NaN values
+        # due to the presence of -inf in an entire row (padding tokens), resulting in 0/0 (NaN) in the softmax output.
+        # This is because there's no safe softmax imp in ONNX, so we need to handle NaN values explicitly to match
+        # the behavior of PyTorch with boolean masks.
+        attn_weight = g.op("Where", g.op("IsNaN", attn_weight), const_zero, attn_weight)
+    elif _type_utils.JitScalarType.from_value(attn_mask) in (
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.BFLOAT16,
+    ):
+        mul_qk_add = g.op("Add", mul_qk, attn_mask)
+        attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
+    else:
+        raise ValueError(
+            f"Unsupported type for attn_mask: {_type_utils.JitScalarType.from_value(attn_mask)}"
+        )
+
+    if dropout_p != 0:
+        attn_weight = g.op(
+            "Dropout",
+            attn_weight,
+            g.op("Constant", value_t=torch.tensor(dropout_p, dtype=torch.float)),
+        )
+
+    return g.op("MatMul", attn_weight, value)
+
+
+def _attention_scale(
+    g: jit_utils.GraphContext, query: torch._C.Value
+) -> torch._C.Value:
+    """Calculate the scale factor for the attention result.
+
+    Args:
+        query: Tensor of shape [..., L, E]
+
+    Returns:
+        Scalar scale factor := 1 / math.sqrt(query.size(-1))
+    """
+    query_shape = g.op("Shape", query)
+    query_shape_last = g.op(
+        "Slice",
+        query_shape,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
+        g.op(
+            "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
+        ),
+    )
+    embedding_size = g.op(
+        "Cast",
+        query_shape_last,
+        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
+    )
+    const_one = g.op("Constant", value_t=torch.tensor([1.0], dtype=torch.float))
+    scale = g.op("Div", const_one, g.op("Sqrt", embedding_size))
+    # Add a Cast to convert the scale back to original type
+    scale = g.op(
+        "Cast",
+        scale,
+        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
+    )
+    return scale
+
+
+def _causal_attention_mask(
+    g: jit_utils.GraphContext, query: torch._C.Value, key: torch._C.Value
+) -> torch._C.Value:
+    """Create a causal mask for the given query and key tensors.
+
+    Equivalent to::
+        mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_mask = torch.zeros(L, S, dtype=torch.float)
+        attn_mask = attn_mask.masked_fill(not mask, -float("inf"))
+
+    Args:
+        query: Tensor of shape [..., L, E]
+        key: Tensor of shape [..., S, E]
+
+    Returns:
+        Tensor of shape [L, S]
+    """
+
+    query_shape = g.op("Shape", query)
+    key_shape = g.op("Shape", key)
+
+    last_idx = g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    second_last_idx = g.op("Constant", value_t=torch.tensor([-2], dtype=torch.int64))
+    target_length = g.op("Slice", query_shape, second_last_idx, last_idx)
+    source_length = g.op("Slice", key_shape, second_last_idx, last_idx)
+    # attn_mask = torch.ones(L, S) := {
+    size = g.op("Concat", target_length, source_length, axis_i=0)
+    const_one = g.op("Constant", value_t=torch.tensor([1.0]))
+    attn_mask = g.op("Expand", const_one, size)
+    # }
+    attn_mask = g.op("Trilu", attn_mask, upper_i=0)
+    # The causal mask has 0s in the lower triangle and -inf in the upper triangle.
+    const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
+    const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
+    attn_mask = g.op(
+        "Where", g.op("Equal", attn_mask, const_zero), const_neg_inf, const_zero
+    )
+    return attn_mask
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset15.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset15.py
new file mode 100644
index 0000000000000..4f86a7f2f8625
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset15.py
@@ -0,0 +1,84 @@
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 15.
+
+Note [ONNX operators that are added/updated in opset 15]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/master/docs/Changelog.md#version-15-of-the-default-onnx-operator-set
+New operators:
+    Bernoulli
+    CastLike
+    Optional
+    OptionalGetElement
+    OptionalHasElement
+
+Updated operators:
+    BatchNormalization https://github.com/onnx/onnx/pull/3545
+                        Backwards compatible
+                        TODO: test coverage for mixed types inputs.
+    Pow                https://github.com/onnx/onnx/pull/3412
+                        Backwards compatible
+                        TODO: bfloat16 support.
+    Shape              https://github.com/onnx/onnx/pull/3580
+                        Backwards compatible
+                        TODO: optional start/end attribute.
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+import functools
+
+import torch
+from torch import _C
+from torch.onnx._internal.torchscript_exporter import (
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=15)
+
+
+@_onnx_symbolic("aten::__is_")
+def aten__is_(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_none(other):
+        if isinstance(self.type(), _C.OptionalType):
+            none = g.op("OptionalHasElement", self)
+            return g.op("Not", none)
+        else:
+            return g.op("Constant", value_t=torch.BoolTensor([0]))
+    return opset9.eq(g, self, other)
+
+
+@_onnx_symbolic("aten::__isnot_")
+@opset9.wrap_logical_op_with_negation  # type: ignore[has-type]
+def aten__isnot_(g: jit_utils.GraphContext, self, other):
+    return aten__is_(g, self, other)
+
+
+@_onnx_symbolic("aten::bernoulli")
+def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
+    if out is not None and not symbolic_helper._is_none(out):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "out parameter is not supported for bernoulli", input
+        )
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "generator is not supported for bernoulli", input
+        )
+    if p is None or symbolic_helper._is_none(p):
+        return g.op("Bernoulli", input)
+    return opset9.bernoulli(g, input, p, generator, out)
+
+
+@_onnx_symbolic("prim::unchecked_cast")
+def prim_unchecked_cast(g: jit_utils.GraphContext, self):
+    # exists to refine the type of the Value
+    # if x is Optional[Tensor], unchecked_cast will cast
+    # x to Tensor, so the rest of the graph knows that x is a Tensor.
+    if isinstance(self.type(), _C.OptionalType):
+        return g.op("OptionalGetElement", self)
+
+    return self
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset16.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset16.py
new file mode 100644
index 0000000000000..a617270a2a7c6
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset16.py
@@ -0,0 +1,191 @@
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 16.
+
+Note [ONNX Operators that are added/updated in opset 16]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-16-of-the-default-onnx-operator-set
+New operators:
+    GridSample https://github.com/onnx/onnx/pull/3557
+
+Updated operators:
+    Identity
+    If
+    LeakyRelu
+    Loop
+    PRelu
+    RoiAlign
+    Scan
+    ScatterElements
+    ScatterND
+    Where
+    GreaterOrEqual
+    LessOrEqual
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+import functools
+
+import torch
+from torch.nn.functional import (
+    GRID_SAMPLE_INTERPOLATION_MODES,
+    GRID_SAMPLE_PADDING_MODES,
+)
+from torch.onnx import errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    utils,
+)
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=16)
+
+
+# note (mkozuki): Why `grid_sampler` instead of `grid_sample`?
+# Because `torch.nn.functional.grid_sample` calls `torch.grid_sampler`.
+@_onnx_symbolic("aten::grid_sampler")
+@symbolic_helper.parse_args("v", "v", "i", "i", "b")
+def grid_sampler(
+    g: jit_utils.GraphContext,
+    input,
+    grid,
+    mode_enum,
+    padding_mode_enum,
+    align_corners,
+):
+    # Check the input and grid tensor rank beforehand.
+    if symbolic_helper._get_tensor_rank(input) == 5:
+        return symbolic_helper._onnx_unsupported("GridSample with 5D volumetric input")
+    mode_s = {v: k for k, v in GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg]
+    padding_mode_s = {v: k for k, v in GRID_SAMPLE_PADDING_MODES.items()}[  # type: ignore[call-arg]
+        padding_mode_enum
+    ]
+    return g.op(
+        "GridSample",
+        input,
+        grid,
+        align_corners_i=int(align_corners),
+        mode_s=mode_s,
+        padding_mode_s=padding_mode_s,
+    )
+
+
+@_onnx_symbolic("aten::scatter_add")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(
+        src, _type_utils.JitScalarType.UNDEFINED
+    )
+    src_sizes = symbolic_helper._get_tensor_sizes(src)
+    index_sizes = symbolic_helper._get_tensor_sizes(index)
+
+    if len(src_sizes) != len(index_sizes):
+        return symbolic_helper._unimplemented(
+            "scatter_add",
+            f"`index` ({index_sizes}) should have the same dimensionality as `src` ({src_sizes})",
+        )
+
+    # PyTorch only allows index shape <= src shape, so we can only consider
+    # taking index as subset size to src, like PyTorch does. When sizes for src
+    # and index are not matched or there are dynamic axes, we take index shape to
+    # slice src to accommodate.
+    if src_sizes != index_sizes or None in index_sizes:
+        adjusted_shape = g.op("Shape", index)
+        starts = g.op("Constant", value_t=torch.tensor([0] * len(index_sizes)))
+        src = g.op("Slice", src, starts, adjusted_shape)
+
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("ScatterElements", self, index, src, axis_i=dim, reduction_s="add")
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        if _type_utils.JitScalarType.from_value(self) != src_type:
+            src = g.op(
+                "Cast",
+                src,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+
+        return g.op(
+            "ScatterElements",
+            self,
+            index,
+            src,
+            axis_i=dim,
+            reduction_s="add",
+        )
+
+
+@_onnx_symbolic("aten::scatter_reduce")
+@symbolic_helper.parse_args("v", "i", "v", "v", "s", "b")
+def scatter_reduce(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    dim: int,
+    index: torch._C.Value,
+    src: torch._C.Value,
+    reduce: str,
+    include_self: bool,
+):
+    if reduce == "mean":
+        raise errors.OnnxExporterError(
+            "ONNX does not support mean reduction for scatter_reduce"
+        )
+    if not include_self:
+        raise errors.OnnxExporterError(
+            "ONNX does not support include_self=False for scatter_reduce"
+        )
+
+    reduce_mode = {  # convert torch string name to onnx string name
+        "mean": "none",  # 'mean' doesn't support in ONNX 1.14 definition
+        "sum": "add",
+        "prod": "mul",
+        "amin": "min",
+        "amax": "max",
+    }
+    onnx_reduce = reduce_mode[reduce]
+
+    self_rank = g.op("Size", g.op("Shape", self))
+
+    # if self_rank == 0:  # assert (index_rank == 0 and rank_src == 0)
+    self_rank_is_zero = g.op(
+        "Equal", self_rank, g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+    )
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", self_rank_is_zero, n_blocks=2, outputs=3
+    )
+    neg_1 = if_context.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+
+    self_reshape = if_context.op("Reshape", self, neg_1)
+    utils._add_output_to_block(if_context.block, self_reshape)
+    index_reshape = if_context.op("Reshape", index, neg_1)
+    utils._add_output_to_block(if_context.block, index_reshape)
+    src_reshape = if_context.op("Reshape", src, neg_1)
+    utils._add_output_to_block(if_context.block, src_reshape)
+
+    self_identity = else_context.op("Identity", self)
+    utils._add_output_to_block(else_context.block, self_identity)
+    index_identitye = else_context.op("Identity", index)
+    utils._add_output_to_block(else_context.block, index_identitye)
+    src_identity = else_context.op("Identity", src)
+    utils._add_output_to_block(else_context.block, src_identity)
+
+    result = g.op("ScatterElements", *if_op, axis_i=dim, reduction_s=onnx_reduce)
+
+    # if self_rank == 0:
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", self_rank_is_zero, n_blocks=2, outputs=1
+    )
+    result_squeezed = if_context.op("Squeeze", result)
+    utils._add_output_to_block(if_context.block, result_squeezed)
+    result_identity = else_context.op("Identity", result)
+    utils._add_output_to_block(else_context.block, result_identity)
+    result_final = if_op.node().output()
+
+    return result_final
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset17.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset17.py
new file mode 100644
index 0000000000000..e8ea41e643068
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset17.py
@@ -0,0 +1,244 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 17.
+
+Note [ONNX Operators that are added/updated in opset 17]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-17-of-the-default-onnx-operator-set
+New operators:
+    BlackmanWindow
+    DFT
+    HammingWindow
+    HannWindow
+    LayerNormalization
+    MelWeightMatrix
+    STFT
+    SequenceMap
+"""
+
+import functools
+from collections.abc import Sequence
+from typing import Optional
+
+import torch
+from torch import _C
+from torch.onnx import errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+)
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = ["layer_norm", "stft", "quantized_layer_norm"]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=17)
+
+
+@_onnx_symbolic("aten::layer_norm")
+@symbolic_helper.parse_args("v", "is", "v", "v", "f", "none")
+def layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+    cudnn_enable: bool,
+):
+    # normalized_shape: input shape from an expected input of size
+    # axis: The first normalization dimension.
+    # layer_norm normalizes on the last D dimensions,
+    # where D is the size of normalized_shape
+    axis = -len(normalized_shape)
+    scalar_type = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.FLOAT
+    )
+    dtype = scalar_type.dtype()
+    if symbolic_helper._is_none(weight):
+        weight_value = torch.ones(normalized_shape, dtype=dtype)
+        weight = g.op("Constant", value_t=weight_value)
+    if symbolic_helper._is_none(bias):
+        bias_value = torch.zeros(normalized_shape, dtype=dtype)
+        bias = g.op("Constant", value_t=bias_value)
+    return g.op(
+        "LayerNormalization",
+        input,
+        weight,
+        bias,
+        epsilon_f=eps,
+        axis_i=axis,
+    )
+
+
+@_onnx_symbolic("quantized::layer_norm")
+def quantized_layer_norm(
+    g: jit_utils.GraphContext,
+    x,
+    normalized_shape,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = layer_norm(g, x, normalized_shape, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+def _compute_edge_sizes(n_fft, window_size):
+    """Helper function to compute the sizes of the edges (left and right)
+    of a given window centered within an FFT size."""
+    left = (n_fft - window_size) // 2
+    right = n_fft - left - window_size
+    return left, right
+
+
+@_onnx_symbolic("aten::stft")
+@symbolic_helper.parse_args("v", "i", "i", "i", "v", "b", "b", "b", "b")
+def stft(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    n_fft: int,
+    hop_length: Optional[int] = None,
+    win_length: Optional[int] = None,
+    window: Optional[_C.Value] = None,
+    normalized: bool = False,
+    onesided: Optional[bool] = True,
+    return_complex: Optional[bool] = False,
+    align_to_window: Optional[bool] = None,
+) -> _C.Value:
+    """Associates `torch.stft` with the `STFT` ONNX operator.
+    Note that torch.stft calls _VF.stft, without centering or padding options.
+    Hence, this function does not contain these two arguments.
+    See torch.stft source code for more info.
+
+    Args:
+        g: Graph to write the ONNX representation into
+        input: Input tensor for the transformation
+        n_fft: FFT size
+        hop_length: Size of the hop. Defaults to `floot(n_fft // 4)`
+        win_length: Size of the analysis window. Defaults to `n_fft`
+        window: Analysis window. Defaults to a window of all ones
+        normalized: Whether to return a normalized STFT
+        onesided: Whether to return only half (+1) of the results, given the
+            symmetry of the STFT
+        return_complex: Whether to return the complex value (Note: Must be
+            `False` or `None`)
+
+    Returns:
+        op: Operator for torch.stft associated with STFT (ONNX)
+    """
+    # Checks
+    if return_complex:
+        raise errors.SymbolicValueError(
+            msg="STFT does not currently support complex types", value=input
+        )
+
+    if align_to_window is not None:
+        raise errors.SymbolicValueError(
+            msg="STFT does not currently support the align_to_window option",
+            value=input,
+        )  # TODO(#145944): add compatibility with align_to_window option.
+
+    # Get STFT sizes
+    frame_step_value = hop_length if hop_length is not None else n_fft // 4
+    frame_step_const = g.op(
+        "Constant", value_t=torch.tensor(frame_step_value, dtype=torch.int64)
+    )
+    frame_length_const = g.op(
+        "Constant", value_t=torch.tensor(n_fft, dtype=torch.int64)
+    )
+
+    # Pre-process input if needed
+    signal = input
+    signal_rank = symbolic_helper._get_tensor_rank(signal)
+    if signal_rank == 1:
+        # Add batch dimension
+        signal = g.op(
+            "Unsqueeze",
+            signal,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+    elif signal_rank is None or signal_rank > 2:
+        raise errors.SymbolicValueError(
+            msg="STFT can only take inputs of 1 [signal] or 2 [batch, signal] dimensions. "
+            f"Current rank of signal is {signal_rank}, please reduce it.",
+            value=input,
+        )
+
+    # Get window and make sure it's the same size as `win_length` or `n_fft`
+    n_win = symbolic_helper._get_tensor_dim_size(window, dim=0)
+    if n_win is not None:
+        win_length_default = win_length if win_length else n_fft
+        assert n_win == win_length_default, (
+            "Analysis window size must equal `win_length` or `n_fft`. "
+            f"Please, set `win_length` or `n_fft` to match `window` size ({n_win})",
+        )
+
+        # Center window around zeros if needed (required by ONNX's STFT)
+        if n_win < n_fft:
+            left, right = _compute_edge_sizes(n_fft, n_win)
+            left_win = g.op("Constant", value_t=torch.zeros(left))
+            right_win = g.op("Constant", value_t=torch.zeros(right))
+            window = g.op("Concat", left_win, window, right_win, axis_i=0)
+
+    # Create window, if needed
+    if symbolic_helper._is_none(window):
+        if win_length:
+            if win_length > n_fft:
+                raise errors.SymbolicValueError(
+                    msg="The analysis window can't be longer than the size of the FFT. "
+                    f"Please set `win_length` ({win_length}) to `n_fft` ({n_fft}) or less.",
+                    value=input,
+                )
+
+            # Center window, if needed
+            left, right = _compute_edge_sizes(n_fft, win_length)
+            torch_window = torch.hstack(
+                (torch.zeros(left), torch.ones(win_length), torch.zeros(right))
+            )
+        else:
+            # Rectangle window
+            torch_window = torch.ones(n_fft)
+        assert torch_window.shape[0] == n_fft
+        window = g.op("Constant", value_t=torch_window)
+    window = g.op(
+        "Cast", window, to_i=_type_utils.JitScalarType.from_value(signal).onnx_type()
+    )
+
+    # Run STFT
+    result = g.op(
+        "STFT",
+        signal,
+        frame_step_const,
+        window,
+        frame_length_const,
+        onesided_i=1 if onesided is None or onesided else 0,
+    )
+
+    # Transpose to mimic torch.stft's behavior
+    result = g.op("Transpose", result, perm_i=[0, 2, 1, 3])
+
+    # Remove batch dimension, if needed
+    if signal_rank == 1:
+        result = g.op(
+            "Squeeze",
+            result,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    # Normalize, if needed
+    if normalized:
+        sqrt_nfft = torch.sqrt(torch.tensor(n_fft, dtype=signal.type().dtype()))
+        result = g.op("Div", result, g.op("Constant", value_t=sqrt_nfft))
+
+    return result
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset18.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset18.py
new file mode 100644
index 0000000000000..6a5ac408fb1b2
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset18.py
@@ -0,0 +1,270 @@
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 18.
+
+Note [ONNX Operators that are added/updated in opset 18]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-18-of-the-default-onnx-operator-set
+New operators:
+    BitwiseAnd
+    CenterCropPad
+    Col2Im
+    Mish
+    OptionalGetElement
+    OptionalHasElement
+    Pad
+    Resize
+    ScatterElements
+    ScatterND
+    Split
+"""
+
+import functools
+from collections.abc import Sequence
+from typing import Optional
+
+import torch
+from torch import _C
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__ = [
+    "col2im",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=18)
+
+
+@_onnx_symbolic("aten::__and_")
+@_onnx_symbolic("aten::bitwise_and")
+def __and_(g: jit_utils.GraphContext, self, other):
+    # do type promotion (scalars don't seem to apply)
+    args = [self, other]
+    # type promotion doesn't happen with torch.bitwise_and(tensor, scalar)
+    prom_args = [arg for arg in args if symbolic_helper._get_tensor_rank(arg)]
+    if len(prom_args) == 0:
+        prom_args = args
+    promotion_jit_type = symbolic_helper._type_promote_from_values(*prom_args)
+    self = symbolic_helper._maybe_cast_to_type(g, self, promotion_jit_type)
+    other = symbolic_helper._maybe_cast_to_type(g, other, promotion_jit_type)
+    if promotion_jit_type == _type_utils.JitScalarType.BOOL:
+        return g.op("And", self, other)
+    return g.op("BitwiseAnd", self, other)
+
+
+@_onnx_symbolic("aten::col2im")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is")
+def col2im(
+    g,
+    input: _C.Value,
+    output_size: _C.Value,
+    kernel_size: _C.Value,
+    dilation: Sequence[int],
+    padding: Sequence[int],
+    stride: Sequence[int],
+):
+    # convert [i0, i1, ..., in] into [i0, i0, i1, i1, ..., in, in]
+    adjusted_padding: list[int] = []
+    for pad in padding:
+        adjusted_padding.extend(pad for _ in range(2))
+
+    num_dimensional_axis = symbolic_helper._get_tensor_sizes(output_size)[0]
+    if not adjusted_padding:
+        adjusted_padding = [0, 0] * num_dimensional_axis
+
+    if not dilation:
+        dilation = [1] * num_dimensional_axis
+
+    if not stride:
+        stride = [1] * num_dimensional_axis
+
+    return g.op(
+        "Col2Im",
+        input,
+        output_size,
+        kernel_size,
+        dilations_i=dilation,
+        pads_i=adjusted_padding,
+        strides_i=stride,
+    )
+
+
+@_onnx_symbolic(
+    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
+)
+@_onnx_symbolic(
+    "aten::prod",
+    decorate=[
+        symbolic_helper._apply_params(
+            "ReduceProd", "prod", allow_multi_dim_support=False
+        )
+    ],
+)
+def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
+    return symbolic_helper._reduce_with_dtype_helper(
+        onnx_op, name, allow_multi_dim_support
+    )
+
+
+@_onnx_symbolic("aten::native_layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f")
+def _native_layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+) -> tuple[_C.Value, _C.Value, _C.Value]:
+    return opset9.native_layer_norm(g, input, normalized_shape, weight, bias, eps)
+
+
+@_onnx_symbolic("aten::glu")
+@symbolic_helper.parse_args("v", "i")
+def _glu(g: jit_utils.GraphContext, input, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
+    if dim_size is not None:
+        assert dim_size % 2 == 0
+
+    first, second = g.op("Split", input, axis_i=dim, num_outputs_i=2, outputs=2)
+    return g.op("Mul", first, g.op("Sigmoid", second))
+
+
+@_onnx_symbolic("aten::max")
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+# TODO(justinchuby): Support multiple quantized args in output
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::maximum")
+@symbolic_helper.quantized_args(True, True)
+def maximum(g: jit_utils.GraphContext, input, other):
+    return max(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::min")
+# TODO(justinchuby): Support multiple quantized args in output
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::minimum")
+@symbolic_helper.quantized_args(True, True)
+def minimum(g: jit_utils.GraphContext, input, other):
+    return min(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::amax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amax(g: jit_utils.GraphContext, self, dim, keepdim):
+    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    return g.op("ReduceMax", self, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::amin")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amin(g: jit_utils.GraphContext, self, dim, keepdim):
+    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    return g.op("ReduceMin", self, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::aminmax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i")
+def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
+    if not symbolic_helper._is_none(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+        return g.op("ReduceMin", self, axes, keepdims_i=keepdim), g.op(
+            "ReduceMax", self, axes, keepdims_i=keepdim
+        )
+    else:
+        return g.op("ReduceMin", self, keepdims_i=keepdim), g.op(
+            "ReduceMax", self, keepdims_i=keepdim
+        )
+
+
+@_onnx_symbolic("aten::var_mean")
+def _var_mean(g: jit_utils.GraphContext, input, *args):
+    if len(args) == 1:
+        return symbolic_helper._var_mean_helper(g, input, None, args[0], None)
+    else:
+        return symbolic_helper._var_mean_helper(g, input, *args)
+
+
+@_onnx_symbolic("aten::logsumexp")
+@symbolic_helper.parse_args("v", "is", "i")
+def _logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
+    if dim is None:
+        return g.op("ReduceLogSumExp", input, keepdims_i=0)
+    else:
+        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+        return g.op("ReduceLogSumExp", input, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::linalg_matrix_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+def _linalg_matrix_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: list[int],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return opset9.linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    return symbolic_helper._embedding_bag_helper(
+        g,
+        embedding_matrix,
+        indices,
+        offsets,
+        scale_grad_by_freq,
+        mode,
+        sparse,
+        per_sample_weights,
+        include_last_offset,
+        padding_idx,
+    )
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Optional[Sequence[int]],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset19.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset19.py
new file mode 100644
index 0000000000000..781bc2d200c7e
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset19.py
@@ -0,0 +1,31 @@
+"""This file exports ONNX ops for opset 19.
+
+Note [ONNX Operators that are added/updated in opset 19]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-19-of-the-default-onnx-operator-set
+New operators:
+AveragePool
+Cast
+CastLike
+Constant
+DeformConv
+DequantizeLinear
+Equal
+Identity
+If
+Loop
+Pad
+QuantizeLinear
+Reshape
+Resize
+Scan
+Shape
+Size
+"""
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__: list[str] = []
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset20.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset20.py
new file mode 100644
index 0000000000000..8e8ca44a26a4e
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset20.py
@@ -0,0 +1,95 @@
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 20.
+
+Note [ONNX Operators that are added/updated in opset 20]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-20-of-the-default-onnx-operator-set
+New operators:
+    AffineGrid
+    ConstantOfShape
+    DFT
+    Gelu
+    GridSample
+    ImageDecoder
+    IsInf
+    IsNaN
+    ReduceMax
+    ReduceMin
+    RegexFullMatch
+    StringConcat
+    StringSplit
+"""
+
+import functools
+
+import torch.nn.functional as F
+from torch import _C
+from torch.onnx._internal.torchscript_exporter import (
+    jit_utils,
+    registration,
+    symbolic_helper,
+)
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__ = ["_grid_sampler", "_affine_grid_generator", "gelu"]
+
+
+def convert_grid_sample_mode(mode_s):
+    return (
+        "linear" if mode_s == "bilinear" else "cubic" if mode_s == "bicubic" else mode_s
+    )
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=20)
+
+
+@_onnx_symbolic("aten::grid_sampler")
+@symbolic_helper.parse_args("v", "v", "i", "i", "b")
+def _grid_sampler(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    grid: _C.Value,
+    mode_enum: int,
+    padding_mode_enum: int,
+    align_corners: bool,
+):
+    mode_s = {v: k for k, v in F.GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg, index]
+    # mode string changes at https://onnx.ai/onnx/operators/text_diff_GridSample_16_20.html
+    mode_s = convert_grid_sample_mode(mode_s)
+    padding_mode_s = {v: k for k, v in F.GRID_SAMPLE_PADDING_MODES.items()}[  # type: ignore[call-arg, index]
+        padding_mode_enum  # type: ignore[index]
+    ]
+    return g.op(
+        "GridSample",
+        input,
+        grid,
+        align_corners_i=int(align_corners),
+        mode_s=mode_s,
+        padding_mode_s=padding_mode_s,
+    )
+
+
+@_onnx_symbolic("aten::affine_grid_generator")
+@symbolic_helper.parse_args("v", "v", "b")
+def _affine_grid_generator(
+    g: jit_utils.GraphContext,
+    theta: _C.Value,
+    size: _C.Value,
+    align_corners: bool,
+):
+    return g.op(
+        "AffineGrid",
+        theta,
+        size,
+        align_corners_i=int(align_corners),
+    )
+
+
+@_onnx_symbolic("aten::gelu")
+@symbolic_helper.parse_args("v", "s")
+def gelu(g: jit_utils.GraphContext, self: _C.Value, approximate: str = "none"):
+    return g.op("Gelu", self, approximate_s=approximate)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py
new file mode 100644
index 0000000000000..d11750b1ee8a5
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py
@@ -0,0 +1,71 @@
+# mypy: allow-untyped-defs
+"""
+Note [ONNX operators that are added/updated from opset 7 to opset 8]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+  Expand
+
+Updated operators:
+  Min, Max, Sum, Mean: supports multidirectional broadcasting.
+  MaxPool: added optional indices output.
+  Scan
+"""
+
+import functools
+import warnings
+
+from torch.onnx._internal.torchscript_exporter import (
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=7)
+
+block_listed_operators = (
+    "scan",
+    "expand",
+    "expand_as",
+    "meshgrid",
+    "adaptive_max_pool1d",
+    "adaptive_max_pool2d",
+    "adaptive_max_pool3d",
+    "max_pool1d_with_indices",
+    "max_pool2d_with_indices",
+    "max_pool3d_with_indices",
+)
+
+
+# NOTE: max, min, sum, mean: broadcasting is not supported in opset 7.
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+@_onnx_symbolic("aten::max")
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.max(input, other)
+    if keepdim is None and dim_or_y is not None:
+        warnings.warn(
+            "Multidirectional broadcasting is not supported in opset 7. "
+            "This might cause the onnx model to be incorrect, if inputs to max operators "
+            "have different shapes"
+        )
+    return opset9.max(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::min")
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.min(input, other)
+    if keepdim is None and dim_or_y is not None:
+        warnings.warn(
+            "Multidirectional broadcasting is not supported in opset 7. "
+            "This might cause the onnx model to be incorrect, if inputs to min operators "
+            "have different shapes"
+        )
+    return opset9.min(g, self, dim_or_y, keepdim)
+
+
+for block_listed_op in block_listed_operators:
+    _onnx_symbolic(f"aten::{block_listed_op}")(
+        symbolic_helper._block_list_in_opset(block_listed_op)
+    )
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
new file mode 100644
index 0000000000000..bde0726080883
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
@@ -0,0 +1,469 @@
+# mypy: allow-untyped-defs
+"""
+Note [ONNX operators that are added/updated from opset 8 to opset 9]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+    Compress
+    ConstantOfShape
+    EyeLike
+    MaxUnpool
+    OneHot
+    Sinh
+    Cosh
+    Asinh
+    Acosh
+    Atanh
+    Shrink
+    IsNaN
+    Sign
+    Erf
+    Scatter
+    Where
+    NonZero
+    TfIdfVectorizer
+    MeanVarianceNormalization
+
+Updated operators:
+    BatchNormalization: removed spatial attribute.
+    Greater, Less, Constant, MatMul, PRelu, Gemm, Flatten: more data types{integers} supported.
+    Cast: more data types{string} supported.
+    Upsample: moved scales from attribute to input.
+    Scan
+"""
+
+import functools
+import warnings
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=8)
+
+block_listed_operators = (
+    "nonzero",
+    "where",
+    "scatter",
+    "scatter_add",
+    "erf",
+    "sign",
+    "isnan",
+    "gather",
+    "arange",
+    "masked_fill",
+    "index_fill",
+    "index_copy",
+    "repeat_interleave",
+    "any",
+    "all",
+)
+
+for block_listed_op in block_listed_operators:
+    _onnx_symbolic(f"aten::{block_listed_op}")(
+        symbolic_helper._block_list_in_opset(block_listed_op)
+    )
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+)
+def _interpolate(name, dim, interpolate_mode):
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        output_size = symbolic_helper._maybe_get_const(output_size, "is")
+        if symbolic_helper._is_value(output_size):
+            return symbolic_helper._unimplemented(
+                name, "torch._C.Value (output_size) indexing"
+            )
+        if scales is None:
+            scales = [
+                1.0
+                if i < 2
+                else float(output_size[-(dim - i)])
+                / float(input.type().sizes()[-(dim - i)])
+                for i in range(0, dim)
+            ]
+        return g.op("Upsample", input, mode_s=interpolate_mode, scales_f=scales)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    align_corners = symbolic_helper._maybe_get_const(align_corners, "b")
+    if not symbolic_helper._is_none(align_corners) and align_corners:
+        return symbolic_helper._unimplemented("interpolate", "align_corners == True")
+
+    if not symbolic_helper._is_none(scale_factor) and symbolic_helper._is_value(
+        scale_factor
+    ):
+        return symbolic_helper._unimplemented(
+            "interpolate", "dynamic scales in opset 8"
+        )
+
+    if not symbolic_helper._is_none(size) and symbolic_helper._is_value(size):
+        return symbolic_helper._unimplemented("interpolate", "dynamic size in opset 8")
+
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Upsample", input, mode_s=mode, scales_f=scales)
+
+
+# NOTE: We should create a wrapper for this kind of operation, after resolving the shape/type propagation
+#       issue for "cast" operators. Some symbolic functions depend on shape information of input tensor, which
+#       is lost after casting.
+def _try_cast_integer_to_float(g: jit_utils.GraphContext, *args):
+    floating_scalar_types = {
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.DOUBLE,
+    }
+    old_type = None
+    # Cast the input tensor to Float if its scalarType is known and is not floating number.
+    # If casting is performed, return the old scalarType, otherwise return None.
+    arg0_type = _type_utils.JitScalarType.from_value(
+        args[0], _type_utils.JitScalarType.UNDEFINED
+    )
+    if arg0_type != _type_utils.JitScalarType.UNDEFINED:
+        old_type = arg0_type
+        if old_type not in floating_scalar_types:
+            old_type = old_type.scalar_name()  # type: ignore[assignment]
+            args = tuple(
+                g.op("Cast", arg, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+                for arg in args
+            )
+        else:
+            return (None,) + args
+    else:
+        warnings.warn(
+            "Only floating datatype is supported for these operators: "
+            "{Greater, Less, MatMul, PRelu, Gemm, Flatten}. This might cause "
+            "the onnx model to be incorrect, if inputs have integer datatypes."
+        )
+    return (old_type,) + args
+
+
+def _cast_to_type(g: jit_utils.GraphContext, input, to_type):
+    if to_type is None:
+        return input
+    return getattr(opset9, f"_cast_{to_type}")(g, input, False)
+
+
+def _comparison_operator(g: jit_utils.GraphContext, input, other, op_name):
+    other = symbolic_helper._maybe_get_scalar(other)
+    other = symbolic_helper._if_scalar_type_as(other, input)
+    _, input, other = _try_cast_integer_to_float(g, input, other)
+    return g.op(op_name, input, other)
+
+
+# NOTE: For symbolics {gt, lt, bmm, matmul, prelu, mm, addmm, view, flatten},
+#       integer input type not supported in opset8. Cast to float if possible.
+@_onnx_symbolic("aten::gt")
+def gt(g: jit_utils.GraphContext, input, other):
+    return _comparison_operator(g, input, other, "Greater")
+
+
+@_onnx_symbolic("aten::lt")
+def lt(g: jit_utils.GraphContext, input, other):
+    return _comparison_operator(g, input, other, "Less")
+
+
+@_onnx_symbolic("aten::bmm")
+def bmm(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, other = _try_cast_integer_to_float(g, self, other)
+        return _cast_to_type(g, g.op("MatMul", self, other), old_type)
+    else:
+        return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::matmul")
+def matmul(g: jit_utils.GraphContext, self, other):
+    return bmm(g, self, other)
+
+
+@_onnx_symbolic("aten::prelu")
+def prelu(g: jit_utils.GraphContext, self, weight):
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
+    if self_rank is not None and self_rank > 2:
+        weight = g.op("Unsqueeze", weight, axes_i=list(range(1, self_rank - 1)))
+    elif self_rank == 0 and weight_sizes == [1]:
+        # self and weight are both scalar but weight has rank == 1, squeeze weight.
+        weight = symbolic_helper._squeeze_helper(g, weight, [0])
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, weight = _try_cast_integer_to_float(g, self, weight)
+        return _cast_to_type(g, g.op("PRelu", self, weight), old_type)
+    else:
+        return g.op("PRelu", self, weight)
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    # Create a dummy C tensor. Only needed for API purposes, the value is
+    # since beta = 0
+    scalar_type = symbolic_helper._try_get_scalar_type(self, other)
+    if scalar_type is None:
+        raise errors.SymbolicValueError(
+            "mm can only operate on tensors with known types", self
+        )
+    zero_constant = g.op(
+        "Constant",
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, other, zero_constant = _try_cast_integer_to_float(
+            g, self, other, zero_constant
+        )
+        return _cast_to_type(
+            g,
+            g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0),
+            old_type,
+        )
+    return g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::addmm")
+@symbolic_helper.parse_args("v", "v", "v", "t", "t")
+def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, mat1, mat2 = _try_cast_integer_to_float(g, self, mat1, mat2)
+        return _cast_to_type(
+            g,
+            g.op(
+                "Gemm",
+                mat1,
+                mat2,
+                self,
+                beta_f=symbolic_helper._scalar(beta),
+                alpha_f=symbolic_helper._scalar(alpha),
+            ),
+            old_type,
+        )
+    else:
+        return g.op(
+            "Gemm",
+            mat1,
+            mat2,
+            self,
+            beta_f=symbolic_helper._scalar(beta),
+            alpha_f=symbolic_helper._scalar(alpha),
+        )
+
+
+@_onnx_symbolic("aten::flatten")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    start_dim_i = symbolic_helper._get_const(start_dim, "i", "start_dim")
+    end_dim_i = symbolic_helper._get_const(end_dim, "i", "end_dim")
+
+    dim = input.type().dim()
+    if end_dim_i < 0:
+        end_dim_i = dim + end_dim_i
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim_i == 1 and end_dim_i == dim - 1:
+        if symbolic_helper._try_get_scalar_type(input):
+            old_type, input = _try_cast_integer_to_float(g, input)
+            return _cast_to_type(
+                g, g.op("Flatten", input, axis_i=start_dim_i), old_type
+            )
+        else:
+            return g.op("Flatten", input, axis_i=start_dim_i)
+    if start_dim_i == 0 and end_dim_i == dim - 2:
+        if symbolic_helper._try_get_scalar_type(input):
+            old_type, input = _try_cast_integer_to_float(g, input)
+            return _cast_to_type(
+                g, g.op("Flatten", input, axis_i=end_dim_i + 1), old_type
+            )
+        else:
+            return g.op("Flatten", input, axis_i=end_dim_i + 1)
+
+    return opset9.flatten(g, input, start_dim, end_dim)
+
+
+def _constant_fill(g: jit_utils.GraphContext, sizes, dtype: int, const_value):
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if not scalar_type.dtype().is_floating_point:
+        result = g.op(
+            "ConstantFill",
+            sizes,
+            dtype_i=_type_utils.JitScalarType.FLOAT.onnx_type(),
+            input_as_shape_i=1,
+            value_f=const_value,
+        )
+        return g.op("Cast", result, to_i=scalar_type.onnx_type())
+    else:
+        return g.op(
+            "ConstantFill",
+            sizes,
+            dtype_i=scalar_type.onnx_type(),
+            input_as_shape_i=1,
+            value_f=const_value,
+        )
+
+
+@_onnx_symbolic("aten::empty")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty(
+    g: jit_utils.GraphContext,
+    sizes,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::empty_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros_like(g, input, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::zeros")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    # NOTE: no way to set device and layout in ONNX, so we ignore it
+    return _constant_fill(g, sizes, dtype, 0)
+
+
+@_onnx_symbolic("aten::zeros_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def zeros_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, 0)
+
+
+@_onnx_symbolic("aten::ones")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    return _constant_fill(g, sizes, dtype, 1)
+
+
+@_onnx_symbolic("aten::ones_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def ones_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, 1)
+
+
+@_onnx_symbolic("aten::full")
+def full(
+    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
+):
+    const_value = symbolic_helper._maybe_get_const(value, "t")
+    if symbolic_helper._is_value(const_value):
+        tmp = zeros(g, sizes, dtype, layout, device)
+        return opset9.add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return _constant_fill(g, sizes, dtype, const_value)
+
+
+@_onnx_symbolic("aten::full_like")
+@symbolic_helper.parse_args("v", "f", "i", "v", "v", "v", "v")
+def full_like(
+    g: jit_utils.GraphContext,
+    input,
+    fill_value,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, fill_value)
+
+
+@_onnx_symbolic("aten::repeat")
+def repeat(g: jit_utils.GraphContext, self, repeats):
+    if not symbolic_helper._is_value(repeats):
+        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+    if symbolic_helper._is_packed_list(repeats):
+        repeat_size_len = len(symbolic_helper._unpack_list(repeats))
+    else:
+        const_repeats = symbolic_helper._maybe_get_const(repeats, "is")
+        repeat_size_len = len(const_repeats)
+    if self.isCompleteTensor():
+        sizes = self.type().sizes()
+        diff_dims = repeat_size_len - len(sizes)
+        if diff_dims > 0:
+            self = opset9.view(
+                g, self, g.op("Constant", value_t=torch.tensor([1] * diff_dims + sizes))
+            )
+    return g.op("Tile", self, repeats)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
new file mode 100644
index 0000000000000..596c656777f88
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
@@ -0,0 +1,6656 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 9.
+
+Opset 9 is supported by ONNX release 1.4.1
+release on 01/23/19
+"""
+
+from __future__ import annotations
+
+import builtins
+import functools
+import math
+import sys
+import warnings
+from typing import Callable, TYPE_CHECKING
+from typing_extensions import deprecated
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.nn.modules.utils
+import torch.onnx
+from torch import _C
+from torch.onnx import _constants, errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+)
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from torch.types import Number
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = [
+    "abs",
+    "acos",
+    "add",
+    "addcmul",
+    "addmm",
+    "alias",
+    "amax",
+    "amin",
+    "aminmax",
+    "arange",
+    "argmax",
+    "argmin",
+    "as_strided",
+    "as_tensor",
+    "asin",
+    "atan",
+    "atan2",
+    "baddbmm",
+    "batch_norm",
+    "bernoulli",
+    "bitwise_not",
+    "bitwise_or",
+    "bmm",
+    "broadcast_tensors",
+    "broadcast_to",
+    "bucketize",
+    "cat",
+    "cdist",
+    "ceil",
+    "clamp_max",
+    "clamp_min",
+    "clamp",
+    "clone",
+    "constant_pad_nd",
+    "contiguous",
+    "conv_tbc",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "convert_element_type",
+    "convolution",
+    "cos",
+    "cosine_similarity",
+    "cross",
+    "cumsum",
+    "detach",
+    "dim",
+    "div",
+    "dot",
+    "dropout",
+    "elu",
+    "embedding_bag",
+    "embedding",
+    "empty_like",
+    "empty",
+    "eq",
+    "erf",
+    "exp",
+    "expand_as",
+    "expand",
+    "eye",
+    "fill",
+    "flatten",
+    "floor_divide",
+    "floor",
+    "floordiv",
+    "frobenius_norm",
+    "full_like",
+    "full",
+    "gather",
+    "ge",
+    "gelu",
+    "get_pool_ceil_padding",
+    "glu",
+    "group_norm",
+    "gt",
+    "hann_window",
+    "hardshrink",
+    "hardsigmoid",
+    "hardswish",
+    "hardtanh",
+    "index_add",
+    "index_copy",
+    "index_fill",
+    "index_put",
+    "index_select",
+    "index",
+    "instance_norm",
+    "is_floating_point",
+    "is_pinned",
+    "isnan",
+    "item",
+    "kl_div",
+    "layer_norm",
+    "le",
+    "leaky_relu",
+    "lerp",
+    "lift",
+    "linalg_cross",
+    "linalg_matrix_norm",
+    "linalg_norm",
+    "linalg_vector_norm",
+    "linear",
+    "linspace",
+    "log_sigmoid",
+    "log_softmax",
+    "log",
+    "log10",
+    "log1p",
+    "log2",
+    "logical_and",
+    "logical_not",
+    "logical_or",
+    "logical_xor",
+    "logit",
+    "logsumexp",
+    "lstm_cell",
+    "lstm",
+    "lt",
+    "masked_fill",
+    "masked_fill_",
+    "matmul",
+    "max_pool1d_with_indices",
+    "max_pool2d_with_indices",
+    "max_pool3d_with_indices",
+    "max",
+    "maximum",
+    "meshgrid",
+    "min",
+    "minimum",
+    "mish",
+    "mm",
+    "movedim",
+    "mse_loss",
+    "mul",
+    "multinomial",
+    "mv",
+    "narrow",
+    "native_layer_norm",
+    "ne",
+    "neg",
+    "new_empty",
+    "new_full",
+    "new_ones",
+    "new_zeros",
+    "nonzero_numpy",
+    "nonzero",
+    "norm",
+    "numel",
+    "numpy_T",
+    "one_hot",
+    "ones_like",
+    "ones",
+    "onnx_placeholder",
+    "pad",
+    "pairwise_distance",
+    "permute",
+    "pixel_shuffle",
+    "pixel_unshuffle",
+    "pow",
+    "prelu",
+    "prim_constant_chunk",
+    "prim_constant_split",
+    "prim_constant",
+    "prim_data",
+    "prim_device",
+    "prim_dtype",
+    "prim_if",
+    "prim_layout",
+    "prim_list_construct",
+    "prim_list_unpack",
+    "prim_loop",
+    "prim_max",
+    "prim_min",
+    "prim_shape",
+    "prim_tolist",
+    "prim_tuple_construct",
+    "prim_type",
+    "prim_unchecked_cast",
+    "prim_uninitialized",
+    "rand_like",
+    "rand",
+    "randint_like",
+    "randint",
+    "randn_like",
+    "randn",
+    "reciprocal",
+    "reflection_pad",
+    "relu",
+    "relu6",
+    "remainder",
+    "repeat_interleave",
+    "repeat",
+    "replication_pad",
+    "reshape_as",
+    "reshape",
+    "roll",
+    "rrelu",
+    "rsqrt",
+    "rsub",
+    "scalar_tensor",
+    "scatter_add",
+    "scatter",
+    "select",
+    "selu",
+    "sigmoid",
+    "sign",
+    "silu",
+    "sin",
+    "size",
+    "slice",
+    "softmax",
+    "softplus",
+    "softshrink",
+    "sort",
+    "split_with_sizes",
+    "split",
+    "sqrt",
+    "square",
+    "squeeze",
+    "stack",
+    "std_mean",
+    "std",
+    "sub",
+    "t",
+    "take",
+    "tan",
+    "tanh",
+    "tanhshrink",
+    "tensor",
+    "threshold",
+    "to",
+    "topk",
+    "transpose",
+    "true_divide",
+    "type_as",
+    "unbind",
+    "unfold",
+    "unsafe_chunk",
+    "unsafe_split_with_sizes",
+    "unsafe_split",
+    "unsqueeze",
+    "unsupported_complex_operators",
+    "noop_complex_operators",
+    "unused",
+    "var_mean",
+    "var",
+    "view_as",
+    "view",
+    "where",
+    "wrap_logical_op_with_cast_to",
+    "wrap_logical_op_with_negation",
+    "zeros_like",
+    "zeros",
+    "zero",
+]
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=9)
+
+
+def _export(name: str):
+    """Exports the function in the current global namespace."""
+
+    def wrapper(func):
+        globals()[name] = func
+        __all__.append(name)
+        return func
+
+    return wrapper
+
+
+def unused(g):
+    """Represents "missing" optional inputs."""
+    n = g.op("prim::Constant")
+    n.setType(_C.OptionalType.ofTensor())
+    return n
+
+
+@_onnx_symbolic("aten::_shape_as_tensor")
+def _shape_as_tensor(g: jit_utils.GraphContext, input):
+    return g.op("Shape", input)
+
+
+@_onnx_symbolic("aten::_reshape_from_tensor")
+def _reshape_from_tensor(g: jit_utils.GraphContext, input, shape):
+    if isinstance(shape, list):
+        shape = g.op("Concat", *shape, axis_i=0)
+    return reshape(g, input, shape)
+
+
+@_onnx_symbolic("aten::reshape")
+@symbolic_helper.quantized_args(True)
+def reshape(g: jit_utils.GraphContext, self, shape):
+    return symbolic_helper._reshape_helper(g, self, shape)
+
+
+@_onnx_symbolic("aten::reshape_as")
+@symbolic_helper.quantized_args(True)
+def reshape_as(g: jit_utils.GraphContext, self, other):
+    shape = g.op("Shape", other)
+    return reshape(g, self, shape)
+
+
+@_onnx_symbolic("aten::add")
+def add(g: jit_utils.GraphContext, self, other, alpha=None):
+    """
+    This function takes the add function and returns the corresponding ONNX operator.
+
+    This function is not meant to be called directly by the user.
+
+    Args:
+        g (GraphContext): The graph context.
+        self (Tensor): The first operand.
+        other (Tensor): The second operand.
+        alpha (float, optional): The scaling factor for the second operand. Defaults to None.
+
+    Returns:
+        ONNX operator.
+    """
+    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "Add", 9, 11, "Add between list of tensors not supported", self
+        )
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        other = g.op("Mul", other, alpha)
+    return g.op("Add", self, other)
+
+
+@_onnx_symbolic("aten::sub")
+def sub(g: jit_utils.GraphContext, self, other, alpha=None):
+    """
+    Consumes sub function and returns the corresponding ONNX operator.
+
+    This function is not meant to be called directly by the user.
+
+    Args:
+        g (GraphContext): The graph context.
+        self (Tensor): The first operand.
+        other (Tensor): The second operand.
+        alpha (Optional[Tensor]): A scaling factor to apply to the second operand.
+            If `alpha` is not provided, it defaults to 1.
+
+    Returns:
+        ONNX operator
+    """
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        other = g.op("Mul", other, alpha)
+    return g.op("Sub", self, other)
+
+
+@_onnx_symbolic("aten::rsub")
+def rsub(g: jit_utils.GraphContext, self, other, alpha=None):
+    return sub(g, other, self, alpha=alpha)
+
+
+@_onnx_symbolic("aten::mul")
+def mul(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_bool(self) and symbolic_helper._is_bool(other):
+        # ONNX Mul doesn't support Boolean, so use And as an equivalent operator.
+        return g.op("And", self, other)
+    else:
+        return g.op("Mul", self, other)
+
+
+@_onnx_symbolic("aten::div")
+def div(g: jit_utils.GraphContext, self, other, *args):
+    if len(args) == 0:
+        return true_divide(g, self, other)
+    else:
+        return _div_rounding_mode(g, self, other, *args)
+
+
+@_onnx_symbolic("aten::addcmul")
+@symbolic_helper.parse_args("v", "v", "v", "f")
+def addcmul(g: jit_utils.GraphContext, self, tensor1, tensor2, value=1.0):
+    value_tens = g.op("Constant", value_t=torch.tensor([value]))
+    return add(g, self, mul(g, mul(g, tensor1, tensor2), value_tens))
+
+
+@symbolic_helper.parse_args("v", "v", "s")
+def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
+    if rounding_mode is None:
+        return true_divide(g, self, other)
+    elif rounding_mode == "floor":
+        return _floor_divide(g, self, other)
+    elif rounding_mode == "trunc":
+        return _trunc_divide(g, self, other)
+    else:
+        raise errors.SymbolicValueError(
+            f'Unsupported rounding mode: "{rounding_mode}". Expected None, "floor" or "trunc"',
+            self,
+        )
+
+
+def _trunc_divide(g: jit_utils.GraphContext, self, other):
+    out = g.op("Div", self, other)
+    # the correct operation is truncate, which is not supported in ONNX,
+    # we cannot call floor since it will behave differently for negative numbers
+    # (eg. -0.1 should become -0 )
+    # - if scalar_type information are not available, assume that
+    # we need to call floor (treat as float)
+    out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+    # Matching PyTorch's behavior:
+    # - if self is fp the output's type is self's type
+    # - if self is not fp and other is fp, the output is of type JitScalarType.FLOAT
+    # - self is not fp and other is not fp, the output's type is self's output type
+    # - the output type defaults to Float
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        if not symbolic_helper._is_fp(self) and symbolic_helper._is_fp(other):
+            out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        else:
+            out = g.op(
+                "Cast",
+                out,
+                to_i=scalar_type.onnx_type(),
+            )
+    else:
+        out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return out
+
+
+def _floor_divide(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        out = true_divide(g, self, other)
+        return g.op("Floor", out)
+    else:
+        # Integer division does truncation rounding
+        div = g.op("Div", self, other)
+        # Division is negative if: self < 0 != other < 0
+        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+        negative = g.op(
+            "Xor",
+            symbolic_helper._lt_helper(g, self, zero),
+            symbolic_helper._lt_helper(g, other, zero),
+        )
+
+        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
+        mod = g.op("Sub", self, g.op("Mul", div, other))
+        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
+
+        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        fixup = g.op("Mul", fixup_mask, one)
+        return g.op("Sub", div, fixup)
+
+
+@_onnx_symbolic("aten::floor_divide")
+def floor_divide(g: jit_utils.GraphContext, self, other):
+    # Deprecated behavior, floor_divide actually truncates
+    return _trunc_divide(g, self, other)
+
+
+@_onnx_symbolic("aten::floordiv")
+def floordiv(g: jit_utils.GraphContext, self, other):
+    return floor_divide(g, self, other)
+
+
+@_onnx_symbolic("aten::true_divide")
+def true_divide(g: jit_utils.GraphContext, self, other):
+    """Division where both inputs are cast to floating types
+
+    If both inputs are floating, performs div as usual
+    If only one input is a floating type, the other input is cast to its type
+    If neither input is a floating type, both inputs are cast to the default scalar type
+    """
+
+    # Case 1: either values are floating
+    # Performs div as usual.
+    # Implicit casting will be handled in scalar type analysis pass.
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        return g.op("Div", self, other)
+
+    # Case 2: neither is floating
+    # Casts both inputs to the default scalar type
+    scalar_type = torch.get_default_dtype()
+    onnx_scalar_type = _C_onnx.TensorProtoDataType.FLOAT
+    assert scalar_type is torch.float or scalar_type is torch.double
+    if torch.get_default_dtype() is torch.double:
+        onnx_scalar_type = _C_onnx.TensorProtoDataType.DOUBLE
+
+    self = g.op("Cast", self, to_i=onnx_scalar_type)
+    other = g.op("Cast", other, to_i=onnx_scalar_type)
+    return g.op("Div", self, other)
+
+
+@_onnx_symbolic("aten::reciprocal")
+def reciprocal(g: jit_utils.GraphContext, self):
+    # torch.reciprocal implicitly casts to float, so we do the same.
+    if not symbolic_helper._is_fp(self):
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return g.op("Reciprocal", self)
+
+
+@_onnx_symbolic("aten::cat")
+@symbolic_helper.parse_args("v", "i")
+def cat(g: jit_utils.GraphContext, tensor_list, dim):
+    """Implement concatenation of pytorch tensors in ONNX along the specified `dim` dimension.
+
+    Parameters:
+        g (jit_utils.GraphContext): Graph context.
+        tensor_list (List[torch.Tensor]): List of tensors to concatenate.
+        dim (int): Dimension along which to concatenate the tensors.
+
+    Returns:
+        ONNX graph node representing the concatenated tensor.
+    """
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    # torch.cat ignores empty tensors such as `torch.Tensor([])`
+    # These needs to be removed as input from ONNX's concat too, otherwise shape inference
+    # will likely fail due to inputs with different ranks (0 for empty tensor, > 0 for anything else)
+    nonempty_tensors = []
+    for t in tensors:
+        if symbolic_helper._is_constant(t) and not symbolic_helper._get_tensor_dim_size(
+            t, 0
+        ):
+            continue
+        nonempty_tensors.append(t)
+    assert len(nonempty_tensors) > 0
+    assert all(
+        symbolic_helper._get_tensor_rank(nonempty_tensors[0]) is None
+        or symbolic_helper._get_tensor_rank(t) is None
+        or symbolic_helper._get_tensor_rank(t)
+        == symbolic_helper._get_tensor_rank(nonempty_tensors[0])
+        for t in nonempty_tensors
+    )
+    tensor_list.node().removeAllInputs()
+    for t in nonempty_tensors:
+        tensor_list.node().addInput(t)
+
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    return g.op("Concat", *tensors, axis_i=dim)
+
+
+@_onnx_symbolic("aten::stack")
+@symbolic_helper.parse_args("v", "i")
+def stack(g: jit_utils.GraphContext, tensor_list, dim):
+    unsqueezed = [
+        symbolic_helper._unsqueeze_helper(g, t, [dim])
+        for t in symbolic_helper._unpack_list(tensor_list)
+    ]
+    return g.op("Concat", *unsqueezed, axis_i=dim)
+
+
+@_onnx_symbolic("aten::list")
+def _list(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    # Create a dummy C tensor. Only needed for API purposes, the value is
+    # since beta = 0
+    C = g.op("Constant", value_t=torch.tensor([1]))
+    return g.op("Gemm", self, other, C, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::bmm")
+def bmm(g: jit_utils.GraphContext, self, other):
+    return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::matmul")
+def matmul(g: jit_utils.GraphContext, self, other):
+    return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::addmm")
+@symbolic_helper.parse_args("v", "v", "v", "t", "t")
+def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
+    scalar_type = None
+    self_scalar_type = symbolic_helper._try_get_scalar_type(self)
+    mat1_scalar_type = symbolic_helper._try_get_scalar_type(mat1)
+    mat2_scalar_type = symbolic_helper._try_get_scalar_type(mat2)
+    if self_scalar_type is not None:
+        scalar_type = self_scalar_type
+    elif mat1_scalar_type is not None:
+        scalar_type = mat1_scalar_type
+    elif mat2_scalar_type is not None:
+        scalar_type = mat2_scalar_type
+
+    mat1_rank = symbolic_helper._get_tensor_rank(mat1)
+    mat2_rank = symbolic_helper._get_tensor_rank(mat2)
+
+    def is_not_none_nor(v, u):
+        return v is not None and v != u
+
+    if scalar_type is not None and (
+        is_not_none_nor(mat1_rank, 2) or is_not_none_nor(mat2_rank, 2)
+    ):
+        res1 = g.op("MatMul", mat1, mat2)
+        res2 = self
+
+        alpha = symbolic_helper._scalar(alpha)
+        beta = symbolic_helper._scalar(beta)
+
+        if alpha != 1:
+            alpha = g.op(
+                "Constant", value_t=torch.tensor(alpha, dtype=scalar_type.dtype())
+            )
+            res1 = g.op("Mul", res1, alpha)
+        if beta != 1:
+            beta = g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    symbolic_helper._scalar(beta), dtype=scalar_type.dtype()
+                ),
+            )
+            res2 = g.op("Mul", res2, beta)
+
+        return g.op("Add", res1, res2)
+
+    return g.op(
+        "Gemm",
+        mat1,
+        mat2,
+        self,
+        beta_f=symbolic_helper._scalar(beta),
+        alpha_f=symbolic_helper._scalar(alpha),
+    )
+
+
+@_onnx_symbolic("aten::neg")
+def neg(g: jit_utils.GraphContext, self):
+    return g.op("Neg", self)
+
+
+@_onnx_symbolic("aten::sqrt")
+def sqrt(g: jit_utils.GraphContext, self):
+    if _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+        _type_utils.JitScalarType.INT16,
+        _type_utils.JitScalarType.INT,
+        _type_utils.JitScalarType.INT64,
+    }:
+        # torch converts all int inputs to sqrt to float
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+    return g.op("Sqrt", self)
+
+
+@_onnx_symbolic("aten::rsqrt")
+def rsqrt(g: jit_utils.GraphContext, self):
+    return g.op(
+        "Div", symbolic_helper._if_scalar_type_as(torch.ones(1), self), sqrt(g, self)
+    )
+
+
+@_onnx_symbolic("aten::tanh")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qtanh.cpp
+@symbolic_helper.quantized_args(True, scale=2.0 / 256.0, zero_point=128)
+def tanh(g: jit_utils.GraphContext, self):
+    return g.op("Tanh", self)
+
+
+@_onnx_symbolic("aten::sin")
+def sin(g: jit_utils.GraphContext, self):
+    return g.op("Sin", self)
+
+
+@_onnx_symbolic("aten::cos")
+def cos(g: jit_utils.GraphContext, self):
+    return g.op("Cos", self)
+
+
+@_onnx_symbolic("aten::tan")
+def tan(g: jit_utils.GraphContext, self):
+    return g.op("Tan", self)
+
+
+@_onnx_symbolic("aten::asin")
+def asin(g: jit_utils.GraphContext, self):
+    return g.op("Asin", self)
+
+
+@_onnx_symbolic("aten::acos")
+def acos(g: jit_utils.GraphContext, self):
+    return g.op("Acos", self)
+
+
+@_onnx_symbolic("aten::atan")
+def atan(g: jit_utils.GraphContext, self):
+    return g.op("Atan", self)
+
+
+@_onnx_symbolic("aten::atan2")
+def atan2(g: jit_utils.GraphContext, self, other):
+    # self is y, and other is x on coordinate
+    slope = g.op("Div", self, other)
+    atan = g.op("Atan", slope)
+    const_zero = g.op("Constant", value_t=torch.tensor(0))
+    const_pi = g.op("Constant", value_t=torch.tensor(math.pi))
+
+    condition_second_or_third_quadrant = g.op("Greater", self, const_zero)
+    second_third_quadrant = g.op(
+        "Where",
+        condition_second_or_third_quadrant,
+        g.op("Add", atan, const_pi),
+        g.op("Sub", atan, const_pi),
+    )
+
+    condition_14_or_23_quadrant = g.op("Less", other, const_zero)
+    result = g.op("Where", condition_14_or_23_quadrant, second_third_quadrant, atan)
+
+    return result
+
+
+@_onnx_symbolic("aten::sigmoid")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
+@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
+def sigmoid(g: jit_utils.GraphContext, self):
+    """Converts the corresponding PyTorch function into ONNX operators.
+
+    It is not meant to be called directly by a user.
+
+    Args:
+        g (jit_utils.GraphContext): Graph context.
+        self (Tensor): the input tensor.
+    Returns:
+        ONNX operator
+    """
+    return g.op("Sigmoid", self)
+
+
+@_onnx_symbolic("aten::sign")
+def sign(g: jit_utils.GraphContext, self):
+    return g.op("Sign", self)
+
+
+@symbolic_helper.quantized_args(True)
+def _slice(g: jit_utils.GraphContext, input, axes, starts, ends):
+    assert len(starts) == len(ends)
+    if len(starts) == 1 and starts[0] == 0 and ends[0] == _constants.INT64_MAX:
+        return input
+    return g.op("Slice", input, axes_i=axes, starts_i=starts, ends_i=ends)
+
+
+@_onnx_symbolic(
+    "aten::sum", decorate=[symbolic_helper._apply_params("ReduceSum", "sum")]
+)
+@_onnx_symbolic(
+    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
+)
+# torch.prod does not support multidimensional "dim"
+@_onnx_symbolic(
+    "aten::prod",
+    decorate=[
+        symbolic_helper._apply_params(
+            "ReduceProd", "prod", allow_multi_dim_support=False
+        )
+    ],
+)
+def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
+    return symbolic_helper._reduce_with_dtype_helper(
+        onnx_op, name, allow_multi_dim_support
+    )
+
+
+@_onnx_symbolic("aten::cumsum")
+@symbolic_helper.parse_args("v", "i", "none")
+def cumsum(g: jit_utils.GraphContext, input, dim, dtype):
+    symbolic_helper._onnx_opset_unsupported("cumsum", 9, 11, input)
+
+
+@_onnx_symbolic("aten::_sample_dirichlet")
+def _sample_dirichlet(g: jit_utils.GraphContext, self, generator):
+    return symbolic_helper._onnx_unsupported("_sample_dirichlet", self)
+
+
+@_onnx_symbolic("aten::_standard_gamma")
+def _standard_gamma(g: jit_utils.GraphContext, self, generator):
+    return symbolic_helper._onnx_unsupported("_standard_gamma", self)
+
+
+@_onnx_symbolic("aten::t")
+def t(g: jit_utils.GraphContext, self):
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is None or rank < 2:
+        # The transpose of a 1d or 0d tensor is itself. ONNX does not define the behavior
+        # clearly and onnxruntime fails on these cases. So we add an Identity node to
+        # mirror the behavior of eager mode.
+        return g.op("Identity", self)
+    return g.op("Transpose", self, perm_i=(1, 0))
+
+
+@_onnx_symbolic("aten::numpy_T")
+@symbolic_helper.quantized_args(True)
+def numpy_T(g: jit_utils.GraphContext, input):
+    ndim = symbolic_helper._get_tensor_rank(input)
+    assert ndim is not None
+    perm = list(reversed(range(0, ndim)))
+    return g.op("Transpose", input, perm_i=perm)
+
+
+@_onnx_symbolic("aten::expand")
+@symbolic_helper.quantized_args(True)
+def expand(g: jit_utils.GraphContext, self, size, implicit):
+    """Implement the expand function for a pytorch tensor in ONNX according to specified `size`"""
+    size = symbolic_helper._maybe_get_const(size, "is")
+    if not symbolic_helper._is_value(size):
+        size = g.op("Constant", value_t=torch.LongTensor(size))
+    elif symbolic_helper._is_packed_list(size):
+        # Expand with -1 dim value means dim is unchanged.
+        # Since onnx::expand supports two-way broadcasting,
+        # -1 dim value can be exported to onnx as 1
+        size = symbolic_helper._reshape_helper(
+            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
+        )
+    dtype = _type_utils.JitScalarType.INT64
+    ones = ones_like(g, size, dtype)
+    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
+    size = where(g, g.op("Equal", size, neg_ones), ones, size)
+    return g.op("Expand", self, size)
+
+
+@_onnx_symbolic("aten::broadcast_to")
+@symbolic_helper.quantized_args(True)
+def broadcast_to(g: jit_utils.GraphContext, self, size):
+    size = symbolic_helper._maybe_get_const(size, "is")
+    if not symbolic_helper._is_value(size):
+        size = g.op("Constant", value_t=torch.LongTensor(size))
+    elif symbolic_helper._is_packed_list(size):
+        # Expand with -1 dim value means dim is unchanged.
+        # Since onnx::expand supports two-way broadcasting,
+        # -1 dim value can be exported to onnx as 1
+        size = symbolic_helper._reshape_helper(
+            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
+        )
+    dtype = _type_utils.JitScalarType.INT64
+    ones = ones_like(g, size, dtype)
+    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
+    size = where(g, g.op("Equal", size, neg_ones), ones, size)
+    return g.op("Expand", self, size)
+
+
+@_onnx_symbolic("aten::expand_as")
+@symbolic_helper.quantized_args(True, True)
+def expand_as(g: jit_utils.GraphContext, self, other):
+    self_t = symbolic_helper._maybe_get_const(self, "t")
+    if isinstance(self_t, torch.Tensor):
+        orig_type = self_t.dtype
+        self_t = self_t.to(torch.double)
+        dims = []
+        for d in range(self_t.dim()):
+            if torch.equal(self_t.mean(d).unsqueeze(d).expand_as(self_t), self_t):
+                dims.append(d)
+                self = g.op(
+                    "Constant", value_t=self_t.mean(dims, keepdim=True).to(orig_type)
+                )
+
+    shape = g.op("Shape", other)
+    return g.op("Expand", self, shape)
+
+
+@_onnx_symbolic("aten::embedding")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i", "b", "v")
+def embedding(
+    g: jit_utils.GraphContext,
+    weight,
+    indices,
+    padding_idx,
+    scale_grad_by_freq,
+    sparse,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of embedding with scale_grad_by_freq=True "
+            "for training mode. ONNX does not support scaling the gradients.",
+            weight,
+        )
+    if padding_idx >= 0 and GLOBALS.export_training:
+        warnings.warn(
+            "Warning: ONNX export of embedding with padding_idx >= 0 "
+            "for training mode. "
+            "ONNX does not support not updating the embedding vector at padding_idx during training."
+        )
+
+    return g.op("Gather", weight, indices)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if not symbolic_helper._is_none(per_sample_weights):
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with per_sample_weights"
+        )
+
+    return symbolic_helper._onnx_unsupported("embedding_bag", embedding_matrix)
+
+
+@_onnx_symbolic("aten::size")
+@symbolic_helper.quantized_args(True, quantize_output=False)
+def size(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Shape", self)
+    if symbolic_helper._maybe_get_const(dim, "i") < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            dim = symbolic_helper._maybe_get_const(dim, "i") + rank
+            dim = g.op("Constant", value_t=torch.tensor(dim))
+    return symbolic_helper._size_helper(g, self, dim)
+
+
+@_onnx_symbolic("aten::transpose")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "i")
+def transpose(g: jit_utils.GraphContext, self, dim0, dim1):
+    if dim0 == dim1:  # micro-optimization
+        return self
+
+    # NB: Transpose in ONNX is actually a Permute
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is not None:
+        axes = list(range(rank))
+        axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
+        return g.op("Transpose", self, perm_i=axes)
+    else:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of transpose for tensor of unknown rank.",
+            self,
+        )
+
+
+@_onnx_symbolic("aten::permute")
+@symbolic_helper.parse_args("v", "is")
+def permute(g: jit_utils.GraphContext, self, dims):
+    if dims == list(range(0, len(dims))):
+        return self
+    return g.op("Transpose", self, perm_i=dims)
+
+
+@_onnx_symbolic("aten::view")
+@symbolic_helper.quantized_args(True)
+def view(g: jit_utils.GraphContext, self, size):
+    return reshape(g, self, size)
+
+
+@_onnx_symbolic("aten::view_as")
+def view_as(g: jit_utils.GraphContext, self, other):
+    shape = g.op("Shape", other)
+    return reshape(g, self, shape)
+
+
+@_onnx_symbolic("aten::unsafe_chunk")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
+    if _outputs is None:
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "unsafe_chunk", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented(
+            "unsafe_chunk", "unknown dimension size", self
+        )
+    split_size = (size + chunks - 1) // chunks
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "split", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
+    if split_val.dim() > 0:
+        return split_with_sizes(g, self, split_size_or_sizes, dim, _outputs)
+    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        if _outputs is not None:
+            size = split_size * _outputs
+        else:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "split", 9, 11, "Unknown dimension size not supported", self
+            )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split")
+def unsafe_split(
+    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
+):
+    return split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+@symbolic_helper.parse_args("v", "is", "i", "i")
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_sizes, _outputs):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "split_with_sizes", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    return g.op("Split", self, split_i=split_sizes, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split_with_sizes")
+def unsafe_split_with_sizes(
+    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
+):
+    return split_with_sizes(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "unbind", 9, 11, "Dynamic number of outputs not supported", self
+        )
+
+    outputs = g.op("Split", self, split_i=[1] * _outputs, axis_i=dim, outputs=_outputs)
+    outputs = [outputs] if _outputs == 1 else outputs
+    squeezed_outputs = [
+        symbolic_helper._squeeze_helper(g, out, [dim]) for out in outputs
+    ]
+    return squeezed_outputs
+
+
+@_onnx_symbolic("aten::select")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "v")
+def select(g: jit_utils.GraphContext, self, dim, index):
+    """Implement the select functionality for a pytorch tensor in ONNX.
+
+    Selects elements from the input tensor along the specified `dim` dimension based on the `index` tensor.
+    """
+    index = symbolic_helper._maybe_get_scalar(index)
+    if (not symbolic_helper._is_value(index)) and (index < 0):
+        if index == -1:
+            end_index = _constants.INT64_MAX
+        else:
+            end_index = index + 1
+        slice_node = symbolic_helper._slice_helper(
+            g, self, axes=[dim], starts=[index], ends=[end_index]
+        )
+        return symbolic_helper._squeeze_helper(g, slice_node, [dim])
+    else:
+        # FIXME(justinchuby): can index be an int and not a value?
+        return g.op("Gather", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::square")
+def square(g: jit_utils.GraphContext, self):
+    return g.op("Mul", self, self)
+
+
+@_onnx_symbolic("aten::squeeze")
+def squeeze(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Squeeze", self)
+
+    squeeze_dim = symbolic_helper._get_const(dim, "i", "dim")
+    # Handle negative dims
+    if squeeze_dim < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            warnings.warn(
+                "ONNX export squeeze with negative axis "
+                + str(squeeze_dim)
+                + " might cause the onnx model to be incorrect. "
+                + "Negative axis is not supported in ONNX. "
+                + "Axis is converted to "
+                + str(squeeze_dim + rank)
+                + " based on input shape at export time. "
+                + "Passing an tensor of different rank in execution will be incorrect."
+            )
+            squeeze_dim += rank
+        else:
+            return symbolic_helper._unimplemented(
+                "squeeze", "negative axis with unknown input rank", self
+            )
+
+    dim_size = symbolic_helper._get_tensor_dim_size(self, squeeze_dim)
+    if dim_size is None:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(squeeze_dim)
+            + " on an input "
+            + "with unknown shape. Note that if the size of dimension "
+            + str(squeeze_dim)
+            + " of the input "
+            + "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on "
+            + "non-singleton dimensions, it is recommended to export this model using opset "
+            + "version 11 or higher."
+        )
+        return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
+    if dim_size > 1:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(squeeze_dim)
+            + ". The size of "
+            + "this dimension in the given input is "
+            + str(dim_size)
+            + ". The model will "
+            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
+            + "input shapes, please use opset version 11 to "
+            + "export the model."
+        )
+        return self
+
+    warnings.warn(
+        "This model contains a squeeze operation on dimension "
+        + str(squeeze_dim)
+        + ". If the model is "
+        + "intended to be used with dynamic input shapes, please use opset version 11 to export the model."
+    )
+    return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
+
+
+@_onnx_symbolic("aten::prelu")
+def prelu(g: jit_utils.GraphContext, self, weight):
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
+    weight_rank = len(weight_sizes)
+    if self_rank is not None:
+        if self_rank > 2:
+            # make weight unidirectional broadcastable
+            weight = symbolic_helper._unsqueeze_helper(
+                g, weight, list(range(1, self_rank - 1))
+            )
+        elif self_rank == 0 and weight_sizes == [1]:
+            # self and weight are both scalar but weight has rank == 1, squeeze weight.
+            weight = symbolic_helper._squeeze_helper(g, weight, [0])
+            weight_rank = 0
+
+    if self_rank is not None and weight_rank is not None:
+        assert self_rank >= weight_rank, (
+            f"rank(x) should be >= rank(slope) but got {self_rank} < {weight_rank}"
+        )
+    return g.op("PRelu", self, weight)
+
+
+@_onnx_symbolic("aten::silu")
+def silu(g: jit_utils.GraphContext, input):
+    return g.op("Mul", input, g.op("Sigmoid", input))
+
+
+@_onnx_symbolic("aten::mish")
+def mish(g: jit_utils.GraphContext, input):
+    return g.op("Mul", input, g.op("Tanh", g.op("Softplus", input)))
+
+
+@_onnx_symbolic("aten::relu")
+@symbolic_helper.quantized_args(True)
+def relu(g: jit_utils.GraphContext, input):
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Relu", input, opset_before=14
+    )
+
+
+@_onnx_symbolic("aten::relu6")
+@symbolic_helper.quantized_args(True)
+def relu6(g: jit_utils.GraphContext, input):
+    return clamp(g, input, 0, 6)
+
+
+@_onnx_symbolic("aten::ceil")
+def ceil(g: jit_utils.GraphContext, input):
+    return g.op("Ceil", input)
+
+
+@_onnx_symbolic("aten::floor")
+def floor(g: jit_utils.GraphContext, input):
+    return g.op("Floor", input)
+
+
+@_onnx_symbolic("aten::len")
+def _len(g: jit_utils.GraphContext, self):
+    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
+    return symbolic_helper._squeeze_helper(g, sz_0, [0])
+
+
+@_onnx_symbolic("aten::threshold")
+@symbolic_helper.parse_args("v", "t", "t")
+def threshold(g: jit_utils.GraphContext, self, threshold, value):
+    # See Note [Export inplace]
+    if symbolic_helper._scalar(threshold) != 0:
+        return symbolic_helper._unimplemented("threshold", "non-zero threshold", self)
+    if symbolic_helper._scalar(value) != 0:
+        return symbolic_helper._unimplemented("threshold", "non-zero value", self)
+    return g.op("Relu", self)
+
+
+@_onnx_symbolic("aten::leaky_relu")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "b")
+def leaky_relu(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    negative_slope: float,
+    inplace: bool = False,
+):
+    # See Note [Export inplace]
+    return g.op("LeakyRelu", input, alpha_f=negative_slope)
+
+
+@_onnx_symbolic("aten::glu")
+@symbolic_helper.parse_args("v", "i")
+def glu(g: jit_utils.GraphContext, input, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
+    if dim_size is not None:
+        assert dim_size % 2 == 0
+
+    first, second = g.op("Split", input, axis_i=dim, outputs=2)
+    return g.op("Mul", first, g.op("Sigmoid", second))
+
+
+@_onnx_symbolic("aten::softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    # Softmax does normalization at vector level.
+    # PyTorch and ONNX use different strategies to split the input tensor into vectors.
+    # Thus dim and axis have different meanings.
+    # PyTorch slices the input tensor into vectors along the `dim`-th dimension.
+    # ONNX reshapes the input into a 2-D tensor, and `axis` indicates where the input is coerced.
+    # If input is a 2 x 3 tensor:
+    # input = [[1.0, 1.0, 1.0],
+    #          [1.0, 1,0, 1,0]]
+    # with dim = 0, the result is:
+    # result = [[0.5, 0.5, 0.5],
+    #           [0.5, 0.5, 0.5]]
+    # with axis = 0, the result is:
+    # result = [[0.167, 0.167, 0.167],
+    #           [0.167, 0.167, 0.167]]
+    # So only when dim and axis both equal to ndim - 1 (the last dimension),
+    # their semantics are equivalent.
+    # So use softmax when dim and axis both equal to ndim - 1,
+    # otherwise transpose the input to put the vectors to be normalized to the last dimension.
+    # When input rank is not known at export time we compute softmax using a subgraph
+    # with other operators
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is not None:
+        # TODO: remove this as onnx opset 11 spec allows negative axes
+        if dim < 0:
+            dim = input_dim + dim
+
+        is_transpose_required = input_dim != dim + 1
+
+        if is_transpose_required:
+            axes = list(range(input_dim))
+            axes[dim], axes[-1] = axes[-1], axes[dim]
+            input = g.op("Transpose", input, perm_i=axes)
+            dim = input_dim - 1
+
+        softmax = g.op("Softmax", input, axis_i=dim)
+        if dtype and dtype.node().kind() != "prim::Constant":
+            parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+            softmax = g.op(
+                "Cast",
+                softmax,
+                to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type(),
+            )
+
+        if is_transpose_required:
+            softmax = g.op("Transpose", softmax, perm_i=axes)  # type: ignore[possibly-undefined]
+        return softmax
+
+    # Apply max normalization.
+    input = g.op("Sub", input, g.op("ReduceMax", input, axes_i=[dim], keepdims_i=1))
+
+    exp = g.op("Exp", input)
+    sum = symbolic_helper._reducesum_helper(g, exp, axes_i=[dim])
+    softmax = g.op("Div", exp, sum)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        softmax = g.op(
+            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    return softmax
+
+
+@_onnx_symbolic("aten::softplus")
+def softplus(g: jit_utils.GraphContext, self, beta, threshold):
+    beta_const = symbolic_helper._maybe_get_const(beta, "f")
+    if beta_const != 1:
+        return g.op("Div", g.op("Softplus", g.op("Mul", self, beta)), beta)
+    return g.op("Softplus", self)
+
+
+@_onnx_symbolic("aten::get_pool_ceil_padding")
+def get_pool_ceil_padding(input, kernel_size, stride, padding):
+    # TODO(justinchuby): Looks like this op is deprecated in torch
+    sizes = symbolic_helper._get_tensor_sizes(input)
+    dim = sizes[-len(padding) :] if sizes is not None else None
+    if dim is None or any(i is None for i in dim):
+        return symbolic_helper._unimplemented(
+            "get_pool_ceil_padding", "input size not accessible", input
+        )
+    ceiled_output_dim = [
+        int(math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])))
+        + 1
+        for i in range(0, len(padding))
+    ]
+    # ensure last pooling starts inside
+    ceiled_output_dim = [
+        (
+            ceiled_output_dim[i] - 1
+            if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
+            else ceiled_output_dim[i]
+        )
+        for i in range(0, len(ceiled_output_dim))
+    ]
+    padding_ceil = [
+        (
+            0
+            if (stride[i] == 1)
+            else (
+                kernel_size[i]
+                - (
+                    dim[i]
+                    + 2 * padding[i]
+                    - ((ceiled_output_dim[i] - 1) * stride[i] + 1)
+                )
+            )
+        )
+        for i in range(0, len(padding))
+    ]
+    # ensure padding is not > kernel_size
+    padding_ceil = [
+        (
+            (
+                int(padding_ceil[i])
+                if padding_ceil[i] < kernel_size[i] - 1
+                else int(kernel_size[i] - 1)
+            )
+            if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
+            else int(padding_ceil[i])
+        )
+        for i in range(0, len(padding_ceil))
+    ]
+    return padding_ceil
+
+
+@_onnx_symbolic(
+    "aten::max_pool1d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool1d", torch.nn.modules.utils._single, 1, return_indices=False
+        ),
+        _export("max_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool2d", torch.nn.modules.utils._pair, 2, return_indices=False
+        ),
+        _export("max_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool3d", torch.nn.modules.utils._triple, 3, return_indices=False
+        ),
+        _export("max_pool3d"),
+    ],
+)
+def _max_pool(name, tuple_fn, ndims, return_indices):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+    def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+        if set(tuple_fn(dilation)) != {1}:
+            return symbolic_helper._unimplemented(name, "dilation", input)
+        if not stride:
+            stride = kernel_size
+        padding = tuple(tuple_fn(padding))
+        if ceil_mode:
+            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
+            padding = padding + tuple(a + b for (a, b) in zip(padding_ceil, padding))
+        else:
+            padding = padding * 2
+        kwargs = {
+            "kernel_shape_i": tuple_fn(kernel_size),
+            "pads_i": padding,
+            "strides_i": tuple_fn(stride),
+        }
+        # easy but hacky way to get flattened indices values
+        # to be used to convert the indices values to non-flattened.
+        # In ONNX the indices are computed as a flatten 1-D tensor,
+        # so the values in indices are in [0, N x C x D1 x ... x Dn).
+        # To convert the indices to the same format used by Pytorch,
+        # we first execute a maxpool with a kernel and stride of 1 on the same input.
+        # This will result in a tensor of indices in which each index will have it's own value.
+        # Using this tensor as a reference, we extract the first index of each axis and subtract
+        # it from each index of this axis in the indices to convert.
+        # This step will result in a tensor were each dimension has values of indices within
+        # the dimension it is in.
+        # For more information :
+        # https://github.com/pytorch/pytorch/pull/16455#issuecomment-460776407
+        if return_indices:
+            r, indices = g.op("MaxPool", input, outputs=2, **kwargs)
+            _, flattened_indices = g.op(
+                "MaxPool",
+                input,
+                outputs=2,
+                kernel_shape_i=[1 for _ in range(ndims)],
+                strides_i=[1 for _ in range(ndims)],
+            )
+            # convert indices to have non-flattened indices values
+            s = symbolic_helper._slice_helper(
+                g,
+                flattened_indices,
+                axes=[2 + i for i in range(ndims)],
+                starts=list(tuple_fn(0)),
+                ends=list(tuple_fn(1)),
+            )
+            indices = sub(g, indices, s)
+            return r, indices
+        else:
+            r = g.op("MaxPool", input, outputs=1, **kwargs)
+            return r
+
+    return symbolic_fn
+
+
+max_pool1d_with_indices = _onnx_symbolic("aten::max_pool1d_with_indices")(
+    _max_pool(
+        "max_pool1d_with_indices",
+        torch.nn.modules.utils._single,
+        1,
+        return_indices=True,
+    )
+)
+max_pool2d_with_indices = _onnx_symbolic("aten::max_pool2d_with_indices")(
+    _max_pool(
+        "max_pool2d_with_indices",
+        torch.nn.modules.utils._pair,
+        2,
+        return_indices=True,
+    )
+)
+max_pool3d_with_indices = _onnx_symbolic("aten::max_pool3d_with_indices")(
+    _max_pool(
+        "max_pool3d_with_indices",
+        torch.nn.modules.utils._triple,
+        3,
+        return_indices=True,
+    )
+)
+
+
+@_onnx_symbolic(
+    "aten::avg_pool1d",
+    decorate=[
+        symbolic_helper._apply_params("avg_pool1d", torch.nn.modules.utils._single),
+        _export("avg_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::avg_pool2d",
+    decorate=[
+        symbolic_helper._apply_params("avg_pool2d", torch.nn.modules.utils._pair),
+        _export("avg_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::avg_pool3d",
+    decorate=[
+        symbolic_helper._apply_params("avg_pool3d", torch.nn.modules.utils._triple),
+        _export("avg_pool3d"),
+    ],
+)
+def _avg_pool(name, tuple_fn):
+    @symbolic_helper.quantized_args(True)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+    def symbolic_fn(
+        g,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: int | Sequence[int],
+        ceil_mode: int,
+        count_include_pad: int,
+        divisor_override=None,
+    ):
+        if not stride:
+            stride = kernel_size
+        padding = symbolic_helper._avgpool_helper(
+            tuple_fn, padding, kernel_size, stride, divisor_override, name
+        )
+        assert isinstance(padding, tuple)
+        adjusted_padding = padding
+        # Although onnx::AvgPool provides count_include_pad,
+        # The corner case of Average Pooling with ceil_mode on
+        # PyTorch allows sliding window go off bound, which leads to
+        # this accommodation.
+        # More detail on https://github.com/pytorch/pytorch/issues/57178
+        if count_include_pad:
+            input = symbolic_helper._op_with_optional_float_cast(
+                g,
+                "Pad",
+                input,
+                pads_i=((0,) * 2 + padding) * 2,
+                mode_s="constant",
+                value_f=0.0,
+                opset_before=11,
+            )
+            adjusted_padding = (0,) * len(padding)
+        if ceil_mode:
+            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
+            adjusted_padding = adjusted_padding + tuple(
+                a + b for (a, b) in zip(padding_ceil, adjusted_padding)
+            )
+        else:
+            adjusted_padding = adjusted_padding * 2
+        output = g.op(
+            "AveragePool",
+            input,
+            kernel_shape_i=tuple_fn(kernel_size),
+            strides_i=tuple_fn(stride),
+            pads_i=adjusted_padding,
+        )
+        return output
+
+    return symbolic_fn
+
+
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool1d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_avg_pool1d", "AveragePool", torch.nn.modules.utils._single
+        ),
+        _export("adaptive_avg_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool2d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_avg_pool2d", "AveragePool", torch.nn.modules.utils._pair
+        ),
+        _export("adaptive_avg_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool3d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_avg_pool3d", "AveragePool", torch.nn.modules.utils._triple
+        ),
+        _export("adaptive_avg_pool3d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool1d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_max_pool1d",
+            "MaxPool",
+            torch.nn.modules.utils._single,
+            max_pool1d_with_indices,
+        ),
+        _export("adaptive_max_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool2d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_max_pool2d",
+            "MaxPool",
+            torch.nn.modules.utils._pair,
+            max_pool2d_with_indices,
+        ),
+        _export("adaptive_max_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool3d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_max_pool3d",
+            "MaxPool",
+            torch.nn.modules.utils._triple,
+            max_pool3d_with_indices,
+        ),
+        _export("adaptive_max_pool3d"),
+    ],
+)
+def _adaptive_pool(name, type, tuple_fn, fn=None):
+    @symbolic_helper.quantized_args(True, False)
+    def symbolic_fn(g, input, output_size):
+        # _adaptive_pool is supported for cases where output_size is 1 for all dimensions,
+        # by executing a GlobalPool.
+        # It is also supported for cases where the output size is a factor of the input size.
+        # For these cases the stride and kernel size are uniform along all the indices of
+        # the same dimension, which makes it possible to export it to ONNX.
+        # for MaxPool, GlobalMaxPool does not return indices,
+        # so we try using max_poolxd_with_indices, and if it is not possible
+        # (input is not a complete tensor or output size not factor of input size)
+        # then we call GlobalAveragePool and return None for the indices
+        output_size_value = output_size
+        try:
+            output_size = symbolic_helper._parse_arg(output_size, "is")
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            return symbolic_helper._onnx_unsupported(
+                "adaptive pooling, since output_size is not constant.", input
+            )
+        if output_size == [1] * len(output_size) and type == "AveragePool":
+            return g.op("GlobalAveragePool", input)
+        sizes = symbolic_helper._get_tensor_sizes(input)
+        try:
+            dim = sizes[2:]
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            dim = None
+        if dim is None or any(i is None for i in dim):
+            if output_size == [1] * len(output_size):
+                return g.op("GlobalMaxPool", input), None
+            return symbolic_helper._unimplemented(
+                name, "input size not accessible", input
+            )
+        # verify if output size % input size = 0 for all dim
+        mod = [dim[i] % output_size[i] for i in range(0, len(dim))]
+        if mod != [0] * len(mod):
+            if output_size == [1] * len(output_size):
+                return g.op("GlobalMaxPool", input), None
+            return symbolic_helper._unimplemented(
+                name, "output size that are not factor of input size", output_size_value
+            )
+        k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
+        # call max_poolxd_with_indices to get indices in the output
+        if type == "MaxPool":
+            return fn(g, input, k, k, (0,) * len(dim), (1,) * len(dim), False)
+        output = g.op(type, input, kernel_shape_i=tuple_fn(k), strides_i=tuple_fn(k))
+        return output
+
+    return symbolic_fn
+
+
+def _prepare_onnx_paddings(dim: int, pad):
+    """Generate paddings in ONNX order based on pad in pytorch.
+    Args:
+        dim: the dimension of the tensor.
+        pad: the paddings in pytorch.
+            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ...
+    """
+    # The desired order of paddings is
+    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
+    # n is the dimension of input.
+    # assume zero-dimensions in the beginning
+    paddings = list(pad[:]) + [0] * (dim * 2 - len(pad))
+    # reverse order and collate first beginnings and then ends
+    paddings = paddings[-2::-2] + paddings[-1::-2]
+    return paddings
+
+
+def _convert_padding_node(input):
+    padding = symbolic_helper._maybe_get_const(input, "is")
+    if symbolic_helper._is_value(padding) and symbolic_helper._is_packed_list(padding):
+        input_list = symbolic_helper._unpack_list(padding)
+        try:
+            padding = [
+                symbolic_helper._get_const(v, "i", "padding") for v in input_list
+            ]
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "Pad", 9, 11, "The sizes of the padding must be constant", input
+            )
+    return padding
+
+
+@_onnx_symbolic("aten::constant_pad_nd")
+def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value):
+    mode = "constant"
+    try:
+        value = symbolic_helper._get_const(value, "f", "value")
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "Pad", 9, 11, "The value for the padding must be constant", value
+        )
+
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, value_f=value, opset_before=11
+    )
+
+
+def _pad_circular(g: jit_utils.GraphContext, input: _C.Value, pad: _C.Value):
+    padding = _convert_padding_node(pad)
+    assert len(padding) % 2 == 0
+    ndim = len(padding) // 2
+
+    cur = input
+    for idx in range(ndim):
+        pad_r = padding[-(2 * idx + 1)]
+        pad_l = padding[-(2 * idx + 2)]
+        tensors = []
+        if pad_l > 0:
+            left = symbolic_helper._slice_helper(
+                g, cur, axes=[2 + idx], starts=[-(pad_l)], ends=[_constants.INT64_MAX]
+            )
+            tensors.append(left)
+
+        if pad_l < 0 or pad_r < 0:
+            start = builtins.max(0, -pad_l)
+            end = -(builtins.max(0, -pad_r))
+            middle = symbolic_helper._slice_helper(
+                g,
+                cur,
+                axes=[2 + idx],
+                starts=[start],
+                ends=[end],
+            )
+            tensors.append(middle)
+        else:
+            tensors.append(cur)
+
+        if pad_r > 0:
+            right = symbolic_helper._slice_helper(
+                g, cur, axes=[2 + idx], starts=[0], ends=[pad_r]
+            )
+            tensors.append(right)
+
+        cur = g.op("Concat", *tensors, axis_i=(2 + idx))
+
+    return cur
+
+
+@_onnx_symbolic("aten::reflection_pad1d")
+@_onnx_symbolic("aten::reflection_pad2d")
+@_onnx_symbolic("aten::reflection_pad3d")
+def reflection_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "reflect"
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
+    )
+
+
+@_onnx_symbolic("aten::replication_pad1d")
+@_onnx_symbolic("aten::replication_pad2d")
+@_onnx_symbolic("aten::replication_pad3d")
+def replication_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "edge"
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
+    )
+
+
+@_onnx_symbolic("aten::pad")
+def pad(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    pad: _C.Value,
+    mode: _C.Value,
+    value: _C.Value,
+):
+    mode = symbolic_helper._parse_arg(mode, "s")
+    if mode == "replicate":
+        return replication_pad(g, input, pad)
+    elif mode == "reflect":
+        return reflection_pad(g, input, pad)
+    elif mode == "constant":
+        return constant_pad_nd(g, input, pad, value)
+    elif mode == "circular":
+        return _pad_circular(g, input, pad)
+    else:
+        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest"),
+        _export("upsample_nearest1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest"),
+        _export("upsample_nearest2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest"),
+        _export("upsample_nearest3d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_linear1d", 3, "linear"),
+        _export("upsample_linear1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear"),
+        _export("upsample_bilinear2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear"),
+        _export("upsample_trilinear3d"),
+    ],
+)
+def _interpolate(name: str, dim: int, interpolate_mode: str):
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        if scales is None:
+            scales = symbolic_helper._interpolate_size_to_scales(
+                g, input, output_size, dim
+            )
+        return g.op("Upsample", input, scales, mode_s=interpolate_mode)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Upsample", input, scales, mode_s=mode)
+
+
+@_onnx_symbolic("aten::bitwise_not")
+def bitwise_not(g: jit_utils.GraphContext, input):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise Not "
+            "for non-boolean input values",
+            input,
+        )
+    return g.op("Not", input)
+
+
+@_onnx_symbolic("aten::bitwise_or")
+def bitwise_or(g, self, other):
+    if not symbolic_helper._is_bool(self):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values. self: ",
+            self,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values. other: ",
+            other,
+        )
+    return g.op("Or", self, other)
+
+
+def wrap_logical_op_with_cast_to(to_type):
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrap_with_cast(g, input, other):
+            to_cast_func = globals()[f"_cast_{to_type}"]
+            return fn(g, to_cast_func(g, input, False), to_cast_func(g, other, False))
+
+        return wrap_with_cast
+
+    return decorator
+
+
+def wrap_logical_op_with_negation(func: Callable) -> Callable:
+    @functools.wraps(func)
+    def wrap_with_not(g, input, other):
+        return g.op("Not", func(g, input, other))
+
+    return wrap_with_not
+
+
+@_onnx_symbolic("aten::__not_")
+def __not_(g: jit_utils.GraphContext, self):
+    if not symbolic_helper._is_bool(self):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise Not "
+            "for non-boolean input values",
+            self,
+        )
+    return g.op("Not", self)
+
+
+@_onnx_symbolic("aten::eq")
+@symbolic_helper.quantized_args(True, True)
+def eq(g: jit_utils.GraphContext, self, other):
+    if isinstance(self.type(), _C.DeviceObjType) and isinstance(
+        other.type(), _C.DeviceObjType
+    ):
+        # ONNX doesn't have devices, so consider them all to be equal.
+        # The no-op check for equality will get constant-folded.
+        return g.op("Constant", value_t=torch.tensor(True, dtype=torch.bool))
+    self_node = self.node()
+    other_node = other.node()
+    if self_node.kind() == other_node.kind() == "onnx::Constant":
+        if self_node.kindOf("value") == other_node.kindOf("value") == "s":
+            # Exporting strings to ONNX is not supported.
+            # If both strings are constant, we can compare them directly.
+            # The no-op check for equality will get constant-folded.
+            return g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    self_node.s("value") == other_node.s("value"),
+                    dtype=torch.bool,
+                ),
+            )
+
+    return g.op("Equal", self, other)
+
+
+@_onnx_symbolic("aten::ne")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+def ne(g: jit_utils.GraphContext, self, other):
+    return eq(g, self, other)
+
+
+@_onnx_symbolic("aten::gt")
+@symbolic_helper.quantized_args(True, True)
+def gt(g: jit_utils.GraphContext, input, other):
+    return _gt_impl(g, input, other)
+
+
+def _gt_impl(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("Greater", input, other)
+
+
+@_onnx_symbolic("aten::lt")
+@symbolic_helper.quantized_args(True, True)
+def lt(g: jit_utils.GraphContext, input, other):
+    return _lt_impl(g, input, other)
+
+
+def _lt_impl(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("Less", input, other)
+
+
+@_onnx_symbolic("aten::ge")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+def ge(g: jit_utils.GraphContext, input, other):
+    return _lt_impl(g, input, other)
+
+
+@_onnx_symbolic("aten::le")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+def le(g: jit_utils.GraphContext, input, other):
+    return _gt_impl(g, input, other)
+
+
+@_onnx_symbolic("aten::__and_")
+def __and_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise AND "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise AND "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("And", input, other)
+
+
+@_onnx_symbolic("aten::__or_")
+def __or_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("Or", input, other)
+
+
+@_onnx_symbolic("aten::__xor_")
+def __xor_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise XOR "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise XOR "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("Xor", input, other)
+
+
+@_onnx_symbolic("aten::logical_and")
+@wrap_logical_op_with_cast_to("Bool")
+def logical_and(g: jit_utils.GraphContext, input, other):
+    return g.op("And", input, other)
+
+
+@_onnx_symbolic("aten::logical_or")
+@wrap_logical_op_with_cast_to("Bool")
+def logical_or(g: jit_utils.GraphContext, input, other):
+    return g.op("Or", input, other)
+
+
+@_onnx_symbolic("aten::logical_xor")
+@wrap_logical_op_with_cast_to("Bool")
+def logical_xor(g: jit_utils.GraphContext, input, other):
+    return g.op("Xor", input, other)
+
+
+@_onnx_symbolic("aten::logical_not")
+def logical_not(g: jit_utils.GraphContext, input):
+    return g.op("Not", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL))
+
+
+@_onnx_symbolic("aten::__rshift_")
+def __rshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    self_scalar_type = _type_utils.JitScalarType.from_value(self)
+    if (
+        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
+        != self_scalar_type
+    ):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=self_scalar_type.onnx_type(),
+        )
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=self_scalar_type.onnx_type(),
+    )
+    rshift = g.op("Div", self, two_pow)
+    return rshift
+
+
+@_onnx_symbolic("aten::__lshift_")
+def __lshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    self_scalar_type = _type_utils.JitScalarType.from_value(self)
+    if (
+        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
+        != self_scalar_type
+    ):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=self_scalar_type.onnx_type(),
+        )
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=self_scalar_type.onnx_type(),
+    )
+    lshift = g.op("Mul", self, two_pow)
+    return lshift
+
+
+@_onnx_symbolic("aten::where")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
+    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
+    if not symbolic_helper._is_bool(condition):
+        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    if self is None:
+        condition = nonzero(g, condition)
+        return symbolic_helper._unbind_helper(
+            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
+        )
+    return g.op("Where", condition, self, other)
+
+
+@_onnx_symbolic("aten::log_softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    # PyTorch dim and ONNX axis have different meanings.
+    # See Softmax comment for details.
+    # TODO: remove this as onnx opset 11 spec allows negative axes
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+    if dim < 0:
+        dim = input_dim + dim
+    is_transpose_required = input_dim != dim + 1
+    # ONNX only supports log_softmax with dim = -1. Transpose must be added before and after log_softmax to support other cases.
+    if is_transpose_required:
+        axes = list(range(input_dim))
+        axes[dim], axes[-1] = axes[-1], axes[dim]
+        input = g.op("Transpose", input, perm_i=axes)
+        dim = input_dim - 1
+    return_op = g.op("LogSoftmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return_op = g.op(
+            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    if is_transpose_required:
+        return_op = g.op("Transpose", return_op, perm_i=axes)  # type: ignore[possibly-undefined]
+    return return_op
+
+
+@_onnx_symbolic("aten::_log_softmax")
+@symbolic_helper.parse_args("v", "i", "i")
+def _log_softmax(g: jit_utils.GraphContext, input, dim, half_to_float):
+    if (
+        half_to_float
+        and _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.UNDEFINED
+        )
+        == _type_utils.JitScalarType.HALF
+    ):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return log_softmax(g, input, dim)
+
+
+@_onnx_symbolic("aten::_convolution")
+@symbolic_helper.parse_args(
+    "v", "v", "v", "is", "is", "is", "i", "is", "i", "i", "i", "i", "i"
+)
+def _convolution(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    benchmark,
+    deterministic,
+    cudnn_enabled,
+    allow_tf32=None,
+):
+    weight_size = symbolic_helper._get_tensor_sizes(weight)
+    try:
+        kernel_shape = weight_size[2:]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        kernel_shape = None
+
+    if kernel_shape is None or any(i is None for i in kernel_shape):
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
+            input,
+        )
+
+    args = [input, weight]
+    # ONNX only supports 1D bias
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) == 1
+    ):
+        args.append(bias)
+
+    kwargs = {
+        "kernel_shape_i": weight_size[2:],
+        "strides_i": stride,
+        # NB: ONNX supports asymmetric padding, whereas PyTorch supports only
+        # symmetric padding
+        "pads_i": padding + padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+    }
+
+    if any(o != 0 for o in output_padding):
+        # ONNX supports both output_shape and output_padding. they are equivalent expressive.
+        # output_padding is more straightforward, so we use it here.
+        # output_shape = stride * (input_shape - 1) + output_padding + kernel_shape - padding * 2
+        assert transposed
+        assert len(stride) == len(output_padding)
+        kwargs["output_padding_i"] = output_padding
+
+    n = g.op("ConvTranspose" if transposed else "Conv", *args, **kwargs)
+
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) != 1
+    ):
+        return g.op("Add", n, bias)
+    else:
+        return n
+
+
+@_onnx_symbolic("aten::_convolution_mode")
+@symbolic_helper.parse_args(
+    "v",
+    "v",
+    "v",
+    "is",
+    "s",
+    "is",
+    "i",
+)
+def _convolution_mode(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+):
+    weight_size = symbolic_helper._get_tensor_sizes(weight)
+    try:
+        kernel_shape = weight_size[2:]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        kernel_shape = None
+
+    if kernel_shape is None or any(i is None for i in kernel_shape):
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
+            input,
+        )
+
+    args = [input, weight]
+    # ONNX only supports 1D bias
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) == 1
+    ):
+        args.append(bias)
+
+    if padding == "valid":
+        padding = "VALID"
+    elif padding == "same":
+        padding = "SAME_UPPER"
+    kwargs = {
+        "kernel_shape_i": weight_size[2:],
+        "strides_i": stride,
+        "auto_pad_s": padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+    }
+
+    n = g.op("Conv", *args, **kwargs)
+
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) != 1
+    ):
+        return g.op("Add", n, bias)
+    else:
+        return n
+
+
+@_onnx_symbolic("aten::convolution")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is", "i")
+def convolution(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv1d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+def conv1d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv2d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+def conv2d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv3d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+def conv3d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv_transpose1d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+def conv_transpose1d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv_transpose2d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+def conv_transpose2d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv_transpose3d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+def conv_transpose3d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::batch_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
+def batch_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    training,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
+    symbolic_helper.check_training_mode(training, "batch_norm")
+
+    if (
+        torch.is_autocast_enabled()
+        and not symbolic_helper.args_have_same_dtype(
+            [input, weight, bias, running_mean, running_var]
+        )
+        and GLOBALS.export_onnx_opset_version < 15
+    ):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "BatchNormalization",
+            9,
+            15,
+            "All input tensors must have the same `dtype`."
+            " Turn off Autocast or export using opset version 15.",
+            input,
+        )
+
+    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
+        g, input, weight, bias, running_mean, running_var
+    )
+    out = g.op(
+        "BatchNormalization",
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        epsilon_f=eps,
+        momentum_f=1 - momentum,
+        outputs=1 if not training else 5,
+    )
+    if not training:
+        return out
+    else:
+        res, new_running_mean, new_running_var, saved_mean, saved_var = out
+        new_running_mean.setType(running_mean.type())
+        new_running_var.setType(running_var.type())
+        saved_mean.setDebugName("batch_norm_dead_output-" + saved_mean.debugName())
+        saved_var.setDebugName("batch_norm_dead_output-" + saved_var.debugName())
+        return res
+
+
+@_onnx_symbolic("aten::native_layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f")
+def native_layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+) -> tuple[_C.Value, _C.Value, _C.Value]:
+    axes = [-i for i in range(len(normalized_shape), 0, -1)]
+
+    two_cst = symbolic_helper._generate_wrapped_number(g, 2.0)
+    eps_cst = symbolic_helper._generate_wrapped_number(g, eps)
+
+    if g.opset < 18:
+        mean = g.op("ReduceMean", input, axes_i=axes)
+    else:
+        mean = g.op(
+            "ReduceMean",
+            input,
+            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
+        )
+
+    numerator = sub(g, input, mean)
+
+    # Cast it to eps dtype to avoid precision loss
+    is_type_half = (
+        _type_utils.JitScalarType.from_value(numerator)
+        == _type_utils.JitScalarType.HALF
+    )
+    if is_type_half:
+        eps_dtype = _type_utils.JitScalarType.from_value(eps_cst)
+        numerator = g.op(
+            "Cast", numerator, to_i=_type_utils.JitScalarType(eps_dtype).onnx_type()
+        )
+
+    # variance = e((x - e(x))^2), and (x - e(x)) is the numerator in the layer_norm formula
+    if g.opset < 18:
+        variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
+    else:
+        variance = g.op(
+            "ReduceMean",
+            pow(g, numerator, two_cst),
+            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
+        )
+
+    denominator = sqrt(g, g.op("Add", variance, eps_cst))
+    normalized = g.op("Div", numerator, denominator)
+
+    # Cast back to input type as eps related ops are all done
+    if is_type_half:
+        input_dtype = _type_utils.JitScalarType.from_value(input)
+        normalized = g.op(
+            "Cast", normalized, to_i=_type_utils.JitScalarType(input_dtype).onnx_type()
+        )
+
+    if not (weight is None or symbolic_helper._is_none(weight)):
+        normalized = mul(g, normalized, weight)
+    if not (bias is None or symbolic_helper._is_none(bias)):
+        normalized = add(g, normalized, bias)
+
+    # rdenominator := 1 / sqrt(variance + eps)
+    # According to aten::native_layer_norm, rdenominator should have the same dtype as input,
+    # mean and normalized, so we need to Cast it back
+    if is_type_half:
+        denominator = g.op(
+            "Cast",
+            denominator,
+            to_i=_type_utils.JitScalarType(input_dtype).onnx_type(),  # type: ignore[possibly-undefined]
+        )
+        rdenominator = g.op("Reciprocal", denominator)
+    else:
+        rdenominator = reciprocal(g, denominator)
+
+    return normalized, mean, rdenominator
+
+
+@_onnx_symbolic("aten::layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f", "b")
+def layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+    cudnn_enable: bool,
+) -> _C.Value:
+    normalized, _, _ = native_layer_norm(g, input, normalized_shape, weight, bias, eps)
+    return normalized
+
+
+@_onnx_symbolic("aten::instance_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "b", "f", "f", "b")
+def instance_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    use_input_stats: bool,
+    momentum: Number,
+    eps: Number,
+    cudnn_enabled: bool,
+):
+    symbolic_helper.check_training_mode(use_input_stats, "instance_norm")
+    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
+    if weight is None or symbolic_helper._is_none(weight):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm for unknown channel size.",
+                input,
+            )
+        weight_value = torch.tensor(
+            [1.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or symbolic_helper._is_none(bias):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm for unknown channel size.",
+                input,
+            )
+        bias_value = torch.tensor(
+            [0.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        bias = g.op("Constant", value_t=bias_value)
+    if (
+        running_mean is None
+        or symbolic_helper._is_none(running_mean)
+        or running_var is None
+        or symbolic_helper._is_none(running_var)
+    ):
+        return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
+    else:
+        input_size = symbolic_helper._get_tensor_sizes(input)
+        # If input shape is [N, C, H, W], reshape to [1, N * C, H, W] and call batch_norm.
+        # For more information instance_norm():
+        # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Normalization.cpp#L542
+        input_size_reshape = input_size.copy()
+        n = input_size[0]
+        if n is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm training for unknown "
+                "batch size.",
+                input,
+            )
+        c = input_size[1]
+        input_size_reshape[0] = 1
+        input_size_reshape[1] = n * c
+        weight_ = repeat(
+            g, weight, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
+        )
+        bias_ = repeat(
+            g, bias, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
+        )
+        running_mean_ = repeat(
+            g,
+            running_mean,
+            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
+        )
+        running_var_ = repeat(
+            g,
+            running_var,
+            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
+        )
+        input_reshaped = g.op(
+            "Reshape",
+            input,
+            g.op("Constant", value_t=torch.LongTensor(input_size_reshape)),
+        )
+        out = batch_norm(
+            g,
+            input_reshaped,
+            weight_,
+            bias_,
+            running_mean_,
+            running_var_,
+            use_input_stats,
+            momentum,
+            eps,
+            cudnn_enabled,
+        )
+        return view(g, out, g.op("Constant", value_t=torch.tensor(input_size)))
+
+
+@_onnx_symbolic("aten::unfold")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
+    sizes = symbolic_helper._get_tensor_sizes(input)
+    # FIXME(justinchuby): Get rid of the try catch here to improve readability
+    try:
+        sizedim = sizes[dimension]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        sizedim = None
+    if sizedim is not None:
+        low_indices = range(0, sizedim, step)
+        hi_indices = range(size, sizedim + 1, step)
+        stack = [
+            symbolic_helper._slice_helper(
+                g, input, axes=[dimension], starts=[low], ends=[hi]
+            )
+            for low, hi in zip(low_indices, hi_indices)
+        ]
+        ndim = len(sizes)
+        perm = list(range(0, ndim))
+        perm.append(perm.pop(dimension))
+        unsqueeze = [
+            symbolic_helper._unsqueeze_helper(
+                g, g.op("Transpose", t, perm_i=perm), [dimension]
+            )
+            for t in stack
+        ]
+        return g.op("Concat", *unsqueeze, axis_i=dimension)
+    else:
+        return symbolic_helper._unimplemented(
+            "Unfold", "input size not accessible", input
+        )
+
+
+@_onnx_symbolic("aten::elu")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "t", "t", "t")
+def elu(g: jit_utils.GraphContext, input, alpha, scale, input_scale):
+    if scale and scale != 1.0:
+        return symbolic_helper._unimplemented(
+            "scale", "does not support scale in Elu", scale
+        )
+    if input_scale and input_scale != 1.0:
+        return symbolic_helper._unimplemented(
+            "input_scale", "does not support input_scale in Elu", input_scale
+        )
+    # See Note [Export inplace]
+    return g.op("Elu", input, alpha_f=symbolic_helper._scalar(alpha))
+
+
+@_onnx_symbolic("aten::selu")
+@symbolic_helper.quantized_args(True)
+def selu(g: jit_utils.GraphContext, input):
+    return g.op("Selu", input)
+
+
+@_onnx_symbolic("aten::index_select")
+@symbolic_helper.parse_args("v", "i", "v")
+def index_select(g: jit_utils.GraphContext, self, dim, index):
+    # In case of a scalar index, index_select returns a tensor with the same rank as the input.
+    # To match this behavior in ONNX, we make index a 1D tensor so that the following gather
+    # also produces a tensor with the same rank as the input.
+    return symbolic_helper._select_helper(g, self, dim, index)
+
+
+@_onnx_symbolic("aten::index_put")
+def index_put(g: jit_utils.GraphContext, self, indices_list_value, values, accumulate):
+    if symbolic_helper._is_packed_list(indices_list_value):
+        indices_list = symbolic_helper._unpack_list(indices_list_value)
+    else:
+        indices_list = [indices_list_value]
+
+    accumulate = symbolic_helper._parse_arg(accumulate, "b")
+
+    if len(indices_list) == 0:
+        if accumulate:
+            return add(g, self, values)
+        return values
+    symbolic_helper._onnx_opset_unsupported("index_put", 9, 11, self)
+
+
+@_onnx_symbolic("aten::index_fill")
+def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, self)
+    expanded_value = expand(g, value, expanded_index_shape, None)
+
+    return scatter(g, self, dim, expanded_index, expanded_value)
+
+
+@_onnx_symbolic("aten::index_copy")
+def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
+    _expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    return scatter(g, self, dim, expanded_index, source)
+
+
+@_onnx_symbolic("aten::bucketize")
+@symbolic_helper.parse_args("v", "v", "b", "b")
+def bucketize(
+    g: jit_utils.GraphContext, self, boundaries, out_int32=False, right=False
+):
+    out_type = _C_onnx.TensorProtoDataType.INT64
+    if out_int32:
+        out_type = _C_onnx.TensorProtoDataType.INT32
+    # A tensor expanded_boundaries is created such that it
+    # contains a copy of boundaries for each element of self.
+    new_shape = g.op("Concat", g.op("Shape", boundaries), g.op("Shape", self), axis_i=0)
+    # Unsqueeze step is performed to respect ONNX's numpy style broadcasting for comparison ops
+    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    assert tensor_rank is not None
+    unsqueeze_axes = list(range(1, tensor_rank + 1))
+    expanded_boundaries = expand(
+        g,
+        symbolic_helper._unsqueeze_helper(g, boundaries, unsqueeze_axes),
+        new_shape,
+        None,
+    )
+    # Compare each element of self to boundaries to get a tensor
+    # with leading 1s and trailing 0s.
+    # e.g., 4 > [1, 3, 4] = [1, 1, 0]
+    # The index of the last 1 is the bucket where the element should go.
+    if right:
+        cond = ge(g, self, expanded_boundaries)
+    else:
+        cond = gt(g, self, expanded_boundaries)
+    cond_out = g.op("Cast", cond, to_i=out_type)
+    # Sum to get the number of 1s corresponding to each element,
+    # which is the same as the bucket index.
+    # e.g., sum(4 > [1, 3, 4]) = sum([1, 1, 0]) = 2
+    return symbolic_helper._reducesum_helper(g, cond_out, axes_i=[0], keepdims_i=0)
+
+
+@_onnx_symbolic("aten::type_as")
+def type_as(g: jit_utils.GraphContext, self, other):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    other_dtype = symbolic_helper._try_get_scalar_type(other)
+    if self_dtype == other_dtype and self_dtype is not None:
+        return self
+    if other_dtype is not None:
+        return g.op(
+            "Cast",
+            self,
+            to_i=other_dtype.onnx_type(),
+        )
+
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of type_as for tensor "
+        "of unknown dtype. Please check if the dtype of the "
+        "parameter passed to the type_as function is correct.",
+        other,
+    )
+
+
+@_onnx_symbolic("aten::cosine_similarity")
+@symbolic_helper.parse_args("v", "v", "i", "f")
+def cosine_similarity(g: jit_utils.GraphContext, x1, x2, dim, eps):
+    cross = symbolic_helper._reducesum_helper(
+        g, mul(g, x1, x2), axes_i=[dim], keepdims_i=0
+    )
+    x1_l2 = symbolic_helper._reducesum_helper(
+        g, mul(g, x1, x1), axes_i=[dim], keepdims_i=0
+    )
+    x2_l2 = symbolic_helper._reducesum_helper(
+        g, mul(g, x2, x2), axes_i=[dim], keepdims_i=0
+    )
+    div_tens = max(
+        g, sqrt(g, mul(g, x1_l2, x2_l2)), g.op("Constant", value_t=torch.tensor([eps]))
+    )
+    return div(g, cross, div_tens)
+
+
+@_onnx_symbolic("aten::pairwise_distance")
+def pairwise_distance(g: jit_utils.GraphContext, input1, input2, p, eps, keepdim):
+    if not symbolic_helper._is_value(eps):
+        eps = g.op("Constant", value_t=torch.tensor([eps]))
+    inv_p = div(
+        g,
+        g.op("Constant", value_t=torch.tensor([1], dtype=torch.float)),
+        add(g, p, eps),
+    )
+    summation = symbolic_helper._reducesum_helper(
+        g,
+        pow(g, sub(g, input1, input2), p),
+        axes_i=[-1],
+        keepdims_i=symbolic_helper._parse_arg(keepdim, "i"),
+    )
+    return pow(g, summation, inv_p)
+
+
+@_onnx_symbolic("aten::clone")
+# ignore clone operators that are inserted by PyTorch autograd
+def clone(g: jit_utils.GraphContext, input, unused_memory_format):
+    return input
+
+
+@_onnx_symbolic("aten::abs")
+def abs(g: jit_utils.GraphContext, self):
+    return g.op("Abs", self)
+
+
+@_onnx_symbolic("aten::log")
+def log(g: jit_utils.GraphContext, self):
+    return g.op("Log", self)
+
+
+@_onnx_symbolic("aten::log1p")
+def log1p(g: jit_utils.GraphContext, self):
+    return log(g, add(g, symbolic_helper._if_scalar_type_as(torch.ones(1), self), self))
+
+
+@_onnx_symbolic("aten::log10")
+def log10(g: jit_utils.GraphContext, self):
+    _ln10 = 2.30258509299404568401
+    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor([_ln10])))
+
+
+@_onnx_symbolic("aten::pow")
+def pow(g: jit_utils.GraphContext, self, exponent):
+    f_dtype = _type_utils.JitScalarType.from_value(self)
+    if not symbolic_helper._is_fp(self):
+        f_dtype = _type_utils.JitScalarType.FLOAT
+        self = g.op("Cast", self, to_i=f_dtype.onnx_type())
+    if not symbolic_helper._is_fp(exponent):
+        exponent = g.op(
+            "Cast",
+            exponent,
+            to_i=f_dtype.onnx_type(),
+        )
+    pow = g.op("Pow", self, exponent)
+    return pow
+
+
+@_onnx_symbolic("aten::clamp")
+def clamp(g: jit_utils.GraphContext, self, min, max):
+    # min or max may be None that we need to dispatch to
+    # Clip separately, as ONNX does not have None syntax
+    if symbolic_helper._is_none(min):
+        return clamp_max(g, self, max)
+    elif symbolic_helper._is_none(max):
+        return clamp_min(g, self, min)
+    else:
+        if symbolic_helper._is_constant(min) and symbolic_helper._is_constant(max):
+            return symbolic_helper._op_with_optional_float_cast(
+                g,
+                "Clip",
+                self,
+                min_f=symbolic_helper._parse_arg(min, "f"),
+                max_f=symbolic_helper._parse_arg(max, "f"),
+                opset_before=12,
+            )
+        else:
+            return clamp_max(g, clamp_min(g, self, min), max)
+
+
+@_onnx_symbolic("aten::clamp_min")
+@symbolic_helper.parse_args("v", "v")
+def clamp_min(g: jit_utils.GraphContext, self, min):
+    if symbolic_helper._is_constant(min):
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, min_f=symbolic_helper._parse_arg(min, "f"), opset_before=12
+        )
+    else:
+        dtype = _type_utils.JitScalarType.from_value(self)
+        min = g.op("Cast", min, to_i=dtype.onnx_type())
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Max", self, min, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::clamp_max")
+@symbolic_helper.parse_args("v", "v")
+def clamp_max(g: jit_utils.GraphContext, self, max):
+    if symbolic_helper._is_constant(max):
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, max_f=symbolic_helper._parse_arg(max, "f"), opset_before=12
+        )
+    else:
+        dtype = _type_utils.JitScalarType.from_value(self)
+        max = g.op("Cast", max, to_i=dtype.onnx_type())
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Min", self, max, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::max")
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+# TODO(justinchuby): Support multiple quantized args in output
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::maximum")
+@symbolic_helper.quantized_args(True, True)
+def maximum(g: jit_utils.GraphContext, input, other):
+    return max(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::min")
+# TODO(justinchuby): Support multiple quantized args in output
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::minimum")
+@symbolic_helper.quantized_args(True, True)
+def minimum(g: jit_utils.GraphContext, input, other):
+    return min(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::amax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amax(g: jit_utils.GraphContext, self, dim, keepdim):
+    return g.op("ReduceMax", self, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::amin")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amin(g: jit_utils.GraphContext, self, dim, keepdim):
+    return g.op("ReduceMin", self, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::aminmax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i")
+def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
+    reduce_kwargs = {"keepdims_i": keepdim}
+    if not symbolic_helper._is_none(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        reduce_kwargs["axes_i"] = [dim]
+
+    return g.op("ReduceMin", self, **reduce_kwargs), g.op(
+        "ReduceMax", self, **reduce_kwargs
+    )
+
+
+@_onnx_symbolic("aten::exp")
+def exp(g: jit_utils.GraphContext, self):
+    return g.op("Exp", self)
+
+
+@_onnx_symbolic("aten::dropout_")
+@_onnx_symbolic("aten::dropout")
+@symbolic_helper.parse_args("v", "f", "i")
+def dropout(g: jit_utils.GraphContext, input, p, train):
+    symbolic_helper.check_training_mode(train, "dropout")
+    # if train is False, dropout is no-op
+    if not train:
+        return input
+    r, _ = g.op("Dropout", input, ratio_f=p, outputs=2)
+    return r
+
+
+@_onnx_symbolic(
+    "aten::alpha_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::alpha_dropout_")],
+)  # See Note [Export inplace]
+@_onnx_symbolic(
+    "aten::feature_alpha_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout_")],
+)
+@_onnx_symbolic(
+    "aten::feature_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::feature_dropout_")],
+)
+@_onnx_symbolic(
+    "aten::feature_alpha_dropout",
+    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout")],
+)
+@_onnx_symbolic(
+    "aten::alpha_dropout",
+    decorate=[symbolic_helper._apply_params("aten::alpha_dropout")],
+)
+@_onnx_symbolic(
+    "aten::feature_dropout",
+    decorate=[symbolic_helper._apply_params("aten::feature_dropout")],
+)
+def _unsupported_dropout(name: str):
+    @symbolic_helper.parse_args("v", "none", "b")
+    def feature_dropout(g, input, p, train):
+        # NB: In inference mode, FeatureDropout is exported as an identity op.
+        if train:
+            return symbolic_helper._unimplemented(name, "training mode", input)
+        return input
+
+    return feature_dropout
+
+
+@_onnx_symbolic("aten::norm")
+@symbolic_helper.parse_args("v", "t", "is", "i", "v")
+def norm(g: jit_utils.GraphContext, self, p, dim, keepdim, dtype=None):
+    if p == 1:
+        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL1")
+    elif p == 2:
+        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL2")
+    else:
+        raise errors.SymbolicValueError(
+            "ONNX export only p-norms with p of 1 or 2", self
+        )
+    result = f(g, self, dim=dim, keepdim=keepdim)
+    if dtype is not None:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    return result
+
+
+@_onnx_symbolic("aten::conv_tbc")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+def conv_tbc(g: jit_utils.GraphContext, input, weight, bias, pad):
+    # input must have 3 dimensions, see:
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ConvolutionTBC.cpp#L8-L10
+    # input = (time, batch, in_channels)
+    # weight = (kernel_width, in_channels, out_channels)
+    # bias = (out_channels,)
+    input = g.op("Transpose", input, perm_i=[1, 2, 0])
+    weight = g.op("Transpose", weight, perm_i=[2, 1, 0])
+    conv = conv1d(g, input, weight, bias, [1], [pad], [1], 1)
+    return g.op("Transpose", conv, perm_i=[2, 0, 1])
+
+
+@_onnx_symbolic("aten::_unique")
+@symbolic_helper.parse_args("v", "i", "i")
+def _unique(g: jit_utils.GraphContext, input, sorted, return_inverse):
+    return symbolic_helper._onnx_unsupported("_unique", input)
+
+
+@_onnx_symbolic("aten::_unique2")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def _unique2(g: jit_utils.GraphContext, input, sorted, return_inverse, return_counts):
+    symbolic_helper._onnx_opset_unsupported("_unique2", 9, 11, input)
+
+
+@_onnx_symbolic("aten::_cast_Byte")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Byte(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.UINT8)
+
+
+@_onnx_symbolic("aten::_cast_Char")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Char(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT8)
+
+
+@_onnx_symbolic("aten::_cast_Short")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Short(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT16)
+
+
+@_onnx_symbolic("aten::_cast_Int")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Int(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+
+
+@_onnx_symbolic("aten::_cast_Long")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Long(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+
+@_onnx_symbolic("aten::_cast_Half")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Half(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT16)
+
+
+@_onnx_symbolic("aten::_cast_Float")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Float(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+
+@_onnx_symbolic("aten::_cast_Double")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Double(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
+
+
+@_onnx_symbolic("aten::_cast_Bool")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Bool(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL)
+
+
+@_onnx_symbolic("aten::empty")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty(
+    g: jit_utils.GraphContext,
+    sizes,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::empty_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros_like(g, input, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::new_empty")
+def new_empty(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return empty(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::scalar_tensor")
+def scalar_tensor(g: jit_utils.GraphContext, scalar, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        dtype = _type_utils.JitScalarType.FLOAT
+    scalar = g.op("Cast", scalar, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    return scalar
+
+
+@_onnx_symbolic("aten::tensor")
+def tensor(
+    g: jit_utils.GraphContext, data, dtype=None, device=None, requires_grad=False
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if symbolic_helper._is_packed_list(data):
+        if dtype is None:
+            dtype = _type_utils.JitScalarType.from_value(
+                symbolic_helper._unpack_list(data)[0]
+            )
+        input_list = []
+        for t in symbolic_helper._unpack_list(data):
+            shape_reference = g.op("Constant", value_t=torch.LongTensor([1]))
+            t = symbolic_helper._reshape_helper(g, t, shape_reference)
+            t = g.op("Cast", t, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+            input_list.append(t)
+        return g.op("Concat", *input_list, axis_i=0)
+    else:
+        if dtype is None:
+            dtype = _type_utils.JitScalarType.from_value(data)
+        if symbolic_helper._is_list(data) and (
+            symbolic_helper._is_tensor_list(data)
+            or symbolic_helper._is_scalar_list(data)
+        ):
+            data = g.op("ConcatFromSequence", data, axis_i=0, new_axis_i=1)
+    return g.op("Cast", data, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+
+@_onnx_symbolic("aten::as_tensor")
+def as_tensor(g: jit_utils.GraphContext, data, dtype=None, device=None):
+    return tensor(g, data, dtype, device)
+
+
+@_onnx_symbolic("aten::zeros")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    # NOTE: no way to set device, layout and pin_memory in ONNX, so we ignore it
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+    if isinstance(sizes_, list) and len(sizes_) == 0:
+        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+    return g.op(
+        "ConstantOfShape",
+        sizes,
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::zeros_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def zeros_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    if symbolic_helper._is_none(dtype):
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op(
+        "ConstantOfShape",
+        shape,
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::new_zeros")
+def new_zeros(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::zero")
+def zero(g: jit_utils.GraphContext, self):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    return zeros_like(g, self, self_dtype)
+
+
+@_onnx_symbolic("aten::ones")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+    if isinstance(sizes_, list) and len(sizes_) == 0:
+        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+    return g.op(
+        "ConstantOfShape",
+        sizes,
+        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::ones_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def ones_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    if symbolic_helper._is_none(dtype):
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op(
+        "ConstantOfShape",
+        shape,
+        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::new_ones")
+def new_ones(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return ones(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::full")
+def full(
+    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
+):
+    const_value = symbolic_helper._maybe_get_const(value, "t")
+    if symbolic_helper._is_value(const_value):
+        dtype = _type_utils.JitScalarType.FLOAT if dtype is None else dtype
+        tmp = zeros(g, sizes, dtype, layout, device)
+        return add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        if dtype is None:
+            scalar_type = _type_utils.JitScalarType.FLOAT
+        else:
+            scalar_type = _type_utils.JitScalarType(dtype)
+        sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+        if isinstance(sizes_, list) and len(sizes_) == 0:
+            sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+        return g.op(
+            "ConstantOfShape",
+            sizes,
+            value_t=const_value.view(1).to(scalar_type.dtype()),
+        )
+
+
+@_onnx_symbolic("aten::full_like")
+def full_like(
+    g: jit_utils.GraphContext,
+    input,
+    fill_value,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    fill_value = symbolic_helper._maybe_get_const(fill_value, "f")
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if symbolic_helper._is_value(fill_value):
+        tmp = zeros_like(g, input, dtype, layout, device)
+        fill_value = g.op("Cast", fill_value, to_i=scalar_type.onnx_type())
+        return add(g, tmp, fill_value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        shape = g.op("Shape", input)
+        return g.op(
+            "ConstantOfShape",
+            shape,
+            value_t=torch.tensor([fill_value], dtype=scalar_type.dtype()),
+        )
+
+
+@_onnx_symbolic("aten::new_full")
+def new_full(
+    g: jit_utils.GraphContext,
+    self,
+    size,
+    fill_value,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return full(g, size, fill_value, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::eye")
+def eye(g: jit_utils.GraphContext, *args):
+    if len(args) == 5:
+        # aten::eye(n, dtype, layout, device, pin_memory)
+        n, dtype, layout, device, _pin_memory = args
+        dim_size = symbolic_helper._unsqueeze_helper(g, n, [0])
+        shape = g.op("Concat", dim_size, dim_size, axis_i=0)
+        tensor = zeros(g, shape, dtype, layout, device)
+        return g.op("EyeLike", tensor)
+    if len(args) == 6:
+        # aten::eye(n, m, dtype, layout, device, pin_memory)
+        n, m, dtype, layout, device, _pin_memory = args
+        shape = g.op(
+            "Concat",
+            symbolic_helper._unsqueeze_helper(g, n, [0]),
+            symbolic_helper._unsqueeze_helper(g, m, [0]),
+            axis_i=0,
+        )
+        tensor = zeros(g, shape, dtype, layout, device)
+        return g.op("EyeLike", tensor)
+
+    return symbolic_helper._unimplemented("aten::eye", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::slice")
+def slice(g: jit_utils.GraphContext, self, *args):
+    if len(args) == 4:
+        # aten::slice(Tensor self, int dim, int start, int end, int step) -> Tensor
+        dim, start, end, step = args
+        step = symbolic_helper._parse_arg(step, "i")
+        if step != 1:
+            raise errors.SymbolicValueError("step!=1 is currently not supported", self)
+        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
+            start.type(), _C.NoneType
+        )
+        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
+            end.type(), _C.NoneType
+        )
+        is_start_onnx_const = start.node().kind() == "onnx::Constant"
+        is_end_onnx_const = end.node().kind() == "onnx::Constant"
+        if (
+            ((not is_start_none) and (not is_start_onnx_const))
+            or ((not is_end_none) and (not is_end_onnx_const))
+            or dim.node().kind() != "onnx::Constant"
+        ):
+            if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
+                raise errors.SymbolicValueError(
+                    "Unsupported: ONNX export of Slice with dynamic inputs. DynamicSlice "
+                    "is a deprecated experimental op. Please use statically allocated "
+                    "variables or export to a higher opset version.",
+                    self,
+                )
+            else:
+                start_unsqueezed = symbolic_helper._unsqueeze_helper(g, start, [0])
+                end_unsqueezed = symbolic_helper._unsqueeze_helper(g, end, [0])
+                dim_unsqueezed = symbolic_helper._unsqueeze_helper(g, dim, [0])
+                return g.op(
+                    "DynamicSlice",
+                    self,
+                    start_unsqueezed,
+                    end_unsqueezed,
+                    dim_unsqueezed,
+                )
+        else:
+            start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
+            end = (
+                _constants.INT64_MAX
+                if is_end_none
+                else symbolic_helper._parse_arg(end, "i")
+            )
+            dim = symbolic_helper._parse_arg(dim, "i")
+            return symbolic_helper._slice_helper(
+                g, self, axes=[dim], starts=[start], ends=[end]
+            )
+    elif len(args) == 3:
+        # aten::slice(t[] l, int start, int end, int step) -> t[]
+        start, end, step = args
+        dim = 0
+        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
+            start.type(), _C.NoneType
+        )
+        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
+            end.type(), _C.NoneType
+        )
+        start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
+        end = (
+            _constants.INT64_MAX
+            if is_end_none
+            else symbolic_helper._parse_arg(end, "i")
+        )
+        return symbolic_helper._slice_helper(
+            g, self, axes=[dim], starts=[start], ends=[end]
+        )
+
+    return symbolic_helper._unimplemented("aten::slice", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::hardtanh")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "f")
+def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Clip", self, min_f=min_val, max_f=max_val, opset_before=12
+    )
+
+
+@_onnx_symbolic("aten::hardswish")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v")
+def hardswish(g: jit_utils.GraphContext, self):
+    hs = hardsigmoid(g, self)
+    return g.op("Mul", self, hs)
+
+
+@_onnx_symbolic("aten::hardsigmoid")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qhardsigmoid.cpp
+@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
+@symbolic_helper.parse_args("v")
+def hardsigmoid(g: jit_utils.GraphContext, self):
+    # Set alpha_f to 1 / 6 to make op equivalent to PyTorch's definition of Hardsigmoid.
+    # See https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html
+    return g.op("HardSigmoid", self, alpha_f=1 / 6)
+
+
+@_onnx_symbolic("aten::tanhshrink")
+@symbolic_helper.parse_args("v")
+def tanhshrink(g: jit_utils.GraphContext, self):
+    return g.op("Sub", self, tanh(g, self))
+
+
+@_onnx_symbolic("aten::hardshrink")
+@symbolic_helper.parse_args("v", "f")
+def hardshrink(g: jit_utils.GraphContext, self, lambd):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    lambd_op = g.op(
+        "Constant",
+        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
+    )
+    cond = logical_or(g, gt(g, self, lambd_op), lt(g, self, neg(g, lambd_op)))
+    return g.op(
+        "Where",
+        cond,
+        self,
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+
+
+@_onnx_symbolic("aten::softshrink")
+@symbolic_helper.parse_args("v", "f")
+def softshrink(g: jit_utils.GraphContext, self, lambd):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    lambd_op = g.op(
+        "Constant",
+        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
+    )
+    gt_cond = gt(g, self, lambd_op)
+    gt_out = g.op(
+        "Where",
+        gt_cond,
+        sub(g, self, lambd_op),
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+    lt_cond = lt(g, self, neg(g, lambd_op))
+    lt_out = g.op(
+        "Where",
+        lt_cond,
+        add(g, self, lambd_op),
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+    return add(g, gt_out, lt_out)
+
+
+@_onnx_symbolic("aten::alias")
+def alias(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::unsqueeze")
+@symbolic_helper.parse_args("v", "i")
+def unsqueeze(g: jit_utils.GraphContext, self, dim):
+    """Implement unsqueezing a pytorch tensor in ONNX by inserting a new dimension at the specified `dim`"""
+    # Handle negative dim
+    if dim < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            warnings.warn(
+                "ONNX export unsqueeze with negative axis "
+                + str(dim)
+                + " might cause the onnx model to be incorrect. "
+                + "Negative axis is not supported in ONNX. "
+                + "Axis is converted to "
+                + str(dim + rank + 1)
+                + " based on input shape at export time. "
+                + "Passing an tensor of different rank in execution will be incorrect."
+            )
+            dim = dim + rank + 1
+        else:
+            return symbolic_helper._unimplemented(
+                "unsqueeze", "negative axis with unknown input rank", self
+            )
+
+    return symbolic_helper._unsqueeze_helper(g, self, axes_i=[dim])
+
+
+@_onnx_symbolic("aten::sort")
+# TODO(justinchuby): Support multiple quantized args in output
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def sort(g: jit_utils.GraphContext, self, dim, descending, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "Sort", "Out parameter is not supported for sort", self
+        )
+    self_sizes = symbolic_helper._get_tensor_sizes(self)
+    try:
+        dim_size = self_sizes[dim]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        dim_size = None
+
+    if dim_size is None:
+        return symbolic_helper._unimplemented("Sort", "input size not accessible", self)
+
+    return g.op("TopK", self, k_i=dim_size, axis_i=dim, outputs=2)
+
+
+@_onnx_symbolic("aten::numel")
+def numel(g: jit_utils.GraphContext, self):
+    return symbolic_helper._numel_helper(g, self)
+
+
+@_onnx_symbolic("aten::topk")
+# TODO(justinchuby): Support multiple quantized args in output
+@symbolic_helper.parse_args("v", "i", "i", "i", "i", "none")
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "TopK", "Out parameter is not supported for topk", self
+        )
+    if not largest:
+        symbolic_helper._unimplemented("TopK", "Ascending TopK is not supported", self)
+
+    return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
+
+
+@_onnx_symbolic("prim::convert_element_type")
+def convert_element_type(g: jit_utils.GraphContext, self, *args):
+    dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+    return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+
+@_onnx_symbolic("aten::to")
+def to(g: jit_utils.GraphContext, self, *args):
+    def is_aten_to_device_only(args):
+        if len(args) == 4:
+            # aten::to(Tensor, Device, bool, bool, memory_format)
+            return (
+                args[0].node().kind() == "prim::device"
+                or args[0].type().isSubtypeOf(_C.ListType.ofInts())
+                or isinstance(args[0].type(), _C.DeviceObjType)
+            )
+        elif len(args) == 5:
+            # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+            # When dtype is None, this is a aten::to(device) call
+            dtype = symbolic_helper._get_const(args[1], "i", "dtype")
+            return dtype is None
+        elif len(args) in (6, 7):
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
+            # When dtype is None, this is a aten::to(device) call
+            dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+            return dtype is None
+        return False
+
+    # ONNX doesn't have a concept of a device, so we ignore device-only casts
+    if is_aten_to_device_only(args):
+        return self
+
+    if len(args) == 4:
+        # TestONNXRuntime::test_ones_bool shows args[0] of aten::to() can be onnx::Constant[value=<Tensor>]()
+        # In this case, the constant value is a tensor not int,
+        # so symbolic_helper._maybe_get_const(args[0], 'i') would not work.
+        dtype = args[0]
+        if (
+            symbolic_helper._is_value(args[0])
+            and args[0].node().kind() == "onnx::Constant"
+        ):
+            tval = symbolic_helper._node_get(args[0].node(), "value")
+            if isinstance(tval, torch.Tensor):
+                if len(tval.shape) == 0:
+                    tval = tval.item()
+                    dtype = int(tval)
+                else:
+                    dtype = tval
+
+        if symbolic_helper._is_value(dtype) or isinstance(dtype, torch.Tensor):
+            # aten::to(Tensor, Tensor, bool, bool, memory_format)
+            dtype = _type_utils.JitScalarType.from_value(args[0])
+            return g.op(
+                "Cast",
+                self,
+                to_i=dtype.onnx_type(),
+            )
+        else:
+            # aten::to(Tensor, ScalarType, bool, bool, memory_format)
+            # memory_format is ignored
+            return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 5:
+        # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+        dtype = symbolic_helper._get_const(args[1], "i", "dtype")
+        # memory_format is ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 6:
+        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
+        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+        # Layout, device and memory_format are ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 7:
+        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
+        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+        # Layout, device and memory_format are ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+    return symbolic_helper._onnx_unsupported("Unknown aten::to signature", self)
+
+
+@_onnx_symbolic("aten::repeat")
+def repeat(g: jit_utils.GraphContext, self, repeats):
+    dtype = _type_utils.JitScalarType.INT64
+    shape_ = ones_like(g, repeats, dtype)
+    self = g.op("Expand", self, shape_)
+    return g.op("Tile", self, repeats)
+
+
+@_onnx_symbolic("aten::repeat_interleave")
+def repeat_interleave(
+    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
+):
+    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
+    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
+    input_sizes = symbolic_helper._get_tensor_sizes(self)
+    if repeats_dim is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
+            self,
+        )
+    if repeats_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
+            self,
+        )
+    if input_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
+            self,
+        )
+
+    # if dim is None flatten
+    # By default, use the flattened input array, and return a flat output array
+    if symbolic_helper._is_none(dim):
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        dim = torch.tensor(0, dtype=torch.int64)
+    else:
+        dim = symbolic_helper._maybe_get_scalar(dim)
+
+    # Handle cases where dim is negative
+    if dim < 0:
+        dim += len(input_sizes)
+
+    input_sizes_temp = input_sizes.copy()
+    for idx, input_size in enumerate(input_sizes):
+        if input_size is None:
+            input_sizes[idx], input_sizes_temp[idx] = 0, -1
+
+    # Cases where repeats is an int or single value tensor
+    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
+        if input_sizes[dim] == 0:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+                self,
+            )
+        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
+            g, self, repeats, dim
+        )
+
+    # Cases where repeats is a 1 dim Tensor
+    elif repeats_dim == 1:
+        if input_sizes[dim] == 0:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+                self,
+            )
+        if repeats_sizes[0] is None:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported for cases with dynamic repeats",
+                self,
+            )
+        assert repeats_sizes[0] == input_sizes[dim], (
+            "repeats must have the same size as input along dim"
+        )
+        reps = repeats_sizes[0]
+    else:
+        raise errors.SymbolicValueError("repeats must be 0-dim or 1-dim tensor", self)
+
+    final_splits = []
+    r_splits = symbolic_helper._repeat_interleave_split_helper(g, repeats, reps, 0)
+    i_splits = symbolic_helper._repeat_interleave_split_helper(g, self, reps, dim)
+    input_sizes[dim], input_sizes_temp[dim] = -1, 1
+    for idx, r_split in enumerate(r_splits):
+        i_split = unsqueeze(g, i_splits[idx], dim + 1)
+        r_concat = [
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[: dim + 1])),
+            r_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1 :])),
+        ]
+        r_concat = g.op("Concat", *r_concat, axis_i=0)
+        i_split = expand(g, i_split, r_concat, None)
+        i_split = symbolic_helper._reshape_helper(
+            g,
+            i_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes)),
+            allowzero=0,
+        )
+        final_splits.append(i_split)
+    return g.op("Concat", *final_splits, axis_i=dim)
+
+
+@_onnx_symbolic("aten::pixel_shuffle")
+@symbolic_helper.parse_args("v", "i")
+def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
+    dims = symbolic_helper._get_tensor_sizes(self)
+    if len(dims) != 4:
+        return symbolic_helper._unimplemented(
+            "pixel_shuffle", "only support 4d input", self
+        )
+    if any(i is None for i in dims[1:]):
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            symbolic_helper._unsqueeze_helper(g, self, [2, 3]),
+            g.op(
+                "Constant",
+                value_t=torch.tensor([0, -1, upscale_factor, upscale_factor, 0, 0]),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
+        # For dynamic input shapes, two reshapes are performed
+        reshape_h = symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op("Constant", value_t=torch.tensor([0, 0, -1, 1, 0, 0])),
+            allowzero=0,
+        )
+        reshape_w = symbolic_helper._reshape_helper(
+            g,
+            reshape_h,
+            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, 1])),
+            allowzero=0,
+        )
+        return symbolic_helper._squeeze_helper(g, reshape_w, [3, 5])
+    else:
+        output_channel = dims[1] // upscale_factor // upscale_factor
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            self,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        upscale_factor,
+                        upscale_factor,
+                        dims[2],
+                        dims[3],
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
+        return symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        dims[2] * upscale_factor,
+                        dims[3] * upscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+
+
+@_onnx_symbolic("aten::pixel_unshuffle")
+@symbolic_helper.parse_args("v", "i")
+def pixel_unshuffle(g: jit_utils.GraphContext, self, downscale_factor):
+    dims = symbolic_helper._get_tensor_sizes(self)
+    if len(dims) != 4:
+        return symbolic_helper._unimplemented(
+            "pixel_shuffle", "only support 4d input", self
+        )
+    if any(i is None for i in dims[1:]):
+        # For dynamic input shapes, two reshapes are performed
+        reshape_h = symbolic_helper._reshape_helper(
+            g,
+            symbolic_helper._unsqueeze_helper(g, self, [3]),
+            g.op("Constant", value_t=torch.tensor([0, 0, -1, downscale_factor, 0])),
+            allowzero=0,
+        )
+        reshape_w = symbolic_helper._reshape_helper(
+            g,
+            reshape_h,
+            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, downscale_factor])),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", reshape_w, perm_i=[0, 1, 3, 5, 2, 4])
+        final_reshape = symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op("Constant", value_t=torch.tensor([0, -1, 1, 1, 0, 0])),
+            allowzero=0,
+        )
+        return symbolic_helper._squeeze_helper(g, final_reshape, [2, 3])
+    else:
+        output_channel = dims[1] * downscale_factor * downscale_factor
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            self,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        dims[1],
+                        dims[2] // downscale_factor,
+                        downscale_factor,
+                        dims[3] // downscale_factor,
+                        downscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 3, 5, 2, 4])
+        return symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        dims[2] // downscale_factor,
+                        dims[3] // downscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+
+
+def _generic_rnn(
+    g: jit_utils.GraphContext,
+    variant,
+    input,
+    initial_states,
+    all_weights,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first=None,
+    batch_sizes=None,
+):
+    warnings.warn(
+        "Exporting a model to ONNX with a batch_size other than 1, "
+        + "with a variable length with "
+        + variant
+        + " can cause an error "
+        + "when running the ONNX model with a different batch size. "
+        + "Make sure to save the model with a batch size of 1, "
+        + "or define the initial states (h0/c0) as inputs of the model. "
+    )
+
+    onnxActivations = [
+        "Relu",
+        "Tanh",
+        "Sigmoid",
+        "Affine",
+        "LeakyRelu",
+        "ThresholdedRelu",
+        "ScaledTanh",
+        "HardSigmoid",
+        "Elu",
+        "Softsign",
+        "Softplus",
+    ]
+    variantToOnnxActivationMap = dict(
+        zip([act_fun.lower() for act_fun in onnxActivations], onnxActivations)
+    )
+    weights_per_layer = 4 if has_biases else 2
+    # this means that projections are used inside LSTM, so need to tell user that it's not supported
+    if variant == "LSTM" and len(all_weights) != num_layers * weights_per_layer * (
+        1 + bidirectional
+    ):
+        return symbolic_helper._unimplemented("LSTM", "LSTMs with projections", input)
+    assert len(all_weights) == num_layers * weights_per_layer * (1 + bidirectional)
+    layer_weights = [
+        all_weights[i : i + weights_per_layer]
+        for i in range(0, len(all_weights), weights_per_layer)
+    ]
+    if batch_first:
+        # batch, seq, feat -> seq, batch, feat
+        input = g.op("Transpose", input, perm_i=[1, 0, 2])
+    if dropout and train:
+        return symbolic_helper._unimplemented(
+            "RNN/GRU/LSTM", "dropout in training mode", input
+        )
+
+    if variant.startswith("RNN"):
+        nonlinearity = variantToOnnxActivationMap[variant[4:].lower()]
+        variant = "RNN"
+
+    w_hh = all_weights[1]
+    hidden_size = symbolic_helper._get_tensor_dim_size(w_hh, 1)
+    if hidden_size is None:
+        return symbolic_helper._unimplemented(
+            "RNN/GRU/LSTM", "unknown hidden size", input
+        )
+
+    unidirectional = not bidirectional
+
+    prev_output = input
+
+    h_outs = []
+    if variant == "RNN" or variant == "GRU":
+        h0 = initial_states
+    elif variant == "LSTM":
+        h0, c0 = initial_states
+        c_outs = []
+
+    sequence_lens = unused(g) if batch_sizes is None else batch_sizes
+
+    if variant == "GRU":
+        # pytorch is reset, input, hidden
+        # onnx is    input, reset, hidden
+        reform_permutation = [(1, 2), (0, 1), (2, 3)]
+    elif variant == "LSTM":
+        # pytorch is input, forget, cell, output.
+        # onnx is    input, output, forget, cell.
+        reform_permutation = [(0, 1), (3, 4), (1, 3)]
+
+    def reform_weights(g, w, n, intervals):
+        slices = [
+            symbolic_helper._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n])
+            for x, y in intervals
+        ]
+        return g.op("Concat", *slices, axis_i=0)
+
+    def transform_weights_no_bias(layer_index):
+        weights = layer_weights[layer_index]
+        if variant == "RNN":
+            weight_ih, weight_hh = weights
+        elif variant == "GRU" or variant == "LSTM":
+            weight_ih, weight_hh = (
+                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
+            )
+        return tuple(
+            symbolic_helper._unsqueeze_helper(g, x, [0])
+            for x in (weight_ih, weight_hh)  # type: ignore[possibly-undefined]
+        )
+
+    def transform_weights(layer_index):
+        weights = layer_weights[layer_index]
+        if variant == "RNN":
+            weight_ih, weight_hh, bias_ih, bias_hh = weights
+        elif variant == "GRU" or variant == "LSTM":
+            weight_ih, weight_hh, bias_ih, bias_hh = (
+                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
+            )
+        bias_concat = g.op("Concat", bias_ih, bias_hh, axis_i=0)  # type: ignore[possibly-undefined]
+        return tuple(
+            symbolic_helper._unsqueeze_helper(g, x, [0])
+            for x in (weight_ih, weight_hh, bias_concat)  # type: ignore[possibly-undefined]
+        )
+
+    def retrieve_state(x, start, end):
+        return (
+            x
+            if num_layers == 1
+            else symbolic_helper._slice_helper(
+                g, x, axes=[0], starts=[start], ends=[end]
+            )
+        )
+
+    for i in range(num_layers):
+        if unidirectional:
+            if weights_per_layer == 4:
+                weight_ih, weight_hh, bias_concat = transform_weights(i)
+            else:
+                weight_ih, weight_hh = transform_weights_no_bias(i)
+                bias_concat = unused(g)
+
+            state_indices = i, i + 1
+        else:
+            if weights_per_layer == 4:
+                weight_ih_f, weight_hh_f, bias_f = transform_weights(2 * i)
+                weight_ih_b, weight_hh_b, bias_b = transform_weights(2 * i + 1)
+                bias_concat = g.op("Concat", bias_f, bias_b, axis_i=0)
+            else:
+                weight_ih_f, weight_hh_f = transform_weights_no_bias(2 * i)
+                weight_ih_b, weight_hh_b = transform_weights_no_bias(2 * i + 1)
+                bias_concat = unused(g)
+
+            weight_ih = g.op("Concat", weight_ih_f, weight_ih_b, axis_i=0)
+            weight_hh = g.op("Concat", weight_hh_f, weight_hh_b, axis_i=0)
+
+            state_indices = 2 * i, 2 * i + 2
+
+        inputs = [prev_output, weight_ih, weight_hh, bias_concat, sequence_lens]
+
+        inputs.append(retrieve_state(h0, *state_indices))  # type: ignore[possibly-undefined]
+        if variant == "LSTM":
+            inputs.append(retrieve_state(c0, *state_indices))  # type: ignore[possibly-undefined]
+
+        extra_kwargs = {} if unidirectional else {"direction_s": "bidirectional"}
+        if variant == "RNN":
+            if bidirectional:
+                activation = [nonlinearity, nonlinearity]  # type: ignore[possibly-undefined]
+            else:
+                activation = [nonlinearity]  # type: ignore[possibly-undefined]
+
+            prev_output, h_out = g.op(
+                "RNN",
+                *inputs,
+                outputs=2,
+                hidden_size_i=hidden_size,
+                activations_s=activation,
+                **extra_kwargs,
+            )
+        elif variant == "GRU":
+            prev_output, h_out = g.op(
+                "GRU",
+                *inputs,
+                outputs=2,
+                hidden_size_i=hidden_size,
+                linear_before_reset_i=1,
+                **extra_kwargs,
+            )
+        elif variant == "LSTM":
+            prev_output, h_out, c_out = g.op(
+                "LSTM", *inputs, outputs=3, hidden_size_i=hidden_size, **extra_kwargs
+            )
+
+        if bidirectional:
+            # The ONNX RNN/GRU/LSTM produce an output of dimensions
+            #   seq_len, num_directions, batch, hidden_size
+            # We have to convert to match pytorch's expected
+            #   seq_len, batch, num_directions * hidden_size
+            # by first moving num_directions before hidden_size with
+            # Transpose, and then combining it with hidden_size
+            # with Reshape.
+            prev_output = g.op("Transpose", prev_output, perm_i=[0, 2, 1, 3])
+            prev_output = symbolic_helper._reshape_helper(
+                g,
+                prev_output,
+                g.op("Constant", value_t=torch.LongTensor([0, 0, -1])),
+                allowzero=0,
+            )
+        else:
+            prev_output = symbolic_helper._squeeze_helper(g, prev_output, [1])
+
+        h_outs.append(h_out)  # type: ignore[possibly-undefined]
+        if variant == "LSTM":
+            c_outs.append(c_out)  # type: ignore[possibly-undefined]
+    if batch_first:
+        # seq, batch, num_directions * hidden_size -> batch, seq, num_directions * hidden_size
+        prev_output = g.op("Transpose", prev_output, perm_i=[1, 0, 2])
+    h_outs = h_out if num_layers == 1 else g.op("Concat", *h_outs, axis_i=0)  # type: ignore[possibly-undefined]
+    if variant == "RNN" or variant == "GRU":
+        return prev_output, h_outs
+    elif variant == "LSTM":
+        c_outs = c_out if num_layers == 1 else g.op("Concat", *c_outs, axis_i=0)  # type: ignore[possibly-undefined]
+        return prev_output, h_outs, c_outs
+
+
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
+def _lstm_full(
+    g: jit_utils.GraphContext,
+    input,
+    hidden_v,
+    weight_v,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    hidden, weight = (
+        symbolic_helper._unpack_list(hidden_v),
+        symbolic_helper._unpack_list(weight_v),
+    )
+    return _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+    )
+
+
+@symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
+def _lstm_packed(
+    g: jit_utils.GraphContext,
+    input,
+    batch_sizes,
+    hidden_v,
+    weight_v,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    hidden, weight = (
+        symbolic_helper._unpack_list(hidden_v),
+        symbolic_helper._unpack_list(weight_v),
+    )
+    return _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_sizes=batch_sizes,
+    )
+
+
+@_onnx_symbolic("aten::lstm")
+def lstm(g: jit_utils.GraphContext, *args):
+    if symbolic_helper._is_tensor_list(args[3]):
+        return _lstm_packed(g, *args)
+    else:
+        return _lstm_full(g, *args)
+
+
+@_onnx_symbolic("aten::lstm_cell")
+def lstm_cell(g: jit_utils.GraphContext, self, hidden, w_ih, w_hh, b_ih, b_hh):
+    input = symbolic_helper._unsqueeze_helper(g, self, [0])
+    hidden = symbolic_helper._unpack_list(hidden)
+    hidden = [symbolic_helper._unsqueeze_helper(g, x, [0]) for x in hidden]
+    weight = (
+        (w_ih, w_hh, b_ih, b_hh) if symbolic_helper._is_tensor(b_ih) else (w_ih, w_hh)
+    )
+    has_biases = True if symbolic_helper._is_tensor(b_ih) else False
+    _, h_outs, c_outs = _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers=1,
+        dropout=0,
+        train=0,
+        bidirectional=False,
+        batch_first=False,
+    )
+    return symbolic_helper._squeeze_helper(
+        g, h_outs, [0]
+    ), symbolic_helper._squeeze_helper(g, c_outs, [0])
+
+
+@_onnx_symbolic(
+    "aten::gru", decorate=[symbolic_helper._apply_params("GRU"), _export("gru")]
+)
+@_onnx_symbolic(
+    "aten::rnn_tanh",
+    decorate=[symbolic_helper._apply_params("RNN_TANH"), _export("rnn_tanh")],
+)
+@_onnx_symbolic(
+    "aten::rnn_relu",
+    decorate=[symbolic_helper._apply_params("RNN_RELU"), _export("rnn_relu")],
+)
+def _one_hidden_rnn(kind: str):
+    @symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
+    def _rnn_full(
+        g,
+        input,
+        hidden,
+        weight_v,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+    ):
+        weight = symbolic_helper._unpack_list(weight_v)
+        return _generic_rnn(
+            g,
+            kind,
+            input,
+            hidden,
+            weight,
+            has_biases,
+            num_layers,
+            dropout,
+            train,
+            bidirectional,
+            batch_first,
+        )
+
+    @symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
+    def _rnn_packed(
+        g,
+        input,
+        batch_sizes,
+        hidden,
+        weight_v,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+    ):
+        weight = symbolic_helper._unpack_list(weight_v)
+        return _generic_rnn(
+            g,
+            kind,
+            input,
+            hidden,
+            weight,
+            has_biases,
+            num_layers,
+            dropout,
+            train,
+            bidirectional,
+            batch_sizes=batch_sizes,
+        )
+
+    def symbolic(g, *args):
+        if symbolic_helper._is_tensor_list(args[3]):
+            return _rnn_packed(g, *args)
+        else:
+            return _rnn_full(g, *args)
+
+    return symbolic
+
+
+@_onnx_symbolic("aten::_dim_arange")
+@symbolic_helper.parse_args("v", "i")
+def _dim_arange(g: jit_utils.GraphContext, like, dim):
+    like_shape = g.op("Shape", like)
+    stop = g.op(
+        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
+    )
+    # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+    return arange(g, stop, 4, None, None, None)
+
+
+@_onnx_symbolic("aten::detach")
+def detach(g: jit_utils.GraphContext, input):
+    # Erase aten::detach nodes because ONNX is inference only
+    return input
+
+
+@_onnx_symbolic("aten::contiguous")
+@symbolic_helper.parse_args("v", "i")
+def contiguous(g: jit_utils.GraphContext, input, memory_format):
+    if memory_format > 2:  # allower values are any, preserve and contiguous_format
+        raise errors.SymbolicValueError(
+            "onnx memory_format support is not implemented", input
+        )
+    return input
+
+
+@_onnx_symbolic("aten::_pack_padded_sequence")
+@symbolic_helper.parse_args("v", "v", "i")
+def _pack_padded_sequence(g: jit_utils.GraphContext, input, lengths, batch_first):
+    # Currently there is no PackPadded operator in ONNX. We rely on an
+    # optimization pass to remove this later. It is an error if all
+    # PackPadded operators cannot be optimized out.
+    if batch_first:
+        input = g.op("Transpose", input, perm_i=[1, 0, 2])
+    if not lengths.type().isSubtypeOf(torch._C.TensorType.get()):
+        raise errors.SymbolicValueError(
+            "'lengths' must be a Tensor for ONNX export", input
+        )
+    # We know it's a TensorType so this check is now safe.
+    # It's really only necessary because those operators expand to something that
+    # only works with int32 types in Caffe2...
+    if (
+        _type_utils.JitScalarType.from_value(
+            lengths, _type_utils.JitScalarType.UNDEFINED
+        )
+        != _type_utils.JitScalarType.INT
+    ):
+        lengths = g.op("Cast", lengths, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("prim::PackPadded", input, lengths, outputs=2)
+
+
+@_onnx_symbolic("aten::_pad_packed_sequence")
+@symbolic_helper.parse_args("v", "v", "i", "t", "v")
+def _pad_packed_sequence(
+    g: jit_utils.GraphContext,
+    data,
+    batch_sizes,
+    batch_first,
+    padding_value,
+    total_length,
+):
+    # Ignore total_length as it is not supported in _symbolic_pad_packed_sequence
+    # It is only useful/used when training using data_parallel model, so
+    # It shouldn't be relevant for ONNX anyway
+    data, lengths = g.op("prim::PadPacked", data, batch_sizes, outputs=2)
+    if batch_first:
+        data = g.op("Transpose", data, perm_i=[1, 0, 2])
+    return data, lengths
+
+
+@_onnx_symbolic("aten::randint")
+def randint(g: jit_utils.GraphContext, low, high, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    low_i = symbolic_helper._get_const(low, "i", "low")
+    high_i = symbolic_helper._get_const(high, "i", "high")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.INT64
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if low_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", low)
+    if high_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", high)
+
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        randn = g.op(
+            "RandomUniformLike",
+            shape_const,
+            low_f=low_i,
+            high_f=high_i,
+        )
+    else:
+        randn = g.op(
+            "RandomUniform",
+            shape_i=shape,
+            low_f=low_i,
+            high_f=high_i,
+        )
+
+    # cast to integer type
+    int_dtype = _type_utils.JitScalarType.INT64
+    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
+    if int_dtype != scalar_type:
+        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
+    return randint
+
+
+@_onnx_symbolic("aten::randint_like")
+def randint_like(g: jit_utils.GraphContext, self, low, high, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    low_i = symbolic_helper._get_const(low, "i", "low")
+    high_i = symbolic_helper._get_const(high, "i", "high")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.INT64
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if low_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", low)
+    if high_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", high)
+
+    randn = g.op(
+        "RandomUniformLike",
+        self,
+        low_f=low_i,
+        high_f=high_i,
+    )
+
+    # cast to integer type
+    int_dtype = _type_utils.JitScalarType.INT64
+    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
+    if int_dtype != scalar_type:
+        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
+    return randint
+
+
+@_onnx_symbolic("aten::randn")
+def randn(g: jit_utils.GraphContext, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        return g.op(
+            "RandomNormalLike",
+            shape_const,
+            dtype_i=scalar_type.onnx_type(),
+        )
+    return g.op(
+        "RandomNormal",
+        shape_i=shape,
+        dtype_i=scalar_type.onnx_type(),
+    )
+
+
+@_onnx_symbolic("aten::rand")
+def rand(g: jit_utils.GraphContext, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        return g.op(
+            "RandomUniformLike",
+            shape_const,
+            dtype_i=scalar_type.onnx_type(),
+        )
+    return g.op(
+        "RandomUniform",
+        shape_i=shape,
+        dtype_i=scalar_type.onnx_type(),
+    )
+
+
+@_onnx_symbolic("aten::randn_like")
+def randn_like(
+    g: jit_utils.GraphContext,
+    self,
+    dtype,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            self, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op("RandomNormalLike", self, dtype_i=scalar_type.onnx_type())
+
+
+@_onnx_symbolic("aten::rand_like")
+def rand_like(
+    g: jit_utils.GraphContext,
+    self,
+    dtype,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        dtype = _type_utils.JitScalarType.from_value(
+            self, _type_utils.JitScalarType.FLOAT
+        )
+    return g.op(
+        "RandomUniformLike", self, dtype_i=_type_utils.JitScalarType(dtype).onnx_type()
+    )
+
+
+@_onnx_symbolic("aten::rrelu")
+@symbolic_helper.parse_args("v", "f", "f", "i", "none")
+def rrelu(g: jit_utils.GraphContext, input, lower, upper, training, generator):
+    if not training:
+        slope = (upper + lower) / 2.0
+        return g.op("LeakyRelu", input, alpha_f=slope)
+    p = g.op("RandomUniformLike", input, high_f=upper, low_f=lower)
+    return g.op("PRelu", input, p)
+
+
+@_onnx_symbolic("aten::bernoulli")
+def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
+    if out is not None and not symbolic_helper._is_none(out):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "out parameter is not supported for bernoulli", input
+        )
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "generator is not supported for bernoulli", input
+        )
+
+    dtype = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.UNDEFINED
+    )
+    if dtype == _type_utils.JitScalarType.UNDEFINED:
+        return symbolic_helper._unimplemented(
+            "Bernoulli", "input dtype not accessible", input
+        )
+
+    rands = g.op(
+        "RandomUniformLike",
+        input,
+        high_f=1.0,
+        low_f=0.0,
+        dtype_i=dtype.onnx_type(),
+    )
+    prob = p if p is not None and not symbolic_helper._is_none(p) else input
+    output = g.op("Less", rands, prob)
+    return g.op("Cast", output, to_i=dtype.onnx_type())
+
+
+@_onnx_symbolic("aten::log_sigmoid")
+@symbolic_helper.parse_args("v")
+def log_sigmoid(g: jit_utils.GraphContext, input):
+    p = g.op("Sigmoid", input)
+    return g.op("Log", p)
+
+
+@_onnx_symbolic("aten::erf")
+@symbolic_helper.parse_args("v")
+def erf(g: jit_utils.GraphContext, input):
+    return g.op("Erf", input)
+
+
+@_onnx_symbolic("aten::flatten")
+@symbolic_helper.quantized_args(True, False, False)
+@symbolic_helper.parse_args("v", "i", "i")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    dim = symbolic_helper._get_tensor_rank(input)
+    if dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+            input,
+        )
+
+    if dim == 0:
+        return symbolic_helper._reshape_helper(g, input, [1])
+    if dim == 1:
+        return g.op("Identity", input)
+    # TODO: remove this as onnx opset 11 spec allows negative axes
+    if end_dim < 0:
+        end_dim = dim + end_dim
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim == 1 and end_dim == dim - 1:
+        return g.op("Flatten", input, axis_i=start_dim)
+    if start_dim == 0 and end_dim == dim - 2:
+        return g.op("Flatten", input, axis_i=end_dim + 1)
+
+    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
+
+
+@_onnx_symbolic("aten::nonzero")
+@symbolic_helper.parse_args("v")
+def nonzero(g: jit_utils.GraphContext, input):
+    """Emitted from `torch.nonzero(x, as_tuple=False)`"""
+    return t(g, g.op("NonZero", input))
+
+
+@_onnx_symbolic("aten::nonzero_numpy")
+# Emitted from `torch.nonzero(x, as_tuple=True)`
+def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
+    return unbind(g, nonzero(g, input), 1, _outputs=_outputs)
+
+
+@_onnx_symbolic("aten::isnan")
+@symbolic_helper.parse_args("v")
+def isnan(g: jit_utils.GraphContext, input):
+    output = g.op("IsNaN", input)
+    return output
+
+
+@_onnx_symbolic("aten::any")
+def _any(g: jit_utils.GraphContext, *args):
+    # aten::any(Tensor self)
+    if len(args) == 1:
+        input = args[0]
+        dim, keepdim = None, 0
+    # aten::any(Tensor self, int[]? dim, bool keepdim)
+    else:
+        input, dim, keepdim = args
+        # Can be int list or single int
+        dim = symbolic_helper._parse_arg(dim, "t")
+        dim = [int(d) for d in dim.view(-1)]
+        keepdim = symbolic_helper._parse_arg(keepdim, "i")
+    input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
+    input_sum = symbolic_helper._reducesum_helper(
+        g, input, axes_i=dim, keepdims_i=keepdim
+    )
+    return gt(g, input_sum, g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)))
+
+
+@_onnx_symbolic("aten::all")
+def _all(g: jit_utils.GraphContext, *args):
+    input = g.op("Not", args[0])
+    # aten::all(Tensor self)
+    if len(args) == 1:
+        return g.op("Not", _any(g, input))
+    # aten::all(Tensor self, int[]? dim, bool keepdim)
+    else:
+        return g.op("Not", _any(g, input, args[1], args[2]))
+
+
+@_onnx_symbolic("aten::narrow")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def narrow(g: jit_utils.GraphContext, input, dim, start, length):
+    return symbolic_helper._slice_helper(
+        g, input, axes=[dim], starts=[start], ends=[start + length]
+    )
+
+
+@_onnx_symbolic("aten::argmax")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmax(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
+
+
+@_onnx_symbolic("aten::argmin")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmin(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
+
+
+@_onnx_symbolic("aten::scatter")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(
+        src, _type_utils.JitScalarType.UNDEFINED
+    )
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("Scatter", self, index, src, axis_i=dim)
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        self_scalar_type = _type_utils.JitScalarType.from_value(self)
+        if self_scalar_type != src_type:
+            src = g.op("Cast", src, to_i=self_scalar_type.onnx_type())
+        return g.op("Scatter", self, index, expand_as(g, src, index), axis_i=dim)
+
+
+@_onnx_symbolic("aten::scatter_add")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
+    scalar_type = symbolic_helper._try_get_scalar_type(self)
+    if scalar_type is None:
+        return symbolic_helper._unimplemented(
+            "scatter_add", "input dtype not accessible", self
+        )
+    sizes = symbolic_helper._get_tensor_sizes(self, allow_nonstatic=False)
+    if sizes:
+        to_add = g.op("Constant", value_t=torch.zeros(sizes, dtype=scalar_type.dtype()))
+    else:
+        to_add = zeros_like(g, self, scalar_type)
+    to_add = symbolic_helper._scatter_helper(g, to_add, dim, index, src)
+    return add(g, self, to_add)
+
+
+@_onnx_symbolic("aten::log2")
+def log2(g: jit_utils.GraphContext, self):
+    _ln2 = 0.693147180559945309
+    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor(_ln2)))
+
+
+@_onnx_symbolic("aten::is_floating_point")
+def is_floating_point(g: jit_utils.GraphContext, self):
+    if symbolic_helper._is_fp(self):
+        return g.op("Constant", value_t=torch.BoolTensor([1]))
+    return g.op("Constant", value_t=torch.BoolTensor([0]))
+
+
+@_onnx_symbolic("aten::__is_")
+def __is_(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_none(other):
+        if symbolic_helper._is_none(self):
+            return g.op("Constant", value_t=torch.BoolTensor([1]))
+        return g.op("Constant", value_t=torch.BoolTensor([0]))
+    return eq(g, self, other)
+
+
+@_onnx_symbolic("aten::__isnot_")
+@wrap_logical_op_with_negation
+def __isnot_(g: jit_utils.GraphContext, self, other):
+    return __is_(g, self, other)
+
+
+@_onnx_symbolic("aten::one_hot")
+def one_hot(g: jit_utils.GraphContext, self, num_classes):
+    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
+    # onnxruntime supports limited type combinations for OneHot.
+    if _type_utils.JitScalarType.from_value(
+        num_classes, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+        _type_utils.JitScalarType.INT,
+        _type_utils.JitScalarType.INT16,
+    }:
+        num_classes = g.op("Cast", num_classes, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("OneHot", self, num_classes, values, axis_i=-1)
+
+
+@_onnx_symbolic("aten::gather")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
+    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
+        return symbolic_helper._unimplemented("gather", "sparse_grad == True", self)
+    # NOTE: This workaround is needed since GatherElement is only supported
+    #       since opset 11, and Gather in ONNX is not the same as torch.gather.
+    scalar_type = _type_utils.JitScalarType.from_value(self)
+    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
+    depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim])))
+    index = g.op(
+        "Cast",
+        g.op("OneHot", index, depth, values, axis_i=dim),
+        to_i=scalar_type.onnx_type(),
+    )
+    mul = g.op("Mul", symbolic_helper._unsqueeze_helper(g, self, [dim + 1]), index)
+    return symbolic_helper._reducesum_helper(g, mul, axes_i=[dim], keepdims_i=0)
+
+
+@symbolic_helper.parse_args("v", "is", "i", "i")
+def _var_mean(g: jit_utils.GraphContext, input, dim, correction, keepdim):
+    return symbolic_helper._var_mean_helper(g, input, dim, correction, keepdim)
+
+
+@_onnx_symbolic("aten::std")
+def std(g: jit_utils.GraphContext, input, *args):
+    var, _ = var_mean(g, input, *args)
+    return g.op("Sqrt", var)
+
+
+@_onnx_symbolic("aten::var")
+def var(g: jit_utils.GraphContext, input, *args):
+    var, _ = var_mean(g, input, *args)
+    return var
+
+
+@_onnx_symbolic("aten::var_mean")
+def var_mean(g: jit_utils.GraphContext, input, *args):
+    if len(args) == 1:
+        return _var_mean(g, input, None, args[0], None)
+    else:
+        return _var_mean(g, input, *args)
+
+
+@_onnx_symbolic("aten::std_mean")
+def std_mean(g: jit_utils.GraphContext, input, *args):
+    var, mean = var_mean(g, input, *args)
+    return g.op("Sqrt", var), mean
+
+
+@_onnx_symbolic("aten::logsumexp")
+@symbolic_helper.parse_args("v", "is", "i")
+def logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
+    return g.op("ReduceLogSumExp", input, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::arange")
+def arange(g: jit_utils.GraphContext, *args):
+    def _get_arange_dtype(dtype):
+        dtype = symbolic_helper._maybe_get_const(dtype, "i")
+        return dtype
+
+    def _float_step_convert(range_tensor):
+        if symbolic_helper._is_fp(range_tensor):
+            range_tensor = g.op(
+                "Cast",
+                g.op("Ceil", range_tensor),
+                to_i=_type_utils.JitScalarType.INT64.onnx_type(),
+            )
+        return range_tensor
+
+    if len(args) == 2 or len(args) == 5:
+        if len(args) == 2:
+            # aten::arange(Scalar end, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[1])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, end=args[0], dtype=dtype
+        )
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        range_tensor = _float_step_convert(end)
+        arange_tensor = symbolic_helper._squeeze_helper(
+            g, nonzero(g, ones(g, range_tensor, dtype, None, None)), [1]
+        )
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+    elif len(args) == 4 or len(args) == 7:
+        if len(args) == 4:
+            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[3])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], step=args[2], dtype=dtype
+        )
+        step = symbolic_helper._unsqueeze_helper(g, step, [0])
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        start = symbolic_helper._unsqueeze_helper(g, start, [0])
+        range_tensor = _float_step_convert(g.op("Div", g.op("Sub", end, start), step))
+        arange_tensor = symbolic_helper._squeeze_helper(
+            g, nonzero(g, ones(g, range_tensor, None, None, None)), [1]
+        )
+        arange_tensor = g.op("Add", g.op("Mul", arange_tensor, step), start)
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+    elif len(args) == 6:
+        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+        dtype = _get_arange_dtype(args[2])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], dtype=dtype
+        )
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        start = symbolic_helper._unsqueeze_helper(g, start, [0])
+        range_tensor = _float_step_convert(g.op("Sub", end, start))
+        arange_tensor = g.op(
+            "Add",
+            symbolic_helper._squeeze_helper(
+                g, nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), [1]
+            ),
+            start,
+        )
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+
+    return symbolic_helper._unimplemented("aten::arange", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::linspace")
+def linspace(
+    g: jit_utils.GraphContext, start, end, steps, dtype, layout, device, pin_memory
+):
+    range_tensor = symbolic_helper._arange_helper(g, steps, None)
+    step = div(
+        g,
+        sub(g, end, start),
+        sub(g, steps, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))),
+    )
+    return add(g, mul(g, range_tensor, step), start)
+
+
+@_onnx_symbolic("aten::lift")
+def lift(g: jit_utils.GraphContext, self):
+    # at::lift() is a no-op from the perspective of tracing for onnx
+    return self
+
+
+@_onnx_symbolic("aten::masked_fill")
+def masked_fill(g: jit_utils.GraphContext, self, mask, value):
+    """Implement the masked_fill functionality available for a pytorch tensor in ONNX.
+
+    Fills elements of the input tensor with `value` where `mask` is True.
+    """
+    mask = g.op("Cast", mask, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    value = symbolic_helper._maybe_get_scalar(value)
+    return g.op("Where", mask, symbolic_helper._if_scalar_type_as(value, self), self)
+
+
+@_onnx_symbolic("aten::masked_fill_")
+def masked_fill_(g: jit_utils.GraphContext, self, mask, value):
+    return masked_fill(g, self, mask, value)
+
+
+@_onnx_symbolic("aten::index")
+def index(g: jit_utils.GraphContext, self, index):
+    if symbolic_helper._is_packed_list(index):
+        indices = symbolic_helper._unpack_list(index)
+    else:
+        indices = [index]
+
+    def try_mask_to_index(index):
+        if not symbolic_helper._is_none(index) and (
+            _type_utils.JitScalarType.from_value(
+                index, _type_utils.JitScalarType.UNDEFINED
+            )
+            == _type_utils.JitScalarType.UINT8
+            or symbolic_helper._is_bool(index)
+        ):
+            if g.opset < 9:
+                raise errors.SymbolicValueError(
+                    "Exporting masked indices are only supported after ONNX opset 9.",
+                    self,
+                )
+            warnings.warn(
+                "Exporting aten::index operator with indices of type Byte. "
+                "Only 1-D indices are supported. In any other case, "
+                "this will produce an incorrect ONNX graph."
+            )
+            index = symbolic_helper._squeeze_helper(g, nonzero(g, index), [1])
+        return index
+
+    indices = [try_mask_to_index(idx) for idx in indices]
+    if len(indices) == 1:
+        return symbolic_helper._select_helper(
+            g, self, 0, indices[0], apply_reshape=False
+        )
+    else:
+        # Multiple tensors as indices. Each tensor could either be
+        #   1. prim::Constant()
+        #           representing ":" in python indexing. E.g. tensor[:, :]
+        #   2. prim::Constant[value=...] or tensor output
+        #           representing advanced indexing. E.g. tensor[[0, 1], [2, 0]].
+        # For more info on advanced indexing,
+        # check https://numpy.org/doc/stable/user/basics.indexing.html#advanced-indexing
+
+        # Consider a general case of
+        #       t: [x_1, y_1, y_2, ..., x_m, ..., y_n]
+        # where t is a tensor of rank m+n, {x_i} are axes where tensor index is provided, and {y_i} are axes for ":".
+        # Same results can be achieved through transposing t into
+        #       t: [x_1, x_2, ..., x_m, y_1, y_2, ..., y_n]
+        # and use gatherND. However ONNX does not have gatherND, to use 1d gather we'll need to flatten t
+        # and process the tensor indices.
+        #       t: [x_1 * x_2 * ... * x_m, y_1 * y_2 * ... * y_n]
+        #       tensor index = \sum_{i=1}^m (ind_i * \prod_{j=i+1}^m (x_j))
+        # After gather, reshape and transpose back.
+        adv_idx_indices = [
+            i for i, idx in enumerate(indices) if not symbolic_helper._is_none(idx)
+        ]
+
+        if len(adv_idx_indices) == 0:
+            return self
+        elif len(adv_idx_indices) == 1:
+            return index_select(
+                g, self, adv_idx_indices[0], indices[adv_idx_indices[0]]
+            )
+        else:
+            rank = symbolic_helper._get_tensor_rank(self)
+            if rank is None:
+                return symbolic_helper._unimplemented(
+                    "aten::index",
+                    "operator of advanced indexing on tensor of unknown rank. ",
+                    self,
+                )
+            # TODO: If indexing is supported natively in ONNX in future opsets,
+            #       update the warning to recommend exporting with higher opset version.
+            warnings.warn(
+                "Exporting aten::index operator of advanced indexing in opset "
+                f"{GLOBALS.export_onnx_opset_version}"
+                " is achieved by combination of multiple ONNX operators, "
+                "including Reshape, Transpose, Concat, and Gather. "
+                "If indices include negative values, the exported graph will produce incorrect results."
+            )
+            adv_idx_count = len(adv_idx_indices)
+            shape_tensor = _shape_as_tensor(g, self)
+            dim_tensor_list = [
+                g.op(
+                    "Gather",
+                    shape_tensor,
+                    g.op("Constant", value_t=torch.LongTensor([dim])),
+                    axis_i=0,
+                )
+                for dim in range(rank)
+            ]
+
+            self = g.op(
+                "Transpose",
+                self,
+                perm_i=adv_idx_indices
+                + [i for i in range(rank) if i not in adv_idx_indices],
+            )
+            self = g.op("Flatten", self, axis_i=adv_idx_count)
+
+            # Note that tensor indices will be broadcasted while accumulating. Thus we get the final subarray shape as well.
+            cum_adv_index = indices[adv_idx_indices[-1]]
+            multiplier = dim_tensor_list[adv_idx_indices[-1]]
+            for i in range(adv_idx_count - 2, -1, -1):
+                adv_index = g.op("Mul", indices[adv_idx_indices[i]], multiplier)
+                cum_adv_index = g.op("Add", cum_adv_index, adv_index)
+                multiplier = g.op(
+                    "Mul", multiplier, dim_tensor_list[adv_idx_indices[i]]
+                )
+
+            # perform gather
+            self = index_select(g, self, 0, cum_adv_index)
+
+            cum_adv_index_shape_tensor = _shape_as_tensor(g, cum_adv_index)
+            # check if all advanced indices are consecutive.
+            # Refer to https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
+            # to understand how the subarray position is decided.
+            if adv_idx_indices == list(
+                range(adv_idx_indices[0], adv_idx_indices[-1] + 1)
+            ):
+                # unfold regular index axes
+                folded_adv_idx_shape_list = [
+                    g.op("Constant", value_t=torch.LongTensor([-1]))
+                ] + [
+                    dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices
+                ]
+                folded_adv_idx_shape = g.op(
+                    "Concat", *folded_adv_idx_shape_list, axis_i=0
+                )
+                self = symbolic_helper._reshape_helper(g, self, folded_adv_idx_shape)
+
+                # Transpose folded advanced indexed axis to its original location.
+                adv_idx_permute = (
+                    list(range(1, adv_idx_indices[0] + 1))
+                    + [0]
+                    + list(range(adv_idx_indices[0] + 1, rank - adv_idx_count + 1))
+                )
+                self = g.op("Transpose", self, perm_i=adv_idx_permute)
+
+                # unfold advanced index axes
+                final_shape_list = (
+                    [dim_tensor_list[i] for i in range(adv_idx_indices[0])]
+                    + [cum_adv_index_shape_tensor]
+                    + [
+                        dim_tensor_list[i]
+                        for i in range(adv_idx_indices[0], rank)
+                        if i not in adv_idx_indices
+                    ]
+                )
+                final_shape = g.op("Concat", *final_shape_list, axis_i=0)
+            else:
+                final_shape = g.op(
+                    "Concat",
+                    cum_adv_index_shape_tensor,
+                    *[
+                        dim_tensor_list[i]
+                        for i in range(rank)
+                        if i not in adv_idx_indices
+                    ],
+                    axis_i=0,
+                )
+
+            return symbolic_helper._reshape_helper(g, self, final_shape)
+
+
+@_onnx_symbolic("aten::linalg_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+def linalg_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.norm.html
+    ord_value = None
+    if dim is None:
+        if symbolic_helper._is_none(ord):
+            self = symbolic_helper._reshape_helper(g, self, [-1])
+            ord = g.op("Constant", value_t=torch.LongTensor([2]))
+        self_dim = symbolic_helper._get_tensor_rank(self)
+        if self_dim is None:
+            return symbolic_helper._unimplemented(
+                "dim", "Input rank must be known at export time.", self
+            )
+        if self_dim == 1:
+            ord_value = symbolic_helper._parse_arg(ord, "f")
+        else:
+            dim = [0, 1]
+    else:
+        if len(dim) == 1:
+            if symbolic_helper._is_none(ord):
+                ord = g.op("Constant", value_t=torch.LongTensor([2]))
+            ord_value = symbolic_helper._parse_arg(ord, "f")
+    if ord_value:
+        return linalg_vector_norm(g, self, ord_value, dim, keepdim, dtype)
+    return linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::linalg_matrix_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+def linalg_matrix_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: list[int],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.matrix_norm.html
+    ord_value = symbolic_helper._parse_arg(ord, "s")
+    if ord_value == "fro":
+        return frobenius_norm(g, self, dim, keepdim)
+    elif ord_value == "nuc":
+        return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==nuc", self)
+    else:
+        ord_value = symbolic_helper._parse_arg(ord, "f")
+        if ord_value is None:
+            return frobenius_norm(g, self, dim, keepdim)
+        if ord_value == 2 or ord_value == -2:
+            # ord = 2/-2 unimplemented due to lack of operators
+            # used to calculate singular values
+            return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==2", self)
+        # Wrap the dim vector to handle negative dim values
+        self_dim = symbolic_helper._get_tensor_rank(self)
+        if self_dim is None:
+            return symbolic_helper._unimplemented(
+                "linalg.matrix_norm", "Input rank must be known at export time.", self
+            )
+        # Common implementation for cases with
+        # ord = 1/-1 and ord = inf/-inf
+        if dim[0] < 0:
+            dim[0] += self_dim
+        if dim[1] < 0:
+            dim[1] += self_dim
+
+        if ord_value == math.inf or ord_value == -math.inf:
+            dim[0], dim[1] = dim[1], dim[0]
+        if dim[1] > dim[0] and not keepdim:
+            dim[1] -= 1
+        sum = symbolic_helper._reducesum_helper(
+            g, g.op("Abs", self), axes_i=[dim[0]], keepdims_i=keepdim
+        )
+        if ord_value > 0:
+            result, _indices = max(
+                g,
+                sum,
+                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
+                keepdim=keepdim,
+            )
+        else:
+            result, _indices = min(
+                g,
+                sum,
+                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
+                keepdim=keepdim,
+            )
+        return result
+
+
+@_onnx_symbolic("aten::linalg_cross")
+@symbolic_helper.parse_args("v", "v", "i")
+def linalg_cross(g: jit_utils.GraphContext, input, other, dim=-1):
+    return cross(g, input, other, dim)
+
+
+@_onnx_symbolic("aten::frobenius_norm")
+@symbolic_helper.parse_args("v", "is", "b")
+def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
+    sqr = g.op("Mul", self, self)
+    sumsqr = symbolic_helper._reducesum_helper(g, sqr, axes_i=dim, keepdims_i=keepdim)
+    return g.op("Sqrt", sumsqr)
+
+
+@_onnx_symbolic("aten::multinomial")
+@symbolic_helper.parse_args("v", "i", "b", "v")
+def multinomial(
+    g: jit_utils.GraphContext, input, num_samples, replacement=False, generator=None
+):
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Multinomial", "generator is not supported for multinomial", input
+        )
+    if not replacement and num_samples > 1:
+        symbolic_helper._unimplemented(
+            "Multinomial",
+            "replacement=False when num_samples > 1 is not supported for multinomial",
+            input,
+        )
+
+    log_input = log(g, input)
+    return g.op(
+        "Multinomial",
+        log_input,
+        dtype_i=_C_onnx.TensorProtoDataType.INT64,
+        sample_size_i=num_samples,
+    )
+
+
+@_onnx_symbolic("aten::baddbmm")
+def baddbmm(g: jit_utils.GraphContext, self, batch1, batch2, beta, alpha):
+    scalar_type = _type_utils.JitScalarType.from_value(self)
+    batch_mul = matmul(g, batch1, batch2)
+    mul_a = mul(
+        g,
+        batch_mul,
+        g.op("Cast", alpha, to_i=scalar_type.onnx_type()),
+    )
+    mul_b = mul(
+        g,
+        self,
+        g.op("Cast", beta, to_i=scalar_type.onnx_type()),
+    )
+    return add(g, mul_a, mul_b)
+
+
+@_onnx_symbolic("aten::meshgrid")
+@symbolic_helper.parse_args("v", "s")
+def meshgrid(g: jit_utils.GraphContext, tensor_list, indexing: str | None = None):
+    if indexing is None:
+        indexing = "ij"
+    elif indexing not in {"ij", "xy"}:
+        raise errors.SymbolicValueError(
+            f"Unsupported indexing: {indexing}", tensor_list
+        )
+    unpacked_tensor_list = symbolic_helper._unpack_list(tensor_list)
+    if indexing == "xy":
+        unpacked_tensor_list[:2] = unpacked_tensor_list[1::-1]
+    tensors = [
+        symbolic_helper._reshape_helper(
+            g, t, g.op("Constant", value_t=torch.LongTensor([-1]))
+        )
+        for t in unpacked_tensor_list
+    ]
+    tensors_shape = [g.op("Shape", t) for t in tensors]
+    out_shape = g.op("Concat", *tensors_shape, axis_i=0)
+    out = []
+    for i, t in enumerate(tensors):
+        shape_i = [g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))] * len(
+            tensors
+        )
+        shape_i[i] = tensors_shape[i]
+        t_reshaped = _reshape_from_tensor(g, t, g.op("Concat", *shape_i, axis_i=0))
+        out.append(g.op("Expand", t_reshaped, out_shape))
+    if indexing == "xy":
+        out[0], out[1] = out[1], out[0]
+    return g.op("prim::ListConstruct", *out)
+
+
+@_onnx_symbolic("aten::remainder")
+def remainder(g: jit_utils.GraphContext, input, other):
+    div = _floor_divide(g, input, other)
+    quo = g.op("Mul", div, other)
+    return g.op("Sub", input, quo)
+
+
+@_onnx_symbolic("aten::gelu")
+@symbolic_helper.parse_args("v", "s")
+def gelu(g: jit_utils.GraphContext, self: torch._C.Value, approximate: str = "none"):
+    if approximate == "tanh":
+        kBeta = math.sqrt(2 / math.pi)
+        kKappa = 0.044715
+
+        beta = torch.tensor(kBeta, dtype=torch.double)
+        kappa = torch.tensor(kKappa, dtype=torch.double)
+        one = torch.tensor(1.0, dtype=torch.double)
+        half = torch.tensor(0.5, dtype=torch.double)
+
+        self_cube = mul(g, self, mul(g, self, self))
+        inner = mul(g, beta, add(g, self, mul(g, kappa, self_cube)))
+        return mul(g, half, mul(g, self, add(g, one, g.op("Tanh", inner))))
+    else:
+        _sqrt2 = 1.4142135623730951
+        erf = g.op("Erf", g.op("Div", self, torch.tensor(_sqrt2, dtype=torch.double)))
+        erf_plusone = add(
+            g, erf, g.op("Constant", value_t=torch.tensor(1, dtype=torch.double))
+        )
+        return mul(
+            g,
+            mul(g, self, erf_plusone),
+            g.op("Constant", value_t=torch.tensor(0.5, dtype=torch.double)),
+        )
+
+
+@_onnx_symbolic("aten::group_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "i", "v", "v", "f", "i")
+def group_norm(
+    g: jit_utils.GraphContext, input, num_groups, weight, bias, eps, cudnn_enabled
+):
+    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
+    if channel_size is not None:
+        assert channel_size % num_groups == 0
+    input_rank = symbolic_helper._get_tensor_rank(input)
+    if input_rank is None:
+        return symbolic_helper._unimplemented("group_norm", "unknown input rank", input)
+    # 0 in the shape list keeps dimension value unchanged.
+    shape = [0, num_groups, -1]
+    input_reshaped = symbolic_helper._reshape_helper(
+        g, input, g.op("Constant", value_t=torch.LongTensor(shape))
+    )
+
+    # C is always divisible by num_groups
+    # Due to shape difference. we need to apply weight and bias after
+    # instance norm computation and reshape
+    weight_ = g.op(
+        "Constant",
+        value_t=torch.tensor(
+            [1.0] * num_groups,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        ),
+    )
+    bias_ = g.op(
+        "Constant",
+        value_t=torch.tensor(
+            [0.0] * num_groups,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        ),
+    )
+
+    norm_reshaped = g.op(
+        "InstanceNormalization", input_reshaped, weight_, bias_, epsilon_f=eps
+    )
+    norm = symbolic_helper._reshape_helper(g, norm_reshaped, g.op("Shape", input))
+
+    if weight is None or weight.node().mustBeNone():
+        weight_value = torch.tensor(
+            [1.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or bias.node().mustBeNone():
+        bias_value = torch.tensor(
+            [0.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
+        )
+        bias = g.op("Constant", value_t=bias_value)
+
+    # Norm has shape [N, C, *] so we reshape weight and bias to [C, *]
+    axes = list(range(1, input_rank - 1))
+    return add(
+        g,
+        mul(g, norm, symbolic_helper._unsqueeze_helper(g, weight, axes)),
+        symbolic_helper._unsqueeze_helper(g, bias, axes),
+    )
+
+
+@_onnx_symbolic("aten::_weight_norm")
+@symbolic_helper.parse_args("v", "v", "i")
+def _weight_norm(g: jit_utils.GraphContext, weight_v, weight_g, dim):
+    rank = symbolic_helper._get_tensor_rank(weight_v)
+    if rank is not None:
+        # W = g * ((v) / ||v||)
+        # Compute norm_except_dim for l2 norm. dim = None means over all dims
+        # torch's weight_norm module sets dim = -1 if it's None.
+        # This conflicts the logic for negative axes to access dims backwards
+        # TODO: Might need a fix in torch group_norm module
+        axes = list(range(rank))
+        if dim is not None:
+            if dim < -1:
+                dim += rank
+            if dim != -1:
+                axes.remove(dim)
+        norm_v = norm(g, weight_v, 2, axes, 1)
+        div = g.op("Div", weight_v, norm_v)
+        return g.op("Mul", div, weight_g)
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of _weight_norm for tensor of unknown rank.",
+        weight_v,
+    )
+
+
+@_onnx_symbolic("aten::dim")
+def dim(g: jit_utils.GraphContext, self):
+    """Implement the dim functionality available for a pytorch tensor in ONNX"""
+    # ONNX does not support dim directly in this opset so we can use 2 ops to get the info
+    shape = g.op("Shape", self)
+    return g.op("Size", shape)
+
+
+@_onnx_symbolic("aten::__contains_")
+def __contains_(g: jit_utils.GraphContext, self, element):
+    unpacked_list = symbolic_helper._unpack_list(self)
+    if all(
+        symbolic_helper._is_constant(x) for x in unpacked_list
+    ) and symbolic_helper._is_constant(element):
+        return g.op(
+            "Constant",
+            value_t=torch.tensor(
+                symbolic_helper._node_get(element.node(), "value")
+                in (symbolic_helper._node_get(x.node(), "value") for x in unpacked_list)
+            ),
+        )
+
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of __contains__ for non-constant list or element.",
+        self,
+    )
+
+
+@_onnx_symbolic("aten::__getitem_")
+def __getitem_(g: jit_utils.GraphContext, self, i):
+    return select(g, self, g.op("Constant", value_t=torch.tensor([0])), i)
+
+
+@_onnx_symbolic("aten::item")
+def item(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::take")
+def take(g: jit_utils.GraphContext, self, index):
+    self_flattened = symbolic_helper._reshape_helper(
+        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    )
+    out = index_select(g, self_flattened, 0, index)
+    out = reshape_as(g, out, index)
+    return out
+
+
+def _kl_div_log_target_impl(g: jit_utils.GraphContext, input, target):
+    diff_ = sub(g, target, input)
+    exp_ = exp(g, target)
+    output = mul(g, exp_, diff_)
+    return output
+
+
+def _kl_div_non_log_target_impl(g: jit_utils.GraphContext, input, target):
+    log_ = log(g, target)
+    diff_ = sub(g, log_, input)
+    output_pos = mul(g, target, diff_)
+    zeros_ = zeros_like(g, output_pos)
+    mask_ = gt(g, target, g.op("Constant", value_t=torch.tensor(0)))
+    output = where(g, mask_, output_pos, zeros_)
+    return output
+
+
+@_onnx_symbolic("aten::kl_div")
+@symbolic_helper.parse_args("v", "v", "i", "b")
+def kl_div(g: jit_utils.GraphContext, input, target, reduction, log_target):
+    if log_target:
+        output = _kl_div_log_target_impl(g, input, target)
+    else:
+        output = _kl_div_non_log_target_impl(g, input, target)
+
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "kl_div with reduction other than none, mean, or sum.", input
+        )
+
+
+@_onnx_symbolic("aten::mse_loss")
+@symbolic_helper.parse_args("v", "v", "i")
+def mse_loss(g: jit_utils.GraphContext, input, target, reduction):
+    output = mul(g, sub(g, input, target), sub(g, input, target))
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "mse_loss with reduction other than none, mean, or sum.", input
+        )
+
+
+@_onnx_symbolic("aten::as_strided")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "is", "i")
+def as_strided(g: jit_utils.GraphContext, self, sizes, strides, offset=None):
+    sizes = symbolic_helper._maybe_get_const(sizes, "is")
+    rank = len(strides)
+    self_1d = symbolic_helper._reshape_helper(
+        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    )
+    ind: torch.Tensor | None
+    if not symbolic_helper._is_value(sizes):
+        ind = torch.tensor([0], dtype=torch.long)
+        for i, (size, stride) in enumerate(zip(sizes, strides)):
+            r_size = [1] * rank
+            r_size[i] = -1
+            ind = ind + torch.arange(size).view(r_size) * stride
+        if offset:
+            ind = ind + offset
+        return g.op("Gather", self_1d, g.op("Constant", value_t=ind))
+    else:
+        ind = None
+        for i, stride in enumerate(strides):
+            r_size = [1] * rank
+            r_size[i] = -1
+            size = select(
+                g,
+                sizes,
+                g.op("Constant", value_t=torch.tensor([0])),
+                g.op("Constant", value_t=torch.tensor(i)),
+            )
+            tmp_ind = symbolic_helper._reshape_helper(
+                g,
+                arange(g, size, 4, None, None, None),
+                g.op("Constant", value_t=torch.tensor(r_size)),
+            )
+            tmp_ind = g.op(
+                "Mul", tmp_ind, g.op("Constant", value_t=torch.tensor([stride]))
+            )
+            if ind is None:
+                ind = tmp_ind
+            else:
+                ind = g.op("Add", ind, tmp_ind)
+        if offset:
+            ind = g.op("Add", ind, g.op("Constant", torch.tensor([offset])))
+        return g.op("Gather", self_1d, ind)
+
+
+@_onnx_symbolic("aten::__derive_index")
+def __derive_index(g: jit_utils.GraphContext, index, start, step):
+    return g.op("Add", start, g.op("Mul", index, step))
+
+
+@_onnx_symbolic("aten::__range_length")
+# Source code for aten op can be found here: pytorch/torch/csrc/jit/runtime/register_prim_ops.cpp
+# if (step > 0 && lo < hi) {
+#   push(stack, 1 + (hi - 1 - lo) / step);
+# } else if (step < 0 && lo > hi) {
+#   push(stack, 1 + (lo - 1 - hi) / (0 - step));
+# } else {
+#  push(stack, 0);
+# }
+def __range_length(g: jit_utils.GraphContext, lo, hi, step):
+    sub = g.op("Sub", hi, lo)
+    div = g.op("Ceil", true_divide(g, sub, step))
+    return g.op("Cast", div, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+
+@_onnx_symbolic("aten::linear")
+def linear(g: jit_utils.GraphContext, input, weight, bias):
+    rank = symbolic_helper._get_tensor_rank(input)
+    weight = t(g, weight)
+    if rank == 2 and not bias.node().mustBeNone():
+        alpha = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        beta = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        output = addmm(g, bias, input, weight, alpha, beta)
+    else:
+        output = matmul(g, input, weight)
+        if not bias.node().mustBeNone():
+            output = add(g, bias, output)
+
+    return output
+
+
+@_onnx_symbolic("aten::hann_window")
+@symbolic_helper.parse_args("v", "b", "i", "v", "v", "v", "v")
+def hann_window(
+    g: jit_utils.GraphContext,
+    window_length,
+    periodic=True,
+    dtype: int | None = None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    requires_grad=False,
+):
+    if dtype is None:
+        dtype_ = torch.get_default_dtype()
+        if not dtype_ or not dtype_.is_floating_point:
+            dtype_ = torch.float
+        scalar_type = _type_utils.JitScalarType.from_dtype(dtype_)
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+
+    n_array = arange(g, window_length, 4, None, None, None)
+    output = g.op("Cast", n_array, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    output = mul(
+        g, g.op("Constant", value_t=torch.tensor(math.pi, dtype=torch.float)), output
+    )
+
+    if periodic is False:
+        window_length = sub(
+            g, window_length, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int))
+        )
+    output = div(g, output, window_length)
+    output = g.op(
+        "Cast",
+        square(g, sin(g, output)),
+        to_i=scalar_type.onnx_type(),
+    )
+
+    return output
+
+
+@_onnx_symbolic("aten::mv")
+def mv(g: jit_utils.GraphContext, self, vec):
+    return matmul(g, self, vec)
+
+
+@_onnx_symbolic("aten::dot")
+def dot(g: jit_utils.GraphContext, self, other):
+    return matmul(g, self, other)
+
+
+@_onnx_symbolic("aten::movedim")
+@symbolic_helper.parse_args("v", "t", "t")
+def movedim(g: jit_utils.GraphContext, self, source, destination):
+    # This is a pythonic implementation mostly taken from aten/src/ATen/native/TensorShape.cpp::movedim
+    source = source.view(-1)
+    destination = destination.view(-1)
+
+    assert source.size() == destination.size()
+
+    if (source == destination).all():
+        return self
+
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    assert self_rank is not None
+
+    perm = list(range(self_rank))
+
+    src_dims = perm.copy()
+    dst_dims = perm.copy()
+
+    for src, dst in zip(source.tolist(), destination.tolist()):
+        perm[dst] = src
+        src_dims[src] = -1
+        dst_dims[dst] = -1
+
+    src_dims = [dim for dim in src_dims if dim != -1]
+    dst_dims = [dim for dim in dst_dims if dim != -1]
+
+    for src, dst in zip(src_dims, dst_dims):
+        perm[dst] = src
+
+    return g.op("Transpose", self, perm_i=perm)
+
+
+@_onnx_symbolic("aten::fill")
+@symbolic_helper.parse_args("v", "v")
+def fill(g: jit_utils.GraphContext, self, value):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    return full_like(g, self, value, scalar_type)
+
+
+@_onnx_symbolic("aten::index_add")
+def index_add(g: jit_utils.GraphContext, self, dim, index, other, alpha=None):
+    warnings.warn(
+        "Warning: ONNX export does not support duplicated values in 'index' field, "
+        + "this will cause the ONNX model to be incorrect."
+    )
+
+    # ONNX does not support "alpha" argument, unlike aten index_add
+    # See: https://github.com/pytorch/pytorch/pull/65993#issuecomment-953151102 for more context
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        return symbolic_helper._unimplemented("index_add", "alpha != 1", self)
+
+    dim = symbolic_helper._maybe_get_const(dim, "i")
+    if dim is None:
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting 'index_add_()' function with "
+            "unknown 'dim' value.",
+            self,
+        )
+
+    self_dim_rank = symbolic_helper._get_tensor_rank(self)
+    other_dim_rank = symbolic_helper._get_tensor_rank(other)
+
+    if self_dim_rank is None or other_dim_rank is None:
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting 'index_add_()' function while "
+            "the rank of self tensor or tensor to be added is unknown.",
+            self,
+        )
+
+    if other_dim_rank != self_dim_rank:
+        delta = self_dim_rank - other_dim_rank
+        for i in range(delta):
+            other = symbolic_helper._unsqueeze_helper(
+                g, other, [symbolic_helper._get_tensor_rank(other)]
+            )
+
+    other_dim_size = symbolic_helper._get_tensor_dim_size(other, dim)
+    self_dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
+
+    if (other_dim_size is not None) and (self_dim_size is not None):
+        if other_dim_size > self_dim_size:
+            raise errors.SymbolicValueError(
+                "ONNX export does not support exporting 'index_add_()' function with "
+                "duplicated values in 'index' parameter yet.",
+                self,
+            )
+
+    # Construct a new shape. It's almost as same as self except the size of the 'dim'
+    # dimension is 1, so that we can expand other dimensions as expected.
+    new_shape_axes = list(range(self_dim_rank))
+    new_shape_starts = [0 for i in range(self_dim_rank)]
+    new_shape_ends = [sys.maxsize if (i != dim) else 1 for i in range(self_dim_rank)]
+
+    new_shape = symbolic_helper._slice_helper(
+        g, self, axes=new_shape_axes, starts=new_shape_starts, ends=new_shape_ends
+    )
+    other = expand_as(g, other, new_shape)
+
+    for i in range(dim):
+        index = symbolic_helper._unsqueeze_helper(g, index, [0])
+
+    for i in range(self_dim_rank - dim - 1):
+        index = symbolic_helper._unsqueeze_helper(
+            g, index, [symbolic_helper._get_tensor_rank(index)]
+        )
+
+    return scatter_add(g, self, dim, expand_as(g, index, other), other)
+
+
+@_onnx_symbolic("aten::roll")
+@symbolic_helper.parse_args("v", "is", "is")
+def roll(g: jit_utils.GraphContext, self, shifts, dims):
+    assert len(shifts) == len(dims)
+
+    result = self
+    for i in range(len(shifts)):
+        shapes = []
+        shape = symbolic_helper._slice_helper(
+            g, result, axes=[dims[i]], starts=[-shifts[i]], ends=[sys.maxsize]
+        )
+        shapes.append(shape)
+        shape = symbolic_helper._slice_helper(
+            g, result, axes=[dims[i]], starts=[0], ends=[-shifts[i]]
+        )
+        shapes.append(shape)
+        result = g.op("Concat", *shapes, axis_i=dims[i])
+
+    return result
+
+
+@_onnx_symbolic("aten::cross")
+@symbolic_helper.parse_args("v", "v", "i")
+def cross(g: jit_utils.GraphContext, input, other, dim=None):
+    dim = symbolic_helper._get_dim_for_cross(input, dim)
+    # If we have two tensors such that
+    # A = [a, b, c], B = [d, e, f], we permute the tensor such that we have
+    # After first roll,
+    # A' = [b, c, a], B' = [f, d, e], so that we calculate (b*f, c*d, a*e)
+    roll_x_1 = roll(g, input, [2], [dim])
+    roll_y_1 = roll(g, other, [1], [dim])
+    # After second roll,
+    # A' = [c, a, b], B' = [e, f, d], so that we calculate (c*e, a*f, b*d)
+    roll_x_2 = roll(g, input, [1], [dim])
+    roll_y_2 = roll(g, other, [2], [dim])
+    # cross product is calculated as
+    # result = [(b*f - c*e), (c*d - a*f), (a*e - b*d)]
+    return sub(g, mul(g, roll_x_1, roll_y_1), mul(g, roll_x_2, roll_y_2))
+
+
+@_onnx_symbolic("aten::cdist")
+def cdist(
+    g: jit_utils.GraphContext,
+    x1,
+    x2,
+    p=2.0,
+    compute_mode="use_mm_for_euclid_dist_if_necessary",
+):
+    # X1.shape = (B * P * D), X2.shape = (B * R * D)
+    # In order to respect numpy style broadcasting as demonstrated in
+    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    # we unsqueeze both input tensors
+    row_size_x1 = symbolic_helper._get_tensor_dim_size(x1, -2)
+    row_size_x2 = symbolic_helper._get_tensor_dim_size(x2, -2)
+    assert row_size_x1 is not None
+    assert row_size_x2 is not None
+    p_float = symbolic_helper._parse_arg(p, "f")
+    compute_mode = symbolic_helper._parse_arg(compute_mode, "i")
+    if p_float == 2.0 and (
+        compute_mode == 1
+        or (compute_mode is None and row_size_x1 >= 25 and row_size_x2 >= 25)
+    ):
+        return _euclidean_dist(g, x1, x2)
+    rank = symbolic_helper._get_tensor_rank(x1)
+    assert rank is not None
+    broadcasted_x1 = symbolic_helper._unsqueeze_helper(g, x1, [rank - 1])
+    broadcasted_x2 = symbolic_helper._unsqueeze_helper(g, x2, [rank - 2])
+    return pairwise_distance(
+        g, broadcasted_x1, broadcasted_x2, p, eps=1e-06, keepdim=False
+    )
+
+
+def _euclidean_dist(g: jit_utils.GraphContext, x1, x2):
+    # X1.shape = (B * P * D), X2.shape = (B * R * D)
+    # using matrix multiplication to accelerate the calculation of
+    # the euclidean distance
+    rank = symbolic_helper._get_tensor_rank(x1)
+    assert rank is not None
+    x1_norm = symbolic_helper._reducesum_helper(
+        g,
+        pow(g, x1, symbolic_helper._generate_wrapped_number(g, 2.0)),
+        axes_i=[-1],
+        keepdims_i=True,
+    )
+    x1_pad = ones_like(g, x1_norm)
+    x2_norm = symbolic_helper._reducesum_helper(
+        g,
+        pow(g, x2, symbolic_helper._generate_wrapped_number(g, 2.0)),
+        axes_i=[-1],
+        keepdims_i=True,
+    )
+    x2_pad = ones_like(g, x2_norm)
+    x1_ = g.op(
+        "Concat",
+        *[
+            mul(g, symbolic_helper._generate_wrapped_number(g, -2.0), x1),
+            x1_norm,
+            x1_pad,
+        ],
+        axis_i=-1,
+    )
+    x2_ = g.op("Concat", *[x2, x2_pad, x2_norm], axis_i=-1)
+    result = matmul(g, x1_, transpose(g, x2_, -2, -1))
+    dtype = _type_utils.JitScalarType.from_value(result)
+    min = g.op(
+        "Cast", symbolic_helper._generate_wrapped_number(g, 0.0), to_i=dtype.onnx_type()
+    )
+    result = symbolic_helper._op_with_optional_float_cast(
+        g, "Max", result, min, opset_before=12
+    )
+    result = sqrt(g, result)
+    return result
+
+
+@_onnx_symbolic("aten::lerp")
+def lerp(g: jit_utils.GraphContext, self, end, weight):
+    # Conditional for better numeric. This has been discussed in
+    # https://github.com/pytorch/pytorch/pull/18871
+    diff = g.op("Sub", end, self)
+    return where(
+        g,
+        g.op("Less", weight, g.op("Constant", value_t=torch.tensor(0.5))),
+        g.op("Add", self, g.op("Mul", weight, diff)),
+        g.op(
+            "Sub",
+            end,
+            g.op(
+                "Mul",
+                diff,
+                g.op("Sub", g.op("Constant", value_t=torch.tensor(1.0)), weight),
+            ),
+        ),
+    )
+
+
+@_onnx_symbolic("aten::broadcast_tensors")
+def broadcast_tensors(g: jit_utils.GraphContext, self):
+    all_tensors = symbolic_helper._unpack_list(self)
+    t_with_final_shape = zeros_like(g, all_tensors[0])
+
+    # Add operator supports multidirectional broadcasting. So we leverage this function
+    # to infer the final shape generated by the broadcast.
+    for t in all_tensors:
+        t_with_final_shape = add(g, t_with_final_shape, t)
+
+    t_list = [expand_as(g, t, t_with_final_shape) for t in all_tensors]
+    return g.op("prim::ListConstruct", *t_list)
+
+
+@_onnx_symbolic("aten::is_pinned")
+def is_pinned(g: jit_utils.GraphContext, self, device=None):
+    # Unused by ONNX.
+    return None
+
+
+@_onnx_symbolic("prim::ConstantSplit")
+def prim_constant_split(g: jit_utils.GraphContext, self, split_size, dim):
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented(
+            "prim::ConstantSplit", "unknown dimension size", self
+        )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=len(splits))
+
+
+# TODO: It would be better to export this as a chunk directly, as this is
+# less sensitive to changes in input size.
+# TODO: Once we have proper scoping, stop reimplementing chunk, delete this
+# method, and use the desugared version
+@_onnx_symbolic("prim::ConstantChunk")
+def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if dim_size is None:
+        return symbolic_helper._unimplemented(
+            "prim::ConstantChunk", "unknown dimension size", self
+        )
+    split_size = (dim_size + chunks - 1) // chunks
+    return prim_constant_split(g, self, split_size, dim)
+
+
+@_onnx_symbolic("prim::shape")
+def prim_shape(g: jit_utils.GraphContext, self):
+    return g.op("Shape", self)
+
+
+@_onnx_symbolic("prim::max")
+def prim_max(g: jit_utils.GraphContext, self, other):
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Max", self, other, opset_before=12
+    )
+
+
+@_onnx_symbolic("prim::min")
+def prim_min(g: jit_utils.GraphContext, self, other=None):
+    if not other:
+        if symbolic_helper._is_packed_list(self):
+            self = stack(g, self, g.op("Constant", value_t=torch.tensor([0])))
+        return min(g, self)
+    return min(g, self, other)
+
+
+@_onnx_symbolic("prim::data")
+def prim_data(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("prim::layout")
+def prim_layout(g: jit_utils.GraphContext, self):
+    # Always return 'torch.strided'. Other layout types are not supported by JIT 'TensorType'.
+    # Layout class defined in 'c10/core/Layout.h'.
+    return g.op("Constant", value_t=torch.tensor(0))
+
+
+@_onnx_symbolic("prim::ListConstruct")
+def prim_list_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+@_onnx_symbolic("prim::ListUnpack")
+def prim_list_unpack(
+    g: jit_utils.GraphContext, *inputs, **kwargs
+) -> list[_C.Value] | None:
+    if len(inputs) == 1 and inputs[0].node().kind() == "prim::ListConstruct":
+        # Cancel the previous node if it is ListConstruct by returning its inputs
+        # TODO(justinchuby): Use a public method in the helper module
+        return symbolic_helper._unpack_list(inputs[0])
+
+    return None
+
+
+@_onnx_symbolic("prim::TupleConstruct")
+def prim_tuple_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+@_onnx_symbolic("prim::Uninitialized")
+def prim_uninitialized(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+# exists to refine the type of the Value
+# if x is an optional Tensor, unchecked_cast will cast
+# x to Tensor, so the rest of the graph knows that x is a Tensor
+# this doesn't do anything in runtime and is a noop in ONNX
+@_onnx_symbolic("prim::unchecked_cast")
+def prim_unchecked_cast(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("prim::dtype")
+def prim_dtype(g: jit_utils.GraphContext, self):
+    scalar_type = symbolic_helper._try_get_scalar_type(self)
+    if scalar_type is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    # This node records a torch dtype as int
+    return g.op("Constant", value_t=torch.tensor(scalar_type))
+
+
+@_onnx_symbolic("prim::tolist")
+def prim_tolist(g: jit_utils.GraphContext, input, dim_val, elem_ty_val):
+    """tolist is currently supported only for 1D input tensors.
+
+    dim_val and elem_ty_val represent dimension and type annotations
+    that need to match dimension and type of the input tensor.
+    """
+    dim = symbolic_helper._maybe_get_const(dim_val, "i")
+    if dim > 1:
+        return symbolic_helper._unimplemented("prim::tolist", "dim_val > 1", input)
+    return input
+
+
+# -----------------------------------------------------------------------------
+# Symbolic functions that need extra context
+# -----------------------------------------------------------------------------
+@_onnx_symbolic("prim::device")
+def prim_device(g: jit_utils.GraphContext, *inputs, **kwargs) -> None:
+    output_type = g.original_node.output().type()
+    if isinstance(output_type, _C.DeviceObjType):
+        return None
+
+    return symbolic_helper._unimplemented(
+        "prim::device",
+        f"output type should be 'DeviceObjType', not '{output_type.kind()}'",
+        g.original_node.output(),
+    )
+
+
+@_onnx_symbolic("prim::Loop")
+def prim_loop(g: jit_utils.GraphContext, *inputs, **attrs) -> list[_C.Value]:
+    node = g.original_node
+    env = g.env
+    values_in_env = g.values_in_env
+    params_dict = g.params_dict
+
+    operator_export_type = GLOBALS.operator_export_type
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    old_blocks = tuple(node.blocks())
+    _new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
+        g, "Loop", *inputs, outputs=node.outputsSize(), n_blocks=len(old_blocks)
+    )
+
+    for old_block, new_block_context in zip(old_blocks, new_block_contexts):
+        # Copy input metadata to subblock
+        #
+        #   prim::Loop(iter, cond, input_1, ..., input_n)
+        #     block0(iter, input_1, ..., input_n)
+        #
+        # For `Loop` node, copy metadata for `iter`, `input_1`, ..., `input_n`.
+        for i, b_in in enumerate(old_block.inputs()):
+            if i == 0 and i < len(inputs):
+                b_in.setType(inputs[i].type())
+            # For optional block inputs, they may switch between None not-None inside
+            # the loop body, so if the loop input is not optional, the block input may
+            # still need to be optional.
+            if (
+                i > 0
+                and (i + 1) < len(inputs)
+                and not isinstance(b_in.type(), _C.OptionalType)
+            ):
+                b_in.setType(inputs[i + 1].type())
+        torch._C._jit_pass_onnx_block(
+            old_block,
+            new_block_context.block,
+            operator_export_type,
+            env,
+            values_in_env,
+            False,
+        )
+    fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
+        new_node, opset_version
+    )
+    # Run shape type inference for Loop after subblock is converted.
+    if GLOBALS.onnx_shape_inference:
+        torch._C._jit_pass_onnx_node_shape_type_inference(
+            new_node, params_dict, opset_version
+        )
+    return fixed_outputs
+
+
+@_onnx_symbolic("prim::If")
+def prim_if(g: jit_utils.GraphContext, *inputs, **attrs) -> list[_C.Value]:
+    n = g.original_node
+    block = g.block
+    env = g.env
+    values_in_env = g.values_in_env
+    params_dict = g.params_dict
+
+    operator_export_type = GLOBALS.operator_export_type
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    static_if = inputs[0].node().kind() == "onnx::Constant"
+    if static_if:
+        # Fold static if
+        #
+        # The torch IR
+        # graph(%embedding_matrix.1 : Float(10, 15, strides=[15, 1], requires_grad=0, device=cpu),
+        #    %input.1 : Long(6, strides=[1], requires_grad=0, device=cpu), ...
+        # %65 : Bool(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
+        # %21 : Long(device=cpu) = aten::eq(%20, %64)
+        # %22 : Long(device=cpu) = prim::If(%21)
+        #     block0():
+        #     %23 : Long(device=cpu) = aten::is_floating_point(%input.1)
+        #     -> (%23)
+        #     block1():
+        #     -> (%65)
+        # %input.53 : Tensor, %weight : Tensor = prim::If(%22)
+        #     block0():
+        #     -> (%embedding_matrix.1, %input.1)
+        #     block1():
+        #     -> (%input.1, %embedding_matrix.1)
+        # %26 : int[] = aten::size(%input.53)
+        #
+        # The converted ONNX graph
+        # %10 : Bool(device=cpu) = onnx::Constant[value={0}]()
+        # %14 : Bool(device=cpu) = onnx::Equal(%13, %8)
+        # %15 : Bool(requires_grad=0, device=cpu) = onnx::Constant[value={0}]()
+        # %16 : Long(1, strides=[1], device=cpu) = onnx::Shape(%input.1)
+        input_flag = symbolic_helper._node_get(inputs[0].node(), "value").tolist()
+        const_value = (
+            all(input_flag) if isinstance(input_flag, list) else bool(input_flag)
+        )
+        block_idx = 0 if const_value else 1
+        current_b = list(n.blocks())[block_idx]
+        env = torch._C._jit_pass_onnx_block(
+            current_b,
+            block,
+            operator_export_type,
+            env,
+            values_in_env,
+            True,
+        )
+        if_output_list = list(n.outputs())
+        current_b_list = list(current_b.outputs())
+
+        final_b_list = []
+        for idx in range(len(if_output_list)):
+            if current_b_list[idx] not in env:
+                raise errors.SymbolicValueError(
+                    f"The sub block ATen output {current_b_list[idx]} is not in env.",
+                    current_b_list[idx],
+                )  # type:ignore[operator]
+            onnx_b = env[current_b_list[idx]]
+            final_b_list.append(onnx_b)
+        return final_b_list
+    else:
+        old_blocks = tuple(n.blocks())
+        _new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
+            g, "If", *inputs, outputs=n.outputsSize(), n_blocks=len(old_blocks)
+        )
+
+        for old_block, new_block_context in zip(old_blocks, new_block_contexts):
+            torch._C._jit_pass_onnx_block(
+                old_block,
+                new_block_context.block,
+                operator_export_type,
+                env,
+                values_in_env,
+                False,
+            )
+        fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
+            new_node, opset_version
+        )
+        # Run shape type inference for If after subblock is converted.
+        if GLOBALS.onnx_shape_inference:
+            torch._C._jit_pass_onnx_node_shape_type_inference(
+                new_node, params_dict, opset_version
+            )
+        return fixed_outputs
+
+
+@_onnx_symbolic("prim::Constant")
+def prim_constant(g: jit_utils.GraphContext, *inputs, **attrs):
+    node = g.original_node
+
+    if node.mustBeNone():
+        return None
+    # This must go before checking for string values, because some device constants
+    # have string values, but we want to keep them as unconverted Device types so
+    # that eq() can work on them.
+    if isinstance(node.output().type(), _C.DeviceObjType):
+        return None
+    if node.kindOf("value") == "t":
+        return g.op("Constant", value_t=symbolic_helper._node_get(node, "value"))
+    if node.kindOf("value") == "s":
+        return g.op("Constant", value_s=symbolic_helper._node_get(node, "value"))
+    if node.output().type().isSubtypeOf(
+        _C.ListType.ofInts()
+    ) or node.output().type().isSubtypeOf(_C.ListType.ofFloats()):
+        return g.op(
+            "Constant", value_t=torch.tensor(symbolic_helper._node_get(node, "value"))
+        )
+    if node.output().type().isSubtypeOf(_C.ListType.ofStrings()):
+        str_constants = [
+            g.op("Constant", value_s=s)
+            for s in symbolic_helper._node_get(node, "value")
+        ]
+        return g.op("prim::ListConstruct", *str_constants)
+
+    raise errors.SymbolicValueError(
+        f"Unsupported prim::Constant kind: '{node.kindOf('value')}'. "
+        f"Please send a bug report at {_constants.PYTORCH_GITHUB_ISSUES_URL}.",
+        node.output(),
+    )
+
+
+@_onnx_symbolic("prim::type")
+def prim_type(g: jit_utils.GraphContext, device_value: _C.Value, *args, **kwargs):
+    if device_value.node().kind() == "prim::device":
+        device = jit_utils.get_device_from_value(device_value.node().input())
+        if device is not None:
+            return g.op("Constant", value_s=str(device))
+
+    return symbolic_helper._unimplemented(
+        "prim::type",
+        "Device type cannot be statically determined.",
+        device_value,
+    )
+
+
+@_onnx_symbolic("onnx::Placeholder")
+def onnx_placeholder(g: jit_utils.GraphContext, *inputs, **attrs):
+    node = g.original_node
+    block = g.block
+    env = g.env
+    values_in_env = g.values_in_env
+
+    return torch._C._jit_onnx_convert_pattern_from_subblock(
+        block, node, env, values_in_env
+    )
+
+
+@_onnx_symbolic("aten::resolve_conj")
+@_onnx_symbolic("aten::resolve_neg")
+def noop_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
+    # ONNX does not have operators to *directly* manipulate real/imaginary components
+    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
+    # which results in failures due to missing operators for complex numbers
+
+    # `aten::resolve_conj` and `aten::resolve_neg` can safely be implemented as no-op
+    return input
+
+
+@_onnx_symbolic("aten::_conj")
+@_onnx_symbolic("aten::conj_physical")
+def unsupported_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
+    # ONNX does not have operators to *directly* manipulate real/imaginary components
+    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
+    # which results in failures due to missing operators for complex numbers
+
+    # While `aten::_conj` and `aten::conj_physical` raise exception when input is complex
+    if symbolic_helper.is_complex_value(input):
+        # FIXME(justinchuby): report correct name for symbolic being executed
+        return symbolic_helper._onnx_unsupported(
+            "aten::_conj, aten::conj_physical",
+            input,
+        )
+
+    # they can safely be implemented as no-op for real numbers only
+    return noop_complex_operators(g, input)
+
+
+@_onnx_symbolic("aten::logit")
+def logit(g: jit_utils.GraphContext, self: torch._C.Value, eps: torch._C.Value):
+    one = g.op("Constant", value_t=torch.tensor(1.0))
+
+    if not symbolic_helper._is_none(eps):
+        eps = g.op(
+            "Cast", eps, to_i=_type_utils.JitScalarType.from_value(self).onnx_type()
+        )
+        one_sub_eps = g.op("Sub", one, eps)
+        self_less_equal_one_sub_eps = g.op("Greater", one_sub_eps, self)
+        temporary_self = g.op("Where", self_less_equal_one_sub_eps, self, one_sub_eps)
+
+        temporary_self_less_eps = g.op("Less", temporary_self, eps)
+        z = g.op("Where", temporary_self_less_eps, eps, temporary_self)
+    else:
+        z = self
+
+    sub = g.op("Sub", one, z)
+    div = g.op("Div", z, sub)
+    return g.op("Log", div)
diff --git a/torch/onnx/_internal/torchscript_exporter/utils.py b/torch/onnx/_internal/torchscript_exporter/utils.py
new file mode 100644
index 0000000000000..2a7339c27e084
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/utils.py
@@ -0,0 +1,1930 @@
+# mypy: allow-untyped-defs
+"""Functions to export models into the ONNX IR format.
+
+These models can be loaded with the ONNX library and then
+converted to models which run on other deep learning frameworks.
+"""
+
+from __future__ import annotations
+
+
+__all__ = [
+    "select_model_mode_for_export",
+    "disable_apex_o2_state_dict_hook",
+    "setup_onnx_logging",
+    "exporter_context",
+    "export",
+    "model_signature",
+    "warn_on_static_input_change",
+    "unpack_quantized_tensor",
+    "unconvertible_ops",
+    "register_custom_op_symbolic",
+    "unregister_custom_op_symbolic",
+    "_add_block",
+    "_add_input_to_block",
+    "_add_output_to_block",
+    "_apply_friendly_debug_names",
+    "_check_flatten_did_not_remove",
+    "_create_jit_graph",
+    "_decide_add_node_names",
+    "_decide_constant_folding",
+    "_decide_input_format",
+    "_decide_keep_init_as_input",
+    "_export",
+    "_get_aten_op_overload_name",
+    "_get_example_outputs",
+    "_get_module_attributes",
+    "_get_named_param_dict",
+    "_get_param_count_list",
+    "_is_constant_tensor_list",
+    "_model_to_graph",
+    "_optimize_graph",
+    "_pre_trace_quant_model",
+    "_reset_trace_module_map",
+    "_resolve_args_by_export_type",
+    "_run_symbolic_function",
+    "_run_symbolic_method",
+    "_set_input_and_output_names",
+    "_setup_trace_module_map",
+    "_should_aten_fallback",
+    "_signature",
+    "_split_tensor_list_constants",
+    "_trace_and_get_graph_from_model",
+    "_trace",
+    "_trigger_symbolic_function_registration",
+    "_validate_dynamic_axes",
+    "_verify_custom_op_name",
+]
+
+import contextlib
+import copy
+import inspect
+import re
+import typing
+import warnings
+from typing import Any, Callable, cast
+from typing_extensions import deprecated
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.jit._trace
+from torch import _C
+from torch.onnx import _constants, errors
+from torch.onnx._internal.torchscript_exporter import (
+    jit_utils,
+    onnx_proto_utils,
+    registration,
+    symbolic_helper,
+)
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+
+
+if typing.TYPE_CHECKING:
+    from collections.abc import Collection, Mapping, Sequence
+
+
+# TODO(justinchuby): Remove dependency to this global variable from constant_fold.cpp
+# Skip check due to cannot import IValue from torch._C
+_params_dict = {}  # type: ignore[var-annotated]
+
+
+@deprecated("Please set training mode before exporting the model", category=None)
+@contextlib.contextmanager
+def select_model_mode_for_export(model, mode: _C_onnx.TrainingMode):
+    """A context manager to temporarily set the training mode of ``model``
+    to ``mode``, resetting it when we exit the with-block.
+
+    .. deprecated:: 2.7
+        Please set training mode before exporting the model.
+
+    Args:
+        model: Same type and meaning as ``model`` arg to :func:`export`.
+        mode: Same type and meaning as ``training`` arg to :func:`export`.
+    """
+    if not isinstance(mode, _C_onnx.TrainingMode):
+        raise TypeError(
+            f"'mode' should be a torch.onnx.TrainingMode enum, but got '{type(mode)}'."
+        )
+    originally_training: bool = False
+
+    if hasattr(model, "training"):
+        originally_training = model.training
+
+        # ONNX opset 12 has better support for training amenable models, with updated
+        # versions of the dropout and batch_norm operators
+        if mode == _C_onnx.TrainingMode.TRAINING or (
+            mode == _C_onnx.TrainingMode.PRESERVE and originally_training
+        ):
+            GLOBALS.export_training = True
+            if GLOBALS.export_onnx_opset_version < 12:
+                warnings.warn(
+                    "You are exporting the model in training mode with onnx opset "
+                    f"version {GLOBALS.export_onnx_opset_version}. "
+                    "Opset versions lower than opset 12 will not be able to export "
+                    "nodes such as Dropout and BatchNorm correctly."
+                )
+        else:
+            GLOBALS.export_training = False
+
+        GLOBALS.training_mode = mode
+        if mode == _C_onnx.TrainingMode.TRAINING:
+            model.train(True)
+        elif mode == _C_onnx.TrainingMode.EVAL:
+            model.train(False)
+        # else mode == _C_onnx.TrainingMode.PRESERVE, do nothing
+
+    try:
+        yield
+    finally:
+        if hasattr(model, "training") and not mode == _C_onnx.TrainingMode.PRESERVE:
+            model.train(originally_training)
+
+
+@deprecated(
+    "Please remove usage of this function. Copy its logic if it is required in user code",
+    category=None,
+)
+@contextlib.contextmanager
+def disable_apex_o2_state_dict_hook(model: torch.nn.Module | torch.jit.ScriptFunction):
+    """A context manager to temporarily disable the Apex O2 hook that returns.
+
+    .. deprecated:: 2.7
+        Please remove usage of this function.
+    """
+    # Apex O2 hook state_dict to return fp16 weights as fp32.
+    # Exporter cannot identify them as same tensors.
+    # Since this hook is only used by optimizer, it is safe to
+    # remove this hook while exporting.
+    if not isinstance(model, torch.jit.ScriptFunction):
+        model_hooks = {}  # type: ignore[var-annotated]
+        for module in model.modules():
+            for key, hook in module._state_dict_hooks.items():
+                if type(hook).__name__ == "O2StateDictHook":
+                    if module not in model_hooks:
+                        model_hooks[module] = {}
+                    model_hooks[module][key] = hook
+            if module in model_hooks:
+                for key in model_hooks[module]:
+                    module._state_dict_hooks.pop(key)
+        try:
+            yield
+        finally:
+            # Add the hooks back
+            for module, m_map in model_hooks.items():
+                for key, hook in m_map.items():
+                    module._state_dict_hooks[key] = hook
+    else:
+        try:
+            yield
+        finally:
+            pass
+
+
+@deprecated("The feature will be removed. Please remove usage of this function")
+@contextlib.contextmanager
+def setup_onnx_logging(verbose: bool):
+    """A context manager to temporarily set the ONNX logging verbosity.
+
+    .. deprecated:: 2.7
+        Please remove usage of this function.
+    """
+    is_originally_enabled = _C._jit_is_onnx_log_enabled
+    if is_originally_enabled or verbose:  # type: ignore[truthy-function]
+        _C._jit_set_onnx_log_enabled(True)
+    try:
+        yield
+    finally:
+        if not is_originally_enabled:  # type: ignore[truthy-function]
+            _C._jit_set_onnx_log_enabled(False)
+
+
+@deprecated(
+    "The feature will be removed. Please remove usage of this function "
+    "and implement equivalent logic if needed",
+    category=None,
+)
+@contextlib.contextmanager
+def exporter_context(model, mode: _C_onnx.TrainingMode, verbose: bool):
+    """A context manager to temporarily set the training mode of ``model``
+    to ``mode``, disable the Apex O2 hook, and set the ONNX logging verbosity.
+
+    .. deprecated:: 2.7
+        Please set training mode before exporting the model.
+    """
+    with (
+        select_model_mode_for_export(model, mode) as mode_ctx,
+        disable_apex_o2_state_dict_hook(model) as apex_ctx,
+        setup_onnx_logging(verbose) as log_ctx,
+    ):
+        yield (mode_ctx, apex_ctx, log_ctx)
+
+
+def export(
+    model: torch.nn.Module | torch.jit.ScriptModule | torch.jit.ScriptFunction,
+    args: tuple[Any, ...] | torch.Tensor,
+    f: str,
+    *,
+    kwargs: dict[str, Any] | None = None,
+    export_params: bool = True,
+    verbose: bool = False,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    input_names: Sequence[str] | None = None,
+    output_names: Sequence[str] | None = None,
+    operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX,
+    opset_version: int | None = None,
+    do_constant_folding: bool = True,
+    dynamic_axes: Mapping[str, Mapping[int, str]]
+    | Mapping[str, Sequence[int]]
+    | None = None,
+    keep_initializers_as_inputs: bool | None = None,
+    custom_opsets: Mapping[str, int] | None = None,
+    export_modules_as_functions: bool | Collection[type[torch.nn.Module]] = False,
+    autograd_inlining: bool = True,
+) -> None:
+    r"""Exports a model into ONNX format.
+
+    If ``model`` is not a :class:`torch.jit.ScriptModule` nor a
+    :class:`torch.jit.ScriptFunction`, this runs
+    ``model`` once in order to convert it to a TorchScript graph to be exported
+    (the equivalent of :func:`torch.jit.trace`). Thus this has the same limited support
+    for dynamic control flow as :func:`torch.jit.trace`.
+
+    Args:
+        model: The model to be exported.
+        args:
+
+            args can be structured either as:
+
+            1. ONLY A TUPLE OF ARGUMENTS::
+
+                args = (x, y, z)
+
+            The tuple should contain model inputs such that ``model(*args)`` is a valid
+            invocation of the model. Any non-Tensor arguments will be hard-coded into the
+            exported model; any Tensor arguments will become inputs of the exported model,
+            in the order they occur in the tuple.
+
+            2. A TENSOR::
+
+                args = torch.Tensor([1])
+
+            This is equivalent to a 1-ary tuple of that Tensor.
+
+            3. A TUPLE OF ARGUMENTS ENDING WITH A DICTIONARY OF NAMED ARGUMENTS::
+
+                args = (x, {"y": input_y, "z": input_z})
+
+            All but the last element of the tuple will be passed as non-keyword arguments,
+            and named arguments will be set from the last element. If a named argument is
+            not present in the dictionary, it is assigned the default value, or None if a
+            default value is not provided.
+
+            .. warning::
+                This behavior will be deprecated in a future release. Please use the
+                kwargs argument instead.
+
+            .. note::
+                If a dictionary is the last element of the args tuple, it will be
+                interpreted as containing named arguments. In order to pass a dict as the
+                last non-keyword arg, provide an empty dict as the last element of the args
+                tuple. For example, instead of::
+
+                    torch.onnx.export(
+                        model,
+                        (
+                            x,
+                            # WRONG: will be interpreted as named arguments
+                            {y: z},
+                        ),
+                        "test.onnx.pb",
+                    )
+
+                Write::
+
+                    torch.onnx.export(model, (x, {y: z}, {}), "test.onnx.pb")
+
+        f: Path to the output ONNX model file. E.g. "model.onnx".
+        kwargs: Named arguments to the model.
+        export_params: If True, all parameters will
+            be exported. Set this to False if you want to export an untrained model.
+            In this case, the exported model will first take all of its parameters
+            as arguments, with the ordering as specified by ``model.state_dict().values()``
+        verbose: if True, prints a description of the
+            model being exported to stdout. In addition, the final ONNX graph will include the
+            field ``doc_string``` from the exported model which mentions the source code locations
+            for ``model``. If True, ONNX exporter logging will be turned on.
+        training:
+            * ``TrainingMode.EVAL``: export the model in inference mode.
+            * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is
+                False and in training mode if model.training is True.
+            * ``TrainingMode.TRAINING``: export the model in training mode. Disables optimizations
+                which might interfere with training.
+        input_names (list of str, default empty list): names to assign to the
+            input nodes of the graph, in order.
+        output_names (list of str, default empty list): names to assign to the
+            output nodes of the graph, in order.
+        operator_export_type (enum, default OperatorExportTypes.ONNX):
+
+            .. warning::
+                This option will be deprecated in a future release. Future exported
+                graphs will always use the default opset domain.
+
+            * ``OperatorExportTypes.ONNX``: Export all ops as regular ONNX ops
+                (in the default opset domain).
+            * ``OperatorExportTypes.ONNX_FALLTHROUGH``: Try to convert all ops
+                to standard ONNX ops in the default opset domain. If unable to do so
+                (e.g. because support has not been added to convert a particular torch op to ONNX),
+                fall back to exporting the op into a custom opset domain without conversion. Applies
+                to `custom ops <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_
+                as well as ATen ops. For the exported model to be usable, the runtime must support
+                these non-standard ops.
+            * ``OperatorExportTypes.ONNX_ATEN``: All ATen ops (in the TorchScript namespace "aten")
+                are exported as ATen ops (in opset domain "org.pytorch.aten").
+                `ATen <https://pytorch.org/cppdocs/#aten>`_ is PyTorch's built-in tensor library, so
+                this instructs the runtime to use PyTorch's implementation of these ops.
+
+                .. warning::
+
+                    Models exported this way are probably runnable only by Caffe2.
+
+                    This may be useful if the numeric differences in implementations of operators are
+                    causing large differences in behavior between PyTorch and Caffe2 (which is more
+                    common on untrained models).
+
+            * ``OperatorExportTypes.ONNX_ATEN_FALLBACK``: Try to export each ATen op
+                (in the TorchScript namespace "aten") as a regular ONNX op. If we are unable to do so
+                (e.g. because support has not been added to convert a particular torch op to ONNX),
+                fall back to exporting an ATen op. See documentation on OperatorExportTypes.ONNX_ATEN for
+                context.
+                For example::
+
+                    graph(%0 : Float):
+                    %3 : int = prim::Constant[value=0]()
+                    # conversion unsupported
+                    %4 : Float = aten::triu(%0, %3)
+                    # conversion supported
+                    %5 : Float = aten::mul(%4, %0)
+                    return (%5)
+
+                Assuming ``aten::triu`` is not supported in ONNX, this will be exported as::
+
+                    graph(%0 : Float):
+                    %1 : Long() = onnx::Constant[value={0}]()
+                    # not converted
+                    %2 : Float = aten::ATen[operator="triu"](%0, %1)
+                    # converted
+                    %3 : Float = onnx::Mul(%2, %0)
+                    return (%3)
+
+                .. warning::
+
+                    Models exported this way are probably runnable only by Caffe2.
+
+        opset_version (int, default 18): The version of the
+            `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
+            to target. Must be >= 7.
+        do_constant_folding: Apply the constant-folding optimization.
+            Constant-folding will replace some of the ops that have all constant inputs
+            with pre-computed constant nodes.
+        dynamic_axes:
+
+            By default the exported model will have the shapes of all input and output tensors
+            set to exactly match those given in ``args``. To specify axes of tensors as
+            dynamic (i.e. known only at run-time), set ``dynamic_axes`` to a dict with schema:
+
+            * KEY (str): an input or output name. Each name must also be provided in ``input_names`` or
+                ``output_names``.
+            * VALUE (dict or list): If a dict, keys are axis indices and values are axis names. If a
+                list, each element is an axis index.
+
+            For example::
+
+                class SumModule(torch.nn.Module):
+                    def forward(self, x):
+                        return torch.sum(x, dim=1)
+
+
+                torch.onnx.export(
+                    SumModule(),
+                    (torch.ones(2, 2),),
+                    "onnx.pb",
+                    input_names=["x"],
+                    output_names=["sum"],
+                )
+
+            Produces::
+
+                input {
+                  name: "x"
+                  ...
+                      shape {
+                        dim {
+                          dim_value: 2  # axis 0
+                        }
+                        dim {
+                          dim_value: 2  # axis 1
+                ...
+                output {
+                  name: "sum"
+                  ...
+                      shape {
+                        dim {
+                          dim_value: 2  # axis 0
+                ...
+
+            While::
+
+                torch.onnx.export(
+                    SumModule(),
+                    (torch.ones(2, 2),),
+                    "onnx.pb",
+                    input_names=["x"],
+                    output_names=["sum"],
+                    dynamic_axes={
+                        # dict value: manually named axes
+                        "x": {0: "my_custom_axis_name"},
+                        # list value: automatic names
+                        "sum": [0],
+                    },
+                )
+
+            Produces::
+
+                input {
+                  name: "x"
+                  ...
+                      shape {
+                        dim {
+                          dim_param: "my_custom_axis_name"  # axis 0
+                        }
+                        dim {
+                          dim_value: 2  # axis 1
+                ...
+                output {
+                  name: "sum"
+                  ...
+                      shape {
+                        dim {
+                          dim_param: "sum_dynamic_axes_1"  # axis 0
+                ...
+
+        keep_initializers_as_inputs: If True, all the
+            initializers (typically corresponding to parameters) in the
+            exported graph will also be added as inputs to the graph. If False,
+            then initializers are not added as inputs to the graph, and only
+            the non-parameter inputs are added as inputs.
+            This may allow for better optimizations (e.g. constant folding) by
+            backends/runtimes.
+
+            If True, `deduplicate_initializers` pass will not be executed. This means
+            initializers with duplicated values will not be deduplicated and
+            will be treated as distinct inputs to the graph. This allows different
+            input initializers to be supplied at the runtime following export.
+
+            If ``opset_version < 9``, initializers MUST be part of graph
+            inputs and this argument will be ignored and the behavior will be
+            equivalent to setting this argument to True.
+
+        custom_opsets (dict[str, int], default empty dict): A dict with schema:
+
+            * KEY (str): opset domain name
+            * VALUE (int): opset version
+
+            If a custom opset is referenced by ``model`` but not mentioned in this dictionary,
+            the opset version is set to 1. Only custom opset domain name and version should be
+            indicated through this argument.
+
+        export_modules_as_functions: Flag to enable
+            exporting all ``nn.Module`` forward calls as local functions in ONNX. Or a set to indicate the
+            particular types of modules to export as local functions in ONNX.
+            This feature requires ``opset_version`` >= 15, otherwise the export will fail. This is because
+            ``opset_version`` < 15 implies IR version < 8, which means no local function support.
+            Module variables will be exported as function attributes. There are two categories of function
+            attributes.
+
+            1. Annotated attributes: class variables that have type annotations via
+            `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_
+            will be exported as attributes.
+            Annotated attributes are not used inside the subgraph of ONNX local function because
+            they are not created by PyTorch JIT tracing, but they may be used by consumers
+            to determine whether or not to replace the function with a particular fused kernel.
+
+            2. Inferred attributes: variables that are used by operators inside the module. Attribute names
+            will have prefix "inferred::". This is to differentiate from predefined attributes retrieved from
+            python module annotations. Inferred attributes are used inside the subgraph of ONNX local function.
+
+            * ``False`` (default): export ``nn.Module`` forward calls as fine grained nodes.
+            * ``True``: export all ``nn.Module`` forward calls as local function nodes.
+            * Set of type of nn.Module: export ``nn.Module`` forward calls as local function nodes,
+                only if the type of the ``nn.Module`` is found in the set.
+
+        autograd_inlining: Flag used to control whether to inline autograd functions.
+            Refer to https://github.com/pytorch/pytorch/pull/74765 for more details.
+
+    Raises:
+        :class:`torch.onnx.errors.CheckerError`: If the ONNX checker detects an invalid ONNX graph.
+        :class:`torch.onnx.errors.UnsupportedOperatorError`: If the ONNX graph cannot be exported because it
+            uses an operator that is not supported by the exporter.
+        :class:`torch.onnx.errors.OnnxExporterError`: Other errors that can occur during export.
+            All errors are subclasses of :class:`errors.OnnxExporterError`.
+    """
+    if operator_export_type != _C_onnx.OperatorExportTypes.ONNX:
+        warnings.warn(
+            "Setting `operator_export_type` to something other than default is deprecated. "
+            "The option will be removed in a future release.",
+            category=DeprecationWarning,
+        )
+    if training == _C_onnx.TrainingMode.TRAINING:
+        warnings.warn(
+            "Setting `training` to something other than default is deprecated. "
+            "The option will be removed in a future release. Please set the training mode "
+            "before exporting the model.",
+            category=DeprecationWarning,
+        )
+
+    args = (args,) if isinstance(args, torch.Tensor) else args
+    if kwargs is not None:
+        args = args + (kwargs,)
+
+    _export(
+        model,
+        args,
+        f,
+        export_params,
+        verbose,
+        training,
+        input_names,
+        output_names,
+        operator_export_type=operator_export_type,
+        opset_version=opset_version,
+        do_constant_folding=do_constant_folding,
+        dynamic_axes=dynamic_axes,
+        keep_initializers_as_inputs=keep_initializers_as_inputs,
+        custom_opsets=custom_opsets,
+        export_modules_as_functions=export_modules_as_functions,
+        autograd_inlining=autograd_inlining,
+    )
+
+    return None
+
+
+def _is_constant_tensor_list(node):
+    if node.kind() != "prim::Constant":
+        return False
+    output_type = node.output().type()
+    if output_type.isSubtypeOf(_C.ListType.ofTensors()):
+        return True
+    if output_type.isSubtypeOf(_C.ListType(_C.OptionalType.ofTensor())):
+        return True
+
+
+# ONNX can't handle constants that are lists of tensors, which can
+# get generated in constant prop. So we split them back into prim::ListConstructs
+
+
+def _split_tensor_list_constants(g, block):
+    for node in block.nodes():
+        for subblock in node.blocks():
+            _split_tensor_list_constants(g, subblock)
+        if _is_constant_tensor_list(node):
+            inputs = []
+            for val in node.output().toIValue():
+                input = g.insertConstant(val)
+                input.node().moveBefore(node)
+                input.node().copyMetadata(node)
+                inputs.append(input)
+
+            lc = (
+                g.create("prim::ListConstruct", inputs)
+                .insertBefore(node)
+                .output()
+                .setType(_C.ListType.ofTensors())
+            )
+            lc.node().copyMetadata(node)
+            node.output().replaceAllUsesWith(lc)
+
+
+def _optimize_graph(
+    graph: _C.Graph,
+    operator_export_type: _C_onnx.OperatorExportTypes,
+    _disable_torch_constant_prop: bool = False,
+    fixed_batch_size: bool = False,
+    params_dict=None,
+    dynamic_axes=None,
+    input_names=None,
+    module=None,
+):
+    if params_dict is None:
+        params_dict = {}
+
+    # Inline everything
+    _C._jit_pass_inline(graph)
+
+    # Remove fork/wait nodes
+    _C._jit_pass_inline_fork_wait(graph)
+    _C._jit_pass_lint(graph)
+    if GLOBALS.autograd_inlining:
+        _C._jit_pass_onnx_autograd_function_process(graph)
+    _C._jit_pass_lower_all_tuples(graph)
+
+    # we now record some ops like ones/zeros
+    # into a trace where we previously recorded constants.
+    # use constant prop to maintain our current level of onnx support
+    # without implementing symbolics for all of them
+    if _disable_torch_constant_prop is False:
+        _C._jit_pass_constant_propagation(graph)
+
+    _split_tensor_list_constants(graph, graph)
+    # run dce to eliminate dead parts of the graph that might have been
+    # left behind by things like symbolic_override
+    _C._jit_pass_dce(graph)
+    _C._jit_pass_lint(graph)
+
+    # CSE should improve perf when Autocast is used with disabled cache
+    # Autocast is disabled due to a limitation on tracer as described at https://github.com/pytorch/pytorch/issues/84092
+    # Must run before _C._jit_pass_erase_number_types to prevent type substitution
+    if _C._jit_pass_cse(graph):
+        _C._jit_pass_onnx_lint(graph)
+
+    _C._jit_pass_canonicalize_graph_fuser_ops(graph)
+    _C._jit_pass_lint(graph)
+    _C._jit_pass_peephole(graph, True)
+    _C._jit_pass_fuse_addmm(graph)
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_peephole(graph, True)
+    _C._jit_pass_lower_all_tuples(graph)
+    # in _jit_pass_onnx, symbolic functions are called for each node for conversion.
+    # However, there are nodes that cannot be converted without additional context.
+    # For example, the number of outputs from split (and whether it is static or dynamic) is unknown
+    # until the point where it is unpacked by listUnpack node.
+    # This pass does a preprocess, and prepares the nodes such that enough context can be received
+    # by the symbolic function.
+    _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
+    _C._jit_pass_onnx_preprocess(graph)
+
+    # onnx does not support tuples, so try to remove them
+    _C._jit_pass_lint(graph)
+
+    # onnx only supports tensors, but 1 / 2 = 0.5 and tensor(1) / tensor(2) = 0
+    _C._jit_pass_prepare_division_for_onnx(graph)
+
+    _C._jit_pass_onnx_remove_print(graph)
+    _C._jit_pass_onnx_preprocess_caffe2(graph)
+
+    symbolic_helper._quantized_ops.clear()
+    # Unpack quantized weights for conv and linear ops and insert into graph.
+    _C._jit_pass_onnx_unpack_quantized_weights(graph, params_dict)
+    # onnx only supports tensors, so we turn all out number types into tensors
+    _C._jit_pass_erase_number_types(graph)
+    if GLOBALS.onnx_shape_inference:
+        input_names = [] if input_names is None else input_names
+        dynamic_axes = {} if dynamic_axes is None else dynamic_axes
+        _C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
+    _C._jit_pass_onnx_lint(graph)
+
+    graph = _C._jit_pass_onnx(graph, operator_export_type)
+    _C._jit_pass_onnx_lint(graph)
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_onnx_scalar_type_analysis(
+        graph, True, GLOBALS.export_onnx_opset_version
+    )
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_onnx_peephole(
+        graph, GLOBALS.export_onnx_opset_version, fixed_batch_size
+    )
+    _C._jit_pass_lint(graph)
+
+    # graph is not a valid jit graph anymore because types have been replaced
+    # (e.g. int with Tensor), so it now contains operators that don't actually
+    # exist. We can't run normal dead code elimination because it'd fail trying
+    # to look up if an operator has side effects, but we can run a dead code
+    # elimination variant that doesn't need to look up if an op has side effects.
+    _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+    _C._jit_pass_lint(graph)
+    graph = _C._jit_pass_canonicalize(graph)
+    _C._jit_pass_lint(graph)
+    if GLOBALS.onnx_shape_inference:
+        try:
+            _C._jit_pass_onnx_graph_shape_type_inference(
+                graph, params_dict, GLOBALS.export_onnx_opset_version
+            )
+        except RuntimeError:
+            # NOTE: shape type inference error should not stop the export process
+            # https://github.com/pytorch/pytorch/issues/132205
+            pass
+
+    return graph
+
+
+def warn_on_static_input_change(input_states):
+    """Warns that changes to input dictionaries and strings won't take effect in the traced ONNX graph.
+
+    We accept dictionaries and strings as ONNX inputs, but they should be only for
+    configuration use. we detect here if these inputs are modified, and if so we warn
+    the user that the changes won't take effect in the traced ONNX graph.
+    """
+    for input, traced_input in zip(input_states[0], input_states[1]):
+        if isinstance(input, dict):
+            if list(input.keys()) != list(traced_input.keys()):
+                warning = (
+                    "We detected that you are modifying a dictionary that is an input to your "
+                    "model. "
+                    "Note that dictionaries are allowed as inputs in ONNX but they should be "
+                    "handled with care. "
+                    "Usages of dictionaries is not recommended, and should not be used except "
+                    "for configuration use. "
+                    "Also note that the order and values of the keys must remain the same. "
+                )
+                warnings.warn(warning)
+        elif isinstance(input, str):
+            if input != traced_input:
+                warning = (
+                    "The model seems to have string inputs/outputs. "
+                    "Note that strings will not appear as inputs/outputs of the ONNX graph. "
+                )
+                warnings.warn(warning)
+
+
+def _resolve_args_by_export_type(arg_name, arg_value, operator_export_type):
+    """Resolves the arguments that are ignored when export_type != operator_export_type.ONNX."""
+    return arg_value
+
+
+def _decide_keep_init_as_input(
+    keep_initializers_as_inputs: bool | None,
+    operator_export_type: _C_onnx.OperatorExportTypes,
+    opset_version: int,
+):
+    """Decides whether the initializers in the graph should be listed as ONNX graph inputs.
+
+    This method encapsulates the logic to decide whether the initializers in the graph
+    should be listed as ONNX graph inputs (i.e., whether to choose ONNX IR v3 or v4).
+    If keep_initializers_as_inputs is not specified (None), then we decide whether to keep
+    initializers as graph inputs (val_keep_init_as_ip) based on export type. If export type
+    is ONNX, then do not keep initializers as input (val_keep_init_as_ip=False). For all other
+    export types keep initializers as input (val_keep_init_as_ip=True).
+    If keep_initializers_as_inputs is specified, then respect it. Unless opset version <= 8,
+    in which case it must be ignored because for opset version <= 8, all initializers MUST be
+    part of graph input (only ONNX IR v3 is allowed), i.e. val_keep_init_as_ip=True.
+
+    Special handling is needed for opset version 8 or lower, because irrespective
+    of user input for keep_initializers_as_inputs, the graph must follow ONNX IR v3
+    semantics, i.e. all initializers must be listed as ONNX graph input.
+    """
+
+    if opset_version < 9:
+        if keep_initializers_as_inputs is False:
+            warnings.warn(
+                "Setting 'keep_initializers_as_inputs=False' for opset version"
+                "8 or lower would lead to an invalid ONNX graph. Therefore, "
+                "'keep_initializers_as_inputs=False' is ignored during export."
+                "Exported model will have initializers as graph inputs (compliant "
+                " to ONNX IR v3)."
+            )
+        return True  # i.e. True == initializers are part of graph input (ONNX IR v3)
+    val_keep_init_as_ip = (
+        True if keep_initializers_as_inputs is None else keep_initializers_as_inputs
+    )
+    if (
+        keep_initializers_as_inputs is None
+        and operator_export_type is _C_onnx.OperatorExportTypes.ONNX
+    ):
+        val_keep_init_as_ip = False
+    return val_keep_init_as_ip
+
+
+def _decide_add_node_names(add_node_names, operator_export_type):
+    return _resolve_args_by_export_type(
+        "add_node_names", add_node_names, operator_export_type
+    )
+
+
+def _decide_constant_folding(do_constant_folding, operator_export_type, training):
+    do_constant_folding = _resolve_args_by_export_type(
+        "do_constant_folding", do_constant_folding, operator_export_type
+    )
+    if do_constant_folding and (
+        training is not None and training is not _C_onnx.TrainingMode.EVAL
+    ):
+        warnings.warn(
+            "It is recommended that constant folding be turned off ('do_constant_folding=False') "
+            "when exporting the model in training-amenable mode, i.e. with 'training=TrainingMode.TRAIN' "
+            "or 'training=TrainingMode.PRESERVE' (when model is in training mode). Otherwise, some "
+            "learnable model parameters may not translate correctly in the exported ONNX model "
+            "because constant folding mutates model parameters. Please consider "
+            "turning off constant folding or setting the training=TrainingMode.EVAL."
+        )
+    return do_constant_folding
+
+
+def _signature(model) -> inspect.Signature:
+    should_be_callable = getattr(model, "forward", model)
+    if callable(should_be_callable):
+        return inspect.signature(should_be_callable)
+    raise ValueError("model has no forward method and is not callable")
+
+
+def _decide_input_format(model, args):
+    try:
+        sig = _signature(model)
+    except ValueError as e:
+        warnings.warn(f"{e}, skipping _decide_input_format")
+        return args
+    try:
+        ordered_list_keys = list(sig.parameters.keys())
+        if ordered_list_keys[0] == "self":
+            ordered_list_keys = ordered_list_keys[1:]
+        args_dict: dict = {}
+        if isinstance(args, list):
+            args_list = args
+        elif isinstance(args, tuple):
+            args_list = list(args)
+        else:
+            args_list = [args]
+        if isinstance(args_list[-1], dict):
+            args_dict = args_list[-1]
+            args_list = args_list[:-1]
+        n_nonkeyword = len(args_list)
+        for optional_arg in ordered_list_keys[n_nonkeyword:]:
+            if optional_arg in args_dict:
+                args_list.append(args_dict[optional_arg])
+            # Check if this arg has a default value
+            else:
+                param = sig.parameters[optional_arg]
+                if param.default != param.empty:
+                    args_list.append(param.default)
+        args = args_list if isinstance(args, list) else tuple(args_list)
+    # Cases of models with no input args
+    except IndexError:
+        warnings.warn("No input args, skipping _decide_input_format")
+    except Exception as e:
+        warnings.warn(f"Skipping _decide_input_format\n {e.args[0]}")
+    return args
+
+
+def _trace(func, args, operator_export_type, return_outs=False):
+    # Special case for common case of passing a single Tensor
+    if isinstance(args, torch.Tensor):
+        args = (args,)
+
+    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
+        func,
+        args,
+        strict=False,
+        _force_outplace=False,
+        _return_inputs_states=True,
+    )
+    warn_on_static_input_change(inputs_states)
+
+    trace_graph = _optimize_graph(trace_graph, operator_export_type, params_dict={})
+    if return_outs:
+        return trace_graph, torch_out
+    return trace_graph
+
+
+def _trace_and_get_graph_from_model(model, args):
+    # A basic sanity check: make sure the state_dict keys are the same
+    # before and after running the model.  Fail fast!
+    orig_state_dict_keys = torch.jit._unique_state_dict(model).keys()
+
+    # Disable Autocast cache because it replaces kernel's weight and bias
+    # by (undesired) constants.
+    # No perf impact for when there are reused weights since https://github.com/pytorch/pytorch/pull/85665
+    prev_autocast_cache_enabled = torch.is_autocast_cache_enabled()
+    torch.set_autocast_cache_enabled(False)
+    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
+        model,
+        args,
+        strict=False,
+        _force_outplace=False,
+        _return_inputs_states=True,
+    )
+    torch.set_autocast_cache_enabled(prev_autocast_cache_enabled)
+
+    warn_on_static_input_change(inputs_states)
+
+    if orig_state_dict_keys != torch.jit._unique_state_dict(model).keys():
+        raise RuntimeError(
+            "state_dict changed after running the tracer; "
+            "something weird is happening in your model!"
+        )
+
+    return trace_graph, torch_out
+
+
+def _get_param_count_list(method_graph, args_params):
+    param_count_list = []
+    for input_, arg_params_ in zip(method_graph.inputs(), args_params):
+        if "PackedParams" in str(input_.type()):
+            in_vars, _ = torch.jit._flatten(arg_params_)
+            param_count_list.append(len(in_vars))
+        else:
+            param_count_list.append(arg_params_ is not None)
+
+    return param_count_list
+
+
+def _check_flatten_did_not_remove(original, jit_flattened):
+    """torch.jit._flatten removes None. Check if it did so in this case."""
+
+    def flatten(x):
+        if isinstance(x, (list, tuple)):
+            for inner in x:
+                yield from flatten(inner)
+        elif isinstance(x, dict):
+            for inner in x.values():
+                yield from flatten(inner)
+        else:
+            yield x
+
+    flattened_with_none = list(flatten(original))
+    num_none = len(flattened_with_none) - len(jit_flattened)
+    assert num_none >= 0
+    if num_none:
+        raise ValueError(
+            f"args contained {num_none} None's after flattening. "
+            "When exporting a ScriptModule or ScriptFunction, no args may "
+            "be None because that breaks type propagation."
+        )
+
+
+def _create_jit_graph(
+    model: torch.nn.Module | torch.jit.ScriptFunction, args: Sequence[Any]
+) -> tuple[_C.Graph, list[_C.IValue], Any | None, _C.ScriptModule | None]:
+    if isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule)):
+        flattened_args = tuple(torch.jit._flatten(tuple(args))[0])
+        _check_flatten_did_not_remove(args, flattened_args)
+        torch_out = None
+
+        if isinstance(model, torch.jit.ScriptModule):
+            try:
+                graph = model.forward.graph  # type: ignore[attr-defined]
+            except AttributeError as e:
+                raise RuntimeError("'forward' method must be a script method") from e
+            _C._jit_pass_onnx_function_substitution(graph)
+            freezed_module = _C._freeze_module(
+                cast(_C.ScriptModule, model._c), preserveParameters=True
+            )
+            module, params = _C._jit_onnx_list_model_parameters(freezed_module)
+            method_graph = module._get_method("forward").graph
+            args_params = tuple(args) + tuple(params)
+            param_count_list = _get_param_count_list(method_graph, args_params)
+            in_vars, _ = torch.jit._flatten(args_params)
+            graph = _C._propagate_and_assign_input_shapes(
+                method_graph, tuple(in_vars), param_count_list, False, False
+            )
+            return graph, params, torch_out, module
+
+        # torch.jit.ScriptFunction
+        params = []
+        graph = model.graph
+        _C._jit_pass_onnx_function_substitution(graph)
+        param_count_list = _get_param_count_list(graph, args)
+        graph = _C._propagate_and_assign_input_shapes(
+            graph, flattened_args, param_count_list, False, False
+        )
+        return graph, params, torch_out, None
+
+    graph, torch_out = _trace_and_get_graph_from_model(model, args)
+    _C._jit_pass_onnx_lint(graph)
+    state_dict = torch.jit._unique_state_dict(model)
+    params = list(state_dict.values())
+    graph_inputs = list(graph.inputs())
+    user_input_num = len(graph_inputs) - len(state_dict)
+    param_names = list(state_dict.keys())
+    for i, inp in enumerate(graph_inputs):
+        if i >= user_input_num:
+            inp.setDebugName(param_names[i - user_input_num])
+    _C._jit_pass_onnx_function_substitution(graph)
+    return graph, params, torch_out, None
+
+
+def _get_named_param_dict(graph, params):
+    input_and_param_names = [val.debugName() for val in graph.inputs()]
+    param_names = input_and_param_names[len(input_and_param_names) - len(params) :]
+    _params_dict = dict(zip(param_names, params))
+    return _params_dict
+
+
+def _get_example_outputs(model, args):
+    input_args = copy.deepcopy(args)
+    input_kwargs = {}
+    if input_args and isinstance(input_args[-1], dict):
+        input_kwargs = input_args[-1]
+        input_args = input_args[:-1]
+
+    example_outputs = model(*input_args, **input_kwargs)
+    if isinstance(example_outputs, list):
+        example_outputs = [example_outputs]
+    elif not isinstance(example_outputs, tuple):
+        example_outputs = (example_outputs,)
+
+    return example_outputs
+
+
+_qtype_vtype_map = {
+    torch.quint8: torch.uint8,
+    torch.qint8: torch.int8,
+    torch.qint32: torch.int32,
+    torch.quint4x2: torch.int8,
+}
+
+
+def unpack_quantized_tensor(value, cast_onnx_accepted=True):
+    if isinstance(value, torch.Tensor) and value.dtype in _qtype_vtype_map:
+        q_value_dequantize = value.dequantize()
+        q_scale = (
+            torch.tensor(value.q_scale(), dtype=torch.double)
+            if cast_onnx_accepted
+            else torch.tensor(value.q_scale(), dtype=torch.float32)
+        )
+        q_zero_point = (
+            torch.tensor(value.q_zero_point(), dtype=torch.int64)
+            if cast_onnx_accepted
+            else torch.tensor(value.q_zero_point(), dtype=_qtype_vtype_map[value.dtype])
+        )
+        q_value = q_value_dequantize / q_scale + q_zero_point
+        q_value = q_value.to(dtype=_qtype_vtype_map[value.dtype])
+        return q_value, q_scale, q_zero_point
+    else:
+        return (value,)
+
+
+def _pre_trace_quant_model(model, args):
+    r"""Returns `torch.jit.trace(model, args)` if model is quantized. Otherwise do nothing and return
+    original model.
+
+    This is due to https://github.com/pytorch/pytorch/issues/75761.
+    """
+    if any(
+        hasattr(m, "_packed_params") for m in getattr(model, "modules", list)()
+    ) or any(getattr(arg, "is_quantized", False) for arg in args):
+        return torch.jit.trace(model, args)
+    return model
+
+
+def _model_to_graph(
+    model,
+    args,
+    verbose=False,
+    input_names=None,
+    output_names=None,
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+    do_constant_folding=True,
+    _disable_torch_constant_prop=False,
+    fixed_batch_size=False,
+    training=_C_onnx.TrainingMode.EVAL,
+    dynamic_axes=None,
+) -> tuple[
+    _C.Graph,
+    dict[str, torch.Tensor],
+    torch.Tensor
+    | tuple[torch.Tensor, ...]
+    | list[torch.Tensor]
+    | dict[str, torch.Tensor]
+    | Any
+    | None,
+]:
+    """Converts model into an ONNX graph.
+
+    Returns:
+        graph: A TorchScript IR Graph with ONNX nodes.
+        params_dict: Dict from input param name to param value.
+        torch_out: The output tensors resulting from the trace of ``model``.
+            If ``model`` is a :class:`torch.jit.ScriptModule` or :class:`torch.jit.ScriptFunction`,
+            this will be None, since we are not doing any tracing.
+    """
+    # TODO: can we simplify this to always return a tuple of Tensor or None?
+
+    # Special case for common case of passing a single Tensor
+    if isinstance(args, (torch.Tensor, int, float, bool)):
+        args = (args,)
+
+    model = _pre_trace_quant_model(model, args)
+    graph, params, torch_out, module = _create_jit_graph(model, args)
+    params_dict = _get_named_param_dict(graph, params)
+
+    try:
+        graph = _optimize_graph(
+            graph,
+            operator_export_type,
+            _disable_torch_constant_prop=_disable_torch_constant_prop,
+            fixed_batch_size=fixed_batch_size,
+            params_dict=params_dict,
+            dynamic_axes=dynamic_axes,
+            input_names=input_names,
+            module=module,
+        )
+    except Exception:
+        _C._jit_onnx_log("Torch IR graph at exception: ", graph)
+        raise
+
+    is_script = isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule))
+    if is_script:
+        example_outputs = _get_example_outputs(model, args)
+        example_outputs_final = ()
+        for example_output in example_outputs:
+            example_outputs_final += unpack_quantized_tensor(example_output)
+        out_vars, desc = torch.jit._flatten(example_outputs_final)
+        _C._jit_pass_onnx_assign_output_shape(
+            graph,
+            out_vars,
+            desc,
+            GLOBALS.onnx_shape_inference,
+            is_script,
+            GLOBALS.export_onnx_opset_version,
+        )
+
+    # NB: ONNX requires complete information about output types, which might be
+    # erased by some optimizations, so we need to set it explicitly again.
+    else:
+        if not isinstance(torch_out, (list, tuple)):
+            output_wrapped = [torch_out]
+        else:
+            output_wrapped = torch_out  # type: ignore[assignment]
+
+        output_tensors, out_desc = torch.jit._flatten(tuple(output_wrapped))
+        # assign_output_shape pass is not compatible with quantized outputs.
+        # Quantized outputs are flattened to 3 values in ONNX, while packed as
+        # single value in PyTorch.
+        if not any(getattr(out, "is_quantized", False) for out in output_tensors):
+            _C._jit_pass_onnx_assign_output_shape(
+                graph,
+                output_tensors,
+                out_desc,
+                GLOBALS.onnx_shape_inference,
+                is_script,
+                GLOBALS.export_onnx_opset_version,
+            )
+
+    _set_input_and_output_names(graph, input_names, output_names)
+    params_dict = _get_named_param_dict(graph, params)
+
+    if (
+        do_constant_folding
+        and GLOBALS.export_onnx_opset_version
+        >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
+    ):
+        if training is None or training == _C_onnx.TrainingMode.EVAL:
+            params_dict = _C._jit_pass_onnx_eval_peephole(graph, params_dict)
+
+        params_dict = _C._jit_pass_onnx_constant_fold(
+            graph, params_dict, GLOBALS.export_onnx_opset_version
+        )
+        _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+
+    if GLOBALS.onnx_shape_inference:
+        try:
+            _C._jit_pass_onnx_graph_shape_type_inference(
+                graph, params_dict, GLOBALS.export_onnx_opset_version
+            )
+        except RuntimeError:
+            # NOTE: shape type inference error should not stop the export process
+            # https://github.com/pytorch/pytorch/issues/132205
+            pass
+
+    params_dict = _C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
+
+    # For ONNX opset < 9, constants only have three data types: float16, float, double.
+    # In this pass transform constants of other data types to float/double + cast operator.
+    if GLOBALS.export_onnx_opset_version < 9:
+        _C._jit_pass_onnx_cast_all_constant_to_floating(graph)
+
+    params_dict = _C._jit_pass_filter_non_tensor_arguments(params_dict)
+    _C._jit_decay_packed_param_input_types(graph)
+
+    # If output names lack a proper name and are identified only by their unique
+    # give them a legible name for debugging purposes
+    _apply_friendly_debug_names(graph, params_dict)
+
+    return graph, params_dict, torch_out
+
+
+@deprecated(
+    "Unconvertible ops are not definitive. Please remove usage of this function"
+)
+def unconvertible_ops(
+    model,
+    args,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: int | None = None,
+) -> tuple[_C.Graph, list[str]]:
+    """Returns an approximated list of all ops that are yet supported by :mod:`torch.onnx`.
+
+    .. deprecated:: 2.5
+        Unconvertible ops are not definitive. Please remove usage of this function.
+
+    The list is approximated because some ops may be removed during the conversion
+    process and don't need to be converted. Some other ops may have partial support
+    that will fail conversion with particular inputs. Please open a Github Issue
+    for op support requests.
+
+    Args:
+        model: Same as the `model` parameter in :func:`torch.onnx.export`.
+        args: Same as the `args` parameter in :func:`torch.onnx.export`.
+        training: Same as the `training` parameter in :func:`torch.onnx.export`.
+        opset_version: Same as the `opset_version` parameter in :func:`torch.onnx.export`.
+
+    Returns:
+        The JIT graph and a list of unconvertible ops in the format of "domain::op".
+    """
+
+    opset_version = opset_version or _constants.ONNX_DEFAULT_OPSET
+    GLOBALS.export_onnx_opset_version = opset_version
+
+    try:
+        with exporter_context(model, training, verbose=False):
+            # Create a mostly clean JIT graph that contains the plain aten and
+            # other ops we can check with the symbolic registry.
+            # NOTE: We don't want to actually convert any ops to ONNX or run any
+            # symbolic functions because there is a higher chance that a pass
+            # fails or an unconvertible op messes up the graph during ONNX conversion.
+            # This way we can always generate a list just by looking at the names
+            # of the ops in the graph.
+            args = _decide_input_format(model, args)
+            model = _pre_trace_quant_model(model, args)
+            graph, _, _, module = _create_jit_graph(model, args)
+            _C._jit_pass_inline(graph)
+            _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
+            _C._jit_pass_erase_number_types(graph)
+            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+    except Exception as e:
+        raise errors.OnnxExporterError(
+            "Failed to discover unconvertible ops because of errors during the JIT graph "
+            "generation process."
+        ) from e
+
+    unsupported_ops = []
+    for node in graph.nodes():
+        domain_op = node.kind()
+        if domain_op.startswith(("onnx::", "prim::")):
+            # We consider onnx and prim ops as supported ops, even though some "prim"
+            # ops are not implemented as symbolic functions, because they may be
+            # eliminated in the conversion passes. Users may still see errors caused
+            # by prim ops even though they don't show up in the list.
+            continue
+        if not registration.registry.is_registered_op(
+            domain_op.rstrip("_"), opset_version
+        ):
+            # We consider all registered ops supported, even though some of them are
+            # only partially supported, because there is not yet a good way to check
+            # if an op is fully supported.
+            # TODO(justinchuby): Create a way to check if an op is fully supported.
+            unsupported_ops.append(domain_op)
+    return graph, unsupported_ops
+
+
+def _setup_trace_module_map(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    export_modules_as_functions: bool | Collection[type[torch.nn.Module]],
+) -> set[str]:
+    def __register_attribute_hook():
+        attr_name = "_onnx_attrs"
+
+        def _track_module_attributes_forward_pre_hook(module, input):
+            setattr(module, attr_name, _get_module_attributes(module))
+
+        def _track_module_attributes_forward_hook(module, input, output):
+            tracing_state = _C._get_tracing_state()
+            if not tracing_state:
+                return
+
+            graph = tracing_state.graph()
+            onnx_attrs = {}
+            if hasattr(module, attr_name):
+                onnx_attrs = getattr(module, attr_name)
+                delattr(module, attr_name)
+
+            _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
+
+        for m in model.modules():
+            m.register_forward_hook(_track_module_attributes_forward_hook)
+            m.register_forward_pre_hook(_track_module_attributes_forward_pre_hook)
+
+    def _unqualified_variable_name(qualified_name: str) -> str:
+        """
+        Parse qualified variable name and return the unqualified version.
+
+        Pure numeric atoms are considered inadequate, so this function will look past them,
+        and start from the first non-numeric atom.
+
+        Example:
+            >>> _unqualified_variable_name("__main__.Foo.bar")
+            'bar'
+            >>> _unqualified_variable_name("__main__.Foo.bar.0")
+            'bar.0'
+        """
+        name_atoms = qualified_name.split(".")
+        for i, atom in reversed(list(enumerate(name_atoms))):
+            if not atom.isnumeric():
+                return ".".join(name_atoms[i:])
+        return qualified_name
+
+    trace_module_map = {
+        _m: torch._C._jit_onnx_create_full_scope_name(
+            torch.typename(type(_m)), _unqualified_variable_name(_n)
+        )
+        for _n, _m in model.named_modules()
+    }
+    torch.jit._trace._trace_module_map = trace_module_map
+    if isinstance(export_modules_as_functions, bool) and export_modules_as_functions:
+        module_typenames = {torch.typename(type(module)) for module in trace_module_map}
+    elif isinstance(export_modules_as_functions, set) and export_modules_as_functions:
+
+        def _find_typename(v):
+            if isinstance(v, type):
+                return torch.typename(v)
+            else:
+                raise RuntimeError(
+                    "Only type of the `nn.Module` should be "
+                    "passed in the set for argument `export_modules_as_functions`. "
+                    f"Got `{type(v).__name__}`."
+                )
+
+        module_typenames = {_find_typename(v) for v in export_modules_as_functions}
+    else:
+        module_typenames = set()
+
+    if module_typenames:
+        __register_attribute_hook()
+
+    return module_typenames
+
+
+def _reset_trace_module_map():
+    torch.jit._trace._trace_module_map = None
+    _C._jit_pass_onnx_clear_scope_records()
+
+
+def _get_module_attributes(module):
+    annotations = typing.get_type_hints(type(module))
+    base_m_annotations = typing.get_type_hints(torch.nn.Module)
+    [annotations.pop(k, None) for k in base_m_annotations]
+    # Check whether module attributes can be accessed. Some classes
+    # define attributes but don't provide access to them in their
+    # constructor.
+    #
+    # For example, torch.nn.Embedding has the `freeze` variable and its
+    # type specified in the class but the attribute is not created in the
+    # constructor. In other words, there is no `self.freeze = <True | False>`
+    # in the constructor.
+    #
+    # Reference: https://github.com/pytorch/pytorch/blob/92de1d322223fb5584e384971b32c46b93bc2f4b/torch/nn/modules/sparse.py#L120
+    attrs = {}
+    for k in annotations:
+        try:
+            attrs[k] = getattr(module, k)
+        except AttributeError:
+            _C._jit_onnx_log(f"Skipping module attribute '{k}'")
+            continue
+    return attrs
+
+
+def _trigger_symbolic_function_registration():
+    """Trigger the registration of symbolic functions for all supported opsets."""
+
+    from torch.onnx._internal.torchscript_exporter import (  # noqa: F401
+        symbolic_opset10,
+        symbolic_opset11,
+        symbolic_opset12,
+        symbolic_opset13,
+        symbolic_opset14,
+        symbolic_opset15,
+        symbolic_opset16,
+        symbolic_opset17,
+        symbolic_opset18,
+        symbolic_opset19,
+        symbolic_opset20,
+        symbolic_opset7,
+        symbolic_opset8,
+        symbolic_opset9,
+    )
+
+
+def _export(
+    model,
+    args,
+    f,
+    export_params=True,
+    verbose=False,
+    training=_C_onnx.TrainingMode.EVAL,
+    input_names=None,
+    output_names=None,
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+    export_type=None,
+    opset_version=None,
+    do_constant_folding=True,
+    dynamic_axes=None,
+    keep_initializers_as_inputs=None,
+    fixed_batch_size=False,
+    custom_opsets=None,
+    add_node_names=True,
+    onnx_shape_inference=True,
+    export_modules_as_functions: Any = False,
+    autograd_inlining=True,
+):
+    assert GLOBALS.in_onnx_export is False
+
+    _trigger_symbolic_function_registration()
+
+    if isinstance(model, torch.nn.DataParallel):
+        raise ValueError(
+            "torch.nn.DataParallel is not supported by ONNX "
+            "exporter, please use 'attribute' module to "
+            "unwrap model from torch.nn.DataParallel. Try "
+            "torch.onnx.export(model.module, ...)"
+        )
+
+    GLOBALS.onnx_shape_inference = onnx_shape_inference
+
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+
+    if opset_version > _constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET:
+        warnings.warn(
+            f"Exporting to ONNX opset version {opset_version} is not supported. "
+            f"by 'torch.onnx.export()'. "
+            f"The highest opset version supported is {_constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET}. "
+            f"To use a newer opset version, consider 'torch.onnx.export(..., dynamo=True)'. ",
+            category=errors.OnnxExporterWarning,
+        )
+
+    if export_modules_as_functions and opset_version < 15:
+        raise ValueError(
+            "`export_modules_as_functions` is not supported for `opset_version` < 15."
+            "This is because `opset_version` < 15 implies IR version < 8, which means "
+            "no local function support. "
+        )
+    if not operator_export_type:
+        operator_export_type = _C_onnx.OperatorExportTypes.ONNX
+
+    # By default, training=TrainingMode.EVAL,
+    # which is good because running a model in training mode could result in
+    # internal buffers getting updated, dropout getting applied, etc.
+    # If you really know what you're doing, you can turn
+    # training=TrainingMode.TRAINING or training=TrainingMode.PRESERVE,
+    # (to preserve whatever the original training mode was.)
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    try:
+        GLOBALS.in_onnx_export = True
+        _autograd_inlining_previous = GLOBALS.autograd_inlining
+        GLOBALS.autograd_inlining = autograd_inlining
+
+        module_typenames_to_export_as_functions: set[str] = set()
+        if isinstance(model, (torch.nn.Module, torch.jit.ScriptModule)):
+            module_typenames_to_export_as_functions = _setup_trace_module_map(
+                model, export_modules_as_functions
+            )
+
+        with exporter_context(model, training, verbose):
+            val_keep_init_as_ip = _decide_keep_init_as_input(
+                keep_initializers_as_inputs,
+                operator_export_type,
+                opset_version,
+            )
+            val_add_node_names = _decide_add_node_names(
+                add_node_names, operator_export_type
+            )
+            val_do_constant_folding = _decide_constant_folding(
+                do_constant_folding, operator_export_type, training
+            )
+            # Normally f can be a file-like object, but for large models, the external data format requires a
+            # valid `model_file_location`. Code in export.cpp will enforce this.
+            if isinstance(f, str):
+                model_file_location = f
+            else:
+                model_file_location = ""
+            args = _decide_input_format(model, args)
+            if dynamic_axes is None:
+                dynamic_axes = {}
+            _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
+
+            graph, params_dict, torch_out = _model_to_graph(
+                model,
+                args,
+                verbose,
+                input_names,
+                output_names,
+                operator_export_type,
+                val_do_constant_folding,
+                fixed_batch_size=fixed_batch_size,
+                training=training,
+                dynamic_axes=dynamic_axes,
+            )
+
+            if custom_opsets is None:
+                custom_opsets = {}
+
+            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+            node_attr_to_name = {}  # type: ignore[var-annotated]
+            if module_typenames_to_export_as_functions:
+                # NOTE: cannot call DCE after this pass. DCE will remove function definition nodes.
+                node_attr_to_name = _C._jit_pass_onnx_function_extraction(
+                    graph,
+                    module_typenames_to_export_as_functions,
+                    list(params_dict.keys()),
+                )
+
+            if keep_initializers_as_inputs is not True:
+                params_dict = _C._jit_pass_onnx_deduplicate_initializers(  # type: ignore[assignment]
+                    graph,
+                    params_dict,  # type: ignore[arg-type]
+                    getattr(model, "training", False),  # type: ignore[arg-type]
+                )
+            _C._jit_pass_onnx_assign_scoped_names_for_node_and_value(graph)
+            defer_weight_export = False
+            if export_params:
+                (
+                    proto,
+                    export_map,
+                    _val_use_external_data_format,
+                    _node_names,
+                ) = graph._export_onnx(  # type: ignore[attr-defined]
+                    params_dict,
+                    opset_version,
+                    dynamic_axes,
+                    defer_weight_export,
+                    operator_export_type,
+                    not verbose,
+                    val_keep_init_as_ip,
+                    custom_opsets,
+                    val_add_node_names,
+                    model_file_location,
+                    node_attr_to_name,
+                )
+            else:
+                (
+                    proto,
+                    export_map,
+                    _,
+                    _,
+                ) = graph._export_onnx(  # type: ignore[attr-defined]
+                    {},
+                    opset_version,
+                    dynamic_axes,
+                    defer_weight_export,
+                    operator_export_type,
+                    not verbose,
+                    val_keep_init_as_ip,
+                    custom_opsets,
+                    val_add_node_names,
+                    model_file_location,
+                    node_attr_to_name,
+                )
+            # insert function_proto into model_proto.
+            proto = onnx_proto_utils._add_onnxscript_fn(
+                proto,
+                custom_opsets,
+            )
+            if verbose:
+                _C._jit_onnx_log("Exported graph: ", graph)
+            onnx_proto_utils._export_file(proto, f, export_map)
+    finally:
+        assert GLOBALS.in_onnx_export
+        GLOBALS.in_onnx_export = False
+        GLOBALS.autograd_inlining = _autograd_inlining_previous
+        _reset_trace_module_map()
+
+    return torch_out
+
+
+def _apply_friendly_debug_names(graph, params):
+    for n in graph.nodes():
+        for v in n.inputs():
+            old_name = v.debugName()
+            if old_name != str(v.unique()):
+                continue
+            new_name = f"{n.kind()}_{v.unique()}"
+            v.setDebugName(new_name)
+            if old_name in params:
+                params[new_name] = params.pop(old_name)
+
+
+def _set_input_and_output_names(graph, input_names, output_names):
+    def set_names(node_list, name_list, descriptor):
+        if name_list is None:
+            return
+        if len(name_list) > len(node_list):
+            raise RuntimeError(
+                f"number of {descriptor} names provided ({len(name_list)}) "
+                f"exceeded number of {descriptor}s ({len(node_list)})"
+            )
+
+        # Mark if the output node DebugName is set before.
+        output_node_set = set()
+        for i, (name, node) in enumerate(zip(name_list, node_list)):
+            # Duplicated output node, insert onnx::Identity to avoid setting the same DebugName after setDebugName().
+            if descriptor == "output":
+                if node in output_node_set:
+                    identity_node = graph.create("onnx::Identity")
+                    identity_node.insertAfter(node.node())
+                    identity_node.addInput(node)
+                    identity_node.output().setType(node.type())
+                    graph.return_node().replaceInput(i, identity_node.output())
+                    node = identity_node.output()
+                output_node_set.add(node)
+
+            if node.debugName() != name:
+                node.setDebugName(name)
+
+    set_names(list(graph.inputs()), input_names, "input")
+    set_names(list(graph.outputs()), output_names, "output")
+
+
+def _run_symbolic_method(g, op_name, symbolic_fn, args):
+    r"""
+    This trampoline function gets invoked for every symbolic method
+    call from C++.
+    """
+    try:
+        graph_context = jit_utils.GraphContext(
+            graph=g,
+            block=g.block(),
+            opset=GLOBALS.export_onnx_opset_version,
+            original_node=None,  # type: ignore[arg-type]
+            params_dict=_params_dict,
+            env={},
+            values_in_env=set(),
+            new_nodes=[],
+        )
+        return symbolic_fn(graph_context, *args)
+    except TypeError as e:
+        # Handle the specific case where we didn't successfully dispatch
+        # to symbolic_fn.  Otherwise, the backtrace will have the clues
+        # you need.
+        e.args = (f"{e.args[0]} (occurred when translating {op_name})",)
+        raise
+
+
+def _add_block(node: _C.Node) -> _C.Block:
+    return node.addBlock()
+
+
+def _add_input_to_block(block: _C.Block):
+    return block.addInputToBlock()  # type: ignore[attr-defined]
+
+
+def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
+    return block.registerOutput(value)
+
+
+def _should_aten_fallback(
+    name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
+):
+    # For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
+    #   an aten::ATen operator is created regardless of symbolics existence
+
+    is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
+    is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN
+    is_aten_fallback_export = (
+        operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+    )
+
+    if not name.startswith("aten::"):
+        return False
+
+    if is_onnx_aten_export or (is_aten_fallback_export and not is_exportable_aten_op):
+        return True
+
+    return False
+
+
+def _get_aten_op_overload_name(n: _C.Node) -> str:
+    # Returns `overload_name` attribute to ATen ops on non-Caffe2 builds
+    schema = n.schema()
+    if not schema.startswith("aten::"):
+        return ""
+    return _C.parse_schema(schema).overload_name
+
+
+def _run_symbolic_function(
+    graph: _C.Graph,
+    block: _C.Block,
+    node: _C.Node,
+    inputs: Any,
+    env: dict[_C.Value, _C.Value],
+    values_in_env: set[_C.Value],
+    new_nodes: list[_C.Node],
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+) -> _C.Value | Sequence[_C.Value | None] | None:
+    """Runs a symbolic function.
+
+    The function is used in C++ to export the node to ONNX.
+
+    Returns:
+        A single or a tuple of Values.
+        None when the node gets cloned as is into the new graph.
+    """
+
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    # See Note [Export inplace]
+    node_kind = node.kind()
+    if node_kind.endswith("_"):
+        # Treat relu_ -> relu; add_ -> add etc.
+        ns_op_name = node_kind[:-1]
+    else:
+        ns_op_name = node_kind
+
+    namespace, op_name = jit_utils.parse_node_kind(ns_op_name)
+
+    graph_context = jit_utils.GraphContext(
+        graph=graph,
+        block=block,
+        opset=opset_version,
+        original_node=node,
+        params_dict=_params_dict,
+        env=env,
+        values_in_env=values_in_env,
+        new_nodes=new_nodes,
+    )
+
+    # Direct ATen export requested
+    if _should_aten_fallback(ns_op_name, opset_version, operator_export_type):
+        attrs = {
+            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+            for k in node.attributeNames()
+        }
+        outputs = node.outputsSize()
+        attrs["outputs"] = outputs
+        return graph_context.aten_op(
+            op_name,
+            *inputs,
+            overload_name=_get_aten_op_overload_name(node),
+            **attrs,
+        )
+
+    try:
+        domain = namespace
+        symbolic_function_name = f"{domain}::{op_name}"
+
+        symbolic_function_group = registration.registry.get_function_group(
+            symbolic_function_name
+        )
+        if symbolic_function_group is not None:
+            symbolic_fn = symbolic_function_group.get(opset_version)
+            if symbolic_fn is not None:
+                # TODO Wrap almost identical attrs assignment or comment the difference.
+                attrs = {
+                    k: symbolic_helper._node_get(node, k) for k in node.attributeNames()
+                }
+                return symbolic_fn(graph_context, *inputs, **attrs)
+
+        attrs = {
+            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+            for k in node.attributeNames()
+        }
+        if namespace == "onnx":
+            # Clone node to trigger ONNX shape inference
+            return graph_context.op(
+                op_name, *inputs, **attrs, outputs=node.outputsSize()
+            )  # type: ignore[attr-defined]
+
+        raise errors.UnsupportedOperatorError(
+            symbolic_function_name,
+            opset_version,
+            symbolic_function_group.get_min_supported()
+            if symbolic_function_group
+            else None,
+        )
+
+    except RuntimeError:
+        if operator_export_type == _C_onnx.OperatorExportTypes.ONNX_FALLTHROUGH:
+            return None
+        elif operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+            # Emit ATen op for non-Caffe2 builds when `operator_export_type==ONNX_ATEN_FALLBACK`
+            attrs = {
+                k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+                for k in node.attributeNames()
+            }
+            return graph_context.aten_op(
+                op_name,
+                *inputs,
+                overload_name=_get_aten_op_overload_name(node),
+                **attrs,
+            )
+        raise
+    except TypeError as e:
+        # Handle the specific case where we didn't successfully dispatch.
+        # Otherwise, the backtrace will have the clues you need.
+        e.args = (f"{e.args[0]} \n(Occurred when translating {op_name}).",)
+        raise
+
+
+def _verify_custom_op_name(symbolic_name: str):
+    if not re.match(r"^[a-zA-Z0-9-_]+::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name):
+        raise errors.OnnxExporterError(
+            f"Failed to register operator {symbolic_name}. "
+            "The symbolic name must match the format domain::name, "
+            "and should start with a letter and contain only "
+            "alphanumerical characters"
+        )
+
+    ns, _ = jit_utils.parse_node_kind(symbolic_name)
+    if ns == "onnx":
+        raise ValueError(
+            f"Failed to register operator {symbolic_name}. {ns} domain cannot be modified."
+        )
+
+
+def register_custom_op_symbolic(
+    symbolic_name: str,
+    symbolic_fn: Callable,
+    opset_version: int,
+):
+    """Registers a symbolic function for a custom operator.
+
+    When the user registers symbolic for custom/contrib ops,
+    it is highly recommended to add shape inference for that operator via setType API,
+    otherwise the exported graph may have incorrect shape inference in some extreme cases.
+    An example of setType is `test_aten_embedding_2` in `test_operators.py`.
+
+    See "Custom Operators" in the module documentation for an example usage.
+
+    Args:
+        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
+            format.
+        symbolic_fn (Callable): A function that takes in the ONNX graph and
+            the input arguments to the current operator, and returns new
+            operator nodes to add to the graph.
+        opset_version (int): The ONNX opset version in which to register.
+    """
+    if symbolic_name.startswith("::"):
+        symbolic_name = f"aten{symbolic_name}"
+
+    _verify_custom_op_name(symbolic_name)
+
+    registration.custom_onnx_symbolic(symbolic_name, opset_version)(symbolic_fn)
+
+
+def unregister_custom_op_symbolic(symbolic_name: str, opset_version: int):
+    """Unregisters ``symbolic_name``.
+
+    See "Custom Operators" in the module documentation for an example usage.
+
+    Args:
+        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
+            format.
+        opset_version (int): The ONNX opset version in which to unregister.
+    """
+    if symbolic_name.startswith("::"):
+        symbolic_name = f"aten{symbolic_name}"
+
+    _verify_custom_op_name(symbolic_name)
+
+    registration.registry.unregister(symbolic_name, opset_version)
+
+
+def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
+    """Ensures dynamic axes argument is follows the expected format."""
+    if len(dynamic_axes) == 0:
+        return
+
+    if hasattr(model, "graph"):
+        # Extracting set of valid input/output names that shall be used for dynamic_axes
+        if (input_names is None) or len(input_names) == 0:
+            input_names = [x.debugName() for x in model.graph.inputs()]
+        if (output_names is None) or len(output_names) == 0:
+            output_names = [y.debugName() for y in model.graph.outputs()]
+
+    valid_names = set((input_names or []) + (output_names or []))
+
+    # If dynamic axes are provided as a list rather than dictionary, they should
+    # first get converted to a dictionary in expected format. If desired axes names
+    # are not provided for dynamic axes, automatic names shall be generated for
+    # provided dynamic axes of specified input/output
+    for key, value in dynamic_axes.items():
+        if key not in valid_names:
+            warnings.warn(
+                f"Provided key {key} for dynamic axes is not a valid input/output name"
+            )
+        if isinstance(value, list):
+            warnings.warn(
+                "No names were found for specified dynamic axes of provided input."
+                f"Automatically generated names will be applied to each dynamic axes of input {key}"
+            )
+
+            value_dict = {}
+            for i, x in enumerate(value):
+                if not isinstance(x, int):
+                    raise ValueError(
+                        "The type of axis index is expected to be an integer"
+                    )
+                if x in value_dict:
+                    warnings.warn(
+                        f"Duplicate dynamic axis index {x} was provided for input {key}."
+                    )
+                else:
+                    value_dict[x] = str(key) + "_dynamic_axes_" + str(i + 1)
+            dynamic_axes[key] = value_dict
+
+
+def model_signature(model: torch.nn.Module | Callable) -> inspect.Signature:
+    return inspect.signature(
+        model.forward if isinstance(model, torch.nn.Module) else model
+    )
diff --git a/torch/onnx/_internal/torchscript_exporter/verification.py b/torch/onnx/_internal/torchscript_exporter/verification.py
new file mode 100644
index 0000000000000..3bf8cba1c8d61
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/verification.py
@@ -0,0 +1,511 @@
+# mypy: allow-untyped-defs
+"""The ONNX verification module provides a set of tools to verify the correctness of ONNX models."""
+
+from __future__ import annotations
+
+
+__all__ = [
+    "OnnxBackend",
+    "VerificationOptions",
+    "verify",
+]
+
+import contextlib
+import copy
+import dataclasses
+import enum
+import io
+import os
+import tempfile
+import warnings
+from collections.abc import Mapping, Sequence
+from typing import Any, Union
+
+import numpy as np
+import numpy.typing as npt
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch.onnx._internal.torchscript_exporter import utils
+from torch.types import Number
+
+
+# Everything below are deprecated ##############################################
+
+_ORT_PROVIDERS = ("CPUExecutionProvider",)
+
+_NumericType = Union[Number, torch.Tensor, np.ndarray]
+_ModelType = Union[torch.nn.Module, torch.jit.ScriptModule]
+_InputArgsType = Union[torch.Tensor, tuple[Any, ...]]
+_InputKwargsType = Mapping[str, Any]
+_OutputsType = Union[Sequence[_NumericType], Sequence]
+
+
+class OnnxBackend(enum.Enum):
+    """Enum class for ONNX backend used for export verification.
+
+    .. deprecated:: 2.7
+        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
+        ``ONNXProgram`` to test the ONNX model.
+    """
+
+    REFERENCE = "ONNXReferenceEvaluator"
+    ONNX_RUNTIME_CPU = "CPUExecutionProvider"
+    ONNX_RUNTIME_CUDA = "CUDAExecutionProvider"
+
+
+@dataclasses.dataclass
+class VerificationOptions:
+    """Options for ONNX export verification.
+
+    .. deprecated:: 2.7
+        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
+        ``ONNXProgram`` to test the ONNX model.
+
+    Attributes:
+        flatten: If True, unpack nested list/tuple/dict inputs into a flattened list of
+            Tensors for ONNX. Set this to False if nested structures are to be preserved
+            for ONNX, which is usually the case with exporting ScriptModules. Default True.
+        ignore_none: Whether to ignore None type in torch output, which is usually the
+            case with tracing. Set this to False, if torch output should keep None type,
+            which is usually the case with exporting ScriptModules. Default to True.
+        check_shape: Whether to check the shapes between PyTorch and ONNX Runtime outputs
+            are exactly the same. Set this to False to allow output shape broadcasting.
+            Default to True.
+        check_dtype: Whether to check the dtypes between PyTorch and ONNX Runtime outputs
+            are consistent. Default to True.
+        backend: ONNX backend for verification. Default to OnnxBackend.ONNX_RUNTIME_CPU.
+        rtol: relative tolerance in comparison between ONNX and PyTorch outputs.
+        atol: absolute tolerance in comparison between ONNX and PyTorch outputs.
+        remained_onnx_input_idx: If provided, only the specified inputs will be passed
+            to the ONNX model. Supply a list when there are unused inputs in the model.
+            Since unused inputs will be removed in the exported ONNX model, supplying
+            all inputs will cause an error on unexpected inputs. This parameter tells
+            the verifier which inputs to pass into the ONNX model.
+        acceptable_error_percentage: acceptable percentage of element mismatches in comparison.
+            It should be a float of value between 0.0 and 1.0.
+    """
+
+    flatten: bool = True
+    ignore_none: bool = True
+    check_shape: bool = True
+    check_dtype: bool = True
+    backend: OnnxBackend = OnnxBackend.ONNX_RUNTIME_CPU
+    rtol: float = 1e-3
+    atol: float = 1e-7
+    remained_onnx_input_idx: Sequence[int] | None = None
+    acceptable_error_percentage: float | None = None
+
+
+def _flatten_tuples(elem):
+    flattened = []
+    for t in elem:
+        if isinstance(t, tuple):
+            flattened.extend(_flatten_tuples(t))
+        else:
+            flattened.append(t)
+    return flattened
+
+
+def _to_numpy(elem) -> list | npt.NDArray:
+    if isinstance(elem, torch.Tensor):
+        if elem.requires_grad:
+            return elem.detach().cpu().numpy()
+        else:
+            return elem.cpu().numpy()
+    elif isinstance(elem, (list, tuple)):
+        return [_to_numpy(inp) for inp in elem]
+    elif isinstance(elem, (bool, int, float)):
+        return np.array(elem)
+    elif isinstance(elem, dict):
+        flattened = []
+        for k in elem:
+            flattened.extend([_to_numpy(k), _to_numpy(elem[k])])
+        return flattened
+    return elem
+
+
+def _inline_flatten_list(inputs, res_list) -> list:
+    for i in inputs:
+        res_list.append(i) if not isinstance(
+            i, (list, tuple)
+        ) else _inline_flatten_list(i, res_list)
+    return res_list
+
+
+def _unpack_to_numpy(values, cast_onnx_accepted=True) -> list:
+    value_unpacked = []
+    for value in values:
+        value_unpacked.extend(
+            utils.unpack_quantized_tensor(value, cast_onnx_accepted=cast_onnx_accepted)
+        )
+    return [_to_numpy(v) for v in value_unpacked]
+
+
+def _run_onnx(onnx_session, inputs) -> _OutputsType:
+    kw_inputs = {}
+    if inputs and isinstance(inputs[-1], dict):
+        kw_inputs = inputs[-1]
+        inputs = inputs[:-1]
+    inputs = _unpack_to_numpy(_flatten_tuples(inputs))
+    ort_inputs = {}
+    for input_name, input in kw_inputs.items():
+        ort_inputs[input_name] = _to_numpy(input)
+    inputs = _to_numpy(inputs)
+    if hasattr(onnx_session, "get_inputs"):
+        # onnxruntime.InferenceSession
+        input_names = [i.name for i in onnx_session.get_inputs()]
+    elif hasattr(onnx_session, "input_names"):
+        # onnx.reference.ReferenceEvaluator
+        input_names = onnx_session.input_names
+    else:
+        raise ValueError(f"Unknown ONNX backend type: {type(onnx_session)}.")
+
+    for i, input in enumerate(inputs):
+        if i == len(input_names) or input_names[i] in ort_inputs:
+            raise ValueError(
+                f"got too many positional inputs. inputs: {inputs}. kw_inputs: {kw_inputs}. "
+                f"input names: {input_names}."
+            )
+        ort_inputs[input_names[i]] = input
+    onnx_outs = onnx_session.run(None, ort_inputs)
+    return onnx_outs
+
+
+def _ort_session(
+    model: str | io.BytesIO, ort_providers: Sequence[str] = _ORT_PROVIDERS
+):
+    try:
+        import onnxruntime  # type: ignore[import]
+    except ImportError as e:
+        raise ImportError("onnxruntime is required for export verification.") from e
+
+    if ort_providers is None:
+        ort_providers = _ORT_PROVIDERS
+
+    session_options = onnxruntime.SessionOptions()
+    # suppress ort warnings.
+    # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
+    session_options.log_severity_level = 3
+    ort_session = onnxruntime.InferenceSession(
+        model if isinstance(model, str) else model.getvalue(),
+        session_options,
+        providers=ort_providers,
+    )
+    return ort_session
+
+
+def _onnx_backend_session(model: str | io.BytesIO, backend: OnnxBackend):
+    if backend == OnnxBackend.REFERENCE:
+        raise NotImplementedError
+    elif backend in {OnnxBackend.ONNX_RUNTIME_CPU, OnnxBackend.ONNX_RUNTIME_CUDA}:
+        onnx_session = _ort_session(model, (backend.value,))
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
+    return onnx_session
+
+
+def _compare_onnx_pytorch_outputs_in_np(
+    onnx_outs: _OutputsType,
+    pt_outs: _OutputsType,
+    options: VerificationOptions,
+):
+    assert len(onnx_outs) == len(pt_outs), (
+        f"Number of outputs differ ONNX runtime: ({len(onnx_outs)}) PyTorch: ({len(pt_outs)})"
+    )
+    acceptable_error_percentage = options.acceptable_error_percentage
+    if acceptable_error_percentage and (
+        acceptable_error_percentage > 1.0 or acceptable_error_percentage < 0.0
+    ):
+        raise ValueError(
+            "If set, acceptable_error_percentage should be between 0.0 and 1.0"
+        )
+
+    for ort_out, pt_out in zip(onnx_outs, pt_outs):
+        try:
+            # TODO: Remove `check_shape` option once every shape inconsistent issue is addressed.
+            if not options.check_shape:
+                # Allow different but broadcastable output shapes.
+                ort_out, pt_out = np.broadcast_arrays(ort_out, pt_out)
+            torch.testing.assert_close(
+                ort_out,
+                pt_out,
+                rtol=options.rtol,
+                atol=options.atol,
+                check_dtype=options.check_dtype,
+                equal_nan=True,
+            )
+        except AssertionError as e:
+            if acceptable_error_percentage:
+                error_percentage = 1 - np.sum(
+                    np.isclose(ort_out, pt_out, rtol=options.rtol, atol=options.atol)
+                ) / np.prod(ort_out.shape)
+                if error_percentage <= acceptable_error_percentage:
+                    warnings.warn(
+                        f"Suppressed AssertionError:\n{e}.\n"
+                        f"Error percentage {error_percentage} "
+                        f"within acceptable range {acceptable_error_percentage}."
+                    )
+                    continue
+            if ort_out.dtype == np.uint8 or ort_out.dtype == np.int8:
+                warnings.warn("ONNX output is quantized")
+            if pt_out.dtype == np.uint8 or pt_out.dtype == np.int8:
+                warnings.warn("PyTorch output is quantized")
+            raise
+
+
+def _compare_onnx_pytorch_outputs(
+    onnx_outs: _OutputsType,
+    pt_outs: Any,
+    options: VerificationOptions,
+):
+    """
+    Compare ONNX and PyTorch outputs.
+
+    Args:
+        onnx_outs: outputs from ONNX backend.
+        pt_outs: outputs from PyTorch.
+        options: options for verification.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+        ValueError: if arguments provided are invalid.
+    """
+    if options.ignore_none:
+        # torch.jit._flatten filters None type
+        pt_outs, _ = torch.jit._flatten(pt_outs)
+    else:
+        pt_outs = _inline_flatten_list([pt_outs], [])
+    pt_outs_np = _unpack_to_numpy(pt_outs, cast_onnx_accepted=False)
+    onnx_outs = _inline_flatten_list(onnx_outs, [])
+    _compare_onnx_pytorch_outputs_in_np(onnx_outs, pt_outs_np, options)
+
+
+def _prepare_input_for_pytorch(args, kwargs):
+    """Prepare input for PyTorch model execution.
+
+    Any future changes/formatting to the input before dispatching to the PyTorch
+    model should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+
+    Returns:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+    """
+    if isinstance(args, (torch.Tensor, dict)):
+        args = (args,)
+    # In-place operators will update input tensor data as well.
+    # Thus inputs are replicated before every forward call.
+    args = copy.deepcopy(args)
+    if kwargs:
+        kwargs = copy.deepcopy(kwargs)
+    else:
+        kwargs = {}
+    return args, kwargs
+
+
+def _prepare_input_for_export(args, kwargs):
+    """Prepare input for ONNX model export.
+
+    Any future changes/formatting to the input before dispatching to the
+    :func:`torch.onnx.export` api should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+
+    Returns:
+        onnx_inputs: positional arguments for ONNX model export, as `args` in
+            :func:`torch.onnx.export`.
+    """
+    args, kwargs = _prepare_input_for_pytorch(args, kwargs)
+    if not kwargs and len(args) > 0 and isinstance(args[-1], dict):
+        onnx_inputs = args + ({},)
+    elif kwargs:
+        onnx_inputs = args + (kwargs,)
+    else:
+        onnx_inputs = args
+    return onnx_inputs
+
+
+def _prepare_input_for_onnx(
+    args, kwargs, remained_onnx_input_idx: Sequence[int] | None, flatten: bool
+):
+    """Prepare input for ONNX model execution in ONNX backend.
+
+    Any future changes/formatting to the input before dispatching to the ONNX backend
+    run should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+        remained_onnx_input_idx: indices of inputs to be used for ONNX model execution.
+        flatten: whether to flatten the input before dispatching to the ONNX model execution.
+
+    Returns:
+        onnx_inputs: positional arguments for ONNX model execution in ONNX backend.
+    """
+    onnx_inputs = _prepare_input_for_export(args, kwargs)
+    if flatten:
+        onnx_inputs, _ = torch.jit._flatten(onnx_inputs)
+    elif onnx_inputs and onnx_inputs[-1] == {}:
+        # Handle empty kwargs (normally removed by flatten).
+        onnx_inputs = onnx_inputs[:-1]
+    if remained_onnx_input_idx is not None:
+        return [onnx_inputs[i] for i in remained_onnx_input_idx]
+    else:
+        return onnx_inputs
+
+
+def _try_clone_model(model):
+    """Used for preserving original model in case forward mutates model states."""
+    try:
+        return copy.deepcopy(model)
+    except Exception:
+        warnings.warn(
+            "Failed to clone model. Model state might be mutated during verification."
+        )
+        return model
+
+
+def _compare_onnx_pytorch_model(
+    pt_model: _ModelType,
+    onnx_model_f: str | io.BytesIO,
+    input_args: _InputArgsType,
+    input_kwargs: _InputKwargsType | None,
+    additional_test_inputs: Sequence[_InputArgsType] | None,
+    options: VerificationOptions,
+):
+    """Compare outputs from ONNX model runs with outputs from PyTorch model runs.
+
+    Args:
+        pt_model: PyTorch model.
+        onnx_model_f: ONNX model file path or file-like object.
+        input_args: positional arguments for PyTorch model forward method.
+        input_kwargs: keyword arguments for PyTorch model forward method.
+        additional_test_inputs: additional positional arguments for PyTorch model
+            forward method.
+        options: options for verification.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+    """
+    onnx_session = _onnx_backend_session(onnx_model_f, options.backend)
+
+    def compare_onnx_pytorch_model_with_input(input_args, input_kwargs):
+        pt_args, pt_kwargs = _prepare_input_for_pytorch(input_args, input_kwargs)
+        # TODO: remove this and treat mutating model separately. See #77679
+        pt_model_copy = _try_clone_model(pt_model)
+        pt_outs = pt_model_copy(*pt_args, **pt_kwargs)
+
+        onnx_inputs = _prepare_input_for_onnx(
+            input_args, input_kwargs, options.remained_onnx_input_idx, options.flatten
+        )
+
+        onnx_outs = _run_onnx(onnx_session, onnx_inputs)
+
+        _compare_onnx_pytorch_outputs(
+            onnx_outs=onnx_outs,
+            pt_outs=pt_outs,
+            options=options,
+        )
+
+    compare_onnx_pytorch_model_with_input(input_args, input_kwargs)
+
+    if additional_test_inputs:
+        for test_input_args in additional_test_inputs:
+            compare_onnx_pytorch_model_with_input(test_input_args, {})
+
+
+def verify(
+    model: _ModelType,
+    input_args: _InputArgsType,
+    input_kwargs: _InputKwargsType | None = None,
+    do_constant_folding: bool = True,
+    dynamic_axes: Mapping[str, Mapping[int, str] | Mapping[str, Sequence[int]]]
+    | None = None,
+    input_names: Sequence[str] | None = None,
+    output_names: Sequence[str] | None = None,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: int | None = None,
+    keep_initializers_as_inputs: bool = True,
+    verbose: bool = False,
+    fixed_batch_size: bool = False,
+    use_external_data: bool = False,
+    additional_test_inputs: Sequence[_InputArgsType] | None = None,
+    options: VerificationOptions | None = None,
+):
+    """Verify model export to ONNX against original PyTorch model.
+
+    .. deprecated:: 2.7
+        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
+        ``ONNXProgram`` to test the ONNX model.
+
+    Args:
+        model: See :func:`torch.onnx.export`.
+        input_args: See :func:`torch.onnx.export`.
+        input_kwargs: See :func:`torch.onnx.export`.
+        do_constant_folding: See :func:`torch.onnx.export`.
+        dynamic_axes: See :func:`torch.onnx.export`.
+        input_names: See :func:`torch.onnx.export`.
+        output_names: See :func:`torch.onnx.export`.
+        training: See :func:`torch.onnx.export`.
+        opset_version: See :func:`torch.onnx.export`.
+        keep_initializers_as_inputs: See :func:`torch.onnx.export`.
+        verbose: See :func:`torch.onnx.export`.
+        fixed_batch_size: Legacy argument, used only by rnn test cases.
+        use_external_data: Explicitly specify whether to export the model with external data.
+        additional_test_inputs: List of tuples. Each tuple is a group of
+            input arguments to test. Currently only ``*args`` are supported.
+        options: A VerificationOptions object that controls the verification behavior.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+        ValueError: if arguments provided are invalid.
+    """
+    if options is None:
+        options = VerificationOptions()
+
+    if training == torch.onnx.TrainingMode.TRAINING:
+        model.train()
+    elif training == torch.onnx.TrainingMode.EVAL:
+        model.eval()
+    with torch.no_grad(), contextlib.ExitStack() as stack:
+        model_f: str | io.BytesIO = io.BytesIO()
+        if use_external_data:
+            tmpdir_path = stack.enter_context(tempfile.TemporaryDirectory())
+            model_f = os.path.join(tmpdir_path, "model.onnx")
+
+        inputs_for_export = _prepare_input_for_export(input_args, input_kwargs)
+
+        # TODO(#77679): remove this and treat mutating model separately.
+        model_copy = _try_clone_model(model)
+        utils._export(
+            model,
+            inputs_for_export,
+            model_f,
+            opset_version=opset_version,
+            do_constant_folding=do_constant_folding,
+            keep_initializers_as_inputs=keep_initializers_as_inputs,
+            dynamic_axes=dynamic_axes,
+            input_names=input_names,
+            output_names=output_names,
+            fixed_batch_size=fixed_batch_size,
+            training=training,
+            verbose=verbose,
+        )
+
+        _compare_onnx_pytorch_model(
+            pt_model=model_copy,
+            onnx_model_f=model_f,
+            input_args=input_args,
+            input_kwargs=input_kwargs,
+            additional_test_inputs=additional_test_inputs,
+            options=options,
+        )
diff --git a/torch/onnx/_onnx_supported_ops.py b/torch/onnx/_onnx_supported_ops.py
deleted file mode 100644
index f3d703ffc227f..0000000000000
--- a/torch/onnx/_onnx_supported_ops.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# mypy: allow-untyped-defs
-import inspect
-from typing import Union
-
-from torch import _C
-from torch.onnx import _constants
-from torch.onnx._internal import registration
-
-
-class _TorchSchema:
-    def __init__(self, schema: Union[_C.FunctionSchema, str]) -> None:
-        if isinstance(schema, _C.FunctionSchema):
-            self.name: str = schema.name
-            self.overload_name: str = schema.overload_name
-            self.arguments: list[str] = [arg.name for arg in schema.arguments]
-            self.optional_arguments: list[str] = []
-            self.returns: list[str] = [ret.name for ret in schema.returns]
-            self.opsets: list[int] = []
-        else:
-            self.name = schema
-            self.overload_name = ""
-            self.arguments = []
-            self.optional_arguments = []
-            self.returns = []
-            self.opsets = []
-
-    def __str__(self) -> str:
-        s = (
-            f"{self.name}.{self.overload_name}("
-            + ", ".join(self.arguments)
-            + ") -> ("
-            + ", ".join(self.returns)
-            + ")"
-            + " in opsets "
-            + ", ".join(str(opset) for opset in self.opsets)
-        )
-        return s
-
-    def __hash__(self):
-        # TODO(thiagocrepaldi): handle overload_name?
-        return hash(self.name)
-
-    def __eq__(self, other) -> bool:
-        if not isinstance(other, _TorchSchema):
-            return False
-        # TODO(thiagocrepaldi): handle overload_name?
-        return self.name == other.name
-
-    def is_aten(self) -> bool:
-        return self.name.startswith("aten::")
-
-    def is_backward(self) -> bool:
-        return "backward" in self.name
-
-
-def _symbolic_argument_count(func):
-    params = []
-    signature = inspect.signature(func)
-    optional_params = []
-    for name, parameter in signature.parameters.items():
-        if name in {"_outputs", "g"}:
-            continue
-        if parameter.default is parameter.empty:
-            optional_params.append(parameter)
-        else:
-            params.append(str(parameter))
-    return params
-
-
-def all_forward_schemas() -> dict[str, _TorchSchema]:
-    """Returns schemas for all TorchScript forward ops."""
-    torch_schemas = [_TorchSchema(s) for s in _C._jit_get_all_schemas()]
-    return {schema.name: schema for schema in torch_schemas if not schema.is_backward()}
-
-
-def all_symbolics_schemas() -> dict[str, _TorchSchema]:
-    """Returns schemas for all onnx supported ops."""
-    symbolics_schemas = {}
-
-    for name in registration.registry.all_functions():
-        func_group = registration.registry.get_function_group(name)
-        assert func_group is not None
-        symbolics_schema = _TorchSchema(name)
-        func = func_group.get(_constants.ONNX_MAX_OPSET)
-        if func is not None:
-            symbolics_schema.arguments = _symbolic_argument_count(func)
-            symbolics_schema.opsets = list(
-                range(func_group.get_min_supported(), _constants.ONNX_MAX_OPSET + 1)
-            )
-        else:
-            # Only support opset < 9
-            func = func_group.get(7)
-            symbolics_schema.arguments = _symbolic_argument_count(func)
-            symbolics_schema.opsets = list(range(7, _constants.ONNX_BASE_OPSET))
-
-        symbolics_schemas[name] = symbolics_schema
-
-    return symbolics_schemas
diff --git a/torch/onnx/ops/__init__.py b/torch/onnx/ops/__init__.py
index c0c87d5ccaad7..d10ba1ac7a3cc 100644
--- a/torch/onnx/ops/__init__.py
+++ b/torch/onnx/ops/__init__.py
@@ -208,7 +208,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
                 # Create a symbolic ONNX operator with the name "CustomOp" in the "custom_domain" domain.
                 # The output tensors will have the specified dtypes and shapes
-                (out1, out2) = torch.onnx.ops.symbolic(
+                (out1, out2) = torch.onnx.ops.symbolic_multi_out(
                     "custom_domain::CustomOp",
                     (x,),
                     dict(attr_key="attr_value"),
diff --git a/torch/onnx/ops/_impl.py b/torch/onnx/ops/_impl.py
index 7127716872f7b..30ffa9caf56d2 100644
--- a/torch/onnx/ops/_impl.py
+++ b/torch/onnx/ops/_impl.py
@@ -1,13 +1,15 @@
 # flake8: noqa: B950
 import math
-import typing
-from typing import Callable, Optional
+from typing import Callable, Optional, TypeVar
+from typing_extensions import ParamSpec
 
 import torch
 from torch.onnx.ops import _dtype_mappings
 
 
-_T = typing.TypeVar("_T", bound=Callable)
+# Use ParamSpec for better type preservation instead of bound Callable TypeVar
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
 
 # ONNX to ATen decomp table
 ONNX_ATEN_DECOMP_TABLE: dict[torch._ops.OpOverload, Callable] = {}
@@ -21,10 +23,12 @@
 )
 
 
-def _onnx_op(op_type: str, opset_version: int) -> Callable[[_T], _T]:
+def _onnx_op(
+    op_type: str, opset_version: int
+) -> Callable[[Callable[_P, _R]], Callable[_P, _R]]:
     """Decorator to register an ONNX operator with a custom implementation."""
 
-    def decorator(func: _T) -> _T:
+    def decorator(func: Callable[_P, _R]) -> Callable[_P, _R]:
         overload = f"opset{opset_version}"
         torch_op = torch.library.custom_op(
             f"onnx::{op_type}.{overload}", mutates_args=()
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index dc6312e5f7a32..76b50a8eb3f77 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -1,2267 +1,8 @@
-# mypy: allow-untyped-defs
-from __future__ import annotations
-
-import functools
-import inspect
-import math
-import sys
-import typing
-import warnings
-from typing import Any, Callable, Literal, NoReturn, TypeVar as _TypeVar
-from typing_extensions import Concatenate as _Concatenate, ParamSpec as _ParamSpec
-
-import torch
-import torch._C._onnx as _C_onnx
-from torch import _C
-
-# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
-from torch.onnx import _constants, _type_utils, errors, utils
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import jit_utils
-
-
-if typing.TYPE_CHECKING:
-    from collections.abc import Sequence
-
-    from torch.types import Number
-
-_T = _TypeVar("_T")
-_U = _TypeVar("_U")
-_P = _ParamSpec("_P")
-
-# ---------------------------------------------------------------------------------
-# Helper functions
-# ---------------------------------------------------------------------------------
-
-_ValueDescriptor = Literal[
-    "v",
-    "i",
-    "is",
-    "f",
-    "fs",
-    "b",
-    "s",
-    "t",
-    "none",
-]
-
-
-def _parse_arg(
-    value,
-    desc: _ValueDescriptor,
-    arg_name: str | None = None,
-    node_name: str | None = None,
-):
-    if desc == "none":
-        return value
-    if desc == "v" or not _is_value(value):
-        return value
-
-    node = value.node()
-    if node.mustBeNone():
-        return None
-    if node.kind() == "onnx::Constant":
-        node_val = _node_get(node, "value")
-        if desc == "i":
-            return int(node_val)
-        elif desc == "f":
-            return float(node_val)
-        elif desc == "b":
-            return bool(node_val)
-        elif desc == "s":
-            return str(node_val)
-        elif desc == "t":
-            return node_val
-        elif desc == "is":
-            return [int(v) for v in node_val]
-        elif desc == "fs":
-            return [float(v) for v in node_val]
-        else:
-            raise errors.SymbolicValueError(
-                f"ONNX symbolic does not understand the Constant node '{node}' "
-                f"specified with descriptor '{desc}'.",
-                value,
-            )
-    elif node.kind() == "prim::ListConstruct":
-        if desc == "is":
-            for v in node.inputs():
-                element_node = v.node()
-                if element_node.kind() != "onnx::Constant":
-                    raise errors.SymbolicValueError(
-                        f"Failed to export a node '{element_node}' "
-                        f"(in list node {node}) "
-                        f"because it is not constant. "
-                        f"Please try to make things (e.g. kernel sizes) static if possible.",
-                        value,
-                    )
-            return [int(_node_get(v.node(), "value")) for v in value.node().inputs()]
-        else:
-            raise errors.SymbolicValueError(
-                f"ONNX symbolic does not know how to unpack the ListConstruct node that "
-                f"is not a list of integers: '{node}'",
-                value,
-            )
-
-    if arg_name is None or node_name is None:
-        raise errors.SymbolicValueError(
-            f"Expected node type 'onnx::Constant', got '{node.kind()}'.",
-            value,
-        )
-
-    raise errors.SymbolicValueError(
-        "Expected node type 'onnx::Constant' "
-        f"for argument '{arg_name}' of node '{node_name}', got '{node.kind()}'.",
-        value,
-    )
-
-
-def _node_get(node: _C.Node, key: str):
-    """Gets attributes of a node which is polymorphic over return type."""
-    assert isinstance(node, _C.Node)
-    sel = node.kindOf(key)
-    return getattr(node, sel)(key)
-
-
-def _is_onnx_constant(value: _C.Value):
-    """Whether a Value is an ONNX constant."""
-    return value.node().kind() == "onnx::Constant"
-
-
-def _maybe_get_const(
-    value: _C.Value | torch.Tensor | Number | Sequence | None,
-    descriptor: _ValueDescriptor,
-):
-    # NOTE: prim::Constant at this stage usually means something not compatible in ONNX,
-    # otherwise it'd be converted to onnx::Constant
-    # TODO(justinchuby): Replace insinstance with _is_value once we figure out mypy
-    if isinstance(value, _C.Value) and _is_onnx_constant(value):
-        return _parse_arg(value, descriptor)
-    return value
-
-
-def _maybe_get_scalar(value):
-    value_t = _maybe_get_const(value, "t")
-    if isinstance(value_t, torch.Tensor) and value_t.shape == ():
-        return value_t
-    return value
-
-
-def _get_const(value, desc, arg_name):
-    if not _is_constant(value):
-        raise errors.SymbolicValueError(
-            f"ONNX symbolic expected a constant value of the '{arg_name}' argument, "
-            f"got '{value}'",
-            value,
-        )
-    return _parse_arg(value, desc)
-
-
-def _unpack_list(list_value: _C.Value) -> list[_C.Value]:
-    list_node = list_value.node()
-    if list_node.kind() != "prim::ListConstruct":
-        raise errors.SymbolicValueError(
-            f"ONNX symbolic expected node type prim::ListConstruct, got '{list_node}'.",
-            list_value,
-        )
-    return list(list_node.inputs())
-
-
-def _unpack_tuple(tuple_value: _C.Value) -> tuple[_C.Value, ...]:
-    tuple_node = tuple_value.node()
-    if not _is_tuple_construct(tuple_value):
-        raise errors.SymbolicValueError(
-            f"ONNX symbolic expected node type 'prim::TupleConstruct', "
-            f"got '{tuple_node.kind()}'.",
-            tuple_value,
-        )
-    return tuple(tuple_node.inputs())
-
-
-def _unpack_quantized_tensor(tuple_value: _C.Value) -> tuple[_C.Value, ...]:
-    """Unpacks a quantized tensor into a tuple of tensor and scale/zero_point.
-    Args:
-        tuple_value: A tuple of tensor, scale, zero_point, and optionally axis.
-    Returns:
-        A tuple of tensor, scale, zero_point, and optionally axis.
-    """
-    tuple_node = tuple_value.node()
-    # A quantized tensor is represented as tuple of the form (tensor, scale, zero_point, <axis>)
-    if not _is_tuple_construct(tuple_value):
-        raise errors.SymbolicValueError(
-            f"ONNX symbolic expected the output of `{tuple_node}` to be a quantized "
-            f"tensor. Is this likely due to missing support for quantized "
-            f"`{tuple_node.kind()}`. Please create an issue on {_constants.PYTORCH_GITHUB_ISSUES_URL}",
-            tuple_value,
-        )
-    unpacked = tuple(tuple_node.inputs())
-    assert len(unpacked) == 3 or len(unpacked) == 4
-    return unpacked
-
-
-# Check if list_value is output from prim::ListConstruct
-# This is usually called before _unpack_list to ensure the list can be unpacked.
-def _is_packed_list(list_value: Any) -> bool:
-    return _is_value(list_value) and list_value.node().kind() == "prim::ListConstruct"
-
-
-def parse_args(
-    *arg_descriptors: _ValueDescriptor,
-) -> Callable[[Callable[_Concatenate[_U, _P], _T]], Callable[_Concatenate[_U, _P], _T]]:
-    """A decorator which converts args from torch._C.Value to built-in types.
-
-    For example:
-
-    ```
-    @parse_args('v', 'i', 'fs')
-    foo(g, a, b, c):
-        assert isinstance(a, torch._C.Value)
-        assert isinstance(b, int)
-        assert isinstance(c, list)
-        assert isinstance(c[0], float)
-    ```
-
-    Args:
-        arg_descriptors: list of str, where each element is
-            a string that specifies the type to convert to. Valid descriptors:
-            "v": no conversion, keep torch._C.Value.
-            "i": int
-            "is": list of int
-            "f": float
-            "fs": list of float
-            "b": bool
-            "s": str
-            "t": torch.Tensor
-            "none": the variable is unused
-    """
-
-    def decorator(
-        fn: Callable[_Concatenate[_U, _P], _T],
-    ) -> Callable[_Concatenate[_U, _P], _T]:
-        fn._arg_descriptors = arg_descriptors  # type: ignore[attr-defined]
-
-        @functools.wraps(fn)
-        def wrapper(g: _U, *args: _P.args, **kwargs: _P.kwargs) -> _T:
-            # some args may be optional, so the length may be smaller
-            FILE_BUG_MSG = (
-                "If you believe this is not due to custom symbolic implementation within your code or "
-                "an external library, please file an issue at "
-                "https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml to report this bug."
-            )
-            assert len(arg_descriptors) >= len(args), (
-                f"A mismatch between the number of arguments ({len(args)}) and "
-                f"their descriptors ({len(arg_descriptors)}) was found at symbolic function '{fn.__name__}'. "
-                f"{FILE_BUG_MSG}"
-            )
-
-            try:
-                sig = inspect.signature(fn)
-                arg_names = list(sig.parameters.keys())[1:]
-                fn_name = fn.__name__
-            except Exception:
-                # FIXME(justinchuby): Avoid catching Exception.
-                # Catch a more specific exception instead.
-                arg_names = [None] * len(args)  # type: ignore[list-item]
-                fn_name = None
-            args = [
-                _parse_arg(arg, arg_desc, arg_name, fn_name)  # type: ignore[method-assign]
-                for arg, arg_desc, arg_name in zip(args, arg_descriptors, arg_names)
-            ]
-            # only support _outputs in kwargs
-            assert len(kwargs) <= 1, (
-                f"Symbolic function {fn.__name__}'s '**kwargs' can contain a single "
-                f"key/value entry. "
-                f"{FILE_BUG_MSG}"
-            )
-
-            if len(kwargs) == 1:
-                assert "_outputs" in kwargs, (
-                    f"Symbolic function {fn.__name__}'s '**kwargs' can only contain "
-                    f"'_outputs' key at '**kwargs'. "
-                    f"{FILE_BUG_MSG}"
-                )
-            return fn(g, *args, **kwargs)
-
-        return wrapper
-
-    return decorator
-
-
-def quantized_args(
-    *arg_q_descriptors: bool,
-    scale: float | None = None,
-    zero_point: int | None = None,
-    quantize_output: bool = True,
-) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
-    """A decorator which extends support for quantized version of the base operator.
-
-    Quantization is detected by examining the arguments that are annotated by
-    `arg_q_descriptors`.
-
-    If quantization is detected, the base operator symbolic function will be wrapped with
-    argument de-quantization and output quantization.
-
-    Otherwise, only the base symbolic function will be invoked.
-
-    For example:
-
-    ```
-    @quantized_args(True, False)
-    def foo(g, x, y):
-        return x + y
-    ```
-
-    is equivalent to
-
-    ```
-    def q_foo(g, x, y):
-        if is_quantized_tensor(x):
-            x = dequantize(x)
-            out = foo(g, x, y)
-            return quantize(out)
-        else:
-            return foo(g, x, y)
-    ```
-
-    Args:
-        arg_q_descriptors: A sequence of bool, where each element represents if the
-          argument is QTensor for quantized version of this operator. It defaults
-          to False for unspecified (variable length) arguments.
-        scale: Quantized output scale. If None, derive from
-          the first quantized input scale.
-        zero_point: Quantized output zero point. If None,
-          derive from the first quantized input zero point.
-        quantize_output: If True, quantize the output of the base operator. Default is True
-    """
-
-    def decorator(fn):
-        @functools.wraps(fn)
-        def wrapper(g, *args, **kwargs):
-            nonlocal scale
-            nonlocal zero_point
-            if scale is not None:
-                _scale = g.op("Constant", value_t=torch.tensor(scale))
-            else:
-                _scale = None
-            if zero_point is not None:
-                _zero_point = g.op("Constant", value_t=torch.tensor(zero_point))
-            else:
-                _zero_point = None
-
-            # Support variable length arguments by marking unspecified ones as non-quantized
-            arg_q_descriptors_extended = arg_q_descriptors + (False,) * (
-                len(args) - len(arg_q_descriptors)
-            )
-            descriptor_args = tuple(zip(arg_q_descriptors_extended, args))
-
-            def _is_arg_quantized(descriptor, arg):
-                return descriptor and _is_value(arg) and _is_tuple_construct(arg)
-
-            # Run regular symbolic function if none of the argument is QTensor.
-            is_quantized: list[bool] = []
-            for descriptor, arg in descriptor_args:
-                # ListConstruct
-                if _is_packed_list(arg):
-                    is_quantized.extend(
-                        _is_arg_quantized(descriptor, arg_input)
-                        for arg_input in arg.node().inputs()
-                    )
-                else:
-                    is_quantized.append(_is_arg_quantized(descriptor, arg))
-
-            if not any(is_quantized):
-                return fn(g, *args, **kwargs)
-
-            # Dequantize arguments that are quantized
-            non_quantized_args = []
-            for descriptor, arg in descriptor_args:
-                if _is_arg_quantized(descriptor, arg):
-                    # Quantized arg is a tuple of (value, scale, zero_point)
-                    dequantized_arg, arg_scale, arg_zero_point, _ = dequantize_helper(
-                        g, arg
-                    )
-                    non_quantized_args.append(dequantized_arg)
-                    # Set scale and zero_point to the first quantized input if not already set
-                    if _scale is None:
-                        _scale = arg_scale
-                    if _zero_point is None:
-                        _zero_point = arg_zero_point
-                # ListConstruct
-                elif _is_packed_list(arg):
-                    for arg_input in arg.node().inputs():
-                        if _is_arg_quantized(descriptor, arg_input):
-                            # Quantized arg is a tuple of (value, scale, zero_point)
-                            (
-                                dequantized_arg,
-                                arg_scale,
-                                arg_zero_point,
-                                _,
-                            ) = dequantize_helper(g, arg_input)
-                            # Set scale and zero_point to the first quantized input if not already set
-                            if _scale is None:
-                                _scale = arg_scale
-                            if _zero_point is None:
-                                _zero_point = arg_zero_point
-                            arg_input.replaceAllUsesWith(dequantized_arg)
-                    non_quantized_args.append(arg)
-                else:
-                    # Non-quantized arg
-                    non_quantized_args.append(arg)
-            # TODO(justinchuby): Only single output is supported for now. We may want to
-            # support multiple outputs in the future.
-            output = fn(g, *non_quantized_args, **kwargs)
-
-            assert _scale is not None, "Bug: Scale must be set for quantized operator"
-            assert _zero_point is not None, (
-                "Bug: Zero point must be set for quantized operator"
-            )
-
-            if quantize_output:
-                return quantize_helper(g, output, _scale, _zero_point)
-            return output
-
-        return wrapper
-
-    return decorator
-
-
-def _scalar(x: Any) -> Number | None:
-    """Convert a scalar tensor into a Python value."""
-    if isinstance(x, torch.Tensor) and x.shape == ():
-        return x.item()
-    return None
-
-
-def _if_scalar_type_as(self, tensor):
-    """
-    Convert self into the same type of tensor, as necessary.
-    We only support implicit casting for scalars, so we never
-    actually need to insert an ONNX cast operator here; just
-    fix up the scalar.
-    """
-    if isinstance(self, _C.Value):
-        return self
-
-    scalar_type = _type_utils.JitScalarType.from_value(
-        tensor, _type_utils.JitScalarType.UNDEFINED
-    )
-    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
-        ty = scalar_type.scalar_name().lower()
-        return getattr(self, ty)()
-    return self
-
-
-def _is_none(x: Any) -> bool:
-    return x is None or (x.node().mustBeNone() if isinstance(x, _C.Value) else False)
-
-
-def _is_value(x: Any) -> bool:
-    return isinstance(x, _C.Value)
-
-
-def _is_constant(value: Any) -> bool:
-    return not _is_value(value) or value.node().kind() in {
-        "onnx::Constant",
-        "prim::Constant",
-    }
-
-
-def _is_tensor(x: _C.Value) -> bool:
-    return x.type().isSubtypeOf(_C.TensorType.get())
-
-
-# Note: _C.JitType is not exposed to Python and cannot be checked in runtime.
-def _as_list_type(jit_type: _C.JitType) -> _C.ListType | None:
-    if isinstance(jit_type, _C.ListType):
-        return jit_type
-    return None
-
-
-def _is_list(x: _C.Value) -> bool:
-    return _as_list_type(x.type()) is not None
-
-
-def _is_tensor_list(x: _C.Value) -> bool:
-    x_type = _as_list_type(x.type())
-    if x_type is None:
-        return False
-    return isinstance(x_type.getElementType(), _C.TensorType)
-
-
-def _is_scalar_list(x: _C.Value) -> bool:
-    """Checks if x is a scalar list, for example: List[float], List[int].
-
-    Besides checking the type is ListType, we also check if the data type is
-    a valid ONNX data type.
-    """
-    x_type = _as_list_type(x.type())
-    if x_type is None:
-        return False
-    scalar_type = _type_utils.JitScalarType.from_value(x)
-    return scalar_type.onnx_compatible()
-
-
-def _is_tuple_construct(x: _C.Value) -> bool:
-    return x.node().kind() == "prim::TupleConstruct"
-
-
-def is_complex_value(x: _C.Value) -> bool:
-    assert _is_value(x)
-    return _type_utils.JitScalarType.from_value(
-        x, _type_utils.JitScalarType.UNDEFINED
-    ) in {
-        _type_utils.JitScalarType.COMPLEX32,
-        _type_utils.JitScalarType.COMPLEX64,
-        _type_utils.JitScalarType.COMPLEX128,
-    }
-
-
-def _get_tensor_rank(x: _C.Value) -> int | None:
-    if not _is_tensor(x) or x.type() is None:
-        return None
-    x_type = x.type()
-    x_type = typing.cast(_C.TensorType, x_type)
-    return x_type.dim()
-
-
-def _get_tensor_sizes(x: _C.Value, allow_nonstatic: bool = True):
-    if not _is_tensor(x) or x.type() is None:
-        return None
-    x_type = x.type()
-    x_type = typing.cast(_C.TensorType, x_type)
-    if allow_nonstatic:
-        # Each individual symbol is returned as None.
-        # e.g. [1, "a", "b"] -> [1, None, None]
-        return x_type.varyingSizes()
-    # returns None, if exists any symbol in sizes.
-    # e.g. [1, "a", "b"] -> None
-    return x_type.sizes()
-
-
-def _get_tensor_dim_size(x: _C.Value, dim: int) -> int | None:
-    sizes = _get_tensor_sizes(x)
-    return sizes[dim] if sizes else None
-
-
-def _get_dim_for_cross(x: _C.Value, dim: int | None):
-    if dim == -1:
-        tensor_rank = _get_tensor_rank(x)
-        assert tensor_rank is not None
-        return dim + tensor_rank
-    # If dim is not given, it defaults to the first dimension found with the size 3
-    if dim is None:
-        sizes = _get_tensor_sizes(x)
-        assert sizes is not None
-        for index, size in enumerate(sizes):
-            if size is not None and size == 3:
-                return index
-    return dim
-
-
-def _unimplemented(op: str, msg: str, value: _C.Value | None = None) -> None:
-    # For BC reasons, the behavior for Caffe2 does not raise exception for unimplemented operators
-    if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
-        _onnx_unsupported(f"{op}, {msg}", value)
-
-
-def _onnx_unsupported(op_name: str, value: _C.Value | None = None) -> NoReturn:
-    message = (
-        f"Unsupported: ONNX export of operator {op_name}. "
-        f"Please feel free to request support or submit a pull request "
-        f"on PyTorch GitHub: {_constants.PYTORCH_GITHUB_ISSUES_URL}"
-    )
-    if isinstance(value, _C.Value):
-        raise errors.SymbolicValueError(
-            message,
-            value,
-        )
-    raise errors.OnnxExporterError(message)
-
-
-def _onnx_opset_unsupported(
-    op_name: str,
-    current_opset: int,
-    supported_opset: int,
-    value: _C.Value | None = None,
-) -> NoReturn:
-    message = (
-        f"Unsupported: ONNX export of {op_name} in opset {current_opset}. "
-        f"Please try opset version {supported_opset}."
-    )
-    if isinstance(value, _C.Value):
-        raise errors.SymbolicValueError(
-            message,
-            value,
-        )
-    raise errors.OnnxExporterError(message)
-
-
-def _onnx_opset_unsupported_detailed(
-    op_name: str,
-    current_opset: int,
-    supported_opset: int,
-    reason: str,
-    value: _C.Value | None = None,
-) -> NoReturn:
-    message = (
-        f"Unsupported: ONNX export of {op_name} in "
-        f"opset {current_opset}. {reason}. Please try opset version {supported_opset}."
-    )
-    if isinstance(value, _C.Value):
-        raise errors.SymbolicValueError(
-            message,
-            value,
-        )
-    raise errors.OnnxExporterError(message)
-
-
-def _block_list_in_opset(name: str):
-    def symbolic_fn(*args, **kwargs):
-        raise errors.OnnxExporterError(
-            f"ONNX export failed on {name}, which is not implemented for opset "
-            f"{GLOBALS.export_onnx_opset_version}. "
-            "Try exporting with other opset versions."
-        )
-
-    return symbolic_fn
-
-
-def _try_get_scalar_type(*args) -> _type_utils.JitScalarType | None:
-    for arg in args:
-        scalar_type = _type_utils.JitScalarType.from_value(
-            arg, _type_utils.JitScalarType.UNDEFINED
-        )
-        if scalar_type != _type_utils.JitScalarType.UNDEFINED:
-            return scalar_type
-    return None
-
-
-def _type_promote_from_values(*args) -> _type_utils.JitScalarType:
-    undef = _type_utils.JitScalarType.UNDEFINED
-    jit_types = [_try_get_scalar_type(arg) for arg in args]
-    if len(jit_types) == 0:
-        return undef
-    if len(jit_types) == 1:
-        return jit_types[0]  # type: ignore[return-value]
-    new_dtype = jit_types[0].dtype()  # type: ignore[union-attr]
-    for t in jit_types:
-        new_dtype = torch.promote_types(new_dtype, t.dtype())  # type: ignore[union-attr]
-    return _type_utils.JitScalarType.from_dtype(new_dtype)
-
-
-def _maybe_cast_to_type(
-    g: jit_utils.GraphContext, value, jit_type: _type_utils.JitScalarType
-):
-    if (
-        _type_utils.JitScalarType.from_value(value, _type_utils.JitScalarType.UNDEFINED)
-        != jit_type
-    ):
-        return g.op(
-            "Cast",
-            value,
-            to_i=jit_type.onnx_type(),
-        )
-    return value
-
-
-def _select_helper(g: jit_utils.GraphContext, self, dim, index, apply_reshape=True):
-    index_const = _maybe_get_scalar(index)
-    index_dim = _get_tensor_rank(index)
-    if not _is_value(index_const):
-        # Index is a constant scalar. Make it a size 1 constant tensor.
-        index = g.op("Constant", value_t=torch.LongTensor([index_const]))
-    elif index_dim is not None and apply_reshape:
-        if index_dim == 0:
-            # Index is a scalar. Reshape it to a size 1 tensor.
-            index = _reshape_helper(
-                g, index, g.op("Constant", value_t=torch.LongTensor([1]))
-            )
-
-    index_scalar_type = _type_utils.JitScalarType.from_value(
-        index, _type_utils.JitScalarType.UNDEFINED
-    )
-    if index_scalar_type not in {
-        _type_utils.JitScalarType.INT64,
-        _type_utils.JitScalarType.INT,
-    }:
-        index = g.op("Cast", index, to_i=_C_onnx.TensorProtoDataType.INT64)
-    return g.op("Gather", self, index, axis_i=dim)
-
-
-def _slice_helper(
-    g: jit_utils.GraphContext,
-    input,
-    axes,
-    starts,
-    ends,
-    steps=None,
-):
-    if g.opset <= 9:
-        from torch.onnx.symbolic_opset9 import _slice as _slice9
-
-        return _slice9(g, input, axes, starts, ends)
-    else:
-        from torch.onnx.symbolic_opset10 import _slice as _slice10
-
-        return _slice10(g, input, axes, starts, ends, steps)
-
-
-def _is_fp(value) -> bool:
-    return _type_utils.JitScalarType.from_value(
-        value, _type_utils.JitScalarType.UNDEFINED
-    ) in {
-        _type_utils.JitScalarType.FLOAT,
-        _type_utils.JitScalarType.DOUBLE,
-        _type_utils.JitScalarType.HALF,
-        _type_utils.JitScalarType.BFLOAT16,
-    }
-
-
-def _is_bool(value) -> bool:
-    return _type_utils.JitScalarType.from_value(
-        value, _type_utils.JitScalarType.UNDEFINED
-    ) in {_type_utils.JitScalarType.BOOL}
-
-
-def _generate_wrapped_number(g: jit_utils.GraphContext, scalar):
-    """Creates a wrapped number based on https://github.com/pytorch/pytorch/issues/9515.
-
-    A Tensor is a considered a "wrapped number" if it is
-    auto-wrapped from a C++ or Python number type. Integer types are
-    wrapped as 0-dim int64 tensors and floating-point types are
-    wrapped as 0-dim double tensors.
-
-    The input to this function is constant value. If the data type
-    is a floating point type, it is converted to a 0-dim double
-    tensor, else it is converted to a 0-dim tensor of its original type
-    """
-    assert not isinstance(scalar, torch.Tensor)
-    if isinstance(scalar, float):
-        return g.op("Constant", value_t=torch.tensor(scalar, dtype=torch.double))
-    return g.op("Constant", value_t=torch.tensor(scalar))
-
-
-def _sort_helper(g: jit_utils.GraphContext, input, dim, descending=True, out=None):
-    if out is not None:
-        _unimplemented("Sort", "Out parameter is not supported")
-    shape_ = g.op("Shape", input)
-    dim_size_ = g.op(
-        "Gather",
-        shape_,
-        g.op("Constant", value_t=torch.tensor([dim], dtype=torch.int64)),
-    )
-    if g.opset <= 10:
-        if not descending:
-            _unimplemented("Sort", "Ascending is not supported")
-        return g.op("TopK", input, dim_size_, axis_i=dim, outputs=2)
-    else:
-        return g.op(
-            "TopK", input, dim_size_, axis_i=dim, largest_i=descending, outputs=2
-        )
-
-
-def _topk_helper(
-    g: jit_utils.GraphContext, input, k, dim, largest=True, sorted=False, out=None
-):
-    if out is not None:
-        _unimplemented("TopK", "Out parameter is not supported")
-    if not _is_value(k):
-        k = g.op("Constant", value_t=torch.tensor([k], dtype=torch.int64))
-    else:
-        k = _reshape_helper(g, k, g.op("Constant", value_t=torch.tensor([1])))
-        if _try_get_scalar_type(k) != _type_utils.JitScalarType.INT64:
-            k = g.op("Cast", k, to_i=_C_onnx.TensorProtoDataType.INT64)
-    if g.opset <= 10:
-        if not largest:
-            _unimplemented("TopK", "Ascending is not supported")
-        return g.op("TopK", input, k, axis_i=dim, outputs=2)
-    else:
-        return g.op(
-            "TopK", input, k, axis_i=dim, largest_i=largest, sorted_i=sorted, outputs=2
-        )
-
-
-def _lt_helper(g: jit_utils.GraphContext, input, other):
-    if g.opset <= 8:
-        from torch.onnx.symbolic_opset8 import lt as _lt8
-
-        return _lt8(g, input, other)
-    else:
-        from torch.onnx.symbolic_opset9 import lt as _lt9
-
-        return _lt9(g, input, other)
-
-
-def _interpolate_warning(interpolate_mode):
-    onnx_op = (
-        "onnx:Resize" if GLOBALS.export_onnx_opset_version >= 10 else "onnx:Upsample"
-    )
-    warnings.warn(
-        "You are trying to export the model with "
-        + onnx_op
-        + " for ONNX opset version "
-        "" + str(GLOBALS.export_onnx_opset_version) + ". "
-        "This operator might cause results to not match the expected results by PyTorch.\n"
-        "ONNX's Upsample/Resize operator did not match Pytorch's Interpolation until opset 11. "
-        "Attributes to determine how to transform the input were added in onnx:Resize in opset 11 "
-        "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n"
-        "We recommend using opset 11 and above for models using this operator."
-    )
-
-
-def _unsqueeze_helper(g: jit_utils.GraphContext, input, axes_i):
-    if len(axes_i) == 0:
-        # unnecessary unsqueeze if axes length==0
-        return input
-    elif _is_constant(axes_i[0]):
-        if g.opset >= 13:
-            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
-            return g.op("Unsqueeze", input, axes)
-        return g.op("Unsqueeze", input, axes_i=axes_i)
-    # Tensor type
-    if g.opset < 13:
-        raise errors.SymbolicValueError(
-            "Opset version must be >= 13 for Unsqueeze with dynamic axes.", input
-        )
-    return g.op("Unsqueeze", input, axes_i[0])
-
-
-def _squeeze_helper(g: jit_utils.GraphContext, input, axes_i):
-    if _is_constant(axes_i[0]):
-        if g.opset >= 13:
-            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
-            return g.op("Squeeze", input, axes)
-        return g.op("Squeeze", input, axes_i=axes_i)
-    # Tensor type
-    if g.opset < 13:
-        raise errors.SymbolicValueError(
-            "Opset version must be >= 13 for Squeeze with dynamic axes.", input
-        )
-    axes_t = axes_i[0]
-    axes_rank = _get_tensor_rank(axes_t)
-    assert axes_rank is not None
-    if axes_rank > 1:
-        raise errors.SymbolicValueError(
-            "For Squeeze axses as input, the axes rank must be one in ONNX spec.", input
-        )
-    elif axes_rank == 0:
-        # The axes is a scalar. Unsqueeze it to a rank 1 tensor.
-        axes_t = _unsqueeze_helper(g, axes_t, [0])
-        return g.op("Squeeze", input, axes_t)
-    return g.op("Squeeze", input, axes_t)
-
-
-def _reducesum_helper(
-    g: jit_utils.GraphContext,
-    input,
-    axes_i=None,
-    keepdims_i=1,
-    noop_with_empty_axes_i=0,
-):
-    keepdims_i = _maybe_get_const(keepdims_i, "i")
-    if g.opset >= 13:
-        if axes_i:
-            if not _is_value(axes_i):
-                axes_i = g.op(
-                    "Constant", value_t=torch.tensor(axes_i, dtype=torch.long)
-                )
-            return g.op(
-                "ReduceSum",
-                input,
-                axes_i,
-                keepdims_i=keepdims_i,
-                noop_with_empty_axes_i=noop_with_empty_axes_i,
-            )
-        return g.op(
-            "ReduceSum",
-            input,
-            keepdims_i=keepdims_i,
-            noop_with_empty_axes_i=noop_with_empty_axes_i,
-        )
-    else:
-        return g.op("ReduceSum", input, axes_i=axes_i, keepdims_i=keepdims_i)
-
-
-def _interpolate_size_to_scales(g: jit_utils.GraphContext, input, output_size, dim):
-    output_size = _maybe_get_const(output_size, "is")
-    if _is_value(output_size):
-        offset = 2
-        offsets = g.op("Constant", value_t=torch.ones(offset, dtype=torch.float32))
-        dividend = g.op("Cast", output_size, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-        divisor = _slice_helper(
-            g, g.op("Shape", input), axes=[0], ends=[sys.maxsize], starts=[offset]
-        )
-        divisor = g.op("Cast", divisor, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-        scale_dims = g.op("Div", dividend, divisor)
-        scales = g.op("Concat", offsets, scale_dims, axis_i=0)
-    else:
-        scales_constant = [
-            1.0
-            if i < 2
-            else float(output_size[-(dim - i)])
-            / float(input.type().sizes()[-(dim - i)])
-            for i in range(0, dim)
-        ]
-        scales = g.op(
-            "Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32)
-        )
-    return scales
-
-
-def _interpolate_get_scales_if_available(g: jit_utils.GraphContext, scales):
-    available_scales = _maybe_get_const(scales[0], "fs") != -1 and not _is_none(
-        scales[0]
-    )
-
-    if not available_scales:
-        return None
-
-    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
-    scales_list = g.op(
-        "Constant", value_t=torch.tensor(_maybe_get_const(scales[0], "fs"))
-    )
-    scales = g.op("Concat", offsets, scales_list, axis_i=0)
-    return scales
-
+"""Backward compatibility module for torch.onnx.symbolic_helper."""
 
-def _get_interpolate_attributes(g: jit_utils.GraphContext, mode, args):
-    if mode == "nearest":
-        align_corners = None
-        scales = args[0:]
-    else:
-        align_corners = args[0]
-        scales = args[1:]
-    scales = _interpolate_get_scales_if_available(g, scales)
-    return scales, align_corners
-
-
-def _interpolate_get_scales(g: jit_utils.GraphContext, scale_factor, dim):
-    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
-    scale_factor_rank = _get_tensor_rank(scale_factor)
-    if isinstance(scale_factor.type(), _C.ListType) or (
-        scale_factor_rank is not None and scale_factor_rank > 0
-    ):
-        return g.op("Concat", offsets, scale_factor, axis_i=0)
-    else:
-        scale_factor = _unsqueeze_helper(g, scale_factor, [0])
-        scale_factor = g.op(
-            "Cast", scale_factor, to_i=_C_onnx.TensorProtoDataType.FLOAT
-        )
-        scales = [scale_factor for i in range(dim - 2)]
-    scale_factor = g.op("Concat", offsets, *scales, axis_i=0)
-    return scale_factor
-
-
-def _interpolate_get_scales_and_mode(
-    g: jit_utils.GraphContext, input, size, scale_factor, mode, align_corners
-):
-    mode = _maybe_get_const(mode, "s")
-    if "linear" in mode:
-        mode = "linear"
-    if "cubic" in mode:
-        mode = "cubic"
-    _interpolate_warning(mode)
-
-    align_corners = _maybe_get_const(align_corners, "b")
-    if isinstance(align_corners, bool) and align_corners:
-        return _unimplemented("interpolate", "align_corners == True")
-
-    if not input.type().dim():
-        return _unimplemented("interpolate", "missing input shape")
-    dim = input.type().dim()
-
-    if not _is_none(scale_factor):
-        scale_factor = _interpolate_get_scales(g, scale_factor, dim)
-    elif not _is_none(size):
-        if not _is_packed_list(size):
-            is_scalar = _maybe_get_const(size, "t").dim() == 0
-            if is_scalar:
-                size = _unsqueeze_helper(g, size, [0])
-                size = [size for i in range(dim - 2)]
-                size = g.op("Concat", *size, axis_i=0)
-        scale_factor = _interpolate_size_to_scales(g, input, size, dim)
-    else:
-        return _unimplemented(
-            "interpolate", "Both size and scales are None in __interpolate"
-        )
-    return scale_factor, mode
-
-
-def _argmin_argmax_helper(
-    g: jit_utils.GraphContext,
-    input: torch._C.Value,
-    dim: torch._C.Value,
-    keepdim: bool,
-    op_name: str,
-):
-    def op_wrapper(input, axis_i, keepdims_i):
-        if g.opset >= 12:
-            return g.op(
-                op_name,
-                input,
-                axis_i=axis_i,
-                keepdims_i=keepdims_i,
-                select_last_index_i=False,
-            )
-        return g.op(op_name, input, axis_i=axis_i, keepdims_i=keepdims_i)
-
-    if _is_none(dim):
-        flattened = _reshape_helper(
-            g, input, g.op("Constant", value_t=torch.tensor([-1]))
-        )
-        output = op_wrapper(flattened, axis_i=0, keepdims_i=False)
-        if keepdim:
-            input_shape = g.op("Shape", input)
-            input_shape_shape = g.op("Shape", input_shape)
-            new_shape = g.op(
-                "ConstantOfShape",
-                input_shape_shape,
-                value_t=torch.tensor([1], dtype=torch.int64),
-            )
-            output = g.op("Reshape", output, new_shape)
-        return output
-
-    dim = _parse_arg(dim, "i")
-    return op_wrapper(input, axis_i=dim, keepdims_i=keepdim)
-
-
-def _interpolate_helper(name, dim, interpolate_mode):
-    @quantized_args(True, False, False)
-    def symbolic_fn(g, input, output_size, *args):
-        scales, align_corners = _get_interpolate_attributes(g, interpolate_mode, args)
-        align_corners = _maybe_get_scalar(align_corners)
-        coordinate_transformation_mode = (
-            "asymmetric"
-            if interpolate_mode == "nearest"
-            else "align_corners"
-            if align_corners
-            else "half_pixel"
-        )
-
-        if scales is None:
-            input_size = g.op("Shape", input)
-            input_size_beg = _slice_helper(
-                g, input_size, axes=[0], ends=[2], starts=[0]
-            )
-            output_size = g.op(
-                "Cast", output_size, to_i=_C_onnx.TensorProtoDataType.INT64
-            )
-            output_size = g.op("Concat", input_size_beg, output_size, axis_i=0)
-
-            if g.opset >= 13:
-                empty_roi = _optional_input_placeholder_tensor(g)
-                empty_scales = _optional_input_placeholder_tensor(g)
-            else:
-                empty_roi = g.op(
-                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
-                )
-                empty_scales = g.op(
-                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
-                )
-
-            return g.op(
-                "Resize",
-                input,
-                empty_roi,
-                empty_scales,
-                output_size,
-                coordinate_transformation_mode_s=coordinate_transformation_mode,
-                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-                mode_s=interpolate_mode,  # nearest, linear, or cubic
-                nearest_mode_s="floor",
-            )  # only valid when mode="nearest"
-        else:
-            if g.opset >= 13:
-                empty_roi = _optional_input_placeholder_tensor(g)
-            else:
-                empty_roi = g.op(
-                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
-                )
-
-            return g.op(
-                "Resize",
-                input,
-                empty_roi,
-                scales,
-                coordinate_transformation_mode_s=coordinate_transformation_mode,
-                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-                mode_s=interpolate_mode,  # nearest, linear, or cubic
-                nearest_mode_s="floor",
-            )  # only valid when mode="nearest"
-
-    return symbolic_fn
-
-
-def __interpolate_helper(
-    g: jit_utils.GraphContext,
-    input,
-    size,
-    scale_factor,
-    mode,
-    align_corners,
-    recompute_scale_factor,
-):
-    mode = _maybe_get_const(mode, "s")
-    if "linear" in mode:
-        mode = "linear"
-    if "cubic" in mode:
-        mode = "cubic"
-    align_corners = _maybe_get_const(align_corners, "b")
-    align_corners = False if not isinstance(align_corners, bool) else align_corners
-    coordinate_transformation_mode = (
-        "asymmetric"
-        if mode == "nearest"
-        else "align_corners"
-        if align_corners
-        else "half_pixel"
-    )
-
-    if not _is_none(size):
-        input_size = g.op("Shape", input)
-        input_size = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
-        # in some cases size is not a packed list but size is a scalar
-        # We need to also verify that (_maybe_get_const(size, "t").dim() == 0)
-        # but this information is not always available. Try to get the dim,
-        # and if not assume that it is not a scalar.
-        try:
-            is_scalar = not _is_packed_list(size) and (
-                _maybe_get_const(size, "t").dim() == 0
-            )
-        except AttributeError:
-            is_scalar = not _is_packed_list(size)
-            if not is_scalar:
-                warnings.warn(
-                    "Cannot verify if the output_size is a scalar "
-                    "while exporting interpolate. Assuming that it is not a scalar."
-                )
-
-        if is_scalar:
-            rank = _get_tensor_rank(input)
-            if rank is None:
-                return _unimplemented(
-                    "interpolate (with a scalar output_size)",
-                    "missing input shape (try giving an array of output_size values)",
-                )
-            size = _unsqueeze_helper(g, size, [0])
-            size = [size for i in range(rank - 2)]
-            size = g.op("Concat", *size, axis_i=0)
-        size = g.op("Cast", size, to_i=_C_onnx.TensorProtoDataType.INT64)
-        size = g.op("Concat", input_size, size, axis_i=0)
-
-        if g.opset >= 13:
-            empty_roi = _optional_input_placeholder_tensor(g)
-            empty_scales = _optional_input_placeholder_tensor(g)
-        else:
-            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-            empty_scales = g.op(
-                "Constant", value_t=torch.tensor([], dtype=torch.float32)
-            )
-
-        return g.op(
-            "Resize",
-            input,
-            empty_roi,
-            empty_scales,
-            size,
-            coordinate_transformation_mode_s=coordinate_transformation_mode,
-            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-            mode_s=mode,  # nearest, linear, or cubic
-            nearest_mode_s="floor",
-        )
-    else:  # if not _is_none(scales)
-        rank = _get_tensor_rank(input)
-        if rank is None:
-            return _unimplemented("interpolate (with scales)", "missing input shape")
-
-        if g.opset >= 13:
-            empty_roi = _optional_input_placeholder_tensor(g)
-        else:
-            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-
-        scales = _interpolate_get_scales(g, scale_factor, rank)
-        return g.op(
-            "Resize",
-            input,
-            empty_roi,
-            scales,
-            coordinate_transformation_mode_s=coordinate_transformation_mode,
-            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-            mode_s=mode,  # nearest, linear, or cubic
-            nearest_mode_s="floor",
-        )  # only valid when mode="nearest"
-
-
-def _unbind_helper(g: jit_utils.GraphContext, self, dim, _outputs):
-    if g.opset < 11:
-        from torch.onnx.symbolic_opset9 import unbind
-    elif g.opset <= 12:
-        from torch.onnx.symbolic_opset11 import unbind  # type: ignore[no-redef]
-    else:
-        from torch.onnx.symbolic_opset13 import unbind  # type: ignore[no-redef]
-    return unbind(g, self, dim, _outputs)
-
-
-def _scatter_helper(g: jit_utils.GraphContext, self, dim, index, src):
-    if g.opset <= 10:
-        from torch.onnx.symbolic_opset9 import scatter
-    else:
-        # for mypy, scatter was imported two lines above
-        from torch.onnx.symbolic_opset11 import scatter  # type: ignore[no-redef]
-    return scatter(g, self, dim, index, src)
-
-
-def _repeat_interleave_split_helper(g: jit_utils.GraphContext, self, reps, dim):
-    if g.opset <= 12:
-        split_out = g.op("Split", self, split_i=[1] * reps, axis_i=dim, outputs=reps)
-    else:
-        from torch.onnx.symbolic_opset13 import split
-
-        repeats = g.op("Constant", value_t=torch.tensor([1] * reps))
-        split_out = split(g, self, repeats, dim, _outputs=reps)
-    return split_out if reps > 1 else [split_out]
-
-
-def _repeat_interleave_single_value_repeat_helper(
-    g: jit_utils.GraphContext, self, repeats, dim
-):
-    from torch.onnx.symbolic_opset9 import flatten, unsqueeze
-
-    if not _is_tensor(repeats):
-        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
-
-    const_repeats: bool = _is_constant(repeats)
-    reps = _maybe_get_const(repeats, "t")
-
-    # Convert 'repeats' to 1-d if it is 0-d.
-    if _get_tensor_rank(repeats) == 0:
-        repeats = g.op("Reshape", repeats, g.op("Constant", value_t=torch.tensor([1])))
-
-    # Create a new dim of size 1, then expand it to be 'repeats' long, and finally collapse it.
-    unsqueezed = unsqueeze(g, self, dim + 1)
-
-    # repeats_per_dim is 1 for all dims except for the new unsqueezed dim, where it has value 'repeats'.
-    if const_repeats:
-        # 'Repeats' is a constant, 'repeats_per_dim' can be a constant.
-        onehot = torch.ones(_get_tensor_rank(unsqueezed), dtype=torch.int64)  # type: ignore[arg-type]
-        onehot[dim + 1] = reps
-        repeats_per_dim = g.op("Constant", value_t=onehot)
-    else:
-        # 'Repeats' is a variable, 'repeats_per_dim' cannot be a constant.
-        onehot = g.op(
-            "OneHot",
-            unsqueeze(g, dim + 1, 0),  # indices, must be >= 1-dimensional
-            g.op(
-                "Constant", value_t=torch.tensor(_get_tensor_rank(unsqueezed))
-            ),  # depth
-            g.op(
-                "Concat", g.op("Constant", value_t=torch.tensor([1])), repeats, axis_i=0
-            ),  # on/off values
-        )
-        repeats_per_dim = flatten(g, onehot, 0, 1)
-
-    tiled = g.op("Tile", unsqueezed, repeats_per_dim)
-    return flatten(g, tiled, dim, dim + 1)
-
-
-def _arange_cast_helper(
-    g: jit_utils.GraphContext, end, start=None, step=None, dtype=None
-) -> tuple[
-    _type_utils.JitScalarType,
-    _C.Value | None,
-    _C.Value | None,
-    _C.Value | None,
-]:
-    def _is_all_integral(scalars):
-        for scalar in scalars:
-            scalar_type = _type_utils.JitScalarType.from_value(
-                scalar, _type_utils.JitScalarType.UNDEFINED
-            )
-            if (
-                scalar_type != _type_utils.JitScalarType.INT64
-                and scalar_type != _type_utils.JitScalarType.UNDEFINED
-            ):
-                return False
-        return True
-
-    # This logic is based on torch.arange docs. If "dtype" is provided,
-    # infer input types from dtype. If not, then check if any of start, stop,
-    # or step are floating point, and infer the type from get_default.
-    # Otherwise, the dtype is inferred to be torch.int64.
-    if dtype is None or (_is_value(dtype) and _is_none(dtype)):
-        if _is_all_integral([start, end, step]):
-            scalar_type = _type_utils.JitScalarType.INT64
-        else:
-            scalar_type = _type_utils.JitScalarType.from_dtype(
-                torch.get_default_dtype()
-            )
-    else:
-        assert isinstance(dtype, int)
-        # TODO(justinchuby): Check if dtype is indeed a int.
-        scalar_type = _type_utils.JitScalarType(dtype)
-
-    start = g.op("Cast", start, to_i=scalar_type.onnx_type()) if start else None
-    end = g.op("Cast", end, to_i=scalar_type.onnx_type()) if end else None
-    step = g.op("Cast", step, to_i=scalar_type.onnx_type()) if step else None
-    return scalar_type, end, start, step
-
-
-def _arange_helper(g: jit_utils.GraphContext, *args):
-    if g.opset <= 10:
-        from torch.onnx.symbolic_opset9 import arange
-    else:
-        from torch.onnx.symbolic_opset11 import arange  # type: ignore[no-redef]
-    return arange(g, *args)
-
-
-def _size_helper(g: jit_utils.GraphContext, self, dim):
-    full_shape = g.op("Shape", self)
-    from torch.onnx.symbolic_opset9 import select
-
-    return select(g, full_shape, g.op("Constant", value_t=torch.tensor([0])), dim)
-
-
-def _index_fill_reshape_helper(g: jit_utils.GraphContext, self, dim, index):
-    # 1. reshape index => [1, ..., 1, dim, 1, ..., 1]
-    # 2. expand index => [..., dim, ...], same shape as self except for dim.
-    # 3. expand value as well.
-    # 4. apply onnx::scatter.
-
-    from torch.onnx.symbolic_opset9 import expand
-
-    if g.opset <= 10:
-        from torch.onnx.symbolic_opset9 import scatter
-    else:
-        # for mypy, scatter was imported two lines above
-        from torch.onnx.symbolic_opset11 import scatter  # type: ignore[no-redef]
-
-    if self.type().dim() is None:
-        return _unimplemented("index_fill", "input rank not accessible")
-    self_dim = self.type().dim()
-    dim_value = _parse_arg(dim, "i")
-    if dim_value < 0:
-        dim_value += self_dim
-    unsqueezed_index = _unsqueeze_helper(
-        g, index, [i for i in range(self_dim) if i != dim_value]
-    )
-    expanded_index_shape = scatter(
-        g, g.op("Shape", self), 0, _unsqueeze_helper(g, dim, [0]), g.op("Shape", index)
-    )
-    expanded_index = expand(g, unsqueezed_index, expanded_index_shape, None)
-    return expanded_index_shape, expanded_index
-
-
-# By default, when any value in the 'shape' input is equal to zero
-# the corresponding dimension value is copied from the input tensor dynamically.
-# allowzero=1 indicates that if any value in the 'shape' input is set to zero,
-# the zero value is honored, similar to NumPy.
-# allowzero=1 is only supported for opset version >= 14.
-def _reshape_helper(g: jit_utils.GraphContext, input, shape, allowzero=0):
-    shape = _maybe_get_const(shape, "is")
-    if not _is_value(shape):
-        shape = g.op("Constant", value_t=torch.LongTensor(shape))
-    if g.opset <= 13:
-        if allowzero == 1:
-            _onnx_opset_unsupported(
-                "Reshape with allowzero=1", GLOBALS.export_onnx_opset_version, 14, input
-            )
-        return g.op("Reshape", input, shape)
-    else:
-        return g.op("Reshape", input, shape, allowzero_i=allowzero)
-
-
-def _batchnorm_helper(
-    g: jit_utils.GraphContext, input, weight, bias, running_mean, running_var
-):
-    from torch.onnx.symbolic_opset9 import _var_mean
-
-    batch_size = _get_tensor_dim_size(input, 0)
-    channel_size = _get_tensor_dim_size(input, 1)
-
-    if weight is None or _is_none(weight):
-        if channel_size is None:
-            raise errors.SymbolicValueError(
-                "Unsupported: ONNX export of batch_norm for unknown channel size.",
-                input,
-            )
-        weight_value = torch.tensor(
-            [1.0] * channel_size,
-            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
-        )
-        weight = g.op("Constant", value_t=weight_value)
-    if bias is None or _is_none(bias):
-        if channel_size is None:
-            raise errors.SymbolicValueError(
-                "Unsupported: ONNX export of batch_norm for unknown channel size.",
-                input,
-            )
-        bias_value = torch.tensor(
-            [0.0] * channel_size,
-            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
-        )
-        bias = g.op("Constant", value_t=bias_value)
-    # If track_running_stats is set to False batch statistics are instead used during evaluation time
-    if (
-        running_mean is None
-        or _is_none(running_mean)
-        or running_var is None
-        or _is_none(running_var)
-    ):
-        assert batch_size is not None and channel_size is not None
-        reshape_in = _reshape_helper(
-            g,
-            input,
-            g.op(
-                "Constant",
-                value_t=torch.tensor([batch_size, channel_size, -1], dtype=torch.int64),
-            ),
-        )
-        trans_in = g.op("Transpose", reshape_in, perm_i=[0, 2, 1])
-        running_var, running_mean = _var_mean(
-            g,
-            trans_in,
-            g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)),
-            False,
-            False,
-        )
-    return weight, bias, running_mean, running_var
-
-
-def _avgpool_helper(
-    tuple_fn: Callable[[Any], Sequence[int]],
-    padding: int | Sequence[int],
-    kernel_size,
-    stride,
-    divisor_override,
-    name,
-) -> tuple[int, ...]:
-    if divisor_override and divisor_override.node().kind() != "prim::Constant":
-        _unimplemented(name, "divisor_override")
-    return tuple(tuple_fn(padding))
-
-
-def check_training_mode(op_train_mode: int, op_name: str) -> None:
-    """Warns the user if the model's training mode and the export mode do not agree."""
-    if GLOBALS.training_mode == _C_onnx.TrainingMode.PRESERVE:
-        return
-
-    if op_train_mode:
-        op_mode_enum = _C_onnx.TrainingMode.TRAINING
-    else:
-        op_mode_enum = _C_onnx.TrainingMode.EVAL
-    if op_mode_enum == GLOBALS.training_mode:
-        # The modes agree. Do nothing
-        return
-
-    op_mode_text = f"train={bool(op_train_mode)}"
-    # Setting the model mode could result in op_mode != GLOBALS.training_mode
-    # if the model is a FuncModule. In this case we warn the user of
-    # the state and export depending on op_mode
-    # This is to support use-cases of fixing certain layer weights
-    # in training.
-    warnings.warn(
-        f"ONNX export mode is set to {GLOBALS.training_mode}, but operator '{op_name}' "
-        f"is set to {op_mode_text}. Exporting with {op_mode_text}."
-    )
-
-
-def _flatten_helper(g: jit_utils.GraphContext, input, start_dim, end_dim, dim):
-    input_size = g.op("Shape", input)
-    slice1 = _slice_helper(g, input_size, axes=[0], starts=[0], ends=[start_dim])
-    slices = [slice1, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long))]
-    if end_dim < dim - 1:
-        slice3 = _slice_helper(
-            g, input_size, axes=[0], starts=[end_dim + 1], ends=[dim]
-        )
-        slices = [
-            slice1,
-            g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
-            slice3,
-        ]
-
-    final_shape = g.op("Concat", *slices, axis_i=0)
-    from torch.onnx.symbolic_opset9 import _reshape_from_tensor
-
-    return _reshape_from_tensor(g, input, final_shape)
-
-
-def _is_split_static(split_size_or_sizes, _outputs):
-    if _outputs is None:
-        return False
-    if (
-        _is_value(split_size_or_sizes)
-        and split_size_or_sizes.node().kind() != "onnx::Constant"
-    ):
-        return False
-    return True
-
-
-def _optional_input_placeholder_tensor(g):
-    n = g.op("prim::Constant")
-    n.setType(_C.OptionalType.ofTensor())
-    return n
-
-
-def _handle_reduce_dim_none(g: jit_utils.GraphContext, self, op_name):
-    rank = _get_tensor_rank(self)
-    if rank is not None and any(
-        _get_tensor_dim_size(self, i) == 0 for i in range(rank)
-    ):
-        # If input tensor is empty, according to ONNX ReduceSum definition,
-        # set keepdims=1 so that the resulted tensor has the same rank as the input.
-        return g.op(op_name, self, keepdims_i=1)
-    return g.op(op_name, self, keepdims_i=0)
-
-
-def dequantize_helper(
-    g: jit_utils.GraphContext,
-    qtensor: _C.Value,
-    qdtype: _C_onnx.TensorProtoDataType | None = None,
-) -> tuple[_C.Value, _C.Value, _C.Value, _C.Value | None]:
-    """Appends to graph `g` ONNX nodes that dequantizes `qtensor` into `tensor`.
-
-    Args:
-        g: Graph, the ONNX IR graph that is under construction.
-        qtensor: torch._C.Value, either a tuple of (quantized_tensor, scale, zero_point)
-            for per tensor quantization, or
-            (quantized_tensor, scale, zero_point, axis) for per channel quantization,
-            representing the quantized tensor.
-        qdtype: torch.onnx.TensorProtoDataType default None, if not None, represents the
-            data type of quantized tensor. It must be either
-            torch.onnx.TensorProtoDataType.UINT8 or torch.onnx.TensorProtoDataType.INT8.
-    """
-    unpacked_qtensors = _unpack_quantized_tensor(qtensor)
-    tensor, scale, zero_point = unpacked_qtensors[:3]
-    axis = unpacked_qtensors[3] if len(unpacked_qtensors) >= 4 else None
-    axis_i = _get_const(axis, "i", "axis")
-    input_qdtype = _type_utils.JitScalarType.from_value(tensor)
-    if qdtype is None:
-        if input_qdtype is not None:
-            qdtype = input_qdtype.onnx_type()
-        else:
-            qdtype = _C_onnx.TensorProtoDataType.UINT8
-    value = g.op("Cast", tensor, to_i=qdtype)
-    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    zero_point = g.op("Cast", zero_point, to_i=qdtype)
-
-    if axis_i is not None and GLOBALS.export_onnx_opset_version < 13:
-        _onnx_opset_unsupported_detailed(
-            "DequantizeLinear",
-            GLOBALS.export_onnx_opset_version,
-            13,
-            "Attribute axis is not supported.",
-            qtensor,
-        )
-
-    return (
-        g.op("DequantizeLinear", value, scale, zero_point, axis_i=axis_i),
-        scale,
-        zero_point,
-        axis,
-    )
-
-
-def quantize_helper(
-    g: jit_utils.GraphContext,
-    tensor: _C.Value,
-    scale: _C.Value,
-    zero_point: _C.Value,
-    axis: _C.Value | None = None,
-) -> _C.Value:
-    """Appends to graph `g` ONNX nodes that quantizes `tensor` based on `scale`, `zero_point` and `axis`.
-
-    Args:
-        g: Graph, the ONNX IR graph that is under construction.
-        tensor: torch._C.Value, representing the tensor to be quantized.
-        scale: torch._C.Value, quantized scale.
-        zero_point: torch._C.Value, quantized zero point.
-        axis: Optional[torch._C.Value] default None, if None, represents per tensor quantization.
-            Otherwise, represents per channel quantization, along given axis.
-
-    Returns:
-        A TupleConstruct storing information of the quantized tensor.
-    """
-    if (
-        axis is not None
-        and not _is_none(axis)
-        and GLOBALS.export_onnx_opset_version < 13
-    ):
-        _onnx_opset_unsupported_detailed(
-            "QuantizeLinear",
-            GLOBALS.export_onnx_opset_version,
-            13,
-            "Attribute axis is not supported.",
-            tensor,
-        )
-
-    assert scale is not None
-    if (
-        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
-        != _type_utils.JitScalarType.FLOAT
-    ):
-        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-
-    assert zero_point is not None
-    if _type_utils.JitScalarType.from_value(
-        zero_point, _type_utils.JitScalarType.UNDEFINED
-    ) not in {
-        _type_utils.JitScalarType.UINT8,
-        _type_utils.JitScalarType.INT8,
-    }:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
-    output = g.op(
-        "QuantizeLinear",
-        tensor,
-        scale,
-        zero_point,
-        axis_i=_get_const(axis, "i", "axis"),
-    )
-    args = [output, scale, zero_point]
-    if axis is not None and not _is_none(axis):
-        args.append(axis)
-    return g.op("prim::TupleConstruct", *args)
-
-
-def requantize_bias_helper(
-    g: jit_utils.GraphContext, bias, input_scale, weight_scale, axis=None
-):
-    """In PyTorch, bias is float and is quantized to int32 implicitly inside the quantized ATen op kernel.
-    In ONNX we need to make the quantization explicit because operators expect all of their inputs to be quantized.
-    Since int32 is not a supported output type by ONNX operator `QuantizeLinear`, quantization is exported using
-    regular operators.
-    """
-    bias_scale = g.op("Mul", weight_scale, input_scale)
-    bias_scale_shape = g.op("Shape", bias_scale)
-    bias_zero_point = g.op(
-        "ConstantOfShape", bias_scale_shape, value_t=torch.tensor([0], dtype=torch.int)
-    )
-    q_bias = g.op(
-        "Cast", g.op("Div", bias, bias_scale), to_i=_C_onnx.TensorProtoDataType.INT32
-    )
-    axis_args = []
-    if axis is not None and not _is_none(axis):
-        axis_args.append(axis)
-    return g.op("prim::TupleConstruct", q_bias, bias_scale, bias_zero_point, *axis_args)
-
-
-def args_have_same_dtype(args):
-    assert args
-    base_dtype = _type_utils.JitScalarType.from_value(args[0])
-    has_same_dtype = all(
-        _type_utils.JitScalarType.from_value(elem) == base_dtype for elem in args
-    )
-    return has_same_dtype
-
-
-def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kwargs):
-    """Some PyTorch operators (e.g., Clip/Min/ReLU/Pad) are super set of ONNX in terms of data types.
-    This function maximizes the exportability of PyTorch-ONNX by allowing ONNX-unsupported PyTorch
-    operator data type. For example, `Cast<int>(Clip<float>(Cast<float>(INPUT)))` can be used to mimic
-    `Clip<int>(INPUT)` (opset version < 12).
-
-    Args:
-        g (torch._C.Graph): graph to write the ONNX representation into.
-        op_name (str): operator name in ONNX.
-        *args (tuple): operands to the operator.
-        **kwargs (dict): attributes to the operator along with "opset_before" (optional, None by default)
-            indicating the smallest opset version to trigger such casting behavior and "target_float_t"
-            (optional, torch.onnx.JitScalarType.FLOAT by default) indicating the data type of internal operator.
-
-    Returns:
-        Optional[torch._C.Value, Tuple[torch._C.Value, ...]]: output(s) of the operator.
-    """
-    opset_before = kwargs.pop("opset_before", None)
-    target_float_t = kwargs.pop("target_float_t", _type_utils.JitScalarType.FLOAT)
-
-    inputs = list(args)
-    dtype_0 = _type_utils.JitScalarType.from_value(inputs[0])
-
-    require_cast = not _is_fp(inputs[0]) and (
-        opset_before is None or GLOBALS.export_onnx_opset_version < opset_before
-    )
-
-    if require_cast:
-        for input in inputs:
-            if input.isCompleteTensor():
-                input_scalar_type = _type_utils.JitScalarType.from_value(input)
-                if input_scalar_type != dtype_0:
-                    raise errors.SymbolicValueError(
-                        f"Inputs of {op_name} must have same dtype."
-                        f"Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
-                        input,
-                    )
-        for i, input in enumerate(inputs):
-            if input.isCompleteTensor() and not _is_fp(input):
-                inputs[i] = g.op(
-                    "Cast",
-                    input,
-                    to_i=target_float_t.onnx_type(),
-                )
-
-    self = g.op(op_name, *inputs, **kwargs)
-
-    if require_cast:
-        self = g.op("Cast", self, to_i=dtype_0.onnx_type())
-
-    return self
-
-
-def _maybe_cast_reduce_op_input(g: jit_utils.GraphContext, self):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.UNDEFINED
-    )
-    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
-        # This check only covers traced modules where dtype is present
-        # pytorch reduce-ops cast all other integral types to int64
-        if not _is_fp(self) and scalar_type != _type_utils.JitScalarType.INT64:
-            self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.INT64)
-    return self
-
-
-def _apply_params(*args, **kwargs):
-    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
-
-    def _apply(fn):
-        return fn(*args, **kwargs)
-
-    return _apply
-
-
-def _reduce_op_symbolic_helper(onnx_op_name, allow_multi_dim_support=True):
-    def symbolic(g, self, dim=None, keepdim=None):
-        self = _maybe_cast_reduce_op_input(g, self)
-        if dim is None or dim == ():
-            # Dim can be 0, which will cause (not dim) == True. So we don't want to do
-            # (not dim)
-            # all-reduce path
-            return _handle_reduce_dim_none(g, self, onnx_op_name)
-        else:
-            # dim-reduce path
-            keepdim = _get_const(keepdim, "i", "keepdim")
-            if g.opset < 18:
-                desc = "is" if allow_multi_dim_support else "i"
-                dim = _get_const(dim, desc, "dim")
-                dim_list = dim if allow_multi_dim_support else [dim]
-                return g.op(onnx_op_name, self, axes_i=dim_list, keepdims_i=keepdim)
-            else:
-                if _is_value(dim):
-                    axes = dim
-                else:
-                    if allow_multi_dim_support:
-                        axes = g.op(
-                            "Constant", value_t=torch.tensor(dim, dtype=torch.long)
-                        )
-                    else:
-                        axes = g.op(
-                            "Constant", value_t=torch.tensor([dim], dtype=torch.long)
-                        )
-                return g.op(onnx_op_name, self, axes, keepdims_i=keepdim)
-
-    return symbolic
-
-
-def _overload_by_arg_count(fn):
-    @functools.wraps(fn)
-    def wrapper(g, *args):
-        overloads = fn(g, *args)
-        for overload in overloads:
-            arg_descriptors = overload._arg_descriptors
-            if len(arg_descriptors) == len(args):
-                return overload(g, *args)
-        return _unimplemented(f"aten::{fn.__name__}", f"with {len(args)} arguments")
-
-    return wrapper
-
-
-def _reduce_with_dtype_helper(
-    onnx_op: str, name: str, allow_multi_dim_support: bool = True
-):
-    symbolic = _reduce_op_symbolic_helper(
-        onnx_op, allow_multi_dim_support=allow_multi_dim_support
-    )
-
-    @_overload_by_arg_count
-    def reduce(g, *args, **kwargs):
-        @quantized_args(True)
-        @parse_args("v", "none")
-        def reduce_nodim(g, self, dtype):
-            dtype_onnx = None
-            if dtype.node().kind() == "onnx::Constant":
-                dtype = _get_const(dtype, "i", "dtype")
-                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
-                self = g.op("Cast", self, to_i=dtype_onnx)
-            elif dtype.node().kind() != "prim::Constant":
-                return _unimplemented(name, "dtype", dtype)
-            result = symbolic(g, self)
-            if dtype_onnx is not None:
-                result_dtype_onnx = _type_utils.JitScalarType.from_value(
-                    result
-                ).onnx_type()
-                if result_dtype_onnx != dtype_onnx:
-                    result = g.op("Cast", result, to_i=dtype_onnx)
-            return result
-
-        dim_desc = "is" if allow_multi_dim_support else "i"
-
-        @quantized_args(True)
-        @parse_args("v", dim_desc, "i", "none")  # type: ignore[arg-type]
-        def reduce_dim(g, self, dim, keepdim, dtype):
-            dtype_onnx = None
-            if dtype.node().kind() == "onnx::Constant":
-                dtype = _get_const(dtype, "i", "dtype")
-                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
-                self = g.op("Cast", self, to_i=dtype_onnx)
-            elif dtype.node().kind() != "prim::Constant":
-                return _unimplemented(name, "dtype", dtype)
-            result = symbolic(g, self, dim, keepdim)
-            if dtype_onnx is not None:
-                result_dtype_onnx = _type_utils.JitScalarType.from_value(
-                    result
-                ).onnx_type()
-                if result_dtype_onnx != dtype_onnx:
-                    result = g.op("Cast", result, to_i=dtype_onnx)
-            return result
-
-        return reduce_nodim, reduce_dim
-
-    return reduce
-
-
-def _max_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    # torch.max(input)
-    if dim_or_y is None and keepdim is None:
-        return g.op("ReduceMax", self, keepdims_i=0)
-    # torch.max(input, other)
-    if keepdim is None:
-        return _op_with_optional_float_cast(g, "Max", self, dim_or_y, opset_before=12)
-    # torch.max(input, dim, keepdim)
-    else:
-        keepdim = _get_const(keepdim, "i", "keepdim")
-        dim = _get_const(dim_or_y, "i", "dim")
-        if g.opset < 18:
-            max = g.op("ReduceMax", self, axes_i=[dim], keepdims_i=keepdim)
-        else:
-            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-            max = g.op("ReduceMax", self, axes, keepdims_i=keepdim)
-        indices = g.op("ArgMax", self, axis_i=dim, keepdims_i=keepdim)
-        return max, indices
-
-
-def _min_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    # torch.min(input)
-    if dim_or_y is None and keepdim is None:
-        return g.op("ReduceMin", self, keepdims_i=0)
-    # torch.min(input, other)
-    if keepdim is None:
-        return _op_with_optional_float_cast(g, "Min", self, dim_or_y, opset_before=12)
-    # torch.min(input, dim, keepdim)
-    else:
-        keepdim = _get_const(keepdim, "i", "keepdim")
-        dim = _get_const(dim_or_y, "i", "dim")
-        if g.opset < 18:
-            min = g.op("ReduceMin", self, axes_i=[dim], keepdims_i=keepdim)
-        else:
-            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-            min = g.op("ReduceMin", self, axes, keepdims_i=keepdim)
-        indices = g.op("ArgMin", self, axis_i=dim, keepdims_i=keepdim)
-        return min, indices
-
-
-def _numel_helper(g: jit_utils.GraphContext, self):
-    shape = g.op("Shape", self)
-    return g.op("ReduceProd", shape, keepdims_i=0)
-
-
-@parse_args("v", "is", "i", "i")
-def _var_mean_helper(g: jit_utils.GraphContext, input, dim, correction, keepdim):
-    if g.opset < 18:
-        if dim is None:
-            mean = g.op("ReduceMean", input, keepdims_i=0)
-            t_mean = mean
-            num_elements = _numel_helper(g, input)
-        else:
-            mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=keepdim)
-            t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1)
-            redudced_dims = g.op("Shape", input)
-            # dim could contain one or multiple dimensions
-            redudced_dims = g.op(
-                "Gather",
-                redudced_dims,
-                g.op("Constant", value_t=torch.tensor(dim)),
-                axis_i=0,
-            )
-            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
-        sub_v = g.op("Sub", input, t_mean)
-        sqr_sub = g.op("Mul", sub_v, sub_v)
-        keepdim_mean = 0 if dim is None else keepdim
-        var = g.op("ReduceMean", sqr_sub, axes_i=dim, keepdims_i=keepdim_mean)
-        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
-        if correction is None:
-            correction = 1
-        if correction != 0:
-            num_elements = g.op(
-                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
-            )
-            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
-            mul = g.op("Mul", var, num_elements)
-            var = g.op("Div", mul, g.op("Sub", num_elements, one))
-        return var, mean
-    else:
-        axes = None
-        if dim is None:
-            mean = g.op("ReduceMean", input, keepdims_i=0)
-            t_mean = mean
-            num_elements = _numel_helper(g, input)
-        else:
-            axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
-            mean = g.op("ReduceMean", input, axes, keepdims_i=keepdim)
-            t_mean = g.op("ReduceMean", input, axes, keepdims_i=1)
-            redudced_dims = g.op("Shape", input)
-            # dim could contain one or multiple dimensions
-            redudced_dims = g.op(
-                "Gather",
-                redudced_dims,
-                g.op("Constant", value_t=torch.tensor(dim)),
-                axis_i=0,
-            )
-            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
-        sub_v = g.op("Sub", input, t_mean)
-        sqr_sub = g.op("Mul", sub_v, sub_v)
-        keepdim_mean = 0 if dim is None else keepdim
-        if axes is None:
-            var = g.op("ReduceMean", sqr_sub, keepdims_i=keepdim_mean)
-        else:
-            var = g.op("ReduceMean", sqr_sub, axes, keepdims_i=keepdim_mean)
-        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
-        if correction is None:
-            correction = 1
-        if correction != 0:
-            num_elements = g.op(
-                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
-            )
-            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
-            mul = g.op("Mul", var, num_elements)
-            var = g.op("Div", mul, g.op("Sub", num_elements, one))
-        return var, mean
-
-
-def _embedding_bag_helper(
-    g: jit_utils.GraphContext,
-    embedding_matrix,
-    indices,
-    offsets,
-    scale_grad_by_freq,
-    mode,
-    sparse,
-    per_sample_weights,
-    include_last_offset,
-    padding_idx,
-):
-    if scale_grad_by_freq and GLOBALS.export_training:
-        return _onnx_unsupported(
-            "embedding_bag with scale_grad_by_freq for training mode"
-        )
-    if padding_idx is not None and padding_idx >= 0:
-        raise RuntimeError("embedding_bag with padding_idx")
-
-    loop_condition = g.op("Constant", value_t=torch.tensor(1))
-    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
-    zero = g.op("Constant", value_t=torch.tensor([0]))
-
-    indices_len = _unsqueeze_helper(
-        g,
-        _size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
-        [0],
-    )
-    if not include_last_offset:
-        offsets = [offsets, indices_len]
-        offsets = g.op("Concat", *offsets, axis_i=0)
-
-    # Offsets holds the starting index position of each bag. So we create a list of the indices slices (determined by
-    # offsets) and gather those indices in indices_row. Then we use this subset of indices to gather from embeddings.
-    # The embeddings output is a loop scan output, so we can avoid creating a sequence and inserting elements in.
-    offsets_starts = _slice_helper(
-        g, offsets, axes=[0], starts=[0], ends=[sys.maxsize], steps=[1]
-    )
-    offsets_ends = _slice_helper(
-        g, offsets, axes=[0], starts=[1], ends=[sys.maxsize], steps=[1]
-    )
-
-    loop_len = _size_helper(g, offsets_ends, g.op("Constant", value_t=torch.tensor(0)))
-
-    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
-        g, "Loop", loop_len, loop_condition, n_blocks=1
-    )
-    loop_block = loop_context.block
-
-    # FIXME(justinchuby): We need to handle what happens when we call b.op on a node return
-    block_input_iter = utils._add_input_to_block(loop_block)
-    utils._add_input_to_block(loop_block)
-
-    indices_start = loop_context.op(
-        "Gather", offsets_starts, block_input_iter, axis_i=0
-    )
-    indices_end = loop_context.op("Gather", offsets_ends, block_input_iter, axis_i=0)
-    indices_start = _unsqueeze_helper(loop_context, indices_start, [0])
-    indices_end = _unsqueeze_helper(loop_context, indices_end, [0])
-
-    indices_row = loop_context.op("Slice", indices, indices_start, indices_end, zero)
-    embeddings = loop_context.op("Gather", embedding_matrix, indices_row, axis_i=0)
-    if not _is_none(per_sample_weights):
-        per_sample_weights_row = loop_context.op(
-            "Slice", per_sample_weights, indices_start, indices_end, zero
-        )
-        per_sample_weights_row = _unsqueeze_helper(
-            loop_context, per_sample_weights_row, [1]
-        )
-        embeddings = loop_context.op("Mul", embeddings, per_sample_weights_row)
-    if mode == 0:
-        embeddings = _reducesum_helper(
-            loop_context, embeddings, axes_i=[0], keepdims_i=0
-        )
-    elif mode == 1:
-        if loop_context.opset < 18:
-            embeddings = loop_context.op(
-                "ReduceMean", embeddings, axes_i=[0], keepdims_i=0
-            )
-        else:
-            axes = loop_context.op(
-                "Constant", value_t=torch.tensor([0], dtype=torch.long)
-            )
-            embeddings = loop_context.op("ReduceMean", embeddings, axes, keepdims_i=0)
-    else:
-        if loop_context.opset < 18:
-            embeddings = loop_context.op(
-                "ReduceMax", embeddings, axes_i=[0], keepdims_i=0
-            )
-        else:
-            axes = loop_context.op(
-                "Constant", value_t=torch.tensor([0], dtype=torch.long)
-            )
-            embeddings = loop_context.op("ReduceMax", embeddings, axes, keepdims_i=0)
-
-    cond_out = loop_context.op(
-        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
-    )
-    utils._add_output_to_block(loop_block, cond_out)
-    utils._add_output_to_block(loop_block, embeddings)
-
-    # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
-    # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
-    return loop.node().output(), None, None, None
-
-
-def _linalg_vector_norm_helper(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    ord: float,
-    dim: Sequence[int] | None,
-    keepdim: bool,
-    dtype: torch._C.Value,
-):
-    axes = None
-    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.vector_norm.html
-    if _is_none(dim):
-        self = _reshape_helper(g, self, [-1])
-        keepdim = False
-    elif g.opset >= 18:
-        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
-
-    if ord == math.inf:
-        if g.opset < 18:
-            result = g.op(
-                "ReduceMax", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
-            )
-        else:
-            if axes is None:
-                result = g.op("ReduceMax", g.op("Abs", self), keepdims_i=keepdim)
-            else:
-                result = g.op("ReduceMax", g.op("Abs", self), axes, keepdims_i=keepdim)
-    elif ord == -math.inf:
-        if g.opset < 18:
-            result = g.op(
-                "ReduceMin", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
-            )
-        else:
-            if axes is None:
-                result = g.op("ReduceMin", g.op("Abs", self), keepdims_i=keepdim)
-            else:
-                result = g.op("ReduceMin", g.op("Abs", self), axes, keepdims_i=keepdim)
-    elif ord == 0:
-        if g.opset < 11:
-            return _onnx_opset_unsupported_detailed(
-                "linalg_vector_norm", 9, 11, "ord=0 not supported", self
-            )
-        else:
-            if dim is None:
-                self = _reshape_helper(
-                    g,
-                    self,
-                    g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
-                )
-                keepdim = False
-
-            cond_op = g.op(
-                "Not",
-                g.op("Equal", self, g.op("Constant", value_t=torch.LongTensor([0]))),
-            )
-            cond_op = g.op(
-                "Cast",
-                cond_op,
-                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-            )
-            return _reducesum_helper(g, cond_op, axes_i=dim, keepdims_i=keepdim)
-    elif ord == 1:
-        if g.opset < 18:
-            result = _reduce_op_symbolic_helper("ReduceL1")(
-                g, self, dim=dim, keepdim=keepdim
-            )
-        else:
-            if axes is None:
-                result = _reduce_op_symbolic_helper("ReduceL1")(
-                    g, self, keepdim=keepdim
-                )
-            else:
-                result = _reduce_op_symbolic_helper("ReduceL1")(
-                    g, self, axes, keepdim=keepdim
-                )
-    elif ord == 2:
-        if g.opset < 18:
-            result = _reduce_op_symbolic_helper("ReduceL2")(
-                g, self, dim=dim, keepdim=keepdim
-            )
-        else:
-            if axes is None:
-                result = _reduce_op_symbolic_helper("ReduceL2")(
-                    g, self, keepdim=keepdim
-                )
-            else:
-                result = _reduce_op_symbolic_helper("ReduceL2")(
-                    g, self, axes, keepdim=keepdim
-                )
-    else:
-        ord_op = g.op("Constant", value_t=torch.tensor(ord, dtype=torch.float32))
-        result = _reducesum_helper(
-            g, g.op("Pow", g.op("Abs", self), ord_op), axes_i=dim, keepdims_i=keepdim
-        )
-        result = g.op(
-            "Pow",
-            result,
-            g.op(
-                "Div",
-                g.op("Constant", value_t=torch.tensor(1, dtype=torch.float32)),
-                ord_op,
-            ),
-        )
-
-    if not _is_none(dtype):
-        dtype = _get_const(dtype, "i", "dtype")
-        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())  # type: ignore[arg-type]
-    return result
-
-
-# Deprecated. Internally use _type_utils.ScalarType
-# TODO: remove these once we support Type's in the JIT IR and we can once again
-# use the unified toType operator
-cast_pytorch_to_onnx = {
-    "Byte": _C_onnx.TensorProtoDataType.UINT8,
-    "Char": _C_onnx.TensorProtoDataType.INT8,
-    "Double": _C_onnx.TensorProtoDataType.DOUBLE,
-    "Float": _C_onnx.TensorProtoDataType.FLOAT,
-    "Half": _C_onnx.TensorProtoDataType.FLOAT16,
-    "Int": _C_onnx.TensorProtoDataType.INT32,
-    "Long": _C_onnx.TensorProtoDataType.INT64,
-    "Short": _C_onnx.TensorProtoDataType.INT16,
-    "Bool": _C_onnx.TensorProtoDataType.BOOL,
-    "ComplexFloat": _C_onnx.TensorProtoDataType.COMPLEX64,
-    "ComplexDouble": _C_onnx.TensorProtoDataType.COMPLEX128,
-    "BFloat16": _C_onnx.TensorProtoDataType.BFLOAT16,
-    "Undefined": _C_onnx.TensorProtoDataType.UNDEFINED,
-}
-
-# Deprecated. Internally use _type_utils.ScalarType
-scalar_name_to_pytorch = {
-    "uint8_t": "Byte",
-    "int8_t": "Char",
-    "double": "Double",
-    "float": "Float",
-    "half": "Half",
-    "int": "Int",
-    "int64_t": "Long",
-    "int16_t": "Short",
-    "bool": "Bool",
-    "complex64": "ComplexFloat",
-    "complex128": "ComplexDouble",
-    "qint8": "QInt8",
-    "quint8": "QUInt8",
-    "qint32": "QInt32",
-    "bfloat16": "BFloat16",
-}
-
-
-# Deprecated. Internally use _type_utils.ScalarType
-# This indicates each scalar type's corresponding
-# torch type. Related source:
-# https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
-scalar_type_to_pytorch_type = [
-    torch.uint8,  # 0
-    torch.int8,  # 1
-    torch.short,  # 2
-    torch.int,  # 3
-    torch.int64,  # 4
-    torch.half,  # 5
-    torch.float,  # 6
-    torch.double,  # 7
-    torch.complex32,  # 8
-    torch.complex64,  # 9
-    torch.complex128,  # 10
-    torch.bool,  # 11
-    torch.qint8,  # 12
-    torch.quint8,  # 13
-    torch.qint32,  # 14
-    torch.bfloat16,  # 15
-]
-
-# Deprecated. Internally use _type_utils.ScalarType
-# source of truth is
-# https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_dtypes.cpp
-pytorch_name_to_type = {
-    "Byte": torch.uint8,
-    "Char": torch.int8,
-    "Double": torch.double,
-    "Float": torch.float,
-    "Half": torch.half,
-    "Int": torch.int,
-    "Long": torch.int64,
-    "Short": torch.short,
-    "Bool": torch.bool,
-    "ComplexFloat": torch.complex64,
-    "ComplexDouble": torch.complex128,
-    "QInt8": torch.qint8,
-    "QUInt8": torch.quint8,
-    "QInt32": torch.qint32,
-    "BFloat16": torch.bfloat16,
-}
+from __future__ import annotations
 
 
-# Deprecated. Internally use _type_utils.ScalarType
-scalar_type_to_onnx = [
-    cast_pytorch_to_onnx["Byte"],  # 0
-    cast_pytorch_to_onnx["Char"],  # 1
-    cast_pytorch_to_onnx["Short"],  # 2
-    cast_pytorch_to_onnx["Int"],  # 3
-    cast_pytorch_to_onnx["Long"],  # 4
-    cast_pytorch_to_onnx["Half"],  # 5
-    cast_pytorch_to_onnx["Float"],  # 6
-    cast_pytorch_to_onnx["Double"],  # 7
-    cast_pytorch_to_onnx["Undefined"],  # 8
-    cast_pytorch_to_onnx["ComplexFloat"],  # 9
-    cast_pytorch_to_onnx["ComplexDouble"],  # 10
-    cast_pytorch_to_onnx["Bool"],  # 11
-    cast_pytorch_to_onnx["Char"],  # 12
-    cast_pytorch_to_onnx["Byte"],  # 13
-    cast_pytorch_to_onnx["Int"],  # 14
-    cast_pytorch_to_onnx["BFloat16"],  # 15
-]
+__all__: list[str] = []
 
-# Global set to store the list of quantized operators in the network.
-# This is currently only used in the conversion of quantized ops from PT -> C2 via ONNX.
-_quantized_ops: set[int] = set()
+from torch.onnx._internal.torchscript_exporter.symbolic_helper import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index 0b8e2478ce339..9bda69b81ab60 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -1,1190 +1,11 @@
-# mypy: allow-untyped-defs
-# mypy: disable-error-code=arg-type
-from __future__ import annotations
-
-import functools
-import sys
-import warnings
-from typing import TYPE_CHECKING
-
-import torch
-import torch._C._onnx as _C_onnx
-import torch.onnx
-from torch import _C
-
-# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
-from torch.onnx import (
-    _constants,
-    _type_utils,
-    errors,
-    symbolic_helper,
-    symbolic_opset9 as opset9,
-)
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import jit_utils, registration
-
-
-if TYPE_CHECKING:
-    from collections.abc import Sequence
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
-
-# This file exports ONNX ops for opset 10
-# Opset 10 is supported by ONNX release 1.5.0
-# release on 04/24/19
-
-
-__all__ = [
-    "dequantize",
-    "div",
-    "embedding_bag",
-    "fake_quantize_per_tensor_affine",
-    "flip",
-    "fmod",
-    "isfinite",
-    "isinf",
-    "nan_to_num",
-    "quantize_per_tensor",
-    "quantized_add_relu",
-    "quantized_add",
-    "quantized_cat",
-    "quantized_conv1d_relu",
-    "quantized_conv2d_relu",
-    "quantized_conv3d_relu",
-    "quantized_conv1d",
-    "quantized_conv2d",
-    "quantized_conv3d",
-    "quantized_conv_transpose1d",
-    "quantized_conv_transpose2d",
-    "quantized_conv_transpose3d",
-    "quantized_group_norm",
-    "quantized_hardswish",
-    "quantized_instance_norm",
-    "quantized_layer_norm",
-    "quantized_leaky_relu",
-    "quantized_linear",
-    "quantized_linear_relu",
-    "quantized_mul",
-    "quantized_sigmoid",
-    "slice",
-    "sort",
-    "topk",
-]
-
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=10)
-
-
-@_onnx_symbolic("aten::div")
-def div(g: jit_utils.GraphContext, self, other, *args):
-    if len(args) == 0:
-        return opset9.true_divide(g, self, other)
-    else:
-        return _div_rounding_mode(g, self, other, *args)
-
-
-@symbolic_helper.parse_args("v", "v", "s")
-def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
-    if rounding_mode == "floor":
-        return _floor_divide(g, self, other)
-    else:
-        return opset9._div_rounding_mode(g, self, other, rounding_mode)
-
-
-@_onnx_symbolic("aten::_floor_divide")
-def _floor_divide(g: jit_utils.GraphContext, self, other):
-    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
-        out = opset9.true_divide(g, self, other)
-        return g.op("Floor", out)
-    else:
-        # Integer division does truncation rounding
-        div = g.op("Div", self, other)
-        # Division is negative if: self < 0 != other < 0
-        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
-        negative = g.op("Xor", g.op("Less", self, zero), g.op("Less", other, zero))
-
-        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
-        mod = g.op("Mod", self, other, fmod_i=0)
-        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
-
-        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
-        fixup = g.op("Sub", div, one)
-        return g.op("Where", fixup_mask, fixup, div)
-
-
-@_onnx_symbolic("aten::sort")
-@symbolic_helper.parse_args("v", "i", "i", "none")
-def sort(g: jit_utils.GraphContext, self, dim, descending, out=None):
-    return symbolic_helper._sort_helper(g, self, dim, descending=descending, out=out)
-
+"""Backward compatibility module for torch.onnx.symbolic_opset10."""
 
-@_onnx_symbolic("aten::topk")
-@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
-def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
-    return symbolic_helper._topk_helper(
-        g, self, k, dim, largest=largest, sorted=sorted, out=out
-    )
-
-
-def _aten_max_pool_onnx(
-    g: jit_utils.GraphContext,
-    self: _C.Value,
-    kernel_shape: Sequence[int],
-    strides: Sequence[int],
-    pads: Sequence[int],
-    dilations: Sequence[int],
-    ceil_mode: bool,
-    unbatched_rank: int,
-) -> _C.Value:
-    self_rank = g.op("Size", g.op("Shape", self))
-    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
-        self = g.op(
-            "Unsqueeze",
-            self,
-            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
-        )
-
-    pool_result, _ = g.op(
-        "MaxPool",
-        self,
-        outputs=2,
-        ceil_mode_i=ceil_mode,
-        dilations_i=dilations,
-        kernel_shape_i=kernel_shape,
-        pads_i=pads,
-        strides_i=strides,
-    )
-
-    if self_rank == unbatched_rank:
-        pool_result = g.op(
-            "Squeeze",
-            pool_result,
-            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
-        )
-
-    return pool_result
-
-
-# For MaxPool
-def _adjust_attributes_of_max_pool(
-    expand_size: int,
-    kernel_size: Sequence[int] | int,
-    stride: Sequence[int] | int,
-    padding: Sequence[int] | int,
-    dilation: Sequence[int] | int,
-) -> tuple[Sequence[int], Sequence[int], Sequence[int], Sequence[int]]:
-    """Adjust attributes of avg_pool to match ONNX specification."""
-
-    if isinstance(dilation, int):
-        dilation = [dilation] * expand_size
-
-    if isinstance(kernel_size, int):
-        kernel_shape = [kernel_size] * expand_size
-    else:
-        kernel_shape = kernel_size  # type: ignore[assignment]
-
-    if isinstance(padding, int):
-        pads = [padding] * expand_size * 2  # type: ignore[operator, assignment]
-    elif len(padding) == 1:
-        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
-    elif len(padding) == 2:
-        # 2D padding
-        pads = padding * 2  # type: ignore[operator, assignment]
-    elif len(padding) == 3:
-        # 3D padding
-        pads = padding * 2  # type: ignore[operator, assignment]
-    else:
-        # When padding is already done for all dimensions,
-        # we don't need to double it
-        # eg: (1, 1, 1, 1, 1, 1)
-        pads = padding  # type: ignore[assignment]
-
-    if isinstance(stride, int):
-        strides = [stride] * expand_size
-    elif not stride:
-        strides = kernel_shape
-    else:
-        strides = stride  # type: ignore[assignment]
-
-    return (kernel_shape, strides, pads, dilation)
-
-
-def _aten_max_pool_with_indices_onnx(
-    g: jit_utils.GraphContext,
-    self: _C.Value,
-    kernel_shape: Sequence[int],
-    strides: Sequence[int],
-    pads: Sequence[int],
-    dilations: Sequence[int],
-    ceil_mode: bool,
-    unbatched_rank: int,
-    n_dims_one: Sequence[int],
-    n_dims_zero: Sequence[int],
-    n_dims_axes: Sequence[int],
-) -> tuple[_C.Value, Sequence[int]]:
-    self_rank = g.op("Size", g.op("Shape", self))
-    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
-        self = g.op(
-            "Unsqueeze",
-            self,
-            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
-        )
-
-    pool_result, indices = g.op(
-        "MaxPool",
-        self,
-        outputs=2,
-        ceil_mode_i=ceil_mode,
-        dilations_i=dilations,
-        kernel_shape_i=kernel_shape,
-        pads_i=pads,
-        strides_i=strides,
-    )
-    _, flatten_indices = g.op(
-        "MaxPool",
-        self,
-        outputs=2,
-        dilations_i=dilations,
-        kernel_shape_i=n_dims_one,
-        strides_i=n_dims_one,
-    )
-
-    ends = g.op("Constant", value_t=torch.tensor(n_dims_one))
-    starts = g.op("Constant", value_t=torch.tensor(n_dims_zero))
-    axes = g.op("Constant", value_t=torch.tensor(n_dims_axes))
-
-    delta = g.op("Slice", flatten_indices, starts, ends, axes)
-    indices = g.op("Sub", indices, delta)
-
-    if self_rank == unbatched_rank:
-        pool_result = g.op(
-            "Squeeze", pool_result, value_t=torch.tensor([0], dtype=torch.int64)
-        )
-        indices = g.op("Squeeze", indices, value_t=torch.tensor([0], dtype=torch.int64))
-
-    return (pool_result, indices)
-
-
-@_onnx_symbolic(
-    "aten::max_pool1d",
-    decorate=[symbolic_helper._apply_params("max_pool1d", 1, return_indices=False)],
-)
-@_onnx_symbolic(
-    "aten::max_pool2d",
-    decorate=[symbolic_helper._apply_params("max_pool2d", 2, return_indices=False)],
-)
-@_onnx_symbolic(
-    "aten::max_pool3d",
-    decorate=[symbolic_helper._apply_params("max_pool3d", 3, return_indices=False)],
-)
-@_onnx_symbolic(
-    "aten::max_pool1d_with_indices",
-    decorate=[
-        symbolic_helper._apply_params(
-            "max_pool1d_with_indices",
-            1,
-            return_indices=True,
-        )
-    ],
-)
-@_onnx_symbolic(
-    "aten::max_pool2d_with_indices",
-    decorate=[
-        symbolic_helper._apply_params(
-            "max_pool2d_with_indices",
-            2,
-            return_indices=True,
-        )
-    ],
-)
-@_onnx_symbolic(
-    "aten::max_pool3d_with_indices",
-    decorate=[
-        symbolic_helper._apply_params(
-            "max_pool3d_with_indices",
-            3,
-            return_indices=True,
-        )
-    ],
-)
-def _max_pool(name: str, expand_size: int, return_indices: bool):
-    @symbolic_helper.quantized_args(True, False, False, False, False, False)
-    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
-    def symbolic_fn(
-        g: jit_utils.GraphContext,
-        input: _C.Value,
-        kernel_size: Sequence[int],
-        stride: Sequence[int],
-        padding: int | Sequence[int],
-        dilation: Sequence[int],
-        ceil_mode: bool,
-    ):
-        kernel_shape, strides, pads, dilations = _adjust_attributes_of_max_pool(
-            expand_size, kernel_size, stride, padding, dilation
-        )
-
-        if return_indices:
-            return _aten_max_pool_with_indices_onnx(
-                g,
-                input,
-                kernel_shape,
-                strides,
-                pads,
-                dilations,
-                ceil_mode,
-                expand_size + 1,
-                ([1] * expand_size),
-                ([0] * expand_size),
-                ([2 + i for i in range(expand_size)]),
-            )
-        else:
-            return _aten_max_pool_onnx(
-                g,
-                input,
-                kernel_shape,
-                strides,
-                pads,
-                dilations,
-                ceil_mode,
-                expand_size + 1,
-            )
-
-    return symbolic_fn
-
-
-# For AvgPool
-def _adjust_attributes_of_avg_pool(
-    expand_size: int,
-    kernel_size: Sequence[int] | int,
-    stride: Sequence[int] | int,
-    padding: Sequence[int] | int,
-) -> tuple[Sequence[int], Sequence[int], Sequence[int]]:
-    """Adjust attributes of avg_pool to match ONNX specification."""
-
-    if isinstance(kernel_size, int):
-        kernel_shape = [kernel_size] * expand_size
-    else:
-        kernel_shape = kernel_size  # type: ignore[assignment]
-
-    if isinstance(padding, int):
-        pads = [padding] * expand_size * 2
-    elif len(padding) == 1:
-        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
-    elif len(padding) == 2:
-        pads = padding * expand_size  # type: ignore[operator, assignment]
-    else:
-        pads = padding * 2  # type: ignore[operator, assignment]
-
-    if isinstance(stride, int):
-        strides = [stride] * expand_size
-    elif not stride:
-        strides = kernel_shape
-    else:
-        strides = stride  # type: ignore[assignment]
-
-    return (kernel_shape, strides, pads)
-
-
-@_onnx_symbolic(
-    "aten::avg_pool1d",
-    decorate=[symbolic_helper._apply_params("avg_pool1d", 1)],
-)
-@_onnx_symbolic(
-    "aten::avg_pool2d",
-    decorate=[symbolic_helper._apply_params("avg_pool2d", 2)],
-)
-@_onnx_symbolic(
-    "aten::avg_pool3d",
-    decorate=[symbolic_helper._apply_params("avg_pool3d", 3)],
-)
-def _avg_pool(name, expand_size):
-    @symbolic_helper.quantized_args(True, False, False, False, False, False, False)
-    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
-    def symbolic_fn(
-        g,
-        input: _C.Value,
-        kernel_size: Sequence[int],
-        stride: Sequence[int],
-        padding: int | Sequence[int],
-        ceil_mode: int,
-        count_include_pad: int,
-        divisor_override=None,
-    ):
-        kernel_shape, strides, pads = _adjust_attributes_of_avg_pool(
-            expand_size, kernel_size, stride, padding
-        )
-
-        result = g.op(
-            "AveragePool",
-            input,
-            ceil_mode_i=ceil_mode,
-            count_include_pad_i=count_include_pad,
-            kernel_shape_i=kernel_shape,
-            pads_i=pads,
-            strides_i=strides,
-        )
-
-        return result
+from __future__ import annotations
 
-    return symbolic_fn
 
+__all__: list[str] = []
 
-@_onnx_symbolic(
-    "aten::upsample_nearest1d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest2d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest3d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_linear1d",
-    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_bilinear2d",
-    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_trilinear3d",
-    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+from torch.onnx._internal.torchscript_exporter.symbolic_opset10 import *  # noqa: F401,F403
+from torch.onnx._internal.torchscript_exporter.symbolic_opset10 import (  # noqa: F401
+    _slice,
 )
-def _interpolate(name, dim, interpolate_mode):
-    @symbolic_helper.quantized_args(True, False, False)
-    def symbolic_fn(g, input, output_size, *args):
-        scales, align_corners = symbolic_helper._get_interpolate_attributes(
-            g, interpolate_mode, args
-        )
-        symbolic_helper._interpolate_warning(interpolate_mode)
-        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
-        if align_corners:
-            return symbolic_helper._unimplemented(name, "align_corners == True", input)
-        if scales is None:
-            scales = symbolic_helper._interpolate_size_to_scales(
-                g, input, output_size, dim
-            )
-        return g.op("Resize", input, scales, mode_s=interpolate_mode)
-
-    return symbolic_fn
-
-
-@_onnx_symbolic("aten::__interpolate")
-def __interpolate(
-    g: jit_utils.GraphContext,
-    input,
-    size,
-    scale_factor,
-    mode,
-    align_corners,
-    recompute_scale_factor,
-    antialias,
-):
-    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
-        g, input, size, scale_factor, mode, align_corners
-    )
-    return g.op("Resize", input, scales, mode_s=mode)
-
-
-def _slice(
-    g: jit_utils.GraphContext,
-    input: torch._C.Value,
-    axes: list | torch.Tensor | torch._C.Value,
-    starts: list | torch.Tensor | torch._C.Value,
-    ends: list | torch.Tensor | torch._C.Value,
-    steps: list | torch.Tensor | torch._C.Value | None = None,
-):
-    def is_none_value(value):
-        if value is None:
-            return True
-        return (
-            isinstance(value, torch._C.Value)
-            and value.node().kind() == "prim::Constant"
-            and isinstance(value.type(), _C.NoneType)
-        )
-
-    def to_slice_input(list_or_value, default_value=None):
-        # Convert input param into a 1D torch.Value.
-        if is_none_value(list_or_value) and default_value is not None:
-            list_or_value = [default_value]
-
-        if isinstance(list_or_value, torch.Tensor):
-            return g.op("Constant", value_t=list_or_value.clone().detach())
-        elif isinstance(list_or_value, list):
-            return g.op("Constant", value_t=torch.tensor(list_or_value))
-
-        rank = symbolic_helper._get_tensor_rank(list_or_value)
-        if rank == 0:
-            return symbolic_helper._unsqueeze_helper(g, list_or_value, [0])
-        if rank == 1:
-            return list_or_value
-        raise errors.SymbolicValueError(
-            f"Rank must be 0 or 1, not {rank}", list_or_value
-        )
-
-    def get_const_value(list_or_value):
-        if isinstance(list_or_value, (list, torch.Tensor)):
-            if len(list_or_value) == 1:
-                return list_or_value[0]
-            return None
-        return symbolic_helper._maybe_get_const(list_or_value, "i")
-
-    # Check if slice is a no-op
-    if (
-        get_const_value(starts) == 0
-        and get_const_value(ends) == _constants.INT64_MAX
-        and (steps is None or get_const_value(steps) == 1)
-    ):
-        return input
-
-    axes = to_slice_input(axes)
-    starts = to_slice_input(starts, default_value=0)
-    ends = to_slice_input(ends, default_value=_constants.INT64_MAX)
-    if steps is None:
-        return g.op("Slice", input, starts, ends, axes)
-    steps = to_slice_input(steps, default_value=1)
-    return g.op("Slice", input, starts, ends, axes, steps)
-
-
-@_onnx_symbolic("aten::slice")
-def slice(g: jit_utils.GraphContext, self, *args):
-    if len(args) == 4:
-        # aten::slice(Tensor self, int dim, int? start=None, int? end=None, int step=1) -> Tensor
-        dims, start, end, step = args
-    elif len(args) == 3:
-        # aten::slice(t[] l, int? start=None, int? end=None, int step=1) -> t[]
-        start, end, step = args
-        dims = [0]
-    else:
-        raise errors.SymbolicValueError("Unknown aten::slice signature", self)
-
-    return symbolic_helper._slice_helper(
-        g,
-        self,
-        axes=dims,
-        starts=start,
-        ends=end,
-        steps=step,
-    )
-
-
-@_onnx_symbolic("aten::flip")
-@symbolic_helper.parse_args("v", "is")
-def flip(g: jit_utils.GraphContext, input, dims):
-    return symbolic_helper._slice_helper(
-        g,
-        input,
-        axes=dims,
-        starts=[-1] * len(dims),
-        ends=[-_constants.INT64_MAX] * len(dims),
-        steps=[-1] * len(dims),
-    )
-
-
-@_onnx_symbolic("aten::fmod")
-def fmod(g: jit_utils.GraphContext, input, other):
-    return g.op("Mod", input, other, fmod_i=1)
-
-
-@_onnx_symbolic("aten::embedding_bag")
-@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
-def embedding_bag(
-    g: jit_utils.GraphContext,
-    embedding_matrix,
-    indices,
-    offsets,
-    scale_grad_by_freq,
-    mode,
-    sparse,
-    per_sample_weights,
-    include_last_offset,
-    padding_idx,
-):
-    if scale_grad_by_freq and GLOBALS.export_training:
-        return symbolic_helper._onnx_unsupported(
-            "embedding_bag with scale_grad_by_freq for training mode"
-        )
-    if padding_idx is not None and padding_idx >= 0:
-        raise RuntimeError("embedding_bag with padding_idx")
-
-    warnings.warn(
-        "Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. "
-        "Please use opset 11 or higher to export model for dynamic input shape.'"
-    )
-    offsets_dim_0 = symbolic_helper._get_tensor_dim_size(offsets, 0)
-    if offsets_dim_0 is not None:
-        if include_last_offset:
-            offset_len = offsets_dim_0 - 1
-            offsets_extended = offsets
-        else:
-            offset_len = offsets_dim_0
-            offsets_extended = [
-                offsets,
-                g.op("Constant", value_t=torch.tensor([sys.maxsize])),
-            ]
-            offsets_extended = g.op("Concat", *offsets_extended, axis_i=0)
-        list_ = []
-        for i in range(offset_len):
-            start_ = symbolic_helper._unsqueeze_helper(
-                g,
-                opset9.select(g, offsets_extended, torch.tensor(0), torch.tensor(i)),
-                [0],
-            )
-            end_ = symbolic_helper._unsqueeze_helper(
-                g,
-                opset9.select(
-                    g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)
-                ),
-                [0],
-            )
-            axes_ = g.op("Constant", value_t=torch.tensor([0]))
-            indices_row = g.op("Slice", indices, start_, end_, axes_)
-
-            embeddings = g.op("Gather", embedding_matrix, indices_row)
-            if not symbolic_helper._is_none(per_sample_weights):
-                per_sample_weights_row = g.op(
-                    "Slice", per_sample_weights, start_, end_, axes_
-                )
-                per_sample_weights_row = symbolic_helper._unsqueeze_helper(
-                    g, per_sample_weights_row, [1]
-                )
-                embeddings = g.op("Mul", embeddings, per_sample_weights_row)
-            if mode == 0:
-                embeddings = symbolic_helper._reducesum_helper(
-                    g, embeddings, axes_i=[0], keepdims_i=0
-                )
-            elif mode == 1:
-                embeddings = g.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
-            else:
-                embeddings = g.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
-
-            embeddings = symbolic_helper._unsqueeze_helper(g, embeddings, [0])
-            list_.append(embeddings)
-
-        output = g.op("Concat", *list_, axis_i=0)
-        # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
-        # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
-        return output, None, None, None
-    else:
-        return symbolic_helper._onnx_unsupported(
-            "embedding_bag with unknown shape of offsets for opset 10 is not supported. "
-            "please use opset 11 or higher."
-        )
-
-
-@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
-@symbolic_helper.parse_args("v", "v", "v", "i", "i")
-def fake_quantize_per_tensor_affine(
-    g: jit_utils.GraphContext,
-    inputs,
-    scale,
-    zero_point,
-    quant_min=-128,
-    quant_max=127,
-):
-    # NOTE: (0, 127) is a special case. PyTorch restricts activations to be in the range (0, 127).
-    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
-    if (quant_min, quant_max) == (0, 127):
-        symbolic_helper._onnx_opset_unsupported_detailed(
-            "fake_quantize_per_tensor_affine",
-            10,
-            13,
-            "Quantize range (0, 127) not supported, requires opset 13 Clip",
-            inputs,
-        )
-    if (quant_min, quant_max) not in [(0, 255), (-128, 127)]:
-        raise errors.SymbolicValueError(
-            f"For (quant_min, quant_max), ONNX allows only (0, 255) and (-128, 127). "
-            f"Got ({quant_min}, {quant_max})",
-            inputs,
-        )
-    scale = symbolic_helper._maybe_get_scalar(scale)
-    if scale is None:
-        symbolic_helper._onnx_opset_unsupported_detailed(
-            "fake_quantize_per_tensor_affine",
-            10,
-            13,
-            "Non-constant scale not supported",
-            inputs,
-        )
-    scale = scale.float().data  # Avoid exporter generating double type
-    if quant_min == 0:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
-    else:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
-    return g.op(
-        "DequantizeLinear",
-        g.op("QuantizeLinear", inputs, scale, zero_point),
-        scale,
-        zero_point,
-    )
-
-
-@_onnx_symbolic("aten::isinf")
-def isinf(g: jit_utils.GraphContext, input):
-    return g.op("IsInf", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE))
-
-
-@_onnx_symbolic("aten::isfinite")
-def isfinite(g: jit_utils.GraphContext, input):
-    inf_node = isinf(g, input)
-    nan_node = opset9.isnan(g, input)
-    return opset9.__not_(g, opset9.__or_(g, inf_node, nan_node))
-
-
-@_onnx_symbolic("aten::quantize_per_tensor")
-def quantize_per_tensor(g: jit_utils.GraphContext, input, scale, zero_point, dtype):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    # TODO(justinchuby): Extract all the cast ops into a helper function.
-    zero_point = g.op(
-        "Cast", zero_point, to_i=_type_utils.JitScalarType(dtype).onnx_type()
-    )
-    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    return symbolic_helper.quantize_helper(g, input, scale, zero_point)
-
-
-@_onnx_symbolic("aten::dequantize")
-def dequantize(g: jit_utils.GraphContext, input):
-    return symbolic_helper.dequantize_helper(g, input)[0]
-
-
-@_onnx_symbolic("aten::nan_to_num")
-@symbolic_helper.parse_args("v", "f", "f", "f")
-def nan_to_num(g: jit_utils.GraphContext, input, nan, posinf, neginf):
-    # Cannot create a int type tensor with inf/nan values, so we simply
-    # return the original tensor
-    if not symbolic_helper._is_fp(input):
-        return input
-    input_dtype = _type_utils.JitScalarType.from_value(input).dtype()
-    if nan is None:
-        nan = 0.0
-    nan_cond = opset9.isnan(g, input)
-    nan_result = g.op(
-        "Where",
-        nan_cond,
-        g.op("Constant", value_t=torch.tensor([nan], dtype=input_dtype)),
-        input,
-    )
-
-    # For None values of posinf, neginf we use the greatest/lowest finite
-    # value representable by input's dtype.
-    finfo = torch.finfo(input_dtype)
-    if posinf is None:
-        posinf = finfo.max
-    posinf_cond = opset9.logical_and(
-        g,
-        isinf(g, nan_result),
-        opset9.gt(g, nan_result, g.op("Constant", value_t=torch.LongTensor([0]))),
-    )
-    nan_posinf_result = g.op(
-        "Where",
-        posinf_cond,
-        g.op("Constant", value_t=torch.tensor([posinf], dtype=input_dtype)),
-        nan_result,
-    )
-
-    if neginf is None:
-        neginf = finfo.min
-    neginf_cond = opset9.logical_and(
-        g,
-        isinf(g, nan_posinf_result),
-        opset9.lt(
-            g, nan_posinf_result, g.op("Constant", value_t=torch.LongTensor([0]))
-        ),
-    )
-    return g.op(
-        "Where",
-        neginf_cond,
-        g.op("Constant", value_t=torch.tensor([neginf], dtype=input_dtype)),
-        nan_posinf_result,
-    )
-
-
-# Quantized symbolics ---------------------------------------------------------
-# https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
-# Support starts from opset 10 because `DequantizeLinear` and `QuantizeLinear` were
-# introduced in opset version 10.
-@_onnx_symbolic("quantized::linear")
-def quantized_linear(
-    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.linear(g, input, weight, bias)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::linear_relu")
-def quantized_linear_relu(
-    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.linear(g, input, weight, bias)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::add")
-def quantized_add(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
-
-    output = opset9.add(g, x, y)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::add_relu")
-def quantized_add_relu(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
-
-    output = opset9.add(g, x, y)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::mul")
-def quantized_mul(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
-
-    output = opset9.mul(g, x, y)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::hardswish")
-def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = opset9.hardswish(g, x)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::sigmoid")
-def quantized_sigmoid(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = opset9.sigmoid(g, x)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::leaky_relu")
-def quantized_leaky_relu(
-    g: jit_utils.GraphContext, x, negative_slope, inplace, op_scale, op_zero_point
-):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = opset9.leaky_relu(g, x, negative_slope, inplace)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::layer_norm")
-def quantized_layer_norm(
-    g: jit_utils.GraphContext,
-    x,
-    normalized_shape,
-    weight,
-    bias,
-    eps,
-    op_scale,
-    op_zero_point,
-):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = opset9.layer_norm(g, x, normalized_shape, weight, bias, eps, False)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::group_norm")
-def quantized_group_norm(
-    g: jit_utils.GraphContext,
-    x,
-    num_groups,
-    weight,
-    bias,
-    eps,
-    op_scale,
-    op_zero_point,
-):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = opset9.group_norm(g, x, num_groups, weight, bias, eps, False)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::instance_norm")
-@symbolic_helper.parse_args("v", "v", "v", "f", "v", "v")
-def quantized_instance_norm(
-    g: jit_utils.GraphContext,
-    q_input,
-    weight,
-    bias,
-    eps,
-    op_scale,
-    op_zero_point,
-):
-    input, _, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-
-    output = opset9.instance_norm(
-        g, input, weight, bias, None, None, False, 0.0, eps, False
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv1d_relu")
-def quantized_conv1d_relu(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv2d_relu")
-def quantized_conv2d_relu(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv3d_relu")
-def quantized_conv3d_relu(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv1d")
-def quantized_conv1d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv2d")
-def quantized_conv2d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv3d")
-def quantized_conv3d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv_transpose1d")
-def quantized_conv_transpose1d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv_transpose2d(
-        g, input, weight, bias, stride, padding, output_padding, groups, dilation
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv_transpose2d")
-def quantized_conv_transpose2d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv_transpose2d(
-        g, input, weight, bias, stride, padding, output_padding, groups, dilation
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv_transpose3d")
-def quantized_conv_transpose3d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv_transpose3d(
-        g, input, weight, bias, stride, padding, output_padding, groups, dilation
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::cat")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def quantized_cat(
-    g: jit_utils.GraphContext,
-    q_inputs: _C.Value,
-    dim: int,
-    op_scale: _C.Value,
-    op_zero_point: _C.Value,
-) -> _C.Value:
-    unpacked_inputs = symbolic_helper._unpack_list(q_inputs)
-    dequantized = [
-        symbolic_helper.dequantize_helper(g, input)[0] for input in unpacked_inputs
-    ]
-    concatenated = g.op("Concat", *dequantized, axis_i=dim)
-    return symbolic_helper.quantize_helper(g, concatenated, op_scale, op_zero_point)
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 47ed56bcfeac9..276ef7209bf69 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -1,1469 +1,8 @@
-# mypy: allow-untyped-defs
-# mypy: disable-error-code=arg-type
-"""This file exports ONNX ops for opset 11."""
+"""Backward compatibility module for torch.onnx.symbolic_opset11."""
 
 from __future__ import annotations
 
-import functools
-import sys
-import warnings
-from typing import TYPE_CHECKING
 
-import torch
-from torch import _C
-from torch._C import _onnx as _C_onnx
-from torch.onnx import (
-    _type_utils,
-    errors,
-    symbolic_helper,
-    symbolic_opset10 as opset10,
-    symbolic_opset9 as opset9,
-    utils,
-)
-from torch.onnx._internal import jit_utils, registration
+__all__: list[str] = []
 
-
-if TYPE_CHECKING:
-    from collections.abc import Sequence
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
-
-__all__ = [
-    "add",
-    "append",
-    "arange",
-    "argsort",
-    "atleast_1d",
-    "atleast_2d",
-    "atleast_3d",
-    "cat",
-    "chunk",
-    "clamp_max",
-    "clamp_min",
-    "clamp",
-    "constant_pad_nd",
-    "cumsum",
-    "Delete",
-    "embedding_bag",
-    "embedding_renorm",
-    "flatten",
-    "gather",
-    "hardtanh",
-    "hstack",
-    "im2col",
-    "index_fill",
-    "index",
-    "index_copy",
-    "index_put",
-    "insert",
-    "linalg_det",
-    "linalg_vector_norm",
-    "logdet",
-    "masked_scatter",
-    "masked_select",
-    "mm",
-    "narrow",
-    "normal",
-    "pad",
-    "pixel_shuffle",
-    "pop",
-    "prim_constant_chunk",
-    "reflection_pad",
-    "relu6",
-    "remainder",
-    "replication_pad",
-    "round",
-    "scatter",
-    "select",
-    "size",
-    "sort",
-    "split_with_sizes",
-    "split",
-    "squeeze",
-    "stack",
-    "topk",
-    "unbind",
-    "unique_dim",
-    "unsqueeze",
-    "vstack",
-]
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=11)
-
-
-@_onnx_symbolic("aten::hardtanh")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "f", "f")
-def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.FLOAT
-    )
-    min_val = g.op(
-        "Constant",
-        value_t=torch.tensor(min_val, dtype=scalar_type.dtype()),
-    )
-    max_val = g.op(
-        "Constant",
-        value_t=torch.tensor(max_val, dtype=scalar_type.dtype()),
-    )
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Clip", self, min_val, max_val, opset_before=12
-    )
-
-
-@_onnx_symbolic("aten::clamp")
-def clamp(g: jit_utils.GraphContext, self, min, max):
-    def _cast_if_not_none(tensor, dtype):
-        if tensor is not None and not symbolic_helper._is_none(tensor):
-            return g.op(
-                "Cast",
-                tensor,
-                to_i=dtype.onnx_type(),
-            )
-        else:
-            return tensor
-
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.UNDEFINED
-    )
-    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
-        min = _cast_if_not_none(min, scalar_type)
-        max = _cast_if_not_none(max, scalar_type)
-
-    if symbolic_helper._is_none(min):
-        return clamp_max(g, self, max)
-    elif symbolic_helper._is_none(max):
-        return clamp_min(g, self, min)
-    else:
-        if (
-            symbolic_helper._get_tensor_rank(min) == 0
-            and symbolic_helper._get_tensor_rank(max) == 0
-        ):
-            return symbolic_helper._op_with_optional_float_cast(
-                g, "Clip", self, min, max, opset_before=12
-            )
-        else:
-            return clamp_max(g, clamp_min(g, self, min), max)
-
-
-@_onnx_symbolic("aten::clamp_min")
-@symbolic_helper.parse_args("v", "v")
-def clamp_min(g: jit_utils.GraphContext, self, min):
-    min = g.op("Cast", min, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
-    if symbolic_helper._get_tensor_rank(min) == 0:
-        max = opset9.unused(g)
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Clip", self, min, max, opset_before=12
-        )
-    else:
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Max", self, min, opset_before=12
-        )
-
-
-@_onnx_symbolic("aten::clamp_max")
-@symbolic_helper.parse_args("v", "v")
-def clamp_max(g: jit_utils.GraphContext, self, max):
-    max = g.op("Cast", max, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
-    if symbolic_helper._get_tensor_rank(max) == 0:
-        min = opset9.unused(g)
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Clip", self, min, max, opset_before=12
-        )
-    else:
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Min", self, max, opset_before=12
-        )
-
-
-@_onnx_symbolic("aten::relu6")
-def relu6(g: jit_utils.GraphContext, input):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        input, _type_utils.JitScalarType.FLOAT
-    )
-    min_val = g.op(
-        "Constant",
-        value_t=torch.tensor(0, dtype=scalar_type.dtype()),
-    )
-    max_val = g.op(
-        "Constant",
-        value_t=torch.tensor(6, dtype=scalar_type.dtype()),
-    )
-    return clamp(g, input, min_val, max_val)
-
-
-@_onnx_symbolic("aten::select")
-# Opset 11 gather accepts negative indices
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "i", "v")
-def select(g: jit_utils.GraphContext, self, dim, index):
-    return g.op("Gather", self, index, axis_i=dim)
-
-
-@_onnx_symbolic("aten::index_put")
-def index_put(
-    g: jit_utils.GraphContext, self, indices_list_value, values, accumulate=False
-):
-    if symbolic_helper._is_packed_list(indices_list_value):
-        indices_list = symbolic_helper._unpack_list(indices_list_value)
-    else:
-        indices_list = [indices_list_value]
-    accumulate = symbolic_helper._parse_arg(accumulate, "b")
-
-    if len(indices_list) == 0:
-        return values
-
-    if len(indices_list) > 1:
-        for idx_ in range(len(indices_list)):
-            if symbolic_helper._is_bool(indices_list[idx_]):
-                indices_list[idx_] = g.op("NonZero", indices_list[idx_])
-        index = indices_list[0]
-
-        for ind in indices_list[1:]:
-            index = opset9.add(g, index, ind)
-        broadcast_index_shape = g.op("Shape", index)
-        indices_list = [
-            symbolic_helper._unsqueeze_helper(
-                g, opset9.expand(g, ind, broadcast_index_shape, None), [-1]
-            )
-            for ind in indices_list
-        ]
-        index = g.op("Concat", *indices_list, axis_i=-1)
-    else:
-        # Replace index_put node with masked_scatter or masked_fill
-        # when inputs to the index_put node contains a single boolean input.
-        #
-        # index_put -> masked_fill
-        #   * input index contains single tensor of Bool type (e.g.: %24 <- %23).
-        #   * input value contains single element (e.g.: %18).
-        #
-        # Torch IR
-        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
-        #   %16 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
-        #               aten::to(%8, %26, %27, %11, %12, %28, %29, %15)
-        #   %18 : Float(requires_grad=0, device=cpu) = prim::Constant[value={1}]()
-        #   %23 : Bool(8, strides=[1], device=cpu) = aten::view(%16, %22)
-        #   %24 : Tensor?[] = prim::ListConstruct(%23)
-        #   %25 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
-        #                aten::index_put(%mask, %24, %18, %30)
-        #   return (%25)
-        #
-        #
-        # index_put -> masked_scatter
-        #   * input index contains single tensor of Bool type (e.g.: %32 <- %31).
-        #   * input value contains multiple elements (e.g.: %28).
-        #
-        # Torch IR
-        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
-        #   %28 : Float(8, strides=[1], requires_grad=0, device=cpu)
-        #                = prim::Constant[value= 1  1  1  1  1  1  1  1 [ CPUFloatType{8} ]]()
-        #   %15 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
-        #                = aten::ne(%mask, %some_const)
-        #   %23 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
-        #                = aten::to(%15, %34, %35, %18, %19, %36, %37, %22)
-        #   %38 : Long(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
-        #   %30 : int[] = prim::Constant[value=[-1]]()
-        #   %31 : Bool(8, strides=[1], device=cpu) = aten::view(%23, %30)
-        #   %32 : Tensor?[] = prim::ListConstruct(%31)
-        #   %33 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
-        #               = aten::index_put(%mask, %32, %28, %38)
-        #   return (%33)
-        index = indices_list[0]
-        bool_inp = index
-        if symbolic_helper._is_bool(bool_inp):
-            rank = symbolic_helper._get_tensor_rank(values)
-            if rank is not None and rank == 0:
-                return opset9.masked_fill(g, self, bool_inp, values)
-            mask_rank = symbolic_helper._get_tensor_rank(bool_inp)
-            self_rank = symbolic_helper._get_tensor_rank(self)
-            if (
-                mask_rank is not None
-                and self_rank is not None
-                and self_rank > mask_rank
-            ):
-                # Unsqueeze 'bool_inp' to be broadcastable to shape of 'self'.
-                bool_inp = symbolic_helper._unsqueeze_helper(
-                    g, bool_inp, list(range(mask_rank, self_rank))
-                )
-            return masked_scatter(g, self, bool_inp, values)
-        broadcast_index_shape = g.op("Shape", index)
-        index = symbolic_helper._unsqueeze_helper(g, index, [-1])
-    sub_data_shape = symbolic_helper._slice_helper(
-        g, g.op("Shape", self), axes=[0], starts=[len(indices_list)], ends=[sys.maxsize]
-    )
-    values_shape = g.op("Concat", broadcast_index_shape, sub_data_shape, axis_i=0)
-    # Check if values is a singular value and expand accordingly
-    rank = symbolic_helper._get_tensor_rank(values)
-    if rank is not None and rank == 0:
-        values = opset9.expand(g, values, values_shape, None)
-    values = symbolic_helper._reshape_helper(g, values, values_shape)
-
-    self_scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.UNDEFINED
-    )
-    if self_scalar_type != _type_utils.JitScalarType.UNDEFINED:
-        values_scalar_type = _type_utils.JitScalarType.from_value(
-            values, _type_utils.JitScalarType.UNDEFINED
-        )
-        if self_scalar_type != values_scalar_type:
-            values = g.op("Cast", values, to_i=self_scalar_type.onnx_type())
-    elif accumulate:
-        raise errors.SymbolicValueError("self does not have a valid scalar type.", self)
-
-    if accumulate:
-        zeros = g.op(
-            "ConstantOfShape",
-            g.op("Shape", self),
-            value_t=torch.tensor([0], dtype=self_scalar_type.dtype()),
-        )
-        result = g.op("ScatterND", zeros, index, values)
-        result = add(g, self, result)
-    else:
-        result = g.op("ScatterND", self, index, values)
-
-    return result
-
-
-@_onnx_symbolic("aten::pixel_shuffle")
-@symbolic_helper.parse_args("v", "i")
-def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
-    rank = symbolic_helper._get_tensor_rank(self)
-    if rank is not None and rank != 4:
-        return symbolic_helper._unimplemented("pixel_shuffle", "only support 4d input")
-    return g.op("DepthToSpace", self, blocksize_i=upscale_factor, mode_s="CRD")
-
-
-@_onnx_symbolic(
-    "aten::upsample_nearest1d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest2d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest3d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_linear1d",
-    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_bilinear2d",
-    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_trilinear3d",
-    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_bicubic2d",
-    decorate=[symbolic_helper._apply_params("upsample_bicubic2d", 4, "cubic")],
-)
-def _interpolate(name: str, dim: int, interpolate_mode: str):
-    return symbolic_helper._interpolate_helper(name, dim, interpolate_mode)
-
-
-@_onnx_symbolic("aten::__interpolate")
-@symbolic_helper.quantized_args(True, False, False, False, False, False, False)
-def __interpolate(
-    g: jit_utils.GraphContext,
-    input,
-    size,
-    scale_factor,
-    mode,
-    align_corners,
-    recompute_scale_factor,
-    antialias,
-):
-    return symbolic_helper.__interpolate_helper(
-        g, input, size, scale_factor, mode, align_corners, recompute_scale_factor
-    )
-
-
-@_onnx_symbolic("aten::gather")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
-    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
-        return symbolic_helper._unimplemented("gather", "sparse_grad == True")
-    return g.op("GatherElements", self, index, axis_i=dim)
-
-
-@_onnx_symbolic("aten::scatter")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def scatter(g: jit_utils.GraphContext, self, dim, index, src):
-    src_type = _type_utils.JitScalarType.from_value(src)
-    src = symbolic_helper._maybe_get_scalar(src)
-    if symbolic_helper._is_value(src):
-        return g.op("ScatterElements", self, index, src, axis_i=dim)
-    else:
-        # Check if scalar "src" has same type as self (PyTorch allows different
-        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
-        if _type_utils.JitScalarType.from_value(self) != src_type:
-            src = g.op(
-                "Cast",
-                src,
-                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-            )
-        return g.op(
-            "ScatterElements", self, index, opset9.expand_as(g, src, index), axis_i=dim
-        )
-
-
-@_onnx_symbolic("aten::cumsum")
-@symbolic_helper.parse_args("v", "i", "none")
-def cumsum(g: jit_utils.GraphContext, self, dim, dtype=None):
-    dim_tensor = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.int))
-    if dtype and dtype.node().kind() != "prim::Constant":
-        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        cast = g.op(
-            "Cast", self, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
-        )
-    else:
-        cast = self
-    csum = g.op("CumSum", cast, dim_tensor)
-    return csum
-
-
-@_onnx_symbolic("aten::masked_select")
-def masked_select(g: jit_utils.GraphContext, self, mask):
-    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
-    return g.op("GatherND", self, index)
-
-
-@_onnx_symbolic("aten::masked_scatter")
-def masked_scatter(g: jit_utils.GraphContext, self, mask, source):
-    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
-    # NOTE: source can have more elements than needed.
-    # It could also have arbitrary shape.
-    # This is not supported by ONNX::ScatterND, so we need to flatten and slice source tensor.
-    source = symbolic_helper._reshape_helper(g, source, torch.LongTensor([-1]))
-    source = symbolic_helper._slice_helper(
-        g,
-        source,
-        axes=torch.LongTensor([0]),
-        starts=torch.LongTensor([0]),
-        ends=opset9.size(g, index, torch.LongTensor([0])),
-    )
-    return g.op("ScatterND", self, index, source)
-
-
-@_onnx_symbolic("aten::len")
-def _len(g: jit_utils.GraphContext, self):
-    if (
-        symbolic_helper._is_tensor_list(self)
-        or self.node().kind() == "onnx::SplitToSequence"
-    ):
-        return g.op("SequenceLength", self)
-    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
-    return symbolic_helper._squeeze_helper(g, sz_0, [0])
-
-
-@_onnx_symbolic("aten::__getitem_")
-def __getitem_(g: jit_utils.GraphContext, self, i):
-    if symbolic_helper._is_tensor_list(self):
-        # SequenceAt requires that the input be a List of Tensors
-        return g.op("SequenceAt", self, i)
-    else:
-        from torch.onnx.symbolic_opset9 import __getitem_ as getitem
-
-        return getitem(g, self, i)
-
-
-@_onnx_symbolic("aten::_set_item")
-def _set_item(g: jit_utils.GraphContext, tensor_list, i, v):
-    tensor_list = g.op("SequenceErase", tensor_list, i)
-    return g.op("SequenceInsert", tensor_list, v, i)
-
-
-@_onnx_symbolic("aten::append")
-def append(g: jit_utils.GraphContext, self, tensor):
-    return g.op("SequenceInsert", self, tensor)
-
-
-@_onnx_symbolic("aten::add")
-def add(g: jit_utils.GraphContext, self, other, alpha=None):
-    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
-        tensor_list_node = other.node()
-        if tensor_list_node.kind() != "prim::ListConstruct":
-            return symbolic_helper._unimplemented(
-                "add", "does not support adding dynamic tensor list to another"
-            )
-        tensors = symbolic_helper._unpack_list(other)
-        l = self
-        for t in tensors:
-            l = g.op("SequenceInsert", l, t)
-        return l
-
-    return opset9.add(g, self, other, alpha)
-
-
-@_onnx_symbolic("aten::insert")
-def insert(g: jit_utils.GraphContext, self, pos, tensor):
-    return g.op("SequenceInsert", self, tensor, pos)
-
-
-@_onnx_symbolic("aten::pop")
-def pop(g: jit_utils.GraphContext, tensor_list, dim):
-    return g.op("SequenceErase", tensor_list, dim)
-
-
-@_onnx_symbolic("aten::Delete")
-def Delete(g: jit_utils.GraphContext, tensor_list, dim):
-    return g.op("SequenceErase", tensor_list, dim)
-
-
-@_onnx_symbolic("aten::cat")
-@symbolic_helper.quantized_args(True)
-def cat(g: jit_utils.GraphContext, tensor_list, dim):
-    if symbolic_helper._is_packed_list(tensor_list):
-        return opset9.cat(g, tensor_list, dim)
-    else:
-        dim = symbolic_helper._get_const(dim, "i", "dim")
-        return g.op("ConcatFromSequence", tensor_list, axis_i=dim)
-
-
-@_onnx_symbolic("aten::stack")
-def stack(g: jit_utils.GraphContext, tensor_list, dim):
-    if symbolic_helper._is_packed_list(tensor_list):
-        return opset9.stack(g, tensor_list, dim)
-    else:
-        dim = symbolic_helper._get_const(dim, "i", "dim")
-        return g.op("ConcatFromSequence", tensor_list, axis_i=dim, new_axis_i=1)
-
-
-@_onnx_symbolic("aten::_unique2")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def _unique2(g: jit_utils.GraphContext, self, sorted, return_inverse, return_counts):
-    u, _indices, inverse_indices, counts = g.op(
-        "Unique", self, sorted_i=sorted, outputs=4
-    )
-    return u, inverse_indices, counts
-
-
-@_onnx_symbolic("aten::unique_dim")
-@symbolic_helper.parse_args("v", "i", "i", "i", "i")
-def unique_dim(
-    g: jit_utils.GraphContext, self, dim, sorted, return_inverse, return_counts
-):
-    u, _indices, inverse_indices, counts = g.op(
-        "Unique", self, axis_i=dim, sorted_i=sorted, outputs=4
-    )
-    return u, inverse_indices, counts
-
-
-@_onnx_symbolic("aten::topk")
-@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
-def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
-    return symbolic_helper._topk_helper(
-        g, self, k, dim, largest=largest, sorted=sorted, out=out
-    )
-
-
-@_onnx_symbolic("aten::sort")
-@symbolic_helper.parse_args("v", "i", "i", "none")
-def sort(g: jit_utils.GraphContext, self, dim, descending, out=None):
-    return symbolic_helper._sort_helper(g, self, dim, descending=descending, out=out)
-
-
-@_onnx_symbolic("aten::argsort")
-@symbolic_helper.parse_args("v", "i", "i", "none")
-def argsort(g: jit_utils.GraphContext, self, dim, descending, out=None):
-    _, indices = symbolic_helper._sort_helper(
-        g, self, dim, descending=descending, out=out
-    )
-    return indices
-
-
-@_onnx_symbolic("aten::round")
-@symbolic_helper.parse_args("v", "i")
-def round(g: jit_utils.GraphContext, self, decimals=0):
-    if not symbolic_helper._is_fp(self):
-        return self
-    if decimals == 0:
-        return g.op("Round", self)
-    mul = g.op("Mul", self, g.op("Constant", value_t=torch.tensor(pow(10, decimals))))
-    round = g.op("Round", mul)
-    return g.op(
-        "Mul", round, g.op("Constant", value_t=torch.tensor(pow(10, -1 * decimals)))
-    )
-
-
-@_onnx_symbolic("aten::remainder")
-def remainder(g: jit_utils.GraphContext, input, other):
-    if symbolic_helper._is_fp(input) or symbolic_helper._is_fp(other):
-        return opset9.remainder(g, input, other)
-    return g.op("Mod", input, other, fmod_i=0)
-
-
-@_onnx_symbolic("aten::split")
-@symbolic_helper.parse_args("v", "v", "i", "i")
-def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
-    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
-        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
-        if _outputs is None:
-            return split_out
-        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
-        if (
-            symbolic_helper._is_packed_list(split_size_or_sizes)
-            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
-        ):
-            split_sizes = [
-                symbolic_helper._unsqueeze_helper(g, v, [0])
-                for v in symbolic_helper._unpack_list(split_size_or_sizes)
-            ]
-            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
-            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-            res = []
-            for i in range(_outputs):
-                end = g.op(
-                    "Add", start, split_sizes[i]
-                )  # split_sizes is a list of same length as _outputs
-                res.append(g.op("Slice", self, start, end, axis))
-                start = end
-            return res
-        return [
-            g.op(
-                "SequenceAt",
-                split_out,
-                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
-            )
-            for i in range(_outputs)
-        ]
-    else:
-        return opset9.split(g, self, split_size_or_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::split_with_sizes")
-@symbolic_helper.parse_args("v", "v", "i", "i")
-def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
-    return split(g, self, split_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::unbind")
-@symbolic_helper.parse_args("v", "i", "i")
-def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
-    if _outputs is None:
-        return g.op(
-            "SplitToSequence",
-            self,
-            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
-            axis_i=dim,
-            keepdims_i=0,
-        )
-    else:
-        return opset9.unbind(g, self, dim, _outputs)
-
-
-def _prepare_onnx_paddings(g: jit_utils.GraphContext, input, pad):
-    """Generate paddings in ONNX order based on pad in pytorch.
-
-    Args:
-        input: the input tensor.
-        pad: the paddings in pytorch.
-            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ..., dim_m_begin, dim_m_end,
-            where m is in range [0, n].
-    """
-    if (
-        not symbolic_helper._is_packed_list(pad)
-        and symbolic_helper._is_list(pad)
-        and symbolic_helper._is_scalar_list(pad)
-    ):
-        pad = g.op("ConcatFromSequence", pad, axis_i=0, new_axis_i=1)
-    # The desired order of paddings is
-    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
-    # n is the dimension of input.
-    # Assume zero-dimensions in the beginning, pad the "pad" sequence with zeros in the beginning
-    pad_len = opset9.size(g, pad, g.op("Constant", value_t=torch.tensor([0])))
-    # Set extension = [0] * (dim * 2 - len(pad))
-    rank = symbolic_helper._get_tensor_rank(input)
-    if rank is None:
-        rank = g.op("Size", g.op("Shape", input))
-    else:
-        rank = g.op("Constant", value_t=torch.tensor(rank, dtype=torch.int64))
-    extension = g.op(
-        "Sub",
-        g.op("Mul", rank, g.op("Constant", value_t=torch.tensor(2, dtype=torch.int64))),
-        pad_len,
-    )
-    # Concat pad with extension: paddings = [dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, 0, 0, ... ]
-    # Currently ONNX only supports int64 type for Pad
-    pad = g.op("Cast", pad, to_i=_C_onnx.TensorProtoDataType.INT64)
-    paddings = g.op(
-        "Concat",
-        pad,
-        g.op(
-            "ConstantOfShape", extension, value_t=torch.tensor([0], dtype=torch.int64)
-        ),
-        axis_i=0,
-    )
-    # Reshape and reverse order and collate first beginnings and then ends
-    # paddings = [[..., 0, dim_n-1_begin, dim_n_begin],
-    #               [..., 0, dim_n-1_end, dim_n_end]]
-    # Reshape back to 1-D paddings = [..., 0, dim_n - 1_begin, dim_n_begin, ..., 0, dim_n - 1_end, dim_n_end]
-    paddings = symbolic_helper._reshape_helper(
-        g, paddings, g.op("Constant", value_t=torch.tensor([-1, 2]))
-    )
-    paddings = g.op("Transpose", opset10.flip(g, paddings, [0]), perm_i=[1, 0])
-    paddings = symbolic_helper._reshape_helper(
-        g, paddings, g.op("Constant", value_t=torch.tensor([-1]))
-    )
-    padding_c = g.op("Cast", paddings, to_i=_C_onnx.TensorProtoDataType.INT64)
-    return padding_c
-
-
-@_onnx_symbolic("aten::constant_pad_nd")
-def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value=None):
-    mode = "constant"
-    value = symbolic_helper._maybe_get_scalar(value)
-    value = symbolic_helper._if_scalar_type_as(value, input)
-    pad = _prepare_onnx_paddings(g, input, padding)
-    return g.op("Pad", input, pad, value, mode_s=mode)
-
-
-@_onnx_symbolic("aten::reflection_pad1d")
-@_onnx_symbolic("aten::reflection_pad2d")
-@_onnx_symbolic("aten::reflection_pad3d")
-def reflection_pad(g: jit_utils.GraphContext, input, padding):
-    mode = "reflect"
-    paddings = _prepare_onnx_paddings(g, input, padding)
-    return g.op("Pad", input, paddings, mode_s=mode)
-
-
-@_onnx_symbolic("aten::replication_pad1d")
-@_onnx_symbolic("aten::replication_pad2d")
-@_onnx_symbolic("aten::replication_pad3d")
-def replication_pad(g: jit_utils.GraphContext, input, padding):
-    mode = "edge"
-    paddings = _prepare_onnx_paddings(g, input, padding)
-    return g.op("Pad", input, paddings, mode_s=mode)
-
-
-@_onnx_symbolic("aten::pad")
-def pad(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    pad: _C.Value,
-    mode: _C.Value,
-    value: _C.Value,
-):
-    mode = symbolic_helper._parse_arg(mode, "s")
-    if mode == "replicate":
-        return replication_pad(g, input, pad)
-    elif mode == "reflect":
-        return reflection_pad(g, input, pad)
-    elif mode == "constant":
-        return constant_pad_nd(g, input, pad, value)
-    elif mode == "circular":
-        return opset9._pad_circular(g, input, pad)
-    else:
-        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
-
-
-@_onnx_symbolic("aten::linalg_det")
-def linalg_det(g: jit_utils.GraphContext, self):
-    return g.op("Det", self)
-
-
-@_onnx_symbolic("aten::logdet")
-def logdet(g: jit_utils.GraphContext, input):
-    return opset9.log(g, linalg_det(g, input))
-
-
-@_onnx_symbolic("aten::arange")
-def arange(g: jit_utils.GraphContext, *args):
-    def _get_arange_dtype(dtype):
-        dtype = symbolic_helper._maybe_get_const(dtype, "i")
-        return dtype
-
-    if len(args) == 2 and all(isinstance(val, int) for val in args):
-        # aten::arange(Scalar start, Scalar end)
-        dtype = torch.int64
-        # Start index.
-        start = g.op(
-            "Constant",
-            value_t=torch.tensor(args[0], dtype=dtype),
-        )
-        # End (exclusive) index.
-        end = g.op(
-            "Constant",
-            value_t=torch.tensor(args[1], dtype=dtype),
-        )
-        # Step size from start to end indexes.
-        delta_default = g.op(
-            "Constant",
-            value_t=torch.tensor(1, dtype=dtype),
-        )
-        return g.op("Range", start, end, delta_default)
-    elif len(args) == 2 or len(args) == 5:
-        if len(args) == 2:
-            # aten::arange(Scalar end, Tensor out)
-            dtype = None
-        else:
-            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
-            dtype = _get_arange_dtype(args[1])
-        type_, end, start, step = symbolic_helper._arange_cast_helper(
-            g, end=args[0], dtype=dtype
-        )
-        start_default = g.op(
-            "Constant",
-            value_t=torch.tensor(0, dtype=type_.dtype()),
-        )
-        delta_default = g.op(
-            "Constant",
-            value_t=torch.tensor(1, dtype=type_.dtype()),
-        )
-        return g.op("Range", start_default, end, delta_default)
-    elif len(args) == 4 or len(args) == 7:
-        if len(args) == 4:
-            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
-            dtype = None
-        else:
-            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
-            dtype = _get_arange_dtype(args[3])
-        _, end, start, step = symbolic_helper._arange_cast_helper(
-            g, start=args[0], end=args[1], step=args[2], dtype=dtype
-        )
-        return g.op("Range", start, end, step)
-    elif len(args) == 6:
-        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
-        dtype = _get_arange_dtype(args[2])
-        type_, end, start, step = symbolic_helper._arange_cast_helper(
-            g, start=args[0], end=args[1], dtype=dtype
-        )
-        delta_default = g.op(
-            "Constant",
-            value_t=torch.tensor(1, dtype=type_.dtype()),
-        )
-        return g.op("Range", start, end, delta_default)
-    else:
-        return symbolic_helper._unimplemented(
-            "aten::arange", f"with {len(args)} arguments"
-        )
-
-
-@_onnx_symbolic("aten::_dim_arange")
-@symbolic_helper.parse_args("v", "i")
-def _dim_arange(g: jit_utils.GraphContext, like, dim):
-    like_shape = g.op("Shape", like)
-    stop = g.op(
-        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
-    )
-    return arange(g, stop, 4, None, None, None)
-
-
-@_onnx_symbolic("aten::size")
-@symbolic_helper.quantized_args(True, quantize_output=False)
-def size(g: jit_utils.GraphContext, self, dim=None):
-    if dim is None:
-        return g.op("Shape", self)
-    return symbolic_helper._size_helper(g, self, dim)
-
-
-@_onnx_symbolic("aten::squeeze")
-def squeeze(g: jit_utils.GraphContext, self, dim=None):
-    if dim is None:
-        return g.op("Squeeze", self)
-
-    # dim as a tensor
-    if not symbolic_helper._is_constant(dim):
-        return symbolic_helper._squeeze_helper(g, self, [dim])
-
-    dim = symbolic_helper._get_const(dim, "i", "dim")
-
-    input_rank = symbolic_helper._get_tensor_rank(self)
-    adjusted_dim = dim
-    if input_rank is not None and dim < 0:
-        adjusted_dim += input_rank
-    dim_size = symbolic_helper._get_tensor_dim_size(self, adjusted_dim)
-    if (dim < 0 and input_rank is None) or dim_size is None:
-        # If onnx shape inference is not on, export always as dynamic.
-        # Because we cannot tell if observed static shape is also static at runtime.
-        # create "cond" node (condition is shape[i]==1)
-        dim_constant = g.op("Constant", value_t=torch.tensor([dim]))
-        size = symbolic_helper._size_helper(g, self, dim_constant)
-        const_one = g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))
-        cond = g.op("Equal", size, const_one)
-        # create the "If" node and add the "then" and "else" blocks to it.
-        if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
-            g, "If", cond, n_blocks=2
-        )
-        squeeze_ = symbolic_helper._squeeze_helper(if_context, self, [dim])
-        utils._add_output_to_block(if_context.block, squeeze_)
-        identity_ = else_context.op("Identity", self)
-        utils._add_output_to_block(else_context.block, identity_)
-        return if_op
-
-    # For static input shape
-    dim = adjusted_dim
-    if dim_size > 1:
-        warnings.warn(
-            "This model contains a squeeze operation on dimension "
-            + str(dim)
-            + ". The size of "
-            + "this dimension in the given input is "
-            + str(dim_size)
-            + ". The model will "
-            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
-            + "input shapes, please export with dynamic_axes argument."
-        )
-        return self
-    return symbolic_helper._squeeze_helper(g, self, [dim])
-
-
-@_onnx_symbolic("aten::unsqueeze")
-def unsqueeze(g: jit_utils.GraphContext, self, dim):
-    if symbolic_helper._is_constant(dim):
-        dim = symbolic_helper._get_const(dim, "i", "dim")
-
-    return symbolic_helper._unsqueeze_helper(g, self, [dim])
-
-
-@_onnx_symbolic("aten::mm")
-def mm(g: jit_utils.GraphContext, self, other):
-    return g.op("Gemm", self, other, beta_f=0.0, alpha_f=1.0)
-
-
-@_onnx_symbolic("aten::index")
-def index(g: jit_utils.GraphContext, self, index):
-    if symbolic_helper._is_packed_list(index):
-        indices = symbolic_helper._unpack_list(index)
-    else:
-        indices = [index]
-
-    # Handle single mask index.
-    if len(indices) == 1:
-        index = indices[0]
-        if not symbolic_helper._is_none(index) and (
-            symbolic_helper._is_bool(index)
-            or _type_utils.JitScalarType.from_value(index)
-            == _type_utils.JitScalarType.UINT8
-        ):
-            index = opset9.nonzero(g, index)
-            return g.op("GatherND", self, index)
-    return opset9.index(g, self, index)
-
-
-@_onnx_symbolic("aten::index_fill")
-def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
-    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
-        g, self, dim, index
-    )
-    value = symbolic_helper._maybe_get_scalar(value)
-    value = symbolic_helper._if_scalar_type_as(value, self)
-    expanded_value = opset9.expand(g, value, expanded_index_shape, None)
-    return scatter(g, self, dim, expanded_index, expanded_value)
-
-
-@_onnx_symbolic("aten::index_copy")
-def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
-    _expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
-        g, self, dim, index
-    )
-    return scatter(g, self, dim, expanded_index, source)
-
-
-@_onnx_symbolic("aten::bitwise_right_shift")
-@_onnx_symbolic("aten::__rshift_")
-def __rshift_(g: jit_utils.GraphContext, self, other):
-    # make sure to cast other to self's type
-    # (when self is long, make sure that other is not float)
-    if _type_utils.JitScalarType.from_value(
-        other, _type_utils.JitScalarType.UNDEFINED
-    ) != _type_utils.JitScalarType.from_value(self):
-        other = g.op(
-            "Cast",
-            other,
-            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-        )
-
-    if (
-        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
-        == _type_utils.JitScalarType.UINT8
-    ):
-        return g.op("BitShift", self, other, direction_s="RIGHT")
-
-    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
-    # exponent (same type as self) has to be float or double in onnx::Pow
-    if not symbolic_helper._is_fp(self):
-        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    two_pow = g.op("Pow", two, other)
-    two_pow = g.op(
-        "Cast",
-        two_pow,
-        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-    )
-    rshift = g.op("Div", self, two_pow)
-    return rshift
-
-
-@_onnx_symbolic("aten::bitwise_left_shift")
-@_onnx_symbolic("aten::__lshift_")
-def __lshift_(g: jit_utils.GraphContext, self, other):
-    # make sure to cast other to self's type
-    # (when self is long, make sure that other is not float)
-    if _type_utils.JitScalarType.from_value(
-        other, _type_utils.JitScalarType.UNDEFINED
-    ) != _type_utils.JitScalarType.from_value(self):
-        other = g.op(
-            "Cast",
-            other,
-            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-        )
-
-    if (
-        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
-        == _type_utils.JitScalarType.UINT8
-    ):
-        return g.op("BitShift", self, other, direction_s="LEFT")
-
-    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
-    # exponent (same type as self) has to be float or double in onnx::Pow
-    if not symbolic_helper._is_fp(self):
-        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    two_pow = g.op("Pow", two, other)
-    two_pow = g.op(
-        "Cast",
-        two_pow,
-        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-    )
-    lshift = g.op("Mul", self, two_pow)
-    return lshift
-
-
-def _get_im2col_indices_along_dim(
-    g: jit_utils.GraphContext, input_d, kernel_size_d, dilation_d, padding_d, stride_d
-):
-    # Input is always 4-D (N, C, H, W)
-    # Calculate indices of sliding blocks along spatial dimension
-    # Slide kernel over input each dim d:
-    # each dimension d ranges from 0 to input[d]+2xpadding[d]-dilation[d]x(kernel_size[d]-1)
-    # with steps = stride
-
-    blocks_d = g.op(
-        "Add", input_d, g.op("Constant", value_t=torch.tensor(padding_d * 2))
-    )
-    blocks_d = g.op(
-        "Sub",
-        blocks_d,
-        g.op("Constant", value_t=torch.tensor(dilation_d * (kernel_size_d - 1))),
-    )
-
-    # Stride kernel over input and find starting indices along dim d
-    blocks_d_indices = g.op(
-        "Range",
-        g.op("Constant", value_t=torch.tensor(0)),
-        blocks_d,
-        g.op("Constant", value_t=torch.tensor(stride_d)),
-    )
-
-    # Apply dilation on kernel and find its indices along dim d
-    kernel_grid = torch.arange(0, kernel_size_d * dilation_d, dilation_d)
-    kernel_grid = g.op("Constant", value_t=kernel_grid.unsqueeze(0))
-
-    # Broadcast and add kernel staring positions (indices) with
-    # kernel_grid along dim d, to get block indices along dim d
-    blocks_d_indices = symbolic_helper._unsqueeze_helper(
-        g, blocks_d_indices, [0]
-    )  # Reshape to [1, -1]
-    kernel_mask = symbolic_helper._reshape_helper(
-        g, kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1]))
-    )
-    block_mask = g.op("Add", blocks_d_indices, kernel_mask)
-
-    return block_mask
-
-
-def _get_im2col_padded_input(g: jit_utils.GraphContext, input, padding_h, padding_w):
-    # Input is always 4-D tensor (N, C, H, W)
-    # Padding tensor has the following format: (padding_h, padding_w)
-    # Reshape the padding to follow ONNX format: (dim1_begin, dim2_begin,...,dim1_end, dim2_end,...)
-    pad = g.op("Constant", value_t=torch.LongTensor([0, 0, padding_h, padding_w] * 2))
-    return g.op("Pad", input, pad)
-
-
-def _get_im2col_output_shape(g: jit_utils.GraphContext, input, kernel_h, kernel_w):
-    batch_dim = size(g, input, g.op("Constant", value_t=torch.tensor(0)))
-    channel_dim = size(g, input, g.op("Constant", value_t=torch.tensor(1)))
-    channel_unfolded = g.op(
-        "Mul", channel_dim, g.op("Constant", value_t=torch.tensor(kernel_h * kernel_w))
-    )
-
-    return g.op(
-        "Concat",
-        symbolic_helper._unsqueeze_helper(g, batch_dim, [0]),
-        symbolic_helper._unsqueeze_helper(g, channel_unfolded, [0]),
-        g.op("Constant", value_t=torch.tensor([-1])),
-        axis_i=0,
-    )
-
-
-@_onnx_symbolic("aten::im2col")
-@symbolic_helper.parse_args("v", "is", "is", "is", "is")
-def im2col(g: jit_utils.GraphContext, input, kernel_size, dilation, padding, stride):
-    # Input is always 4-D tensor (N, C, H, W)
-    # All other args are int[2]
-
-    input_h = size(g, input, g.op("Constant", value_t=torch.tensor(2)))
-    input_w = size(g, input, g.op("Constant", value_t=torch.tensor(3)))
-
-    stride_h, stride_w = stride[0], stride[1]
-    padding_h, padding_w = padding[0], padding[1]
-    dilation_h, dilation_w = dilation[0], dilation[1]
-    kernel_h, kernel_w = kernel_size[0], kernel_size[1]
-
-    blocks_row_indices = _get_im2col_indices_along_dim(
-        g, input_h, kernel_h, dilation_h, padding_h, stride_h
-    )
-    blocks_col_indices = _get_im2col_indices_along_dim(
-        g, input_w, kernel_w, dilation_w, padding_w, stride_w
-    )
-
-    output_shape = _get_im2col_output_shape(g, input, kernel_h, kernel_w)
-    padded_input = _get_im2col_padded_input(g, input, padding_h, padding_w)
-
-    # For a 4D matrix of size (1, 1, 3, 3) as below with kernel_size=2, stride=1, and dilation=1
-    # [[[[1., 2., 3.,],
-    #    [4., 5., 6.,],
-    #    [7., 8., 9.,]]]]
-    # First gather indices along rows (dim=2) with blocks_row_indices = [[0,1], [1,2]] to get:
-    # [[[[[1., 2., 3.],
-    #     [4., 5., 6.]],
-    #    [[4., 5., 6.],
-    #     [7., 8., 9.]]]]]
-    # And then gather along cols (dim=4) with blocks_row_indices = [[0,1], [1,2]] to get:
-    # [[[[[[1., 2.],
-    #      [4., 5.]],
-    #     [[2., 3.],
-    #      [5., 6]]],
-    #    [[[4., 5.],
-    #      [7., 8.]],
-    #     [[5., 6.],
-    #      [8., 9.]]]]]]
-    # Transpose dims 3 (depth) and 4 (rows), and then reshape to output shape (1, 1, 4, 4) to get:
-    #  [[[1., 2., 4., 5.],
-    #    [2., 3., 5., 6.],
-    #    [4., 5., 7., 8.],
-    #    [5., 6., 8., 9.]]]
-    output = g.op("Gather", padded_input, blocks_row_indices, axis_i=2)
-    output = g.op("Gather", output, blocks_col_indices, axis_i=4)
-    output = g.op("Transpose", output, perm_i=[0, 1, 2, 4, 3, 5])
-    return symbolic_helper._reshape_helper(g, output, output_shape)
-
-
-@_onnx_symbolic("aten::narrow")
-def narrow(g: jit_utils.GraphContext, input, dim, start, length):
-    end = g.op("Add", start, length)
-    return symbolic_helper._slice_helper(g, input, axes=dim, starts=start, ends=end)
-
-
-@_onnx_symbolic("aten::flatten")
-@symbolic_helper.quantized_args(True, False, False)
-@symbolic_helper.parse_args("v", "i", "i")
-def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
-    dim = symbolic_helper._get_tensor_rank(input)
-    if dim == 1:
-        return input
-    # use ONNX's Flatten operator for cases where the output shape is 2D
-    if start_dim == 1:
-        if end_dim == -1 or (dim is not None and end_dim == dim - 1):
-            return g.op("Flatten", input, axis_i=start_dim)
-    elif start_dim == 0:
-        if end_dim == -2 or (dim is not None and end_dim == dim - 2):
-            return g.op("Flatten", input, axis_i=end_dim + 1)
-    if dim is None:
-        return symbolic_helper._unimplemented(
-            "dim",
-            "ONNX and PyTorch use different strategies to split the input. "
-            "Input rank must be known at export time.",
-        )
-    # if end_dim is negative add dim
-    if end_dim < 0:
-        end_dim = dim + end_dim
-
-    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
-
-
-@_onnx_symbolic("aten::linalg_vector_norm")
-@symbolic_helper.parse_args("v", "f", "is", "b", "v")
-def linalg_vector_norm(
-    g: jit_utils.GraphContext,
-    self,
-    ord,
-    dim: Sequence[int] | None,
-    keepdim: bool,
-    dtype,
-):
-    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
-
-
-@_onnx_symbolic("aten::embedding_bag")
-@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
-def embedding_bag(
-    g: jit_utils.GraphContext,
-    embedding_matrix,
-    indices,
-    offsets,
-    scale_grad_by_freq,
-    mode,
-    sparse,
-    per_sample_weights,
-    include_last_offset,
-    padding_idx,
-):
-    return symbolic_helper._embedding_bag_helper(
-        g,
-        embedding_matrix,
-        indices,
-        offsets,
-        scale_grad_by_freq,
-        mode,
-        sparse,
-        per_sample_weights,
-        include_last_offset,
-        padding_idx,
-    )
-
-
-@_onnx_symbolic("aten::embedding_renorm")
-@symbolic_helper.parse_args("v", "v", "f", "f")
-def embedding_renorm(g: jit_utils.GraphContext, weight, indices, max_norm, norm_type):
-    unique_indices = g.op("Unique", indices)
-    partial_weight = g.op("Gather", weight, unique_indices)
-    norm_i = int(norm_type)
-    if norm_i == 1:
-        norm_type = "ReduceL1"
-    elif norm_i == 2:
-        norm_type = "ReduceL2"
-    else:
-        raise errors.SymbolicValueError(
-            f"Unsupported: ONNX export of embedding_renorm with norm: {norm_i}. "
-            "Only 1. and 2. are supported.",
-            weight,
-        )
-    partial_weight_norm = g.op(norm_type, partial_weight, axes_i=[1], keepdims_i=1)
-    # https://github.com/pytorch/pytorch/blob/0a07488ed2c47765e337e290bd138c0e6e459cbd/aten/src/ATen/native/Embedding.cpp#L177
-    # Add 1e-7 to prevent division by zero.
-    partial_weight_norm_ = g.op(
-        "Add", partial_weight_norm, g.op("Constant", value_t=torch.tensor(1e-7))
-    )
-    max_norm = torch.tensor(max_norm)
-    scales = g.op("Div", max_norm, partial_weight_norm_)
-    partial_weight_renorm = g.op("Mul", partial_weight, scales)
-    partial_weight_renorm = g.op(
-        "Where",
-        g.op("Greater", partial_weight_norm, max_norm),
-        partial_weight_renorm,
-        partial_weight,
-    )
-    return g.op(
-        "ScatterND",
-        weight,
-        symbolic_helper._unsqueeze_helper(g, unique_indices, [1]),
-        partial_weight_renorm,
-    )
-
-
-@_onnx_symbolic("aten::chunk")
-def chunk(g: jit_utils.GraphContext, self, chunks, dim):
-    # Calculate chunk size for dynamic chunk
-    dim_size = g.op("Gather", g.op("Shape", self), dim, axis_i=0)
-    chunk_size_s = g.op(
-        "Sub", chunks, g.op("Constant", value_t=torch.tensor([1], dtype=torch.long))
-    )
-    chunk_size = g.op("Div", g.op("Add", dim_size, chunk_size_s), chunks)
-    # Create splits vector
-    chunk_vec = [
-        opset9.expand(g, chunk_size, chunk_size_s, None),
-        g.op("Sub", dim_size, g.op("Mul", chunk_size, chunk_size_s)),
-    ]
-    chunk_vec = g.op("Concat", *chunk_vec, axis_i=0)
-    return split(g, self, chunk_vec, dim)
-
-
-@_onnx_symbolic("aten::normal")
-def normal(
-    g: jit_utils.GraphContext,
-    mean,
-    std,
-    sizes=None,
-    generator=None,
-    dtype=None,
-    layout=None,
-    device=None,
-    pin_memory=None,
-):
-    # If you can sample from a given distribution with mean 0 and variance 1, then you can easily sample from a
-    # scale-location transformation of that distribution, which has mean mu and variance sigma's square. If x is a sample
-    # from a mean 0 and variance 1 distribution then
-    #       sigma x+mu
-    # is a sample with mean mu and variance sigma's square.
-    if sizes is not None and not symbolic_helper._is_none(sizes):
-        mean = opset9.expand(g, mean, sizes, None)
-    result = opset9.mul(g, std, g.op("RandomNormalLike", mean))
-    return add(g, result, mean)
-
-
-@_onnx_symbolic("aten::atleast_1d")
-def atleast_1d(g: jit_utils.GraphContext, self: torch._C.Value):
-    # NOTE: If it's 0D, reshape to 1D
-
-    # NOTE: self could be a packed list or a tensor
-    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
-        tensor_list = symbolic_helper._unpack_list(self)
-        new_tensor_list = []
-        for tensor in tensor_list:
-            new_tensor = tensor
-            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
-            if tensor_rank == 0:
-                new_tensor = symbolic_helper._reshape_helper(
-                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1]))
-                )
-            new_tensor_list.append(new_tensor)
-        return g.op("SequenceConstruct", *new_tensor_list)
-
-    tensor_rank = symbolic_helper._get_tensor_rank(self)
-    if tensor_rank == 0:
-        self = symbolic_helper._reshape_helper(
-            g, self, g.op("Constant", value_t=torch.tensor([1]))
-        )
-    return self
-
-
-@_onnx_symbolic("aten::atleast_2d")
-def atleast_2d(g: jit_utils.GraphContext, self: torch._C.Value):
-    # NOTE: If it's 0D, reshape to 2D
-    #       If it's 1D, unsqueeze to 2D
-
-    # NOTE: self could be a packed list or a tensor
-    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
-        tensor_list = symbolic_helper._unpack_list(self)
-        new_tensor_list = []
-        for tensor in tensor_list:
-            new_tensor = tensor
-            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
-            if tensor_rank == 0:
-                new_tensor = symbolic_helper._reshape_helper(
-                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1]))
-                )
-            elif tensor_rank == 1:
-                new_tensor = symbolic_helper._unsqueeze_helper(
-                    g, new_tensor, axes_i=[0]
-                )
-            new_tensor_list.append(new_tensor)
-        return g.op("SequenceConstruct", *new_tensor_list)
-
-    tensor_rank = symbolic_helper._get_tensor_rank(self)
-    if tensor_rank == 0:
-        self = symbolic_helper._reshape_helper(
-            g, self, g.op("Constant", value_t=torch.tensor([1, 1]))
-        )
-    elif tensor_rank == 1:
-        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
-    return self
-
-
-@_onnx_symbolic("aten::atleast_3d")
-def atleast_3d(g: jit_utils.GraphContext, self: torch._C.Value):
-    # NOTE: If it's 0D, reshape to 3D
-    #       If it's 1D, unsqueeze to 3D
-    #       If it's 2D, unsqueeze to 3D
-
-    # NOTE: self could be a packed list or a tensor
-    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
-        tensor_list = symbolic_helper._unpack_list(self)
-        new_tensor_list = []
-        for tensor in tensor_list:
-            new_tensor = tensor
-            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
-            if tensor_rank == 0:
-                new_tensor = symbolic_helper._reshape_helper(
-                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
-                )
-            elif tensor_rank == 1:
-                new_tensor = symbolic_helper._unsqueeze_helper(
-                    g, new_tensor, axes_i=[0]
-                )
-                new_tensor = symbolic_helper._unsqueeze_helper(
-                    g, new_tensor, axes_i=[-1]
-                )
-            elif tensor_rank == 2:
-                new_tensor = symbolic_helper._unsqueeze_helper(
-                    g, new_tensor, axes_i=[-1]
-                )
-            new_tensor_list.append(new_tensor)
-        return g.op("SequenceConstruct", *new_tensor_list)
-
-    tensor_rank = symbolic_helper._get_tensor_rank(self)
-    if tensor_rank == 0:
-        self = symbolic_helper._reshape_helper(
-            g, self, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
-        )
-    elif tensor_rank == 1:
-        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
-        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
-    elif tensor_rank == 2:
-        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
-    return self
-
-
-@_onnx_symbolic("prim::ConstantChunk")
-def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
-    input_shape = g.op("Shape", self)
-    axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-    input_shape_dim = g.op("Gather", input_shape, axis, axis_i=0)
-    start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
-    chunk_size = g.op("Constant", value_t=torch.tensor([chunks], dtype=torch.long))
-    chunk_size_minus_1 = g.op(
-        "Constant", value_t=torch.tensor([chunks - 1], dtype=torch.long)
-    )
-    input_shape_dim_shift = g.op("Add", input_shape_dim, chunk_size_minus_1)
-    chunk_dim = g.op("Div", input_shape_dim_shift, chunk_size)
-    res = []
-    for i in range(chunks):
-        index = g.op("Constant", value_t=torch.tensor([i + 1], dtype=torch.long))
-        end = g.op("Mul", chunk_dim, index)
-        res.append(g.op("Slice", self, start, end, axis))
-        start = end
-    return res
-
-
-@_onnx_symbolic("aten::hstack")
-def hstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
-    tensor_list = atleast_1d(g, tensor_list)
-    first_tensor = g.op(
-        "SequenceAt",
-        tensor_list,
-        g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)),
-    )
-    first_tensor_shape = g.op("Shape", first_tensor)
-    first_tensor_dim = g.op("Size", first_tensor_shape)
-
-    const_one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
-    equal_to_one = g.op("Equal", first_tensor_dim, const_one)
-
-    (
-        if_op_greater,
-        (if_context_equal, else_context_equal),
-        _,
-    ) = jit_utils.add_op_with_blocks(g, "If", equal_to_one, n_blocks=2, outputs=1)
-    result_if = if_context_equal.op(
-        "ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0
-    )
-    utils._add_output_to_block(if_context_equal.block, result_if)
-    result_else = else_context_equal.op(
-        "ConcatFromSequence", tensor_list, axis_i=1, new_axis_i=0
-    )
-    utils._add_output_to_block(else_context_equal.block, result_else)
-    result = if_op_greater.node().output()
-
-    return result
-
-
-@_onnx_symbolic("aten::vstack")
-def vstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
-    tensor_list = atleast_2d(g, tensor_list)
-    return g.op("ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0)
+from torch.onnx._internal.torchscript_exporter.symbolic_opset11 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index 21489fbb79725..63e137734e8a7 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -1,464 +1,8 @@
-# mypy: allow-untyped-defs
-# mypy: disable-error-code=arg-type
-from __future__ import annotations
-
-import functools
-import sys
-
-import torch
-from torch._C import _onnx as _C_onnx
-from torch.onnx import (
-    _type_utils,
-    errors,
-    symbolic_helper,
-    symbolic_opset9 as opset9,
-    utils,
-)
-from torch.onnx._internal import jit_utils, registration
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
-
-# This file exports ONNX ops for opset 12
-
-__all__ = [
-    "argmax",
-    "argmin",
-    "binary_cross_entropy_with_logits",
-    "celu",
-    "cross_entropy_loss",
-    "dropout",
-    "einsum",
-    "ge",
-    "le",
-    "native_dropout",
-    "nll_loss",
-    "nll_loss2d",
-    "nll_loss_nd",
-    "outer",
-    "pow",
-    "tensordot",
-    "unfold",
-]
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=12)
-
-
-def _einsum_helper(g: jit_utils.GraphContext, equation, tensors):
-    if not tensors:
-        raise RuntimeError("Einsum inputs are empty.")
-    # ONNX does not support bool for Einsum inputs.
-    if symbolic_helper._is_bool(tensors[0]):
-        tensors = [
-            g.op("Cast", tensor, to_i=_C_onnx.TensorProtoDataType.INT64)
-            for tensor in tensors
-        ]
-        return g.op(
-            "Cast",
-            g.op("Einsum", *tensors, equation_s=equation),
-            to_i=_C_onnx.TensorProtoDataType.BOOL,
-        )
-    else:
-        return g.op("Einsum", *tensors, equation_s=equation)
-
-
-@_onnx_symbolic("aten::einsum")
-@symbolic_helper.parse_args("s", "v", "is")
-def einsum(g: jit_utils.GraphContext, equation, tensor_list, path=None):
-    tensors = symbolic_helper._unpack_list(tensor_list)
-    return _einsum_helper(g, equation, tensors)
-
-
-@_onnx_symbolic("aten::outer")
-@symbolic_helper.parse_args("v", "v")
-def outer(g: jit_utils.GraphContext, input, other):
-    # make sure to cast other to self's type
-    if _type_utils.JitScalarType.from_value(
-        other, _type_utils.JitScalarType.UNDEFINED
-    ) != _type_utils.JitScalarType.from_value(input):
-        other = g.op(
-            "Cast",
-            other,
-            to_i=_type_utils.JitScalarType.from_value(input).onnx_type(),
-        )
-    return _einsum_helper(g, "i,j->ij", [input, other])
-
-
-def _dropout_returns_masked_input_and_mask(
-    g: jit_utils.GraphContext, input: torch._C.Value, p: float, train: bool
-) -> tuple[torch._C.Value, torch._C.Value | None]:
-    symbolic_helper.check_training_mode(train, "dropout")
-    # In eval mode, dropout is non-op. That is, if the node's
-    # train param is set to False, dropout just returns its inputs.
-    if not train:
-        return input, None
-    p = g.op("Constant", value_t=torch.tensor(p))
-    t = g.op("Constant", value_t=torch.tensor(train, dtype=torch.bool))
-    r, mask = g.op("Dropout", input, p, t, outputs=2)
-    return r, mask
-
-
-@_onnx_symbolic("aten::dropout")
-@symbolic_helper.parse_args("v", "f", "b")
-def dropout(g: jit_utils.GraphContext, input, p, train):
-    masked, _ = _dropout_returns_masked_input_and_mask(g, input, p, train)
-    return masked
-
-
-@_onnx_symbolic("aten::native_dropout")
-@symbolic_helper.parse_args("v", "f", "b")
-def native_dropout(g: jit_utils.GraphContext, input, p, train):
-    return _dropout_returns_masked_input_and_mask(g, input, p, train)
-
-
-@_onnx_symbolic("aten::nll_loss")
-def nll_loss(g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index):
-    # none reduction : onnx::Constant[value={0}]
-    # mean reduction : onnx::Constant[value={1}]
-    # sum reduction : onnx::Constant[value={2}]
-    reduction = symbolic_helper._maybe_get_const(reduction, "i")
-    reduction_vals = ["none", "mean", "sum"]
-    reduction = reduction_vals[reduction]
-
-    # in onnx NegativeLogLikelihoodLoss specification, ignore_index is optional without default value.
-    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
-    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
-    if weight.node().mustBeNone():
-        nllloss = g.op(
-            "NegativeLogLikelihoodLoss",
-            self,
-            target,
-            reduction_s=reduction,
-            ignore_index_i=ignore_index,
-        )
-    else:
-        nllloss = g.op(
-            "NegativeLogLikelihoodLoss",
-            self,
-            target,
-            weight,
-            reduction_s=reduction,
-            ignore_index_i=ignore_index,
-        )
-
-    return nllloss
-
-
-@_onnx_symbolic("aten::nll_loss2d")
-def nll_loss2d(
-    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
-):
-    return nll_loss(g, self, target, weight, reduction, ignore_index)
-
-
-@_onnx_symbolic("aten::nll_loss_nd")
-def nll_loss_nd(
-    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
-):
-    return nll_loss(g, self, target, weight, reduction, ignore_index)
-
-
-@_onnx_symbolic("aten::cross_entropy_loss")
-def cross_entropy_loss(
-    g: jit_utils.GraphContext,
-    self,
-    target,
-    weight,
-    reduction,
-    ignore_index,
-    label_smoothing,
-):
-    # none reduction : onnx::Constant[value={0}]
-    # mean reduction : onnx::Constant[value={1}]
-    # sum reduction : onnx::Constant[value={2}]
-    reduction = symbolic_helper._maybe_get_const(reduction, "i")
-    reduction_vals = ["none", "mean", "sum"]
-    reduction = reduction_vals[reduction]
-
-    label_smoothing = symbolic_helper._maybe_get_const(label_smoothing, "f")
-    if label_smoothing is not None and label_smoothing > 0.0:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX does not support label_smoothing", self
-        )
-
-    # in onnx SoftmaxCrossEntropyLoss specification, ignore_index is optional without default value.
-    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
-    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
-    if weight.node().mustBeNone():
-        celoss = g.op(
-            "SoftmaxCrossEntropyLoss",
-            self,
-            target,
-            reduction_s=reduction,
-            ignore_index_i=ignore_index,
-        )
-    else:
-        celoss = g.op(
-            "SoftmaxCrossEntropyLoss",
-            self,
-            target,
-            weight,
-            reduction_s=reduction,
-            ignore_index_i=ignore_index,
-        )
-
-    return celoss
-
-
-@_onnx_symbolic("aten::binary_cross_entropy_with_logits")
-@symbolic_helper.parse_args("v", "v", "v", "v", "i")
-def binary_cross_entropy_with_logits(
-    g: jit_utils.GraphContext, input, target, weight, pos_weight, reduction
-):
-    p = g.op("Constant", value_t=torch.tensor([1]))
-    sig_x = opset9.sigmoid(g, input)
-    log_sig_x = opset9.log(g, sig_x)
-    sub_1_x = opset9.sub(g, p, sig_x)
-    sub_1_y = opset9.sub(g, p, target)
-    log_1_x = opset9.log(g, sub_1_x)
-    if pos_weight is None or symbolic_helper._is_none(pos_weight):
-        output = opset9.neg(
-            g,
-            opset9.add(
-                g, opset9.mul(g, target, log_sig_x), opset9.mul(g, sub_1_y, log_1_x)
-            ),
-        )
-    else:
-        output = opset9.neg(
-            g,
-            opset9.add(
-                g,
-                opset9.mul(g, opset9.mul(g, target, log_sig_x), pos_weight),
-                opset9.mul(g, sub_1_y, log_1_x),
-            ),
-        )
-
-    if weight is not None and not symbolic_helper._is_none(weight):
-        output = opset9.mul(g, weight, output)
-
-    reduction = symbolic_helper._maybe_get_const(reduction, "i")
-    if reduction == 0:
-        return output
-    elif reduction == 1:
-        return g.op("ReduceMean", output, keepdims_i=0)
-    elif reduction == 2:
-        return g.op("ReduceSum", output, keepdims_i=0)
-    else:
-        return symbolic_helper._onnx_unsupported(
-            "binary_cross_entropy_with_logits with reduction other than none, mean, or sum",
-            input,
-        )
+"""Backward compatibility module for torch.onnx.symbolic_opset12."""
 
+from __future__ import annotations
 
-@_onnx_symbolic("aten::celu")
-def celu(g: jit_utils.GraphContext, self, alpha):
-    alpha = symbolic_helper._maybe_get_const(alpha, "f")
-    # if the input is of type double cast it to float
-    if (
-        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
-        == _type_utils.JitScalarType.DOUBLE
-    ):
-        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-        out = g.op("Celu", self, alpha_f=alpha)
-        return g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
-
-    return g.op("Celu", self, alpha_f=alpha)
-
-
-@_onnx_symbolic("aten::argmax")
-@symbolic_helper.parse_args("v", "v", "b")
-def argmax(
-    g: jit_utils.GraphContext,
-    input: torch._C.Value,
-    dim: torch._C.Value,
-    keepdim: bool,
-):
-    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
-
-
-@_onnx_symbolic("aten::argmin")
-@symbolic_helper.parse_args("v", "v", "b")
-def argmin(
-    g: jit_utils.GraphContext,
-    input: torch._C.Value,
-    dim: torch._C.Value,
-    keepdim: bool,
-):
-    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
-
-
-@_onnx_symbolic("aten::pow")
-def pow(g: jit_utils.GraphContext, self, exponent):
-    return g.op("Pow", self, exponent)
-
-
-@_onnx_symbolic("aten::ge")
-def ge(g: jit_utils.GraphContext, input, other):
-    return g.op("GreaterOrEqual", input, other)
-
-
-@_onnx_symbolic("aten::le")
-def le(g: jit_utils.GraphContext, input, other):
-    return g.op("LessOrEqual", input, other)
-
-
-@_onnx_symbolic("aten::unfold")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
-    const_size = symbolic_helper._maybe_get_const(size, "i")
-    const_step = symbolic_helper._maybe_get_const(step, "i")
-    if not symbolic_helper._is_value(const_size) and not symbolic_helper._is_value(
-        const_step
-    ):
-        return opset9.unfold(g, input, dimension, const_size, const_step)
-
-    sizedim = symbolic_helper._get_tensor_dim_size(input, dimension)
-    if sizedim is not None:
-        low_start = g.op("Constant", value_t=torch.tensor(0))
-        low_end = g.op("Constant", value_t=torch.tensor(sizedim))
-        hi_end = g.op("Constant", value_t=torch.tensor(sizedim + 1))
-        low_indices = g.op("Range", low_start, low_end, step)
-        hi_indices = g.op("Range", size, hi_end, step)
-
-        low_size = symbolic_helper._size_helper(
-            g, low_indices, g.op("Constant", value_t=torch.tensor(0))
-        )
-        hi_size = symbolic_helper._size_helper(
-            g, hi_indices, g.op("Constant", value_t=torch.tensor(0))
-        )
-
-        ndim = symbolic_helper._get_tensor_rank(input)
-        assert ndim is not None
-        perm = list(range(0, ndim))
-        perm.append(perm.pop(dimension))
-
-        unsqueeze_list = []
-        loop_condition = g.op("Constant", value_t=torch.tensor(1))
-        loop_condition = g.op(
-            "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
-        )
-        loop_len = g.op("Min", low_size, hi_size)
-
-        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
-            g, "Loop", loop_len, loop_condition, n_blocks=1
-        )
-
-        loop_block = loop_context.block
-        block_input_iter = utils._add_input_to_block(loop_block)
-        cond = utils._add_input_to_block(loop_block)  # noqa: F841
-
-        starts = loop_context.op("Gather", low_indices, block_input_iter)
-        ends = loop_context.op("Gather", hi_indices, block_input_iter)
-        axes = loop_context.op("Constant", value_t=torch.tensor([2]))
-        starts = symbolic_helper._unsqueeze_helper(loop_context, starts, [0])
-        ends = symbolic_helper._unsqueeze_helper(loop_context, ends, [0])
-        stack = loop_context.op("Slice", input, starts, ends, axes)
-
-        unsqueeze = symbolic_helper._unsqueeze_helper(
-            loop_context, loop_context.op("Transpose", stack, perm_i=perm), [dimension]
-        )
-        unsqueeze_list.append(unsqueeze)
-        concat = loop_context.op("Concat", *unsqueeze_list, axis_i=0)
-
-        cond_out = loop_context.op(
-            "Cast", loop_condition, _C_onnx.TensorProtoDataType.BOOL
-        )
-        utils._add_output_to_block(loop_block, cond_out)
-        utils._add_output_to_block(loop_block, concat)
-
-        loop_output = loop.node().output()
-        perm = [0, 1, 2, 3, 4]
-        perm[0], perm[dimension + 1] = perm[dimension + 1], perm[0]
-        transpose = g.op("Transpose", loop_output, perm_i=perm)
-        squeeze = symbolic_helper._squeeze_helper(g, transpose, [0])
-
-        return squeeze
-
-    return symbolic_helper._unimplemented("Unfold", "input size not accessible")
-
-
-@_onnx_symbolic("aten::tensordot")
-@symbolic_helper.parse_args("v", "v", "is", "is", "v")
-def tensordot(g: jit_utils.GraphContext, input_a, input_b, dims_a, dims_b, out=None):
-    if out is not None:
-        symbolic_helper._unimplemented(
-            "Tensordot", "Out parameter is not supported for tensordot."
-        )
-
-    dim_count_a = symbolic_helper._get_tensor_rank(input_a)
-    if dim_count_a is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of tensordot for tensor(input_a) of unknown rank.",
-            input_a,
-        )
-
-    dim_count_b = symbolic_helper._get_tensor_rank(input_b)
-    if dim_count_b is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of tensordot for tensor(input_b) of unknown rank.",
-            input_b,
-        )
-
-    dims_a = [
-        (dims_a[i] + dim_count_a) if (dims_a[i] < 0) else dims_a[i]
-        for i in range(len(dims_a))
-    ]
-    dims_b = [
-        (dims_b[i] + dim_count_b) if (dims_b[i] < 0) else dims_b[i]
-        for i in range(len(dims_b))
-    ]
-
-    left_dims_a = [i for i in range(dim_count_a) if (i not in dims_a)]
-    left_dims_b = [i for i in range(dim_count_b) if (i not in dims_b)]
-
-    new_input_a = opset9.permute(g, input_a, left_dims_a + dims_a)
-    new_input_b = opset9.permute(g, input_b, dims_b + left_dims_b)
-
-    input_shape = g.op("Shape", new_input_a)
-    left_sizes_a = symbolic_helper._slice_helper(
-        g, input_shape, axes=[0], starts=[0], ends=[len(left_dims_a)]
-    )
-    shape_sizes = [
-        left_sizes_a,
-        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
-    ]
-    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
-
-    input_shape = g.op("Shape", output_a)
-    slices = symbolic_helper._slice_helper(
-        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
-    )
-    shape_sizes = [
-        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
-        slices,
-    ]
-    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
-
-    input_shape = g.op("Shape", new_input_b)
-    left_sizes_b = symbolic_helper._slice_helper(
-        g, input_shape, axes=[0], starts=[len(dims_b)], ends=[sys.maxsize]
-    )
-    slices = symbolic_helper._slice_helper(
-        g, input_shape, axes=[0], starts=[0], ends=[len(dims_b)]
-    )
-    shape_sizes = [
-        slices,
-        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
-    ]
-    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
-
-    input_shape = g.op("Shape", output_b)
-    slices = symbolic_helper._slice_helper(
-        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
-    )
-    shape_sizes = [
-        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
-        slices,
-    ]
-    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
 
-    output = einsum(g, "ij,jk->ik", g.op("prim::ListConstruct", *[output_a, output_b]))
+__all__: list[str] = []
 
-    shape_sizes = [left_sizes_a, left_sizes_b]
-    return opset9._reshape_from_tensor(g, output, shape_sizes)
+from torch.onnx._internal.torchscript_exporter.symbolic_opset12 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset13.py b/torch/onnx/symbolic_opset13.py
index aa40c55780420..18aff9295be8c 100644
--- a/torch/onnx/symbolic_opset13.py
+++ b/torch/onnx/symbolic_opset13.py
@@ -1,1113 +1,8 @@
-# mypy: allow-untyped-defs
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
+"""Backward compatibility module for torch.onnx.symbolic_opset13."""
 
-# This file exports ONNX ops for opset 13
-import functools
+from __future__ import annotations
 
-import torch
-import torch._C._onnx as _C_onnx
-from torch.onnx import (
-    _constants,
-    _type_utils,
-    errors,
-    symbolic_helper,
-    symbolic_opset11 as opset11,
-    symbolic_opset9 as opset9,
-    utils,
-)
-from torch.onnx._internal import jit_utils, registration
 
+__all__: list[str] = []
 
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=13)
-
-
-@_onnx_symbolic("aten::softmax")
-@symbolic_helper.parse_args("v", "i", "none")
-def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
-    softmax = g.op("Softmax", input, axis_i=dim)
-    if dtype and dtype.node().kind() != "prim::Constant":
-        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        softmax = g.op(
-            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
-        )
-
-    return softmax
-
-
-@_onnx_symbolic("aten::log_softmax")
-@symbolic_helper.parse_args("v", "i", "none")
-def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
-    return_op = g.op("LogSoftmax", input, axis_i=dim)
-    if dtype and dtype.node().kind() != "prim::Constant":
-        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        return_op = g.op(
-            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
-        )
-    return return_op
-
-
-@_onnx_symbolic("aten::frobenius_norm")
-@symbolic_helper.parse_args("v", "v", "i")
-def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
-    dim_val = symbolic_helper._maybe_get_const(dim, "is")
-    if not symbolic_helper._is_value(dim_val) and len(dim_val) == 0:
-        return g.op("ReduceL2", self, keepdims_i=0)
-    sqr = g.op("Mul", self, self)
-    sumsqr = symbolic_helper._reducesum_helper(g, sqr, dim, keepdims_i=keepdim)
-    return g.op("Sqrt", sumsqr)
-
-
-@_onnx_symbolic("aten::split")
-@symbolic_helper.parse_args("v", "v", "i", "i")
-def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
-    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
-        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
-        if _outputs is None:
-            return split_out
-        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
-        if (
-            symbolic_helper._is_packed_list(split_size_or_sizes)
-            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
-        ):
-            split_sizes = [
-                symbolic_helper._unsqueeze_helper(g, v, [0])
-                for v in symbolic_helper._unpack_list(split_size_or_sizes)
-            ]
-
-            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
-            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-            res = []
-            for i in range(_outputs):
-                end = g.op(
-                    "Add", start, split_sizes[i]
-                )  # split_sizes is a list of same length as _outputs
-                res.append(g.op("Slice", self, start, end, axis))
-                start = end
-            return res
-        return [
-            g.op(
-                "SequenceAt",
-                split_out,
-                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
-            )
-            for i in range(_outputs)
-        ]
-
-    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
-    if split_val.dim() > 0:
-        return g.op("Split", self, split_size_or_sizes, axis_i=dim, outputs=_outputs)
-    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
-
-    size = symbolic_helper._get_tensor_dim_size(self, dim)
-    if size is None:
-        if _outputs is not None:
-            size = split_size * _outputs
-        else:
-            raise errors.SymbolicValueError(
-                "Unknown dimension size not supported", self
-            )
-    splits = [split_size] * (size // split_size)
-    leftover = size % split_size
-    if leftover:
-        splits.append(leftover)
-    splits = g.op("Constant", value_t=torch.tensor(splits))
-    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
-
-
-@_onnx_symbolic("aten::split_with_sizes")
-def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
-    return split(g, self, split_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::unsafe_split")
-def unsafe_split(
-    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
-):
-    return split(g, self, split_size_or_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::unsafe_split_with_sizes")
-def unsafe_split_with_sizes(
-    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
-):
-    return split_with_sizes(g, self, split_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::tensor_split")
-@symbolic_helper.parse_args("v", "v", "i", "i")
-def tensor_split(
-    g: jit_utils.GraphContext, self, indices_or_sections, dim, _outputs=None
-):
-    axis = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
-    axis = opset11.unsqueeze(g, axis, 0)
-    const_1 = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
-
-    if symbolic_helper._is_split_static(indices_or_sections, _outputs):
-        split_val = symbolic_helper._node_get(indices_or_sections.node(), "value")
-
-        if split_val.dim() > 0:
-            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
-            res = []
-            assert _outputs is not None
-            for i in range(_outputs - 1):
-                end = g.op(
-                    "Gather",
-                    indices_or_sections,
-                    g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
-                    axis_i=0,
-                )
-                res.append(g.op("Slice", self, start, end, axis))
-                start = end
-
-            end = symbolic_helper._size_helper(g, self, axis)
-            res.append(g.op("Slice", self, start, end, axis))
-            return res
-
-        split_size = symbolic_helper._get_const(
-            indices_or_sections, "i", "indices_or_sections"
-        )
-
-        size = symbolic_helper._get_tensor_dim_size(self, dim)
-        if size is None:
-            if _outputs is not None:
-                size = split_size * _outputs
-            else:
-                raise errors.SymbolicValueError(
-                    "Unknown dimension size not supported", self
-                )
-
-        min_split_size = size // split_size
-        num_splits_one_extra = size % split_size
-
-        splits = num_splits_one_extra * [min_split_size + 1]
-        leftover = (split_size - num_splits_one_extra) * [min_split_size]
-
-        splits = g.op(
-            "Constant", value_t=torch.tensor(splits + leftover, dtype=torch.long)
-        )
-        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
-
-    if (
-        symbolic_helper._is_tensor(indices_or_sections)
-        and symbolic_helper._get_tensor_rank(indices_or_sections) == 1
-    ):
-        loop_len = symbolic_helper._size_helper(
-            g, indices_or_sections, g.op("Constant", value_t=torch.tensor(0))
-        )
-        loop_len = opset11.unsqueeze(g, loop_len, 0)
-        loop_condition = g.op("Cast", const_1, to_i=_C_onnx.TensorProtoDataType.BOOL)
-
-        # To make the first slice in the below loop work,
-        # we pad a zero to the first position so that it will be the initial start of slice.
-        padding_0 = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
-        indices_or_sections = g.op("Concat", padding_0, indices_or_sections, axis_i=0)
-
-        final_splits = g.op("SequenceEmpty")
-        # Loop inputs
-        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
-            g, "Loop", loop_len, loop_condition, final_splits, outputs=1, n_blocks=1
-        )
-
-        loop_block = loop_context.block
-        block_input_iter = utils._add_input_to_block(loop_block)
-        cond = utils._add_input_to_block(loop_block)  # noqa: F841
-        final_splits = utils._add_input_to_block(loop_block)
-
-        start = loop_context.op(
-            "Gather", indices_or_sections, block_input_iter, axis_i=0
-        )
-        end = loop_context.op(
-            "Gather",
-            indices_or_sections,
-            loop_context.op("Add", block_input_iter, const_1),
-            axis_i=0,
-        )
-
-        slice = loop_context.op("Slice", self, start, end, axis)
-        final_splits = loop_context.op("SequenceInsert", final_splits, slice)
-
-        # Loop outputs
-        cond_out = loop_context.op("Identity", loop_condition)
-        utils._add_output_to_block(loop_block, cond_out)
-        utils._add_output_to_block(loop_block, final_splits)
-
-        loop_out = loop.node().output()
-        start = g.op(
-            "Gather",
-            indices_or_sections,
-            g.op("Constant", value_t=torch.tensor(-1, dtype=torch.long)),
-            axis_i=0,
-        )
-        start = opset11.unsqueeze(g, start, 0)
-        end = symbolic_helper._size_helper(g, self, axis)
-
-        last_slice = g.op("Slice", self, start, end, axis)
-
-        return g.op("SequenceInsert", loop_out, last_slice)
-
-    else:  # scalar tensor
-        dim_size = symbolic_helper._size_helper(g, self, axis)
-        min_split_size = g.op("Div", dim_size, indices_or_sections)
-        min_split_size_plus_1 = g.op(
-            "Add",
-            min_split_size,
-            const_1,
-        )
-        num_splits_one_extra = g.op("Mod", dim_size, indices_or_sections)
-        splits = g.op("Tile", min_split_size_plus_1, num_splits_one_extra)
-        leftover = g.op(
-            "Tile",
-            min_split_size,
-            g.op(
-                "Sub",
-                opset11.unsqueeze(g, indices_or_sections, 0),
-                num_splits_one_extra,
-            ),
-        )
-
-        splits = g.op("Concat", splits, leftover, axis_i=0)
-        if _outputs is None:
-            return g.op("SplitToSequence", self, splits, axis_i=dim)
-        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
-
-
-@_onnx_symbolic("aten::unbind")
-@symbolic_helper.parse_args("v", "i", "i")
-def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
-    if _outputs is None:
-        return g.op(
-            "SplitToSequence",
-            self,
-            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
-            axis_i=dim,
-            keepdims_i=0,
-        )
-
-    splits = g.op("Constant", value_t=torch.tensor([1] * _outputs))
-    outputs = g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
-    outputs = [outputs] if _outputs == 1 else outputs
-    squeezed_outputs = [
-        g.op("Squeeze", out, g.op("Constant", value_t=torch.tensor([dim])))
-        for out in outputs
-    ]
-    return squeezed_outputs
-
-
-@_onnx_symbolic("aten::nonzero_numpy")
-# Emitted from `torch.nonzero(x, as_tuple=True)`
-def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
-    return unbind(g, opset9.nonzero(g, input), 1, _outputs=_outputs)
-
-
-@_onnx_symbolic("aten::where")
-@symbolic_helper.parse_args("v", "v", "v", "i")
-def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
-    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
-    if not symbolic_helper._is_bool(condition):
-        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
-    if self is None:
-        condition = opset9.nonzero(g, condition)
-        return symbolic_helper._unbind_helper(
-            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
-        )
-    return g.op("Where", condition, self, other)
-
-
-@_onnx_symbolic("aten::fake_quantize_per_channel_affine")
-@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i")
-def fake_quantize_per_channel_affine(
-    g: jit_utils.GraphContext,
-    inputs,
-    scale,
-    zero_point,
-    axis,
-    quant_min=-128,
-    quant_max=127,
-):
-    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
-    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
-    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
-        raise errors.SymbolicValueError(
-            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
-            f"Got ({quant_min}, {quant_max})",
-            inputs,
-        )
-    # ONNX defines zero_point to be int8 or uint8
-    if quant_min == 0:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
-    else:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
-    quantized = g.op("QuantizeLinear", inputs, scale, zero_point, axis_i=axis)
-    if (quant_min, quant_max) == (0, 127):
-        quantized = g.op(
-            "Clip",
-            quantized,
-            opset9.unused(g),
-            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
-        )
-    return g.op("DequantizeLinear", quantized, scale, zero_point, axis_i=axis)
-
-
-@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
-@symbolic_helper.parse_args("v", "v", "v", "i", "i")
-def fake_quantize_per_tensor_affine(
-    g: jit_utils.GraphContext,
-    inputs,
-    scale,
-    zero_point,
-    quant_min=-128,
-    quant_max=127,
-):
-    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
-    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
-    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
-        raise errors.SymbolicValueError(
-            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
-            f"Got ({quant_min}, {quant_max})",
-            inputs,
-        )
-    if quant_min == 0:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
-    else:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
-    if (
-        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
-        != _type_utils.JitScalarType.FLOAT
-    ):
-        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    quantized = g.op("QuantizeLinear", inputs, scale, zero_point)
-    if (quant_min, quant_max) == (0, 127):
-        quantized = g.op(
-            "Clip",
-            quantized,
-            opset9.unused(g),
-            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
-        )
-    return g.op("DequantizeLinear", quantized, scale, zero_point)
-
-
-def _reduce_op_symbolic(onnx_op_name):
-    def symbolic(g, self, dim=None, keepdim=None):
-        self = symbolic_helper._maybe_cast_reduce_op_input(g, self)
-        if dim is None:
-            # all-reduce path
-            return symbolic_helper._handle_reduce_dim_none(g, self, onnx_op_name)
-        else:
-            keepdim = symbolic_helper._get_const(keepdim, "i", "keepdim")
-            return g.op(onnx_op_name, self, dim, keepdims_i=keepdim)
-
-    return symbolic
-
-
-@_onnx_symbolic(
-    "aten::sum",
-    decorate=[symbolic_helper._apply_params("ReduceSum", "sum")],
-)
-def _reduce_with_dtype(onnx_op, name):
-    symbolic = _reduce_op_symbolic(onnx_op)
-
-    @symbolic_helper._overload_by_arg_count
-    def reduce(g, *args, **kwargs):
-        @symbolic_helper.parse_args("v", "none")
-        def reduce_nodim(g, self, dtype):
-            dtype_onnx = None
-            if dtype.node().kind() == "onnx::Constant":
-                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
-                self = g.op("Cast", self, to_i=dtype_onnx)
-            elif dtype.node().kind() != "prim::Constant":
-                return symbolic_helper._unimplemented(name, "dtype", dtype)
-            result = symbolic(g, self)
-            if dtype_onnx is not None:
-                result_dtype_onnx = _type_utils.JitScalarType.from_value(
-                    result
-                ).onnx_type()
-                if result_dtype_onnx != dtype_onnx:
-                    result = g.op("Cast", result, to_i=dtype_onnx)
-            return result
-
-        @symbolic_helper.parse_args("v", "v", "i", "none")
-        def reduce_dim(g, self, dim, keepdim, dtype):
-            dtype_onnx = None
-            if dtype.node().kind() == "onnx::Constant":
-                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
-                self = g.op("Cast", self, to_i=dtype_onnx)
-            elif dtype.node().kind() != "prim::Constant":
-                return symbolic_helper._unimplemented(name, "dtype", dtype)
-            result = symbolic(g, self, dim, keepdim)
-            if dtype_onnx is not None:
-                result_dtype_onnx = _type_utils.JitScalarType.from_value(
-                    result
-                ).onnx_type()
-                if result_dtype_onnx != dtype_onnx:
-                    result = g.op("Cast", result, to_i=dtype_onnx)
-            return result
-
-        return reduce_nodim, reduce_dim
-
-    return reduce
-
-
-# Ported from
-# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/core.py#L6097
-# NOTE: Supporting aten::unflatten before opset13 needs helper function to adjust ONNX op changes in Concat, Slice, ...
-@_onnx_symbolic("aten::unflatten")
-def unflatten(g: jit_utils.GraphContext, input, dim, unflattened_size):
-    input_dim = symbolic_helper._get_tensor_rank(input)
-    if input_dim is None:
-        return symbolic_helper._unimplemented(
-            "dim",
-            "ONNX and PyTorch use different strategies to split the input. "
-            "Input rank must be known at export time.",
-        )
-
-    # dim could be negative
-    input_dim = g.op("Constant", value_t=torch.tensor([input_dim], dtype=torch.int64))
-    dim = g.op("Add", input_dim, dim)
-    dim = g.op("Mod", dim, input_dim)
-
-    input_size = g.op("Shape", input)
-
-    head_start_idx = g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64))
-    head_end_idx = g.op(
-        "Reshape", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
-    )
-    head_part_rank = g.op("Slice", input_size, head_start_idx, head_end_idx)
-
-    dim_plus_one = g.op(
-        "Add", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
-    )
-    tail_start_idx = g.op(
-        "Reshape",
-        dim_plus_one,
-        g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64)),
-    )
-    tail_end_idx = g.op(
-        "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
-    )
-    tail_part_rank = g.op("Slice", input_size, tail_start_idx, tail_end_idx)
-
-    final_shape = g.op(
-        "Concat", head_part_rank, unflattened_size, tail_part_rank, axis_i=0
-    )
-
-    return symbolic_helper._reshape_helper(g, input, final_shape)
-
-
-@_onnx_symbolic("aten::unsafe_chunk")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
-    if _outputs is None:
-        return g.op(
-            "SplitToSequence",
-            self,
-            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
-            axis_i=dim,
-            keepdims_i=0,
-        )
-
-    size = symbolic_helper._get_tensor_dim_size(self, dim)
-    if size is None:
-        return symbolic_helper._unimplemented("unsafe_chunk", "unknown dimension size")
-    split_size = (size + chunks - 1) // chunks
-    splits = [split_size] * (size // split_size)
-    leftover = size % split_size
-    if leftover:
-        splits.append(leftover)
-
-    # TODO: So far we don"t have a module using this method. We"ll keep
-    # this as a constant unless we see a request of dynamics in any
-    # user's modules.
-    splits = g.op("Constant", value_t=torch.tensor(splits, dtype=torch.long))
-    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
-
-
-@_onnx_symbolic("aten::tile")
-def tile(g: jit_utils.GraphContext, self, dims):
-    self_shape = g.op("Shape", self)
-    self_rank = g.op("Size", self_shape)
-    dims_rank = g.op("Size", dims)
-    diff = g.op("Sub", self_rank, dims_rank)
-    const_zero = g.op("Constant", value_t=torch.tensor([0]))
-
-    # 1. If dims is shorter than self.shape pad dims with 1
-    dims_shorter_than_self_shape = g.op("Greater", diff, const_zero)
-    (
-        if_op_greater,
-        (if_context_greater, else_context_greater),
-        _,
-    ) = jit_utils.add_op_with_blocks(
-        g, "If", dims_shorter_than_self_shape, n_blocks=2, outputs=1
-    )
-    const_one = if_context_greater.op("Constant", value_t=torch.LongTensor([1]))
-    diff_1d_greater = if_context_greater.op("Reshape", diff, const_one)
-    exapnd_ones_greater = if_context_greater.op("Expand", const_one, diff_1d_greater)
-    dims_ = if_context_greater.op("Concat", exapnd_ones_greater, dims, axis_i=0)
-    utils._add_output_to_block(if_context_greater.block, dims_)
-    identity_dim = else_context_greater.op("Identity", dims)
-    utils._add_output_to_block(else_context_greater.block, identity_dim)
-    dims_final = if_op_greater.node().output()
-
-    # 2. If dims is longer than self.shape pad self.shape with 1
-    dims_longer_than_self_shape = g.op("Less", diff, const_zero)
-    (
-        if_op_less,
-        (if_context_less, else_context_less),
-        _,
-    ) = jit_utils.add_op_with_blocks(
-        g, "If", dims_longer_than_self_shape, n_blocks=2, outputs=1
-    )
-    const_one = if_context_less.op("Constant", value_t=torch.LongTensor([1]))
-    diff_1d_less = if_context_less.op(
-        "Reshape",
-        if_context_less.op("Abs", diff),
-        const_one,
-    )
-    exapnd_ones_less = if_context_less.op("Expand", const_one, diff_1d_less)
-    self_final_shape = if_context_less.op(
-        "Concat", exapnd_ones_less, self_shape, axis_i=0
-    )
-    self_ = if_context_less.op("Reshape", self, self_final_shape)
-    utils._add_output_to_block(if_context_less.block, self_)
-    identity_self = else_context_less.op("Identity", self)
-    utils._add_output_to_block(else_context_less.block, identity_self)
-    self_final = if_op_less.node().output()
-
-    dims_final = g.op("Cast", dims_final, to_i=_C_onnx.TensorProtoDataType.INT64)
-    return g.op("Tile", self_final, dims_final)
-
-
-@_onnx_symbolic("aten::repeat_interleave")
-def repeat_interleave(
-    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
-):
-    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
-    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
-    input_sizes = symbolic_helper._get_tensor_sizes(self)
-    if repeats_dim is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
-            self,
-        )
-    if repeats_sizes is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
-            self,
-        )
-    if input_sizes is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
-            self,
-        )
-
-    final_dim = dim
-    # if dim is None flatten
-    # By default, use the flattened input array, and return a flat output array
-    if symbolic_helper._is_none(dim):
-        self = symbolic_helper._reshape_helper(
-            g, self, g.op("Constant", value_t=torch.tensor([-1]))
-        )
-        dim = torch.tensor(0, dtype=torch.int64)
-    else:
-        dim = symbolic_helper._maybe_get_scalar(dim)
-
-    # Handle cases where dim is negative
-    if dim < 0:
-        dim += len(input_sizes)
-
-    output_sizes = input_sizes.copy()
-    for idx, input_size in enumerate(input_sizes):
-        if input_size is None:
-            output_sizes[idx], input_sizes[idx] = 0, -1
-
-    # Check if all indices should be repeated the same number of times.
-    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
-        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
-            g, self, repeats, dim
-        )
-
-    cond_dynamic_repeats = repeats_dim == 1 and repeats_sizes[0] is None
-    # If input size is dynamic or repeats vector is dynamic
-    if output_sizes[dim] == 0 or cond_dynamic_repeats:
-        reps = symbolic_helper._size_helper(g, self, dim)
-        reps = opset11.unsqueeze(g, reps, 0)
-
-        # Check if repeats is dynamic
-        # As repeats is dynamic, we use a where node as a substitute for the if statement
-        # If repests_dim = 1, expand repeats otherwise use original tensor
-        if cond_dynamic_repeats:
-            repeat_dim = symbolic_helper._size_helper(
-                g, repeats, g.op("Constant", value_t=torch.LongTensor([0]))
-            )
-            repeat_cond = g.op(
-                "Equal", repeat_dim, g.op("Constant", value_t=torch.LongTensor([1]))
-            )
-            repeats = where(g, repeat_cond, g.op("Expand", repeats, reps), repeats)
-    # There are cases when the repeats are 1-d tensor with multiple repeats, but dim
-    # provided along one of the dynamic axes provided. A simple example would be
-    # input.shape -> [1, 1, *] where * represents the dynamic axes, and dim = 2
-    # Now, repeat interleaving can be performed in pytorch when the value of * matches
-    # with the number of elements in repeat, for example if * -> 2, number of repeats
-    # should be 2 as well.
-    else:
-        return opset9.repeat_interleave(g, self, repeats, final_dim)
-
-    reps_like = g.op(
-        "ConstantOfShape",
-        g.op("Shape", repeats),
-        value_t=torch.tensor([1], dtype=torch.long),
-    )
-    r_splits = split(g, repeats, reps_like, 0)
-    i_splits = split(g, self, reps_like, dim)
-
-    output_sizes[dim], input_sizes[dim] = -1, 1
-
-    # Create a loop to iterate over each value along the dimension
-    # and perform individual interleaving using the repeats tensor
-    # Loop is of the following pattern
-    # input (trip_count, cond)
-    #   int trip_count = ...;
-    #   bool cond = ...;
-    #   for (int i=0; i < trip_count && cond; ++i) {
-    #     cond = ...;
-    #   }
-
-    # Loop conditions
-    loop_condition = g.op("Constant", value_t=torch.tensor(1))
-    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
-    loop_len = reps
-
-    # Create an empty sequence to store final expansions
-    final_splits = g.op("SequenceEmpty")
-
-    # Loop inputs
-    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
-        g, "Loop", loop_len, loop_condition, final_splits, n_blocks=1
-    )
-
-    loop_block = loop_context.block
-    block_input_iter = utils._add_input_to_block(loop_block)
-    cond = utils._add_input_to_block(loop_block)  # noqa: F841
-    final_splits = utils._add_input_to_block(loop_block)
-
-    r_split = loop_context.op("SequenceAt", r_splits, block_input_iter)
-    i_split = loop_context.op("SequenceAt", i_splits, block_input_iter)
-
-    i_split = opset11.unsqueeze(loop_context, i_split, dim + 1)
-    r_concat = [
-        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[: dim + 1])),
-        r_split,
-        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1 :])),
-    ]
-    r_concat = loop_context.op("Concat", *r_concat, axis_i=0)
-    i_split = opset9.expand(loop_context, i_split, r_concat, None)
-    i_split = symbolic_helper._reshape_helper(
-        loop_context, i_split, g.op("Constant", value_t=torch.LongTensor(output_sizes))
-    )
-    final_splits = loop_context.op("SequenceInsert", final_splits, i_split)
-
-    # Loop outputs
-    cond_out = loop_context.op(
-        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
-    )
-    utils._add_output_to_block(loop_block, cond_out)
-    utils._add_output_to_block(loop_block, final_splits)
-
-    loop_out = loop.node().output()
-    loop_out = g.op("ConcatFromSequence", loop_out, axis_i=dim)
-    return loop_out
-
-
-@_onnx_symbolic("aten::diagonal")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def diagonal(g: jit_utils.GraphContext, self, offset, dim1, dim2):
-    rank = symbolic_helper._get_tensor_rank(self)
-    # Replace negative indexing when rank is known
-    if rank is not None:
-        dim1 = dim1 if dim1 >= 0 else dim1 + rank
-        dim2 = dim2 if dim2 >= 0 else dim2 + rank
-
-    dim1_size = opset9.size(
-        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim1]))
-    )
-    dim2_size = opset9.size(
-        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim2]))
-    )
-    # Create appropriate mask
-    mask_shape = g.op("Concat", dim1_size, dim2_size, axis_i=0)
-    mask = opset9.zeros(g, mask_shape, None, None, None)
-    mask = g.op("EyeLike", mask, k_i=offset)
-    # dim1 and dim2 appended as a dimension at the end of the shape
-
-    if rank is not None:
-        axes = list(range(rank))
-        axes.remove(dim1)
-        axes.remove(dim2)
-        self = g.op("Transpose", self, perm_i=axes + [dim1, dim2])
-    else:
-        return symbolic_helper._unimplemented("diagonal", "unknown input rank")
-
-    # Multiply input and mask to calculate values along diagonal
-    # The mask consists of one values where diagonal values are to be calculated
-    # For example:
-    # [[1.1, 1.2, 1.3],   *    [[1, 0, 0]   =   [[1.1, 0, 0],
-    #  [2.1, 2.2, 2.3],         [0, 1, 0]        [0, 2.2, 0],
-    #  [3.1, 3.2, 3.3]]         [0, 0, 1]]       [0, 0, 3.3]]
-    result = g.op("Mul", self, mask)
-    result = symbolic_helper._reducesum_helper(g, result, axes_i=[-1], keepdims_i=0)
-
-    # Calculate gather indices based on offset and dims
-    # If offset is greater than zero, set offset to zero as this aids in
-    # calculation of selection window
-    offset_op = g.op("Constant", value_t=torch.LongTensor([offset]))
-    if offset >= 0:
-        diag_size = g.op(
-            "Max",
-            g.op("Min", dim1_size, g.op("Sub", dim2_size, offset_op)),
-            g.op("Constant", value_t=torch.LongTensor([0])),
-        )
-        offset = 0
-    else:
-        diag_size = g.op(
-            "Max",
-            g.op("Min", g.op("Add", dim1_size, offset_op), dim2_size),
-            g.op("Constant", value_t=torch.LongTensor([0])),
-        )
-    diag_size = g.op("Concat", diag_size, axis_i=0)
-
-    # Calculate which diagonal values to select
-    # For example, in cases with offsets:
-    # [[0, 1.1, 0]
-    #  [0, 0, 2.2]]
-    # we need to select the last two columns, so we create a tensor
-    # with all columns that are to be selected
-    # So in this example, it is [1, 2]
-    select_window_ones_fill = opset9.ones(g, diag_size, 4, None, None)
-    select_window = g.op(
-        "CumSum",
-        select_window_ones_fill,
-        g.op("Constant", value_t=torch.LongTensor([0])),
-    )
-    select_window = g.op(
-        "Add",
-        select_window,
-        g.op("Constant", value_t=torch.LongTensor([abs(offset) - 1])),
-    )
-
-    gather_shape = [
-        opset9.size(g, result, dim=g.op("Constant", value_t=torch.LongTensor([axis])))
-        for axis in list(range(rank))[:-2]
-    ]
-    gather_shape.append(diag_size)
-    gather_shape = g.op("Concat", *gather_shape, axis_i=0)
-    gather_indices = opset9.zeros(g, gather_shape, 4, None, None)
-
-    # There might be cases where offset value is greater than number of rows/columns
-    # and might cause the diagonal to overrun and as a result of this, diag_size would be zero.
-    # For example, if
-    #       offset = 9, dim1_size = 2 (columns), dim2_size = 4 (rows)
-    #       diag_size = max(min(2, (4-9)), 0) = 0, based on calculation above
-    # Cases with diagonal overrun always result in diag_size = max(0, -ve value) = 0
-    # In cases without diagonal overrun, we select the appropriate rows/columns along which we
-    # are calculating diagonal values. In cases with diagonal overrun, we return a tensor which has
-    # the dimension of the row/column where overrun occurred as 0-dim, as we are essentially
-    # returning an empty tensor
-    overrun_cond = g.op(
-        "Not",
-        g.op(
-            "Equal",
-            diag_size,
-            g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64)),
-        ),
-    )
-
-    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
-        g, "If", overrun_cond, n_blocks=2
-    )
-
-    gather_indices_if_block = if_context.op("Add", gather_indices, select_window)
-    gather_indices_if_block = symbolic_helper._unsqueeze_helper(
-        if_context, gather_indices_if_block, [rank - 1]
-    )
-    final_non_overrun = if_context.op(
-        "GatherND", result, gather_indices_if_block, batch_dims_i=rank - 2
-    )
-    final_overrun = opset9.zeros(else_context, gather_shape, 6, None, None)
-    utils._add_output_to_block(if_context.block, final_non_overrun)
-    utils._add_output_to_block(else_context.block, final_overrun)
-    return if_op
-
-
-# Quantized ops
-
-
-@_onnx_symbolic("quantized::linear")
-def quantized_linear(
-    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.linear(g, input, weight, bias)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::linear_relu")
-def quantized_linear_relu(
-    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.linear(g, input, weight, bias)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv1d_relu")
-def quantized_conv1d_relu(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv2d_relu")
-def quantized_conv2d_relu(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv3d_relu")
-def quantized_conv3d_relu(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv1d")
-def quantized_conv1d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv2d")
-def quantized_conv2d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv3d")
-def quantized_conv3d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv_transpose1d")
-def quantized_conv_transpose1d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv_transpose2d(
-        g, input, weight, bias, stride, padding, output_padding, groups, dilation
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv_transpose2d")
-def quantized_conv_transpose2d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv_transpose2d(
-        g, input, weight, bias, stride, padding, output_padding, groups, dilation
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv_transpose3d")
-def quantized_conv_transpose3d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv_transpose3d(
-        g, input, weight, bias, stride, padding, output_padding, groups, dilation
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+from torch.onnx._internal.torchscript_exporter.symbolic_opset13 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset14.py b/torch/onnx/symbolic_opset14.py
index 8bc6f0f9f4d26..367aa9eb0832a 100644
--- a/torch/onnx/symbolic_opset14.py
+++ b/torch/onnx/symbolic_opset14.py
@@ -1,285 +1,8 @@
-# mypy: allow-untyped-defs
-# mypy: disable-error-code=arg-type
-"""This file exports ONNX ops for opset 14.
+"""Backward compatibility module for torch.onnx.symbolic_opset14."""
 
-Note [ONNX operators that are added/updated in opset 14]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-New operators:
-    HardSwish, Trilu
-
-Updated operators:
-    Reshape
-    Add, Sub, Mul, Div
-    GRU, LSTM, RNN
-    BatchNorm, Cumsum, Relu
-"""
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
 from __future__ import annotations
 
-import functools
-
-import torch
-from torch.onnx import _constants, _type_utils, symbolic_helper
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import jit_utils, registration
-
-
-__all__ = [
-    "hardswish",
-    "tril",
-    "triu",
-    "reshape",
-    "batch_norm",
-    "quantized_hardswish",
-    "scaled_dot_product_attention",
-]
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=14)
-
-
-@_onnx_symbolic("aten::hardswish")
-@symbolic_helper.parse_args("v")
-def hardswish(g: jit_utils.GraphContext, self):
-    return g.op("HardSwish", self)
-
-
-@_onnx_symbolic("aten::tril")
-def tril(g: jit_utils.GraphContext, self, diagonal, out=None):
-    return g.op("Trilu", self, diagonal, upper_i=0)
-
-
-@_onnx_symbolic("aten::triu")
-def triu(g: jit_utils.GraphContext, self, diagonal, out=None):
-    return g.op("Trilu", self, diagonal, upper_i=1)
-
-
-@_onnx_symbolic("aten::reshape")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "v")
-def reshape(g: jit_utils.GraphContext, self, shape):
-    # NOTE: Due to bug in ORT https://github.com/microsoft/onnxruntime/issues/10664
-    #       Reshape export cannot utilize the new allowzero attribute introduced in opset 14.
-    return symbolic_helper._reshape_helper(g, self, shape, allowzero=0)
-
-
-@_onnx_symbolic("aten::batch_norm")
-@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
-def batch_norm(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    running_mean,
-    running_var,
-    training,
-    momentum,
-    eps,
-    cudnn_enabled,
-):
-    if (
-        torch.is_autocast_enabled()
-        and not symbolic_helper.args_have_same_dtype(
-            [input, weight, bias, running_mean, running_var]
-        )
-        and GLOBALS.export_onnx_opset_version < 15
-    ):
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "BatchNormalization",
-            14,
-            15,
-            "All input tensors must have the same `dtype`."
-            " Turn off Autocast or export using opset version 15.",
-            input,
-        )
-
-    symbolic_helper.check_training_mode(training, "batch_norm")
-    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
-        g, input, weight, bias, running_mean, running_var
-    )
-    out = g.op(
-        "BatchNormalization",
-        input,
-        weight,
-        bias,
-        running_mean,
-        running_var,
-        epsilon_f=eps,
-        momentum_f=1 - momentum,
-        training_mode_i=0 if not training else 1,
-        outputs=1 if not training else 3,
-    )
-    if not training:
-        return out
-    else:
-        res, new_running_mean, new_running_var = out
-        new_running_mean.setType(running_mean.type())
-        new_running_var.setType(running_var.type())
-        return res
-
-
-@_onnx_symbolic("quantized::hardswish")
-def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = hardswish(g, x)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-# Ported from
-# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/nn.py#L1504
-# aten_scaled_dot_product_attention
-# NOTE: Need op.Trilu
-@_onnx_symbolic("aten::scaled_dot_product_attention")
-@symbolic_helper.parse_args("v", "v", "v", "v", "f", "b", "v", "b")
-def scaled_dot_product_attention(
-    g: jit_utils.GraphContext,
-    query: torch._C.Value,
-    key: torch._C.Value,
-    value: torch._C.Value,
-    attn_mask: torch._C.Value | None = None,
-    dropout_p: float = 0.0,
-    is_causal: bool = False,
-    scale: torch._C.Value | None = None,
-    enable_gqa: bool = False,
-):
-    assert (not is_causal) or (is_causal and symbolic_helper._is_none(attn_mask)), (
-        "is_causal and attn_mask cannot be set at the same time"
-    )
-    assert not enable_gqa, (
-        "conversion of scaled_dot_product_attention not implemented if enable_gqa is True"
-    )
-
-    if symbolic_helper._is_none(scale):
-        scale = _attention_scale(g, query)
-
-    if is_causal:
-        attn_mask = _causal_attention_mask(g, query, key)
-
-    # Swap the last two axes of key
-    # NOTE: onnx-script has different logic here, because the attribute perms in
-    # transpose needs list of ints
-    key_shape_builtin = symbolic_helper._get_tensor_rank(key)
-    key_transposed_axes = list(range(key_shape_builtin))
-    key_transposed_axes[-1], key_transposed_axes[-2] = (
-        key_transposed_axes[-2],
-        key_transposed_axes[-1],
-    )
-    key_transposed = g.op("Transpose", key, perm_i=key_transposed_axes)
-
-    # https://github.com/pytorch/pytorch/blob/12da0c70378b5be9135c6fda62a9863bce4a4818/aten/src/ATen/native/transformers/attention.cpp#L653
-    # Scale q, k before matmul for stability see https://tinyurl.com/sudb9s96 for math
-    query_scaled = g.op("Mul", query, g.op("Sqrt", scale))
-    key_transposed_scaled = g.op("Mul", key_transposed, g.op("Sqrt", scale))
-    mul_qk = g.op("MatMul", query_scaled, key_transposed_scaled)
-
-    if symbolic_helper._is_none(attn_mask):
-        mul_qk_add = mul_qk
-    elif (
-        _type_utils.JitScalarType.from_value(attn_mask)
-        == _type_utils.JitScalarType.BOOL
-    ):
-        # Turn the Boolean mask to float: attn_mask.masked_fill(not attn_mask, -float('inf'))
-        const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
-        const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
-        attn_mask = g.op("Where", attn_mask, const_zero, const_neg_inf)
-        mul_qk_add = g.op("Add", mul_qk, attn_mask)
-    elif _type_utils.JitScalarType.from_value(attn_mask) in (
-        _type_utils.JitScalarType.FLOAT,
-        _type_utils.JitScalarType.HALF,
-        _type_utils.JitScalarType.BFLOAT16,
-    ):
-        mul_qk_add = g.op("Add", mul_qk, attn_mask)
-    else:
-        raise ValueError(
-            f"Unsupported type for attn_mask: {_type_utils.JitScalarType.from_value(attn_mask)}"
-        )
-
-    attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
-
-    if dropout_p != 0:
-        attn_weight = g.op(
-            "Dropout",
-            attn_weight,
-            g.op("Constant", value_t=torch.tensor(dropout_p, dtype=torch.float)),
-        )
-
-    return g.op("MatMul", attn_weight, value)
-
-
-def _attention_scale(
-    g: jit_utils.GraphContext, query: torch._C.Value
-) -> torch._C.Value:
-    """Calculate the scale factor for the attention result.
-
-    Args:
-        query: Tensor of shape [..., L, E]
-
-    Returns:
-        Scalar scale factor := 1 / math.sqrt(query.size(-1))
-    """
-    query_shape = g.op("Shape", query)
-    query_shape_last = g.op(
-        "Slice",
-        query_shape,
-        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
-        g.op(
-            "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
-        ),
-    )
-    embedding_size = g.op(
-        "Cast",
-        query_shape_last,
-        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
-    )
-    const_one = g.op("Constant", value_t=torch.tensor([1.0], dtype=torch.float))
-    scale = g.op("Div", const_one, g.op("Sqrt", embedding_size))
-    # Add a Cast to convert the scale back to original type
-    scale = g.op(
-        "Cast",
-        scale,
-        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
-    )
-    return scale
-
-
-def _causal_attention_mask(
-    g: jit_utils.GraphContext, query: torch._C.Value, key: torch._C.Value
-) -> torch._C.Value:
-    """Create a causal mask for the given query and key tensors.
-
-    Equivalent to::
-        mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
-        attn_mask = torch.zeros(L, S, dtype=torch.float)
-        attn_mask = attn_mask.masked_fill(not mask, -float("inf"))
-
-    Args:
-        query: Tensor of shape [..., L, E]
-        key: Tensor of shape [..., S, E]
-
-    Returns:
-        Tensor of shape [L, S]
-    """
 
-    query_shape = g.op("Shape", query)
-    key_shape = g.op("Shape", key)
+__all__: list[str] = []
 
-    last_idx = g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
-    second_last_idx = g.op("Constant", value_t=torch.tensor([-2], dtype=torch.int64))
-    target_length = g.op("Slice", query_shape, second_last_idx, last_idx)
-    source_length = g.op("Slice", key_shape, second_last_idx, last_idx)
-    # attn_mask = torch.ones(L, S) := {
-    size = g.op("Concat", target_length, source_length, axis_i=0)
-    const_one = g.op("Constant", value_t=torch.tensor([1.0]))
-    attn_mask = g.op("Expand", const_one, size)
-    # }
-    attn_mask = g.op("Trilu", attn_mask, upper_i=0)
-    # The causal mask has 0s in the lower triangle and -inf in the upper triangle.
-    const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
-    const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
-    attn_mask = g.op(
-        "Where", g.op("Equal", attn_mask, const_zero), const_neg_inf, const_zero
-    )
-    return attn_mask
+from torch.onnx._internal.torchscript_exporter.symbolic_opset14 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset15.py b/torch/onnx/symbolic_opset15.py
index 08f8dcbf5a226..e04e3b0452127 100644
--- a/torch/onnx/symbolic_opset15.py
+++ b/torch/onnx/symbolic_opset15.py
@@ -1,80 +1,8 @@
-# mypy: allow-untyped-defs
-"""This file exports ONNX ops for opset 15.
+"""Backward compatibility module for torch.onnx.symbolic_opset15."""
 
-Note [ONNX operators that are added/updated in opset 15]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-https://github.com/onnx/onnx/blob/master/docs/Changelog.md#version-15-of-the-default-onnx-operator-set
-New operators:
-    Bernoulli
-    CastLike
-    Optional
-    OptionalGetElement
-    OptionalHasElement
+from __future__ import annotations
 
-Updated operators:
-    BatchNormalization https://github.com/onnx/onnx/pull/3545
-                        Backwards compatible
-                        TODO: test coverage for mixed types inputs.
-    Pow                https://github.com/onnx/onnx/pull/3412
-                        Backwards compatible
-                        TODO: bfloat16 support.
-    Shape              https://github.com/onnx/onnx/pull/3580
-                        Backwards compatible
-                        TODO: optional start/end attribute.
-"""
 
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
+__all__: list[str] = []
 
-import functools
-
-import torch
-from torch import _C
-from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
-from torch.onnx._internal import jit_utils, registration
-
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=15)
-
-
-@_onnx_symbolic("aten::__is_")
-def aten__is_(g: jit_utils.GraphContext, self, other):
-    if symbolic_helper._is_none(other):
-        if isinstance(self.type(), _C.OptionalType):
-            none = g.op("OptionalHasElement", self)
-            return g.op("Not", none)
-        else:
-            return g.op("Constant", value_t=torch.BoolTensor([0]))
-    return opset9.eq(g, self, other)
-
-
-@_onnx_symbolic("aten::__isnot_")
-@opset9.wrap_logical_op_with_negation  # type: ignore[has-type]
-def aten__isnot_(g: jit_utils.GraphContext, self, other):
-    return aten__is_(g, self, other)
-
-
-@_onnx_symbolic("aten::bernoulli")
-def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
-    if out is not None and not symbolic_helper._is_none(out):
-        symbolic_helper._unimplemented(
-            "Bernoulli", "out parameter is not supported for bernoulli", input
-        )
-    if generator is not None and not symbolic_helper._is_none(generator):
-        symbolic_helper._unimplemented(
-            "Bernoulli", "generator is not supported for bernoulli", input
-        )
-    if p is None or symbolic_helper._is_none(p):
-        return g.op("Bernoulli", input)
-    return opset9.bernoulli(g, input, p, generator, out)
-
-
-@_onnx_symbolic("prim::unchecked_cast")
-def prim_unchecked_cast(g: jit_utils.GraphContext, self):
-    # exists to refine the type of the Value
-    # if x is Optional[Tensor], unchecked_cast will cast
-    # x to Tensor, so the rest of the graph knows that x is a Tensor.
-    if isinstance(self.type(), _C.OptionalType):
-        return g.op("OptionalGetElement", self)
-
-    return self
+from torch.onnx._internal.torchscript_exporter.symbolic_opset15 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset16.py b/torch/onnx/symbolic_opset16.py
index d4a7baa78c2d5..9a248bb0f26c5 100644
--- a/torch/onnx/symbolic_opset16.py
+++ b/torch/onnx/symbolic_opset16.py
@@ -1,185 +1,8 @@
-# mypy: allow-untyped-defs
-"""This file exports ONNX ops for opset 16.
+"""Backward compatibility module for torch.onnx.symbolic_opset16."""
 
-Note [ONNX Operators that are added/updated in opset 16]
+from __future__ import annotations
 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-16-of-the-default-onnx-operator-set
-New operators:
-    GridSample https://github.com/onnx/onnx/pull/3557
 
-Updated operators:
-    Identity
-    If
-    LeakyRelu
-    Loop
-    PRelu
-    RoiAlign
-    Scan
-    ScatterElements
-    ScatterND
-    Where
-    GreaterOrEqual
-    LessOrEqual
-"""
+__all__: list[str] = []
 
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
-
-import functools
-
-import torch
-from torch.nn.functional import (
-    GRID_SAMPLE_INTERPOLATION_MODES,
-    GRID_SAMPLE_PADDING_MODES,
-)
-from torch.onnx import _type_utils, errors, symbolic_helper, utils
-from torch.onnx._internal import jit_utils, registration
-
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=16)
-
-
-# note (mkozuki): Why `grid_sampler` instead of `grid_sample`?
-# Because `torch.nn.functional.grid_sample` calls `torch.grid_sampler`.
-@_onnx_symbolic("aten::grid_sampler")
-@symbolic_helper.parse_args("v", "v", "i", "i", "b")
-def grid_sampler(
-    g: jit_utils.GraphContext,
-    input,
-    grid,
-    mode_enum,
-    padding_mode_enum,
-    align_corners,
-):
-    # Check the input and grid tensor rank beforehand.
-    if symbolic_helper._get_tensor_rank(input) == 5:
-        return symbolic_helper._onnx_unsupported("GridSample with 5D volumetric input")
-    mode_s = {v: k for k, v in GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg]
-    padding_mode_s = {v: k for k, v in GRID_SAMPLE_PADDING_MODES.items()}[  # type: ignore[call-arg]
-        padding_mode_enum
-    ]
-    return g.op(
-        "GridSample",
-        input,
-        grid,
-        align_corners_i=int(align_corners),
-        mode_s=mode_s,
-        padding_mode_s=padding_mode_s,
-    )
-
-
-@_onnx_symbolic("aten::scatter_add")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
-    src_type = _type_utils.JitScalarType.from_value(
-        src, _type_utils.JitScalarType.UNDEFINED
-    )
-    src_sizes = symbolic_helper._get_tensor_sizes(src)
-    index_sizes = symbolic_helper._get_tensor_sizes(index)
-
-    if len(src_sizes) != len(index_sizes):
-        return symbolic_helper._unimplemented(
-            "scatter_add",
-            f"`index` ({index_sizes}) should have the same dimensionality as `src` ({src_sizes})",
-        )
-
-    # PyTorch only allows index shape <= src shape, so we can only consider
-    # taking index as subset size to src, like PyTorch does. When sizes for src
-    # and index are not matched or there are dynamic axes, we take index shape to
-    # slice src to accommodate.
-    if src_sizes != index_sizes or None in index_sizes:
-        adjusted_shape = g.op("Shape", index)
-        starts = g.op("Constant", value_t=torch.tensor([0] * len(index_sizes)))
-        src = g.op("Slice", src, starts, adjusted_shape)
-
-    src = symbolic_helper._maybe_get_scalar(src)
-    if symbolic_helper._is_value(src):
-        return g.op("ScatterElements", self, index, src, axis_i=dim, reduction_s="add")
-    else:
-        # Check if scalar "src" has same type as self (PyTorch allows different
-        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
-        if _type_utils.JitScalarType.from_value(self) != src_type:
-            src = g.op(
-                "Cast",
-                src,
-                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-            )
-
-        return g.op(
-            "ScatterElements",
-            self,
-            index,
-            src,
-            axis_i=dim,
-            reduction_s="add",
-        )
-
-
-@_onnx_symbolic("aten::scatter_reduce")
-@symbolic_helper.parse_args("v", "i", "v", "v", "s", "b")
-def scatter_reduce(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    dim: int,
-    index: torch._C.Value,
-    src: torch._C.Value,
-    reduce: str,
-    include_self: bool,
-):
-    if reduce == "mean":
-        raise errors.OnnxExporterError(
-            "ONNX does not support mean reduction for scatter_reduce"
-        )
-    if not include_self:
-        raise errors.OnnxExporterError(
-            "ONNX does not support include_self=False for scatter_reduce"
-        )
-
-    reduce_mode = {  # convert torch string name to onnx string name
-        "mean": "none",  # 'mean' doesn't support in ONNX 1.14 definition
-        "sum": "add",
-        "prod": "mul",
-        "amin": "min",
-        "amax": "max",
-    }
-    onnx_reduce = reduce_mode[reduce]
-
-    self_rank = g.op("Size", g.op("Shape", self))
-
-    # if self_rank == 0:  # assert (index_rank == 0 and rank_src == 0)
-    self_rank_is_zero = g.op(
-        "Equal", self_rank, g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
-    )
-    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
-        g, "If", self_rank_is_zero, n_blocks=2, outputs=3
-    )
-    neg_1 = if_context.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
-
-    self_reshape = if_context.op("Reshape", self, neg_1)
-    utils._add_output_to_block(if_context.block, self_reshape)
-    index_reshape = if_context.op("Reshape", index, neg_1)
-    utils._add_output_to_block(if_context.block, index_reshape)
-    src_reshape = if_context.op("Reshape", src, neg_1)
-    utils._add_output_to_block(if_context.block, src_reshape)
-
-    self_identity = else_context.op("Identity", self)
-    utils._add_output_to_block(else_context.block, self_identity)
-    index_identitye = else_context.op("Identity", index)
-    utils._add_output_to_block(else_context.block, index_identitye)
-    src_identity = else_context.op("Identity", src)
-    utils._add_output_to_block(else_context.block, src_identity)
-
-    result = g.op("ScatterElements", *if_op, axis_i=dim, reduction_s=onnx_reduce)
-
-    # if self_rank == 0:
-    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
-        g, "If", self_rank_is_zero, n_blocks=2, outputs=1
-    )
-    result_squeezed = if_context.op("Squeeze", result)
-    utils._add_output_to_block(if_context.block, result_squeezed)
-    result_identity = else_context.op("Identity", result)
-    utils._add_output_to_block(else_context.block, result_identity)
-    result_final = if_op.node().output()
-
-    return result_final
+from torch.onnx._internal.torchscript_exporter.symbolic_opset16 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset17.py b/torch/onnx/symbolic_opset17.py
index bcf80058fe2a1..800acd446b5dc 100644
--- a/torch/onnx/symbolic_opset17.py
+++ b/torch/onnx/symbolic_opset17.py
@@ -1,239 +1,8 @@
-# mypy: allow-untyped-defs
-# mypy: disable-error-code=arg-type
-"""This file exports ONNX ops for opset 17.
+"""Backward compatibility module for torch.onnx.symbolic_opset17."""
 
-Note [ONNX Operators that are added/updated in opset 17]
+from __future__ import annotations
 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-17-of-the-default-onnx-operator-set
-New operators:
-    BlackmanWindow
-    DFT
-    HammingWindow
-    HannWindow
-    LayerNormalization
-    MelWeightMatrix
-    STFT
-    SequenceMap
-"""
 
-import functools
-from collections.abc import Sequence
-from typing import Optional
+__all__: list[str] = []
 
-import torch
-from torch import _C
-from torch.onnx import _type_utils, errors, symbolic_helper
-from torch.onnx._internal import jit_utils, registration
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
-
-__all__ = ["layer_norm", "stft", "quantized_layer_norm"]
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=17)
-
-
-@_onnx_symbolic("aten::layer_norm")
-@symbolic_helper.parse_args("v", "is", "v", "v", "f", "none")
-def layer_norm(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    normalized_shape: Sequence[int],
-    weight: _C.Value,
-    bias: _C.Value,
-    eps: float,
-    cudnn_enable: bool,
-):
-    # normalized_shape: input shape from an expected input of size
-    # axis: The first normalization dimension.
-    # layer_norm normalizes on the last D dimensions,
-    # where D is the size of normalized_shape
-    axis = -len(normalized_shape)
-    scalar_type = _type_utils.JitScalarType.from_value(
-        input, _type_utils.JitScalarType.FLOAT
-    )
-    dtype = scalar_type.dtype()
-    if symbolic_helper._is_none(weight):
-        weight_value = torch.ones(normalized_shape, dtype=dtype)
-        weight = g.op("Constant", value_t=weight_value)
-    if symbolic_helper._is_none(bias):
-        bias_value = torch.zeros(normalized_shape, dtype=dtype)
-        bias = g.op("Constant", value_t=bias_value)
-    return g.op(
-        "LayerNormalization",
-        input,
-        weight,
-        bias,
-        epsilon_f=eps,
-        axis_i=axis,
-    )
-
-
-@_onnx_symbolic("quantized::layer_norm")
-def quantized_layer_norm(
-    g: jit_utils.GraphContext,
-    x,
-    normalized_shape,
-    weight,
-    bias,
-    eps,
-    op_scale,
-    op_zero_point,
-):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = layer_norm(g, x, normalized_shape, weight, bias, eps, False)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-def _compute_edge_sizes(n_fft, window_size):
-    """Helper function to compute the sizes of the edges (left and right)
-    of a given window centered within an FFT size."""
-    left = (n_fft - window_size) // 2
-    right = n_fft - left - window_size
-    return left, right
-
-
-@_onnx_symbolic("aten::stft")
-@symbolic_helper.parse_args("v", "i", "i", "i", "v", "b", "b", "b", "b")
-def stft(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    n_fft: int,
-    hop_length: Optional[int] = None,
-    win_length: Optional[int] = None,
-    window: Optional[_C.Value] = None,
-    normalized: bool = False,
-    onesided: Optional[bool] = True,
-    return_complex: Optional[bool] = False,
-    align_to_window: Optional[bool] = None,
-) -> _C.Value:
-    """Associates `torch.stft` with the `STFT` ONNX operator.
-    Note that torch.stft calls _VF.stft, without centering or padding options.
-    Hence, this function does not contain these two arguments.
-    See torch.stft source code for more info.
-
-    Args:
-        g: Graph to write the ONNX representation into
-        input: Input tensor for the transformation
-        n_fft: FFT size
-        hop_length: Size of the hop. Defaults to `floot(n_fft // 4)`
-        win_length: Size of the analysis window. Defaults to `n_fft`
-        window: Analysis window. Defaults to a window of all ones
-        normalized: Whether to return a normalized STFT
-        onesided: Whether to return only half (+1) of the results, given the
-            symmetry of the STFT
-        return_complex: Whether to return the complex value (Note: Must be
-            `False` or `None`)
-
-    Returns:
-        op: Operator for torch.stft associated with STFT (ONNX)
-    """
-    # Checks
-    if return_complex:
-        raise errors.SymbolicValueError(
-            msg="STFT does not currently support complex types", value=input
-        )
-
-    if align_to_window is not None:
-        raise errors.SymbolicValueError(
-            msg="STFT does not currently support the align_to_window option",
-            value=input,
-        )  # TODO(#145944): add compatibility with align_to_window option.
-
-    # Get STFT sizes
-    frame_step_value = hop_length if hop_length is not None else n_fft // 4
-    frame_step_const = g.op(
-        "Constant", value_t=torch.tensor(frame_step_value, dtype=torch.int64)
-    )
-    frame_length_const = g.op(
-        "Constant", value_t=torch.tensor(n_fft, dtype=torch.int64)
-    )
-
-    # Pre-process input if needed
-    signal = input
-    signal_rank = symbolic_helper._get_tensor_rank(signal)
-    if signal_rank == 1:
-        # Add batch dimension
-        signal = g.op(
-            "Unsqueeze",
-            signal,
-            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
-        )
-    elif signal_rank is None or signal_rank > 2:
-        raise errors.SymbolicValueError(
-            msg="STFT can only take inputs of 1 [signal] or 2 [batch, signal] dimensions. "
-            f"Current rank of signal is {signal_rank}, please reduce it.",
-            value=input,
-        )
-
-    # Get window and make sure it's the same size as `win_length` or `n_fft`
-    n_win = symbolic_helper._get_tensor_dim_size(window, dim=0)
-    if n_win is not None:
-        win_length_default = win_length if win_length else n_fft
-        assert n_win == win_length_default, (
-            "Analysis window size must equal `win_length` or `n_fft`. "
-            f"Please, set `win_length` or `n_fft` to match `window` size ({n_win})",
-        )
-
-        # Center window around zeros if needed (required by ONNX's STFT)
-        if n_win < n_fft:
-            left, right = _compute_edge_sizes(n_fft, n_win)
-            left_win = g.op("Constant", value_t=torch.zeros(left))
-            right_win = g.op("Constant", value_t=torch.zeros(right))
-            window = g.op("Concat", left_win, window, right_win, axis_i=0)
-
-    # Create window, if needed
-    if symbolic_helper._is_none(window):
-        if win_length:
-            if win_length > n_fft:
-                raise errors.SymbolicValueError(
-                    msg="The analysis window can't be longer than the size of the FFT. "
-                    f"Please set `win_length` ({win_length}) to `n_fft` ({n_fft}) or less.",
-                    value=input,
-                )
-
-            # Center window, if needed
-            left, right = _compute_edge_sizes(n_fft, win_length)
-            torch_window = torch.hstack(
-                (torch.zeros(left), torch.ones(win_length), torch.zeros(right))
-            )
-        else:
-            # Rectangle window
-            torch_window = torch.ones(n_fft)
-        assert torch_window.shape[0] == n_fft
-        window = g.op("Constant", value_t=torch_window)
-    window = g.op(
-        "Cast", window, to_i=_type_utils.JitScalarType.from_value(signal).onnx_type()
-    )
-
-    # Run STFT
-    result = g.op(
-        "STFT",
-        signal,
-        frame_step_const,
-        window,
-        frame_length_const,
-        onesided_i=1 if onesided is None or onesided else 0,
-    )
-
-    # Transpose to mimic torch.stft's behavior
-    result = g.op("Transpose", result, perm_i=[0, 2, 1, 3])
-
-    # Remove batch dimension, if needed
-    if signal_rank == 1:
-        result = g.op(
-            "Squeeze",
-            result,
-            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
-        )
-
-    # Normalize, if needed
-    if normalized:
-        sqrt_nfft = torch.sqrt(torch.tensor(n_fft, dtype=signal.type().dtype()))
-        result = g.op("Div", result, g.op("Constant", value_t=sqrt_nfft))
-
-    return result
+from torch.onnx._internal.torchscript_exporter.symbolic_opset17 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset18.py b/torch/onnx/symbolic_opset18.py
index 76f5d4df6ec20..cc07a60f018d8 100644
--- a/torch/onnx/symbolic_opset18.py
+++ b/torch/onnx/symbolic_opset18.py
@@ -1,265 +1,8 @@
-# mypy: allow-untyped-defs
-"""This file exports ONNX ops for opset 18.
+"""Backward compatibility module for torch.onnx.symbolic_opset18."""
 
-Note [ONNX Operators that are added/updated in opset 18]
+from __future__ import annotations
 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-18-of-the-default-onnx-operator-set
-New operators:
-    BitwiseAnd
-    CenterCropPad
-    Col2Im
-    Mish
-    OptionalGetElement
-    OptionalHasElement
-    Pad
-    Resize
-    ScatterElements
-    ScatterND
-    Split
-"""
 
-import functools
-from collections.abc import Sequence
-from typing import Optional
+__all__: list[str] = []
 
-import torch
-from torch import _C
-from torch.onnx import _type_utils, symbolic_helper, symbolic_opset9 as opset9
-from torch.onnx._internal import jit_utils, registration
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in symbolic_helper.py
-
-__all__ = [
-    "col2im",
-]
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=18)
-
-
-@_onnx_symbolic("aten::__and_")
-@_onnx_symbolic("aten::bitwise_and")
-def __and_(g: jit_utils.GraphContext, self, other):
-    # do type promotion (scalars don't seem to apply)
-    args = [self, other]
-    # type promotion doesn't happen with torch.bitwise_and(tensor, scalar)
-    prom_args = [arg for arg in args if symbolic_helper._get_tensor_rank(arg)]
-    if len(prom_args) == 0:
-        prom_args = args
-    promotion_jit_type = symbolic_helper._type_promote_from_values(*prom_args)
-    self = symbolic_helper._maybe_cast_to_type(g, self, promotion_jit_type)
-    other = symbolic_helper._maybe_cast_to_type(g, other, promotion_jit_type)
-    if promotion_jit_type == _type_utils.JitScalarType.BOOL:
-        return g.op("And", self, other)
-    return g.op("BitwiseAnd", self, other)
-
-
-@_onnx_symbolic("aten::col2im")
-@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is")
-def col2im(
-    g,
-    input: _C.Value,
-    output_size: _C.Value,
-    kernel_size: _C.Value,
-    dilation: Sequence[int],
-    padding: Sequence[int],
-    stride: Sequence[int],
-):
-    # convert [i0, i1, ..., in] into [i0, i0, i1, i1, ..., in, in]
-    adjusted_padding: list[int] = []
-    for pad in padding:
-        adjusted_padding.extend(pad for _ in range(2))
-
-    num_dimensional_axis = symbolic_helper._get_tensor_sizes(output_size)[0]
-    if not adjusted_padding:
-        adjusted_padding = [0, 0] * num_dimensional_axis
-
-    if not dilation:
-        dilation = [1] * num_dimensional_axis
-
-    if not stride:
-        stride = [1] * num_dimensional_axis
-
-    return g.op(
-        "Col2Im",
-        input,
-        output_size,
-        kernel_size,
-        dilations_i=dilation,
-        pads_i=adjusted_padding,
-        strides_i=stride,
-    )
-
-
-@_onnx_symbolic(
-    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
-)
-@_onnx_symbolic(
-    "aten::prod",
-    decorate=[
-        symbolic_helper._apply_params(
-            "ReduceProd", "prod", allow_multi_dim_support=False
-        )
-    ],
-)
-def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
-    return symbolic_helper._reduce_with_dtype_helper(
-        onnx_op, name, allow_multi_dim_support
-    )
-
-
-@_onnx_symbolic("aten::native_layer_norm")
-@symbolic_helper.quantized_args(True, False, False, False)
-@symbolic_helper.parse_args("v", "is", "v", "v", "f")
-def _native_layer_norm(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    normalized_shape: Sequence[int],
-    weight: _C.Value,
-    bias: _C.Value,
-    eps: float,
-) -> tuple[_C.Value, _C.Value, _C.Value]:
-    return opset9.native_layer_norm(g, input, normalized_shape, weight, bias, eps)
-
-
-@_onnx_symbolic("aten::glu")
-@symbolic_helper.parse_args("v", "i")
-def _glu(g: jit_utils.GraphContext, input, dim):
-    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
-    if dim_size is not None:
-        assert dim_size % 2 == 0
-
-    first, second = g.op("Split", input, axis_i=dim, num_outputs_i=2, outputs=2)
-    return g.op("Mul", first, g.op("Sigmoid", second))
-
-
-@_onnx_symbolic("aten::max")
-# torch.max (same for torch.min) actually has two interfaces smashed together:
-# torch.max(x, dim, keepdim) and torch.max(x, y)
-# TODO(justinchuby): Support multiple quantized args in output
-def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
-
-
-@_onnx_symbolic("aten::maximum")
-@symbolic_helper.quantized_args(True, True)
-def maximum(g: jit_utils.GraphContext, input, other):
-    return max(g, input, dim_or_y=other)
-
-
-@_onnx_symbolic("aten::min")
-# TODO(justinchuby): Support multiple quantized args in output
-def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
-
-
-@_onnx_symbolic("aten::minimum")
-@symbolic_helper.quantized_args(True, True)
-def minimum(g: jit_utils.GraphContext, input, other):
-    return min(g, input, dim_or_y=other)
-
-
-@_onnx_symbolic("aten::amax")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "is", "i")
-def amax(g: jit_utils.GraphContext, self, dim, keepdim):
-    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
-    return g.op("ReduceMax", self, axes, keepdims_i=keepdim)
-
-
-@_onnx_symbolic("aten::amin")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "is", "i")
-def amin(g: jit_utils.GraphContext, self, dim, keepdim):
-    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
-    return g.op("ReduceMin", self, axes, keepdims_i=keepdim)
-
-
-@_onnx_symbolic("aten::aminmax")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "v", "i")
-def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
-    if not symbolic_helper._is_none(dim):
-        dim = symbolic_helper._get_const(dim, "i", "dim")
-        axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-        return g.op("ReduceMin", self, axes, keepdims_i=keepdim), g.op(
-            "ReduceMax", self, axes, keepdims_i=keepdim
-        )
-    else:
-        return g.op("ReduceMin", self, keepdims_i=keepdim), g.op(
-            "ReduceMax", self, keepdims_i=keepdim
-        )
-
-
-@_onnx_symbolic("aten::var_mean")
-def _var_mean(g: jit_utils.GraphContext, input, *args):
-    if len(args) == 1:
-        return symbolic_helper._var_mean_helper(g, input, None, args[0], None)
-    else:
-        return symbolic_helper._var_mean_helper(g, input, *args)
-
-
-@_onnx_symbolic("aten::logsumexp")
-@symbolic_helper.parse_args("v", "is", "i")
-def _logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
-    if dim is None:
-        return g.op("ReduceLogSumExp", input, keepdims_i=0)
-    else:
-        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
-        return g.op("ReduceLogSumExp", input, axes, keepdims_i=keepdim)
-
-
-@_onnx_symbolic("aten::linalg_matrix_norm")
-@symbolic_helper.parse_args("v", "v", "is", "b", "v")
-def _linalg_matrix_norm(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    ord: torch._C.Value,
-    dim: list[int],
-    keepdim: bool,
-    dtype: torch._C.Value,
-):
-    return opset9.linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
-
-
-@_onnx_symbolic("aten::embedding_bag")
-@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
-def embedding_bag(
-    g: jit_utils.GraphContext,
-    embedding_matrix,
-    indices,
-    offsets,
-    scale_grad_by_freq,
-    mode,
-    sparse,
-    per_sample_weights,
-    include_last_offset,
-    padding_idx,
-):
-    return symbolic_helper._embedding_bag_helper(
-        g,
-        embedding_matrix,
-        indices,
-        offsets,
-        scale_grad_by_freq,
-        mode,
-        sparse,
-        per_sample_weights,
-        include_last_offset,
-        padding_idx,
-    )
-
-
-@_onnx_symbolic("aten::linalg_vector_norm")
-@symbolic_helper.parse_args("v", "f", "is", "b", "v")
-def linalg_vector_norm(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    ord: float,
-    dim: Optional[Sequence[int]],
-    keepdim: bool,
-    dtype: torch._C.Value,
-):
-    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
+from torch.onnx._internal.torchscript_exporter.symbolic_opset18 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset19.py b/torch/onnx/symbolic_opset19.py
index 781bc2d200c7e..4f7a54fc1dd38 100644
--- a/torch/onnx/symbolic_opset19.py
+++ b/torch/onnx/symbolic_opset19.py
@@ -1,31 +1,8 @@
-"""This file exports ONNX ops for opset 19.
+"""Backward compatibility module for torch.onnx.symbolic_opset19."""
 
-Note [ONNX Operators that are added/updated in opset 19]
+from __future__ import annotations
 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-19-of-the-default-onnx-operator-set
-New operators:
-AveragePool
-Cast
-CastLike
-Constant
-DeformConv
-DequantizeLinear
-Equal
-Identity
-If
-Loop
-Pad
-QuantizeLinear
-Reshape
-Resize
-Scan
-Shape
-Size
-"""
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in symbolic_helper.py
 
 __all__: list[str] = []
+
+from torch.onnx._internal.torchscript_exporter.symbolic_opset19 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset20.py b/torch/onnx/symbolic_opset20.py
index d96f770ca11e2..56635a7811611 100644
--- a/torch/onnx/symbolic_opset20.py
+++ b/torch/onnx/symbolic_opset20.py
@@ -1,92 +1,8 @@
-# mypy: allow-untyped-defs
-"""This file exports ONNX ops for opset 20.
+"""Backward compatibility module for torch.onnx.symbolic_opset20."""
 
-Note [ONNX Operators that are added/updated in opset 20]
+from __future__ import annotations
 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-20-of-the-default-onnx-operator-set
-New operators:
-    AffineGrid
-    ConstantOfShape
-    DFT
-    Gelu
-    GridSample
-    ImageDecoder
-    IsInf
-    IsNaN
-    ReduceMax
-    ReduceMin
-    RegexFullMatch
-    StringConcat
-    StringSplit
-"""
 
-import functools
+__all__: list[str] = []
 
-import torch.nn.functional as F
-from torch import _C
-from torch.onnx import symbolic_helper
-from torch.onnx._internal import jit_utils, registration
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in symbolic_helper.py
-
-__all__ = ["_grid_sampler", "_affine_grid_generator", "gelu"]
-
-
-def convert_grid_sample_mode(mode_s):
-    return (
-        "linear" if mode_s == "bilinear" else "cubic" if mode_s == "bicubic" else mode_s
-    )
-
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=20)
-
-
-@_onnx_symbolic("aten::grid_sampler")
-@symbolic_helper.parse_args("v", "v", "i", "i", "b")
-def _grid_sampler(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    grid: _C.Value,
-    mode_enum: int,
-    padding_mode_enum: int,
-    align_corners: bool,
-):
-    mode_s = {v: k for k, v in F.GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg, index]
-    # mode string changes at https://onnx.ai/onnx/operators/text_diff_GridSample_16_20.html
-    mode_s = convert_grid_sample_mode(mode_s)
-    padding_mode_s = {v: k for k, v in F.GRID_SAMPLE_PADDING_MODES.items()}[  # type: ignore[call-arg, index]
-        padding_mode_enum  # type: ignore[index]
-    ]
-    return g.op(
-        "GridSample",
-        input,
-        grid,
-        align_corners_i=int(align_corners),
-        mode_s=mode_s,
-        padding_mode_s=padding_mode_s,
-    )
-
-
-@_onnx_symbolic("aten::affine_grid_generator")
-@symbolic_helper.parse_args("v", "v", "b")
-def _affine_grid_generator(
-    g: jit_utils.GraphContext,
-    theta: _C.Value,
-    size: _C.Value,
-    align_corners: bool,
-):
-    return g.op(
-        "AffineGrid",
-        theta,
-        size,
-        align_corners_i=int(align_corners),
-    )
-
-
-@_onnx_symbolic("aten::gelu")
-@symbolic_helper.parse_args("v", "s")
-def gelu(g: jit_utils.GraphContext, self: _C.Value, approximate: str = "none"):
-    return g.op("Gelu", self, approximate_s=approximate)
+from torch.onnx._internal.torchscript_exporter.symbolic_opset20 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset7.py b/torch/onnx/symbolic_opset7.py
index c647ead4e2975..c11e769677ec4 100644
--- a/torch/onnx/symbolic_opset7.py
+++ b/torch/onnx/symbolic_opset7.py
@@ -1,67 +1,8 @@
-# mypy: allow-untyped-defs
-"""
-Note [ONNX operators that are added/updated from opset 7 to opset 8]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-New operators:
-  Expand
+"""Backward compatibility module for torch.onnx.symbolic_opset7."""
 
-Updated operators:
-  Min, Max, Sum, Mean: supports multidirectional broadcasting.
-  MaxPool: added optional indices output.
-  Scan
-"""
+from __future__ import annotations
 
-import functools
-import warnings
 
-from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
-from torch.onnx._internal import jit_utils, registration
+__all__: list[str] = []
 
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=7)
-
-block_listed_operators = (
-    "scan",
-    "expand",
-    "expand_as",
-    "meshgrid",
-    "adaptive_max_pool1d",
-    "adaptive_max_pool2d",
-    "adaptive_max_pool3d",
-    "max_pool1d_with_indices",
-    "max_pool2d_with_indices",
-    "max_pool3d_with_indices",
-)
-
-
-# NOTE: max, min, sum, mean: broadcasting is not supported in opset 7.
-# torch.max (same for torch.min) actually has two interfaces smashed together:
-# torch.max(x, dim, keepdim) and torch.max(x, y)
-@_onnx_symbolic("aten::max")
-def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    # torch.max(input, other)
-    if keepdim is None and dim_or_y is not None:
-        warnings.warn(
-            "Multidirectional broadcasting is not supported in opset 7. "
-            "This might cause the onnx model to be incorrect, if inputs to max operators "
-            "have different shapes"
-        )
-    return opset9.max(g, self, dim_or_y, keepdim)
-
-
-@_onnx_symbolic("aten::min")
-def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    # torch.min(input, other)
-    if keepdim is None and dim_or_y is not None:
-        warnings.warn(
-            "Multidirectional broadcasting is not supported in opset 7. "
-            "This might cause the onnx model to be incorrect, if inputs to min operators "
-            "have different shapes"
-        )
-    return opset9.min(g, self, dim_or_y, keepdim)
-
-
-for block_listed_op in block_listed_operators:
-    _onnx_symbolic(f"aten::{block_listed_op}")(
-        symbolic_helper._block_list_in_opset(block_listed_op)
-    )
+from torch.onnx._internal.torchscript_exporter.symbolic_opset7 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py
index 41abf46be2a0a..0e4411649f3e0 100644
--- a/torch/onnx/symbolic_opset8.py
+++ b/torch/onnx/symbolic_opset8.py
@@ -1,463 +1,8 @@
-# mypy: allow-untyped-defs
-"""
-Note [ONNX operators that are added/updated from opset 8 to opset 9]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-New operators:
-    Compress
-    ConstantOfShape
-    EyeLike
-    MaxUnpool
-    OneHot
-    Sinh
-    Cosh
-    Asinh
-    Acosh
-    Atanh
-    Shrink
-    IsNaN
-    Sign
-    Erf
-    Scatter
-    Where
-    NonZero
-    TfIdfVectorizer
-    MeanVarianceNormalization
+"""Backward compatibility module for torch.onnx.symbolic_opset8."""
 
-Updated operators:
-    BatchNormalization: removed spatial attribute.
-    Greater, Less, Constant, MatMul, PRelu, Gemm, Flatten: more data types{integers} supported.
-    Cast: more data types{string} supported.
-    Upsample: moved scales from attribute to input.
-    Scan
-"""
+from __future__ import annotations
 
-import functools
-import warnings
 
-import torch
-from torch._C import _onnx as _C_onnx
-from torch.onnx import _type_utils, errors, symbolic_helper, symbolic_opset9 as opset9
-from torch.onnx._internal import jit_utils, registration
+__all__: list[str] = []
 
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=8)
-
-block_listed_operators = (
-    "nonzero",
-    "where",
-    "scatter",
-    "scatter_add",
-    "erf",
-    "sign",
-    "isnan",
-    "gather",
-    "arange",
-    "masked_fill",
-    "index_fill",
-    "index_copy",
-    "repeat_interleave",
-    "any",
-    "all",
-)
-
-for block_listed_op in block_listed_operators:
-    _onnx_symbolic(f"aten::{block_listed_op}")(
-        symbolic_helper._block_list_in_opset(block_listed_op)
-    )
-
-
-@_onnx_symbolic(
-    "aten::upsample_nearest1d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest2d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest3d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_linear1d",
-    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_bilinear2d",
-    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_trilinear3d",
-    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
-)
-def _interpolate(name, dim, interpolate_mode):
-    def symbolic_fn(g, input, output_size, *args):
-        scales, align_corners = symbolic_helper._get_interpolate_attributes(
-            g, interpolate_mode, args
-        )
-        symbolic_helper._interpolate_warning(interpolate_mode)
-        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
-        if align_corners:
-            return symbolic_helper._unimplemented(name, "align_corners == True", input)
-        output_size = symbolic_helper._maybe_get_const(output_size, "is")
-        if symbolic_helper._is_value(output_size):
-            return symbolic_helper._unimplemented(
-                name, "torch._C.Value (output_size) indexing"
-            )
-        if scales is None:
-            scales = [
-                1.0
-                if i < 2
-                else float(output_size[-(dim - i)])
-                / float(input.type().sizes()[-(dim - i)])
-                for i in range(0, dim)
-            ]
-        return g.op("Upsample", input, mode_s=interpolate_mode, scales_f=scales)
-
-    return symbolic_fn
-
-
-@_onnx_symbolic("aten::__interpolate")
-def __interpolate(
-    g: jit_utils.GraphContext,
-    input,
-    size,
-    scale_factor,
-    mode,
-    align_corners,
-    recompute_scale_factor,
-    antialias,
-):
-    align_corners = symbolic_helper._maybe_get_const(align_corners, "b")
-    if not symbolic_helper._is_none(align_corners) and align_corners:
-        return symbolic_helper._unimplemented("interpolate", "align_corners == True")
-
-    if not symbolic_helper._is_none(scale_factor) and symbolic_helper._is_value(
-        scale_factor
-    ):
-        return symbolic_helper._unimplemented(
-            "interpolate", "dynamic scales in opset 8"
-        )
-
-    if not symbolic_helper._is_none(size) and symbolic_helper._is_value(size):
-        return symbolic_helper._unimplemented("interpolate", "dynamic size in opset 8")
-
-    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
-        g, input, size, scale_factor, mode, align_corners
-    )
-    return g.op("Upsample", input, mode_s=mode, scales_f=scales)
-
-
-# NOTE: We should create a wrapper for this kind of operation, after resolving the shape/type propagation
-#       issue for "cast" operators. Some symbolic functions depend on shape information of input tensor, which
-#       is lost after casting.
-def _try_cast_integer_to_float(g: jit_utils.GraphContext, *args):
-    floating_scalar_types = {
-        _type_utils.JitScalarType.HALF,
-        _type_utils.JitScalarType.FLOAT,
-        _type_utils.JitScalarType.DOUBLE,
-    }
-    old_type = None
-    # Cast the input tensor to Float if its scalarType is known and is not floating number.
-    # If casting is performed, return the old scalarType, otherwise return None.
-    arg0_type = _type_utils.JitScalarType.from_value(
-        args[0], _type_utils.JitScalarType.UNDEFINED
-    )
-    if arg0_type != _type_utils.JitScalarType.UNDEFINED:
-        old_type = arg0_type
-        if old_type not in floating_scalar_types:
-            old_type = old_type.scalar_name()  # type: ignore[assignment]
-            args = tuple(
-                g.op("Cast", arg, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-                for arg in args
-            )
-        else:
-            return (None,) + args
-    else:
-        warnings.warn(
-            "Only floating datatype is supported for these operators: "
-            "{Greater, Less, MatMul, PRelu, Gemm, Flatten}. This might cause "
-            "the onnx model to be incorrect, if inputs have integer datatypes."
-        )
-    return (old_type,) + args
-
-
-def _cast_to_type(g: jit_utils.GraphContext, input, to_type):
-    if to_type is None:
-        return input
-    return getattr(opset9, f"_cast_{to_type}")(g, input, False)
-
-
-def _comparison_operator(g: jit_utils.GraphContext, input, other, op_name):
-    other = symbolic_helper._maybe_get_scalar(other)
-    other = symbolic_helper._if_scalar_type_as(other, input)
-    _, input, other = _try_cast_integer_to_float(g, input, other)
-    return g.op(op_name, input, other)
-
-
-# NOTE: For symbolics {gt, lt, bmm, matmul, prelu, mm, addmm, view, flatten},
-#       integer input type not supported in opset8. Cast to float if possible.
-@_onnx_symbolic("aten::gt")
-def gt(g: jit_utils.GraphContext, input, other):
-    return _comparison_operator(g, input, other, "Greater")
-
-
-@_onnx_symbolic("aten::lt")
-def lt(g: jit_utils.GraphContext, input, other):
-    return _comparison_operator(g, input, other, "Less")
-
-
-@_onnx_symbolic("aten::bmm")
-def bmm(g: jit_utils.GraphContext, self, other):
-    if symbolic_helper._try_get_scalar_type(self):
-        old_type, self, other = _try_cast_integer_to_float(g, self, other)
-        return _cast_to_type(g, g.op("MatMul", self, other), old_type)
-    else:
-        return g.op("MatMul", self, other)
-
-
-@_onnx_symbolic("aten::matmul")
-def matmul(g: jit_utils.GraphContext, self, other):
-    return bmm(g, self, other)
-
-
-@_onnx_symbolic("aten::prelu")
-def prelu(g: jit_utils.GraphContext, self, weight):
-    self_rank = symbolic_helper._get_tensor_rank(self)
-    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
-    if self_rank is not None and self_rank > 2:
-        weight = g.op("Unsqueeze", weight, axes_i=list(range(1, self_rank - 1)))
-    elif self_rank == 0 and weight_sizes == [1]:
-        # self and weight are both scalar but weight has rank == 1, squeeze weight.
-        weight = symbolic_helper._squeeze_helper(g, weight, [0])
-    if symbolic_helper._try_get_scalar_type(self):
-        old_type, self, weight = _try_cast_integer_to_float(g, self, weight)
-        return _cast_to_type(g, g.op("PRelu", self, weight), old_type)
-    else:
-        return g.op("PRelu", self, weight)
-
-
-@_onnx_symbolic("aten::mm")
-def mm(g: jit_utils.GraphContext, self, other):
-    # Create a dummy C tensor. Only needed for API purposes, the value is
-    # since beta = 0
-    scalar_type = symbolic_helper._try_get_scalar_type(self, other)
-    if scalar_type is None:
-        raise errors.SymbolicValueError(
-            "mm can only operate on tensors with known types", self
-        )
-    zero_constant = g.op(
-        "Constant",
-        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
-    )
-
-    if symbolic_helper._try_get_scalar_type(self):
-        old_type, self, other, zero_constant = _try_cast_integer_to_float(
-            g, self, other, zero_constant
-        )
-        return _cast_to_type(
-            g,
-            g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0),
-            old_type,
-        )
-    return g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0)
-
-
-@_onnx_symbolic("aten::addmm")
-@symbolic_helper.parse_args("v", "v", "v", "t", "t")
-def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
-    if symbolic_helper._try_get_scalar_type(self):
-        old_type, self, mat1, mat2 = _try_cast_integer_to_float(g, self, mat1, mat2)
-        return _cast_to_type(
-            g,
-            g.op(
-                "Gemm",
-                mat1,
-                mat2,
-                self,
-                beta_f=symbolic_helper._scalar(beta),
-                alpha_f=symbolic_helper._scalar(alpha),
-            ),
-            old_type,
-        )
-    else:
-        return g.op(
-            "Gemm",
-            mat1,
-            mat2,
-            self,
-            beta_f=symbolic_helper._scalar(beta),
-            alpha_f=symbolic_helper._scalar(alpha),
-        )
-
-
-@_onnx_symbolic("aten::flatten")
-def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
-    start_dim_i = symbolic_helper._get_const(start_dim, "i", "start_dim")
-    end_dim_i = symbolic_helper._get_const(end_dim, "i", "end_dim")
-
-    dim = input.type().dim()
-    if end_dim_i < 0:
-        end_dim_i = dim + end_dim_i
-    # use ONNX's Flatten operator for cases where the output shape is 2D
-    if start_dim_i == 1 and end_dim_i == dim - 1:
-        if symbolic_helper._try_get_scalar_type(input):
-            old_type, input = _try_cast_integer_to_float(g, input)
-            return _cast_to_type(
-                g, g.op("Flatten", input, axis_i=start_dim_i), old_type
-            )
-        else:
-            return g.op("Flatten", input, axis_i=start_dim_i)
-    if start_dim_i == 0 and end_dim_i == dim - 2:
-        if symbolic_helper._try_get_scalar_type(input):
-            old_type, input = _try_cast_integer_to_float(g, input)
-            return _cast_to_type(
-                g, g.op("Flatten", input, axis_i=end_dim_i + 1), old_type
-            )
-        else:
-            return g.op("Flatten", input, axis_i=end_dim_i + 1)
-
-    return opset9.flatten(g, input, start_dim, end_dim)
-
-
-def _constant_fill(g: jit_utils.GraphContext, sizes, dtype: int, const_value):
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    if not scalar_type.dtype().is_floating_point:
-        result = g.op(
-            "ConstantFill",
-            sizes,
-            dtype_i=_type_utils.JitScalarType.FLOAT.onnx_type(),
-            input_as_shape_i=1,
-            value_f=const_value,
-        )
-        return g.op("Cast", result, to_i=scalar_type.onnx_type())
-    else:
-        return g.op(
-            "ConstantFill",
-            sizes,
-            dtype_i=scalar_type.onnx_type(),
-            input_as_shape_i=1,
-            value_f=const_value,
-        )
-
-
-@_onnx_symbolic("aten::empty")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def empty(
-    g: jit_utils.GraphContext,
-    sizes,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-    memory_format=None,
-):
-    return zeros(g, sizes, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::empty_like")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def empty_like(
-    g: jit_utils.GraphContext,
-    input,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-    memory_format=None,
-):
-    return zeros_like(g, input, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::zeros")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v")
-def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
-    # NOTE: no way to set device and layout in ONNX, so we ignore it
-    return _constant_fill(g, sizes, dtype, 0)
-
-
-@_onnx_symbolic("aten::zeros_like")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def zeros_like(
-    g: jit_utils.GraphContext,
-    input,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-    memory_format=None,
-):
-    shape = g.op("Shape", input)
-    return _constant_fill(g, shape, dtype, 0)
-
-
-@_onnx_symbolic("aten::ones")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v")
-def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
-    return _constant_fill(g, sizes, dtype, 1)
-
-
-@_onnx_symbolic("aten::ones_like")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def ones_like(
-    g: jit_utils.GraphContext,
-    input,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-    memory_format=None,
-):
-    shape = g.op("Shape", input)
-    return _constant_fill(g, shape, dtype, 1)
-
-
-@_onnx_symbolic("aten::full")
-def full(
-    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
-):
-    const_value = symbolic_helper._maybe_get_const(value, "t")
-    if symbolic_helper._is_value(const_value):
-        tmp = zeros(g, sizes, dtype, layout, device)
-        return opset9.add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
-    else:
-        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        return _constant_fill(g, sizes, dtype, const_value)
-
-
-@_onnx_symbolic("aten::full_like")
-@symbolic_helper.parse_args("v", "f", "i", "v", "v", "v", "v")
-def full_like(
-    g: jit_utils.GraphContext,
-    input,
-    fill_value,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-    memory_format=None,
-):
-    shape = g.op("Shape", input)
-    return _constant_fill(g, shape, dtype, fill_value)
-
-
-@_onnx_symbolic("aten::repeat")
-def repeat(g: jit_utils.GraphContext, self, repeats):
-    if not symbolic_helper._is_value(repeats):
-        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
-    if symbolic_helper._is_packed_list(repeats):
-        repeat_size_len = len(symbolic_helper._unpack_list(repeats))
-    else:
-        const_repeats = symbolic_helper._maybe_get_const(repeats, "is")
-        repeat_size_len = len(const_repeats)
-    if self.isCompleteTensor():
-        sizes = self.type().sizes()
-        diff_dims = repeat_size_len - len(sizes)
-        if diff_dims > 0:
-            self = opset9.view(
-                g, self, g.op("Constant", value_t=torch.tensor([1] * diff_dims + sizes))
-            )
-    return g.op("Tile", self, repeats)
+from torch.onnx._internal.torchscript_exporter.symbolic_opset8 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index af56a87514597..bd0f4795340ae 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1,6653 +1,14 @@
-# mypy: allow-untyped-decorators
-# mypy: allow-untyped-defs
-# mypy: disable-error-code=arg-type
-"""This file exports ONNX ops for opset 9.
-
-Opset 9 is supported by ONNX release 1.4.1
-release on 01/23/19
-"""
+"""Backward compatibility module for torch.onnx.symbolic_opset9."""
 
 from __future__ import annotations
 
-import builtins
-import functools
-import math
-import sys
-import warnings
-from typing import Callable, TYPE_CHECKING
-from typing_extensions import deprecated
-
-import torch
-import torch._C._onnx as _C_onnx
-import torch.nn.modules.utils
-import torch.onnx
-from torch import _C
-
-# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
-from torch.onnx import _constants, _type_utils, errors, symbolic_helper
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import jit_utils, registration
-
-
-if TYPE_CHECKING:
-    from collections.abc import Sequence
-
-    from torch.types import Number
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
-
-__all__ = [
-    "abs",
-    "acos",
-    "add",
-    "addcmul",
-    "addmm",
-    "alias",
-    "amax",
-    "amin",
-    "aminmax",
-    "arange",
-    "argmax",
-    "argmin",
-    "as_strided",
-    "as_tensor",
-    "asin",
-    "atan",
-    "atan2",
-    "baddbmm",
-    "batch_norm",
-    "bernoulli",
-    "bitwise_not",
-    "bitwise_or",
-    "bmm",
-    "broadcast_tensors",
-    "broadcast_to",
-    "bucketize",
-    "cat",
-    "cdist",
-    "ceil",
-    "clamp_max",
-    "clamp_min",
-    "clamp",
-    "clone",
-    "constant_pad_nd",
-    "contiguous",
-    "conv_tbc",
-    "conv_transpose1d",
-    "conv_transpose2d",
-    "conv_transpose3d",
-    "conv1d",
-    "conv2d",
-    "conv3d",
-    "convert_element_type",
-    "convolution",
-    "cos",
-    "cosine_similarity",
-    "cross",
-    "cumsum",
-    "detach",
-    "dim",
-    "div",
-    "dot",
-    "dropout",
-    "elu",
-    "embedding_bag",
-    "embedding",
-    "empty_like",
-    "empty",
-    "eq",
-    "erf",
-    "exp",
-    "expand_as",
-    "expand",
-    "eye",
-    "fill",
-    "flatten",
-    "floor_divide",
-    "floor",
-    "floordiv",
-    "frobenius_norm",
-    "full_like",
-    "full",
-    "gather",
-    "ge",
-    "gelu",
-    "get_pool_ceil_padding",
-    "glu",
-    "group_norm",
-    "gt",
-    "hann_window",
-    "hardshrink",
-    "hardsigmoid",
-    "hardswish",
-    "hardtanh",
-    "index_add",
-    "index_copy",
-    "index_fill",
-    "index_put",
-    "index_select",
-    "index",
-    "instance_norm",
-    "is_floating_point",
-    "is_pinned",
-    "isnan",
-    "item",
-    "kl_div",
-    "layer_norm",
-    "le",
-    "leaky_relu",
-    "lerp",
-    "lift",
-    "linalg_cross",
-    "linalg_matrix_norm",
-    "linalg_norm",
-    "linalg_vector_norm",
-    "linear",
-    "linspace",
-    "log_sigmoid",
-    "log_softmax",
-    "log",
-    "log10",
-    "log1p",
-    "log2",
-    "logical_and",
-    "logical_not",
-    "logical_or",
-    "logical_xor",
-    "logit",
-    "logsumexp",
-    "lstm_cell",
-    "lstm",
-    "lt",
-    "masked_fill",
-    "masked_fill_",
-    "matmul",
-    "max_pool1d_with_indices",
-    "max_pool2d_with_indices",
-    "max_pool3d_with_indices",
-    "max",
-    "maximum",
-    "meshgrid",
-    "min",
-    "minimum",
-    "mish",
-    "mm",
-    "movedim",
-    "mse_loss",
-    "mul",
-    "multinomial",
-    "mv",
-    "narrow",
-    "native_layer_norm",
-    "ne",
-    "neg",
-    "new_empty",
-    "new_full",
-    "new_ones",
-    "new_zeros",
-    "nonzero_numpy",
-    "nonzero",
-    "norm",
-    "numel",
-    "numpy_T",
-    "one_hot",
-    "ones_like",
-    "ones",
-    "onnx_placeholder",
-    "pad",
-    "pairwise_distance",
-    "permute",
-    "pixel_shuffle",
-    "pixel_unshuffle",
-    "pow",
-    "prelu",
-    "prim_constant_chunk",
-    "prim_constant_split",
-    "prim_constant",
-    "prim_data",
-    "prim_device",
-    "prim_dtype",
-    "prim_if",
-    "prim_layout",
-    "prim_list_construct",
-    "prim_list_unpack",
-    "prim_loop",
-    "prim_max",
-    "prim_min",
-    "prim_shape",
-    "prim_tolist",
-    "prim_tuple_construct",
-    "prim_type",
-    "prim_unchecked_cast",
-    "prim_uninitialized",
-    "rand_like",
-    "rand",
-    "randint_like",
-    "randint",
-    "randn_like",
-    "randn",
-    "reciprocal",
-    "reflection_pad",
-    "relu",
-    "relu6",
-    "remainder",
-    "repeat_interleave",
-    "repeat",
-    "replication_pad",
-    "reshape_as",
-    "reshape",
-    "roll",
-    "rrelu",
-    "rsqrt",
-    "rsub",
-    "scalar_tensor",
-    "scatter_add",
-    "scatter",
-    "select",
-    "selu",
-    "sigmoid",
-    "sign",
-    "silu",
-    "sin",
-    "size",
-    "slice",
-    "softmax",
-    "softplus",
-    "softshrink",
-    "sort",
-    "split_with_sizes",
-    "split",
-    "sqrt",
-    "square",
-    "squeeze",
-    "stack",
-    "std_mean",
-    "std",
-    "sub",
-    "t",
-    "take",
-    "tan",
-    "tanh",
-    "tanhshrink",
-    "tensor",
-    "threshold",
-    "to",
-    "topk",
-    "transpose",
-    "true_divide",
-    "type_as",
-    "unbind",
-    "unfold",
-    "unsafe_chunk",
-    "unsafe_split_with_sizes",
-    "unsafe_split",
-    "unsqueeze",
-    "unsupported_complex_operators",
-    "noop_complex_operators",
-    "unused",
-    "var_mean",
-    "var",
-    "view_as",
-    "view",
-    "where",
-    "wrap_logical_op_with_cast_to",
-    "wrap_logical_op_with_negation",
-    "zeros_like",
-    "zeros",
-    "zero",
-]
-
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=9)
-
-
-def _export(name: str):
-    """Exports the function in the current global namespace."""
-
-    def wrapper(func):
-        globals()[name] = func
-        __all__.append(name)
-        return func
-
-    return wrapper
-
-
-def unused(g):
-    """Represents "missing" optional inputs."""
-    n = g.op("prim::Constant")
-    n.setType(_C.OptionalType.ofTensor())
-    return n
-
-
-@_onnx_symbolic("aten::_shape_as_tensor")
-def _shape_as_tensor(g: jit_utils.GraphContext, input):
-    return g.op("Shape", input)
-
-
-@_onnx_symbolic("aten::_reshape_from_tensor")
-def _reshape_from_tensor(g: jit_utils.GraphContext, input, shape):
-    if isinstance(shape, list):
-        shape = g.op("Concat", *shape, axis_i=0)
-    return reshape(g, input, shape)
-
-
-@_onnx_symbolic("aten::reshape")
-@symbolic_helper.quantized_args(True)
-def reshape(g: jit_utils.GraphContext, self, shape):
-    return symbolic_helper._reshape_helper(g, self, shape)
-
-
-@_onnx_symbolic("aten::reshape_as")
-@symbolic_helper.quantized_args(True)
-def reshape_as(g: jit_utils.GraphContext, self, other):
-    shape = g.op("Shape", other)
-    return reshape(g, self, shape)
-
-
-@_onnx_symbolic("aten::add")
-def add(g: jit_utils.GraphContext, self, other, alpha=None):
-    """
-    This function takes the add function and returns the corresponding ONNX operator.
-
-    This function is not meant to be called directly by the user.
-
-    Args:
-        g (GraphContext): The graph context.
-        self (Tensor): The first operand.
-        other (Tensor): The second operand.
-        alpha (float, optional): The scaling factor for the second operand. Defaults to None.
-
-    Returns:
-        ONNX operator.
-    """
-    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "Add", 9, 11, "Add between list of tensors not supported", self
-        )
-    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
-        other = g.op("Mul", other, alpha)
-    return g.op("Add", self, other)
-
-
-@_onnx_symbolic("aten::sub")
-def sub(g: jit_utils.GraphContext, self, other, alpha=None):
-    """
-    Consumes sub function and returns the corresponding ONNX operator.
-
-    This function is not meant to be called directly by the user.
-
-    Args:
-        g (GraphContext): The graph context.
-        self (Tensor): The first operand.
-        other (Tensor): The second operand.
-        alpha (Optional[Tensor]): A scaling factor to apply to the second operand.
-            If `alpha` is not provided, it defaults to 1.
-
-    Returns:
-        ONNX operator
-    """
-    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
-        other = g.op("Mul", other, alpha)
-    return g.op("Sub", self, other)
-
-
-@_onnx_symbolic("aten::rsub")
-def rsub(g: jit_utils.GraphContext, self, other, alpha=None):
-    return sub(g, other, self, alpha=alpha)
-
-
-@_onnx_symbolic("aten::mul")
-def mul(g: jit_utils.GraphContext, self, other):
-    if symbolic_helper._is_bool(self) and symbolic_helper._is_bool(other):
-        # ONNX Mul doesn't support Boolean, so use And as an equivalent operator.
-        return g.op("And", self, other)
-    else:
-        return g.op("Mul", self, other)
-
-
-@_onnx_symbolic("aten::div")
-def div(g: jit_utils.GraphContext, self, other, *args):
-    if len(args) == 0:
-        return true_divide(g, self, other)
-    else:
-        return _div_rounding_mode(g, self, other, *args)
-
-
-@_onnx_symbolic("aten::addcmul")
-@symbolic_helper.parse_args("v", "v", "v", "f")
-def addcmul(g: jit_utils.GraphContext, self, tensor1, tensor2, value=1.0):
-    value_tens = g.op("Constant", value_t=torch.tensor([value]))
-    return add(g, self, mul(g, mul(g, tensor1, tensor2), value_tens))
-
-
-@symbolic_helper.parse_args("v", "v", "s")
-def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
-    if rounding_mode is None:
-        return true_divide(g, self, other)
-    elif rounding_mode == "floor":
-        return _floor_divide(g, self, other)
-    elif rounding_mode == "trunc":
-        return _trunc_divide(g, self, other)
-    else:
-        raise errors.SymbolicValueError(
-            f'Unsupported rounding mode: "{rounding_mode}". Expected None, "floor" or "trunc"',
-            self,
-        )
-
-
-def _trunc_divide(g: jit_utils.GraphContext, self, other):
-    out = g.op("Div", self, other)
-    # the correct operation is truncate, which is not supported in ONNX,
-    # we cannot call floor since it will behave differently for negative numbers
-    # (eg. -0.1 should become -0 )
-    # - if scalar_type information are not available, assume that
-    # we need to call floor (treat as float)
-    out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.INT64)
-
-    # Matching PyTorch's behavior:
-    # - if self is fp the output's type is self's type
-    # - if self is not fp and other is fp, the output is of type JitScalarType.FLOAT
-    # - self is not fp and other is not fp, the output's type is self's output type
-    # - the output type defaults to Float
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.UNDEFINED
-    )
-    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
-        if not symbolic_helper._is_fp(self) and symbolic_helper._is_fp(other):
-            out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-        else:
-            out = g.op(
-                "Cast",
-                out,
-                to_i=scalar_type.onnx_type(),
-            )
-    else:
-        out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    return out
-
-
-def _floor_divide(g: jit_utils.GraphContext, self, other):
-    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
-        out = true_divide(g, self, other)
-        return g.op("Floor", out)
-    else:
-        # Integer division does truncation rounding
-        div = g.op("Div", self, other)
-        # Division is negative if: self < 0 != other < 0
-        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
-        negative = g.op(
-            "Xor",
-            symbolic_helper._lt_helper(g, self, zero),
-            symbolic_helper._lt_helper(g, other, zero),
-        )
-
-        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
-        mod = g.op("Sub", self, g.op("Mul", div, other))
-        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
-
-        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
-        fixup = g.op("Mul", fixup_mask, one)
-        return g.op("Sub", div, fixup)
-
-
-@_onnx_symbolic("aten::floor_divide")
-def floor_divide(g: jit_utils.GraphContext, self, other):
-    # Deprecated behavior, floor_divide actually truncates
-    return _trunc_divide(g, self, other)
-
-
-@_onnx_symbolic("aten::floordiv")
-def floordiv(g: jit_utils.GraphContext, self, other):
-    return floor_divide(g, self, other)
-
-
-@_onnx_symbolic("aten::true_divide")
-def true_divide(g: jit_utils.GraphContext, self, other):
-    """Division where both inputs are cast to floating types
-
-    If both inputs are floating, performs div as usual
-    If only one input is a floating type, the other input is cast to its type
-    If neither input is a floating type, both inputs are cast to the default scalar type
-    """
-
-    # Case 1: either values are floating
-    # Performs div as usual.
-    # Implicit casting will be handled in scalar type analysis pass.
-    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
-        return g.op("Div", self, other)
-
-    # Case 2: neither is floating
-    # Casts both inputs to the default scalar type
-    scalar_type = torch.get_default_dtype()
-    onnx_scalar_type = _C_onnx.TensorProtoDataType.FLOAT
-    assert scalar_type is torch.float or scalar_type is torch.double
-    if torch.get_default_dtype() is torch.double:
-        onnx_scalar_type = _C_onnx.TensorProtoDataType.DOUBLE
-
-    self = g.op("Cast", self, to_i=onnx_scalar_type)
-    other = g.op("Cast", other, to_i=onnx_scalar_type)
-    return g.op("Div", self, other)
-
-
-@_onnx_symbolic("aten::reciprocal")
-def reciprocal(g: jit_utils.GraphContext, self):
-    # torch.reciprocal implicitly casts to float, so we do the same.
-    if not symbolic_helper._is_fp(self):
-        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    return g.op("Reciprocal", self)
-
-
-@_onnx_symbolic("aten::cat")
-@symbolic_helper.parse_args("v", "i")
-def cat(g: jit_utils.GraphContext, tensor_list, dim):
-    """Implement concatenation of pytorch tensors in ONNX along the specified `dim` dimension.
-
-    Parameters:
-        g (jit_utils.GraphContext): Graph context.
-        tensor_list (List[torch.Tensor]): List of tensors to concatenate.
-        dim (int): Dimension along which to concatenate the tensors.
-
-    Returns:
-        ONNX graph node representing the concatenated tensor.
-    """
-    tensors = symbolic_helper._unpack_list(tensor_list)
-    # torch.cat ignores empty tensors such as `torch.Tensor([])`
-    # These needs to be removed as input from ONNX's concat too, otherwise shape inference
-    # will likely fail due to inputs with different ranks (0 for empty tensor, > 0 for anything else)
-    nonempty_tensors = []
-    for t in tensors:
-        if symbolic_helper._is_constant(t) and not symbolic_helper._get_tensor_dim_size(
-            t, 0
-        ):
-            continue
-        nonempty_tensors.append(t)
-    assert len(nonempty_tensors) > 0
-    assert all(
-        symbolic_helper._get_tensor_rank(nonempty_tensors[0]) is None
-        or symbolic_helper._get_tensor_rank(t) is None
-        or symbolic_helper._get_tensor_rank(t)
-        == symbolic_helper._get_tensor_rank(nonempty_tensors[0])
-        for t in nonempty_tensors
-    )
-    tensor_list.node().removeAllInputs()
-    for t in nonempty_tensors:
-        tensor_list.node().addInput(t)
-
-    tensors = symbolic_helper._unpack_list(tensor_list)
-    return g.op("Concat", *tensors, axis_i=dim)
-
-
-@_onnx_symbolic("aten::stack")
-@symbolic_helper.parse_args("v", "i")
-def stack(g: jit_utils.GraphContext, tensor_list, dim):
-    unsqueezed = [
-        symbolic_helper._unsqueeze_helper(g, t, [dim])
-        for t in symbolic_helper._unpack_list(tensor_list)
-    ]
-    return g.op("Concat", *unsqueezed, axis_i=dim)
-
-
-@_onnx_symbolic("aten::list")
-def _list(g: jit_utils.GraphContext, self):
-    return self
-
-
-@_onnx_symbolic("aten::mm")
-def mm(g: jit_utils.GraphContext, self, other):
-    # Create a dummy C tensor. Only needed for API purposes, the value is
-    # since beta = 0
-    C = g.op("Constant", value_t=torch.tensor([1]))
-    return g.op("Gemm", self, other, C, beta_f=0.0, alpha_f=1.0)
-
-
-@_onnx_symbolic("aten::bmm")
-def bmm(g: jit_utils.GraphContext, self, other):
-    return g.op("MatMul", self, other)
-
-
-@_onnx_symbolic("aten::matmul")
-def matmul(g: jit_utils.GraphContext, self, other):
-    return g.op("MatMul", self, other)
-
-
-@_onnx_symbolic("aten::addmm")
-@symbolic_helper.parse_args("v", "v", "v", "t", "t")
-def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
-    scalar_type = None
-    self_scalar_type = symbolic_helper._try_get_scalar_type(self)
-    mat1_scalar_type = symbolic_helper._try_get_scalar_type(mat1)
-    mat2_scalar_type = symbolic_helper._try_get_scalar_type(mat2)
-    if self_scalar_type is not None:
-        scalar_type = self_scalar_type
-    elif mat1_scalar_type is not None:
-        scalar_type = mat1_scalar_type
-    elif mat2_scalar_type is not None:
-        scalar_type = mat2_scalar_type
-
-    mat1_rank = symbolic_helper._get_tensor_rank(mat1)
-    mat2_rank = symbolic_helper._get_tensor_rank(mat2)
-
-    def is_not_none_nor(v, u):
-        return v is not None and v != u
-
-    if scalar_type is not None and (
-        is_not_none_nor(mat1_rank, 2) or is_not_none_nor(mat2_rank, 2)
-    ):
-        res1 = g.op("MatMul", mat1, mat2)
-        res2 = self
-
-        alpha = symbolic_helper._scalar(alpha)
-        beta = symbolic_helper._scalar(beta)
-
-        if alpha != 1:
-            alpha = g.op(
-                "Constant", value_t=torch.tensor(alpha, dtype=scalar_type.dtype())
-            )
-            res1 = g.op("Mul", res1, alpha)
-        if beta != 1:
-            beta = g.op(
-                "Constant",
-                value_t=torch.tensor(
-                    symbolic_helper._scalar(beta), dtype=scalar_type.dtype()
-                ),
-            )
-            res2 = g.op("Mul", res2, beta)
-
-        return g.op("Add", res1, res2)
-
-    return g.op(
-        "Gemm",
-        mat1,
-        mat2,
-        self,
-        beta_f=symbolic_helper._scalar(beta),
-        alpha_f=symbolic_helper._scalar(alpha),
-    )
-
-
-@_onnx_symbolic("aten::neg")
-def neg(g: jit_utils.GraphContext, self):
-    return g.op("Neg", self)
-
-
-@_onnx_symbolic("aten::sqrt")
-def sqrt(g: jit_utils.GraphContext, self):
-    if _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.UNDEFINED
-    ) in {
-        _type_utils.JitScalarType.UINT8,
-        _type_utils.JitScalarType.INT8,
-        _type_utils.JitScalarType.INT16,
-        _type_utils.JitScalarType.INT,
-        _type_utils.JitScalarType.INT64,
-    }:
-        # torch converts all int inputs to sqrt to float
-        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-
-    return g.op("Sqrt", self)
-
-
-@_onnx_symbolic("aten::rsqrt")
-def rsqrt(g: jit_utils.GraphContext, self):
-    return g.op(
-        "Div", symbolic_helper._if_scalar_type_as(torch.ones(1), self), sqrt(g, self)
-    )
-
-
-@_onnx_symbolic("aten::tanh")
-# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qtanh.cpp
-@symbolic_helper.quantized_args(True, scale=2.0 / 256.0, zero_point=128)
-def tanh(g: jit_utils.GraphContext, self):
-    return g.op("Tanh", self)
-
-
-@_onnx_symbolic("aten::sin")
-def sin(g: jit_utils.GraphContext, self):
-    return g.op("Sin", self)
-
-
-@_onnx_symbolic("aten::cos")
-def cos(g: jit_utils.GraphContext, self):
-    return g.op("Cos", self)
-
-
-@_onnx_symbolic("aten::tan")
-def tan(g: jit_utils.GraphContext, self):
-    return g.op("Tan", self)
-
-
-@_onnx_symbolic("aten::asin")
-def asin(g: jit_utils.GraphContext, self):
-    return g.op("Asin", self)
-
-
-@_onnx_symbolic("aten::acos")
-def acos(g: jit_utils.GraphContext, self):
-    return g.op("Acos", self)
-
-
-@_onnx_symbolic("aten::atan")
-def atan(g: jit_utils.GraphContext, self):
-    return g.op("Atan", self)
-
-
-@_onnx_symbolic("aten::atan2")
-def atan2(g: jit_utils.GraphContext, self, other):
-    # self is y, and other is x on coordinate
-    slope = g.op("Div", self, other)
-    atan = g.op("Atan", slope)
-    const_zero = g.op("Constant", value_t=torch.tensor(0))
-    const_pi = g.op("Constant", value_t=torch.tensor(math.pi))
-
-    condition_second_or_third_quadrant = g.op("Greater", self, const_zero)
-    second_third_quadrant = g.op(
-        "Where",
-        condition_second_or_third_quadrant,
-        g.op("Add", atan, const_pi),
-        g.op("Sub", atan, const_pi),
-    )
-
-    condition_14_or_23_quadrant = g.op("Less", other, const_zero)
-    result = g.op("Where", condition_14_or_23_quadrant, second_third_quadrant, atan)
-
-    return result
-
-
-@_onnx_symbolic("aten::sigmoid")
-# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
-@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
-def sigmoid(g: jit_utils.GraphContext, self):
-    """Converts the corresponding PyTorch function into ONNX operators.
-
-    It is not meant to be called directly by a user.
-
-    Args:
-        g (jit_utils.GraphContext): Graph context.
-        self (Tensor): the input tensor.
-    Returns:
-        ONNX operator
-    """
-    return g.op("Sigmoid", self)
-
-
-@_onnx_symbolic("aten::sign")
-def sign(g: jit_utils.GraphContext, self):
-    return g.op("Sign", self)
-
-
-@symbolic_helper.quantized_args(True)
-def _slice(g: jit_utils.GraphContext, input, axes, starts, ends):
-    assert len(starts) == len(ends)
-    if len(starts) == 1 and starts[0] == 0 and ends[0] == _constants.INT64_MAX:
-        return input
-    return g.op("Slice", input, axes_i=axes, starts_i=starts, ends_i=ends)
-
-
-@_onnx_symbolic(
-    "aten::sum", decorate=[symbolic_helper._apply_params("ReduceSum", "sum")]
-)
-@_onnx_symbolic(
-    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
-)
-# torch.prod does not support multidimensional "dim"
-@_onnx_symbolic(
-    "aten::prod",
-    decorate=[
-        symbolic_helper._apply_params(
-            "ReduceProd", "prod", allow_multi_dim_support=False
-        )
-    ],
-)
-def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
-    return symbolic_helper._reduce_with_dtype_helper(
-        onnx_op, name, allow_multi_dim_support
-    )
-
-
-@_onnx_symbolic("aten::cumsum")
-@symbolic_helper.parse_args("v", "i", "none")
-def cumsum(g: jit_utils.GraphContext, input, dim, dtype):
-    symbolic_helper._onnx_opset_unsupported("cumsum", 9, 11, input)
 
+__all__: list[str] = []
 
-@_onnx_symbolic("aten::_sample_dirichlet")
-def _sample_dirichlet(g: jit_utils.GraphContext, self, generator):
-    return symbolic_helper._onnx_unsupported("_sample_dirichlet", self)
-
-
-@_onnx_symbolic("aten::_standard_gamma")
-def _standard_gamma(g: jit_utils.GraphContext, self, generator):
-    return symbolic_helper._onnx_unsupported("_standard_gamma", self)
-
-
-@_onnx_symbolic("aten::t")
-def t(g: jit_utils.GraphContext, self):
-    rank = symbolic_helper._get_tensor_rank(self)
-    if rank is None or rank < 2:
-        # The transpose of a 1d or 0d tensor is itself. ONNX does not define the behavior
-        # clearly and onnxruntime fails on these cases. So we add an Identity node to
-        # mirror the behavior of eager mode.
-        return g.op("Identity", self)
-    return g.op("Transpose", self, perm_i=(1, 0))
-
-
-@_onnx_symbolic("aten::numpy_T")
-@symbolic_helper.quantized_args(True)
-def numpy_T(g: jit_utils.GraphContext, input):
-    ndim = symbolic_helper._get_tensor_rank(input)
-    assert ndim is not None
-    perm = list(reversed(range(0, ndim)))
-    return g.op("Transpose", input, perm_i=perm)
-
-
-@_onnx_symbolic("aten::expand")
-@symbolic_helper.quantized_args(True)
-def expand(g: jit_utils.GraphContext, self, size, implicit):
-    """Implement the expand function for a pytorch tensor in ONNX according to specified `size`"""
-    size = symbolic_helper._maybe_get_const(size, "is")
-    if not symbolic_helper._is_value(size):
-        size = g.op("Constant", value_t=torch.LongTensor(size))
-    elif symbolic_helper._is_packed_list(size):
-        # Expand with -1 dim value means dim is unchanged.
-        # Since onnx::expand supports two-way broadcasting,
-        # -1 dim value can be exported to onnx as 1
-        size = symbolic_helper._reshape_helper(
-            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
-        )
-    dtype = _type_utils.JitScalarType.INT64
-    ones = ones_like(g, size, dtype)
-    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
-    size = where(g, g.op("Equal", size, neg_ones), ones, size)
-    return g.op("Expand", self, size)
-
-
-@_onnx_symbolic("aten::broadcast_to")
-@symbolic_helper.quantized_args(True)
-def broadcast_to(g: jit_utils.GraphContext, self, size):
-    size = symbolic_helper._maybe_get_const(size, "is")
-    if not symbolic_helper._is_value(size):
-        size = g.op("Constant", value_t=torch.LongTensor(size))
-    elif symbolic_helper._is_packed_list(size):
-        # Expand with -1 dim value means dim is unchanged.
-        # Since onnx::expand supports two-way broadcasting,
-        # -1 dim value can be exported to onnx as 1
-        size = symbolic_helper._reshape_helper(
-            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
-        )
-    dtype = _type_utils.JitScalarType.INT64
-    ones = ones_like(g, size, dtype)
-    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
-    size = where(g, g.op("Equal", size, neg_ones), ones, size)
-    return g.op("Expand", self, size)
-
-
-@_onnx_symbolic("aten::expand_as")
-@symbolic_helper.quantized_args(True, True)
-def expand_as(g: jit_utils.GraphContext, self, other):
-    self_t = symbolic_helper._maybe_get_const(self, "t")
-    if isinstance(self_t, torch.Tensor):
-        orig_type = self_t.dtype
-        self_t = self_t.to(torch.double)
-        dims = []
-        for d in range(self_t.dim()):
-            if torch.equal(self_t.mean(d).unsqueeze(d).expand_as(self_t), self_t):
-                dims.append(d)
-                self = g.op(
-                    "Constant", value_t=self_t.mean(dims, keepdim=True).to(orig_type)
-                )
-
-    shape = g.op("Shape", other)
-    return g.op("Expand", self, shape)
-
-
-@_onnx_symbolic("aten::embedding")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "v", "i", "b", "v")
-def embedding(
-    g: jit_utils.GraphContext,
-    weight,
-    indices,
-    padding_idx,
-    scale_grad_by_freq,
-    sparse,
-):
-    if scale_grad_by_freq and GLOBALS.export_training:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of embedding with scale_grad_by_freq=True "
-            "for training mode. ONNX does not support scaling the gradients.",
-            weight,
-        )
-    if padding_idx >= 0 and GLOBALS.export_training:
-        warnings.warn(
-            "Warning: ONNX export of embedding with padding_idx >= 0 "
-            "for training mode. "
-            "ONNX does not support not updating the embedding vector at padding_idx during training."
-        )
-
-    return g.op("Gather", weight, indices)
-
-
-@_onnx_symbolic("aten::embedding_bag")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
-def embedding_bag(
-    g: jit_utils.GraphContext,
-    embedding_matrix,
-    indices,
-    offsets,
-    scale_grad_by_freq,
-    mode,
-    sparse,
-    per_sample_weights,
-    include_last_offset,
-    padding_idx,
-):
-    if not symbolic_helper._is_none(per_sample_weights):
-        return symbolic_helper._onnx_unsupported(
-            "embedding_bag with per_sample_weights"
-        )
-
-    return symbolic_helper._onnx_unsupported("embedding_bag", embedding_matrix)
-
-
-@_onnx_symbolic("aten::size")
-@symbolic_helper.quantized_args(True, quantize_output=False)
-def size(g: jit_utils.GraphContext, self, dim=None):
-    if dim is None:
-        return g.op("Shape", self)
-    if symbolic_helper._maybe_get_const(dim, "i") < 0:
-        rank = symbolic_helper._get_tensor_rank(self)
-        if rank is not None:
-            dim = symbolic_helper._maybe_get_const(dim, "i") + rank
-            dim = g.op("Constant", value_t=torch.tensor(dim))
-    return symbolic_helper._size_helper(g, self, dim)
-
-
-@_onnx_symbolic("aten::transpose")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "i", "i")
-def transpose(g: jit_utils.GraphContext, self, dim0, dim1):
-    if dim0 == dim1:  # micro-optimization
-        return self
-
-    # NB: Transpose in ONNX is actually a Permute
-    rank = symbolic_helper._get_tensor_rank(self)
-    if rank is not None:
-        axes = list(range(rank))
-        axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
-        return g.op("Transpose", self, perm_i=axes)
-    else:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of transpose for tensor of unknown rank.",
-            self,
-        )
-
-
-@_onnx_symbolic("aten::permute")
-@symbolic_helper.parse_args("v", "is")
-def permute(g: jit_utils.GraphContext, self, dims):
-    if dims == list(range(0, len(dims))):
-        return self
-    return g.op("Transpose", self, perm_i=dims)
-
-
-@_onnx_symbolic("aten::view")
-@symbolic_helper.quantized_args(True)
-def view(g: jit_utils.GraphContext, self, size):
-    return reshape(g, self, size)
-
-
-@_onnx_symbolic("aten::view_as")
-def view_as(g: jit_utils.GraphContext, self, other):
-    shape = g.op("Shape", other)
-    return reshape(g, self, shape)
-
-
-@_onnx_symbolic("aten::unsafe_chunk")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
-    if _outputs is None:
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "unsafe_chunk", 9, 11, "Dynamic number of outputs not supported", self
-        )
-    size = symbolic_helper._get_tensor_dim_size(self, dim)
-    if size is None:
-        return symbolic_helper._unimplemented(
-            "unsafe_chunk", "unknown dimension size", self
-        )
-    split_size = (size + chunks - 1) // chunks
-    splits = [split_size] * (size // split_size)
-    leftover = size % split_size
-    if leftover:
-        splits.append(leftover)
-    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
-
-
-@_onnx_symbolic("aten::split")
-@symbolic_helper.parse_args("v", "v", "i", "i")
-def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
-    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "split", 9, 11, "Dynamic number of outputs not supported", self
-        )
-    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
-    if split_val.dim() > 0:
-        return split_with_sizes(g, self, split_size_or_sizes, dim, _outputs)
-    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
-
-    size = symbolic_helper._get_tensor_dim_size(self, dim)
-    if size is None:
-        if _outputs is not None:
-            size = split_size * _outputs
-        else:
-            return symbolic_helper._onnx_opset_unsupported_detailed(
-                "split", 9, 11, "Unknown dimension size not supported", self
-            )
-    splits = [split_size] * (size // split_size)
-    leftover = size % split_size
-    if leftover:
-        splits.append(leftover)
-    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
-
-
-@_onnx_symbolic("aten::unsafe_split")
-def unsafe_split(
-    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
-):
-    return split(g, self, split_size_or_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::split_with_sizes")
-@symbolic_helper.parse_args("v", "is", "i", "i")
-def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
-    if not symbolic_helper._is_split_static(split_sizes, _outputs):
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "split_with_sizes", 9, 11, "Dynamic number of outputs not supported", self
-        )
-    return g.op("Split", self, split_i=split_sizes, axis_i=dim, outputs=_outputs)
-
-
-@_onnx_symbolic("aten::unsafe_split_with_sizes")
-def unsafe_split_with_sizes(
-    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
-):
-    return split_with_sizes(g, self, split_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::unbind")
-@symbolic_helper.parse_args("v", "i", "i")
-def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
-    if _outputs is None:
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "unbind", 9, 11, "Dynamic number of outputs not supported", self
-        )
-
-    outputs = g.op("Split", self, split_i=[1] * _outputs, axis_i=dim, outputs=_outputs)
-    outputs = [outputs] if _outputs == 1 else outputs
-    squeezed_outputs = [
-        symbolic_helper._squeeze_helper(g, out, [dim]) for out in outputs
-    ]
-    return squeezed_outputs
-
-
-@_onnx_symbolic("aten::select")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "i", "v")
-def select(g: jit_utils.GraphContext, self, dim, index):
-    """Implement the select functionality for a pytorch tensor in ONNX.
-
-    Selects elements from the input tensor along the specified `dim` dimension based on the `index` tensor.
-    """
-    index = symbolic_helper._maybe_get_scalar(index)
-    if (not symbolic_helper._is_value(index)) and (index < 0):
-        if index == -1:
-            end_index = _constants.INT64_MAX
-        else:
-            end_index = index + 1
-        slice_node = symbolic_helper._slice_helper(
-            g, self, axes=[dim], starts=[index], ends=[end_index]
-        )
-        return symbolic_helper._squeeze_helper(g, slice_node, [dim])
-    else:
-        # FIXME(justinchuby): can index be an int and not a value?
-        return g.op("Gather", self, index, axis_i=dim)
-
-
-@_onnx_symbolic("aten::square")
-def square(g: jit_utils.GraphContext, self):
-    return g.op("Mul", self, self)
-
-
-@_onnx_symbolic("aten::squeeze")
-def squeeze(g: jit_utils.GraphContext, self, dim=None):
-    if dim is None:
-        return g.op("Squeeze", self)
-
-    squeeze_dim = symbolic_helper._get_const(dim, "i", "dim")
-    # Handle negative dims
-    if squeeze_dim < 0:
-        rank = symbolic_helper._get_tensor_rank(self)
-        if rank is not None:
-            warnings.warn(
-                "ONNX export squeeze with negative axis "
-                + str(squeeze_dim)
-                + " might cause the onnx model to be incorrect. "
-                + "Negative axis is not supported in ONNX. "
-                + "Axis is converted to "
-                + str(squeeze_dim + rank)
-                + " based on input shape at export time. "
-                + "Passing an tensor of different rank in execution will be incorrect."
-            )
-            squeeze_dim += rank
-        else:
-            return symbolic_helper._unimplemented(
-                "squeeze", "negative axis with unknown input rank", self
-            )
-
-    dim_size = symbolic_helper._get_tensor_dim_size(self, squeeze_dim)
-    if dim_size is None:
-        warnings.warn(
-            "This model contains a squeeze operation on dimension "
-            + str(squeeze_dim)
-            + " on an input "
-            + "with unknown shape. Note that if the size of dimension "
-            + str(squeeze_dim)
-            + " of the input "
-            + "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on "
-            + "non-singleton dimensions, it is recommended to export this model using opset "
-            + "version 11 or higher."
-        )
-        return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
-    if dim_size > 1:
-        warnings.warn(
-            "This model contains a squeeze operation on dimension "
-            + str(squeeze_dim)
-            + ". The size of "
-            + "this dimension in the given input is "
-            + str(dim_size)
-            + ". The model will "
-            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
-            + "input shapes, please use opset version 11 to "
-            + "export the model."
-        )
-        return self
-
-    warnings.warn(
-        "This model contains a squeeze operation on dimension "
-        + str(squeeze_dim)
-        + ". If the model is "
-        + "intended to be used with dynamic input shapes, please use opset version 11 to export the model."
-    )
-    return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
-
-
-@_onnx_symbolic("aten::prelu")
-def prelu(g: jit_utils.GraphContext, self, weight):
-    self_rank = symbolic_helper._get_tensor_rank(self)
-    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
-    weight_rank = len(weight_sizes)
-    if self_rank is not None:
-        if self_rank > 2:
-            # make weight unidirectional broadcastable
-            weight = symbolic_helper._unsqueeze_helper(
-                g, weight, list(range(1, self_rank - 1))
-            )
-        elif self_rank == 0 and weight_sizes == [1]:
-            # self and weight are both scalar but weight has rank == 1, squeeze weight.
-            weight = symbolic_helper._squeeze_helper(g, weight, [0])
-            weight_rank = 0
-
-    if self_rank is not None and weight_rank is not None:
-        assert self_rank >= weight_rank, (
-            f"rank(x) should be >= rank(slope) but got {self_rank} < {weight_rank}"
-        )
-    return g.op("PRelu", self, weight)
-
-
-@_onnx_symbolic("aten::silu")
-def silu(g: jit_utils.GraphContext, input):
-    return g.op("Mul", input, g.op("Sigmoid", input))
-
-
-@_onnx_symbolic("aten::mish")
-def mish(g: jit_utils.GraphContext, input):
-    return g.op("Mul", input, g.op("Tanh", g.op("Softplus", input)))
-
-
-@_onnx_symbolic("aten::relu")
-@symbolic_helper.quantized_args(True)
-def relu(g: jit_utils.GraphContext, input):
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Relu", input, opset_before=14
-    )
-
-
-@_onnx_symbolic("aten::relu6")
-@symbolic_helper.quantized_args(True)
-def relu6(g: jit_utils.GraphContext, input):
-    return clamp(g, input, 0, 6)
-
-
-@_onnx_symbolic("aten::ceil")
-def ceil(g: jit_utils.GraphContext, input):
-    return g.op("Ceil", input)
-
-
-@_onnx_symbolic("aten::floor")
-def floor(g: jit_utils.GraphContext, input):
-    return g.op("Floor", input)
-
-
-@_onnx_symbolic("aten::len")
-def _len(g: jit_utils.GraphContext, self):
-    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
-    return symbolic_helper._squeeze_helper(g, sz_0, [0])
-
-
-@_onnx_symbolic("aten::threshold")
-@symbolic_helper.parse_args("v", "t", "t")
-def threshold(g: jit_utils.GraphContext, self, threshold, value):
-    # See Note [Export inplace]
-    if symbolic_helper._scalar(threshold) != 0:
-        return symbolic_helper._unimplemented("threshold", "non-zero threshold", self)
-    if symbolic_helper._scalar(value) != 0:
-        return symbolic_helper._unimplemented("threshold", "non-zero value", self)
-    return g.op("Relu", self)
-
-
-@_onnx_symbolic("aten::leaky_relu")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "f", "b")
-def leaky_relu(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    negative_slope: float,
-    inplace: bool = False,
-):
-    # See Note [Export inplace]
-    return g.op("LeakyRelu", input, alpha_f=negative_slope)
-
-
-@_onnx_symbolic("aten::glu")
-@symbolic_helper.parse_args("v", "i")
-def glu(g: jit_utils.GraphContext, input, dim):
-    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
-    if dim_size is not None:
-        assert dim_size % 2 == 0
-
-    first, second = g.op("Split", input, axis_i=dim, outputs=2)
-    return g.op("Mul", first, g.op("Sigmoid", second))
-
-
-@_onnx_symbolic("aten::softmax")
-@symbolic_helper.parse_args("v", "i", "none")
-def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
-    # Softmax does normalization at vector level.
-    # PyTorch and ONNX use different strategies to split the input tensor into vectors.
-    # Thus dim and axis have different meanings.
-    # PyTorch slices the input tensor into vectors along the `dim`-th dimension.
-    # ONNX reshapes the input into a 2-D tensor, and `axis` indicates where the input is coerced.
-    # If input is a 2 x 3 tensor:
-    # input = [[1.0, 1.0, 1.0],
-    #          [1.0, 1,0, 1,0]]
-    # with dim = 0, the result is:
-    # result = [[0.5, 0.5, 0.5],
-    #           [0.5, 0.5, 0.5]]
-    # with axis = 0, the result is:
-    # result = [[0.167, 0.167, 0.167],
-    #           [0.167, 0.167, 0.167]]
-    # So only when dim and axis both equal to ndim - 1 (the last dimension),
-    # their semantics are equivalent.
-    # So use softmax when dim and axis both equal to ndim - 1,
-    # otherwise transpose the input to put the vectors to be normalized to the last dimension.
-    # When input rank is not known at export time we compute softmax using a subgraph
-    # with other operators
-    input_dim = symbolic_helper._get_tensor_rank(input)
-    if input_dim is not None:
-        # TODO: remove this as onnx opset 11 spec allows negative axes
-        if dim < 0:
-            dim = input_dim + dim
-
-        is_transpose_required = input_dim != dim + 1
-
-        if is_transpose_required:
-            axes = list(range(input_dim))
-            axes[dim], axes[-1] = axes[-1], axes[dim]
-            input = g.op("Transpose", input, perm_i=axes)
-            dim = input_dim - 1
-
-        softmax = g.op("Softmax", input, axis_i=dim)
-        if dtype and dtype.node().kind() != "prim::Constant":
-            parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-            softmax = g.op(
-                "Cast",
-                softmax,
-                to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type(),
-            )
-
-        if is_transpose_required:
-            softmax = g.op("Transpose", softmax, perm_i=axes)  # type: ignore[possibly-undefined]
-        return softmax
-
-    # Apply max normalization.
-    input = g.op("Sub", input, g.op("ReduceMax", input, axes_i=[dim], keepdims_i=1))
-
-    exp = g.op("Exp", input)
-    sum = symbolic_helper._reducesum_helper(g, exp, axes_i=[dim])
-    softmax = g.op("Div", exp, sum)
-    if dtype and dtype.node().kind() != "prim::Constant":
-        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        softmax = g.op(
-            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
-        )
-    return softmax
-
-
-@_onnx_symbolic("aten::softplus")
-def softplus(g: jit_utils.GraphContext, self, beta, threshold):
-    beta_const = symbolic_helper._maybe_get_const(beta, "f")
-    if beta_const != 1:
-        return g.op("Div", g.op("Softplus", g.op("Mul", self, beta)), beta)
-    return g.op("Softplus", self)
-
-
-@_onnx_symbolic("aten::get_pool_ceil_padding")
-def get_pool_ceil_padding(input, kernel_size, stride, padding):
-    # TODO(justinchuby): Looks like this op is deprecated in torch
-    sizes = symbolic_helper._get_tensor_sizes(input)
-    dim = sizes[-len(padding) :] if sizes is not None else None
-    if dim is None or any(i is None for i in dim):
-        return symbolic_helper._unimplemented(
-            "get_pool_ceil_padding", "input size not accessible", input
-        )
-    ceiled_output_dim = [
-        int(math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])))
-        + 1
-        for i in range(0, len(padding))
-    ]
-    # ensure last pooling starts inside
-    ceiled_output_dim = [
-        (
-            ceiled_output_dim[i] - 1
-            if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
-            else ceiled_output_dim[i]
-        )
-        for i in range(0, len(ceiled_output_dim))
-    ]
-    padding_ceil = [
-        (
-            0
-            if (stride[i] == 1)
-            else (
-                kernel_size[i]
-                - (
-                    dim[i]
-                    + 2 * padding[i]
-                    - ((ceiled_output_dim[i] - 1) * stride[i] + 1)
-                )
-            )
-        )
-        for i in range(0, len(padding))
-    ]
-    # ensure padding is not > kernel_size
-    padding_ceil = [
-        (
-            (
-                int(padding_ceil[i])
-                if padding_ceil[i] < kernel_size[i] - 1
-                else int(kernel_size[i] - 1)
-            )
-            if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
-            else int(padding_ceil[i])
-        )
-        for i in range(0, len(padding_ceil))
-    ]
-    return padding_ceil
-
-
-@_onnx_symbolic(
-    "aten::max_pool1d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "max_pool1d", torch.nn.modules.utils._single, 1, return_indices=False
-        ),
-        _export("max_pool1d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::max_pool2d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "max_pool2d", torch.nn.modules.utils._pair, 2, return_indices=False
-        ),
-        _export("max_pool2d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::max_pool3d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "max_pool3d", torch.nn.modules.utils._triple, 3, return_indices=False
-        ),
-        _export("max_pool3d"),
-    ],
-)
-def _max_pool(name, tuple_fn, ndims, return_indices):
-    @symbolic_helper.quantized_args(True, False, False, False, False, False)
-    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
-    def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
-        if set(tuple_fn(dilation)) != {1}:
-            return symbolic_helper._unimplemented(name, "dilation", input)
-        if not stride:
-            stride = kernel_size
-        padding = tuple(tuple_fn(padding))
-        if ceil_mode:
-            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
-            padding = padding + tuple(a + b for (a, b) in zip(padding_ceil, padding))
-        else:
-            padding = padding * 2
-        kwargs = {
-            "kernel_shape_i": tuple_fn(kernel_size),
-            "pads_i": padding,
-            "strides_i": tuple_fn(stride),
-        }
-        # easy but hacky way to get flattened indices values
-        # to be used to convert the indices values to non-flattened.
-        # In ONNX the indices are computed as a flatten 1-D tensor,
-        # so the values in indices are in [0, N x C x D1 x ... x Dn).
-        # To convert the indices to the same format used by Pytorch,
-        # we first execute a maxpool with a kernel and stride of 1 on the same input.
-        # This will result in a tensor of indices in which each index will have it's own value.
-        # Using this tensor as a reference, we extract the first index of each axis and subtract
-        # it from each index of this axis in the indices to convert.
-        # This step will result in a tensor were each dimension has values of indices within
-        # the dimension it is in.
-        # For more information :
-        # https://github.com/pytorch/pytorch/pull/16455#issuecomment-460776407
-        if return_indices:
-            r, indices = g.op("MaxPool", input, outputs=2, **kwargs)
-            _, flattened_indices = g.op(
-                "MaxPool",
-                input,
-                outputs=2,
-                kernel_shape_i=[1 for _ in range(ndims)],
-                strides_i=[1 for _ in range(ndims)],
-            )
-            # convert indices to have non-flattened indices values
-            s = symbolic_helper._slice_helper(
-                g,
-                flattened_indices,
-                axes=[2 + i for i in range(ndims)],
-                starts=list(tuple_fn(0)),
-                ends=list(tuple_fn(1)),
-            )
-            indices = sub(g, indices, s)
-            return r, indices
-        else:
-            r = g.op("MaxPool", input, outputs=1, **kwargs)
-            return r
-
-    return symbolic_fn
-
-
-max_pool1d_with_indices = _onnx_symbolic("aten::max_pool1d_with_indices")(
-    _max_pool(
-        "max_pool1d_with_indices",
-        torch.nn.modules.utils._single,
-        1,
-        return_indices=True,
-    )
-)
-max_pool2d_with_indices = _onnx_symbolic("aten::max_pool2d_with_indices")(
-    _max_pool(
-        "max_pool2d_with_indices",
-        torch.nn.modules.utils._pair,
-        2,
-        return_indices=True,
-    )
-)
-max_pool3d_with_indices = _onnx_symbolic("aten::max_pool3d_with_indices")(
-    _max_pool(
-        "max_pool3d_with_indices",
-        torch.nn.modules.utils._triple,
-        3,
-        return_indices=True,
-    )
-)
-
-
-@_onnx_symbolic(
-    "aten::avg_pool1d",
-    decorate=[
-        symbolic_helper._apply_params("avg_pool1d", torch.nn.modules.utils._single),
-        _export("avg_pool1d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::avg_pool2d",
-    decorate=[
-        symbolic_helper._apply_params("avg_pool2d", torch.nn.modules.utils._pair),
-        _export("avg_pool2d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::avg_pool3d",
-    decorate=[
-        symbolic_helper._apply_params("avg_pool3d", torch.nn.modules.utils._triple),
-        _export("avg_pool3d"),
-    ],
-)
-def _avg_pool(name, tuple_fn):
-    @symbolic_helper.quantized_args(True)
-    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
-    def symbolic_fn(
-        g,
-        input: _C.Value,
-        kernel_size: Sequence[int],
-        stride: Sequence[int],
-        padding: int | Sequence[int],
-        ceil_mode: int,
-        count_include_pad: int,
-        divisor_override=None,
-    ):
-        if not stride:
-            stride = kernel_size
-        padding = symbolic_helper._avgpool_helper(
-            tuple_fn, padding, kernel_size, stride, divisor_override, name
-        )
-        assert isinstance(padding, tuple)
-        adjusted_padding = padding
-        # Although onnx::AvgPool provides count_include_pad,
-        # The corner case of Average Pooling with ceil_mode on
-        # PyTorch allows sliding window go off bound, which leads to
-        # this accommodation.
-        # More detail on https://github.com/pytorch/pytorch/issues/57178
-        if count_include_pad:
-            input = symbolic_helper._op_with_optional_float_cast(
-                g,
-                "Pad",
-                input,
-                pads_i=((0,) * 2 + padding) * 2,
-                mode_s="constant",
-                value_f=0.0,
-                opset_before=11,
-            )
-            adjusted_padding = (0,) * len(padding)
-        if ceil_mode:
-            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
-            adjusted_padding = adjusted_padding + tuple(
-                a + b for (a, b) in zip(padding_ceil, adjusted_padding)
-            )
-        else:
-            adjusted_padding = adjusted_padding * 2
-        output = g.op(
-            "AveragePool",
-            input,
-            kernel_shape_i=tuple_fn(kernel_size),
-            strides_i=tuple_fn(stride),
-            pads_i=adjusted_padding,
-        )
-        return output
-
-    return symbolic_fn
-
-
-@_onnx_symbolic(
-    "aten::adaptive_avg_pool1d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "adaptive_avg_pool1d", "AveragePool", torch.nn.modules.utils._single
-        ),
-        _export("adaptive_avg_pool1d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::adaptive_avg_pool2d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "adaptive_avg_pool2d", "AveragePool", torch.nn.modules.utils._pair
-        ),
-        _export("adaptive_avg_pool2d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::adaptive_avg_pool3d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "adaptive_avg_pool3d", "AveragePool", torch.nn.modules.utils._triple
-        ),
-        _export("adaptive_avg_pool3d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::adaptive_max_pool1d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "adaptive_max_pool1d",
-            "MaxPool",
-            torch.nn.modules.utils._single,
-            max_pool1d_with_indices,
-        ),
-        _export("adaptive_max_pool1d"),
-    ],
+from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import *  # noqa: F401,F403
+from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import (  # noqa: F401
+    _prepare_onnx_paddings,
+    _reshape_from_tensor,
+    _slice,
+    _var_mean,
 )
-@_onnx_symbolic(
-    "aten::adaptive_max_pool2d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "adaptive_max_pool2d",
-            "MaxPool",
-            torch.nn.modules.utils._pair,
-            max_pool2d_with_indices,
-        ),
-        _export("adaptive_max_pool2d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::adaptive_max_pool3d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "adaptive_max_pool3d",
-            "MaxPool",
-            torch.nn.modules.utils._triple,
-            max_pool3d_with_indices,
-        ),
-        _export("adaptive_max_pool3d"),
-    ],
-)
-def _adaptive_pool(name, type, tuple_fn, fn=None):
-    @symbolic_helper.quantized_args(True, False)
-    def symbolic_fn(g, input, output_size):
-        # _adaptive_pool is supported for cases where output_size is 1 for all dimensions,
-        # by executing a GlobalPool.
-        # It is also supported for cases where the output size is a factor of the input size.
-        # For these cases the stride and kernel size are uniform along all the indices of
-        # the same dimension, which makes it possible to export it to ONNX.
-        # for MaxPool, GlobalMaxPool does not return indices,
-        # so we try using max_poolxd_with_indices, and if it is not possible
-        # (input is not a complete tensor or output size not factor of input size)
-        # then we call GlobalAveragePool and return None for the indices
-        output_size_value = output_size
-        try:
-            output_size = symbolic_helper._parse_arg(output_size, "is")
-        except Exception:
-            # FIXME(justinchuby): Avoid catching Exception.
-            # Catch a more specific exception instead.
-            return symbolic_helper._onnx_unsupported(
-                "adaptive pooling, since output_size is not constant.", input
-            )
-        if output_size == [1] * len(output_size) and type == "AveragePool":
-            return g.op("GlobalAveragePool", input)
-        sizes = symbolic_helper._get_tensor_sizes(input)
-        try:
-            dim = sizes[2:]
-        except Exception:
-            # FIXME(justinchuby): Avoid catching Exception.
-            # Catch a more specific exception instead.
-            dim = None
-        if dim is None or any(i is None for i in dim):
-            if output_size == [1] * len(output_size):
-                return g.op("GlobalMaxPool", input), None
-            return symbolic_helper._unimplemented(
-                name, "input size not accessible", input
-            )
-        # verify if output size % input size = 0 for all dim
-        mod = [dim[i] % output_size[i] for i in range(0, len(dim))]
-        if mod != [0] * len(mod):
-            if output_size == [1] * len(output_size):
-                return g.op("GlobalMaxPool", input), None
-            return symbolic_helper._unimplemented(
-                name, "output size that are not factor of input size", output_size_value
-            )
-        k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
-        # call max_poolxd_with_indices to get indices in the output
-        if type == "MaxPool":
-            return fn(g, input, k, k, (0,) * len(dim), (1,) * len(dim), False)
-        output = g.op(type, input, kernel_shape_i=tuple_fn(k), strides_i=tuple_fn(k))
-        return output
-
-    return symbolic_fn
-
-
-def _prepare_onnx_paddings(dim: int, pad):
-    """Generate paddings in ONNX order based on pad in pytorch.
-    Args:
-        dim: the dimension of the tensor.
-        pad: the paddings in pytorch.
-            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ...
-    """
-    # The desired order of paddings is
-    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
-    # n is the dimension of input.
-    # assume zero-dimensions in the beginning
-    paddings = list(pad[:]) + [0] * (dim * 2 - len(pad))
-    # reverse order and collate first beginnings and then ends
-    paddings = paddings[-2::-2] + paddings[-1::-2]
-    return paddings
-
-
-def _convert_padding_node(input):
-    padding = symbolic_helper._maybe_get_const(input, "is")
-    if symbolic_helper._is_value(padding) and symbolic_helper._is_packed_list(padding):
-        input_list = symbolic_helper._unpack_list(padding)
-        try:
-            padding = [
-                symbolic_helper._get_const(v, "i", "padding") for v in input_list
-            ]
-        except Exception:
-            # FIXME(justinchuby): Avoid catching Exception.
-            # Catch a more specific exception instead.
-            return symbolic_helper._onnx_opset_unsupported_detailed(
-                "Pad", 9, 11, "The sizes of the padding must be constant", input
-            )
-    return padding
-
-
-@_onnx_symbolic("aten::constant_pad_nd")
-def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value):
-    mode = "constant"
-    try:
-        value = symbolic_helper._get_const(value, "f", "value")
-    except Exception:
-        # FIXME(justinchuby): Avoid catching Exception.
-        # Catch a more specific exception instead.
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "Pad", 9, 11, "The value for the padding must be constant", value
-        )
-
-    padding = _convert_padding_node(padding)
-    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Pad", input, pads_i=paddings, mode_s=mode, value_f=value, opset_before=11
-    )
-
-
-def _pad_circular(g: jit_utils.GraphContext, input: _C.Value, pad: _C.Value):
-    padding = _convert_padding_node(pad)
-    assert len(padding) % 2 == 0
-    ndim = len(padding) // 2
-
-    cur = input
-    for idx in range(ndim):
-        pad_r = padding[-(2 * idx + 1)]
-        pad_l = padding[-(2 * idx + 2)]
-        tensors = []
-        if pad_l > 0:
-            left = symbolic_helper._slice_helper(
-                g, cur, axes=[2 + idx], starts=[-(pad_l)], ends=[_constants.INT64_MAX]
-            )
-            tensors.append(left)
-
-        if pad_l < 0 or pad_r < 0:
-            start = builtins.max(0, -pad_l)
-            end = -(builtins.max(0, -pad_r))
-            middle = symbolic_helper._slice_helper(
-                g,
-                cur,
-                axes=[2 + idx],
-                starts=[start],
-                ends=[end],
-            )
-            tensors.append(middle)
-        else:
-            tensors.append(cur)
-
-        if pad_r > 0:
-            right = symbolic_helper._slice_helper(
-                g, cur, axes=[2 + idx], starts=[0], ends=[pad_r]
-            )
-            tensors.append(right)
-
-        cur = g.op("Concat", *tensors, axis_i=(2 + idx))
-
-    return cur
-
-
-@_onnx_symbolic("aten::reflection_pad1d")
-@_onnx_symbolic("aten::reflection_pad2d")
-@_onnx_symbolic("aten::reflection_pad3d")
-def reflection_pad(g: jit_utils.GraphContext, input, padding):
-    mode = "reflect"
-    padding = _convert_padding_node(padding)
-    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
-    )
-
-
-@_onnx_symbolic("aten::replication_pad1d")
-@_onnx_symbolic("aten::replication_pad2d")
-@_onnx_symbolic("aten::replication_pad3d")
-def replication_pad(g: jit_utils.GraphContext, input, padding):
-    mode = "edge"
-    padding = _convert_padding_node(padding)
-    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
-    )
-
-
-@_onnx_symbolic("aten::pad")
-def pad(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    pad: _C.Value,
-    mode: _C.Value,
-    value: _C.Value,
-):
-    mode = symbolic_helper._parse_arg(mode, "s")
-    if mode == "replicate":
-        return replication_pad(g, input, pad)
-    elif mode == "reflect":
-        return reflection_pad(g, input, pad)
-    elif mode == "constant":
-        return constant_pad_nd(g, input, pad, value)
-    elif mode == "circular":
-        return _pad_circular(g, input, pad)
-    else:
-        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
-
-
-@_onnx_symbolic(
-    "aten::upsample_nearest1d",
-    decorate=[
-        symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest"),
-        _export("upsample_nearest1d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest2d",
-    decorate=[
-        symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest"),
-        _export("upsample_nearest2d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest3d",
-    decorate=[
-        symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest"),
-        _export("upsample_nearest3d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::upsample_linear1d",
-    decorate=[
-        symbolic_helper._apply_params("upsample_linear1d", 3, "linear"),
-        _export("upsample_linear1d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::upsample_bilinear2d",
-    decorate=[
-        symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear"),
-        _export("upsample_bilinear2d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::upsample_trilinear3d",
-    decorate=[
-        symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear"),
-        _export("upsample_trilinear3d"),
-    ],
-)
-def _interpolate(name: str, dim: int, interpolate_mode: str):
-    def symbolic_fn(g, input, output_size, *args):
-        scales, align_corners = symbolic_helper._get_interpolate_attributes(
-            g, interpolate_mode, args
-        )
-        symbolic_helper._interpolate_warning(interpolate_mode)
-        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
-        if align_corners:
-            return symbolic_helper._unimplemented(name, "align_corners == True", input)
-        if scales is None:
-            scales = symbolic_helper._interpolate_size_to_scales(
-                g, input, output_size, dim
-            )
-        return g.op("Upsample", input, scales, mode_s=interpolate_mode)
-
-    return symbolic_fn
-
-
-@_onnx_symbolic("aten::__interpolate")
-def __interpolate(
-    g: jit_utils.GraphContext,
-    input,
-    size,
-    scale_factor,
-    mode,
-    align_corners,
-    recompute_scale_factor,
-    antialias,
-):
-    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
-        g, input, size, scale_factor, mode, align_corners
-    )
-    return g.op("Upsample", input, scales, mode_s=mode)
-
-
-@_onnx_symbolic("aten::bitwise_not")
-def bitwise_not(g: jit_utils.GraphContext, input):
-    if not symbolic_helper._is_bool(input):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise Not "
-            "for non-boolean input values",
-            input,
-        )
-    return g.op("Not", input)
-
-
-@_onnx_symbolic("aten::bitwise_or")
-def bitwise_or(g, self, other):
-    if not symbolic_helper._is_bool(self):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise OR "
-            "for non-boolean input values. self: ",
-            self,
-        )
-    if not symbolic_helper._is_bool(other):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise OR "
-            "for non-boolean input values. other: ",
-            other,
-        )
-    return g.op("Or", self, other)
-
-
-def wrap_logical_op_with_cast_to(to_type):
-    def decorator(fn):
-        @functools.wraps(fn)
-        def wrap_with_cast(g, input, other):
-            to_cast_func = globals()[f"_cast_{to_type}"]
-            return fn(g, to_cast_func(g, input, False), to_cast_func(g, other, False))
-
-        return wrap_with_cast
-
-    return decorator
-
-
-def wrap_logical_op_with_negation(func: Callable) -> Callable:
-    @functools.wraps(func)
-    def wrap_with_not(g, input, other):
-        return g.op("Not", func(g, input, other))
-
-    return wrap_with_not
-
-
-@_onnx_symbolic("aten::__not_")
-def __not_(g: jit_utils.GraphContext, self):
-    if not symbolic_helper._is_bool(self):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise Not "
-            "for non-boolean input values",
-            self,
-        )
-    return g.op("Not", self)
-
-
-@_onnx_symbolic("aten::eq")
-@symbolic_helper.quantized_args(True, True)
-def eq(g: jit_utils.GraphContext, self, other):
-    if isinstance(self.type(), _C.DeviceObjType) and isinstance(
-        other.type(), _C.DeviceObjType
-    ):
-        # ONNX doesn't have devices, so consider them all to be equal.
-        # The no-op check for equality will get constant-folded.
-        return g.op("Constant", value_t=torch.tensor(True, dtype=torch.bool))
-    self_node = self.node()
-    other_node = other.node()
-    if self_node.kind() == other_node.kind() == "onnx::Constant":
-        if self_node.kindOf("value") == other_node.kindOf("value") == "s":
-            # Exporting strings to ONNX is not supported.
-            # If both strings are constant, we can compare them directly.
-            # The no-op check for equality will get constant-folded.
-            return g.op(
-                "Constant",
-                value_t=torch.tensor(
-                    self_node.s("value") == other_node.s("value"),
-                    dtype=torch.bool,
-                ),
-            )
-
-    return g.op("Equal", self, other)
-
-
-@_onnx_symbolic("aten::ne")
-@symbolic_helper.quantized_args(True, True)
-@wrap_logical_op_with_negation
-def ne(g: jit_utils.GraphContext, self, other):
-    return eq(g, self, other)
-
-
-@_onnx_symbolic("aten::gt")
-@symbolic_helper.quantized_args(True, True)
-def gt(g: jit_utils.GraphContext, input, other):
-    return _gt_impl(g, input, other)
-
-
-def _gt_impl(g: jit_utils.GraphContext, input, other):
-    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
-        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
-        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
-    return g.op("Greater", input, other)
-
-
-@_onnx_symbolic("aten::lt")
-@symbolic_helper.quantized_args(True, True)
-def lt(g: jit_utils.GraphContext, input, other):
-    return _lt_impl(g, input, other)
-
-
-def _lt_impl(g: jit_utils.GraphContext, input, other):
-    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
-        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
-        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
-    return g.op("Less", input, other)
-
-
-@_onnx_symbolic("aten::ge")
-@symbolic_helper.quantized_args(True, True)
-@wrap_logical_op_with_negation
-def ge(g: jit_utils.GraphContext, input, other):
-    return _lt_impl(g, input, other)
-
-
-@_onnx_symbolic("aten::le")
-@symbolic_helper.quantized_args(True, True)
-@wrap_logical_op_with_negation
-def le(g: jit_utils.GraphContext, input, other):
-    return _gt_impl(g, input, other)
-
-
-@_onnx_symbolic("aten::__and_")
-def __and_(g: jit_utils.GraphContext, input, other):
-    if not symbolic_helper._is_bool(input):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise AND "
-            "for non-boolean input values",
-            input,
-        )
-    if not symbolic_helper._is_bool(other):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise AND "
-            "for non-boolean input values",
-            other,
-        )
-    return g.op("And", input, other)
-
-
-@_onnx_symbolic("aten::__or_")
-def __or_(g: jit_utils.GraphContext, input, other):
-    if not symbolic_helper._is_bool(input):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise OR "
-            "for non-boolean input values",
-            input,
-        )
-    if not symbolic_helper._is_bool(other):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise OR "
-            "for non-boolean input values",
-            other,
-        )
-    return g.op("Or", input, other)
-
-
-@_onnx_symbolic("aten::__xor_")
-def __xor_(g: jit_utils.GraphContext, input, other):
-    if not symbolic_helper._is_bool(input):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise XOR "
-            "for non-boolean input values",
-            input,
-        )
-    if not symbolic_helper._is_bool(other):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise XOR "
-            "for non-boolean input values",
-            other,
-        )
-    return g.op("Xor", input, other)
-
-
-@_onnx_symbolic("aten::logical_and")
-@wrap_logical_op_with_cast_to("Bool")
-def logical_and(g: jit_utils.GraphContext, input, other):
-    return g.op("And", input, other)
-
-
-@_onnx_symbolic("aten::logical_or")
-@wrap_logical_op_with_cast_to("Bool")
-def logical_or(g: jit_utils.GraphContext, input, other):
-    return g.op("Or", input, other)
-
-
-@_onnx_symbolic("aten::logical_xor")
-@wrap_logical_op_with_cast_to("Bool")
-def logical_xor(g: jit_utils.GraphContext, input, other):
-    return g.op("Xor", input, other)
-
-
-@_onnx_symbolic("aten::logical_not")
-def logical_not(g: jit_utils.GraphContext, input):
-    return g.op("Not", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL))
-
-
-@_onnx_symbolic("aten::__rshift_")
-def __rshift_(g: jit_utils.GraphContext, self, other):
-    # make sure to cast other to self's type
-    # (when self is long, make sure that other is not float)
-    self_scalar_type = _type_utils.JitScalarType.from_value(self)
-    if (
-        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
-        != self_scalar_type
-    ):
-        other = g.op(
-            "Cast",
-            other,
-            to_i=self_scalar_type.onnx_type(),
-        )
-
-    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
-    # exponent (same type as self) has to be float or double in onnx::Pow
-    if not symbolic_helper._is_fp(self):
-        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    two_pow = g.op("Pow", two, other)
-    two_pow = g.op(
-        "Cast",
-        two_pow,
-        to_i=self_scalar_type.onnx_type(),
-    )
-    rshift = g.op("Div", self, two_pow)
-    return rshift
-
-
-@_onnx_symbolic("aten::__lshift_")
-def __lshift_(g: jit_utils.GraphContext, self, other):
-    # make sure to cast other to self's type
-    # (when self is long, make sure that other is not float)
-    self_scalar_type = _type_utils.JitScalarType.from_value(self)
-    if (
-        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
-        != self_scalar_type
-    ):
-        other = g.op(
-            "Cast",
-            other,
-            to_i=self_scalar_type.onnx_type(),
-        )
-
-    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
-    # exponent (same type as self) has to be float or double in onnx::Pow
-    if not symbolic_helper._is_fp(self):
-        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    two_pow = g.op("Pow", two, other)
-    two_pow = g.op(
-        "Cast",
-        two_pow,
-        to_i=self_scalar_type.onnx_type(),
-    )
-    lshift = g.op("Mul", self, two_pow)
-    return lshift
-
-
-@_onnx_symbolic("aten::where")
-@symbolic_helper.parse_args("v", "v", "v", "i")
-def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
-    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
-    if not symbolic_helper._is_bool(condition):
-        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
-    if self is None:
-        condition = nonzero(g, condition)
-        return symbolic_helper._unbind_helper(
-            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
-        )
-    return g.op("Where", condition, self, other)
-
-
-@_onnx_symbolic("aten::log_softmax")
-@symbolic_helper.parse_args("v", "i", "none")
-def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
-    # PyTorch dim and ONNX axis have different meanings.
-    # See Softmax comment for details.
-    # TODO: remove this as onnx opset 11 spec allows negative axes
-    input_dim = symbolic_helper._get_tensor_rank(input)
-    if input_dim is None:
-        return symbolic_helper._unimplemented(
-            "dim",
-            "ONNX and PyTorch use different strategies to split the input. "
-            "Input rank must be known at export time.",
-        )
-    if dim < 0:
-        dim = input_dim + dim
-    is_transpose_required = input_dim != dim + 1
-    # ONNX only supports log_softmax with dim = -1. Transpose must be added before and after log_softmax to support other cases.
-    if is_transpose_required:
-        axes = list(range(input_dim))
-        axes[dim], axes[-1] = axes[-1], axes[dim]
-        input = g.op("Transpose", input, perm_i=axes)
-        dim = input_dim - 1
-    return_op = g.op("LogSoftmax", input, axis_i=dim)
-    if dtype and dtype.node().kind() != "prim::Constant":
-        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        return_op = g.op(
-            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
-        )
-    if is_transpose_required:
-        return_op = g.op("Transpose", return_op, perm_i=axes)  # type: ignore[possibly-undefined]
-    return return_op
-
-
-@_onnx_symbolic("aten::_log_softmax")
-@symbolic_helper.parse_args("v", "i", "i")
-def _log_softmax(g: jit_utils.GraphContext, input, dim, half_to_float):
-    if (
-        half_to_float
-        and _type_utils.JitScalarType.from_value(
-            input, _type_utils.JitScalarType.UNDEFINED
-        )
-        == _type_utils.JitScalarType.HALF
-    ):
-        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    return log_softmax(g, input, dim)
-
-
-@_onnx_symbolic("aten::_convolution")
-@symbolic_helper.parse_args(
-    "v", "v", "v", "is", "is", "is", "i", "is", "i", "i", "i", "i", "i"
-)
-def _convolution(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    transposed,
-    output_padding,
-    groups,
-    benchmark,
-    deterministic,
-    cudnn_enabled,
-    allow_tf32=None,
-):
-    weight_size = symbolic_helper._get_tensor_sizes(weight)
-    try:
-        kernel_shape = weight_size[2:]
-    except Exception:
-        # FIXME(justinchuby): Avoid catching Exception.
-        # Catch a more specific exception instead.
-        kernel_shape = None
-
-    if kernel_shape is None or any(i is None for i in kernel_shape):
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
-            input,
-        )
-
-    args = [input, weight]
-    # ONNX only supports 1D bias
-    if (
-        not symbolic_helper._is_none(bias)
-        and symbolic_helper._get_tensor_rank(bias) == 1
-    ):
-        args.append(bias)
-
-    kwargs = {
-        "kernel_shape_i": weight_size[2:],
-        "strides_i": stride,
-        # NB: ONNX supports asymmetric padding, whereas PyTorch supports only
-        # symmetric padding
-        "pads_i": padding + padding,
-        "dilations_i": dilation,
-        "group_i": groups,
-    }
-
-    if any(o != 0 for o in output_padding):
-        # ONNX supports both output_shape and output_padding. they are equivalent expressive.
-        # output_padding is more straightforward, so we use it here.
-        # output_shape = stride * (input_shape - 1) + output_padding + kernel_shape - padding * 2
-        assert transposed
-        assert len(stride) == len(output_padding)
-        kwargs["output_padding_i"] = output_padding
-
-    n = g.op("ConvTranspose" if transposed else "Conv", *args, **kwargs)
-
-    if (
-        not symbolic_helper._is_none(bias)
-        and symbolic_helper._get_tensor_rank(bias) != 1
-    ):
-        return g.op("Add", n, bias)
-    else:
-        return n
-
-
-@_onnx_symbolic("aten::_convolution_mode")
-@symbolic_helper.parse_args(
-    "v",
-    "v",
-    "v",
-    "is",
-    "s",
-    "is",
-    "i",
-)
-def _convolution_mode(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-):
-    weight_size = symbolic_helper._get_tensor_sizes(weight)
-    try:
-        kernel_shape = weight_size[2:]
-    except Exception:
-        # FIXME(justinchuby): Avoid catching Exception.
-        # Catch a more specific exception instead.
-        kernel_shape = None
-
-    if kernel_shape is None or any(i is None for i in kernel_shape):
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
-            input,
-        )
-
-    args = [input, weight]
-    # ONNX only supports 1D bias
-    if (
-        not symbolic_helper._is_none(bias)
-        and symbolic_helper._get_tensor_rank(bias) == 1
-    ):
-        args.append(bias)
-
-    if padding == "valid":
-        padding = "VALID"
-    elif padding == "same":
-        padding = "SAME_UPPER"
-    kwargs = {
-        "kernel_shape_i": weight_size[2:],
-        "strides_i": stride,
-        "auto_pad_s": padding,
-        "dilations_i": dilation,
-        "group_i": groups,
-    }
-
-    n = g.op("Conv", *args, **kwargs)
-
-    if (
-        not symbolic_helper._is_none(bias)
-        and symbolic_helper._get_tensor_rank(bias) != 1
-    ):
-        return g.op("Add", n, bias)
-    else:
-        return n
-
-
-@_onnx_symbolic("aten::convolution")
-@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is", "i")
-def convolution(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    transposed,
-    output_padding,
-    groups,
-):
-    return _convolution(
-        g,
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        transposed,
-        output_padding,
-        groups,
-        None,
-        None,
-        None,
-        None,
-    )
-
-
-@_onnx_symbolic("aten::conv1d")
-@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
-def conv1d(
-    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
-):
-    str_padding = symbolic_helper._parse_arg(padding, "s")
-    if str_padding in ["valid", "same"]:
-        return _convolution_mode(
-            g,
-            input,
-            weight,
-            bias,
-            stride,
-            str_padding,
-            dilation,
-            groups,
-        )
-    else:
-        padding = symbolic_helper._parse_arg(padding, "is")
-        return _convolution(
-            g,
-            input,
-            weight,
-            bias,
-            stride,
-            padding,
-            dilation,
-            False,
-            (),
-            groups,
-            None,
-            None,
-            None,
-            None,
-        )
-
-
-@_onnx_symbolic("aten::conv2d")
-@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
-def conv2d(
-    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
-):
-    str_padding = symbolic_helper._parse_arg(padding, "s")
-    if str_padding in ["valid", "same"]:
-        return _convolution_mode(
-            g,
-            input,
-            weight,
-            bias,
-            stride,
-            str_padding,
-            dilation,
-            groups,
-        )
-    else:
-        padding = symbolic_helper._parse_arg(padding, "is")
-        return _convolution(
-            g,
-            input,
-            weight,
-            bias,
-            stride,
-            padding,
-            dilation,
-            False,
-            (),
-            groups,
-            None,
-            None,
-            None,
-            None,
-        )
-
-
-@_onnx_symbolic("aten::conv3d")
-@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
-def conv3d(
-    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
-):
-    str_padding = symbolic_helper._parse_arg(padding, "s")
-    if str_padding in ["valid", "same"]:
-        return _convolution_mode(
-            g,
-            input,
-            weight,
-            bias,
-            stride,
-            str_padding,
-            dilation,
-            groups,
-        )
-    else:
-        padding = symbolic_helper._parse_arg(padding, "is")
-        return _convolution(
-            g,
-            input,
-            weight,
-            bias,
-            stride,
-            padding,
-            dilation,
-            False,
-            (),
-            groups,
-            None,
-            None,
-            None,
-            None,
-        )
-
-
-@_onnx_symbolic("aten::conv_transpose1d")
-@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
-def conv_transpose1d(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    groups,
-    dilation,
-):
-    return _convolution(
-        g,
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        True,
-        output_padding,
-        groups,
-        None,
-        None,
-        None,
-        None,
-    )
-
-
-@_onnx_symbolic("aten::conv_transpose2d")
-@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
-def conv_transpose2d(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    groups,
-    dilation,
-):
-    return _convolution(
-        g,
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        True,
-        output_padding,
-        groups,
-        None,
-        None,
-        None,
-        None,
-    )
-
-
-@_onnx_symbolic("aten::conv_transpose3d")
-@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
-def conv_transpose3d(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    groups,
-    dilation,
-):
-    return _convolution(
-        g,
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        True,
-        output_padding,
-        groups,
-        None,
-        None,
-        None,
-        None,
-    )
-
-
-@_onnx_symbolic("aten::batch_norm")
-@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
-def batch_norm(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    running_mean,
-    running_var,
-    training,
-    momentum,
-    eps,
-    cudnn_enabled,
-):
-    symbolic_helper.check_training_mode(training, "batch_norm")
-
-    if (
-        torch.is_autocast_enabled()
-        and not symbolic_helper.args_have_same_dtype(
-            [input, weight, bias, running_mean, running_var]
-        )
-        and GLOBALS.export_onnx_opset_version < 15
-    ):
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "BatchNormalization",
-            9,
-            15,
-            "All input tensors must have the same `dtype`."
-            " Turn off Autocast or export using opset version 15.",
-            input,
-        )
-
-    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
-        g, input, weight, bias, running_mean, running_var
-    )
-    out = g.op(
-        "BatchNormalization",
-        input,
-        weight,
-        bias,
-        running_mean,
-        running_var,
-        epsilon_f=eps,
-        momentum_f=1 - momentum,
-        outputs=1 if not training else 5,
-    )
-    if not training:
-        return out
-    else:
-        res, new_running_mean, new_running_var, saved_mean, saved_var = out
-        new_running_mean.setType(running_mean.type())
-        new_running_var.setType(running_var.type())
-        saved_mean.setDebugName("batch_norm_dead_output-" + saved_mean.debugName())
-        saved_var.setDebugName("batch_norm_dead_output-" + saved_var.debugName())
-        return res
-
-
-@_onnx_symbolic("aten::native_layer_norm")
-@symbolic_helper.quantized_args(True, False, False, False)
-@symbolic_helper.parse_args("v", "is", "v", "v", "f")
-def native_layer_norm(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    normalized_shape: Sequence[int],
-    weight: _C.Value,
-    bias: _C.Value,
-    eps: float,
-) -> tuple[_C.Value, _C.Value, _C.Value]:
-    axes = [-i for i in range(len(normalized_shape), 0, -1)]
-
-    two_cst = symbolic_helper._generate_wrapped_number(g, 2.0)
-    eps_cst = symbolic_helper._generate_wrapped_number(g, eps)
-
-    if g.opset < 18:
-        mean = g.op("ReduceMean", input, axes_i=axes)
-    else:
-        mean = g.op(
-            "ReduceMean",
-            input,
-            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
-        )
-
-    numerator = sub(g, input, mean)
-
-    # Cast it to eps dtype to avoid precision loss
-    is_type_half = (
-        _type_utils.JitScalarType.from_value(numerator)
-        == _type_utils.JitScalarType.HALF
-    )
-    if is_type_half:
-        eps_dtype = _type_utils.JitScalarType.from_value(eps_cst)
-        numerator = g.op(
-            "Cast", numerator, to_i=_type_utils.JitScalarType(eps_dtype).onnx_type()
-        )
-
-    # variance = e((x - e(x))^2), and (x - e(x)) is the numerator in the layer_norm formula
-    if g.opset < 18:
-        variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
-    else:
-        variance = g.op(
-            "ReduceMean",
-            pow(g, numerator, two_cst),
-            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
-        )
-
-    denominator = sqrt(g, g.op("Add", variance, eps_cst))
-    normalized = g.op("Div", numerator, denominator)
-
-    # Cast back to input type as eps related ops are all done
-    if is_type_half:
-        input_dtype = _type_utils.JitScalarType.from_value(input)
-        normalized = g.op(
-            "Cast", normalized, to_i=_type_utils.JitScalarType(input_dtype).onnx_type()
-        )
-
-    if not (weight is None or symbolic_helper._is_none(weight)):
-        normalized = mul(g, normalized, weight)
-    if not (bias is None or symbolic_helper._is_none(bias)):
-        normalized = add(g, normalized, bias)
-
-    # rdenominator := 1 / sqrt(variance + eps)
-    # According to aten::native_layer_norm, rdenominator should have the same dtype as input,
-    # mean and normalized, so we need to Cast it back
-    if is_type_half:
-        denominator = g.op(
-            "Cast",
-            denominator,
-            to_i=_type_utils.JitScalarType(input_dtype).onnx_type(),  # type: ignore[possibly-undefined]
-        )
-        rdenominator = g.op("Reciprocal", denominator)
-    else:
-        rdenominator = reciprocal(g, denominator)
-
-    return normalized, mean, rdenominator
-
-
-@_onnx_symbolic("aten::layer_norm")
-@symbolic_helper.quantized_args(True, False, False, False)
-@symbolic_helper.parse_args("v", "is", "v", "v", "f", "b")
-def layer_norm(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    normalized_shape: Sequence[int],
-    weight: _C.Value,
-    bias: _C.Value,
-    eps: float,
-    cudnn_enable: bool,
-) -> _C.Value:
-    normalized, _, _ = native_layer_norm(g, input, normalized_shape, weight, bias, eps)
-    return normalized
-
-
-@_onnx_symbolic("aten::instance_norm")
-@symbolic_helper.parse_args("v", "v", "v", "v", "v", "b", "f", "f", "b")
-def instance_norm(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    running_mean,
-    running_var,
-    use_input_stats: bool,
-    momentum: Number,
-    eps: Number,
-    cudnn_enabled: bool,
-):
-    symbolic_helper.check_training_mode(use_input_stats, "instance_norm")
-    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
-    if weight is None or symbolic_helper._is_none(weight):
-        if channel_size is None:
-            raise errors.SymbolicValueError(
-                "Unsupported: ONNX export of instance_norm for unknown channel size.",
-                input,
-            )
-        weight_value = torch.tensor(
-            [1.0] * channel_size,
-            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
-        )
-        weight = g.op("Constant", value_t=weight_value)
-    if bias is None or symbolic_helper._is_none(bias):
-        if channel_size is None:
-            raise errors.SymbolicValueError(
-                "Unsupported: ONNX export of instance_norm for unknown channel size.",
-                input,
-            )
-        bias_value = torch.tensor(
-            [0.0] * channel_size,
-            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
-        )
-        bias = g.op("Constant", value_t=bias_value)
-    if (
-        running_mean is None
-        or symbolic_helper._is_none(running_mean)
-        or running_var is None
-        or symbolic_helper._is_none(running_var)
-    ):
-        return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
-    else:
-        input_size = symbolic_helper._get_tensor_sizes(input)
-        # If input shape is [N, C, H, W], reshape to [1, N * C, H, W] and call batch_norm.
-        # For more information instance_norm():
-        # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Normalization.cpp#L542
-        input_size_reshape = input_size.copy()
-        n = input_size[0]
-        if n is None:
-            raise errors.SymbolicValueError(
-                "Unsupported: ONNX export of instance_norm training for unknown "
-                "batch size.",
-                input,
-            )
-        c = input_size[1]
-        input_size_reshape[0] = 1
-        input_size_reshape[1] = n * c
-        weight_ = repeat(
-            g, weight, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
-        )
-        bias_ = repeat(
-            g, bias, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
-        )
-        running_mean_ = repeat(
-            g,
-            running_mean,
-            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
-        )
-        running_var_ = repeat(
-            g,
-            running_var,
-            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
-        )
-        input_reshaped = g.op(
-            "Reshape",
-            input,
-            g.op("Constant", value_t=torch.LongTensor(input_size_reshape)),
-        )
-        out = batch_norm(
-            g,
-            input_reshaped,
-            weight_,
-            bias_,
-            running_mean_,
-            running_var_,
-            use_input_stats,
-            momentum,
-            eps,
-            cudnn_enabled,
-        )
-        return view(g, out, g.op("Constant", value_t=torch.tensor(input_size)))
-
-
-@_onnx_symbolic("aten::unfold")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
-    sizes = symbolic_helper._get_tensor_sizes(input)
-    # FIXME(justinchuby): Get rid of the try catch here to improve readability
-    try:
-        sizedim = sizes[dimension]
-    except Exception:
-        # FIXME(justinchuby): Avoid catching Exception.
-        # Catch a more specific exception instead.
-        sizedim = None
-    if sizedim is not None:
-        low_indices = range(0, sizedim, step)
-        hi_indices = range(size, sizedim + 1, step)
-        stack = [
-            symbolic_helper._slice_helper(
-                g, input, axes=[dimension], starts=[low], ends=[hi]
-            )
-            for low, hi in zip(low_indices, hi_indices)
-        ]
-        ndim = len(sizes)
-        perm = list(range(0, ndim))
-        perm.append(perm.pop(dimension))
-        unsqueeze = [
-            symbolic_helper._unsqueeze_helper(
-                g, g.op("Transpose", t, perm_i=perm), [dimension]
-            )
-            for t in stack
-        ]
-        return g.op("Concat", *unsqueeze, axis_i=dimension)
-    else:
-        return symbolic_helper._unimplemented(
-            "Unfold", "input size not accessible", input
-        )
-
-
-@_onnx_symbolic("aten::elu")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "t", "t", "t")
-def elu(g: jit_utils.GraphContext, input, alpha, scale, input_scale):
-    if scale and scale != 1.0:
-        return symbolic_helper._unimplemented(
-            "scale", "does not support scale in Elu", scale
-        )
-    if input_scale and input_scale != 1.0:
-        return symbolic_helper._unimplemented(
-            "input_scale", "does not support input_scale in Elu", input_scale
-        )
-    # See Note [Export inplace]
-    return g.op("Elu", input, alpha_f=symbolic_helper._scalar(alpha))
-
-
-@_onnx_symbolic("aten::selu")
-@symbolic_helper.quantized_args(True)
-def selu(g: jit_utils.GraphContext, input):
-    return g.op("Selu", input)
-
-
-@_onnx_symbolic("aten::index_select")
-@symbolic_helper.parse_args("v", "i", "v")
-def index_select(g: jit_utils.GraphContext, self, dim, index):
-    # In case of a scalar index, index_select returns a tensor with the same rank as the input.
-    # To match this behavior in ONNX, we make index a 1D tensor so that the following gather
-    # also produces a tensor with the same rank as the input.
-    return symbolic_helper._select_helper(g, self, dim, index)
-
-
-@_onnx_symbolic("aten::index_put")
-def index_put(g: jit_utils.GraphContext, self, indices_list_value, values, accumulate):
-    if symbolic_helper._is_packed_list(indices_list_value):
-        indices_list = symbolic_helper._unpack_list(indices_list_value)
-    else:
-        indices_list = [indices_list_value]
-
-    accumulate = symbolic_helper._parse_arg(accumulate, "b")
-
-    if len(indices_list) == 0:
-        if accumulate:
-            return add(g, self, values)
-        return values
-    symbolic_helper._onnx_opset_unsupported("index_put", 9, 11, self)
-
-
-@_onnx_symbolic("aten::index_fill")
-def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
-    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
-        g, self, dim, index
-    )
-    value = symbolic_helper._maybe_get_scalar(value)
-    value = symbolic_helper._if_scalar_type_as(value, self)
-    expanded_value = expand(g, value, expanded_index_shape, None)
-
-    return scatter(g, self, dim, expanded_index, expanded_value)
-
-
-@_onnx_symbolic("aten::index_copy")
-def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
-    _expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
-        g, self, dim, index
-    )
-    return scatter(g, self, dim, expanded_index, source)
-
-
-@_onnx_symbolic("aten::bucketize")
-@symbolic_helper.parse_args("v", "v", "b", "b")
-def bucketize(
-    g: jit_utils.GraphContext, self, boundaries, out_int32=False, right=False
-):
-    out_type = _C_onnx.TensorProtoDataType.INT64
-    if out_int32:
-        out_type = _C_onnx.TensorProtoDataType.INT32
-    # A tensor expanded_boundaries is created such that it
-    # contains a copy of boundaries for each element of self.
-    new_shape = g.op("Concat", g.op("Shape", boundaries), g.op("Shape", self), axis_i=0)
-    # Unsqueeze step is performed to respect ONNX's numpy style broadcasting for comparison ops
-    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
-    tensor_rank = symbolic_helper._get_tensor_rank(self)
-    assert tensor_rank is not None
-    unsqueeze_axes = list(range(1, tensor_rank + 1))
-    expanded_boundaries = expand(
-        g,
-        symbolic_helper._unsqueeze_helper(g, boundaries, unsqueeze_axes),
-        new_shape,
-        None,
-    )
-    # Compare each element of self to boundaries to get a tensor
-    # with leading 1s and trailing 0s.
-    # e.g., 4 > [1, 3, 4] = [1, 1, 0]
-    # The index of the last 1 is the bucket where the element should go.
-    if right:
-        cond = ge(g, self, expanded_boundaries)
-    else:
-        cond = gt(g, self, expanded_boundaries)
-    cond_out = g.op("Cast", cond, to_i=out_type)
-    # Sum to get the number of 1s corresponding to each element,
-    # which is the same as the bucket index.
-    # e.g., sum(4 > [1, 3, 4]) = sum([1, 1, 0]) = 2
-    return symbolic_helper._reducesum_helper(g, cond_out, axes_i=[0], keepdims_i=0)
-
-
-@_onnx_symbolic("aten::type_as")
-def type_as(g: jit_utils.GraphContext, self, other):
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-    other_dtype = symbolic_helper._try_get_scalar_type(other)
-    if self_dtype == other_dtype and self_dtype is not None:
-        return self
-    if other_dtype is not None:
-        return g.op(
-            "Cast",
-            self,
-            to_i=other_dtype.onnx_type(),
-        )
-
-    raise errors.SymbolicValueError(
-        "Unsupported: ONNX export of type_as for tensor "
-        "of unknown dtype. Please check if the dtype of the "
-        "parameter passed to the type_as function is correct.",
-        other,
-    )
-
-
-@_onnx_symbolic("aten::cosine_similarity")
-@symbolic_helper.parse_args("v", "v", "i", "f")
-def cosine_similarity(g: jit_utils.GraphContext, x1, x2, dim, eps):
-    cross = symbolic_helper._reducesum_helper(
-        g, mul(g, x1, x2), axes_i=[dim], keepdims_i=0
-    )
-    x1_l2 = symbolic_helper._reducesum_helper(
-        g, mul(g, x1, x1), axes_i=[dim], keepdims_i=0
-    )
-    x2_l2 = symbolic_helper._reducesum_helper(
-        g, mul(g, x2, x2), axes_i=[dim], keepdims_i=0
-    )
-    div_tens = max(
-        g, sqrt(g, mul(g, x1_l2, x2_l2)), g.op("Constant", value_t=torch.tensor([eps]))
-    )
-    return div(g, cross, div_tens)
-
-
-@_onnx_symbolic("aten::pairwise_distance")
-def pairwise_distance(g: jit_utils.GraphContext, input1, input2, p, eps, keepdim):
-    if not symbolic_helper._is_value(eps):
-        eps = g.op("Constant", value_t=torch.tensor([eps]))
-    inv_p = div(
-        g,
-        g.op("Constant", value_t=torch.tensor([1], dtype=torch.float)),
-        add(g, p, eps),
-    )
-    summation = symbolic_helper._reducesum_helper(
-        g,
-        pow(g, sub(g, input1, input2), p),
-        axes_i=[-1],
-        keepdims_i=symbolic_helper._parse_arg(keepdim, "i"),
-    )
-    return pow(g, summation, inv_p)
-
-
-@_onnx_symbolic("aten::clone")
-# ignore clone operators that are inserted by PyTorch autograd
-def clone(g: jit_utils.GraphContext, input, unused_memory_format):
-    return input
-
-
-@_onnx_symbolic("aten::abs")
-def abs(g: jit_utils.GraphContext, self):
-    return g.op("Abs", self)
-
-
-@_onnx_symbolic("aten::log")
-def log(g: jit_utils.GraphContext, self):
-    return g.op("Log", self)
-
-
-@_onnx_symbolic("aten::log1p")
-def log1p(g: jit_utils.GraphContext, self):
-    return log(g, add(g, symbolic_helper._if_scalar_type_as(torch.ones(1), self), self))
-
-
-@_onnx_symbolic("aten::log10")
-def log10(g: jit_utils.GraphContext, self):
-    _ln10 = 2.30258509299404568401
-    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor([_ln10])))
-
-
-@_onnx_symbolic("aten::pow")
-def pow(g: jit_utils.GraphContext, self, exponent):
-    f_dtype = _type_utils.JitScalarType.from_value(self)
-    if not symbolic_helper._is_fp(self):
-        f_dtype = _type_utils.JitScalarType.FLOAT
-        self = g.op("Cast", self, to_i=f_dtype.onnx_type())
-    if not symbolic_helper._is_fp(exponent):
-        exponent = g.op(
-            "Cast",
-            exponent,
-            to_i=f_dtype.onnx_type(),
-        )
-    pow = g.op("Pow", self, exponent)
-    return pow
-
-
-@_onnx_symbolic("aten::clamp")
-def clamp(g: jit_utils.GraphContext, self, min, max):
-    # min or max may be None that we need to dispatch to
-    # Clip separately, as ONNX does not have None syntax
-    if symbolic_helper._is_none(min):
-        return clamp_max(g, self, max)
-    elif symbolic_helper._is_none(max):
-        return clamp_min(g, self, min)
-    else:
-        if symbolic_helper._is_constant(min) and symbolic_helper._is_constant(max):
-            return symbolic_helper._op_with_optional_float_cast(
-                g,
-                "Clip",
-                self,
-                min_f=symbolic_helper._parse_arg(min, "f"),
-                max_f=symbolic_helper._parse_arg(max, "f"),
-                opset_before=12,
-            )
-        else:
-            return clamp_max(g, clamp_min(g, self, min), max)
-
-
-@_onnx_symbolic("aten::clamp_min")
-@symbolic_helper.parse_args("v", "v")
-def clamp_min(g: jit_utils.GraphContext, self, min):
-    if symbolic_helper._is_constant(min):
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Clip", self, min_f=symbolic_helper._parse_arg(min, "f"), opset_before=12
-        )
-    else:
-        dtype = _type_utils.JitScalarType.from_value(self)
-        min = g.op("Cast", min, to_i=dtype.onnx_type())
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Max", self, min, opset_before=12
-        )
-
-
-@_onnx_symbolic("aten::clamp_max")
-@symbolic_helper.parse_args("v", "v")
-def clamp_max(g: jit_utils.GraphContext, self, max):
-    if symbolic_helper._is_constant(max):
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Clip", self, max_f=symbolic_helper._parse_arg(max, "f"), opset_before=12
-        )
-    else:
-        dtype = _type_utils.JitScalarType.from_value(self)
-        max = g.op("Cast", max, to_i=dtype.onnx_type())
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Min", self, max, opset_before=12
-        )
-
-
-@_onnx_symbolic("aten::max")
-# torch.max (same for torch.min) actually has two interfaces smashed together:
-# torch.max(x, dim, keepdim) and torch.max(x, y)
-# TODO(justinchuby): Support multiple quantized args in output
-def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
-
-
-@_onnx_symbolic("aten::maximum")
-@symbolic_helper.quantized_args(True, True)
-def maximum(g: jit_utils.GraphContext, input, other):
-    return max(g, input, dim_or_y=other)
-
-
-@_onnx_symbolic("aten::min")
-# TODO(justinchuby): Support multiple quantized args in output
-def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
-
-
-@_onnx_symbolic("aten::minimum")
-@symbolic_helper.quantized_args(True, True)
-def minimum(g: jit_utils.GraphContext, input, other):
-    return min(g, input, dim_or_y=other)
-
-
-@_onnx_symbolic("aten::amax")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "is", "i")
-def amax(g: jit_utils.GraphContext, self, dim, keepdim):
-    return g.op("ReduceMax", self, axes_i=dim, keepdims_i=keepdim)
-
-
-@_onnx_symbolic("aten::amin")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "is", "i")
-def amin(g: jit_utils.GraphContext, self, dim, keepdim):
-    return g.op("ReduceMin", self, axes_i=dim, keepdims_i=keepdim)
-
-
-@_onnx_symbolic("aten::aminmax")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "v", "i")
-def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
-    reduce_kwargs = {"keepdims_i": keepdim}
-    if not symbolic_helper._is_none(dim):
-        dim = symbolic_helper._get_const(dim, "i", "dim")
-        reduce_kwargs["axes_i"] = [dim]
-
-    return g.op("ReduceMin", self, **reduce_kwargs), g.op(
-        "ReduceMax", self, **reduce_kwargs
-    )
-
-
-@_onnx_symbolic("aten::exp")
-def exp(g: jit_utils.GraphContext, self):
-    return g.op("Exp", self)
-
-
-@_onnx_symbolic("aten::dropout_")
-@_onnx_symbolic("aten::dropout")
-@symbolic_helper.parse_args("v", "f", "i")
-def dropout(g: jit_utils.GraphContext, input, p, train):
-    symbolic_helper.check_training_mode(train, "dropout")
-    # if train is False, dropout is no-op
-    if not train:
-        return input
-    r, _ = g.op("Dropout", input, ratio_f=p, outputs=2)
-    return r
-
-
-@_onnx_symbolic(
-    "aten::alpha_dropout_",
-    decorate=[symbolic_helper._apply_params("aten::alpha_dropout_")],
-)  # See Note [Export inplace]
-@_onnx_symbolic(
-    "aten::feature_alpha_dropout_",
-    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout_")],
-)
-@_onnx_symbolic(
-    "aten::feature_dropout_",
-    decorate=[symbolic_helper._apply_params("aten::feature_dropout_")],
-)
-@_onnx_symbolic(
-    "aten::feature_alpha_dropout",
-    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout")],
-)
-@_onnx_symbolic(
-    "aten::alpha_dropout",
-    decorate=[symbolic_helper._apply_params("aten::alpha_dropout")],
-)
-@_onnx_symbolic(
-    "aten::feature_dropout",
-    decorate=[symbolic_helper._apply_params("aten::feature_dropout")],
-)
-def _unsupported_dropout(name: str):
-    @symbolic_helper.parse_args("v", "none", "b")
-    def feature_dropout(g, input, p, train):
-        # NB: In inference mode, FeatureDropout is exported as an identity op.
-        if train:
-            return symbolic_helper._unimplemented(name, "training mode", input)
-        return input
-
-    return feature_dropout
-
-
-@_onnx_symbolic("aten::norm")
-@symbolic_helper.parse_args("v", "t", "is", "i", "v")
-def norm(g: jit_utils.GraphContext, self, p, dim, keepdim, dtype=None):
-    if p == 1:
-        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL1")
-    elif p == 2:
-        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL2")
-    else:
-        raise errors.SymbolicValueError(
-            "ONNX export only p-norms with p of 1 or 2", self
-        )
-    result = f(g, self, dim=dim, keepdim=keepdim)
-    if dtype is not None:
-        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-    return result
-
-
-@_onnx_symbolic("aten::conv_tbc")
-@symbolic_helper.parse_args("v", "v", "v", "i")
-def conv_tbc(g: jit_utils.GraphContext, input, weight, bias, pad):
-    # input must have 3 dimensions, see:
-    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ConvolutionTBC.cpp#L8-L10
-    # input = (time, batch, in_channels)
-    # weight = (kernel_width, in_channels, out_channels)
-    # bias = (out_channels,)
-    input = g.op("Transpose", input, perm_i=[1, 2, 0])
-    weight = g.op("Transpose", weight, perm_i=[2, 1, 0])
-    conv = conv1d(g, input, weight, bias, [1], [pad], [1], 1)
-    return g.op("Transpose", conv, perm_i=[2, 0, 1])
-
-
-@_onnx_symbolic("aten::_unique")
-@symbolic_helper.parse_args("v", "i", "i")
-def _unique(g: jit_utils.GraphContext, input, sorted, return_inverse):
-    return symbolic_helper._onnx_unsupported("_unique", input)
-
-
-@_onnx_symbolic("aten::_unique2")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def _unique2(g: jit_utils.GraphContext, input, sorted, return_inverse, return_counts):
-    symbolic_helper._onnx_opset_unsupported("_unique2", 9, 11, input)
-
-
-@_onnx_symbolic("aten::_cast_Byte")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Byte(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.UINT8)
-
-
-@_onnx_symbolic("aten::_cast_Char")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Char(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT8)
-
-
-@_onnx_symbolic("aten::_cast_Short")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Short(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT16)
-
-
-@_onnx_symbolic("aten::_cast_Int")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Int(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
-
-
-@_onnx_symbolic("aten::_cast_Long")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Long(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
-
-
-@_onnx_symbolic("aten::_cast_Half")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Half(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT16)
-
-
-@_onnx_symbolic("aten::_cast_Float")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Float(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-
-
-@_onnx_symbolic("aten::_cast_Double")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Double(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
-
-
-@_onnx_symbolic("aten::_cast_Bool")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Bool(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL)
-
-
-@_onnx_symbolic("aten::empty")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def empty(
-    g: jit_utils.GraphContext,
-    sizes,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-    memory_format=None,
-):
-    return zeros(g, sizes, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::empty_like")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def empty_like(
-    g: jit_utils.GraphContext,
-    input,
-    dtype=None,
-    layout=None,
-    device=None,
-    pin_memory=False,
-    memory_format=None,
-):
-    return zeros_like(g, input, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::new_empty")
-def new_empty(
-    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
-):
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-    if symbolic_helper._is_none(dtype) and self_dtype is not None:
-        dtype = self_dtype
-    return empty(g, sizes, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::scalar_tensor")
-def scalar_tensor(g: jit_utils.GraphContext, scalar, dtype, *options):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if dtype is None:
-        dtype = _type_utils.JitScalarType.FLOAT
-    scalar = g.op("Cast", scalar, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-    return scalar
-
-
-@_onnx_symbolic("aten::tensor")
-def tensor(
-    g: jit_utils.GraphContext, data, dtype=None, device=None, requires_grad=False
-):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if symbolic_helper._is_packed_list(data):
-        if dtype is None:
-            dtype = _type_utils.JitScalarType.from_value(
-                symbolic_helper._unpack_list(data)[0]
-            )
-        input_list = []
-        for t in symbolic_helper._unpack_list(data):
-            shape_reference = g.op("Constant", value_t=torch.LongTensor([1]))
-            t = symbolic_helper._reshape_helper(g, t, shape_reference)
-            t = g.op("Cast", t, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-            input_list.append(t)
-        return g.op("Concat", *input_list, axis_i=0)
-    else:
-        if dtype is None:
-            dtype = _type_utils.JitScalarType.from_value(data)
-        if symbolic_helper._is_list(data) and (
-            symbolic_helper._is_tensor_list(data)
-            or symbolic_helper._is_scalar_list(data)
-        ):
-            data = g.op("ConcatFromSequence", data, axis_i=0, new_axis_i=1)
-    return g.op("Cast", data, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-
-
-@_onnx_symbolic("aten::as_tensor")
-def as_tensor(g: jit_utils.GraphContext, data, dtype=None, device=None):
-    return tensor(g, data, dtype, device)
-
-
-@_onnx_symbolic("aten::zeros")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v")
-def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
-    # NOTE: no way to set device, layout and pin_memory in ONNX, so we ignore it
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
-    if isinstance(sizes_, list) and len(sizes_) == 0:
-        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
-    return g.op(
-        "ConstantOfShape",
-        sizes,
-        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
-    )
-
-
-@_onnx_symbolic("aten::zeros_like")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def zeros_like(
-    g: jit_utils.GraphContext,
-    input,
-    dtype=None,
-    layout=None,
-    device=None,
-    pin_memory=False,
-    memory_format=None,
-):
-    shape = g.op("Shape", input)
-    if symbolic_helper._is_none(dtype):
-        scalar_type = _type_utils.JitScalarType.from_value(
-            input, _type_utils.JitScalarType.FLOAT
-        )
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    return g.op(
-        "ConstantOfShape",
-        shape,
-        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
-    )
-
-
-@_onnx_symbolic("aten::new_zeros")
-def new_zeros(
-    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
-):
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-
-    if symbolic_helper._is_none(dtype) and self_dtype is not None:
-        dtype = self_dtype
-    return zeros(g, sizes, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::zero")
-def zero(g: jit_utils.GraphContext, self):
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-    return zeros_like(g, self, self_dtype)
-
-
-@_onnx_symbolic("aten::ones")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v")
-def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
-    if isinstance(sizes_, list) and len(sizes_) == 0:
-        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
-    return g.op(
-        "ConstantOfShape",
-        sizes,
-        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
-    )
-
-
-@_onnx_symbolic("aten::ones_like")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def ones_like(
-    g: jit_utils.GraphContext,
-    input,
-    dtype=None,
-    layout=None,
-    device=None,
-    pin_memory=False,
-    memory_format=None,
-):
-    shape = g.op("Shape", input)
-    if symbolic_helper._is_none(dtype):
-        scalar_type = _type_utils.JitScalarType.from_value(
-            input, _type_utils.JitScalarType.FLOAT
-        )
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    return g.op(
-        "ConstantOfShape",
-        shape,
-        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
-    )
-
-
-@_onnx_symbolic("aten::new_ones")
-def new_ones(
-    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
-):
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-    if symbolic_helper._is_none(dtype) and self_dtype is not None:
-        dtype = self_dtype
-    return ones(g, sizes, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::full")
-def full(
-    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
-):
-    const_value = symbolic_helper._maybe_get_const(value, "t")
-    if symbolic_helper._is_value(const_value):
-        dtype = _type_utils.JitScalarType.FLOAT if dtype is None else dtype
-        tmp = zeros(g, sizes, dtype, layout, device)
-        return add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
-    else:
-        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        if dtype is None:
-            scalar_type = _type_utils.JitScalarType.FLOAT
-        else:
-            scalar_type = _type_utils.JitScalarType(dtype)
-        sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
-        if isinstance(sizes_, list) and len(sizes_) == 0:
-            sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
-        return g.op(
-            "ConstantOfShape",
-            sizes,
-            value_t=const_value.view(1).to(scalar_type.dtype()),
-        )
-
-
-@_onnx_symbolic("aten::full_like")
-def full_like(
-    g: jit_utils.GraphContext,
-    input,
-    fill_value,
-    dtype=None,
-    layout=None,
-    device=None,
-    pin_memory=False,
-    memory_format=None,
-):
-    fill_value = symbolic_helper._maybe_get_const(fill_value, "f")
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.from_value(
-            input, _type_utils.JitScalarType.FLOAT
-        )
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    if symbolic_helper._is_value(fill_value):
-        tmp = zeros_like(g, input, dtype, layout, device)
-        fill_value = g.op("Cast", fill_value, to_i=scalar_type.onnx_type())
-        return add(g, tmp, fill_value, g.op("Constant", value_t=torch.tensor(1)))
-    else:
-        shape = g.op("Shape", input)
-        return g.op(
-            "ConstantOfShape",
-            shape,
-            value_t=torch.tensor([fill_value], dtype=scalar_type.dtype()),
-        )
-
-
-@_onnx_symbolic("aten::new_full")
-def new_full(
-    g: jit_utils.GraphContext,
-    self,
-    size,
-    fill_value,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-):
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-    if symbolic_helper._is_none(dtype) and self_dtype is not None:
-        dtype = self_dtype
-    return full(g, size, fill_value, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::eye")
-def eye(g: jit_utils.GraphContext, *args):
-    if len(args) == 5:
-        # aten::eye(n, dtype, layout, device, pin_memory)
-        n, dtype, layout, device, _pin_memory = args
-        dim_size = symbolic_helper._unsqueeze_helper(g, n, [0])
-        shape = g.op("Concat", dim_size, dim_size, axis_i=0)
-        tensor = zeros(g, shape, dtype, layout, device)
-        return g.op("EyeLike", tensor)
-    if len(args) == 6:
-        # aten::eye(n, m, dtype, layout, device, pin_memory)
-        n, m, dtype, layout, device, _pin_memory = args
-        shape = g.op(
-            "Concat",
-            symbolic_helper._unsqueeze_helper(g, n, [0]),
-            symbolic_helper._unsqueeze_helper(g, m, [0]),
-            axis_i=0,
-        )
-        tensor = zeros(g, shape, dtype, layout, device)
-        return g.op("EyeLike", tensor)
-
-    return symbolic_helper._unimplemented("aten::eye", f"with {len(args)} arguments")
-
-
-@_onnx_symbolic("aten::slice")
-def slice(g: jit_utils.GraphContext, self, *args):
-    if len(args) == 4:
-        # aten::slice(Tensor self, int dim, int start, int end, int step) -> Tensor
-        dim, start, end, step = args
-        step = symbolic_helper._parse_arg(step, "i")
-        if step != 1:
-            raise errors.SymbolicValueError("step!=1 is currently not supported", self)
-        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
-            start.type(), _C.NoneType
-        )
-        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
-            end.type(), _C.NoneType
-        )
-        is_start_onnx_const = start.node().kind() == "onnx::Constant"
-        is_end_onnx_const = end.node().kind() == "onnx::Constant"
-        if (
-            ((not is_start_none) and (not is_start_onnx_const))
-            or ((not is_end_none) and (not is_end_onnx_const))
-            or dim.node().kind() != "onnx::Constant"
-        ):
-            if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
-                raise errors.SymbolicValueError(
-                    "Unsupported: ONNX export of Slice with dynamic inputs. DynamicSlice "
-                    "is a deprecated experimental op. Please use statically allocated "
-                    "variables or export to a higher opset version.",
-                    self,
-                )
-            else:
-                start_unsqueezed = symbolic_helper._unsqueeze_helper(g, start, [0])
-                end_unsqueezed = symbolic_helper._unsqueeze_helper(g, end, [0])
-                dim_unsqueezed = symbolic_helper._unsqueeze_helper(g, dim, [0])
-                return g.op(
-                    "DynamicSlice",
-                    self,
-                    start_unsqueezed,
-                    end_unsqueezed,
-                    dim_unsqueezed,
-                )
-        else:
-            start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
-            end = (
-                _constants.INT64_MAX
-                if is_end_none
-                else symbolic_helper._parse_arg(end, "i")
-            )
-            dim = symbolic_helper._parse_arg(dim, "i")
-            return symbolic_helper._slice_helper(
-                g, self, axes=[dim], starts=[start], ends=[end]
-            )
-    elif len(args) == 3:
-        # aten::slice(t[] l, int start, int end, int step) -> t[]
-        start, end, step = args
-        dim = 0
-        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
-            start.type(), _C.NoneType
-        )
-        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
-            end.type(), _C.NoneType
-        )
-        start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
-        end = (
-            _constants.INT64_MAX
-            if is_end_none
-            else symbolic_helper._parse_arg(end, "i")
-        )
-        return symbolic_helper._slice_helper(
-            g, self, axes=[dim], starts=[start], ends=[end]
-        )
-
-    return symbolic_helper._unimplemented("aten::slice", f"with {len(args)} arguments")
-
-
-@_onnx_symbolic("aten::hardtanh")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "f", "f")
-def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Clip", self, min_f=min_val, max_f=max_val, opset_before=12
-    )
-
-
-@_onnx_symbolic("aten::hardswish")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v")
-def hardswish(g: jit_utils.GraphContext, self):
-    hs = hardsigmoid(g, self)
-    return g.op("Mul", self, hs)
-
-
-@_onnx_symbolic("aten::hardsigmoid")
-# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qhardsigmoid.cpp
-@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
-@symbolic_helper.parse_args("v")
-def hardsigmoid(g: jit_utils.GraphContext, self):
-    # Set alpha_f to 1 / 6 to make op equivalent to PyTorch's definition of Hardsigmoid.
-    # See https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html
-    return g.op("HardSigmoid", self, alpha_f=1 / 6)
-
-
-@_onnx_symbolic("aten::tanhshrink")
-@symbolic_helper.parse_args("v")
-def tanhshrink(g: jit_utils.GraphContext, self):
-    return g.op("Sub", self, tanh(g, self))
-
-
-@_onnx_symbolic("aten::hardshrink")
-@symbolic_helper.parse_args("v", "f")
-def hardshrink(g: jit_utils.GraphContext, self, lambd):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.FLOAT
-    )
-    lambd_op = g.op(
-        "Constant",
-        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
-    )
-    cond = logical_or(g, gt(g, self, lambd_op), lt(g, self, neg(g, lambd_op)))
-    return g.op(
-        "Where",
-        cond,
-        self,
-        g.op(
-            "Constant",
-            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
-        ),
-    )
-
-
-@_onnx_symbolic("aten::softshrink")
-@symbolic_helper.parse_args("v", "f")
-def softshrink(g: jit_utils.GraphContext, self, lambd):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.FLOAT
-    )
-    lambd_op = g.op(
-        "Constant",
-        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
-    )
-    gt_cond = gt(g, self, lambd_op)
-    gt_out = g.op(
-        "Where",
-        gt_cond,
-        sub(g, self, lambd_op),
-        g.op(
-            "Constant",
-            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
-        ),
-    )
-    lt_cond = lt(g, self, neg(g, lambd_op))
-    lt_out = g.op(
-        "Where",
-        lt_cond,
-        add(g, self, lambd_op),
-        g.op(
-            "Constant",
-            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
-        ),
-    )
-    return add(g, gt_out, lt_out)
-
-
-@_onnx_symbolic("aten::alias")
-def alias(g: jit_utils.GraphContext, self):
-    return self
-
-
-@_onnx_symbolic("aten::unsqueeze")
-@symbolic_helper.parse_args("v", "i")
-def unsqueeze(g: jit_utils.GraphContext, self, dim):
-    """Implement unsqueezing a pytorch tensor in ONNX by inserting a new dimension at the specified `dim`"""
-    # Handle negative dim
-    if dim < 0:
-        rank = symbolic_helper._get_tensor_rank(self)
-        if rank is not None:
-            warnings.warn(
-                "ONNX export unsqueeze with negative axis "
-                + str(dim)
-                + " might cause the onnx model to be incorrect. "
-                + "Negative axis is not supported in ONNX. "
-                + "Axis is converted to "
-                + str(dim + rank + 1)
-                + " based on input shape at export time. "
-                + "Passing an tensor of different rank in execution will be incorrect."
-            )
-            dim = dim + rank + 1
-        else:
-            return symbolic_helper._unimplemented(
-                "unsqueeze", "negative axis with unknown input rank", self
-            )
-
-    return symbolic_helper._unsqueeze_helper(g, self, axes_i=[dim])
-
-
-@_onnx_symbolic("aten::sort")
-# TODO(justinchuby): Support multiple quantized args in output
-@symbolic_helper.parse_args("v", "i", "i", "none")
-def sort(g: jit_utils.GraphContext, self, dim, descending, out=None):
-    if out is not None:
-        symbolic_helper._unimplemented(
-            "Sort", "Out parameter is not supported for sort", self
-        )
-    self_sizes = symbolic_helper._get_tensor_sizes(self)
-    try:
-        dim_size = self_sizes[dim]
-    except Exception:
-        # FIXME(justinchuby): Avoid catching Exception.
-        # Catch a more specific exception instead.
-        dim_size = None
-
-    if dim_size is None:
-        return symbolic_helper._unimplemented("Sort", "input size not accessible", self)
-
-    return g.op("TopK", self, k_i=dim_size, axis_i=dim, outputs=2)
-
-
-@_onnx_symbolic("aten::numel")
-def numel(g: jit_utils.GraphContext, self):
-    return symbolic_helper._numel_helper(g, self)
-
-
-@_onnx_symbolic("aten::topk")
-# TODO(justinchuby): Support multiple quantized args in output
-@symbolic_helper.parse_args("v", "i", "i", "i", "i", "none")
-def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
-    if out is not None:
-        symbolic_helper._unimplemented(
-            "TopK", "Out parameter is not supported for topk", self
-        )
-    if not largest:
-        symbolic_helper._unimplemented("TopK", "Ascending TopK is not supported", self)
-
-    return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
-
-
-@_onnx_symbolic("prim::convert_element_type")
-def convert_element_type(g: jit_utils.GraphContext, self, *args):
-    dtype = symbolic_helper._get_const(args[0], "i", "dtype")
-    return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-
-
-@_onnx_symbolic("aten::to")
-def to(g: jit_utils.GraphContext, self, *args):
-    def is_aten_to_device_only(args):
-        if len(args) == 4:
-            # aten::to(Tensor, Device, bool, bool, memory_format)
-            return (
-                args[0].node().kind() == "prim::device"
-                or args[0].type().isSubtypeOf(_C.ListType.ofInts())
-                or isinstance(args[0].type(), _C.DeviceObjType)
-            )
-        elif len(args) == 5:
-            # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
-            # When dtype is None, this is a aten::to(device) call
-            dtype = symbolic_helper._get_const(args[1], "i", "dtype")
-            return dtype is None
-        elif len(args) in (6, 7):
-            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
-            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
-            # When dtype is None, this is a aten::to(device) call
-            dtype = symbolic_helper._get_const(args[0], "i", "dtype")
-            return dtype is None
-        return False
-
-    # ONNX doesn't have a concept of a device, so we ignore device-only casts
-    if is_aten_to_device_only(args):
-        return self
-
-    if len(args) == 4:
-        # TestONNXRuntime::test_ones_bool shows args[0] of aten::to() can be onnx::Constant[value=<Tensor>]()
-        # In this case, the constant value is a tensor not int,
-        # so symbolic_helper._maybe_get_const(args[0], 'i') would not work.
-        dtype = args[0]
-        if (
-            symbolic_helper._is_value(args[0])
-            and args[0].node().kind() == "onnx::Constant"
-        ):
-            tval = symbolic_helper._node_get(args[0].node(), "value")
-            if isinstance(tval, torch.Tensor):
-                if len(tval.shape) == 0:
-                    tval = tval.item()
-                    dtype = int(tval)
-                else:
-                    dtype = tval
-
-        if symbolic_helper._is_value(dtype) or isinstance(dtype, torch.Tensor):
-            # aten::to(Tensor, Tensor, bool, bool, memory_format)
-            dtype = _type_utils.JitScalarType.from_value(args[0])
-            return g.op(
-                "Cast",
-                self,
-                to_i=dtype.onnx_type(),
-            )
-        else:
-            # aten::to(Tensor, ScalarType, bool, bool, memory_format)
-            # memory_format is ignored
-            return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-    elif len(args) == 5:
-        # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
-        dtype = symbolic_helper._get_const(args[1], "i", "dtype")
-        # memory_format is ignored
-        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-    elif len(args) == 6:
-        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
-        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
-        # Layout, device and memory_format are ignored
-        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-    elif len(args) == 7:
-        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
-        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
-        # Layout, device and memory_format are ignored
-        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-
-    return symbolic_helper._onnx_unsupported("Unknown aten::to signature", self)
-
-
-@_onnx_symbolic("aten::repeat")
-def repeat(g: jit_utils.GraphContext, self, repeats):
-    dtype = _type_utils.JitScalarType.INT64
-    shape_ = ones_like(g, repeats, dtype)
-    self = g.op("Expand", self, shape_)
-    return g.op("Tile", self, repeats)
-
-
-@_onnx_symbolic("aten::repeat_interleave")
-def repeat_interleave(
-    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
-):
-    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
-    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
-    input_sizes = symbolic_helper._get_tensor_sizes(self)
-    if repeats_dim is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
-            self,
-        )
-    if repeats_sizes is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
-            self,
-        )
-    if input_sizes is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
-            self,
-        )
-
-    # if dim is None flatten
-    # By default, use the flattened input array, and return a flat output array
-    if symbolic_helper._is_none(dim):
-        self = symbolic_helper._reshape_helper(
-            g, self, g.op("Constant", value_t=torch.tensor([-1]))
-        )
-        dim = torch.tensor(0, dtype=torch.int64)
-    else:
-        dim = symbolic_helper._maybe_get_scalar(dim)
-
-    # Handle cases where dim is negative
-    if dim < 0:
-        dim += len(input_sizes)
-
-    input_sizes_temp = input_sizes.copy()
-    for idx, input_size in enumerate(input_sizes):
-        if input_size is None:
-            input_sizes[idx], input_sizes_temp[idx] = 0, -1
-
-    # Cases where repeats is an int or single value tensor
-    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
-        if input_sizes[dim] == 0:
-            return symbolic_helper._onnx_opset_unsupported_detailed(
-                "repeat_interleave",
-                9,
-                13,
-                "Unsupported along dimension with unknown input size",
-                self,
-            )
-        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
-            g, self, repeats, dim
-        )
-
-    # Cases where repeats is a 1 dim Tensor
-    elif repeats_dim == 1:
-        if input_sizes[dim] == 0:
-            return symbolic_helper._onnx_opset_unsupported_detailed(
-                "repeat_interleave",
-                9,
-                13,
-                "Unsupported along dimension with unknown input size",
-                self,
-            )
-        if repeats_sizes[0] is None:
-            return symbolic_helper._onnx_opset_unsupported_detailed(
-                "repeat_interleave",
-                9,
-                13,
-                "Unsupported for cases with dynamic repeats",
-                self,
-            )
-        assert repeats_sizes[0] == input_sizes[dim], (
-            "repeats must have the same size as input along dim"
-        )
-        reps = repeats_sizes[0]
-    else:
-        raise errors.SymbolicValueError("repeats must be 0-dim or 1-dim tensor", self)
-
-    final_splits = []
-    r_splits = symbolic_helper._repeat_interleave_split_helper(g, repeats, reps, 0)
-    i_splits = symbolic_helper._repeat_interleave_split_helper(g, self, reps, dim)
-    input_sizes[dim], input_sizes_temp[dim] = -1, 1
-    for idx, r_split in enumerate(r_splits):
-        i_split = unsqueeze(g, i_splits[idx], dim + 1)
-        r_concat = [
-            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[: dim + 1])),
-            r_split,
-            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1 :])),
-        ]
-        r_concat = g.op("Concat", *r_concat, axis_i=0)
-        i_split = expand(g, i_split, r_concat, None)
-        i_split = symbolic_helper._reshape_helper(
-            g,
-            i_split,
-            g.op("Constant", value_t=torch.LongTensor(input_sizes)),
-            allowzero=0,
-        )
-        final_splits.append(i_split)
-    return g.op("Concat", *final_splits, axis_i=dim)
-
-
-@_onnx_symbolic("aten::pixel_shuffle")
-@symbolic_helper.parse_args("v", "i")
-def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
-    dims = symbolic_helper._get_tensor_sizes(self)
-    if len(dims) != 4:
-        return symbolic_helper._unimplemented(
-            "pixel_shuffle", "only support 4d input", self
-        )
-    if any(i is None for i in dims[1:]):
-        after_view = symbolic_helper._reshape_helper(
-            g,
-            symbolic_helper._unsqueeze_helper(g, self, [2, 3]),
-            g.op(
-                "Constant",
-                value_t=torch.tensor([0, -1, upscale_factor, upscale_factor, 0, 0]),
-            ),
-            allowzero=0,
-        )
-        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
-        # For dynamic input shapes, two reshapes are performed
-        reshape_h = symbolic_helper._reshape_helper(
-            g,
-            after_transpose,
-            g.op("Constant", value_t=torch.tensor([0, 0, -1, 1, 0, 0])),
-            allowzero=0,
-        )
-        reshape_w = symbolic_helper._reshape_helper(
-            g,
-            reshape_h,
-            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, 1])),
-            allowzero=0,
-        )
-        return symbolic_helper._squeeze_helper(g, reshape_w, [3, 5])
-    else:
-        output_channel = dims[1] // upscale_factor // upscale_factor
-        after_view = symbolic_helper._reshape_helper(
-            g,
-            self,
-            g.op(
-                "Constant",
-                value_t=torch.tensor(
-                    [
-                        -1,
-                        output_channel,
-                        upscale_factor,
-                        upscale_factor,
-                        dims[2],
-                        dims[3],
-                    ]
-                ),
-            ),
-            allowzero=0,
-        )
-        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
-        return symbolic_helper._reshape_helper(
-            g,
-            after_transpose,
-            g.op(
-                "Constant",
-                value_t=torch.tensor(
-                    [
-                        -1,
-                        output_channel,
-                        dims[2] * upscale_factor,
-                        dims[3] * upscale_factor,
-                    ]
-                ),
-            ),
-            allowzero=0,
-        )
-
-
-@_onnx_symbolic("aten::pixel_unshuffle")
-@symbolic_helper.parse_args("v", "i")
-def pixel_unshuffle(g: jit_utils.GraphContext, self, downscale_factor):
-    dims = symbolic_helper._get_tensor_sizes(self)
-    if len(dims) != 4:
-        return symbolic_helper._unimplemented(
-            "pixel_shuffle", "only support 4d input", self
-        )
-    if any(i is None for i in dims[1:]):
-        # For dynamic input shapes, two reshapes are performed
-        reshape_h = symbolic_helper._reshape_helper(
-            g,
-            symbolic_helper._unsqueeze_helper(g, self, [3]),
-            g.op("Constant", value_t=torch.tensor([0, 0, -1, downscale_factor, 0])),
-            allowzero=0,
-        )
-        reshape_w = symbolic_helper._reshape_helper(
-            g,
-            reshape_h,
-            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, downscale_factor])),
-            allowzero=0,
-        )
-        after_transpose = g.op("Transpose", reshape_w, perm_i=[0, 1, 3, 5, 2, 4])
-        final_reshape = symbolic_helper._reshape_helper(
-            g,
-            after_transpose,
-            g.op("Constant", value_t=torch.tensor([0, -1, 1, 1, 0, 0])),
-            allowzero=0,
-        )
-        return symbolic_helper._squeeze_helper(g, final_reshape, [2, 3])
-    else:
-        output_channel = dims[1] * downscale_factor * downscale_factor
-        after_view = symbolic_helper._reshape_helper(
-            g,
-            self,
-            g.op(
-                "Constant",
-                value_t=torch.tensor(
-                    [
-                        -1,
-                        dims[1],
-                        dims[2] // downscale_factor,
-                        downscale_factor,
-                        dims[3] // downscale_factor,
-                        downscale_factor,
-                    ]
-                ),
-            ),
-            allowzero=0,
-        )
-        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 3, 5, 2, 4])
-        return symbolic_helper._reshape_helper(
-            g,
-            after_transpose,
-            g.op(
-                "Constant",
-                value_t=torch.tensor(
-                    [
-                        -1,
-                        output_channel,
-                        dims[2] // downscale_factor,
-                        dims[3] // downscale_factor,
-                    ]
-                ),
-            ),
-            allowzero=0,
-        )
-
-
-def _generic_rnn(
-    g: jit_utils.GraphContext,
-    variant,
-    input,
-    initial_states,
-    all_weights,
-    has_biases,
-    num_layers,
-    dropout,
-    train,
-    bidirectional,
-    batch_first=None,
-    batch_sizes=None,
-):
-    warnings.warn(
-        "Exporting a model to ONNX with a batch_size other than 1, "
-        + "with a variable length with "
-        + variant
-        + " can cause an error "
-        + "when running the ONNX model with a different batch size. "
-        + "Make sure to save the model with a batch size of 1, "
-        + "or define the initial states (h0/c0) as inputs of the model. "
-    )
-
-    onnxActivations = [
-        "Relu",
-        "Tanh",
-        "Sigmoid",
-        "Affine",
-        "LeakyRelu",
-        "ThresholdedRelu",
-        "ScaledTanh",
-        "HardSigmoid",
-        "Elu",
-        "Softsign",
-        "Softplus",
-    ]
-    variantToOnnxActivationMap = dict(
-        zip([act_fun.lower() for act_fun in onnxActivations], onnxActivations)
-    )
-    weights_per_layer = 4 if has_biases else 2
-    # this means that projections are used inside LSTM, so need to tell user that it's not supported
-    if variant == "LSTM" and len(all_weights) != num_layers * weights_per_layer * (
-        1 + bidirectional
-    ):
-        return symbolic_helper._unimplemented("LSTM", "LSTMs with projections", input)
-    assert len(all_weights) == num_layers * weights_per_layer * (1 + bidirectional)
-    layer_weights = [
-        all_weights[i : i + weights_per_layer]
-        for i in range(0, len(all_weights), weights_per_layer)
-    ]
-    if batch_first:
-        # batch, seq, feat -> seq, batch, feat
-        input = g.op("Transpose", input, perm_i=[1, 0, 2])
-    if dropout and train:
-        return symbolic_helper._unimplemented(
-            "RNN/GRU/LSTM", "dropout in training mode", input
-        )
-
-    if variant.startswith("RNN"):
-        nonlinearity = variantToOnnxActivationMap[variant[4:].lower()]
-        variant = "RNN"
-
-    w_hh = all_weights[1]
-    hidden_size = symbolic_helper._get_tensor_dim_size(w_hh, 1)
-    if hidden_size is None:
-        return symbolic_helper._unimplemented(
-            "RNN/GRU/LSTM", "unknown hidden size", input
-        )
-
-    unidirectional = not bidirectional
-
-    prev_output = input
-
-    h_outs = []
-    if variant == "RNN" or variant == "GRU":
-        h0 = initial_states
-    elif variant == "LSTM":
-        h0, c0 = initial_states
-        c_outs = []
-
-    sequence_lens = unused(g) if batch_sizes is None else batch_sizes
-
-    if variant == "GRU":
-        # pytorch is reset, input, hidden
-        # onnx is    input, reset, hidden
-        reform_permutation = [(1, 2), (0, 1), (2, 3)]
-    elif variant == "LSTM":
-        # pytorch is input, forget, cell, output.
-        # onnx is    input, output, forget, cell.
-        reform_permutation = [(0, 1), (3, 4), (1, 3)]
-
-    def reform_weights(g, w, n, intervals):
-        slices = [
-            symbolic_helper._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n])
-            for x, y in intervals
-        ]
-        return g.op("Concat", *slices, axis_i=0)
-
-    def transform_weights_no_bias(layer_index):
-        weights = layer_weights[layer_index]
-        if variant == "RNN":
-            weight_ih, weight_hh = weights
-        elif variant == "GRU" or variant == "LSTM":
-            weight_ih, weight_hh = (
-                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
-            )
-        return tuple(
-            symbolic_helper._unsqueeze_helper(g, x, [0])
-            for x in (weight_ih, weight_hh)  # type: ignore[possibly-undefined]
-        )
-
-    def transform_weights(layer_index):
-        weights = layer_weights[layer_index]
-        if variant == "RNN":
-            weight_ih, weight_hh, bias_ih, bias_hh = weights
-        elif variant == "GRU" or variant == "LSTM":
-            weight_ih, weight_hh, bias_ih, bias_hh = (
-                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
-            )
-        bias_concat = g.op("Concat", bias_ih, bias_hh, axis_i=0)  # type: ignore[possibly-undefined]
-        return tuple(
-            symbolic_helper._unsqueeze_helper(g, x, [0])
-            for x in (weight_ih, weight_hh, bias_concat)  # type: ignore[possibly-undefined]
-        )
-
-    def retrieve_state(x, start, end):
-        return (
-            x
-            if num_layers == 1
-            else symbolic_helper._slice_helper(
-                g, x, axes=[0], starts=[start], ends=[end]
-            )
-        )
-
-    for i in range(num_layers):
-        if unidirectional:
-            if weights_per_layer == 4:
-                weight_ih, weight_hh, bias_concat = transform_weights(i)
-            else:
-                weight_ih, weight_hh = transform_weights_no_bias(i)
-                bias_concat = unused(g)
-
-            state_indices = i, i + 1
-        else:
-            if weights_per_layer == 4:
-                weight_ih_f, weight_hh_f, bias_f = transform_weights(2 * i)
-                weight_ih_b, weight_hh_b, bias_b = transform_weights(2 * i + 1)
-                bias_concat = g.op("Concat", bias_f, bias_b, axis_i=0)
-            else:
-                weight_ih_f, weight_hh_f = transform_weights_no_bias(2 * i)
-                weight_ih_b, weight_hh_b = transform_weights_no_bias(2 * i + 1)
-                bias_concat = unused(g)
-
-            weight_ih = g.op("Concat", weight_ih_f, weight_ih_b, axis_i=0)
-            weight_hh = g.op("Concat", weight_hh_f, weight_hh_b, axis_i=0)
-
-            state_indices = 2 * i, 2 * i + 2
-
-        inputs = [prev_output, weight_ih, weight_hh, bias_concat, sequence_lens]
-
-        inputs.append(retrieve_state(h0, *state_indices))  # type: ignore[possibly-undefined]
-        if variant == "LSTM":
-            inputs.append(retrieve_state(c0, *state_indices))  # type: ignore[possibly-undefined]
-
-        extra_kwargs = {} if unidirectional else {"direction_s": "bidirectional"}
-        if variant == "RNN":
-            if bidirectional:
-                activation = [nonlinearity, nonlinearity]  # type: ignore[possibly-undefined]
-            else:
-                activation = [nonlinearity]  # type: ignore[possibly-undefined]
-
-            prev_output, h_out = g.op(
-                "RNN",
-                *inputs,
-                outputs=2,
-                hidden_size_i=hidden_size,
-                activations_s=activation,
-                **extra_kwargs,
-            )
-        elif variant == "GRU":
-            prev_output, h_out = g.op(
-                "GRU",
-                *inputs,
-                outputs=2,
-                hidden_size_i=hidden_size,
-                linear_before_reset_i=1,
-                **extra_kwargs,
-            )
-        elif variant == "LSTM":
-            prev_output, h_out, c_out = g.op(
-                "LSTM", *inputs, outputs=3, hidden_size_i=hidden_size, **extra_kwargs
-            )
-
-        if bidirectional:
-            # The ONNX RNN/GRU/LSTM produce an output of dimensions
-            #   seq_len, num_directions, batch, hidden_size
-            # We have to convert to match pytorch's expected
-            #   seq_len, batch, num_directions * hidden_size
-            # by first moving num_directions before hidden_size with
-            # Transpose, and then combining it with hidden_size
-            # with Reshape.
-            prev_output = g.op("Transpose", prev_output, perm_i=[0, 2, 1, 3])
-            prev_output = symbolic_helper._reshape_helper(
-                g,
-                prev_output,
-                g.op("Constant", value_t=torch.LongTensor([0, 0, -1])),
-                allowzero=0,
-            )
-        else:
-            prev_output = symbolic_helper._squeeze_helper(g, prev_output, [1])
-
-        h_outs.append(h_out)  # type: ignore[possibly-undefined]
-        if variant == "LSTM":
-            c_outs.append(c_out)  # type: ignore[possibly-undefined]
-    if batch_first:
-        # seq, batch, num_directions * hidden_size -> batch, seq, num_directions * hidden_size
-        prev_output = g.op("Transpose", prev_output, perm_i=[1, 0, 2])
-    h_outs = h_out if num_layers == 1 else g.op("Concat", *h_outs, axis_i=0)  # type: ignore[possibly-undefined]
-    if variant == "RNN" or variant == "GRU":
-        return prev_output, h_outs
-    elif variant == "LSTM":
-        c_outs = c_out if num_layers == 1 else g.op("Concat", *c_outs, axis_i=0)  # type: ignore[possibly-undefined]
-        return prev_output, h_outs, c_outs
-
-
-@symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
-def _lstm_full(
-    g: jit_utils.GraphContext,
-    input,
-    hidden_v,
-    weight_v,
-    has_biases,
-    num_layers,
-    dropout,
-    train,
-    bidirectional,
-    batch_first,
-):
-    hidden, weight = (
-        symbolic_helper._unpack_list(hidden_v),
-        symbolic_helper._unpack_list(weight_v),
-    )
-    return _generic_rnn(
-        g,
-        "LSTM",
-        input,
-        hidden,
-        weight,
-        has_biases,
-        num_layers,
-        dropout,
-        train,
-        bidirectional,
-        batch_first,
-    )
-
-
-@symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
-def _lstm_packed(
-    g: jit_utils.GraphContext,
-    input,
-    batch_sizes,
-    hidden_v,
-    weight_v,
-    has_biases,
-    num_layers,
-    dropout,
-    train,
-    bidirectional,
-):
-    hidden, weight = (
-        symbolic_helper._unpack_list(hidden_v),
-        symbolic_helper._unpack_list(weight_v),
-    )
-    return _generic_rnn(
-        g,
-        "LSTM",
-        input,
-        hidden,
-        weight,
-        has_biases,
-        num_layers,
-        dropout,
-        train,
-        bidirectional,
-        batch_sizes=batch_sizes,
-    )
-
-
-@_onnx_symbolic("aten::lstm")
-def lstm(g: jit_utils.GraphContext, *args):
-    if symbolic_helper._is_tensor_list(args[3]):
-        return _lstm_packed(g, *args)
-    else:
-        return _lstm_full(g, *args)
-
-
-@_onnx_symbolic("aten::lstm_cell")
-def lstm_cell(g: jit_utils.GraphContext, self, hidden, w_ih, w_hh, b_ih, b_hh):
-    input = symbolic_helper._unsqueeze_helper(g, self, [0])
-    hidden = symbolic_helper._unpack_list(hidden)
-    hidden = [symbolic_helper._unsqueeze_helper(g, x, [0]) for x in hidden]
-    weight = (
-        (w_ih, w_hh, b_ih, b_hh) if symbolic_helper._is_tensor(b_ih) else (w_ih, w_hh)
-    )
-    has_biases = True if symbolic_helper._is_tensor(b_ih) else False
-    _, h_outs, c_outs = _generic_rnn(
-        g,
-        "LSTM",
-        input,
-        hidden,
-        weight,
-        has_biases,
-        num_layers=1,
-        dropout=0,
-        train=0,
-        bidirectional=False,
-        batch_first=False,
-    )
-    return symbolic_helper._squeeze_helper(
-        g, h_outs, [0]
-    ), symbolic_helper._squeeze_helper(g, c_outs, [0])
-
-
-@_onnx_symbolic(
-    "aten::gru", decorate=[symbolic_helper._apply_params("GRU"), _export("gru")]
-)
-@_onnx_symbolic(
-    "aten::rnn_tanh",
-    decorate=[symbolic_helper._apply_params("RNN_TANH"), _export("rnn_tanh")],
-)
-@_onnx_symbolic(
-    "aten::rnn_relu",
-    decorate=[symbolic_helper._apply_params("RNN_RELU"), _export("rnn_relu")],
-)
-def _one_hidden_rnn(kind: str):
-    @symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
-    def _rnn_full(
-        g,
-        input,
-        hidden,
-        weight_v,
-        has_biases,
-        num_layers,
-        dropout,
-        train,
-        bidirectional,
-        batch_first,
-    ):
-        weight = symbolic_helper._unpack_list(weight_v)
-        return _generic_rnn(
-            g,
-            kind,
-            input,
-            hidden,
-            weight,
-            has_biases,
-            num_layers,
-            dropout,
-            train,
-            bidirectional,
-            batch_first,
-        )
-
-    @symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
-    def _rnn_packed(
-        g,
-        input,
-        batch_sizes,
-        hidden,
-        weight_v,
-        has_biases,
-        num_layers,
-        dropout,
-        train,
-        bidirectional,
-    ):
-        weight = symbolic_helper._unpack_list(weight_v)
-        return _generic_rnn(
-            g,
-            kind,
-            input,
-            hidden,
-            weight,
-            has_biases,
-            num_layers,
-            dropout,
-            train,
-            bidirectional,
-            batch_sizes=batch_sizes,
-        )
-
-    def symbolic(g, *args):
-        if symbolic_helper._is_tensor_list(args[3]):
-            return _rnn_packed(g, *args)
-        else:
-            return _rnn_full(g, *args)
-
-    return symbolic
-
-
-@_onnx_symbolic("aten::_dim_arange")
-@symbolic_helper.parse_args("v", "i")
-def _dim_arange(g: jit_utils.GraphContext, like, dim):
-    like_shape = g.op("Shape", like)
-    stop = g.op(
-        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
-    )
-    # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
-    return arange(g, stop, 4, None, None, None)
-
-
-@_onnx_symbolic("aten::detach")
-def detach(g: jit_utils.GraphContext, input):
-    # Erase aten::detach nodes because ONNX is inference only
-    return input
-
-
-@_onnx_symbolic("aten::contiguous")
-@symbolic_helper.parse_args("v", "i")
-def contiguous(g: jit_utils.GraphContext, input, memory_format):
-    if memory_format > 2:  # allower values are any, preserve and contiguous_format
-        raise errors.SymbolicValueError(
-            "onnx memory_format support is not implemented", input
-        )
-    return input
-
-
-@_onnx_symbolic("aten::_pack_padded_sequence")
-@symbolic_helper.parse_args("v", "v", "i")
-def _pack_padded_sequence(g: jit_utils.GraphContext, input, lengths, batch_first):
-    # Currently there is no PackPadded operator in ONNX. We rely on an
-    # optimization pass to remove this later. It is an error if all
-    # PackPadded operators cannot be optimized out.
-    if batch_first:
-        input = g.op("Transpose", input, perm_i=[1, 0, 2])
-    if not lengths.type().isSubtypeOf(torch._C.TensorType.get()):
-        raise errors.SymbolicValueError(
-            "'lengths' must be a Tensor for ONNX export", input
-        )
-    # We know it's a TensorType so this check is now safe.
-    # It's really only necessary because those operators expand to something that
-    # only works with int32 types in Caffe2...
-    if (
-        _type_utils.JitScalarType.from_value(
-            lengths, _type_utils.JitScalarType.UNDEFINED
-        )
-        != _type_utils.JitScalarType.INT
-    ):
-        lengths = g.op("Cast", lengths, to_i=_C_onnx.TensorProtoDataType.INT32)
-    return g.op("prim::PackPadded", input, lengths, outputs=2)
-
-
-@_onnx_symbolic("aten::_pad_packed_sequence")
-@symbolic_helper.parse_args("v", "v", "i", "t", "v")
-def _pad_packed_sequence(
-    g: jit_utils.GraphContext,
-    data,
-    batch_sizes,
-    batch_first,
-    padding_value,
-    total_length,
-):
-    # Ignore total_length as it is not supported in _symbolic_pad_packed_sequence
-    # It is only useful/used when training using data_parallel model, so
-    # It shouldn't be relevant for ONNX anyway
-    data, lengths = g.op("prim::PadPacked", data, batch_sizes, outputs=2)
-    if batch_first:
-        data = g.op("Transpose", data, perm_i=[1, 0, 2])
-    return data, lengths
-
-
-@_onnx_symbolic("aten::randint")
-def randint(g: jit_utils.GraphContext, low, high, shapes, dtype, *options):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    low_i = symbolic_helper._get_const(low, "i", "low")
-    high_i = symbolic_helper._get_const(high, "i", "high")
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.INT64
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    if low_i is None:
-        raise symbolic_helper._onnx_unsupported("randint", low)
-    if high_i is None:
-        raise symbolic_helper._onnx_unsupported("randint", high)
-
-    shape = symbolic_helper._maybe_get_const(shapes, "is")
-    if symbolic_helper._is_value(shape):
-        shape_const = g.op(
-            "ConstantOfShape",
-            shapes,
-            value_t=torch.tensor([0], dtype=torch.float),
-        )
-        randn = g.op(
-            "RandomUniformLike",
-            shape_const,
-            low_f=low_i,
-            high_f=high_i,
-        )
-    else:
-        randn = g.op(
-            "RandomUniform",
-            shape_i=shape,
-            low_f=low_i,
-            high_f=high_i,
-        )
-
-    # cast to integer type
-    int_dtype = _type_utils.JitScalarType.INT64
-    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
-    if int_dtype != scalar_type:
-        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
-    return randint
-
-
-@_onnx_symbolic("aten::randint_like")
-def randint_like(g: jit_utils.GraphContext, self, low, high, dtype, *options):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    low_i = symbolic_helper._get_const(low, "i", "low")
-    high_i = symbolic_helper._get_const(high, "i", "high")
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.INT64
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    if low_i is None:
-        raise symbolic_helper._onnx_unsupported("randint", low)
-    if high_i is None:
-        raise symbolic_helper._onnx_unsupported("randint", high)
-
-    randn = g.op(
-        "RandomUniformLike",
-        self,
-        low_f=low_i,
-        high_f=high_i,
-    )
-
-    # cast to integer type
-    int_dtype = _type_utils.JitScalarType.INT64
-    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
-    if int_dtype != scalar_type:
-        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
-    return randint
-
-
-@_onnx_symbolic("aten::randn")
-def randn(g: jit_utils.GraphContext, shapes, dtype, *options):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    shape = symbolic_helper._maybe_get_const(shapes, "is")
-    if symbolic_helper._is_value(shape):
-        shape_const = g.op(
-            "ConstantOfShape",
-            shapes,
-            value_t=torch.tensor([0], dtype=torch.float),
-        )
-        return g.op(
-            "RandomNormalLike",
-            shape_const,
-            dtype_i=scalar_type.onnx_type(),
-        )
-    return g.op(
-        "RandomNormal",
-        shape_i=shape,
-        dtype_i=scalar_type.onnx_type(),
-    )
-
-
-@_onnx_symbolic("aten::rand")
-def rand(g: jit_utils.GraphContext, shapes, dtype, *options):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    shape = symbolic_helper._maybe_get_const(shapes, "is")
-    if symbolic_helper._is_value(shape):
-        shape_const = g.op(
-            "ConstantOfShape",
-            shapes,
-            value_t=torch.tensor([0], dtype=torch.float),
-        )
-        return g.op(
-            "RandomUniformLike",
-            shape_const,
-            dtype_i=scalar_type.onnx_type(),
-        )
-    return g.op(
-        "RandomUniform",
-        shape_i=shape,
-        dtype_i=scalar_type.onnx_type(),
-    )
-
-
-@_onnx_symbolic("aten::randn_like")
-def randn_like(
-    g: jit_utils.GraphContext,
-    self,
-    dtype,
-    layout=None,
-    device=None,
-    pin_memory=False,
-    memory_format=None,
-):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.from_value(
-            self, _type_utils.JitScalarType.FLOAT
-        )
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    return g.op("RandomNormalLike", self, dtype_i=scalar_type.onnx_type())
-
-
-@_onnx_symbolic("aten::rand_like")
-def rand_like(
-    g: jit_utils.GraphContext,
-    self,
-    dtype,
-    layout=None,
-    device=None,
-    pin_memory=False,
-    memory_format=None,
-):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if dtype is None:
-        dtype = _type_utils.JitScalarType.from_value(
-            self, _type_utils.JitScalarType.FLOAT
-        )
-    return g.op(
-        "RandomUniformLike", self, dtype_i=_type_utils.JitScalarType(dtype).onnx_type()
-    )
-
-
-@_onnx_symbolic("aten::rrelu")
-@symbolic_helper.parse_args("v", "f", "f", "i", "none")
-def rrelu(g: jit_utils.GraphContext, input, lower, upper, training, generator):
-    if not training:
-        slope = (upper + lower) / 2.0
-        return g.op("LeakyRelu", input, alpha_f=slope)
-    p = g.op("RandomUniformLike", input, high_f=upper, low_f=lower)
-    return g.op("PRelu", input, p)
-
-
-@_onnx_symbolic("aten::bernoulli")
-def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
-    if out is not None and not symbolic_helper._is_none(out):
-        symbolic_helper._unimplemented(
-            "Bernoulli", "out parameter is not supported for bernoulli", input
-        )
-    if generator is not None and not symbolic_helper._is_none(generator):
-        symbolic_helper._unimplemented(
-            "Bernoulli", "generator is not supported for bernoulli", input
-        )
-
-    dtype = _type_utils.JitScalarType.from_value(
-        input, _type_utils.JitScalarType.UNDEFINED
-    )
-    if dtype == _type_utils.JitScalarType.UNDEFINED:
-        return symbolic_helper._unimplemented(
-            "Bernoulli", "input dtype not accessible", input
-        )
-
-    rands = g.op(
-        "RandomUniformLike",
-        input,
-        high_f=1.0,
-        low_f=0.0,
-        dtype_i=dtype.onnx_type(),
-    )
-    prob = p if p is not None and not symbolic_helper._is_none(p) else input
-    output = g.op("Less", rands, prob)
-    return g.op("Cast", output, to_i=dtype.onnx_type())
-
-
-@_onnx_symbolic("aten::log_sigmoid")
-@symbolic_helper.parse_args("v")
-def log_sigmoid(g: jit_utils.GraphContext, input):
-    p = g.op("Sigmoid", input)
-    return g.op("Log", p)
-
-
-@_onnx_symbolic("aten::erf")
-@symbolic_helper.parse_args("v")
-def erf(g: jit_utils.GraphContext, input):
-    return g.op("Erf", input)
-
-
-@_onnx_symbolic("aten::flatten")
-@symbolic_helper.quantized_args(True, False, False)
-@symbolic_helper.parse_args("v", "i", "i")
-def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
-    dim = symbolic_helper._get_tensor_rank(input)
-    if dim is None:
-        return symbolic_helper._unimplemented(
-            "dim",
-            "ONNX and PyTorch use different strategies to split the input. "
-            "Input rank must be known at export time.",
-            input,
-        )
-
-    if dim == 0:
-        return symbolic_helper._reshape_helper(g, input, [1])
-    if dim == 1:
-        return g.op("Identity", input)
-    # TODO: remove this as onnx opset 11 spec allows negative axes
-    if end_dim < 0:
-        end_dim = dim + end_dim
-    # use ONNX's Flatten operator for cases where the output shape is 2D
-    if start_dim == 1 and end_dim == dim - 1:
-        return g.op("Flatten", input, axis_i=start_dim)
-    if start_dim == 0 and end_dim == dim - 2:
-        return g.op("Flatten", input, axis_i=end_dim + 1)
-
-    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
-
-
-@_onnx_symbolic("aten::nonzero")
-@symbolic_helper.parse_args("v")
-def nonzero(g: jit_utils.GraphContext, input):
-    """Emitted from `torch.nonzero(x, as_tuple=False)`"""
-    return t(g, g.op("NonZero", input))
-
-
-@_onnx_symbolic("aten::nonzero_numpy")
-# Emitted from `torch.nonzero(x, as_tuple=True)`
-def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
-    return unbind(g, nonzero(g, input), 1, _outputs=_outputs)
-
-
-@_onnx_symbolic("aten::isnan")
-@symbolic_helper.parse_args("v")
-def isnan(g: jit_utils.GraphContext, input):
-    output = g.op("IsNaN", input)
-    return output
-
-
-@_onnx_symbolic("aten::any")
-def _any(g: jit_utils.GraphContext, *args):
-    # aten::any(Tensor self)
-    if len(args) == 1:
-        input = args[0]
-        dim, keepdim = None, 0
-    # aten::any(Tensor self, int[]? dim, bool keepdim)
-    else:
-        input, dim, keepdim = args
-        # Can be int list or single int
-        dim = symbolic_helper._parse_arg(dim, "t")
-        dim = [int(d) for d in dim.view(-1)]
-        keepdim = symbolic_helper._parse_arg(keepdim, "i")
-    input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
-    input_sum = symbolic_helper._reducesum_helper(
-        g, input, axes_i=dim, keepdims_i=keepdim
-    )
-    return gt(g, input_sum, g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)))
-
-
-@_onnx_symbolic("aten::all")
-def _all(g: jit_utils.GraphContext, *args):
-    input = g.op("Not", args[0])
-    # aten::all(Tensor self)
-    if len(args) == 1:
-        return g.op("Not", _any(g, input))
-    # aten::all(Tensor self, int[]? dim, bool keepdim)
-    else:
-        return g.op("Not", _any(g, input, args[1], args[2]))
-
-
-@_onnx_symbolic("aten::narrow")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def narrow(g: jit_utils.GraphContext, input, dim, start, length):
-    return symbolic_helper._slice_helper(
-        g, input, axes=[dim], starts=[start], ends=[start + length]
-    )
-
-
-@_onnx_symbolic("aten::argmax")
-@symbolic_helper.parse_args("v", "v", "b")
-def argmax(
-    g: jit_utils.GraphContext,
-    input: torch._C.Value,
-    dim: torch._C.Value,
-    keepdim: bool,
-):
-    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
-
-
-@_onnx_symbolic("aten::argmin")
-@symbolic_helper.parse_args("v", "v", "b")
-def argmin(
-    g: jit_utils.GraphContext,
-    input: torch._C.Value,
-    dim: torch._C.Value,
-    keepdim: bool,
-):
-    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
-
-
-@_onnx_symbolic("aten::scatter")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def scatter(g: jit_utils.GraphContext, self, dim, index, src):
-    src_type = _type_utils.JitScalarType.from_value(
-        src, _type_utils.JitScalarType.UNDEFINED
-    )
-    src = symbolic_helper._maybe_get_scalar(src)
-    if symbolic_helper._is_value(src):
-        return g.op("Scatter", self, index, src, axis_i=dim)
-    else:
-        # Check if scalar "src" has same type as self (PyTorch allows different
-        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
-        self_scalar_type = _type_utils.JitScalarType.from_value(self)
-        if self_scalar_type != src_type:
-            src = g.op("Cast", src, to_i=self_scalar_type.onnx_type())
-        return g.op("Scatter", self, index, expand_as(g, src, index), axis_i=dim)
-
-
-@_onnx_symbolic("aten::scatter_add")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
-    scalar_type = symbolic_helper._try_get_scalar_type(self)
-    if scalar_type is None:
-        return symbolic_helper._unimplemented(
-            "scatter_add", "input dtype not accessible", self
-        )
-    sizes = symbolic_helper._get_tensor_sizes(self, allow_nonstatic=False)
-    if sizes:
-        to_add = g.op("Constant", value_t=torch.zeros(sizes, dtype=scalar_type.dtype()))
-    else:
-        to_add = zeros_like(g, self, scalar_type)
-    to_add = symbolic_helper._scatter_helper(g, to_add, dim, index, src)
-    return add(g, self, to_add)
-
-
-@_onnx_symbolic("aten::log2")
-def log2(g: jit_utils.GraphContext, self):
-    _ln2 = 0.693147180559945309
-    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor(_ln2)))
-
-
-@_onnx_symbolic("aten::is_floating_point")
-def is_floating_point(g: jit_utils.GraphContext, self):
-    if symbolic_helper._is_fp(self):
-        return g.op("Constant", value_t=torch.BoolTensor([1]))
-    return g.op("Constant", value_t=torch.BoolTensor([0]))
-
-
-@_onnx_symbolic("aten::__is_")
-def __is_(g: jit_utils.GraphContext, self, other):
-    if symbolic_helper._is_none(other):
-        if symbolic_helper._is_none(self):
-            return g.op("Constant", value_t=torch.BoolTensor([1]))
-        return g.op("Constant", value_t=torch.BoolTensor([0]))
-    return eq(g, self, other)
-
-
-@_onnx_symbolic("aten::__isnot_")
-@wrap_logical_op_with_negation
-def __isnot_(g: jit_utils.GraphContext, self, other):
-    return __is_(g, self, other)
-
-
-@_onnx_symbolic("aten::one_hot")
-def one_hot(g: jit_utils.GraphContext, self, num_classes):
-    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
-    # onnxruntime supports limited type combinations for OneHot.
-    if _type_utils.JitScalarType.from_value(
-        num_classes, _type_utils.JitScalarType.UNDEFINED
-    ) in {
-        _type_utils.JitScalarType.UINT8,
-        _type_utils.JitScalarType.INT8,
-        _type_utils.JitScalarType.INT,
-        _type_utils.JitScalarType.INT16,
-    }:
-        num_classes = g.op("Cast", num_classes, to_i=_C_onnx.TensorProtoDataType.INT64)
-    return g.op("OneHot", self, num_classes, values, axis_i=-1)
-
-
-@_onnx_symbolic("aten::gather")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
-    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
-        return symbolic_helper._unimplemented("gather", "sparse_grad == True", self)
-    # NOTE: This workaround is needed since GatherElement is only supported
-    #       since opset 11, and Gather in ONNX is not the same as torch.gather.
-    scalar_type = _type_utils.JitScalarType.from_value(self)
-    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
-    depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim])))
-    index = g.op(
-        "Cast",
-        g.op("OneHot", index, depth, values, axis_i=dim),
-        to_i=scalar_type.onnx_type(),
-    )
-    mul = g.op("Mul", symbolic_helper._unsqueeze_helper(g, self, [dim + 1]), index)
-    return symbolic_helper._reducesum_helper(g, mul, axes_i=[dim], keepdims_i=0)
-
-
-@symbolic_helper.parse_args("v", "is", "i", "i")
-def _var_mean(g: jit_utils.GraphContext, input, dim, correction, keepdim):
-    return symbolic_helper._var_mean_helper(g, input, dim, correction, keepdim)
-
-
-@_onnx_symbolic("aten::std")
-def std(g: jit_utils.GraphContext, input, *args):
-    var, _ = var_mean(g, input, *args)
-    return g.op("Sqrt", var)
-
-
-@_onnx_symbolic("aten::var")
-def var(g: jit_utils.GraphContext, input, *args):
-    var, _ = var_mean(g, input, *args)
-    return var
-
-
-@_onnx_symbolic("aten::var_mean")
-def var_mean(g: jit_utils.GraphContext, input, *args):
-    if len(args) == 1:
-        return _var_mean(g, input, None, args[0], None)
-    else:
-        return _var_mean(g, input, *args)
-
-
-@_onnx_symbolic("aten::std_mean")
-def std_mean(g: jit_utils.GraphContext, input, *args):
-    var, mean = var_mean(g, input, *args)
-    return g.op("Sqrt", var), mean
-
-
-@_onnx_symbolic("aten::logsumexp")
-@symbolic_helper.parse_args("v", "is", "i")
-def logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
-    return g.op("ReduceLogSumExp", input, axes_i=dim, keepdims_i=keepdim)
-
-
-@_onnx_symbolic("aten::arange")
-def arange(g: jit_utils.GraphContext, *args):
-    def _get_arange_dtype(dtype):
-        dtype = symbolic_helper._maybe_get_const(dtype, "i")
-        return dtype
-
-    def _float_step_convert(range_tensor):
-        if symbolic_helper._is_fp(range_tensor):
-            range_tensor = g.op(
-                "Cast",
-                g.op("Ceil", range_tensor),
-                to_i=_type_utils.JitScalarType.INT64.onnx_type(),
-            )
-        return range_tensor
-
-    if len(args) == 2 or len(args) == 5:
-        if len(args) == 2:
-            # aten::arange(Scalar end, Tensor out)
-            dtype = None
-        else:
-            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
-            dtype = _get_arange_dtype(args[1])
-        dtype, end, start, step = symbolic_helper._arange_cast_helper(
-            g, end=args[0], dtype=dtype
-        )
-        end = symbolic_helper._unsqueeze_helper(g, end, [0])
-        range_tensor = _float_step_convert(end)
-        arange_tensor = symbolic_helper._squeeze_helper(
-            g, nonzero(g, ones(g, range_tensor, dtype, None, None)), [1]
-        )
-        return g.op(
-            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
-        )
-    elif len(args) == 4 or len(args) == 7:
-        if len(args) == 4:
-            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
-            dtype = None
-        else:
-            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
-            dtype = _get_arange_dtype(args[3])
-        dtype, end, start, step = symbolic_helper._arange_cast_helper(
-            g, start=args[0], end=args[1], step=args[2], dtype=dtype
-        )
-        step = symbolic_helper._unsqueeze_helper(g, step, [0])
-        end = symbolic_helper._unsqueeze_helper(g, end, [0])
-        start = symbolic_helper._unsqueeze_helper(g, start, [0])
-        range_tensor = _float_step_convert(g.op("Div", g.op("Sub", end, start), step))
-        arange_tensor = symbolic_helper._squeeze_helper(
-            g, nonzero(g, ones(g, range_tensor, None, None, None)), [1]
-        )
-        arange_tensor = g.op("Add", g.op("Mul", arange_tensor, step), start)
-        return g.op(
-            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
-        )
-    elif len(args) == 6:
-        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
-        dtype = _get_arange_dtype(args[2])
-        dtype, end, start, step = symbolic_helper._arange_cast_helper(
-            g, start=args[0], end=args[1], dtype=dtype
-        )
-        end = symbolic_helper._unsqueeze_helper(g, end, [0])
-        start = symbolic_helper._unsqueeze_helper(g, start, [0])
-        range_tensor = _float_step_convert(g.op("Sub", end, start))
-        arange_tensor = g.op(
-            "Add",
-            symbolic_helper._squeeze_helper(
-                g, nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), [1]
-            ),
-            start,
-        )
-        return g.op(
-            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
-        )
-
-    return symbolic_helper._unimplemented("aten::arange", f"with {len(args)} arguments")
-
-
-@_onnx_symbolic("aten::linspace")
-def linspace(
-    g: jit_utils.GraphContext, start, end, steps, dtype, layout, device, pin_memory
-):
-    range_tensor = symbolic_helper._arange_helper(g, steps, None)
-    step = div(
-        g,
-        sub(g, end, start),
-        sub(g, steps, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))),
-    )
-    return add(g, mul(g, range_tensor, step), start)
-
-
-@_onnx_symbolic("aten::lift")
-def lift(g: jit_utils.GraphContext, self):
-    # at::lift() is a no-op from the perspective of tracing for onnx
-    return self
-
-
-@_onnx_symbolic("aten::masked_fill")
-def masked_fill(g: jit_utils.GraphContext, self, mask, value):
-    """Implement the masked_fill functionality available for a pytorch tensor in ONNX.
-
-    Fills elements of the input tensor with `value` where `mask` is True.
-    """
-    mask = g.op("Cast", mask, to_i=_C_onnx.TensorProtoDataType.BOOL)
-    value = symbolic_helper._maybe_get_scalar(value)
-    return g.op("Where", mask, symbolic_helper._if_scalar_type_as(value, self), self)
-
-
-@_onnx_symbolic("aten::masked_fill_")
-def masked_fill_(g: jit_utils.GraphContext, self, mask, value):
-    return masked_fill(g, self, mask, value)
-
-
-@_onnx_symbolic("aten::index")
-def index(g: jit_utils.GraphContext, self, index):
-    if symbolic_helper._is_packed_list(index):
-        indices = symbolic_helper._unpack_list(index)
-    else:
-        indices = [index]
-
-    def try_mask_to_index(index):
-        if not symbolic_helper._is_none(index) and (
-            _type_utils.JitScalarType.from_value(
-                index, _type_utils.JitScalarType.UNDEFINED
-            )
-            == _type_utils.JitScalarType.UINT8
-            or symbolic_helper._is_bool(index)
-        ):
-            if g.opset < 9:
-                raise errors.SymbolicValueError(
-                    "Exporting masked indices are only supported after ONNX opset 9.",
-                    self,
-                )
-            warnings.warn(
-                "Exporting aten::index operator with indices of type Byte. "
-                "Only 1-D indices are supported. In any other case, "
-                "this will produce an incorrect ONNX graph."
-            )
-            index = symbolic_helper._squeeze_helper(g, nonzero(g, index), [1])
-        return index
-
-    indices = [try_mask_to_index(idx) for idx in indices]
-    if len(indices) == 1:
-        return symbolic_helper._select_helper(
-            g, self, 0, indices[0], apply_reshape=False
-        )
-    else:
-        # Multiple tensors as indices. Each tensor could either be
-        #   1. prim::Constant()
-        #           representing ":" in python indexing. E.g. tensor[:, :]
-        #   2. prim::Constant[value=...] or tensor output
-        #           representing advanced indexing. E.g. tensor[[0, 1], [2, 0]].
-        # For more info on advanced indexing,
-        # check https://numpy.org/doc/stable/user/basics.indexing.html#advanced-indexing
-
-        # Consider a general case of
-        #       t: [x_1, y_1, y_2, ..., x_m, ..., y_n]
-        # where t is a tensor of rank m+n, {x_i} are axes where tensor index is provided, and {y_i} are axes for ":".
-        # Same results can be achieved through transposing t into
-        #       t: [x_1, x_2, ..., x_m, y_1, y_2, ..., y_n]
-        # and use gatherND. However ONNX does not have gatherND, to use 1d gather we'll need to flatten t
-        # and process the tensor indices.
-        #       t: [x_1 * x_2 * ... * x_m, y_1 * y_2 * ... * y_n]
-        #       tensor index = \sum_{i=1}^m (ind_i * \prod_{j=i+1}^m (x_j))
-        # After gather, reshape and transpose back.
-        adv_idx_indices = [
-            i for i, idx in enumerate(indices) if not symbolic_helper._is_none(idx)
-        ]
-
-        if len(adv_idx_indices) == 0:
-            return self
-        elif len(adv_idx_indices) == 1:
-            return index_select(
-                g, self, adv_idx_indices[0], indices[adv_idx_indices[0]]
-            )
-        else:
-            rank = symbolic_helper._get_tensor_rank(self)
-            if rank is None:
-                return symbolic_helper._unimplemented(
-                    "aten::index",
-                    "operator of advanced indexing on tensor of unknown rank. ",
-                    self,
-                )
-            # TODO: If indexing is supported natively in ONNX in future opsets,
-            #       update the warning to recommend exporting with higher opset version.
-            warnings.warn(
-                "Exporting aten::index operator of advanced indexing in opset "
-                f"{GLOBALS.export_onnx_opset_version}"
-                " is achieved by combination of multiple ONNX operators, "
-                "including Reshape, Transpose, Concat, and Gather. "
-                "If indices include negative values, the exported graph will produce incorrect results."
-            )
-            adv_idx_count = len(adv_idx_indices)
-            shape_tensor = _shape_as_tensor(g, self)
-            dim_tensor_list = [
-                g.op(
-                    "Gather",
-                    shape_tensor,
-                    g.op("Constant", value_t=torch.LongTensor([dim])),
-                    axis_i=0,
-                )
-                for dim in range(rank)
-            ]
-
-            self = g.op(
-                "Transpose",
-                self,
-                perm_i=adv_idx_indices
-                + [i for i in range(rank) if i not in adv_idx_indices],
-            )
-            self = g.op("Flatten", self, axis_i=adv_idx_count)
-
-            # Note that tensor indices will be broadcasted while accumulating. Thus we get the final subarray shape as well.
-            cum_adv_index = indices[adv_idx_indices[-1]]
-            multiplier = dim_tensor_list[adv_idx_indices[-1]]
-            for i in range(adv_idx_count - 2, -1, -1):
-                adv_index = g.op("Mul", indices[adv_idx_indices[i]], multiplier)
-                cum_adv_index = g.op("Add", cum_adv_index, adv_index)
-                multiplier = g.op(
-                    "Mul", multiplier, dim_tensor_list[adv_idx_indices[i]]
-                )
-
-            # perform gather
-            self = index_select(g, self, 0, cum_adv_index)
-
-            cum_adv_index_shape_tensor = _shape_as_tensor(g, cum_adv_index)
-            # check if all advanced indices are consecutive.
-            # Refer to https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
-            # to understand how the subarray position is decided.
-            if adv_idx_indices == list(
-                range(adv_idx_indices[0], adv_idx_indices[-1] + 1)
-            ):
-                # unfold regular index axes
-                folded_adv_idx_shape_list = [
-                    g.op("Constant", value_t=torch.LongTensor([-1]))
-                ] + [
-                    dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices
-                ]
-                folded_adv_idx_shape = g.op(
-                    "Concat", *folded_adv_idx_shape_list, axis_i=0
-                )
-                self = symbolic_helper._reshape_helper(g, self, folded_adv_idx_shape)
-
-                # Transpose folded advanced indexed axis to its original location.
-                adv_idx_permute = (
-                    list(range(1, adv_idx_indices[0] + 1))
-                    + [0]
-                    + list(range(adv_idx_indices[0] + 1, rank - adv_idx_count + 1))
-                )
-                self = g.op("Transpose", self, perm_i=adv_idx_permute)
-
-                # unfold advanced index axes
-                final_shape_list = (
-                    [dim_tensor_list[i] for i in range(adv_idx_indices[0])]
-                    + [cum_adv_index_shape_tensor]
-                    + [
-                        dim_tensor_list[i]
-                        for i in range(adv_idx_indices[0], rank)
-                        if i not in adv_idx_indices
-                    ]
-                )
-                final_shape = g.op("Concat", *final_shape_list, axis_i=0)
-            else:
-                final_shape = g.op(
-                    "Concat",
-                    cum_adv_index_shape_tensor,
-                    *[
-                        dim_tensor_list[i]
-                        for i in range(rank)
-                        if i not in adv_idx_indices
-                    ],
-                    axis_i=0,
-                )
-
-            return symbolic_helper._reshape_helper(g, self, final_shape)
-
-
-@_onnx_symbolic("aten::linalg_norm")
-@symbolic_helper.parse_args("v", "v", "is", "b", "v")
-def linalg_norm(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    ord: torch._C.Value,
-    dim: Sequence[int] | None,
-    keepdim: bool,
-    dtype: torch._C.Value,
-):
-    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.norm.html
-    ord_value = None
-    if dim is None:
-        if symbolic_helper._is_none(ord):
-            self = symbolic_helper._reshape_helper(g, self, [-1])
-            ord = g.op("Constant", value_t=torch.LongTensor([2]))
-        self_dim = symbolic_helper._get_tensor_rank(self)
-        if self_dim is None:
-            return symbolic_helper._unimplemented(
-                "dim", "Input rank must be known at export time.", self
-            )
-        if self_dim == 1:
-            ord_value = symbolic_helper._parse_arg(ord, "f")
-        else:
-            dim = [0, 1]
-    else:
-        if len(dim) == 1:
-            if symbolic_helper._is_none(ord):
-                ord = g.op("Constant", value_t=torch.LongTensor([2]))
-            ord_value = symbolic_helper._parse_arg(ord, "f")
-    if ord_value:
-        return linalg_vector_norm(g, self, ord_value, dim, keepdim, dtype)
-    return linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
-
-
-@_onnx_symbolic("aten::linalg_vector_norm")
-@symbolic_helper.parse_args("v", "f", "is", "b", "v")
-def linalg_vector_norm(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    ord: float,
-    dim: Sequence[int] | None,
-    keepdim: bool,
-    dtype: torch._C.Value,
-):
-    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
-
-
-@_onnx_symbolic("aten::linalg_matrix_norm")
-@symbolic_helper.parse_args("v", "v", "is", "b", "v")
-def linalg_matrix_norm(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    ord: torch._C.Value,
-    dim: list[int],
-    keepdim: bool,
-    dtype: torch._C.Value,
-):
-    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.matrix_norm.html
-    ord_value = symbolic_helper._parse_arg(ord, "s")
-    if ord_value == "fro":
-        return frobenius_norm(g, self, dim, keepdim)
-    elif ord_value == "nuc":
-        return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==nuc", self)
-    else:
-        ord_value = symbolic_helper._parse_arg(ord, "f")
-        if ord_value is None:
-            return frobenius_norm(g, self, dim, keepdim)
-        if ord_value == 2 or ord_value == -2:
-            # ord = 2/-2 unimplemented due to lack of operators
-            # used to calculate singular values
-            return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==2", self)
-        # Wrap the dim vector to handle negative dim values
-        self_dim = symbolic_helper._get_tensor_rank(self)
-        if self_dim is None:
-            return symbolic_helper._unimplemented(
-                "linalg.matrix_norm", "Input rank must be known at export time.", self
-            )
-        # Common implementation for cases with
-        # ord = 1/-1 and ord = inf/-inf
-        if dim[0] < 0:
-            dim[0] += self_dim
-        if dim[1] < 0:
-            dim[1] += self_dim
-
-        if ord_value == math.inf or ord_value == -math.inf:
-            dim[0], dim[1] = dim[1], dim[0]
-        if dim[1] > dim[0] and not keepdim:
-            dim[1] -= 1
-        sum = symbolic_helper._reducesum_helper(
-            g, g.op("Abs", self), axes_i=[dim[0]], keepdims_i=keepdim
-        )
-        if ord_value > 0:
-            result, _indices = max(
-                g,
-                sum,
-                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
-                keepdim=keepdim,
-            )
-        else:
-            result, _indices = min(
-                g,
-                sum,
-                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
-                keepdim=keepdim,
-            )
-        return result
-
-
-@_onnx_symbolic("aten::linalg_cross")
-@symbolic_helper.parse_args("v", "v", "i")
-def linalg_cross(g: jit_utils.GraphContext, input, other, dim=-1):
-    return cross(g, input, other, dim)
-
-
-@_onnx_symbolic("aten::frobenius_norm")
-@symbolic_helper.parse_args("v", "is", "b")
-def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
-    sqr = g.op("Mul", self, self)
-    sumsqr = symbolic_helper._reducesum_helper(g, sqr, axes_i=dim, keepdims_i=keepdim)
-    return g.op("Sqrt", sumsqr)
-
-
-@_onnx_symbolic("aten::multinomial")
-@symbolic_helper.parse_args("v", "i", "b", "v")
-def multinomial(
-    g: jit_utils.GraphContext, input, num_samples, replacement=False, generator=None
-):
-    if generator is not None and not symbolic_helper._is_none(generator):
-        symbolic_helper._unimplemented(
-            "Multinomial", "generator is not supported for multinomial", input
-        )
-    if not replacement and num_samples > 1:
-        symbolic_helper._unimplemented(
-            "Multinomial",
-            "replacement=False when num_samples > 1 is not supported for multinomial",
-            input,
-        )
-
-    log_input = log(g, input)
-    return g.op(
-        "Multinomial",
-        log_input,
-        dtype_i=_C_onnx.TensorProtoDataType.INT64,
-        sample_size_i=num_samples,
-    )
-
-
-@_onnx_symbolic("aten::baddbmm")
-def baddbmm(g: jit_utils.GraphContext, self, batch1, batch2, beta, alpha):
-    scalar_type = _type_utils.JitScalarType.from_value(self)
-    batch_mul = matmul(g, batch1, batch2)
-    mul_a = mul(
-        g,
-        batch_mul,
-        g.op("Cast", alpha, to_i=scalar_type.onnx_type()),
-    )
-    mul_b = mul(
-        g,
-        self,
-        g.op("Cast", beta, to_i=scalar_type.onnx_type()),
-    )
-    return add(g, mul_a, mul_b)
-
-
-@_onnx_symbolic("aten::meshgrid")
-@symbolic_helper.parse_args("v", "s")
-def meshgrid(g: jit_utils.GraphContext, tensor_list, indexing: str | None = None):
-    if indexing is None:
-        indexing = "ij"
-    elif indexing not in {"ij", "xy"}:
-        raise errors.SymbolicValueError(
-            f"Unsupported indexing: {indexing}", tensor_list
-        )
-    unpacked_tensor_list = symbolic_helper._unpack_list(tensor_list)
-    if indexing == "xy":
-        unpacked_tensor_list[:2] = unpacked_tensor_list[1::-1]
-    tensors = [
-        symbolic_helper._reshape_helper(
-            g, t, g.op("Constant", value_t=torch.LongTensor([-1]))
-        )
-        for t in unpacked_tensor_list
-    ]
-    tensors_shape = [g.op("Shape", t) for t in tensors]
-    out_shape = g.op("Concat", *tensors_shape, axis_i=0)
-    out = []
-    for i, t in enumerate(tensors):
-        shape_i = [g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))] * len(
-            tensors
-        )
-        shape_i[i] = tensors_shape[i]
-        t_reshaped = _reshape_from_tensor(g, t, g.op("Concat", *shape_i, axis_i=0))
-        out.append(g.op("Expand", t_reshaped, out_shape))
-    if indexing == "xy":
-        out[0], out[1] = out[1], out[0]
-    return g.op("prim::ListConstruct", *out)
-
-
-@_onnx_symbolic("aten::remainder")
-def remainder(g: jit_utils.GraphContext, input, other):
-    div = _floor_divide(g, input, other)
-    quo = g.op("Mul", div, other)
-    return g.op("Sub", input, quo)
-
-
-@_onnx_symbolic("aten::gelu")
-@symbolic_helper.parse_args("v", "s")
-def gelu(g: jit_utils.GraphContext, self: torch._C.Value, approximate: str = "none"):
-    if approximate == "tanh":
-        kBeta = math.sqrt(2 / math.pi)
-        kKappa = 0.044715
-
-        beta = torch.tensor(kBeta, dtype=torch.double)
-        kappa = torch.tensor(kKappa, dtype=torch.double)
-        one = torch.tensor(1.0, dtype=torch.double)
-        half = torch.tensor(0.5, dtype=torch.double)
-
-        self_cube = mul(g, self, mul(g, self, self))
-        inner = mul(g, beta, add(g, self, mul(g, kappa, self_cube)))
-        return mul(g, half, mul(g, self, add(g, one, g.op("Tanh", inner))))
-    else:
-        _sqrt2 = 1.4142135623730951
-        erf = g.op("Erf", g.op("Div", self, torch.tensor(_sqrt2, dtype=torch.double)))
-        erf_plusone = add(
-            g, erf, g.op("Constant", value_t=torch.tensor(1, dtype=torch.double))
-        )
-        return mul(
-            g,
-            mul(g, self, erf_plusone),
-            g.op("Constant", value_t=torch.tensor(0.5, dtype=torch.double)),
-        )
-
-
-@_onnx_symbolic("aten::group_norm")
-@symbolic_helper.quantized_args(True, False, False, False)
-@symbolic_helper.parse_args("v", "i", "v", "v", "f", "i")
-def group_norm(
-    g: jit_utils.GraphContext, input, num_groups, weight, bias, eps, cudnn_enabled
-):
-    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
-    if channel_size is not None:
-        assert channel_size % num_groups == 0
-    input_rank = symbolic_helper._get_tensor_rank(input)
-    if input_rank is None:
-        return symbolic_helper._unimplemented("group_norm", "unknown input rank", input)
-    # 0 in the shape list keeps dimension value unchanged.
-    shape = [0, num_groups, -1]
-    input_reshaped = symbolic_helper._reshape_helper(
-        g, input, g.op("Constant", value_t=torch.LongTensor(shape))
-    )
-
-    # C is always divisible by num_groups
-    # Due to shape difference. we need to apply weight and bias after
-    # instance norm computation and reshape
-    weight_ = g.op(
-        "Constant",
-        value_t=torch.tensor(
-            [1.0] * num_groups,
-            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
-        ),
-    )
-    bias_ = g.op(
-        "Constant",
-        value_t=torch.tensor(
-            [0.0] * num_groups,
-            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
-        ),
-    )
-
-    norm_reshaped = g.op(
-        "InstanceNormalization", input_reshaped, weight_, bias_, epsilon_f=eps
-    )
-    norm = symbolic_helper._reshape_helper(g, norm_reshaped, g.op("Shape", input))
-
-    if weight is None or weight.node().mustBeNone():
-        weight_value = torch.tensor(
-            [1.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
-        )
-        weight = g.op("Constant", value_t=weight_value)
-    if bias is None or bias.node().mustBeNone():
-        bias_value = torch.tensor(
-            [0.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
-        )
-        bias = g.op("Constant", value_t=bias_value)
-
-    # Norm has shape [N, C, *] so we reshape weight and bias to [C, *]
-    axes = list(range(1, input_rank - 1))
-    return add(
-        g,
-        mul(g, norm, symbolic_helper._unsqueeze_helper(g, weight, axes)),
-        symbolic_helper._unsqueeze_helper(g, bias, axes),
-    )
-
-
-@_onnx_symbolic("aten::_weight_norm")
-@symbolic_helper.parse_args("v", "v", "i")
-def _weight_norm(g: jit_utils.GraphContext, weight_v, weight_g, dim):
-    rank = symbolic_helper._get_tensor_rank(weight_v)
-    if rank is not None:
-        # W = g * ((v) / ||v||)
-        # Compute norm_except_dim for l2 norm. dim = None means over all dims
-        # torch's weight_norm module sets dim = -1 if it's None.
-        # This conflicts the logic for negative axes to access dims backwards
-        # TODO: Might need a fix in torch group_norm module
-        axes = list(range(rank))
-        if dim is not None:
-            if dim < -1:
-                dim += rank
-            if dim != -1:
-                axes.remove(dim)
-        norm_v = norm(g, weight_v, 2, axes, 1)
-        div = g.op("Div", weight_v, norm_v)
-        return g.op("Mul", div, weight_g)
-    raise errors.SymbolicValueError(
-        "Unsupported: ONNX export of _weight_norm for tensor of unknown rank.",
-        weight_v,
-    )
-
-
-@_onnx_symbolic("aten::dim")
-def dim(g: jit_utils.GraphContext, self):
-    """Implement the dim functionality available for a pytorch tensor in ONNX"""
-    # ONNX does not support dim directly in this opset so we can use 2 ops to get the info
-    shape = g.op("Shape", self)
-    return g.op("Size", shape)
-
-
-@_onnx_symbolic("aten::__contains_")
-def __contains_(g: jit_utils.GraphContext, self, element):
-    unpacked_list = symbolic_helper._unpack_list(self)
-    if all(
-        symbolic_helper._is_constant(x) for x in unpacked_list
-    ) and symbolic_helper._is_constant(element):
-        return g.op(
-            "Constant",
-            value_t=torch.tensor(
-                symbolic_helper._node_get(element.node(), "value")
-                in (symbolic_helper._node_get(x.node(), "value") for x in unpacked_list)
-            ),
-        )
-
-    raise errors.SymbolicValueError(
-        "Unsupported: ONNX export of __contains__ for non-constant list or element.",
-        self,
-    )
-
-
-@_onnx_symbolic("aten::__getitem_")
-def __getitem_(g: jit_utils.GraphContext, self, i):
-    return select(g, self, g.op("Constant", value_t=torch.tensor([0])), i)
-
-
-@_onnx_symbolic("aten::item")
-def item(g: jit_utils.GraphContext, self):
-    return self
-
-
-@_onnx_symbolic("aten::take")
-def take(g: jit_utils.GraphContext, self, index):
-    self_flattened = symbolic_helper._reshape_helper(
-        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
-    )
-    out = index_select(g, self_flattened, 0, index)
-    out = reshape_as(g, out, index)
-    return out
-
-
-def _kl_div_log_target_impl(g: jit_utils.GraphContext, input, target):
-    diff_ = sub(g, target, input)
-    exp_ = exp(g, target)
-    output = mul(g, exp_, diff_)
-    return output
-
-
-def _kl_div_non_log_target_impl(g: jit_utils.GraphContext, input, target):
-    log_ = log(g, target)
-    diff_ = sub(g, log_, input)
-    output_pos = mul(g, target, diff_)
-    zeros_ = zeros_like(g, output_pos)
-    mask_ = gt(g, target, g.op("Constant", value_t=torch.tensor(0)))
-    output = where(g, mask_, output_pos, zeros_)
-    return output
-
-
-@_onnx_symbolic("aten::kl_div")
-@symbolic_helper.parse_args("v", "v", "i", "b")
-def kl_div(g: jit_utils.GraphContext, input, target, reduction, log_target):
-    if log_target:
-        output = _kl_div_log_target_impl(g, input, target)
-    else:
-        output = _kl_div_non_log_target_impl(g, input, target)
-
-    if reduction == 0:
-        return output
-    elif reduction == 1:
-        return g.op("ReduceMean", output, keepdims_i=0)
-    elif reduction == 2:
-        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
-    else:
-        return symbolic_helper._onnx_unsupported(
-            "kl_div with reduction other than none, mean, or sum.", input
-        )
-
-
-@_onnx_symbolic("aten::mse_loss")
-@symbolic_helper.parse_args("v", "v", "i")
-def mse_loss(g: jit_utils.GraphContext, input, target, reduction):
-    output = mul(g, sub(g, input, target), sub(g, input, target))
-    if reduction == 0:
-        return output
-    elif reduction == 1:
-        return g.op("ReduceMean", output, keepdims_i=0)
-    elif reduction == 2:
-        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
-    else:
-        return symbolic_helper._onnx_unsupported(
-            "mse_loss with reduction other than none, mean, or sum.", input
-        )
-
-
-@_onnx_symbolic("aten::as_strided")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "v", "is", "i")
-def as_strided(g: jit_utils.GraphContext, self, sizes, strides, offset=None):
-    sizes = symbolic_helper._maybe_get_const(sizes, "is")
-    rank = len(strides)
-    self_1d = symbolic_helper._reshape_helper(
-        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
-    )
-    ind: torch.Tensor | None
-    if not symbolic_helper._is_value(sizes):
-        ind = torch.tensor([0], dtype=torch.long)
-        for i, (size, stride) in enumerate(zip(sizes, strides)):
-            r_size = [1] * rank
-            r_size[i] = -1
-            ind = ind + torch.arange(size).view(r_size) * stride
-        if offset:
-            ind = ind + offset
-        return g.op("Gather", self_1d, g.op("Constant", value_t=ind))
-    else:
-        ind = None
-        for i, stride in enumerate(strides):
-            r_size = [1] * rank
-            r_size[i] = -1
-            size = select(
-                g,
-                sizes,
-                g.op("Constant", value_t=torch.tensor([0])),
-                g.op("Constant", value_t=torch.tensor(i)),
-            )
-            tmp_ind = symbolic_helper._reshape_helper(
-                g,
-                arange(g, size, 4, None, None, None),
-                g.op("Constant", value_t=torch.tensor(r_size)),
-            )
-            tmp_ind = g.op(
-                "Mul", tmp_ind, g.op("Constant", value_t=torch.tensor([stride]))
-            )
-            if ind is None:
-                ind = tmp_ind
-            else:
-                ind = g.op("Add", ind, tmp_ind)
-        if offset:
-            ind = g.op("Add", ind, g.op("Constant", torch.tensor([offset])))
-        return g.op("Gather", self_1d, ind)
-
-
-@_onnx_symbolic("aten::__derive_index")
-def __derive_index(g: jit_utils.GraphContext, index, start, step):
-    return g.op("Add", start, g.op("Mul", index, step))
-
-
-@_onnx_symbolic("aten::__range_length")
-# Source code for aten op can be found here: pytorch/torch/csrc/jit/runtime/register_prim_ops.cpp
-# if (step > 0 && lo < hi) {
-#   push(stack, 1 + (hi - 1 - lo) / step);
-# } else if (step < 0 && lo > hi) {
-#   push(stack, 1 + (lo - 1 - hi) / (0 - step));
-# } else {
-#  push(stack, 0);
-# }
-def __range_length(g: jit_utils.GraphContext, lo, hi, step):
-    sub = g.op("Sub", hi, lo)
-    div = g.op("Ceil", true_divide(g, sub, step))
-    return g.op("Cast", div, to_i=_C_onnx.TensorProtoDataType.INT64)
-
-
-@_onnx_symbolic("aten::linear")
-def linear(g: jit_utils.GraphContext, input, weight, bias):
-    rank = symbolic_helper._get_tensor_rank(input)
-    weight = t(g, weight)
-    if rank == 2 and not bias.node().mustBeNone():
-        alpha = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
-        beta = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
-        output = addmm(g, bias, input, weight, alpha, beta)
-    else:
-        output = matmul(g, input, weight)
-        if not bias.node().mustBeNone():
-            output = add(g, bias, output)
-
-    return output
-
-
-@_onnx_symbolic("aten::hann_window")
-@symbolic_helper.parse_args("v", "b", "i", "v", "v", "v", "v")
-def hann_window(
-    g: jit_utils.GraphContext,
-    window_length,
-    periodic=True,
-    dtype: int | None = None,
-    layout=None,
-    device=None,
-    pin_memory=None,
-    requires_grad=False,
-):
-    if dtype is None:
-        dtype_ = torch.get_default_dtype()
-        if not dtype_ or not dtype_.is_floating_point:
-            dtype_ = torch.float
-        scalar_type = _type_utils.JitScalarType.from_dtype(dtype_)
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-
-    n_array = arange(g, window_length, 4, None, None, None)
-    output = g.op("Cast", n_array, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    output = mul(
-        g, g.op("Constant", value_t=torch.tensor(math.pi, dtype=torch.float)), output
-    )
-
-    if periodic is False:
-        window_length = sub(
-            g, window_length, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int))
-        )
-    output = div(g, output, window_length)
-    output = g.op(
-        "Cast",
-        square(g, sin(g, output)),
-        to_i=scalar_type.onnx_type(),
-    )
-
-    return output
-
-
-@_onnx_symbolic("aten::mv")
-def mv(g: jit_utils.GraphContext, self, vec):
-    return matmul(g, self, vec)
-
-
-@_onnx_symbolic("aten::dot")
-def dot(g: jit_utils.GraphContext, self, other):
-    return matmul(g, self, other)
-
-
-@_onnx_symbolic("aten::movedim")
-@symbolic_helper.parse_args("v", "t", "t")
-def movedim(g: jit_utils.GraphContext, self, source, destination):
-    # This is a pythonic implementation mostly taken from aten/src/ATen/native/TensorShape.cpp::movedim
-    source = source.view(-1)
-    destination = destination.view(-1)
-
-    assert source.size() == destination.size()
-
-    if (source == destination).all():
-        return self
-
-    self_rank = symbolic_helper._get_tensor_rank(self)
-    assert self_rank is not None
-
-    perm = list(range(self_rank))
-
-    src_dims = perm.copy()
-    dst_dims = perm.copy()
-
-    for src, dst in zip(source.tolist(), destination.tolist()):
-        perm[dst] = src
-        src_dims[src] = -1
-        dst_dims[dst] = -1
-
-    src_dims = [dim for dim in src_dims if dim != -1]
-    dst_dims = [dim for dim in dst_dims if dim != -1]
-
-    for src, dst in zip(src_dims, dst_dims):
-        perm[dst] = src
-
-    return g.op("Transpose", self, perm_i=perm)
-
-
-@_onnx_symbolic("aten::fill")
-@symbolic_helper.parse_args("v", "v")
-def fill(g: jit_utils.GraphContext, self, value):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.FLOAT
-    )
-    return full_like(g, self, value, scalar_type)
-
-
-@_onnx_symbolic("aten::index_add")
-def index_add(g: jit_utils.GraphContext, self, dim, index, other, alpha=None):
-    warnings.warn(
-        "Warning: ONNX export does not support duplicated values in 'index' field, "
-        + "this will cause the ONNX model to be incorrect."
-    )
-
-    # ONNX does not support "alpha" argument, unlike aten index_add
-    # See: https://github.com/pytorch/pytorch/pull/65993#issuecomment-953151102 for more context
-    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
-        return symbolic_helper._unimplemented("index_add", "alpha != 1", self)
-
-    dim = symbolic_helper._maybe_get_const(dim, "i")
-    if dim is None:
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting 'index_add_()' function with "
-            "unknown 'dim' value.",
-            self,
-        )
-
-    self_dim_rank = symbolic_helper._get_tensor_rank(self)
-    other_dim_rank = symbolic_helper._get_tensor_rank(other)
-
-    if self_dim_rank is None or other_dim_rank is None:
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting 'index_add_()' function while "
-            "the rank of self tensor or tensor to be added is unknown.",
-            self,
-        )
-
-    if other_dim_rank != self_dim_rank:
-        delta = self_dim_rank - other_dim_rank
-        for i in range(delta):
-            other = symbolic_helper._unsqueeze_helper(
-                g, other, [symbolic_helper._get_tensor_rank(other)]
-            )
-
-    other_dim_size = symbolic_helper._get_tensor_dim_size(other, dim)
-    self_dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
-
-    if (other_dim_size is not None) and (self_dim_size is not None):
-        if other_dim_size > self_dim_size:
-            raise errors.SymbolicValueError(
-                "ONNX export does not support exporting 'index_add_()' function with "
-                "duplicated values in 'index' parameter yet.",
-                self,
-            )
-
-    # Construct a new shape. It's almost as same as self except the size of the 'dim'
-    # dimension is 1, so that we can expand other dimensions as expected.
-    new_shape_axes = list(range(self_dim_rank))
-    new_shape_starts = [0 for i in range(self_dim_rank)]
-    new_shape_ends = [sys.maxsize if (i != dim) else 1 for i in range(self_dim_rank)]
-
-    new_shape = symbolic_helper._slice_helper(
-        g, self, axes=new_shape_axes, starts=new_shape_starts, ends=new_shape_ends
-    )
-    other = expand_as(g, other, new_shape)
-
-    for i in range(dim):
-        index = symbolic_helper._unsqueeze_helper(g, index, [0])
-
-    for i in range(self_dim_rank - dim - 1):
-        index = symbolic_helper._unsqueeze_helper(
-            g, index, [symbolic_helper._get_tensor_rank(index)]
-        )
-
-    return scatter_add(g, self, dim, expand_as(g, index, other), other)
-
-
-@_onnx_symbolic("aten::roll")
-@symbolic_helper.parse_args("v", "is", "is")
-def roll(g: jit_utils.GraphContext, self, shifts, dims):
-    assert len(shifts) == len(dims)
-
-    result = self
-    for i in range(len(shifts)):
-        shapes = []
-        shape = symbolic_helper._slice_helper(
-            g, result, axes=[dims[i]], starts=[-shifts[i]], ends=[sys.maxsize]
-        )
-        shapes.append(shape)
-        shape = symbolic_helper._slice_helper(
-            g, result, axes=[dims[i]], starts=[0], ends=[-shifts[i]]
-        )
-        shapes.append(shape)
-        result = g.op("Concat", *shapes, axis_i=dims[i])
-
-    return result
-
-
-@_onnx_symbolic("aten::cross")
-@symbolic_helper.parse_args("v", "v", "i")
-def cross(g: jit_utils.GraphContext, input, other, dim=None):
-    dim = symbolic_helper._get_dim_for_cross(input, dim)
-    # If we have two tensors such that
-    # A = [a, b, c], B = [d, e, f], we permute the tensor such that we have
-    # After first roll,
-    # A' = [b, c, a], B' = [f, d, e], so that we calculate (b*f, c*d, a*e)
-    roll_x_1 = roll(g, input, [2], [dim])
-    roll_y_1 = roll(g, other, [1], [dim])
-    # After second roll,
-    # A' = [c, a, b], B' = [e, f, d], so that we calculate (c*e, a*f, b*d)
-    roll_x_2 = roll(g, input, [1], [dim])
-    roll_y_2 = roll(g, other, [2], [dim])
-    # cross product is calculated as
-    # result = [(b*f - c*e), (c*d - a*f), (a*e - b*d)]
-    return sub(g, mul(g, roll_x_1, roll_y_1), mul(g, roll_x_2, roll_y_2))
-
-
-@_onnx_symbolic("aten::cdist")
-def cdist(
-    g: jit_utils.GraphContext,
-    x1,
-    x2,
-    p=2.0,
-    compute_mode="use_mm_for_euclid_dist_if_necessary",
-):
-    # X1.shape = (B * P * D), X2.shape = (B * R * D)
-    # In order to respect numpy style broadcasting as demonstrated in
-    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
-    # we unsqueeze both input tensors
-    row_size_x1 = symbolic_helper._get_tensor_dim_size(x1, -2)
-    row_size_x2 = symbolic_helper._get_tensor_dim_size(x2, -2)
-    assert row_size_x1 is not None
-    assert row_size_x2 is not None
-    p_float = symbolic_helper._parse_arg(p, "f")
-    compute_mode = symbolic_helper._parse_arg(compute_mode, "i")
-    if p_float == 2.0 and (
-        compute_mode == 1
-        or (compute_mode is None and row_size_x1 >= 25 and row_size_x2 >= 25)
-    ):
-        return _euclidean_dist(g, x1, x2)
-    rank = symbolic_helper._get_tensor_rank(x1)
-    assert rank is not None
-    broadcasted_x1 = symbolic_helper._unsqueeze_helper(g, x1, [rank - 1])
-    broadcasted_x2 = symbolic_helper._unsqueeze_helper(g, x2, [rank - 2])
-    return pairwise_distance(
-        g, broadcasted_x1, broadcasted_x2, p, eps=1e-06, keepdim=False
-    )
-
-
-def _euclidean_dist(g: jit_utils.GraphContext, x1, x2):
-    # X1.shape = (B * P * D), X2.shape = (B * R * D)
-    # using matrix multiplication to accelerate the calculation of
-    # the euclidean distance
-    rank = symbolic_helper._get_tensor_rank(x1)
-    assert rank is not None
-    x1_norm = symbolic_helper._reducesum_helper(
-        g,
-        pow(g, x1, symbolic_helper._generate_wrapped_number(g, 2.0)),
-        axes_i=[-1],
-        keepdims_i=True,
-    )
-    x1_pad = ones_like(g, x1_norm)
-    x2_norm = symbolic_helper._reducesum_helper(
-        g,
-        pow(g, x2, symbolic_helper._generate_wrapped_number(g, 2.0)),
-        axes_i=[-1],
-        keepdims_i=True,
-    )
-    x2_pad = ones_like(g, x2_norm)
-    x1_ = g.op(
-        "Concat",
-        *[
-            mul(g, symbolic_helper._generate_wrapped_number(g, -2.0), x1),
-            x1_norm,
-            x1_pad,
-        ],
-        axis_i=-1,
-    )
-    x2_ = g.op("Concat", *[x2, x2_pad, x2_norm], axis_i=-1)
-    result = matmul(g, x1_, transpose(g, x2_, -2, -1))
-    dtype = _type_utils.JitScalarType.from_value(result)
-    min = g.op(
-        "Cast", symbolic_helper._generate_wrapped_number(g, 0.0), to_i=dtype.onnx_type()
-    )
-    result = symbolic_helper._op_with_optional_float_cast(
-        g, "Max", result, min, opset_before=12
-    )
-    result = sqrt(g, result)
-    return result
-
-
-@_onnx_symbolic("aten::lerp")
-def lerp(g: jit_utils.GraphContext, self, end, weight):
-    # Conditional for better numeric. This has been discussed in
-    # https://github.com/pytorch/pytorch/pull/18871
-    diff = g.op("Sub", end, self)
-    return where(
-        g,
-        g.op("Less", weight, g.op("Constant", value_t=torch.tensor(0.5))),
-        g.op("Add", self, g.op("Mul", weight, diff)),
-        g.op(
-            "Sub",
-            end,
-            g.op(
-                "Mul",
-                diff,
-                g.op("Sub", g.op("Constant", value_t=torch.tensor(1.0)), weight),
-            ),
-        ),
-    )
-
-
-@_onnx_symbolic("aten::broadcast_tensors")
-def broadcast_tensors(g: jit_utils.GraphContext, self):
-    all_tensors = symbolic_helper._unpack_list(self)
-    t_with_final_shape = zeros_like(g, all_tensors[0])
-
-    # Add operator supports multidirectional broadcasting. So we leverage this function
-    # to infer the final shape generated by the broadcast.
-    for t in all_tensors:
-        t_with_final_shape = add(g, t_with_final_shape, t)
-
-    t_list = [expand_as(g, t, t_with_final_shape) for t in all_tensors]
-    return g.op("prim::ListConstruct", *t_list)
-
-
-@_onnx_symbolic("aten::is_pinned")
-def is_pinned(g: jit_utils.GraphContext, self, device=None):
-    # Unused by ONNX.
-    return None
-
-
-@_onnx_symbolic("prim::ConstantSplit")
-def prim_constant_split(g: jit_utils.GraphContext, self, split_size, dim):
-    size = symbolic_helper._get_tensor_dim_size(self, dim)
-    if size is None:
-        return symbolic_helper._unimplemented(
-            "prim::ConstantSplit", "unknown dimension size", self
-        )
-    splits = [split_size] * (size // split_size)
-    leftover = size % split_size
-    if leftover:
-        splits.append(leftover)
-    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=len(splits))
-
-
-# TODO: It would be better to export this as a chunk directly, as this is
-# less sensitive to changes in input size.
-# TODO: Once we have proper scoping, stop reimplementing chunk, delete this
-# method, and use the desugared version
-@_onnx_symbolic("prim::ConstantChunk")
-def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
-    dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
-    if dim_size is None:
-        return symbolic_helper._unimplemented(
-            "prim::ConstantChunk", "unknown dimension size", self
-        )
-    split_size = (dim_size + chunks - 1) // chunks
-    return prim_constant_split(g, self, split_size, dim)
-
-
-@_onnx_symbolic("prim::shape")
-def prim_shape(g: jit_utils.GraphContext, self):
-    return g.op("Shape", self)
-
-
-@_onnx_symbolic("prim::max")
-def prim_max(g: jit_utils.GraphContext, self, other):
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Max", self, other, opset_before=12
-    )
-
-
-@_onnx_symbolic("prim::min")
-def prim_min(g: jit_utils.GraphContext, self, other=None):
-    if not other:
-        if symbolic_helper._is_packed_list(self):
-            self = stack(g, self, g.op("Constant", value_t=torch.tensor([0])))
-        return min(g, self)
-    return min(g, self, other)
-
-
-@_onnx_symbolic("prim::data")
-def prim_data(g: jit_utils.GraphContext, self):
-    return self
-
-
-@_onnx_symbolic("prim::layout")
-def prim_layout(g: jit_utils.GraphContext, self):
-    # Always return 'torch.strided'. Other layout types are not supported by JIT 'TensorType'.
-    # Layout class defined in 'c10/core/Layout.h'.
-    return g.op("Constant", value_t=torch.tensor(0))
-
-
-@_onnx_symbolic("prim::ListConstruct")
-def prim_list_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
-    return None
-
-
-@_onnx_symbolic("prim::ListUnpack")
-def prim_list_unpack(
-    g: jit_utils.GraphContext, *inputs, **kwargs
-) -> list[_C.Value] | None:
-    if len(inputs) == 1 and inputs[0].node().kind() == "prim::ListConstruct":
-        # Cancel the previous node if it is ListConstruct by returning its inputs
-        # TODO(justinchuby): Use a public method in the helper module
-        return symbolic_helper._unpack_list(inputs[0])
-
-    return None
-
-
-@_onnx_symbolic("prim::TupleConstruct")
-def prim_tuple_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
-    return None
-
-
-@_onnx_symbolic("prim::Uninitialized")
-def prim_uninitialized(g: jit_utils.GraphContext, *inputs, **kwargs):
-    return None
-
-
-# exists to refine the type of the Value
-# if x is an optional Tensor, unchecked_cast will cast
-# x to Tensor, so the rest of the graph knows that x is a Tensor
-# this doesn't do anything in runtime and is a noop in ONNX
-@_onnx_symbolic("prim::unchecked_cast")
-def prim_unchecked_cast(g: jit_utils.GraphContext, self):
-    return self
-
-
-@_onnx_symbolic("prim::dtype")
-def prim_dtype(g: jit_utils.GraphContext, self):
-    scalar_type = symbolic_helper._try_get_scalar_type(self)
-    if scalar_type is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    # This node records a torch dtype as int
-    return g.op("Constant", value_t=torch.tensor(scalar_type))
-
-
-@_onnx_symbolic("prim::tolist")
-def prim_tolist(g: jit_utils.GraphContext, input, dim_val, elem_ty_val):
-    """tolist is currently supported only for 1D input tensors.
-
-    dim_val and elem_ty_val represent dimension and type annotations
-    that need to match dimension and type of the input tensor.
-    """
-    dim = symbolic_helper._maybe_get_const(dim_val, "i")
-    if dim > 1:
-        return symbolic_helper._unimplemented("prim::tolist", "dim_val > 1", input)
-    return input
-
-
-# -----------------------------------------------------------------------------
-# Symbolic functions that need extra context
-# -----------------------------------------------------------------------------
-@_onnx_symbolic("prim::device")
-def prim_device(g: jit_utils.GraphContext, *inputs, **kwargs) -> None:
-    output_type = g.original_node.output().type()
-    if isinstance(output_type, _C.DeviceObjType):
-        return None
-
-    return symbolic_helper._unimplemented(
-        "prim::device",
-        f"output type should be 'DeviceObjType', not '{output_type.kind()}'",
-        g.original_node.output(),
-    )
-
-
-@_onnx_symbolic("prim::Loop")
-def prim_loop(g: jit_utils.GraphContext, *inputs, **attrs) -> list[_C.Value]:
-    node = g.original_node
-    env = g.env
-    values_in_env = g.values_in_env
-    params_dict = g.params_dict
-
-    operator_export_type = GLOBALS.operator_export_type
-    opset_version = GLOBALS.export_onnx_opset_version
-
-    old_blocks = tuple(node.blocks())
-    _new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
-        g, "Loop", *inputs, outputs=node.outputsSize(), n_blocks=len(old_blocks)
-    )
-
-    for old_block, new_block_context in zip(old_blocks, new_block_contexts):
-        # Copy input metadata to subblock
-        #
-        #   prim::Loop(iter, cond, input_1, ..., input_n)
-        #     block0(iter, input_1, ..., input_n)
-        #
-        # For `Loop` node, copy metadata for `iter`, `input_1`, ..., `input_n`.
-        for i, b_in in enumerate(old_block.inputs()):
-            if i == 0 and i < len(inputs):
-                b_in.setType(inputs[i].type())
-            # For optional block inputs, they may switch between None not-None inside
-            # the loop body, so if the loop input is not optional, the block input may
-            # still need to be optional.
-            if (
-                i > 0
-                and (i + 1) < len(inputs)
-                and not isinstance(b_in.type(), _C.OptionalType)
-            ):
-                b_in.setType(inputs[i + 1].type())
-        torch._C._jit_pass_onnx_block(
-            old_block,
-            new_block_context.block,
-            operator_export_type,
-            env,
-            values_in_env,
-            False,
-        )
-    fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
-        new_node, opset_version
-    )
-    # Run shape type inference for Loop after subblock is converted.
-    if GLOBALS.onnx_shape_inference:
-        torch._C._jit_pass_onnx_node_shape_type_inference(
-            new_node, params_dict, opset_version
-        )
-    return fixed_outputs
-
-
-@_onnx_symbolic("prim::If")
-def prim_if(g: jit_utils.GraphContext, *inputs, **attrs) -> list[_C.Value]:
-    n = g.original_node
-    block = g.block
-    env = g.env
-    values_in_env = g.values_in_env
-    params_dict = g.params_dict
-
-    operator_export_type = GLOBALS.operator_export_type
-    opset_version = GLOBALS.export_onnx_opset_version
-
-    static_if = inputs[0].node().kind() == "onnx::Constant"
-    if static_if:
-        # Fold static if
-        #
-        # The torch IR
-        # graph(%embedding_matrix.1 : Float(10, 15, strides=[15, 1], requires_grad=0, device=cpu),
-        #    %input.1 : Long(6, strides=[1], requires_grad=0, device=cpu), ...
-        # %65 : Bool(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
-        # %21 : Long(device=cpu) = aten::eq(%20, %64)
-        # %22 : Long(device=cpu) = prim::If(%21)
-        #     block0():
-        #     %23 : Long(device=cpu) = aten::is_floating_point(%input.1)
-        #     -> (%23)
-        #     block1():
-        #     -> (%65)
-        # %input.53 : Tensor, %weight : Tensor = prim::If(%22)
-        #     block0():
-        #     -> (%embedding_matrix.1, %input.1)
-        #     block1():
-        #     -> (%input.1, %embedding_matrix.1)
-        # %26 : int[] = aten::size(%input.53)
-        #
-        # The converted ONNX graph
-        # %10 : Bool(device=cpu) = onnx::Constant[value={0}]()
-        # %14 : Bool(device=cpu) = onnx::Equal(%13, %8)
-        # %15 : Bool(requires_grad=0, device=cpu) = onnx::Constant[value={0}]()
-        # %16 : Long(1, strides=[1], device=cpu) = onnx::Shape(%input.1)
-        input_flag = symbolic_helper._node_get(inputs[0].node(), "value").tolist()
-        const_value = (
-            all(input_flag) if isinstance(input_flag, list) else bool(input_flag)
-        )
-        block_idx = 0 if const_value else 1
-        current_b = list(n.blocks())[block_idx]
-        env = torch._C._jit_pass_onnx_block(
-            current_b,
-            block,
-            operator_export_type,
-            env,
-            values_in_env,
-            True,
-        )
-        if_output_list = list(n.outputs())
-        current_b_list = list(current_b.outputs())
-
-        final_b_list = []
-        for idx in range(len(if_output_list)):
-            if current_b_list[idx] not in env:
-                raise errors.SymbolicValueError(
-                    f"The sub block ATen output {current_b_list[idx]} is not in env.",
-                    current_b_list[idx],
-                )  # type:ignore[operator]
-            onnx_b = env[current_b_list[idx]]
-            final_b_list.append(onnx_b)
-        return final_b_list
-    else:
-        old_blocks = tuple(n.blocks())
-        _new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
-            g, "If", *inputs, outputs=n.outputsSize(), n_blocks=len(old_blocks)
-        )
-
-        for old_block, new_block_context in zip(old_blocks, new_block_contexts):
-            torch._C._jit_pass_onnx_block(
-                old_block,
-                new_block_context.block,
-                operator_export_type,
-                env,
-                values_in_env,
-                False,
-            )
-        fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
-            new_node, opset_version
-        )
-        # Run shape type inference for If after subblock is converted.
-        if GLOBALS.onnx_shape_inference:
-            torch._C._jit_pass_onnx_node_shape_type_inference(
-                new_node, params_dict, opset_version
-            )
-        return fixed_outputs
-
-
-@_onnx_symbolic("prim::Constant")
-def prim_constant(g: jit_utils.GraphContext, *inputs, **attrs):
-    node = g.original_node
-
-    if node.mustBeNone():
-        return None
-    # This must go before checking for string values, because some device constants
-    # have string values, but we want to keep them as unconverted Device types so
-    # that eq() can work on them.
-    if isinstance(node.output().type(), _C.DeviceObjType):
-        return None
-    if node.kindOf("value") == "t":
-        return g.op("Constant", value_t=symbolic_helper._node_get(node, "value"))
-    if node.kindOf("value") == "s":
-        return g.op("Constant", value_s=symbolic_helper._node_get(node, "value"))
-    if node.output().type().isSubtypeOf(
-        _C.ListType.ofInts()
-    ) or node.output().type().isSubtypeOf(_C.ListType.ofFloats()):
-        return g.op(
-            "Constant", value_t=torch.tensor(symbolic_helper._node_get(node, "value"))
-        )
-    if node.output().type().isSubtypeOf(_C.ListType.ofStrings()):
-        str_constants = [
-            g.op("Constant", value_s=s)
-            for s in symbolic_helper._node_get(node, "value")
-        ]
-        return g.op("prim::ListConstruct", *str_constants)
-
-    raise errors.SymbolicValueError(
-        f"Unsupported prim::Constant kind: '{node.kindOf('value')}'. "
-        f"Please send a bug report at {_constants.PYTORCH_GITHUB_ISSUES_URL}.",
-        node.output(),
-    )
-
-
-@_onnx_symbolic("prim::type")
-def prim_type(g: jit_utils.GraphContext, device_value: _C.Value, *args, **kwargs):
-    if device_value.node().kind() == "prim::device":
-        device = jit_utils.get_device_from_value(device_value.node().input())
-        if device is not None:
-            return g.op("Constant", value_s=str(device))
-
-    return symbolic_helper._unimplemented(
-        "prim::type",
-        "Device type cannot be statically determined.",
-        device_value,
-    )
-
-
-@_onnx_symbolic("onnx::Placeholder")
-def onnx_placeholder(g: jit_utils.GraphContext, *inputs, **attrs):
-    node = g.original_node
-    block = g.block
-    env = g.env
-    values_in_env = g.values_in_env
-
-    return torch._C._jit_onnx_convert_pattern_from_subblock(
-        block, node, env, values_in_env
-    )
-
-
-@_onnx_symbolic("aten::resolve_conj")
-@_onnx_symbolic("aten::resolve_neg")
-def noop_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
-    # ONNX does not have operators to *directly* manipulate real/imaginary components
-    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
-    # which results in failures due to missing operators for complex numbers
-
-    # `aten::resolve_conj` and `aten::resolve_neg` can safely be implemented as no-op
-    return input
-
-
-@_onnx_symbolic("aten::_conj")
-@_onnx_symbolic("aten::conj_physical")
-def unsupported_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
-    # ONNX does not have operators to *directly* manipulate real/imaginary components
-    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
-    # which results in failures due to missing operators for complex numbers
-
-    # While `aten::_conj` and `aten::conj_physical` raise exception when input is complex
-    if symbolic_helper.is_complex_value(input):
-        # FIXME(justinchuby): report correct name for symbolic being executed
-        return symbolic_helper._onnx_unsupported(
-            "aten::_conj, aten::conj_physical",
-            input,
-        )
-
-    # they can safely be implemented as no-op for real numbers only
-    return noop_complex_operators(g, input)
-
-
-@_onnx_symbolic("aten::logit")
-def logit(g: jit_utils.GraphContext, self: torch._C.Value, eps: torch._C.Value):
-    one = g.op("Constant", value_t=torch.tensor(1.0))
-
-    if not symbolic_helper._is_none(eps):
-        eps = g.op(
-            "Cast", eps, to_i=_type_utils.JitScalarType.from_value(self).onnx_type()
-        )
-        one_sub_eps = g.op("Sub", one, eps)
-        self_less_equal_one_sub_eps = g.op("Greater", one_sub_eps, self)
-        temporary_self = g.op("Where", self_less_equal_one_sub_eps, self, one_sub_eps)
-
-        temporary_self_less_eps = g.op("Less", temporary_self, eps)
-        z = g.op("Where", temporary_self_less_eps, eps, temporary_self)
-    else:
-        z = self
-
-    sub = g.op("Sub", one, z)
-    div = g.op("Div", z, sub)
-    return g.op("Log", div)
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index ec08090a595f6..6b1d752bb04ea 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1,1880 +1,8 @@
-# mypy: allow-untyped-defs
-"""Functions to export models into the ONNX IR format.
-
-These models can be loaded with the ONNX library and then
-converted to models which run on other deep learning frameworks.
-"""
+"""Backward compatibility module for torch.onnx.utils."""
 
 from __future__ import annotations
 
-import contextlib
-import copy
-import inspect
-import re
-import typing
-import warnings
-from typing import Any, Callable, cast
-from typing_extensions import deprecated
-
-import torch
-import torch._C._onnx as _C_onnx
-import torch.jit._trace
-import torch.serialization
-from torch import _C
-from torch.onnx import _constants, errors, symbolic_helper  # noqa: F401
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import jit_utils, onnx_proto_utils, registration
-
-
-if typing.TYPE_CHECKING:
-    from collections.abc import Collection, Mapping, Sequence
-
-
-__all__ = [
-    "select_model_mode_for_export",
-    "disable_apex_o2_state_dict_hook",
-    "setup_onnx_logging",
-    "exporter_context",
-    "export",
-    "model_signature",
-    "warn_on_static_input_change",
-    "unpack_quantized_tensor",
-    "unconvertible_ops",
-    "register_custom_op_symbolic",
-    "unregister_custom_op_symbolic",
-]
-
-
-# TODO(justinchuby): Remove dependency to this global variable from constant_fold.cpp
-# Skip check due to cannot import IValue from torch._C
-_params_dict = {}  # type: ignore[var-annotated]
-
-
-@deprecated("Please set training mode before exporting the model", category=None)
-@contextlib.contextmanager
-def select_model_mode_for_export(model, mode: _C_onnx.TrainingMode):
-    """A context manager to temporarily set the training mode of ``model``
-    to ``mode``, resetting it when we exit the with-block.
-
-    .. deprecated:: 2.7
-        Please set training mode before exporting the model.
-
-    Args:
-        model: Same type and meaning as ``model`` arg to :func:`export`.
-        mode: Same type and meaning as ``training`` arg to :func:`export`.
-    """
-    if not isinstance(mode, _C_onnx.TrainingMode):
-        raise TypeError(
-            f"'mode' should be a torch.onnx.TrainingMode enum, but got '{type(mode)}'."
-        )
-    originally_training: bool = False
-
-    if hasattr(model, "training"):
-        originally_training = model.training
-
-        # ONNX opset 12 has better support for training amenable models, with updated
-        # versions of the dropout and batch_norm operators
-        if mode == _C_onnx.TrainingMode.TRAINING or (
-            mode == _C_onnx.TrainingMode.PRESERVE and originally_training
-        ):
-            GLOBALS.export_training = True
-            if GLOBALS.export_onnx_opset_version < 12:
-                warnings.warn(
-                    "You are exporting the model in training mode with onnx opset "
-                    f"version {GLOBALS.export_onnx_opset_version}. "
-                    "Opset versions lower than opset 12 will not be able to export "
-                    "nodes such as Dropout and BatchNorm correctly."
-                )
-        else:
-            GLOBALS.export_training = False
-
-        GLOBALS.training_mode = mode
-        if mode == _C_onnx.TrainingMode.TRAINING:
-            model.train(True)
-        elif mode == _C_onnx.TrainingMode.EVAL:
-            model.train(False)
-        # else mode == _C_onnx.TrainingMode.PRESERVE, do nothing
-
-    try:
-        yield
-    finally:
-        if hasattr(model, "training") and not mode == _C_onnx.TrainingMode.PRESERVE:
-            model.train(originally_training)
-
-
-@deprecated(
-    "Please remove usage of this function. Copy its logic if it is required in user code",
-    category=None,
-)
-@contextlib.contextmanager
-def disable_apex_o2_state_dict_hook(model: torch.nn.Module | torch.jit.ScriptFunction):
-    """A context manager to temporarily disable the Apex O2 hook that returns.
-
-    .. deprecated:: 2.7
-        Please remove usage of this function.
-    """
-    # Apex O2 hook state_dict to return fp16 weights as fp32.
-    # Exporter cannot identify them as same tensors.
-    # Since this hook is only used by optimizer, it is safe to
-    # remove this hook while exporting.
-    if not isinstance(model, torch.jit.ScriptFunction):
-        model_hooks = {}  # type: ignore[var-annotated]
-        for module in model.modules():
-            for key, hook in module._state_dict_hooks.items():
-                if type(hook).__name__ == "O2StateDictHook":
-                    if module not in model_hooks:
-                        model_hooks[module] = {}
-                    model_hooks[module][key] = hook
-            if module in model_hooks:
-                for key in model_hooks[module]:
-                    module._state_dict_hooks.pop(key)
-        try:
-            yield
-        finally:
-            # Add the hooks back
-            for module, m_map in model_hooks.items():
-                for key, hook in m_map.items():
-                    module._state_dict_hooks[key] = hook
-    else:
-        try:
-            yield
-        finally:
-            pass
-
-
-@deprecated("The feature will be removed. Please remove usage of this function")
-@contextlib.contextmanager
-def setup_onnx_logging(verbose: bool):
-    """A context manager to temporarily set the ONNX logging verbosity.
-
-    .. deprecated:: 2.7
-        Please remove usage of this function.
-    """
-    is_originally_enabled = _C._jit_is_onnx_log_enabled
-    if is_originally_enabled or verbose:  # type: ignore[truthy-function]
-        _C._jit_set_onnx_log_enabled(True)
-    try:
-        yield
-    finally:
-        if not is_originally_enabled:  # type: ignore[truthy-function]
-            _C._jit_set_onnx_log_enabled(False)
-
-
-@deprecated(
-    "The feature will be removed. Please remove usage of this function "
-    "and implement equivalent logic if needed",
-    category=None,
-)
-@contextlib.contextmanager
-def exporter_context(model, mode: _C_onnx.TrainingMode, verbose: bool):
-    """A context manager to temporarily set the training mode of ``model``
-    to ``mode``, disable the Apex O2 hook, and set the ONNX logging verbosity.
-
-    .. deprecated:: 2.7
-        Please set training mode before exporting the model.
-    """
-    with (
-        select_model_mode_for_export(model, mode) as mode_ctx,
-        disable_apex_o2_state_dict_hook(model) as apex_ctx,
-        setup_onnx_logging(verbose) as log_ctx,
-    ):
-        yield (mode_ctx, apex_ctx, log_ctx)
-
-
-def _get_torch_export_args(
-    args: tuple[Any, ...],
-    kwargs: dict[str, Any] | None,
-) -> tuple[tuple[Any, ...], dict[str, Any] | None]:
-    """Obtain the arguments for torch.onnx.export from the model and the input arguments."""
-    if not kwargs and args and isinstance(args[-1], dict):
-        kwargs = args[-1]
-        args = args[:-1]
-    return args, kwargs
-
-
-def export(
-    model: torch.nn.Module | torch.jit.ScriptModule | torch.jit.ScriptFunction,
-    args: tuple[Any, ...] | torch.Tensor,
-    f: str,
-    *,
-    kwargs: dict[str, Any] | None = None,
-    export_params: bool = True,
-    verbose: bool = False,
-    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
-    input_names: Sequence[str] | None = None,
-    output_names: Sequence[str] | None = None,
-    operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX,
-    opset_version: int | None = None,
-    do_constant_folding: bool = True,
-    dynamic_axes: Mapping[str, Mapping[int, str]]
-    | Mapping[str, Sequence[int]]
-    | None = None,
-    keep_initializers_as_inputs: bool | None = None,
-    custom_opsets: Mapping[str, int] | None = None,
-    export_modules_as_functions: bool | Collection[type[torch.nn.Module]] = False,
-    autograd_inlining: bool = True,
-) -> None:
-    r"""Exports a model into ONNX format.
-
-    If ``model`` is not a :class:`torch.jit.ScriptModule` nor a
-    :class:`torch.jit.ScriptFunction`, this runs
-    ``model`` once in order to convert it to a TorchScript graph to be exported
-    (the equivalent of :func:`torch.jit.trace`). Thus this has the same limited support
-    for dynamic control flow as :func:`torch.jit.trace`.
-
-    Args:
-        model: The model to be exported.
-        args:
-
-            args can be structured either as:
-
-            1. ONLY A TUPLE OF ARGUMENTS::
-
-                args = (x, y, z)
-
-            The tuple should contain model inputs such that ``model(*args)`` is a valid
-            invocation of the model. Any non-Tensor arguments will be hard-coded into the
-            exported model; any Tensor arguments will become inputs of the exported model,
-            in the order they occur in the tuple.
-
-            2. A TENSOR::
-
-                args = torch.Tensor([1])
-
-            This is equivalent to a 1-ary tuple of that Tensor.
-
-            3. A TUPLE OF ARGUMENTS ENDING WITH A DICTIONARY OF NAMED ARGUMENTS::
-
-                args = (x, {"y": input_y, "z": input_z})
-
-            All but the last element of the tuple will be passed as non-keyword arguments,
-            and named arguments will be set from the last element. If a named argument is
-            not present in the dictionary, it is assigned the default value, or None if a
-            default value is not provided.
-
-            .. warning::
-                This behavior will be deprecated in a future release. Please use the
-                kwargs argument instead.
-
-            .. note::
-                If a dictionary is the last element of the args tuple, it will be
-                interpreted as containing named arguments. In order to pass a dict as the
-                last non-keyword arg, provide an empty dict as the last element of the args
-                tuple. For example, instead of::
-
-                    torch.onnx.export(
-                        model,
-                        (
-                            x,
-                            # WRONG: will be interpreted as named arguments
-                            {y: z},
-                        ),
-                        "test.onnx.pb",
-                    )
-
-                Write::
-
-                    torch.onnx.export(model, (x, {y: z}, {}), "test.onnx.pb")
-
-        f: Path to the output ONNX model file. E.g. "model.onnx".
-        kwargs: Named arguments to the model.
-        export_params: If True, all parameters will
-            be exported. Set this to False if you want to export an untrained model.
-            In this case, the exported model will first take all of its parameters
-            as arguments, with the ordering as specified by ``model.state_dict().values()``
-        verbose: if True, prints a description of the
-            model being exported to stdout. In addition, the final ONNX graph will include the
-            field ``doc_string``` from the exported model which mentions the source code locations
-            for ``model``. If True, ONNX exporter logging will be turned on.
-        training:
-            * ``TrainingMode.EVAL``: export the model in inference mode.
-            * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is
-                False and in training mode if model.training is True.
-            * ``TrainingMode.TRAINING``: export the model in training mode. Disables optimizations
-                which might interfere with training.
-        input_names (list of str, default empty list): names to assign to the
-            input nodes of the graph, in order.
-        output_names (list of str, default empty list): names to assign to the
-            output nodes of the graph, in order.
-        operator_export_type (enum, default OperatorExportTypes.ONNX):
-
-            .. warning::
-                This option will be deprecated in a future release. Future exported
-                graphs will always use the default opset domain.
-
-            * ``OperatorExportTypes.ONNX``: Export all ops as regular ONNX ops
-                (in the default opset domain).
-            * ``OperatorExportTypes.ONNX_FALLTHROUGH``: Try to convert all ops
-                to standard ONNX ops in the default opset domain. If unable to do so
-                (e.g. because support has not been added to convert a particular torch op to ONNX),
-                fall back to exporting the op into a custom opset domain without conversion. Applies
-                to `custom ops <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_
-                as well as ATen ops. For the exported model to be usable, the runtime must support
-                these non-standard ops.
-            * ``OperatorExportTypes.ONNX_ATEN``: All ATen ops (in the TorchScript namespace "aten")
-                are exported as ATen ops (in opset domain "org.pytorch.aten").
-                `ATen <https://pytorch.org/cppdocs/#aten>`_ is PyTorch's built-in tensor library, so
-                this instructs the runtime to use PyTorch's implementation of these ops.
-
-                .. warning::
-
-                    Models exported this way are probably runnable only by Caffe2.
-
-                    This may be useful if the numeric differences in implementations of operators are
-                    causing large differences in behavior between PyTorch and Caffe2 (which is more
-                    common on untrained models).
-
-            * ``OperatorExportTypes.ONNX_ATEN_FALLBACK``: Try to export each ATen op
-                (in the TorchScript namespace "aten") as a regular ONNX op. If we are unable to do so
-                (e.g. because support has not been added to convert a particular torch op to ONNX),
-                fall back to exporting an ATen op. See documentation on OperatorExportTypes.ONNX_ATEN for
-                context.
-                For example::
-
-                    graph(%0 : Float):
-                    %3 : int = prim::Constant[value=0]()
-                    # conversion unsupported
-                    %4 : Float = aten::triu(%0, %3)
-                    # conversion supported
-                    %5 : Float = aten::mul(%4, %0)
-                    return (%5)
-
-                Assuming ``aten::triu`` is not supported in ONNX, this will be exported as::
-
-                    graph(%0 : Float):
-                    %1 : Long() = onnx::Constant[value={0}]()
-                    # not converted
-                    %2 : Float = aten::ATen[operator="triu"](%0, %1)
-                    # converted
-                    %3 : Float = onnx::Mul(%2, %0)
-                    return (%3)
-
-                .. warning::
-
-                    Models exported this way are probably runnable only by Caffe2.
-
-        opset_version (int, default 18): The version of the
-            `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
-            to target. Must be >= 7.
-        do_constant_folding: Apply the constant-folding optimization.
-            Constant-folding will replace some of the ops that have all constant inputs
-            with pre-computed constant nodes.
-        dynamic_axes:
-
-            By default the exported model will have the shapes of all input and output tensors
-            set to exactly match those given in ``args``. To specify axes of tensors as
-            dynamic (i.e. known only at run-time), set ``dynamic_axes`` to a dict with schema:
-
-            * KEY (str): an input or output name. Each name must also be provided in ``input_names`` or
-                ``output_names``.
-            * VALUE (dict or list): If a dict, keys are axis indices and values are axis names. If a
-                list, each element is an axis index.
-
-            For example::
-
-                class SumModule(torch.nn.Module):
-                    def forward(self, x):
-                        return torch.sum(x, dim=1)
-
-
-                torch.onnx.export(
-                    SumModule(),
-                    (torch.ones(2, 2),),
-                    "onnx.pb",
-                    input_names=["x"],
-                    output_names=["sum"],
-                )
-
-            Produces::
-
-                input {
-                  name: "x"
-                  ...
-                      shape {
-                        dim {
-                          dim_value: 2  # axis 0
-                        }
-                        dim {
-                          dim_value: 2  # axis 1
-                ...
-                output {
-                  name: "sum"
-                  ...
-                      shape {
-                        dim {
-                          dim_value: 2  # axis 0
-                ...
-
-            While::
-
-                torch.onnx.export(
-                    SumModule(),
-                    (torch.ones(2, 2),),
-                    "onnx.pb",
-                    input_names=["x"],
-                    output_names=["sum"],
-                    dynamic_axes={
-                        # dict value: manually named axes
-                        "x": {0: "my_custom_axis_name"},
-                        # list value: automatic names
-                        "sum": [0],
-                    },
-                )
-
-            Produces::
-
-                input {
-                  name: "x"
-                  ...
-                      shape {
-                        dim {
-                          dim_param: "my_custom_axis_name"  # axis 0
-                        }
-                        dim {
-                          dim_value: 2  # axis 1
-                ...
-                output {
-                  name: "sum"
-                  ...
-                      shape {
-                        dim {
-                          dim_param: "sum_dynamic_axes_1"  # axis 0
-                ...
-
-        keep_initializers_as_inputs: If True, all the
-            initializers (typically corresponding to parameters) in the
-            exported graph will also be added as inputs to the graph. If False,
-            then initializers are not added as inputs to the graph, and only
-            the non-parameter inputs are added as inputs.
-            This may allow for better optimizations (e.g. constant folding) by
-            backends/runtimes.
-
-            If True, `deduplicate_initializers` pass will not be executed. This means
-            initializers with duplicated values will not be deduplicated and
-            will be treated as distinct inputs to the graph. This allows different
-            input initializers to be supplied at the runtime following export.
-
-            If ``opset_version < 9``, initializers MUST be part of graph
-            inputs and this argument will be ignored and the behavior will be
-            equivalent to setting this argument to True.
-
-        custom_opsets (dict[str, int], default empty dict): A dict with schema:
-
-            * KEY (str): opset domain name
-            * VALUE (int): opset version
-
-            If a custom opset is referenced by ``model`` but not mentioned in this dictionary,
-            the opset version is set to 1. Only custom opset domain name and version should be
-            indicated through this argument.
-
-        export_modules_as_functions: Flag to enable
-            exporting all ``nn.Module`` forward calls as local functions in ONNX. Or a set to indicate the
-            particular types of modules to export as local functions in ONNX.
-            This feature requires ``opset_version`` >= 15, otherwise the export will fail. This is because
-            ``opset_version`` < 15 implies IR version < 8, which means no local function support.
-            Module variables will be exported as function attributes. There are two categories of function
-            attributes.
-
-            1. Annotated attributes: class variables that have type annotations via
-            `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_
-            will be exported as attributes.
-            Annotated attributes are not used inside the subgraph of ONNX local function because
-            they are not created by PyTorch JIT tracing, but they may be used by consumers
-            to determine whether or not to replace the function with a particular fused kernel.
-
-            2. Inferred attributes: variables that are used by operators inside the module. Attribute names
-            will have prefix "inferred::". This is to differentiate from predefined attributes retrieved from
-            python module annotations. Inferred attributes are used inside the subgraph of ONNX local function.
-
-            * ``False`` (default): export ``nn.Module`` forward calls as fine grained nodes.
-            * ``True``: export all ``nn.Module`` forward calls as local function nodes.
-            * Set of type of nn.Module: export ``nn.Module`` forward calls as local function nodes,
-                only if the type of the ``nn.Module`` is found in the set.
-
-        autograd_inlining: Flag used to control whether to inline autograd functions.
-            Refer to https://github.com/pytorch/pytorch/pull/74765 for more details.
-
-    Raises:
-        :class:`torch.onnx.errors.CheckerError`: If the ONNX checker detects an invalid ONNX graph.
-        :class:`torch.onnx.errors.UnsupportedOperatorError`: If the ONNX graph cannot be exported because it
-            uses an operator that is not supported by the exporter.
-        :class:`torch.onnx.errors.OnnxExporterError`: Other errors that can occur during export.
-            All errors are subclasses of :class:`errors.OnnxExporterError`.
-    """
-    if operator_export_type != _C_onnx.OperatorExportTypes.ONNX:
-        warnings.warn(
-            "Setting `operator_export_type` to something other than default is deprecated. "
-            "The option will be removed in a future release.",
-            category=DeprecationWarning,
-        )
-    if training == _C_onnx.TrainingMode.TRAINING:
-        warnings.warn(
-            "Setting `training` to something other than default is deprecated. "
-            "The option will be removed in a future release. Please set the training mode "
-            "before exporting the model.",
-            category=DeprecationWarning,
-        )
-
-    args = (args,) if isinstance(args, torch.Tensor) else args
-    if kwargs is not None:
-        args = args + (kwargs,)
-
-    _export(
-        model,
-        args,
-        f,
-        export_params,
-        verbose,
-        training,
-        input_names,
-        output_names,
-        operator_export_type=operator_export_type,
-        opset_version=opset_version,
-        do_constant_folding=do_constant_folding,
-        dynamic_axes=dynamic_axes,
-        keep_initializers_as_inputs=keep_initializers_as_inputs,
-        custom_opsets=custom_opsets,
-        export_modules_as_functions=export_modules_as_functions,
-        autograd_inlining=autograd_inlining,
-    )
-
-    return None
-
-
-def _is_constant_tensor_list(node):
-    if node.kind() != "prim::Constant":
-        return False
-    output_type = node.output().type()
-    if output_type.isSubtypeOf(_C.ListType.ofTensors()):
-        return True
-    if output_type.isSubtypeOf(_C.ListType(_C.OptionalType.ofTensor())):
-        return True
-
-
-# ONNX can't handle constants that are lists of tensors, which can
-# get generated in constant prop. So we split them back into prim::ListConstructs
-
-
-def _split_tensor_list_constants(g, block):
-    for node in block.nodes():
-        for subblock in node.blocks():
-            _split_tensor_list_constants(g, subblock)
-        if _is_constant_tensor_list(node):
-            inputs = []
-            for val in node.output().toIValue():
-                input = g.insertConstant(val)
-                input.node().moveBefore(node)
-                input.node().copyMetadata(node)
-                inputs.append(input)
-
-            lc = (
-                g.create("prim::ListConstruct", inputs)
-                .insertBefore(node)
-                .output()
-                .setType(_C.ListType.ofTensors())
-            )
-            lc.node().copyMetadata(node)
-            node.output().replaceAllUsesWith(lc)
-
-
-def _optimize_graph(
-    graph: _C.Graph,
-    operator_export_type: _C_onnx.OperatorExportTypes,
-    _disable_torch_constant_prop: bool = False,
-    fixed_batch_size: bool = False,
-    params_dict=None,
-    dynamic_axes=None,
-    input_names=None,
-    module=None,
-):
-    if params_dict is None:
-        params_dict = {}
-
-    # Inline everything
-    _C._jit_pass_inline(graph)
-
-    # Remove fork/wait nodes
-    _C._jit_pass_inline_fork_wait(graph)
-    _C._jit_pass_lint(graph)
-    if GLOBALS.autograd_inlining:
-        _C._jit_pass_onnx_autograd_function_process(graph)
-    _C._jit_pass_lower_all_tuples(graph)
-
-    # we now record some ops like ones/zeros
-    # into a trace where we previously recorded constants.
-    # use constant prop to maintain our current level of onnx support
-    # without implementing symbolics for all of them
-    if _disable_torch_constant_prop is False:
-        _C._jit_pass_constant_propagation(graph)
-
-    _split_tensor_list_constants(graph, graph)
-    # run dce to eliminate dead parts of the graph that might have been
-    # left behind by things like symbolic_override
-    _C._jit_pass_dce(graph)
-    _C._jit_pass_lint(graph)
-
-    # CSE should improve perf when Autocast is used with disabled cache
-    # Autocast is disabled due to a limitation on tracer as described at https://github.com/pytorch/pytorch/issues/84092
-    # Must run before _C._jit_pass_erase_number_types to prevent type substitution
-    if _C._jit_pass_cse(graph):
-        _C._jit_pass_onnx_lint(graph)
-
-    _C._jit_pass_canonicalize_graph_fuser_ops(graph)
-    _C._jit_pass_lint(graph)
-    _C._jit_pass_peephole(graph, True)
-    _C._jit_pass_fuse_addmm(graph)
-    _C._jit_pass_lint(graph)
-
-    _C._jit_pass_peephole(graph, True)
-    _C._jit_pass_lower_all_tuples(graph)
-    # in _jit_pass_onnx, symbolic functions are called for each node for conversion.
-    # However, there are nodes that cannot be converted without additional context.
-    # For example, the number of outputs from split (and whether it is static or dynamic) is unknown
-    # until the point where it is unpacked by listUnpack node.
-    # This pass does a preprocess, and prepares the nodes such that enough context can be received
-    # by the symbolic function.
-    _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
-    _C._jit_pass_onnx_preprocess(graph)
-
-    # onnx does not support tuples, so try to remove them
-    _C._jit_pass_lint(graph)
-
-    # onnx only supports tensors, but 1 / 2 = 0.5 and tensor(1) / tensor(2) = 0
-    _C._jit_pass_prepare_division_for_onnx(graph)
-
-    _C._jit_pass_onnx_remove_print(graph)
-    _C._jit_pass_onnx_preprocess_caffe2(graph)
-
-    symbolic_helper._quantized_ops.clear()
-    # Unpack quantized weights for conv and linear ops and insert into graph.
-    _C._jit_pass_onnx_unpack_quantized_weights(graph, params_dict)
-    # onnx only supports tensors, so we turn all out number types into tensors
-    _C._jit_pass_erase_number_types(graph)
-    if GLOBALS.onnx_shape_inference:
-        input_names = [] if input_names is None else input_names
-        dynamic_axes = {} if dynamic_axes is None else dynamic_axes
-        _C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
-    _C._jit_pass_onnx_lint(graph)
-
-    graph = _C._jit_pass_onnx(graph, operator_export_type)
-    _C._jit_pass_onnx_lint(graph)
-    _C._jit_pass_lint(graph)
-
-    _C._jit_pass_onnx_scalar_type_analysis(
-        graph, True, GLOBALS.export_onnx_opset_version
-    )
-    _C._jit_pass_lint(graph)
-
-    _C._jit_pass_onnx_peephole(
-        graph, GLOBALS.export_onnx_opset_version, fixed_batch_size
-    )
-    _C._jit_pass_lint(graph)
-
-    # graph is not a valid jit graph anymore because types have been replaced
-    # (e.g. int with Tensor), so it now contains operators that don't actually
-    # exist. We can't run normal dead code elimination because it'd fail trying
-    # to look up if an operator has side effects, but we can run a dead code
-    # elimination variant that doesn't need to look up if an op has side effects.
-    _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
-    _C._jit_pass_lint(graph)
-    graph = _C._jit_pass_canonicalize(graph)
-    _C._jit_pass_lint(graph)
-    if GLOBALS.onnx_shape_inference:
-        try:
-            _C._jit_pass_onnx_graph_shape_type_inference(
-                graph, params_dict, GLOBALS.export_onnx_opset_version
-            )
-        except RuntimeError:
-            # NOTE: shape type inference error should not stop the export process
-            # https://github.com/pytorch/pytorch/issues/132205
-            pass
-
-    return graph
-
-
-def warn_on_static_input_change(input_states):
-    """Warns that changes to input dictionaries and strings won't take effect in the traced ONNX graph.
-
-    We accept dictionaries and strings as ONNX inputs, but they should be only for
-    configuration use. we detect here if these inputs are modified, and if so we warn
-    the user that the changes won't take effect in the traced ONNX graph.
-    """
-    for input, traced_input in zip(input_states[0], input_states[1]):
-        if isinstance(input, dict):
-            if list(input.keys()) != list(traced_input.keys()):
-                warning = (
-                    "We detected that you are modifying a dictionary that is an input to your "
-                    "model. "
-                    "Note that dictionaries are allowed as inputs in ONNX but they should be "
-                    "handled with care. "
-                    "Usages of dictionaries is not recommended, and should not be used except "
-                    "for configuration use. "
-                    "Also note that the order and values of the keys must remain the same. "
-                )
-                warnings.warn(warning)
-        elif isinstance(input, str):
-            if input != traced_input:
-                warning = (
-                    "The model seems to have string inputs/outputs. "
-                    "Note that strings will not appear as inputs/outputs of the ONNX graph. "
-                )
-                warnings.warn(warning)
-
-
-def _resolve_args_by_export_type(arg_name, arg_value, operator_export_type):
-    """Resolves the arguments that are ignored when export_type != operator_export_type.ONNX."""
-    return arg_value
-
-
-def _decide_keep_init_as_input(
-    keep_initializers_as_inputs: bool | None,
-    operator_export_type: _C_onnx.OperatorExportTypes,
-    opset_version: int,
-):
-    """Decides whether the initializers in the graph should be listed as ONNX graph inputs.
-
-    This method encapsulates the logic to decide whether the initializers in the graph
-    should be listed as ONNX graph inputs (i.e., whether to choose ONNX IR v3 or v4).
-    If keep_initializers_as_inputs is not specified (None), then we decide whether to keep
-    initializers as graph inputs (val_keep_init_as_ip) based on export type. If export type
-    is ONNX, then do not keep initializers as input (val_keep_init_as_ip=False). For all other
-    export types keep initializers as input (val_keep_init_as_ip=True).
-    If keep_initializers_as_inputs is specified, then respect it. Unless opset version <= 8,
-    in which case it must be ignored because for opset version <= 8, all initializers MUST be
-    part of graph input (only ONNX IR v3 is allowed), i.e. val_keep_init_as_ip=True.
-
-    Special handling is needed for opset version 8 or lower, because irrespective
-    of user input for keep_initializers_as_inputs, the graph must follow ONNX IR v3
-    semantics, i.e. all initializers must be listed as ONNX graph input.
-    """
-
-    if opset_version < 9:
-        if keep_initializers_as_inputs is False:
-            warnings.warn(
-                "Setting 'keep_initializers_as_inputs=False' for opset version"
-                "8 or lower would lead to an invalid ONNX graph. Therefore, "
-                "'keep_initializers_as_inputs=False' is ignored during export."
-                "Exported model will have initializers as graph inputs (compliant "
-                " to ONNX IR v3)."
-            )
-        return True  # i.e. True == initializers are part of graph input (ONNX IR v3)
-    val_keep_init_as_ip = (
-        True if keep_initializers_as_inputs is None else keep_initializers_as_inputs
-    )
-    if (
-        keep_initializers_as_inputs is None
-        and operator_export_type is _C_onnx.OperatorExportTypes.ONNX
-    ):
-        val_keep_init_as_ip = False
-    return val_keep_init_as_ip
-
-
-def _decide_add_node_names(add_node_names, operator_export_type):
-    return _resolve_args_by_export_type(
-        "add_node_names", add_node_names, operator_export_type
-    )
-
-
-def _decide_constant_folding(do_constant_folding, operator_export_type, training):
-    do_constant_folding = _resolve_args_by_export_type(
-        "do_constant_folding", do_constant_folding, operator_export_type
-    )
-    if do_constant_folding and (
-        training is not None and training is not _C_onnx.TrainingMode.EVAL
-    ):
-        warnings.warn(
-            "It is recommended that constant folding be turned off ('do_constant_folding=False') "
-            "when exporting the model in training-amenable mode, i.e. with 'training=TrainingMode.TRAIN' "
-            "or 'training=TrainingMode.PRESERVE' (when model is in training mode). Otherwise, some "
-            "learnable model parameters may not translate correctly in the exported ONNX model "
-            "because constant folding mutates model parameters. Please consider "
-            "turning off constant folding or setting the training=TrainingMode.EVAL."
-        )
-    return do_constant_folding
-
-
-def _signature(model) -> inspect.Signature:
-    should_be_callable = getattr(model, "forward", model)
-    if callable(should_be_callable):
-        return inspect.signature(should_be_callable)
-    raise ValueError("model has no forward method and is not callable")
-
-
-def _decide_input_format(model, args):
-    try:
-        sig = _signature(model)
-    except ValueError as e:
-        warnings.warn(f"{e}, skipping _decide_input_format")
-        return args
-    try:
-        ordered_list_keys = list(sig.parameters.keys())
-        if ordered_list_keys[0] == "self":
-            ordered_list_keys = ordered_list_keys[1:]
-        args_dict: dict = {}
-        if isinstance(args, list):
-            args_list = args
-        elif isinstance(args, tuple):
-            args_list = list(args)
-        else:
-            args_list = [args]
-        if isinstance(args_list[-1], dict):
-            args_dict = args_list[-1]
-            args_list = args_list[:-1]
-        n_nonkeyword = len(args_list)
-        for optional_arg in ordered_list_keys[n_nonkeyword:]:
-            if optional_arg in args_dict:
-                args_list.append(args_dict[optional_arg])
-            # Check if this arg has a default value
-            else:
-                param = sig.parameters[optional_arg]
-                if param.default != param.empty:
-                    args_list.append(param.default)
-        args = args_list if isinstance(args, list) else tuple(args_list)
-    # Cases of models with no input args
-    except IndexError:
-        warnings.warn("No input args, skipping _decide_input_format")
-    except Exception as e:
-        warnings.warn(f"Skipping _decide_input_format\n {e.args[0]}")
-    return args
-
-
-def _trace(func, args, operator_export_type, return_outs=False):
-    # Special case for common case of passing a single Tensor
-    if isinstance(args, torch.Tensor):
-        args = (args,)
-
-    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
-        func,
-        args,
-        strict=False,
-        _force_outplace=False,
-        _return_inputs_states=True,
-    )
-    warn_on_static_input_change(inputs_states)
-
-    trace_graph = _optimize_graph(trace_graph, operator_export_type, params_dict={})
-    if return_outs:
-        return trace_graph, torch_out
-    return trace_graph
-
-
-def _trace_and_get_graph_from_model(model, args):
-    # A basic sanity check: make sure the state_dict keys are the same
-    # before and after running the model.  Fail fast!
-    orig_state_dict_keys = torch.jit._unique_state_dict(model).keys()
-
-    # Disable Autocast cache because it replaces kernel's weight and bias
-    # by (undesired) constants.
-    # No perf impact for when there are reused weights since https://github.com/pytorch/pytorch/pull/85665
-    prev_autocast_cache_enabled = torch.is_autocast_cache_enabled()
-    torch.set_autocast_cache_enabled(False)
-    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
-        model,
-        args,
-        strict=False,
-        _force_outplace=False,
-        _return_inputs_states=True,
-    )
-    torch.set_autocast_cache_enabled(prev_autocast_cache_enabled)
-
-    warn_on_static_input_change(inputs_states)
-
-    if orig_state_dict_keys != torch.jit._unique_state_dict(model).keys():
-        raise RuntimeError(
-            "state_dict changed after running the tracer; "
-            "something weird is happening in your model!"
-        )
-
-    return trace_graph, torch_out
-
-
-def _get_param_count_list(method_graph, args_params):
-    param_count_list = []
-    for input_, arg_params_ in zip(method_graph.inputs(), args_params):
-        if "PackedParams" in str(input_.type()):
-            in_vars, _ = torch.jit._flatten(arg_params_)
-            param_count_list.append(len(in_vars))
-        else:
-            param_count_list.append(arg_params_ is not None)
-
-    return param_count_list
-
-
-def _check_flatten_did_not_remove(original, jit_flattened):
-    """torch.jit._flatten removes None. Check if it did so in this case."""
-
-    def flatten(x):
-        if isinstance(x, (list, tuple)):
-            for inner in x:
-                yield from flatten(inner)
-        elif isinstance(x, dict):
-            for inner in x.values():
-                yield from flatten(inner)
-        else:
-            yield x
-
-    flattened_with_none = list(flatten(original))
-    num_none = len(flattened_with_none) - len(jit_flattened)
-    assert num_none >= 0
-    if num_none:
-        raise ValueError(
-            f"args contained {num_none} None's after flattening. "
-            "When exporting a ScriptModule or ScriptFunction, no args may "
-            "be None because that breaks type propagation."
-        )
-
-
-def _create_jit_graph(
-    model: torch.nn.Module | torch.jit.ScriptFunction, args: Sequence[Any]
-) -> tuple[_C.Graph, list[_C.IValue], Any | None, _C.ScriptModule | None]:
-    if isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule)):
-        flattened_args = tuple(torch.jit._flatten(tuple(args))[0])
-        _check_flatten_did_not_remove(args, flattened_args)
-        torch_out = None
-
-        if isinstance(model, torch.jit.ScriptModule):
-            try:
-                graph = model.forward.graph  # type: ignore[attr-defined]
-            except AttributeError as e:
-                raise RuntimeError("'forward' method must be a script method") from e
-            _C._jit_pass_onnx_function_substitution(graph)
-            freezed_module = _C._freeze_module(
-                cast(_C.ScriptModule, model._c), preserveParameters=True
-            )
-            module, params = _C._jit_onnx_list_model_parameters(freezed_module)
-            method_graph = module._get_method("forward").graph
-            args_params = tuple(args) + tuple(params)
-            param_count_list = _get_param_count_list(method_graph, args_params)
-            in_vars, _ = torch.jit._flatten(args_params)
-            graph = _C._propagate_and_assign_input_shapes(
-                method_graph, tuple(in_vars), param_count_list, False, False
-            )
-            return graph, params, torch_out, module
-
-        # torch.jit.ScriptFunction
-        params = []
-        graph = model.graph
-        _C._jit_pass_onnx_function_substitution(graph)
-        param_count_list = _get_param_count_list(graph, args)
-        graph = _C._propagate_and_assign_input_shapes(
-            graph, flattened_args, param_count_list, False, False
-        )
-        return graph, params, torch_out, None
-
-    graph, torch_out = _trace_and_get_graph_from_model(model, args)
-    _C._jit_pass_onnx_lint(graph)
-    state_dict = torch.jit._unique_state_dict(model)
-    params = list(state_dict.values())
-    graph_inputs = list(graph.inputs())
-    user_input_num = len(graph_inputs) - len(state_dict)
-    param_names = list(state_dict.keys())
-    for i, inp in enumerate(graph_inputs):
-        if i >= user_input_num:
-            inp.setDebugName(param_names[i - user_input_num])
-    _C._jit_pass_onnx_function_substitution(graph)
-    return graph, params, torch_out, None
-
-
-def _get_named_param_dict(graph, params):
-    input_and_param_names = [val.debugName() for val in graph.inputs()]
-    param_names = input_and_param_names[len(input_and_param_names) - len(params) :]
-    _params_dict = dict(zip(param_names, params))
-    return _params_dict
-
-
-def _get_example_outputs(model, args):
-    input_args = copy.deepcopy(args)
-    input_kwargs = {}
-    if input_args and isinstance(input_args[-1], dict):
-        input_kwargs = input_args[-1]
-        input_args = input_args[:-1]
-
-    example_outputs = model(*input_args, **input_kwargs)
-    if isinstance(example_outputs, list):
-        example_outputs = [example_outputs]
-    elif not isinstance(example_outputs, tuple):
-        example_outputs = (example_outputs,)
-
-    return example_outputs
-
-
-_qtype_vtype_map = {
-    torch.quint8: torch.uint8,
-    torch.qint8: torch.int8,
-    torch.qint32: torch.int32,
-    torch.quint4x2: torch.int8,
-}
-
-
-def unpack_quantized_tensor(value, cast_onnx_accepted=True):
-    if isinstance(value, torch.Tensor) and value.dtype in _qtype_vtype_map:
-        q_value_dequantize = value.dequantize()
-        q_scale = (
-            torch.tensor(value.q_scale(), dtype=torch.double)
-            if cast_onnx_accepted
-            else torch.tensor(value.q_scale(), dtype=torch.float32)
-        )
-        q_zero_point = (
-            torch.tensor(value.q_zero_point(), dtype=torch.int64)
-            if cast_onnx_accepted
-            else torch.tensor(value.q_zero_point(), dtype=_qtype_vtype_map[value.dtype])
-        )
-        q_value = q_value_dequantize / q_scale + q_zero_point
-        q_value = q_value.to(dtype=_qtype_vtype_map[value.dtype])
-        return q_value, q_scale, q_zero_point
-    else:
-        return (value,)
-
-
-def _pre_trace_quant_model(model, args):
-    r"""Returns `torch.jit.trace(model, args)` if model is quantized. Otherwise do nothing and return
-    original model.
-
-    This is due to https://github.com/pytorch/pytorch/issues/75761.
-    """
-    if any(
-        hasattr(m, "_packed_params") for m in getattr(model, "modules", list)()
-    ) or any(getattr(arg, "is_quantized", False) for arg in args):
-        return torch.jit.trace(model, args)
-    return model
-
-
-def _model_to_graph(
-    model,
-    args,
-    verbose=False,
-    input_names=None,
-    output_names=None,
-    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
-    do_constant_folding=True,
-    _disable_torch_constant_prop=False,
-    fixed_batch_size=False,
-    training=_C_onnx.TrainingMode.EVAL,
-    dynamic_axes=None,
-) -> tuple[
-    _C.Graph,
-    dict[str, torch.Tensor],
-    torch.Tensor
-    | tuple[torch.Tensor, ...]
-    | list[torch.Tensor]
-    | dict[str, torch.Tensor]
-    | Any
-    | None,
-]:
-    """Converts model into an ONNX graph.
-
-    Returns:
-        graph: A TorchScript IR Graph with ONNX nodes.
-        params_dict: Dict from input param name to param value.
-        torch_out: The output tensors resulting from the trace of ``model``.
-            If ``model`` is a :class:`torch.jit.ScriptModule` or :class:`torch.jit.ScriptFunction`,
-            this will be None, since we are not doing any tracing.
-    """
-    # TODO: can we simplify this to always return a tuple of Tensor or None?
-
-    # Special case for common case of passing a single Tensor
-    if isinstance(args, (torch.Tensor, int, float, bool)):
-        args = (args,)
-
-    model = _pre_trace_quant_model(model, args)
-    graph, params, torch_out, module = _create_jit_graph(model, args)
-    params_dict = _get_named_param_dict(graph, params)
-
-    try:
-        graph = _optimize_graph(
-            graph,
-            operator_export_type,
-            _disable_torch_constant_prop=_disable_torch_constant_prop,
-            fixed_batch_size=fixed_batch_size,
-            params_dict=params_dict,
-            dynamic_axes=dynamic_axes,
-            input_names=input_names,
-            module=module,
-        )
-    except Exception:
-        _C._jit_onnx_log("Torch IR graph at exception: ", graph)
-        raise
-
-    is_script = isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule))
-    if is_script:
-        example_outputs = _get_example_outputs(model, args)
-        example_outputs_final = ()
-        for example_output in example_outputs:
-            example_outputs_final += unpack_quantized_tensor(example_output)
-        out_vars, desc = torch.jit._flatten(example_outputs_final)
-        _C._jit_pass_onnx_assign_output_shape(
-            graph,
-            out_vars,
-            desc,
-            GLOBALS.onnx_shape_inference,
-            is_script,
-            GLOBALS.export_onnx_opset_version,
-        )
-
-    # NB: ONNX requires complete information about output types, which might be
-    # erased by some optimizations, so we need to set it explicitly again.
-    else:
-        if not isinstance(torch_out, (list, tuple)):
-            output_wrapped = [torch_out]
-        else:
-            output_wrapped = torch_out  # type: ignore[assignment]
-
-        output_tensors, out_desc = torch.jit._flatten(tuple(output_wrapped))
-        # assign_output_shape pass is not compatible with quantized outputs.
-        # Quantized outputs are flattened to 3 values in ONNX, while packed as
-        # single value in PyTorch.
-        if not any(getattr(out, "is_quantized", False) for out in output_tensors):
-            _C._jit_pass_onnx_assign_output_shape(
-                graph,
-                output_tensors,
-                out_desc,
-                GLOBALS.onnx_shape_inference,
-                is_script,
-                GLOBALS.export_onnx_opset_version,
-            )
-
-    _set_input_and_output_names(graph, input_names, output_names)
-    params_dict = _get_named_param_dict(graph, params)
-
-    if (
-        do_constant_folding
-        and GLOBALS.export_onnx_opset_version
-        >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
-    ):
-        if training is None or training == _C_onnx.TrainingMode.EVAL:
-            params_dict = _C._jit_pass_onnx_eval_peephole(graph, params_dict)
-
-        params_dict = _C._jit_pass_onnx_constant_fold(
-            graph, params_dict, GLOBALS.export_onnx_opset_version
-        )
-        _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
-
-    if GLOBALS.onnx_shape_inference:
-        try:
-            _C._jit_pass_onnx_graph_shape_type_inference(
-                graph, params_dict, GLOBALS.export_onnx_opset_version
-            )
-        except RuntimeError:
-            # NOTE: shape type inference error should not stop the export process
-            # https://github.com/pytorch/pytorch/issues/132205
-            pass
-
-    params_dict = _C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
-
-    # For ONNX opset < 9, constants only have three data types: float16, float, double.
-    # In this pass transform constants of other data types to float/double + cast operator.
-    if GLOBALS.export_onnx_opset_version < 9:
-        _C._jit_pass_onnx_cast_all_constant_to_floating(graph)
-
-    params_dict = _C._jit_pass_filter_non_tensor_arguments(params_dict)
-    _C._jit_decay_packed_param_input_types(graph)
-
-    # If output names lack a proper name and are identified only by their unique
-    # give them a legible name for debugging purposes
-    _apply_friendly_debug_names(graph, params_dict)
-
-    return graph, params_dict, torch_out
-
-
-@deprecated(
-    "Unconvertible ops are not definitive. Please remove usage of this function"
-)
-def unconvertible_ops(
-    model,
-    args,
-    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
-    opset_version: int | None = None,
-) -> tuple[_C.Graph, list[str]]:
-    """Returns an approximated list of all ops that are yet supported by :mod:`torch.onnx`.
-
-    .. deprecated:: 2.5
-        Unconvertible ops are not definitive. Please remove usage of this function.
-
-    The list is approximated because some ops may be removed during the conversion
-    process and don't need to be converted. Some other ops may have partial support
-    that will fail conversion with particular inputs. Please open a Github Issue
-    for op support requests.
-
-    Args:
-        model: Same as the `model` parameter in :func:`torch.onnx.export`.
-        args: Same as the `args` parameter in :func:`torch.onnx.export`.
-        training: Same as the `training` parameter in :func:`torch.onnx.export`.
-        opset_version: Same as the `opset_version` parameter in :func:`torch.onnx.export`.
-
-    Returns:
-        The JIT graph and a list of unconvertible ops in the format of "domain::op".
-    """
-
-    opset_version = opset_version or _constants.ONNX_DEFAULT_OPSET
-    GLOBALS.export_onnx_opset_version = opset_version
-
-    try:
-        with exporter_context(model, training, verbose=False):
-            # Create a mostly clean JIT graph that contains the plain aten and
-            # other ops we can check with the symbolic registry.
-            # NOTE: We don't want to actually convert any ops to ONNX or run any
-            # symbolic functions because there is a higher chance that a pass
-            # fails or an unconvertible op messes up the graph during ONNX conversion.
-            # This way we can always generate a list just by looking at the names
-            # of the ops in the graph.
-            args = _decide_input_format(model, args)
-            model = _pre_trace_quant_model(model, args)
-            graph, _, _, module = _create_jit_graph(model, args)
-            _C._jit_pass_inline(graph)
-            _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
-            _C._jit_pass_erase_number_types(graph)
-            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
-    except Exception as e:
-        raise errors.OnnxExporterError(
-            "Failed to discover unconvertible ops because of errors during the JIT graph "
-            "generation process."
-        ) from e
-
-    unsupported_ops = []
-    for node in graph.nodes():
-        domain_op = node.kind()
-        if domain_op.startswith(("onnx::", "prim::")):
-            # We consider onnx and prim ops as supported ops, even though some "prim"
-            # ops are not implemented as symbolic functions, because they may be
-            # eliminated in the conversion passes. Users may still see errors caused
-            # by prim ops even though they don't show up in the list.
-            continue
-        if not registration.registry.is_registered_op(
-            domain_op.rstrip("_"), opset_version
-        ):
-            # We consider all registered ops supported, even though some of them are
-            # only partially supported, because there is not yet a good way to check
-            # if an op is fully supported.
-            # TODO(justinchuby): Create a way to check if an op is fully supported.
-            unsupported_ops.append(domain_op)
-    return graph, unsupported_ops
-
-
-def _setup_trace_module_map(
-    model: torch.nn.Module | torch.jit.ScriptModule,
-    export_modules_as_functions: bool | Collection[type[torch.nn.Module]],
-) -> set[str]:
-    def __register_attribute_hook():
-        attr_name = "_onnx_attrs"
-
-        def _track_module_attributes_forward_pre_hook(module, input):
-            setattr(module, attr_name, _get_module_attributes(module))
-
-        def _track_module_attributes_forward_hook(module, input, output):
-            tracing_state = _C._get_tracing_state()
-            if not tracing_state:
-                return
-
-            graph = tracing_state.graph()
-            onnx_attrs = {}
-            if hasattr(module, attr_name):
-                onnx_attrs = getattr(module, attr_name)
-                delattr(module, attr_name)
-
-            _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
-
-        for m in model.modules():
-            m.register_forward_hook(_track_module_attributes_forward_hook)
-            m.register_forward_pre_hook(_track_module_attributes_forward_pre_hook)
-
-    def _unqualified_variable_name(qualified_name: str) -> str:
-        """
-        Parse qualified variable name and return the unqualified version.
-
-        Pure numeric atoms are considered inadequate, so this function will look past them,
-        and start from the first non-numeric atom.
-
-        Example:
-            >>> _unqualified_variable_name("__main__.Foo.bar")
-            'bar'
-            >>> _unqualified_variable_name("__main__.Foo.bar.0")
-            'bar.0'
-        """
-        name_atoms = qualified_name.split(".")
-        for i, atom in reversed(list(enumerate(name_atoms))):
-            if not atom.isnumeric():
-                return ".".join(name_atoms[i:])
-        return qualified_name
-
-    trace_module_map = {
-        _m: torch._C._jit_onnx_create_full_scope_name(
-            torch.typename(type(_m)), _unqualified_variable_name(_n)
-        )
-        for _n, _m in model.named_modules()
-    }
-    torch.jit._trace._trace_module_map = trace_module_map
-    if isinstance(export_modules_as_functions, bool) and export_modules_as_functions:
-        module_typenames = {torch.typename(type(module)) for module in trace_module_map}
-    elif isinstance(export_modules_as_functions, set) and export_modules_as_functions:
-
-        def _find_typename(v):
-            if isinstance(v, type):
-                return torch.typename(v)
-            else:
-                raise RuntimeError(
-                    "Only type of the `nn.Module` should be "
-                    "passed in the set for argument `export_modules_as_functions`. "
-                    f"Got `{type(v).__name__}`."
-                )
-
-        module_typenames = {_find_typename(v) for v in export_modules_as_functions}
-    else:
-        module_typenames = set()
-
-    if module_typenames:
-        __register_attribute_hook()
-
-    return module_typenames
-
-
-def _reset_trace_module_map():
-    torch.jit._trace._trace_module_map = None
-    _C._jit_pass_onnx_clear_scope_records()
-
-
-def _get_module_attributes(module):
-    annotations = typing.get_type_hints(type(module))
-    base_m_annotations = typing.get_type_hints(torch.nn.Module)
-    [annotations.pop(k, None) for k in base_m_annotations]
-    # Check whether module attributes can be accessed. Some classes
-    # define attributes but don't provide access to them in their
-    # constructor.
-    #
-    # For example, torch.nn.Embedding has the `freeze` variable and its
-    # type specified in the class but the attribute is not created in the
-    # constructor. In other words, there is no `self.freeze = <True | False>`
-    # in the constructor.
-    #
-    # Reference: https://github.com/pytorch/pytorch/blob/92de1d322223fb5584e384971b32c46b93bc2f4b/torch/nn/modules/sparse.py#L120
-    attrs = {}
-    for k in annotations:
-        try:
-            attrs[k] = getattr(module, k)
-        except AttributeError:
-            _C._jit_onnx_log(f"Skipping module attribute '{k}'")
-            continue
-    return attrs
-
-
-def _export(
-    model,
-    args,
-    f,
-    export_params=True,
-    verbose=False,
-    training=_C_onnx.TrainingMode.EVAL,
-    input_names=None,
-    output_names=None,
-    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
-    export_type=None,
-    opset_version=None,
-    do_constant_folding=True,
-    dynamic_axes=None,
-    keep_initializers_as_inputs=None,
-    fixed_batch_size=False,
-    custom_opsets=None,
-    add_node_names=True,
-    onnx_shape_inference=True,
-    export_modules_as_functions: Any = False,
-    autograd_inlining=True,
-):
-    assert GLOBALS.in_onnx_export is False
-
-    if isinstance(model, torch.nn.DataParallel):
-        raise ValueError(
-            "torch.nn.DataParallel is not supported by ONNX "
-            "exporter, please use 'attribute' module to "
-            "unwrap model from torch.nn.DataParallel. Try "
-            "torch.onnx.export(model.module, ...)"
-        )
-
-    GLOBALS.onnx_shape_inference = onnx_shape_inference
-
-    if opset_version is None:
-        opset_version = _constants.ONNX_DEFAULT_OPSET
-
-    if opset_version > _constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET:
-        warnings.warn(
-            f"Exporting to ONNX opset version {opset_version} is not supported. "
-            f"by 'torch.onnx.export()'. "
-            f"The highest opset version supported is {_constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET}. "
-            f"To use a newer opset version, consider 'torch.onnx.export(..., dynamo=True)'. ",
-            category=errors.OnnxExporterWarning,
-        )
-
-    if export_modules_as_functions and opset_version < 15:
-        raise ValueError(
-            "`export_modules_as_functions` is not supported for `opset_version` < 15."
-            "This is because `opset_version` < 15 implies IR version < 8, which means "
-            "no local function support. "
-        )
-    if not operator_export_type:
-        operator_export_type = _C_onnx.OperatorExportTypes.ONNX
-
-    # By default, training=TrainingMode.EVAL,
-    # which is good because running a model in training mode could result in
-    # internal buffers getting updated, dropout getting applied, etc.
-    # If you really know what you're doing, you can turn
-    # training=TrainingMode.TRAINING or training=TrainingMode.PRESERVE,
-    # (to preserve whatever the original training mode was.)
-    GLOBALS.export_onnx_opset_version = opset_version
-    GLOBALS.operator_export_type = operator_export_type
-
-    try:
-        GLOBALS.in_onnx_export = True
-        _autograd_inlining_previous = GLOBALS.autograd_inlining
-        GLOBALS.autograd_inlining = autograd_inlining
-
-        module_typenames_to_export_as_functions: set[str] = set()
-        if isinstance(model, (torch.nn.Module, torch.jit.ScriptModule)):
-            module_typenames_to_export_as_functions = _setup_trace_module_map(
-                model, export_modules_as_functions
-            )
-
-        with exporter_context(model, training, verbose):
-            val_keep_init_as_ip = _decide_keep_init_as_input(
-                keep_initializers_as_inputs,
-                operator_export_type,
-                opset_version,
-            )
-            val_add_node_names = _decide_add_node_names(
-                add_node_names, operator_export_type
-            )
-            val_do_constant_folding = _decide_constant_folding(
-                do_constant_folding, operator_export_type, training
-            )
-            # Normally f can be a file-like object, but for large models, the external data format requires a
-            # valid `model_file_location`. Code in export.cpp will enforce this.
-            if isinstance(f, str):
-                model_file_location = f
-            else:
-                model_file_location = ""
-            args = _decide_input_format(model, args)
-            if dynamic_axes is None:
-                dynamic_axes = {}
-            _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
-
-            graph, params_dict, torch_out = _model_to_graph(
-                model,
-                args,
-                verbose,
-                input_names,
-                output_names,
-                operator_export_type,
-                val_do_constant_folding,
-                fixed_batch_size=fixed_batch_size,
-                training=training,
-                dynamic_axes=dynamic_axes,
-            )
-
-            if custom_opsets is None:
-                custom_opsets = {}
-
-            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
-            node_attr_to_name = {}  # type: ignore[var-annotated]
-            if module_typenames_to_export_as_functions:
-                # NOTE: cannot call DCE after this pass. DCE will remove function definition nodes.
-                node_attr_to_name = _C._jit_pass_onnx_function_extraction(
-                    graph,
-                    module_typenames_to_export_as_functions,
-                    list(params_dict.keys()),
-                )
-
-            if keep_initializers_as_inputs is not True:
-                params_dict = _C._jit_pass_onnx_deduplicate_initializers(  # type: ignore[assignment]
-                    graph,
-                    params_dict,  # type: ignore[arg-type]
-                    getattr(model, "training", False),  # type: ignore[arg-type]
-                )
-            _C._jit_pass_onnx_assign_scoped_names_for_node_and_value(graph)
-            defer_weight_export = False
-            if export_params:
-                (
-                    proto,
-                    export_map,
-                    _val_use_external_data_format,
-                    _node_names,
-                ) = graph._export_onnx(  # type: ignore[attr-defined]
-                    params_dict,
-                    opset_version,
-                    dynamic_axes,
-                    defer_weight_export,
-                    operator_export_type,
-                    not verbose,
-                    val_keep_init_as_ip,
-                    custom_opsets,
-                    val_add_node_names,
-                    model_file_location,
-                    node_attr_to_name,
-                )
-            else:
-                (
-                    proto,
-                    export_map,
-                    _,
-                    _,
-                ) = graph._export_onnx(  # type: ignore[attr-defined]
-                    {},
-                    opset_version,
-                    dynamic_axes,
-                    defer_weight_export,
-                    operator_export_type,
-                    not verbose,
-                    val_keep_init_as_ip,
-                    custom_opsets,
-                    val_add_node_names,
-                    model_file_location,
-                    node_attr_to_name,
-                )
-            # insert function_proto into model_proto.
-            proto = onnx_proto_utils._add_onnxscript_fn(
-                proto,
-                custom_opsets,
-            )
-            if verbose:
-                _C._jit_onnx_log("Exported graph: ", graph)
-            onnx_proto_utils._export_file(proto, f, export_map)
-    finally:
-        assert GLOBALS.in_onnx_export
-        GLOBALS.in_onnx_export = False
-        GLOBALS.autograd_inlining = _autograd_inlining_previous
-        _reset_trace_module_map()
-
-    return torch_out
-
-
-def _apply_friendly_debug_names(graph, params):
-    for n in graph.nodes():
-        for v in n.inputs():
-            old_name = v.debugName()
-            if old_name != str(v.unique()):
-                continue
-            new_name = f"{n.kind()}_{v.unique()}"
-            v.setDebugName(new_name)
-            if old_name in params:
-                params[new_name] = params.pop(old_name)
-
-
-def _set_input_and_output_names(graph, input_names, output_names):
-    def set_names(node_list, name_list, descriptor):
-        if name_list is None:
-            return
-        if len(name_list) > len(node_list):
-            raise RuntimeError(
-                f"number of {descriptor} names provided ({len(name_list)}) "
-                f"exceeded number of {descriptor}s ({len(node_list)})"
-            )
-
-        # Mark if the output node DebugName is set before.
-        output_node_set = set()
-        for i, (name, node) in enumerate(zip(name_list, node_list)):
-            # Duplicated output node, insert onnx::Identity to avoid setting the same DebugName after setDebugName().
-            if descriptor == "output":
-                if node in output_node_set:
-                    identity_node = graph.create("onnx::Identity")
-                    identity_node.insertAfter(node.node())
-                    identity_node.addInput(node)
-                    identity_node.output().setType(node.type())
-                    graph.return_node().replaceInput(i, identity_node.output())
-                    node = identity_node.output()
-                output_node_set.add(node)
-
-            if node.debugName() != name:
-                node.setDebugName(name)
-
-    set_names(list(graph.inputs()), input_names, "input")
-    set_names(list(graph.outputs()), output_names, "output")
-
-
-def _run_symbolic_method(g, op_name, symbolic_fn, args):
-    r"""
-    This trampoline function gets invoked for every symbolic method
-    call from C++.
-    """
-    try:
-        graph_context = jit_utils.GraphContext(
-            graph=g,
-            block=g.block(),
-            opset=GLOBALS.export_onnx_opset_version,
-            original_node=None,  # type: ignore[arg-type]
-            params_dict=_params_dict,
-            env={},
-            values_in_env=set(),
-            new_nodes=[],
-        )
-        return symbolic_fn(graph_context, *args)
-    except TypeError as e:
-        # Handle the specific case where we didn't successfully dispatch
-        # to symbolic_fn.  Otherwise, the backtrace will have the clues
-        # you need.
-        e.args = (f"{e.args[0]} (occurred when translating {op_name})",)
-        raise
-
-
-def _add_block(node: _C.Node) -> _C.Block:
-    return node.addBlock()
-
-
-def _add_input_to_block(block: _C.Block):
-    return block.addInputToBlock()  # type: ignore[attr-defined]
-
-
-def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
-    return block.registerOutput(value)
-
-
-def _should_aten_fallback(
-    name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
-):
-    # For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
-    #   an aten::ATen operator is created regardless of symbolics existence
-
-    is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
-    is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN
-    is_aten_fallback_export = (
-        operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
-    )
-
-    if not name.startswith("aten::"):
-        return False
-
-    if is_onnx_aten_export or (is_aten_fallback_export and not is_exportable_aten_op):
-        return True
-
-    return False
-
-
-def _get_aten_op_overload_name(n: _C.Node) -> str:
-    # Returns `overload_name` attribute to ATen ops on non-Caffe2 builds
-    schema = n.schema()
-    if not schema.startswith("aten::"):
-        return ""
-    return _C.parse_schema(schema).overload_name
-
-
-def _run_symbolic_function(
-    graph: _C.Graph,
-    block: _C.Block,
-    node: _C.Node,
-    inputs: Any,
-    env: dict[_C.Value, _C.Value],
-    values_in_env: set[_C.Value],
-    new_nodes: list[_C.Node],
-    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
-) -> _C.Value | Sequence[_C.Value | None] | None:
-    """Runs a symbolic function.
-
-    The function is used in C++ to export the node to ONNX.
-
-    Returns:
-        A single or a tuple of Values.
-        None when the node gets cloned as is into the new graph.
-    """
-
-    opset_version = GLOBALS.export_onnx_opset_version
-
-    # See Note [Export inplace]
-    node_kind = node.kind()
-    if node_kind.endswith("_"):
-        # Treat relu_ -> relu; add_ -> add etc.
-        ns_op_name = node_kind[:-1]
-    else:
-        ns_op_name = node_kind
-
-    namespace, op_name = jit_utils.parse_node_kind(ns_op_name)
-
-    graph_context = jit_utils.GraphContext(
-        graph=graph,
-        block=block,
-        opset=opset_version,
-        original_node=node,
-        params_dict=_params_dict,
-        env=env,
-        values_in_env=values_in_env,
-        new_nodes=new_nodes,
-    )
-
-    # Direct ATen export requested
-    if _should_aten_fallback(ns_op_name, opset_version, operator_export_type):
-        attrs = {
-            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
-            for k in node.attributeNames()
-        }
-        outputs = node.outputsSize()
-        attrs["outputs"] = outputs
-        return graph_context.aten_op(
-            op_name,
-            *inputs,
-            overload_name=_get_aten_op_overload_name(node),
-            **attrs,
-        )
-
-    try:
-        domain = namespace
-        symbolic_function_name = f"{domain}::{op_name}"
-
-        symbolic_function_group = registration.registry.get_function_group(
-            symbolic_function_name
-        )
-        if symbolic_function_group is not None:
-            symbolic_fn = symbolic_function_group.get(opset_version)
-            if symbolic_fn is not None:
-                # TODO Wrap almost identical attrs assignment or comment the difference.
-                attrs = {
-                    k: symbolic_helper._node_get(node, k) for k in node.attributeNames()
-                }
-                return symbolic_fn(graph_context, *inputs, **attrs)
-
-        attrs = {
-            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
-            for k in node.attributeNames()
-        }
-        if namespace == "onnx":
-            # Clone node to trigger ONNX shape inference
-            return graph_context.op(
-                op_name, *inputs, **attrs, outputs=node.outputsSize()
-            )  # type: ignore[attr-defined]
-
-        raise errors.UnsupportedOperatorError(
-            symbolic_function_name,
-            opset_version,
-            symbolic_function_group.get_min_supported()
-            if symbolic_function_group
-            else None,
-        )
-
-    except RuntimeError:
-        if operator_export_type == _C_onnx.OperatorExportTypes.ONNX_FALLTHROUGH:
-            return None
-        elif operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-            # Emit ATen op for non-Caffe2 builds when `operator_export_type==ONNX_ATEN_FALLBACK`
-            attrs = {
-                k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
-                for k in node.attributeNames()
-            }
-            return graph_context.aten_op(
-                op_name,
-                *inputs,
-                overload_name=_get_aten_op_overload_name(node),
-                **attrs,
-            )
-        raise
-    except TypeError as e:
-        # Handle the specific case where we didn't successfully dispatch.
-        # Otherwise, the backtrace will have the clues you need.
-        e.args = (f"{e.args[0]} \n(Occurred when translating {op_name}).",)
-        raise
-
-
-def _verify_custom_op_name(symbolic_name: str):
-    if not re.match(r"^[a-zA-Z0-9-_]+::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name):
-        raise errors.OnnxExporterError(
-            f"Failed to register operator {symbolic_name}. "
-            "The symbolic name must match the format domain::name, "
-            "and should start with a letter and contain only "
-            "alphanumerical characters"
-        )
-
-    ns, _ = jit_utils.parse_node_kind(symbolic_name)
-    if ns == "onnx":
-        raise ValueError(
-            f"Failed to register operator {symbolic_name}. {ns} domain cannot be modified."
-        )
-
-
-def register_custom_op_symbolic(
-    symbolic_name: str,
-    symbolic_fn: Callable,
-    opset_version: int,
-):
-    """Registers a symbolic function for a custom operator.
-
-    When the user registers symbolic for custom/contrib ops,
-    it is highly recommended to add shape inference for that operator via setType API,
-    otherwise the exported graph may have incorrect shape inference in some extreme cases.
-    An example of setType is `test_aten_embedding_2` in `test_operators.py`.
-
-    See "Custom Operators" in the module documentation for an example usage.
-
-    Args:
-        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
-            format.
-        symbolic_fn (Callable): A function that takes in the ONNX graph and
-            the input arguments to the current operator, and returns new
-            operator nodes to add to the graph.
-        opset_version (int): The ONNX opset version in which to register.
-    """
-    if symbolic_name.startswith("::"):
-        symbolic_name = f"aten{symbolic_name}"
-
-    _verify_custom_op_name(symbolic_name)
-
-    registration.custom_onnx_symbolic(symbolic_name, opset_version)(symbolic_fn)
-
-
-def unregister_custom_op_symbolic(symbolic_name: str, opset_version: int):
-    """Unregisters ``symbolic_name``.
-
-    See "Custom Operators" in the module documentation for an example usage.
-
-    Args:
-        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
-            format.
-        opset_version (int): The ONNX opset version in which to unregister.
-    """
-    if symbolic_name.startswith("::"):
-        symbolic_name = f"aten{symbolic_name}"
-
-    _verify_custom_op_name(symbolic_name)
-
-    registration.registry.unregister(symbolic_name, opset_version)
-
-
-def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
-    """Ensures dynamic axes argument is follows the expected format."""
-    if len(dynamic_axes) == 0:
-        return
-
-    if hasattr(model, "graph"):
-        # Extracting set of valid input/output names that shall be used for dynamic_axes
-        if (input_names is None) or len(input_names) == 0:
-            input_names = [x.debugName() for x in model.graph.inputs()]
-        if (output_names is None) or len(output_names) == 0:
-            output_names = [y.debugName() for y in model.graph.outputs()]
-
-    valid_names = set((input_names or []) + (output_names or []))
-
-    # If dynamic axes are provided as a list rather than dictionary, they should
-    # first get converted to a dictionary in expected format. If desired axes names
-    # are not provided for dynamic axes, automatic names shall be generated for
-    # provided dynamic axes of specified input/output
-    for key, value in dynamic_axes.items():
-        if key not in valid_names:
-            warnings.warn(
-                f"Provided key {key} for dynamic axes is not a valid input/output name"
-            )
-        if isinstance(value, list):
-            warnings.warn(
-                "No names were found for specified dynamic axes of provided input."
-                f"Automatically generated names will be applied to each dynamic axes of input {key}"
-            )
-
-            value_dict = {}
-            for i, x in enumerate(value):
-                if not isinstance(x, int):
-                    raise ValueError(
-                        "The type of axis index is expected to be an integer"
-                    )
-                if x in value_dict:
-                    warnings.warn(
-                        f"Duplicate dynamic axis index {x} was provided for input {key}."
-                    )
-                else:
-                    value_dict[x] = str(key) + "_dynamic_axes_" + str(i + 1)
-            dynamic_axes[key] = value_dict
 
+__all__: list[str] = []
 
-def model_signature(model: torch.nn.Module | Callable) -> inspect.Signature:
-    return inspect.signature(
-        model.forward if isinstance(model, torch.nn.Module) else model
-    )
+from torch.onnx._internal.torchscript_exporter.utils import *  # noqa: F401,F403
diff --git a/torch/onnx/verification.py b/torch/onnx/verification.py
index bc98fedae0864..70d901acb47a9 100644
--- a/torch/onnx/verification.py
+++ b/torch/onnx/verification.py
@@ -1,1872 +1,12 @@
-# mypy: allow-untyped-defs
-"""The ONNX verification module provides a set of tools to verify the correctness of ONNX models."""
+"""A set of tools to verify the correctness of ONNX models."""
 
-from __future__ import annotations
+__all__ = ["VerificationInfo", "verify_onnx_program"]
 
-
-__all__ = [
-    "OnnxBackend",
-    "VerificationOptions",
-    "verify",
-    "check_export_model_diff",
-    "VerificationInfo",
-    "verify_onnx_program",
-    "GraphInfo",
-    "GraphInfoPrettyPrinter",
-    "OnnxTestCaseRepro",
-    "find_mismatch",
-    "verify_aten_graph",
-]
-
-import contextlib
-import copy
-import dataclasses
-import datetime
-import difflib
-import enum
-import functools
-import io
-import itertools
-import os
-import tempfile
-import typing_extensions
-import warnings
-from collections.abc import Collection, Mapping, Sequence
-from typing import Any, Callable, Union
-
-import numpy as np
-import numpy.typing as npt
-
-import torch
-import torch._C._onnx as _C_onnx
-from torch import _C
-from torch.onnx import _constants, _experimental, utils
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import onnx_proto_utils
 from torch.onnx._internal.exporter._verification import (
     VerificationInfo,
     verify_onnx_program,
 )
-from torch.types import Number
-
 
-# TODO: Update deprecation messages to recommend the new classes
 
 VerificationInfo.__module__ = "torch.onnx.verification"
 verify_onnx_program.__module__ = "torch.onnx.verification"
-
-# Everything below are deprecated ##############################################
-
-_ORT_PROVIDERS = ("CPUExecutionProvider",)
-
-_NumericType = Union[Number, torch.Tensor, np.ndarray]
-_ModelType = Union[torch.nn.Module, torch.jit.ScriptModule]
-_InputArgsType = Union[torch.Tensor, tuple[Any, ...]]
-_InputKwargsType = Mapping[str, Any]
-_OutputsType = Union[Sequence[_NumericType], Sequence]
-
-
-class OnnxBackend(enum.Enum):
-    """Enum class for ONNX backend used for export verification.
-
-    .. deprecated:: 2.7
-        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
-        ``ONNXProgram`` to test the ONNX model.
-    """
-
-    REFERENCE = "ONNXReferenceEvaluator"
-    ONNX_RUNTIME_CPU = "CPUExecutionProvider"
-    ONNX_RUNTIME_CUDA = "CUDAExecutionProvider"
-
-
-@dataclasses.dataclass
-class VerificationOptions:
-    """Options for ONNX export verification.
-
-    .. deprecated:: 2.7
-        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
-        ``ONNXProgram`` to test the ONNX model.
-
-    Attributes:
-        flatten: If True, unpack nested list/tuple/dict inputs into a flattened list of
-            Tensors for ONNX. Set this to False if nested structures are to be preserved
-            for ONNX, which is usually the case with exporting ScriptModules. Default True.
-        ignore_none: Whether to ignore None type in torch output, which is usually the
-            case with tracing. Set this to False, if torch output should keep None type,
-            which is usually the case with exporting ScriptModules. Default to True.
-        check_shape: Whether to check the shapes between PyTorch and ONNX Runtime outputs
-            are exactly the same. Set this to False to allow output shape broadcasting.
-            Default to True.
-        check_dtype: Whether to check the dtypes between PyTorch and ONNX Runtime outputs
-            are consistent. Default to True.
-        backend: ONNX backend for verification. Default to OnnxBackend.ONNX_RUNTIME_CPU.
-        rtol: relative tolerance in comparison between ONNX and PyTorch outputs.
-        atol: absolute tolerance in comparison between ONNX and PyTorch outputs.
-        remained_onnx_input_idx: If provided, only the specified inputs will be passed
-            to the ONNX model. Supply a list when there are unused inputs in the model.
-            Since unused inputs will be removed in the exported ONNX model, supplying
-            all inputs will cause an error on unexpected inputs. This parameter tells
-            the verifier which inputs to pass into the ONNX model.
-        acceptable_error_percentage: acceptable percentage of element mismatches in comparison.
-            It should be a float of value between 0.0 and 1.0.
-    """
-
-    flatten: bool = True
-    ignore_none: bool = True
-    check_shape: bool = True
-    check_dtype: bool = True
-    backend: OnnxBackend = OnnxBackend.ONNX_RUNTIME_CPU
-    rtol: float = 1e-3
-    atol: float = 1e-7
-    remained_onnx_input_idx: Sequence[int] | None = None
-    acceptable_error_percentage: float | None = None
-
-
-def _flatten_tuples(elem):
-    flattened = []
-    for t in elem:
-        if isinstance(t, tuple):
-            flattened.extend(_flatten_tuples(t))
-        else:
-            flattened.append(t)
-    return flattened
-
-
-# TODO(justinchuby): Add type checking by narrowing down the return type when input is None
-def _to_numpy(elem) -> list | npt.NDArray:
-    if isinstance(elem, torch.Tensor):
-        if elem.requires_grad:
-            return elem.detach().cpu().numpy()
-        else:
-            return elem.cpu().numpy()
-    elif isinstance(elem, (list, tuple)):
-        return [_to_numpy(inp) for inp in elem]
-    elif isinstance(elem, (bool, int, float)):
-        return np.array(elem)
-    elif isinstance(elem, dict):
-        flattened = []
-        for k in elem:
-            flattened.extend([_to_numpy(k), _to_numpy(elem[k])])
-        return flattened
-    return elem
-
-
-def _inline_flatten_list(inputs, res_list) -> list:
-    for i in inputs:
-        res_list.append(i) if not isinstance(
-            i, (list, tuple)
-        ) else _inline_flatten_list(i, res_list)
-    return res_list
-
-
-def _unpack_to_numpy(values, cast_onnx_accepted=True) -> list:
-    value_unpacked = []
-    for value in values:
-        value_unpacked.extend(
-            utils.unpack_quantized_tensor(value, cast_onnx_accepted=cast_onnx_accepted)
-        )
-    return [_to_numpy(v) for v in value_unpacked]
-
-
-def _run_onnx(onnx_session, inputs) -> _OutputsType:
-    kw_inputs = {}
-    if inputs and isinstance(inputs[-1], dict):
-        kw_inputs = inputs[-1]
-        inputs = inputs[:-1]
-    inputs = _unpack_to_numpy(_flatten_tuples(inputs))
-    ort_inputs = {}
-    for input_name, input in kw_inputs.items():
-        ort_inputs[input_name] = _to_numpy(input)
-    inputs = _to_numpy(inputs)
-    if hasattr(onnx_session, "get_inputs"):
-        # onnxruntime.InferenceSession
-        input_names = [i.name for i in onnx_session.get_inputs()]
-    elif hasattr(onnx_session, "input_names"):
-        # onnx.reference.ReferenceEvaluator
-        input_names = onnx_session.input_names
-    else:
-        raise ValueError(f"Unknown ONNX backend type: {type(onnx_session)}.")
-
-    for i, input in enumerate(inputs):
-        if i == len(input_names) or input_names[i] in ort_inputs:
-            raise ValueError(
-                f"got too many positional inputs. inputs: {inputs}. kw_inputs: {kw_inputs}. "
-                f"input names: {input_names}."
-            )
-        ort_inputs[input_names[i]] = input
-    onnx_outs = onnx_session.run(None, ort_inputs)
-    return onnx_outs
-
-
-def _ort_session(
-    model: str | io.BytesIO, ort_providers: Sequence[str] = _ORT_PROVIDERS
-):
-    try:
-        import onnxruntime  # type: ignore[import]
-    except ImportError as e:
-        raise ImportError("onnxruntime is required for export verification.") from e
-
-    if ort_providers is None:
-        ort_providers = _ORT_PROVIDERS
-
-    session_options = onnxruntime.SessionOptions()
-    # suppress ort warnings.
-    # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
-    session_options.log_severity_level = 3
-    ort_session = onnxruntime.InferenceSession(
-        model if isinstance(model, str) else model.getvalue(),
-        session_options,
-        providers=ort_providers,
-    )
-    return ort_session
-
-
-def _onnx_reference_evaluator_session(model: str | io.BytesIO):
-    try:
-        import onnx
-        from onnx import reference as onnx_reference  # type: ignore[attr-defined]
-    except ImportError as exc:
-        raise ImportError("onnx >= 1.13 is required for reference evaluator.") from exc
-
-    proto = (
-        onnx.load(model)  # type: ignore[attr-defined]
-        if isinstance(model, str)
-        else onnx.load_model_from_string(model.getvalue())  # type: ignore[attr-defined]
-    )
-    onnx_session = onnx_reference.ReferenceEvaluator(proto)
-    return onnx_session
-
-
-def _onnx_backend_session(model: str | io.BytesIO, backend: OnnxBackend):
-    if backend == OnnxBackend.REFERENCE:
-        onnx_session = _onnx_reference_evaluator_session(model)
-    elif backend in {OnnxBackend.ONNX_RUNTIME_CPU, OnnxBackend.ONNX_RUNTIME_CUDA}:
-        onnx_session = _ort_session(model, (backend.value,))
-    else:
-        raise ValueError(f"Unsupported backend: {backend}")
-    return onnx_session
-
-
-def _compare_onnx_pytorch_outputs_in_np(
-    onnx_outs: _OutputsType,
-    pt_outs: _OutputsType,
-    options: VerificationOptions,
-):
-    assert len(onnx_outs) == len(pt_outs), (
-        f"Number of outputs differ ONNX runtime: ({len(onnx_outs)}) PyTorch: ({len(pt_outs)})"
-    )
-    acceptable_error_percentage = options.acceptable_error_percentage
-    if acceptable_error_percentage and (
-        acceptable_error_percentage > 1.0 or acceptable_error_percentage < 0.0
-    ):
-        raise ValueError(
-            "If set, acceptable_error_percentage should be between 0.0 and 1.0"
-        )
-
-    for ort_out, pt_out in zip(onnx_outs, pt_outs):
-        try:
-            # TODO: Remove `check_shape` option once every shape inconsistent issue is addressed.
-            if not options.check_shape:
-                # Allow different but broadcastable output shapes.
-                ort_out, pt_out = np.broadcast_arrays(ort_out, pt_out)
-            torch.testing.assert_close(
-                ort_out,
-                pt_out,
-                rtol=options.rtol,
-                atol=options.atol,
-                check_dtype=options.check_dtype,
-                equal_nan=True,
-            )
-        except AssertionError as e:
-            if acceptable_error_percentage:
-                error_percentage = 1 - np.sum(
-                    np.isclose(ort_out, pt_out, rtol=options.rtol, atol=options.atol)
-                ) / np.prod(ort_out.shape)
-                if error_percentage <= acceptable_error_percentage:
-                    warnings.warn(
-                        f"Suppressed AssertionError:\n{e}.\n"
-                        f"Error percentage {error_percentage} "
-                        f"within acceptable range {acceptable_error_percentage}."
-                    )
-                    continue
-            if ort_out.dtype == np.uint8 or ort_out.dtype == np.int8:
-                warnings.warn("ONNX output is quantized")
-            if pt_out.dtype == np.uint8 or pt_out.dtype == np.int8:
-                warnings.warn("PyTorch output is quantized")
-            raise
-
-
-def _compare_onnx_pytorch_outputs(
-    onnx_outs: _OutputsType,
-    pt_outs: Any,
-    options: VerificationOptions,
-):
-    """
-    Compare ONNX and PyTorch outputs.
-
-    Args:
-        onnx_outs: outputs from ONNX backend.
-        pt_outs: outputs from PyTorch.
-        options: options for verification.
-
-    Raises:
-        AssertionError: if outputs from ONNX model and PyTorch model are not
-            equal up to specified precision.
-        ValueError: if arguments provided are invalid.
-    """
-    if options.ignore_none:
-        # torch.jit._flatten filters None type
-        pt_outs, _ = torch.jit._flatten(pt_outs)
-    else:
-        pt_outs = _inline_flatten_list([pt_outs], [])
-    pt_outs_np = _unpack_to_numpy(pt_outs, cast_onnx_accepted=False)
-    onnx_outs = _inline_flatten_list(onnx_outs, [])
-    _compare_onnx_pytorch_outputs_in_np(onnx_outs, pt_outs_np, options)
-
-
-def _prepare_input_for_pytorch(args, kwargs):
-    """Prepare input for PyTorch model execution.
-
-    Any future changes/formatting to the input before dispatching to the PyTorch
-    model should be made in this function.
-
-    Args:
-        args: positional arguments for PyTorch model forward method.
-        kwargs: keyword arguments for PyTorch model forward method.
-
-    Returns:
-        args: positional arguments for PyTorch model forward method.
-        kwargs: keyword arguments for PyTorch model forward method.
-    """
-    if isinstance(args, (torch.Tensor, dict)):
-        args = (args,)
-    # In-place operators will update input tensor data as well.
-    # Thus inputs are replicated before every forward call.
-    args = copy.deepcopy(args)
-    if kwargs:
-        kwargs = copy.deepcopy(kwargs)
-    else:
-        kwargs = {}
-    return args, kwargs
-
-
-def _prepare_input_for_export(args, kwargs):
-    """Prepare input for ONNX model export.
-
-    Any future changes/formatting to the input before dispatching to the
-    :func:`torch.onnx.export` api should be made in this function.
-
-    Args:
-        args: positional arguments for PyTorch model forward method.
-        kwargs: keyword arguments for PyTorch model forward method.
-
-    Returns:
-        onnx_inputs: positional arguments for ONNX model export, as `args` in
-            :func:`torch.onnx.export`.
-    """
-    args, kwargs = _prepare_input_for_pytorch(args, kwargs)
-    if not kwargs and len(args) > 0 and isinstance(args[-1], dict):
-        onnx_inputs = args + ({},)
-    elif kwargs:
-        onnx_inputs = args + (kwargs,)
-    else:
-        onnx_inputs = args
-    return onnx_inputs
-
-
-def _prepare_input_for_onnx(
-    args, kwargs, remained_onnx_input_idx: Sequence[int] | None, flatten: bool
-):
-    """Prepare input for ONNX model execution in ONNX backend.
-
-    Any future changes/formatting to the input before dispatching to the ONNX backend
-    run should be made in this function.
-
-    Args:
-        args: positional arguments for PyTorch model forward method.
-        kwargs: keyword arguments for PyTorch model forward method.
-        remained_onnx_input_idx: indices of inputs to be used for ONNX model execution.
-        flatten: whether to flatten the input before dispatching to the ONNX model execution.
-
-    Returns:
-        onnx_inputs: positional arguments for ONNX model execution in ONNX backend.
-    """
-    onnx_inputs = _prepare_input_for_export(args, kwargs)
-    if flatten:
-        onnx_inputs, _ = torch.jit._flatten(onnx_inputs)
-    elif onnx_inputs and onnx_inputs[-1] == {}:
-        # Handle empty kwargs (normally removed by flatten).
-        onnx_inputs = onnx_inputs[:-1]
-    if remained_onnx_input_idx is not None:
-        return [onnx_inputs[i] for i in remained_onnx_input_idx]
-    else:
-        return onnx_inputs
-
-
-def _try_clone_model(model):
-    """Used for preserving original model in case forward mutates model states."""
-    try:
-        return copy.deepcopy(model)
-    except Exception:
-        warnings.warn(
-            "Failed to clone model. Model state might be mutated during verification."
-        )
-        return model
-
-
-def _compare_onnx_pytorch_model(
-    pt_model: _ModelType,
-    onnx_model_f: str | io.BytesIO,
-    input_args: _InputArgsType,
-    input_kwargs: _InputKwargsType | None,
-    additional_test_inputs: Sequence[_InputArgsType] | None,
-    options: VerificationOptions,
-):
-    """Compare outputs from ONNX model runs with outputs from PyTorch model runs.
-
-    Args:
-        pt_model: PyTorch model.
-        onnx_model_f: ONNX model file path or file-like object.
-        input_args: positional arguments for PyTorch model forward method.
-        input_kwargs: keyword arguments for PyTorch model forward method.
-        additional_test_inputs: additional positional arguments for PyTorch model
-            forward method.
-        options: options for verification.
-
-    Raises:
-        AssertionError: if outputs from ONNX model and PyTorch model are not
-            equal up to specified precision.
-    """
-    onnx_session = _onnx_backend_session(onnx_model_f, options.backend)
-
-    def compare_onnx_pytorch_model_with_input(input_args, input_kwargs):
-        pt_args, pt_kwargs = _prepare_input_for_pytorch(input_args, input_kwargs)
-        # TODO: remove this and treat mutating model separately. See #77679
-        pt_model_copy = _try_clone_model(pt_model)
-        pt_outs = pt_model_copy(*pt_args, **pt_kwargs)
-
-        onnx_inputs = _prepare_input_for_onnx(
-            input_args, input_kwargs, options.remained_onnx_input_idx, options.flatten
-        )
-
-        onnx_outs = _run_onnx(onnx_session, onnx_inputs)
-
-        _compare_onnx_pytorch_outputs(
-            onnx_outs=onnx_outs,
-            pt_outs=pt_outs,
-            options=options,
-        )
-
-    compare_onnx_pytorch_model_with_input(input_args, input_kwargs)
-
-    if additional_test_inputs:
-        for test_input_args in additional_test_inputs:
-            compare_onnx_pytorch_model_with_input(test_input_args, {})
-
-
-class _GraphDiff:
-    """A class to represent the difference between two graphs."""
-
-    def __init__(self, graph_a: _C.Graph, graph_b: _C.Graph):
-        """Construct a _GraphDiff object.
-
-        Args:
-            graph_a (_C.Graph): First graph to compare.
-            graph_b (_C.Graph): Second graph to compare.
-        """
-        self.graph_a = graph_a
-        self.graph_b = graph_b
-
-    def __str__(self):
-        """See function :func:`diff_report`."""
-        return self.diff_report()
-
-    def _indent(self, lines: str) -> str:
-        return "\n".join(["\t" + line for line in lines.splitlines()])
-
-    def diff_report(self) -> str:
-        """Return a string representation of the graph difference.
-
-        The report shows the first pair of nodes that diverges. It also shows the source
-        location of the pair of nodes.
-
-        Returns:
-            graph_diff_report (str): A string representation of the graph difference.
-        """
-        graph_a = self.graph_a
-        graph_b = self.graph_b
-
-        graph_a_str = str(graph_a)
-        graph_b_str = str(graph_b)
-
-        if graph_a_str == graph_b_str:
-            return ""
-
-        graph_diff = difflib.ndiff(
-            graph_a_str.splitlines(True), graph_b_str.splitlines(True)
-        )
-        graph_diff_report = ["Graph diff:", self._indent("".join(graph_diff))]
-
-        for node_a, node_b in itertools.zip_longest(graph_a.nodes(), graph_b.nodes()):
-            if str(node_a) != str(node_b):
-                graph_diff_report.append("First diverging operator:")
-                node_diff = difflib.ndiff(
-                    str(node_a).splitlines(True), str(node_b).splitlines(True)
-                )
-                source_printout = ["node diff:", self._indent("".join(node_diff))]
-
-                stack_a = node_a.sourceRange() if node_a else None
-                if stack_a:
-                    source_printout.extend(
-                        ["Former source location:", self._indent(str(stack_a))]
-                    )
-                stack_b = node_b.sourceRange() if node_b else None
-                if stack_b:
-                    source_printout.extend(
-                        ["Latter source location:", self._indent(str(stack_b))]
-                    )
-
-                graph_diff_report.extend(source_printout)
-
-                break
-
-        return "\n".join(graph_diff_report)
-
-
-def _check_graph_diff(
-    model: torch.nn.Module | torch.jit.ScriptModule,
-    test_input_groups: Sequence[tuple[tuple[Any, ...], Mapping[str, Any]]],
-    export_options: _experimental.ExportOptions,
-    model_to_graph_func: Callable[
-        [
-            torch.nn.Module,
-            tuple[Any, ...],
-            Mapping[str, Any],
-            _experimental.ExportOptions,
-        ],
-        _C.Graph,
-    ],
-) -> str:
-    """Check if graph produced by `model_to_graph_func` is the same across `test_input_groups`.
-
-    Args:
-        model: See :func:`check_export_model_diff`.
-        test_input_groups: See :func:`check_export_model_diff`.
-        export_options: See :func:`check_export_model_diff`.
-        model_to_graph_func: A function to convert a PyTorch model to a JIT IR graph.
-
-    Returns:
-        graph_diff_report (str): A string representation of the graph difference.
-    """
-    if len(test_input_groups) < 2:
-        raise ValueError("Need at least two groups of test inputs to compare.")
-
-    ref_jit_graph = None
-    for args, kwargs in test_input_groups:
-        jit_graph = model_to_graph_func(model, args, kwargs, export_options)
-        if ref_jit_graph is None:
-            ref_jit_graph = jit_graph
-            continue
-
-        graph_diff_report = _GraphDiff(ref_jit_graph, jit_graph).diff_report()
-        if graph_diff_report:
-            return graph_diff_report
-    return ""
-
-
-def _traced_graph_from_model(
-    model: torch.nn.Module | torch.jit.ScriptModule,
-    args: tuple[Any, ...],
-    kwargs: Mapping[str, Any],
-    export_options: _experimental.ExportOptions,
-) -> _C.Graph:
-    """As part of the ONNX export steps, create a traced JIT graph from a PyTorch model.
-
-    Args:
-        model: See :func:`check_export_model_diff`.
-        args: See :func:`check_export_model_diff`.
-        kwargs: See :func:`check_export_model_diff`.
-        export_options: See :func:`check_export_model_diff`.
-
-    Returns:
-        jit_graph (_C.Graph): A traced JIT graph.
-    """
-    training = export_options.training
-    verbose = export_options.verbose
-
-    with utils.exporter_context(model, training, verbose):
-        export_inputs = _prepare_input_for_export(args, kwargs)
-        model = utils._pre_trace_quant_model(model, export_inputs)
-        jit_graph, _, _, _ = utils._create_jit_graph(model, export_inputs)
-        return jit_graph
-
-
-def _onnx_graph_from_model(
-    model: torch.nn.Module | torch.jit.ScriptModule,
-    args: tuple[Any, ...],
-    kwargs: Mapping[str, Any],
-    export_options: _experimental.ExportOptions,
-) -> _C.Graph:
-    """As part of the ONNX export steps, export an ONNX JIT graph from a PyTorch model.
-
-    Args:
-        model: See :func:`check_export_model_diff`.
-        args: See :func:`check_export_model_diff`.
-        kwargs: See :func:`check_export_model_diff`.
-        export_options: See :func:`check_export_model_diff`.
-
-    Returns:
-        onnx_graph (_C.Graph): An ONNX JIT graph.
-    """
-    # TODO: refactor utils.py to remove duplicated code of context setup. See #78834
-    opset_version = export_options.opset_version
-    operator_export_type = export_options.operator_export_type
-    export_modules_as_functions = export_options.export_modules_as_functions
-    training = export_options.training
-    verbose = export_options.verbose
-    dynamic_axes = export_options.dynamic_axes
-    input_names = export_options.input_names
-    output_names = export_options.output_names
-
-    if opset_version is None:
-        opset_version = _constants.ONNX_DEFAULT_OPSET
-
-    utils._setup_trace_module_map(model, export_modules_as_functions)
-
-    if not operator_export_type:
-        operator_export_type = _C_onnx.OperatorExportTypes.ONNX
-
-    GLOBALS.export_onnx_opset_version = opset_version
-    GLOBALS.operator_export_type = operator_export_type
-
-    with utils.exporter_context(model, training, verbose):
-        do_constant_folding = utils._decide_constant_folding(
-            export_options.do_constant_folding, operator_export_type, training
-        )
-
-        if dynamic_axes is None:
-            dynamic_axes = {}
-        utils._validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
-
-        export_inputs = _prepare_input_for_export(args, kwargs)
-        export_inputs = utils._decide_input_format(model, export_inputs)
-        onnx_graph, _, _ = utils._model_to_graph(
-            model,
-            export_inputs,
-            verbose,
-            input_names,
-            output_names,
-            operator_export_type,
-            do_constant_folding,
-            training=training,
-            dynamic_axes=dynamic_axes,
-        )
-
-        return onnx_graph
-
-
-def _onnx_graph_from_aten_graph(
-    graph: torch.Graph,
-    export_options: _experimental.ExportOptions,
-    params_dict: dict[str, Any] | None = None,
-) -> tuple[torch.Graph, dict[str, Any]]:
-    if params_dict is None:
-        params_dict = {}
-    operator_export_type = export_options.operator_export_type
-    dynamic_axes = export_options.dynamic_axes or {}
-    input_names = export_options.input_names
-    training = export_options.training
-    do_constant_folding = export_options.do_constant_folding
-    opset_version = export_options.opset_version or _constants.ONNX_DEFAULT_OPSET
-
-    GLOBALS.export_onnx_opset_version = opset_version
-    GLOBALS.operator_export_type = operator_export_type
-
-    do_constant_folding = utils._decide_constant_folding(
-        do_constant_folding, operator_export_type, training
-    )
-
-    # TODO: Below is doing aten graph to onnx. It should be abstracted as a
-    # function in torch/onnx/utils.py.
-    graph = graph.copy()
-    graph = utils._optimize_graph(
-        graph,
-        operator_export_type,
-        params_dict=params_dict,
-        dynamic_axes=dynamic_axes,
-        input_names=input_names,
-    )
-
-    if training is None or training == _C_onnx.TrainingMode.EVAL:
-        params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict)
-
-    if (
-        do_constant_folding
-        and opset_version >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
-    ):
-        params_dict = _C._jit_pass_onnx_constant_fold(graph, params_dict, opset_version)
-        _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
-
-    if GLOBALS.onnx_shape_inference:
-        _C._jit_pass_onnx_graph_shape_type_inference(graph, params_dict, opset_version)
-
-    params_dict = _C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
-
-    # For ONNX opset < 9, constants only have three data types: float16, float, double.
-    # In this pass transform constants of other data types to float/double + cast operator.
-    if opset_version < 9:
-        _C._jit_pass_onnx_cast_all_constant_to_floating(graph)
-
-    params_dict = _C._jit_pass_filter_non_tensor_arguments(params_dict)
-    _C._jit_decay_packed_param_input_types(graph)
-
-    _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
-
-    if export_options.verbose:
-        print("ONNX graph: ", graph)
-
-    return graph, params_dict
-
-
-def _onnx_proto_from_onnx_graph(
-    onnx_graph: torch.Graph,
-    export_options: _experimental.ExportOptions,
-    params_dict: dict[str, Any],
-) -> tuple[bytes, Mapping[str, bytes]]:
-    opset_version = export_options.opset_version or _constants.ONNX_DEFAULT_OPSET
-    dynamic_axes = export_options.dynamic_axes or {}
-    operator_export_type = export_options.operator_export_type
-    val_keep_init_as_ip = utils._decide_keep_init_as_input(
-        export_options.keep_initializers_as_inputs,
-        operator_export_type,
-        opset_version,
-    )
-    val_add_node_names = utils._decide_add_node_names(True, operator_export_type)
-    custom_opsets = export_options.custom_opsets or {}
-
-    proto, export_map, _, _ = onnx_graph._export_onnx(  # type: ignore[attr-defined]
-        params_dict,
-        opset_version,
-        dynamic_axes,
-        False,
-        operator_export_type,
-        not export_options.verbose,
-        val_keep_init_as_ip,
-        custom_opsets,
-        val_add_node_names,
-        "",
-        {},
-    )
-
-    return proto, export_map
-
-
-def check_export_model_diff(
-    model: torch.nn.Module | torch.jit.ScriptModule,
-    test_input_groups: Sequence[tuple[tuple[Any, ...], Mapping[str, Any]]],
-    export_options: _experimental.ExportOptions | None = None,
-) -> str:
-    """Verify exported model discrepancy between different groups of inputs.
-
-    A graph is exported for each group of inputs. The exported graphs are then compared
-    to each other, and discrepancies of first pair of nodes are reported. This function
-    first checks the jit graph. If no discrepancies were found, it then checks the onnx
-    graph.
-
-    Unless otherwise specified, the jit/ONNX graph is expected to be the same, regardless
-    of the inputs used for exporting. A discrepancy implies the graph exported is
-    not accurate when run on other groups of inputs, which will typically results in
-    runtime errors or mismatching output.
-
-    Args:
-        model (torch.nn.Module or torch.jit.ScriptModule): The model to be exported.
-        test_input_groups (Sequence[Tuple[Tuple[Any, ...], Mapping[str, Any]]]): A sequence
-            of input groups to be used to export the model. Each input group is a pair of
-            (args, kwargs).
-        export_options (_experimental.ExportOptions, optional): An _experimental.ExportOptions
-            object that controls the export behavior.
-
-    Returns:
-        str: A string containing the diff of the exported models.
-    """
-    export_options = (
-        _experimental.ExportOptions() if export_options is None else export_options
-    )
-
-    jit_diff_report = _check_graph_diff(
-        model, test_input_groups, export_options, _traced_graph_from_model
-    )
-    if jit_diff_report:
-        return jit_diff_report
-
-    return _check_graph_diff(
-        model, test_input_groups, export_options, _onnx_graph_from_model
-    )
-
-
-@typing_extensions.deprecated(
-    "torch.onnx.verification.* is deprecated. Consider using torch.onnx.export(..., dynamo=True) "
-    "and use ONNXProgram to test the ONNX model",
-    category=None,
-)
-def verify(
-    model: _ModelType,
-    input_args: _InputArgsType,
-    input_kwargs: _InputKwargsType | None = None,
-    do_constant_folding: bool = True,
-    dynamic_axes: Mapping[str, Mapping[int, str] | Mapping[str, Sequence[int]]]
-    | None = None,
-    input_names: Sequence[str] | None = None,
-    output_names: Sequence[str] | None = None,
-    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
-    opset_version: int | None = None,
-    keep_initializers_as_inputs: bool = True,
-    verbose: bool = False,
-    fixed_batch_size: bool = False,
-    use_external_data: bool = False,
-    additional_test_inputs: Sequence[_InputArgsType] | None = None,
-    options: VerificationOptions | None = None,
-):
-    """Verify model export to ONNX against original PyTorch model.
-
-    .. deprecated:: 2.7
-        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
-        ``ONNXProgram`` to test the ONNX model.
-
-    Args:
-        model: See :func:`torch.onnx.export`.
-        input_args: See :func:`torch.onnx.export`.
-        input_kwargs: See :func:`torch.onnx.export`.
-        do_constant_folding: See :func:`torch.onnx.export`.
-        dynamic_axes: See :func:`torch.onnx.export`.
-        input_names: See :func:`torch.onnx.export`.
-        output_names: See :func:`torch.onnx.export`.
-        training: See :func:`torch.onnx.export`.
-        opset_version: See :func:`torch.onnx.export`.
-        keep_initializers_as_inputs: See :func:`torch.onnx.export`.
-        verbose: See :func:`torch.onnx.export`.
-        fixed_batch_size: Legacy argument, used only by rnn test cases.
-        use_external_data: Explicitly specify whether to export the model with external data.
-        additional_test_inputs: List of tuples. Each tuple is a group of
-            input arguments to test. Currently only ``*args`` are supported.
-        options: A VerificationOptions object that controls the verification behavior.
-
-    Raises:
-        AssertionError: if outputs from ONNX model and PyTorch model are not
-            equal up to specified precision.
-        ValueError: if arguments provided are invalid.
-    """
-    if options is None:
-        options = VerificationOptions()
-
-    if training == torch.onnx.TrainingMode.TRAINING:
-        model.train()
-    elif training == torch.onnx.TrainingMode.EVAL:
-        model.eval()
-    with torch.no_grad(), contextlib.ExitStack() as stack:
-        model_f: str | io.BytesIO = io.BytesIO()
-        if use_external_data:
-            tmpdir_path = stack.enter_context(tempfile.TemporaryDirectory())
-            model_f = os.path.join(tmpdir_path, "model.onnx")
-
-        inputs_for_export = _prepare_input_for_export(input_args, input_kwargs)
-
-        # TODO(#77679): remove this and treat mutating model separately.
-        model_copy = _try_clone_model(model)
-        utils._export(
-            model,
-            inputs_for_export,
-            model_f,
-            opset_version=opset_version,
-            do_constant_folding=do_constant_folding,
-            keep_initializers_as_inputs=keep_initializers_as_inputs,
-            dynamic_axes=dynamic_axes,
-            input_names=input_names,
-            output_names=output_names,
-            fixed_batch_size=fixed_batch_size,
-            training=training,
-            verbose=verbose,
-        )
-
-        _compare_onnx_pytorch_model(
-            pt_model=model_copy,
-            onnx_model_f=model_f,
-            input_args=input_args,
-            input_kwargs=input_kwargs,
-            additional_test_inputs=additional_test_inputs,
-            options=options,
-        )
-
-
-@typing_extensions.deprecated(
-    "torch.onnx.verification.* is deprecated. Consider using torch.onnx.export(..., dynamo=True) "
-    "and use ONNXProgram to test the ONNX model"
-)
-def verify_aten_graph(
-    graph: torch.Graph,
-    input_args: tuple[Any, ...],
-    export_options: _experimental.ExportOptions,
-    params_dict: dict[str, Any] | None = None,
-    verification_options: VerificationOptions | None = None,
-) -> tuple[AssertionError | None, torch.Graph, _OutputsType, _OutputsType]:
-    """Verify aten graph export to ONNX against original PyTorch model.
-
-    .. deprecated:: 2.7
-        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
-        ``ONNXProgram`` to test the ONNX model.
-    """
-    if verification_options is None:
-        verification_options = VerificationOptions()
-    if params_dict is None:
-        params_dict = {}
-
-    original_jit_graph = graph
-    graph = graph.copy()
-
-    # Execute aten graph and get reference torch jit outputs.
-    graph_inputs = list(graph.inputs())
-    jit_inputs = tuple([arg for arg in input_args if arg is not None])
-    weights = [params_dict[v.debugName()] for v in graph_inputs[len(jit_inputs) :]]
-    assert all(w is not None for w in weights)
-    # TODO: Only copy the argument if mutation is detected in Graph.
-    jit_inputs = copy.deepcopy(jit_inputs)
-    jit_input_and_parameters = jit_inputs + tuple(weights)
-    jit_outs = torch._C._jit_interpret_graph(graph, jit_input_and_parameters)  # type: ignore[attr-defined]
-    if not isinstance(jit_outs, (list, tuple)):
-        jit_outs = [jit_outs]
-
-    # Convert aten graph to onnx graph.
-    graph, onnx_params_dict = _onnx_graph_from_aten_graph(
-        graph, export_options, params_dict
-    )
-
-    proto, export_map = _onnx_proto_from_onnx_graph(
-        graph, export_options, onnx_params_dict
-    )
-    model_f: str | io.BytesIO = io.BytesIO()
-    onnx_proto_utils._export_file(proto, model_f, export_map)
-
-    # NOTE: Verification is unstable. Try catch to emit information for debugging.
-    try:
-        # NOTE: Input might be dce'ed, so we need to remove those from the input args.
-        new_input_names = {v.debugName() for v in graph.inputs()}
-        new_input_args = []
-        for v, arg in zip(original_jit_graph.inputs(), input_args):
-            if v.debugName() in new_input_names:
-                new_input_args.append(arg)
-        input_args = tuple(new_input_args)
-
-        onnx_inputs = _prepare_input_for_onnx(
-            input_args,
-            {},
-            verification_options.remained_onnx_input_idx,
-            verification_options.flatten,
-        )
-
-        onnx_session = _onnx_backend_session(model_f, verification_options.backend)
-        onnx_outs = _run_onnx(onnx_session, onnx_inputs)
-        del onnx_session  # To free device memory
-
-        try:
-            _compare_onnx_pytorch_outputs(
-                onnx_outs=onnx_outs,
-                pt_outs=jit_outs,
-                options=verification_options,
-            )
-        except AssertionError as e:
-            return e, graph, jit_outs, onnx_outs
-
-        return None, graph, jit_outs, onnx_outs
-
-    except Exception as e:
-        print("Unexpected error during verification.")
-        print("jit graph: ", original_jit_graph)
-        print("onnx graph: ", graph)
-        raise e
-
-
-class GraphInfoPrettyPrinter:
-    graph_info: GraphInfo | None
-    upper_printer: GraphInfoPrettyPrinter | None
-    lower_printer: GraphInfoPrettyPrinter | None
-
-    graph_str_lambdas: Mapping[int, str]
-    connector_str_lambdas: Mapping[int, str]
-    children_str_lambdas: Mapping[int, str]
-
-    def __init__(self, graph_info: GraphInfo | None):
-        self.graph_info = graph_info
-        if (
-            graph_info is not None
-            and graph_info.upper_graph_info is not None
-            and graph_info.lower_graph_info is not None
-        ):
-            self.upper_printer = GraphInfoPrettyPrinter(graph_info.upper_graph_info)
-            self.lower_printer = GraphInfoPrettyPrinter(graph_info.lower_graph_info)
-        else:
-            self.upper_printer = None
-            self.lower_printer = None
-
-    def _total_rows(self) -> int:
-        if self.graph_info is None:
-            return 1
-        if self.upper_printer and self.lower_printer:
-            return (
-                self.upper_printer._total_rows() + self.lower_printer._total_rows() + 1
-            )
-        return 2  # Two lines: node count + id.
-
-    def _node_count_segment_str(self) -> str:
-        if self.graph_info is None:
-            return "..."
-        node_count = self.graph_info.essential_node_count()
-        has_mismatch = self.graph_info.has_mismatch()
-        error_node_kind = (
-            f"({self.graph_info.essential_node_kinds().pop()})"
-            if node_count == 1 and has_mismatch
-            else ""
-        )
-
-        return f"{node_count} {'X' if has_mismatch else chr(0x2713)} {error_node_kind}"
-
-    def _graph_id_segment_str(self) -> str:
-        if self.graph_info is None:
-            return ""
-        return f"id: {self.graph_info.id}"
-
-    def _max_segment_columns(self) -> int:
-        return max(
-            map(len, (self._node_count_segment_str(), self._graph_id_segment_str()))
-        )
-
-    def _graph_segment_str_at_line(self, line: int) -> str:
-        """Get the string representation of the graph segment at the given line."""
-        if line == 0:
-            result_str = self._node_count_segment_str()
-            result_str += " " * (self._max_segment_columns() - len(result_str))
-            return result_str
-        if line == 1:
-            result_str = self._graph_id_segment_str()
-            result_str += " " * (self._max_segment_columns() - len(result_str))
-            return result_str
-        if 0 <= line < self._total_rows():
-            return " " * self._max_segment_columns()
-        return ""
-
-    def _connector_segment_str_at_line(self, line: int) -> str:
-        """Get the connector segment string at the given line."""
-        if self.upper_printer is None and self.lower_printer is None:
-            return ""
-        upper_total_rows = self.upper_printer._total_rows() if self.upper_printer else 1
-        lower_total_rows = self.lower_printer._total_rows() if self.lower_printer else 1
-        if line == 0:
-            return "  __"
-        elif line < upper_total_rows + 1:
-            return " |  "
-        elif line == upper_total_rows + 1:
-            return " |__"
-        elif line < upper_total_rows + lower_total_rows + 1:
-            return "    "
-        return ""
-
-    def _children_str_at_line(self, line: int) -> str:
-        """Get the string representation of the children at the given line.
-
-        Recursively calls `_str_at_line` on children nodes.
-        """
-        if self.upper_printer is None and self.lower_printer is None:
-            return ""
-        upper_total_rows = self.upper_printer._total_rows() if self.upper_printer else 1
-        lower_total_rows = self.lower_printer._total_rows() if self.lower_printer else 1
-        if 0 <= line < upper_total_rows:
-            return (
-                self.upper_printer._str_at_line(line) if self.upper_printer else "..."
-            )
-        elif upper_total_rows < line < upper_total_rows + lower_total_rows + 1:
-            return (
-                self.lower_printer._str_at_line(line - upper_total_rows - 1)
-                if self.lower_printer
-                else "..."
-            )
-        return ""
-
-    def _str_at_line(self, line: int) -> str:
-        """Get the string representation of the graph at the given line."""
-        return (
-            self._graph_segment_str_at_line(line)
-            + self._connector_segment_str_at_line(line)
-            + self._children_str_at_line(line)
-        )
-
-    def pretty_print(self):
-        if self.graph_info is None:
-            print(None)
-            return
-        # Print tree.
-        print(" Tree: ".center(80, "="))
-        total_rows = self._total_rows()
-        for line in range(total_rows):
-            print(self._str_at_line(line).rstrip())
-        if self.graph_info.has_mismatch():
-            # Summarize leaf subgraphs with mismatch.
-            print(" Mismatch leaf subgraphs: ".center(80, "="))
-            print(
-                [
-                    graph_info.id
-                    for graph_info in self.graph_info.all_mismatch_leaf_graph_info()
-                ]
-            )
-            # Summarize node kinds with mismatch.
-            mismatch_node_kinds: dict[str, int] = {}
-            for graph_info in self.graph_info.all_mismatch_leaf_graph_info():
-                node_kinds = graph_info.essential_node_kinds()
-                if len(node_kinds) == 1:
-                    node_kind = node_kinds.pop()
-                    mismatch_node_kinds[node_kind] = (
-                        mismatch_node_kinds.get(node_kind, 0) + 1
-                    )
-            print(" Mismatch node kinds: ".center(80, "="))
-            print(mismatch_node_kinds)
-        else:
-            print(" No mismatch found. ".center(80, "="))
-
-
-class OnnxTestCaseRepro:
-    def __init__(self, repro_dir):
-        self.repro_dir = repro_dir
-        self.proto, self.inputs, self.outputs = onnx_proto_utils.load_test_case(
-            repro_dir
-        )
-
-    @classmethod
-    def create_test_case_repro(
-        cls, proto: bytes, inputs, outputs, dir: str, name: str | None = None
-    ):
-        """Create a repro under "{dir}/test_{name}" for an ONNX test case.
-
-        The test case contains the model and the inputs/outputs data. The directory
-        structure is as follows:
-
-        dir
-        \u251c\u2500\u2500 test_<name>
-        \u2502   \u251c\u2500\u2500 model.onnx
-        \u2502   \u2514\u2500\u2500 test_data_set_0
-        \u2502       \u251c\u2500\u2500 input_0.pb
-        \u2502       \u251c\u2500\u2500 input_1.pb
-        \u2502       \u251c\u2500\u2500 output_0.pb
-        \u2502       \u2514\u2500\u2500 output_1.pb
-
-        Args:
-            proto: ONNX model proto.
-            inputs: Inputs to the model.
-            outputs: Outputs of the model.
-            dir: Directory to save the repro.
-            name: Name of the test case. If not specified, a name based on current time
-                will be generated.
-        Returns:
-            Path to the repro.
-        """
-        if name is None:
-            name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
-        return onnx_proto_utils.export_as_test_case(
-            proto,
-            _to_numpy(inputs),
-            _to_numpy(outputs),
-            name,
-            dir,
-        )
-
-    def validate(self, options: VerificationOptions):
-        """Run the ONNX test case with options.backend, and compare with the expected outputs.
-
-        Args:
-            options: Options for validation.
-
-        Raise:
-            AssertionError: if outputs from options.backend and expected outputs are not
-                equal up to specified precision.
-        """
-        onnx_session = _onnx_backend_session(io.BytesIO(self.proto), options.backend)
-        run_outputs = onnx_session.run(None, self.inputs)
-        if hasattr(onnx_session, "get_outputs"):
-            output_names = [o.name for o in onnx_session.get_outputs()]
-        elif hasattr(onnx_session, "output_names"):
-            output_names = onnx_session.output_names
-        else:
-            raise ValueError(f"Unknown onnx session type: {type(onnx_session)}")
-        expected_outs = [self.outputs[name] for name in output_names]
-        _compare_onnx_pytorch_outputs_in_np(run_outputs, expected_outs, options)
-
-
-@typing_extensions.deprecated(
-    "torch.onnx.verification.* is deprecated. Consider using torch.onnx.export(..., dynamo=True) "
-    "and use ONNXProgram to test the ONNX model"
-)
-@dataclasses.dataclass
-class GraphInfo:
-    """GraphInfo contains validation information of a TorchScript graph and its converted ONNX graph.
-
-    .. deprecated:: 2.7
-        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
-        ``ONNXProgram`` to test the ONNX model.
-    """
-
-    graph: torch.Graph
-    input_args: tuple[Any, ...]
-    params_dict: dict[str, Any]
-    export_options: _experimental.ExportOptions = dataclasses.field(
-        default_factory=_experimental.ExportOptions
-    )
-    mismatch_error: AssertionError | None = dataclasses.field(default=None, init=False)
-    pt_outs: Sequence[_NumericType] | None = dataclasses.field(default=None, init=False)
-    upper_graph_info: GraphInfo | None = dataclasses.field(default=None, init=False)
-    lower_graph_info: GraphInfo | None = dataclasses.field(default=None, init=False)
-    id: str = dataclasses.field(default="")
-    _onnx_graph: torch.Graph | None = dataclasses.field(init=False, default=None)
-
-    _EXCLUDED_NODE_KINDS: frozenset[str] = frozenset(
-        {"prim::Constant", "prim::ListConstruct", "aten::ScalarImplicit"}
-    )
-
-    def clear(self):
-        """Clear states and results of previous verification."""
-        self.mismatch_error = None
-        self.pt_outs = None
-        self._onnx_graph = None
-        self.upper_graph_info = None
-        self.lower_graph_info = None
-
-    def pretty_print_tree(self):
-        """Pretty print `GraphInfo` tree.
-
-        Each node represents a subgraph, showing the number of nodes in the subgraph and
-        a check mark if the subgraph has output mismatch between torch and ONNX.
-
-        The id of the subgraph is shown under the node. The `GraphInfo` object for any
-        subgraph can be retrieved by calling `graph_info.find_partition(id)`.
-
-        Example::
-
-            ==================================== Tree: =====================================
-            5 X   __2 X    __1 \u2713
-            id:  |  id: 0 |  id: 00
-                 |        |
-                 |        |__1 X (aten::relu)
-                 |           id: 01
-                 |
-                 |__3 X    __1 \u2713
-                    id: 1 |  id: 10
-                          |
-                          |__2 X     __1 X (aten::relu)
-                             id: 11 |  id: 110
-                                    |
-                                    |__1 \u2713
-                                       id: 111
-            =========================== Mismatch leaf subgraphs: ===========================
-            ['01', '110']
-            ============================= Mismatch node kinds: =============================
-            {'aten::relu': 2}
-
-        """
-        GraphInfoPrettyPrinter(self).pretty_print()
-
-    def pretty_print_mismatch(self, graph: bool = False):
-        """Pretty print details of the mismatch between torch and ONNX.
-
-        Args:
-            graph: If True, print the ATen JIT graph and ONNX graph.
-        """
-        print(f" Mismatch info for graph partition {self.id}: ".center(80, "="))
-        if graph:
-            print(" ATen JIT graph ".center(80, "="))
-            # TODO: A more compact graph printer.
-            #   * Drop stride, grad, device information.
-            #   * Show source location on a separate line.
-            print(self.graph)
-            if self._onnx_graph is not None:
-                print(" ONNX graph ".center(80, "="))
-                print(self._onnx_graph)
-        if self.has_mismatch():
-            print(" Mismatch error ".center(80, "="))
-            print(self.mismatch_error)
-        else:
-            print(" No mismatch ".center(80, "="))
-
-    def has_mismatch(self) -> bool:
-        """Return True if the subgraph has output mismatch between torch and ONNX."""
-        return self.mismatch_error is not None
-
-    def essential_node_count(self) -> int:
-        """Return the number of nodes in the subgraph excluding those in `_EXCLUDED_NODE_KINDS`."""
-        return sum(
-            1 for n in self.graph.nodes() if n.kind() not in self._EXCLUDED_NODE_KINDS
-        )
-
-    def essential_node_kinds(self) -> set[str]:
-        """Return the set of node kinds in the subgraph excluding those in `_EXCLUDED_NODE_KINDS`."""
-        return {
-            n.kind()
-            for n in self.graph.nodes()
-            if n.kind() not in self._EXCLUDED_NODE_KINDS
-        }
-
-    def all_mismatch_leaf_graph_info(self) -> list[GraphInfo]:
-        """Return a list of all leaf `GraphInfo` objects that have mismatch."""
-        if not self.has_mismatch():
-            return []
-
-        no_mismatch_children = (
-            self.upper_graph_info is None or not self.upper_graph_info.has_mismatch()
-        ) and (
-            self.lower_graph_info is None or not self.lower_graph_info.has_mismatch()
-        )
-
-        if no_mismatch_children:
-            return [self]
-
-        results = []
-        if self.upper_graph_info is not None:
-            results += self.upper_graph_info.all_mismatch_leaf_graph_info()
-        if self.lower_graph_info is not None:
-            results += self.lower_graph_info.all_mismatch_leaf_graph_info()
-
-        return results
-
-    def find_partition(self, id: str) -> GraphInfo | None:
-        """Find the `GraphInfo` object with the given id."""
-        if id == self.id:
-            return self
-        current_length = len(self.id)
-        if len(id) > current_length:
-            if id[current_length] == "0" and self.upper_graph_info is not None:
-                return self.upper_graph_info.find_partition(id)
-            elif id[current_length] == "1" and self.lower_graph_info is not None:
-                return self.lower_graph_info.find_partition(id)
-        return None
-
-    def export_repro(
-        self, repro_dir: str | None = None, name: str | None = None
-    ) -> str:
-        """Export the subgraph to ONNX along with the input/output data for repro.
-
-        The repro directory will contain the following files::
-
-            dir
-            \u251c\u2500\u2500 test_<name>
-            \u2502   \u251c\u2500\u2500 model.onnx
-            \u2502   \u2514\u2500\u2500 test_data_set_0
-            \u2502       \u251c\u2500\u2500 input_0.pb
-            \u2502       \u251c\u2500\u2500 input_1.pb
-            \u2502       \u251c\u2500\u2500 output_0.pb
-            \u2502       \u2514\u2500\u2500 output_1.pb
-
-        Args:
-            repro_dir: The directory to export the repro files to. Defaults to current
-                working directory if None.
-            name: An optional name for the test case folder: "test_{name}".
-
-        Returns:
-            The path to the exported repro directory.
-        """
-
-        if repro_dir is None:
-            repro_dir = os.getcwd()
-        repro_dir = os.path.join(repro_dir, "onnx_debug")
-
-        onnx_graph, onnx_params_dict = _onnx_graph_from_aten_graph(
-            self.graph, self.export_options, self.params_dict
-        )
-
-        proto, _ = _onnx_proto_from_onnx_graph(
-            onnx_graph, self.export_options, onnx_params_dict
-        )
-        return OnnxTestCaseRepro.create_test_case_repro(
-            proto, self.input_args, self.pt_outs, repro_dir, name
-        )
-
-    def _graph_partition_pivot(self) -> int:
-        """Find the pivot index to partition the graph.
-
-        The pivot is the node that splits the graph into two parts. Each part should
-        have the similar amount of nodes, excluding non essential ops, defined in
-        `_EXCLUDED_NODE_KINDS`, such as `prim::Constant`.
-        If the graph has an odd number of nodes, the upper part will have one more node.
-        If the graph does not have any node that can be partitioned, return -1.
-
-        Returns:
-            The index of the pivot node.
-        """
-        included_node_indices = [
-            i
-            for i, n in enumerate(self.graph.nodes())
-            if n.kind() not in self._EXCLUDED_NODE_KINDS
-        ]
-        half_idx = len(included_node_indices) // 2 - 1
-        if half_idx >= 0 and len(included_node_indices) > half_idx:
-            return included_node_indices[half_idx] + 1
-        return -1
-
-    def _partition_upper_graph(self) -> torch.Graph:
-        pivot = self._graph_partition_pivot()
-        if pivot == -1:
-            return torch.Graph()
-        graph = self.graph.copy()  # Copy to not mutate parent graph.
-        original_outputs = list(graph.outputs())
-
-        def _process_bridge_value_for_upper(
-            new_outputs: list[torch.Value], bridge_value: torch.Value
-        ) -> torch.Value:
-            # Add bridge values as upper graph outputs.
-            new_outputs.append(bridge_value)
-            return bridge_value
-
-        new_outputs: list[torch.Value] = []
-        process_bridge_value_for_upper = functools.partial(
-            _process_bridge_value_for_upper, new_outputs
-        )
-        _, dropped_nodes, complete_upper_nodes_set, _ = self._partition_nodes(
-            graph, pivot, process_bridge_value_for_upper
-        )
-
-        for _ in enumerate(original_outputs):
-            graph.eraseOutput(0)
-        for output in new_outputs:
-            graph.registerOutput(output)
-
-        for node in reversed(dropped_nodes):
-            node.destroy()
-
-        for i, input in reversed(list(enumerate(list(graph.inputs())))):
-            if (
-                not _has_uses_by_nodes(input, complete_upper_nodes_set)
-                and input not in new_outputs
-            ):
-                try:
-                    graph.eraseInput(i)
-                except RuntimeError as e:
-                    print(input, graph)
-                    raise e
-
-        return graph
-
-    def _partition_lower_graph(self) -> torch.Graph:
-        pivot = self._graph_partition_pivot()
-        if pivot == -1:
-            return torch.Graph()
-        graph = self.graph.copy()  # Copy to not mutate parent graph.
-        original_outputs = list(graph.outputs())
-        original_inputs = list(graph.inputs())
-
-        def _process_bridge_value_for_lower(
-            graph: torch.Graph, bridge_value: torch.Value
-        ) -> torch.Value:
-            # Add bridge values as lower graph inputs.
-            new_input = graph.addInput()
-            bridge_value.replaceAllUsesWith(new_input)
-            new_input.copyMetadata(bridge_value)
-            return new_input
-
-        process_bridge_value_for_lower = functools.partial(
-            _process_bridge_value_for_lower, graph
-        )
-
-        upper_nodes, lower_nodes, _, complete_lower_nodes_set = self._partition_nodes(
-            graph, pivot, process_bridge_value_for_lower
-        )
-
-        new_outputs = [
-            output for output in original_outputs if _produced_by(output, lower_nodes)
-        ]
-        for _ in enumerate(original_outputs):
-            graph.eraseOutput(0)
-        for output in new_outputs:
-            graph.registerOutput(output)
-
-        for input in original_inputs:
-            if _has_uses_by_nodes(input, complete_lower_nodes_set):
-                new_input = graph.addInput()
-                input.replaceAllUsesWith(new_input)
-                new_input.copyMetadata(input)
-
-        for node in reversed(upper_nodes):
-            if node not in complete_lower_nodes_set:
-                try:
-                    node.destroy()
-                except RuntimeError as e:
-                    print(node, graph)
-                    raise e
-
-        for _ in original_inputs:
-            graph.eraseInput(0)
-
-        return graph
-
-    def _partition_node(
-        self,
-        node: torch.Node,
-        complete_upper_nodes_set: set[torch.Node],
-        complete_lower_nodes_set: set[torch.Node],
-        original_graph_outputs: set[torch.Value],
-        covered_bridge_values: set[torch.Value],
-        process_bridge_value: Callable[[torch.Value], torch.Value],
-    ):
-        if node in complete_lower_nodes_set:
-            return
-
-        if (
-            _node_has_uses_by(node, complete_lower_nodes_set)
-            and node.kind() in self._EXCLUDED_NODE_KINDS
-        ):
-            complete_lower_nodes_set.update(_all_nodes([node]))
-            for input in node.inputs():
-                if input in covered_bridge_values:
-                    continue
-                self._partition_node(
-                    input.node(),
-                    complete_upper_nodes_set,
-                    complete_lower_nodes_set,
-                    original_graph_outputs,
-                    covered_bridge_values,
-                    process_bridge_value,
-                )
-        else:
-            for output in node.outputs():
-                if output in covered_bridge_values:
-                    continue
-                if (
-                    _has_uses_by_nodes(output, complete_lower_nodes_set)
-                    or output in original_graph_outputs
-                ):
-                    covered_bridge_values.add(process_bridge_value(output))
-
-    def _partition_nodes(
-        self,
-        graph: torch.Graph,
-        pivot: int,
-        process_bridge_value: Callable[[torch.Value], torch.Value],
-    ) -> tuple[list[torch.Node], list[torch.Node], set[torch.Node], set[torch.Node]]:
-        nodes = list(graph.nodes())
-        upper_nodes = nodes[:pivot]
-        lower_nodes = nodes[pivot:]
-        # `upper_nodes` and `complete_upper_nodes_set` differs in that the latter
-        # recursively contains nodes in subblock of `upper_nodes`.
-        # The same applies for `lower_nodes` and `complete_lower_nodes_set`.
-        # With addition that `complete_lower_nodes_set` will include nodes that
-        # are determined to be copied from `upper_nodes` to `lower_nodes`.
-        complete_upper_nodes_set = _all_nodes(upper_nodes)
-        complete_lower_nodes_set = _all_nodes(lower_nodes)
-        original_graph_outputs = set(graph.outputs())
-        # Bridge values are values produced from upper graph, and consumed
-        # by lower graph. These values need to be become upper graph outputs
-        # and lower graph inputs, to bridge the interaction.
-        # Start with all graph inputs marked as covered. If any graph input is
-        # needed by lower graph, just keep it in lower graph inputs later.
-        covered_bridge_values = set(graph.inputs())
-        for node in upper_nodes:
-            self._partition_node(
-                node,
-                complete_upper_nodes_set,
-                complete_lower_nodes_set,
-                original_graph_outputs,
-                covered_bridge_values,
-                process_bridge_value,
-            )
-        return (
-            upper_nodes,
-            lower_nodes,
-            complete_upper_nodes_set,
-            complete_lower_nodes_set,
-        )
-
-    def _bridge_kwargs(self):
-        pt_outs = self.pt_outs
-        graph_outputs = list(self.graph.outputs())
-        assert pt_outs is not None
-        assert len(graph_outputs) == len(pt_outs), (
-            f"{len(graph_outputs)} vs {len(pt_outs)}\nGraph: {self.graph}"
-        )
-        return {v.debugName(): o for v, o in zip(graph_outputs, pt_outs)}
-
-    def _args_and_params_for_partition_graph(
-        self,
-        graph: torch.Graph,
-        bridge_kwargs: Mapping[str, _NumericType | Sequence[_NumericType]],
-        full_kwargs: Mapping[str, torch.Tensor],
-        full_params: Mapping[str, torch.Tensor],
-    ):
-        input_names = [input.debugName() for input in graph.inputs()]
-        args = tuple(bridge_kwargs[k] for k in input_names if k in bridge_kwargs)
-        args += tuple(full_kwargs[k] for k in input_names if k in full_kwargs)
-        params = {k: full_params[k] for k in input_names if k in full_params}
-        assert len(args) + len(params) == len(input_names), (
-            f"{len(args)} + {len(params)} vs {len(input_names)}: {input_names}"
-        )
-        return args, params
-
-    def verify_export(
-        self, options: VerificationOptions
-    ) -> tuple[AssertionError | None, torch.Graph, _OutputsType, _OutputsType]:
-        """
-        Verify the export from TorchScript IR graph to ONNX.
-
-        Export the TorchScript IR graph to ONNX, with the inputs, parameters and export
-        options recorded in this object. Then verify the exported ONNX graph against
-        the original TorchScript IR graph under the provided verification options.
-
-        Args:
-            options: The verification options.
-
-        Returns:
-            error: The AssertionError raised during the verification. Returns None if no
-            error is raised.
-            onnx_graph: The exported ONNX graph in TorchScript IR format.
-            onnx_outs: The outputs from running exported ONNX model under the onnx
-            backend in `options`.
-            pt_outs: The outputs from running the TorchScript IR graph.
-        """
-        return verify_aten_graph(
-            self.graph,
-            input_args=self.input_args,
-            params_dict=self.params_dict,
-            export_options=self.export_options,
-            verification_options=options,
-        )
-
-    def find_mismatch(
-        self,
-        options: VerificationOptions | None = None,
-    ):
-        """
-        Find all mismatches between the TorchScript IR graph and the exported onnx model.
-
-        Binary searches the model graph to find the minimal subgraph that exhibits the
-        mismatch. A `GraphInfo` object is created for each subgraph, recording the test
-        inputs and export options, as well as the validation results.
-
-        Args:
-            options: The verification options.
-        """
-        self.clear()
-
-        if options is None:
-            options = VerificationOptions()
-
-        if self.export_options.verbose:
-            print(self.graph)
-
-        if len(list(self.graph.outputs())) == 0:
-            return
-
-        assert len(self.input_args) + len(self.params_dict) == len(
-            list(self.graph.inputs())
-        ), (
-            f"Number of graph inputs({len(list(self.graph.inputs()))}) does not match "
-            f"the provided tensor arguments({len(self.input_args)} + {len(self.params_dict)})."
-        )
-
-        self.mismatch_error, self._onnx_graph, self.pt_outs, _ = self.verify_export(
-            options
-        )
-
-        if self.mismatch_error is None:
-            # No mismatch found in graph.
-            return
-
-        if self.essential_node_count() <= 1:
-            # Reached leaf node, no more partitioning.
-            return
-
-        full_kwargs = {
-            k.debugName(): v for k, v in zip(self.graph.inputs(), self.input_args)
-        }
-        full_params = self.params_dict
-
-        upper_graph = self._partition_upper_graph()
-        upper_args, upper_params = self._args_and_params_for_partition_graph(
-            upper_graph, {}, full_kwargs, full_params
-        )
-        self.upper_graph_info = GraphInfo(
-            upper_graph,
-            upper_args,
-            upper_params,
-            self.export_options,
-            id=self.id + "0",
-        )
-
-        self.upper_graph_info.find_mismatch(options)
-
-        bridge_kwargs = self.upper_graph_info._bridge_kwargs()
-        lower_graph = self._partition_lower_graph()
-        lower_args, lower_params = self._args_and_params_for_partition_graph(
-            lower_graph, bridge_kwargs, full_kwargs, full_params
-        )
-        self.lower_graph_info = GraphInfo(
-            lower_graph,
-            lower_args,
-            lower_params,
-            self.export_options,
-            id=self.id + "1",
-        )
-
-        self.lower_graph_info.find_mismatch(options)
-
-
-def _all_nodes(nodes: Collection[torch.Node]) -> set[torch.Node]:
-    all_nodes = set(nodes)
-    for n in nodes:
-        for b in n.blocks():
-            all_nodes.update(_all_nodes(list(b.nodes())))
-    return all_nodes
-
-
-def _has_uses_by_nodes(value: torch.Value, nodes: Collection[torch.Node]) -> bool:
-    return any(use.user in nodes for use in value.uses())
-
-
-def _node_has_uses_by(node: torch.Node, nodes: Collection[torch.Node]) -> bool:
-    for output in node.outputs():
-        if _has_uses_by_nodes(output, nodes):
-            return True
-    return False
-
-
-def _produced_by(value: torch.Value, nodes: Collection[torch.Node]) -> bool:
-    return value.node() in nodes
-
-
-@typing_extensions.deprecated(
-    "torch.onnx.verification.* is deprecated. Consider using torch.onnx.export(..., dynamo=True) "
-    "and use ONNXProgram to test the ONNX model"
-)
-def find_mismatch(
-    model: torch.nn.Module | torch.jit.ScriptModule,
-    input_args: tuple[Any, ...],
-    do_constant_folding: bool = True,
-    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
-    opset_version: int | None = None,
-    keep_initializers_as_inputs: bool = True,
-    verbose: bool = False,
-    options: VerificationOptions | None = None,
-) -> GraphInfo:
-    r"""Find all mismatches between the original model and the exported model.
-
-    .. deprecated:: 2.7
-        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
-        ``ONNXProgram`` to test the ONNX model.
-
-    Experimental. The API is subject to change.
-
-    This tool helps debug the mismatch between the original PyTorch model and exported
-    ONNX model. It binary searches the model graph to find the minimal subgraph that
-    exhibits the mismatch.
-
-    Args:
-        model: The model to be exported.
-        input_args: The input arguments to the model.
-        do_constant_folding: Same as `do_constant_folding` in :func:`torch.onnx.export`.
-        training: Same as `training` in :func:`torch.onnx.export`.
-        opset_version: Same as `opset_version` in :func:`torch.onnx.export`.
-        keep_initializers_as_inputs: Same as `keep_initializers_as_inputs` in :func:`torch.onnx.export`.
-        verbose: Same as `verbose` in :func:`torch.onnx.export`.
-        options: The options for the mismatch verification.
-
-    Returns:
-        A GraphInfo object that contains the mismatch information.
-
-    Example::
-
-        >>> import torch
-        >>> import torch.onnx.verification
-        >>> torch.manual_seed(0)
-        >>> opset_version = 15
-        >>> # Define a custom symbolic function for aten::relu.
-        >>> # The custom symbolic function is incorrect, which will result in mismatches.
-        >>> def incorrect_relu_symbolic_function(g, self):
-        ...     return self
-        >>> torch.onnx.register_custom_op_symbolic(
-        ...     "aten::relu",
-        ...     incorrect_relu_symbolic_function,
-        ...     opset_version=opset_version,
-        ... )
-        >>> class Model(torch.nn.Module):
-        ...     def __init__(self) -> None:
-        ...         super().__init__()
-        ...         self.layers = torch.nn.Sequential(
-        ...             torch.nn.Linear(3, 4),
-        ...             torch.nn.ReLU(),
-        ...             torch.nn.Linear(4, 5),
-        ...             torch.nn.ReLU(),
-        ...             torch.nn.Linear(5, 6),
-        ...         )
-        ...     def forward(self, x):
-        ...         return self.layers(x)
-        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
-        >>> graph_info = torch.onnx.verification.find_mismatch(
-        ...     Model(),
-        ...     (torch.randn(2, 3),),
-        ...     opset_version=opset_version,
-        ... )
-        ===================== Mismatch info for graph partition : ======================
-        ================================ Mismatch error ================================
-        Tensor-likes are not close!
-        Mismatched elements: 12 / 12 (100.0%)
-        Greatest absolute difference: 0.2328854203224182 at index (1, 2) (up to 1e-07 allowed)
-        Greatest relative difference: 0.699536174352349 at index (1, 3) (up to 0.001 allowed)
-        ==================================== Tree: =====================================
-        5 X   __2 X    __1 \u2713
-        id:  |  id: 0 |  id: 00
-             |        |
-             |        |__1 X (aten::relu)
-             |           id: 01
-             |
-             |__3 X    __1 \u2713
-                id: 1 |  id: 10
-                      |
-                      |__2 X     __1 X (aten::relu)
-                         id: 11 |  id: 110
-                                |
-                                |__1 \u2713
-                                   id: 111
-        =========================== Mismatch leaf subgraphs: ===========================
-        ['01', '110']
-        ============================= Mismatch node kinds: =============================
-        {'aten::relu': 2}
-
-    """
-    if options is None:
-        options = VerificationOptions()
-    if opset_version is None:
-        opset_version = _constants.ONNX_DEFAULT_OPSET
-    """From aten graph, do binary search on graph partition to find operator export discrepancy."""
-    # TODO: Copied from utils.py `export` until `_optimize_graph`.
-    if training == torch.onnx.TrainingMode.TRAINING:
-        model.train()
-    elif training == torch.onnx.TrainingMode.EVAL:
-        model.eval()
-    with torch.no_grad():
-        inputs_for_export = _prepare_input_for_export(input_args, {})
-        args = utils._decide_input_format(model, inputs_for_export)
-
-        model = utils._pre_trace_quant_model(model, args)
-        graph, params, _torch_out, _module = utils._create_jit_graph(model, args)
-        params_dict = utils._get_named_param_dict(graph, params)
-
-        utils._apply_friendly_debug_names(graph, params_dict)
-
-        graph_info = GraphInfo(
-            graph,
-            input_args,
-            params_dict,
-            _experimental.ExportOptions(
-                do_constant_folding=do_constant_folding,
-                training=training,
-                opset_version=opset_version,
-                keep_initializers_as_inputs=keep_initializers_as_inputs,
-                verbose=verbose,
-            ),
-        )
-        graph_info.find_mismatch(options)
-        graph_info.pretty_print_mismatch()
-        graph_info.pretty_print_tree()
-
-        return graph_info
diff --git a/torch/optim/__init__.py b/torch/optim/__init__.py
index 7354092dda4e0..1060a6287a8e6 100644
--- a/torch/optim/__init__.py
+++ b/torch/optim/__init__.py
@@ -8,6 +8,7 @@
 
 from torch.optim import lr_scheduler as lr_scheduler, swa_utils as swa_utils
 from torch.optim._adafactor import Adafactor as Adafactor
+from torch.optim._muon import Muon as Muon
 from torch.optim.adadelta import Adadelta as Adadelta
 from torch.optim.adagrad import Adagrad as Adagrad
 from torch.optim.adam import Adam as Adam
@@ -25,6 +26,7 @@
 
 
 Adafactor.__module__ = "torch.optim"
+Muon.__module__ = "torch.optim"
 
 
 del adadelta  # type: ignore[name-defined] # noqa: F821
@@ -52,6 +54,7 @@
     "ASGD",
     "LBFGS",
     "lr_scheduler",
+    "Muon",
     "NAdam",
     "Optimizer",
     "RAdam",
diff --git a/torch/optim/_muon.py b/torch/optim/_muon.py
new file mode 100644
index 0000000000000..28b6c2d8b5b41
--- /dev/null
+++ b/torch/optim/_muon.py
@@ -0,0 +1,362 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""Implementation of the Muon optimizer."""
+
+import math
+from collections.abc import MutableMapping
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from .optimizer import (
+    _disable_dynamo_if_unsupported,
+    _params_doc,
+    _to_scalar,
+    Optimizer,
+    ParamsT,
+)
+
+
+__all__ = ["Muon"]
+
+# Constants from Keller Jordan's Muon post: https://kellerjordan.github.io/posts/muon/
+# github permlink: https://github.com/KellerJordan/Muon/blob/f90a42b28e00b8d9d2d05865fe90d9f39abcbcbd/muon.py#L16
+EPS = 1e-7
+DEFAULT_A = 3.4445
+DEFAULT_B = -4.7750
+DEFAULT_C = 2.0315
+DEFAULT_NS_STEPS = 5
+
+
+def _zeropower_via_newtonschulz(
+    grad: Tensor, ns_coefficients: tuple[float, float, float], ns_steps: int, eps: float
+) -> Tensor:
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+
+    Implementation reference: https://github.com/KellerJordan/Muon/blob/master/muon.py
+    with suggestions by @jxbz, @leloykun, and @YouJiacheng.
+    """
+    if ns_steps >= 100:
+        raise ValueError(
+            "Number of steps must be less than 100 for computational efficiency"
+        )
+    if len(grad.shape) != 2:
+        raise ValueError("Input tensor gradient must be a 2D matrix")
+    if len(ns_coefficients) != 3:
+        raise ValueError("Coefficients must be a tuple of exactly 3 values")
+    a, b, c = ns_coefficients
+    ortho_grad = grad.bfloat16()
+    if grad.size(0) > grad.size(1):
+        ortho_grad = ortho_grad.T
+    # Ensure spectral norm is at most 1
+    ortho_grad.div_(ortho_grad.norm().clamp(min=eps))
+    # Perform the NS iterations
+    for _ in range(ns_steps):
+        gram_matrix = ortho_grad @ ortho_grad.T
+        gram_update = torch.addmm(
+            gram_matrix, gram_matrix, gram_matrix, beta=b, alpha=c
+        )
+        ortho_grad = torch.addmm(ortho_grad, gram_update, ortho_grad, beta=a)
+
+    if grad.size(0) > grad.size(1):
+        ortho_grad = ortho_grad.T
+    return ortho_grad
+
+
+def _adjust_lr(
+    lr: float, adjust_lr_fn: Optional[str], param_shape: torch.Size
+) -> float:
+    """Default learning rate adjustment used by Muon."""
+    A, B = param_shape[:2]
+
+    if adjust_lr_fn is None or adjust_lr_fn == "original":
+        adjusted_ratio = math.sqrt(max(1, A / B))
+    elif adjust_lr_fn == "match_rms_adamw":
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+    else:
+        adjusted_ratio = 1.0
+    return lr * adjusted_ratio
+
+
+class Muon(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = 1e-3,
+        weight_decay: float = 0.1,
+        momentum: float = 0.95,
+        nesterov: bool = True,
+        ns_coefficients: tuple[float, float, float] = (DEFAULT_A, DEFAULT_B, DEFAULT_C),
+        eps: float = EPS,
+        ns_steps: int = DEFAULT_NS_STEPS,
+        adjust_lr_fn: Optional[str] = None,
+    ) -> None:
+        if isinstance(lr, Tensor) and lr.numel() != 1:
+            raise ValueError("Tensor lr must be 1-element")
+        if not 0.0 <= lr:
+            raise ValueError(f"Learning rate should be >= 0 but is: {lr}")
+        if not 0.0 <= momentum:
+            raise ValueError(f"momentum should be >= 0 but is: {momentum}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"weight decay should be >= 0 but is: {weight_decay}")
+        if adjust_lr_fn is not None and adjust_lr_fn not in [
+            "original",
+            "match_rms_adamw",
+        ]:
+            raise ValueError(
+                f"Adjust learning rate function {adjust_lr_fn} is not supported"
+            )
+
+        defaults = {
+            "lr": lr,
+            "weight_decay": weight_decay,
+            "momentum": momentum,
+            "nesterov": nesterov,
+            "ns_coefficients": ns_coefficients,
+            "eps": eps,
+            "ns_steps": ns_steps,
+            "adjust_lr_fn": adjust_lr_fn,
+        }
+        super().__init__(params, defaults)
+
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.ndim != 2:
+                    raise ValueError(
+                        f"Muon only supports 2D parameters whereas we found a parameter with size: {p.size()}"
+                    )
+
+    def _init_group(
+        self,
+        group: MutableMapping,
+        params_with_grad: list[Tensor],
+        grads: list[Tensor],
+        muon_momentum_bufs: list[Tensor],
+    ):
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+
+            if torch.is_complex(p):
+                raise RuntimeError("Muon does not support complex parameters")
+            if p.grad.is_sparse:
+                raise RuntimeError("Muon does not support sparse gradients")
+
+            params_with_grad.append(p)
+            grads.append(p.grad)
+
+            state = self.state[p]
+
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(
+                    p.grad, memory_format=torch.preserve_format
+                )
+            muon_momentum_bufs.append(state["momentum_buffer"])
+
+        return False  # has_complex
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step."""
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            lr = group["lr"]
+            weight_decay = group["weight_decay"]
+            momentum = group["momentum"]
+
+            params_with_grad: list[Tensor] = []
+            grads: list[Tensor] = []
+            muon_momentum_bufs: list[Tensor] = []
+
+            has_complex = self._init_group(
+                group,
+                params_with_grad,
+                grads,
+                muon_momentum_bufs,
+            )
+
+            muon(
+                params_with_grad,
+                grads,
+                muon_momentum_bufs,
+                lr=lr,
+                weight_decay=weight_decay,
+                momentum=momentum,
+                nesterov=group["nesterov"],
+                ns_coefficients=group["ns_coefficients"],
+                eps=group["eps"],
+                ns_steps=group["ns_steps"],
+                adjust_lr_fn=group["adjust_lr_fn"],
+                has_complex=has_complex,
+            )
+        return loss
+
+
+Muon.__doc__ = (
+    r"""Implements Muon algorithm.
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt} \\
+            &\textbf{input}      : \gamma \text{ (lr)},\ \lambda \text{ (weight decay)},\
+               \mu \text{ (momentum)},\ \textit{nesterov}\in\{True,False\},\\
+            &\hspace{13mm}(a,b,c)\ \text{ (NS coefficients)},\
+               \varepsilon \text{ (epsilon)},\ k \text{ (NS steps)},\
+               \theta_0 \text{ (params)},\ f(\theta) \text{ (objective)} \\
+            &\textbf{initialize} : B_0 \leftarrow 0 \text{ (momentum buffer)} \\[-1.ex]
+            &\rule{110mm}{0.4pt} \\
+            &\textbf{for}\ t=1\ \textbf{to}\ \ldots\ \textbf{do} \\[0.25ex]
+            &\hspace{5mm} g_t \leftarrow \nabla_{\theta} f_t(\theta_{t-1}) \\[0.25ex]
+            &\hspace{5mm} B_t \leftarrow \mu B_{t-1} + g_t \\[0.25ex]
+            &\hspace{5mm} \widetilde{B}_t \leftarrow
+                \begin{cases}
+                   g_t + \mu B_t, & \text{if nesterov}=True \\
+                   B_t,           & \text{if nesterov}=False
+                \end{cases} \\[1.0ex]
+            &\hspace{5mm} O_t \leftarrow \mathrm{NS}^{(a,b,c)}_{k}\!\big(\widetilde{B}_t;\ \varepsilon\big) \\[0.5ex]
+            &\hspace{5mm} \theta_t \leftarrow \theta_{t-1} - \gamma\,\lambda\,\theta_{t-1}
+               \quad\text{(decoupled weight decay)} \\[0.25ex]
+
+            &\hspace{5mm} \gamma \leftarrow \mathrm{AdjustLR}\!\big(\gamma;\ \mathrm{shape}\!\big(\theta_t \big) \big) \\[0.25ex]
+            &\hspace{5mm} \theta_t \leftarrow \theta_t - \gamma\, O_t \\
+            &\rule{110mm}{0.4pt} \\[-1.ex]
+            &\mathbf{return}\ \theta_t \\[-1.ex]
+            &\rule{110mm}{0.4pt}s
+       \end{aligned}
+
+    Here, :math:`\mathrm{NS}^{(a,b,c)}_{k}(\cdot;\varepsilon)` denotes :math:`k` iterations of the
+    Newton–Schulz orthogonalization operator parameterized by coefficients :math:`(a,b,c)`
+    with numerical stabilization :math:`\varepsilon`.
+
+    The purpose for :math:`\mathrm{AdjustLR}\!\big(\gamma;\ \mathrm{shape}\!\big(\theta_t \big) \big)`
+    is to make the orthogonalized update have a consistent :math:`RMS` across rectangular matrices.
+
+    Keller's original implementation scales the update by :math:`\sqrt{\max\!\left(1, \frac{A}{B}\right)}`,
+    where :math:`A` and :math:`B` are dimension of the matrix being optimized.
+
+    Moonshot's implementation also focuses on matching :math:`RMS` of AdamW. The adjustment is computed as:
+    :math:`\gamma \leftarrow {0.2}\gamma\,\sqrt{\max\!\left({A}, {B}\right)}`
+    The method is adopted from `Muon is Scalable for LLM Training`_. Research
+    results show that with this adjustment Muon can directly reuse the learning rate
+    and weight decay tuned for AdamW.
+
+    We provide two options for the learning rate adjustment: "original", which follows Keller's
+    implementation, and "match_rms_adamw", which refers to Moonshot's implementation. This gives users the
+    flexibility to choose between the two. If `adjust_lr_fn` is not specified, the default is "original".
+
+    For further details regarding the algorithm we refer to `Muon: An optimizer for hidden layers in neural networks`_
+    and `Muon is Scalable for LLM Training`_.
+    """
+    + rf"""
+    Args:
+        {_params_doc}. Note that Muon is an optimizer for 2D parameters of neural network hidden layers. Other
+            parameters, such as bias, and embedding, should be optimized by a standard method such as AdamW.
+        lr (float, Tensor, optional): learning rate (default: 1e-3).
+        weight_decay (float, optional): weight decay (L2 penalty). (default: 0.1)
+        momentum (float, optional): momentum factor (default: 0.95)
+        nesterov (bool, optional): enables Nesterov momentum. Only applicable
+            when momentum is non-zero
+        ns_coefficients (tuple of three floats, optional): coefficients \(a,b,c\) for the
+            Newton–Schulz orthogonalization polynomial (default: ({DEFAULT_A}, {DEFAULT_B}, {DEFAULT_C}))
+        eps (float, optional): term added to the denominator for numerical stability. (default: {EPS})
+        ns_steps (int, optional): number of Newton–Schulz iteration steps. (default: {DEFAULT_NS_STEPS})
+        adjust_lr_fn (str, optional): function to adjust learning rate. One of "original" and "match_rms_adamw".
+            If not specified, we will default to use "original". (default: None)
+
+    .. _Muon\: An optimizer for hidden layers in neural networks:
+        https://kellerjordan.github.io/posts/muon/
+    .. _Muon is Scalable for LLM Training:
+        https://arxiv.org/pdf/2502.16982
+
+    """
+)
+
+
+def _single_tensor_muon(
+    params: list[Tensor],
+    grads: list[Tensor],
+    muon_momentum_bufs: list[Tensor],
+    *,
+    lr: float,
+    weight_decay: float,
+    momentum: float,
+    nesterov: bool,
+    ns_coefficients: tuple[float, float, float],
+    ns_steps: int,
+    eps: float,
+    adjust_lr_fn: Optional[str],
+    has_complex: bool,
+) -> None:
+    lr = _to_scalar(lr)
+    if has_complex:
+        raise ValueError("Complex parameters are not supported")
+
+    for i, param in enumerate(params):
+        grad = grads[i]
+        if grad.ndim != 2:
+            raise ValueError("Param gradient must be a 2D matrix")
+
+        buf = muon_momentum_bufs[i]
+        buf.lerp_(grad, 1 - momentum)
+        update = grad.lerp(buf, momentum) if nesterov else buf
+
+        update = _zeropower_via_newtonschulz(update, ns_coefficients, ns_steps, eps)
+
+        adjusted_lr = _adjust_lr(lr, adjust_lr_fn, param.shape)
+
+        param.mul_(1 - lr * weight_decay)
+        param.add_(update, alpha=-adjusted_lr)
+
+
+@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_muon)
+def muon(
+    params: list[Tensor],
+    grads: list[Tensor],
+    muon_momentum_bufs: list[Tensor],
+    *,
+    foreach: Optional[bool] = None,
+    lr: float,
+    weight_decay: float,
+    momentum: float,
+    nesterov: bool,
+    ns_coefficients: tuple[float, float, float],
+    ns_steps: int,
+    eps: float,
+    adjust_lr_fn: Optional[str],
+    has_complex: bool,
+):
+    r"""Functional API that performs Muon algorithm computation.
+
+    See :class:`~torch.optim.Muon` for details.
+    """
+    if foreach is not None and foreach:
+        raise RuntimeError("Foreach is not supported for Muon yet")
+
+    func = _single_tensor_muon
+
+    func(
+        params,
+        grads,
+        muon_momentum_bufs,
+        lr=lr,
+        weight_decay=weight_decay,
+        momentum=momentum,
+        nesterov=nesterov,
+        ns_coefficients=ns_coefficients,
+        ns_steps=ns_steps,
+        eps=eps,
+        adjust_lr_fn=adjust_lr_fn,
+        has_complex=has_complex,
+    )
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
index e8ae47eefd80e..674aaaf268835 100644
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@@ -454,7 +454,8 @@ def obj_func(x, t, d):
                     # the reason we do this: in a stochastic setting,
                     # no use to re-evaluate that function here
                     with torch.enable_grad():
-                        loss = float(closure())
+                        loss = closure()
+                    loss = float(loss)
                     flat_grad = self._gather_flat_grad()
                     opt_cond = flat_grad.abs().max() <= tolerance_grad
                     ls_func_evals = 1
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 6f9f6f1a3cf0c..8703719dabc72 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -200,13 +200,16 @@ def step(self, epoch: Optional[int] = None) -> None:
                 )
 
         self._step_count += 1
+        if epoch is not None:
+            warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
+        self._update_lr(epoch)
 
+    def _update_lr(self, epoch: Optional[int] = None):
         with _enable_get_lr_call(self):
             if epoch is None:
                 self.last_epoch += 1
                 values = self.get_lr()
             else:
-                warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
                 self.last_epoch = epoch
                 if hasattr(self, "_get_closed_form_lr"):
                     values = cast(list[float], self._get_closed_form_lr())
@@ -913,7 +916,7 @@ def step(self) -> None:  # type: ignore[override]
         idx = bisect_right(self._milestones, self.last_epoch)
         scheduler = self._schedulers[idx]
         if idx > 0 and self._milestones[idx - 1] == self.last_epoch:
-            scheduler.step(0)
+            scheduler._update_lr(0)
         else:
             scheduler.step()
 
@@ -1344,7 +1347,7 @@ def step(self, metrics: SupportsFloat, epoch=None) -> None:  # type: ignore[over
             warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
         self.last_epoch = epoch
 
-        if self.is_better(current, self.best):
+        if self._is_better(current, self.best):
             self.best = current
             self.num_bad_epochs = 0
         else:
@@ -1386,7 +1389,7 @@ def _reduce_lr(self, epoch):
     def in_cooldown(self):  # noqa: D102
         return self.cooldown_counter > 0
 
-    def is_better(self, a, best):  # noqa: D102
+    def _is_better(self, a, best):  # noqa: D102
         if self.mode == "min" and self.threshold_mode == "rel":
             rel_epsilon = 1.0 - self.threshold
             return a < best * rel_epsilon
@@ -1686,6 +1689,15 @@ def get_lr(self) -> list[float]:
 
     @override
     def state_dict(self) -> dict[str, Any]:  # noqa: D102
+        """Return the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        The learning rate lambda functions will only be saved if they are callable objects
+        and not if they are functions or lambdas.
+
+        When saving or loading the scheduler, please make sure to also save or load the state of the optimizer.
+        """
         state = super().state_dict()
         # We are dropping the `_scale_fn_ref` attribute because it is a
         # `weakref.WeakMethod` and can't be pickled.
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 2dc95eb555574..2ef6c48f4efab 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -28,9 +28,10 @@
 Args: TypeAlias = tuple[Any, ...]
 Kwargs: TypeAlias = dict[str, Any]
 StateDict: TypeAlias = dict[str, Any]
-DeviceDict = dict[Optional[torch.device], torch.Tensor]
-DeviceDtypeDict = dict[Optional[tuple[torch.device, torch.dtype]], torch.Tensor]
-
+DeviceDict: TypeAlias = dict[Optional[torch.device], torch.Tensor]
+DeviceDtypeDict: TypeAlias = dict[
+    Optional[tuple[torch.device, torch.dtype]], torch.Tensor
+]
 
 GlobalOptimizerPreHook: TypeAlias = Callable[
     ["Optimizer", Args, Kwargs], Optional[tuple[Args, Kwargs]]
@@ -998,16 +999,18 @@ def zero_grad(self, set_to_none: bool = True) -> None:
         r"""Reset the gradients of all optimized :class:`torch.Tensor` s.
 
         Args:
-            set_to_none (bool): instead of setting to zero, set the grads to None.
+            set_to_none (bool, optional): Instead of setting to zero, set the grads to None. Default: ``True``
+
                 This will in general have lower memory footprint, and can modestly improve performance.
                 However, it changes certain behaviors. For example:
+
                 1. When the user tries to access a gradient and perform manual ops on it,
-                a None attribute or a Tensor full of 0s will behave differently.
+                   a None attribute or a Tensor full of 0s will behave differently.
                 2. If the user requests ``zero_grad(set_to_none=True)`` followed by a backward pass, ``.grad``\ s
-                are guaranteed to be None for params that did not receive a gradient.
+                   are guaranteed to be None for params that did not receive a gradient.
                 3. ``torch.optim`` optimizers have a different behavior if the gradient is 0 or None
-                (in one case it does the step with a gradient of 0 and in the other it skips
-                the step altogether).
+                   (in one case it does the step with a gradient of 0 and in the other it skips
+                   the step altogether).
         """
         foreach = self.defaults.get("foreach", False) or self.defaults.get(
             "fused", False
diff --git a/torch/overrides.py b/torch/overrides.py
index fe7af6bc4ff0c..c8fd7c6a22899 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -362,6 +362,7 @@ def get_ignored_functions() -> set[Callable]:
         Tensor._view_func,
         Tensor._view_func_unsafe,
         Tensor._rev_view_func_unsafe,
+        Tensor._make_dtensor,
         Tensor._make_wrapper_subclass,
         Tensor._python_dispatch.__get__,
         Tensor._has_symbolic_sizes_strides.__get__,
@@ -609,8 +610,8 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         torch.fused_moving_avg_obs_fake_quant: (
             lambda x, observer_on, fake_quant_on, averaging_const, running_min, running_max, scale, zero_point, quant_min, quant_max, ch_axis, per_row_fake_quant=False, symmetric_quant=False: -1  # noqa: B950
         ),
-        torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias: -1,
-        torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias: -1,
+        torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias, output: -1,
+        torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias, output: -1,
         torch.fbgemm_linear_int8_weight: lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1,  # noqa: B950
         torch.fbgemm_linear_int8_weight_fp32_activation: (
             lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1
diff --git a/torch/package/_mangling.py b/torch/package/_mangling.py
index 09d7901c2d6cc..08b0560f79322 100644
--- a/torch/package/_mangling.py
+++ b/torch/package/_mangling.py
@@ -2,6 +2,7 @@
 """Import mangling.
 See mangling.md for details.
 """
+
 import re
 
 
diff --git a/torch/package/importer.py b/torch/package/importer.py
index 49b4512f79a60..8cfc1e336a454 100644
--- a/torch/package/importer.py
+++ b/torch/package/importer.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 import importlib
+import logging
 from abc import ABC, abstractmethod
 from pickle import (  # type: ignore[attr-defined]
     _getattribute,
@@ -13,6 +14,7 @@
 
 
 __all__ = ["ObjNotFoundError", "ObjMismatchError", "Importer", "OrderedImporter"]
+log = logging.getLogger(__name__)
 
 
 class ObjNotFoundError(Exception):
@@ -204,6 +206,20 @@ def _is_torchpackage_dummy(self, module):
             return True
         return module.__file__ is None
 
+    def get_name(self, obj: Any, name: Optional[str] = None) -> tuple[str, str]:
+        for importer in self._importers:
+            try:
+                return importer.get_name(obj, name)
+            except (ObjNotFoundError, ObjMismatchError) as e:
+                warning_message = (
+                    f"Tried to call get_name with obj {obj}, "
+                    f"and name {name} on {importer} and got {e}"
+                )
+                log.warning(warning_message)
+        raise ObjNotFoundError(
+            f"Could not find obj {obj} and name {name} in any of the importers {self._importers}"
+        )
+
     def import_module(self, module_name: str) -> ModuleType:
         last_err = None
         for importer in self._importers:
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 21446c626b9a3..6118e8ce80964 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -605,9 +605,9 @@ def save_pickle(
             dependencies (bool, optional): If ``True``, we scan the source for dependencies.
         """
 
-        assert (pickle_protocol == 4) or (
-            pickle_protocol == 3
-        ), "torch.package only supports pickle protocols 3 and 4"
+        assert (pickle_protocol == 4) or (pickle_protocol == 3), (
+            "torch.package only supports pickle protocols 3 and 4"
+        )
 
         filename = self._filename(package, resource)
         # Write the pickle data for `obj`
diff --git a/torch/package/package_importer.py b/torch/package/package_importer.py
index a97cf475b350a..7291227e42ae2 100644
--- a/torch/package/package_importer.py
+++ b/torch/package/package_importer.py
@@ -423,7 +423,12 @@ def _load_module(self, name: str, parent: str):
                         module.__dict__.setdefault(old_name, new_name)
 
                 return module
-        return self._make_module(name, cur.source_file, isinstance(cur, _PackageNode), parent)  # type: ignore[attr-defined]
+        return self._make_module(
+            name,
+            cur.source_file,  # type: ignore[attr-defined]
+            isinstance(cur, _PackageNode),
+            parent,
+        )
 
     def _compile_source(self, fullpath: str, mangled_filename: str):
         source = self.zip_reader.get_record(fullpath)
diff --git a/torch/profiler/__init__.py b/torch/profiler/__init__.py
index a90a371130e7a..153d4560e2641 100644
--- a/torch/profiler/__init__.py
+++ b/torch/profiler/__init__.py
@@ -7,6 +7,7 @@
     An earlier version of the API in :mod:`torch.autograd` module is considered legacy and will be deprecated.
 
 """
+
 import os
 from typing import Any
 from typing_extensions import TypeVarTuple, Unpack
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index 7ad917d1e86be..d9f3a917c1525 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -239,10 +239,12 @@ def inputs_are_mutable(cls, t: _ExtraFields_TorchOp) -> tuple[Optional[bool], ..
     def match_schemas(cls, t: _ExtraFields_TorchOp) -> tuple[FunctionSchema, ...]:
         signature = tuple(
             # Tensor
-            TensorKey.from_tensor(i) if isinstance(i, _TensorMetadata)
+            TensorKey.from_tensor(i)
+            if isinstance(i, _TensorMetadata)
             #
             # TensorList
-            else [TensorKey.from_tensor(j) for j in i] if isinstance(i, list)
+            else [TensorKey.from_tensor(j) for j in i]
+            if isinstance(i, list)
             #
             # Scalar and uncaptured inputs.
             else i
diff --git a/torch/profiler/_utils.py b/torch/profiler/_utils.py
index b1160324cb906..5b631ef743c6e 100644
--- a/torch/profiler/_utils.py
+++ b/torch/profiler/_utils.py
@@ -124,9 +124,9 @@ def compute_self_time(self) -> None:
             for child_event in curr_event.children:
                 self_time -= child_event.duration_time_ns
                 stack.append(child_event)
-            assert (
-                EventKey(curr_event) not in self.metrics
-            ), f"Duplicate id: {curr_event.id}, {curr_event.name}"
+            assert EventKey(curr_event) not in self.metrics, (
+                f"Duplicate id: {curr_event.id}, {curr_event.name}"
+            )
             self.metrics[EventKey(curr_event)] = EventMetrics(self_time_ns=self_time)
             self.metrics[
                 EventKey(curr_event)
@@ -227,8 +227,7 @@ def new_old_event_comparator(event):
 
             while (
                 current_kernel_index < len(cuda_kernel_events)
-                and (cuda_kernel_events[current_kernel_index].start_ns())
-                <= start_time  # type: ignore[possibly-undefined]
+                and (cuda_kernel_events[current_kernel_index].start_ns()) <= start_time  # type: ignore[possibly-undefined]
             ):
                 current_kernel_index += 1
             current_queue_depth = spawned_kernel_index - current_kernel_index + 1
@@ -352,11 +351,11 @@ def get_optimizable_events(self, length: int = 1, print_enable: bool = True):
 
         output += "\n".join(
             [
-                f"""{'-' * 80}
+                f"""{"-" * 80}
 Event:                {event}
 Source code location: {source_code_location(event.event)}
 Percentage idle time: {self.metrics[event].fraction_idle_time * 100:.2f}%
-{'-' * 80}"""
+{"-" * 80}"""
                 for event in event_list
             ]
         )
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index f7be416cfaa7f..573541799bbe6 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -527,7 +527,6 @@ def tensorboard_trace_handler(
     ``worker_name`` should be unique for each worker in distributed scenario,
     it will be set to '[hostname]_[pid]' by default.
     """
-    import os
     import socket
     import time
 
@@ -624,8 +623,7 @@ class profile(_KinetoProfile):
             ]
         ) as p:
             code_to_profile()
-        print(p.key_averages().table(
-            sort_by="self_cuda_time_total", row_limit=-1))
+        print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
 
     Using the profiler's ``schedule``, ``on_trace_ready`` and ``step`` functions:
 
@@ -635,16 +633,17 @@ class profile(_KinetoProfile):
         # on different iterations of the training loop;
         # trace_handler is called every time a new trace becomes available
         def trace_handler(prof):
-            print(prof.key_averages().table(
-                sort_by="self_cuda_time_total", row_limit=-1))
+            print(
+                prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1)
+            )
             # prof.export_chrome_trace("/tmp/test_trace_" + str(prof.step_num) + ".json")
 
+
         with torch.profiler.profile(
             activities=[
                 torch.profiler.ProfilerActivity.CPU,
                 torch.profiler.ProfilerActivity.CUDA,
             ],
-
             # In this example with wait=1, warmup=1, active=2, repeat=1,
             # profiler will skip the first step/iteration,
             # start warming up on the second, record
@@ -652,20 +651,15 @@ def trace_handler(prof):
             # after which the trace will become available
             # and on_trace_ready (when set) is called;
             # the cycle repeats starting with the next step
-
-            schedule=torch.profiler.schedule(
-                wait=1,
-                warmup=1,
-                active=2,
-                repeat=1),
-            on_trace_ready=trace_handler
+            schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=1),
+            on_trace_ready=trace_handler,
             # on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')
             # used when outputting for tensorboard
-            ) as p:
-                for iter in range(N):
-                    code_iteration_to_profile(iter)
-                    # send a signal to the profiler that the next iteration has started
-                    p.step()
+        ) as p:
+            for iter in range(N):
+                code_iteration_to_profile(iter)
+                # send a signal to the profiler that the next iteration has started
+                p.step()
 
     The following sample shows how to setup up an Execution Trace Observer (`execution_trace_observer`)
 
diff --git a/torch/quantization/fuser_method_mappings.py b/torch/quantization/fuser_method_mappings.py
index cfb13ac96271f..5a68fbf02015f 100644
--- a/torch/quantization/fuser_method_mappings.py
+++ b/torch/quantization/fuser_method_mappings.py
@@ -6,6 +6,7 @@
 `torch/ao/quantization/fuser_method_mappings.py`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.fuser_method_mappings import (
     _DEFAULT_OP_LIST_TO_FUSER_METHOD,
     fuse_conv_bn,
diff --git a/torch/quantization/fx/_equalize.py b/torch/quantization/fx/_equalize.py
index 7acea4f84a2a0..d6b8611d4a769 100644
--- a/torch/quantization/fx/_equalize.py
+++ b/torch/quantization/fx/_equalize.py
@@ -6,6 +6,7 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.fx._equalize import (
     _convert_equalization_ref,
     _InputEqualizationObserver,
diff --git a/torch/quantization/fx/convert.py b/torch/quantization/fx/convert.py
index 9d6ac350602bb..30a661da41e5e 100644
--- a/torch/quantization/fx/convert.py
+++ b/torch/quantization/fx/convert.py
@@ -6,4 +6,5 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.fx.convert import convert
diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py
index 67527080304fb..22ad750e9f878 100644
--- a/torch/quantization/fx/fuse.py
+++ b/torch/quantization/fx/fuse.py
@@ -6,4 +6,5 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.fx.fuse import fuse
diff --git a/torch/quantization/fx/fusion_patterns.py b/torch/quantization/fx/fusion_patterns.py
index e29337b3f861e..982d919655f36 100644
--- a/torch/quantization/fx/fusion_patterns.py
+++ b/torch/quantization/fx/fusion_patterns.py
@@ -6,4 +6,5 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.fx.fuse_handler import DefaultFuseHandler, FuseHandler
diff --git a/torch/quantization/fx/graph_module.py b/torch/quantization/fx/graph_module.py
index a71e980a57ba1..74b63903d7400 100644
--- a/torch/quantization/fx/graph_module.py
+++ b/torch/quantization/fx/graph_module.py
@@ -6,6 +6,7 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.fx.graph_module import (
     _is_observed_module,
     _is_observed_standalone_module,
diff --git a/torch/quantization/fx/match_utils.py b/torch/quantization/fx/match_utils.py
index 8b49f7c645d8d..8585a21ad445d 100644
--- a/torch/quantization/fx/match_utils.py
+++ b/torch/quantization/fx/match_utils.py
@@ -6,6 +6,7 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.fx.match_utils import (
     _find_matches,
     _is_match,
diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py
index 2a83e180fc4db..fa601d1eb619c 100644
--- a/torch/quantization/fx/pattern_utils.py
+++ b/torch/quantization/fx/pattern_utils.py
@@ -6,6 +6,7 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.fx.pattern_utils import (
     _register_fusion_pattern,
     _register_quant_pattern,
diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py
index ca65dcc04dd00..a6007ef242af5 100644
--- a/torch/quantization/fx/prepare.py
+++ b/torch/quantization/fx/prepare.py
@@ -6,4 +6,5 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.fx.prepare import prepare
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 20d8cc52ee4fb..89f8d4406e912 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -6,6 +6,7 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.fx.quantize_handler import (
     BatchNormQuantizeHandler,
     BinaryOpQuantizeHandler,
diff --git a/torch/quantization/fx/quantization_types.py b/torch/quantization/fx/quantization_types.py
index a422cdd3142e0..0820ea057078e 100644
--- a/torch/quantization/fx/quantization_types.py
+++ b/torch/quantization/fx/quantization_types.py
@@ -6,4 +6,5 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.utils import Pattern, QuantizerCls
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index ef35559884b7c..e45c82b8fb6f2 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -6,6 +6,7 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.fx.utils import (
     all_node_args_have_no_tensors,
     assert_and_get_unique_device,
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index 6e6c7c1917c83..2163e2717b069 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -6,6 +6,7 @@
 `torch/ao/quantization/observer.py`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.observer import (
     _is_activation_post_process,
     _is_per_channel_script_obs_instance,
diff --git a/torch/quantization/qconfig.py b/torch/quantization/qconfig.py
index 6bb7e14110cb9..a02ff7d6f7388 100644
--- a/torch/quantization/qconfig.py
+++ b/torch/quantization/qconfig.py
@@ -6,6 +6,7 @@
 `torch/ao/quantization/qconfig.py`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.qconfig import (
     _add_module_to_qconfig_obs_ctr,
     _assert_valid_qconfig,
diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py
index 8b44a980ce82f..faa24d391d31a 100644
--- a/torch/quantization/quantization_mappings.py
+++ b/torch/quantization/quantization_mappings.py
@@ -6,6 +6,7 @@
 `torch/ao/quantization/quantization_mappings.py`, while adding an import statement
 here.
 """
+
 from torch.ao.quantization.quantization_mappings import (
     _get_special_act_post_process,
     _has_special_act_post_process,
diff --git a/torch/serialization.py b/torch/serialization.py
index 61a4acf684152..a6eb314fc1a82 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -1426,7 +1426,7 @@ def _get_wo_message(message: str) -> str:
                         "Please file an issue with the following so that we can make "
                         "`weights_only=True` compatible with your use case: WeightsUnpickler error: "
                     )
-            updated_message += message
+            updated_message += "\n\n" + message
         return updated_message + DOCS_MESSAGE
 
     weights_only_not_set = weights_only is None
diff --git a/torch/signal/windows/windows.py b/torch/signal/windows/windows.py
index 7d67de3f83848..e68c202f03e8a 100644
--- a/torch/signal/windows/windows.py
+++ b/torch/signal/windows/windows.py
@@ -128,9 +128,7 @@ def _window_function_checks(
     >>> # Generates a periodic exponential window and decay factor equal to .5
     >>> torch.signal.windows.exponential(10, sym=False,tau=.5)
     tensor([4.5400e-05, 3.3546e-04, 2.4788e-03, 1.8316e-02, 1.3534e-01, 1.0000e+00, 1.3534e-01, 1.8316e-02, 2.4788e-03, 3.3546e-04])
-    """.format(
-        **window_common_args
-    ),
+    """.format(**window_common_args),
 )
 def exponential(
     M: int,
@@ -452,9 +450,7 @@ def kaiser(
     >>> # Generates a periodic Hamming window.
     >>> torch.signal.windows.hamming(10, sym=False)
     tensor([0.0800, 0.1679, 0.3979, 0.6821, 0.9121, 1.0000, 0.9121, 0.6821, 0.3979, 0.1679])
-""".format(
-        **window_common_args
-    ),
+""".format(**window_common_args),
 )
 def hamming(
     M: int,
@@ -508,9 +504,7 @@ def hamming(
     >>> # Generates a periodic Hann window.
     >>> torch.signal.windows.hann(10, sym=False)
     tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
-""".format(
-        **window_common_args
-    ),
+""".format(**window_common_args),
 )
 def hann(
     M: int,
@@ -564,9 +558,7 @@ def hann(
     >>> # Generates a periodic Blackman window.
     >>> torch.signal.windows.blackman(5, sym=False)
     tensor([-1.4901e-08,  2.0077e-01,  8.4923e-01,  8.4923e-01,  2.0077e-01])
-""".format(
-        **window_common_args
-    ),
+""".format(**window_common_args),
 )
 def blackman(
     M: int,
@@ -627,9 +619,7 @@ def blackman(
     >>> # Generates a periodic Bartlett window.
     >>> torch.signal.windows.bartlett(10, sym=False)
     tensor([0.0000, 0.2000, 0.4000, 0.6000, 0.8000, 1.0000, 0.8000, 0.6000, 0.4000, 0.2000])
-""".format(
-        **window_common_args
-    ),
+""".format(**window_common_args),
 )
 def bartlett(
     M: int,
@@ -704,9 +694,7 @@ def bartlett(
     >>> # Generates a periodic general cosine window with 2 coefficients.
     >>> torch.signal.windows.general_cosine(10, a=[0.5, 1 - 0.5], sym=False)
     tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
-""".format(
-        **window_common_args
-    ),
+""".format(**window_common_args),
 )
 def general_cosine(
     M,
@@ -799,9 +787,7 @@ def general_cosine(
     >>> # Generates a periodic Hann window with the general Hamming window.
     >>> torch.signal.windows.general_hamming(10, alpha=0.5, sym=False)
     tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
-""".format(
-        **window_common_args
-    ),
+""".format(**window_common_args),
 )
 def general_hamming(
     M,
@@ -866,9 +852,7 @@ def general_hamming(
     >>> # Generates a periodic Nuttall window.
     >>> torch.signal.windows.general_hamming(5, sym=False)
     tensor([3.6280e-04, 1.1052e-01, 7.9826e-01, 7.9826e-01, 1.1052e-01])
-""".format(
-        **window_common_args
-    ),
+""".format(**window_common_args),
 )
 def nuttall(
     M: int,
diff --git a/torch/sparse/__init__.py b/torch/sparse/__init__.py
index 39d78e8c26ab7..31299314a85f1 100644
--- a/torch/sparse/__init__.py
+++ b/torch/sparse/__init__.py
@@ -559,7 +559,11 @@ def as_sparse_gradcheck(gradcheck):
     For example:
 
     >>> gradcheck = torch.sparse.as_sparse_gradcheck(torch.autograd.gradcheck)
-    >>> x = torch.tensor([[0, 1], [2, 3]], dtype=torch.float64).to_sparse_coo().requires_grad_(True)
+    >>> x = (
+    ...     torch.tensor([[0, 1], [2, 3]], dtype=torch.float64)
+    ...     .to_sparse_coo()
+    ...     .requires_grad_(True)
+    ... )
     >>> gradcheck(lambda x: x.to_sparse_csr(), x)
     True
     """
@@ -667,7 +671,7 @@ def restore_from_strided_representation(args):
                         )
                     else:
                         raise NotImplementedError(
-                            f'conversion of {d["layout"]} strided representation to tensor'
+                            f"conversion of {d['layout']} strided representation to tensor"
                         )
                 new_args.append(a)
             return tuple(new_args)
diff --git a/torch/sparse/_triton_ops.py b/torch/sparse/_triton_ops.py
index a5e802084c28b..ea36264d8f822 100644
--- a/torch/sparse/_triton_ops.py
+++ b/torch/sparse/_triton_ops.py
@@ -296,11 +296,11 @@ def scatter_mm(blocks, others, indices_data, *, accumulators=None):
         for b in range(nbatches):
             for i, r in enumerate(r_offsets):
                 r0, r1 = divmod(r, N)
-                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
-                for g in range(c_indices[i], c_indices[i+1]):
+                acc = accumulators[b, r0 : r0 + Ms, r1 : r1 + Ns]
+                for g in range(c_indices[i], c_indices[i + 1]):
                     p = p_offsets[g]
                     q0, q1 = divmod(q_offsets[g], N)
-                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]
+                    acc += blocks[p] @ others[b, q0 : q0 + Ks, q1 : q1 + Ns]
 
       where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
       integer multiples of ``Ms`` and ``Ks``, respectively.
@@ -320,11 +320,11 @@ def scatter_mm(blocks, others, indices_data, *, accumulators=None):
                 n = (r % N) // Ns
                 r0, r1 = divmod(r, N)
                 c0, c1 = c_indices[m], c_indices[m + 1]
-                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
+                acc = accumulators[b, r0 : r0 + Ms, r1 : r1 + Ns]
                 for i, p in enumerate(range(c0, c1)):
                     q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i]
                     q0, q1 = divmod(q, N)
-                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]
+                    acc += blocks[p] @ others[b, q0 : q0 + Ks, q1 : q1 + Ns]
 
       where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
       integer multiples of ``Ms`` and ``Ks``, respectively.
diff --git a/torch/sparse/_triton_ops_meta.py b/torch/sparse/_triton_ops_meta.py
index 762874077c7ac..89245246395a9 100644
--- a/torch/sparse/_triton_ops_meta.py
+++ b/torch/sparse/_triton_ops_meta.py
@@ -97,6 +97,7 @@
 kernel parameters for addmm-based operations.
 
 """
+
 __all__ = ["get_meta", "tune_bsr_dense_addmm", "tune__int_bsr_dense_addmm"]
 
 import inspect
@@ -432,9 +433,9 @@ def from_key(key, parameters):
 
 
 def create_blocked_tensor(B, M, N, blocksize, sparsity, dtype, device):
-    assert (
-        sparsity <= 1.0 and sparsity >= 0.0
-    ), "sparsity should be a value between 0 and 1"
+    assert sparsity <= 1.0 and sparsity >= 0.0, (
+        "sparsity should be a value between 0 and 1"
+    )
     assert M % blocksize[0] == 0
     assert N % blocksize[1] == 0
     shape = (B, M // blocksize[0], N // blocksize[1])[int(B == 0) :]
diff --git a/torch/sparse/semi_structured.py b/torch/sparse/semi_structured.py
index 721f25512794d..b225eaabb3206 100644
--- a/torch/sparse/semi_structured.py
+++ b/torch/sparse/semi_structured.py
@@ -465,14 +465,26 @@ def prune_dense_static_sort(
         The equivalent PyTorch code to create the same five outputs from the dense tensor can be found below:
         ```
         from torch.sparse import SparseSemiStructuredTensorCUTLASS
-        from torch.sparse._semi_structured_conversions import _sparse_semi_structured_tile, _compute_compressed_swizzled_bitmask
+        from torch.sparse._semi_structured_conversions import (
+            _sparse_semi_structured_tile,
+            _compute_compressed_swizzled_bitmask,
+        )
 
         pruned = _sparse_semi_structured_tile(dense)
         packed_cutlass, meta_cutlass = sparse_semi_structured_from_dense_cutlass(pruned)
-        packed_t_cutlass, meta_t_cutlass = sparse_semi_structured_from_dense_cutlass(pruned.t().contiguous())
+        packed_t_cutlass, meta_t_cutlass = sparse_semi_structured_from_dense_cutlass(
+            pruned.t().contiguous()
+        )
         bitmask = _compute_compressed_swizzled_bitmask(pruned)
 
-        SparseSemiStructuredTensorCUTLASS(dense.shape, packed_cutlass, meta_cutlass, packed_t_cutlass, meta_t_cutlass, bitmask)
+        SparseSemiStructuredTensorCUTLASS(
+            dense.shape,
+            packed_cutlass,
+            meta_cutlass,
+            packed_t_cutlass,
+            meta_t_cutlass,
+            bitmask,
+        )
         ```
         """
         # We can either pack to the CUTLASS or cuSPARSELt representation, depending on the use_cutlass flag.
@@ -583,14 +595,19 @@ def prune_dense_static_sort(
         The equivalent PyTorch code to create the same three outputs from the dense tensor can be found below:
         ```
         from torch.sparse import SparseSemiStructuredTensorCUSPARSELT
-        from torch.sparse._semi_structured_conversions import _sparse_semi_structured_tile, _compute_compressed_swizzled_bitmask
+        from torch.sparse._semi_structured_conversions import (
+            _sparse_semi_structured_tile,
+            _compute_compressed_swizzled_bitmask,
+        )
 
         pruned = _sparse_semi_structured_tile(dense)
         packed_cusparselt = torch._cslt_compress(pruned)
         packed_t_cusparselt = torch._cslt_compress(pruned.t().contiguous())
         bitmask = _compute_compressed_swizzled_bitmask(pruned)
 
-        SparseSemiStructuredTensorCUSPARSELT(dense.shape, packed_cutlass, None, packed_t_cutlass, None, bitmask)
+        SparseSemiStructuredTensorCUSPARSELT(
+            dense.shape, packed_cutlass, None, packed_t_cutlass, None, bitmask
+        )
         ```
         """
         (
diff --git a/torch/special/__init__.py b/torch/special/__init__.py
index be027caa94cbb..dbc9314ad2087 100644
--- a/torch/special/__init__.py
+++ b/torch/special/__init__.py
@@ -134,9 +134,7 @@
     >>> torch.special.digamma(a)
     tensor([-0.5772, -1.9635])
 
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 gammaln = _add_docstr(
@@ -162,9 +160,7 @@
     >>> torch.special.gammaln(a)
     tensor([ 0.5724,  0.0000, -0.1208])
 
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 polygamma = _add_docstr(
@@ -200,9 +196,7 @@
     tensor([ 6.4939, 97.4091])
     >>> torch.special.polygamma(4, a)
     tensor([ -24.8863, -771.4742])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 erf = _add_docstr(
@@ -226,9 +220,7 @@
 
     >>> torch.special.erf(torch.tensor([0, -1., 10.]))
     tensor([ 0.0000, -0.8427,  1.0000])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 erfc = _add_docstr(
@@ -253,9 +245,7 @@
 
     >>> torch.special.erfc(torch.tensor([0, -1., 10.]))
     tensor([ 1.0000, 1.8427,  0.0000])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 erfcx = _add_docstr(
@@ -283,9 +273,7 @@
 
     >>> torch.special.erfcx(torch.tensor([0, -1., 10.]))
     tensor([ 1.0000, 5.0090, 0.0561])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 erfinv = _add_docstr(
@@ -311,9 +299,7 @@
 
     >>> torch.special.erfinv(torch.tensor([0, 0.5, -1.]))
     tensor([ 0.0000,  0.4769,    -inf])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 logit = _add_docstr(
@@ -351,9 +337,7 @@
     tensor([0.2796, 0.9331, 0.6486, 0.1523, 0.6516])
     >>> torch.special.logit(a, eps=1e-6)
     tensor([-0.9466,  2.6352,  0.6131, -1.7169,  0.6261])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 logsumexp = _add_docstr(
@@ -362,9 +346,7 @@
 logsumexp(input, dim, keepdim=False, *, out=None)
 
 Alias for :func:`torch.logsumexp`.
-""".format(
-        **multi_dim_common
-    ),
+""".format(**multi_dim_common),
 )
 
 expit = _add_docstr(
@@ -391,9 +373,7 @@
     tensor([ 0.9213,  1.0887, -0.8858, -1.7683])
     >>> torch.special.expit(t)
     tensor([ 0.7153,  0.7481,  0.2920,  0.1458])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 exp2 = _add_docstr(
@@ -418,9 +398,7 @@
 
     >>> torch.special.exp2(torch.tensor([0, math.log2(2.), 3, 4]))
     tensor([ 1.,  2.,  8., 16.])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 expm1 = _add_docstr(
@@ -448,9 +426,7 @@
 
     >>> torch.special.expm1(torch.tensor([0, math.log(2.)]))
     tensor([ 0.,  1.])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 xlog1py = _add_docstr(
@@ -495,9 +471,7 @@
     tensor([1.6094, 3.2189, 4.8283])
     >>> torch.special.xlog1py(2, y)
     tensor([2.7726, 2.1972, 1.3863])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 xlogy = _add_docstr(
@@ -542,9 +516,7 @@
     tensor([1.3863, 2.7726, 4.1589])
     >>> torch.special.xlogy(2, y)
     tensor([2.1972, 1.3863, 0.0000])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 i0 = _add_docstr(
@@ -570,9 +542,7 @@
     >>> torch.i0(torch.arange(5, dtype=torch.float32))
     tensor([ 1.0000,  1.2661,  2.2796,  4.8808, 11.3019])
 
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 i0e = _add_docstr(
@@ -597,9 +567,7 @@
 
     >>> torch.special.i0e(torch.arange(5, dtype=torch.float32))
     tensor([1.0000, 0.4658, 0.3085, 0.2430, 0.2070])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 i1 = _add_docstr(
@@ -624,9 +592,7 @@
 
     >>> torch.special.i1(torch.arange(5, dtype=torch.float32))
     tensor([0.0000, 0.5652, 1.5906, 3.9534, 9.7595])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 i1e = _add_docstr(
@@ -652,9 +618,7 @@
 
     >>> torch.special.i1e(torch.arange(5, dtype=torch.float32))
     tensor([0.0000, 0.2079, 0.2153, 0.1968, 0.1788])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 ndtr = _add_docstr(
@@ -679,9 +643,7 @@
 
     >>> torch.special.ndtr(torch.tensor([-3., -2, -1, 0, 1, 2, 3]))
     tensor([0.0013, 0.0228, 0.1587, 0.5000, 0.8413, 0.9772, 0.9987])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 ndtri = _add_docstr(
@@ -709,9 +671,7 @@
 
     >>> torch.special.ndtri(torch.tensor([0, 0.25, 0.5, 0.75, 1]))
     tensor([   -inf, -0.6745,  0.0000,  0.6745,     inf])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 log_ndtr = _add_docstr(
@@ -736,9 +696,7 @@
 
     >>> torch.special.log_ndtr(torch.tensor([-3., -2, -1, 0, 1, 2, 3]))
     tensor([-6.6077 -3.7832 -1.841  -0.6931 -0.1728 -0.023  -0.0014])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 log1p = _add_docstr(
@@ -779,9 +737,7 @@
     tensor([ 0.2252, -0.2948,  1.0267, -1.1566])
     >>> torch.special.sinc(t)
     tensor([ 0.9186,  0.8631, -0.0259, -0.1300])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 round = _add_docstr(
@@ -886,9 +842,7 @@
     tensor([1.6449, 0.0823])
     >>> torch.special.zeta(2, torch.tensor([1., 2.]))
     tensor([1.6449, 0.6449])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 multigammaln = _add_docstr(
@@ -925,9 +879,7 @@
     >>> torch.special.multigammaln(a, 2)
     tensor([[0.3928, 0.4007, 0.7586],
             [1.0311, 0.3901, 0.5049]])
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 gammainc = _add_docstr(
@@ -976,9 +928,7 @@
     >>> b = torch.special.gammainc(a1, a2) + torch.special.gammaincc(a1, a2)
     tensor([1., 1., 1.])
 
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 gammaincc = _add_docstr(
@@ -1026,9 +976,7 @@
     >>> b = torch.special.gammainc(a1, a2) + torch.special.gammaincc(a1, a2)
     tensor([1., 1., 1.])
 
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 airy_ai = _add_docstr(
@@ -1045,9 +993,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 bessel_j0 = _add_docstr(
@@ -1064,9 +1010,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 bessel_j1 = _add_docstr(
@@ -1083,9 +1027,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 bessel_y0 = _add_docstr(
@@ -1102,9 +1044,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 bessel_y1 = _add_docstr(
@@ -1121,9 +1061,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 chebyshev_polynomial_t = _add_docstr(
@@ -1154,9 +1092,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 chebyshev_polynomial_u = _add_docstr(
@@ -1188,9 +1124,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 chebyshev_polynomial_v = _add_docstr(
@@ -1208,9 +1142,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 chebyshev_polynomial_w = _add_docstr(
@@ -1228,9 +1160,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 hermite_polynomial_h = _add_docstr(
@@ -1256,9 +1186,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 hermite_polynomial_he = _add_docstr(
@@ -1284,9 +1212,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 laguerre_polynomial_l = _add_docstr(
@@ -1312,9 +1238,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 legendre_polynomial_p = _add_docstr(
@@ -1340,9 +1264,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 modified_bessel_i0 = _add_docstr(
@@ -1359,9 +1281,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 modified_bessel_i1 = _add_docstr(
@@ -1378,9 +1298,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 modified_bessel_k0 = _add_docstr(
@@ -1397,9 +1315,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 modified_bessel_k1 = _add_docstr(
@@ -1416,9 +1332,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 scaled_modified_bessel_k0 = _add_docstr(
@@ -1435,9 +1349,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 scaled_modified_bessel_k1 = _add_docstr(
@@ -1454,9 +1366,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 shifted_chebyshev_polynomial_t = _add_docstr(
@@ -1474,9 +1384,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 shifted_chebyshev_polynomial_u = _add_docstr(
@@ -1494,9 +1402,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 shifted_chebyshev_polynomial_v = _add_docstr(
@@ -1514,9 +1420,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 shifted_chebyshev_polynomial_w = _add_docstr(
@@ -1534,9 +1438,7 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
 
 spherical_bessel_j0 = _add_docstr(
@@ -1553,7 +1455,5 @@
 
 Keyword args:
     {out}
-""".format(
-        **common_args
-    ),
+""".format(**common_args),
 )
diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index 228c04cd312f2..eff07c413deb4 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -1538,7 +1538,9 @@ def assert_close(
         >>> expected = torch.tensor([1.0, 2.0, 3.0])
         >>> actual = torch.tensor([1.0, 4.0, 5.0])
         >>> # The default error message can be overwritten.
-        >>> torch.testing.assert_close(actual, expected, msg="Argh, the tensors are not close!")
+        >>> torch.testing.assert_close(
+        ...     actual, expected, msg="Argh, the tensors are not close!"
+        ... )
         Traceback (most recent call last):
         ...
         AssertionError: Argh, the tensors are not close!
diff --git a/torch/testing/_creation.py b/torch/testing/_creation.py
index e513b8d856035..23d80d6ceae4f 100644
--- a/torch/testing/_creation.py
+++ b/torch/testing/_creation.py
@@ -115,11 +115,11 @@ def make_tensor(
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
         >>> from torch.testing import make_tensor
         >>> # Creates a float tensor with values in [-1, 1)
-        >>> make_tensor((3,), device='cpu', dtype=torch.float32, low=-1, high=1)
+        >>> make_tensor((3,), device="cpu", dtype=torch.float32, low=-1, high=1)
         >>> # xdoctest: +SKIP
         tensor([ 0.1205, 0.2282, -0.6380])
         >>> # Creates a bool tensor on CUDA
-        >>> make_tensor((2, 2), device='cuda', dtype=torch.bool)
+        >>> make_tensor((2, 2), device="cuda", dtype=torch.bool)
         tensor([[False, False],
                 [False, True]], device='cuda:0')
     """
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 649680b1968c8..be284429114f5 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -24,6 +24,7 @@
     TEST_CUDNN = LazyVal(lambda: TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)))
 
 TEST_CUDNN_VERSION = LazyVal(lambda: torch.backends.cudnn.version() if TEST_CUDNN else 0)
+ROCM_VERSION = LazyVal(lambda : tuple(int(v) for v in torch.version.hip.split('.')[:2]) if torch.version.hip else (0, 0))
 
 SM53OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (5, 3))
 SM60OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (6, 0))
@@ -40,6 +41,7 @@
 IS_JETSON = LazyVal(lambda: torch.cuda.is_available() and (torch.cuda.get_device_capability() in [(7, 2), (8, 7)] or IS_THOR))
 IS_SM89 = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() == (8, 9))
 IS_SM90 = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() == (9, 0))
+IS_SM100 = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() == (10, 0))
 
 def evaluate_gfx_arch_within(arch_list):
     if not torch.cuda.is_available():
@@ -94,7 +96,6 @@ def evaluate_platform_supports_cudnn_attention():
 def evaluate_platform_supports_fp8():
     if torch.cuda.is_available():
         if torch.version.hip:
-            ROCM_VERSION = tuple(int(v) for v in torch.version.hip.split('.')[:2])
             archs = ['gfx94']
             if ROCM_VERSION >= (6, 3):
                 archs.extend(['gfx120'])
@@ -120,19 +121,26 @@ def evaluate_platform_supports_fp8_grouped_gemm():
             return SM90OrLater and not SM100OrLater
     return False
 
-
-def _platform_supports_mx_gemm():
+def evaluate_platform_supports_mx_gemm():
     if torch.cuda.is_available():
         if torch.version.hip:
-            return 'gfx95' in torch.cuda.get_device_properties(0).gcnArchName
+            if ROCM_VERSION >= (7, 0):
+                return 'gfx950' in torch.cuda.get_device_properties(0).gcnArchName
         else:
-            return TEST_CUDA and SM100OrLater
+            return SM100OrLater
+    return False
+
+def evaluate_platform_supports_mxfp8_grouped_gemm():
+    if torch.cuda.is_available() and not torch.version.hip:
+        built_with_fbgemm_genai = "USE_FBGEMM_GENAI" in torch.__config__.show()
+        return built_with_fbgemm_genai and IS_SM100
     return False
 
+PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_mx_gemm())
 PLATFORM_SUPPORTS_FP8: bool = LazyVal(lambda: evaluate_platform_supports_fp8())
 PLATFORM_SUPPORTS_FP8_GROUPED_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_fp8_grouped_gemm())
-PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: _platform_supports_mx_gemm())
-
+PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: TEST_CUDA and SM100OrLater)
+PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_mxfp8_grouped_gemm())
 
 if TEST_NUMBA:
     try:
@@ -238,7 +246,7 @@ def tf32_enabled():
 # if device is specified, it will check if device is cuda
 # if dtype is specified, it will check if dtype is float32 or complex64
 # tf32 and fp32 are different only when all the three checks pass
-def tf32_on_and_off(tf32_precision=1e-5):
+def tf32_on_and_off(tf32_precision=1e-5, only_if=True):
     def with_tf32_disabled(self, function_call):
         with tf32_off():
             function_call()
@@ -254,7 +262,7 @@ def wrapper(f):
         @functools.wraps(f)
         def wrapped(*args, **kwargs):
             kwargs.update(zip(arg_names, args))
-            cond = torch.cuda.is_tf32_supported()
+            cond = torch.cuda.is_tf32_supported() and only_if
             if 'device' in kwargs:
                 cond = cond and (torch.device(kwargs['device']).type == 'cuda')
             if 'dtype' in kwargs:
@@ -268,7 +276,6 @@ def wrapped(*args, **kwargs):
         return wrapped
     return wrapper
 
-
 # This is a wrapper that wraps a test to run it with TF32 turned off.
 # This wrapper is designed to be used when a test uses matmul or convolutions
 # but the purpose of that test is not testing matmul or convolutions.
@@ -299,7 +306,7 @@ def _get_torch_rocm_version():
     if not TEST_WITH_ROCM or torch.version.hip is None:
         return (0, 0)
     rocm_version = str(torch.version.hip)
-    rocm_version = rocm_version.split("-")[0]    # ignore git sha
+    rocm_version = rocm_version.split("-", maxsplit=1)[0]    # ignore git sha
     return tuple(int(x) for x in rocm_version.split("."))
 
 def _check_cusparse_generic_available():
@@ -312,7 +319,7 @@ def _check_hipsparse_generic_available():
         return False
 
     rocm_version = str(torch.version.hip)
-    rocm_version = rocm_version.split("-")[0]    # ignore git sha
+    rocm_version = rocm_version.split("-", maxsplit=1)[0]    # ignore git sha
     rocm_version_tuple = tuple(int(x) for x in rocm_version.split("."))
     return not (rocm_version_tuple is None or rocm_version_tuple < (5, 1))
 
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 01499280da8f5..8971eca1bb24e 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -721,9 +721,9 @@ def filter_desired_device_types(device_type_test_bases, except_for=None, only_fo
     intersect = set(except_for if except_for else []) & set(
         only_for if only_for else []
     )
-    assert (
-        not intersect
-    ), f"device ({intersect}) appeared in both except_for and only_for"
+    assert not intersect, (
+        f"device ({intersect}) appeared in both except_for and only_for"
+    )
 
     # Replace your privateuse1 backend name with 'privateuse1'
     if is_privateuse1_backend_available():
@@ -1277,26 +1277,39 @@ def __init__(self, dep, reason):
 
 
 def _has_sufficient_memory(device, size):
-    if torch.device(device).type == "cuda":
-        if not torch.cuda.is_available():
+    device_ = torch.device(device)
+    device_type = device_.type
+    if device_type in ["cuda", "xpu"]:
+        acc = torch.accelerator.current_accelerator()
+        # Case 1: no accelerator found
+        if not acc:
+            return False
+        # Case 2: accelerator found but not matching device type
+        if acc.type != device_type:
+            return True
+        # Case 3: accelerator found and matching device type but not available
+        if not torch.accelerator.is_available():
             return False
+        # Case 4: accelerator found and matching device type and available
         gc.collect()
-        torch.cuda.empty_cache()
-        # torch.cuda.mem_get_info, aka cudaMemGetInfo, returns a tuple of (free memory, total memory) of a GPU
-        if device == "cuda":
-            device = "cuda:0"
-        return (
-            torch.cuda.memory.mem_get_info(device)[0]
-            * torch.cuda.memory.get_per_process_memory_fraction(device)
-        ) >= size
-
-    if device == "xla":
-        raise unittest.SkipTest("TODO: Memory availability checks for XLA?")
+        torch.accelerator.empty_cache()
+
+        if device_.index is None:
+            device_ = torch.device(device_type, 0)
+
+        if device_type == "cuda":
+            return (
+                torch.cuda.memory.mem_get_info(device_)[0]
+                * torch.cuda.memory.get_per_process_memory_fraction(device_)
+            ) >= size
 
-    if device == "xpu":
-        raise unittest.SkipTest("TODO: Memory availability checks for Intel GPU?")
+        if device_type == "xpu":
+            return torch.xpu.memory.mem_get_info(device_)[0] >= size
 
-    if device != "cpu":
+    if device_type == "xla":
+        raise unittest.SkipTest("TODO: Memory availability checks for XLA?")
+
+    if device_type != "cpu":
         raise unittest.SkipTest("Unknown device type")
 
     # CPU
@@ -1342,7 +1355,6 @@ def dep_fn(self, *args, **kwargs):
             # an additional array of the same size as the input.
             if inductor and torch._inductor.config.cpp_wrapper and _device != "cpu":
                 size_bytes *= 2
-
             if not _has_sufficient_memory(_device, size_bytes):
                 raise unittest.SkipTest(f"Insufficient {_device} memory")
 
@@ -1407,9 +1419,9 @@ def __init__(self, num_required_devices):
         self.num_required_devices = num_required_devices
 
     def __call__(self, fn):
-        assert not hasattr(
-            fn, "num_required_devices"
-        ), f"deviceCountAtLeast redefinition for {fn.__name__}"
+        assert not hasattr(fn, "num_required_devices"), (
+            f"deviceCountAtLeast redefinition for {fn.__name__}"
+        )
         fn.num_required_devices = self.num_required_devices
 
         @wraps(fn)
@@ -1474,13 +1486,13 @@ def only_fn(self, *args, **kwargs):
 # self.precision *2, max(1, self.precision)).
 class precisionOverride:
     def __init__(self, d):
-        assert isinstance(
-            d, dict
-        ), "precisionOverride not given a dtype : precision dict!"
+        assert isinstance(d, dict), (
+            "precisionOverride not given a dtype : precision dict!"
+        )
         for dtype in d.keys():
-            assert isinstance(
-                dtype, torch.dtype
-            ), f"precisionOverride given unknown dtype {dtype}"
+            assert isinstance(dtype, torch.dtype), (
+                f"precisionOverride given unknown dtype {dtype}"
+            )
 
         self.d = d
 
@@ -1513,12 +1525,12 @@ class toleranceOverride:
     def __init__(self, d):
         assert isinstance(d, dict), "toleranceOverride not given a dtype : tol dict!"
         for dtype, prec in d.items():
-            assert isinstance(
-                dtype, torch.dtype
-            ), f"toleranceOverride given unknown dtype {dtype}"
-            assert isinstance(
-                prec, tol
-            ), "toleranceOverride not given a dtype : tol dict!"
+            assert isinstance(dtype, torch.dtype), (
+                f"toleranceOverride given unknown dtype {dtype}"
+            )
+            assert isinstance(prec, tol), (
+                "toleranceOverride not given a dtype : tol dict!"
+            )
 
         self.d = d
 
@@ -1546,13 +1558,13 @@ def __init__(self, *args, device_type="all"):
                     "all dtype variants must be. "
                     f"Received non-list non-tuple dtype {str(arg)}"
                 )
-                assert all(
-                    isinstance(dtype, torch.dtype) for dtype in arg
-                ), f"Unknown dtype in {str(arg)}"
+                assert all(isinstance(dtype, torch.dtype) for dtype in arg), (
+                    f"Unknown dtype in {str(arg)}"
+                )
         else:
-            assert all(
-                isinstance(arg, torch.dtype) for arg in args
-            ), f"Unknown dtype in {str(args)}"
+            assert all(isinstance(arg, torch.dtype) for arg in args), (
+                f"Unknown dtype in {str(args)}"
+            )
 
         self.args = args
         self.device_type = device_type
@@ -1577,6 +1589,12 @@ def __init__(self, *args):
         super().__init__(*args, device_type="cuda")
 
 
+# Overrides specified dtypes on Intel GPU.
+class dtypesIfXPU(dtypes):
+    def __init__(self, *args):
+        super().__init__(*args, device_type="xpu")
+
+
 class dtypesIfMPS(dtypes):
     def __init__(self, *args):
         super().__init__(*args, device_type="mps")
@@ -1960,14 +1978,18 @@ def get_all_device_types() -> list[str]:
     and torch.cpu._is_avx2_supported()
     and os.getenv("ATEN_CPU_CAPABILITY") != "default"
 )
+IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED = (
+    torch.xpu.is_available() and torch.utils._triton.has_triton()
+)
 flex_attention_supported_platform = unittest.skipUnless(
-    IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED
+    IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED
+    or IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED
     or (
         torch.cuda.is_available()
         and torch.utils._triton.has_triton()
         and torch.cuda.get_device_capability() >= (8, 0)
     ),
-    "Requires CUDA and Triton, or CPU with avx2 and later",
+    "Requires CUDA and Triton, Intel GPU and triton, or CPU with avx2 and later",
 )
 if torch.version.hip and "gfx94" in torch.cuda.get_device_properties(0).gcnArchName:
     e4m3_type = torch.float8_e4m3fnuz
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index af1aafd3871ae..c1f75697fe889 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -96,10 +96,10 @@ class TestSkip(NamedTuple):
 class DistTestCases:
     # Backends that do not support a specific collective
     skip_collective = {}
-    skip_collective["allgather_coalesced"] = {"nccl", "mpi", "ucc"}
+    skip_collective["allgather_coalesced"] = {"nccl", "mpi", "ucc", "xccl"}
     skip_collective["reduce"] = set()
-    skip_collective["sendrecv anysource"] = {"nccl", "ucc"}
-    skip_collective["cpu barrier"] = {"nccl", "ucc"}
+    skip_collective["sendrecv anysource"] = {"nccl", "ucc", "xccl"}
+    skip_collective["cpu barrier"] = {"nccl", "ucc", "xccl"}
 
     # Sets showing that something is implemented
     backend_feature = {}
@@ -253,9 +253,9 @@ def verify_ddp_error_logged(model_DDP, err_substr):
         if err_substr.find("\nException raised from ") == -1
         else err_substr.split("\nException raised from ")[0]
     )
-    assert (
-        actual in logging_err
-    ), f"Did not find expected {actual} in ddp logging data error: {logging_err}"
+    assert actual in logging_err, (
+        f"Did not find expected {actual} in ddp logging data error: {logging_err}"
+    )
 
 
 def with_nccl_blocking_wait(func):
@@ -294,9 +294,9 @@ def wrapper(*args, **kwargs):
         finally:
             # restore old values.
             if cached_nccl_async_error_handling is not None:
-                os.environ[
-                    "TORCH_NCCL_ASYNC_ERROR_HANDLING"
-                ] = cached_nccl_async_error_handling
+                os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = (
+                    cached_nccl_async_error_handling
+                )
 
             if cached_nccl_blocking_wait is not None:
                 os.environ["TORCH_NCCL_BLOCKING_WAIT"] = cached_nccl_blocking_wait
@@ -338,15 +338,26 @@ def requires_gloo():
 
 
 def requires_nccl_version(version, msg):
-    if not c10d.is_nccl_available():
-        return skip_but_pass_in_sandcastle(
-            "c10d was not compiled with the NCCL backend",
-        )
+    if TEST_CUDA:
+        if not c10d.is_nccl_available():
+            return skip_but_pass_in_sandcastle(
+                "c10d was not compiled with the NCCL backend",
+            )
+        else:
+            return skip_but_pass_in_sandcastle_if(
+                torch.cuda.nccl.version() < version,
+                f"Requires NCCL version greater than or equal to: {version}, found: {torch.cuda.nccl.version()}, reason: {msg}",
+            )
     else:
-        return skip_but_pass_in_sandcastle_if(
-            torch.cuda.nccl.version() < version,
-            f"Requires NCCL version greater than or equal to: {version}, found: {torch.cuda.nccl.version()}, reason: {msg}",
-        )
+
+        def decorator(func):
+            @wraps(func)
+            def wrapper(*args, **kwargs):
+                return func(*args, **kwargs)
+
+            return wrapper
+
+        return decorator
 
 
 def requires_nccl():
@@ -435,9 +446,10 @@ def sm_is_or_higher_than(device: torch.device, major: int, minor: int) -> bool:
     Returns True if the device's compute capability is (major, minor) or higher.
     Error out if the device is not a CUDA device.
     Returns False if device is a RoCM device.
+    Returns True if device is a non-CUDA device.
     """
     if device.type != "cuda":
-        raise ValueError("sm_is_or_later() is only supported for CUDA devices")
+        return True
 
     if torch.version.hip is not None:
         # ROCm devices may have different compute capability codes
@@ -812,7 +824,7 @@ def run_test(self, test_name: str, parent_pipe) -> None:
             sys.exit(TEST_SKIPS["generic"].exit_code)
         except Exception:
             logger.error(
-                "Caught exception: \n%s exiting " "process %s with exit code: %s",
+                "Caught exception: \n%s exiting process %s with exit code: %s",
                 traceback.format_exc(),
                 self.rank,
                 MultiProcessTestCase.TEST_ERROR_EXIT_CODE,
@@ -1456,12 +1468,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 @contextmanager
 def _dynamo_dist_per_rank_init(
-    rank, world_size, backend="nccl", init_pg=True, fake_pg=False
+    rank, world_size, backend=None, init_pg=True, fake_pg=False
 ):
     # To avoid multiple inheritance from _dynamo.test_case.TestCase and MultiProcessTestCase,
     # Just manually implement the most important part of the dynamo behavior to reset/clear.
     if not fake_pg:
         torch.accelerator.set_device_index(rank)
+
+    device_type = (
+        acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+    )
+    if backend is None:
+        backend = c10d.get_default_backend_for_device(device_type)
+
     os.environ["MASTER_ADDR"] = "localhost"
     os.environ["MASTER_PORT"] = "6789"
     if init_pg:
@@ -1508,9 +1527,12 @@ def setUpClass(cls):
             )
         )
         cls.rank = 0
-        cls.device = f"cuda:{cls.rank}"
-        cls.device_ids = None if "cuda" in cls.device else [cls.rank]
-        c10d.init_process_group("nccl", rank=cls.rank, world_size=1)
+        device = torch.accelerator.current_accelerator().type
+        cls.device = f"{device}:{cls.rank}"
+        cls.device_ids = None if device in cls.device else [cls.rank]
+        c10d.init_process_group(
+            c10d.get_default_backend_for_device(device), rank=cls.rank, world_size=1
+        )
 
     @classmethod
     def tearDownClass(cls):
@@ -1546,7 +1568,7 @@ def _run(
         self.run_test(test_name, parent_pipe)
 
 
-class MultiProcContinousTest(TestCase):
+class MultiProcContinuousTest(TestCase):
     # Class variables:
     MAIN_PROCESS_RANK = -1
     # number of test processes
@@ -1589,8 +1611,11 @@ def opts(cls, high_priority_stream=False):
     @classmethod
     def _init_pg(cls, rank, world_size, rdvz_file):
         assert rdvz_file is not None
+        # rank should be local_rank for tests running on <= 8gpus which is how all these tests are designed
+        # and we expect LOCAL_RANK set by torchrun. Setting it lets init_device_mesh set the device without
+        # issuing a warning
+        os.environ["LOCAL_RANK"] = str(rank)
         store = c10d.FileStore(rdvz_file, world_size)
-
         # create nccl processgroup with opts
         c10d.init_process_group(
             backend=cls.backend_str(),
@@ -1605,7 +1630,7 @@ def _init_pg(cls, rank, world_size, rdvz_file):
     @classmethod
     def _run_test_given_id(cls, test_id: str, **kwargs) -> None:
         # self.id() == e.g. '__main__.TestDistributed.TestAdditive.test_get_rank'
-        test_name = test_id.split(".")[-1]
+        test_name = test_id.rsplit(".", maxsplit=1)[-1]
         # Get the test function from the test class
         self = cls(test_name)
         self.rank = cls.rank
@@ -1627,7 +1652,7 @@ def _worker_loop(cls, rank, world_size, rdvz_file, task_queue, completion_queue)
         cls._init_pg(rank, world_size, rdvz_file)
 
         # End of bootstrap
-        logger.info("Setup complete")
+        logger.debug("Setup complete")
 
         # Loop forever, waiting for a test name to run
         while True:
@@ -1652,7 +1677,7 @@ def _worker_loop(cls, rank, world_size, rdvz_file, task_queue, completion_queue)
                 completion_queue.put(enhanced_ex)
 
         # Termination
-        logger.info("Terminating ...")
+        logger.debug("Terminating ...")
         # Calling destroy_process_group when workers have exceptions
         # while others are doing collectives will cause a deadlock since
         # it waits for enqueued collectives to finish.
@@ -1689,9 +1714,7 @@ def _spawn_processes(cls, world_size) -> None:
             cls.processes.append(process)
             cls.task_queues.append(task_queue)
             cls.completion_queues.append(completion_queue)
-            logger.info(
-                "Started process %s with pid %s", rank, process.pid
-            )  # noqa: UP031
+            logger.debug("Started process %s with pid %s", rank, process.pid)  # noqa: UP031
 
     @classmethod
     def setUpClass(cls):
diff --git a/torch/testing/_internal/common_dtype.py b/torch/testing/_internal/common_dtype.py
index 774ce179f33e0..474bb689f0ad9 100644
--- a/torch/testing/_internal/common_dtype.py
+++ b/torch/testing/_internal/common_dtype.py
@@ -121,6 +121,19 @@ def all_types_and_half():
     return _all_types_and_half
 
 
+_all_mps_types = (
+    _dispatch_dtypes({torch.float, torch.half, torch.bfloat16}) + _integral_types
+)
+
+
+def all_mps_types():
+    return _all_mps_types
+
+
+def all_mps_types_and(*dtypes):
+    return _all_mps_types + _validate_dtypes(*dtypes)
+
+
 _float8_types = _dispatch_dtypes(
     (
         torch.float8_e4m3fn,
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index a9e24eb90ef8c..c7274fddd6d3b 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -6,6 +6,7 @@
 import re
 import sys
 import time
+import unittest
 import warnings
 from abc import ABC, abstractmethod
 from contextlib import nullcontext
@@ -1122,6 +1123,7 @@ def check_sharded_parity(
         cls.assertEqual(sharded_param.grad.to_local(), sharded_ref_grad.to_local())
 
 
+@unittest.skipIf(TEST_XPU, "not-support-multithread")
 class FSDPTestMultiThread(MultiThreadedTestCase):
     @property
     def world_size(self):
@@ -1187,7 +1189,7 @@ def _run(cls, rank, test_name, file_name, pipe, **kwargs):  # type: ignore[overr
         fake_pg = kwargs.get("fake_pg", False)
 
         print(f"dist init r={self.rank}, world={self.world_size}")
-        if torch.cuda.device_count() < self.world_size:
+        if torch.accelerator.device_count() < self.world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
 
         # Specify gloo backend to make 'init_process_group()' succeed,
@@ -1285,10 +1287,10 @@ def _train_for_several_steps(
             loss = sharded_grad_scaler.scale(loss)
 
             if not mixed_precision and not use_pure_fp16:
-                assert (
-                    loss.dtype == torch.float32
-                ), "loss data type should be float32, as the original \
+                assert loss.dtype == torch.float32, (
+                    "loss data type should be float32, as the original \
                     parameter data type is float32."
+                )
             else:
                 if use_pure_fp16:
                     self.assertEqual(loss.dtype, torch.float16)
@@ -1354,9 +1356,9 @@ def _test_fsdp_parity(
                 wrapper should provide data parallel semantics. If ``None``,
                 then the callable defaults to the DDP constructor.
         """
-        assert (
-            fsdp_init_mode != FSDPInitMode.NO_FSDP
-        ), "Expects an FSDP init mode that wraps with FSDP"
+        assert fsdp_init_mode != FSDPInitMode.NO_FSDP, (
+            "Expects an FSDP init mode that wraps with FSDP"
+        )
         if init_kwargs is None:
             init_kwargs = {}
         lr = 1e-2
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 41bb2b96bd938..4c2c3e023031f 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1161,8 +1161,8 @@ def make_arg_conj(size):
 
 
 def sample_inputs_addmm(op_info, device, dtype, requires_grad, **kwargs):
-    alpha_val = kwargs.get('alpha', 2 + 3j if dtype.is_complex else 0.6)
-    beta_val = kwargs.get('beta', 1 + 2j if dtype.is_complex else 0.2)
+    alpha_val = kwargs.get('alpha', 2 + 3j if dtype.is_complex else 0.6 if dtype.is_floating_point else 2)
+    beta_val = kwargs.get('beta', 1 + 2j if dtype.is_complex else 0.2 if dtype.is_floating_point else 3)
     tests_list = [
         ((2, 3), (2, 2), (2, 3), False),
         ((3, 3), (3, 3), (3, 3), False),
@@ -2509,7 +2509,7 @@ def error_inputs_cat(op_info, device, **kwargs):
                      error_regex='zero-dimensional.*cannot be concatenated')
 
     # error inputs for different dtype of out tensors
-    d = make_tensor((2, 3), device=device, dtype=torch.double)
+    d = make_tensor((2, 3), device=device, dtype=torch.double if not device.startswith("mps") else torch.float16)
     x = make_tensor((2, 3), device=device, dtype=torch.float32)
     yield ErrorInput(SampleInput(x, kwargs={'out': d}), error_type=TypeError,
                      error_regex='invalid combination of arguments')
@@ -6004,6 +6004,7 @@ def sample_inputs_repeat_interleave(op_info, device, dtype, requires_grad, **kwa
     yield SampleInput(make_input((2, 3, 4)), repeats=2)
     yield SampleInput(make_input((2, 3, 4)), repeats=2, dim=1)
     yield SampleInput(make_input((2, 3, 4)), repeats=torch.arange(3, device=device), dim=1)
+    yield SampleInput(make_input((4, 1)), repeats=torch.arange(4, device=device), dim=0, output_size=6)
 
 
 def sample_inputs_stft(op_info, device, dtype, requires_grad, **kwargs):
@@ -6525,6 +6526,8 @@ def sample_inputs_view_as_real(op_info, device, dtype, requires_grad, **kwargs):
 
 def error_inputs_complex(op_info, device, is_ref=False, **kwargs):
     make_arg = partial(make_tensor, dtype=torch.float32, device=device)
+    other_dtype = torch.float16 if device.startswith("mps") else torch.float64
+    other_dtype_name = "Half" if device.startswith("mps") else "Double"
 
     if is_ref:
         error_float = "Expected both inputs to be Half, Float or Double tensors but got torch.float32 and torch.int32"
@@ -6532,16 +6535,16 @@ def error_inputs_complex(op_info, device, is_ref=False, **kwargs):
         error_out = "Expected out tensor to have dtype torch.complex128 but got torch.complex64 instead"
     else:
         error_float = "Expected both inputs to be Half, Float or Double tensors but got Float and Int"
-        error_dtype = "Expected object of scalar type Float but got scalar type Double for second argument"
-        error_out = "Expected object of scalar type ComplexDouble but got scalar type ComplexFloat for argument 'out'"
+        error_dtype = f"Expected object of scalar type Float but got scalar type {other_dtype_name} for second argument"
+        error_out = f"Expected object of scalar type Complex{other_dtype_name} but got scalar type ComplexFloat for argument 'out'"
 
     yield ErrorInput(SampleInput(make_arg(M, S), make_arg(M, S, dtype=torch.int)),
                      error_type=RuntimeError, error_regex=error_float)
 
-    yield ErrorInput(SampleInput(make_arg(M, S), make_arg(M, S, dtype=torch.float64)),
+    yield ErrorInput(SampleInput(make_arg(M, S), make_arg(M, S, dtype=other_dtype)),
                      error_type=RuntimeError, error_regex=error_dtype)
 
-    yield ErrorInput(SampleInput(make_arg(M, S, dtype=torch.float64), make_arg(M, S, dtype=torch.float64),
+    yield ErrorInput(SampleInput(make_arg(M, S, dtype=other_dtype), make_arg(M, S, dtype=other_dtype),
                                  out=make_arg(M, S, dtype=torch.complex64)),
                      error_type=RuntimeError, error_regex=error_out)
 
@@ -8356,6 +8359,36 @@ def sample_inputs_grid_sampler_2d(op_info, device, dtype, requires_grad, **kwarg
             align_corners,
         )
 
+def sample_inputs_grid_sampler_3d(op_info, device, dtype, requires_grad, **kwargs):
+    _make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad,
+                          low=-1, high=1)
+    # Test both out-of-range and in-range grid values
+    _make_grid = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad,
+                         low=-4, high=4)
+
+    modes = (0,)
+    padding_modes = (0, 1, 2)
+    align_cornerss = (False, True)
+    shape_pairs = [
+        # [input_shape, grid_shape]
+        [(1, 1, 2, 2, 2), (1, 1, 1, 1, 3)],
+        [(2, 3, S, L, L), (2, M + 2, M + 1, M, 3)],
+        [(L, L + 1, L + 2, L + 3, L + 4), (L, M + 2, M + 1, M, 3)],
+        [(M, M + 1, M + 2, M + 3, M + 4), (M, L + 3, L + 2, L + 1, 3)],
+        [(L, M + 1, M + 2, M + 3, M + 4), (L, L + 3, L + 2, L + 1, 3)],
+    ]
+
+    params_prod = itertools.product(modes, padding_modes, align_cornerss, shape_pairs)
+
+    for mode, padding_mode, align_corners, (input_shape, grid_shape) in params_prod:
+        yield SampleInput(
+            _make_input(input_shape),
+            _make_grid(grid_shape),
+            mode,
+            padding_mode,
+            align_corners,
+        )
+
 def sample_inputs_cosine_embedding_loss(op_info, device, dtype, requires_grad, **kwargs):
     make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -12329,6 +12362,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(
                    toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
                    'TestCommon', 'test_variant_consistency_eager', device_type='cuda'),
+               # Higher differences starting with Zen3 or Alder Lake
+               DecorateInfo(
+                   toleranceOverride({torch.complex64: tol(atol=4e-05, rtol=4e-06)}),
+                   'TestDecomp', 'test_quick', device_type='cpu'),
                DecorateInfo(
                    toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
                    'TestMathBits', 'test_conj_view', device_type='cuda'),
@@ -21032,6 +21069,22 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             DecorateInfo(slowTest, 'TestDecomp', 'test_comprehensive', dtypes=(torch.float32, torch.float64),
                          active_if=IS_WINDOWS),
         ),),
+    # TODO: Remove grid_sampler_3d tests once `nn.functional.grid_sample` has
+    # MPS support for all cases.
+    OpInfo(
+        "grid_sampler_3d",
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_grid_sampler_3d,
+        supports_gradgrad=False,
+        gradcheck_nondet_tol=1e-15,
+        skips=(
+            # NOTE: Only run on MPS
+            DecorateInfo(unittest.skip('Skipped!'), device_type='cpu'),
+            DecorateInfo(unittest.skip('Skipped!'), device_type='cuda'),
+            DecorateInfo(unittest.skip('Skipped!'), device_type='xpu'),
+            DecorateInfo(unittest.skip('Skipped!'), device_type='meta'),
+        ),),
     OpInfo(
         "argwhere",
         ref=np.argwhere,
diff --git a/torch/testing/_internal/common_mps.py b/torch/testing/_internal/common_mps.py
index 58afc631d21bb..ea07fd3c05143 100644
--- a/torch/testing/_internal/common_mps.py
+++ b/torch/testing/_internal/common_mps.py
@@ -12,8 +12,9 @@
 
     def mps_ops_modifier(
         ops: Sequence[OpInfo],
-        device_type: Optional[str] = None,
+        device_type: str = "mps",
         xfail_exclusion: Optional[list[str]] = None,
+        sparse: bool = False,
     ) -> Sequence[OpInfo]:
         if xfail_exclusion is None:
             xfail_exclusion = []
@@ -25,6 +26,7 @@ def mps_ops_modifier(
             "__rsub__",
             "__getitem__",
             "_unsafe_masked_index",
+            "_unsafe_masked_index_put_accumulate",
             "abs",
             "add",
             "alias_copy",
@@ -36,6 +38,7 @@ def mps_ops_modifier(
             "as_strided_copy",
             "as_strided_scatter",
             "asin",
+            "asinh",
             "acos",
             "atan",
             "broadcast_tensors",
@@ -73,8 +76,10 @@ def mps_ops_modifier(
             "H",
             "hsplit",
             "imag",
+            "index_add",
             "index_copy",
             "index_select",
+            "index_put",
             "isfinite",
             "isinf",
             "isreal",
@@ -284,85 +289,6 @@ def mps_ops_modifier(
             "where",
             "byte",
         }
-        # Those ops worked on MacOS12, but broken on MacOS13, see https://github.com/pytorch/pytorch/issues/85758
-        MACOS_BEFORE_13_3_XFAILLIST = {
-            # Failures due to precision issues (due to fast-math). These has been fixed in MacOS 13.3+
-            "cdist": [torch.float32],
-            # CPU Error: cpu not giving nan for x/0.0
-            "atan2": [
-                torch.bool,
-                torch.int16,
-                torch.int32,
-                torch.int64,
-                torch.uint8,
-                torch.int8,
-            ],
-            # test blow pass on macOS 12 as it falls back to cpu
-            # Argsort case using duplicate indices (undefined behaviour):
-            #  - CPU output: tensor([2546, 6917, 3181,  ..., 7128, 5133,   30], device='cpu')
-            #  - MPS output: tensor([2546, 6917, 3181,  ..., 7128,   30, 5133], device='mps:0')
-            # Elements from index 30 and 5133 are both equal.
-            # Since CPU is not using argsort with stable=True, these cases result in undefined behaviour.
-            "argsort": [torch.float16, torch.int8, torch.uint8, torch.bool],
-            # Same issue as `argsort` with duplicate indices. This test checks both the sorted values and the indices.
-            # The values of the sorted tensor match the CPU,
-            # but in case of the returned indices this results in undefined behaviour.
-            "sort": [torch.int8, torch.uint8, torch.bool, torch.float16],
-            # Unsupported dtypes
-            "cumsum": [torch.int64],
-            "cumprod": [torch.int64],
-            "cumulative_trapezoid": [torch.int64],
-            "masked.cumsum": [torch.int64],
-            "masked.cumprod": [torch.int64],
-            "linalg.vander": [torch.int64],
-            # Fail with `Expected 1.0 but got nan.` for empty tensors
-            # Caused by sample input at index 23: SampleInput(
-            #     input=Tensor[size=(), device="mps:0", dtype=torch.float32],
-            #     args=(0),
-            #     kwargs={'mask': 'Tensor[size=(), device="mps:0", dtype=torch.bool]'},
-            #     broadcasts_input=False, name='')
-            "masked.softmin": [torch.float32, torch.float16],
-            "masked.softmax": [torch.float32, torch.float16],
-            "masked.log_softmax": [torch.float32, torch.float16],
-        }
-
-        MACOS_AFTER_13_1_XFAILLIST = {
-            # before macOS 13.2 it falls back to cpu and pass the forward pass
-            "grid_sampler_2d": [
-                torch.float32,
-                torch.float16,
-                torch.bfloat16,
-            ],  # Unsupported Border padding mode
-        }
-
-        MACOS_13_3_XFAILLIST = {
-            # Failure due to precision issue for fp16
-            # on both cpu and mps there are test cases that might produce inf result
-            # 'nn.functional.pairwise_distance': [torch.float16],
-            # test blow pass on macOS 12 as it falls back to cpu
-            # Argsort case using duplicate indices (undefined behaviour):
-            #  - CPU output: tensor([2546, 6917, 3181,  ..., 7128, 5133,   30], device='cpu')
-            #  - MPS output: tensor([2546, 6917, 3181,  ..., 7128,   30, 5133], device='mps:0')
-            # Elements from index 30 and 5133 are both equal.
-            # Since CPU is not using argsort with stable=True, these cases result in undefined behaviour.
-            "argsort": [
-                torch.float16,
-                torch.int8,
-                torch.uint8,
-                torch.bool,
-                torch.bfloat16,
-            ],
-            # Same issue as `argsort` with duplicate indices. This test checks both the sorted values and the indices.
-            # The values of the sorted tensor match the CPU,
-            # but in case of the returned indices this results in undefined behaviour.
-            "sort": [
-                torch.int8,
-                torch.uint8,
-                torch.bool,
-                torch.float16,
-                torch.bfloat16,
-            ],
-        }
 
         MACOS_BEFORE_14_4_XFAILLIST = {
             # These ops work fine in 14.4 but fail in 14.2 or 13.x
@@ -370,7 +296,7 @@ def mps_ops_modifier(
         }
 
         # Those ops are not expected to work
-        UNIMPLEMENTED_XFAILLIST = {
+        UNIMPLEMENTED_XFAILLIST: dict[str, Optional[list]] = {
             # Failures due to lack of op implementation on MPS backend
             "logspace": None,
             "logspacetensor_overload": None,
@@ -387,13 +313,11 @@ def mps_ops_modifier(
             "nn.functional.grid_sample": None,  # Unsupported Border padding mode
             "hash_tensor": None,
             "heaviside": None,
-            "igamma": None,
-            "igammac": None,
             "index_reduceprod": None,
             "index_reducemean": None,
             "index_reduceamax": None,
             "index_reduceamin": None,
-            "kthvalue": None,
+            # "kthvalue": None,
             "lcm": None,
             "linalg.cond": None,
             "linalg.eigh": None,
@@ -416,7 +340,6 @@ def mps_ops_modifier(
             "masked.median": None,
             "matrix_exp": None,
             "mode": None,
-            "native_dropout_backward": None,
             "normnuc": None,
             "nn.functional.fractional_max_pool2d": None,
             "nn.functional.fractional_max_pool3d": None,
@@ -495,9 +418,7 @@ def mps_ops_modifier(
                 torch.float16,
             ],
             # Unsupported dtypes
-            "dot": [torch.int64] if MACOS_VERSION < 14.0 else [],
             "histc": [torch.float16, torch.bfloat16],
-            "index_add": [torch.int64],
             # GEMM on MPS is not supported for integral types
             "nn.functional.linear": [
                 torch.int16,
@@ -506,19 +427,9 @@ def mps_ops_modifier(
                 torch.uint8,
                 torch.int8,
             ],
-            "addmmdecomposed": [
-                torch.int16,
-                torch.int32,
-                torch.int64,
-                torch.uint8,
-                torch.int8,
-            ],
             "addbmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
-            "addmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
             "baddbmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
             "mat": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
-            "matmul": [torch.int64] if MACOS_VERSION < 14.0 else [],
-            "__rmatmul__": [torch.int64] if MACOS_VERSION < 14.0 else [],
             # returned output on CPU is float64
             "bincount": [
                 torch.int16,
@@ -527,9 +438,13 @@ def mps_ops_modifier(
                 torch.uint8,
                 torch.int8,
             ],
-            # round not working properly for float16 and bfloat16
-            "round": [torch.float16, torch.bfloat16],
-            "rounddecimals_0": [torch.bfloat16],
+        }
+        UNIMPLEMENTED_XFAILLIST_SPARSE: dict[str, Optional[list]] = {
+            "logspace": None,
+            "logspacetensor_overload": None,
+            "linalg.eig": None,
+            "linalg.eigvals": None,
+            "put": None,
         }
 
         if MACOS_VERSION < 15.0:
@@ -539,8 +454,10 @@ def mps_ops_modifier(
                     "nanquantile": None,
                 }
             )
+        if sparse:
+            UNIMPLEMENTED_XFAILLIST.update(UNIMPLEMENTED_XFAILLIST_SPARSE)
 
-        UNDEFINED_XFAILLIST = {
+        UNDEFINED_XFAILLIST: dict[str, Optional[list]] = {
             # Top 60 operators
             # topk fails with duplicate indices
             "topk": [
@@ -617,12 +534,44 @@ def mps_ops_modifier(
             ],
         }
 
-        ON_MPS_XFAILLIST = {
+        ON_MPS_XFAILLIST: dict[str, Optional[list]] = {
             # Failures due to lack of implementation of downstream functions on MPS backend
             # TODO: remove these once downstream function 'aten::_linalg_svd.U' have been implemented
             "linalg.matrix_rank": None,
             # Exception: Caused by `torch.arange(-8.001, -4.0, dtype=torch.uint8, device="mps")`
             "arange": [torch.uint8],
+            # before macOS 13.2 it falls back to cpu and pass the forward pass
+            "grid_sampler_2d": [
+                torch.float32,
+                torch.float16,
+                torch.bfloat16,
+            ],  # Unsupported Border padding mode
+            # Failure due to precision issue for fp16
+            # on both cpu and mps there are test cases that might produce inf result
+            # 'nn.functional.pairwise_distance': [torch.float16],
+            # test blow pass on macOS 12 as it falls back to cpu
+            # Argsort case using duplicate indices (undefined behaviour):
+            #  - CPU output: tensor([2546, 6917, 3181,  ..., 7128, 5133,   30], device='cpu')
+            #  - MPS output: tensor([2546, 6917, 3181,  ..., 7128,   30, 5133], device='mps:0')
+            # Elements from index 30 and 5133 are both equal.
+            # Since CPU is not using argsort with stable=True, these cases result in undefined behaviour.
+            "argsort": [
+                torch.float16,
+                torch.int8,
+                torch.uint8,
+                torch.bool,
+                torch.bfloat16,
+            ],
+            # Same issue as `argsort` with duplicate indices. This test checks both the sorted values and the indices.
+            # The values of the sorted tensor match the CPU,
+            # but in case of the returned indices this results in undefined behaviour.
+            "sort": [
+                torch.int8,
+                torch.uint8,
+                torch.bool,
+                torch.float16,
+                torch.bfloat16,
+            ],
         }
 
         EMPTY_OPS_SKIPLIST = {
@@ -644,6 +593,10 @@ def mps_ops_modifier(
             # Unsupported
             # This doesn't work on M1, but is partially working on M2 with the exception of torch.float16
             "nn.functional.conv3d": None,
+            # The CPU impl of grid_sampler_3d does not use opmath_t, so it has a
+            # large amount of error compared with the MPS impl for half
+            # precision types. So we have to skip these for now.
+            "grid_sampler_3d": [torch.float16, torch.bfloat16],
         }
 
         def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
@@ -654,6 +607,30 @@ def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
 
         for op in ops:
             key = op.name + op.variant_test_name
+            addDecorator(
+                op,
+                DecorateInfo(
+                    unittest.expectedFailure,
+                    dtypes=[
+                        torch.double,
+                        torch.cdouble,
+                    ],
+                ),
+            )
+            if sparse:
+                # Skipped due to test_sparse_zero_dims test in test_sparse.py which allocates empty tensor
+                # which leads to unexpected success with it
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.skip(
+                            "Skipped due to MPS not supporting complex128 tensors"
+                        ),
+                        dtypes=[
+                            torch.complex128,
+                        ],
+                    ),
+                )
             if key in EMPTY_OPS_SKIPLIST:
                 addDecorator(
                     op,
@@ -690,43 +667,6 @@ def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
                     ),
                 )
 
-            if (
-                key in MACOS_BEFORE_13_3_XFAILLIST
-                and key not in xfail_exclusion
-                and (torch.backends.mps.is_macos13_or_newer() and MACOS_VERSION < 13.3)
-            ):
-                addDecorator(
-                    op,
-                    DecorateInfo(
-                        unittest.expectedFailure,
-                        dtypes=MACOS_BEFORE_13_3_XFAILLIST[key],
-                    ),
-                )
-
-            if (
-                key in MACOS_AFTER_13_1_XFAILLIST
-                and key not in xfail_exclusion
-                and torch.backends.mps.is_macos13_or_newer(2)
-            ):
-                addDecorator(
-                    op,
-                    DecorateInfo(
-                        unittest.expectedFailure, dtypes=MACOS_AFTER_13_1_XFAILLIST[key]
-                    ),
-                )
-
-            if (
-                key in MACOS_13_3_XFAILLIST
-                and key not in xfail_exclusion
-                and (MACOS_VERSION >= 13.3)
-            ):
-                addDecorator(
-                    op,
-                    DecorateInfo(
-                        unittest.expectedFailure, dtypes=MACOS_13_3_XFAILLIST[key]
-                    ),
-                )
-
             # If ops is not supported for complex types, expect it to fail
             if key not in SUPPORTED_COMPLEX_OPS:
                 addDecorator(
@@ -751,7 +691,10 @@ def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "scalar_tensor": [torch.float16, torch.float32],
             "cdist": [torch.float32],
             "masked.scatter": [torch.float16, torch.float32],
+            "grid_sampler_3d": None,
             "index_fill": [torch.float16, torch.float32],  # missing `aten::_unique`.
+            "igamma": None,  # currently not supported for any device
+            "igammac": None,  # currently not supported for any device
             "linalg.solve": [torch.float16, torch.float32],  # missing `aten::lu_solve`.
             "linalg.solve_ex": [
                 torch.float16,
@@ -813,8 +756,6 @@ def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "signal.windows.kaiser": [torch.float32],
             "signal.windows.nuttall": [torch.float32],
             "eye": [torch.float16, torch.float32],
-            # round not working properly for float16
-            "round": [torch.float16],
             # topk fails with duplicate indices
             "topk": [torch.float16],
         }
@@ -874,8 +815,6 @@ def mps_ops_error_inputs_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "clamp_min",
             "masked_scatter",
             # unsupported float64 dtype
-            "cat",
-            "complex",
             "multinomial",
             "nn.functional.conv1d",
             "nn.functional.conv2d",
@@ -900,3 +839,12 @@ def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
                 addDecorator(op, DecorateInfo(unittest.expectedFailure))
 
         return ops
+else:
+
+    def mps_ops_modifier(
+        ops: Sequence[OpInfo],
+        device_type: str = "mps",
+        xfail_exclusion: Optional[list[str]] = None,
+        sparse: bool = False,
+    ) -> Sequence[OpInfo]:
+        return ops
diff --git a/torch/testing/_internal/common_optimizers.py b/torch/testing/_internal/common_optimizers.py
index 780514e674397..1e4368380bb59 100644
--- a/torch/testing/_internal/common_optimizers.py
+++ b/torch/testing/_internal/common_optimizers.py
@@ -20,6 +20,7 @@
     AdamW,
     ASGD,
     LBFGS,
+    Muon,
     NAdam,
     Optimizer,
     RAdam,
@@ -245,8 +246,9 @@ def test_wrapper(*args, **kwargs):
 # Helper function for generating error inputs for all optimizers, used below.
 def get_error_inputs_for_all_optims(device, dtype):
     if _get_device_type(device) == "cpu":
-        sample_param = Parameter(torch.randn(1, device=device, dtype=dtype))
-        sample_param2 = Parameter(torch.randn(1, device=device, dtype=dtype))
+        # Creating 2D parameters for compatibility with Muon.
+        sample_param = Parameter(torch.randn(1, 1, device=device, dtype=dtype))
+        sample_param2 = Parameter(torch.randn(1, 1, device=device, dtype=dtype))
         return [
             ErrorOptimizerInput(
                 OptimizerInput(
@@ -833,6 +835,81 @@ def optim_error_inputs_func_lbfgs(device, dtype):
     return error_inputs
 
 
+def optim_inputs_func_muon(device, dtype=None):
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 0.01}, desc="non-default lr"),
+        OptimizerInput(
+            params=None, kwargs={"lr": torch.tensor(0.001)}, desc="Tensor lr"
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.2},
+            desc="non-default weight_decay",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"momentum": 0.8},
+            desc="non-default momentum",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"ns_steps": 6},
+            desc="passing alternative ns_steps",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "ns_coefficients": (3.4, -4.7, 2.0),
+            },
+            desc="passing alternative ns_coefficients",
+        ),
+    ]
+
+
+def optim_error_inputs_func_muon(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    complex_param = torch.rand(2, 3, device=device, dtype=torch.complex64)
+    complex_param.grad = torch.rand_like(complex_param)
+    non_2d_param = torch.rand(2, 3, 4, device=device, dtype=dtype)
+    non_2d_param.grad = torch.rand_like(non_2d_param)
+    param = torch.rand(2, 3, device=device, dtype=dtype)
+    param.grad = torch.rand_like(param)
+    error_inputs += [
+        ErrorOptimizerInput(
+            OptimizerInput(
+                params=[non_2d_param],
+                kwargs=dict(),
+                desc="only support 2D parameters",
+            ),
+            error_type=ValueError,
+            error_regex="Muon only supports 2D parameters",
+            error_on=OptimizerErrorEnum.CONSTRUCTION_ERROR,
+        ),
+        ErrorOptimizerInput(
+            OptimizerInput(
+                params=[param],
+                kwargs={"adjust_lr_fn": "arbitrary"},
+                desc="only support `original` and `match_rms_adamw`",
+            ),
+            error_type=ValueError,
+            error_regex="Adjust learning rate function arbitrary is not supported",
+            error_on=OptimizerErrorEnum.CONSTRUCTION_ERROR,
+        ),
+        ErrorOptimizerInput(
+            OptimizerInput(
+                params=[complex_param],
+                kwargs=dict(),
+                desc="does not support complex parameters",
+            ),
+            error_type=RuntimeError,
+            error_regex="Muon does not support complex parameters",
+            error_on=OptimizerErrorEnum.STEP_ERROR,
+        ),
+    ]
+    return error_inputs
+
+
 def optim_inputs_func_nadam(device, dtype=None):
     cuda_supported_configs = [
         OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
@@ -1268,9 +1345,9 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
     trivial. That said, we sometimes want to test for all possible configs on an
     optimizer including all supported flags, so this helper returns all optim inputs.
     """
-    assert all(
-        x in ["foreach", "fused", "differentiable"] for x in skip
-    ), "skip must be a subset of ['foreach', 'fused', 'differentiable']"
+    assert all(x in ["foreach", "fused", "differentiable"] for x in skip), (
+        "skip must be a subset of ['foreach', 'fused', 'differentiable']"
+    )
 
     optim_inputs = optim_info.optim_inputs_func(device)
 
@@ -1869,6 +1946,27 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
             ),
         ),
     ),
+    OptimizerInfo(
+        Muon,
+        optim_inputs_func=optim_inputs_func_muon,
+        optim_error_inputs_func=optim_error_inputs_func_muon,
+        supported_impls=(),
+        not_og_supported_flags=(),
+        supports_complex=False,
+        skips=(
+            # Note on numerical differences: `compile` applies different matmul tuning,
+            # which leads to deviations compared to eager mode. In the Newton–Schulz
+            # iteration for orthogonalization, computations are done in bfloat16, further
+            # amplifying these numerical differences.
+            DecorateInfo(
+                unittest.skip(
+                    "Expect high difference between compiled and eager due to bfloat16 and iterative process."
+                ),
+                "CompiledOptimizerParityTests",
+                "test_correctness",
+            ),
+        ),
+    ),
     OptimizerInfo(
         NAdam,
         optim_inputs_func=optim_inputs_func_nadam,
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 211b282c4fc4a..600848b80a7e8 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -611,7 +611,7 @@ def _group_quantize_tensor_symmetric(w, n_bit=4, groupsize=32):
 
 
 def _dynamically_quantize_per_channel(x, quant_min, quant_max, target_dtype):
-    # source: https://github.com/pytorch-labs/gpt-fast/blob/main/quantize.py
+    # source: https://github.com/meta-pytorch/gpt-fast/blob/main/quantize.py
     # default setup for affine quantization of activations
     x_dtype = x.dtype
     x = x.float()
@@ -3394,7 +3394,7 @@ def get_default_quantizer(is_qat, is_dynamic, inputs):
 
     maybe_no_grad = contextlib.nullcontext() if is_qat else torch.no_grad()
     with maybe_no_grad:
-        export_model = export_for_training(mod, inputs, strict=True).module()
+        export_model = export_for_training(mod, inputs, strict=True).module(check_guards=False)
         quantizer = (
             quantizer
             if quantizer
diff --git a/torch/testing/_internal/common_quantized.py b/torch/testing/_internal/common_quantized.py
index 9dc177a7899bd..0dc9d4cb3db72 100644
--- a/torch/testing/_internal/common_quantized.py
+++ b/torch/testing/_internal/common_quantized.py
@@ -479,3 +479,110 @@ def to_blocked(input_matrix) -> torch.Tensor:
     rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
 
     return rearranged.flatten()
+
+# This function is extracted from https://github.com/pytorch/ao/blob/v0.12.0/torchao/prototype/mx_formats/mx_tensor.py#L142
+def to_mxfp8(
+    data_hp: torch.Tensor,
+    block_size: int = 32,
+):
+    assert data_hp.dtype in (
+        torch.bfloat16,
+        torch.float,
+    ), f"{data_hp.dtype} is not supported yet"
+    assert (
+        data_hp.shape[-1] % block_size == 0
+    ), f"the last dimension of shape {data_hp.shape} must be divisible by block_size {block_size}"
+    assert data_hp.is_contiguous(), "unsupported"
+
+    orig_shape = data_hp.shape
+    data_hp = data_hp.reshape(
+        *orig_shape[:-1], orig_shape[-1] // block_size, block_size
+    )
+
+    max_abs = torch.amax(torch.abs(data_hp), -1).unsqueeze(-1)
+
+    data_hp = data_hp.to(torch.float32)
+    max_abs = max_abs.to(torch.float32)
+
+    F8E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max  # 448.0
+    max_pos = F8E4M3_MAX
+
+    # RCEIL
+    def _to_mx_rceil(
+        data_hp: torch.Tensor,
+        max_abs: torch.Tensor,
+        max_pos: float,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        E8M0_EXPONENT_BIAS = 127
+        descale = max_abs / max_pos
+        exponent = torch.where(
+            torch.isnan(descale),
+            0xFF,  # Handle biased exponent for nan
+            # NOTE: descale < (torch.finfo(torch.float32).smallest_normal / 2) is handled through clamping
+            (
+                torch.clamp(
+                    torch.ceil(torch.log2(descale)),
+                    min=-E8M0_EXPONENT_BIAS,
+                    max=E8M0_EXPONENT_BIAS,
+                )
+                + E8M0_EXPONENT_BIAS
+            ).to(torch.uint8),
+        )
+
+        descale_fp = torch.where(
+            exponent == 0,
+            1.0,
+            torch.exp2(E8M0_EXPONENT_BIAS - exponent.to(torch.float32)),
+        )
+
+        # scale and saturated cast the data elements to max of target dtype
+        data_lp = torch.clamp(data_hp * descale_fp, min=-1 * max_pos, max=max_pos)
+        return exponent, data_lp
+
+    scale_e8m0_biased, data_lp = _to_mx_rceil(data_hp, max_abs, max_pos)
+
+    # cast to target dtype
+    data_lp = data_lp.to(torch.float8_e4m3fn)
+    # need to reshape at the end to help inductor fuse things
+    data_lp = data_lp.reshape(orig_shape)
+
+    scale_e8m0_biased = scale_e8m0_biased.view(torch.float8_e8m0fnu)
+    scale_e8m0_biased = scale_e8m0_biased.squeeze(-1)
+    return scale_e8m0_biased, data_lp
+
+# Source: https://github.com/pytorch/ao/blob/568c1932a16ae9f30d48da214a88dc0013e98ed8/torchao/prototype/moe_training/utils.py#L310
+def generate_jagged_offs(E, M, multiple_of=16, dtype=torch.int32, device="cuda"):
+    """
+    Utility function for tests and benchmarks.
+
+    Generates a tensor of length E, containing random values divisible by `multiple_of`,
+    from 0 to M, in sorted order, and where the final value in the tensor is always M.
+    Args:
+        E (int): The length of the tensor.
+        M (int): The maximum value in the tensor.
+    Returns:
+        torch.Tensor: A tensor of length E with the specified properties.
+    """
+    import random
+
+    # Ensure M is divisible by 16
+    if M % multiple_of != 0:
+        raise ValueError(f"M must be divisible by {multiple_of}")
+
+    # Generate a list of possible values
+    possible_values = list(range(multiple_of, M + 1, multiple_of))
+
+    # If E is larger than the number of possible values, raise an error
+    if E > len(possible_values):
+        raise ValueError("E cannot be larger than the number of possible values")
+
+    # Randomly select E - 1 values from the possible values (excluding M)
+    selected_values = torch.tensor(random.sample(possible_values[:-1], E - 1))
+
+    # Append M to the selected values
+    selected_values = torch.cat((selected_values, torch.tensor([M])))
+
+    # Sort the selected values
+    selected_values, _ = torch.sort(selected_values)
+
+    return selected_values.to(dtype).to(device)
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 2fa368bb2d894..ad6f45d9049ea 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -308,7 +308,7 @@ def maybe_load_json(filename):
 if os.getenv("DISABLED_TESTS_FILE", ""):
     disabled_tests_dict = maybe_load_json(os.getenv("DISABLED_TESTS_FILE", ""))
 
-NATIVE_DEVICES = ('cpu', 'cuda', 'xpu', 'meta', torch._C._get_privateuse1_backend_name())
+NATIVE_DEVICES = ('cpu', 'cuda', 'xpu', 'meta', 'mps', torch._C._get_privateuse1_backend_name())
 
 # used for managing devices testing for torch profiler UTs
 # for now cpu, cuda and xpu are added for testing torch profiler UTs
@@ -340,9 +340,10 @@ def extract_test_fn() -> Optional[Callable]:
             self_val = frame.f_locals["self"]
             if isinstance(self_val, unittest.TestCase):
                 test_id = self_val.id()
-                test_name = test_id.split('.')[2]
-                test_fn = getattr(self_val, test_name).__func__
-                return test_fn
+                *_, cls_name, test_name = test_id.rsplit('.', 2)
+                if cls_name == type(self_val).__name__ and test_name.startswith("test"):
+                    test_fn = getattr(self_val, test_name).__func__
+                    return test_fn
     except Exception:
         pass
     return None
@@ -2052,7 +2053,7 @@ def dec_fn(fn):
         def wrap_fn(self, *args, **kwargs):
             if TEST_WITH_ROCM:
                 rocm_version = str(torch.version.hip)
-                rocm_version = rocm_version.split("-")[0]    # ignore git sha
+                rocm_version = rocm_version.split("-", maxsplit=1)[0]    # ignore git sha
                 rocm_version_tuple = tuple(int(x) for x in rocm_version.split("."))
                 if rocm_version_tuple is None or version is None or rocm_version_tuple < tuple(version):
                     reason = f"ROCm {rocm_version_tuple} is available but {version} required"
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index 94bfead8a0c03..e25e08fbf5090 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -31,6 +31,7 @@
     SequenceParallel,
 )
 from torch.testing._internal.common_distributed import (
+    MultiProcContinuousTest,
     MultiProcessTestCase,
     MultiThreadedTestCase,
     run_subtests,
@@ -41,6 +42,8 @@
 from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
 
 
+DEVICE_COUNT: int
+
 if TEST_CUDA:
     DEVICE_TYPE = "cuda"
     PG_BACKEND = "nccl"
@@ -334,6 +337,21 @@ def skip_unless_torch_gpu(method: T) -> T:
     return cast(T, skip_if_lt_x_gpu(NUM_DEVICES)(method))
 
 
+class DTensorContinuousTestBase(MultiProcContinuousTest):
+    @classmethod
+    def device_type(cls) -> str:
+        # if enough GPU/XPU/HPU we can use those devices, otherwise we fallback to CPU
+        if not (TEST_CUDA or TEST_XPU or TEST_HPU) or DEVICE_COUNT < cls.world_size:
+            return "cpu"
+        else:
+            return DEVICE_TYPE
+
+    @classmethod
+    def backend_str(cls) -> str:
+        backend = dist.get_default_backend_for_device(DEVICE_TYPE)
+        return backend
+
+
 class DTensorTestBase(MultiProcessTestCase):
     @property
     def world_size(self) -> int:
@@ -355,33 +373,40 @@ def backend(self) -> str:
     def build_device_mesh(self) -> DeviceMesh:
         return init_device_mesh(self.device_type, (self.world_size,))
 
-    def init_pg(self, eager_init) -> None:
+    def init_pg(self, eager_init, backend: Optional[str] = None) -> None:
         if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
 
-        if self.backend not in [
+        if backend is None:
+            backend = self.backend
+
+        if backend not in [
             "nccl",
             "gloo",
             "mpi",
             "cpu:gloo,cuda:nccl",
             "hccl",
             "xccl",
+            "fake",
         ]:
-            raise RuntimeError(f"Backend {self.backend} not supported!")
+            raise RuntimeError(f"Backend {backend} not supported!")
 
         device_id = None
-        if "nccl" in self.backend or "xccl" in self.backend:
+        if "nccl" in backend or "xccl" in backend:
             # set device for nccl pg for collectives
+            # TODO: if users want to enable testing across hosts, we may need
+            # to change this part.
             torch.accelerator.set_device_index(self.rank)
             # we only need to set device_id for nccl backend with eager init
             device_id = (
                 torch.device(f"{self.device_type}:{self.rank}") if eager_init else None
             )
+
         # For nccl backend, bind the device to the process if device_id is not None
         # so the nccl communicator is immediately formed and we can use `ncclCommSplit`
         # for form subgroup to avoid unnecesssary overhead.
         dist.init_process_group(
-            backend=self.backend,
+            backend=backend,
             world_size=self.world_size,
             rank=self.rank,  # pyre-ignore[16]
             init_method=f"file://{self.file_name}",  # pyre-ignore[16]
@@ -398,7 +423,18 @@ def destroy_pg(self, device_id: Optional[int] = None) -> None:
             device_id = (
                 torch.cuda.current_device() if self.device_type == "cuda" else self.rank
             )
-        dist.barrier(device_ids=[device_id])
+
+        if self.device_type == "cpu" and torch._C._get_accelerator().type != "cpu":
+            # NOTE: when `device_id` is not None, barrier() will choose the accelerator
+            # of the most pripority, which means if the test specifies to use CPU for
+            # testing while CUDA is available on the host, the barrier() will use CUDA.
+            # To avoid this and better respect `self.device_type`, we add this branch to
+            # enforce barrier() to use CPU when `self.device_type` is CPU and other
+            # accelerator is also available.
+            dist.barrier()
+        else:
+            dist.barrier(device_ids=[device_id])
+
         dist.destroy_process_group()
 
     def setUp(self) -> None:
@@ -449,13 +485,17 @@ def run_subtests(self, *args, **kwargs):
 
 
 # wrapper to initialize comms (processgroup)
-def with_comms(eager_init: Union[TestFunc, bool] = False) -> TestFunc:
-    def decorator(func, eager_init: bool = False):
+def with_comms(
+    eager_init: Union[TestFunc, bool] = False, backend: Optional[str] = None
+) -> TestFunc:
+    def decorator(func, eager_init: bool = False, backend: Optional[str] = None):
         @wraps(func)  # pyre-ignore[6]
         def wrapper(
-            self, *args: tuple[object], **kwargs: dict[str, Any]  # type: ignore[misc]
+            self,
+            *args: tuple[object],
+            **kwargs: dict[str, Any],  # type: ignore[misc]
         ) -> None:
-            self.init_pg(eager_init)
+            self.init_pg(eager_init, backend)
 
             try:
                 func(self, *args, **kwargs)  # type: ignore[misc]
@@ -470,7 +510,7 @@ def wrapper(
     return (
         decorator(func=eager_init)
         if callable(eager_init)
-        else partial(decorator, eager_init=eager_init)
+        else partial(decorator, eager_init=eager_init, backend=backend)
     )
 
 
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index 1ac9252d498e0..61c21be3ca075 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -253,7 +253,11 @@ def train_batch(
             else:
                 input_batches = batches
 
-        with self.hybrid_module.join() if simulate_uneven_inputs else contextlib.nullcontext():
+        with (
+            self.hybrid_module.join()
+            if simulate_uneven_inputs
+            else contextlib.nullcontext()
+        ):
             for b in input_batches:
                 with dist_autograd.context() as context_id:
                     output = self.hybrid_module.forward(b)
@@ -261,8 +265,7 @@ def train_batch(
                     dist_autograd.backward(context_id, [loss])
                     grads_dict = dist_autograd.get_gradients(context_id)
                     gLogger.info(
-                        "Loss is %s for mini batch: %s. "
-                        "Grads dict has %s entries: %s",
+                        "Loss is %s for mini batch: %s. Grads dict has %s entries: %s",
                         loss,
                         mini_batch,
                         len(grads_dict),
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index ce1cbcc681004..8c50b305e2fa3 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -459,6 +459,14 @@ def require_world_size(world_size):
     return lambda func: func
 
 
+def require_exact_world_size(world_size):
+    if int(os.environ["WORLD_SIZE"]) != world_size:
+        return skip_but_pass_in_sandcastle(
+            f"Test requires an exact world size of {world_size:d}"
+        )
+    return lambda func: func
+
+
 @contextmanager
 def _lock():
     TEMP_DIR = os.environ["TEMP_DIR"]
@@ -921,8 +929,7 @@ def test_new_subgroups(self):
             BACKEND not in DistTestCases.backend_feature["subgroup"],
             f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
         )
-        @require_world_size(4)
-        @skip_if_lt_x_gpu(4)
+        @require_exact_world_size(4)
         def test_new_subgroups_with_group_param(self):
             # Initialize global test environment
             self._init_global_test()
@@ -967,9 +974,10 @@ def test_new_subgroups_group_size_exceeds_world_size(self):
         @require_world_size(4)
         @skip_if_lt_x_gpu(4)
         def test_new_subgroups_world_size_not_divisible_by_group_size(self):
+            expected_msg = f"The world size ({dist.get_world_size()}) must be divisible by 'group_size=3'"
             with self.assertRaisesRegex(
                 ValueError,
-                re.escape("The world size (4) must be divisible by 'group_size=3'"),
+                re.escape(expected_msg),
             ):
                 dist.new_subgroups(3)
 
diff --git a/torch/testing/_internal/distributed/fake_pg.py b/torch/testing/_internal/distributed/fake_pg.py
index a34ee75cf600e..035a8bb7c586d 100644
--- a/torch/testing/_internal/distributed/fake_pg.py
+++ b/torch/testing/_internal/distributed/fake_pg.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 
 import torch.distributed as dist
-from torch._C._distributed_c10d import FakeProcessGroup
+from torch.distributed._distributed_c10d import FakeProcessGroup
 
 
 class FakeStore(dist.Store):
@@ -11,7 +11,7 @@ class FakeStore(dist.Store):
     """
 
 
-def _create_fake_pg(prefix_store, rank, world_size, timeout):
+def _create_fake_pg(common_opts, backend_opts):
     """
     A fake process group (not related to FakeTensor) is a process group which
     doesn't actually do any communication, it just hallucinates some
@@ -22,7 +22,11 @@ def _create_fake_pg(prefix_store, rank, world_size, timeout):
     for every collective. It should be used as a convenient tool when playing
     with distributed but don't care about the actual data.
     """
-    return FakeProcessGroup(rank, world_size)
+    return FakeProcessGroup(
+        common_opts.group_rank, common_opts.group_size, backend_opts
+    )
 
 
-dist.Backend.register_backend("fake", _create_fake_pg, devices=["cpu", "cuda", "hpu"])
+dist.Backend.register_backend(
+    "fake", _create_fake_pg, extended_api=True, devices=["cpu", "cuda", "hpu"]
+)
diff --git a/torch/testing/_internal/dynamo_test_failures.py b/torch/testing/_internal/dynamo_test_failures.py
index 08246cc65132b..cdc69b7920cf0 100644
--- a/torch/testing/_internal/dynamo_test_failures.py
+++ b/torch/testing/_internal/dynamo_test_failures.py
@@ -1,32 +1,33 @@
-# mypy: allow-untyped-defs
+"""
+This file contains the list of tests that are known to fail under Dynamo
+
+We generate xFailIfTorchDynamo* for all tests in `dynamo_expected_failures`
+We generate skipIfTorchDynamo* for all tests in `dynamo_skips`
+We generate runWithoutCompiledAutograd for all tests in `compiled_autograd_skips`
+
+For an easier-than-manual way of generating and updating these lists,
+see scripts/compile_tests/update_failures.py
+
+If you're adding a new test, and it's failing PYTORCH_TEST_WITH_DYNAMO=1,
+either add the appropriate decorators to your test or add skips for them
+via test/dynamo_skips and test/dynamo_expected_failures.
+
+*These are not exactly unittest.expectedFailure and unittest.skip. We'll
+always execute the test and then suppress the signal, if necessary.
+If your tests crashes, or is slow, please use @skipIfTorchDynamo instead.
+
+The expected failure and skip files are located in test/dynamo_skips and
+test/dynamo_expected_failures. They're individual files rather than a list so
+git will merge changes easier.
+"""
+
 import logging
 import os
 import sys
+from typing import Optional
 
 
-# NOTE: [dynamo_test_failures.py]
-#
-# We generate xFailIfTorchDynamo* for all tests in `dynamo_expected_failures`
-# We generate skipIfTorchDynamo* for all tests in `dynamo_skips`
-# We generate runWithoutCompiledAutograd for all tests in `compiled_autograd_skips`
-#
-# For an easier-than-manual way of generating and updating these lists,
-# see scripts/compile_tests/update_failures.py
-#
-# If you're adding a new test, and it's failing PYTORCH_TEST_WITH_DYNAMO=1,
-# either add the appropriate decorators to your test or add skips for them
-# via test/dynamo_skips and test/dynamo_expected_failures.
-#
-# *These are not exactly unittest.expectedFailure and unittest.skip. We'll
-# always execute the test and then suppress the signal, if necessary.
-# If your tests crashes, or is slow, please use @skipIfTorchDynamo instead.
-#
-# The expected failure and skip files are located in test/dynamo_skips and
-# test/dynamo_expected_failures. They're individual files rather than a list so
-# git will merge changes easier.
-
-
-def find_test_dir():
+def find_test_dir() -> Optional[str]:
     # Find the path to the dynamo expected failure and skip files.
     from os.path import abspath, basename, dirname, exists, join, normpath
 
diff --git a/torch/testing/_internal/hop_db.py b/torch/testing/_internal/hop_db.py
index 50462859019c4..2a0883408892f 100644
--- a/torch/testing/_internal/hop_db.py
+++ b/torch/testing/_internal/hop_db.py
@@ -202,6 +202,15 @@ def body_fn(iter_t, x):
 
     return torch._higher_order_ops.while_loop(cond_fn, body_fn, (iter_t, x))
 
+def simple_while_loop_stack_output(iter_t, x):
+    def cond_fn(iter_t, x):
+        return iter_t > 0
+
+    def body_fn(iter_t, x):
+        return iter_t - 1, x.cos()
+
+    return torch._higher_order_ops.while_loop_stack_output(cond_fn, body_fn, (iter_t, x), tuple())
+
 
 def sample_inputs_scan(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = functools.partial(
@@ -374,6 +383,19 @@ def fn(x):
         check_inplace_batched_forward_grad=False,
         supports_autograd=False,
     ),
+    OpInfo(
+        name="while_loop_stack_output",
+        variant_test_name="simple",
+        op=simple_while_loop_stack_output,
+        sample_inputs_func=sample_inputs_while_loop,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+        supports_autograd=False,
+    ),
     OpInfo(
         name="auto_functionalize",
         variant_test_name="simple",
diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py
index 8a521d56f5f84..661181243250c 100644
--- a/torch/testing/_internal/inductor_utils.py
+++ b/torch/testing/_internal/inductor_utils.py
@@ -13,6 +13,7 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch._inductor.graph import GraphLowering
 from torch._inductor.compile_fx import shape_env_from_inputs
+from torch._inductor.utils import OrderedSet
 from torch._inductor.codecache import CppCodeCache
 from torch._inductor.custom_graph_pass import CustomGraphModulePass
 from torch._inductor.codegen.common import (
@@ -69,13 +70,13 @@ def test_cpu():
     TRITON_HAS_CPU = False
 
 
-HAS_CUDA = torch.cuda.is_available() and HAS_TRITON
+HAS_CUDA_AND_TRITON = torch.cuda.is_available() and HAS_TRITON
 
-HAS_XPU = torch.xpu.is_available() and HAS_TRITON
+HAS_XPU_AND_TRITON = torch.xpu.is_available() and HAS_TRITON
 
 HAS_MPS = torch.mps.is_available()
 
-HAS_GPU = HAS_CUDA or HAS_XPU
+HAS_GPU = HAS_CUDA_AND_TRITON or HAS_XPU_AND_TRITON
 
 GPU_TYPE = get_gpu_type()
 
@@ -163,16 +164,16 @@ def inner(fn):
 skipCPUIf = functools.partial(skipDeviceIf, device="cpu")
 
 IS_A100 = LazyVal(
-    lambda: HAS_CUDA
+    lambda: HAS_CUDA_AND_TRITON
     and get_gpu_shared_memory() == 166912
 )
 
 IS_H100 = LazyVal(
-    lambda: HAS_CUDA
+    lambda: HAS_CUDA_AND_TRITON
     and get_gpu_shared_memory() == 232448
 )
 
-IS_BIG_GPU = LazyVal(lambda: HAS_CUDA and is_big_gpu())
+IS_BIG_GPU = LazyVal(lambda: HAS_CUDA_AND_TRITON and is_big_gpu())
 
 def dummy_graph() -> GraphLowering:
     """
@@ -306,6 +307,24 @@ def _quantize_rowwise(x: Tensor, float8_dtype: torch.dtype):
     inverse_scale = scale.reciprocal()
     return x_fp8, inverse_scale
 
+class MockGraphHandler(GraphLowering):
+    """Minimal mock graph handler for testing virtualized context."""
+
+    def __init__(self, name_to_buffer=None):
+        import torch._inductor.sizevars
+
+        self.sizevars = torch._inductor.sizevars.SizeVarAllocator()
+        self.name_to_buffer = name_to_buffer or {}
+        self.graph_inputs = {}
+        self.mutated_buffers = OrderedSet()
+        self.removed_buffers = OrderedSet()
+        self.constants = {}
+        self.scheduler = None
+
+    def get_dtype(self, buffer_name: str) -> torch.dtype:  # noqa: ARG002
+        """Return default dtype for any buffer (for testing)."""
+        return torch.float32
+
 @contextlib.contextmanager
 def patch_inductor_backend(
     device: str,
@@ -323,6 +342,7 @@ def patch_inductor_backend(
     original_scheduling = get_scheduling_for_device(device)
     original_python_wrapper = get_wrapper_codegen_for_device(device, False)
     original_cpp_wrapper = get_wrapper_codegen_for_device(device, True)
+    original_fx_wrapper = get_wrapper_codegen_for_device(device, fx_wrapper=True)
     original_custom_pass = get_custom_backend_pass_for_device(device)
     original_custom_backend_config = get_custom_backend_config_for_device(device)
 
@@ -333,6 +353,7 @@ def patch_inductor_backend(
             original_scheduling,
             python_wrapper_codegen if python_wrapper_codegen is not None else original_python_wrapper,
             original_cpp_wrapper,
+            original_fx_wrapper,
             custom_pass if custom_pass is not None else original_custom_pass,
             custom_backend_config if custom_backend_config is not None else original_custom_backend_config
         )
@@ -344,6 +365,7 @@ def patch_inductor_backend(
             original_scheduling,
             original_python_wrapper,
             original_cpp_wrapper,
+            original_fx_wrapper,
             original_custom_pass,
             original_custom_backend_config
         )
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index 5cd248792dcb1..97dee3c7c0f4e 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -162,9 +162,7 @@ def __init__(
         # Allow calling either as SampleInput(input, args=args, kwargs=kwargs), or as
         # SampleInput(input, *args, **kwargs) but not to mix the two forms
         if args is not None or kwargs is not None:
-            assert (
-                not var_args and not var_kwargs
-            ), """
+            assert not var_args and not var_kwargs, """
 A SampleInput can be constructed "naturally" with *args and **kwargs or by
 explicitly setting the "args" and "kwargs" parameters, but the two
 methods of construction cannot be mixed!"""
@@ -226,7 +224,7 @@ def _repr_helper(self, formatter):
             f"name={repr(self.name)}",
         ]
 
-        return f'SampleInput({", ".join(a for a in arguments if a is not None)})'
+        return f"SampleInput({', '.join(a for a in arguments if a is not None)})"
 
     def __repr__(self):
         return self._repr_helper(lambda x: x)
@@ -1601,13 +1599,11 @@ def __post_init__(self):
 
     # returns a string identifier of the rule type
     @abstractmethod
-    def type(self) -> str:
-        ...
+    def type(self) -> str: ...
 
     # returns an appropriate context that handles the xfail, skips, etc.
     @abstractmethod
-    def get_context(self, test_case):
-        ...
+    def get_context(self, test_case): ...
 
 
 # useful for specifying xfails
@@ -1791,8 +1787,10 @@ def __init__(
         # kwargs to use when calling the op. This is required for operators that
         # have other required parameters besides the input tensor.
         generate_args_kwargs: Callable = lambda t, dim=None, keepdim=False: (
-            yield (),
-            {},
+            yield (
+                (),
+                {},
+            )
         ),
         # Options from the OpInfo base class
         **kwargs,
@@ -2476,9 +2474,9 @@ def __init__(
             self.supports_one_python_scalar = True
 
         if self.supports_one_python_scalar:
-            assert (
-                supports_rhs_python_scalar
-            ), "Can't support lhs and rhs Python scalars but not rhs scalars!"
+            assert supports_rhs_python_scalar, (
+                "Can't support lhs and rhs Python scalars but not rhs scalars!"
+            )
 
 
 # The following functions and classes are for testing elementwise unary operators.
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index e05299632d04d..c5d08073803bb 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -102,8 +102,9 @@ def sample_inputs_masked_reduction(op_info, device, dtype, requires_grad, **kwar
         for mask in _generate_masked_op_mask(
             sample_input.input.shape, device, **kwargs
         ):
-            sample_input_args, sample_input_kwargs = sample_input.args, dict(
-                mask=mask, **sample_input.kwargs
+            sample_input_args, sample_input_kwargs = (
+                sample_input.args,
+                dict(mask=mask, **sample_input.kwargs),
             )
             yield SampleInput(
                 sample_input.input.detach().requires_grad_(requires_grad),
@@ -224,8 +225,9 @@ def sample_inputs_masked_norm(op_info, device, dtype, requires_grad, **kwargs):
             op_info, device, dtype, requires_grad, **kwargs
         ):
             sample_input_args, sample_input_kwargs = (
-                ord,
-            ) + sample_input.args, sample_input.kwargs.copy()
+                (ord,) + sample_input.args,
+                sample_input.kwargs.copy(),
+            )
             yield SampleInput(
                 sample_input.input.clone().requires_grad_(requires_grad),
                 args=sample_input_args,
@@ -276,8 +278,9 @@ def masked_samples():
             for mask in _generate_masked_op_mask(
                 sample_input.input.shape, device, **kwargs
             ):
-                sample_input_args, sample_input_kwargs = sample_input.args, dict(
-                    mask=mask, **sample_input.kwargs
+                sample_input_args, sample_input_kwargs = (
+                    sample_input.args,
+                    dict(mask=mask, **sample_input.kwargs),
                 )
                 yield SampleInput(
                     sample_input.input.detach().requires_grad_(requires_grad),
@@ -364,8 +367,9 @@ def sample_inputs_masked_cumops(op_info, device, dtype, requires_grad, **kwargs)
         ):
             if type(mask) != torch.Tensor:
                 continue
-            sample_input_args, sample_input_kwargs = sample_input.args, dict(
-                mask=mask, **sample_input.kwargs
+            sample_input_args, sample_input_kwargs = (
+                sample_input.args,
+                dict(mask=mask, **sample_input.kwargs),
             )
             if "keepdim" in sample_input_kwargs:
                 sample_input_kwargs.pop("keepdim")
diff --git a/torch/testing/_internal/triton_utils.py b/torch/testing/_internal/triton_utils.py
index 69b260d2833b5..4edaf86dd1d71 100644
--- a/torch/testing/_internal/triton_utils.py
+++ b/torch/testing/_internal/triton_utils.py
@@ -2,17 +2,83 @@
 
 import unittest
 
-from torch.testing._internal.inductor_utils import HAS_CUDA, HAS_GPU
+from torch.testing._internal.inductor_utils import (
+    HAS_CUDA_AND_TRITON,
+    HAS_GPU,
+    HAS_XPU_AND_TRITON,
+)
 from torch.utils._triton import has_triton
 
 
-requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+requires_cuda_and_triton = unittest.skipUnless(
+    HAS_CUDA_AND_TRITON, "requires cuda and triton"
+)
+requires_gpu_and_triton = unittest.skipUnless(
+    HAS_XPU_AND_TRITON or HAS_CUDA_AND_TRITON, "requires gpu and triton"
+)
 requires_gpu = unittest.skipUnless(HAS_GPU, "requires gpu")
 
 if has_triton():
     import triton
     from triton import language as tl
 
+    import torch
+
+    def _get_strange_configs() -> list[triton.Config]:
+        if torch.version.hip:
+            configs = [
+                triton.Config(
+                    {
+                        "BLOCK_SIZE_M": 16,
+                        "BLOCK_SIZE_N": 16,
+                        "BLOCK_SIZE_K": 16,
+                        "GROUP_SIZE_M": 4,
+                        "matrix_instr_nonkdim": 16,
+                        "waves_per_eu": 3,
+                        "kpack": 2,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_SIZE_M": 128,
+                        "BLOCK_SIZE_N": 64,
+                        "BLOCK_SIZE_K": 16,
+                        "GROUP_SIZE_M": 4,
+                        "matrix_instr_nonkdim": 16,
+                        "waves_per_eu": 3,
+                        "kpack": 2,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                ),
+            ]
+        else:
+            configs = [
+                triton.Config(
+                    {
+                        "BLOCK_SIZE_M": 16,
+                        "BLOCK_SIZE_N": 16,
+                        "BLOCK_SIZE_K": 16,
+                        "GROUP_SIZE_M": 4,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_SIZE_M": 128,
+                        "BLOCK_SIZE_N": 64,
+                        "BLOCK_SIZE_K": 32,
+                        "GROUP_SIZE_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                ),
+            ]
+        return configs
+
     # Define here so that multiple tests can take advantage of it
     @triton.jit
     def add_kernel(
diff --git a/torch/utils/_config_module.py b/torch/utils/_config_module.py
index 4ec4e5b591596..811b45fd1d697 100644
--- a/torch/utils/_config_module.py
+++ b/torch/utils/_config_module.py
@@ -112,7 +112,7 @@ def __init__(
 
     @staticmethod
     def string_or_list_of_string_to_list(
-        val: Optional[Union[str, list[str]]]
+        val: Optional[Union[str, list[str]]],
     ) -> Optional[list[str]]:
         if val is None:
             return None
@@ -135,8 +135,7 @@ def Config(
         env_name_force: Optional[Union[str, list[str]]] = None,
         value_type: Optional[type] = None,
         alias: Optional[str] = None,
-    ) -> T:
-        ...
+    ) -> T: ...
 
 else:
 
@@ -323,9 +322,9 @@ def __init__(self, config: _Config):
 
         # Ensure justknobs and envvars are allowlisted types
         if self.justknob is not None and self.default is not None:
-            assert isinstance(
-                self.default, bool
-            ), f"justknobs only support booleans, {self.default} is not a boolean"
+            assert isinstance(self.default, bool), (
+                f"justknobs only support booleans, {self.default} is not a boolean"
+            )
         if self.value_type is not None and (
             config.env_name_default is not None or config.env_name_force is not None
         ):
@@ -334,7 +333,9 @@ def __init__(self, config: _Config):
                 str,
                 Optional[bool],
                 Optional[str],
-            ), f"envvar configs only support (optional) booleans or strings, {self.value_type} is neither"
+            ), (
+                f"envvar configs only support (optional) booleans or strings, {self.value_type} is neither"
+            )
 
 
 class ConfigModule(ModuleType):
diff --git a/torch/utils/_cxx_pytree.py b/torch/utils/_cxx_pytree.py
index 24c73061b716a..efe140f10f014 100644
--- a/torch/utils/_cxx_pytree.py
+++ b/torch/utils/_cxx_pytree.py
@@ -282,9 +282,9 @@ def tree_is_leaf(
     False
     >>> tree_is_leaf((1, 2, 3), is_leaf=lambda x: isinstance(x, tuple))
     True
-    >>> tree_is_leaf({'a': 1, 'b': 2, 'c': 3})
+    >>> tree_is_leaf({"a": 1, "b": 2, "c": 3})
     False
-    >>> tree_is_leaf({'a': 1, 'b': 2, 'c': None})
+    >>> tree_is_leaf({"a": 1, "b": 2, "c": None})
     False
 
     Args:
@@ -586,29 +586,28 @@ def tree_map_(
 # These specializations help with type inference on the lambda passed to this
 # function
 @overload
-def map_only(type_or_types_or_pred: type[T], /) -> MapOnlyFn[Fn[T, Any]]:
-    ...
+def map_only(type_or_types_or_pred: type[T], /) -> MapOnlyFn[Fn[T, Any]]: ...
 
 
 @overload
-def map_only(type_or_types_or_pred: Type2[T, S], /) -> MapOnlyFn[Fn2[T, S, Any]]:
-    ...
+def map_only(type_or_types_or_pred: Type2[T, S], /) -> MapOnlyFn[Fn2[T, S, Any]]: ...
 
 
 @overload
-def map_only(type_or_types_or_pred: Type3[T, S, U], /) -> MapOnlyFn[Fn3[T, S, U, Any]]:
-    ...
+def map_only(
+    type_or_types_or_pred: Type3[T, S, U], /
+) -> MapOnlyFn[Fn3[T, S, U, Any]]: ...
 
 
 # This specialization is needed for the implementations below that call
 @overload
-def map_only(type_or_types_or_pred: TypeAny, /) -> MapOnlyFn[FnAny[Any]]:
-    ...
+def map_only(type_or_types_or_pred: TypeAny, /) -> MapOnlyFn[FnAny[Any]]: ...
 
 
 @overload
-def map_only(type_or_types_or_pred: Callable[[Any], bool], /) -> MapOnlyFn[FnAny[Any]]:
-    ...
+def map_only(
+    type_or_types_or_pred: Callable[[Any], bool], /
+) -> MapOnlyFn[FnAny[Any]]: ...
 
 
 def map_only(
@@ -664,8 +663,7 @@ def tree_map_only(
     func: Fn[T, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -675,8 +673,7 @@ def tree_map_only(
     func: Fn2[T, S, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -686,8 +683,7 @@ def tree_map_only(
     func: Fn3[T, S, U, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -697,8 +693,7 @@ def tree_map_only(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -708,8 +703,7 @@ def tree_map_only(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 def tree_map_only(
@@ -729,8 +723,7 @@ def tree_map_only_(
     func: Fn[T, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -740,8 +733,7 @@ def tree_map_only_(
     func: Fn2[T, S, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -751,8 +743,7 @@ def tree_map_only_(
     func: Fn3[T, S, U, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -762,8 +753,7 @@ def tree_map_only_(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -773,8 +763,7 @@ def tree_map_only_(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 def tree_map_only_(
@@ -812,8 +801,7 @@ def tree_all_only(
     pred: Fn[T, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 @overload
@@ -823,8 +811,7 @@ def tree_all_only(
     pred: Fn2[T, S, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 @overload
@@ -834,8 +821,7 @@ def tree_all_only(
     pred: Fn3[T, S, U, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 def tree_all_only(
@@ -856,8 +842,7 @@ def tree_any_only(
     pred: Fn[T, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 @overload
@@ -867,8 +852,7 @@ def tree_any_only(
     pred: Fn2[T, S, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 @overload
@@ -878,8 +862,7 @@ def tree_any_only(
     pred: Fn3[T, S, U, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 def tree_any_only(
@@ -1011,7 +994,7 @@ def __instancecheck__(self, instance: object) -> bool:
         return _is_pytreespec_instance(instance) and instance.is_leaf()
 
 
-class LeafSpec(TreeSpec, metaclass=LeafSpecMeta):
+class LeafSpec(TreeSpec, metaclass=LeafSpecMeta):  # type: ignore[misc,final]
     def __new__(cls) -> "LeafSpec":
         return optree.treespec_leaf(none_is_leaf=True)  # type: ignore[return-value]
 
diff --git a/torch/utils/_functools.py b/torch/utils/_functools.py
index 40ffd8f80a9e7..0b555ffc27f96 100644
--- a/torch/utils/_functools.py
+++ b/torch/utils/_functools.py
@@ -12,7 +12,7 @@
 
 
 def cache_method(
-    f: Callable[Concatenate[_C, _P], _T]
+    f: Callable[Concatenate[_C, _P], _T],
 ) -> Callable[Concatenate[_C, _P], _T]:
     """
     Like `@functools.cache` but for methods.
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index 664994e6fe38f..5441468eb3b5f 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -1,10 +1,11 @@
 # mypy: allow-untyped-defs
 import contextlib
+import functools
 import warnings
 from collections import deque
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, Optional, overload, Protocol, Union
+from typing import Optional, overload, Protocol, Union
 from typing_extensions import TypeIs
 
 import torch
@@ -27,6 +28,8 @@
 
 _is_in_torch_dispatch_mode = False
 _is_in_non_infra_torch_dispatch_mode = False
+# If inside any mode that has ignore_compile_internals() = False
+_is_in_any_mode_without_ignore_compile_internals = False
 
 
 def is_in_torch_dispatch_mode(include_infra_modes=True) -> bool:
@@ -37,6 +40,10 @@ def is_in_torch_dispatch_mode(include_infra_modes=True) -> bool:
     )
 
 
+def is_in_any_mode_without_ignore_compile_internals() -> bool:
+    return _is_in_any_mode_without_ignore_compile_internals
+
+
 class TorchDispatchMode:
     """
     A ``TorchDispatchMode`` allows you to override the meaning of all
@@ -81,6 +88,9 @@ def __init__(self, _dispatch_key=None):
 
         self.old_dispatch_mode_flags: deque[bool] = deque()
         self.old_non_infra_dispatch_mode_flags: deque[bool] = deque()
+        self.old_without_ignore_compile_internals_dispatch_mode_flags: deque[bool] = (
+            deque()
+        )
 
     def _lazy_init_old_dispatch_mode_flags(self):
         if not hasattr(self, "old_dispatch_mode_flags"):
@@ -89,12 +99,21 @@ def _lazy_init_old_dispatch_mode_flags(self):
         if not hasattr(self, "old_non_infra_dispatch_mode_flags"):
             self.old_non_infra_dispatch_mode_flags: deque[bool] = deque()  # type: ignore[no-redef]
 
+        if not hasattr(
+            self, "old_without_ignore_compile_internals_dispatch_mode_flags"
+        ):
+            self.old_without_ignore_compile_internals_dispatch_mode_flags: deque[  # type: ignore[no-redef]
+                bool
+            ] = deque()
+
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         raise NotImplementedError
 
     def __enter__(self):
         global _is_in_torch_dispatch_mode
         global _is_in_non_infra_torch_dispatch_mode
+        global _is_in_any_mode_without_ignore_compile_internals
+
         # Previously, there wasn't any state in this class' constructor
         # super calls were added to existing modes, but for any new modes
         # this will replicate the previous behavior of not strictly needing
@@ -108,6 +127,13 @@ def __enter__(self):
         _is_in_non_infra_torch_dispatch_mode = (
             _is_in_non_infra_torch_dispatch_mode or not self.is_infra_mode()
         )
+        self.old_without_ignore_compile_internals_dispatch_mode_flags.append(
+            _is_in_any_mode_without_ignore_compile_internals
+        )
+        _is_in_any_mode_without_ignore_compile_internals = (
+            _is_in_any_mode_without_ignore_compile_internals
+            or not self.ignore_compile_internals()
+        )
         _push_mode(self)
         return self
 
@@ -123,6 +149,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         _is_in_non_infra_torch_dispatch_mode = (
             self.old_non_infra_dispatch_mode_flags.pop()
         )
+        global _is_in_any_mode_without_ignore_compile_internals
+        _is_in_any_mode_without_ignore_compile_internals = (
+            self.old_without_ignore_compile_internals_dispatch_mode_flags.pop()
+        )
         _pop_mode(mb_dk_or_mode_key)
 
     @classmethod
@@ -137,6 +167,38 @@ def push(cls, *args, **kwargs):
     def is_infra_mode(cls):
         return False
 
+    @classmethod
+    def ignore_compile_internals(cls):
+        """Ignore operators that are compiled via torch.compile.
+
+        If ``True``, then this TorchDispatchMode ignores operators that
+        are optimized by :func:`torch.compile`. Mechanically, this involves
+        turning off the TorchDispatchMode throughout the whole compilation process,
+        and turning it back on for the runtime of the compiled artifact(s).
+        For example,
+
+        @torch.compile
+        def f(x):
+            return x.sin().cos()
+
+        with LoggingMode():
+            f(x)
+
+        The above example will not log anything if
+        ``LoggingMode.ignore_compile_internals()`` is True.
+        torch.compile will fuse sin() and cos() into a single operation
+        and this TorchDispatchMode will not be passed sin and cos.
+
+        If ``False`` (default), :func:`torch.compile` will respect
+        the eager semantics of passing this TorchDispatchMode all
+        operators that would have run during eager execution.
+        The way this will usually happen is that :func:`torch.compile`
+        will just fallback to eager-mode PyTorch.
+        """
+        if cls.is_infra_mode():
+            return True
+        return False
+
 
 def _get_current_dispatch_mode():
     stack_len = _len_torch_dispatch_stack()
@@ -302,14 +364,12 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
 # Subtypes which have __tensor_flatten__ and __tensor_unflatten__.
 class TensorWithFlatten(Protocol):
-    def __tensor_flatten__(self) -> tuple[Sequence[str], object]:
-        ...
+    def __tensor_flatten__(self) -> tuple[Sequence[str], object]: ...
 
     @staticmethod
     def __tensor_unflatten__(
         inner_tensors: int, flatten_spec: int, outer_size: int, outer_stride: int
-    ) -> torch.Tensor:
-        ...
+    ) -> torch.Tensor: ...
 
     # It would be really nice to be able to say that the return of
     # is_traceable_wrapper_subclass() is Intersection[torch.Tensor,
@@ -318,26 +378,20 @@ def __tensor_unflatten__(
     shape: torch._C.Size
 
     @overload
-    def stride(self, dim: None = None) -> tuple[int, ...]:
-        ...
+    def stride(self, dim: None = None) -> tuple[int, ...]: ...
 
     @overload
-    def stride(self, dim: int) -> int:
-        ...
+    def stride(self, dim: int) -> int: ...
 
     @overload
-    def size(self, dim: None = None) -> tuple[int, ...]:
-        ...
+    def size(self, dim: None = None) -> tuple[int, ...]: ...
 
     @overload
-    def size(self, dim: int) -> int:
-        ...
+    def size(self, dim: int) -> int: ...
 
-    def storage_offset(self) -> int:
-        ...
+    def storage_offset(self) -> int: ...
 
-    def dim(self) -> int:
-        ...
+    def dim(self) -> int: ...
 
     @overload
     def to(
@@ -347,8 +401,7 @@ def to(
         copy: bool = False,
         *,
         memory_format: Optional[torch.memory_format] = None,
-    ) -> torch.Tensor:
-        ...
+    ) -> torch.Tensor: ...
 
     @overload
     def to(
@@ -359,8 +412,7 @@ def to(
         copy: bool = False,
         *,
         memory_format: Optional[torch.memory_format] = None,
-    ) -> torch.Tensor:
-        ...
+    ) -> torch.Tensor: ...
 
     @overload
     def to(
@@ -370,8 +422,7 @@ def to(
         copy: bool = False,
         *,
         memory_format: Optional[torch.memory_format] = None,
-    ) -> torch.Tensor:
-        ...
+    ) -> torch.Tensor: ...
 
 
 def is_traceable_wrapper_subclass(t: object) -> TypeIs[TensorWithFlatten]:
@@ -404,7 +455,7 @@ def is_traceable_wrapper_subclass(t: object) -> TypeIs[TensorWithFlatten]:
                 that require the stride info to be constructed. In most cases, this arg can be
                 safely ignored.
     """
-    is_subclass = isinstance(t, torch.Tensor) and type(t) != torch.Tensor
+    is_subclass = isinstance(t, torch.Tensor) and type(t) is not torch.Tensor
     return (
         is_subclass
         and hasattr(t, "__tensor_flatten__")
@@ -416,7 +467,7 @@ def is_traceable_wrapper_subclass_type(t: type) -> TypeIs[type[TensorWithFlatten
     """Same as above, but takes a type argument instead of an instance."""
     return (
         issubclass(t, torch.Tensor)
-        and t != torch.Tensor
+        and t is not torch.Tensor
         and hasattr(t, "__tensor_flatten__")
         and hasattr(t, "__tensor_unflatten__")
     )
@@ -485,7 +536,16 @@ def alias_non_inplace_storage(arg, ret):
         # in theory if a subclass that needs this API wants to sometimes return
         # plain tensors, we could remove the assert and just not perform the aliasing,
         # but it seems safer to learn more about this case first.
-        if is_traceable_wrapper_subclass(arg) or is_traceable_wrapper_subclass(ret):
+        #
+        # Performance note: This is all just to assert that the argument and result
+        # types match, checking that is cheaper than is_traceable_wrapper_subclass_type,
+        # and multiple returns are relatively unlikely, so just check up front!
+        arg_type = type(arg)
+        ret_type = type(ret)
+        if arg_type is not ret_type and (
+            is_traceable_wrapper_subclass_type(arg_type)
+            or is_traceable_wrapper_subclass_type(ret_type)
+        ):
             ret_list = ret if isinstance(ret, list) else [ret]
             for r in ret_list:
                 assert type(arg) == type(
@@ -510,17 +570,12 @@ def alias_non_inplace_storage(arg, ret):
             assert isinstance(ret, torch.Tensor), f"type: {type(ret)}"
             torch._functionalize_unsafe_set(ret, arg)
 
-    def is_read_only_alias_match(arg, ret):
-        shared_aliases = arg.alias_set & ret.alias_set
-        return len(shared_aliases) > 0 and not arg.is_write
-
-    num_args = len(func._schema.arguments)
-    num_returns = len(func._schema.returns)
-    for arg_idx in range(num_args):
-        for return_idx in range(num_returns):
-            if is_read_only_alias_match(
-                schema_info.args[arg_idx], schema_info.outs[return_idx]
-            ):
+    for arg_idx, schema_arg in enumerate(schema_info.args):
+        for return_idx, schema_out in enumerate(schema_info.outs):
+            is_read_only_alias_match = (
+                schema_arg.alias_set & schema_out.alias_set
+            ) and not schema_arg.is_write
+            if is_read_only_alias_match:
                 alias_non_inplace_storage(args[arg_idx], outs[return_idx])
 
 
@@ -539,16 +594,19 @@ class SchemaInfo:
     args: list[AliasInfo]
     outs: list[AliasInfo]
 
-
-# Can't import torch._ops.OpOverload due to circular reference
-parsed_schema_map: dict[Any, SchemaInfo] = {}
+    # NOTE[SchemaInfo int_tags]: This has nothing to do with aliasing, but we take
+    # advantage of our existing caching of data for each OpOverload to paper over an
+    # efficiency problem with pybind11::enum_ (which currently is used to implement
+    # torch.Tag): a scan over a list of pybind enums using `in` is inefficient because
+    # each element must be converted to int with the __int__ method, which incurs a lot
+    # of overhead. Converting to int once and caching removes this per-op overhead.
+    int_tags: list[int]
 
 
 # Given an OpOverload, returns schema information on it.
 # This is cached for efficiency, since it can involve running torchgen
+@functools.cache
 def get_alias_info(func) -> SchemaInfo:
-    if func in parsed_schema_map:
-        return parsed_schema_map[func]
     # For ATen ops: use torchgen (since torchscript parser doesn't handle alias annotations
     # properly for some ops that output tensorlists)
     if func.namespace == "aten":
@@ -610,11 +668,16 @@ def get_alias_info(func) -> SchemaInfo:
             )
             for a in func._schema.returns
         ]
-    schema_info = SchemaInfo(args=arg_schemas, outs=out_schemas)
-    parsed_schema_map[func] = schema_info
+    schema_info = SchemaInfo(
+        args=arg_schemas, outs=out_schemas, int_tags=[int(x) for x in func.tags]
+    )
     return schema_info
 
 
+# See NOTE[SchemaInfo int_tags] above.
+_TORCH_TAG_INPLACE_VIEW_INT = int(torch.Tag.inplace_view)  # type: ignore[call-overload]
+
+
 def return_and_correct_aliasing(func, args, kwargs, out):
     """
     This function should be used by wrapper tensor ``__torch_dispatch__`` subclasses
@@ -636,14 +699,14 @@ def return_and_correct_aliasing(func, args, kwargs, out):
     schema_info = get_alias_info(func)
 
     def get_write_alias(x):
-        if len(x.alias_set) == 0:
+        alias_set = x.alias_set
+        if not alias_set or not x.is_write:
             return None
-        alias_set = list(x.alias_set)
         # torchscript allows for complicated alias sets, but our dispatcher ops only really involve simple aliasing
         assert len(alias_set) == 1
-        if x.is_write:
-            return alias_set[0]
-        return None
+        # timeit says next(iter(alias_set)) is faster than list(alias_set)[0] even for
+        # set of size 1 on Python 3.13.
+        return next(iter(alias_set))
 
     def get_arg_from_alias(output_alias, schema_info, args, kwargs):
         new_args, new_kwargs = torch.fx.operator_schemas.normalize_function(  # type: ignore[misc]
@@ -669,7 +732,8 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
 
     # For inplace_view ops in particular, we'll try hard to make sure that the wrapper subclass's
     # metadata is set correctly.
-    if torch.Tag.inplace_view in func.tags:
+    # See NOTE[SchemaInfo int_tags] above.
+    if _TORCH_TAG_INPLACE_VIEW_INT in schema_info.int_tags:
         # no_dispatch() to make sure that we secretly change the metadata on the wrapper,
         # but don't end up dispatching the op anywhere else.
         mutated_args = [
@@ -698,30 +762,26 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
 
     # Next: we need to make sure to return inputs directly, if the output is a mutable alias (e.g. add_()).
 
+    # Compute write aliases once instead of repeatedly.
+    schema_info_outs_write_aliases = [get_write_alias(r) for r in schema_info.outs]
     # simple case: none of our outputs have mutable aliases, so we can return the output as-is
-    if not any(get_write_alias(r) is not None for r in schema_info.outs):
+    if not any(x is not None for x in schema_info_outs_write_aliases):
         return out
 
     # simplifying assumption: we don't have **any** ops with return types like "-> (Tensor(a!), Tensor)"
-    if not all(get_write_alias(r) is not None for r in schema_info.outs):
+    if not all(x is not None for x in schema_info_outs_write_aliases):
         raise RuntimeError("Unsupported schema: " + str(func._schema))
 
-    if len(func._schema.returns) == 1:
+    if len(schema_info_outs_write_aliases) == 1:
         return get_arg_from_alias(
-            get_write_alias(schema_info.outs[0]), schema_info, args, kwargs
+            schema_info_outs_write_aliases[0], schema_info, args, kwargs
         )
 
     # In the multi-return case, all aten ops return a tuple / list, so cast accordingly.
     outs_to_return = type(out)(
         [
-            (
-                get_arg_from_alias(
-                    get_write_alias(schema_info.outs[i]), schema_info, args, kwargs
-                )
-                if get_write_alias(r) is not None
-                else o
-            )
-            for ((i, r), o) in zip(enumerate(schema_info.outs), out)
+            (get_arg_from_alias(write_alias, schema_info, args, kwargs))
+            for write_alias in schema_info_outs_write_aliases
         ]
     )
     return outs_to_return
diff --git a/torch/utils/_pytree.py b/torch/utils/_pytree.py
index 3e7cadc6dc7a7..773e9f00e3d15 100644
--- a/torch/utils/_pytree.py
+++ b/torch/utils/_pytree.py
@@ -99,17 +99,13 @@
 
 
 class KeyEntry(Protocol):
-    def __hash__(self) -> int:
-        ...
+    def __hash__(self) -> int: ...
 
-    def __eq__(self, other: object) -> bool:
-        ...
+    def __eq__(self, other: object) -> bool: ...
 
-    def __str__(self) -> str:
-        ...
+    def __str__(self) -> str: ...
 
-    def get(self, parent: Any) -> Any:
-        ...
+    def get(self, parent: Any) -> Any: ...
 
 
 class EnumEncoder(json.JSONEncoder):
@@ -374,7 +370,7 @@ def _unflatten_fn(values: Iterable[Any], context: Context) -> Any:
 
     def _flatten_fn_with_keys(obj: Any) -> tuple[list[Any], Context]:
         flattened, (flat_names, _none_names) = _flatten_fn(obj)  # type: ignore[misc]
-        return [(MappingKey(k), v) for k, v in zip(flat_names, flattened)], flat_names
+        return [(GetAttrKey(k), v) for k, v in zip(flat_names, flattened)], flat_names
 
     _private_register_pytree_node(
         cls,
@@ -757,7 +753,7 @@ def _tuple_flatten(d: tuple[T, ...]) -> tuple[list[T], Context]:
 
 
 def _tuple_flatten_with_keys(
-    d: tuple[T, ...]
+    d: tuple[T, ...],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _tuple_flatten(d)
     return [(SequenceKey(i), v) for i, v in enumerate(values)], context
@@ -785,7 +781,7 @@ def _dict_flatten(d: dict[Any, T]) -> tuple[list[T], Context]:
 
 
 def _dict_flatten_with_keys(
-    d: dict[Any, T]
+    d: dict[Any, T],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _dict_flatten(d)
     return [(MappingKey(k), v) for k, v in zip(context, values)], context
@@ -849,7 +845,7 @@ def _ordereddict_flatten(d: OrderedDict[Any, T]) -> tuple[list[T], Context]:
 
 
 def _ordereddict_flatten_with_keys(
-    d: OrderedDict[Any, T]
+    d: OrderedDict[Any, T],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _ordereddict_flatten(d)
     return [(MappingKey(k), v) for k, v in zip(context, values)], context
@@ -872,7 +868,7 @@ def _defaultdict_flatten(d: defaultdict[Any, T]) -> tuple[list[T], Context]:
 
 
 def _defaultdict_flatten_with_keys(
-    d: defaultdict[Any, T]
+    d: defaultdict[Any, T],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _defaultdict_flatten(d)
     _, dict_context = context
@@ -1035,9 +1031,9 @@ def tree_is_leaf(
     False
     >>> tree_is_leaf((1, 2, 3), is_leaf=lambda x: isinstance(x, tuple))
     True
-    >>> tree_is_leaf({'a': 1, 'b': 2, 'c': 3})
+    >>> tree_is_leaf({"a": 1, "b": 2, "c": 3})
     False
-    >>> tree_is_leaf({'a': 1, 'b': 2, 'c': None})
+    >>> tree_is_leaf({"a": 1, "b": 2, "c": None})
     False
     """
     if is_leaf is not None and is_leaf(tree):
@@ -1346,9 +1342,9 @@ def tree_map(
 
     See also :func:`tree_map_`.
 
-    >>> tree_map(lambda x: x + 1, {'x': 7, 'y': (42, 64)})
+    >>> tree_map(lambda x: x + 1, {"x": 7, "y": (42, 64)})
     {'x': 8, 'y': (43, 65)}
-    >>> tree_map(lambda x: x is None, {'x': 7, 'y': (42, 64), 'z': None})
+    >>> tree_map(lambda x: x is None, {"x": 7, "y": (42, 64), "z": None})
     {'x': False, 'y': (False, False), 'z': True}
 
     If multiple inputs are given, the structure of the tree is taken from the first input;
@@ -1432,29 +1428,28 @@ def tree_map_(
 # These specializations help with type inference on the lambda passed to this
 # function
 @overload
-def map_only(type_or_types_or_pred: type[T], /) -> MapOnlyFn[Fn[T, Any]]:
-    ...
+def map_only(type_or_types_or_pred: type[T], /) -> MapOnlyFn[Fn[T, Any]]: ...
 
 
 @overload
-def map_only(type_or_types_or_pred: Type2[T, S], /) -> MapOnlyFn[Fn2[T, S, Any]]:
-    ...
+def map_only(type_or_types_or_pred: Type2[T, S], /) -> MapOnlyFn[Fn2[T, S, Any]]: ...
 
 
 @overload
-def map_only(type_or_types_or_pred: Type3[T, S, U], /) -> MapOnlyFn[Fn3[T, S, U, Any]]:
-    ...
+def map_only(
+    type_or_types_or_pred: Type3[T, S, U], /
+) -> MapOnlyFn[Fn3[T, S, U, Any]]: ...
 
 
 # This specialization is needed for the implementations below that call
 @overload
-def map_only(type_or_types_or_pred: TypeAny, /) -> MapOnlyFn[FnAny[Any]]:
-    ...
+def map_only(type_or_types_or_pred: TypeAny, /) -> MapOnlyFn[FnAny[Any]]: ...
 
 
 @overload
-def map_only(type_or_types_or_pred: Callable[[Any], bool], /) -> MapOnlyFn[FnAny[Any]]:
-    ...
+def map_only(
+    type_or_types_or_pred: Callable[[Any], bool], /
+) -> MapOnlyFn[FnAny[Any]]: ...
 
 
 def map_only(
@@ -1510,8 +1505,7 @@ def tree_map_only(
     func: Fn[T, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -1521,8 +1515,7 @@ def tree_map_only(
     func: Fn2[T, S, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -1532,8 +1525,7 @@ def tree_map_only(
     func: Fn3[T, S, U, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -1543,8 +1535,7 @@ def tree_map_only(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -1554,8 +1545,7 @@ def tree_map_only(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 def tree_map_only(
@@ -1575,8 +1565,7 @@ def tree_map_only_(
     func: Fn[T, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -1586,8 +1575,7 @@ def tree_map_only_(
     func: Fn2[T, S, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -1597,8 +1585,7 @@ def tree_map_only_(
     func: Fn3[T, S, U, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -1608,8 +1595,7 @@ def tree_map_only_(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 @overload
@@ -1619,8 +1605,7 @@ def tree_map_only_(
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> PyTree:
-    ...
+) -> PyTree: ...
 
 
 def tree_map_only_(
@@ -1658,8 +1643,7 @@ def tree_all_only(
     pred: Fn[T, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 @overload
@@ -1669,8 +1653,7 @@ def tree_all_only(
     pred: Fn2[T, S, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 @overload
@@ -1680,8 +1663,7 @@ def tree_all_only(
     pred: Fn3[T, S, U, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 def tree_all_only(
@@ -1702,8 +1684,7 @@ def tree_any_only(
     pred: Fn[T, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 @overload
@@ -1713,8 +1694,7 @@ def tree_any_only(
     pred: Fn2[T, S, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 @overload
@@ -1724,8 +1704,7 @@ def tree_any_only(
     pred: Fn3[T, S, U, bool],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 def tree_any_only(
@@ -1862,7 +1841,7 @@ def _json_to_treespec(json_schema: DumpableContext) -> TreeSpec:
 
     if json_schema["type"] not in SERIALIZED_TYPE_TO_PYTHON_TYPE:
         raise NotImplementedError(
-            f'Deserializing {json_schema["type"]} in pytree is not registered.',
+            f"Deserializing {json_schema['type']} in pytree is not registered.",
         )
 
     typ = SERIALIZED_TYPE_TO_PYTHON_TYPE[json_schema["type"]]
diff --git a/torch/utils/_strobelight/cli_function_profiler.py b/torch/utils/_strobelight/cli_function_profiler.py
index 39e981a78ac5b..9b94a7b7a484b 100644
--- a/torch/utils/_strobelight/cli_function_profiler.py
+++ b/torch/utils/_strobelight/cli_function_profiler.py
@@ -301,7 +301,7 @@ def strobelight(
         profiler = StrobelightCLIFunctionProfiler(**kwargs)
 
     def strobelight_inner(
-        work_function: Callable[_P, _R]
+        work_function: Callable[_P, _R],
     ) -> Callable[_P, Optional[_R]]:
         @functools.wraps(work_function)
         def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> Optional[_R]:
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
index 42c99839d4164..2b6c159f5c3a0 100644
--- a/torch/utils/_sympy/functions.py
+++ b/torch/utils/_sympy/functions.py
@@ -98,7 +98,7 @@ def _is_symbols_binary_summation(expr: sympy.Expr) -> bool:
 
 
 def _keep_float(
-    f: Callable[[Unpack[_Ts]], _T]
+    f: Callable[[Unpack[_Ts]], _T],
 ) -> Callable[[Unpack[_Ts]], Union[_T, sympy.Float]]:
     @functools.wraps(f)
     def inner(*args: Unpack[_Ts]) -> Union[_T, sympy.Float]:
@@ -926,10 +926,12 @@ def _find_localzeros(cls, values, **options):
 
     _eval_is_algebraic = lambda s: _torf(i.is_algebraic for i in s.args)  # noqa: E731
     _eval_is_antihermitian = lambda s: _torf(  # noqa: E731
-        i.is_antihermitian for i in s.args  # noqa: E731
+        i.is_antihermitian
+        for i in s.args  # noqa: E731
     )  # noqa: E731
     _eval_is_commutative = lambda s: _torf(  # noqa: E731
-        i.is_commutative for i in s.args  # noqa: E731
+        i.is_commutative
+        for i in s.args  # noqa: E731
     )  # noqa: E731
     _eval_is_complex = lambda s: _torf(i.is_complex for i in s.args)  # noqa: E731
     _eval_is_composite = lambda s: _torf(i.is_composite for i in s.args)  # noqa: E731
@@ -943,10 +945,12 @@ def _find_localzeros(cls, values, **options):
     _eval_is_negative = lambda s: _torf(i.is_negative for i in s.args)  # noqa: E731
     _eval_is_noninteger = lambda s: _torf(i.is_noninteger for i in s.args)  # noqa: E731
     _eval_is_nonnegative = lambda s: _torf(  # noqa: E731
-        i.is_nonnegative for i in s.args  # noqa: E731
+        i.is_nonnegative
+        for i in s.args  # noqa: E731
     )  # noqa: E731
     _eval_is_nonpositive = lambda s: _torf(  # noqa: E731
-        i.is_nonpositive for i in s.args  # noqa: E731
+        i.is_nonpositive
+        for i in s.args  # noqa: E731
     )  # noqa: E731
     _eval_is_nonzero = lambda s: _torf(i.is_nonzero for i in s.args)  # noqa: E731
     _eval_is_odd = lambda s: _torf(i.is_odd for i in s.args)  # noqa: E731
@@ -956,10 +960,12 @@ def _find_localzeros(cls, values, **options):
     _eval_is_rational = lambda s: _torf(i.is_rational for i in s.args)  # noqa: E731
     _eval_is_real = lambda s: _torf(i.is_real for i in s.args)  # noqa: E731
     _eval_is_extended_real = lambda s: _torf(  # noqa: E731
-        i.is_extended_real for i in s.args  # noqa: E731
+        i.is_extended_real
+        for i in s.args  # noqa: E731
     )  # noqa: E731
     _eval_is_transcendental = lambda s: _torf(  # noqa: E731
-        i.is_transcendental for i in s.args  # noqa: E731
+        i.is_transcendental
+        for i in s.args  # noqa: E731
     )  # noqa: E731
     _eval_is_zero = lambda s: _torf(i.is_zero for i in s.args)  # noqa: E731
 
diff --git a/torch/utils/_sympy/value_ranges.py b/torch/utils/_sympy/value_ranges.py
index 1b360337a53bb..e02e049cc36dd 100644
--- a/torch/utils/_sympy/value_ranges.py
+++ b/torch/utils/_sympy/value_ranges.py
@@ -144,16 +144,14 @@ def __init__(
         self: ValueRanges[sympy.Expr],
         lower: ExprIn,
         upper: ExprIn,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @overload
     def __init__(  # type: ignore[misc]
         self: ValueRanges[SympyBoolean],
         lower: BoolIn,
         upper: BoolIn,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     def __init__(self, lower: AllIn, upper: AllIn) -> None:
         lower = simple_sympify(lower)
@@ -240,15 +238,13 @@ def tighten(self, other) -> ValueRanges:
     def __and__(
         self: ValueRanges[sympy.Expr],
         other: ValueRanges[sympy.Expr],
-    ) -> ValueRanges[sympy.Expr]:
-        ...
+    ) -> ValueRanges[sympy.Expr]: ...
 
     @overload
     def __and__(  # type: ignore[misc]
         self: ValueRanges[SympyBoolean],
         other: ValueRanges[SympyBoolean],
-    ) -> ValueRanges[SympyBoolean]:
-        ...
+    ) -> ValueRanges[SympyBoolean]: ...
 
     def __and__(self: AllVR, other: AllVR) -> AllVR:
         if other in (ValueRanges.unknown(), ValueRanges.unknown_int()):
@@ -272,15 +268,13 @@ def __and__(self: AllVR, other: AllVR) -> AllVR:
     def __or__(
         self: ValueRanges[sympy.Expr],
         other: ValueRanges[sympy.Expr],
-    ) -> ValueRanges[sympy.Expr]:
-        ...
+    ) -> ValueRanges[sympy.Expr]: ...
 
     @overload
     def __or__(  # type: ignore[misc]
         self: ValueRanges[SympyBoolean],
         other: ValueRanges[SympyBoolean],
-    ) -> ValueRanges[SympyBoolean]:
-        ...
+    ) -> ValueRanges[SympyBoolean]: ...
 
     def __or__(self: AllVR, other: AllVR) -> AllVR:
         if ValueRanges.unknown() in (self, other):
@@ -343,8 +337,7 @@ def increasing_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
 
     @overload
     @staticmethod
-    def decreasing_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
-        ...
+    def decreasing_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR: ...
 
     @overload
     @staticmethod
@@ -384,8 +377,7 @@ def coordinatewise_increasing_map(
         x: Union[ExprIn, ExprVR],
         y: Union[ExprIn, ExprVR],
         fn: ExprFn2,
-    ) -> ExprVR:
-        ...
+    ) -> ExprVR: ...
 
     @overload
     @staticmethod
@@ -393,8 +385,7 @@ def coordinatewise_increasing_map(  # type: ignore[misc]
         x: Union[BoolIn, BoolVR],
         y: Union[BoolIn, BoolVR],
         fn: BoolFn2,
-    ) -> BoolVR:
-        ...
+    ) -> BoolVR: ...
 
     @staticmethod
     def coordinatewise_increasing_map(
diff --git a/torch/utils/_triton.py b/torch/utils/_triton.py
index af1e5e0e6f42a..7d545e8221643 100644
--- a/torch/utils/_triton.py
+++ b/torch/utils/_triton.py
@@ -13,6 +13,17 @@ def has_triton_package() -> bool:
         return False
 
 
+@functools.cache
+def get_triton_version(fallback: tuple[int, int] = (0, 0)) -> tuple[int, int]:
+    try:
+        import triton  # noqa: F401
+
+        major, minor = tuple(int(v) for v in triton.__version__.split(".")[:2])
+        return (major, minor)
+    except ImportError:
+        return fallback
+
+
 @functools.cache
 def _device_supports_tma() -> bool:
     import torch
@@ -71,7 +82,7 @@ def has_triton_tma_device() -> bool:
             torch.cuda.is_available()
             and torch.cuda.get_device_capability() >= (9, 0)
             and not torch.version.hip
-        ):
+        ) or torch.xpu.is_available():
             # old API
             try:
                 from triton.language.extra.cuda import (  # noqa: F401
@@ -103,7 +114,7 @@ def has_triton_stable_tma_api() -> bool:
             torch.cuda.is_available()
             and torch.cuda.get_device_capability() >= (9, 0)
             and not torch.version.hip
-        ):
+        ) or torch.xpu.is_available():
             try:
                 from triton.language import make_tensor_descriptor  # noqa: F401
 
diff --git a/torch/utils/backend_registration.py b/torch/utils/backend_registration.py
index e11a7afc09d8a..5a83aede8d468 100644
--- a/torch/utils/backend_registration.py
+++ b/torch/utils/backend_registration.py
@@ -426,9 +426,9 @@ def func_name(*args, **kwargs):
     it is marked as private. It is a convenience function for backend implementers to
     more easily call the hooks into their backend extensions.
     """
-    assert isinstance(
-        func_name, str
-    ), f"func_name must be `str`, but got `{type(func_name)}`."
+    assert isinstance(func_name, str), (
+        f"func_name must be `str`, but got `{type(func_name)}`."
+    )
     backend_name = _get_privateuse1_backend_name()
     custom_device_mod = getattr(torch, backend_name, None)  # type: ignore[arg-type]
     function = getattr(custom_device_mod, func_name, None)  # type: ignore[arg-type]
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
index b4c5b8ea198d6..30d2fc106f5ff 100644
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@@ -347,6 +347,7 @@ def checkpoint(
     context_fn: Callable[[], Tuple[ContextManager, ContextManager]] = noop_context_fn,
     determinism_check: str = _DEFAULT_DETERMINISM_MODE,
     debug: bool = False,
+    early_stop: bool = True,
     **kwargs
 ):
     r"""Checkpoint a model or part of the model.
@@ -373,7 +374,7 @@ def checkpoint(
     .. warning::
 
         The ``use_reentrant`` parameter should be passed explicitly. In version
-        2.4 we will raise an exception if ``use_reentrant`` is not passed.
+        2.9 we will raise an exception if ``use_reentrant`` is not passed.
         If you are using the ``use_reentrant=True`` variant, please refer to the
         note below for important considerations and potential limitations.
 
@@ -425,6 +426,9 @@ def checkpoint(
             passed as the tuple. For example, in LSTM, if user passes
             ``(activation, hidden)``, :attr:`function` should correctly use the
             first input as ``activation`` and the second input as ``hidden``
+        args: tuple containing inputs to the :attr:`function`
+
+    Keyword args:
         preserve_rng_state(bool, optional):  Omit stashing and restoring
             the RNG state during each checkpoint. Note that under torch.compile,
             this flag doesn't take effect and we always preserve RNG state.
@@ -432,7 +436,7 @@ def checkpoint(
         use_reentrant(bool):
             specify whether to use the activation checkpoint variant that
             requires reentrant autograd. This parameter should be passed
-            explicitly. In version 2.5 we will raise an exception if
+            explicitly. In version 2.9 we will raise an exception if
             ``use_reentrant`` is not passed. If ``use_reentrant=False``,
             ``checkpoint`` will use an implementation that does not require
             reentrant autograd. This allows ``checkpoint`` to support additional
@@ -455,7 +459,11 @@ def checkpoint(
             a trace of the operators ran during the original forward computation
             as well as the recomputation. This argument is only supported if
             ``use_reentrant=False``.
-        args: tuple containing inputs to the :attr:`function`
+        early_stop(bool, optional): If ``True``, non-reentrant checkpoint stops
+            recomputation as soon as it has computed all needed Tensors. This
+            argument is ignored if ``use_reentrant=True``. Can be overridden
+            globally using :func:`set_checkpoint_early_stop` context manager.
+            Default: ``True``.
 
     Returns:
         Output of running :attr:`function` on :attr:`*args`
@@ -463,8 +471,8 @@ def checkpoint(
     if use_reentrant is None:
         warnings.warn(
             "torch.utils.checkpoint: the use_reentrant parameter should be "
-            "passed explicitly. In version 2.5 we will raise an exception "
-            "if use_reentrant is not passed. use_reentrant=False is "
+            "passed explicitly. Starting in PyTorch 2.9, calling checkpoint "
+            "without use_reentrant will raise an exception. use_reentrant=False is "
             "recommended, but if you need to preserve the current default "
             "behavior, you can pass use_reentrant=True. Refer to docs for more "
             "details on the differences between the two variants.",
@@ -488,7 +496,7 @@ def checkpoint(
         return CheckpointFunction.apply(function, preserve, *args)
     else:
         gen = _checkpoint_without_reentrant_generator(
-            function, preserve, context_fn, determinism_check, debug, *args, **kwargs
+            function, preserve, context_fn, determinism_check, debug, early_stop, *args, **kwargs
         )
         # Runs pre-forward logic
         next(gen)
@@ -511,7 +519,7 @@ def checkpoint_sequential(functions, segments, input, use_reentrant=None, **kwar
 
     .. warning::
         The ``use_reentrant`` parameter should be passed explicitly. In version
-        2.4 we will raise an exception if ``use_reentrant`` is not passed.
+        2.9 we will raise an exception if ``use_reentrant`` is not passed.
         If you are using the ``use_reentrant=True` variant, please see
         :func:`~torch.utils.checkpoint.checkpoint` for
         the important considerations and limitations of this variant. It is
@@ -552,7 +560,7 @@ def checkpoint_sequential(functions, segments, input, use_reentrant=None, **kwar
         warnings.warn(
             "torch.utils.checkpoint.checkpoint_sequential: the use_reentrant "
             "parameter should be passed explicitly. "
-            "In version 2.5 we will raise an exception if use_reentrant "
+            "In version 2.9 we will raise an exception if use_reentrant "
             "is not passed. use_reentrant=False is "
             "recommended, but if you need to preserve the current default "
             "behavior, you can pass use_reentrant=True. Refer to docs for more "
@@ -731,7 +739,7 @@ def _internal_assert(cond):
 #    by holder=None. We skip over them. We still save x at (4) (since its holder
 #    is still alive.)
 
-_enable_checkpoint_early_stop = True
+_enable_checkpoint_early_stop: Optional[bool] = None
 
 
 @contextlib.contextmanager
@@ -1448,6 +1456,7 @@ def _checkpoint_without_reentrant_generator(
     context_fn: Callable[[], Tuple[ContextManager, ContextManager]] = noop_context_fn,
     determinism_check: str = _DEFAULT_DETERMINISM_MODE,
     debug: bool = False,
+    early_stop: bool = True,
     *args,
     **kwargs
 ):
@@ -1475,6 +1484,10 @@ def _checkpoint_without_reentrant_generator(
         debug(bool, optional): If ``True``, error messages will also include
             a trace of the operators ran during the original forward computation
             as well as the recomputation.
+        early_stop(bool, optional): If ``True``, non-reentrant checkpoint stops
+            recomputation as soon as it has computed all needed Tensors. Can be
+            overridden globally using :func:`set_checkpoint_early_stop` context
+            manager. Default: ``True``.
         *args: Arguments to pass in to the given ``function``.
         **kwargs: Keyword arguments to pass into the given ``function``.
     """
@@ -1543,7 +1556,7 @@ def recompute_fn(*inputs):
 
     new_frame = _CheckpointFrame(
         recompute_fn,
-        _enable_checkpoint_early_stop,
+        _enable_checkpoint_early_stop if _enable_checkpoint_early_stop is not None else early_stop,
         unpack_error_cb,
         metadata_fn
     )
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index eabfd6813ea26..7202a9638756d 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -22,9 +22,8 @@
 import torch._appdirs
 from .file_baton import FileBaton
 from ._cpp_extension_versioner import ExtensionVersioner
-from .hipify import hipify_python
-from .hipify.hipify_python import GeneratedFileCleaner
 from typing import Optional, Union
+from typing_extensions import deprecated
 from torch.torch_version import TorchVersion, Version
 
 from setuptools.command.build_ext import build_ext
@@ -275,6 +274,22 @@ def _join_sycl_home(*paths) -> str:
     '-DHIP_ENABLE_WARP_SYNC_BUILTINS=1'
 ]
 
+if IS_WINDOWS:
+    # Compatibility flags, similar to those set in cmake/Dependencies.cmake.
+    COMMON_HIPCC_FLAGS.append('-fms-extensions')
+    # Suppress warnings about dllexport.
+    COMMON_HIPCC_FLAGS.append('-Wno-ignored-attributes')
+
+
+def _get_icpx_version() -> str:
+    icpx = 'icx' if IS_WINDOWS else 'icpx'
+    compiler_info = subprocess.check_output([icpx, '--version'])
+    match = re.search(r'(\d+)\.(\d+)\.(\d+)', compiler_info.decode().strip())
+    version = ['0', '0', '0'] if match is None else list(match.groups())
+    version = list(map(int, version))
+    assert len(version) == 3, "Failed to parse DPC++ compiler version"
+    # Aligning version format with what torch.version.xpu() returns
+    return f"{version[0]}{version[1]:02}{version[2]:02}"
 
 
 def _get_sycl_arch_list():
@@ -537,6 +552,7 @@ def _check_cuda_version(compiler_name: str, compiler_version: TorchVersion) -> N
                 f'Please make sure to use an adequate version of {compiler_name} ({version_bound_str}).'
             )
 
+
 # Specify Visual Studio C runtime library for hipcc
 def _set_hipcc_runtime_lib(is_standalone, debug):
     if is_standalone:
@@ -672,15 +688,6 @@ def build_extensions(self) -> None:
                 # min supported CPython version.
                 # See https://docs.python.org/3/c-api/stable.html#c.Py_LIMITED_API
                 self._add_compile_flag(extension, f'-DPy_LIMITED_API={min_supported_cpython}')
-            else:
-                # pybind11 is not CPython API stable so don't add these flags used when
-                # compiling pybind11 when pybind11 is not even used. otherwise, the build
-                # logs are confusing.
-                # See note [Pybind11 ABI constants]
-                for name in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
-                    val = getattr(torch._C, f"_PYBIND11_{name}")
-                    if val is not None and not IS_WINDOWS:
-                        self._add_compile_flag(extension, f'-DPYBIND11_{name}="{val}"')
             self._define_torch_extension_name(extension)
 
             if 'nvcc_dlink' in extension.extra_compile_args:
@@ -841,7 +848,11 @@ def unix_wrap_ninja_compile(sources,
                 host_cflags = extra_cc_cflags + common_cflags + post_cflags
                 append_std17_if_no_std_present(host_cflags)
                 # escaping quoted arguments to pass them thru SYCL compiler
-                host_cflags = [item.replace('"', '\\\\"') for item in host_cflags]
+                icpx_version = _get_icpx_version()
+                if int(icpx_version) >= 20250200:
+                    host_cflags = [item.replace('"', '\\"') for item in host_cflags]
+                else:
+                    host_cflags = [item.replace('"', '\\\\"') for item in host_cflags]
                 host_cflags = ' '.join(host_cflags)
                 # Note the order: shlex.quote sycl_flags first, _wrap_sycl_host_flags
                 # second. Reason is that sycl host flags are quoted, space containing
@@ -1356,6 +1367,7 @@ def CUDAExtension(name, sources, *args, **kwargs):
     include_dirs = kwargs.get('include_dirs', [])
 
     if IS_HIP_EXTENSION:
+        from .hipify import hipify_python
         build_dir = os.getcwd()
         hipify_result = hipify_python.hipify(
             project_directory=build_dir,
@@ -1693,25 +1705,9 @@ def load(name,
         is_standalone,
         keep_intermediates=keep_intermediates)
 
-def _get_pybind11_abi_build_flags():
-    # Note [Pybind11 ABI constants]
-    #
-    # Pybind11 before 2.4 used to build an ABI strings using the following pattern:
-    # f"__pybind11_internals_v{PYBIND11_INTERNALS_VERSION}{PYBIND11_INTERNALS_KIND}{PYBIND11_BUILD_TYPE}__"
-    # Since 2.4 compier type, stdlib and build abi parameters are also encoded like this:
-    # f"__pybind11_internals_v{PYBIND11_INTERNALS_VERSION}{PYBIND11_INTERNALS_KIND}{PYBIND11_COMPILER_TYPE}{PYBIND11_STDLIB}{PYBIND11_BUILD_ABI}{PYBIND11_BUILD_TYPE}__"
-    #
-    # This was done in order to further narrow down the chances of compiler ABI incompatibility
-    # that can cause a hard to debug segfaults.
-    # For PyTorch extensions we want to relax those restrictions and pass compiler, stdlib and abi properties
-    # captured during PyTorch native library compilation in torch/csrc/Module.cpp
-
-    abi_cflags = []
-    for pname in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
-        pval = getattr(torch._C, f"_PYBIND11_{pname}")
-        if pval is not None and not IS_WINDOWS:
-            abi_cflags.append(f'-DPYBIND11_{pname}=\\"{pval}\\"')
-    return abi_cflags
+@deprecated("PyBind11 ABI handling is internal to PyBind11; this will be removed after PyTorch 2.9.0")
+def _get_pybind11_abi_build_flags() -> list[str]:
+    return []
 
 def check_compiler_is_gcc(compiler):
     if not IS_LINUX:
@@ -1842,7 +1838,6 @@ def build_precompile_header(pch_cmd):
         common_cflags += ['-DTORCH_API_INCLUDE_EXTENSION_H']
 
     common_cflags += ['-std=c++17', '-fPIC']
-    common_cflags += [f"{x}" for x in _get_pybind11_abi_build_flags()]
     common_cflags_str = listToString(common_cflags)
 
     pch_cmd = format_precompiler_header_cmd(compiler, head_file, head_file_pch, common_cflags_str, torch_include_dirs_str, extra_cflags_str, extra_include_paths_str)
@@ -2113,6 +2108,8 @@ def _jit_compile(name,
     if baton.try_acquire():
         try:
             if version != old_version:
+                from .hipify import hipify_python
+                from .hipify.hipify_python import GeneratedFileCleaner
                 with GeneratedFileCleaner(keep_intermediates=keep_intermediates) as clean_ctx:
                     if IS_HIP_EXTENSION and (with_cuda or with_cudnn):
                         hipify_result = hipify_python.hipify(
@@ -2403,13 +2400,13 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
         ('Ampere', '8.0;8.6+PTX'),
         ('Ada', '8.9+PTX'),
         ('Hopper', '9.0+PTX'),
-        ('Blackwell+Tegra', '10.1'),
+        ('Blackwell+Tegra', '11.0'),
         ('Blackwell', '10.0;10.3;12.0;12.1+PTX'),
     ])
 
     supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
                         '7.0', '7.2', '7.5', '8.0', '8.6', '8.7', '8.9', '9.0', '9.0a',
-                        '10.0', '10.0a', '10.1', '10.1a', '10.3', '10.3a', '12.0',
+                        '10.0', '10.0a', '11.0', '11.0a', '10.3', '10.3a', '12.0',
                         '12.0a', '12.1', '12.1a']
     valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]
 
@@ -2677,8 +2674,6 @@ def _write_ninja_file_to_build_library(path,
         common_cflags.append(f'-DTORCH_EXTENSION_NAME={name}')
         common_cflags.append('-DTORCH_API_INCLUDE_EXTENSION_H')
 
-    common_cflags += [f"{x}" for x in _get_pybind11_abi_build_flags()]
-
     # Windows does not understand `-isystem` and quotes flags later.
     if IS_WINDOWS:
         common_cflags += [f'-I{include}' for include in user_includes + system_includes]
@@ -2725,7 +2720,9 @@ def _write_ninja_file_to_build_library(path,
         _append_sycl_std_if_no_std_present(sycl_cflags)
         host_cflags = cflags
         # escaping quoted arguments to pass them thru SYCL compiler
-        host_cflags = [item.replace('\\"', '\\\\"') for item in host_cflags]
+        icpx_version = _get_icpx_version()
+        if int(icpx_version) < 20250200:
+            host_cflags = [item.replace('\\"', '\\\\"') for item in host_cflags]
         host_cflags = ' '.join(host_cflags)
         sycl_cflags += _wrap_sycl_host_flags(host_cflags)
         sycl_dlink_post_cflags = _SYCL_DLINK_FLAGS.copy()
diff --git a/torch/utils/data/_utils/collate.py b/torch/utils/data/_utils/collate.py
index 68a4da0731c0e..3b291b1e60a4c 100644
--- a/torch/utils/data/_utils/collate.py
+++ b/torch/utils/data/_utils/collate.py
@@ -44,7 +44,7 @@ def default_convert(data):
         >>> default_convert(np.array([0, 1]))
         tensor([0, 1])
         >>> # Example with NamedTuple
-        >>> Point = namedtuple('Point', ['x', 'y'])
+        >>> Point = namedtuple("Point", ["x", "y"])
         >>> default_convert(Point(0, 0))
         Point(x=0, y=0)
         >>> default_convert(Point(np.array(0), np.array(0)))
@@ -366,13 +366,13 @@ def default_collate(batch):
         >>> default_collate([0, 1, 2, 3])
         tensor([0, 1, 2, 3])
         >>> # Example with a batch of `str`s:
-        >>> default_collate(['a', 'b', 'c'])
+        >>> default_collate(["a", "b", "c"])
         ['a', 'b', 'c']
         >>> # Example with `Map` inside the batch:
-        >>> default_collate([{'A': 0, 'B': 1}, {'A': 100, 'B': 100}])
+        >>> default_collate([{"A": 0, "B": 1}, {"A": 100, "B": 100}])
         {'A': tensor([  0, 100]), 'B': tensor([  1, 100])}
         >>> # Example with `NamedTuple` inside the batch:
-        >>> Point = namedtuple('Point', ['x', 'y'])
+        >>> Point = namedtuple("Point", ["x", "y"])
         >>> default_collate([Point(0, 0), Point(1, 1)])
         Point(x=tensor([0, 1]), y=tensor([0, 1]))
         >>> # Example with `Tuple` inside the batch:
diff --git a/torch/utils/data/_utils/pin_memory.py b/torch/utils/data/_utils/pin_memory.py
index c75756dd5fdb1..b53c7aef9596f 100644
--- a/torch/utils/data/_utils/pin_memory.py
+++ b/torch/utils/data/_utils/pin_memory.py
@@ -69,7 +69,9 @@ def pin_memory(data, device=None):
                 )
                 return clone
             else:
-                return type(data)({k: pin_memory(sample, device) for k, sample in data.items()})  # type: ignore[call-arg]
+                return type(data)(
+                    {k: pin_memory(sample, device) for k, sample in data.items()}
+                )  # type: ignore[call-arg]
         except TypeError:
             # The mapping type may not support `copy()` / `update(mapping)`
             # or `__init__(iterable)`.
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index a275e2e86b6ff..97c7243e78ef7 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -1,5 +1,5 @@
 # mypy: allow-untyped-defs
-r""""Contains definitions of the methods used by the _BaseDataLoaderIter workers.
+r"""Contains definitions of the methods used by the _BaseDataLoaderIter workers.
 
 These **needs** to be in global scope since Py2 doesn't support serializing
 static methods.
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index dd7a73ea11e08..991b4f00eb85e 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -5,6 +5,7 @@
 functions to be run in multiprocessing. E.g., the data loading worker loop is
 in `./_utils/worker.py`.
 """
+
 from __future__ import annotations
 
 import functools
@@ -1208,7 +1209,10 @@ def __init__(self, loader):
                 atexit.register(_MultiProcessingDataLoaderIter._clean_up_worker, w)
 
         # .pid can be None only before process is spawned (not the case, so ignore)
-        _utils.signal_handling._set_worker_pids(id(self), tuple(w.pid for w in self._workers))  # type: ignore[misc]
+        _utils.signal_handling._set_worker_pids(
+            id(self),
+            tuple(w.pid for w in self._workers),  # type: ignore[misc]
+        )
         _utils.signal_handling._set_SIGCHLD_handler()
         self._worker_pids_set = True
         self._reset(loader, first_iter=True)
diff --git a/torch/utils/data/datapipes/_decorator.py b/torch/utils/data/datapipes/_decorator.py
index 13e28a19d6266..0833f8fdf759b 100644
--- a/torch/utils/data/datapipes/_decorator.py
+++ b/torch/utils/data/datapipes/_decorator.py
@@ -109,8 +109,7 @@ def __call__(self, *args, **kwargs):
 
         # Decorate with a functional argument
         if not (
-            isinstance(args[0], type)
-            and issubclass(args[0], IterDataPipe)  # type: ignore[arg-type]
+            isinstance(args[0], type) and issubclass(args[0], IterDataPipe)  # type: ignore[arg-type]
         ):
             raise TypeError(
                 f"Only `IterDataPipe` can be decorated, but {args[0].__name__} is found"
diff --git a/torch/utils/data/datapipes/datapipe.py b/torch/utils/data/datapipes/datapipe.py
index d3eeee0ebfdd5..506f642c411db 100644
--- a/torch/utils/data/datapipes/datapipe.py
+++ b/torch/utils/data/datapipes/datapipe.py
@@ -99,7 +99,9 @@ class IterDataPipe(IterableDataset[_T_co], metaclass=_IterDataPipeMeta):
             >>> from torchdata.datapipes.iter import IterableWrapper, Mapper
             >>> dp = IterableWrapper(range(10))
             >>> map_dp_1 = Mapper(dp, lambda x: x + 1)  # Using class constructor
-            >>> map_dp_2 = dp.map(lambda x: x + 1)  # Using functional form (recommended)
+            >>> map_dp_2 = dp.map(
+            ...     lambda x: x + 1
+            ... )  # Using functional form (recommended)
             >>> list(map_dp_1)
             [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
             >>> list(map_dp_2)
@@ -114,7 +116,9 @@ class IterDataPipe(IterableDataset[_T_co], metaclass=_IterDataPipeMeta):
             >>> list(it1)
             [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
             >>> it1 = iter(source_dp)
-            >>> it2 = iter(source_dp)  # The creation of a new iterator invalidates `it1`
+            >>> it2 = iter(
+            ...     source_dp
+            ... )  # The creation of a new iterator invalidates `it1`
             >>> next(it2)
             0
             >>> next(it1)  # Further usage of `it1` will raise a `RunTimeError`
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index 718e728c9389d..41c6bb362af2b 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -55,7 +55,8 @@ class MapperIterDataPipe(IterDataPipe[_T_co]):
         >>> def add_one(x):
         ...     return x + 1
         >>> dp = IterableWrapper(range(10))
-        >>> map_dp_1 = dp.map(add_one)  # Invocation via functional form is preferred
+        >>> # Invocation via functional form is preferred
+        ... map_dp_1 = dp.map(add_one)
         >>> list(map_dp_1)
         [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
         >>> # We discourage the usage of `lambda` functions as they are not serializable with `pickle`
@@ -202,7 +203,7 @@ class CollatorIterDataPipe(MapperIterDataPipe):
         >>> class MyIterDataPipe(torch.utils.data.IterDataPipe):
         ...     def __init__(self, start, end):
         ...         super(MyIterDataPipe).__init__()
-        ...         assert end > start, "this example code only works with end >= start"
+        ...         assert end > start, "this example only works with end >= start"
         ...         self.start = start
         ...         self.end = end
         ...
@@ -211,13 +212,11 @@ class CollatorIterDataPipe(MapperIterDataPipe):
         ...
         ...     def __len__(self):
         ...         return self.end - self.start
-        ...
         >>> ds = MyIterDataPipe(start=3, end=7)
         >>> print(list(ds))
         [3, 4, 5, 6]
         >>> def collate_fn(batch):
         ...     return torch.tensor(batch, dtype=torch.float)
-        ...
         >>> collated_ds = CollateIterDataPipe(ds, collate_fn=collate_fn)
         >>> print(list(collated_ds))
         [tensor(3.), tensor(4.), tensor(5.), tensor(6.)]
diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index 4c602ce4eeda0..f92edd6b7b39c 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py
@@ -38,15 +38,17 @@ def __init__(
         sampler_args: Optional[tuple] = None,
         sampler_kwargs: Optional[dict] = None,
     ) -> None:
-        assert isinstance(
-            datapipe, Sized
-        ), "Sampler class requires input datapipe implemented `__len__`"
+        assert isinstance(datapipe, Sized), (
+            "Sampler class requires input datapipe implemented `__len__`"
+        )
         super().__init__()
         self.datapipe = datapipe
         self.sampler_args = () if sampler_args is None else sampler_args
         self.sampler_kwargs = {} if sampler_kwargs is None else sampler_kwargs
         # https://github.com/python/mypy/pull/9629 will solve
-        self.sampler = sampler(*self.sampler_args, data_source=self.datapipe, **self.sampler_kwargs)  # type: ignore[misc]
+        self.sampler = sampler(
+            *self.sampler_args, data_source=self.datapipe, **self.sampler_kwargs
+        )  # type: ignore[misc]
 
     def __iter__(self) -> Iterator[_T_co]:
         return iter(self.sampler)
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index deaca079c68c0..8c6abc5062105 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -116,16 +116,13 @@ class _ContainerTemplate(ABC):
     r"""Abstract class for container ``DataPipes``. The followings are three required methods."""
 
     @abstractmethod
-    def get_next_element_by_instance(self, instance_id: int):
-        ...
+    def get_next_element_by_instance(self, instance_id: int): ...
 
     @abstractmethod
-    def is_every_instance_exhausted(self) -> bool:
-        ...
+    def is_every_instance_exhausted(self) -> bool: ...
 
     @abstractmethod
-    def reset(self) -> None:
-        ...
+    def reset(self) -> None: ...
 
     @abstractmethod
     def get_length_by_instance(self, instance_id: int):
@@ -403,7 +400,9 @@ class DemultiplexerIterDataPipe(IterDataPipe):
         >>> # It can also filter out any element that gets `None` from the `classifier_fn`
         >>> def odd_or_even_no_zero(n):
         ...     return n % 2 if n != 0 else None
-        >>> dp1, dp2 = source_dp.demux(num_instances=2, classifier_fn=odd_or_even_no_zero, drop_none=True)
+        >>> dp1, dp2 = source_dp.demux(
+        ...     num_instances=2, classifier_fn=odd_or_even_no_zero, drop_none=True
+        ... )
         >>> list(dp1)
         [2, 4]
         >>> list(dp2)
@@ -428,7 +427,9 @@ def __new__(
         # When num_instances == 1, demux can be replaced by filter,
         # but keep it as Demultiplexer for the sake of consistency
         # like throwing Error when classification result is out of o range
-        container = _DemultiplexerIterDataPipe(datapipe, num_instances, classifier_fn, drop_none, buffer_size)  # type: ignore[abstract]
+        container = _DemultiplexerIterDataPipe(
+            datapipe, num_instances, classifier_fn, drop_none, buffer_size
+        )  # type: ignore[abstract]
         return [_ChildDataPipe(container, i) for i in range(num_instances)]
 
 
@@ -602,16 +603,18 @@ class MultiplexerIterDataPipe(IterDataPipe):
     Example:
         >>> # xdoctest: +REQUIRES(module:torchdata)
         >>> from torchdata.datapipes.iter import IterableWrapper
-        >>> dp1, dp2, dp3 = IterableWrapper(range(3)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25))
+        >>> dp1, dp2, dp3 = (
+        ...     IterableWrapper(range(3)),
+        ...     IterableWrapper(range(10, 15)),
+        ...     IterableWrapper(range(20, 25)),
+        ... )
         >>> list(dp1.mux(dp2, dp3))
         [0, 10, 20, 1, 11, 21, 2, 12, 22]
     """
 
     def __init__(self, *datapipes):
         self.datapipes = datapipes
-        self.buffer: list = (
-            []
-        )  # Store values to be yielded only when every iterator provides one
+        self.buffer: list = []  # Store values to be yielded only when every iterator provides one
 
     def __iter__(self):
         iterators = [iter(x) for x in self.datapipes]
@@ -670,7 +673,11 @@ class ZipperIterDataPipe(IterDataPipe[tuple[_T_co]]):
     Example:
         >>> # xdoctest: +REQUIRES(module:torchdata)
         >>> from torchdata.datapipes.iter import IterableWrapper
-        >>> dp1, dp2, dp3 = IterableWrapper(range(5)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25))
+        >>> dp1, dp2, dp3 = (
+        ...     IterableWrapper(range(5)),
+        ...     IterableWrapper(range(10, 15)),
+        ...     IterableWrapper(range(20, 25)),
+        ... )
         >>> list(dp1.zip(dp2, dp3))
         [(0, 10, 20), (1, 11, 21), (2, 12, 22), (3, 13, 23), (4, 14, 24)]
     """
diff --git a/torch/utils/data/datapipes/iter/fileopener.py b/torch/utils/data/datapipes/iter/fileopener.py
index 2542c89773bdd..3025b809e12df 100644
--- a/torch/utils/data/datapipes/iter/fileopener.py
+++ b/torch/utils/data/datapipes/iter/fileopener.py
@@ -33,8 +33,12 @@ class FileOpenerIterDataPipe(IterDataPipe[tuple[str, IOBase]]):
 
     Example:
         >>> # xdoctest: +SKIP
-        >>> from torchdata.datapipes.iter import FileLister, FileOpener, StreamReader
-        >>> dp = FileLister(root=".").filter(lambda fname: fname.endswith('.txt'))
+        >>> from torchdata.datapipes.iter import (
+        ...     FileLister,
+        ...     FileOpener,
+        ...     StreamReader,
+        ... )
+        >>> dp = FileLister(root=".").filter(lambda fname: fname.endswith(".txt"))
         >>> dp = FileOpener(dp)
         >>> dp = StreamReader(dp)
         >>> list(dp)
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index 08d124fdc6087..055d9c28b09be 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -182,7 +182,9 @@ class GrouperIterDataPipe(IterDataPipe[DataChunk]):
         >>> from torchdata.datapipes.iter import IterableWrapper
         >>> def group_fn(file):
         ...     return os.path.basename(file).split(".")[0]
-        >>> source_dp = IterableWrapper(["a.png", "b.png", "a.json", "b.json", "a.jpg", "c.json"])
+        >>> source_dp = IterableWrapper(
+        ...     ["a.png", "b.png", "a.json", "b.json", "a.jpg", "c.json"]
+        ... )
         >>> dp0 = source_dp.groupby(group_key_fn=group_fn)
         >>> list(dp0)
         [['a.png', 'a.json', 'a.jpg'], ['b.png', 'b.json'], ['c.json']]
@@ -191,7 +193,12 @@ class GrouperIterDataPipe(IterDataPipe[DataChunk]):
         >>> list(dp1)
         [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']]
         >>> # Scenario where `buffer` is full, and group 'a' needs to be yielded since its size > `guaranteed_group_size`
-        >>> dp2 = source_dp.groupby(group_key_fn=group_fn, buffer_size=3, group_size=3, guaranteed_group_size=2)
+        >>> dp2 = source_dp.groupby(
+        ...     group_key_fn=group_fn,
+        ...     buffer_size=3,
+        ...     group_size=3,
+        ...     guaranteed_group_size=2,
+        ... )
         >>> list(dp2)
         [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']]
     """
diff --git a/torch/utils/data/datapipes/map/utils.py b/torch/utils/data/datapipes/map/utils.py
index 02865e8064f86..e1290df323724 100644
--- a/torch/utils/data/datapipes/map/utils.py
+++ b/torch/utils/data/datapipes/map/utils.py
@@ -31,8 +31,8 @@ class SequenceWrapperMapDataPipe(MapDataPipe[_T]):
         >>> dp = SequenceWrapper(range(10))
         >>> list(dp)
         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
-        >>> dp = SequenceWrapper({'a': 100, 'b': 200, 'c': 300, 'd': 400})
-        >>> dp['a']
+        >>> dp = SequenceWrapper({"a": 100, "b": 200, "c": 300, "d": 400})
+        >>> dp["a"]
         100
     """
 
diff --git a/torch/utils/data/datapipes/utils/decoder.py b/torch/utils/data/datapipes/utils/decoder.py
index ee5bee8f15280..9db7309bdc525 100644
--- a/torch/utils/data/datapipes/utils/decoder.py
+++ b/torch/utils/data/datapipes/utils/decoder.py
@@ -45,8 +45,8 @@ def basichandlers(extension: str, data):
 
     Example:
         >>> import pickle
-        >>> data = pickle.dumps('some data')
-        >>> new_data = basichandlers('pickle', data)
+        >>> data = pickle.dumps("some data")
+        >>> new_data = basichandlers("pickle", data)
         >>> new_data
         some data
 
@@ -169,9 +169,9 @@ class ImageHandler:
     """
 
     def __init__(self, imagespec):
-        assert imagespec in list(
-            imagespecs.keys()
-        ), f"unknown image specification: {imagespec}"
+        assert imagespec in list(imagespecs.keys()), (
+            f"unknown image specification: {imagespec}"
+        )
         self.imagespec = imagespec.lower()
 
     def __call__(self, extension, data):
@@ -205,18 +205,18 @@ def __call__(self, extension, data):
                 return img
             elif atype == "numpy":
                 result = np.asarray(img)
-                assert (
-                    result.dtype == np.uint8
-                ), f"numpy image array should be type uint8, but got {result.dtype}"
+                assert result.dtype == np.uint8, (
+                    f"numpy image array should be type uint8, but got {result.dtype}"
+                )
                 if etype == "uint8":
                     return result
                 else:
                     return result.astype("f") / 255.0
             elif atype == "torch":
                 result = np.asarray(img)
-                assert (
-                    result.dtype == np.uint8
-                ), f"numpy image array should be type uint8, but got {result.dtype}"
+                assert result.dtype == np.uint8, (
+                    f"numpy image array should be type uint8, but got {result.dtype}"
+                )
 
                 if etype == "uint8":
                     result = np.array(result.transpose(2, 0, 1))
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index d0234c553ce68..e8164e015a668 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -96,7 +96,7 @@ class IterableDataset(Dataset[_T_co], Iterable[_T_co]):
         >>> class MyIterableDataset(torch.utils.data.IterableDataset):
         ...     def __init__(self, start, end):
         ...         super(MyIterableDataset).__init__()
-        ...         assert end > start, "this example code only works with end >= start"
+        ...         assert end > start, "this example only works with end >= start"
         ...         self.start = start
         ...         self.end = end
         ...
@@ -138,7 +138,7 @@ class IterableDataset(Dataset[_T_co], Iterable[_T_co]):
         >>> class MyIterableDataset(torch.utils.data.IterableDataset):
         ...     def __init__(self, start, end):
         ...         super(MyIterableDataset).__init__()
-        ...         assert end > start, "this example code only works with end >= start"
+        ...         assert end > start, "this example only works with end >= start"
         ...         self.start = start
         ...         self.end = end
         ...
@@ -198,9 +198,9 @@ class TensorDataset(Dataset[tuple[Tensor, ...]]):
     tensors: tuple[Tensor, ...]
 
     def __init__(self, *tensors: Tensor) -> None:
-        assert all(
-            tensors[0].size(0) == tensor.size(0) for tensor in tensors
-        ), "Size mismatch between tensors"
+        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors), (
+            "Size mismatch between tensors"
+        )
         self.tensors = tensors
 
     def __getitem__(self, index):
@@ -222,7 +222,7 @@ class StackDataset(Dataset[_T_stack]):
         >>> tuple_stack = StackDataset(images, texts)
         >>> tuple_stack[0] == (images[0], texts[0])
         >>> dict_stack = StackDataset(image=images, text=texts)
-        >>> dict_stack[0] == {'image': images[0], 'text': texts[0]}
+        >>> dict_stack[0] == {"image": images[0], "text": texts[0]}
 
     Args:
         *args (Dataset): Datasets for stacking returned as tuple.
@@ -323,9 +323,9 @@ def __init__(self, datasets: Iterable[Dataset]) -> None:
         self.datasets = list(datasets)
         assert len(self.datasets) > 0, "datasets should not be an empty iterable"  # type: ignore[arg-type]
         for d in self.datasets:
-            assert not isinstance(
-                d, IterableDataset
-            ), "ConcatDataset does not support IterableDataset"
+            assert not isinstance(d, IterableDataset), (
+                "ConcatDataset does not support IterableDataset"
+            )
         self.cumulative_sizes = self.cumsum(self.datasets)
 
     def __len__(self):
@@ -371,17 +371,17 @@ def __init__(self, datasets: Iterable[Dataset]) -> None:
 
     def __iter__(self):
         for d in self.datasets:
-            assert isinstance(
-                d, IterableDataset
-            ), "ChainDataset only supports IterableDataset"
+            assert isinstance(d, IterableDataset), (
+                "ChainDataset only supports IterableDataset"
+            )
             yield from d
 
     def __len__(self):
         total = 0
         for d in self.datasets:
-            assert isinstance(
-                d, IterableDataset
-            ), "ChainDataset only supports IterableDataset"
+            assert isinstance(d, IterableDataset), (
+                "ChainDataset only supports IterableDataset"
+            )
             total += len(d)  # type: ignore[arg-type]
         return total
 
diff --git a/torch/utils/data/sampler.py b/torch/utils/data/sampler.py
index c92bdbb00e102..6c2e6dcaf2f45 100644
--- a/torch/utils/data/sampler.py
+++ b/torch/utils/data/sampler.py
@@ -236,9 +236,17 @@ class WeightedRandomSampler(Sampler[int]):
 
     Example:
         >>> # xdoctest: +IGNORE_WANT("non-deterministic")
-        >>> list(WeightedRandomSampler([0.1, 0.9, 0.4, 0.7, 3.0, 0.6], 5, replacement=True))
+        >>> list(
+        ...     WeightedRandomSampler(
+        ...         [0.1, 0.9, 0.4, 0.7, 3.0, 0.6], 5, replacement=True
+        ...     )
+        ... )
         [4, 4, 1, 4, 5]
-        >>> list(WeightedRandomSampler([0.9, 0.4, 0.05, 0.2, 0.3, 0.1], 5, replacement=False))
+        >>> list(
+        ...     WeightedRandomSampler(
+        ...         [0.9, 0.4, 0.05, 0.2, 0.3, 0.1], 5, replacement=False
+        ...     )
+        ... )
         [0, 1, 4, 3, 2]
     """
 
@@ -298,9 +306,15 @@ class BatchSampler(Sampler[list[int]]):
             its size would be less than ``batch_size``
 
     Example:
-        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))
+        >>> list(
+        ...     BatchSampler(
+        ...         SequentialSampler(range(10)), batch_size=3, drop_last=False
+        ...     )
+        ... )
         [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
-        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True))
+        >>> list(
+        ...     BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True)
+        ... )
         [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
     """
 
diff --git a/torch/utils/flop_counter.py b/torch/utils/flop_counter.py
index 348e40eb62546..b8d4e878b7f08 100644
--- a/torch/utils/flop_counter.py
+++ b/torch/utils/flop_counter.py
@@ -834,7 +834,8 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         kwargs = kwargs if kwargs else {}
 
         # Skip ops from non-standard dispatch_sizes_strides_policy such as NJT
-        if func in {torch.ops.aten.is_contiguous.default,
+        if func in {torch.ops.aten.sym_is_contiguous.default,
+                    torch.ops.aten.is_contiguous.default,
                     torch.ops.aten.is_contiguous.memory_format,
                     torch.ops.aten.is_strides_like_format.default,
                     torch.ops.aten.is_non_overlapping_and_dense.default,
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index bfb2f00378132..1906bb6b96889 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -607,9 +607,12 @@
             "channel_descriptor.h",
             ("hip/channel_descriptor.h", CONV_INCLUDE, API_RUNTIME),
         ),
-        ("device_functions.h", ("hip/device_functions.h", CONV_INCLUDE, API_RUNTIME)),
-        ("driver_types.h", ("hip/driver_types.h", CONV_INCLUDE, API_RUNTIME)),
-        ("library_types.h", ("hip/library_types.h", CONV_INCLUDE, API_RUNTIME)),
+        ('include "device_functions.h', ('include "hip/device_functions.h', CONV_INCLUDE, API_RUNTIME)),
+        ('include <device_functions.h', ('include <hip/device_functions.h', CONV_INCLUDE, API_RUNTIME)),
+        ('include "driver_types.h', ('include "hip/driver_types.h', CONV_INCLUDE, API_RUNTIME)),
+        ('include <driver_types.h', ('include <hip/driver_types.h', CONV_INCLUDE, API_RUNTIME)),
+        ('include "library_types.h', ('include "hip/library_types.h', CONV_INCLUDE, API_RUNTIME)),
+        ('include <library_types.h', ('include <hip/library_types.h', CONV_INCLUDE, API_RUNTIME)),
         ("cuComplex.h", ("hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME)),
         ("cuda_fp16.h", ("hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME)),
         ("cuda_bf16.h", ("hip/hip_bf16.h", CONV_INCLUDE, API_RUNTIME)),
@@ -4324,12 +4327,15 @@
         ("cudaStreamGetCaptureInfo_v2", ("hipStreamGetCaptureInfo_v2", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureStatus", ("hipStreamCaptureStatus", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureStatusActive", ("hipStreamCaptureStatusActive", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureStatusNone", ("hipStreamCaptureStatusNone", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureMode", ("hipStreamCaptureMode", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureModeGlobal", ("hipStreamCaptureModeGlobal", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureModeRelaxed", ("hipStreamCaptureModeRelaxed", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureModeThreadLocal", ("hipStreamCaptureModeThreadLocal", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamBeginCapture", ("hipStreamBeginCapture", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamEndCapture", ("hipStreamEndCapture", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamSetCaptureDependencies", ("hipStreamSetCaptureDependencies", CONV_STREAM, API_RUNTIME)),
+        ("cudaStreamUpdateCaptureDependencies", ("hipStreamUpdateCaptureDependencies", CONV_STREAM, API_RUNTIME)),
         ("cudaGraphInstantiate", ("hipGraphInstantiate", CONV_TYPE, API_RUNTIME)),
         ("cudaGraphInstantiateWithFlags", ("hipGraphInstantiateWithFlags", CONV_TYPE, API_RUNTIME)),
         (
@@ -8910,6 +8916,302 @@
                 API_PYTORCH,
             ),
         ),
+        (
+            "cuda::CUDACachingAllocator::raw_alloc",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::raw_alloc", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::raw_alloc",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::raw_alloc", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::raw_alloc_with_stream",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::raw_alloc_with_stream", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::raw_alloc_with_stream",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::raw_alloc_with_stream", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::raw_delete",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::raw_delete", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::raw_delete",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::raw_delete", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::init",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::init", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::init",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::init", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::getMemoryFraction",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::getMemoryFraction", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::getMemoryFraction",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::getMemoryFraction", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::setMemoryFraction",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::setMemoryFraction", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::setMemoryFraction",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::setMemoryFraction", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::emptyCache",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::emptyCache", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::emptyCache",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::emptyCache", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::enable",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::enable", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::enable",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::enable", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::isEnabled",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::isEnabled", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::isEnabled",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::isEnabled", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::cacheInfo",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::cacheInfo", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::cacheInfo",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::cacheInfo", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::getBaseAllocation",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::getBaseAllocation", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::getBaseAllocation",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::getBaseAllocation", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::getDeviceStats",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::getDeviceStats", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::getDeviceStats",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::getDeviceStats", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::resetAccumulatedStats",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::resetAccumulatedStats", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::resetAccumulatedStats",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::resetAccumulatedStats", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::resetPeakStats",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::resetPeakStats", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::resetPeakStats",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::resetPeakStats", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::snapshot",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::snapshot", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::snapshot",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::snapshot", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::getCheckpointState",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::getCheckpointState", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::getCheckpointState",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::getCheckpointState", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::setCheckpointState",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::setCheckpointState", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::setCheckpointState",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::setCheckpointState", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::setCheckpointPoolState",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::setCheckpointPoolState", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::setCheckpointPoolState",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::setCheckpointPoolState", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::beginAllocateToPool",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::beginAllocateToPool", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::beginAllocateToPool",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::beginAllocateToPool", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::endAllocateToPool",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::endAllocateToPool", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::endAllocateToPool",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::endAllocateToPool", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::recordHistory",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::recordHistory", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::recordHistory",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::recordHistory", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::recordAnnotation",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::recordAnnotation", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::recordAnnotation",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::recordAnnotation", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::pushCompileContext",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::pushCompileContext", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::pushCompileContext",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::pushCompileContext", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::popCompileContext",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::popCompileContext", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::popCompileContext",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::popCompileContext", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::isHistoryEnabled",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::isHistoryEnabled", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::isHistoryEnabled",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::isHistoryEnabled", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::checkPoolLiveAllocations",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::checkPoolLiveAllocations", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::checkPoolLiveAllocations",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::checkPoolLiveAllocations", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::attachOutOfMemoryObserver",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::attachOutOfMemoryObserver", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::attachOutOfMemoryObserver",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::attachOutOfMemoryObserver", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::attachAllocatorTraceTracker",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::attachAllocatorTraceTracker", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::attachAllocatorTraceTracker",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::attachAllocatorTraceTracker", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::releasePool",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::releasePool", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::releasePool",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::releasePool", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::createOrIncrefPool",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::createOrIncrefPool", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::createOrIncrefPool",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::createOrIncrefPool", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::setUseOnOOM",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::setUseOnOOM", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::setUseOnOOM",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::setUseOnOOM", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::getPoolUseCount",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::getPoolUseCount", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::getPoolUseCount",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::getPoolUseCount", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::getIpcDevPtr",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::getIpcDevPtr", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::getIpcDevPtr",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::getIpcDevPtr", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::shareIpcHandle",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::shareIpcHandle", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::shareIpcHandle",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::shareIpcHandle", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::name",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::name", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::name",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::name", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::memcpyAsync",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::memcpyAsync", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::memcpyAsync",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::memcpyAsync", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::enablePeerAccess",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::enablePeerAccess", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::enablePeerAccess",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::enablePeerAccess", API_PYTORCH),
+        ),
         (
             "cuda::CUDAAllocator::recordStream",
             (
diff --git a/torch/utils/module_tracker.py b/torch/utils/module_tracker.py
index 8ac97f2e2e826..4c7dec0481522 100644
--- a/torch/utils/module_tracker.py
+++ b/torch/utils/module_tracker.py
@@ -49,6 +49,7 @@ class ModuleTracker:
             def my_linear(m1, m2, bias):
                 print(f"Current modules: {tracker.parents}")
                 return torch.mm(m1, m2.t()) + bias
+
             torch.nn.functional.linear = my_linear
 
             mod(torch.rand(2, 2))
diff --git a/torch/xpu/__init__.py b/torch/xpu/__init__.py
index 23e3a25c90f5f..79aae38a31685 100644
--- a/torch/xpu/__init__.py
+++ b/torch/xpu/__init__.py
@@ -6,6 +6,7 @@
 This package is lazily initialized, so you can always import it, and use
 :func:`is_available()` to determine if your system supports XPU.
 """
+
 import threading
 import traceback
 from functools import lru_cache
@@ -235,15 +236,13 @@ def get_device_capability(device: Optional[_device_t] = None) -> dict[str, Any]:
         Dict[str, Any]: the xpu capability dictionary of the device
     """
     props = get_device_properties(device)
-    # pybind service attributes are no longer needed and their presence breaks
-    # the further logic related to the serialization of the created dictionary.
-    # In particular it filters out `<bound method PyCapsule._pybind11_conduit_v1_ of _XpuDeviceProperties..>`
-    # to fix Triton tests.
-    # This field appears after updating pybind to 2.13.6.
+    # Only keep attributes that are safe for dictionary serialization.
+    serializable_types = (int, float, bool, str, type(None), list, tuple, dict)
     return {
-        prop: getattr(props, prop)
-        for prop in dir(props)
-        if not prop.startswith(("__", "_pybind11_"))
+        key: value
+        for key in dir(props)
+        if not key.startswith("__")
+        and isinstance((value := getattr(props, key)), serializable_types)
     }
 
 
@@ -292,6 +291,7 @@ class StreamContext:
             ``None``.
     .. note:: Streams are per-device.
     """
+
     cur_stream: Optional["torch.xpu.Stream"]
 
     def __init__(self, stream: Optional["torch.xpu.Stream"]):
@@ -438,7 +438,7 @@ def get_gencode_flags() -> str:
     arch_list = get_arch_list()
     if len(arch_list) == 0:
         return ""
-    return f'-device {",".join(arch for arch in arch_list)}'
+    return f"-device {','.join(arch for arch in arch_list)}"
 
 
 def _get_generator(device: torch.device) -> torch._C.Generator:
diff --git a/torchgen/aoti/fallback_ops.py b/torchgen/aoti/fallback_ops.py
index 3ff40412898ab..611400d271d96 100644
--- a/torchgen/aoti/fallback_ops.py
+++ b/torchgen/aoti/fallback_ops.py
@@ -183,4 +183,9 @@
 # The same BC rules apply as inductor_fallback_ops.
 aten_shimified_ops: dict[str, dict[str, list[str]]] = {
     "aten.fill_.Scalar": {},
+    "aten.pad.default": {},
+    "aten.narrow.default": {},
+    "aten.amax.default": {},
+    "aten.new_empty.default": {},
+    "aten.new_zeros.default": {},
 }
diff --git a/torchgen/api/types/types.py b/torchgen/api/types/types.py
index 8e068291738c3..41c05653fffdf 100644
--- a/torchgen/api/types/types.py
+++ b/torchgen/api/types/types.py
@@ -79,6 +79,7 @@
 typeAndSizeT = BaseCppType("torch::autograd::generated", "TypeAndSize")
 tensorGeometryT = BaseCppType("at", "TensorGeometry")
 SymIntT = BaseCppType("c10", "SymInt")
+SymBoolT = BaseCppType("c10", "SymBool")
 symIntArrayRefT = BaseCppType("c10", "SymIntArrayRef")
 
 # Types representing template parameters.  Technically, we probably shouldn't
@@ -125,6 +126,7 @@
     BaseTy.Storage: storageT,
     BaseTy.Stream: streamT,
     BaseTy.SymInt: SymIntT,
+    BaseTy.SymBool: SymBoolT,
 }
 
 # CTypes encode C++ type structure as needed for translation.
diff --git a/torchgen/gen.py b/torchgen/gen.py
index 7d1413827f35d..b8290d6b86844 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -2849,14 +2849,13 @@ def main() -> None:
     # TODO: stop generating CUDA kernels for non-CUDA builds
     ignore_keys = set()
 
+    MPS_KEYS = {DispatchKey.MPS, DispatchKey.SparseMPS, DispatchKey.SparseCsrMPS}
     if options.mps or options.update_aoti_c_shim:
-        functions_keys.add(DispatchKey.MPS)
+        functions_keys.update(MPS_KEYS)
         aoti_backends.add(DispatchKey.MPS)
     else:
-        ignore_keys.add(DispatchKey.MPS)
-
-        if DispatchKey.MPS in dispatch_keys:
-            del dispatch_keys[dispatch_keys.index(DispatchKey.MPS)]
+        ignore_keys.update(MPS_KEYS)
+        dispatch_keys[:] = [k for k in dispatch_keys if k not in MPS_KEYS]
 
     if options.xpu or options.update_aoti_c_shim:
         functions_keys.add(DispatchKey.XPU)
diff --git a/torchgen/gen_aoti_c_shim.py b/torchgen/gen_aoti_c_shim.py
index 655f2bd65b02d..65161200256e5 100644
--- a/torchgen/gen_aoti_c_shim.py
+++ b/torchgen/gen_aoti_c_shim.py
@@ -24,6 +24,7 @@
     OperatorName,
     OptionalType,
     Type,
+    Variant,
 )
 from torchgen.utils import FileManager, mapMaybe
 
@@ -396,7 +397,22 @@ def gen_static_dispatch_backend_call(
 ) -> str:
     sig = DispatcherSignature.from_schema(f.func)
     cpp_sig = gen_static_dispatch_backend_call_signature(sig, f)
+
     if backend_index is None:
+        # Check if this is a symint function and if the function only has method variants
+        if sig.symint and f.func.has_symint():
+            has_function_variant = Variant.function in f.variants
+
+            if not has_function_variant:
+                # Functions with both function and method variants can use the at::{*}_symint version
+                # (e.g., narrow -> at::narrow_symint), BUT
+                # Method-only functions with symint parameters should use at::symint:: namespace
+                # Remove the _symint suffix since at::symint:: namespace uses the base name
+                # (e.g., new_empty -> at::symint::new_empty<c10::SymInt>)
+                base_name = cpp_sig.name()
+                base_name = base_name.removesuffix("_symint")  # Remove "_symint" suffix
+                return f"at::symint::{base_name}<c10::SymInt>"
+
         return f"at::{cpp_sig.name()}"
     else:
         return f"at::{backend_index.dispatch_key.lower()}::{cpp_sig.name()}"
@@ -744,7 +760,7 @@ def headers_for_aoti() -> str:
             f"c_shim_{device_name}.cpp",
             lambda: gen_aoti_c_shim(
                 fallback_native_functions,
-                inductor_fallback_ops,
+                fallback_ops_dict,
                 structured_func_group_dict,
                 dispatch_key,
                 backend_indices,
diff --git a/torchgen/selective_build/operator.py b/torchgen/selective_build/operator.py
index 0cb92dfc09e28..8047f033e3d2b 100644
--- a/torchgen/selective_build/operator.py
+++ b/torchgen/selective_build/operator.py
@@ -168,4 +168,4 @@ def merge_operator_dicts(
 
 
 def strip_operator_overload_name(op_name: str) -> str:
-    return op_name.split(".")[0]
+    return op_name.split(".", maxsplit=1)[0]